aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/devicetree/bindings/arm/pmu.txt3
-rw-r--r--Documentation/kernel-parameters.txt24
-rw-r--r--MAINTAINERS1
-rw-r--r--Makefile2
-rw-r--r--arch/arm/Kconfig102
-rw-r--r--arch/arm/common/Makefile3
-rw-r--r--arch/arm/common/bL_switcher.c864
-rw-r--r--arch/arm/common/bL_switcher_dummy_if.c71
-rw-r--r--arch/arm/common/mcpm_entry.c12
-rw-r--r--arch/arm/common/mcpm_head.S16
-rw-r--r--arch/arm/include/asm/bL_switcher.h83
-rw-r--r--arch/arm/include/asm/hardirq.h2
-rw-r--r--arch/arm/include/asm/mcpm.h8
-rw-r--r--arch/arm/include/asm/pmu.h12
-rw-r--r--arch/arm/include/asm/smp.h2
-rw-r--r--arch/arm/include/asm/topology.h34
-rw-r--r--arch/arm/kernel/hw_breakpoint.c3
-rw-r--r--arch/arm/kernel/perf_event.c19
-rw-r--r--arch/arm/kernel/perf_event_cpu.c117
-rw-r--r--arch/arm/kernel/perf_event_v7.c57
-rw-r--r--arch/arm/kernel/smp.c21
-rw-r--r--arch/arm/kernel/topology.c135
-rw-r--r--arch/x86/kvm/vmx.c11
-rw-r--r--block/blk-core.c3
-rw-r--r--block/blk-ioc.c3
-rw-r--r--block/genhd.c14
-rw-r--r--crypto/algapi.c3
-rw-r--r--drivers/block/nbd.c3
-rw-r--r--drivers/cdrom/cdrom.c2
-rw-r--r--drivers/cpufreq/cpufreq_stats.c1
-rw-r--r--drivers/irqchip/irq-gic.c145
-rw-r--r--drivers/net/phy/phy.c9
-rw-r--r--drivers/power/charger-manager.c2
-rw-r--r--drivers/scsi/osd/osd_uld.c2
-rw-r--r--drivers/scsi/sd.c2
-rw-r--r--drivers/tty/serial/8250/8250_pci.c4
-rw-r--r--drivers/tty/tty_io.c2
-rw-r--r--drivers/video/console/fbcon.c2
-rw-r--r--fs/ceph/xattr.c9
-rw-r--r--fs/hpfs/map.c3
-rw-r--r--fs/hpfs/super.c8
-rw-r--r--fs/nfs/nfs4state.c23
-rw-r--r--fs/nfsd/nfs4xdr.c2
-rw-r--r--include/linux/ceph/decode.h5
-rw-r--r--include/linux/hugetlb.h16
-rw-r--r--include/linux/irqchip/arm-gic.h7
-rw-r--r--include/linux/sched.h13
-rw-r--r--include/linux/vmstat.h2
-rw-r--r--include/linux/workqueue.h35
-rw-r--r--include/trace/events/power_cpu_migrate.h67
-rw-r--r--include/trace/events/sched.h153
-rw-r--r--kernel/futex.c3
-rw-r--r--kernel/irq/irqdesc.c21
-rw-r--r--kernel/module.c34
-rw-r--r--kernel/power/Kconfig20
-rw-r--r--kernel/sched/core.c23
-rw-r--r--kernel/sched/debug.c3
-rw-r--r--kernel/sched/fair.c1130
-rw-r--r--kernel/sched/sched.h13
-rw-r--r--kernel/workqueue.c26
-rw-r--r--linaro/configs/android.conf31
-rw-r--r--linaro/configs/big-LITTLE-IKS.conf5
-rw-r--r--linaro/configs/big-LITTLE-MP.conf11
-rw-r--r--linaro/configs/debug.conf1
-rw-r--r--linaro/configs/distribution.conf44
-rw-r--r--linaro/configs/kvm-guest.conf11
-rw-r--r--linaro/configs/kvm-host.conf11
-rw-r--r--linaro/configs/linaro-base.conf94
l---------linaro/configs/ubuntu-minimal.conf1
-rw-r--r--linaro/configs/xen.conf7
-rw-r--r--mm/hugetlb.c17
-rw-r--r--mm/memcontrol.c2
-rw-r--r--mm/vmstat.c95
-rw-r--r--net/ceph/auth_none.c6
74 files changed, 3611 insertions, 140 deletions
diff --git a/Documentation/devicetree/bindings/arm/pmu.txt b/Documentation/devicetree/bindings/arm/pmu.txt
index 343781b9f24..4ce82d045a6 100644
--- a/Documentation/devicetree/bindings/arm/pmu.txt
+++ b/Documentation/devicetree/bindings/arm/pmu.txt
@@ -16,6 +16,9 @@ Required properties:
"arm,arm1176-pmu"
"arm,arm1136-pmu"
- interrupts : 1 combined interrupt or 1 per core.
+- cluster : a phandle to the cluster to which it belongs
+ If there are more than one cluster with same CPU type
+ then there should be separate PMU nodes per cluster.
Example:
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 2fe6e767b3d..ba7daaa688f 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1240,6 +1240,15 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
See comment before ip2_setup() in
drivers/char/ip2/ip2base.c.
+ irqaffinity= [SMP] Set the default irq affinity mask
+ Format:
+ <cpu number>,...,<cpu number>
+ or
+ <cpu number>-<cpu number>
+ (must be a positive range in ascending order)
+ or a mixture
+ <cpu number>,...,<cpu number>-<cpu number>
+
irqfixup [HW]
When an interrupt is not handled search all handlers
for it. Intended to get systems with badly broken
@@ -3341,6 +3350,21 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
that this also can be controlled per-workqueue for
workqueues visible under /sys/bus/workqueue/.
+ workqueue.power_efficient
+ Per-cpu workqueues are generally preferred because
+ they show better performance thanks to cache
+ locality; unfortunately, per-cpu workqueues tend to
+ be more power hungry than unbound workqueues.
+
+ Enabling this makes the per-cpu workqueues which
+ were observed to contribute significantly to power
+ consumption unbound, leading to measurably lower
+ power usage at the cost of small performance
+ overhead.
+
+ The default value of this parameter is determined by
+ the config option CONFIG_WQ_POWER_EFFICIENT_DEFAULT.
+
x2apic_phys [X86-64,APIC] Use x2apic physical mode instead of
default x2apic cluster mode on platforms
supporting x2apic.
diff --git a/MAINTAINERS b/MAINTAINERS
index ad7e322ad17..48c748080c9 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -7667,6 +7667,7 @@ STABLE BRANCH
M: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
L: stable@vger.kernel.org
S: Supported
+F: Documentation/stable_kernel_rules.txt
STAGING SUBSYSTEM
M: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
diff --git a/Makefile b/Makefile
index e5e3ba08519..b75cc30af7b 100644
--- a/Makefile
+++ b/Makefile
@@ -1,6 +1,6 @@
VERSION = 3
PATCHLEVEL = 10
-SUBLEVEL = 0
+SUBLEVEL = 1
EXTRAVERSION =
NAME = Unicycling Gorilla
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index c24b364bb8c..effe59d4774 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -1494,6 +1494,90 @@ config SCHED_SMT
MultiThreading at a cost of slightly increased overhead in some
places. If unsure say N here.
+config DISABLE_CPU_SCHED_DOMAIN_BALANCE
+ bool "(EXPERIMENTAL) Disable CPU level scheduler load-balancing"
+ help
+ Disables scheduler load-balancing at CPU sched domain level.
+
+config SCHED_HMP
+ bool "(EXPERIMENTAL) Heterogenous multiprocessor scheduling"
+ depends on DISABLE_CPU_SCHED_DOMAIN_BALANCE && SCHED_MC && FAIR_GROUP_SCHED && !SCHED_AUTOGROUP
+ help
+ Experimental scheduler optimizations for heterogeneous platforms.
+ Attempts to introspectively select task affinity to optimize power
+ and performance. Basic support for multiple (>2) cpu types is in place,
+ but it has only been tested with two types of cpus.
+ There is currently no support for migration of task groups, hence
+ !SCHED_AUTOGROUP. Furthermore, normal load-balancing must be disabled
+ between cpus of different type (DISABLE_CPU_SCHED_DOMAIN_BALANCE).
+
+config SCHED_HMP_PRIO_FILTER
+ bool "(EXPERIMENTAL) Filter HMP migrations by task priority"
+ depends on SCHED_HMP
+ help
+ Enables task priority based HMP migration filter. Any task with
+ a NICE value above the threshold will always be on low-power cpus
+ with less compute capacity.
+
+config SCHED_HMP_PRIO_FILTER_VAL
+ int "NICE priority threshold"
+ default 5
+ depends on SCHED_HMP_PRIO_FILTER
+
+config HMP_FAST_CPU_MASK
+ string "HMP scheduler fast CPU mask"
+ depends on SCHED_HMP
+ help
+ Leave empty to use device tree information.
+ Specify the cpuids of the fast CPUs in the system as a list string,
+ e.g. cpuid 0+1 should be specified as 0-1.
+
+config HMP_SLOW_CPU_MASK
+ string "HMP scheduler slow CPU mask"
+ depends on SCHED_HMP
+ help
+ Leave empty to use device tree information.
+ Specify the cpuids of the slow CPUs in the system as a list string,
+ e.g. cpuid 0+1 should be specified as 0-1.
+
+config HMP_VARIABLE_SCALE
+ bool "Allows changing the load tracking scale through sysfs"
+ depends on SCHED_HMP
+ help
+ When turned on, this option exports the thresholds and load average
+ period value for the load tracking patches through sysfs.
+ The values can be modified to change the rate of load accumulation
+ and the thresholds used for HMP migration.
+ The load_avg_period_ms is the time in ms to reach a load average of
+ 0.5 for an idle task of 0 load average ratio that start a busy loop.
+ The up_threshold and down_threshold is the value to go to a faster
+ CPU or to go back to a slower cpu.
+ The {up,down}_threshold are devided by 1024 before being compared
+ to the load average.
+ For examples, with load_avg_period_ms = 128 and up_threshold = 512,
+ a running task with a load of 0 will be migrated to a bigger CPU after
+ 128ms, because after 128ms its load_avg_ratio is 0.5 and the real
+ up_threshold is 0.5.
+ This patch has the same behavior as changing the Y of the load
+ average computation to
+ (1002/1024)^(LOAD_AVG_PERIOD/load_avg_period_ms)
+ but it remove intermadiate overflows in computation.
+
+config HMP_FREQUENCY_INVARIANT_SCALE
+ bool "(EXPERIMENTAL) Frequency-Invariant Tracked Load for HMP"
+ depends on HMP_VARIABLE_SCALE && CPU_FREQ
+ help
+ Scales the current load contribution in line with the frequency
+ of the CPU that the task was executed on.
+ In this version, we use a simple linear scale derived from the
+ maximum frequency reported by CPUFreq.
+ Restricting tracked load to be scaled by the CPU's frequency
+ represents the consumption of possible compute capacity
+ (rather than consumption of actual instantaneous capacity as
+ normal) and allows the HMP migration's simple threshold
+ migration strategy to interact more predictably with CPUFreq's
+ asynchronous compute capacity changes.
+
config HAVE_ARM_SCU
bool
help
@@ -1528,6 +1612,24 @@ config BIG_LITTLE
help
This option enables support for the big.LITTLE architecture.
+config BL_SWITCHER
+ bool "big.LITTLE switcher support"
+ depends on BIG_LITTLE && MCPM && HOTPLUG_CPU
+ select CPU_PM
+ select ARM_CPU_SUSPEND
+ help
+ The big.LITTLE "switcher" provides the core functionality to
+ transparently handle transition between a cluster of A15's
+ and a cluster of A7's in a big.LITTLE system.
+
+config BL_SWITCHER_DUMMY_IF
+ tristate "Simple big.LITTLE switcher user interface"
+ depends on BL_SWITCHER && DEBUG_KERNEL
+ help
+ This is a simple and dummy char dev interface to control
+ the big.LITTLE switcher core code. It is meant for
+ debugging purposes only.
+
choice
prompt "Memory split"
default VMSPLIT_3G
diff --git a/arch/arm/common/Makefile b/arch/arm/common/Makefile
index f27d6a7af57..462cd580fc2 100644
--- a/arch/arm/common/Makefile
+++ b/arch/arm/common/Makefile
@@ -14,6 +14,9 @@ obj-$(CONFIG_SHARP_SCOOP) += scoop.o
obj-$(CONFIG_PCI_HOST_ITE8152) += it8152.o
obj-$(CONFIG_ARM_TIMER_SP804) += timer-sp.o
obj-$(CONFIG_MCPM) += mcpm_head.o mcpm_entry.o mcpm_platsmp.o vlock.o
+obj-$(CONFIG_BL_SWITCHER) += bL_switcher.o
+obj-$(CONFIG_BL_SWITCHER_DUMMY_IF) += bL_switcher_dummy_if.o
+
AFLAGS_mcpm_head.o := -march=armv7-a
AFLAGS_vlock.o := -march=armv7-a
CFLAGS_REMOVE_mcpm_entry.o = -pg
diff --git a/arch/arm/common/bL_switcher.c b/arch/arm/common/bL_switcher.c
new file mode 100644
index 00000000000..8fee70dfb30
--- /dev/null
+++ b/arch/arm/common/bL_switcher.c
@@ -0,0 +1,864 @@
+/*
+ * arch/arm/common/bL_switcher.c -- big.LITTLE cluster switcher core driver
+ *
+ * Created by: Nicolas Pitre, March 2012
+ * Copyright: (C) 2012 Linaro Limited
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/atomic.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/interrupt.h>
+#include <linux/cpu_pm.h>
+#include <linux/cpu.h>
+#include <linux/cpumask.h>
+#include <linux/kthread.h>
+#include <linux/wait.h>
+#include <linux/time.h>
+#include <linux/clockchips.h>
+#include <linux/hrtimer.h>
+#include <linux/tick.h>
+#include <linux/notifier.h>
+#include <linux/mm.h>
+#include <linux/mutex.h>
+#include <linux/smp.h>
+#include <linux/spinlock.h>
+#include <linux/string.h>
+#include <linux/sysfs.h>
+#include <linux/irqchip/arm-gic.h>
+#include <linux/moduleparam.h>
+
+#include <asm/smp_plat.h>
+#include <asm/cacheflush.h>
+#include <asm/cputype.h>
+#include <asm/suspend.h>
+#include <asm/mcpm.h>
+#include <asm/bL_switcher.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/power_cpu_migrate.h>
+
+
+/*
+ * Use our own MPIDR accessors as the generic ones in asm/cputype.h have
+ * __attribute_const__ and we don't want the compiler to assume any
+ * constness here as the value _does_ change along some code paths.
+ */
+
+static int read_mpidr(void)
+{
+ unsigned int id;
+ asm volatile ("mrc\tp15, 0, %0, c0, c0, 5" : "=r" (id));
+ return id & MPIDR_HWID_BITMASK;
+}
+
+/*
+ * Get a global nanosecond time stamp for tracing.
+ */
+static s64 get_ns(void)
+{
+ struct timespec ts;
+ getnstimeofday(&ts);
+ return timespec_to_ns(&ts);
+}
+
+/*
+ * bL switcher core code.
+ */
+
+static void bL_do_switch(void *_arg)
+{
+ unsigned ib_mpidr, ib_cpu, ib_cluster;
+ long volatile handshake, **handshake_ptr = _arg;
+
+ pr_debug("%s\n", __func__);
+
+ ib_mpidr = cpu_logical_map(smp_processor_id());
+ ib_cpu = MPIDR_AFFINITY_LEVEL(ib_mpidr, 0);
+ ib_cluster = MPIDR_AFFINITY_LEVEL(ib_mpidr, 1);
+
+ /* Advertise our handshake location */
+ if (handshake_ptr) {
+ handshake = 0;
+ *handshake_ptr = &handshake;
+ } else
+ handshake = -1;
+
+ /*
+ * Our state has been saved at this point. Let's release our
+ * inbound CPU.
+ */
+ mcpm_set_entry_vector(ib_cpu, ib_cluster, cpu_resume);
+ sev();
+
+ /*
+ * From this point, we must assume that our counterpart CPU might
+ * have taken over in its parallel world already, as if execution
+ * just returned from cpu_suspend(). It is therefore important to
+ * be very careful not to make any change the other guy is not
+ * expecting. This is why we need stack isolation.
+ *
+ * Fancy under cover tasks could be performed here. For now
+ * we have none.
+ */
+
+ /*
+ * Let's wait until our inbound is alive.
+ */
+ while (!handshake) {
+ wfe();
+ smp_mb();
+ }
+
+ /* Let's put ourself down. */
+ mcpm_cpu_power_down();
+
+ /* should never get here */
+ BUG();
+}
+
+/*
+ * Stack isolation. To ensure 'current' remains valid, we just use another
+ * piece of our thread's stack space which should be fairly lightly used.
+ * The selected area starts just above the thread_info structure located
+ * at the very bottom of the stack, aligned to a cache line, and indexed
+ * with the cluster number.
+ */
+#define STACK_SIZE 512
+extern void call_with_stack(void (*fn)(void *), void *arg, void *sp);
+static int bL_switchpoint(unsigned long _arg)
+{
+ unsigned int mpidr = read_mpidr();
+ unsigned int clusterid = MPIDR_AFFINITY_LEVEL(mpidr, 1);
+ void *stack = current_thread_info() + 1;
+ stack = PTR_ALIGN(stack, L1_CACHE_BYTES);
+ stack += clusterid * STACK_SIZE + STACK_SIZE;
+ call_with_stack(bL_do_switch, (void *)_arg, stack);
+ BUG();
+}
+
+/*
+ * Generic switcher interface
+ */
+
+static unsigned int bL_gic_id[MAX_CPUS_PER_CLUSTER][MAX_NR_CLUSTERS];
+static int bL_switcher_cpu_pairing[NR_CPUS];
+
+/*
+ * bL_switch_to - Switch to a specific cluster for the current CPU
+ * @new_cluster_id: the ID of the cluster to switch to.
+ *
+ * This function must be called on the CPU to be switched.
+ * Returns 0 on success, else a negative status code.
+ */
+static int bL_switch_to(unsigned int new_cluster_id)
+{
+ unsigned int mpidr, this_cpu, that_cpu;
+ unsigned int ob_mpidr, ob_cpu, ob_cluster, ib_mpidr, ib_cpu, ib_cluster;
+ struct completion inbound_alive;
+ struct tick_device *tdev;
+ enum clock_event_mode tdev_mode;
+ long volatile *handshake_ptr;
+ int ipi_nr, ret;
+
+ this_cpu = smp_processor_id();
+ ob_mpidr = read_mpidr();
+ ob_cpu = MPIDR_AFFINITY_LEVEL(ob_mpidr, 0);
+ ob_cluster = MPIDR_AFFINITY_LEVEL(ob_mpidr, 1);
+ BUG_ON(cpu_logical_map(this_cpu) != ob_mpidr);
+
+ if (new_cluster_id == ob_cluster)
+ return 0;
+
+ that_cpu = bL_switcher_cpu_pairing[this_cpu];
+ ib_mpidr = cpu_logical_map(that_cpu);
+ ib_cpu = MPIDR_AFFINITY_LEVEL(ib_mpidr, 0);
+ ib_cluster = MPIDR_AFFINITY_LEVEL(ib_mpidr, 1);
+
+ pr_debug("before switch: CPU %d MPIDR %#x -> %#x\n",
+ this_cpu, ob_mpidr, ib_mpidr);
+
+ this_cpu = smp_processor_id();
+
+ /* Close the gate for our entry vectors */
+ mcpm_set_entry_vector(ob_cpu, ob_cluster, NULL);
+ mcpm_set_entry_vector(ib_cpu, ib_cluster, NULL);
+
+ /* Install our "inbound alive" notifier. */
+ init_completion(&inbound_alive);
+ ipi_nr = register_ipi_completion(&inbound_alive, this_cpu);
+ ipi_nr |= ((1 << 16) << bL_gic_id[ob_cpu][ob_cluster]);
+ mcpm_set_early_poke(ib_cpu, ib_cluster, gic_get_sgir_physaddr(), ipi_nr);
+
+ /*
+ * Let's wake up the inbound CPU now in case it requires some delay
+ * to come online, but leave it gated in our entry vector code.
+ */
+ ret = mcpm_cpu_power_up(ib_cpu, ib_cluster);
+ if (ret) {
+ pr_err("%s: mcpm_cpu_power_up() returned %d\n", __func__, ret);
+ return ret;
+ }
+
+ /*
+ * Raise a SGI on the inbound CPU to make sure it doesn't stall
+ * in a possible WFI, such as in bL_power_down().
+ */
+ gic_send_sgi(bL_gic_id[ib_cpu][ib_cluster], 0);
+
+ /*
+ * Wait for the inbound to come up. This allows for other
+ * tasks to be scheduled in the mean time.
+ */
+ wait_for_completion(&inbound_alive);
+ mcpm_set_early_poke(ib_cpu, ib_cluster, 0, 0);
+
+ /*
+ * From this point we are entering the switch critical zone
+ * and can't sleep/schedule anymore.
+ */
+ local_irq_disable();
+ local_fiq_disable();
+ trace_cpu_migrate_begin(get_ns(), ob_mpidr);
+
+ /* redirect GIC's SGIs to our counterpart */
+ gic_migrate_target(bL_gic_id[ib_cpu][ib_cluster]);
+
+ tdev = tick_get_device(this_cpu);
+ if (tdev && !cpumask_equal(tdev->evtdev->cpumask, cpumask_of(this_cpu)))
+ tdev = NULL;
+ if (tdev) {
+ tdev_mode = tdev->evtdev->mode;
+ clockevents_set_mode(tdev->evtdev, CLOCK_EVT_MODE_SHUTDOWN);
+ }
+
+ ret = cpu_pm_enter();
+
+ /* we can not tolerate errors at this point */
+ if (ret)
+ panic("%s: cpu_pm_enter() returned %d\n", __func__, ret);
+
+ /*
+ * Swap the physical CPUs in the logical map for this logical CPU.
+ * This must be flushed to RAM as the resume code
+ * needs to access it while the caches are still disabled.
+ */
+ cpu_logical_map(this_cpu) = ib_mpidr;
+ cpu_logical_map(that_cpu) = ob_mpidr;
+ sync_cache_w(&cpu_logical_map(this_cpu));
+
+ /* Let's do the actual CPU switch. */
+ ret = cpu_suspend((unsigned long)&handshake_ptr, bL_switchpoint);
+ if (ret > 0)
+ panic("%s: cpu_suspend() returned %d\n", __func__, ret);
+
+ /* We are executing on the inbound CPU at this point */
+ mpidr = read_mpidr();
+ pr_debug("after switch: CPU %d MPIDR %#x\n", this_cpu, mpidr);
+ BUG_ON(mpidr != ib_mpidr);
+
+ mcpm_cpu_powered_up();
+
+ ret = cpu_pm_exit();
+
+ if (tdev) {
+ clockevents_set_mode(tdev->evtdev, tdev_mode);
+ clockevents_program_event(tdev->evtdev,
+ tdev->evtdev->next_event, 1);
+ }
+
+ trace_cpu_migrate_finish(get_ns(), ib_mpidr);
+ local_fiq_enable();
+ local_irq_enable();
+
+ *handshake_ptr = 1;
+ dsb_sev();
+
+ if (ret)
+ pr_err("%s exiting with error %d\n", __func__, ret);
+ return ret;
+}
+
+struct bL_thread {
+ spinlock_t lock;
+ struct task_struct *task;
+ wait_queue_head_t wq;
+ int wanted_cluster;
+ struct completion started;
+ bL_switch_completion_handler completer;
+ void *completer_cookie;
+};
+
+static struct bL_thread bL_threads[NR_CPUS];
+
+static int bL_switcher_thread(void *arg)
+{
+ struct bL_thread *t = arg;
+ struct sched_param param = { .sched_priority = 1 };
+ int cluster;
+ bL_switch_completion_handler completer;
+ void *completer_cookie;
+
+ sched_setscheduler_nocheck(current, SCHED_FIFO, &param);
+ complete(&t->started);
+
+ do {
+ if (signal_pending(current))
+ flush_signals(current);
+ wait_event_interruptible(t->wq,
+ t->wanted_cluster != -1 ||
+ kthread_should_stop());
+
+ spin_lock(&t->lock);
+ cluster = t->wanted_cluster;
+ completer = t->completer;
+ completer_cookie = t->completer_cookie;
+ t->wanted_cluster = -1;
+ t->completer = NULL;
+ spin_unlock(&t->lock);
+
+ if (cluster != -1) {
+ bL_switch_to(cluster);
+
+ if (completer)
+ completer(completer_cookie);
+ }
+ } while (!kthread_should_stop());
+
+ return 0;
+}
+
+static struct task_struct * bL_switcher_thread_create(int cpu, void *arg)
+{
+ struct task_struct *task;
+
+ task = kthread_create_on_node(bL_switcher_thread, arg,
+ cpu_to_node(cpu), "kswitcher_%d", cpu);
+ if (!IS_ERR(task)) {
+ kthread_bind(task, cpu);
+ wake_up_process(task);
+ } else
+ pr_err("%s failed for CPU %d\n", __func__, cpu);
+ return task;
+}
+
+/*
+ * bL_switch_request_cb - Switch to a specific cluster for the given CPU,
+ * with completion notification via a callback
+ *
+ * @cpu: the CPU to switch
+ * @new_cluster_id: the ID of the cluster to switch to.
+ * @completer: switch completion callback. if non-NULL,
+ * @completer(@completer_cookie) will be called on completion of
+ * the switch, in non-atomic context.
+ * @completer_cookie: opaque context argument for @completer.
+ *
+ * This function causes a cluster switch on the given CPU by waking up
+ * the appropriate switcher thread. This function may or may not return
+ * before the switch has occurred.
+ *
+ * If a @completer callback function is supplied, it will be called when
+ * the switch is complete. This can be used to determine asynchronously
+ * when the switch is complete, regardless of when bL_switch_request()
+ * returns. When @completer is supplied, no new switch request is permitted
+ * for the affected CPU until after the switch is complete, and @completer
+ * has returned.
+ */
+int bL_switch_request_cb(unsigned int cpu, unsigned int new_cluster_id,
+ bL_switch_completion_handler completer,
+ void *completer_cookie)
+{
+ struct bL_thread *t;
+
+ if (cpu >= ARRAY_SIZE(bL_threads)) {
+ pr_err("%s: cpu %d out of bounds\n", __func__, cpu);
+ return -EINVAL;
+ }
+
+ t = &bL_threads[cpu];
+
+ if (IS_ERR(t->task))
+ return PTR_ERR(t->task);
+ if (!t->task)
+ return -ESRCH;
+
+ spin_lock(&t->lock);
+ if (t->completer) {
+ spin_unlock(&t->lock);
+ return -EBUSY;
+ }
+ t->completer = completer;
+ t->completer_cookie = completer_cookie;
+ t->wanted_cluster = new_cluster_id;
+ spin_unlock(&t->lock);
+ wake_up(&t->wq);
+ return 0;
+}
+
+EXPORT_SYMBOL_GPL(bL_switch_request_cb);
+
+/*
+ * Detach an outstanding switch request.
+ *
+ * The switcher will continue with the switch request in the background,
+ * but the completer function will not be called.
+ *
+ * This may be necessary if the completer is in a kernel module which is
+ * about to be unloaded.
+ */
+void bL_switch_request_detach(unsigned int cpu,
+ bL_switch_completion_handler completer)
+{
+ struct bL_thread *t;
+
+ if (cpu >= ARRAY_SIZE(bL_threads)) {
+ pr_err("%s: cpu %d out of bounds\n", __func__, cpu);
+ return;
+ }
+
+ t = &bL_threads[cpu];
+
+ if (IS_ERR(t->task) || !t->task)
+ return;
+
+ spin_lock(&t->lock);
+ if (t->completer == completer)
+ t->completer = NULL;
+ spin_unlock(&t->lock);
+}
+
+EXPORT_SYMBOL_GPL(bL_switch_request_detach);
+
+/*
+ * Activation and configuration code.
+ */
+
+static DEFINE_MUTEX(bL_switcher_activation_lock);
+static BLOCKING_NOTIFIER_HEAD(bL_activation_notifier);
+static unsigned int bL_switcher_active;
+static unsigned int bL_switcher_cpu_original_cluster[NR_CPUS];
+static cpumask_t bL_switcher_removed_logical_cpus;
+
+int bL_switcher_register_notifier(struct notifier_block *nb)
+{
+ return blocking_notifier_chain_register(&bL_activation_notifier, nb);
+}
+EXPORT_SYMBOL_GPL(bL_switcher_register_notifier);
+
+int bL_switcher_unregister_notifier(struct notifier_block *nb)
+{
+ return blocking_notifier_chain_unregister(&bL_activation_notifier, nb);
+}
+EXPORT_SYMBOL_GPL(bL_switcher_unregister_notifier);
+
+static int bL_activation_notify(unsigned long val)
+{
+ int ret;
+
+ ret = blocking_notifier_call_chain(&bL_activation_notifier, val, NULL);
+ if (ret & NOTIFY_STOP_MASK)
+ pr_err("%s: notifier chain failed with status 0x%x\n",
+ __func__, ret);
+ return notifier_to_errno(ret);
+}
+
+static void bL_switcher_restore_cpus(void)
+{
+ int i;
+
+ for_each_cpu(i, &bL_switcher_removed_logical_cpus)
+ cpu_up(i);
+}
+
+static int bL_switcher_halve_cpus(void)
+{
+ int i, j, cluster_0, gic_id, ret;
+ unsigned int cpu, cluster, mask;
+ cpumask_t available_cpus;
+
+ /* First pass to validate what we have */
+ mask = 0;
+ for_each_online_cpu(i) {
+ cpu = MPIDR_AFFINITY_LEVEL(cpu_logical_map(i), 0);
+ cluster = MPIDR_AFFINITY_LEVEL(cpu_logical_map(i), 1);
+ if (cluster >= 2) {
+ pr_err("%s: only dual cluster systems are supported\n", __func__);
+ return -EINVAL;
+ }
+ if (WARN_ON(cpu >= MAX_CPUS_PER_CLUSTER))
+ return -EINVAL;
+ mask |= (1 << cluster);
+ }
+ if (mask != 3) {
+ pr_err("%s: no CPU pairing possible\n", __func__);
+ return -EINVAL;
+ }
+
+ /*
+ * Now let's do the pairing. We match each CPU with another CPU
+ * from a different cluster. To get a uniform scheduling behavior
+ * without fiddling with CPU topology and compute capacity data,
+ * we'll use logical CPUs initially belonging to the same cluster.
+ */
+ memset(bL_switcher_cpu_pairing, -1, sizeof(bL_switcher_cpu_pairing));
+ cpumask_copy(&available_cpus, cpu_online_mask);
+ cluster_0 = -1;
+ for_each_cpu(i, &available_cpus) {
+ int match = -1;
+ cluster = MPIDR_AFFINITY_LEVEL(cpu_logical_map(i), 1);
+ if (cluster_0 == -1)
+ cluster_0 = cluster;
+ if (cluster != cluster_0)
+ continue;
+ cpumask_clear_cpu(i, &available_cpus);
+ for_each_cpu(j, &available_cpus) {
+ cluster = MPIDR_AFFINITY_LEVEL(cpu_logical_map(j), 1);
+ /*
+ * Let's remember the last match to create "odd"
+ * pairing on purpose in order for other code not
+ * to assume any relation between physical and
+ * logical CPU numbers.
+ */
+ if (cluster != cluster_0)
+ match = j;
+ }
+ if (match != -1) {
+ bL_switcher_cpu_pairing[i] = match;
+ cpumask_clear_cpu(match, &available_cpus);
+ pr_info("CPU%d paired with CPU%d\n", i, match);
+ }
+ }
+
+ /*
+ * Now we disable the unwanted CPUs i.e. everything that has no
+ * pairing information (that includes the pairing counterparts).
+ */
+ cpumask_clear(&bL_switcher_removed_logical_cpus);
+ for_each_online_cpu(i) {
+ cpu = MPIDR_AFFINITY_LEVEL(cpu_logical_map(i), 0);
+ cluster = MPIDR_AFFINITY_LEVEL(cpu_logical_map(i), 1);
+
+ /* Let's take note of the GIC ID for this CPU */
+ gic_id = gic_get_cpu_id(i);
+ if (gic_id < 0) {
+ pr_err("%s: bad GIC ID for CPU %d\n", __func__, i);
+ bL_switcher_restore_cpus();
+ return -EINVAL;
+ }
+ bL_gic_id[cpu][cluster] = gic_id;
+ pr_info("GIC ID for CPU %u cluster %u is %u\n",
+ cpu, cluster, gic_id);
+
+ if (bL_switcher_cpu_pairing[i] != -1) {
+ bL_switcher_cpu_original_cluster[i] = cluster;
+ continue;
+ }
+
+ ret = cpu_down(i);
+ if (ret) {
+ bL_switcher_restore_cpus();
+ return ret;
+ }
+ cpumask_set_cpu(i, &bL_switcher_removed_logical_cpus);
+ }
+
+ return 0;
+}
+
+/* Determine the logical CPU a given physical CPU is grouped on. */
+int bL_switcher_get_logical_index(u32 mpidr)
+{
+ int cpu;
+
+ if (!bL_switcher_active)
+ return -EUNATCH;
+
+ mpidr &= MPIDR_HWID_BITMASK;
+ for_each_online_cpu(cpu) {
+ int pairing = bL_switcher_cpu_pairing[cpu];
+ if (pairing == -1)
+ continue;
+ if ((mpidr == cpu_logical_map(cpu)) ||
+ (mpidr == cpu_logical_map(pairing)))
+ return cpu;
+ }
+ return -EINVAL;
+}
+
+static void bL_switcher_trace_trigger_cpu(void *__always_unused info)
+{
+ trace_cpu_migrate_current(get_ns(), read_mpidr());
+}
+
+int bL_switcher_trace_trigger(void)
+{
+ int ret;
+
+ preempt_disable();
+
+ bL_switcher_trace_trigger_cpu(NULL);
+ ret = smp_call_function(bL_switcher_trace_trigger_cpu, NULL, true);
+
+ preempt_enable();
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(bL_switcher_trace_trigger);
+
+static int bL_switcher_enable(void)
+{
+ int cpu, ret;
+
+ mutex_lock(&bL_switcher_activation_lock);
+ cpu_hotplug_driver_lock();
+ if (bL_switcher_active) {
+ cpu_hotplug_driver_unlock();
+ mutex_unlock(&bL_switcher_activation_lock);
+ return 0;
+ }
+
+ pr_info("big.LITTLE switcher initializing\n");
+
+ ret = bL_activation_notify(BL_NOTIFY_PRE_ENABLE);
+ if (ret)
+ goto error;
+
+ ret = bL_switcher_halve_cpus();
+ if (ret)
+ goto error;
+
+ bL_switcher_trace_trigger();
+
+ for_each_online_cpu(cpu) {
+ struct bL_thread *t = &bL_threads[cpu];
+ spin_lock_init(&t->lock);
+ init_waitqueue_head(&t->wq);
+ init_completion(&t->started);
+ t->wanted_cluster = -1;
+ t->task = bL_switcher_thread_create(cpu, t);
+ }
+
+ bL_switcher_active = 1;
+ bL_activation_notify(BL_NOTIFY_POST_ENABLE);
+ pr_info("big.LITTLE switcher initialized\n");
+ goto out;
+
+error:
+ pr_warning("big.LITTLE switcher initialization failed\n");
+ bL_activation_notify(BL_NOTIFY_POST_DISABLE);
+
+out:
+ cpu_hotplug_driver_unlock();
+ mutex_unlock(&bL_switcher_activation_lock);
+ return ret;
+}
+
+#ifdef CONFIG_SYSFS
+
+static void bL_switcher_disable(void)
+{
+ unsigned int cpu, cluster;
+ struct bL_thread *t;
+ struct task_struct *task;
+
+ mutex_lock(&bL_switcher_activation_lock);
+ cpu_hotplug_driver_lock();
+
+ if (!bL_switcher_active)
+ goto out;
+
+ if (bL_activation_notify(BL_NOTIFY_PRE_DISABLE) != 0) {
+ bL_activation_notify(BL_NOTIFY_POST_ENABLE);
+ goto out;
+ }
+
+ bL_switcher_active = 0;
+
+ /*
+ * To deactivate the switcher, we must shut down the switcher
+ * threads to prevent any other requests from being accepted.
+ * Then, if the final cluster for given logical CPU is not the
+ * same as the original one, we'll recreate a switcher thread
+ * just for the purpose of switching the CPU back without any
+ * possibility for interference from external requests.
+ */
+ for_each_online_cpu(cpu) {
+ t = &bL_threads[cpu];
+ task = t->task;
+ t->task = NULL;
+ if (!task || IS_ERR(task))
+ continue;
+ kthread_stop(task);
+ /* no more switch may happen on this CPU at this point */
+ cluster = MPIDR_AFFINITY_LEVEL(cpu_logical_map(cpu), 1);
+ if (cluster == bL_switcher_cpu_original_cluster[cpu])
+ continue;
+ init_completion(&t->started);
+ t->wanted_cluster = bL_switcher_cpu_original_cluster[cpu];
+ task = bL_switcher_thread_create(cpu, t);
+ if (!IS_ERR(task)) {
+ wait_for_completion(&t->started);
+ kthread_stop(task);
+ cluster = MPIDR_AFFINITY_LEVEL(cpu_logical_map(cpu), 1);
+ if (cluster == bL_switcher_cpu_original_cluster[cpu])
+ continue;
+ }
+ /* If execution gets here, we're in trouble. */
+ pr_crit("%s: unable to restore original cluster for CPU %d\n",
+ __func__, cpu);
+ pr_crit("%s: CPU %d can't be restored\n",
+ __func__, bL_switcher_cpu_pairing[cpu]);
+ cpumask_clear_cpu(bL_switcher_cpu_pairing[cpu],
+ &bL_switcher_removed_logical_cpus);
+ }
+
+ bL_switcher_restore_cpus();
+ bL_switcher_trace_trigger();
+
+ bL_activation_notify(BL_NOTIFY_POST_DISABLE);
+
+out:
+ cpu_hotplug_driver_unlock();
+ mutex_unlock(&bL_switcher_activation_lock);
+}
+
+static ssize_t bL_switcher_active_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%u\n", bL_switcher_active);
+}
+
+static ssize_t bL_switcher_active_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ int ret;
+
+ switch (buf[0]) {
+ case '0':
+ bL_switcher_disable();
+ ret = 0;
+ break;
+ case '1':
+ ret = bL_switcher_enable();
+ break;
+ default:
+ ret = -EINVAL;
+ }
+
+ return (ret >= 0) ? count : ret;
+}
+
+static ssize_t bL_switcher_trace_trigger_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ int ret = bL_switcher_trace_trigger();
+
+ return ret ? ret : count;
+}
+
+static struct kobj_attribute bL_switcher_active_attr =
+ __ATTR(active, 0644, bL_switcher_active_show, bL_switcher_active_store);
+
+static struct kobj_attribute bL_switcher_trace_trigger_attr =
+ __ATTR(trace_trigger, 0200, NULL, bL_switcher_trace_trigger_store);
+
+static struct attribute *bL_switcher_attrs[] = {
+ &bL_switcher_active_attr.attr,
+ &bL_switcher_trace_trigger_attr.attr,
+ NULL,
+};
+
+static struct attribute_group bL_switcher_attr_group = {
+ .attrs = bL_switcher_attrs,
+};
+
+static struct kobject *bL_switcher_kobj;
+
+static int __init bL_switcher_sysfs_init(void)
+{
+ int ret;
+
+ bL_switcher_kobj = kobject_create_and_add("bL_switcher", kernel_kobj);
+ if (!bL_switcher_kobj)
+ return -ENOMEM;
+ ret = sysfs_create_group(bL_switcher_kobj, &bL_switcher_attr_group);
+ if (ret)
+ kobject_put(bL_switcher_kobj);
+ return ret;
+}
+
+#endif /* CONFIG_SYSFS */
+
+bool bL_switcher_get_enabled(void)
+{
+ mutex_lock(&bL_switcher_activation_lock);
+
+ return bL_switcher_active;
+}
+EXPORT_SYMBOL_GPL(bL_switcher_get_enabled);
+
+void bL_switcher_put_enabled(void)
+{
+ mutex_unlock(&bL_switcher_activation_lock);
+}
+EXPORT_SYMBOL_GPL(bL_switcher_put_enabled);
+
+/*
+ * Veto any CPU hotplug operation while the switcher is active.
+ * We're just not ready to deal with that given the trickery involved.
+ */
+static int bL_switcher_hotplug_callback(struct notifier_block *nfb,
+ unsigned long action, void *hcpu)
+{
+ switch (action) {
+ case CPU_UP_PREPARE:
+ case CPU_DOWN_PREPARE:
+ if (bL_switcher_active)
+ return NOTIFY_BAD;
+ }
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block bL_switcher_hotplug_notifier =
+ { &bL_switcher_hotplug_callback, NULL, 0 };
+
+#ifdef CONFIG_SCHED_HMP
+static bool no_bL_switcher = true;
+#else
+static bool no_bL_switcher;
+#endif
+core_param(no_bL_switcher, no_bL_switcher, bool, 0644);
+
+static int __init bL_switcher_init(void)
+{
+ int ret;
+
+ if (MAX_NR_CLUSTERS != 2) {
+ pr_err("%s: only dual cluster systems are supported\n", __func__);
+ return -EINVAL;
+ }
+
+ register_cpu_notifier(&bL_switcher_hotplug_notifier);
+
+ if (!no_bL_switcher) {
+ ret = bL_switcher_enable();
+ if (ret)
+ return ret;
+ }
+
+#ifdef CONFIG_SYSFS
+ ret = bL_switcher_sysfs_init();
+ if (ret)
+ pr_err("%s: unable to create sysfs entry\n", __func__);
+#endif
+
+ return 0;
+}
+
+late_initcall(bL_switcher_init);
diff --git a/arch/arm/common/bL_switcher_dummy_if.c b/arch/arm/common/bL_switcher_dummy_if.c
new file mode 100644
index 00000000000..5e2dd197e72
--- /dev/null
+++ b/arch/arm/common/bL_switcher_dummy_if.c
@@ -0,0 +1,71 @@
+/*
+ * arch/arm/common/bL_switcher_dummy_if.c -- b.L switcher dummy interface
+ *
+ * Created by: Nicolas Pitre, November 2012
+ * Copyright: (C) 2012 Linaro Limited
+ *
+ * Dummy interface to user space for debugging purpose only.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/miscdevice.h>
+#include <asm/uaccess.h>
+#include <asm/bL_switcher.h>
+
+static ssize_t bL_switcher_write(struct file *file, const char __user *buf,
+ size_t len, loff_t *pos)
+{
+ unsigned char val[3];
+ unsigned int cpu, cluster;
+ int ret;
+
+ pr_debug("%s\n", __func__);
+
+ if (len < 3)
+ return -EINVAL;
+
+ if (copy_from_user(val, buf, 3))
+ return -EFAULT;
+
+ /* format: <cpu#>,<cluster#> */
+ if (val[0] < '0' || val[0] > '4' ||
+ val[1] != ',' ||
+ val[2] < '0' || val[2] > '1')
+ return -EINVAL;
+
+ cpu = val[0] - '0';
+ cluster = val[2] - '0';
+ ret = bL_switch_request(cpu, cluster);
+
+ return ret ? : len;
+}
+
+static const struct file_operations bL_switcher_fops = {
+ .write = bL_switcher_write,
+ .owner = THIS_MODULE,
+};
+
+static struct miscdevice bL_switcher_device = {
+ MISC_DYNAMIC_MINOR,
+ "b.L_switcher",
+ &bL_switcher_fops
+};
+
+static int __init bL_switcher_dummy_if_init(void)
+{
+ return misc_register(&bL_switcher_device);
+}
+
+static void __exit bL_switcher_dummy_if_exit(void)
+{
+ misc_deregister(&bL_switcher_device);
+}
+
+module_init(bL_switcher_dummy_if_init);
+module_exit(bL_switcher_dummy_if_exit);
diff --git a/arch/arm/common/mcpm_entry.c b/arch/arm/common/mcpm_entry.c
index 370236dd1a0..4a2b32fd53a 100644
--- a/arch/arm/common/mcpm_entry.c
+++ b/arch/arm/common/mcpm_entry.c
@@ -27,6 +27,18 @@ void mcpm_set_entry_vector(unsigned cpu, unsigned cluster, void *ptr)
sync_cache_w(&mcpm_entry_vectors[cluster][cpu]);
}
+extern unsigned long mcpm_entry_early_pokes[MAX_NR_CLUSTERS][MAX_CPUS_PER_CLUSTER][2];
+
+void mcpm_set_early_poke(unsigned cpu, unsigned cluster,
+ unsigned long poke_phys_addr, unsigned long poke_val)
+{
+ unsigned long *poke = &mcpm_entry_early_pokes[cluster][cpu][0];
+ poke[0] = poke_phys_addr;
+ poke[1] = poke_val;
+ __cpuc_flush_dcache_area((void *)poke, 8);
+ outer_clean_range(__pa(poke), __pa(poke + 2));
+}
+
static const struct mcpm_platform_ops *platform_ops;
int __init mcpm_platform_register(const struct mcpm_platform_ops *ops)
diff --git a/arch/arm/common/mcpm_head.S b/arch/arm/common/mcpm_head.S
index 8178705c4b2..057e9c5a9e1 100644
--- a/arch/arm/common/mcpm_head.S
+++ b/arch/arm/common/mcpm_head.S
@@ -71,12 +71,19 @@ ENTRY(mcpm_entry_point)
* position independent way.
*/
adr r5, 3f
- ldmia r5, {r6, r7, r8, r11}
+ ldmia r5, {r0, r6, r7, r8, r11}
+ add r0, r5, r0 @ r0 = mcpm_entry_early_pokes
add r6, r5, r6 @ r6 = mcpm_entry_vectors
ldr r7, [r5, r7] @ r7 = mcpm_power_up_setup_phys
add r8, r5, r8 @ r8 = mcpm_sync
add r11, r5, r11 @ r11 = first_man_locks
+ @ Perform an early poke, if any
+ add r0, r0, r4, lsl #3
+ ldmia r0, {r0, r1}
+ teq r0, #0
+ strne r1, [r0]
+
mov r0, #MCPM_SYNC_CLUSTER_SIZE
mla r8, r0, r10, r8 @ r8 = sync cluster base
@@ -195,7 +202,8 @@ mcpm_entry_gated:
.align 2
-3: .word mcpm_entry_vectors - .
+3: .word mcpm_entry_early_pokes - .
+ .word mcpm_entry_vectors - 3b
.word mcpm_power_up_setup_phys - 3b
.word mcpm_sync - 3b
.word first_man_locks - 3b
@@ -214,6 +222,10 @@ first_man_locks:
ENTRY(mcpm_entry_vectors)
.space 4 * MAX_NR_CLUSTERS * MAX_CPUS_PER_CLUSTER
+ .type mcpm_entry_early_pokes, #object
+ENTRY(mcpm_entry_early_pokes)
+ .space 8 * MAX_NR_CLUSTERS * MAX_CPUS_PER_CLUSTER
+
.type mcpm_power_up_setup_phys, #object
ENTRY(mcpm_power_up_setup_phys)
.space 4 @ set by mcpm_sync_init()
diff --git a/arch/arm/include/asm/bL_switcher.h b/arch/arm/include/asm/bL_switcher.h
new file mode 100644
index 00000000000..482383b45c9
--- /dev/null
+++ b/arch/arm/include/asm/bL_switcher.h
@@ -0,0 +1,83 @@
+/*
+ * arch/arm/include/asm/bL_switcher.h
+ *
+ * Created by: Nicolas Pitre, April 2012
+ * Copyright: (C) 2012 Linaro Limited
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef ASM_BL_SWITCHER_H
+#define ASM_BL_SWITCHER_H
+
+#include <linux/compiler.h>
+#include <linux/types.h>
+
+typedef void (*bL_switch_completion_handler)(void *cookie);
+
+int bL_switch_request_cb(unsigned int cpu, unsigned int new_cluster_id,
+ bL_switch_completion_handler completer,
+ void *completer_cookie);
+static inline int bL_switch_request(unsigned int cpu, unsigned int new_cluster_id)
+{
+ return bL_switch_request_cb(cpu, new_cluster_id, NULL, NULL);
+}
+
+/*
+ * Register here to be notified about runtime enabling/disabling of
+ * the switcher.
+ *
+ * The notifier chain is called with the switcher activation lock held:
+ * the switcher will not be enabled or disabled during callbacks.
+ * Callbacks must not call bL_switcher_{get,put}_enabled().
+ */
+#define BL_NOTIFY_PRE_ENABLE 0
+#define BL_NOTIFY_POST_ENABLE 1
+#define BL_NOTIFY_PRE_DISABLE 2
+#define BL_NOTIFY_POST_DISABLE 3
+
+#ifdef CONFIG_BL_SWITCHER
+
+void bL_switch_request_detach(unsigned int cpu,
+ bL_switch_completion_handler completer);
+
+int bL_switcher_register_notifier(struct notifier_block *nb);
+int bL_switcher_unregister_notifier(struct notifier_block *nb);
+
+/*
+ * Use these functions to temporarily prevent enabling/disabling of
+ * the switcher.
+ * bL_switcher_get_enabled() returns true if the switcher is currently
+ * enabled. Each call to bL_switcher_get_enabled() must be followed
+ * by a call to bL_switcher_put_enabled(). These functions are not
+ * recursive.
+ */
+bool bL_switcher_get_enabled(void);
+void bL_switcher_put_enabled(void);
+
+int bL_switcher_trace_trigger(void);
+int bL_switcher_get_logical_index(u32 mpidr);
+
+#else
+static void bL_switch_request_detach(unsigned int cpu,
+ bL_switch_completion_handler completer) { }
+
+static inline int bL_switcher_register_notifier(struct notifier_block *nb)
+{
+ return 0;
+}
+
+static inline int bL_switcher_unregister_notifier(struct notifier_block *nb)
+{
+ return 0;
+}
+
+static inline bool bL_switcher_get_enabled(void) { return false; }
+static inline void bL_switcher_put_enabled(void) { }
+static inline int bL_switcher_trace_trigger(void) { return 0; }
+static inline int bL_switcher_get_logical_index(u32 mpidr) { return -EUNATCH; }
+#endif /* CONFIG_BL_SWITCHER */
+
+#endif
diff --git a/arch/arm/include/asm/hardirq.h b/arch/arm/include/asm/hardirq.h
index 2740c2a2df6..3d7351c844a 100644
--- a/arch/arm/include/asm/hardirq.h
+++ b/arch/arm/include/asm/hardirq.h
@@ -5,7 +5,7 @@
#include <linux/threads.h>
#include <asm/irq.h>
-#define NR_IPI 6
+#define NR_IPI 7
typedef struct {
unsigned int __softirq_pending;
diff --git a/arch/arm/include/asm/mcpm.h b/arch/arm/include/asm/mcpm.h
index 0f7b7620e9a..7626a7fd493 100644
--- a/arch/arm/include/asm/mcpm.h
+++ b/arch/arm/include/asm/mcpm.h
@@ -42,6 +42,14 @@ extern void mcpm_entry_point(void);
void mcpm_set_entry_vector(unsigned cpu, unsigned cluster, void *ptr);
/*
+ * This sets an early poke i.e a value to be poked into some address
+ * from very early assembly code before the CPU is ungated. The
+ * address must be physical, and if 0 then nothing will happen.
+ */
+void mcpm_set_early_poke(unsigned cpu, unsigned cluster,
+ unsigned long poke_phys_addr, unsigned long poke_val);
+
+/*
* CPU/cluster power operations API for higher subsystems to use.
*/
diff --git a/arch/arm/include/asm/pmu.h b/arch/arm/include/asm/pmu.h
index f24edad26c7..0cd7824ca76 100644
--- a/arch/arm/include/asm/pmu.h
+++ b/arch/arm/include/asm/pmu.h
@@ -62,9 +62,19 @@ struct pmu_hw_events {
raw_spinlock_t pmu_lock;
};
+struct cpupmu_regs {
+ u32 pmc;
+ u32 pmcntenset;
+ u32 pmuseren;
+ u32 pmintenset;
+ u32 pmxevttype[8];
+ u32 pmxevtcnt[8];
+};
+
struct arm_pmu {
struct pmu pmu;
cpumask_t active_irqs;
+ cpumask_t valid_cpus;
char *name;
irqreturn_t (*handle_irq)(int irq_num, void *dev);
void (*enable)(struct perf_event *event);
@@ -81,6 +91,8 @@ struct arm_pmu {
int (*request_irq)(struct arm_pmu *, irq_handler_t handler);
void (*free_irq)(struct arm_pmu *);
int (*map_event)(struct perf_event *event);
+ void (*save_regs)(struct arm_pmu *, struct cpupmu_regs *);
+ void (*restore_regs)(struct arm_pmu *, struct cpupmu_regs *);
int num_events;
atomic_t active_events;
struct mutex reserve_mutex;
diff --git a/arch/arm/include/asm/smp.h b/arch/arm/include/asm/smp.h
index d3a22bebe6c..610ccf33f5e 100644
--- a/arch/arm/include/asm/smp.h
+++ b/arch/arm/include/asm/smp.h
@@ -81,6 +81,8 @@ extern void arch_send_call_function_single_ipi(int cpu);
extern void arch_send_call_function_ipi_mask(const struct cpumask *mask);
extern void arch_send_wakeup_ipi_mask(const struct cpumask *mask);
+extern int register_ipi_completion(struct completion *completion, int cpu);
+
struct smp_operations {
#ifdef CONFIG_SMP
/*
diff --git a/arch/arm/include/asm/topology.h b/arch/arm/include/asm/topology.h
index 58b8b84adcd..983fa7c153a 100644
--- a/arch/arm/include/asm/topology.h
+++ b/arch/arm/include/asm/topology.h
@@ -26,11 +26,45 @@ extern struct cputopo_arm cpu_topology[NR_CPUS];
void init_cpu_topology(void);
void store_cpu_topology(unsigned int cpuid);
const struct cpumask *cpu_coregroup_mask(int cpu);
+int cluster_to_logical_mask(unsigned int socket_id, cpumask_t *cluster_mask);
+
+#ifdef CONFIG_DISABLE_CPU_SCHED_DOMAIN_BALANCE
+/* Common values for CPUs */
+#ifndef SD_CPU_INIT
+#define SD_CPU_INIT (struct sched_domain) { \
+ .min_interval = 1, \
+ .max_interval = 4, \
+ .busy_factor = 64, \
+ .imbalance_pct = 125, \
+ .cache_nice_tries = 1, \
+ .busy_idx = 2, \
+ .idle_idx = 1, \
+ .newidle_idx = 0, \
+ .wake_idx = 0, \
+ .forkexec_idx = 0, \
+ \
+ .flags = 0*SD_LOAD_BALANCE \
+ | 1*SD_BALANCE_NEWIDLE \
+ | 1*SD_BALANCE_EXEC \
+ | 1*SD_BALANCE_FORK \
+ | 0*SD_BALANCE_WAKE \
+ | 1*SD_WAKE_AFFINE \
+ | 0*SD_SHARE_CPUPOWER \
+ | 0*SD_SHARE_PKG_RESOURCES \
+ | 0*SD_SERIALIZE \
+ , \
+ .last_balance = jiffies, \
+ .balance_interval = 1, \
+}
+#endif
+#endif /* CONFIG_DISABLE_CPU_SCHED_DOMAIN_BALANCE */
#else
static inline void init_cpu_topology(void) { }
static inline void store_cpu_topology(unsigned int cpuid) { }
+static inline int cluster_to_logical_mask(unsigned int socket_id,
+ cpumask_t *cluster_mask) { return -EINVAL; }
#endif
diff --git a/arch/arm/kernel/hw_breakpoint.c b/arch/arm/kernel/hw_breakpoint.c
index 1fd749ee4a1..1b803117ed9 100644
--- a/arch/arm/kernel/hw_breakpoint.c
+++ b/arch/arm/kernel/hw_breakpoint.c
@@ -1049,7 +1049,8 @@ static struct notifier_block dbg_cpu_pm_nb = {
static void __init pm_init(void)
{
- cpu_pm_register_notifier(&dbg_cpu_pm_nb);
+ if (has_ossr)
+ cpu_pm_register_notifier(&dbg_cpu_pm_nb);
}
#else
static inline void pm_init(void)
diff --git a/arch/arm/kernel/perf_event.c b/arch/arm/kernel/perf_event.c
index 8c3094d0f7b..d847c622a7b 100644
--- a/arch/arm/kernel/perf_event.c
+++ b/arch/arm/kernel/perf_event.c
@@ -12,6 +12,7 @@
*/
#define pr_fmt(fmt) "hw perfevents: " fmt
+#include <linux/cpumask.h>
#include <linux/kernel.h>
#include <linux/platform_device.h>
#include <linux/pm_runtime.h>
@@ -81,6 +82,9 @@ armpmu_map_event(struct perf_event *event,
return armpmu_map_cache_event(cache_map, config);
case PERF_TYPE_RAW:
return armpmu_map_raw_event(raw_event_mask, config);
+ default:
+ if (event->attr.type >= PERF_TYPE_MAX)
+ return armpmu_map_raw_event(raw_event_mask, config);
}
return -ENOENT;
@@ -158,6 +162,8 @@ armpmu_stop(struct perf_event *event, int flags)
struct arm_pmu *armpmu = to_arm_pmu(event->pmu);
struct hw_perf_event *hwc = &event->hw;
+ if (!cpumask_test_cpu(smp_processor_id(), &armpmu->valid_cpus))
+ return;
/*
* ARM pmu always has to update the counter, so ignore
* PERF_EF_UPDATE, see comments in armpmu_start().
@@ -174,6 +180,8 @@ static void armpmu_start(struct perf_event *event, int flags)
struct arm_pmu *armpmu = to_arm_pmu(event->pmu);
struct hw_perf_event *hwc = &event->hw;
+ if (!cpumask_test_cpu(smp_processor_id(), &armpmu->valid_cpus))
+ return;
/*
* ARM pmu always has to reprogram the period, so ignore
* PERF_EF_RELOAD, see the comment below.
@@ -201,6 +209,9 @@ armpmu_del(struct perf_event *event, int flags)
struct hw_perf_event *hwc = &event->hw;
int idx = hwc->idx;
+ if (!cpumask_test_cpu(smp_processor_id(), &armpmu->valid_cpus))
+ return;
+
armpmu_stop(event, PERF_EF_UPDATE);
hw_events->events[idx] = NULL;
clear_bit(idx, hw_events->used_mask);
@@ -217,6 +228,10 @@ armpmu_add(struct perf_event *event, int flags)
int idx;
int err = 0;
+ /* An event following a process won't be stopped earlier */
+ if (!cpumask_test_cpu(smp_processor_id(), &armpmu->valid_cpus))
+ return 0;
+
perf_pmu_disable(event->pmu);
/* If we don't have a space for the counter then finish early. */
@@ -416,6 +431,10 @@ static int armpmu_event_init(struct perf_event *event)
int err = 0;
atomic_t *active_events = &armpmu->active_events;
+ if (event->cpu != -1 &&
+ !cpumask_test_cpu(event->cpu, &armpmu->valid_cpus))
+ return -ENOENT;
+
/* does not support taken branch sampling */
if (has_branch_stack(event))
return -EOPNOTSUPP;
diff --git a/arch/arm/kernel/perf_event_cpu.c b/arch/arm/kernel/perf_event_cpu.c
index 1f2740e3dbc..0b48a38e3cf 100644
--- a/arch/arm/kernel/perf_event_cpu.c
+++ b/arch/arm/kernel/perf_event_cpu.c
@@ -19,6 +19,7 @@
#define pr_fmt(fmt) "CPU PMU: " fmt
#include <linux/bitmap.h>
+#include <linux/cpu_pm.h>
#include <linux/export.h>
#include <linux/kernel.h>
#include <linux/of.h>
@@ -31,33 +32,36 @@
#include <asm/pmu.h>
/* Set at runtime when we know what CPU type we are. */
-static struct arm_pmu *cpu_pmu;
+static DEFINE_PER_CPU(struct arm_pmu *, cpu_pmu);
static DEFINE_PER_CPU(struct perf_event * [ARMPMU_MAX_HWEVENTS], hw_events);
static DEFINE_PER_CPU(unsigned long [BITS_TO_LONGS(ARMPMU_MAX_HWEVENTS)], used_mask);
static DEFINE_PER_CPU(struct pmu_hw_events, cpu_hw_events);
+static DEFINE_PER_CPU(struct cpupmu_regs, cpu_pmu_regs);
+
/*
* Despite the names, these two functions are CPU-specific and are used
* by the OProfile/perf code.
*/
const char *perf_pmu_name(void)
{
- if (!cpu_pmu)
+ struct arm_pmu *pmu = per_cpu(cpu_pmu, 0);
+ if (!pmu)
return NULL;
- return cpu_pmu->name;
+ return pmu->name;
}
EXPORT_SYMBOL_GPL(perf_pmu_name);
int perf_num_counters(void)
{
- int max_events = 0;
+ struct arm_pmu *pmu = per_cpu(cpu_pmu, 0);
- if (cpu_pmu != NULL)
- max_events = cpu_pmu->num_events;
+ if (!pmu)
+ return 0;
- return max_events;
+ return pmu->num_events;
}
EXPORT_SYMBOL_GPL(perf_num_counters);
@@ -75,11 +79,13 @@ static void cpu_pmu_free_irq(struct arm_pmu *cpu_pmu)
{
int i, irq, irqs;
struct platform_device *pmu_device = cpu_pmu->plat_device;
+ int cpu = -1;
irqs = min(pmu_device->num_resources, num_possible_cpus());
for (i = 0; i < irqs; ++i) {
- if (!cpumask_test_and_clear_cpu(i, &cpu_pmu->active_irqs))
+ cpu = cpumask_next(cpu, &cpu_pmu->valid_cpus);
+ if (!cpumask_test_and_clear_cpu(cpu, &cpu_pmu->active_irqs))
continue;
irq = platform_get_irq(pmu_device, i);
if (irq >= 0)
@@ -91,6 +97,7 @@ static int cpu_pmu_request_irq(struct arm_pmu *cpu_pmu, irq_handler_t handler)
{
int i, err, irq, irqs;
struct platform_device *pmu_device = cpu_pmu->plat_device;
+ int cpu = -1;
if (!pmu_device)
return -ENODEV;
@@ -103,6 +110,7 @@ static int cpu_pmu_request_irq(struct arm_pmu *cpu_pmu, irq_handler_t handler)
for (i = 0; i < irqs; ++i) {
err = 0;
+ cpu = cpumask_next(cpu, &cpu_pmu->valid_cpus);
irq = platform_get_irq(pmu_device, i);
if (irq < 0)
continue;
@@ -112,7 +120,7 @@ static int cpu_pmu_request_irq(struct arm_pmu *cpu_pmu, irq_handler_t handler)
* assume that we're running on a uniprocessor machine and
* continue. Otherwise, continue without this interrupt.
*/
- if (irq_set_affinity(irq, cpumask_of(i)) && irqs > 1) {
+ if (irq_set_affinity(irq, cpumask_of(cpu)) && irqs > 1) {
pr_warning("unable to set irq affinity (irq=%d, cpu=%u)\n",
irq, i);
continue;
@@ -126,7 +134,7 @@ static int cpu_pmu_request_irq(struct arm_pmu *cpu_pmu, irq_handler_t handler)
return err;
}
- cpumask_set_cpu(i, &cpu_pmu->active_irqs);
+ cpumask_set_cpu(cpu, &cpu_pmu->active_irqs);
}
return 0;
@@ -135,7 +143,7 @@ static int cpu_pmu_request_irq(struct arm_pmu *cpu_pmu, irq_handler_t handler)
static void cpu_pmu_init(struct arm_pmu *cpu_pmu)
{
int cpu;
- for_each_possible_cpu(cpu) {
+ for_each_cpu_mask(cpu, cpu_pmu->valid_cpus) {
struct pmu_hw_events *events = &per_cpu(cpu_hw_events, cpu);
events->events = per_cpu(hw_events, cpu);
events->used_mask = per_cpu(used_mask, cpu);
@@ -148,7 +156,7 @@ static void cpu_pmu_init(struct arm_pmu *cpu_pmu)
/* Ensure the PMU has sane values out of reset. */
if (cpu_pmu->reset)
- on_each_cpu(cpu_pmu->reset, cpu_pmu, 1);
+ on_each_cpu_mask(&cpu_pmu->valid_cpus, cpu_pmu->reset, cpu_pmu, 1);
}
/*
@@ -160,21 +168,46 @@ static void cpu_pmu_init(struct arm_pmu *cpu_pmu)
static int __cpuinit cpu_pmu_notify(struct notifier_block *b,
unsigned long action, void *hcpu)
{
+ struct arm_pmu *pmu = per_cpu(cpu_pmu, (long)hcpu);
+
if ((action & ~CPU_TASKS_FROZEN) != CPU_STARTING)
return NOTIFY_DONE;
- if (cpu_pmu && cpu_pmu->reset)
- cpu_pmu->reset(cpu_pmu);
+ if (pmu && pmu->reset)
+ pmu->reset(pmu);
else
return NOTIFY_DONE;
return NOTIFY_OK;
}
+static int cpu_pmu_pm_notify(struct notifier_block *b,
+ unsigned long action, void *hcpu)
+{
+ int cpu = smp_processor_id();
+ struct arm_pmu *pmu = per_cpu(cpu_pmu, cpu);
+ struct cpupmu_regs *pmuregs = &per_cpu(cpu_pmu_regs, cpu);
+
+ if (!pmu)
+ return NOTIFY_DONE;
+
+ if (action == CPU_PM_ENTER && pmu->save_regs) {
+ pmu->save_regs(pmu, pmuregs);
+ } else if (action == CPU_PM_EXIT && pmu->restore_regs) {
+ pmu->restore_regs(pmu, pmuregs);
+ }
+
+ return NOTIFY_OK;
+}
+
static struct notifier_block __cpuinitdata cpu_pmu_hotplug_notifier = {
.notifier_call = cpu_pmu_notify,
};
+static struct notifier_block __cpuinitdata cpu_pmu_pm_notifier = {
+ .notifier_call = cpu_pmu_pm_notify,
+};
+
/*
* PMU platform driver and devicetree bindings.
*/
@@ -246,6 +279,9 @@ static int probe_current_pmu(struct arm_pmu *pmu)
}
}
+ /* assume PMU support all the CPUs in this case */
+ cpumask_setall(&pmu->valid_cpus);
+
put_cpu();
return ret;
}
@@ -253,15 +289,10 @@ static int probe_current_pmu(struct arm_pmu *pmu)
static int cpu_pmu_device_probe(struct platform_device *pdev)
{
const struct of_device_id *of_id;
- int (*init_fn)(struct arm_pmu *);
struct device_node *node = pdev->dev.of_node;
struct arm_pmu *pmu;
- int ret = -ENODEV;
-
- if (cpu_pmu) {
- pr_info("attempt to register multiple PMU devices!");
- return -ENOSPC;
- }
+ int ret = 0;
+ int cpu;
pmu = kzalloc(sizeof(struct arm_pmu), GFP_KERNEL);
if (!pmu) {
@@ -270,8 +301,28 @@ static int cpu_pmu_device_probe(struct platform_device *pdev)
}
if (node && (of_id = of_match_node(cpu_pmu_of_device_ids, pdev->dev.of_node))) {
- init_fn = of_id->data;
- ret = init_fn(pmu);
+ smp_call_func_t init_fn = (smp_call_func_t)of_id->data;
+ struct device_node *ncluster;
+ int cluster = -1;
+ cpumask_t sibling_mask;
+
+ ncluster = of_parse_phandle(node, "cluster", 0);
+ if (ncluster) {
+ int len;
+ const u32 *hwid;
+ hwid = of_get_property(ncluster, "reg", &len);
+ if (hwid && len == 4)
+ cluster = be32_to_cpup(hwid);
+ }
+ /* set sibling mask to all cpu mask if socket is not specified */
+ if (cluster == -1 ||
+ cluster_to_logical_mask(cluster, &sibling_mask))
+ cpumask_setall(&sibling_mask);
+
+ smp_call_function_any(&sibling_mask, init_fn, pmu, 1);
+
+ /* now set the valid_cpus after init */
+ cpumask_copy(&pmu->valid_cpus, &sibling_mask);
} else {
ret = probe_current_pmu(pmu);
}
@@ -281,10 +332,12 @@ static int cpu_pmu_device_probe(struct platform_device *pdev)
goto out_free;
}
- cpu_pmu = pmu;
- cpu_pmu->plat_device = pdev;
- cpu_pmu_init(cpu_pmu);
- ret = armpmu_register(cpu_pmu, PERF_TYPE_RAW);
+ for_each_cpu_mask(cpu, pmu->valid_cpus)
+ per_cpu(cpu_pmu, cpu) = pmu;
+
+ pmu->plat_device = pdev;
+ cpu_pmu_init(pmu);
+ ret = armpmu_register(pmu, -1);
if (!ret)
return 0;
@@ -313,9 +366,17 @@ static int __init register_pmu_driver(void)
if (err)
return err;
+ err = cpu_pm_register_notifier(&cpu_pmu_pm_notifier);
+ if (err) {
+ unregister_cpu_notifier(&cpu_pmu_hotplug_notifier);
+ return err;
+ }
+
err = platform_driver_register(&cpu_pmu_driver);
- if (err)
+ if (err) {
+ cpu_pm_unregister_notifier(&cpu_pmu_pm_notifier);
unregister_cpu_notifier(&cpu_pmu_hotplug_notifier);
+ }
return err;
}
diff --git a/arch/arm/kernel/perf_event_v7.c b/arch/arm/kernel/perf_event_v7.c
index 039cffb053a..654db5030c3 100644
--- a/arch/arm/kernel/perf_event_v7.c
+++ b/arch/arm/kernel/perf_event_v7.c
@@ -950,6 +950,51 @@ static void armv7_pmnc_dump_regs(struct arm_pmu *cpu_pmu)
}
#endif
+static void armv7pmu_save_regs(struct arm_pmu *cpu_pmu,
+ struct cpupmu_regs *regs)
+{
+ unsigned int cnt;
+ asm volatile("mrc p15, 0, %0, c9, c12, 0" : "=r" (regs->pmc));
+ if (!(regs->pmc & ARMV7_PMNC_E))
+ return;
+
+ asm volatile("mrc p15, 0, %0, c9, c12, 1" : "=r" (regs->pmcntenset));
+ asm volatile("mrc p15, 0, %0, c9, c14, 0" : "=r" (regs->pmuseren));
+ asm volatile("mrc p15, 0, %0, c9, c14, 1" : "=r" (regs->pmintenset));
+ asm volatile("mrc p15, 0, %0, c9, c13, 0" : "=r" (regs->pmxevtcnt[0]));
+ for (cnt = ARMV7_IDX_COUNTER0;
+ cnt <= ARMV7_IDX_COUNTER_LAST(cpu_pmu); cnt++) {
+ armv7_pmnc_select_counter(cnt);
+ asm volatile("mrc p15, 0, %0, c9, c13, 1"
+ : "=r"(regs->pmxevttype[cnt]));
+ asm volatile("mrc p15, 0, %0, c9, c13, 2"
+ : "=r"(regs->pmxevtcnt[cnt]));
+ }
+ return;
+}
+
+static void armv7pmu_restore_regs(struct arm_pmu *cpu_pmu,
+ struct cpupmu_regs *regs)
+{
+ unsigned int cnt;
+ if (!(regs->pmc & ARMV7_PMNC_E))
+ return;
+
+ asm volatile("mcr p15, 0, %0, c9, c12, 1" : : "r" (regs->pmcntenset));
+ asm volatile("mcr p15, 0, %0, c9, c14, 0" : : "r" (regs->pmuseren));
+ asm volatile("mcr p15, 0, %0, c9, c14, 1" : : "r" (regs->pmintenset));
+ asm volatile("mcr p15, 0, %0, c9, c13, 0" : : "r" (regs->pmxevtcnt[0]));
+ for (cnt = ARMV7_IDX_COUNTER0;
+ cnt <= ARMV7_IDX_COUNTER_LAST(cpu_pmu); cnt++) {
+ armv7_pmnc_select_counter(cnt);
+ asm volatile("mcr p15, 0, %0, c9, c13, 1"
+ : : "r"(regs->pmxevttype[cnt]));
+ asm volatile("mcr p15, 0, %0, c9, c13, 2"
+ : : "r"(regs->pmxevtcnt[cnt]));
+ }
+ asm volatile("mcr p15, 0, %0, c9, c12, 0" : : "r" (regs->pmc));
+}
+
static void armv7pmu_enable_event(struct perf_event *event)
{
unsigned long flags;
@@ -1223,6 +1268,8 @@ static void armv7pmu_init(struct arm_pmu *cpu_pmu)
cpu_pmu->start = armv7pmu_start;
cpu_pmu->stop = armv7pmu_stop;
cpu_pmu->reset = armv7pmu_reset;
+ cpu_pmu->save_regs = armv7pmu_save_regs;
+ cpu_pmu->restore_regs = armv7pmu_restore_regs;
cpu_pmu->max_period = (1LLU << 32) - 1;
};
@@ -1240,7 +1287,7 @@ static u32 armv7_read_num_pmnc_events(void)
static int armv7_a8_pmu_init(struct arm_pmu *cpu_pmu)
{
armv7pmu_init(cpu_pmu);
- cpu_pmu->name = "ARMv7 Cortex-A8";
+ cpu_pmu->name = "ARMv7_Cortex_A8";
cpu_pmu->map_event = armv7_a8_map_event;
cpu_pmu->num_events = armv7_read_num_pmnc_events();
return 0;
@@ -1249,7 +1296,7 @@ static int armv7_a8_pmu_init(struct arm_pmu *cpu_pmu)
static int armv7_a9_pmu_init(struct arm_pmu *cpu_pmu)
{
armv7pmu_init(cpu_pmu);
- cpu_pmu->name = "ARMv7 Cortex-A9";
+ cpu_pmu->name = "ARMv7_Cortex_A9";
cpu_pmu->map_event = armv7_a9_map_event;
cpu_pmu->num_events = armv7_read_num_pmnc_events();
return 0;
@@ -1258,7 +1305,7 @@ static int armv7_a9_pmu_init(struct arm_pmu *cpu_pmu)
static int armv7_a5_pmu_init(struct arm_pmu *cpu_pmu)
{
armv7pmu_init(cpu_pmu);
- cpu_pmu->name = "ARMv7 Cortex-A5";
+ cpu_pmu->name = "ARMv7_Cortex_A5";
cpu_pmu->map_event = armv7_a5_map_event;
cpu_pmu->num_events = armv7_read_num_pmnc_events();
return 0;
@@ -1267,7 +1314,7 @@ static int armv7_a5_pmu_init(struct arm_pmu *cpu_pmu)
static int armv7_a15_pmu_init(struct arm_pmu *cpu_pmu)
{
armv7pmu_init(cpu_pmu);
- cpu_pmu->name = "ARMv7 Cortex-A15";
+ cpu_pmu->name = "ARMv7_Cortex_A15";
cpu_pmu->map_event = armv7_a15_map_event;
cpu_pmu->num_events = armv7_read_num_pmnc_events();
cpu_pmu->set_event_filter = armv7pmu_set_event_filter;
@@ -1277,7 +1324,7 @@ static int armv7_a15_pmu_init(struct arm_pmu *cpu_pmu)
static int armv7_a7_pmu_init(struct arm_pmu *cpu_pmu)
{
armv7pmu_init(cpu_pmu);
- cpu_pmu->name = "ARMv7 Cortex-A7";
+ cpu_pmu->name = "ARMv7_Cortex_A7";
cpu_pmu->map_event = armv7_a7_map_event;
cpu_pmu->num_events = armv7_read_num_pmnc_events();
cpu_pmu->set_event_filter = armv7pmu_set_event_filter;
diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c
index 5919eb451bb..dc05ca77963 100644
--- a/arch/arm/kernel/smp.c
+++ b/arch/arm/kernel/smp.c
@@ -66,6 +66,7 @@ enum ipi_msg_type {
IPI_CALL_FUNC,
IPI_CALL_FUNC_SINGLE,
IPI_CPU_STOP,
+ IPI_COMPLETION,
};
static DECLARE_COMPLETION(cpu_running);
@@ -463,6 +464,7 @@ static const char *ipi_types[NR_IPI] = {
S(IPI_CALL_FUNC, "Function call interrupts"),
S(IPI_CALL_FUNC_SINGLE, "Single function call interrupts"),
S(IPI_CPU_STOP, "CPU stop interrupts"),
+ S(IPI_COMPLETION, "completion interrupts"),
};
void show_ipi_list(struct seq_file *p, int prec)
@@ -588,6 +590,19 @@ static void ipi_cpu_stop(unsigned int cpu)
cpu_relax();
}
+static DEFINE_PER_CPU(struct completion *, cpu_completion);
+
+int register_ipi_completion(struct completion *completion, int cpu)
+{
+ per_cpu(cpu_completion, cpu) = completion;
+ return IPI_COMPLETION;
+}
+
+static void ipi_complete(unsigned int cpu)
+{
+ complete(per_cpu(cpu_completion, cpu));
+}
+
/*
* Main handler for inter-processor interrupts
*/
@@ -638,6 +653,12 @@ void handle_IPI(int ipinr, struct pt_regs *regs)
irq_exit();
break;
+ case IPI_COMPLETION:
+ irq_enter();
+ ipi_complete(cpu);
+ irq_exit();
+ break;
+
default:
printk(KERN_CRIT "CPU%u: Unknown IPI message 0x%x\n",
cpu, ipinr);
diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c
index c5a59546a25..677da58d9e8 100644
--- a/arch/arm/kernel/topology.c
+++ b/arch/arm/kernel/topology.c
@@ -23,6 +23,7 @@
#include <linux/slab.h>
#include <asm/cputype.h>
+#include <asm/smp_plat.h>
#include <asm/topology.h>
/*
@@ -289,6 +290,140 @@ void store_cpu_topology(unsigned int cpuid)
cpu_topology[cpuid].socket_id, mpidr);
}
+
+#ifdef CONFIG_SCHED_HMP
+
+static const char * const little_cores[] = {
+ "arm,cortex-a7",
+ NULL,
+};
+
+static bool is_little_cpu(struct device_node *cn)
+{
+ const char * const *lc;
+ for (lc = little_cores; *lc; lc++)
+ if (of_device_is_compatible(cn, *lc))
+ return true;
+ return false;
+}
+
+void __init arch_get_fast_and_slow_cpus(struct cpumask *fast,
+ struct cpumask *slow)
+{
+ struct device_node *cn = NULL;
+ int cpu;
+
+ cpumask_clear(fast);
+ cpumask_clear(slow);
+
+ /*
+ * Use the config options if they are given. This helps testing
+ * HMP scheduling on systems without a big.LITTLE architecture.
+ */
+ if (strlen(CONFIG_HMP_FAST_CPU_MASK) && strlen(CONFIG_HMP_SLOW_CPU_MASK)) {
+ if (cpulist_parse(CONFIG_HMP_FAST_CPU_MASK, fast))
+ WARN(1, "Failed to parse HMP fast cpu mask!\n");
+ if (cpulist_parse(CONFIG_HMP_SLOW_CPU_MASK, slow))
+ WARN(1, "Failed to parse HMP slow cpu mask!\n");
+ return;
+ }
+
+ /*
+ * Else, parse device tree for little cores.
+ */
+ while ((cn = of_find_node_by_type(cn, "cpu"))) {
+
+ const u32 *mpidr;
+ int len;
+
+ mpidr = of_get_property(cn, "reg", &len);
+ if (!mpidr || len != 4) {
+ pr_err("* %s missing reg property\n", cn->full_name);
+ continue;
+ }
+
+ cpu = get_logical_index(be32_to_cpup(mpidr));
+ if (cpu == -EINVAL) {
+ pr_err("couldn't get logical index for mpidr %x\n",
+ be32_to_cpup(mpidr));
+ break;
+ }
+
+ if (is_little_cpu(cn))
+ cpumask_set_cpu(cpu, slow);
+ else
+ cpumask_set_cpu(cpu, fast);
+ }
+
+ if (!cpumask_empty(fast) && !cpumask_empty(slow))
+ return;
+
+ /*
+ * We didn't find both big and little cores so let's call all cores
+ * fast as this will keep the system running, with all cores being
+ * treated equal.
+ */
+ cpumask_setall(fast);
+ cpumask_clear(slow);
+}
+
+struct cpumask hmp_slow_cpu_mask;
+
+void __init arch_get_hmp_domains(struct list_head *hmp_domains_list)
+{
+ struct cpumask hmp_fast_cpu_mask;
+ struct hmp_domain *domain;
+
+ arch_get_fast_and_slow_cpus(&hmp_fast_cpu_mask, &hmp_slow_cpu_mask);
+
+ /*
+ * Initialize hmp_domains
+ * Must be ordered with respect to compute capacity.
+ * Fastest domain at head of list.
+ */
+ if(!cpumask_empty(&hmp_slow_cpu_mask)) {
+ domain = (struct hmp_domain *)
+ kmalloc(sizeof(struct hmp_domain), GFP_KERNEL);
+ cpumask_copy(&domain->possible_cpus, &hmp_slow_cpu_mask);
+ cpumask_and(&domain->cpus, cpu_online_mask, &domain->possible_cpus);
+ list_add(&domain->hmp_domains, hmp_domains_list);
+ }
+ domain = (struct hmp_domain *)
+ kmalloc(sizeof(struct hmp_domain), GFP_KERNEL);
+ cpumask_copy(&domain->possible_cpus, &hmp_fast_cpu_mask);
+ cpumask_and(&domain->cpus, cpu_online_mask, &domain->possible_cpus);
+ list_add(&domain->hmp_domains, hmp_domains_list);
+}
+#endif /* CONFIG_SCHED_HMP */
+
+
+/*
+ * cluster_to_logical_mask - return cpu logical mask of CPUs in a cluster
+ * @socket_id: cluster HW identifier
+ * @cluster_mask: the cpumask location to be initialized, modified by the
+ * function only if return value == 0
+ *
+ * Return:
+ *
+ * 0 on success
+ * -EINVAL if cluster_mask is NULL or there is no record matching socket_id
+ */
+int cluster_to_logical_mask(unsigned int socket_id, cpumask_t *cluster_mask)
+{
+ int cpu;
+
+ if (!cluster_mask)
+ return -EINVAL;
+
+ for_each_online_cpu(cpu)
+ if (socket_id == topology_physical_package_id(cpu)) {
+ cpumask_copy(cluster_mask, topology_core_cpumask(cpu));
+ return 0;
+ }
+
+ return -EINVAL;
+}
+
/*
* init_cpu_topology is called at boot when only one cpu is running
* which prevent simultaneous write access to cpu_topology array
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 260a9193955..5402c94ab76 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3399,15 +3399,22 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu,
var->limit = vmx_read_guest_seg_limit(vmx, seg);
var->selector = vmx_read_guest_seg_selector(vmx, seg);
ar = vmx_read_guest_seg_ar(vmx, seg);
+ var->unusable = (ar >> 16) & 1;
var->type = ar & 15;
var->s = (ar >> 4) & 1;
var->dpl = (ar >> 5) & 3;
- var->present = (ar >> 7) & 1;
+ /*
+ * Some userspaces do not preserve unusable property. Since usable
+ * segment has to be present according to VMX spec we can use present
+ * property to amend userspace bug by making unusable segment always
+ * nonpresent. vmx_segment_access_rights() already marks nonpresent
+ * segment as unusable.
+ */
+ var->present = !var->unusable;
var->avl = (ar >> 12) & 1;
var->l = (ar >> 13) & 1;
var->db = (ar >> 14) & 1;
var->g = (ar >> 15) & 1;
- var->unusable = (ar >> 16) & 1;
}
static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg)
diff --git a/block/blk-core.c b/block/blk-core.c
index d5745b5833c..0852e5d4343 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -3180,7 +3180,8 @@ int __init blk_dev_init(void)
/* used for unplugging and affects IO latency/throughput - HIGHPRI */
kblockd_workqueue = alloc_workqueue("kblockd",
- WQ_MEM_RECLAIM | WQ_HIGHPRI, 0);
+ WQ_MEM_RECLAIM | WQ_HIGHPRI |
+ WQ_POWER_EFFICIENT, 0);
if (!kblockd_workqueue)
panic("Failed to create kblockd\n");
diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index 9c4bb8266bc..4464c823cff 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -144,7 +144,8 @@ void put_io_context(struct io_context *ioc)
if (atomic_long_dec_and_test(&ioc->refcount)) {
spin_lock_irqsave(&ioc->lock, flags);
if (!hlist_empty(&ioc->icq_list))
- schedule_work(&ioc->release_work);
+ queue_work(system_power_efficient_wq,
+ &ioc->release_work);
else
free_ioc = true;
spin_unlock_irqrestore(&ioc->lock, flags);
diff --git a/block/genhd.c b/block/genhd.c
index 20625eed551..dadf42b454a 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -512,7 +512,7 @@ static void register_disk(struct gendisk *disk)
ddev->parent = disk->driverfs_dev;
- dev_set_name(ddev, disk->disk_name);
+ dev_set_name(ddev, "%s", disk->disk_name);
/* delay uevents, until we scanned partition table */
dev_set_uevent_suppress(ddev, 1);
@@ -1489,9 +1489,11 @@ static void __disk_unblock_events(struct gendisk *disk, bool check_now)
intv = disk_events_poll_jiffies(disk);
set_timer_slack(&ev->dwork.timer, intv / 4);
if (check_now)
- queue_delayed_work(system_freezable_wq, &ev->dwork, 0);
+ queue_delayed_work(system_freezable_power_efficient_wq,
+ &ev->dwork, 0);
else if (intv)
- queue_delayed_work(system_freezable_wq, &ev->dwork, intv);
+ queue_delayed_work(system_freezable_power_efficient_wq,
+ &ev->dwork, intv);
out_unlock:
spin_unlock_irqrestore(&ev->lock, flags);
}
@@ -1534,7 +1536,8 @@ void disk_flush_events(struct gendisk *disk, unsigned int mask)
spin_lock_irq(&ev->lock);
ev->clearing |= mask;
if (!ev->block)
- mod_delayed_work(system_freezable_wq, &ev->dwork, 0);
+ mod_delayed_work(system_freezable_power_efficient_wq,
+ &ev->dwork, 0);
spin_unlock_irq(&ev->lock);
}
@@ -1627,7 +1630,8 @@ static void disk_check_events(struct disk_events *ev,
intv = disk_events_poll_jiffies(disk);
if (!ev->block && intv)
- queue_delayed_work(system_freezable_wq, &ev->dwork, intv);
+ queue_delayed_work(system_freezable_power_efficient_wq,
+ &ev->dwork, intv);
spin_unlock_irq(&ev->lock);
diff --git a/crypto/algapi.c b/crypto/algapi.c
index 6149a6e0964..7a1ae87f168 100644
--- a/crypto/algapi.c
+++ b/crypto/algapi.c
@@ -495,7 +495,8 @@ static struct crypto_template *__crypto_lookup_template(const char *name)
struct crypto_template *crypto_lookup_template(const char *name)
{
- return try_then_request_module(__crypto_lookup_template(name), name);
+ return try_then_request_module(__crypto_lookup_template(name), "%s",
+ name);
}
EXPORT_SYMBOL_GPL(crypto_lookup_template);
diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 037288e7874..46b35f7acfd 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -714,7 +714,8 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
else
blk_queue_flush(nbd->disk->queue, 0);
- thread = kthread_create(nbd_thread, nbd, nbd->disk->disk_name);
+ thread = kthread_create(nbd_thread, nbd, "%s",
+ nbd->disk->disk_name);
if (IS_ERR(thread)) {
mutex_lock(&nbd->tx_lock);
return PTR_ERR(thread);
diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c
index d620b449574..8a3aff724d9 100644
--- a/drivers/cdrom/cdrom.c
+++ b/drivers/cdrom/cdrom.c
@@ -2882,7 +2882,7 @@ static noinline int mmc_ioctl_cdrom_read_data(struct cdrom_device_info *cdi,
if (lba < 0)
return -EINVAL;
- cgc->buffer = kmalloc(blocksize, GFP_KERNEL);
+ cgc->buffer = kzalloc(blocksize, GFP_KERNEL);
if (cgc->buffer == NULL)
return -ENOMEM;
diff --git a/drivers/cpufreq/cpufreq_stats.c b/drivers/cpufreq/cpufreq_stats.c
index cd98e162a23..c44338b60fc 100644
--- a/drivers/cpufreq/cpufreq_stats.c
+++ b/drivers/cpufreq/cpufreq_stats.c
@@ -350,6 +350,7 @@ static int __cpuinit cpufreq_stat_cpu_callback(struct notifier_block *nfb,
switch (action) {
case CPU_ONLINE:
+ case CPU_ONLINE_FROZEN:
cpufreq_update_policy(cpu);
break;
case CPU_DOWN_PREPARE:
diff --git a/drivers/irqchip/irq-gic.c b/drivers/irqchip/irq-gic.c
index fe44d3e2c70..6f00dfa7d4f 100644
--- a/drivers/irqchip/irq-gic.c
+++ b/drivers/irqchip/irq-gic.c
@@ -253,10 +253,9 @@ static int gic_set_affinity(struct irq_data *d, const struct cpumask *mask_val,
if (cpu >= NR_GIC_CPU_IF || cpu >= nr_cpu_ids)
return -EINVAL;
+ raw_spin_lock(&irq_controller_lock);
mask = 0xff << shift;
bit = gic_cpu_map[cpu] << shift;
-
- raw_spin_lock(&irq_controller_lock);
val = readl_relaxed(reg) & ~mask;
writel_relaxed(val | bit, reg);
raw_spin_unlock(&irq_controller_lock);
@@ -652,7 +651,9 @@ static void __init gic_pm_init(struct gic_chip_data *gic)
void gic_raise_softirq(const struct cpumask *mask, unsigned int irq)
{
int cpu;
- unsigned long map = 0;
+ unsigned long flags, map = 0;
+
+ raw_spin_lock_irqsave(&irq_controller_lock, flags);
/* Convert our logical CPU mask into a physical one. */
for_each_cpu(cpu, mask)
@@ -666,9 +667,145 @@ void gic_raise_softirq(const struct cpumask *mask, unsigned int irq)
/* this always happens on GIC0 */
writel_relaxed(map << 16 | irq, gic_data_dist_base(&gic_data[0]) + GIC_DIST_SOFTINT);
+
+ raw_spin_unlock_irqrestore(&irq_controller_lock, flags);
}
#endif
+#ifdef CONFIG_BL_SWITCHER
+/*
+ * gic_send_sgi - send a SGI directly to given CPU interface number
+ *
+ * cpu_id: the ID for the destination CPU interface
+ * irq: the IPI number to send a SGI for
+ */
+void gic_send_sgi(unsigned int cpu_id, unsigned int irq)
+{
+ BUG_ON(cpu_id >= NR_GIC_CPU_IF);
+ cpu_id = 1 << cpu_id;
+ /* this always happens on GIC0 */
+ writel_relaxed((cpu_id << 16) | irq, gic_data_dist_base(&gic_data[0]) + GIC_DIST_SOFTINT);
+}
+
+/*
+ * gic_get_cpu_id - get the CPU interface ID for the specified CPU
+ *
+ * @cpu: the logical CPU number to get the GIC ID for.
+ *
+ * Return the CPU interface ID for the given logical CPU number,
+ * or -1 if the CPU number is too large or the interface ID is
+ * unknown (more than one bit set).
+ */
+int gic_get_cpu_id(unsigned int cpu)
+{
+ unsigned int cpu_bit;
+
+ if (cpu >= NR_GIC_CPU_IF)
+ return -1;
+ cpu_bit = gic_cpu_map[cpu];
+ if (cpu_bit & (cpu_bit - 1))
+ return -1;
+ return __ffs(cpu_bit);
+}
+
+/*
+ * gic_migrate_target - migrate IRQs to another PU interface
+ *
+ * @new_cpu_id: the CPU target ID to migrate IRQs to
+ *
+ * Migrate all peripheral interrupts with a target matching the current CPU
+ * to the interface corresponding to @new_cpu_id. The CPU interface mapping
+ * is also updated. Targets to other CPU interfaces are unchanged.
+ * This must be called with IRQs locally disabled.
+ */
+void gic_migrate_target(unsigned int new_cpu_id)
+{
+ unsigned int old_cpu_id, gic_irqs, gic_nr = 0;
+ void __iomem *dist_base;
+ int i, ror_val, cpu = smp_processor_id();
+ u32 val, old_mask, active_mask;
+
+ if (gic_nr >= MAX_GIC_NR)
+ BUG();
+
+ dist_base = gic_data_dist_base(&gic_data[gic_nr]);
+ if (!dist_base)
+ return;
+ gic_irqs = gic_data[gic_nr].gic_irqs;
+
+ old_cpu_id = __ffs(gic_cpu_map[cpu]);
+ old_mask = 0x01010101 << old_cpu_id;
+ ror_val = (old_cpu_id - new_cpu_id) & 31;
+
+ raw_spin_lock(&irq_controller_lock);
+
+ gic_cpu_map[cpu] = 1 << new_cpu_id;
+
+ for (i = 8; i < DIV_ROUND_UP(gic_irqs, 4); i++) {
+ val = readl_relaxed(dist_base + GIC_DIST_TARGET + i * 4);
+ active_mask = val & old_mask;
+ if (active_mask) {
+ val &= ~active_mask;
+ val |= ror32(active_mask, ror_val);
+ writel_relaxed(val, dist_base + GIC_DIST_TARGET + i * 4);
+ }
+ }
+
+ raw_spin_unlock(&irq_controller_lock);
+
+ /*
+ * Now let's migrate and clear any potential SGIs that might be
+ * pending for us (old_cpu_id). Since GIC_DIST_SGI_PENDING_SET
+ * is a banked register, we can only forward the SGI using
+ * GIC_DIST_SOFTINT. The original SGI source is lost but Linux
+ * doesn't use that information anyway.
+ *
+ * For the same reason we do not adjust SGI source information
+ * for previously sent SGIs by us to other CPUs either.
+ */
+ for (i = 0; i < 16; i += 4) {
+ int j;
+ val = readl_relaxed(dist_base + GIC_DIST_SGI_PENDING_SET + i);
+ if (!val)
+ continue;
+ writel_relaxed(val, dist_base + GIC_DIST_SGI_PENDING_CLEAR + i);
+ for (j = i; j < i + 4; j++) {
+ if (val & 0xff)
+ writel_relaxed((1 << (new_cpu_id + 16)) | j,
+ dist_base + GIC_DIST_SOFTINT);
+ val >>= 8;
+ }
+ }
+}
+
+/*
+ * gic_get_sgir_physaddr - get the physical address for the SGI register
+ *
+ * REturn the physical address of the SGI register to be used
+ * by some early assembly code when the kernel is not yet available.
+ */
+static unsigned long gic_dist_physaddr;
+
+unsigned long gic_get_sgir_physaddr(void)
+{
+ if (!gic_dist_physaddr)
+ return 0;
+ return gic_dist_physaddr + GIC_DIST_SOFTINT;
+}
+
+void __init gic_init_physaddr(struct device_node *node)
+{
+ struct resource res;
+ if (of_address_to_resource(node, 0, &res) == 0) {
+ gic_dist_physaddr = res.start;
+ pr_info("GIC physical location is %#lx\n", gic_dist_physaddr);
+ }
+}
+
+#else
+#define gic_init_physaddr(node) do { } while(0)
+#endif
+
static int gic_irq_domain_map(struct irq_domain *d, unsigned int irq,
irq_hw_number_t hw)
{
@@ -850,6 +987,8 @@ int __init gic_of_init(struct device_node *node, struct device_node *parent)
percpu_offset = 0;
gic_init_bases(gic_cnt, -1, dist_base, cpu_base, percpu_offset, node);
+ if (!gic_cnt)
+ gic_init_physaddr(node);
if (parent) {
irq = irq_of_parse_and_map(node, 0);
diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c
index 38f0b312ff8..663d2d0448b 100644
--- a/drivers/net/phy/phy.c
+++ b/drivers/net/phy/phy.c
@@ -439,7 +439,7 @@ void phy_start_machine(struct phy_device *phydev,
{
phydev->adjust_state = handler;
- schedule_delayed_work(&phydev->state_queue, HZ);
+ queue_delayed_work(system_power_efficient_wq, &phydev->state_queue, HZ);
}
/**
@@ -500,7 +500,7 @@ static irqreturn_t phy_interrupt(int irq, void *phy_dat)
disable_irq_nosync(irq);
atomic_inc(&phydev->irq_disable);
- schedule_work(&phydev->phy_queue);
+ queue_work(system_power_efficient_wq, &phydev->phy_queue);
return IRQ_HANDLED;
}
@@ -655,7 +655,7 @@ static void phy_change(struct work_struct *work)
/* reschedule state queue work to run as soon as possible */
cancel_delayed_work_sync(&phydev->state_queue);
- schedule_delayed_work(&phydev->state_queue, 0);
+ queue_delayed_work(system_power_efficient_wq, &phydev->state_queue, 0);
return;
@@ -918,7 +918,8 @@ void phy_state_machine(struct work_struct *work)
if (err < 0)
phy_error(phydev);
- schedule_delayed_work(&phydev->state_queue, PHY_STATE_TIME * HZ);
+ queue_delayed_work(system_power_efficient_wq, &phydev->state_queue,
+ PHY_STATE_TIME * HZ);
}
static inline void mmd_phy_indirect(struct mii_bus *bus, int prtad, int devad,
diff --git a/drivers/power/charger-manager.c b/drivers/power/charger-manager.c
index fefc39fe42b..98de1ddce45 100644
--- a/drivers/power/charger-manager.c
+++ b/drivers/power/charger-manager.c
@@ -450,7 +450,7 @@ static void uevent_notify(struct charger_manager *cm, const char *event)
strncpy(env_str, event, UEVENT_BUF_SIZE);
kobject_uevent(&cm->dev->kobj, KOBJ_CHANGE);
- dev_info(cm->dev, event);
+ dev_info(cm->dev, "%s", event);
}
/**
diff --git a/drivers/scsi/osd/osd_uld.c b/drivers/scsi/osd/osd_uld.c
index 0fab6b5c7b8..9d86947d67f 100644
--- a/drivers/scsi/osd/osd_uld.c
+++ b/drivers/scsi/osd/osd_uld.c
@@ -485,7 +485,7 @@ static int osd_probe(struct device *dev)
oud->class_dev.class = &osd_uld_class;
oud->class_dev.parent = dev;
oud->class_dev.release = __remove;
- error = dev_set_name(&oud->class_dev, disk->disk_name);
+ error = dev_set_name(&oud->class_dev, "%s", disk->disk_name);
if (error) {
OSD_ERR("dev_set_name failed => %d\n", error);
goto err_put_cdev;
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index c1c555242d0..6f6a1b48f99 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -142,7 +142,7 @@ sd_store_cache_type(struct device *dev, struct device_attribute *attr,
char *buffer_data;
struct scsi_mode_data data;
struct scsi_sense_hdr sshdr;
- const char *temp = "temporary ";
+ static const char temp[] = "temporary ";
int len;
if (sdp->type != TYPE_DISK)
diff --git a/drivers/tty/serial/8250/8250_pci.c b/drivers/tty/serial/8250/8250_pci.c
index 26e3a97ab15..c52948b368d 100644
--- a/drivers/tty/serial/8250/8250_pci.c
+++ b/drivers/tty/serial/8250/8250_pci.c
@@ -4797,10 +4797,6 @@ static struct pci_device_id serial_pci_tbl[] = {
PCI_VENDOR_ID_IBM, 0x0299,
0, 0, pbn_b0_bt_2_115200 },
- { PCI_VENDOR_ID_NETMOS, PCI_DEVICE_ID_NETMOS_9835,
- 0x1000, 0x0012,
- 0, 0, pbn_b0_bt_2_115200 },
-
{ PCI_VENDOR_ID_NETMOS, PCI_DEVICE_ID_NETMOS_9901,
0xA000, 0x1000,
0, 0, pbn_b0_1_115200 },
diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c
index 6464029e486..44766821390 100644
--- a/drivers/tty/tty_io.c
+++ b/drivers/tty/tty_io.c
@@ -1618,6 +1618,8 @@ static void release_tty(struct tty_struct *tty, int idx)
tty_free_termios(tty);
tty_driver_remove_tty(tty->driver, tty);
tty->port->itty = NULL;
+ if (tty->link)
+ tty->link->port->itty = NULL;
cancel_work_sync(&tty->port->buf.work);
if (tty->link)
diff --git a/drivers/video/console/fbcon.c b/drivers/video/console/fbcon.c
index a92783e480e..0d8f98c79a6 100644
--- a/drivers/video/console/fbcon.c
+++ b/drivers/video/console/fbcon.c
@@ -404,7 +404,7 @@ static void cursor_timer_handler(unsigned long dev_addr)
struct fb_info *info = (struct fb_info *) dev_addr;
struct fbcon_ops *ops = info->fbcon_par;
- schedule_work(&info->queue);
+ queue_work(system_power_efficient_wq, &info->queue);
mod_timer(&ops->cursor_timer, jiffies + HZ/5);
}
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 9b6b2b6dd16..be661d8f532 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -675,17 +675,18 @@ ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value,
if (!ceph_is_valid_xattr(name))
return -ENODATA;
- spin_lock(&ci->i_ceph_lock);
- dout("getxattr %p ver=%lld index_ver=%lld\n", inode,
- ci->i_xattrs.version, ci->i_xattrs.index_version);
/* let's see if a virtual xattr was requested */
vxattr = ceph_match_vxattr(inode, name);
if (vxattr && !(vxattr->exists_cb && !vxattr->exists_cb(ci))) {
err = vxattr->getxattr_cb(ci, value, size);
- goto out;
+ return err;
}
+ spin_lock(&ci->i_ceph_lock);
+ dout("getxattr %p ver=%lld index_ver=%lld\n", inode,
+ ci->i_xattrs.version, ci->i_xattrs.index_version);
+
if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1) &&
(ci->i_xattrs.index_version >= ci->i_xattrs.version)) {
goto get_xattr;
diff --git a/fs/hpfs/map.c b/fs/hpfs/map.c
index 4acb19d7835..803d3da3a0f 100644
--- a/fs/hpfs/map.c
+++ b/fs/hpfs/map.c
@@ -17,7 +17,8 @@ __le32 *hpfs_map_bitmap(struct super_block *s, unsigned bmp_block,
struct quad_buffer_head *qbh, char *id)
{
secno sec;
- if (hpfs_sb(s)->sb_chk) if (bmp_block * 16384 > hpfs_sb(s)->sb_fs_size) {
+ unsigned n_bands = (hpfs_sb(s)->sb_fs_size + 0x3fff) >> 14;
+ if (hpfs_sb(s)->sb_chk) if (bmp_block >= n_bands) {
hpfs_error(s, "hpfs_map_bitmap called with bad parameter: %08x at %s", bmp_block, id);
return NULL;
}
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index a0617e70695..962e90c37ae 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -558,7 +558,13 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent)
sbi->sb_cp_table = NULL;
sbi->sb_c_bitmap = -1;
sbi->sb_max_fwd_alloc = 0xffffff;
-
+
+ if (sbi->sb_fs_size >= 0x80000000) {
+ hpfs_error(s, "invalid size in superblock: %08x",
+ (unsigned)sbi->sb_fs_size);
+ goto bail4;
+ }
+
/* Load bitmap directory */
if (!(sbi->sb_bmp_dir = hpfs_load_bitmap_directory(s, le32_to_cpu(superblock->bitmaps))))
goto bail4;
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 1fab140764c..2c37442ed93 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -228,19 +228,8 @@ static int nfs41_setup_state_renewal(struct nfs_client *clp)
return status;
}
-/*
- * Back channel returns NFS4ERR_DELAY for new requests when
- * NFS4_SESSION_DRAINING is set so there is no work to be done when draining
- * is ended.
- */
-static void nfs4_end_drain_session(struct nfs_client *clp)
+static void nfs4_end_drain_slot_table(struct nfs4_slot_table *tbl)
{
- struct nfs4_session *ses = clp->cl_session;
- struct nfs4_slot_table *tbl;
-
- if (ses == NULL)
- return;
- tbl = &ses->fc_slot_table;
if (test_and_clear_bit(NFS4_SLOT_TBL_DRAINING, &tbl->slot_tbl_state)) {
spin_lock(&tbl->slot_tbl_lock);
nfs41_wake_slot_table(tbl);
@@ -248,6 +237,16 @@ static void nfs4_end_drain_session(struct nfs_client *clp)
}
}
+static void nfs4_end_drain_session(struct nfs_client *clp)
+{
+ struct nfs4_session *ses = clp->cl_session;
+
+ if (ses != NULL) {
+ nfs4_end_drain_slot_table(&ses->bc_slot_table);
+ nfs4_end_drain_slot_table(&ses->fc_slot_table);
+ }
+}
+
/*
* Signal state manager thread if session fore channel is drained
*/
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 6cd86e0fe45..582321a978b 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -162,8 +162,8 @@ static __be32 *read_buf(struct nfsd4_compoundargs *argp, u32 nbytes)
*/
memcpy(p, argp->p, avail);
/* step to next page */
- argp->p = page_address(argp->pagelist[0]);
argp->pagelist++;
+ argp->p = page_address(argp->pagelist[0]);
if (argp->pagelen < PAGE_SIZE) {
argp->end = argp->p + (argp->pagelen>>2);
argp->pagelen = 0;
diff --git a/include/linux/ceph/decode.h b/include/linux/ceph/decode.h
index 379f7150899..0442c3d800f 100644
--- a/include/linux/ceph/decode.h
+++ b/include/linux/ceph/decode.h
@@ -160,11 +160,6 @@ static inline void ceph_decode_timespec(struct timespec *ts,
static inline void ceph_encode_timespec(struct ceph_timespec *tv,
const struct timespec *ts)
{
- BUG_ON(ts->tv_sec < 0);
- BUG_ON(ts->tv_sec > (__kernel_time_t)U32_MAX);
- BUG_ON(ts->tv_nsec < 0);
- BUG_ON(ts->tv_nsec > (long)U32_MAX);
-
tv->tv_sec = cpu_to_le32((u32)ts->tv_sec);
tv->tv_nsec = cpu_to_le32((u32)ts->tv_nsec);
}
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 6b4890fa57e..feaf0c7fb7d 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -358,6 +358,17 @@ static inline int hstate_index(struct hstate *h)
return h - hstates;
}
+pgoff_t __basepage_index(struct page *page);
+
+/* Return page->index in PAGE_SIZE units */
+static inline pgoff_t basepage_index(struct page *page)
+{
+ if (!PageCompound(page))
+ return page->index;
+
+ return __basepage_index(page);
+}
+
#else /* CONFIG_HUGETLB_PAGE */
struct hstate {};
#define alloc_huge_page_node(h, nid) NULL
@@ -378,6 +389,11 @@ static inline unsigned int pages_per_huge_page(struct hstate *h)
}
#define hstate_index_to_shift(index) 0
#define hstate_index(h) 0
+
+static inline pgoff_t basepage_index(struct page *page)
+{
+ return page->index;
+}
#endif /* CONFIG_HUGETLB_PAGE */
#endif /* _LINUX_HUGETLB_H */
diff --git a/include/linux/irqchip/arm-gic.h b/include/linux/irqchip/arm-gic.h
index b5696108c06..40643ca79cd 100644
--- a/include/linux/irqchip/arm-gic.h
+++ b/include/linux/irqchip/arm-gic.h
@@ -31,6 +31,8 @@
#define GIC_DIST_TARGET 0x800
#define GIC_DIST_CONFIG 0xc00
#define GIC_DIST_SOFTINT 0xf00
+#define GIC_DIST_SGI_PENDING_CLEAR 0xf10
+#define GIC_DIST_SGI_PENDING_SET 0xf20
#define GICH_HCR 0x0
#define GICH_VTR 0x4
@@ -75,6 +77,11 @@ static inline void gic_init(unsigned int nr, int start,
gic_init_bases(nr, start, dist, cpu, 0, NULL);
}
+void gic_send_sgi(unsigned int cpu_id, unsigned int irq);
+int gic_get_cpu_id(unsigned int cpu);
+void gic_migrate_target(unsigned int new_cpu_id);
+unsigned long gic_get_sgir_physaddr(void);
+
#endif /* __ASSEMBLY */
#endif
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 178a8d909f1..0e2a546cdad 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -885,6 +885,13 @@ void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms);
bool cpus_share_cache(int this_cpu, int that_cpu);
+#ifdef CONFIG_SCHED_HMP
+struct hmp_domain {
+ struct cpumask cpus;
+ struct cpumask possible_cpus;
+ struct list_head hmp_domains;
+};
+#endif /* CONFIG_SCHED_HMP */
#else /* CONFIG_SMP */
struct sched_domain_attr;
@@ -931,6 +938,12 @@ struct sched_avg {
u64 last_runnable_update;
s64 decay_count;
unsigned long load_avg_contrib;
+ unsigned long load_avg_ratio;
+#ifdef CONFIG_SCHED_HMP
+ u64 hmp_last_up_migration;
+ u64 hmp_last_down_migration;
+#endif
+ u32 usage_avg_sum;
};
#ifdef CONFIG_SCHEDSTATS
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index c586679b6fe..a30ab7910ff 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -198,7 +198,7 @@ extern void __inc_zone_state(struct zone *, enum zone_stat_item);
extern void dec_zone_state(struct zone *, enum zone_stat_item);
extern void __dec_zone_state(struct zone *, enum zone_stat_item);
-void refresh_cpu_vm_stats(int);
+bool refresh_cpu_vm_stats(int);
void refresh_zone_stat_thresholds(void);
void drain_zonestat(struct zone *zone, struct per_cpu_pageset *);
diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 623488fdc1f..a9f4119c7e2 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -303,6 +303,33 @@ enum {
WQ_CPU_INTENSIVE = 1 << 5, /* cpu instensive workqueue */
WQ_SYSFS = 1 << 6, /* visible in sysfs, see wq_sysfs_register() */
+ /*
+ * Per-cpu workqueues are generally preferred because they tend to
+ * show better performance thanks to cache locality. Per-cpu
+ * workqueues exclude the scheduler from choosing the CPU to
+ * execute the worker threads, which has an unfortunate side effect
+ * of increasing power consumption.
+ *
+ * The scheduler considers a CPU idle if it doesn't have any task
+ * to execute and tries to keep idle cores idle to conserve power;
+ * however, for example, a per-cpu work item scheduled from an
+ * interrupt handler on an idle CPU will force the scheduler to
+ * excute the work item on that CPU breaking the idleness, which in
+ * turn may lead to more scheduling choices which are sub-optimal
+ * in terms of power consumption.
+ *
+ * Workqueues marked with WQ_POWER_EFFICIENT are per-cpu by default
+ * but become unbound if workqueue.power_efficient kernel param is
+ * specified. Per-cpu workqueues which are identified to
+ * contribute significantly to power-consumption are identified and
+ * marked with this flag and enabling the power_efficient mode
+ * leads to noticeable power saving at the cost of small
+ * performance disadvantage.
+ *
+ * http://thread.gmane.org/gmane.linux.kernel/1480396
+ */
+ WQ_POWER_EFFICIENT = 1 << 7,
+
__WQ_DRAINING = 1 << 16, /* internal: workqueue is draining */
__WQ_ORDERED = 1 << 17, /* internal: workqueue is ordered */
@@ -333,11 +360,19 @@ enum {
*
* system_freezable_wq is equivalent to system_wq except that it's
* freezable.
+ *
+ * *_power_efficient_wq are inclined towards saving power and converted
+ * into WQ_UNBOUND variants if 'wq_power_efficient' is enabled; otherwise,
+ * they are same as their non-power-efficient counterparts - e.g.
+ * system_power_efficient_wq is identical to system_wq if
+ * 'wq_power_efficient' is disabled. See WQ_POWER_EFFICIENT for more info.
*/
extern struct workqueue_struct *system_wq;
extern struct workqueue_struct *system_long_wq;
extern struct workqueue_struct *system_unbound_wq;
extern struct workqueue_struct *system_freezable_wq;
+extern struct workqueue_struct *system_power_efficient_wq;
+extern struct workqueue_struct *system_freezable_power_efficient_wq;
static inline struct workqueue_struct * __deprecated __system_nrt_wq(void)
{
diff --git a/include/trace/events/power_cpu_migrate.h b/include/trace/events/power_cpu_migrate.h
new file mode 100644
index 00000000000..f76dd4de625
--- /dev/null
+++ b/include/trace/events/power_cpu_migrate.h
@@ -0,0 +1,67 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM power
+
+#if !defined(_TRACE_POWER_CPU_MIGRATE_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_POWER_CPU_MIGRATE_H
+
+#include <linux/tracepoint.h>
+
+#define __cpu_migrate_proto \
+ TP_PROTO(u64 timestamp, \
+ u32 cpu_hwid)
+#define __cpu_migrate_args \
+ TP_ARGS(timestamp, \
+ cpu_hwid)
+
+DECLARE_EVENT_CLASS(cpu_migrate,
+
+ __cpu_migrate_proto,
+ __cpu_migrate_args,
+
+ TP_STRUCT__entry(
+ __field(u64, timestamp )
+ __field(u32, cpu_hwid )
+ ),
+
+ TP_fast_assign(
+ __entry->timestamp = timestamp;
+ __entry->cpu_hwid = cpu_hwid;
+ ),
+
+ TP_printk("timestamp=%llu cpu_hwid=0x%08lX",
+ (unsigned long long)__entry->timestamp,
+ (unsigned long)__entry->cpu_hwid
+ )
+);
+
+#define __define_cpu_migrate_event(name) \
+ DEFINE_EVENT(cpu_migrate, cpu_migrate_##name, \
+ __cpu_migrate_proto, \
+ __cpu_migrate_args \
+ )
+
+__define_cpu_migrate_event(begin);
+__define_cpu_migrate_event(finish);
+__define_cpu_migrate_event(current);
+
+#undef __define_cpu_migrate
+#undef __cpu_migrate_proto
+#undef __cpu_migrate_args
+
+/* This file can get included multiple times, TRACE_HEADER_MULTI_READ at top */
+#ifndef _PWR_CPU_MIGRATE_EVENT_AVOID_DOUBLE_DEFINING
+#define _PWR_CPU_MIGRATE_EVENT_AVOID_DOUBLE_DEFINING
+
+/*
+ * Set from_phys_cpu and to_phys_cpu to CPU_MIGRATE_ALL_CPUS to indicate
+ * a whole-cluster migration:
+ */
+#define CPU_MIGRATE_ALL_CPUS 0x80000000U
+#endif
+
+#endif /* _TRACE_POWER_CPU_MIGRATE_H */
+
+/* This part must be outside protection */
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE power_cpu_migrate
+#include <trace/define_trace.h>
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index e5586caff67..203e8e9933b 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -430,6 +430,159 @@ TRACE_EVENT(sched_pi_setprio,
__entry->oldprio, __entry->newprio)
);
+/*
+ * Tracepoint for showing tracked load contribution.
+ */
+TRACE_EVENT(sched_task_load_contrib,
+
+ TP_PROTO(struct task_struct *tsk, unsigned long load_contrib),
+
+ TP_ARGS(tsk, load_contrib),
+
+ TP_STRUCT__entry(
+ __array(char, comm, TASK_COMM_LEN)
+ __field(pid_t, pid)
+ __field(unsigned long, load_contrib)
+ ),
+
+ TP_fast_assign(
+ memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
+ __entry->pid = tsk->pid;
+ __entry->load_contrib = load_contrib;
+ ),
+
+ TP_printk("comm=%s pid=%d load_contrib=%lu",
+ __entry->comm, __entry->pid,
+ __entry->load_contrib)
+);
+
+/*
+ * Tracepoint for showing tracked task runnable ratio [0..1023].
+ */
+TRACE_EVENT(sched_task_runnable_ratio,
+
+ TP_PROTO(struct task_struct *tsk, unsigned long ratio),
+
+ TP_ARGS(tsk, ratio),
+
+ TP_STRUCT__entry(
+ __array(char, comm, TASK_COMM_LEN)
+ __field(pid_t, pid)
+ __field(unsigned long, ratio)
+ ),
+
+ TP_fast_assign(
+ memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
+ __entry->pid = tsk->pid;
+ __entry->ratio = ratio;
+ ),
+
+ TP_printk("comm=%s pid=%d ratio=%lu",
+ __entry->comm, __entry->pid,
+ __entry->ratio)
+);
+
+/*
+ * Tracepoint for showing tracked rq runnable ratio [0..1023].
+ */
+TRACE_EVENT(sched_rq_runnable_ratio,
+
+ TP_PROTO(int cpu, unsigned long ratio),
+
+ TP_ARGS(cpu, ratio),
+
+ TP_STRUCT__entry(
+ __field(int, cpu)
+ __field(unsigned long, ratio)
+ ),
+
+ TP_fast_assign(
+ __entry->cpu = cpu;
+ __entry->ratio = ratio;
+ ),
+
+ TP_printk("cpu=%d ratio=%lu",
+ __entry->cpu,
+ __entry->ratio)
+);
+
+/*
+ * Tracepoint for showing tracked rq runnable load.
+ */
+TRACE_EVENT(sched_rq_runnable_load,
+
+ TP_PROTO(int cpu, u64 load),
+
+ TP_ARGS(cpu, load),
+
+ TP_STRUCT__entry(
+ __field(int, cpu)
+ __field(u64, load)
+ ),
+
+ TP_fast_assign(
+ __entry->cpu = cpu;
+ __entry->load = load;
+ ),
+
+ TP_printk("cpu=%d load=%llu",
+ __entry->cpu,
+ __entry->load)
+);
+
+/*
+ * Tracepoint for showing tracked task cpu usage ratio [0..1023].
+ */
+TRACE_EVENT(sched_task_usage_ratio,
+
+ TP_PROTO(struct task_struct *tsk, unsigned long ratio),
+
+ TP_ARGS(tsk, ratio),
+
+ TP_STRUCT__entry(
+ __array(char, comm, TASK_COMM_LEN)
+ __field(pid_t, pid)
+ __field(unsigned long, ratio)
+ ),
+
+ TP_fast_assign(
+ memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
+ __entry->pid = tsk->pid;
+ __entry->ratio = ratio;
+ ),
+
+ TP_printk("comm=%s pid=%d ratio=%lu",
+ __entry->comm, __entry->pid,
+ __entry->ratio)
+);
+
+/*
+ * Tracepoint for HMP (CONFIG_SCHED_HMP) task migrations.
+ */
+TRACE_EVENT(sched_hmp_migrate,
+
+ TP_PROTO(struct task_struct *tsk, int dest, int force),
+
+ TP_ARGS(tsk, dest, force),
+
+ TP_STRUCT__entry(
+ __array(char, comm, TASK_COMM_LEN)
+ __field(pid_t, pid)
+ __field(int, dest)
+ __field(int, force)
+ ),
+
+ TP_fast_assign(
+ memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
+ __entry->pid = tsk->pid;
+ __entry->dest = dest;
+ __entry->force = force;
+ ),
+
+ TP_printk("comm=%s pid=%d dest=%d force=%d",
+ __entry->comm, __entry->pid,
+ __entry->dest, __entry->force)
+);
#endif /* _TRACE_SCHED_H */
/* This part must be outside protection */
diff --git a/kernel/futex.c b/kernel/futex.c
index b26dcfc02c9..49dacfb4574 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -61,6 +61,7 @@
#include <linux/nsproxy.h>
#include <linux/ptrace.h>
#include <linux/sched/rt.h>
+#include <linux/hugetlb.h>
#include <asm/futex.h>
@@ -365,7 +366,7 @@ again:
} else {
key->both.offset |= FUT_OFF_INODE; /* inode-based key */
key->shared.inode = page_head->mapping->host;
- key->shared.pgoff = page_head->index;
+ key->shared.pgoff = basepage_index(page);
}
get_futex_key_refs(key);
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 192a302d6cf..473b2b6eccb 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -23,10 +23,27 @@
static struct lock_class_key irq_desc_lock_class;
#if defined(CONFIG_SMP)
+static int __init irq_affinity_setup(char *str)
+{
+ zalloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT);
+ cpulist_parse(str, irq_default_affinity);
+ /*
+ * Set at least the boot cpu. We don't want to end up with
+ * bugreports caused by random comandline masks
+ */
+ cpumask_set_cpu(smp_processor_id(), irq_default_affinity);
+ return 1;
+}
+__setup("irqaffinity=", irq_affinity_setup);
+
static void __init init_irq_default_affinity(void)
{
- alloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT);
- cpumask_setall(irq_default_affinity);
+#ifdef CONFIG_CPUMASK_OFFSTACK
+ if (!irq_default_affinity)
+ zalloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT);
+#endif
+ if (cpumask_empty(irq_default_affinity))
+ cpumask_setall(irq_default_affinity);
}
#else
static void __init init_irq_default_affinity(void)
diff --git a/kernel/module.c b/kernel/module.c
index cab4bce49c2..fa53db8aade 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2927,7 +2927,6 @@ static struct module *layout_and_allocate(struct load_info *info, int flags)
{
/* Module within temporary copy. */
struct module *mod;
- Elf_Shdr *pcpusec;
int err;
mod = setup_load_info(info, flags);
@@ -2942,17 +2941,10 @@ static struct module *layout_and_allocate(struct load_info *info, int flags)
err = module_frob_arch_sections(info->hdr, info->sechdrs,
info->secstrings, mod);
if (err < 0)
- goto out;
+ return ERR_PTR(err);
- pcpusec = &info->sechdrs[info->index.pcpu];
- if (pcpusec->sh_size) {
- /* We have a special allocation for this section. */
- err = percpu_modalloc(mod,
- pcpusec->sh_size, pcpusec->sh_addralign);
- if (err)
- goto out;
- pcpusec->sh_flags &= ~(unsigned long)SHF_ALLOC;
- }
+ /* We will do a special allocation for per-cpu sections later. */
+ info->sechdrs[info->index.pcpu].sh_flags &= ~(unsigned long)SHF_ALLOC;
/* Determine total sizes, and put offsets in sh_entsize. For now
this is done generically; there doesn't appear to be any
@@ -2963,17 +2955,22 @@ static struct module *layout_and_allocate(struct load_info *info, int flags)
/* Allocate and move to the final place */
err = move_module(mod, info);
if (err)
- goto free_percpu;
+ return ERR_PTR(err);
/* Module has been copied to its final place now: return it. */
mod = (void *)info->sechdrs[info->index.mod].sh_addr;
kmemleak_load_module(mod, info);
return mod;
+}
-free_percpu:
- percpu_modfree(mod);
-out:
- return ERR_PTR(err);
+static int alloc_module_percpu(struct module *mod, struct load_info *info)
+{
+ Elf_Shdr *pcpusec = &info->sechdrs[info->index.pcpu];
+ if (!pcpusec->sh_size)
+ return 0;
+
+ /* We have a special allocation for this section. */
+ return percpu_modalloc(mod, pcpusec->sh_size, pcpusec->sh_addralign);
}
/* mod is no longer valid after this! */
@@ -3237,6 +3234,11 @@ static int load_module(struct load_info *info, const char __user *uargs,
}
#endif
+ /* To avoid stressing percpu allocator, do this once we're unique. */
+ err = alloc_module_percpu(mod, info);
+ if (err)
+ goto unlink_mod;
+
/* Now module is in final location, initialize linked lists, etc. */
err = module_unload_init(mod);
if (err)
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 5dfdc9ea180..46455961a88 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -263,6 +263,26 @@ config PM_GENERIC_DOMAINS
bool
depends on PM
+config WQ_POWER_EFFICIENT_DEFAULT
+ bool "Enable workqueue power-efficient mode by default"
+ depends on PM
+ default n
+ help
+ Per-cpu workqueues are generally preferred because they show
+ better performance thanks to cache locality; unfortunately,
+ per-cpu workqueues tend to be more power hungry than unbound
+ workqueues.
+
+ Enabling workqueue.power_efficient kernel parameter makes the
+ per-cpu workqueues which were observed to contribute
+ significantly to power consumption unbound, leading to measurably
+ lower power usage at the cost of small performance overhead.
+
+ This config option determines whether workqueue.power_efficient
+ is enabled by default.
+
+ If in doubt, say N.
+
config PM_GENERIC_DOMAINS_SLEEP
def_bool y
depends on PM_SLEEP && PM_GENERIC_DOMAINS
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e8b335016c5..50d9e9849ce 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1617,6 +1617,20 @@ static void __sched_fork(struct task_struct *p)
#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)
p->se.avg.runnable_avg_period = 0;
p->se.avg.runnable_avg_sum = 0;
+#ifdef CONFIG_SCHED_HMP
+ /* keep LOAD_AVG_MAX in sync with fair.c if load avg series is changed */
+#define LOAD_AVG_MAX 47742
+ if (p->mm) {
+ p->se.avg.hmp_last_up_migration = 0;
+ p->se.avg.hmp_last_down_migration = 0;
+ p->se.avg.load_avg_ratio = 1023;
+ p->se.avg.load_avg_contrib =
+ (1023 * scale_load_down(p->se.load.weight));
+ p->se.avg.runnable_avg_period = LOAD_AVG_MAX;
+ p->se.avg.runnable_avg_sum = LOAD_AVG_MAX;
+ p->se.avg.usage_avg_sum = LOAD_AVG_MAX;
+ }
+#endif
#endif
#ifdef CONFIG_SCHEDSTATS
memset(&p->se.statistics, 0, sizeof(p->se.statistics));
@@ -3813,6 +3827,8 @@ static struct task_struct *find_process_by_pid(pid_t pid)
return pid ? find_task_by_vpid(pid) : current;
}
+extern struct cpumask hmp_slow_cpu_mask;
+
/* Actually do priority change: must hold rq lock. */
static void
__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
@@ -3822,8 +3838,13 @@ __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
p->normal_prio = normal_prio(p);
/* we are holding p->pi_lock already */
p->prio = rt_mutex_getprio(p);
- if (rt_prio(p->prio))
+ if (rt_prio(p->prio)) {
p->sched_class = &rt_sched_class;
+#ifdef CONFIG_SCHED_HMP
+ if (cpumask_equal(&p->cpus_allowed, cpu_all_mask))
+ do_set_cpus_allowed(p, &hmp_slow_cpu_mask);
+#endif
+ }
else
p->sched_class = &fair_sched_class;
set_load_weight(p);
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 75024a67352..fbd8caa83ef 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -94,6 +94,7 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
#ifdef CONFIG_SMP
P(se->avg.runnable_avg_sum);
P(se->avg.runnable_avg_period);
+ P(se->avg.usage_avg_sum);
P(se->avg.load_avg_contrib);
P(se->avg.decay_count);
#endif
@@ -223,6 +224,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
cfs_rq->tg_runnable_contrib);
SEQ_printf(m, " .%-30s: %d\n", "tg->runnable_avg",
atomic_read(&cfs_rq->tg->runnable_avg));
+ SEQ_printf(m, " .%-30s: %d\n", "tg->usage_avg",
+ atomic_read(&cfs_rq->tg->usage_avg));
#endif
print_cfs_group_stats(m, cpu, cfs_rq->tg);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c61a614465c..c849d68a9b7 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -31,9 +31,20 @@
#include <linux/task_work.h>
#include <trace/events/sched.h>
+#ifdef CONFIG_HMP_VARIABLE_SCALE
+#include <linux/sysfs.h>
+#include <linux/vmalloc.h>
+#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
+/* Include cpufreq header to add a notifier so that cpu frequency
+ * scaling can track the current CPU frequency
+ */
+#include <linux/cpufreq.h>
+#endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */
+#endif /* CONFIG_HMP_VARIABLE_SCALE */
#include "sched.h"
+
/*
* Targeted preemption latency for CPU-bound tasks:
* (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
@@ -1201,8 +1212,95 @@ static u32 __compute_runnable_contrib(u64 n)
return contrib + runnable_avg_yN_sum[n];
}
-/*
- * We can represent the historical contribution to runnable average as the
+#ifdef CONFIG_HMP_VARIABLE_SCALE
+
+#define HMP_VARIABLE_SCALE_SHIFT 16ULL
+struct hmp_global_attr {
+ struct attribute attr;
+ ssize_t (*show)(struct kobject *kobj,
+ struct attribute *attr, char *buf);
+ ssize_t (*store)(struct kobject *a, struct attribute *b,
+ const char *c, size_t count);
+ int *value;
+ int (*to_sysfs)(int);
+ int (*from_sysfs)(int);
+};
+
+#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
+#define HMP_DATA_SYSFS_MAX 4
+#else
+#define HMP_DATA_SYSFS_MAX 3
+#endif
+
+struct hmp_data_struct {
+#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
+ int freqinvar_load_scale_enabled;
+#endif
+ int multiplier; /* used to scale the time delta */
+ struct attribute_group attr_group;
+ struct attribute *attributes[HMP_DATA_SYSFS_MAX + 1];
+ struct hmp_global_attr attr[HMP_DATA_SYSFS_MAX];
+} hmp_data;
+
+static u64 hmp_variable_scale_convert(u64 delta);
+#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
+/* Frequency-Invariant Load Modification:
+ * Loads are calculated as in PJT's patch however we also scale the current
+ * contribution in line with the frequency of the CPU that the task was
+ * executed on.
+ * In this version, we use a simple linear scale derived from the maximum
+ * frequency reported by CPUFreq. As an example:
+ *
+ * Consider that we ran a task for 100% of the previous interval.
+ *
+ * Our CPU was under asynchronous frequency control through one of the
+ * CPUFreq governors.
+ *
+ * The CPUFreq governor reports that it is able to scale the CPU between
+ * 500MHz and 1GHz.
+ *
+ * During the period, the CPU was running at 1GHz.
+ *
+ * In this case, our load contribution for that period is calculated as
+ * 1 * (number_of_active_microseconds)
+ *
+ * This results in our task being able to accumulate maximum load as normal.
+ *
+ *
+ * Consider now that our CPU was executing at 500MHz.
+ *
+ * We now scale the load contribution such that it is calculated as
+ * 0.5 * (number_of_active_microseconds)
+ *
+ * Our task can only record 50% maximum load during this period.
+ *
+ * This represents the task consuming 50% of the CPU's *possible* compute
+ * capacity. However the task did consume 100% of the CPU's *available*
+ * compute capacity which is the value seen by the CPUFreq governor and
+ * user-side CPU Utilization tools.
+ *
+ * Restricting tracked load to be scaled by the CPU's frequency accurately
+ * represents the consumption of possible compute capacity and allows the
+ * HMP migration's simple threshold migration strategy to interact more
+ * predictably with CPUFreq's asynchronous compute capacity changes.
+ */
+#define SCHED_FREQSCALE_SHIFT 10
+struct cpufreq_extents {
+ u32 curr_scale;
+ u32 min;
+ u32 max;
+ u32 flags;
+};
+/* Flag set when the governor in use only allows one frequency.
+ * Disables scaling.
+ */
+#define SCHED_LOAD_FREQINVAR_SINGLEFREQ 0x01
+
+static struct cpufreq_extents freq_scale[CONFIG_NR_CPUS];
+#endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */
+#endif /* CONFIG_HMP_VARIABLE_SCALE */
+
+/* We can represent the historical contribution to runnable average as the
* coefficients of a geometric series. To do this we sub-divide our runnable
* history into segments of approximately 1ms (1024us); label the segment that
* occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g.
@@ -1231,13 +1329,24 @@ static u32 __compute_runnable_contrib(u64 n)
*/
static __always_inline int __update_entity_runnable_avg(u64 now,
struct sched_avg *sa,
- int runnable)
+ int runnable,
+ int running,
+ int cpu)
{
u64 delta, periods;
u32 runnable_contrib;
int delta_w, decayed = 0;
+#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
+ u64 scaled_delta;
+ u32 scaled_runnable_contrib;
+ int scaled_delta_w;
+ u32 curr_scale = 1024;
+#endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */
delta = now - sa->last_runnable_update;
+#ifdef CONFIG_HMP_VARIABLE_SCALE
+ delta = hmp_variable_scale_convert(delta);
+#endif
/*
* This should only happen when time goes backwards, which it
* unfortunately does during sched clock init when we swap over to TSC.
@@ -1256,6 +1365,12 @@ static __always_inline int __update_entity_runnable_avg(u64 now,
return 0;
sa->last_runnable_update = now;
+#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
+ /* retrieve scale factor for load */
+ if (hmp_data.freqinvar_load_scale_enabled)
+ curr_scale = freq_scale[cpu].curr_scale;
+#endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */
+
/* delta_w is the amount already accumulated against our next period */
delta_w = sa->runnable_avg_period % 1024;
if (delta + delta_w >= 1024) {
@@ -1268,8 +1383,20 @@ static __always_inline int __update_entity_runnable_avg(u64 now,
* period and accrue it.
*/
delta_w = 1024 - delta_w;
+ /* scale runnable time if necessary */
+#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
+ scaled_delta_w = (delta_w * curr_scale)
+ >> SCHED_FREQSCALE_SHIFT;
+ if (runnable)
+ sa->runnable_avg_sum += scaled_delta_w;
+ if (running)
+ sa->usage_avg_sum += scaled_delta_w;
+#else
if (runnable)
sa->runnable_avg_sum += delta_w;
+ if (running)
+ sa->usage_avg_sum += delta_w;
+#endif /* #ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */
sa->runnable_avg_period += delta_w;
delta -= delta_w;
@@ -1277,22 +1404,49 @@ static __always_inline int __update_entity_runnable_avg(u64 now,
/* Figure out how many additional periods this update spans */
periods = delta / 1024;
delta %= 1024;
-
+ /* decay the load we have accumulated so far */
sa->runnable_avg_sum = decay_load(sa->runnable_avg_sum,
periods + 1);
sa->runnable_avg_period = decay_load(sa->runnable_avg_period,
periods + 1);
-
+ sa->usage_avg_sum = decay_load(sa->usage_avg_sum, periods + 1);
+ /* add the contribution from this period */
/* Efficiently calculate \sum (1..n_period) 1024*y^i */
runnable_contrib = __compute_runnable_contrib(periods);
+ /* Apply load scaling if necessary.
+ * Note that multiplying the whole series is same as
+ * multiplying all terms
+ */
+#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
+ scaled_runnable_contrib = (runnable_contrib * curr_scale)
+ >> SCHED_FREQSCALE_SHIFT;
+ if (runnable)
+ sa->runnable_avg_sum += scaled_runnable_contrib;
+ if (running)
+ sa->usage_avg_sum += scaled_runnable_contrib;
+#else
if (runnable)
sa->runnable_avg_sum += runnable_contrib;
+ if (running)
+ sa->usage_avg_sum += runnable_contrib;
+#endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */
sa->runnable_avg_period += runnable_contrib;
}
/* Remainder of delta accrued against u_0` */
+ /* scale if necessary */
+#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
+ scaled_delta = ((delta * curr_scale) >> SCHED_FREQSCALE_SHIFT);
+ if (runnable)
+ sa->runnable_avg_sum += scaled_delta;
+ if (running)
+ sa->usage_avg_sum += scaled_delta;
+#else
if (runnable)
sa->runnable_avg_sum += delta;
+ if (running)
+ sa->usage_avg_sum += delta;
+#endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */
sa->runnable_avg_period += delta;
return decayed;
@@ -1338,16 +1492,28 @@ static inline void __update_tg_runnable_avg(struct sched_avg *sa,
struct cfs_rq *cfs_rq)
{
struct task_group *tg = cfs_rq->tg;
- long contrib;
+ long contrib, usage_contrib;
/* The fraction of a cpu used by this cfs_rq */
contrib = div_u64(sa->runnable_avg_sum << NICE_0_SHIFT,
sa->runnable_avg_period + 1);
contrib -= cfs_rq->tg_runnable_contrib;
- if (abs(contrib) > cfs_rq->tg_runnable_contrib / 64) {
+ usage_contrib = div_u64(sa->usage_avg_sum << NICE_0_SHIFT,
+ sa->runnable_avg_period + 1);
+ usage_contrib -= cfs_rq->tg_usage_contrib;
+
+ /*
+ * contrib/usage at this point represent deltas, only update if they
+ * are substantive.
+ */
+ if ((abs(contrib) > cfs_rq->tg_runnable_contrib / 64) ||
+ (abs(usage_contrib) > cfs_rq->tg_usage_contrib / 64)) {
atomic_add(contrib, &tg->runnable_avg);
cfs_rq->tg_runnable_contrib += contrib;
+
+ atomic_add(usage_contrib, &tg->usage_avg);
+ cfs_rq->tg_usage_contrib += usage_contrib;
}
}
@@ -1408,6 +1574,11 @@ static inline void __update_task_entity_contrib(struct sched_entity *se)
contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight);
contrib /= (se->avg.runnable_avg_period + 1);
se->avg.load_avg_contrib = scale_load(contrib);
+ trace_sched_task_load_contrib(task_of(se), se->avg.load_avg_contrib);
+ contrib = se->avg.runnable_avg_sum * scale_load_down(NICE_0_LOAD);
+ contrib /= (se->avg.runnable_avg_period + 1);
+ se->avg.load_avg_ratio = scale_load(contrib);
+ trace_sched_task_runnable_ratio(task_of(se), se->avg.load_avg_ratio);
}
/* Compute the current contribution to load_avg by se, return any delta */
@@ -1443,7 +1614,11 @@ static inline void update_entity_load_avg(struct sched_entity *se,
struct cfs_rq *cfs_rq = cfs_rq_of(se);
long contrib_delta;
u64 now;
+ int cpu = -1; /* not used in normal case */
+#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
+ cpu = cfs_rq->rq->cpu;
+#endif
/*
* For a group entity we need to use their owned cfs_rq_clock_task() in
* case they are the parent of a throttled hierarchy.
@@ -1453,7 +1628,8 @@ static inline void update_entity_load_avg(struct sched_entity *se,
else
now = cfs_rq_clock_task(group_cfs_rq(se));
- if (!__update_entity_runnable_avg(now, &se->avg, se->on_rq))
+ if (!__update_entity_runnable_avg(now, &se->avg, se->on_rq,
+ cfs_rq->curr == se, cpu))
return;
contrib_delta = __update_entity_load_avg_contrib(se);
@@ -1497,8 +1673,19 @@ static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update)
static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
{
- __update_entity_runnable_avg(rq->clock_task, &rq->avg, runnable);
+ u32 contrib;
+ int cpu = -1; /* not used in normal case */
+
+#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
+ cpu = rq->cpu;
+#endif
+ __update_entity_runnable_avg(rq->clock_task, &rq->avg, runnable,
+ runnable, cpu);
__update_tg_runnable_avg(&rq->avg, &rq->cfs);
+ contrib = rq->avg.runnable_avg_sum * scale_load_down(1024);
+ contrib /= (rq->avg.runnable_avg_period + 1);
+ trace_sched_rq_runnable_ratio(cpu_of(rq), scale_load(contrib));
+ trace_sched_rq_runnable_load(cpu_of(rq), rq->cfs.runnable_load_avg);
}
/* Add the load generated by se into cfs_rq's child load-average */
@@ -1886,6 +2073,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
*/
update_stats_wait_end(cfs_rq, se);
__dequeue_entity(cfs_rq, se);
+ update_entity_load_avg(se, 1);
}
update_stats_curr_start(cfs_rq, se);
@@ -3314,6 +3502,437 @@ done:
return target;
}
+#ifdef CONFIG_SCHED_HMP
+/*
+ * Heterogenous multiprocessor (HMP) optimizations
+ *
+ * The cpu types are distinguished using a list of hmp_domains
+ * which each represent one cpu type using a cpumask.
+ * The list is assumed ordered by compute capacity with the
+ * fastest domain first.
+ */
+DEFINE_PER_CPU(struct hmp_domain *, hmp_cpu_domain);
+
+extern void __init arch_get_hmp_domains(struct list_head *hmp_domains_list);
+
+/* Setup hmp_domains */
+static int __init hmp_cpu_mask_setup(void)
+{
+ char buf[64];
+ struct hmp_domain *domain;
+ struct list_head *pos;
+ int dc, cpu;
+
+ pr_debug("Initializing HMP scheduler:\n");
+
+ /* Initialize hmp_domains using platform code */
+ arch_get_hmp_domains(&hmp_domains);
+ if (list_empty(&hmp_domains)) {
+ pr_debug("HMP domain list is empty!\n");
+ return 0;
+ }
+
+ /* Print hmp_domains */
+ dc = 0;
+ list_for_each(pos, &hmp_domains) {
+ domain = list_entry(pos, struct hmp_domain, hmp_domains);
+ cpulist_scnprintf(buf, 64, &domain->possible_cpus);
+ pr_debug(" HMP domain %d: %s\n", dc, buf);
+
+ for_each_cpu_mask(cpu, domain->possible_cpus) {
+ per_cpu(hmp_cpu_domain, cpu) = domain;
+ }
+ dc++;
+ }
+
+ return 1;
+}
+
+static struct hmp_domain *hmp_get_hmp_domain_for_cpu(int cpu)
+{
+ struct hmp_domain *domain;
+ struct list_head *pos;
+
+ list_for_each(pos, &hmp_domains) {
+ domain = list_entry(pos, struct hmp_domain, hmp_domains);
+ if(cpumask_test_cpu(cpu, &domain->possible_cpus))
+ return domain;
+ }
+ return NULL;
+}
+
+static void hmp_online_cpu(int cpu)
+{
+ struct hmp_domain *domain = hmp_get_hmp_domain_for_cpu(cpu);
+
+ if(domain)
+ cpumask_set_cpu(cpu, &domain->cpus);
+}
+
+static void hmp_offline_cpu(int cpu)
+{
+ struct hmp_domain *domain = hmp_get_hmp_domain_for_cpu(cpu);
+
+ if(domain)
+ cpumask_clear_cpu(cpu, &domain->cpus);
+}
+
+/*
+ * Migration thresholds should be in the range [0..1023]
+ * hmp_up_threshold: min. load required for migrating tasks to a faster cpu
+ * hmp_down_threshold: max. load allowed for tasks migrating to a slower cpu
+ * The default values (512, 256) offer good responsiveness, but may need
+ * tweaking suit particular needs.
+ *
+ * hmp_up_prio: Only up migrate task with high priority (<hmp_up_prio)
+ * hmp_next_up_threshold: Delay before next up migration (1024 ~= 1 ms)
+ * hmp_next_down_threshold: Delay before next down migration (1024 ~= 1 ms)
+ */
+unsigned int hmp_up_threshold = 512;
+unsigned int hmp_down_threshold = 256;
+#ifdef CONFIG_SCHED_HMP_PRIO_FILTER
+unsigned int hmp_up_prio = NICE_TO_PRIO(CONFIG_SCHED_HMP_PRIO_FILTER_VAL);
+#endif
+unsigned int hmp_next_up_threshold = 4096;
+unsigned int hmp_next_down_threshold = 4096;
+
+static unsigned int hmp_up_migration(int cpu, int *target_cpu, struct sched_entity *se);
+static unsigned int hmp_down_migration(int cpu, struct sched_entity *se);
+static inline unsigned int hmp_domain_min_load(struct hmp_domain *hmpd,
+ int *min_cpu);
+
+/* Check if cpu is in fastest hmp_domain */
+static inline unsigned int hmp_cpu_is_fastest(int cpu)
+{
+ struct list_head *pos;
+
+ pos = &hmp_cpu_domain(cpu)->hmp_domains;
+ return pos == hmp_domains.next;
+}
+
+/* Check if cpu is in slowest hmp_domain */
+static inline unsigned int hmp_cpu_is_slowest(int cpu)
+{
+ struct list_head *pos;
+
+ pos = &hmp_cpu_domain(cpu)->hmp_domains;
+ return list_is_last(pos, &hmp_domains);
+}
+
+/* Next (slower) hmp_domain relative to cpu */
+static inline struct hmp_domain *hmp_slower_domain(int cpu)
+{
+ struct list_head *pos;
+
+ pos = &hmp_cpu_domain(cpu)->hmp_domains;
+ return list_entry(pos->next, struct hmp_domain, hmp_domains);
+}
+
+/* Previous (faster) hmp_domain relative to cpu */
+static inline struct hmp_domain *hmp_faster_domain(int cpu)
+{
+ struct list_head *pos;
+
+ pos = &hmp_cpu_domain(cpu)->hmp_domains;
+ return list_entry(pos->prev, struct hmp_domain, hmp_domains);
+}
+
+/*
+ * Selects a cpu in previous (faster) hmp_domain
+ * Note that cpumask_any_and() returns the first cpu in the cpumask
+ */
+static inline unsigned int hmp_select_faster_cpu(struct task_struct *tsk,
+ int cpu)
+{
+ int lowest_cpu=NR_CPUS;
+ __always_unused int lowest_ratio = hmp_domain_min_load(hmp_faster_domain(cpu), &lowest_cpu);
+ /*
+ * If the lowest-loaded CPU in the domain is allowed by the task affinity
+ * select that one, otherwise select one which is allowed
+ */
+ if(lowest_cpu != NR_CPUS && cpumask_test_cpu(lowest_cpu,tsk_cpus_allowed(tsk)))
+ return lowest_cpu;
+ else
+ return cpumask_any_and(&hmp_faster_domain(cpu)->cpus,
+ tsk_cpus_allowed(tsk));
+}
+
+/*
+ * Selects a cpu in next (slower) hmp_domain
+ * Note that cpumask_any_and() returns the first cpu in the cpumask
+ */
+static inline unsigned int hmp_select_slower_cpu(struct task_struct *tsk,
+ int cpu)
+{
+ int lowest_cpu=NR_CPUS;
+ __always_unused int lowest_ratio = hmp_domain_min_load(hmp_slower_domain(cpu), &lowest_cpu);
+ /*
+ * If the lowest-loaded CPU in the domain is allowed by the task affinity
+ * select that one, otherwise select one which is allowed
+ */
+ if(lowest_cpu != NR_CPUS && cpumask_test_cpu(lowest_cpu,tsk_cpus_allowed(tsk)))
+ return lowest_cpu;
+ else
+ return cpumask_any_and(&hmp_slower_domain(cpu)->cpus,
+ tsk_cpus_allowed(tsk));
+}
+
+static inline void hmp_next_up_delay(struct sched_entity *se, int cpu)
+{
+ struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs;
+
+ se->avg.hmp_last_up_migration = cfs_rq_clock_task(cfs_rq);
+ se->avg.hmp_last_down_migration = 0;
+}
+
+static inline void hmp_next_down_delay(struct sched_entity *se, int cpu)
+{
+ struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs;
+
+ se->avg.hmp_last_down_migration = cfs_rq_clock_task(cfs_rq);
+ se->avg.hmp_last_up_migration = 0;
+}
+
+#ifdef CONFIG_HMP_VARIABLE_SCALE
+/*
+ * Heterogenous multiprocessor (HMP) optimizations
+ *
+ * These functions allow to change the growing speed of the load_avg_ratio
+ * by default it goes from 0 to 0.5 in LOAD_AVG_PERIOD = 32ms
+ * This can now be changed with /sys/kernel/hmp/load_avg_period_ms.
+ *
+ * These functions also allow to change the up and down threshold of HMP
+ * using /sys/kernel/hmp/{up,down}_threshold.
+ * Both must be between 0 and 1023. The threshold that is compared
+ * to the load_avg_ratio is up_threshold/1024 and down_threshold/1024.
+ *
+ * For instance, if load_avg_period = 64 and up_threshold = 512, an idle
+ * task with a load of 0 will reach the threshold after 64ms of busy loop.
+ *
+ * Changing load_avg_periods_ms has the same effect than changing the
+ * default scaling factor Y=1002/1024 in the load_avg_ratio computation to
+ * (1002/1024.0)^(LOAD_AVG_PERIOD/load_avg_period_ms), but the last one
+ * could trigger overflows.
+ * For instance, with Y = 1023/1024 in __update_task_entity_contrib()
+ * "contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight);"
+ * could be overflowed for a weight > 2^12 even is the load_avg_contrib
+ * should still be a 32bits result. This would not happen by multiplicating
+ * delta time by 1/22 and setting load_avg_period_ms = 706.
+ */
+
+/*
+ * By scaling the delta time it end-up increasing or decrease the
+ * growing speed of the per entity load_avg_ratio
+ * The scale factor hmp_data.multiplier is a fixed point
+ * number: (32-HMP_VARIABLE_SCALE_SHIFT).HMP_VARIABLE_SCALE_SHIFT
+ */
+static u64 hmp_variable_scale_convert(u64 delta)
+{
+ u64 high = delta >> 32ULL;
+ u64 low = delta & 0xffffffffULL;
+ low *= hmp_data.multiplier;
+ high *= hmp_data.multiplier;
+ return (low >> HMP_VARIABLE_SCALE_SHIFT)
+ + (high << (32ULL - HMP_VARIABLE_SCALE_SHIFT));
+}
+
+static ssize_t hmp_show(struct kobject *kobj,
+ struct attribute *attr, char *buf)
+{
+ ssize_t ret = 0;
+ struct hmp_global_attr *hmp_attr =
+ container_of(attr, struct hmp_global_attr, attr);
+ int temp = *(hmp_attr->value);
+ if (hmp_attr->to_sysfs != NULL)
+ temp = hmp_attr->to_sysfs(temp);
+ ret = sprintf(buf, "%d\n", temp);
+ return ret;
+}
+
+static ssize_t hmp_store(struct kobject *a, struct attribute *attr,
+ const char *buf, size_t count)
+{
+ int temp;
+ ssize_t ret = count;
+ struct hmp_global_attr *hmp_attr =
+ container_of(attr, struct hmp_global_attr, attr);
+ char *str = vmalloc(count + 1);
+ if (str == NULL)
+ return -ENOMEM;
+ memcpy(str, buf, count);
+ str[count] = 0;
+ if (sscanf(str, "%d", &temp) < 1)
+ ret = -EINVAL;
+ else {
+ if (hmp_attr->from_sysfs != NULL)
+ temp = hmp_attr->from_sysfs(temp);
+ if (temp < 0)
+ ret = -EINVAL;
+ else
+ *(hmp_attr->value) = temp;
+ }
+ vfree(str);
+ return ret;
+}
+
+static int hmp_period_tofrom_sysfs(int value)
+{
+ return (LOAD_AVG_PERIOD << HMP_VARIABLE_SCALE_SHIFT) / value;
+}
+
+/* max value for threshold is 1024 */
+static int hmp_theshold_from_sysfs(int value)
+{
+ if (value > 1024)
+ return -1;
+ return value;
+}
+#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
+/* freqinvar control is only 0,1 off/on */
+static int hmp_freqinvar_from_sysfs(int value)
+{
+ if (value < 0 || value > 1)
+ return -1;
+ return value;
+}
+#endif
+static void hmp_attr_add(
+ const char *name,
+ int *value,
+ int (*to_sysfs)(int),
+ int (*from_sysfs)(int))
+{
+ int i = 0;
+ while (hmp_data.attributes[i] != NULL) {
+ i++;
+ if (i >= HMP_DATA_SYSFS_MAX)
+ return;
+ }
+ hmp_data.attr[i].attr.mode = 0644;
+ hmp_data.attr[i].show = hmp_show;
+ hmp_data.attr[i].store = hmp_store;
+ hmp_data.attr[i].attr.name = name;
+ hmp_data.attr[i].value = value;
+ hmp_data.attr[i].to_sysfs = to_sysfs;
+ hmp_data.attr[i].from_sysfs = from_sysfs;
+ hmp_data.attributes[i] = &hmp_data.attr[i].attr;
+ hmp_data.attributes[i + 1] = NULL;
+}
+
+static int hmp_attr_init(void)
+{
+ int ret;
+ memset(&hmp_data, sizeof(hmp_data), 0);
+ /* by default load_avg_period_ms == LOAD_AVG_PERIOD
+ * meaning no change
+ */
+ hmp_data.multiplier = hmp_period_tofrom_sysfs(LOAD_AVG_PERIOD);
+
+ hmp_attr_add("load_avg_period_ms",
+ &hmp_data.multiplier,
+ hmp_period_tofrom_sysfs,
+ hmp_period_tofrom_sysfs);
+ hmp_attr_add("up_threshold",
+ &hmp_up_threshold,
+ NULL,
+ hmp_theshold_from_sysfs);
+ hmp_attr_add("down_threshold",
+ &hmp_down_threshold,
+ NULL,
+ hmp_theshold_from_sysfs);
+#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
+ /* default frequency-invariant scaling ON */
+ hmp_data.freqinvar_load_scale_enabled = 1;
+ hmp_attr_add("frequency_invariant_load_scale",
+ &hmp_data.freqinvar_load_scale_enabled,
+ NULL,
+ hmp_freqinvar_from_sysfs);
+#endif
+ hmp_data.attr_group.name = "hmp";
+ hmp_data.attr_group.attrs = hmp_data.attributes;
+ ret = sysfs_create_group(kernel_kobj,
+ &hmp_data.attr_group);
+ return 0;
+}
+late_initcall(hmp_attr_init);
+#endif /* CONFIG_HMP_VARIABLE_SCALE */
+
+static inline unsigned int hmp_domain_min_load(struct hmp_domain *hmpd,
+ int *min_cpu)
+{
+ int cpu;
+ int min_cpu_runnable_temp = NR_CPUS;
+ unsigned long min_runnable_load = INT_MAX;
+ unsigned long contrib;
+
+ for_each_cpu_mask(cpu, hmpd->cpus) {
+ /* don't use the divisor in the loop, just at the end */
+ contrib = cpu_rq(cpu)->avg.runnable_avg_sum * scale_load_down(1024);
+ if (contrib < min_runnable_load) {
+ min_runnable_load = contrib;
+ min_cpu_runnable_temp = cpu;
+ }
+ }
+
+ if (min_cpu)
+ *min_cpu = min_cpu_runnable_temp;
+
+ /* domain will often have at least one empty CPU */
+ return min_runnable_load ? min_runnable_load / (LOAD_AVG_MAX + 1) : 0;
+}
+
+/*
+ * Calculate the task starvation
+ * This is the ratio of actually running time vs. runnable time.
+ * If the two are equal the task is getting the cpu time it needs or
+ * it is alone on the cpu and the cpu is fully utilized.
+ */
+static inline unsigned int hmp_task_starvation(struct sched_entity *se)
+{
+ u32 starvation;
+
+ starvation = se->avg.usage_avg_sum * scale_load_down(NICE_0_LOAD);
+ starvation /= (se->avg.runnable_avg_sum + 1);
+
+ return scale_load(starvation);
+}
+
+static inline unsigned int hmp_offload_down(int cpu, struct sched_entity *se)
+{
+ int min_usage;
+ int dest_cpu = NR_CPUS;
+
+ if (hmp_cpu_is_slowest(cpu))
+ return NR_CPUS;
+
+ /* Is the current domain fully loaded? */
+ /* load < ~50% */
+ min_usage = hmp_domain_min_load(hmp_cpu_domain(cpu), NULL);
+ if (min_usage < (NICE_0_LOAD>>1))
+ return NR_CPUS;
+
+ /* Is the task alone on the cpu? */
+ if (cpu_rq(cpu)->cfs.nr_running < 2)
+ return NR_CPUS;
+
+ /* Is the task actually starving? */
+ /* >=25% ratio running/runnable = starving */
+ if (hmp_task_starvation(se) > 768)
+ return NR_CPUS;
+
+ /* Does the slower domain have spare cycles? */
+ min_usage = hmp_domain_min_load(hmp_slower_domain(cpu), &dest_cpu);
+ /* load > 50% */
+ if (min_usage > NICE_0_LOAD/2)
+ return NR_CPUS;
+
+ if (cpumask_test_cpu(dest_cpu, &hmp_slower_domain(cpu)->cpus))
+ return dest_cpu;
+
+ return NR_CPUS;
+}
+#endif /* CONFIG_SCHED_HMP */
+
/*
* sched_balance_self: balance the current task (running on cpu) in domains
* that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
@@ -3338,6 +3957,28 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
if (p->nr_cpus_allowed == 1)
return prev_cpu;
+#ifdef CONFIG_SCHED_HMP
+ /* always put non-kernel forking tasks on a big domain */
+ if (p->mm && (sd_flag & SD_BALANCE_FORK)) {
+ if(hmp_cpu_is_fastest(prev_cpu)) {
+ struct hmp_domain *hmpdom = list_entry(&hmp_cpu_domain(prev_cpu)->hmp_domains, struct hmp_domain, hmp_domains);
+ __always_unused int lowest_ratio = hmp_domain_min_load(hmpdom, &new_cpu);
+ if(new_cpu != NR_CPUS && cpumask_test_cpu(new_cpu,tsk_cpus_allowed(p)))
+ return new_cpu;
+ else {
+ new_cpu = cpumask_any_and(&hmp_faster_domain(cpu)->cpus,
+ tsk_cpus_allowed(p));
+ if(new_cpu < nr_cpu_ids)
+ return new_cpu;
+ }
+ } else {
+ new_cpu = hmp_select_faster_cpu(p, prev_cpu);
+ if (new_cpu != NR_CPUS)
+ return new_cpu;
+ }
+ }
+#endif
+
if (sd_flag & SD_BALANCE_WAKE) {
if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
want_affine = 1;
@@ -3412,6 +4053,23 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
unlock:
rcu_read_unlock();
+#ifdef CONFIG_SCHED_HMP
+ if (hmp_up_migration(prev_cpu, &new_cpu, &p->se)) {
+ hmp_next_up_delay(&p->se, new_cpu);
+ trace_sched_hmp_migrate(p, new_cpu, 0);
+ return new_cpu;
+ }
+ if (hmp_down_migration(prev_cpu, &p->se)) {
+ new_cpu = hmp_select_slower_cpu(p, prev_cpu);
+ hmp_next_down_delay(&p->se, new_cpu);
+ trace_sched_hmp_migrate(p, new_cpu, 0);
+ return new_cpu;
+ }
+ /* Make sure that the task stays in its previous hmp domain */
+ if (!cpumask_test_cpu(new_cpu, &hmp_cpu_domain(prev_cpu)->cpus))
+ return prev_cpu;
+#endif
+
return new_cpu;
}
@@ -3945,7 +4603,6 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
* 1) task is cache cold, or
* 2) too many balance attempts have failed.
*/
-
tsk_cache_hot = task_hot(p, env->src_rq->clock_task, env->sd);
if (!tsk_cache_hot ||
env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
@@ -5371,7 +6028,11 @@ static struct {
static inline int find_new_ilb(int call_cpu)
{
int ilb = cpumask_first(nohz.idle_cpus_mask);
-
+#ifdef CONFIG_SCHED_HMP
+ /* restrict nohz balancing to occur in the same hmp domain */
+ ilb = cpumask_first_and(nohz.idle_cpus_mask,
+ &((struct hmp_domain *)hmp_cpu_domain(call_cpu))->cpus);
+#endif
if (ilb < nr_cpu_ids && idle_cpu(ilb))
return ilb;
@@ -5650,6 +6311,18 @@ static inline int nohz_kick_needed(struct rq *rq, int cpu)
if (time_before(now, nohz.next_balance))
return 0;
+#ifdef CONFIG_SCHED_HMP
+ /*
+ * Bail out if there are no nohz CPUs in our
+ * HMP domain, since we will move tasks between
+ * domains through wakeup and force balancing
+ * as necessary based upon task load.
+ */
+ if (cpumask_first_and(nohz.idle_cpus_mask,
+ &((struct hmp_domain *)hmp_cpu_domain(cpu))->cpus) >= nr_cpu_ids)
+ return 0;
+#endif
+
if (rq->nr_running >= 2)
goto need_kick;
@@ -5682,6 +6355,300 @@ need_kick:
static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { }
#endif
+#ifdef CONFIG_SCHED_HMP
+/* Check if task should migrate to a faster cpu */
+static unsigned int hmp_up_migration(int cpu, int *target_cpu, struct sched_entity *se)
+{
+ struct task_struct *p = task_of(se);
+ struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs;
+ u64 now;
+
+ if (target_cpu)
+ *target_cpu = NR_CPUS;
+
+ if (hmp_cpu_is_fastest(cpu))
+ return 0;
+
+#ifdef CONFIG_SCHED_HMP_PRIO_FILTER
+ /* Filter by task priority */
+ if (p->prio >= hmp_up_prio)
+ return 0;
+#endif
+ if (se->avg.load_avg_ratio < hmp_up_threshold)
+ return 0;
+
+ /* Let the task load settle before doing another up migration */
+ now = cfs_rq_clock_task(cfs_rq);
+ if (((now - se->avg.hmp_last_up_migration) >> 10)
+ < hmp_next_up_threshold)
+ return 0;
+
+ /* Target domain load < 94% */
+ if (hmp_domain_min_load(hmp_faster_domain(cpu), target_cpu)
+ > NICE_0_LOAD-64)
+ return 0;
+
+ if (cpumask_intersects(&hmp_faster_domain(cpu)->cpus,
+ tsk_cpus_allowed(p)))
+ return 1;
+
+ return 0;
+}
+
+/* Check if task should migrate to a slower cpu */
+static unsigned int hmp_down_migration(int cpu, struct sched_entity *se)
+{
+ struct task_struct *p = task_of(se);
+ struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs;
+ u64 now;
+
+ if (hmp_cpu_is_slowest(cpu))
+ return 0;
+
+#ifdef CONFIG_SCHED_HMP_PRIO_FILTER
+ /* Filter by task priority */
+ if ((p->prio >= hmp_up_prio) &&
+ cpumask_intersects(&hmp_slower_domain(cpu)->cpus,
+ tsk_cpus_allowed(p))) {
+ return 1;
+ }
+#endif
+
+ /* Let the task load settle before doing another down migration */
+ now = cfs_rq_clock_task(cfs_rq);
+ if (((now - se->avg.hmp_last_down_migration) >> 10)
+ < hmp_next_down_threshold)
+ return 0;
+
+ if (cpumask_intersects(&hmp_slower_domain(cpu)->cpus,
+ tsk_cpus_allowed(p))
+ && se->avg.load_avg_ratio < hmp_down_threshold) {
+ return 1;
+ }
+ return 0;
+}
+
+/*
+ * hmp_can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
+ * Ideally this function should be merged with can_migrate_task() to avoid
+ * redundant code.
+ */
+static int hmp_can_migrate_task(struct task_struct *p, struct lb_env *env)
+{
+ int tsk_cache_hot = 0;
+
+ /*
+ * We do not migrate tasks that are:
+ * 1) running (obviously), or
+ * 2) cannot be migrated to this CPU due to cpus_allowed
+ */
+ if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
+ schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
+ return 0;
+ }
+ env->flags &= ~LBF_ALL_PINNED;
+
+ if (task_running(env->src_rq, p)) {
+ schedstat_inc(p, se.statistics.nr_failed_migrations_running);
+ return 0;
+ }
+
+ /*
+ * Aggressive migration if:
+ * 1) task is cache cold, or
+ * 2) too many balance attempts have failed.
+ */
+
+ tsk_cache_hot = task_hot(p, env->src_rq->clock_task, env->sd);
+ if (!tsk_cache_hot ||
+ env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
+#ifdef CONFIG_SCHEDSTATS
+ if (tsk_cache_hot) {
+ schedstat_inc(env->sd, lb_hot_gained[env->idle]);
+ schedstat_inc(p, se.statistics.nr_forced_migrations);
+ }
+#endif
+ return 1;
+ }
+
+ return 1;
+}
+
+/*
+ * move_specific_task tries to move a specific task.
+ * Returns 1 if successful and 0 otherwise.
+ * Called with both runqueues locked.
+ */
+static int move_specific_task(struct lb_env *env, struct task_struct *pm)
+{
+ struct task_struct *p, *n;
+
+ list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
+ if (throttled_lb_pair(task_group(p), env->src_rq->cpu,
+ env->dst_cpu))
+ continue;
+
+ if (!hmp_can_migrate_task(p, env))
+ continue;
+ /* Check if we found the right task */
+ if (p != pm)
+ continue;
+
+ move_task(p, env);
+ /*
+ * Right now, this is only the third place move_task()
+ * is called, so we can safely collect move_task()
+ * stats here rather than inside move_task().
+ */
+ schedstat_inc(env->sd, lb_gained[env->idle]);
+ return 1;
+ }
+ return 0;
+}
+
+/*
+ * hmp_active_task_migration_cpu_stop is run by cpu stopper and used to
+ * migrate a specific task from one runqueue to another.
+ * hmp_force_up_migration uses this to push a currently running task
+ * off a runqueue.
+ * Based on active_load_balance_stop_cpu and can potentially be merged.
+ */
+static int hmp_active_task_migration_cpu_stop(void *data)
+{
+ struct rq *busiest_rq = data;
+ struct task_struct *p = busiest_rq->migrate_task;
+ int busiest_cpu = cpu_of(busiest_rq);
+ int target_cpu = busiest_rq->push_cpu;
+ struct rq *target_rq = cpu_rq(target_cpu);
+ struct sched_domain *sd;
+
+ raw_spin_lock_irq(&busiest_rq->lock);
+ /* make sure the requested cpu hasn't gone down in the meantime */
+ if (unlikely(busiest_cpu != smp_processor_id() ||
+ !busiest_rq->active_balance)) {
+ goto out_unlock;
+ }
+ /* Is there any task to move? */
+ if (busiest_rq->nr_running <= 1)
+ goto out_unlock;
+ /* Task has migrated meanwhile, abort forced migration */
+ if (task_rq(p) != busiest_rq)
+ goto out_unlock;
+ /*
+ * This condition is "impossible", if it occurs
+ * we need to fix it. Originally reported by
+ * Bjorn Helgaas on a 128-cpu setup.
+ */
+ BUG_ON(busiest_rq == target_rq);
+
+ /* move a task from busiest_rq to target_rq */
+ double_lock_balance(busiest_rq, target_rq);
+
+ /* Search for an sd spanning us and the target CPU. */
+ rcu_read_lock();
+ for_each_domain(target_cpu, sd) {
+ if (cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
+ break;
+ }
+
+ if (likely(sd)) {
+ struct lb_env env = {
+ .sd = sd,
+ .dst_cpu = target_cpu,
+ .dst_rq = target_rq,
+ .src_cpu = busiest_rq->cpu,
+ .src_rq = busiest_rq,
+ .idle = CPU_IDLE,
+ };
+
+ schedstat_inc(sd, alb_count);
+
+ if (move_specific_task(&env, p))
+ schedstat_inc(sd, alb_pushed);
+ else
+ schedstat_inc(sd, alb_failed);
+ }
+ rcu_read_unlock();
+ double_unlock_balance(busiest_rq, target_rq);
+out_unlock:
+ busiest_rq->active_balance = 0;
+ raw_spin_unlock_irq(&busiest_rq->lock);
+ return 0;
+}
+
+static DEFINE_SPINLOCK(hmp_force_migration);
+
+/*
+ * hmp_force_up_migration checks runqueues for tasks that need to
+ * be actively migrated to a faster cpu.
+ */
+static void hmp_force_up_migration(int this_cpu)
+{
+ int cpu, target_cpu;
+ struct sched_entity *curr;
+ struct rq *target;
+ unsigned long flags;
+ unsigned int force;
+ struct task_struct *p;
+
+ if (!spin_trylock(&hmp_force_migration))
+ return;
+ for_each_online_cpu(cpu) {
+ force = 0;
+ target = cpu_rq(cpu);
+ raw_spin_lock_irqsave(&target->lock, flags);
+ curr = target->cfs.curr;
+ if (!curr) {
+ raw_spin_unlock_irqrestore(&target->lock, flags);
+ continue;
+ }
+ if (!entity_is_task(curr)) {
+ struct cfs_rq *cfs_rq;
+
+ cfs_rq = group_cfs_rq(curr);
+ while (cfs_rq) {
+ curr = cfs_rq->curr;
+ cfs_rq = group_cfs_rq(curr);
+ }
+ }
+ p = task_of(curr);
+ if (hmp_up_migration(cpu, &target_cpu, curr)) {
+ if (!target->active_balance) {
+ target->active_balance = 1;
+ target->push_cpu = target_cpu;
+ target->migrate_task = p;
+ force = 1;
+ trace_sched_hmp_migrate(p, target->push_cpu, 1);
+ hmp_next_up_delay(&p->se, target->push_cpu);
+ }
+ }
+ if (!force && !target->active_balance) {
+ /*
+ * For now we just check the currently running task.
+ * Selecting the lightest task for offloading will
+ * require extensive book keeping.
+ */
+ target->push_cpu = hmp_offload_down(cpu, curr);
+ if (target->push_cpu < NR_CPUS) {
+ target->active_balance = 1;
+ target->migrate_task = p;
+ force = 1;
+ trace_sched_hmp_migrate(p, target->push_cpu, 2);
+ hmp_next_down_delay(&p->se, target->push_cpu);
+ }
+ }
+ raw_spin_unlock_irqrestore(&target->lock, flags);
+ if (force)
+ stop_one_cpu_nowait(cpu_of(target),
+ hmp_active_task_migration_cpu_stop,
+ target, &target->active_balance_work);
+ }
+ spin_unlock(&hmp_force_migration);
+}
+#else
+static void hmp_force_up_migration(int this_cpu) { }
+#endif /* CONFIG_SCHED_HMP */
+
/*
* run_rebalance_domains is triggered when needed from the scheduler tick.
* Also triggered for nohz idle balancing (with nohz_balancing_kick set).
@@ -5693,6 +6660,8 @@ static void run_rebalance_domains(struct softirq_action *h)
enum cpu_idle_type idle = this_rq->idle_balance ?
CPU_IDLE : CPU_NOT_IDLE;
+ hmp_force_up_migration(this_cpu);
+
rebalance_domains(this_cpu, idle);
/*
@@ -5725,11 +6694,17 @@ void trigger_load_balance(struct rq *rq, int cpu)
static void rq_online_fair(struct rq *rq)
{
+#ifdef CONFIG_SCHED_HMP
+ hmp_online_cpu(rq->cpu);
+#endif
update_sysctl();
}
static void rq_offline_fair(struct rq *rq)
{
+#ifdef CONFIG_SCHED_HMP
+ hmp_offline_cpu(rq->cpu);
+#endif
update_sysctl();
/* Ensure any throttled groups are reachable by pick_next_task */
@@ -6192,6 +7167,139 @@ __init void init_sched_fair_class(void)
zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
cpu_notifier(sched_ilb_notifier, 0);
#endif
+
+#ifdef CONFIG_SCHED_HMP
+ hmp_cpu_mask_setup();
+#endif
#endif /* SMP */
}
+
+#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
+static u32 cpufreq_calc_scale(u32 min, u32 max, u32 curr)
+{
+ u32 result = curr / max;
+ return result;
+}
+
+/* Called when the CPU Frequency is changed.
+ * Once for each CPU.
+ */
+static int cpufreq_callback(struct notifier_block *nb,
+ unsigned long val, void *data)
+{
+ struct cpufreq_freqs *freq = data;
+ int cpu = freq->cpu;
+ struct cpufreq_extents *extents;
+
+ if (freq->flags & CPUFREQ_CONST_LOOPS)
+ return NOTIFY_OK;
+
+ if (val != CPUFREQ_POSTCHANGE)
+ return NOTIFY_OK;
+
+ /* if dynamic load scale is disabled, set the load scale to 1.0 */
+ if (!hmp_data.freqinvar_load_scale_enabled) {
+ freq_scale[cpu].curr_scale = 1024;
+ return NOTIFY_OK;
+ }
+
+ extents = &freq_scale[cpu];
+ if (extents->flags & SCHED_LOAD_FREQINVAR_SINGLEFREQ) {
+ /* If our governor was recognised as a single-freq governor,
+ * use 1.0
+ */
+ extents->curr_scale = 1024;
+ } else {
+ extents->curr_scale = cpufreq_calc_scale(extents->min,
+ extents->max, freq->new);
+ }
+
+ return NOTIFY_OK;
+}
+
+/* Called when the CPUFreq governor is changed.
+ * Only called for the CPUs which are actually changed by the
+ * userspace.
+ */
+static int cpufreq_policy_callback(struct notifier_block *nb,
+ unsigned long event, void *data)
+{
+ struct cpufreq_policy *policy = data;
+ struct cpufreq_extents *extents;
+ int cpu, singleFreq = 0;
+ static const char performance_governor[] = "performance";
+ static const char powersave_governor[] = "powersave";
+
+ if (event == CPUFREQ_START)
+ return 0;
+
+ if (event != CPUFREQ_INCOMPATIBLE)
+ return 0;
+
+ /* CPUFreq governors do not accurately report the range of
+ * CPU Frequencies they will choose from.
+ * We recognise performance and powersave governors as
+ * single-frequency only.
+ */
+ if (!strncmp(policy->governor->name, performance_governor,
+ strlen(performance_governor)) ||
+ !strncmp(policy->governor->name, powersave_governor,
+ strlen(powersave_governor)))
+ singleFreq = 1;
+
+ /* Make sure that all CPUs impacted by this policy are
+ * updated since we will only get a notification when the
+ * user explicitly changes the policy on a CPU.
+ */
+ for_each_cpu(cpu, policy->cpus) {
+ extents = &freq_scale[cpu];
+ extents->max = policy->max >> SCHED_FREQSCALE_SHIFT;
+ extents->min = policy->min >> SCHED_FREQSCALE_SHIFT;
+ if (!hmp_data.freqinvar_load_scale_enabled) {
+ extents->curr_scale = 1024;
+ } else if (singleFreq) {
+ extents->flags |= SCHED_LOAD_FREQINVAR_SINGLEFREQ;
+ extents->curr_scale = 1024;
+ } else {
+ extents->flags &= ~SCHED_LOAD_FREQINVAR_SINGLEFREQ;
+ extents->curr_scale = cpufreq_calc_scale(extents->min,
+ extents->max, policy->cur);
+ }
+ }
+
+ return 0;
+}
+
+static struct notifier_block cpufreq_notifier = {
+ .notifier_call = cpufreq_callback,
+};
+static struct notifier_block cpufreq_policy_notifier = {
+ .notifier_call = cpufreq_policy_callback,
+};
+
+static int __init register_sched_cpufreq_notifier(void)
+{
+ int ret = 0;
+
+ /* init safe defaults since there are no policies at registration */
+ for (ret = 0; ret < CONFIG_NR_CPUS; ret++) {
+ /* safe defaults */
+ freq_scale[ret].max = 1024;
+ freq_scale[ret].min = 1024;
+ freq_scale[ret].curr_scale = 1024;
+ }
+
+ pr_info("sched: registering cpufreq notifiers for scale-invariant loads\n");
+ ret = cpufreq_register_notifier(&cpufreq_policy_notifier,
+ CPUFREQ_POLICY_NOTIFIER);
+
+ if (ret != -EINVAL)
+ ret = cpufreq_register_notifier(&cpufreq_notifier,
+ CPUFREQ_TRANSITION_NOTIFIER);
+
+ return ret;
+}
+
+core_initcall(register_sched_cpufreq_notifier);
+#endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index ce39224d615..27f51ac8670 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -142,7 +142,7 @@ struct task_group {
atomic_t load_weight;
atomic64_t load_avg;
- atomic_t runnable_avg;
+ atomic_t runnable_avg, usage_avg;
#endif
#ifdef CONFIG_RT_GROUP_SCHED
@@ -279,7 +279,7 @@ struct cfs_rq {
#endif /* CONFIG_FAIR_GROUP_SCHED */
/* These always depend on CONFIG_FAIR_GROUP_SCHED */
#ifdef CONFIG_FAIR_GROUP_SCHED
- u32 tg_runnable_contrib;
+ u32 tg_runnable_contrib, tg_usage_contrib;
u64 tg_load_contrib;
#endif /* CONFIG_FAIR_GROUP_SCHED */
@@ -464,6 +464,9 @@ struct rq {
int active_balance;
int push_cpu;
struct cpu_stop_work active_balance_work;
+#ifdef CONFIG_SCHED_HMP
+ struct task_struct *migrate_task;
+#endif
/* cpu of this runqueue: */
int cpu;
int online;
@@ -642,6 +645,12 @@ static inline unsigned int group_first_cpu(struct sched_group *group)
extern int group_balance_cpu(struct sched_group *sg);
+#ifdef CONFIG_SCHED_HMP
+static LIST_HEAD(hmp_domains);
+DECLARE_PER_CPU(struct hmp_domain *, hmp_cpu_domain);
+#define hmp_cpu_domain(cpu) (per_cpu(hmp_cpu_domain, (cpu)))
+#endif /* CONFIG_SCHED_HMP */
+
#endif /* CONFIG_SMP */
#include "stats.h"
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index ee8e29a2320..f02c4a4a0c3 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -272,6 +272,15 @@ static cpumask_var_t *wq_numa_possible_cpumask;
static bool wq_disable_numa;
module_param_named(disable_numa, wq_disable_numa, bool, 0444);
+/* see the comment above the definition of WQ_POWER_EFFICIENT */
+#ifdef CONFIG_WQ_POWER_EFFICIENT_DEFAULT
+static bool wq_power_efficient = true;
+#else
+static bool wq_power_efficient;
+#endif
+
+module_param_named(power_efficient, wq_power_efficient, bool, 0444);
+
static bool wq_numa_enabled; /* unbound NUMA affinity enabled */
/* buf for wq_update_unbound_numa_attrs(), protected by CPU hotplug exclusion */
@@ -305,6 +314,10 @@ struct workqueue_struct *system_unbound_wq __read_mostly;
EXPORT_SYMBOL_GPL(system_unbound_wq);
struct workqueue_struct *system_freezable_wq __read_mostly;
EXPORT_SYMBOL_GPL(system_freezable_wq);
+struct workqueue_struct *system_power_efficient_wq __read_mostly;
+EXPORT_SYMBOL_GPL(system_power_efficient_wq);
+struct workqueue_struct *system_freezable_power_efficient_wq __read_mostly;
+EXPORT_SYMBOL_GPL(system_freezable_power_efficient_wq);
static int worker_thread(void *__worker);
static void copy_workqueue_attrs(struct workqueue_attrs *to,
@@ -4086,6 +4099,10 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
struct workqueue_struct *wq;
struct pool_workqueue *pwq;
+ /* see the comment above the definition of WQ_POWER_EFFICIENT */
+ if ((flags & WQ_POWER_EFFICIENT) && wq_power_efficient)
+ flags |= WQ_UNBOUND;
+
/* allocate wq and format name */
if (flags & WQ_UNBOUND)
tbl_size = wq_numa_tbl_len * sizeof(wq->numa_pwq_tbl[0]);
@@ -4985,8 +5002,15 @@ static int __init init_workqueues(void)
WQ_UNBOUND_MAX_ACTIVE);
system_freezable_wq = alloc_workqueue("events_freezable",
WQ_FREEZABLE, 0);
+ system_power_efficient_wq = alloc_workqueue("events_power_efficient",
+ WQ_POWER_EFFICIENT, 0);
+ system_freezable_power_efficient_wq = alloc_workqueue("events_freezable_power_efficient",
+ WQ_FREEZABLE | WQ_POWER_EFFICIENT,
+ 0);
BUG_ON(!system_wq || !system_highpri_wq || !system_long_wq ||
- !system_unbound_wq || !system_freezable_wq);
+ !system_unbound_wq || !system_freezable_wq ||
+ !system_power_efficient_wq ||
+ !system_freezable_power_efficient_wq);
return 0;
}
early_initcall(init_workqueues);
diff --git a/linaro/configs/android.conf b/linaro/configs/android.conf
new file mode 100644
index 00000000000..bb90ecd9e16
--- /dev/null
+++ b/linaro/configs/android.conf
@@ -0,0 +1,31 @@
+CONFIG_IPV6=y
+# CONFIG_IPV6_SIT is not set
+CONFIG_PANIC_TIMEOUT=0
+CONFIG_HAS_WAKELOCK=y
+CONFIG_WAKELOCK=y
+CONFIG_BLK_DEV_LOOP=y
+CONFIG_DM_CRYPT=y
+CONFIG_POWER_SUPPLY=y
+CONFIG_ANDROID_PARANOID_NETWORK=y
+CONFIG_NET_ACTIVITY_STATS=y
+CONFIG_INPUT_MISC=y
+CONFIG_INPUT_UINPUT=y
+CONFIG_INPUT_GPIO=y
+CONFIG_USB_G_ANDROID=y
+CONFIG_SWITCH=y
+CONFIG_STAGING=y
+CONFIG_ANDROID=y
+CONFIG_ANDROID_BINDER_IPC=y
+CONFIG_ASHMEM=y
+CONFIG_ANDROID_LOGGER=y
+CONFIG_ANDROID_TIMED_OUTPUT=y
+CONFIG_ANDROID_TIMED_GPIO=y
+CONFIG_ANDROID_LOW_MEMORY_KILLER=y
+CONFIG_ANDROID_INTF_ALARM_DEV=y
+CONFIG_CRYPTO_TWOFISH=y
+CONFIG_BLK_DEV_RAM=y
+CONFIG_BLK_DEV_RAM_COUNT=16
+CONFIG_BLK_DEV_RAM_SIZE=16384
+CONFIG_FUSE_FS=y
+CONFIG_CPU_FREQ_GOV_INTERACTIVE=y
+CONFIG_CPU_FREQ_DEFAULT_GOV_INTERACTIVE=y
diff --git a/linaro/configs/big-LITTLE-IKS.conf b/linaro/configs/big-LITTLE-IKS.conf
new file mode 100644
index 00000000000..b067fde86ea
--- /dev/null
+++ b/linaro/configs/big-LITTLE-IKS.conf
@@ -0,0 +1,5 @@
+CONFIG_BIG_LITTLE=y
+CONFIG_BL_SWITCHER=y
+CONFIG_ARM_DT_BL_CPUFREQ=y
+CONFIG_ARM_VEXPRESS_BL_CPUFREQ=y
+CONFIG_CPU_FREQ_GOV_USERSPACE=y
diff --git a/linaro/configs/big-LITTLE-MP.conf b/linaro/configs/big-LITTLE-MP.conf
new file mode 100644
index 00000000000..0bbc603a13e
--- /dev/null
+++ b/linaro/configs/big-LITTLE-MP.conf
@@ -0,0 +1,11 @@
+CONFIG_CGROUPS=y
+CONFIG_CGROUP_SCHED=y
+CONFIG_FAIR_GROUP_SCHED=y
+CONFIG_NO_HZ=y
+CONFIG_SCHED_MC=y
+CONFIG_DISABLE_CPU_SCHED_DOMAIN_BALANCE=y
+CONFIG_SCHED_HMP=y
+CONFIG_HMP_FAST_CPU_MASK=""
+CONFIG_HMP_SLOW_CPU_MASK=""
+CONFIG_HMP_VARIABLE_SCALE=y
+CONFIG_HMP_FREQUENCY_INVARIANT_SCALE=y
diff --git a/linaro/configs/debug.conf b/linaro/configs/debug.conf
new file mode 100644
index 00000000000..36980566b2d
--- /dev/null
+++ b/linaro/configs/debug.conf
@@ -0,0 +1 @@
+CONFIG_PROVE_LOCKING=y
diff --git a/linaro/configs/distribution.conf b/linaro/configs/distribution.conf
new file mode 100644
index 00000000000..fbcfed1b6ce
--- /dev/null
+++ b/linaro/configs/distribution.conf
@@ -0,0 +1,44 @@
+# CONFIG_LOCALVERSION_AUTO is not set
+CONFIG_CGROUPS=y
+# CONFIG_COMPAT_BRK is not set
+CONFIG_DEFAULT_MMAP_MIN_ADDR=32768
+CONFIG_SECCOMP=y
+CONFIG_CC_STACKPROTECTOR=y
+CONFIG_SYN_COOKIES=y
+CONFIG_IPV6=y
+CONFIG_NETLABEL=y
+CONFIG_BRIDGE_NETFILTER=y
+CONFIG_NF_CONNTRACK=m
+CONFIG_NETFILTER_XT_CONNMARK=m
+CONFIG_NETFILTER_XT_MARK=m
+CONFIG_NETFILTER_XT_TARGET_CHECKSUM=m
+CONFIG_NF_CONNTRACK_IPV4=m
+CONFIG_NF_NAT_IPV4=m
+CONFIG_IP_NF_IPTABLES=m
+CONFIG_IP_NF_FILTER=m
+CONFIG_IP_NF_MANGLE=m
+CONFIG_NF_CONNTRACK_IPV6=m
+CONFIG_NF_NAT_IPV6=m
+CONFIG_IP6_NF_IPTABLES=m
+CONFIG_IP6_NF_FILTER=m
+CONFIG_IP6_NF_MANGLE=m
+CONFIG_BRIDGE_NF_EBTABLES=m
+CONFIG_BRIDGE_EBT_MARK_T=m
+CONFIG_BRIDGE=m
+CONFIG_DEVTMPFS=y
+CONFIG_DEVTMPFS_MOUNT=y
+CONFIG_BLK_DEV_RAM=y
+CONFIG_BLK_DEV_RAM_SIZE=65536
+CONFIG_INPUT_MISC=y
+CONFIG_INPUT_UINPUT=y
+# CONFIG_DEVKMEM is not set
+CONFIG_FRAMEBUFFER_CONSOLE=y
+CONFIG_AUTOFS4_FS=y
+CONFIG_TMPFS_POSIX_ACL=y
+CONFIG_STRICT_DEVMEM=y
+CONFIG_SECURITY=y
+CONFIG_LSM_MMAP_MIN_ADDR=0
+CONFIG_SECURITY_SELINUX=y
+CONFIG_SECURITY_SMACK=y
+CONFIG_SECURITY_APPARMOR=y
+CONFIG_DEFAULT_SECURITY_APPARMOR=y
diff --git a/linaro/configs/kvm-guest.conf b/linaro/configs/kvm-guest.conf
new file mode 100644
index 00000000000..00e84a3ba1e
--- /dev/null
+++ b/linaro/configs/kvm-guest.conf
@@ -0,0 +1,11 @@
+CONFIG_BALLOON_COMPACTION=y
+CONFIG_VIRTIO_BLK=y
+CONFIG_VIRTIO_NET=y
+CONFIG_HVC_DRIVER=y
+CONFIG_VIRTIO_CONSOLE=y
+CONFIG_VIRTIO=y
+CONFIG_VIRTIO_BALLOON=y
+CONFIG_VIRTIO_MMIO=y
+CONFIG_VIRTIO_MMIO_CMDLINE_DEVICES=y
+CONFIG_VIRTUALIZATION=y
+# CONFIG_THUMB2_KERNEL is not set
diff --git a/linaro/configs/kvm-host.conf b/linaro/configs/kvm-host.conf
new file mode 100644
index 00000000000..21a40e03137
--- /dev/null
+++ b/linaro/configs/kvm-host.conf
@@ -0,0 +1,11 @@
+CONFIG_VIRTUALIZATION=y
+CONFIG_ARM_LPAE=y
+CONFIG_ARM_VIRT_EXT=y
+CONFIG_HAVE_KVM_IRQCHIP=y
+CONFIG_KVM_ARM_HOST=y
+CONFIG_KVM_ARM_MAX_VCPUS=4
+CONFIG_KVM_ARM_TIMER=y
+CONFIG_KVM_ARM_VGIC=y
+CONFIG_KVM_MMIO=y
+CONFIG_KVM=y
+CONFIG_BLK_DEV_NBD=m
diff --git a/linaro/configs/linaro-base.conf b/linaro/configs/linaro-base.conf
new file mode 100644
index 00000000000..947ca1f5093
--- /dev/null
+++ b/linaro/configs/linaro-base.conf
@@ -0,0 +1,94 @@
+CONFIG_SYSVIPC=y
+CONFIG_POSIX_MQUEUE=y
+CONFIG_BSD_PROCESS_ACCT=y
+CONFIG_IKCONFIG=y
+CONFIG_IKCONFIG_PROC=y
+CONFIG_LOG_BUF_SHIFT=16
+CONFIG_BLK_DEV_INITRD=y
+CONFIG_EMBEDDED=y
+CONFIG_HOTPLUG=y
+CONFIG_PERF_EVENTS=y
+CONFIG_SLAB=y
+CONFIG_PROFILING=y
+CONFIG_OPROFILE=y
+CONFIG_MODULES=y
+CONFIG_MODULE_UNLOAD=y
+CONFIG_NO_HZ=y
+CONFIG_HIGH_RES_TIMERS=y
+CONFIG_SMP=y
+CONFIG_SCHED_MC=y
+CONFIG_SCHED_SMT=y
+CONFIG_THUMB2_KERNEL=y
+CONFIG_AEABI=y
+# CONFIG_OABI_COMPAT is not set
+CONFIG_CPU_FREQ=y
+CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y
+CONFIG_CPU_IDLE=y
+CONFIG_BINFMT_MISC=y
+CONFIG_MD=y
+CONFIG_BLK_DEV_DM=y
+CONFIG_NET=y
+CONFIG_PACKET=y
+CONFIG_UNIX=y
+CONFIG_XFRM_USER=y
+CONFIG_NET_KEY=y
+CONFIG_NET_KEY_MIGRATE=y
+CONFIG_INET=y
+CONFIG_IP_MULTICAST=y
+CONFIG_IP_PNP=y
+CONFIG_IP_PNP_DHCP=y
+CONFIG_IP_PNP_BOOTP=y
+CONFIG_IP_PNP_RARP=y
+# CONFIG_INET_LRO is not set
+CONFIG_NETFILTER=y
+CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
+CONFIG_CONNECTOR=y
+CONFIG_MTD=y
+CONFIG_MTD_CMDLINE_PARTS=y
+CONFIG_MTD_BLOCK=y
+CONFIG_MTD_OOPS=y
+CONFIG_MTD_CFI=y
+CONFIG_MTD_CFI_INTELEXT=y
+CONFIG_MTD_NAND=y
+CONFIG_NETDEVICES=y
+CONFIG_EXT2_FS=y
+CONFIG_EXT3_FS=y
+CONFIG_EXT4_FS=y
+CONFIG_BTRFS_FS=y
+CONFIG_QUOTA=y
+CONFIG_QFMT_V2=y
+CONFIG_MSDOS_FS=y
+CONFIG_VFAT_FS=y
+CONFIG_TMPFS=y
+CONFIG_ECRYPT_FS=y
+CONFIG_JFFS2_FS=y
+CONFIG_JFFS2_SUMMARY=y
+CONFIG_JFFS2_FS_XATTR=y
+CONFIG_JFFS2_COMPRESSION_OPTIONS=y
+CONFIG_JFFS2_LZO=y
+CONFIG_JFFS2_RUBIN=y
+CONFIG_CRAMFS=y
+CONFIG_NETWORK_FILESYSTEMS=y
+CONFIG_NFS_FS=y
+# CONFIG_NFS_V2 is not set
+CONFIG_NFS_V3=y
+CONFIG_NFS_V3_ACL=y
+CONFIG_NFS_V4=y
+CONFIG_ROOT_NFS=y
+CONFIG_NLS_CODEPAGE_437=y
+CONFIG_NLS_ISO8859_1=y
+CONFIG_PRINTK_TIME=y
+CONFIG_MAGIC_SYSRQ=y
+CONFIG_DEBUG_FS=y
+CONFIG_SCHEDSTATS=y
+CONFIG_TIMER_STATS=y
+CONFIG_KEYS=y
+CONFIG_CRYPTO_MICHAEL_MIC=y
+CONFIG_CRC_CCITT=y
+CONFIG_CRC_T10DIF=y
+CONFIG_CRC_ITU_T=y
+CONFIG_CRC7=y
+CONFIG_HW_PERF_EVENTS=y
+CONFIG_FUNCTION_TRACER=y
+CONFIG_ENABLE_DEFAULT_TRACERS=y
+CONFIG_PROC_DEVICETREE=y
diff --git a/linaro/configs/ubuntu-minimal.conf b/linaro/configs/ubuntu-minimal.conf
new file mode 120000
index 00000000000..794e82f3bc1
--- /dev/null
+++ b/linaro/configs/ubuntu-minimal.conf
@@ -0,0 +1 @@
+distribution.conf \ No newline at end of file
diff --git a/linaro/configs/xen.conf b/linaro/configs/xen.conf
new file mode 100644
index 00000000000..d24fabbea07
--- /dev/null
+++ b/linaro/configs/xen.conf
@@ -0,0 +1,7 @@
+CONFIG_XEN=y
+CONFIG_XEN_NETDEV_FRONTEND=y
+CONFIG_XEN_NETDEV_BACKEND=y
+CONFIG_XEN_BLKDEV_FRONTEND=y
+CONFIG_XEN_BLKDEV_BACKEND=y
+CONFIG_XENFS=y
+CONFIG_XEN_COMPAT_XENFS=y
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index e2bfbf73a55..5cf99bf8cce 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -690,6 +690,23 @@ int PageHuge(struct page *page)
}
EXPORT_SYMBOL_GPL(PageHuge);
+pgoff_t __basepage_index(struct page *page)
+{
+ struct page *page_head = compound_head(page);
+ pgoff_t index = page_index(page_head);
+ unsigned long compound_idx;
+
+ if (!PageHuge(page_head))
+ return page_index(page);
+
+ if (compound_order(page_head) >= MAX_ORDER)
+ compound_idx = page_to_pfn(page) - page_to_pfn(page_head);
+ else
+ compound_idx = page - page_head;
+
+ return (index << compound_order(page_head)) + compound_idx;
+}
+
static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
{
struct page *page;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 194721839cf..fd79df5d315 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -6303,8 +6303,6 @@ mem_cgroup_css_online(struct cgroup *cont)
* call __mem_cgroup_free, so return directly
*/
mem_cgroup_put(memcg);
- if (parent->use_hierarchy)
- mem_cgroup_put(parent);
}
return error;
}
diff --git a/mm/vmstat.c b/mm/vmstat.c
index f42745e6578..b916a43a6b3 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -14,6 +14,7 @@
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/cpu.h>
+#include <linux/cpumask.h>
#include <linux/vmstat.h>
#include <linux/sched.h>
#include <linux/math64.h>
@@ -432,11 +433,12 @@ EXPORT_SYMBOL(dec_zone_page_state);
* with the global counters. These could cause remote node cache line
* bouncing and will have to be only done when necessary.
*/
-void refresh_cpu_vm_stats(int cpu)
+bool refresh_cpu_vm_stats(int cpu)
{
struct zone *zone;
int i;
int global_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
+ bool vm_activity = false;
for_each_populated_zone(zone) {
struct per_cpu_pageset *p;
@@ -483,14 +485,21 @@ void refresh_cpu_vm_stats(int cpu)
if (p->expire)
continue;
- if (p->pcp.count)
+ if (p->pcp.count) {
+ vm_activity = true;
drain_zone_pages(zone, &p->pcp);
+ }
#endif
}
for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
- if (global_diff[i])
+ if (global_diff[i]) {
atomic_long_add(global_diff[i], &vm_stat[i]);
+ vm_activity = true;
+ }
+
+ return vm_activity;
+
}
/*
@@ -1174,22 +1183,72 @@ static const struct file_operations proc_vmstat_file_operations = {
#ifdef CONFIG_SMP
static DEFINE_PER_CPU(struct delayed_work, vmstat_work);
int sysctl_stat_interval __read_mostly = HZ;
+static struct cpumask vmstat_off_cpus;
+struct delayed_work vmstat_monitor_work;
-static void vmstat_update(struct work_struct *w)
+static inline bool need_vmstat(int cpu)
{
- refresh_cpu_vm_stats(smp_processor_id());
- schedule_delayed_work(&__get_cpu_var(vmstat_work),
- round_jiffies_relative(sysctl_stat_interval));
+ struct zone *zone;
+ int i;
+
+ for_each_populated_zone(zone) {
+ struct per_cpu_pageset *p;
+
+ p = per_cpu_ptr(zone->pageset, cpu);
+
+ for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
+ if (p->vm_stat_diff[i])
+ return true;
+
+ if (zone_to_nid(zone) != numa_node_id() && p->pcp.count)
+ return true;
+ }
+
+ return false;
}
-static void __cpuinit start_cpu_timer(int cpu)
+static void vmstat_update(struct work_struct *w);
+
+static void start_cpu_timer(int cpu)
{
struct delayed_work *work = &per_cpu(vmstat_work, cpu);
- INIT_DEFERRABLE_WORK(work, vmstat_update);
+ cpumask_clear_cpu(cpu, &vmstat_off_cpus);
schedule_delayed_work_on(cpu, work, __round_jiffies_relative(HZ, cpu));
}
+static void __cpuinit setup_cpu_timer(int cpu)
+{
+ struct delayed_work *work = &per_cpu(vmstat_work, cpu);
+
+ INIT_DEFERRABLE_WORK(work, vmstat_update);
+ start_cpu_timer(cpu);
+}
+
+static void vmstat_update_monitor(struct work_struct *w)
+{
+ int cpu;
+
+ for_each_cpu_and(cpu, &vmstat_off_cpus, cpu_online_mask)
+ if (need_vmstat(cpu))
+ start_cpu_timer(cpu);
+
+ queue_delayed_work(system_unbound_wq, &vmstat_monitor_work,
+ round_jiffies_relative(sysctl_stat_interval));
+}
+
+
+static void vmstat_update(struct work_struct *w)
+{
+ int cpu = smp_processor_id();
+
+ if (likely(refresh_cpu_vm_stats(cpu)))
+ schedule_delayed_work(&__get_cpu_var(vmstat_work),
+ round_jiffies_relative(sysctl_stat_interval));
+ else
+ cpumask_set_cpu(cpu, &vmstat_off_cpus);
+}
+
/*
* Use the cpu notifier to insure that the thresholds are recalculated
* when necessary.
@@ -1204,17 +1263,19 @@ static int __cpuinit vmstat_cpuup_callback(struct notifier_block *nfb,
case CPU_ONLINE:
case CPU_ONLINE_FROZEN:
refresh_zone_stat_thresholds();
- start_cpu_timer(cpu);
+ setup_cpu_timer(cpu);
node_set_state(cpu_to_node(cpu), N_CPU);
break;
case CPU_DOWN_PREPARE:
case CPU_DOWN_PREPARE_FROZEN:
- cancel_delayed_work_sync(&per_cpu(vmstat_work, cpu));
- per_cpu(vmstat_work, cpu).work.func = NULL;
+ if (!cpumask_test_cpu(cpu, &vmstat_off_cpus)) {
+ cancel_delayed_work_sync(&per_cpu(vmstat_work, cpu));
+ per_cpu(vmstat_work, cpu).work.func = NULL;
+ }
break;
case CPU_DOWN_FAILED:
case CPU_DOWN_FAILED_FROZEN:
- start_cpu_timer(cpu);
+ setup_cpu_timer(cpu);
break;
case CPU_DEAD:
case CPU_DEAD_FROZEN:
@@ -1237,8 +1298,14 @@ static int __init setup_vmstat(void)
register_cpu_notifier(&vmstat_notifier);
+ INIT_DEFERRABLE_WORK(&vmstat_monitor_work,
+ vmstat_update_monitor);
+ queue_delayed_work(system_unbound_wq,
+ &vmstat_monitor_work,
+ round_jiffies_relative(HZ));
+
for_each_online_cpu(cpu)
- start_cpu_timer(cpu);
+ setup_cpu_timer(cpu);
#endif
#ifdef CONFIG_PROC_FS
proc_create("buddyinfo", S_IRUGO, NULL, &fragmentation_file_operations);
diff --git a/net/ceph/auth_none.c b/net/ceph/auth_none.c
index 925ca583c09..8c93fa8d81b 100644
--- a/net/ceph/auth_none.c
+++ b/net/ceph/auth_none.c
@@ -39,6 +39,11 @@ static int should_authenticate(struct ceph_auth_client *ac)
return xi->starting;
}
+static int build_request(struct ceph_auth_client *ac, void *buf, void *end)
+{
+ return 0;
+}
+
/*
* the generic auth code decode the global_id, and we carry no actual
* authenticate state, so nothing happens here.
@@ -106,6 +111,7 @@ static const struct ceph_auth_client_ops ceph_auth_none_ops = {
.destroy = destroy,
.is_authenticated = is_authenticated,
.should_authenticate = should_authenticate,
+ .build_request = build_request,
.handle_reply = handle_reply,
.create_authorizer = ceph_auth_none_create_authorizer,
.destroy_authorizer = ceph_auth_none_destroy_authorizer,