aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSteve Capper <steve.capper@linaro.org>2013-05-30 14:21:13 +0100
committerSteve Capper <steve.capper@linaro.org>2013-06-03 16:13:18 +0100
commit6a45d7d08f9614928cf2372fcd773d7d8d14ff70 (patch)
treee96351345cf7c858abe209d2c24e0bd1328317a3
parent00226e2acfd50c101c2d1f6141a0a16c256c8fea (diff)
downloadlinux-6a45d7d08f9614928cf2372fcd773d7d8d14ff70.tar.gz
ARM: mm: Add NUMA support.
This patch adds support for NUMA running on sparse memory. At the moment, the number of nodes has to be specified on the commandline. One can also, optionally, specify the memory size of each node. (Otherwise the memory range is split roughly equally between nodes). CPUs can be striped across nodes (cpu number modulo the number of nodes), or assigned to a node based on their topology_physical_package_id. So for instance on a TC2, the A7 cores can be grouped together in one node and the A15s grouped together in another node Signed-off-by: Steve Capper <steve.capper@linaro.org>
-rw-r--r--arch/arm/Kconfig27
-rw-r--r--arch/arm/include/asm/mmzone.h30
-rw-r--r--arch/arm/include/asm/numa.h28
-rw-r--r--arch/arm/kernel/setup.c6
-rw-r--r--arch/arm/mm/Makefile2
-rw-r--r--arch/arm/mm/init.c10
-rw-r--r--arch/arm/mm/numa.c216
7 files changed, 319 insertions, 0 deletions
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index ace7c08960b..86cbced8939 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -1061,6 +1061,33 @@ config ARM_TIMER_SP804
source arch/arm/mm/Kconfig
+config NUMA
+ bool "NUMA Support"
+ depends on SPARSEMEM
+ help
+ Say Y to compile the kernel to support NUMA (Non-Uniform Memory
+ Access). At the moment, one has to specify the number of nodes using
+ the commandline:
+ numa=fake=x,[size0],[size1],...,[sizeN-1],[usetopology]
+ where x is the number of nodes, and sizeY is the size of node Y in
+ bytes (one can suffix m or g for megabytes or gigabytes). If no sizes
+ are specified, the memory is distributed roughly evenly between nodes.
+ If "usetopology" is specified, the "topology_physical_package_id" is
+ used to assign CPUs to nodes.
+
+config NODES_SHIFT
+ int "Maximum NUMA Nodes (as a power of 2)" if NUMA
+ range 1 10
+ default "1"
+ depends on NEED_MULTIPLE_NODES
+ ---help---
+ Specify the maximum number of NUMA Nodes available on the target
+ system. Increases memory reserved to accommodate various tables.
+
+config USE_PERCPU_NUMA_NODE_ID
+ def_bool y
+ depends on NUMA
+
config ARM_NR_BANKS
int
default 16 if ARCH_EP93XX
diff --git a/arch/arm/include/asm/mmzone.h b/arch/arm/include/asm/mmzone.h
new file mode 100644
index 00000000000..5af2f798854
--- /dev/null
+++ b/arch/arm/include/asm/mmzone.h
@@ -0,0 +1,30 @@
+/*
+ * Copyright (C) 2013 Linaro Ltd.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef __ASM_MMZONE_H
+#define __ASM_MMZONE_H
+
+#define NODE_DATA(nid) (node_data[nid])
+extern struct pglist_data *node_data[];
+extern cpumask_var_t *node_to_cpumask_map;
+
+#define cpumask_of_node(node) ((node) == -1 ? \
+ cpu_all_mask : \
+ node_to_cpumask_map[node])
+
+#define parent_node(node) (node)
+
+#endif /* __ASM_MMZONE_H */
diff --git a/arch/arm/include/asm/numa.h b/arch/arm/include/asm/numa.h
new file mode 100644
index 00000000000..cb1b49a75ef
--- /dev/null
+++ b/arch/arm/include/asm/numa.h
@@ -0,0 +1,28 @@
+/*
+ * Copyright (C) 2013 Linaro Ltd.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef __ASM_NUMA_H
+#define __ASM_NUMA_H
+
+#ifdef CONFIG_NUMA
+void __init arm_numa_init(void);
+#else
+static inline void arm_numa_init(void)
+{
+}
+#endif
+
+#endif /* __ASM_NUMA_H */
diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c
index 1522c7ae31b..e73e879b42b 100644
--- a/arch/arm/kernel/setup.c
+++ b/arch/arm/kernel/setup.c
@@ -829,6 +829,12 @@ static int __init topology_init(void)
{
int cpu;
+#ifdef CONFIG_NUMA
+ int node;
+ for_each_online_node(node)
+ register_one_node(node);
+#endif
+
for_each_possible_cpu(cpu) {
struct cpuinfo_arm *cpuinfo = &per_cpu(cpu_data, cpu);
cpuinfo->cpu.hotpluggable = 1;
diff --git a/arch/arm/mm/Makefile b/arch/arm/mm/Makefile
index 9e51be96f63..199935473c7 100644
--- a/arch/arm/mm/Makefile
+++ b/arch/arm/mm/Makefile
@@ -8,6 +8,8 @@ obj-y := dma-mapping.o extable.o fault.o init.o \
obj-$(CONFIG_MMU) += fault-armv.o flush.o idmap.o ioremap.o \
mmap.o pgd.o mmu.o
+obj-$(CONFIG_NUMA) += numa.o
+
ifneq ($(CONFIG_MMU),y)
obj-y += nommu.o
endif
diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c
index b45f69ab1e5..fe723979efd 100644
--- a/arch/arm/mm/init.c
+++ b/arch/arm/mm/init.c
@@ -30,6 +30,7 @@
#include <asm/setup.h>
#include <asm/tlb.h>
#include <asm/fixmap.h>
+#include <asm/numa.h>
#include <asm/mach/arch.h>
#include <asm/mach/map.h>
@@ -181,6 +182,11 @@ static void __init arm_bootmem_init(unsigned long start_pfn,
pg_data_t *pgdat;
/*
+ * If we need NUMA, initialise it here.
+ */
+ arm_numa_init();
+
+ /*
* Allocate the bootmem bitmap page. This must be in a region
* of memory which has already been mapped.
*/
@@ -276,8 +282,10 @@ static void __init arm_bootmem_free(unsigned long min, unsigned long max_low,
max_zone_pfns[ZONE_DMA] = __phys_to_pfn(arm_dma_limit);
#endif
+#ifndef CONFIG_NUMA
memblock_set_node(__pfn_to_phys(min),
__pfn_to_phys(max_high - min), 0);
+#endif
free_area_init_nodes(max_zone_pfns);
}
@@ -577,7 +585,9 @@ void __init mem_init(void)
extern u32 itcm_end;
#endif
+#ifndef CONFIG_NEED_MULTIPLE_NODES
max_mapnr = pfn_to_page(max_pfn + PHYS_PFN_OFFSET) - mem_map;
+#endif
/* this will put all unused low memory onto the freelists */
free_unused_memmap(&meminfo);
diff --git a/arch/arm/mm/numa.c b/arch/arm/mm/numa.c
new file mode 100644
index 00000000000..62765d39bdd
--- /dev/null
+++ b/arch/arm/mm/numa.c
@@ -0,0 +1,216 @@
+/*
+ * Copyright (C) 2013 Linaro Ltd.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/memblock.h>
+#include <linux/mmzone.h>
+#include <linux/cpu.h>
+#include <linux/bootmem.h>
+
+#include <asm/numa.h>
+
+struct pglist_data *node_data[MAX_NUMNODES];
+EXPORT_SYMBOL(node_data);
+
+cpumask_var_t *node_to_cpumask_map;
+EXPORT_SYMBOL(node_to_cpumask_map);
+
+static unsigned int numa_use_topology;
+static unsigned int numa_node_count = 1;
+
+static char *memcmdline __initdata;
+
+/*
+ * Add a CPU to a NUMA node.
+ * Default assignment policy is the cpu number modulo the number of nodes.
+ *
+ * We can also group CPUs via the topology_physical_package_id.
+ * (if the user adds "usetopology" to the command line).
+ * When we add CPU 0 (the boot CPU), it is always to node 0, as we don't have
+ * the topology information at that time.
+ * Subsequent CPUs get added based on the topology_physical_package_id.
+ * To stop CPU0 being added to the same node as CPUs on a different cluster,
+ * we subtract the topology_physical_package_id of node 0.
+ */
+static void __cpuinit add_cpu_to_node(int cpu)
+{
+ unsigned int node;
+ unsigned int n0 = topology_physical_package_id(0);
+ unsigned int nc = topology_physical_package_id(cpu);
+
+ if (numa_use_topology)
+ node = cpu ? (numa_node_count + nc - n0) % numa_node_count : 0;
+ else
+ node = cpu % numa_node_count;
+
+ cpumask_set_cpu(cpu, node_to_cpumask_map[node]);
+ set_cpu_numa_node(cpu, node);
+ pr_info("NUMA: Adding CPU %d to node %d\n", cpu, node);
+}
+
+static int __cpuinit numa_add_cpu(struct notifier_block *self,
+ unsigned long action, void *hcpu)
+{
+ if (action == CPU_ONLINE) {
+ int cpu = (long)hcpu;
+ add_cpu_to_node(cpu);
+ }
+
+ return NOTIFY_OK;
+}
+
+static struct notifier_block __cpuinitdata numa_node_nb = {
+ .notifier_call = numa_add_cpu,
+ .priority = 1, /* Must run before sched domains notifier. */
+};
+
+ /*
+ * Split the available memory between the NUMA nodes.
+ *
+ * By default, the memory is distributed roughly evenly between nodes.
+ *
+ * One can also specify requested node sizes on the command line, if
+ * "memcmdline" is not NULL, we try to parse it as a size.
+ *
+ * We traverse memory blocks rather than the pfn addressable range to allow for
+ * sparse memory configurations and memory holes.
+ */
+static void __init numa_split_memblocks(void)
+{
+ int node;
+ unsigned long pfnsrem = 0, pfnsblock, pfncurr, pfnend = 0;
+ unsigned long pfn_starts[MAX_NUMNODES];
+ struct memblock_region *reg;
+
+ for_each_memblock(memory, reg) {
+ pfnend = memblock_region_memory_end_pfn(reg);
+ pfnsrem += pfnend - memblock_region_memory_base_pfn(reg);
+ }
+
+ reg = memblock.memory.regions;
+ pfncurr = memblock_region_memory_base_pfn(reg);
+ pfnsblock = memblock_region_memory_end_pfn(reg) - pfncurr;
+ pfn_starts[0] = pfncurr;
+
+ for(node = 0; node < numa_node_count - 1; node++) {
+ unsigned long pfnsnode = pfnsrem / (numa_node_count - node);
+
+ if (memcmdline) {
+ unsigned long nsize = __phys_to_pfn(
+ memparse(memcmdline, &memcmdline));
+
+ if (*memcmdline == ',')
+ ++memcmdline;
+
+ if ((nsize > 0) && (nsize < pfnsrem))
+ pfnsnode = nsize;
+ else
+ memcmdline = NULL;
+ }
+
+ while(pfnsnode > 0) {
+ unsigned long pfnsset = min(pfnsnode, pfnsblock);
+
+ pfncurr += pfnsset;
+
+ pfnsblock -= pfnsset;
+ pfnsrem -= pfnsset;
+ pfnsnode -= pfnsset;
+
+ if (pfnsblock == 0) {
+ reg++;
+ pfncurr = memblock_region_memory_base_pfn(reg);
+ pfnsblock = memblock_region_memory_end_pfn(reg)
+ - pfncurr;
+ }
+ }
+
+ pfn_starts[node + 1] = pfncurr;
+ }
+
+ for (node = 0; node < numa_node_count - 1; node++)
+ memblock_set_node(__pfn_to_phys(pfn_starts[node]),
+ __pfn_to_phys(pfn_starts[node + 1]
+ - pfn_starts[node]),
+ node);
+
+ memblock_set_node(__pfn_to_phys(pfn_starts[node]),
+ __pfn_to_phys(pfnend - pfn_starts[node]), node);
+}
+
+void __init arm_numa_init()
+{
+ int node;
+ size_t size;
+
+ for (node = 0; node < numa_node_count; node++) {
+ phys_addr_t pa = memblock_alloc(sizeof(pg_data_t),
+ L1_CACHE_BYTES);
+ NODE_DATA(node) = __va(pa);
+ memset(NODE_DATA(node), 0, sizeof(pg_data_t));
+ NODE_DATA(node)->bdata = &bootmem_node_data[node];
+ node_set_online(node);
+ }
+
+ size = sizeof(cpumask_var_t) * numa_node_count;
+ node_to_cpumask_map = __va(memblock_alloc(size, L1_CACHE_BYTES));
+ memset(node_to_cpumask_map, 0, size);
+
+ numa_split_memblocks();
+ register_cpu_notifier(&numa_node_nb);
+
+ add_cpu_to_node(0);
+}
+
+static int __init early_numa(char *p)
+{
+ if (!p)
+ return 0;
+
+ p = strstr(p, "fake=");
+ if (p) {
+ int num_nodes = 0;
+ int optres;
+
+ p += strlen("fake=");
+ optres = get_option(&p, &num_nodes);
+ if ((optres == 0) || (optres == 3))
+ return -EINVAL;
+
+ if ((num_nodes > 0) && (num_nodes <= MAX_NUMNODES)) {
+ pr_info("NUMA: setting up fake NUMA with %d nodes.\n",
+ num_nodes);
+ numa_node_count = num_nodes;
+ } else {
+ pr_info("NUMA: can't set up %d nodes for NUMA.\n"
+ "(MAX_NUMNODES = %d)\n",
+ num_nodes, MAX_NUMNODES);
+ return -EINVAL;
+ }
+
+ if (optres == 2)
+ memcmdline = p;
+
+ if (strstr(p, "usetopology")) {
+ numa_use_topology = 1;
+ pr_info("NUMA: using CPU topology to assign nodes.\n");
+ } else
+ pr_info("NUMA: NOT using CPU topology.\n");
+ }
+
+ return 0;
+}
+
+early_param("numa", early_numa);