Skip to content

Commit

Permalink
mm: handle uninitialized numa nodes gracefully
Browse files Browse the repository at this point in the history
We have had several reports [1][2][3] that page allocator blows up when an
allocation from a possible node is requested.  The underlying reason is
that NODE_DATA for the specific node is not allocated.

NUMA specific initialization is arch specific and it can vary a lot.  E.g.
x86 tries to initialize all nodes that have some cpu affinity (see
init_cpu_to_node) but this can be insufficient because the node might be
cpuless for example.

One way to address this problem would be to check for !node_online nodes
when trying to get a zonelist and silently fall back to another node.
That is unfortunately adding a branch into allocator hot path and it
doesn't handle any other potential NODE_DATA users.

This patch takes a different approach (following a lead of [3]) and it pre
allocates pgdat for all possible nodes in an arch indipendent code -
free_area_init.  All uninitialized nodes are treated as memoryless nodes.
node_state of the node is not changed because that would lead to other
side effects - e.g.  sysfs representation of such a node and from past
discussions [4] it is known that some tools might have problems digesting
that.

Newly allocated pgdat only gets a minimal initialization and the rest of
the work is expected to be done by the memory hotplug - hotadd_new_pgdat
(renamed to hotadd_init_pgdat).

generic_alloc_nodedata is changed to use the memblock allocator because
neither page nor slab allocators are available at the stage when all
pgdats are allocated.  Hotplug doesn't allocate pgdat anymore so we can
use the early boot allocator.  The only arch specific implementation is
ia64 and that is changed to use the early allocator as well.

[1] http://lkml.kernel.org/r/[email protected]
[2] http://lkml.kernel.org/r/[email protected]
[3] http://lkml.kernel.org/r/[email protected]
[4] http://lkml.kernel.org/r/[email protected]

Link: https://lkml.kernel.org/r/[email protected]
Reported-by: Alexey Makhalov <[email protected]>
Tested-by: Alexey Makhalov <[email protected]>
Reported-by: Nico Pache <[email protected]>
Acked-by: Rafael Aquini <[email protected]>
Tested-by: Rafael Aquini <[email protected]>
Acked-by: David Hildenbrand <[email protected]>
Reviewed-by: Oscar Salvador <[email protected]>
Acked-by: Mike Rapoport <[email protected]>
Signed-off-by: Michal Hocko <[email protected]>
Cc: Christoph Lameter <[email protected]>
Cc: Dennis Zhou <[email protected]>
Cc: Eric Dumazet <[email protected]>
Cc: Tejun Heo <[email protected]>
Cc: Wei Yang <[email protected]>
Signed-off-by: Andrew Morton <[email protected]>
Signed-off-by: Stephen Rothwell <[email protected]>
  • Loading branch information
Michal Hocko authored and sfrothwell committed Feb 17, 2022
1 parent 39b049b commit da4490c
Show file tree
Hide file tree
Showing 5 changed files with 46 additions and 19 deletions.
4 changes: 2 additions & 2 deletions arch/ia64/mm/discontig.c
Original file line number Diff line number Diff line change
Expand Up @@ -608,11 +608,11 @@ void __init paging_init(void)
zero_page_memmap_ptr = virt_to_page(ia64_imva(empty_zero_page));
}

pg_data_t *arch_alloc_nodedata(int nid)
pg_data_t * __init arch_alloc_nodedata(int nid)
{
unsigned long size = compute_pernodesize(nid);

return kzalloc(size, GFP_KERNEL);
return memblock_alloc(size, SMP_CACHE_BYTES);
}

void arch_free_nodedata(pg_data_t *pgdat)
Expand Down
2 changes: 1 addition & 1 deletion include/linux/memory_hotplug.h
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ extern void arch_refresh_nodedata(int nid, pg_data_t *pgdat);
*/
#define generic_alloc_nodedata(nid) \
({ \
kzalloc(sizeof(pg_data_t), GFP_KERNEL); \
memblock_alloc(sizeof(*pgdat), SMP_CACHE_BYTES); \
})
/*
* This definition is just for error path in node hotadd.
Expand Down
2 changes: 2 additions & 0 deletions mm/internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -717,4 +717,6 @@ void vunmap_range_noflush(unsigned long start, unsigned long end);
int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
unsigned long addr, int page_nid, int *flags);

DECLARE_PER_CPU(struct per_cpu_nodestat, boot_nodestats);

#endif /* __MM_INTERNAL_H */
21 changes: 9 additions & 12 deletions mm/memory_hotplug.c
Original file line number Diff line number Diff line change
Expand Up @@ -1162,19 +1162,21 @@ static void reset_node_present_pages(pg_data_t *pgdat)
}

/* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */
static pg_data_t __ref *hotadd_new_pgdat(int nid)
static pg_data_t __ref *hotadd_init_pgdat(int nid)
{
struct pglist_data *pgdat;

pgdat = NODE_DATA(nid);
if (!pgdat) {
pgdat = arch_alloc_nodedata(nid);
if (!pgdat)
return NULL;

/*
* NODE_DATA is preallocated (free_area_init) but its internal
* state is not allocated completely. Add missing pieces.
* Completely offline nodes stay around and they just need
* reintialization.
*/
if (pgdat->per_cpu_nodestats == &boot_nodestats) {
pgdat->per_cpu_nodestats =
alloc_percpu(struct per_cpu_nodestat);
arch_refresh_nodedata(nid, pgdat);
} else {
int cpu;
/*
Expand All @@ -1193,8 +1195,6 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid)
}
}

/* we can use NODE_DATA(nid) from here */
pgdat->node_id = nid;
pgdat->node_start_pfn = 0;

/* init node's zones as empty zones, we don't have any present pages.*/
Expand Down Expand Up @@ -1246,7 +1246,7 @@ static int __try_online_node(int nid, bool set_node_online)
if (node_online(nid))
return 0;

pgdat = hotadd_new_pgdat(nid);
pgdat = hotadd_init_pgdat(nid);
if (!pgdat) {
pr_err("Cannot online node %d due to NULL pgdat\n", nid);
ret = -ENOMEM;
Expand Down Expand Up @@ -1445,9 +1445,6 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)

return ret;
error:
/* rollback pgdat allocation and others */
if (new_node)
rollback_node_hotadd(nid);
if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK))
memblock_remove(start, size);
error_mem_hotplug_end:
Expand Down
36 changes: 32 additions & 4 deletions mm/page_alloc.c
Original file line number Diff line number Diff line change
Expand Up @@ -6407,7 +6407,7 @@ static void per_cpu_pages_init(struct per_cpu_pages *pcp, struct per_cpu_zonesta
#define BOOT_PAGESET_BATCH 1
static DEFINE_PER_CPU(struct per_cpu_pages, boot_pageset);
static DEFINE_PER_CPU(struct per_cpu_zonestat, boot_zonestats);
static DEFINE_PER_CPU(struct per_cpu_nodestat, boot_nodestats);
DEFINE_PER_CPU(struct per_cpu_nodestat, boot_nodestats);

static void __build_all_zonelists(void *data)
{
Expand All @@ -6429,7 +6429,11 @@ static void __build_all_zonelists(void *data)
if (self && !node_online(self->node_id)) {
build_zonelists(self);
} else {
for_each_online_node(nid) {
/*
* All possible nodes have pgdat preallocated
* in free_area_init
*/
for_each_node(nid) {
pg_data_t *pgdat = NODE_DATA(nid);

build_zonelists(pgdat);
Expand Down Expand Up @@ -8129,8 +8133,32 @@ void __init free_area_init(unsigned long *max_zone_pfn)
/* Initialise every node */
mminit_verify_pageflags_layout();
setup_nr_node_ids();
for_each_online_node(nid) {
pg_data_t *pgdat = NODE_DATA(nid);
for_each_node(nid) {
pg_data_t *pgdat;

if (!node_online(nid)) {
pr_info("Initializing node %d as memoryless\n", nid);

/* Allocator not initialized yet */
pgdat = arch_alloc_nodedata(nid);
if (!pgdat) {
pr_err("Cannot allocate %zuB for node %d.\n",
sizeof(*pgdat), nid);
continue;
}
arch_refresh_nodedata(nid, pgdat);
free_area_init_memoryless_node(nid);
/*
* not marking this node online because we do not want to
* confuse userspace by sysfs files/directories for node
* without any memory attached to it (see topology_init)
* The pgdat will get fully initialized when a memory is
* hotpluged into it by hotadd_init_pgdat
*/
continue;
}

pgdat = NODE_DATA(nid);
free_area_init_node(nid);

/* Any memory on that node */
Expand Down

0 comments on commit da4490c

Please sign in to comment.