Skip to content

Commit 9626c1a

Browse files
ldu4gregkh
authored andcommitted
mm: don't rely on system state to detect hot-plug operations
commit f85086f upstream. In register_mem_sect_under_node() the system_state's value is checked to detect whether the call is made during boot time or during an hot-plug operation. Unfortunately, that check against SYSTEM_BOOTING is wrong because regular memory is registered at SYSTEM_SCHEDULING state. In addition, memory hot-plug operation can be triggered at this system state by the ACPI [1]. So checking against the system state is not enough. The consequence is that on system with interleaved node's ranges like this: Early memory node ranges node 1: [mem 0x0000000000000000-0x000000011fffffff] node 2: [mem 0x0000000120000000-0x000000014fffffff] node 1: [mem 0x0000000150000000-0x00000001ffffffff] node 0: [mem 0x0000000200000000-0x000000048fffffff] node 2: [mem 0x0000000490000000-0x00000007ffffffff] This can be seen on PowerPC LPAR after multiple memory hot-plug and hot-unplug operations are done. At the next reboot the node's memory ranges can be interleaved and since the call to link_mem_sections() is made in topology_init() while the system is in the SYSTEM_SCHEDULING state, the node's id is not checked, and the sections registered to multiple nodes: $ ls -l /sys/devices/system/memory/memory21/node* total 0 lrwxrwxrwx 1 root root 0 Aug 24 05:27 node1 -> ../../node/node1 lrwxrwxrwx 1 root root 0 Aug 24 05:27 node2 -> ../../node/node2 In that case, the system is able to boot but if later one of theses memory blocks is hot-unplugged and then hot-plugged, the sysfs inconsistency is detected and this is triggering a BUG_ON(): kernel BUG at /Users/laurent/src/linux-ppc/mm/memory_hotplug.c:1084! Oops: Exception in kernel mode, sig: 5 [#1] LE PAGE_SIZE=64K MMU=Hash SMP NR_CPUS=2048 NUMA pSeries Modules linked in: rpadlpar_io rpaphp pseries_rng rng_core vmx_crypto gf128mul binfmt_misc ip_tables x_tables xfs libcrc32c crc32c_vpmsum autofs4 CPU: 8 PID: 10256 Comm: drmgr Not tainted 5.9.0-rc1+ raspberrypi#25 Call Trace: add_memory_resource+0x23c/0x340 (unreliable) __add_memory+0x5c/0xf0 dlpar_add_lmb+0x1b4/0x500 dlpar_memory+0x1f8/0xb80 handle_dlpar_errorlog+0xc0/0x190 dlpar_store+0x198/0x4a0 kobj_attr_store+0x30/0x50 sysfs_kf_write+0x64/0x90 kernfs_fop_write+0x1b0/0x290 vfs_write+0xe8/0x290 ksys_write+0xdc/0x130 system_call_exception+0x160/0x270 system_call_common+0xf0/0x27c This patch addresses the root cause by not relying on the system_state value to detect whether the call is due to a hot-plug operation. An extra parameter is added to link_mem_sections() detailing whether the operation is due to a hot-plug operation. [1] According to Oscar Salvador, using this qemu command line, ACPI memory hotplug operations are raised at SYSTEM_SCHEDULING state: $QEMU -enable-kvm -machine pc -smp 4,sockets=4,cores=1,threads=1 -cpu host -monitor pty \ -m size=$MEM,slots=255,maxmem=4294967296k \ -numa node,nodeid=0,cpus=0-3,mem=512 -numa node,nodeid=1,mem=512 \ -object memory-backend-ram,id=memdimm0,size=134217728 -device pc-dimm,node=0,memdev=memdimm0,id=dimm0,slot=0 \ -object memory-backend-ram,id=memdimm1,size=134217728 -device pc-dimm,node=0,memdev=memdimm1,id=dimm1,slot=1 \ -object memory-backend-ram,id=memdimm2,size=134217728 -device pc-dimm,node=0,memdev=memdimm2,id=dimm2,slot=2 \ -object memory-backend-ram,id=memdimm3,size=134217728 -device pc-dimm,node=0,memdev=memdimm3,id=dimm3,slot=3 \ -object memory-backend-ram,id=memdimm4,size=134217728 -device pc-dimm,node=1,memdev=memdimm4,id=dimm4,slot=4 \ -object memory-backend-ram,id=memdimm5,size=134217728 -device pc-dimm,node=1,memdev=memdimm5,id=dimm5,slot=5 \ -object memory-backend-ram,id=memdimm6,size=134217728 -device pc-dimm,node=1,memdev=memdimm6,id=dimm6,slot=6 \ Fixes: 4fbce63 ("mm/memory_hotplug.c: make register_mem_sect_under_node() a callback of walk_memory_range()") Signed-off-by: Laurent Dufour <[email protected]> Signed-off-by: Andrew Morton <[email protected]> Reviewed-by: David Hildenbrand <[email protected]> Reviewed-by: Oscar Salvador <[email protected]> Acked-by: Michal Hocko <[email protected]> Cc: Greg Kroah-Hartman <[email protected]> Cc: "Rafael J. Wysocki" <[email protected]> Cc: Fenghua Yu <[email protected]> Cc: Nathan Lynch <[email protected]> Cc: Scott Cheloha <[email protected]> Cc: Tony Luck <[email protected]> Cc: <[email protected]> Link: https://lkml.kernel.org/r/[email protected] Signed-off-by: Linus Torvalds <[email protected]> Signed-off-by: Greg Kroah-Hartman <[email protected]>
1 parent 42b7153 commit 9626c1a

File tree

3 files changed

+64
-35
lines changed

3 files changed

+64
-35
lines changed

drivers/base/node.c

Lines changed: 55 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -758,14 +758,36 @@ static int __ref get_nid_for_pfn(unsigned long pfn)
758758
return pfn_to_nid(pfn);
759759
}
760760

761+
static int do_register_memory_block_under_node(int nid,
762+
struct memory_block *mem_blk)
763+
{
764+
int ret;
765+
766+
/*
767+
* If this memory block spans multiple nodes, we only indicate
768+
* the last processed node.
769+
*/
770+
mem_blk->nid = nid;
771+
772+
ret = sysfs_create_link_nowarn(&node_devices[nid]->dev.kobj,
773+
&mem_blk->dev.kobj,
774+
kobject_name(&mem_blk->dev.kobj));
775+
if (ret)
776+
return ret;
777+
778+
return sysfs_create_link_nowarn(&mem_blk->dev.kobj,
779+
&node_devices[nid]->dev.kobj,
780+
kobject_name(&node_devices[nid]->dev.kobj));
781+
}
782+
761783
/* register memory section under specified node if it spans that node */
762-
static int register_mem_sect_under_node(struct memory_block *mem_blk,
763-
void *arg)
784+
static int register_mem_block_under_node_early(struct memory_block *mem_blk,
785+
void *arg)
764786
{
765787
unsigned long memory_block_pfns = memory_block_size_bytes() / PAGE_SIZE;
766788
unsigned long start_pfn = section_nr_to_pfn(mem_blk->start_section_nr);
767789
unsigned long end_pfn = start_pfn + memory_block_pfns - 1;
768-
int ret, nid = *(int *)arg;
790+
int nid = *(int *)arg;
769791
unsigned long pfn;
770792

771793
for (pfn = start_pfn; pfn <= end_pfn; pfn++) {
@@ -782,38 +804,33 @@ static int register_mem_sect_under_node(struct memory_block *mem_blk,
782804
}
783805

784806
/*
785-
* We need to check if page belongs to nid only for the boot
786-
* case, during hotplug we know that all pages in the memory
787-
* block belong to the same node.
788-
*/
789-
if (system_state == SYSTEM_BOOTING) {
790-
page_nid = get_nid_for_pfn(pfn);
791-
if (page_nid < 0)
792-
continue;
793-
if (page_nid != nid)
794-
continue;
795-
}
796-
797-
/*
798-
* If this memory block spans multiple nodes, we only indicate
799-
* the last processed node.
807+
* We need to check if page belongs to nid only at the boot
808+
* case because node's ranges can be interleaved.
800809
*/
801-
mem_blk->nid = nid;
802-
803-
ret = sysfs_create_link_nowarn(&node_devices[nid]->dev.kobj,
804-
&mem_blk->dev.kobj,
805-
kobject_name(&mem_blk->dev.kobj));
806-
if (ret)
807-
return ret;
810+
page_nid = get_nid_for_pfn(pfn);
811+
if (page_nid < 0)
812+
continue;
813+
if (page_nid != nid)
814+
continue;
808815

809-
return sysfs_create_link_nowarn(&mem_blk->dev.kobj,
810-
&node_devices[nid]->dev.kobj,
811-
kobject_name(&node_devices[nid]->dev.kobj));
816+
return do_register_memory_block_under_node(nid, mem_blk);
812817
}
813818
/* mem section does not span the specified node */
814819
return 0;
815820
}
816821

822+
/*
823+
* During hotplug we know that all pages in the memory block belong to the same
824+
* node.
825+
*/
826+
static int register_mem_block_under_node_hotplug(struct memory_block *mem_blk,
827+
void *arg)
828+
{
829+
int nid = *(int *)arg;
830+
831+
return do_register_memory_block_under_node(nid, mem_blk);
832+
}
833+
817834
/*
818835
* Unregister a memory block device under the node it spans. Memory blocks
819836
* with multiple nodes cannot be offlined and therefore also never be removed.
@@ -829,11 +846,19 @@ void unregister_memory_block_under_nodes(struct memory_block *mem_blk)
829846
kobject_name(&node_devices[mem_blk->nid]->dev.kobj));
830847
}
831848

832-
int link_mem_sections(int nid, unsigned long start_pfn, unsigned long end_pfn)
849+
int link_mem_sections(int nid, unsigned long start_pfn, unsigned long end_pfn,
850+
enum meminit_context context)
833851
{
852+
walk_memory_blocks_func_t func;
853+
854+
if (context == MEMINIT_HOTPLUG)
855+
func = register_mem_block_under_node_hotplug;
856+
else
857+
func = register_mem_block_under_node_early;
858+
834859
return walk_memory_blocks(PFN_PHYS(start_pfn),
835860
PFN_PHYS(end_pfn - start_pfn), (void *)&nid,
836-
register_mem_sect_under_node);
861+
func);
837862
}
838863

839864
#ifdef CONFIG_HUGETLBFS

include/linux/node.h

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -99,11 +99,13 @@ extern struct node *node_devices[];
9999
typedef void (*node_registration_func_t)(struct node *);
100100

101101
#if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) && defined(CONFIG_NUMA)
102-
extern int link_mem_sections(int nid, unsigned long start_pfn,
103-
unsigned long end_pfn);
102+
int link_mem_sections(int nid, unsigned long start_pfn,
103+
unsigned long end_pfn,
104+
enum meminit_context context);
104105
#else
105106
static inline int link_mem_sections(int nid, unsigned long start_pfn,
106-
unsigned long end_pfn)
107+
unsigned long end_pfn,
108+
enum meminit_context context)
107109
{
108110
return 0;
109111
}
@@ -128,7 +130,8 @@ static inline int register_one_node(int nid)
128130
if (error)
129131
return error;
130132
/* link memory sections under this node */
131-
error = link_mem_sections(nid, start_pfn, end_pfn);
133+
error = link_mem_sections(nid, start_pfn, end_pfn,
134+
MEMINIT_EARLY);
132135
}
133136

134137
return error;

mm/memory_hotplug.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1082,7 +1082,8 @@ int __ref add_memory_resource(int nid, struct resource *res)
10821082
}
10831083

10841084
/* link memory sections under this node.*/
1085-
ret = link_mem_sections(nid, PFN_DOWN(start), PFN_UP(start + size - 1));
1085+
ret = link_mem_sections(nid, PFN_DOWN(start), PFN_UP(start + size - 1),
1086+
MEMINIT_HOTPLUG);
10861087
BUG_ON(ret);
10871088

10881089
/* create new memmap entry */

0 commit comments

Comments
 (0)