linux_dsm_epyc7002/arch/powerpc/mm/drmem.c
Scott Cheloha e5e179aa3a pseries/drmem: don't cache node id in drmem_lmb struct
At memory hot-remove time we can retrieve an LMB's nid from its
corresponding memory_block.  There is no need to store the nid
in multiple locations.

Note that lmb_to_memblock() uses find_memory_block() to get the
corresponding memory_block.  As find_memory_block() runs in sub-linear
time this approach is negligibly slower than what we do at present.

In exchange for this lookup at hot-remove time we no longer need to
call memory_add_physaddr_to_nid() during drmem_init() for each LMB.
On powerpc, memory_add_physaddr_to_nid() is a linear search, so this
spares us an O(n^2) initialization during boot.

On systems with many LMBs that initialization overhead is palpable and
disruptive.  For example, on a box with 249854 LMBs we're seeing
drmem_init() take upwards of 30 seconds to complete:

[   53.721639] drmem: initializing drmem v2
[   80.604346] watchdog: BUG: soft lockup - CPU#65 stuck for 23s! [swapper/0:1]
[   80.604377] Modules linked in:
[   80.604389] CPU: 65 PID: 1 Comm: swapper/0 Not tainted 5.6.0-rc2+ #4
[   80.604397] NIP:  c0000000000a4980 LR: c0000000000a4940 CTR: 0000000000000000
[   80.604407] REGS: c0002dbff8493830 TRAP: 0901   Not tainted  (5.6.0-rc2+)
[   80.604412] MSR:  8000000002009033 <SF,VEC,EE,ME,IR,DR,RI,LE>  CR: 44000248  XER: 0000000d
[   80.604431] CFAR: c0000000000a4a38 IRQMASK: 0
[   80.604431] GPR00: c0000000000a4940 c0002dbff8493ac0 c000000001904400 c0003cfffffede30
[   80.604431] GPR04: 0000000000000000 c000000000f4095a 000000000000002f 0000000010000000
[   80.604431] GPR08: c0000bf7ecdb7fb8 c0000bf7ecc2d3c8 0000000000000008 c00c0002fdfb2001
[   80.604431] GPR12: 0000000000000000 c00000001e8ec200
[   80.604477] NIP [c0000000000a4980] hot_add_scn_to_nid+0xa0/0x3e0
[   80.604486] LR [c0000000000a4940] hot_add_scn_to_nid+0x60/0x3e0
[   80.604492] Call Trace:
[   80.604498] [c0002dbff8493ac0] [c0000000000a4940] hot_add_scn_to_nid+0x60/0x3e0 (unreliable)
[   80.604509] [c0002dbff8493b20] [c000000000087c10] memory_add_physaddr_to_nid+0x20/0x60
[   80.604521] [c0002dbff8493b40] [c0000000010d4880] drmem_init+0x25c/0x2f0
[   80.604530] [c0002dbff8493c10] [c000000000010154] do_one_initcall+0x64/0x2c0
[   80.604540] [c0002dbff8493ce0] [c0000000010c4aa0] kernel_init_freeable+0x2d8/0x3a0
[   80.604550] [c0002dbff8493db0] [c000000000010824] kernel_init+0x2c/0x148
[   80.604560] [c0002dbff8493e20] [c00000000000b648] ret_from_kernel_thread+0x5c/0x74
[   80.604567] Instruction dump:
[   80.604574] 392918e8 e9490000 e90a000a e92a0000 80ea000c 1d080018 3908ffe8 7d094214
[   80.604586] 7fa94040 419d00dc e9490010 714a0088 <2faa0008> 409e00ac e9490000 7fbe5040
[   89.047390] drmem: 249854 LMB(s)

With a patched kernel on the same machine we're no longer seeing the
soft lockup.  drmem_init() now completes in negligible time, even when
the LMB count is large.

Fixes: b2d3b5ee66 ("powerpc/pseries: Track LMB nid instead of using device tree")
Signed-off-by: Scott Cheloha <cheloha@linux.ibm.com>
Reviewed-by: Nathan Lynch <nathanl@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20200811015115.63677-1-cheloha@linux.ibm.com
2020-09-02 11:00:21 +10:00

471 lines
10 KiB
C

// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Dynamic reconfiguration memory support
*
* Copyright 2017 IBM Corporation
*/
#define pr_fmt(fmt) "drmem: " fmt
#include <linux/kernel.h>
#include <linux/of.h>
#include <linux/of_fdt.h>
#include <linux/memblock.h>
#include <asm/prom.h>
#include <asm/drmem.h>
static int n_root_addr_cells, n_root_size_cells;
static struct drmem_lmb_info __drmem_info;
struct drmem_lmb_info *drmem_info = &__drmem_info;
u64 drmem_lmb_memory_max(void)
{
struct drmem_lmb *last_lmb;
last_lmb = &drmem_info->lmbs[drmem_info->n_lmbs - 1];
return last_lmb->base_addr + drmem_lmb_size();
}
static u32 drmem_lmb_flags(struct drmem_lmb *lmb)
{
/*
* Return the value of the lmb flags field minus the reserved
* bit used internally for hotplug processing.
*/
return lmb->flags & ~DRMEM_LMB_RESERVED;
}
static struct property *clone_property(struct property *prop, u32 prop_sz)
{
struct property *new_prop;
new_prop = kzalloc(sizeof(*new_prop), GFP_KERNEL);
if (!new_prop)
return NULL;
new_prop->name = kstrdup(prop->name, GFP_KERNEL);
new_prop->value = kzalloc(prop_sz, GFP_KERNEL);
if (!new_prop->name || !new_prop->value) {
kfree(new_prop->name);
kfree(new_prop->value);
kfree(new_prop);
return NULL;
}
new_prop->length = prop_sz;
#if defined(CONFIG_OF_DYNAMIC)
of_property_set_flag(new_prop, OF_DYNAMIC);
#endif
return new_prop;
}
static int drmem_update_dt_v1(struct device_node *memory,
struct property *prop)
{
struct property *new_prop;
struct of_drconf_cell_v1 *dr_cell;
struct drmem_lmb *lmb;
u32 *p;
new_prop = clone_property(prop, prop->length);
if (!new_prop)
return -1;
p = new_prop->value;
*p++ = cpu_to_be32(drmem_info->n_lmbs);
dr_cell = (struct of_drconf_cell_v1 *)p;
for_each_drmem_lmb(lmb) {
dr_cell->base_addr = cpu_to_be64(lmb->base_addr);
dr_cell->drc_index = cpu_to_be32(lmb->drc_index);
dr_cell->aa_index = cpu_to_be32(lmb->aa_index);
dr_cell->flags = cpu_to_be32(drmem_lmb_flags(lmb));
dr_cell++;
}
of_update_property(memory, new_prop);
return 0;
}
static void init_drconf_v2_cell(struct of_drconf_cell_v2 *dr_cell,
struct drmem_lmb *lmb)
{
dr_cell->base_addr = cpu_to_be64(lmb->base_addr);
dr_cell->drc_index = cpu_to_be32(lmb->drc_index);
dr_cell->aa_index = cpu_to_be32(lmb->aa_index);
dr_cell->flags = cpu_to_be32(drmem_lmb_flags(lmb));
}
static int drmem_update_dt_v2(struct device_node *memory,
struct property *prop)
{
struct property *new_prop;
struct of_drconf_cell_v2 *dr_cell;
struct drmem_lmb *lmb, *prev_lmb;
u32 lmb_sets, prop_sz, seq_lmbs;
u32 *p;
/* First pass, determine how many LMB sets are needed. */
lmb_sets = 0;
prev_lmb = NULL;
for_each_drmem_lmb(lmb) {
if (!prev_lmb) {
prev_lmb = lmb;
lmb_sets++;
continue;
}
if (prev_lmb->aa_index != lmb->aa_index ||
drmem_lmb_flags(prev_lmb) != drmem_lmb_flags(lmb))
lmb_sets++;
prev_lmb = lmb;
}
prop_sz = lmb_sets * sizeof(*dr_cell) + sizeof(__be32);
new_prop = clone_property(prop, prop_sz);
if (!new_prop)
return -1;
p = new_prop->value;
*p++ = cpu_to_be32(lmb_sets);
dr_cell = (struct of_drconf_cell_v2 *)p;
/* Second pass, populate the LMB set data */
prev_lmb = NULL;
seq_lmbs = 0;
for_each_drmem_lmb(lmb) {
if (prev_lmb == NULL) {
/* Start of first LMB set */
prev_lmb = lmb;
init_drconf_v2_cell(dr_cell, lmb);
seq_lmbs++;
continue;
}
if (prev_lmb->aa_index != lmb->aa_index ||
drmem_lmb_flags(prev_lmb) != drmem_lmb_flags(lmb)) {
/* end of one set, start of another */
dr_cell->seq_lmbs = cpu_to_be32(seq_lmbs);
dr_cell++;
init_drconf_v2_cell(dr_cell, lmb);
seq_lmbs = 1;
} else {
seq_lmbs++;
}
prev_lmb = lmb;
}
/* close out last LMB set */
dr_cell->seq_lmbs = cpu_to_be32(seq_lmbs);
of_update_property(memory, new_prop);
return 0;
}
int drmem_update_dt(void)
{
struct device_node *memory;
struct property *prop;
int rc = -1;
memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
if (!memory)
return -1;
prop = of_find_property(memory, "ibm,dynamic-memory", NULL);
if (prop) {
rc = drmem_update_dt_v1(memory, prop);
} else {
prop = of_find_property(memory, "ibm,dynamic-memory-v2", NULL);
if (prop)
rc = drmem_update_dt_v2(memory, prop);
}
of_node_put(memory);
return rc;
}
static void read_drconf_v1_cell(struct drmem_lmb *lmb,
const __be32 **prop)
{
const __be32 *p = *prop;
lmb->base_addr = of_read_number(p, n_root_addr_cells);
p += n_root_addr_cells;
lmb->drc_index = of_read_number(p++, 1);
p++; /* skip reserved field */
lmb->aa_index = of_read_number(p++, 1);
lmb->flags = of_read_number(p++, 1);
*prop = p;
}
static int
__walk_drmem_v1_lmbs(const __be32 *prop, const __be32 *usm, void *data,
int (*func)(struct drmem_lmb *, const __be32 **, void *))
{
struct drmem_lmb lmb;
u32 i, n_lmbs;
int ret = 0;
n_lmbs = of_read_number(prop++, 1);
for (i = 0; i < n_lmbs; i++) {
read_drconf_v1_cell(&lmb, &prop);
ret = func(&lmb, &usm, data);
if (ret)
break;
}
return ret;
}
static void read_drconf_v2_cell(struct of_drconf_cell_v2 *dr_cell,
const __be32 **prop)
{
const __be32 *p = *prop;
dr_cell->seq_lmbs = of_read_number(p++, 1);
dr_cell->base_addr = of_read_number(p, n_root_addr_cells);
p += n_root_addr_cells;
dr_cell->drc_index = of_read_number(p++, 1);
dr_cell->aa_index = of_read_number(p++, 1);
dr_cell->flags = of_read_number(p++, 1);
*prop = p;
}
static int
__walk_drmem_v2_lmbs(const __be32 *prop, const __be32 *usm, void *data,
int (*func)(struct drmem_lmb *, const __be32 **, void *))
{
struct of_drconf_cell_v2 dr_cell;
struct drmem_lmb lmb;
u32 i, j, lmb_sets;
int ret = 0;
lmb_sets = of_read_number(prop++, 1);
for (i = 0; i < lmb_sets; i++) {
read_drconf_v2_cell(&dr_cell, &prop);
for (j = 0; j < dr_cell.seq_lmbs; j++) {
lmb.base_addr = dr_cell.base_addr;
dr_cell.base_addr += drmem_lmb_size();
lmb.drc_index = dr_cell.drc_index;
dr_cell.drc_index++;
lmb.aa_index = dr_cell.aa_index;
lmb.flags = dr_cell.flags;
ret = func(&lmb, &usm, data);
if (ret)
break;
}
}
return ret;
}
#ifdef CONFIG_PPC_PSERIES
int __init walk_drmem_lmbs_early(unsigned long node, void *data,
int (*func)(struct drmem_lmb *, const __be32 **, void *))
{
const __be32 *prop, *usm;
int len, ret = -ENODEV;
prop = of_get_flat_dt_prop(node, "ibm,lmb-size", &len);
if (!prop || len < dt_root_size_cells * sizeof(__be32))
return ret;
/* Get the address & size cells */
n_root_addr_cells = dt_root_addr_cells;
n_root_size_cells = dt_root_size_cells;
drmem_info->lmb_size = dt_mem_next_cell(dt_root_size_cells, &prop);
usm = of_get_flat_dt_prop(node, "linux,drconf-usable-memory", &len);
prop = of_get_flat_dt_prop(node, "ibm,dynamic-memory", &len);
if (prop) {
ret = __walk_drmem_v1_lmbs(prop, usm, data, func);
} else {
prop = of_get_flat_dt_prop(node, "ibm,dynamic-memory-v2",
&len);
if (prop)
ret = __walk_drmem_v2_lmbs(prop, usm, data, func);
}
memblock_dump_all();
return ret;
}
#endif
static int init_drmem_lmb_size(struct device_node *dn)
{
const __be32 *prop;
int len;
if (drmem_info->lmb_size)
return 0;
prop = of_get_property(dn, "ibm,lmb-size", &len);
if (!prop || len < n_root_size_cells * sizeof(__be32)) {
pr_info("Could not determine LMB size\n");
return -1;
}
drmem_info->lmb_size = of_read_number(prop, n_root_size_cells);
return 0;
}
/*
* Returns the property linux,drconf-usable-memory if
* it exists (the property exists only in kexec/kdump kernels,
* added by kexec-tools)
*/
static const __be32 *of_get_usable_memory(struct device_node *dn)
{
const __be32 *prop;
u32 len;
prop = of_get_property(dn, "linux,drconf-usable-memory", &len);
if (!prop || len < sizeof(unsigned int))
return NULL;
return prop;
}
int walk_drmem_lmbs(struct device_node *dn, void *data,
int (*func)(struct drmem_lmb *, const __be32 **, void *))
{
const __be32 *prop, *usm;
int ret = -ENODEV;
if (!of_root)
return ret;
/* Get the address & size cells */
of_node_get(of_root);
n_root_addr_cells = of_n_addr_cells(of_root);
n_root_size_cells = of_n_size_cells(of_root);
of_node_put(of_root);
if (init_drmem_lmb_size(dn))
return ret;
usm = of_get_usable_memory(dn);
prop = of_get_property(dn, "ibm,dynamic-memory", NULL);
if (prop) {
ret = __walk_drmem_v1_lmbs(prop, usm, data, func);
} else {
prop = of_get_property(dn, "ibm,dynamic-memory-v2", NULL);
if (prop)
ret = __walk_drmem_v2_lmbs(prop, usm, data, func);
}
return ret;
}
static void __init init_drmem_v1_lmbs(const __be32 *prop)
{
struct drmem_lmb *lmb;
drmem_info->n_lmbs = of_read_number(prop++, 1);
if (drmem_info->n_lmbs == 0)
return;
drmem_info->lmbs = kcalloc(drmem_info->n_lmbs, sizeof(*lmb),
GFP_KERNEL);
if (!drmem_info->lmbs)
return;
for_each_drmem_lmb(lmb)
read_drconf_v1_cell(lmb, &prop);
}
static void __init init_drmem_v2_lmbs(const __be32 *prop)
{
struct drmem_lmb *lmb;
struct of_drconf_cell_v2 dr_cell;
const __be32 *p;
u32 i, j, lmb_sets;
int lmb_index;
lmb_sets = of_read_number(prop++, 1);
if (lmb_sets == 0)
return;
/* first pass, calculate the number of LMBs */
p = prop;
for (i = 0; i < lmb_sets; i++) {
read_drconf_v2_cell(&dr_cell, &p);
drmem_info->n_lmbs += dr_cell.seq_lmbs;
}
drmem_info->lmbs = kcalloc(drmem_info->n_lmbs, sizeof(*lmb),
GFP_KERNEL);
if (!drmem_info->lmbs)
return;
/* second pass, read in the LMB information */
lmb_index = 0;
p = prop;
for (i = 0; i < lmb_sets; i++) {
read_drconf_v2_cell(&dr_cell, &p);
for (j = 0; j < dr_cell.seq_lmbs; j++) {
lmb = &drmem_info->lmbs[lmb_index++];
lmb->base_addr = dr_cell.base_addr;
dr_cell.base_addr += drmem_info->lmb_size;
lmb->drc_index = dr_cell.drc_index;
dr_cell.drc_index++;
lmb->aa_index = dr_cell.aa_index;
lmb->flags = dr_cell.flags;
}
}
}
static int __init drmem_init(void)
{
struct device_node *dn;
const __be32 *prop;
dn = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
if (!dn) {
pr_info("No dynamic reconfiguration memory found\n");
return 0;
}
if (init_drmem_lmb_size(dn)) {
of_node_put(dn);
return 0;
}
prop = of_get_property(dn, "ibm,dynamic-memory", NULL);
if (prop) {
init_drmem_v1_lmbs(prop);
} else {
prop = of_get_property(dn, "ibm,dynamic-memory-v2", NULL);
if (prop)
init_drmem_v2_lmbs(prop);
}
of_node_put(dn);
return 0;
}
late_initcall(drmem_init);