mirror of
https://github.com/AuxXxilium/linux_dsm_epyc7002.git
synced 2024-12-28 11:18:45 +07:00
d4edc5b6c4
On POWER platforms, the hypervisor can notify the guest kernel about dynamic
changes in the cpu-numa associativity (VPHN topology update). Hence the
cpu-to-node mappings that we got from the firmware during boot, may no longer
be valid after such updates. This is handled using the arch_update_cpu_topology()
hook in the scheduler, and the sched-domains are rebuilt according to the new
mappings.
But unfortunately, at the moment, CPU hotplug ignores these updated mappings
and instead queries the firmware for the cpu-to-numa relationships and uses
them during CPU online. So the kernel can end up assigning wrong NUMA nodes
to CPUs during subsequent CPU hotplug online operations (after booting).
Further, a particularly problematic scenario can result from this bug:
On POWER platforms, the SMT mode can be switched between 1, 2, 4 (and even 8)
threads per core. The switch to Single-Threaded (ST) mode is performed by
offlining all except the first CPU thread in each core. Switching back to
SMT mode involves onlining those other threads back, in each core.
Now consider this scenario:
1. During boot, the kernel gets the cpu-to-node mappings from the firmware
and assigns the CPUs to NUMA nodes appropriately, during CPU online.
2. Later on, the hypervisor updates the cpu-to-node mappings dynamically and
communicates this update to the kernel. The kernel in turn updates its
cpu-to-node associations and rebuilds its sched domains. Everything is
fine so far.
3. Now, the user switches the machine from SMT to ST mode (say, by running
ppc64_cpu --smt=1). This involves offlining all except 1 thread in each
core.
4. The user then tries to switch back from ST to SMT mode (say, by running
ppc64_cpu --smt=4), and this involves onlining those threads back. Since
CPU hotplug ignores the new mappings, it queries the firmware and tries to
associate the newly onlined sibling threads to the old NUMA nodes. This
results in sibling threads within the same core getting associated with
different NUMA nodes, which is incorrect.
The scheduler's build-sched-domains code gets thoroughly confused with this
and enters an infinite loop and causes soft-lockups, as explained in detail
in commit 3be7db6ab
(powerpc: VPHN topology change updates all siblings).
So to fix this, use the numa_cpu_lookup_table to remember the updated
cpu-to-node mappings, and use them during CPU hotplug online operations.
Further, we also need to ensure that all threads in a core are assigned to a
common NUMA node, irrespective of whether all those threads were online during
the topology update. To achieve this, we take care not to use cpu_sibling_mask()
since it is not hotplug invariant. Instead, we use cpu_first_sibling_thread()
and set up the mappings manually using the 'threads_per_core' value for that
particular platform. This helps us ensure that we don't hit this bug with any
combination of CPU hotplug and SMT mode switching.
Cc: stable@vger.kernel.org
Signed-off-by: Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
116 lines
2.6 KiB
C
116 lines
2.6 KiB
C
#ifndef _ASM_POWERPC_TOPOLOGY_H
|
|
#define _ASM_POWERPC_TOPOLOGY_H
|
|
#ifdef __KERNEL__
|
|
|
|
|
|
struct device;
|
|
struct device_node;
|
|
|
|
#ifdef CONFIG_NUMA
|
|
|
|
/*
|
|
* Before going off node we want the VM to try and reclaim from the local
|
|
* node. It does this if the remote distance is larger than RECLAIM_DISTANCE.
|
|
* With the default REMOTE_DISTANCE of 20 and the default RECLAIM_DISTANCE of
|
|
* 20, we never reclaim and go off node straight away.
|
|
*
|
|
* To fix this we choose a smaller value of RECLAIM_DISTANCE.
|
|
*/
|
|
#define RECLAIM_DISTANCE 10
|
|
|
|
#include <asm/mmzone.h>
|
|
|
|
static inline int cpu_to_node(int cpu)
|
|
{
|
|
int nid;
|
|
|
|
nid = numa_cpu_lookup_table[cpu];
|
|
|
|
/*
|
|
* During early boot, the numa-cpu lookup table might not have been
|
|
* setup for all CPUs yet. In such cases, default to node 0.
|
|
*/
|
|
return (nid < 0) ? 0 : nid;
|
|
}
|
|
|
|
#define parent_node(node) (node)
|
|
|
|
#define cpumask_of_node(node) ((node) == -1 ? \
|
|
cpu_all_mask : \
|
|
node_to_cpumask_map[node])
|
|
|
|
struct pci_bus;
|
|
#ifdef CONFIG_PCI
|
|
extern int pcibus_to_node(struct pci_bus *bus);
|
|
#else
|
|
static inline int pcibus_to_node(struct pci_bus *bus)
|
|
{
|
|
return -1;
|
|
}
|
|
#endif
|
|
|
|
#define cpumask_of_pcibus(bus) (pcibus_to_node(bus) == -1 ? \
|
|
cpu_all_mask : \
|
|
cpumask_of_node(pcibus_to_node(bus)))
|
|
|
|
extern int __node_distance(int, int);
|
|
#define node_distance(a, b) __node_distance(a, b)
|
|
|
|
extern void __init dump_numa_cpu_topology(void);
|
|
|
|
extern int sysfs_add_device_to_node(struct device *dev, int nid);
|
|
extern void sysfs_remove_device_from_node(struct device *dev, int nid);
|
|
|
|
#else
|
|
|
|
static inline void dump_numa_cpu_topology(void) {}
|
|
|
|
static inline int sysfs_add_device_to_node(struct device *dev, int nid)
|
|
{
|
|
return 0;
|
|
}
|
|
|
|
static inline void sysfs_remove_device_from_node(struct device *dev,
|
|
int nid)
|
|
{
|
|
}
|
|
#endif /* CONFIG_NUMA */
|
|
|
|
#if defined(CONFIG_NUMA) && defined(CONFIG_PPC_SPLPAR)
|
|
extern int start_topology_update(void);
|
|
extern int stop_topology_update(void);
|
|
extern int prrn_is_enabled(void);
|
|
#else
|
|
static inline int start_topology_update(void)
|
|
{
|
|
return 0;
|
|
}
|
|
static inline int stop_topology_update(void)
|
|
{
|
|
return 0;
|
|
}
|
|
static inline int prrn_is_enabled(void)
|
|
{
|
|
return 0;
|
|
}
|
|
#endif /* CONFIG_NUMA && CONFIG_PPC_SPLPAR */
|
|
|
|
#include <asm-generic/topology.h>
|
|
|
|
#ifdef CONFIG_SMP
|
|
#include <asm/cputable.h>
|
|
#define smt_capable() (cpu_has_feature(CPU_FTR_SMT))
|
|
|
|
#ifdef CONFIG_PPC64
|
|
#include <asm/smp.h>
|
|
|
|
#define topology_physical_package_id(cpu) (cpu_to_chip_id(cpu))
|
|
#define topology_thread_cpumask(cpu) (per_cpu(cpu_sibling_map, cpu))
|
|
#define topology_core_cpumask(cpu) (per_cpu(cpu_core_map, cpu))
|
|
#define topology_core_id(cpu) (cpu_to_core_id(cpu))
|
|
#endif
|
|
#endif
|
|
|
|
#endif /* __KERNEL__ */
|
|
#endif /* _ASM_POWERPC_TOPOLOGY_H */
|