mirror of
https://github.com/AuxXxilium/linux_dsm_epyc7002.git
synced 2025-01-13 04:06:37 +07:00
956f8b4450
Patch series "mm/memory_hotplug: allow to specify a default online_type", v3. Distributions nowadays use udev rules ([1] [2]) to specify if and how to online hotplugged memory. The rules seem to get more complex with many special cases. Due to the various special cases, CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE cannot be used. All memory hotplug is handled via udev rules. Every time we hotplug memory, the udev rule will come to the same conclusion. Especially Hyper-V (but also soon virtio-mem) add a lot of memory in separate memory blocks and wait for memory to get onlined by user space before continuing to add more memory blocks (to not add memory faster than it is getting onlined). This of course slows down the whole memory hotplug process. To make the job of distributions easier and to avoid udev rules that get more and more complicated, let's extend the mechanism provided by - /sys/devices/system/memory/auto_online_blocks - "memhp_default_state=" on the kernel cmdline to be able to specify also "online_movable" as well as "online_kernel" === Example /usr/libexec/config-memhotplug === #!/bin/bash VIRT=`systemd-detect-virt --vm` ARCH=`uname -p` sense_virtio_mem() { if [ -d "/sys/bus/virtio/drivers/virtio_mem/" ]; then DEVICES=`find /sys/bus/virtio/drivers/virtio_mem/ -maxdepth 1 -type l | wc -l` if [ $DEVICES != "0" ]; then return 0 fi fi return 1 } if [ ! -e "/sys/devices/system/memory/auto_online_blocks" ]; then echo "Memory hotplug configuration support missing in the kernel" exit 1 fi if grep "memhp_default_state=" /proc/cmdline > /dev/null; then echo "Memory hotplug configuration overridden in kernel cmdline (memhp_default_state=)" exit 1 fi if [ $VIRT == "microsoft" ]; then echo "Detected Hyper-V on $ARCH" # Hyper-V wants all memory in ZONE_NORMAL ONLINE_TYPE="online_kernel" elif sense_virtio_mem; then echo "Detected virtio-mem on $ARCH" # virtio-mem wants all memory in ZONE_NORMAL ONLINE_TYPE="online_kernel" elif [ $ARCH == "s390x" ] || [ $ARCH == "s390" ]; then echo "Detected $ARCH" # standby memory should not be onlined automatically ONLINE_TYPE="offline" elif [ $ARCH == "ppc64" ] || [ $ARCH == "ppc64le" ]; then echo "Detected" $ARCH # PPC64 onlines all hotplugged memory right from the kernel ONLINE_TYPE="offline" elif [ $VIRT == "none" ]; then echo "Detected bare-metal on $ARCH" # Bare metal users expect hotplugged memory to be unpluggable. We assume # that ZONE imbalances on such enterpise servers cannot happen and is # properly documented ONLINE_TYPE="online_movable" else # TODO: Hypervisors that want to unplug DIMMs and can guarantee that ZONE # imbalances won't happen echo "Detected $VIRT on $ARCH" # Usually, ballooning is used in virtual environments, so memory should go to # ZONE_NORMAL. However, sometimes "movable_node" is relevant. ONLINE_TYPE="online" fi echo "Selected online_type:" $ONLINE_TYPE # Configure what to do with memory that will be hotplugged in the future echo $ONLINE_TYPE 2>/dev/null > /sys/devices/system/memory/auto_online_blocks if [ $? != "0" ]; then echo "Memory hotplug cannot be configured (e.g., old kernel or missing permissions)" # A backup udev rule should handle old kernels if necessary exit 1 fi # Process all already pluggedd blocks (e.g., DIMMs, but also Hyper-V or virtio-mem) if [ $ONLINE_TYPE != "offline" ]; then for MEMORY in /sys/devices/system/memory/memory*; do STATE=`cat $MEMORY/state` if [ $STATE == "offline" ]; then echo $ONLINE_TYPE > $MEMORY/state fi done fi === Example /usr/lib/systemd/system/config-memhotplug.service === [Unit] Description=Configure memory hotplug behavior DefaultDependencies=no Conflicts=shutdown.target Before=sysinit.target shutdown.target After=systemd-modules-load.service ConditionPathExists=|/sys/devices/system/memory/auto_online_blocks [Service] ExecStart=/usr/libexec/config-memhotplug Type=oneshot TimeoutSec=0 RemainAfterExit=yes [Install] WantedBy=sysinit.target === Example modification to the 40-redhat.rules [2] === : diff --git a/40-redhat.rules b/40-redhat.rules-new : index 2c690e5..168fd03 100644 : --- a/40-redhat.rules : +++ b/40-redhat.rules-new : @@ -6,6 +6,9 @@ SUBSYSTEM=="cpu", ACTION=="add", TEST=="online", ATTR{online}=="0", ATTR{online} : # Memory hotadd request : SUBSYSTEM!="memory", GOTO="memory_hotplug_end" : ACTION!="add", GOTO="memory_hotplug_end" : +# memory hotplug behavior configured : +PROGRAM=="grep online /sys/devices/system/memory/auto_online_blocks", GOTO="memory_hotplug_end" : + : PROGRAM="/bin/uname -p", RESULT=="s390*", GOTO="memory_hotplug_end" : : ENV{.state}="online" === [1] https://github.com/lnykryn/systemd-rhel/pull/281 [2] https://github.com/lnykryn/systemd-rhel/blob/staging/rules/40-redhat.rules This patch (of 8): The name is misleading and it's not really clear what is "kept". Let's just name it like the online_type name we expose to user space ("online"). Add some documentation to the types. Signed-off-by: David Hildenbrand <david@redhat.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Reviewed-by: Wei Yang <richard.weiyang@gmail.com> Reviewed-by: Baoquan He <bhe@redhat.com> Acked-by: Pankaj Gupta <pankaj.gupta.linux@gmail.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Michal Hocko <mhocko@kernel.org> Cc: Oscar Salvador <osalvador@suse.de> Cc: "Rafael J. Wysocki" <rafael@kernel.org> Cc: Wei Yang <richard.weiyang@gmail.com> Cc: Vitaly Kuznetsov <vkuznets@redhat.com> Cc: Yumei Huang <yuhuang@redhat.com> Cc: Igor Mammedov <imammedo@redhat.com> Cc: Eduardo Habkost <ehabkost@redhat.com> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org> Cc: Haiyang Zhang <haiyangz@microsoft.com> Cc: K. Y. Srinivasan <kys@microsoft.com> Cc: Michael Ellerman <mpe@ellerman.id.au> (powerpc) Cc: Paul Mackerras <paulus@samba.org> Cc: Stephen Hemminger <sthemmin@microsoft.com> Cc: Wei Liu <wei.liu@kernel.org> Link: http://lkml.kernel.org/r/20200319131221.14044-1-david@redhat.com Link: http://lkml.kernel.org/r/20200317104942.11178-2-david@redhat.com Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
366 lines
11 KiB
C
366 lines
11 KiB
C
/* SPDX-License-Identifier: GPL-2.0 */
|
|
#ifndef __LINUX_MEMORY_HOTPLUG_H
|
|
#define __LINUX_MEMORY_HOTPLUG_H
|
|
|
|
#include <linux/mmzone.h>
|
|
#include <linux/spinlock.h>
|
|
#include <linux/notifier.h>
|
|
#include <linux/bug.h>
|
|
|
|
struct page;
|
|
struct zone;
|
|
struct pglist_data;
|
|
struct mem_section;
|
|
struct memory_block;
|
|
struct resource;
|
|
struct vmem_altmap;
|
|
|
|
#ifdef CONFIG_MEMORY_HOTPLUG
|
|
/*
|
|
* Return page for the valid pfn only if the page is online. All pfn
|
|
* walkers which rely on the fully initialized page->flags and others
|
|
* should use this rather than pfn_valid && pfn_to_page
|
|
*/
|
|
#define pfn_to_online_page(pfn) \
|
|
({ \
|
|
struct page *___page = NULL; \
|
|
unsigned long ___pfn = pfn; \
|
|
unsigned long ___nr = pfn_to_section_nr(___pfn); \
|
|
\
|
|
if (___nr < NR_MEM_SECTIONS && online_section_nr(___nr) && \
|
|
pfn_valid_within(___pfn)) \
|
|
___page = pfn_to_page(___pfn); \
|
|
___page; \
|
|
})
|
|
|
|
/*
|
|
* Types for free bootmem stored in page->lru.next. These have to be in
|
|
* some random range in unsigned long space for debugging purposes.
|
|
*/
|
|
enum {
|
|
MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE = 12,
|
|
SECTION_INFO = MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE,
|
|
MIX_SECTION_INFO,
|
|
NODE_INFO,
|
|
MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE = NODE_INFO,
|
|
};
|
|
|
|
/* Types for control the zone type of onlined and offlined memory */
|
|
enum {
|
|
/* Offline the memory. */
|
|
MMOP_OFFLINE = -1,
|
|
/* Online the memory. Zone depends, see default_zone_for_pfn(). */
|
|
MMOP_ONLINE,
|
|
/* Online the memory to ZONE_NORMAL. */
|
|
MMOP_ONLINE_KERNEL,
|
|
/* Online the memory to ZONE_MOVABLE. */
|
|
MMOP_ONLINE_MOVABLE,
|
|
};
|
|
|
|
/*
|
|
* Restrictions for the memory hotplug:
|
|
* flags: MHP_ flags
|
|
* altmap: alternative allocator for memmap array
|
|
*/
|
|
struct mhp_restrictions {
|
|
unsigned long flags;
|
|
struct vmem_altmap *altmap;
|
|
};
|
|
|
|
/*
|
|
* Zone resizing functions
|
|
*
|
|
* Note: any attempt to resize a zone should has pgdat_resize_lock()
|
|
* zone_span_writelock() both held. This ensure the size of a zone
|
|
* can't be changed while pgdat_resize_lock() held.
|
|
*/
|
|
static inline unsigned zone_span_seqbegin(struct zone *zone)
|
|
{
|
|
return read_seqbegin(&zone->span_seqlock);
|
|
}
|
|
static inline int zone_span_seqretry(struct zone *zone, unsigned iv)
|
|
{
|
|
return read_seqretry(&zone->span_seqlock, iv);
|
|
}
|
|
static inline void zone_span_writelock(struct zone *zone)
|
|
{
|
|
write_seqlock(&zone->span_seqlock);
|
|
}
|
|
static inline void zone_span_writeunlock(struct zone *zone)
|
|
{
|
|
write_sequnlock(&zone->span_seqlock);
|
|
}
|
|
static inline void zone_seqlock_init(struct zone *zone)
|
|
{
|
|
seqlock_init(&zone->span_seqlock);
|
|
}
|
|
extern int zone_grow_free_lists(struct zone *zone, unsigned long new_nr_pages);
|
|
extern int zone_grow_waitqueues(struct zone *zone, unsigned long nr_pages);
|
|
extern int add_one_highpage(struct page *page, int pfn, int bad_ppro);
|
|
/* VM interface that may be used by firmware interface */
|
|
extern int online_pages(unsigned long pfn, unsigned long nr_pages,
|
|
int online_type, int nid);
|
|
extern struct zone *test_pages_in_a_zone(unsigned long start_pfn,
|
|
unsigned long end_pfn);
|
|
extern unsigned long __offline_isolated_pages(unsigned long start_pfn,
|
|
unsigned long end_pfn);
|
|
|
|
typedef void (*online_page_callback_t)(struct page *page, unsigned int order);
|
|
|
|
extern void generic_online_page(struct page *page, unsigned int order);
|
|
extern int set_online_page_callback(online_page_callback_t callback);
|
|
extern int restore_online_page_callback(online_page_callback_t callback);
|
|
|
|
extern int try_online_node(int nid);
|
|
|
|
extern int arch_add_memory(int nid, u64 start, u64 size,
|
|
struct mhp_restrictions *restrictions);
|
|
extern u64 max_mem_size;
|
|
|
|
extern bool memhp_auto_online;
|
|
/* If movable_node boot option specified */
|
|
extern bool movable_node_enabled;
|
|
static inline bool movable_node_is_enabled(void)
|
|
{
|
|
return movable_node_enabled;
|
|
}
|
|
|
|
extern void arch_remove_memory(int nid, u64 start, u64 size,
|
|
struct vmem_altmap *altmap);
|
|
extern void __remove_pages(unsigned long start_pfn, unsigned long nr_pages,
|
|
struct vmem_altmap *altmap);
|
|
|
|
/* reasonably generic interface to expand the physical pages */
|
|
extern int __add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages,
|
|
struct mhp_restrictions *restrictions);
|
|
|
|
#ifndef CONFIG_ARCH_HAS_ADD_PAGES
|
|
static inline int add_pages(int nid, unsigned long start_pfn,
|
|
unsigned long nr_pages, struct mhp_restrictions *restrictions)
|
|
{
|
|
return __add_pages(nid, start_pfn, nr_pages, restrictions);
|
|
}
|
|
#else /* ARCH_HAS_ADD_PAGES */
|
|
int add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages,
|
|
struct mhp_restrictions *restrictions);
|
|
#endif /* ARCH_HAS_ADD_PAGES */
|
|
|
|
#ifdef CONFIG_NUMA
|
|
extern int memory_add_physaddr_to_nid(u64 start);
|
|
#else
|
|
static inline int memory_add_physaddr_to_nid(u64 start)
|
|
{
|
|
return 0;
|
|
}
|
|
#endif
|
|
|
|
#ifdef CONFIG_HAVE_ARCH_NODEDATA_EXTENSION
|
|
/*
|
|
* For supporting node-hotadd, we have to allocate a new pgdat.
|
|
*
|
|
* If an arch has generic style NODE_DATA(),
|
|
* node_data[nid] = kzalloc() works well. But it depends on the architecture.
|
|
*
|
|
* In general, generic_alloc_nodedata() is used.
|
|
* Now, arch_free_nodedata() is just defined for error path of node_hot_add.
|
|
*
|
|
*/
|
|
extern pg_data_t *arch_alloc_nodedata(int nid);
|
|
extern void arch_free_nodedata(pg_data_t *pgdat);
|
|
extern void arch_refresh_nodedata(int nid, pg_data_t *pgdat);
|
|
|
|
#else /* CONFIG_HAVE_ARCH_NODEDATA_EXTENSION */
|
|
|
|
#define arch_alloc_nodedata(nid) generic_alloc_nodedata(nid)
|
|
#define arch_free_nodedata(pgdat) generic_free_nodedata(pgdat)
|
|
|
|
#ifdef CONFIG_NUMA
|
|
/*
|
|
* If ARCH_HAS_NODEDATA_EXTENSION=n, this func is used to allocate pgdat.
|
|
* XXX: kmalloc_node() can't work well to get new node's memory at this time.
|
|
* Because, pgdat for the new node is not allocated/initialized yet itself.
|
|
* To use new node's memory, more consideration will be necessary.
|
|
*/
|
|
#define generic_alloc_nodedata(nid) \
|
|
({ \
|
|
kzalloc(sizeof(pg_data_t), GFP_KERNEL); \
|
|
})
|
|
/*
|
|
* This definition is just for error path in node hotadd.
|
|
* For node hotremove, we have to replace this.
|
|
*/
|
|
#define generic_free_nodedata(pgdat) kfree(pgdat)
|
|
|
|
extern pg_data_t *node_data[];
|
|
static inline void arch_refresh_nodedata(int nid, pg_data_t *pgdat)
|
|
{
|
|
node_data[nid] = pgdat;
|
|
}
|
|
|
|
#else /* !CONFIG_NUMA */
|
|
|
|
/* never called */
|
|
static inline pg_data_t *generic_alloc_nodedata(int nid)
|
|
{
|
|
BUG();
|
|
return NULL;
|
|
}
|
|
static inline void generic_free_nodedata(pg_data_t *pgdat)
|
|
{
|
|
}
|
|
static inline void arch_refresh_nodedata(int nid, pg_data_t *pgdat)
|
|
{
|
|
}
|
|
#endif /* CONFIG_NUMA */
|
|
#endif /* CONFIG_HAVE_ARCH_NODEDATA_EXTENSION */
|
|
|
|
#ifdef CONFIG_HAVE_BOOTMEM_INFO_NODE
|
|
extern void __init register_page_bootmem_info_node(struct pglist_data *pgdat);
|
|
#else
|
|
static inline void register_page_bootmem_info_node(struct pglist_data *pgdat)
|
|
{
|
|
}
|
|
#endif
|
|
extern void put_page_bootmem(struct page *page);
|
|
extern void get_page_bootmem(unsigned long ingo, struct page *page,
|
|
unsigned long type);
|
|
|
|
void get_online_mems(void);
|
|
void put_online_mems(void);
|
|
|
|
void mem_hotplug_begin(void);
|
|
void mem_hotplug_done(void);
|
|
|
|
#else /* ! CONFIG_MEMORY_HOTPLUG */
|
|
#define pfn_to_online_page(pfn) \
|
|
({ \
|
|
struct page *___page = NULL; \
|
|
if (pfn_valid(pfn)) \
|
|
___page = pfn_to_page(pfn); \
|
|
___page; \
|
|
})
|
|
|
|
static inline unsigned zone_span_seqbegin(struct zone *zone)
|
|
{
|
|
return 0;
|
|
}
|
|
static inline int zone_span_seqretry(struct zone *zone, unsigned iv)
|
|
{
|
|
return 0;
|
|
}
|
|
static inline void zone_span_writelock(struct zone *zone) {}
|
|
static inline void zone_span_writeunlock(struct zone *zone) {}
|
|
static inline void zone_seqlock_init(struct zone *zone) {}
|
|
|
|
static inline int mhp_notimplemented(const char *func)
|
|
{
|
|
printk(KERN_WARNING "%s() called, with CONFIG_MEMORY_HOTPLUG disabled\n", func);
|
|
dump_stack();
|
|
return -ENOSYS;
|
|
}
|
|
|
|
static inline void register_page_bootmem_info_node(struct pglist_data *pgdat)
|
|
{
|
|
}
|
|
|
|
static inline int try_online_node(int nid)
|
|
{
|
|
return 0;
|
|
}
|
|
|
|
static inline void get_online_mems(void) {}
|
|
static inline void put_online_mems(void) {}
|
|
|
|
static inline void mem_hotplug_begin(void) {}
|
|
static inline void mem_hotplug_done(void) {}
|
|
|
|
static inline bool movable_node_is_enabled(void)
|
|
{
|
|
return false;
|
|
}
|
|
#endif /* ! CONFIG_MEMORY_HOTPLUG */
|
|
|
|
#if defined(CONFIG_MEMORY_HOTPLUG) || defined(CONFIG_DEFERRED_STRUCT_PAGE_INIT)
|
|
/*
|
|
* pgdat resizing functions
|
|
*/
|
|
static inline
|
|
void pgdat_resize_lock(struct pglist_data *pgdat, unsigned long *flags)
|
|
{
|
|
spin_lock_irqsave(&pgdat->node_size_lock, *flags);
|
|
}
|
|
static inline
|
|
void pgdat_resize_unlock(struct pglist_data *pgdat, unsigned long *flags)
|
|
{
|
|
spin_unlock_irqrestore(&pgdat->node_size_lock, *flags);
|
|
}
|
|
static inline
|
|
void pgdat_resize_init(struct pglist_data *pgdat)
|
|
{
|
|
spin_lock_init(&pgdat->node_size_lock);
|
|
}
|
|
#else /* !(CONFIG_MEMORY_HOTPLUG || CONFIG_DEFERRED_STRUCT_PAGE_INIT) */
|
|
/*
|
|
* Stub functions for when hotplug is off
|
|
*/
|
|
static inline void pgdat_resize_lock(struct pglist_data *p, unsigned long *f) {}
|
|
static inline void pgdat_resize_unlock(struct pglist_data *p, unsigned long *f) {}
|
|
static inline void pgdat_resize_init(struct pglist_data *pgdat) {}
|
|
#endif /* !(CONFIG_MEMORY_HOTPLUG || CONFIG_DEFERRED_STRUCT_PAGE_INIT) */
|
|
|
|
#ifdef CONFIG_MEMORY_HOTREMOVE
|
|
|
|
extern bool is_mem_section_removable(unsigned long pfn, unsigned long nr_pages);
|
|
extern void try_offline_node(int nid);
|
|
extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages);
|
|
extern int remove_memory(int nid, u64 start, u64 size);
|
|
extern void __remove_memory(int nid, u64 start, u64 size);
|
|
|
|
#else
|
|
static inline bool is_mem_section_removable(unsigned long pfn,
|
|
unsigned long nr_pages)
|
|
{
|
|
return false;
|
|
}
|
|
|
|
static inline void try_offline_node(int nid) {}
|
|
|
|
static inline int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
|
|
{
|
|
return -EINVAL;
|
|
}
|
|
|
|
static inline int remove_memory(int nid, u64 start, u64 size)
|
|
{
|
|
return -EBUSY;
|
|
}
|
|
|
|
static inline void __remove_memory(int nid, u64 start, u64 size) {}
|
|
#endif /* CONFIG_MEMORY_HOTREMOVE */
|
|
|
|
extern void set_zone_contiguous(struct zone *zone);
|
|
extern void clear_zone_contiguous(struct zone *zone);
|
|
|
|
extern void __ref free_area_init_core_hotplug(int nid);
|
|
extern int __add_memory(int nid, u64 start, u64 size);
|
|
extern int add_memory(int nid, u64 start, u64 size);
|
|
extern int add_memory_resource(int nid, struct resource *resource);
|
|
extern void move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn,
|
|
unsigned long nr_pages, struct vmem_altmap *altmap);
|
|
extern void remove_pfn_range_from_zone(struct zone *zone,
|
|
unsigned long start_pfn,
|
|
unsigned long nr_pages);
|
|
extern bool is_memblock_offlined(struct memory_block *mem);
|
|
extern int sparse_add_section(int nid, unsigned long pfn,
|
|
unsigned long nr_pages, struct vmem_altmap *altmap);
|
|
extern void sparse_remove_section(struct mem_section *ms,
|
|
unsigned long pfn, unsigned long nr_pages,
|
|
unsigned long map_offset, struct vmem_altmap *altmap);
|
|
extern struct page *sparse_decode_mem_map(unsigned long coded_mem_map,
|
|
unsigned long pnum);
|
|
extern bool allow_online_pfn_range(int nid, unsigned long pfn, unsigned long nr_pages,
|
|
int online_type);
|
|
extern struct zone *zone_for_pfn_range(int online_type, int nid, unsigned start_pfn,
|
|
unsigned long nr_pages);
|
|
#endif /* __LINUX_MEMORY_HOTPLUG_H */
|