linux_dsm_epyc7002/include/linux/memblock.h
Naoya Horiguchi 907ec5fca3 mm: zero remaining unavailable struct pages
Patch series "mm: Fix for movable_node boot option", v3.

This patch series contains a fix for the movable_node boot option issue
which was introduced by commit 124049decb ("x86/e820: put !E820_TYPE_RAM
regions into memblock.reserved").

The commit breaks the option because it changed the memory gap range to
reserved memblock.  So, the node is marked as Normal zone even if the SRAT
has Hot pluggable affinity.

First and second patch fix the original issue which the commit tried to
fix, then revert the commit.

This patch (of 3):

There is a kernel panic that is triggered when reading /proc/kpageflags on
the kernel booted with kernel parameter 'memmap=nn[KMG]!ss[KMG]':

  BUG: unable to handle kernel paging request at fffffffffffffffe
  PGD 9b20e067 P4D 9b20e067 PUD 9b210067 PMD 0
  Oops: 0000 [#1] SMP PTI
  CPU: 2 PID: 1728 Comm: page-types Not tainted 4.17.0-rc6-mm1-v4.17-rc6-180605-0816-00236-g2dfb086ef02c+ #160
  Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.11.0-2.fc28 04/01/2014
  RIP: 0010:stable_page_flags+0x27/0x3c0
  Code: 00 00 00 0f 1f 44 00 00 48 85 ff 0f 84 a0 03 00 00 41 54 55 49 89 fc 53 48 8b 57 08 48 8b 2f 48 8d 42 ff 83 e2 01 48 0f 44 c7 <48> 8b 00 f6 c4 01 0f 84 10 03 00 00 31 db 49 8b 54 24 08 4c 89 e7
  RSP: 0018:ffffbbd44111fde0 EFLAGS: 00010202
  RAX: fffffffffffffffe RBX: 00007fffffffeff9 RCX: 0000000000000000
  RDX: 0000000000000001 RSI: 0000000000000202 RDI: ffffed1182fff5c0
  RBP: ffffffffffffffff R08: 0000000000000001 R09: 0000000000000001
  R10: ffffbbd44111fed8 R11: 0000000000000000 R12: ffffed1182fff5c0
  R13: 00000000000bffd7 R14: 0000000002fff5c0 R15: ffffbbd44111ff10
  FS:  00007efc4335a500(0000) GS:ffff93a5bfc00000(0000) knlGS:0000000000000000
  CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
  CR2: fffffffffffffffe CR3: 00000000b2a58000 CR4: 00000000001406e0
  Call Trace:
   kpageflags_read+0xc7/0x120
   proc_reg_read+0x3c/0x60
   __vfs_read+0x36/0x170
   vfs_read+0x89/0x130
   ksys_pread64+0x71/0x90
   do_syscall_64+0x5b/0x160
   entry_SYSCALL_64_after_hwframe+0x44/0xa9
  RIP: 0033:0x7efc42e75e23
  Code: 09 00 ba 9f 01 00 00 e8 ab 81 f4 ff 66 2e 0f 1f 84 00 00 00 00 00 90 83 3d 29 0a 2d 00 00 75 13 49 89 ca b8 11 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 34 c3 48 83 ec 08 e8 db d3 01 00 48 89 04 24

According to kernel bisection, this problem became visible due to commit
f7f99100d8 which changes how struct pages are initialized.

Memblock layout affects the pfn ranges covered by node/zone.  Consider
that we have a VM with 2 NUMA nodes and each node has 4GB memory, and the
default (no memmap= given) memblock layout is like below:

  MEMBLOCK configuration:
   memory size = 0x00000001fff75c00 reserved size = 0x000000000300c000
   memory.cnt  = 0x4
   memory[0x0]     [0x0000000000001000-0x000000000009efff], 0x000000000009e000 bytes on node 0 flags: 0x0
   memory[0x1]     [0x0000000000100000-0x00000000bffd6fff], 0x00000000bfed7000 bytes on node 0 flags: 0x0
   memory[0x2]     [0x0000000100000000-0x000000013fffffff], 0x0000000040000000 bytes on node 0 flags: 0x0
   memory[0x3]     [0x0000000140000000-0x000000023fffffff], 0x0000000100000000 bytes on node 1 flags: 0x0
   ...

If you give memmap=1G!4G (so it just covers memory[0x2]),
the range [0x100000000-0x13fffffff] is gone:

  MEMBLOCK configuration:
   memory size = 0x00000001bff75c00 reserved size = 0x000000000300c000
   memory.cnt  = 0x3
   memory[0x0]     [0x0000000000001000-0x000000000009efff], 0x000000000009e000 bytes on node 0 flags: 0x0
   memory[0x1]     [0x0000000000100000-0x00000000bffd6fff], 0x00000000bfed7000 bytes on node 0 flags: 0x0
   memory[0x2]     [0x0000000140000000-0x000000023fffffff], 0x0000000100000000 bytes on node 1 flags: 0x0
   ...

This causes shrinking node 0's pfn range because it is calculated by the
address range of memblock.memory.  So some of struct pages in the gap
range are left uninitialized.

We have a function zero_resv_unavail() which does zeroing the struct pages
outside memblock.memory, but currently it covers only the reserved
unavailable range (i.e.  memblock.memory && !memblock.reserved).  This
patch extends it to cover all unavailable range, which fixes the reported
issue.

Link: http://lkml.kernel.org/r/20181002143821.5112-2-msys.mizuma@gmail.com
Fixes: f7f99100d8 ("mm: stop zeroing memory during allocation in vmemmap")
Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Signed-off-by-by: Masayoshi Mizuma <m.mizuma@jp.fujitsu.com>
Tested-by: Oscar Salvador <osalvador@suse.de>
Tested-by: Masayoshi Mizuma <m.mizuma@jp.fujitsu.com>
Reviewed-by: Pavel Tatashin <pavel.tatashin@microsoft.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2018-10-26 16:38:15 -07:00

453 lines
15 KiB
C

#ifndef _LINUX_MEMBLOCK_H
#define _LINUX_MEMBLOCK_H
#ifdef __KERNEL__
#ifdef CONFIG_HAVE_MEMBLOCK
/*
* Logical memory blocks.
*
* Copyright (C) 2001 Peter Bergner, IBM Corp.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*/
#include <linux/init.h>
#include <linux/mm.h>
#define INIT_MEMBLOCK_REGIONS 128
#define INIT_PHYSMEM_REGIONS 4
/**
* enum memblock_flags - definition of memory region attributes
* @MEMBLOCK_NONE: no special request
* @MEMBLOCK_HOTPLUG: hotpluggable region
* @MEMBLOCK_MIRROR: mirrored region
* @MEMBLOCK_NOMAP: don't add to kernel direct mapping
*/
enum memblock_flags {
MEMBLOCK_NONE = 0x0, /* No special request */
MEMBLOCK_HOTPLUG = 0x1, /* hotpluggable region */
MEMBLOCK_MIRROR = 0x2, /* mirrored region */
MEMBLOCK_NOMAP = 0x4, /* don't add to kernel direct mapping */
};
/**
* struct memblock_region - represents a memory region
* @base: physical address of the region
* @size: size of the region
* @flags: memory region attributes
* @nid: NUMA node id
*/
struct memblock_region {
phys_addr_t base;
phys_addr_t size;
enum memblock_flags flags;
#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
int nid;
#endif
};
/**
* struct memblock_type - collection of memory regions of certain type
* @cnt: number of regions
* @max: size of the allocated array
* @total_size: size of all regions
* @regions: array of regions
* @name: the memory type symbolic name
*/
struct memblock_type {
unsigned long cnt;
unsigned long max;
phys_addr_t total_size;
struct memblock_region *regions;
char *name;
};
/**
* struct memblock - memblock allocator metadata
* @bottom_up: is bottom up direction?
* @current_limit: physical address of the current allocation limit
* @memory: usabe memory regions
* @reserved: reserved memory regions
* @physmem: all physical memory
*/
struct memblock {
bool bottom_up; /* is bottom up direction? */
phys_addr_t current_limit;
struct memblock_type memory;
struct memblock_type reserved;
#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP
struct memblock_type physmem;
#endif
};
extern struct memblock memblock;
extern int memblock_debug;
#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK
#define __init_memblock __meminit
#define __initdata_memblock __meminitdata
void memblock_discard(void);
#else
#define __init_memblock
#define __initdata_memblock
#endif
#define memblock_dbg(fmt, ...) \
if (memblock_debug) printk(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__)
phys_addr_t memblock_find_in_range_node(phys_addr_t size, phys_addr_t align,
phys_addr_t start, phys_addr_t end,
int nid, enum memblock_flags flags);
phys_addr_t memblock_find_in_range(phys_addr_t start, phys_addr_t end,
phys_addr_t size, phys_addr_t align);
void memblock_allow_resize(void);
int memblock_add_node(phys_addr_t base, phys_addr_t size, int nid);
int memblock_add(phys_addr_t base, phys_addr_t size);
int memblock_remove(phys_addr_t base, phys_addr_t size);
int memblock_free(phys_addr_t base, phys_addr_t size);
int memblock_reserve(phys_addr_t base, phys_addr_t size);
void memblock_trim_memory(phys_addr_t align);
bool memblock_overlaps_region(struct memblock_type *type,
phys_addr_t base, phys_addr_t size);
int memblock_mark_hotplug(phys_addr_t base, phys_addr_t size);
int memblock_clear_hotplug(phys_addr_t base, phys_addr_t size);
int memblock_mark_mirror(phys_addr_t base, phys_addr_t size);
int memblock_mark_nomap(phys_addr_t base, phys_addr_t size);
int memblock_clear_nomap(phys_addr_t base, phys_addr_t size);
enum memblock_flags choose_memblock_flags(void);
/* Low level functions */
int memblock_add_range(struct memblock_type *type,
phys_addr_t base, phys_addr_t size,
int nid, enum memblock_flags flags);
void __next_mem_range(u64 *idx, int nid, enum memblock_flags flags,
struct memblock_type *type_a,
struct memblock_type *type_b, phys_addr_t *out_start,
phys_addr_t *out_end, int *out_nid);
void __next_mem_range_rev(u64 *idx, int nid, enum memblock_flags flags,
struct memblock_type *type_a,
struct memblock_type *type_b, phys_addr_t *out_start,
phys_addr_t *out_end, int *out_nid);
void __next_reserved_mem_region(u64 *idx, phys_addr_t *out_start,
phys_addr_t *out_end);
void __memblock_free_early(phys_addr_t base, phys_addr_t size);
void __memblock_free_late(phys_addr_t base, phys_addr_t size);
/**
* for_each_mem_range - iterate through memblock areas from type_a and not
* included in type_b. Or just type_a if type_b is NULL.
* @i: u64 used as loop variable
* @type_a: ptr to memblock_type to iterate
* @type_b: ptr to memblock_type which excludes from the iteration
* @nid: node selector, %NUMA_NO_NODE for all nodes
* @flags: pick from blocks based on memory attributes
* @p_start: ptr to phys_addr_t for start address of the range, can be %NULL
* @p_end: ptr to phys_addr_t for end address of the range, can be %NULL
* @p_nid: ptr to int for nid of the range, can be %NULL
*/
#define for_each_mem_range(i, type_a, type_b, nid, flags, \
p_start, p_end, p_nid) \
for (i = 0, __next_mem_range(&i, nid, flags, type_a, type_b, \
p_start, p_end, p_nid); \
i != (u64)ULLONG_MAX; \
__next_mem_range(&i, nid, flags, type_a, type_b, \
p_start, p_end, p_nid))
/**
* for_each_mem_range_rev - reverse iterate through memblock areas from
* type_a and not included in type_b. Or just type_a if type_b is NULL.
* @i: u64 used as loop variable
* @type_a: ptr to memblock_type to iterate
* @type_b: ptr to memblock_type which excludes from the iteration
* @nid: node selector, %NUMA_NO_NODE for all nodes
* @flags: pick from blocks based on memory attributes
* @p_start: ptr to phys_addr_t for start address of the range, can be %NULL
* @p_end: ptr to phys_addr_t for end address of the range, can be %NULL
* @p_nid: ptr to int for nid of the range, can be %NULL
*/
#define for_each_mem_range_rev(i, type_a, type_b, nid, flags, \
p_start, p_end, p_nid) \
for (i = (u64)ULLONG_MAX, \
__next_mem_range_rev(&i, nid, flags, type_a, type_b,\
p_start, p_end, p_nid); \
i != (u64)ULLONG_MAX; \
__next_mem_range_rev(&i, nid, flags, type_a, type_b, \
p_start, p_end, p_nid))
/**
* for_each_reserved_mem_region - iterate over all reserved memblock areas
* @i: u64 used as loop variable
* @p_start: ptr to phys_addr_t for start address of the range, can be %NULL
* @p_end: ptr to phys_addr_t for end address of the range, can be %NULL
*
* Walks over reserved areas of memblock. Available as soon as memblock
* is initialized.
*/
#define for_each_reserved_mem_region(i, p_start, p_end) \
for (i = 0UL, __next_reserved_mem_region(&i, p_start, p_end); \
i != (u64)ULLONG_MAX; \
__next_reserved_mem_region(&i, p_start, p_end))
static inline bool memblock_is_hotpluggable(struct memblock_region *m)
{
return m->flags & MEMBLOCK_HOTPLUG;
}
static inline bool memblock_is_mirror(struct memblock_region *m)
{
return m->flags & MEMBLOCK_MIRROR;
}
static inline bool memblock_is_nomap(struct memblock_region *m)
{
return m->flags & MEMBLOCK_NOMAP;
}
#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
int memblock_search_pfn_nid(unsigned long pfn, unsigned long *start_pfn,
unsigned long *end_pfn);
void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn,
unsigned long *out_end_pfn, int *out_nid);
/**
* for_each_mem_pfn_range - early memory pfn range iterator
* @i: an integer used as loop variable
* @nid: node selector, %MAX_NUMNODES for all nodes
* @p_start: ptr to ulong for start pfn of the range, can be %NULL
* @p_end: ptr to ulong for end pfn of the range, can be %NULL
* @p_nid: ptr to int for nid of the range, can be %NULL
*
* Walks over configured memory ranges.
*/
#define for_each_mem_pfn_range(i, nid, p_start, p_end, p_nid) \
for (i = -1, __next_mem_pfn_range(&i, nid, p_start, p_end, p_nid); \
i >= 0; __next_mem_pfn_range(&i, nid, p_start, p_end, p_nid))
#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
/**
* for_each_free_mem_range - iterate through free memblock areas
* @i: u64 used as loop variable
* @nid: node selector, %NUMA_NO_NODE for all nodes
* @flags: pick from blocks based on memory attributes
* @p_start: ptr to phys_addr_t for start address of the range, can be %NULL
* @p_end: ptr to phys_addr_t for end address of the range, can be %NULL
* @p_nid: ptr to int for nid of the range, can be %NULL
*
* Walks over free (memory && !reserved) areas of memblock. Available as
* soon as memblock is initialized.
*/
#define for_each_free_mem_range(i, nid, flags, p_start, p_end, p_nid) \
for_each_mem_range(i, &memblock.memory, &memblock.reserved, \
nid, flags, p_start, p_end, p_nid)
/**
* for_each_free_mem_range_reverse - rev-iterate through free memblock areas
* @i: u64 used as loop variable
* @nid: node selector, %NUMA_NO_NODE for all nodes
* @flags: pick from blocks based on memory attributes
* @p_start: ptr to phys_addr_t for start address of the range, can be %NULL
* @p_end: ptr to phys_addr_t for end address of the range, can be %NULL
* @p_nid: ptr to int for nid of the range, can be %NULL
*
* Walks over free (memory && !reserved) areas of memblock in reverse
* order. Available as soon as memblock is initialized.
*/
#define for_each_free_mem_range_reverse(i, nid, flags, p_start, p_end, \
p_nid) \
for_each_mem_range_rev(i, &memblock.memory, &memblock.reserved, \
nid, flags, p_start, p_end, p_nid)
static inline void memblock_set_region_flags(struct memblock_region *r,
enum memblock_flags flags)
{
r->flags |= flags;
}
static inline void memblock_clear_region_flags(struct memblock_region *r,
enum memblock_flags flags)
{
r->flags &= ~flags;
}
#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
int memblock_set_node(phys_addr_t base, phys_addr_t size,
struct memblock_type *type, int nid);
static inline void memblock_set_region_node(struct memblock_region *r, int nid)
{
r->nid = nid;
}
static inline int memblock_get_region_node(const struct memblock_region *r)
{
return r->nid;
}
#else
static inline void memblock_set_region_node(struct memblock_region *r, int nid)
{
}
static inline int memblock_get_region_node(const struct memblock_region *r)
{
return 0;
}
#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
phys_addr_t memblock_alloc_nid(phys_addr_t size, phys_addr_t align, int nid);
phys_addr_t memblock_alloc_try_nid(phys_addr_t size, phys_addr_t align, int nid);
phys_addr_t memblock_alloc(phys_addr_t size, phys_addr_t align);
/*
* Set the allocation direction to bottom-up or top-down.
*/
static inline void __init memblock_set_bottom_up(bool enable)
{
memblock.bottom_up = enable;
}
/*
* Check if the allocation direction is bottom-up or not.
* if this is true, that said, memblock will allocate memory
* in bottom-up direction.
*/
static inline bool memblock_bottom_up(void)
{
return memblock.bottom_up;
}
/* Flags for memblock_alloc_base() amd __memblock_alloc_base() */
#define MEMBLOCK_ALLOC_ANYWHERE (~(phys_addr_t)0)
#define MEMBLOCK_ALLOC_ACCESSIBLE 0
phys_addr_t __init memblock_alloc_range(phys_addr_t size, phys_addr_t align,
phys_addr_t start, phys_addr_t end,
enum memblock_flags flags);
phys_addr_t memblock_alloc_base_nid(phys_addr_t size,
phys_addr_t align, phys_addr_t max_addr,
int nid, enum memblock_flags flags);
phys_addr_t memblock_alloc_base(phys_addr_t size, phys_addr_t align,
phys_addr_t max_addr);
phys_addr_t __memblock_alloc_base(phys_addr_t size, phys_addr_t align,
phys_addr_t max_addr);
phys_addr_t memblock_phys_mem_size(void);
phys_addr_t memblock_reserved_size(void);
phys_addr_t memblock_mem_size(unsigned long limit_pfn);
phys_addr_t memblock_start_of_DRAM(void);
phys_addr_t memblock_end_of_DRAM(void);
void memblock_enforce_memory_limit(phys_addr_t memory_limit);
void memblock_cap_memory_range(phys_addr_t base, phys_addr_t size);
void memblock_mem_limit_remove_map(phys_addr_t limit);
bool memblock_is_memory(phys_addr_t addr);
bool memblock_is_map_memory(phys_addr_t addr);
bool memblock_is_region_memory(phys_addr_t base, phys_addr_t size);
bool memblock_is_reserved(phys_addr_t addr);
bool memblock_is_region_reserved(phys_addr_t base, phys_addr_t size);
extern void __memblock_dump_all(void);
static inline void memblock_dump_all(void)
{
if (memblock_debug)
__memblock_dump_all();
}
/**
* memblock_set_current_limit - Set the current allocation limit to allow
* limiting allocations to what is currently
* accessible during boot
* @limit: New limit value (physical address)
*/
void memblock_set_current_limit(phys_addr_t limit);
phys_addr_t memblock_get_current_limit(void);
/*
* pfn conversion functions
*
* While the memory MEMBLOCKs should always be page aligned, the reserved
* MEMBLOCKs may not be. This accessor attempt to provide a very clear
* idea of what they return for such non aligned MEMBLOCKs.
*/
/**
* memblock_region_memory_base_pfn - get the lowest pfn of the memory region
* @reg: memblock_region structure
*
* Return: the lowest pfn intersecting with the memory region
*/
static inline unsigned long memblock_region_memory_base_pfn(const struct memblock_region *reg)
{
return PFN_UP(reg->base);
}
/**
* memblock_region_memory_end_pfn - get the end pfn of the memory region
* @reg: memblock_region structure
*
* Return: the end_pfn of the reserved region
*/
static inline unsigned long memblock_region_memory_end_pfn(const struct memblock_region *reg)
{
return PFN_DOWN(reg->base + reg->size);
}
/**
* memblock_region_reserved_base_pfn - get the lowest pfn of the reserved region
* @reg: memblock_region structure
*
* Return: the lowest pfn intersecting with the reserved region
*/
static inline unsigned long memblock_region_reserved_base_pfn(const struct memblock_region *reg)
{
return PFN_DOWN(reg->base);
}
/**
* memblock_region_reserved_end_pfn - get the end pfn of the reserved region
* @reg: memblock_region structure
*
* Return: the end_pfn of the reserved region
*/
static inline unsigned long memblock_region_reserved_end_pfn(const struct memblock_region *reg)
{
return PFN_UP(reg->base + reg->size);
}
#define for_each_memblock(memblock_type, region) \
for (region = memblock.memblock_type.regions; \
region < (memblock.memblock_type.regions + memblock.memblock_type.cnt); \
region++)
#define for_each_memblock_type(i, memblock_type, rgn) \
for (i = 0, rgn = &memblock_type->regions[0]; \
i < memblock_type->cnt; \
i++, rgn = &memblock_type->regions[i])
#ifdef CONFIG_MEMTEST
extern void early_memtest(phys_addr_t start, phys_addr_t end);
#else
static inline void early_memtest(phys_addr_t start, phys_addr_t end)
{
}
#endif
#else
static inline phys_addr_t memblock_alloc(phys_addr_t size, phys_addr_t align)
{
return 0;
}
#endif /* CONFIG_HAVE_MEMBLOCK */
#endif /* __KERNEL__ */
#endif /* _LINUX_MEMBLOCK_H */