mirror of
https://github.com/AuxXxilium/linux_dsm_epyc7002.git
synced 2025-02-12 08:25:07 +07:00
Merge branch 'akpm' (patches from Andrew)
Merge misc VM fixes from Andrew Morton: "A bunch of hotfixes, all affecting mm/. The two-patch series from Andrea may be controversial. This restores patches which were reverted in Dec 2018 due to a regression report [*]. After extensive discussion it is evident that the problems which these patches solved were significantly more serious than the problems they introduced. I am told that major distros are already carrying these two patches for this reason" [*] See https://lore.kernel.org/lkml/alpine.DEB.2.21.1812061343240.144733@chino.kir.corp.google.com/ https://lore.kernel.org/lkml/alpine.DEB.2.21.1812031545560.161134@chino.kir.corp.google.com/ for the google-specific issues brought up by David Rijentes. And as Andrew says: "I'm unaware of anyone else who will be adversely affected by this, and google already carries over a thousand kernel patches - another won't kill them. There has been sporadic discussion about fixing these things for real but it's clear that nobody apart from David is particularly motivated" * emailed patches from Andrew Morton <akpm@linux-foundation.org>: hugetlbfs: fix hugetlb page migration/fault race causing SIGBUS mm, vmscan: do not special-case slab reclaim when watermarks are boosted Revert "mm, thp: restore node-local hugepage allocations" Revert "Revert "mm, thp: consolidate THP gfp handling into alloc_hugepage_direct_gfpmask"" include/asm-generic/5level-fixup.h: fix variable 'p4d' set but not used seq_file: fix problem when seeking mid-record mm: workingset: fix vmstat counters for shadow nodes mm/usercopy: use memory range to be accessed for wraparound check mm: kmemleak: disable early logging in case of error mm/vmalloc.c: fix percpu free VM area search criteria mm/memcontrol.c: fix use after free in mem_cgroup_iter() mm/z3fold.c: fix z3fold_destroy_pool() race condition mm/z3fold.c: fix z3fold_destroy_pool() ordering mm: mempolicy: handle vma with unmovable pages mapped correctly in mbind mm: mempolicy: make the behavior consistent when MPOL_MF_MOVE* and MPOL_MF_STRICT were specified mm/hmm: fix bad subpage pointer in try_to_unmap_one mm/hmm: fix ZONE_DEVICE anon page mapping reuse mm: document zone device struct page field usage
This commit is contained in:
commit
cab6d5b66b
@ -119,6 +119,7 @@ static int traverse(struct seq_file *m, loff_t offset)
|
||||
}
|
||||
if (seq_has_overflowed(m))
|
||||
goto Eoverflow;
|
||||
p = m->op->next(m, p, &m->index);
|
||||
if (pos + m->count > offset) {
|
||||
m->from = offset - pos;
|
||||
m->count -= m->from;
|
||||
@ -126,7 +127,6 @@ static int traverse(struct seq_file *m, loff_t offset)
|
||||
}
|
||||
pos += m->count;
|
||||
m->count = 0;
|
||||
p = m->op->next(m, p, &m->index);
|
||||
if (pos == offset)
|
||||
break;
|
||||
}
|
||||
|
@ -19,9 +19,24 @@
|
||||
|
||||
#define p4d_alloc(mm, pgd, address) (pgd)
|
||||
#define p4d_offset(pgd, start) (pgd)
|
||||
#define p4d_none(p4d) 0
|
||||
#define p4d_bad(p4d) 0
|
||||
#define p4d_present(p4d) 1
|
||||
|
||||
#ifndef __ASSEMBLY__
|
||||
static inline int p4d_none(p4d_t p4d)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline int p4d_bad(p4d_t p4d)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline int p4d_present(p4d_t p4d)
|
||||
{
|
||||
return 1;
|
||||
}
|
||||
#endif
|
||||
|
||||
#define p4d_ERROR(p4d) do { } while (0)
|
||||
#define p4d_clear(p4d) pgd_clear(p4d)
|
||||
#define p4d_val(p4d) pgd_val(p4d)
|
||||
|
@ -510,22 +510,18 @@ alloc_pages(gfp_t gfp_mask, unsigned int order)
|
||||
}
|
||||
extern struct page *alloc_pages_vma(gfp_t gfp_mask, int order,
|
||||
struct vm_area_struct *vma, unsigned long addr,
|
||||
int node, bool hugepage);
|
||||
#define alloc_hugepage_vma(gfp_mask, vma, addr, order) \
|
||||
alloc_pages_vma(gfp_mask, order, vma, addr, numa_node_id(), true)
|
||||
int node);
|
||||
#else
|
||||
#define alloc_pages(gfp_mask, order) \
|
||||
alloc_pages_node(numa_node_id(), gfp_mask, order)
|
||||
#define alloc_pages_vma(gfp_mask, order, vma, addr, node, false)\
|
||||
alloc_pages(gfp_mask, order)
|
||||
#define alloc_hugepage_vma(gfp_mask, vma, addr, order) \
|
||||
#define alloc_pages_vma(gfp_mask, order, vma, addr, node)\
|
||||
alloc_pages(gfp_mask, order)
|
||||
#endif
|
||||
#define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0)
|
||||
#define alloc_page_vma(gfp_mask, vma, addr) \
|
||||
alloc_pages_vma(gfp_mask, 0, vma, addr, numa_node_id(), false)
|
||||
alloc_pages_vma(gfp_mask, 0, vma, addr, numa_node_id())
|
||||
#define alloc_page_vma_node(gfp_mask, vma, addr, node) \
|
||||
alloc_pages_vma(gfp_mask, 0, vma, addr, node, false)
|
||||
alloc_pages_vma(gfp_mask, 0, vma, addr, node)
|
||||
|
||||
extern unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order);
|
||||
extern unsigned long get_zeroed_page(gfp_t gfp_mask);
|
||||
|
@ -668,6 +668,7 @@ static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec,
|
||||
|
||||
void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
|
||||
int val);
|
||||
void __mod_lruvec_slab_state(void *p, enum node_stat_item idx, int val);
|
||||
|
||||
static inline void mod_lruvec_state(struct lruvec *lruvec,
|
||||
enum node_stat_item idx, int val)
|
||||
@ -1072,6 +1073,14 @@ static inline void mod_lruvec_page_state(struct page *page,
|
||||
mod_node_page_state(page_pgdat(page), idx, val);
|
||||
}
|
||||
|
||||
static inline void __mod_lruvec_slab_state(void *p, enum node_stat_item idx,
|
||||
int val)
|
||||
{
|
||||
struct page *page = virt_to_head_page(p);
|
||||
|
||||
__mod_node_page_state(page_pgdat(page), idx, val);
|
||||
}
|
||||
|
||||
static inline
|
||||
unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
|
||||
gfp_t gfp_mask,
|
||||
@ -1159,6 +1168,16 @@ static inline void __dec_lruvec_page_state(struct page *page,
|
||||
__mod_lruvec_page_state(page, idx, -1);
|
||||
}
|
||||
|
||||
static inline void __inc_lruvec_slab_state(void *p, enum node_stat_item idx)
|
||||
{
|
||||
__mod_lruvec_slab_state(p, idx, 1);
|
||||
}
|
||||
|
||||
static inline void __dec_lruvec_slab_state(void *p, enum node_stat_item idx)
|
||||
{
|
||||
__mod_lruvec_slab_state(p, idx, -1);
|
||||
}
|
||||
|
||||
/* idx can be of type enum memcg_stat_item or node_stat_item */
|
||||
static inline void inc_memcg_state(struct mem_cgroup *memcg,
|
||||
int idx)
|
||||
|
@ -139,6 +139,8 @@ struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp,
|
||||
struct mempolicy *get_task_policy(struct task_struct *p);
|
||||
struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
|
||||
unsigned long addr);
|
||||
struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
|
||||
unsigned long addr);
|
||||
bool vma_policy_mof(struct vm_area_struct *vma);
|
||||
|
||||
extern void numa_default_policy(void);
|
||||
|
@ -159,7 +159,16 @@ struct page {
|
||||
/** @pgmap: Points to the hosting device page map. */
|
||||
struct dev_pagemap *pgmap;
|
||||
void *zone_device_data;
|
||||
unsigned long _zd_pad_1; /* uses mapping */
|
||||
/*
|
||||
* ZONE_DEVICE private pages are counted as being
|
||||
* mapped so the next 3 words hold the mapping, index,
|
||||
* and private fields from the source anonymous or
|
||||
* page cache page while the page is migrated to device
|
||||
* private memory.
|
||||
* ZONE_DEVICE MEMORY_DEVICE_FS_DAX pages also
|
||||
* use the mapping, index, and private fields when
|
||||
* pmem backed DAX files are mapped.
|
||||
*/
|
||||
};
|
||||
|
||||
/** @rcu_head: You can use this to free a page by RCU. */
|
||||
|
@ -644,30 +644,40 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
|
||||
* available
|
||||
* never: never stall for any thp allocation
|
||||
*/
|
||||
static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma)
|
||||
static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma, unsigned long addr)
|
||||
{
|
||||
const bool vma_madvised = !!(vma->vm_flags & VM_HUGEPAGE);
|
||||
gfp_t this_node = 0;
|
||||
|
||||
#ifdef CONFIG_NUMA
|
||||
struct mempolicy *pol;
|
||||
/*
|
||||
* __GFP_THISNODE is used only when __GFP_DIRECT_RECLAIM is not
|
||||
* specified, to express a general desire to stay on the current
|
||||
* node for optimistic allocation attempts. If the defrag mode
|
||||
* and/or madvise hint requires the direct reclaim then we prefer
|
||||
* to fallback to other node rather than node reclaim because that
|
||||
* can lead to excessive reclaim even though there is free memory
|
||||
* on other nodes. We expect that NUMA preferences are specified
|
||||
* by memory policies.
|
||||
*/
|
||||
pol = get_vma_policy(vma, addr);
|
||||
if (pol->mode != MPOL_BIND)
|
||||
this_node = __GFP_THISNODE;
|
||||
mpol_cond_put(pol);
|
||||
#endif
|
||||
|
||||
/* Always do synchronous compaction */
|
||||
if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags))
|
||||
return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY);
|
||||
|
||||
/* Kick kcompactd and fail quickly */
|
||||
if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags))
|
||||
return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM;
|
||||
|
||||
/* Synchronous compaction if madvised, otherwise kick kcompactd */
|
||||
return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM | this_node;
|
||||
if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags))
|
||||
return GFP_TRANSHUGE_LIGHT |
|
||||
(vma_madvised ? __GFP_DIRECT_RECLAIM :
|
||||
__GFP_KSWAPD_RECLAIM);
|
||||
|
||||
/* Only do synchronous compaction if madvised */
|
||||
return GFP_TRANSHUGE_LIGHT | (vma_madvised ? __GFP_DIRECT_RECLAIM :
|
||||
__GFP_KSWAPD_RECLAIM | this_node);
|
||||
if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags))
|
||||
return GFP_TRANSHUGE_LIGHT |
|
||||
(vma_madvised ? __GFP_DIRECT_RECLAIM : 0);
|
||||
|
||||
return GFP_TRANSHUGE_LIGHT;
|
||||
return GFP_TRANSHUGE_LIGHT | (vma_madvised ? __GFP_DIRECT_RECLAIM :
|
||||
this_node);
|
||||
return GFP_TRANSHUGE_LIGHT | this_node;
|
||||
}
|
||||
|
||||
/* Caller must hold page table lock. */
|
||||
@ -739,8 +749,8 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
|
||||
pte_free(vma->vm_mm, pgtable);
|
||||
return ret;
|
||||
}
|
||||
gfp = alloc_hugepage_direct_gfpmask(vma);
|
||||
page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER);
|
||||
gfp = alloc_hugepage_direct_gfpmask(vma, haddr);
|
||||
page = alloc_pages_vma(gfp, HPAGE_PMD_ORDER, vma, haddr, numa_node_id());
|
||||
if (unlikely(!page)) {
|
||||
count_vm_event(THP_FAULT_FALLBACK);
|
||||
return VM_FAULT_FALLBACK;
|
||||
@ -1347,8 +1357,9 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd)
|
||||
alloc:
|
||||
if (__transparent_hugepage_enabled(vma) &&
|
||||
!transparent_hugepage_debug_cow()) {
|
||||
huge_gfp = alloc_hugepage_direct_gfpmask(vma);
|
||||
new_page = alloc_hugepage_vma(huge_gfp, vma, haddr, HPAGE_PMD_ORDER);
|
||||
huge_gfp = alloc_hugepage_direct_gfpmask(vma, haddr);
|
||||
new_page = alloc_pages_vma(huge_gfp, HPAGE_PMD_ORDER, vma,
|
||||
haddr, numa_node_id());
|
||||
} else
|
||||
new_page = NULL;
|
||||
|
||||
|
19
mm/hugetlb.c
19
mm/hugetlb.c
@ -3856,6 +3856,25 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
|
||||
|
||||
page = alloc_huge_page(vma, haddr, 0);
|
||||
if (IS_ERR(page)) {
|
||||
/*
|
||||
* Returning error will result in faulting task being
|
||||
* sent SIGBUS. The hugetlb fault mutex prevents two
|
||||
* tasks from racing to fault in the same page which
|
||||
* could result in false unable to allocate errors.
|
||||
* Page migration does not take the fault mutex, but
|
||||
* does a clear then write of pte's under page table
|
||||
* lock. Page fault code could race with migration,
|
||||
* notice the clear pte and try to allocate a page
|
||||
* here. Before returning error, get ptl and make
|
||||
* sure there really is no pte entry.
|
||||
*/
|
||||
ptl = huge_pte_lock(h, mm, ptep);
|
||||
if (!huge_pte_none(huge_ptep_get(ptep))) {
|
||||
ret = 0;
|
||||
spin_unlock(ptl);
|
||||
goto out;
|
||||
}
|
||||
spin_unlock(ptl);
|
||||
ret = vmf_error(PTR_ERR(page));
|
||||
goto out;
|
||||
}
|
||||
|
@ -1966,6 +1966,7 @@ static void kmemleak_disable(void)
|
||||
|
||||
/* stop any memory operation tracing */
|
||||
kmemleak_enabled = 0;
|
||||
kmemleak_early_log = 0;
|
||||
|
||||
/* check whether it is too early for a kernel thread */
|
||||
if (kmemleak_initialized)
|
||||
@ -2009,7 +2010,6 @@ void __init kmemleak_init(void)
|
||||
|
||||
#ifdef CONFIG_DEBUG_KMEMLEAK_DEFAULT_OFF
|
||||
if (!kmemleak_skip_disable) {
|
||||
kmemleak_early_log = 0;
|
||||
kmemleak_disable();
|
||||
return;
|
||||
}
|
||||
|
@ -768,6 +768,26 @@ void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
|
||||
__this_cpu_write(pn->lruvec_stat_cpu->count[idx], x);
|
||||
}
|
||||
|
||||
void __mod_lruvec_slab_state(void *p, enum node_stat_item idx, int val)
|
||||
{
|
||||
struct page *page = virt_to_head_page(p);
|
||||
pg_data_t *pgdat = page_pgdat(page);
|
||||
struct mem_cgroup *memcg;
|
||||
struct lruvec *lruvec;
|
||||
|
||||
rcu_read_lock();
|
||||
memcg = memcg_from_slab_page(page);
|
||||
|
||||
/* Untracked pages have no memcg, no lruvec. Update only the node */
|
||||
if (!memcg || memcg == root_mem_cgroup) {
|
||||
__mod_node_page_state(pgdat, idx, val);
|
||||
} else {
|
||||
lruvec = mem_cgroup_lruvec(pgdat, memcg);
|
||||
__mod_lruvec_state(lruvec, idx, val);
|
||||
}
|
||||
rcu_read_unlock();
|
||||
}
|
||||
|
||||
/**
|
||||
* __count_memcg_events - account VM events in a cgroup
|
||||
* @memcg: the memory cgroup
|
||||
@ -1130,26 +1150,45 @@ void mem_cgroup_iter_break(struct mem_cgroup *root,
|
||||
css_put(&prev->css);
|
||||
}
|
||||
|
||||
static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
|
||||
static void __invalidate_reclaim_iterators(struct mem_cgroup *from,
|
||||
struct mem_cgroup *dead_memcg)
|
||||
{
|
||||
struct mem_cgroup *memcg = dead_memcg;
|
||||
struct mem_cgroup_reclaim_iter *iter;
|
||||
struct mem_cgroup_per_node *mz;
|
||||
int nid;
|
||||
int i;
|
||||
|
||||
for (; memcg; memcg = parent_mem_cgroup(memcg)) {
|
||||
for_each_node(nid) {
|
||||
mz = mem_cgroup_nodeinfo(memcg, nid);
|
||||
for (i = 0; i <= DEF_PRIORITY; i++) {
|
||||
iter = &mz->iter[i];
|
||||
cmpxchg(&iter->position,
|
||||
dead_memcg, NULL);
|
||||
}
|
||||
for_each_node(nid) {
|
||||
mz = mem_cgroup_nodeinfo(from, nid);
|
||||
for (i = 0; i <= DEF_PRIORITY; i++) {
|
||||
iter = &mz->iter[i];
|
||||
cmpxchg(&iter->position,
|
||||
dead_memcg, NULL);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
|
||||
{
|
||||
struct mem_cgroup *memcg = dead_memcg;
|
||||
struct mem_cgroup *last;
|
||||
|
||||
do {
|
||||
__invalidate_reclaim_iterators(memcg, dead_memcg);
|
||||
last = memcg;
|
||||
} while ((memcg = parent_mem_cgroup(memcg)));
|
||||
|
||||
/*
|
||||
* When cgruop1 non-hierarchy mode is used,
|
||||
* parent_mem_cgroup() does not walk all the way up to the
|
||||
* cgroup root (root_mem_cgroup). So we have to handle
|
||||
* dead_memcg from cgroup root separately.
|
||||
*/
|
||||
if (last != root_mem_cgroup)
|
||||
__invalidate_reclaim_iterators(root_mem_cgroup,
|
||||
dead_memcg);
|
||||
}
|
||||
|
||||
/**
|
||||
* mem_cgroup_scan_tasks - iterate over tasks of a memory cgroup hierarchy
|
||||
* @memcg: hierarchy root
|
||||
|
134
mm/mempolicy.c
134
mm/mempolicy.c
@ -403,7 +403,7 @@ static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
|
||||
},
|
||||
};
|
||||
|
||||
static void migrate_page_add(struct page *page, struct list_head *pagelist,
|
||||
static int migrate_page_add(struct page *page, struct list_head *pagelist,
|
||||
unsigned long flags);
|
||||
|
||||
struct queue_pages {
|
||||
@ -429,11 +429,14 @@ static inline bool queue_pages_required(struct page *page,
|
||||
}
|
||||
|
||||
/*
|
||||
* queue_pages_pmd() has three possible return values:
|
||||
* 1 - pages are placed on the right node or queued successfully.
|
||||
* 0 - THP was split.
|
||||
* -EIO - is migration entry or MPOL_MF_STRICT was specified and an existing
|
||||
* page was already on a node that does not follow the policy.
|
||||
* queue_pages_pmd() has four possible return values:
|
||||
* 0 - pages are placed on the right node or queued successfully.
|
||||
* 1 - there is unmovable page, and MPOL_MF_MOVE* & MPOL_MF_STRICT were
|
||||
* specified.
|
||||
* 2 - THP was split.
|
||||
* -EIO - is migration entry or only MPOL_MF_STRICT was specified and an
|
||||
* existing page was already on a node that does not follow the
|
||||
* policy.
|
||||
*/
|
||||
static int queue_pages_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr,
|
||||
unsigned long end, struct mm_walk *walk)
|
||||
@ -451,23 +454,20 @@ static int queue_pages_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr,
|
||||
if (is_huge_zero_page(page)) {
|
||||
spin_unlock(ptl);
|
||||
__split_huge_pmd(walk->vma, pmd, addr, false, NULL);
|
||||
ret = 2;
|
||||
goto out;
|
||||
}
|
||||
if (!queue_pages_required(page, qp)) {
|
||||
ret = 1;
|
||||
if (!queue_pages_required(page, qp))
|
||||
goto unlock;
|
||||
}
|
||||
|
||||
ret = 1;
|
||||
flags = qp->flags;
|
||||
/* go to thp migration */
|
||||
if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
|
||||
if (!vma_migratable(walk->vma)) {
|
||||
ret = -EIO;
|
||||
if (!vma_migratable(walk->vma) ||
|
||||
migrate_page_add(page, qp->pagelist, flags)) {
|
||||
ret = 1;
|
||||
goto unlock;
|
||||
}
|
||||
|
||||
migrate_page_add(page, qp->pagelist, flags);
|
||||
} else
|
||||
ret = -EIO;
|
||||
unlock:
|
||||
@ -479,6 +479,13 @@ static int queue_pages_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr,
|
||||
/*
|
||||
* Scan through pages checking if pages follow certain conditions,
|
||||
* and move them to the pagelist if they do.
|
||||
*
|
||||
* queue_pages_pte_range() has three possible return values:
|
||||
* 0 - pages are placed on the right node or queued successfully.
|
||||
* 1 - there is unmovable page, and MPOL_MF_MOVE* & MPOL_MF_STRICT were
|
||||
* specified.
|
||||
* -EIO - only MPOL_MF_STRICT was specified and an existing page was already
|
||||
* on a node that does not follow the policy.
|
||||
*/
|
||||
static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
|
||||
unsigned long end, struct mm_walk *walk)
|
||||
@ -488,17 +495,17 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
|
||||
struct queue_pages *qp = walk->private;
|
||||
unsigned long flags = qp->flags;
|
||||
int ret;
|
||||
bool has_unmovable = false;
|
||||
pte_t *pte;
|
||||
spinlock_t *ptl;
|
||||
|
||||
ptl = pmd_trans_huge_lock(pmd, vma);
|
||||
if (ptl) {
|
||||
ret = queue_pages_pmd(pmd, ptl, addr, end, walk);
|
||||
if (ret > 0)
|
||||
return 0;
|
||||
else if (ret < 0)
|
||||
if (ret != 2)
|
||||
return ret;
|
||||
}
|
||||
/* THP was split, fall through to pte walk */
|
||||
|
||||
if (pmd_trans_unstable(pmd))
|
||||
return 0;
|
||||
@ -519,14 +526,28 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
|
||||
if (!queue_pages_required(page, qp))
|
||||
continue;
|
||||
if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
|
||||
if (!vma_migratable(vma))
|
||||
/* MPOL_MF_STRICT must be specified if we get here */
|
||||
if (!vma_migratable(vma)) {
|
||||
has_unmovable = true;
|
||||
break;
|
||||
migrate_page_add(page, qp->pagelist, flags);
|
||||
}
|
||||
|
||||
/*
|
||||
* Do not abort immediately since there may be
|
||||
* temporary off LRU pages in the range. Still
|
||||
* need migrate other LRU pages.
|
||||
*/
|
||||
if (migrate_page_add(page, qp->pagelist, flags))
|
||||
has_unmovable = true;
|
||||
} else
|
||||
break;
|
||||
}
|
||||
pte_unmap_unlock(pte - 1, ptl);
|
||||
cond_resched();
|
||||
|
||||
if (has_unmovable)
|
||||
return 1;
|
||||
|
||||
return addr != end ? -EIO : 0;
|
||||
}
|
||||
|
||||
@ -639,7 +660,13 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end,
|
||||
*
|
||||
* If pages found in a given range are on a set of nodes (determined by
|
||||
* @nodes and @flags,) it's isolated and queued to the pagelist which is
|
||||
* passed via @private.)
|
||||
* passed via @private.
|
||||
*
|
||||
* queue_pages_range() has three possible return values:
|
||||
* 1 - there is unmovable page, but MPOL_MF_MOVE* & MPOL_MF_STRICT were
|
||||
* specified.
|
||||
* 0 - queue pages successfully or no misplaced page.
|
||||
* -EIO - there is misplaced page and only MPOL_MF_STRICT was specified.
|
||||
*/
|
||||
static int
|
||||
queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
|
||||
@ -940,7 +967,7 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
|
||||
/*
|
||||
* page migration, thp tail pages can be passed.
|
||||
*/
|
||||
static void migrate_page_add(struct page *page, struct list_head *pagelist,
|
||||
static int migrate_page_add(struct page *page, struct list_head *pagelist,
|
||||
unsigned long flags)
|
||||
{
|
||||
struct page *head = compound_head(page);
|
||||
@ -953,8 +980,19 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist,
|
||||
mod_node_page_state(page_pgdat(head),
|
||||
NR_ISOLATED_ANON + page_is_file_cache(head),
|
||||
hpage_nr_pages(head));
|
||||
} else if (flags & MPOL_MF_STRICT) {
|
||||
/*
|
||||
* Non-movable page may reach here. And, there may be
|
||||
* temporary off LRU pages or non-LRU movable pages.
|
||||
* Treat them as unmovable pages since they can't be
|
||||
* isolated, so they can't be moved at the moment. It
|
||||
* should return -EIO for this case too.
|
||||
*/
|
||||
return -EIO;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* page allocation callback for NUMA node migration */
|
||||
@ -1142,8 +1180,8 @@ static struct page *new_page(struct page *page, unsigned long start)
|
||||
} else if (PageTransHuge(page)) {
|
||||
struct page *thp;
|
||||
|
||||
thp = alloc_hugepage_vma(GFP_TRANSHUGE, vma, address,
|
||||
HPAGE_PMD_ORDER);
|
||||
thp = alloc_pages_vma(GFP_TRANSHUGE, HPAGE_PMD_ORDER, vma,
|
||||
address, numa_node_id());
|
||||
if (!thp)
|
||||
return NULL;
|
||||
prep_transhuge_page(thp);
|
||||
@ -1157,9 +1195,10 @@ static struct page *new_page(struct page *page, unsigned long start)
|
||||
}
|
||||
#else
|
||||
|
||||
static void migrate_page_add(struct page *page, struct list_head *pagelist,
|
||||
static int migrate_page_add(struct page *page, struct list_head *pagelist,
|
||||
unsigned long flags)
|
||||
{
|
||||
return -EIO;
|
||||
}
|
||||
|
||||
int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
|
||||
@ -1182,6 +1221,7 @@ static long do_mbind(unsigned long start, unsigned long len,
|
||||
struct mempolicy *new;
|
||||
unsigned long end;
|
||||
int err;
|
||||
int ret;
|
||||
LIST_HEAD(pagelist);
|
||||
|
||||
if (flags & ~(unsigned long)MPOL_MF_VALID)
|
||||
@ -1243,10 +1283,15 @@ static long do_mbind(unsigned long start, unsigned long len,
|
||||
if (err)
|
||||
goto mpol_out;
|
||||
|
||||
err = queue_pages_range(mm, start, end, nmask,
|
||||
ret = queue_pages_range(mm, start, end, nmask,
|
||||
flags | MPOL_MF_INVERT, &pagelist);
|
||||
if (!err)
|
||||
err = mbind_range(mm, start, end, new);
|
||||
|
||||
if (ret < 0) {
|
||||
err = -EIO;
|
||||
goto up_out;
|
||||
}
|
||||
|
||||
err = mbind_range(mm, start, end, new);
|
||||
|
||||
if (!err) {
|
||||
int nr_failed = 0;
|
||||
@ -1259,13 +1304,14 @@ static long do_mbind(unsigned long start, unsigned long len,
|
||||
putback_movable_pages(&pagelist);
|
||||
}
|
||||
|
||||
if (nr_failed && (flags & MPOL_MF_STRICT))
|
||||
if ((ret > 0) || (nr_failed && (flags & MPOL_MF_STRICT)))
|
||||
err = -EIO;
|
||||
} else
|
||||
putback_movable_pages(&pagelist);
|
||||
|
||||
up_out:
|
||||
up_write(&mm->mmap_sem);
|
||||
mpol_out:
|
||||
mpol_out:
|
||||
mpol_put(new);
|
||||
return err;
|
||||
}
|
||||
@ -1688,7 +1734,7 @@ struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
|
||||
* freeing by another task. It is the caller's responsibility to free the
|
||||
* extra reference for shared policies.
|
||||
*/
|
||||
static struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
|
||||
struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
|
||||
unsigned long addr)
|
||||
{
|
||||
struct mempolicy *pol = __get_vma_policy(vma, addr);
|
||||
@ -2037,7 +2083,6 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
|
||||
* @vma: Pointer to VMA or NULL if not available.
|
||||
* @addr: Virtual Address of the allocation. Must be inside the VMA.
|
||||
* @node: Which node to prefer for allocation (modulo policy).
|
||||
* @hugepage: for hugepages try only the preferred node if possible
|
||||
*
|
||||
* This function allocates a page from the kernel page pool and applies
|
||||
* a NUMA policy associated with the VMA or the current process.
|
||||
@ -2048,7 +2093,7 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
|
||||
*/
|
||||
struct page *
|
||||
alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
|
||||
unsigned long addr, int node, bool hugepage)
|
||||
unsigned long addr, int node)
|
||||
{
|
||||
struct mempolicy *pol;
|
||||
struct page *page;
|
||||
@ -2066,31 +2111,6 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) {
|
||||
int hpage_node = node;
|
||||
|
||||
/*
|
||||
* For hugepage allocation and non-interleave policy which
|
||||
* allows the current node (or other explicitly preferred
|
||||
* node) we only try to allocate from the current/preferred
|
||||
* node and don't fall back to other nodes, as the cost of
|
||||
* remote accesses would likely offset THP benefits.
|
||||
*
|
||||
* If the policy is interleave, or does not allow the current
|
||||
* node in its nodemask, we allocate the standard way.
|
||||
*/
|
||||
if (pol->mode == MPOL_PREFERRED && !(pol->flags & MPOL_F_LOCAL))
|
||||
hpage_node = pol->v.preferred_node;
|
||||
|
||||
nmask = policy_nodemask(gfp, pol);
|
||||
if (!nmask || node_isset(hpage_node, *nmask)) {
|
||||
mpol_cond_put(pol);
|
||||
page = __alloc_pages_node(hpage_node,
|
||||
gfp | __GFP_THISNODE, order);
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
|
||||
nmask = policy_nodemask(gfp, pol);
|
||||
preferred_nid = policy_node(gfp, pol, node);
|
||||
page = __alloc_pages_nodemask(gfp, order, preferred_nid, nmask);
|
||||
|
@ -403,6 +403,30 @@ void __put_devmap_managed_page(struct page *page)
|
||||
|
||||
mem_cgroup_uncharge(page);
|
||||
|
||||
/*
|
||||
* When a device_private page is freed, the page->mapping field
|
||||
* may still contain a (stale) mapping value. For example, the
|
||||
* lower bits of page->mapping may still identify the page as
|
||||
* an anonymous page. Ultimately, this entire field is just
|
||||
* stale and wrong, and it will cause errors if not cleared.
|
||||
* One example is:
|
||||
*
|
||||
* migrate_vma_pages()
|
||||
* migrate_vma_insert_page()
|
||||
* page_add_new_anon_rmap()
|
||||
* __page_set_anon_rmap()
|
||||
* ...checks page->mapping, via PageAnon(page) call,
|
||||
* and incorrectly concludes that the page is an
|
||||
* anonymous page. Therefore, it incorrectly,
|
||||
* silently fails to set up the new anon rmap.
|
||||
*
|
||||
* For other types of ZONE_DEVICE pages, migration is either
|
||||
* handled differently or not done at all, so there is no need
|
||||
* to clear page->mapping.
|
||||
*/
|
||||
if (is_device_private_page(page))
|
||||
page->mapping = NULL;
|
||||
|
||||
page->pgmap->ops->page_free(page);
|
||||
} else if (!count)
|
||||
__put_page(page);
|
||||
|
@ -1475,7 +1475,15 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
|
||||
/*
|
||||
* No need to invalidate here it will synchronize on
|
||||
* against the special swap migration pte.
|
||||
*
|
||||
* The assignment to subpage above was computed from a
|
||||
* swap PTE which results in an invalid pointer.
|
||||
* Since only PAGE_SIZE pages can currently be
|
||||
* migrated, just set it to page. This will need to be
|
||||
* changed when hugepage migrations to device private
|
||||
* memory are supported.
|
||||
*/
|
||||
subpage = page;
|
||||
goto discard;
|
||||
}
|
||||
|
||||
|
@ -1466,7 +1466,7 @@ static struct page *shmem_alloc_hugepage(gfp_t gfp,
|
||||
|
||||
shmem_pseudo_vma_init(&pvma, info, hindex);
|
||||
page = alloc_pages_vma(gfp | __GFP_COMP | __GFP_NORETRY | __GFP_NOWARN,
|
||||
HPAGE_PMD_ORDER, &pvma, 0, numa_node_id(), true);
|
||||
HPAGE_PMD_ORDER, &pvma, 0, numa_node_id());
|
||||
shmem_pseudo_vma_destroy(&pvma);
|
||||
if (page)
|
||||
prep_transhuge_page(page);
|
||||
|
@ -147,7 +147,7 @@ static inline void check_bogus_address(const unsigned long ptr, unsigned long n,
|
||||
bool to_user)
|
||||
{
|
||||
/* Reject if object wraps past end of memory. */
|
||||
if (ptr + n < ptr)
|
||||
if (ptr + (n - 1) < ptr)
|
||||
usercopy_abort("wrapped address", NULL, to_user, 0, ptr + n);
|
||||
|
||||
/* Reject if NULL or ZERO-allocation. */
|
||||
|
12
mm/vmalloc.c
12
mm/vmalloc.c
@ -3278,10 +3278,20 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
|
||||
if (va == NULL)
|
||||
goto overflow;
|
||||
|
||||
/*
|
||||
* If required width exeeds current VA block, move
|
||||
* base downwards and then recheck.
|
||||
*/
|
||||
if (base + end > va->va_end) {
|
||||
base = pvm_determine_end_from_reverse(&va, align) - end;
|
||||
term_area = area;
|
||||
continue;
|
||||
}
|
||||
|
||||
/*
|
||||
* If this VA does not fit, move base downwards and recheck.
|
||||
*/
|
||||
if (base + start < va->va_start || base + end > va->va_end) {
|
||||
if (base + start < va->va_start) {
|
||||
va = node_to_va(rb_prev(&va->rb_node));
|
||||
base = pvm_determine_end_from_reverse(&va, align) - end;
|
||||
term_area = area;
|
||||
|
13
mm/vmscan.c
13
mm/vmscan.c
@ -88,9 +88,6 @@ struct scan_control {
|
||||
/* Can pages be swapped as part of reclaim? */
|
||||
unsigned int may_swap:1;
|
||||
|
||||
/* e.g. boosted watermark reclaim leaves slabs alone */
|
||||
unsigned int may_shrinkslab:1;
|
||||
|
||||
/*
|
||||
* Cgroups are not reclaimed below their configured memory.low,
|
||||
* unless we threaten to OOM. If any cgroups are skipped due to
|
||||
@ -2714,10 +2711,8 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
|
||||
shrink_node_memcg(pgdat, memcg, sc, &lru_pages);
|
||||
node_lru_pages += lru_pages;
|
||||
|
||||
if (sc->may_shrinkslab) {
|
||||
shrink_slab(sc->gfp_mask, pgdat->node_id,
|
||||
memcg, sc->priority);
|
||||
}
|
||||
shrink_slab(sc->gfp_mask, pgdat->node_id, memcg,
|
||||
sc->priority);
|
||||
|
||||
/* Record the group's reclaim efficiency */
|
||||
vmpressure(sc->gfp_mask, memcg, false,
|
||||
@ -3194,7 +3189,6 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
|
||||
.may_writepage = !laptop_mode,
|
||||
.may_unmap = 1,
|
||||
.may_swap = 1,
|
||||
.may_shrinkslab = 1,
|
||||
};
|
||||
|
||||
/*
|
||||
@ -3238,7 +3232,6 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg,
|
||||
.may_unmap = 1,
|
||||
.reclaim_idx = MAX_NR_ZONES - 1,
|
||||
.may_swap = !noswap,
|
||||
.may_shrinkslab = 1,
|
||||
};
|
||||
unsigned long lru_pages;
|
||||
|
||||
@ -3286,7 +3279,6 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
|
||||
.may_writepage = !laptop_mode,
|
||||
.may_unmap = 1,
|
||||
.may_swap = may_swap,
|
||||
.may_shrinkslab = 1,
|
||||
};
|
||||
|
||||
set_task_reclaim_state(current, &sc.reclaim_state);
|
||||
@ -3598,7 +3590,6 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
|
||||
*/
|
||||
sc.may_writepage = !laptop_mode && !nr_boost_reclaim;
|
||||
sc.may_swap = !nr_boost_reclaim;
|
||||
sc.may_shrinkslab = !nr_boost_reclaim;
|
||||
|
||||
/*
|
||||
* Do some background aging of the anon list, to give
|
||||
|
@ -380,14 +380,12 @@ void workingset_update_node(struct xa_node *node)
|
||||
if (node->count && node->count == node->nr_values) {
|
||||
if (list_empty(&node->private_list)) {
|
||||
list_lru_add(&shadow_nodes, &node->private_list);
|
||||
__inc_lruvec_page_state(virt_to_page(node),
|
||||
WORKINGSET_NODES);
|
||||
__inc_lruvec_slab_state(node, WORKINGSET_NODES);
|
||||
}
|
||||
} else {
|
||||
if (!list_empty(&node->private_list)) {
|
||||
list_lru_del(&shadow_nodes, &node->private_list);
|
||||
__dec_lruvec_page_state(virt_to_page(node),
|
||||
WORKINGSET_NODES);
|
||||
__dec_lruvec_slab_state(node, WORKINGSET_NODES);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -480,7 +478,7 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
|
||||
}
|
||||
|
||||
list_lru_isolate(lru, item);
|
||||
__dec_lruvec_page_state(virt_to_page(node), WORKINGSET_NODES);
|
||||
__dec_lruvec_slab_state(node, WORKINGSET_NODES);
|
||||
|
||||
spin_unlock(lru_lock);
|
||||
|
||||
@ -503,7 +501,7 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
|
||||
* shadow entries we were tracking ...
|
||||
*/
|
||||
xas_store(&xas, NULL);
|
||||
__inc_lruvec_page_state(virt_to_page(node), WORKINGSET_NODERECLAIM);
|
||||
__inc_lruvec_slab_state(node, WORKINGSET_NODERECLAIM);
|
||||
|
||||
out_invalid:
|
||||
xa_unlock_irq(&mapping->i_pages);
|
||||
|
14
mm/z3fold.c
14
mm/z3fold.c
@ -817,9 +817,19 @@ static struct z3fold_pool *z3fold_create_pool(const char *name, gfp_t gfp,
|
||||
static void z3fold_destroy_pool(struct z3fold_pool *pool)
|
||||
{
|
||||
kmem_cache_destroy(pool->c_handle);
|
||||
z3fold_unregister_migration(pool);
|
||||
destroy_workqueue(pool->release_wq);
|
||||
|
||||
/*
|
||||
* We need to destroy pool->compact_wq before pool->release_wq,
|
||||
* as any pending work on pool->compact_wq will call
|
||||
* queue_work(pool->release_wq, &pool->work).
|
||||
*
|
||||
* There are still outstanding pages until both workqueues are drained,
|
||||
* so we cannot unregister migration until then.
|
||||
*/
|
||||
|
||||
destroy_workqueue(pool->compact_wq);
|
||||
destroy_workqueue(pool->release_wq);
|
||||
z3fold_unregister_migration(pool);
|
||||
kfree(pool);
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user