linux_dsm_epyc7002/mm/util.c
Michal Hocko 9fbeb5ab59 mm: make vm_mmap killable
All the callers of vm_mmap seem to check for the failure already and
bail out in one way or another on the error which means that we can
change it to use killable version of vm_mmap_pgoff and return -EINTR if
the current task gets killed while waiting for mmap_sem.  This also
means that vm_mmap_pgoff can be killable by default and drop the
additional parameter.

This will help in the OOM conditions when the oom victim might be stuck
waiting for the mmap_sem for write which in turn can block oom_reaper
which relies on the mmap_sem for read to make a forward progress and
reclaim the address space of the victim.

Please note that load_elf_binary is ignoring vm_mmap error for
current->personality & MMAP_PAGE_ZERO case but that shouldn't be a
problem because the address is not used anywhere and we never return to
the userspace if we got killed.

Signed-off-by: Michal Hocko <mhocko@suse.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-05-23 17:04:14 -07:00

642 lines
16 KiB
C

#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/string.h>
#include <linux/compiler.h>
#include <linux/export.h>
#include <linux/err.h>
#include <linux/sched.h>
#include <linux/security.h>
#include <linux/swap.h>
#include <linux/swapops.h>
#include <linux/mman.h>
#include <linux/hugetlb.h>
#include <linux/vmalloc.h>
#include <asm/sections.h>
#include <asm/uaccess.h>
#include "internal.h"
static inline int is_kernel_rodata(unsigned long addr)
{
return addr >= (unsigned long)__start_rodata &&
addr < (unsigned long)__end_rodata;
}
/**
* kfree_const - conditionally free memory
* @x: pointer to the memory
*
* Function calls kfree only if @x is not in .rodata section.
*/
void kfree_const(const void *x)
{
if (!is_kernel_rodata((unsigned long)x))
kfree(x);
}
EXPORT_SYMBOL(kfree_const);
/**
* kstrdup - allocate space for and copy an existing string
* @s: the string to duplicate
* @gfp: the GFP mask used in the kmalloc() call when allocating memory
*/
char *kstrdup(const char *s, gfp_t gfp)
{
size_t len;
char *buf;
if (!s)
return NULL;
len = strlen(s) + 1;
buf = kmalloc_track_caller(len, gfp);
if (buf)
memcpy(buf, s, len);
return buf;
}
EXPORT_SYMBOL(kstrdup);
/**
* kstrdup_const - conditionally duplicate an existing const string
* @s: the string to duplicate
* @gfp: the GFP mask used in the kmalloc() call when allocating memory
*
* Function returns source string if it is in .rodata section otherwise it
* fallbacks to kstrdup.
* Strings allocated by kstrdup_const should be freed by kfree_const.
*/
const char *kstrdup_const(const char *s, gfp_t gfp)
{
if (is_kernel_rodata((unsigned long)s))
return s;
return kstrdup(s, gfp);
}
EXPORT_SYMBOL(kstrdup_const);
/**
* kstrndup - allocate space for and copy an existing string
* @s: the string to duplicate
* @max: read at most @max chars from @s
* @gfp: the GFP mask used in the kmalloc() call when allocating memory
*/
char *kstrndup(const char *s, size_t max, gfp_t gfp)
{
size_t len;
char *buf;
if (!s)
return NULL;
len = strnlen(s, max);
buf = kmalloc_track_caller(len+1, gfp);
if (buf) {
memcpy(buf, s, len);
buf[len] = '\0';
}
return buf;
}
EXPORT_SYMBOL(kstrndup);
/**
* kmemdup - duplicate region of memory
*
* @src: memory region to duplicate
* @len: memory region length
* @gfp: GFP mask to use
*/
void *kmemdup(const void *src, size_t len, gfp_t gfp)
{
void *p;
p = kmalloc_track_caller(len, gfp);
if (p)
memcpy(p, src, len);
return p;
}
EXPORT_SYMBOL(kmemdup);
/**
* memdup_user - duplicate memory region from user space
*
* @src: source address in user space
* @len: number of bytes to copy
*
* Returns an ERR_PTR() on failure.
*/
void *memdup_user(const void __user *src, size_t len)
{
void *p;
/*
* Always use GFP_KERNEL, since copy_from_user() can sleep and
* cause pagefault, which makes it pointless to use GFP_NOFS
* or GFP_ATOMIC.
*/
p = kmalloc_track_caller(len, GFP_KERNEL);
if (!p)
return ERR_PTR(-ENOMEM);
if (copy_from_user(p, src, len)) {
kfree(p);
return ERR_PTR(-EFAULT);
}
return p;
}
EXPORT_SYMBOL(memdup_user);
/*
* strndup_user - duplicate an existing string from user space
* @s: The string to duplicate
* @n: Maximum number of bytes to copy, including the trailing NUL.
*/
char *strndup_user(const char __user *s, long n)
{
char *p;
long length;
length = strnlen_user(s, n);
if (!length)
return ERR_PTR(-EFAULT);
if (length > n)
return ERR_PTR(-EINVAL);
p = memdup_user(s, length);
if (IS_ERR(p))
return p;
p[length - 1] = '\0';
return p;
}
EXPORT_SYMBOL(strndup_user);
/**
* memdup_user_nul - duplicate memory region from user space and NUL-terminate
*
* @src: source address in user space
* @len: number of bytes to copy
*
* Returns an ERR_PTR() on failure.
*/
void *memdup_user_nul(const void __user *src, size_t len)
{
char *p;
/*
* Always use GFP_KERNEL, since copy_from_user() can sleep and
* cause pagefault, which makes it pointless to use GFP_NOFS
* or GFP_ATOMIC.
*/
p = kmalloc_track_caller(len + 1, GFP_KERNEL);
if (!p)
return ERR_PTR(-ENOMEM);
if (copy_from_user(p, src, len)) {
kfree(p);
return ERR_PTR(-EFAULT);
}
p[len] = '\0';
return p;
}
EXPORT_SYMBOL(memdup_user_nul);
void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
struct vm_area_struct *prev, struct rb_node *rb_parent)
{
struct vm_area_struct *next;
vma->vm_prev = prev;
if (prev) {
next = prev->vm_next;
prev->vm_next = vma;
} else {
mm->mmap = vma;
if (rb_parent)
next = rb_entry(rb_parent,
struct vm_area_struct, vm_rb);
else
next = NULL;
}
vma->vm_next = next;
if (next)
next->vm_prev = vma;
}
/* Check if the vma is being used as a stack by this task */
int vma_is_stack_for_task(struct vm_area_struct *vma, struct task_struct *t)
{
return (vma->vm_start <= KSTK_ESP(t) && vma->vm_end >= KSTK_ESP(t));
}
#if defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT)
void arch_pick_mmap_layout(struct mm_struct *mm)
{
mm->mmap_base = TASK_UNMAPPED_BASE;
mm->get_unmapped_area = arch_get_unmapped_area;
}
#endif
/*
* Like get_user_pages_fast() except its IRQ-safe in that it won't fall
* back to the regular GUP.
* If the architecture not support this function, simply return with no
* page pinned
*/
int __weak __get_user_pages_fast(unsigned long start,
int nr_pages, int write, struct page **pages)
{
return 0;
}
EXPORT_SYMBOL_GPL(__get_user_pages_fast);
/**
* get_user_pages_fast() - pin user pages in memory
* @start: starting user address
* @nr_pages: number of pages from start to pin
* @write: whether pages will be written to
* @pages: array that receives pointers to the pages pinned.
* Should be at least nr_pages long.
*
* Returns number of pages pinned. This may be fewer than the number
* requested. If nr_pages is 0 or negative, returns 0. If no pages
* were pinned, returns -errno.
*
* get_user_pages_fast provides equivalent functionality to get_user_pages,
* operating on current and current->mm, with force=0 and vma=NULL. However
* unlike get_user_pages, it must be called without mmap_sem held.
*
* get_user_pages_fast may take mmap_sem and page table locks, so no
* assumptions can be made about lack of locking. get_user_pages_fast is to be
* implemented in a way that is advantageous (vs get_user_pages()) when the
* user memory area is already faulted in and present in ptes. However if the
* pages have to be faulted in, it may turn out to be slightly slower so
* callers need to carefully consider what to use. On many architectures,
* get_user_pages_fast simply falls back to get_user_pages.
*/
int __weak get_user_pages_fast(unsigned long start,
int nr_pages, int write, struct page **pages)
{
return get_user_pages_unlocked(start, nr_pages, write, 0, pages);
}
EXPORT_SYMBOL_GPL(get_user_pages_fast);
unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr,
unsigned long len, unsigned long prot,
unsigned long flag, unsigned long pgoff)
{
unsigned long ret;
struct mm_struct *mm = current->mm;
unsigned long populate;
ret = security_mmap_file(file, prot, flag);
if (!ret) {
if (down_write_killable(&mm->mmap_sem))
return -EINTR;
ret = do_mmap_pgoff(file, addr, len, prot, flag, pgoff,
&populate);
up_write(&mm->mmap_sem);
if (populate)
mm_populate(ret, populate);
}
return ret;
}
unsigned long vm_mmap(struct file *file, unsigned long addr,
unsigned long len, unsigned long prot,
unsigned long flag, unsigned long offset)
{
if (unlikely(offset + PAGE_ALIGN(len) < offset))
return -EINVAL;
if (unlikely(offset_in_page(offset)))
return -EINVAL;
return vm_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT);
}
EXPORT_SYMBOL(vm_mmap);
void kvfree(const void *addr)
{
if (is_vmalloc_addr(addr))
vfree(addr);
else
kfree(addr);
}
EXPORT_SYMBOL(kvfree);
static inline void *__page_rmapping(struct page *page)
{
unsigned long mapping;
mapping = (unsigned long)page->mapping;
mapping &= ~PAGE_MAPPING_FLAGS;
return (void *)mapping;
}
/* Neutral page->mapping pointer to address_space or anon_vma or other */
void *page_rmapping(struct page *page)
{
page = compound_head(page);
return __page_rmapping(page);
}
/*
* Return true if this page is mapped into pagetables.
* For compound page it returns true if any subpage of compound page is mapped.
*/
bool page_mapped(struct page *page)
{
int i;
if (likely(!PageCompound(page)))
return atomic_read(&page->_mapcount) >= 0;
page = compound_head(page);
if (atomic_read(compound_mapcount_ptr(page)) >= 0)
return true;
if (PageHuge(page))
return false;
for (i = 0; i < hpage_nr_pages(page); i++) {
if (atomic_read(&page[i]._mapcount) >= 0)
return true;
}
return false;
}
EXPORT_SYMBOL(page_mapped);
struct anon_vma *page_anon_vma(struct page *page)
{
unsigned long mapping;
page = compound_head(page);
mapping = (unsigned long)page->mapping;
if ((mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON)
return NULL;
return __page_rmapping(page);
}
struct address_space *page_mapping(struct page *page)
{
struct address_space *mapping;
page = compound_head(page);
/* This happens if someone calls flush_dcache_page on slab page */
if (unlikely(PageSlab(page)))
return NULL;
if (unlikely(PageSwapCache(page))) {
swp_entry_t entry;
entry.val = page_private(page);
return swap_address_space(entry);
}
mapping = page->mapping;
if ((unsigned long)mapping & PAGE_MAPPING_FLAGS)
return NULL;
return mapping;
}
/* Slow path of page_mapcount() for compound pages */
int __page_mapcount(struct page *page)
{
int ret;
ret = atomic_read(&page->_mapcount) + 1;
page = compound_head(page);
ret += atomic_read(compound_mapcount_ptr(page)) + 1;
if (PageDoubleMap(page))
ret--;
return ret;
}
EXPORT_SYMBOL_GPL(__page_mapcount);
int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS;
int sysctl_overcommit_ratio __read_mostly = 50;
unsigned long sysctl_overcommit_kbytes __read_mostly;
int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */
unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */
int overcommit_ratio_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp,
loff_t *ppos)
{
int ret;
ret = proc_dointvec(table, write, buffer, lenp, ppos);
if (ret == 0 && write)
sysctl_overcommit_kbytes = 0;
return ret;
}
int overcommit_kbytes_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp,
loff_t *ppos)
{
int ret;
ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
if (ret == 0 && write)
sysctl_overcommit_ratio = 0;
return ret;
}
/*
* Committed memory limit enforced when OVERCOMMIT_NEVER policy is used
*/
unsigned long vm_commit_limit(void)
{
unsigned long allowed;
if (sysctl_overcommit_kbytes)
allowed = sysctl_overcommit_kbytes >> (PAGE_SHIFT - 10);
else
allowed = ((totalram_pages - hugetlb_total_pages())
* sysctl_overcommit_ratio / 100);
allowed += total_swap_pages;
return allowed;
}
/*
* Make sure vm_committed_as in one cacheline and not cacheline shared with
* other variables. It can be updated by several CPUs frequently.
*/
struct percpu_counter vm_committed_as ____cacheline_aligned_in_smp;
/*
* The global memory commitment made in the system can be a metric
* that can be used to drive ballooning decisions when Linux is hosted
* as a guest. On Hyper-V, the host implements a policy engine for dynamically
* balancing memory across competing virtual machines that are hosted.
* Several metrics drive this policy engine including the guest reported
* memory commitment.
*/
unsigned long vm_memory_committed(void)
{
return percpu_counter_read_positive(&vm_committed_as);
}
EXPORT_SYMBOL_GPL(vm_memory_committed);
/*
* Check that a process has enough memory to allocate a new virtual
* mapping. 0 means there is enough memory for the allocation to
* succeed and -ENOMEM implies there is not.
*
* We currently support three overcommit policies, which are set via the
* vm.overcommit_memory sysctl. See Documentation/vm/overcommit-accounting
*
* Strict overcommit modes added 2002 Feb 26 by Alan Cox.
* Additional code 2002 Jul 20 by Robert Love.
*
* cap_sys_admin is 1 if the process has admin privileges, 0 otherwise.
*
* Note this is a helper function intended to be used by LSMs which
* wish to use this logic.
*/
int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
{
long free, allowed, reserve;
VM_WARN_ONCE(percpu_counter_read(&vm_committed_as) <
-(s64)vm_committed_as_batch * num_online_cpus(),
"memory commitment underflow");
vm_acct_memory(pages);
/*
* Sometimes we want to use more memory than we have
*/
if (sysctl_overcommit_memory == OVERCOMMIT_ALWAYS)
return 0;
if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) {
free = global_page_state(NR_FREE_PAGES);
free += global_page_state(NR_FILE_PAGES);
/*
* shmem pages shouldn't be counted as free in this
* case, they can't be purged, only swapped out, and
* that won't affect the overall amount of available
* memory in the system.
*/
free -= global_page_state(NR_SHMEM);
free += get_nr_swap_pages();
/*
* Any slabs which are created with the
* SLAB_RECLAIM_ACCOUNT flag claim to have contents
* which are reclaimable, under pressure. The dentry
* cache and most inode caches should fall into this
*/
free += global_page_state(NR_SLAB_RECLAIMABLE);
/*
* Leave reserved pages. The pages are not for anonymous pages.
*/
if (free <= totalreserve_pages)
goto error;
else
free -= totalreserve_pages;
/*
* Reserve some for root
*/
if (!cap_sys_admin)
free -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);
if (free > pages)
return 0;
goto error;
}
allowed = vm_commit_limit();
/*
* Reserve some for root
*/
if (!cap_sys_admin)
allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);
/*
* Don't let a single process grow so big a user can't recover
*/
if (mm) {
reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10);
allowed -= min_t(long, mm->total_vm / 32, reserve);
}
if (percpu_counter_read_positive(&vm_committed_as) < allowed)
return 0;
error:
vm_unacct_memory(pages);
return -ENOMEM;
}
/**
* get_cmdline() - copy the cmdline value to a buffer.
* @task: the task whose cmdline value to copy.
* @buffer: the buffer to copy to.
* @buflen: the length of the buffer. Larger cmdline values are truncated
* to this length.
* Returns the size of the cmdline field copied. Note that the copy does
* not guarantee an ending NULL byte.
*/
int get_cmdline(struct task_struct *task, char *buffer, int buflen)
{
int res = 0;
unsigned int len;
struct mm_struct *mm = get_task_mm(task);
unsigned long arg_start, arg_end, env_start, env_end;
if (!mm)
goto out;
if (!mm->arg_end)
goto out_mm; /* Shh! No looking before we're done */
down_read(&mm->mmap_sem);
arg_start = mm->arg_start;
arg_end = mm->arg_end;
env_start = mm->env_start;
env_end = mm->env_end;
up_read(&mm->mmap_sem);
len = arg_end - arg_start;
if (len > buflen)
len = buflen;
res = access_process_vm(task, arg_start, buffer, len, 0);
/*
* If the nul at the end of args has been overwritten, then
* assume application is using setproctitle(3).
*/
if (res > 0 && buffer[res-1] != '\0' && len < buflen) {
len = strnlen(buffer, res);
if (len < res) {
res = len;
} else {
len = env_end - env_start;
if (len > buflen - res)
len = buflen - res;
res += access_process_vm(task, env_start,
buffer+res, len, 0);
res = strnlen(buffer, res);
}
}
out_mm:
mmput(mm);
out:
return res;
}