mirror of
https://github.com/AuxXxilium/linux_dsm_epyc7002.git
synced 2024-12-13 07:56:47 +07:00
6b31d5955c
Wenwei Tao has noticed that our current assumption that the oom victim
is dying and never doing any visible changes after it dies, and so the
oom_reaper can tear it down, is not entirely true.
__task_will_free_mem consider a task dying when SIGNAL_GROUP_EXIT is set
but do_group_exit sends SIGKILL to all threads _after_ the flag is set.
So there is a race window when some threads won't have
fatal_signal_pending while the oom_reaper could start unmapping the
address space. Moreover some paths might not check for fatal signals
before each PF/g-u-p/copy_from_user.
We already have a protection for oom_reaper vs. PF races by checking
MMF_UNSTABLE. This has been, however, checked only for kernel threads
(use_mm users) which can outlive the oom victim. A simple fix would be
to extend the current check in handle_mm_fault for all tasks but that
wouldn't be sufficient because the current check assumes that a kernel
thread would bail out after EFAULT from get_user*/copy_from_user and
never re-read the same address which would succeed because the PF path
has established page tables already. This seems to be the case for the
only existing use_mm user currently (virtio driver) but it is rather
fragile in general.
This is even more fragile in general for more complex paths such as
generic_perform_write which can re-read the same address more times
(e.g. iov_iter_copy_from_user_atomic to fail and then
iov_iter_fault_in_readable on retry).
Therefore we have to implement MMF_UNSTABLE protection in a robust way
and never make a potentially corrupted content visible. That requires
to hook deeper into the PF path and check for the flag _every time_
before a pte for anonymous memory is established (that means all
!VM_SHARED mappings).
The corruption can be triggered artificially
(http://lkml.kernel.org/r/201708040646.v746kkhC024636@www262.sakura.ne.jp)
but there doesn't seem to be any real life bug report. The race window
should be quite tight to trigger most of the time.
Link: http://lkml.kernel.org/r/20170807113839.16695-3-mhocko@kernel.org
Fixes: aac4536355
("mm, oom: introduce oom reaper")
Signed-off-by: Michal Hocko <mhocko@suse.com>
Reported-by: Wenwei Tao <wenwei.tww@alibaba-inc.com>
Tested-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: "Kirill A. Shutemov" <kirill@shutemov.name>
Cc: Andrea Argangeli <andrea@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
109 lines
2.8 KiB
C
109 lines
2.8 KiB
C
#ifndef __INCLUDE_LINUX_OOM_H
|
|
#define __INCLUDE_LINUX_OOM_H
|
|
|
|
|
|
#include <linux/sched/signal.h>
|
|
#include <linux/types.h>
|
|
#include <linux/nodemask.h>
|
|
#include <uapi/linux/oom.h>
|
|
#include <linux/sched/coredump.h> /* MMF_* */
|
|
#include <linux/mm.h> /* VM_FAULT* */
|
|
|
|
struct zonelist;
|
|
struct notifier_block;
|
|
struct mem_cgroup;
|
|
struct task_struct;
|
|
|
|
/*
|
|
* Details of the page allocation that triggered the oom killer that are used to
|
|
* determine what should be killed.
|
|
*/
|
|
struct oom_control {
|
|
/* Used to determine cpuset */
|
|
struct zonelist *zonelist;
|
|
|
|
/* Used to determine mempolicy */
|
|
nodemask_t *nodemask;
|
|
|
|
/* Memory cgroup in which oom is invoked, or NULL for global oom */
|
|
struct mem_cgroup *memcg;
|
|
|
|
/* Used to determine cpuset and node locality requirement */
|
|
const gfp_t gfp_mask;
|
|
|
|
/*
|
|
* order == -1 means the oom kill is required by sysrq, otherwise only
|
|
* for display purposes.
|
|
*/
|
|
const int order;
|
|
|
|
/* Used by oom implementation, do not set */
|
|
unsigned long totalpages;
|
|
struct task_struct *chosen;
|
|
unsigned long chosen_points;
|
|
};
|
|
|
|
extern struct mutex oom_lock;
|
|
|
|
static inline void set_current_oom_origin(void)
|
|
{
|
|
current->signal->oom_flag_origin = true;
|
|
}
|
|
|
|
static inline void clear_current_oom_origin(void)
|
|
{
|
|
current->signal->oom_flag_origin = false;
|
|
}
|
|
|
|
static inline bool oom_task_origin(const struct task_struct *p)
|
|
{
|
|
return p->signal->oom_flag_origin;
|
|
}
|
|
|
|
static inline bool tsk_is_oom_victim(struct task_struct * tsk)
|
|
{
|
|
return tsk->signal->oom_mm;
|
|
}
|
|
|
|
/*
|
|
* Checks whether a page fault on the given mm is still reliable.
|
|
* This is no longer true if the oom reaper started to reap the
|
|
* address space which is reflected by MMF_UNSTABLE flag set in
|
|
* the mm. At that moment any !shared mapping would lose the content
|
|
* and could cause a memory corruption (zero pages instead of the
|
|
* original content).
|
|
*
|
|
* User should call this before establishing a page table entry for
|
|
* a !shared mapping and under the proper page table lock.
|
|
*
|
|
* Return 0 when the PF is safe VM_FAULT_SIGBUS otherwise.
|
|
*/
|
|
static inline int check_stable_address_space(struct mm_struct *mm)
|
|
{
|
|
if (unlikely(test_bit(MMF_UNSTABLE, &mm->flags)))
|
|
return VM_FAULT_SIGBUS;
|
|
return 0;
|
|
}
|
|
|
|
extern unsigned long oom_badness(struct task_struct *p,
|
|
struct mem_cgroup *memcg, const nodemask_t *nodemask,
|
|
unsigned long totalpages);
|
|
|
|
extern bool out_of_memory(struct oom_control *oc);
|
|
|
|
extern void exit_oom_victim(void);
|
|
|
|
extern int register_oom_notifier(struct notifier_block *nb);
|
|
extern int unregister_oom_notifier(struct notifier_block *nb);
|
|
|
|
extern bool oom_killer_disable(signed long timeout);
|
|
extern void oom_killer_enable(void);
|
|
|
|
extern struct task_struct *find_lock_task_mm(struct task_struct *p);
|
|
|
|
/* sysctls */
|
|
extern int sysctl_oom_dump_tasks;
|
|
extern int sysctl_oom_kill_allocating_task;
|
|
extern int sysctl_panic_on_oom;
|
|
#endif /* _INCLUDE_LINUX_OOM_H */
|