linux_dsm_epyc7002/kernel/ptrace.c
Andrea Arcangeli 30e0fca6c1 [PATCH] ptrace/coredump/exit_group deadlock
I could seldom reproduce a deadlock with a task not killable in T state
(TASK_STOPPED, not TASK_TRACED) by attaching a NPTL threaded program to
gdb, by segfaulting the task and triggering a core dump while some other
task is executing exit_group and while one task is in ptrace_attached
TASK_STOPPED state (not TASK_TRACED yet).  This originated from a gdb
bugreport (the fact gdb was segfaulting the task wasn't a kernel bug), but
I just incidentally noticed the gdb bug triggered a real kernel bug as
well.

Most threads hangs in exit_mm because the core_dumping is still going, the
core dumping hangs because the stopped task doesn't exit, the stopped task
can't wakeup because it has SIGNAL_GROUP_EXIT set, hence the deadlock.

To me it seems that the problem is that the force_sig_specific(SIGKILL) in
zap_threads is a noop if the task has PF_PTRACED set (like in this case
because gdb is attached).  The __ptrace_unlink does nothing because the
signal->flags is set to SIGNAL_GROUP_EXIT|SIGNAL_STOP_DEQUEUED (verified).

The above info also shows that the stopped task hit a race and got the stop
signal (presumably by the ptrace_attach, only the attach, state is still
TASK_STOPPED and gdb hangs waiting the core before it can set it to
TASK_TRACED) after one of the thread invoked the core dump (it's the core
dump that sets signal->flags to SIGNAL_GROUP_EXIT).

So beside the fact nobody would wakeup the task in __ptrace_unlink (the
state is _not_ TASK_TRACED), there's a secondary problem in the signal
handling code, where a task should ignore the ptrace-sigstops as long as
SIGNAL_GROUP_EXIT is set (or the wakeup in __ptrace_unlink path wouldn't be
enough).

So I attempted to make this patch that seems to fix the problem.  There
were various ways to fix it, perhaps you prefer a different one, I just
opted to the one that looked safer to me.

I also removed the clearing of the stopped bits from the zap_other_threads
(zap_other_threads was safe unlike zap_threads).  I don't like useless
code, this whole NPTL signal/ptrace thing is already unreadable enough and
full of corner cases without confusing useless code into it to make it even
less readable.  And if this code is really needed, then you may want to
explain why it's not being done in the other paths that sets
SIGNAL_GROUP_EXIT at least.

Even after this patch I still wonder who serializes the read of
p->ptrace in zap_threads.

Patch is called ptrace-core_dump-exit_group-deadlock-1.

This was the trace I've got:

test          T ffff81003e8118c0     0 14305      1         14311 14309 (NOTLB)
ffff810058ccdde8 0000000000000082 000001f4000037e1 ffff810000000013
       00000000000000f8 ffff81003e811b00 ffff81003e8118c0 ffff810011362100
       0000000000000012 ffff810017ca4180
Call Trace:<ffffffff801317ed>{try_to_wake_up+893} <ffffffff80141677>{finish_stop+87}
       <ffffffff8014367f>{get_signal_to_deliver+1359} <ffffffff8010d3ad>{do_signal+157}
       <ffffffff8013deee>{ptrace_check_attach+222} <ffffffff80111575>{sys_ptrace+2293}
       <ffffffff80131810>{default_wake_function+0} <ffffffff80196399>{sys_ioctl+73}
       <ffffffff8010dd27>{sysret_signal+28} <ffffffff8010e00f>{ptregscall_common+103}

test          D ffff810011362100     0 14309      1         14305 14312 (NOTLB)
ffff810053c81cf8 0000000000000082 0000000000000286 0000000000000001
       0000000000000195 ffff810011362340 ffff810011362100 ffff81002e338040
       ffff810001e0ca80 0000000000000001
Call Trace:<ffffffff801317ed>{try_to_wake_up+893} <ffffffff8044677d>{wait_for_completion+173}
       <ffffffff80131810>{default_wake_function+0} <ffffffff80137435>{exit_mm+149}
       <ffffffff801381af>{do_exit+479} <ffffffff80138d0c>{do_group_exit+252}
       <ffffffff801436db>{get_signal_to_deliver+1451} <ffffffff8010d3ad>{do_signal+157}
       <ffffffff8013deee>{ptrace_check_attach+222} <ffffffff80140850>{specific_send_sig_info+2

       <ffffffff8014208a>{force_sig_info+186} <ffffffff804479a0>{do_int3+112}
       <ffffffff8010e308>{retint_signal+61}
test          D ffff81002e338040     0 14311      1         14716 14305 (NOTLB)
ffff81005ca8dcf8 0000000000000082 0000000000000286 0000000000000001
       0000000000000120 ffff81002e338280 ffff81002e338040 ffff8100481cb740
       ffff810001e0ca80 0000000000000001
Call Trace:<ffffffff801317ed>{try_to_wake_up+893} <ffffffff8044677d>{wait_for_completion+173}
       <ffffffff80131810>{default_wake_function+0} <ffffffff80137435>{exit_mm+149}
       <ffffffff801381af>{do_exit+479} <ffffffff80142d0e>{__dequeue_signal+558}
       <ffffffff80138d0c>{do_group_exit+252} <ffffffff801436db>{get_signal_to_deliver+1451}
       <ffffffff8010d3ad>{do_signal+157} <ffffffff8013deee>{ptrace_check_attach+222}
       <ffffffff80140850>{specific_send_sig_info+208} <ffffffff8014208a>{force_sig_info+186}
       <ffffffff804479a0>{do_int3+112} <ffffffff8010e308>{retint_signal+61}

test          D ffff810017ca4180     0 14312      1         14309 13882 (NOTLB)
ffff81005d15fcb8 0000000000000082 ffff81005d15fc58 ffffffff80130816
       0000000000000897 ffff810017ca43c0 ffff810017ca4180 ffff81003e8118c0
       0000000000000082 ffffffff801317ed
Call Trace:<ffffffff80130816>{activate_task+150} <ffffffff801317ed>{try_to_wake_up+893}
       <ffffffff8044677d>{wait_for_completion+173} <ffffffff80131810>{default_wake_function+0}
       <ffffffff8018cdc3>{do_coredump+819} <ffffffff80445f52>{thread_return+82}
       <ffffffff801436d4>{get_signal_to_deliver+1444} <ffffffff8010d3ad>{do_signal+157}
       <ffffffff8013deee>{ptrace_check_attach+222} <ffffffff80140850>{specific_send_sig_info+2

       <ffffffff804472e5>{_spin_unlock_irqrestore+5} <ffffffff8014208a>{force_sig_info+186}
       <ffffffff804476ff>{do_general_protection+159} <ffffffff8010e308>{retint_signal+61}

Signed-off-by: Andrea Arcangeli <andrea@suse.de>
Cc: Roland McGrath <roland@redhat.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Linus Torvalds <torvalds@osdl.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-30 17:37:22 -08:00

409 lines
9.1 KiB
C

/*
* linux/kernel/ptrace.c
*
* (C) Copyright 1999 Linus Torvalds
*
* Common interfaces for "ptrace()" which we do not want
* to continually duplicate across every architecture.
*/
#include <linux/module.h>
#include <linux/sched.h>
#include <linux/errno.h>
#include <linux/mm.h>
#include <linux/highmem.h>
#include <linux/pagemap.h>
#include <linux/smp_lock.h>
#include <linux/ptrace.h>
#include <linux/security.h>
#include <linux/signal.h>
#include <asm/pgtable.h>
#include <asm/uaccess.h>
/*
* ptrace a task: make the debugger its new parent and
* move it to the ptrace list.
*
* Must be called with the tasklist lock write-held.
*/
void __ptrace_link(task_t *child, task_t *new_parent)
{
if (!list_empty(&child->ptrace_list))
BUG();
if (child->parent == new_parent)
return;
list_add(&child->ptrace_list, &child->parent->ptrace_children);
REMOVE_LINKS(child);
child->parent = new_parent;
SET_LINKS(child);
}
/*
* Turn a tracing stop into a normal stop now, since with no tracer there
* would be no way to wake it up with SIGCONT or SIGKILL. If there was a
* signal sent that would resume the child, but didn't because it was in
* TASK_TRACED, resume it now.
* Requires that irqs be disabled.
*/
void ptrace_untrace(task_t *child)
{
spin_lock(&child->sighand->siglock);
if (child->state == TASK_TRACED) {
if (child->signal->flags & SIGNAL_STOP_STOPPED) {
child->state = TASK_STOPPED;
} else {
signal_wake_up(child, 1);
}
}
if (child->signal->flags & SIGNAL_GROUP_EXIT) {
sigaddset(&child->pending.signal, SIGKILL);
signal_wake_up(child, 1);
}
spin_unlock(&child->sighand->siglock);
}
/*
* unptrace a task: move it back to its original parent and
* remove it from the ptrace list.
*
* Must be called with the tasklist lock write-held.
*/
void __ptrace_unlink(task_t *child)
{
if (!child->ptrace)
BUG();
child->ptrace = 0;
if (!list_empty(&child->ptrace_list)) {
list_del_init(&child->ptrace_list);
REMOVE_LINKS(child);
child->parent = child->real_parent;
SET_LINKS(child);
}
ptrace_untrace(child);
}
/*
* Check that we have indeed attached to the thing..
*/
int ptrace_check_attach(struct task_struct *child, int kill)
{
int ret = -ESRCH;
/*
* We take the read lock around doing both checks to close a
* possible race where someone else was tracing our child and
* detached between these two checks. After this locked check,
* we are sure that this is our traced child and that can only
* be changed by us so it's not changing right after this.
*/
read_lock(&tasklist_lock);
if ((child->ptrace & PT_PTRACED) && child->parent == current &&
(!(child->ptrace & PT_ATTACHED) || child->real_parent != current)
&& child->signal != NULL) {
ret = 0;
spin_lock_irq(&child->sighand->siglock);
if (child->state == TASK_STOPPED) {
child->state = TASK_TRACED;
} else if (child->state != TASK_TRACED && !kill) {
ret = -ESRCH;
}
spin_unlock_irq(&child->sighand->siglock);
}
read_unlock(&tasklist_lock);
if (!ret && !kill) {
wait_task_inactive(child);
}
/* All systems go.. */
return ret;
}
static int may_attach(struct task_struct *task)
{
if (!task->mm)
return -EPERM;
if (((current->uid != task->euid) ||
(current->uid != task->suid) ||
(current->uid != task->uid) ||
(current->gid != task->egid) ||
(current->gid != task->sgid) ||
(current->gid != task->gid)) && !capable(CAP_SYS_PTRACE))
return -EPERM;
smp_rmb();
if (!task->mm->dumpable && !capable(CAP_SYS_PTRACE))
return -EPERM;
return security_ptrace(current, task);
}
int ptrace_may_attach(struct task_struct *task)
{
int err;
task_lock(task);
err = may_attach(task);
task_unlock(task);
return !err;
}
int ptrace_attach(struct task_struct *task)
{
int retval;
task_lock(task);
retval = -EPERM;
if (task->pid <= 1)
goto bad;
if (task == current)
goto bad;
/* the same process cannot be attached many times */
if (task->ptrace & PT_PTRACED)
goto bad;
retval = may_attach(task);
if (retval)
goto bad;
/* Go */
task->ptrace |= PT_PTRACED | ((task->real_parent != current)
? PT_ATTACHED : 0);
if (capable(CAP_SYS_PTRACE))
task->ptrace |= PT_PTRACE_CAP;
task_unlock(task);
write_lock_irq(&tasklist_lock);
__ptrace_link(task, current);
write_unlock_irq(&tasklist_lock);
force_sig_specific(SIGSTOP, task);
return 0;
bad:
task_unlock(task);
return retval;
}
int ptrace_detach(struct task_struct *child, unsigned int data)
{
if (!valid_signal(data))
return -EIO;
/* Architecture-specific hardware disable .. */
ptrace_disable(child);
/* .. re-parent .. */
child->exit_code = data;
write_lock_irq(&tasklist_lock);
__ptrace_unlink(child);
/* .. and wake it up. */
if (child->exit_state != EXIT_ZOMBIE)
wake_up_process(child);
write_unlock_irq(&tasklist_lock);
return 0;
}
/*
* Access another process' address space.
* Source/target buffer must be kernel space,
* Do not walk the page table directly, use get_user_pages
*/
int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write)
{
struct mm_struct *mm;
struct vm_area_struct *vma;
struct page *page;
void *old_buf = buf;
mm = get_task_mm(tsk);
if (!mm)
return 0;
down_read(&mm->mmap_sem);
/* ignore errors, just check how much was sucessfully transfered */
while (len) {
int bytes, ret, offset;
void *maddr;
ret = get_user_pages(tsk, mm, addr, 1,
write, 1, &page, &vma);
if (ret <= 0)
break;
bytes = len;
offset = addr & (PAGE_SIZE-1);
if (bytes > PAGE_SIZE-offset)
bytes = PAGE_SIZE-offset;
maddr = kmap(page);
if (write) {
copy_to_user_page(vma, page, addr,
maddr + offset, buf, bytes);
set_page_dirty_lock(page);
} else {
copy_from_user_page(vma, page, addr,
buf, maddr + offset, bytes);
}
kunmap(page);
page_cache_release(page);
len -= bytes;
buf += bytes;
addr += bytes;
}
up_read(&mm->mmap_sem);
mmput(mm);
return buf - old_buf;
}
int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst, int len)
{
int copied = 0;
while (len > 0) {
char buf[128];
int this_len, retval;
this_len = (len > sizeof(buf)) ? sizeof(buf) : len;
retval = access_process_vm(tsk, src, buf, this_len, 0);
if (!retval) {
if (copied)
break;
return -EIO;
}
if (copy_to_user(dst, buf, retval))
return -EFAULT;
copied += retval;
src += retval;
dst += retval;
len -= retval;
}
return copied;
}
int ptrace_writedata(struct task_struct *tsk, char __user *src, unsigned long dst, int len)
{
int copied = 0;
while (len > 0) {
char buf[128];
int this_len, retval;
this_len = (len > sizeof(buf)) ? sizeof(buf) : len;
if (copy_from_user(buf, src, this_len))
return -EFAULT;
retval = access_process_vm(tsk, dst, buf, this_len, 1);
if (!retval) {
if (copied)
break;
return -EIO;
}
copied += retval;
src += retval;
dst += retval;
len -= retval;
}
return copied;
}
static int ptrace_setoptions(struct task_struct *child, long data)
{
child->ptrace &= ~PT_TRACE_MASK;
if (data & PTRACE_O_TRACESYSGOOD)
child->ptrace |= PT_TRACESYSGOOD;
if (data & PTRACE_O_TRACEFORK)
child->ptrace |= PT_TRACE_FORK;
if (data & PTRACE_O_TRACEVFORK)
child->ptrace |= PT_TRACE_VFORK;
if (data & PTRACE_O_TRACECLONE)
child->ptrace |= PT_TRACE_CLONE;
if (data & PTRACE_O_TRACEEXEC)
child->ptrace |= PT_TRACE_EXEC;
if (data & PTRACE_O_TRACEVFORKDONE)
child->ptrace |= PT_TRACE_VFORK_DONE;
if (data & PTRACE_O_TRACEEXIT)
child->ptrace |= PT_TRACE_EXIT;
return (data & ~PTRACE_O_MASK) ? -EINVAL : 0;
}
static int ptrace_getsiginfo(struct task_struct *child, siginfo_t __user * data)
{
siginfo_t lastinfo;
int error = -ESRCH;
read_lock(&tasklist_lock);
if (likely(child->sighand != NULL)) {
error = -EINVAL;
spin_lock_irq(&child->sighand->siglock);
if (likely(child->last_siginfo != NULL)) {
lastinfo = *child->last_siginfo;
error = 0;
}
spin_unlock_irq(&child->sighand->siglock);
}
read_unlock(&tasklist_lock);
if (!error)
return copy_siginfo_to_user(data, &lastinfo);
return error;
}
static int ptrace_setsiginfo(struct task_struct *child, siginfo_t __user * data)
{
siginfo_t newinfo;
int error = -ESRCH;
if (copy_from_user(&newinfo, data, sizeof (siginfo_t)))
return -EFAULT;
read_lock(&tasklist_lock);
if (likely(child->sighand != NULL)) {
error = -EINVAL;
spin_lock_irq(&child->sighand->siglock);
if (likely(child->last_siginfo != NULL)) {
*child->last_siginfo = newinfo;
error = 0;
}
spin_unlock_irq(&child->sighand->siglock);
}
read_unlock(&tasklist_lock);
return error;
}
int ptrace_request(struct task_struct *child, long request,
long addr, long data)
{
int ret = -EIO;
switch (request) {
#ifdef PTRACE_OLDSETOPTIONS
case PTRACE_OLDSETOPTIONS:
#endif
case PTRACE_SETOPTIONS:
ret = ptrace_setoptions(child, data);
break;
case PTRACE_GETEVENTMSG:
ret = put_user(child->ptrace_message, (unsigned long __user *) data);
break;
case PTRACE_GETSIGINFO:
ret = ptrace_getsiginfo(child, (siginfo_t __user *) data);
break;
case PTRACE_SETSIGINFO:
ret = ptrace_setsiginfo(child, (siginfo_t __user *) data);
break;
default:
break;
}
return ret;
}