mirror of
https://github.com/AuxXxilium/linux_dsm_epyc7002.git
synced 2024-12-28 11:18:45 +07:00
7cf97b1254
The current SECCOMP_RET_USER_NOTIF API allows for syscall supervision over an fd. It is often used in settings where a supervising task emulates syscalls on behalf of a supervised task in userspace, either to further restrict the supervisee's syscall abilities or to circumvent kernel enforced restrictions the supervisor deems safe to lift (e.g. actually performing a mount(2) for an unprivileged container). While SECCOMP_RET_USER_NOTIF allows for the interception of any syscall, only a certain subset of syscalls could be correctly emulated. Over the last few development cycles, the set of syscalls which can't be emulated has been reduced due to the addition of pidfd_getfd(2). With this we are now able to, for example, intercept syscalls that require the supervisor to operate on file descriptors of the supervisee such as connect(2). However, syscalls that cause new file descriptors to be installed can not currently be correctly emulated since there is no way for the supervisor to inject file descriptors into the supervisee. This patch adds a new addfd ioctl to remove this restriction by allowing the supervisor to install file descriptors into the intercepted task. By implementing this feature via seccomp the supervisor effectively instructs the supervisee to install a set of file descriptors into its own file descriptor table during the intercepted syscall. This way it is possible to intercept syscalls such as open() or accept(), and install (or replace, like dup2(2)) the supervisor's resulting fd into the supervisee. One replacement use-case would be to redirect the stdout and stderr of a supervisee into log file descriptors opened by the supervisor. The ioctl handling is based on the discussions[1] of how Extensible Arguments should interact with ioctls. Instead of building size into the addfd structure, make it a function of the ioctl command (which is how sizes are normally passed to ioctls). To support forward and backward compatibility, just mask out the direction and size, and match everything. The size (and any future direction) checks are done along with copy_struct_from_user() logic. As a note, the seccomp_notif_addfd structure is laid out based on 8-byte alignment without requiring packing as there have been packing issues with uapi highlighted before[2][3]. Although we could overload the newfd field and use -1 to indicate that it is not to be used, doing so requires changing the size of the fd field, and introduces struct packing complexity. [1]: https://lore.kernel.org/lkml/87o8w9bcaf.fsf@mid.deneb.enyo.de/ [2]: https://lore.kernel.org/lkml/a328b91d-fd8f-4f27-b3c2-91a9c45f18c0@rasmusvillemoes.dk/ [3]: https://lore.kernel.org/lkml/20200612104629.GA15814@ircssh-2.c.rugged-nimbus-611.internal Cc: Christoph Hellwig <hch@lst.de> Cc: Christian Brauner <christian.brauner@ubuntu.com> Cc: Tycho Andersen <tycho@tycho.ws> Cc: Jann Horn <jannh@google.com> Cc: Robert Sesek <rsesek@google.com> Cc: Chris Palmer <palmer@google.com> Cc: Al Viro <viro@zeniv.linux.org.uk> Cc: linux-fsdevel@vger.kernel.org Cc: linux-kernel@vger.kernel.org Cc: linux-api@vger.kernel.org Suggested-by: Matt Denton <mpdenton@google.com> Link: https://lore.kernel.org/r/20200603011044.7972-4-sargun@sargun.me Signed-off-by: Sargun Dhillon <sargun@sargun.me> Reviewed-by: Will Drewry <wad@chromium.org> Co-developed-by: Kees Cook <keescook@chromium.org> Signed-off-by: Kees Cook <keescook@chromium.org>
151 lines
5.6 KiB
C
151 lines
5.6 KiB
C
/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
|
|
#ifndef _UAPI_LINUX_SECCOMP_H
|
|
#define _UAPI_LINUX_SECCOMP_H
|
|
|
|
#include <linux/compiler.h>
|
|
#include <linux/types.h>
|
|
|
|
|
|
/* Valid values for seccomp.mode and prctl(PR_SET_SECCOMP, <mode>) */
|
|
#define SECCOMP_MODE_DISABLED 0 /* seccomp is not in use. */
|
|
#define SECCOMP_MODE_STRICT 1 /* uses hard-coded filter. */
|
|
#define SECCOMP_MODE_FILTER 2 /* uses user-supplied filter. */
|
|
|
|
/* Valid operations for seccomp syscall. */
|
|
#define SECCOMP_SET_MODE_STRICT 0
|
|
#define SECCOMP_SET_MODE_FILTER 1
|
|
#define SECCOMP_GET_ACTION_AVAIL 2
|
|
#define SECCOMP_GET_NOTIF_SIZES 3
|
|
|
|
/* Valid flags for SECCOMP_SET_MODE_FILTER */
|
|
#define SECCOMP_FILTER_FLAG_TSYNC (1UL << 0)
|
|
#define SECCOMP_FILTER_FLAG_LOG (1UL << 1)
|
|
#define SECCOMP_FILTER_FLAG_SPEC_ALLOW (1UL << 2)
|
|
#define SECCOMP_FILTER_FLAG_NEW_LISTENER (1UL << 3)
|
|
#define SECCOMP_FILTER_FLAG_TSYNC_ESRCH (1UL << 4)
|
|
|
|
/*
|
|
* All BPF programs must return a 32-bit value.
|
|
* The bottom 16-bits are for optional return data.
|
|
* The upper 16-bits are ordered from least permissive values to most,
|
|
* as a signed value (so 0x8000000 is negative).
|
|
*
|
|
* The ordering ensures that a min_t() over composed return values always
|
|
* selects the least permissive choice.
|
|
*/
|
|
#define SECCOMP_RET_KILL_PROCESS 0x80000000U /* kill the process */
|
|
#define SECCOMP_RET_KILL_THREAD 0x00000000U /* kill the thread */
|
|
#define SECCOMP_RET_KILL SECCOMP_RET_KILL_THREAD
|
|
#define SECCOMP_RET_TRAP 0x00030000U /* disallow and force a SIGSYS */
|
|
#define SECCOMP_RET_ERRNO 0x00050000U /* returns an errno */
|
|
#define SECCOMP_RET_USER_NOTIF 0x7fc00000U /* notifies userspace */
|
|
#define SECCOMP_RET_TRACE 0x7ff00000U /* pass to a tracer or disallow */
|
|
#define SECCOMP_RET_LOG 0x7ffc0000U /* allow after logging */
|
|
#define SECCOMP_RET_ALLOW 0x7fff0000U /* allow */
|
|
|
|
/* Masks for the return value sections. */
|
|
#define SECCOMP_RET_ACTION_FULL 0xffff0000U
|
|
#define SECCOMP_RET_ACTION 0x7fff0000U
|
|
#define SECCOMP_RET_DATA 0x0000ffffU
|
|
|
|
/**
|
|
* struct seccomp_data - the format the BPF program executes over.
|
|
* @nr: the system call number
|
|
* @arch: indicates system call convention as an AUDIT_ARCH_* value
|
|
* as defined in <linux/audit.h>.
|
|
* @instruction_pointer: at the time of the system call.
|
|
* @args: up to 6 system call arguments always stored as 64-bit values
|
|
* regardless of the architecture.
|
|
*/
|
|
struct seccomp_data {
|
|
int nr;
|
|
__u32 arch;
|
|
__u64 instruction_pointer;
|
|
__u64 args[6];
|
|
};
|
|
|
|
struct seccomp_notif_sizes {
|
|
__u16 seccomp_notif;
|
|
__u16 seccomp_notif_resp;
|
|
__u16 seccomp_data;
|
|
};
|
|
|
|
struct seccomp_notif {
|
|
__u64 id;
|
|
__u32 pid;
|
|
__u32 flags;
|
|
struct seccomp_data data;
|
|
};
|
|
|
|
/*
|
|
* Valid flags for struct seccomp_notif_resp
|
|
*
|
|
* Note, the SECCOMP_USER_NOTIF_FLAG_CONTINUE flag must be used with caution!
|
|
* If set by the process supervising the syscalls of another process the
|
|
* syscall will continue. This is problematic because of an inherent TOCTOU.
|
|
* An attacker can exploit the time while the supervised process is waiting on
|
|
* a response from the supervising process to rewrite syscall arguments which
|
|
* are passed as pointers of the intercepted syscall.
|
|
* It should be absolutely clear that this means that the seccomp notifier
|
|
* _cannot_ be used to implement a security policy! It should only ever be used
|
|
* in scenarios where a more privileged process supervises the syscalls of a
|
|
* lesser privileged process to get around kernel-enforced security
|
|
* restrictions when the privileged process deems this safe. In other words,
|
|
* in order to continue a syscall the supervising process should be sure that
|
|
* another security mechanism or the kernel itself will sufficiently block
|
|
* syscalls if arguments are rewritten to something unsafe.
|
|
*
|
|
* Similar precautions should be applied when stacking SECCOMP_RET_USER_NOTIF
|
|
* or SECCOMP_RET_TRACE. For SECCOMP_RET_USER_NOTIF filters acting on the
|
|
* same syscall, the most recently added filter takes precedence. This means
|
|
* that the new SECCOMP_RET_USER_NOTIF filter can override any
|
|
* SECCOMP_IOCTL_NOTIF_SEND from earlier filters, essentially allowing all
|
|
* such filtered syscalls to be executed by sending the response
|
|
* SECCOMP_USER_NOTIF_FLAG_CONTINUE. Note that SECCOMP_RET_TRACE can equally
|
|
* be overriden by SECCOMP_USER_NOTIF_FLAG_CONTINUE.
|
|
*/
|
|
#define SECCOMP_USER_NOTIF_FLAG_CONTINUE (1UL << 0)
|
|
|
|
struct seccomp_notif_resp {
|
|
__u64 id;
|
|
__s64 val;
|
|
__s32 error;
|
|
__u32 flags;
|
|
};
|
|
|
|
/* valid flags for seccomp_notif_addfd */
|
|
#define SECCOMP_ADDFD_FLAG_SETFD (1UL << 0) /* Specify remote fd */
|
|
|
|
/**
|
|
* struct seccomp_notif_addfd
|
|
* @id: The ID of the seccomp notification
|
|
* @flags: SECCOMP_ADDFD_FLAG_*
|
|
* @srcfd: The local fd number
|
|
* @newfd: Optional remote FD number if SETFD option is set, otherwise 0.
|
|
* @newfd_flags: The O_* flags the remote FD should have applied
|
|
*/
|
|
struct seccomp_notif_addfd {
|
|
__u64 id;
|
|
__u32 flags;
|
|
__u32 srcfd;
|
|
__u32 newfd;
|
|
__u32 newfd_flags;
|
|
};
|
|
|
|
#define SECCOMP_IOC_MAGIC '!'
|
|
#define SECCOMP_IO(nr) _IO(SECCOMP_IOC_MAGIC, nr)
|
|
#define SECCOMP_IOR(nr, type) _IOR(SECCOMP_IOC_MAGIC, nr, type)
|
|
#define SECCOMP_IOW(nr, type) _IOW(SECCOMP_IOC_MAGIC, nr, type)
|
|
#define SECCOMP_IOWR(nr, type) _IOWR(SECCOMP_IOC_MAGIC, nr, type)
|
|
|
|
/* Flags for seccomp notification fd ioctl. */
|
|
#define SECCOMP_IOCTL_NOTIF_RECV SECCOMP_IOWR(0, struct seccomp_notif)
|
|
#define SECCOMP_IOCTL_NOTIF_SEND SECCOMP_IOWR(1, \
|
|
struct seccomp_notif_resp)
|
|
#define SECCOMP_IOCTL_NOTIF_ID_VALID SECCOMP_IOW(2, __u64)
|
|
/* On success, the return value is the remote process's added fd number */
|
|
#define SECCOMP_IOCTL_NOTIF_ADDFD SECCOMP_IOW(3, \
|
|
struct seccomp_notif_addfd)
|
|
|
|
#endif /* _UAPI_LINUX_SECCOMP_H */
|