mirror of
https://github.com/AuxXxilium/linux_dsm_epyc7002.git
synced 2024-12-28 11:18:45 +07:00
fb3c5386b3
This allows the seccomp notifier to continue a syscall. A positive
discussion about this feature was triggered by a post to the
ksummit-discuss mailing list (cf. [3]) and took place during KSummit
(cf. [1]) and again at the containers/checkpoint-restore
micro-conference at Linux Plumbers.
Recently we landed seccomp support for SECCOMP_RET_USER_NOTIF (cf. [4])
which enables a process (watchee) to retrieve an fd for its seccomp
filter. This fd can then be handed to another (usually more privileged)
process (watcher). The watcher will then be able to receive seccomp
messages about the syscalls having been performed by the watchee.
This feature is heavily used in some userspace workloads. For example,
it is currently used to intercept mknod() syscalls in user namespaces
aka in containers.
The mknod() syscall can be easily filtered based on dev_t. This allows
us to only intercept a very specific subset of mknod() syscalls.
Furthermore, mknod() is not possible in user namespaces toto coelo and
so intercepting and denying syscalls that are not in the whitelist on
accident is not a big deal. The watchee won't notice a difference.
In contrast to mknod(), a lot of other syscall we intercept (e.g.
setxattr()) cannot be easily filtered like mknod() because they have
pointer arguments. Additionally, some of them might actually succeed in
user namespaces (e.g. setxattr() for all "user.*" xattrs). Since we
currently cannot tell seccomp to continue from a user notifier we are
stuck with performing all of the syscalls in lieu of the container. This
is a huge security liability since it is extremely difficult to
correctly assume all of the necessary privileges of the calling task
such that the syscall can be successfully emulated without escaping
other additional security restrictions (think missing CAP_MKNOD for
mknod(), or MS_NODEV on a filesystem etc.). This can be solved by
telling seccomp to resume the syscall.
One thing that came up in the discussion was the problem that another
thread could change the memory after userspace has decided to let the
syscall continue which is a well known TOCTOU with seccomp which is
present in other ways already.
The discussion showed that this feature is already very useful for any
syscall without pointer arguments. For any accidentally intercepted
non-pointer syscall it is safe to continue.
For syscalls with pointer arguments there is a race but for any cautious
userspace and the main usec cases the race doesn't matter. The notifier
is intended to be used in a scenario where a more privileged watcher
supervises the syscalls of lesser privileged watchee to allow it to get
around kernel-enforced limitations by performing the syscall for it
whenever deemed save by the watcher. Hence, if a user tricks the watcher
into allowing a syscall they will either get a deny based on
kernel-enforced restrictions later or they will have changed the
arguments in such a way that they manage to perform a syscall with
arguments that they would've been allowed to do anyway.
In general, it is good to point out again, that the notifier fd was not
intended to allow userspace to implement a security policy but rather to
work around kernel security mechanisms in cases where the watcher knows
that a given action is safe to perform.
/* References */
[1]: https://linuxplumbersconf.org/event/4/contributions/560
[2]: https://linuxplumbersconf.org/event/4/contributions/477
[3]: https://lore.kernel.org/r/20190719093538.dhyopljyr5ns33qx@brauner.io
[4]: commit 6a21cc50f0
("seccomp: add a return code to trap to userspace")
Co-developed-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
Reviewed-by: Tycho Andersen <tycho@tycho.ws>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Will Drewry <wad@chromium.org>
CC: Tyler Hicks <tyhicks@canonical.com>
Link: https://lore.kernel.org/r/20190920083007.11475-2-christian.brauner@ubuntu.com
Signed-off-by: Kees Cook <keescook@chromium.org>
127 lines
4.9 KiB
C
127 lines
4.9 KiB
C
/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
|
|
#ifndef _UAPI_LINUX_SECCOMP_H
|
|
#define _UAPI_LINUX_SECCOMP_H
|
|
|
|
#include <linux/compiler.h>
|
|
#include <linux/types.h>
|
|
|
|
|
|
/* Valid values for seccomp.mode and prctl(PR_SET_SECCOMP, <mode>) */
|
|
#define SECCOMP_MODE_DISABLED 0 /* seccomp is not in use. */
|
|
#define SECCOMP_MODE_STRICT 1 /* uses hard-coded filter. */
|
|
#define SECCOMP_MODE_FILTER 2 /* uses user-supplied filter. */
|
|
|
|
/* Valid operations for seccomp syscall. */
|
|
#define SECCOMP_SET_MODE_STRICT 0
|
|
#define SECCOMP_SET_MODE_FILTER 1
|
|
#define SECCOMP_GET_ACTION_AVAIL 2
|
|
#define SECCOMP_GET_NOTIF_SIZES 3
|
|
|
|
/* Valid flags for SECCOMP_SET_MODE_FILTER */
|
|
#define SECCOMP_FILTER_FLAG_TSYNC (1UL << 0)
|
|
#define SECCOMP_FILTER_FLAG_LOG (1UL << 1)
|
|
#define SECCOMP_FILTER_FLAG_SPEC_ALLOW (1UL << 2)
|
|
#define SECCOMP_FILTER_FLAG_NEW_LISTENER (1UL << 3)
|
|
|
|
/*
|
|
* All BPF programs must return a 32-bit value.
|
|
* The bottom 16-bits are for optional return data.
|
|
* The upper 16-bits are ordered from least permissive values to most,
|
|
* as a signed value (so 0x8000000 is negative).
|
|
*
|
|
* The ordering ensures that a min_t() over composed return values always
|
|
* selects the least permissive choice.
|
|
*/
|
|
#define SECCOMP_RET_KILL_PROCESS 0x80000000U /* kill the process */
|
|
#define SECCOMP_RET_KILL_THREAD 0x00000000U /* kill the thread */
|
|
#define SECCOMP_RET_KILL SECCOMP_RET_KILL_THREAD
|
|
#define SECCOMP_RET_TRAP 0x00030000U /* disallow and force a SIGSYS */
|
|
#define SECCOMP_RET_ERRNO 0x00050000U /* returns an errno */
|
|
#define SECCOMP_RET_USER_NOTIF 0x7fc00000U /* notifies userspace */
|
|
#define SECCOMP_RET_TRACE 0x7ff00000U /* pass to a tracer or disallow */
|
|
#define SECCOMP_RET_LOG 0x7ffc0000U /* allow after logging */
|
|
#define SECCOMP_RET_ALLOW 0x7fff0000U /* allow */
|
|
|
|
/* Masks for the return value sections. */
|
|
#define SECCOMP_RET_ACTION_FULL 0xffff0000U
|
|
#define SECCOMP_RET_ACTION 0x7fff0000U
|
|
#define SECCOMP_RET_DATA 0x0000ffffU
|
|
|
|
/**
|
|
* struct seccomp_data - the format the BPF program executes over.
|
|
* @nr: the system call number
|
|
* @arch: indicates system call convention as an AUDIT_ARCH_* value
|
|
* as defined in <linux/audit.h>.
|
|
* @instruction_pointer: at the time of the system call.
|
|
* @args: up to 6 system call arguments always stored as 64-bit values
|
|
* regardless of the architecture.
|
|
*/
|
|
struct seccomp_data {
|
|
int nr;
|
|
__u32 arch;
|
|
__u64 instruction_pointer;
|
|
__u64 args[6];
|
|
};
|
|
|
|
struct seccomp_notif_sizes {
|
|
__u16 seccomp_notif;
|
|
__u16 seccomp_notif_resp;
|
|
__u16 seccomp_data;
|
|
};
|
|
|
|
struct seccomp_notif {
|
|
__u64 id;
|
|
__u32 pid;
|
|
__u32 flags;
|
|
struct seccomp_data data;
|
|
};
|
|
|
|
/*
|
|
* Valid flags for struct seccomp_notif_resp
|
|
*
|
|
* Note, the SECCOMP_USER_NOTIF_FLAG_CONTINUE flag must be used with caution!
|
|
* If set by the process supervising the syscalls of another process the
|
|
* syscall will continue. This is problematic because of an inherent TOCTOU.
|
|
* An attacker can exploit the time while the supervised process is waiting on
|
|
* a response from the supervising process to rewrite syscall arguments which
|
|
* are passed as pointers of the intercepted syscall.
|
|
* It should be absolutely clear that this means that the seccomp notifier
|
|
* _cannot_ be used to implement a security policy! It should only ever be used
|
|
* in scenarios where a more privileged process supervises the syscalls of a
|
|
* lesser privileged process to get around kernel-enforced security
|
|
* restrictions when the privileged process deems this safe. In other words,
|
|
* in order to continue a syscall the supervising process should be sure that
|
|
* another security mechanism or the kernel itself will sufficiently block
|
|
* syscalls if arguments are rewritten to something unsafe.
|
|
*
|
|
* Similar precautions should be applied when stacking SECCOMP_RET_USER_NOTIF
|
|
* or SECCOMP_RET_TRACE. For SECCOMP_RET_USER_NOTIF filters acting on the
|
|
* same syscall, the most recently added filter takes precedence. This means
|
|
* that the new SECCOMP_RET_USER_NOTIF filter can override any
|
|
* SECCOMP_IOCTL_NOTIF_SEND from earlier filters, essentially allowing all
|
|
* such filtered syscalls to be executed by sending the response
|
|
* SECCOMP_USER_NOTIF_FLAG_CONTINUE. Note that SECCOMP_RET_TRACE can equally
|
|
* be overriden by SECCOMP_USER_NOTIF_FLAG_CONTINUE.
|
|
*/
|
|
#define SECCOMP_USER_NOTIF_FLAG_CONTINUE BIT(0)
|
|
|
|
struct seccomp_notif_resp {
|
|
__u64 id;
|
|
__s64 val;
|
|
__s32 error;
|
|
__u32 flags;
|
|
};
|
|
|
|
#define SECCOMP_IOC_MAGIC '!'
|
|
#define SECCOMP_IO(nr) _IO(SECCOMP_IOC_MAGIC, nr)
|
|
#define SECCOMP_IOR(nr, type) _IOR(SECCOMP_IOC_MAGIC, nr, type)
|
|
#define SECCOMP_IOW(nr, type) _IOW(SECCOMP_IOC_MAGIC, nr, type)
|
|
#define SECCOMP_IOWR(nr, type) _IOWR(SECCOMP_IOC_MAGIC, nr, type)
|
|
|
|
/* Flags for seccomp notification fd ioctl. */
|
|
#define SECCOMP_IOCTL_NOTIF_RECV SECCOMP_IOWR(0, struct seccomp_notif)
|
|
#define SECCOMP_IOCTL_NOTIF_SEND SECCOMP_IOWR(1, \
|
|
struct seccomp_notif_resp)
|
|
#define SECCOMP_IOCTL_NOTIF_ID_VALID SECCOMP_IOR(2, __u64)
|
|
#endif /* _UAPI_LINUX_SECCOMP_H */
|