mirror of
https://github.com/AuxXxilium/linux_dsm_epyc7002.git
synced 2025-01-18 13:16:12 +07:00
ef2c41cf38
This adds support for creating a process in a different cgroup than its parent. Callers can limit and account processes and threads right from the moment they are spawned: - A service manager can directly spawn new services into dedicated cgroups. - A process can be directly created in a frozen cgroup and will be frozen as well. - The initial accounting jitter experienced by process supervisors and daemons is eliminated with this. - Threaded applications or even thread implementations can choose to create a specific cgroup layout where each thread is spawned directly into a dedicated cgroup. This feature is limited to the unified hierarchy. Callers need to pass a directory file descriptor for the target cgroup. The caller can choose to pass an O_PATH file descriptor. All usual migration restrictions apply, i.e. there can be no processes in inner nodes. In general, creating a process directly in a target cgroup adheres to all migration restrictions. One of the biggest advantages of this feature is that CLONE_INTO_GROUP does not need to grab the write side of the cgroup cgroup_threadgroup_rwsem. This global lock makes moving tasks/threads around super expensive. With clone3() this lock is avoided. Cc: Tejun Heo <tj@kernel.org> Cc: Ingo Molnar <mingo@redhat.com> Cc: Oleg Nesterov <oleg@redhat.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Li Zefan <lizefan@huawei.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: cgroups@vger.kernel.org Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com> Signed-off-by: Tejun Heo <tj@kernel.org>
149 lines
6.1 KiB
C
149 lines
6.1 KiB
C
/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
|
|
#ifndef _UAPI_LINUX_SCHED_H
|
|
#define _UAPI_LINUX_SCHED_H
|
|
|
|
#include <linux/types.h>
|
|
|
|
/*
|
|
* cloning flags:
|
|
*/
|
|
#define CSIGNAL 0x000000ff /* signal mask to be sent at exit */
|
|
#define CLONE_VM 0x00000100 /* set if VM shared between processes */
|
|
#define CLONE_FS 0x00000200 /* set if fs info shared between processes */
|
|
#define CLONE_FILES 0x00000400 /* set if open files shared between processes */
|
|
#define CLONE_SIGHAND 0x00000800 /* set if signal handlers and blocked signals shared */
|
|
#define CLONE_PIDFD 0x00001000 /* set if a pidfd should be placed in parent */
|
|
#define CLONE_PTRACE 0x00002000 /* set if we want to let tracing continue on the child too */
|
|
#define CLONE_VFORK 0x00004000 /* set if the parent wants the child to wake it up on mm_release */
|
|
#define CLONE_PARENT 0x00008000 /* set if we want to have the same parent as the cloner */
|
|
#define CLONE_THREAD 0x00010000 /* Same thread group? */
|
|
#define CLONE_NEWNS 0x00020000 /* New mount namespace group */
|
|
#define CLONE_SYSVSEM 0x00040000 /* share system V SEM_UNDO semantics */
|
|
#define CLONE_SETTLS 0x00080000 /* create a new TLS for the child */
|
|
#define CLONE_PARENT_SETTID 0x00100000 /* set the TID in the parent */
|
|
#define CLONE_CHILD_CLEARTID 0x00200000 /* clear the TID in the child */
|
|
#define CLONE_DETACHED 0x00400000 /* Unused, ignored */
|
|
#define CLONE_UNTRACED 0x00800000 /* set if the tracing process can't force CLONE_PTRACE on this clone */
|
|
#define CLONE_CHILD_SETTID 0x01000000 /* set the TID in the child */
|
|
#define CLONE_NEWCGROUP 0x02000000 /* New cgroup namespace */
|
|
#define CLONE_NEWUTS 0x04000000 /* New utsname namespace */
|
|
#define CLONE_NEWIPC 0x08000000 /* New ipc namespace */
|
|
#define CLONE_NEWUSER 0x10000000 /* New user namespace */
|
|
#define CLONE_NEWPID 0x20000000 /* New pid namespace */
|
|
#define CLONE_NEWNET 0x40000000 /* New network namespace */
|
|
#define CLONE_IO 0x80000000 /* Clone io context */
|
|
|
|
/* Flags for the clone3() syscall. */
|
|
#define CLONE_CLEAR_SIGHAND 0x100000000ULL /* Clear any signal handler and reset to SIG_DFL. */
|
|
#define CLONE_INTO_CGROUP 0x200000000ULL /* Clone into a specific cgroup given the right permissions. */
|
|
|
|
/*
|
|
* cloning flags intersect with CSIGNAL so can be used with unshare and clone3
|
|
* syscalls only:
|
|
*/
|
|
#define CLONE_NEWTIME 0x00000080 /* New time namespace */
|
|
|
|
#ifndef __ASSEMBLY__
|
|
/**
|
|
* struct clone_args - arguments for the clone3 syscall
|
|
* @flags: Flags for the new process as listed above.
|
|
* All flags are valid except for CSIGNAL and
|
|
* CLONE_DETACHED.
|
|
* @pidfd: If CLONE_PIDFD is set, a pidfd will be
|
|
* returned in this argument.
|
|
* @child_tid: If CLONE_CHILD_SETTID is set, the TID of the
|
|
* child process will be returned in the child's
|
|
* memory.
|
|
* @parent_tid: If CLONE_PARENT_SETTID is set, the TID of
|
|
* the child process will be returned in the
|
|
* parent's memory.
|
|
* @exit_signal: The exit_signal the parent process will be
|
|
* sent when the child exits.
|
|
* @stack: Specify the location of the stack for the
|
|
* child process.
|
|
* Note, @stack is expected to point to the
|
|
* lowest address. The stack direction will be
|
|
* determined by the kernel and set up
|
|
* appropriately based on @stack_size.
|
|
* @stack_size: The size of the stack for the child process.
|
|
* @tls: If CLONE_SETTLS is set, the tls descriptor
|
|
* is set to tls.
|
|
* @set_tid: Pointer to an array of type *pid_t. The size
|
|
* of the array is defined using @set_tid_size.
|
|
* This array is used to select PIDs/TIDs for
|
|
* newly created processes. The first element in
|
|
* this defines the PID in the most nested PID
|
|
* namespace. Each additional element in the array
|
|
* defines the PID in the parent PID namespace of
|
|
* the original PID namespace. If the array has
|
|
* less entries than the number of currently
|
|
* nested PID namespaces only the PIDs in the
|
|
* corresponding namespaces are set.
|
|
* @set_tid_size: This defines the size of the array referenced
|
|
* in @set_tid. This cannot be larger than the
|
|
* kernel's limit of nested PID namespaces.
|
|
* @cgroup: If CLONE_INTO_CGROUP is specified set this to
|
|
* a file descriptor for the cgroup.
|
|
*
|
|
* The structure is versioned by size and thus extensible.
|
|
* New struct members must go at the end of the struct and
|
|
* must be properly 64bit aligned.
|
|
*/
|
|
struct clone_args {
|
|
__aligned_u64 flags;
|
|
__aligned_u64 pidfd;
|
|
__aligned_u64 child_tid;
|
|
__aligned_u64 parent_tid;
|
|
__aligned_u64 exit_signal;
|
|
__aligned_u64 stack;
|
|
__aligned_u64 stack_size;
|
|
__aligned_u64 tls;
|
|
__aligned_u64 set_tid;
|
|
__aligned_u64 set_tid_size;
|
|
__aligned_u64 cgroup;
|
|
};
|
|
#endif
|
|
|
|
#define CLONE_ARGS_SIZE_VER0 64 /* sizeof first published struct */
|
|
#define CLONE_ARGS_SIZE_VER1 80 /* sizeof second published struct */
|
|
#define CLONE_ARGS_SIZE_VER2 88 /* sizeof third published struct */
|
|
|
|
/*
|
|
* Scheduling policies
|
|
*/
|
|
#define SCHED_NORMAL 0
|
|
#define SCHED_FIFO 1
|
|
#define SCHED_RR 2
|
|
#define SCHED_BATCH 3
|
|
/* SCHED_ISO: reserved but not implemented yet */
|
|
#define SCHED_IDLE 5
|
|
#define SCHED_DEADLINE 6
|
|
|
|
/* Can be ORed in to make sure the process is reverted back to SCHED_NORMAL on fork */
|
|
#define SCHED_RESET_ON_FORK 0x40000000
|
|
|
|
/*
|
|
* For the sched_{set,get}attr() calls
|
|
*/
|
|
#define SCHED_FLAG_RESET_ON_FORK 0x01
|
|
#define SCHED_FLAG_RECLAIM 0x02
|
|
#define SCHED_FLAG_DL_OVERRUN 0x04
|
|
#define SCHED_FLAG_KEEP_POLICY 0x08
|
|
#define SCHED_FLAG_KEEP_PARAMS 0x10
|
|
#define SCHED_FLAG_UTIL_CLAMP_MIN 0x20
|
|
#define SCHED_FLAG_UTIL_CLAMP_MAX 0x40
|
|
|
|
#define SCHED_FLAG_KEEP_ALL (SCHED_FLAG_KEEP_POLICY | \
|
|
SCHED_FLAG_KEEP_PARAMS)
|
|
|
|
#define SCHED_FLAG_UTIL_CLAMP (SCHED_FLAG_UTIL_CLAMP_MIN | \
|
|
SCHED_FLAG_UTIL_CLAMP_MAX)
|
|
|
|
#define SCHED_FLAG_ALL (SCHED_FLAG_RESET_ON_FORK | \
|
|
SCHED_FLAG_RECLAIM | \
|
|
SCHED_FLAG_DL_OVERRUN | \
|
|
SCHED_FLAG_KEEP_ALL | \
|
|
SCHED_FLAG_UTIL_CLAMP)
|
|
|
|
#endif /* _UAPI_LINUX_SCHED_H */
|