mirror of
https://github.com/AuxXxilium/linux_dsm_epyc7002.git
synced 2024-12-28 11:18:45 +07:00
ef2c41cf38
This adds support for creating a process in a different cgroup than its parent. Callers can limit and account processes and threads right from the moment they are spawned: - A service manager can directly spawn new services into dedicated cgroups. - A process can be directly created in a frozen cgroup and will be frozen as well. - The initial accounting jitter experienced by process supervisors and daemons is eliminated with this. - Threaded applications or even thread implementations can choose to create a specific cgroup layout where each thread is spawned directly into a dedicated cgroup. This feature is limited to the unified hierarchy. Callers need to pass a directory file descriptor for the target cgroup. The caller can choose to pass an O_PATH file descriptor. All usual migration restrictions apply, i.e. there can be no processes in inner nodes. In general, creating a process directly in a target cgroup adheres to all migration restrictions. One of the biggest advantages of this feature is that CLONE_INTO_GROUP does not need to grab the write side of the cgroup cgroup_threadgroup_rwsem. This global lock makes moving tasks/threads around super expensive. With clone3() this lock is avoided. Cc: Tejun Heo <tj@kernel.org> Cc: Ingo Molnar <mingo@redhat.com> Cc: Oleg Nesterov <oleg@redhat.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Li Zefan <lizefan@huawei.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: cgroups@vger.kernel.org Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com> Signed-off-by: Tejun Heo <tj@kernel.org>
184 lines
4.8 KiB
C
184 lines
4.8 KiB
C
/* SPDX-License-Identifier: GPL-2.0 */
|
|
#ifndef _LINUX_SCHED_TASK_H
|
|
#define _LINUX_SCHED_TASK_H
|
|
|
|
/*
|
|
* Interface between the scheduler and various task lifetime (fork()/exit())
|
|
* functionality:
|
|
*/
|
|
|
|
#include <linux/sched.h>
|
|
#include <linux/uaccess.h>
|
|
|
|
struct task_struct;
|
|
struct rusage;
|
|
union thread_union;
|
|
struct css_set;
|
|
|
|
/* All the bits taken by the old clone syscall. */
|
|
#define CLONE_LEGACY_FLAGS 0xffffffffULL
|
|
|
|
struct kernel_clone_args {
|
|
u64 flags;
|
|
int __user *pidfd;
|
|
int __user *child_tid;
|
|
int __user *parent_tid;
|
|
int exit_signal;
|
|
unsigned long stack;
|
|
unsigned long stack_size;
|
|
unsigned long tls;
|
|
pid_t *set_tid;
|
|
/* Number of elements in *set_tid */
|
|
size_t set_tid_size;
|
|
int cgroup;
|
|
struct cgroup *cgrp;
|
|
struct css_set *cset;
|
|
};
|
|
|
|
/*
|
|
* This serializes "schedule()" and also protects
|
|
* the run-queue from deletions/modifications (but
|
|
* _adding_ to the beginning of the run-queue has
|
|
* a separate lock).
|
|
*/
|
|
extern rwlock_t tasklist_lock;
|
|
extern spinlock_t mmlist_lock;
|
|
|
|
extern union thread_union init_thread_union;
|
|
extern struct task_struct init_task;
|
|
|
|
#ifdef CONFIG_PROVE_RCU
|
|
extern int lockdep_tasklist_lock_is_held(void);
|
|
#endif /* #ifdef CONFIG_PROVE_RCU */
|
|
|
|
extern asmlinkage void schedule_tail(struct task_struct *prev);
|
|
extern void init_idle(struct task_struct *idle, int cpu);
|
|
|
|
extern int sched_fork(unsigned long clone_flags, struct task_struct *p);
|
|
extern void sched_dead(struct task_struct *p);
|
|
|
|
void __noreturn do_task_dead(void);
|
|
|
|
extern void proc_caches_init(void);
|
|
|
|
extern void fork_init(void);
|
|
|
|
extern void release_task(struct task_struct * p);
|
|
|
|
#ifdef CONFIG_HAVE_COPY_THREAD_TLS
|
|
extern int copy_thread_tls(unsigned long, unsigned long, unsigned long,
|
|
struct task_struct *, unsigned long);
|
|
#else
|
|
extern int copy_thread(unsigned long, unsigned long, unsigned long,
|
|
struct task_struct *);
|
|
|
|
/* Architectures that haven't opted into copy_thread_tls get the tls argument
|
|
* via pt_regs, so ignore the tls argument passed via C. */
|
|
static inline int copy_thread_tls(
|
|
unsigned long clone_flags, unsigned long sp, unsigned long arg,
|
|
struct task_struct *p, unsigned long tls)
|
|
{
|
|
return copy_thread(clone_flags, sp, arg, p);
|
|
}
|
|
#endif
|
|
extern void flush_thread(void);
|
|
|
|
#ifdef CONFIG_HAVE_EXIT_THREAD
|
|
extern void exit_thread(struct task_struct *tsk);
|
|
#else
|
|
static inline void exit_thread(struct task_struct *tsk)
|
|
{
|
|
}
|
|
#endif
|
|
extern void do_group_exit(int);
|
|
|
|
extern void exit_files(struct task_struct *);
|
|
extern void exit_itimers(struct signal_struct *);
|
|
|
|
extern long _do_fork(struct kernel_clone_args *kargs);
|
|
extern bool legacy_clone_args_valid(const struct kernel_clone_args *kargs);
|
|
extern long do_fork(unsigned long, unsigned long, unsigned long, int __user *, int __user *);
|
|
struct task_struct *fork_idle(int);
|
|
struct mm_struct *copy_init_mm(void);
|
|
extern pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags);
|
|
extern long kernel_wait4(pid_t, int __user *, int, struct rusage *);
|
|
|
|
extern void free_task(struct task_struct *tsk);
|
|
|
|
/* sched_exec is called by processes performing an exec */
|
|
#ifdef CONFIG_SMP
|
|
extern void sched_exec(void);
|
|
#else
|
|
#define sched_exec() {}
|
|
#endif
|
|
|
|
static inline struct task_struct *get_task_struct(struct task_struct *t)
|
|
{
|
|
refcount_inc(&t->usage);
|
|
return t;
|
|
}
|
|
|
|
extern void __put_task_struct(struct task_struct *t);
|
|
|
|
static inline void put_task_struct(struct task_struct *t)
|
|
{
|
|
if (refcount_dec_and_test(&t->usage))
|
|
__put_task_struct(t);
|
|
}
|
|
|
|
void put_task_struct_rcu_user(struct task_struct *task);
|
|
|
|
#ifdef CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT
|
|
extern int arch_task_struct_size __read_mostly;
|
|
#else
|
|
# define arch_task_struct_size (sizeof(struct task_struct))
|
|
#endif
|
|
|
|
#ifndef CONFIG_HAVE_ARCH_THREAD_STRUCT_WHITELIST
|
|
/*
|
|
* If an architecture has not declared a thread_struct whitelist we
|
|
* must assume something there may need to be copied to userspace.
|
|
*/
|
|
static inline void arch_thread_struct_whitelist(unsigned long *offset,
|
|
unsigned long *size)
|
|
{
|
|
*offset = 0;
|
|
/* Handle dynamically sized thread_struct. */
|
|
*size = arch_task_struct_size - offsetof(struct task_struct, thread);
|
|
}
|
|
#endif
|
|
|
|
#ifdef CONFIG_VMAP_STACK
|
|
static inline struct vm_struct *task_stack_vm_area(const struct task_struct *t)
|
|
{
|
|
return t->stack_vm_area;
|
|
}
|
|
#else
|
|
static inline struct vm_struct *task_stack_vm_area(const struct task_struct *t)
|
|
{
|
|
return NULL;
|
|
}
|
|
#endif
|
|
|
|
/*
|
|
* Protects ->fs, ->files, ->mm, ->group_info, ->comm, keyring
|
|
* subscriptions and synchronises with wait4(). Also used in procfs. Also
|
|
* pins the final release of task.io_context. Also protects ->cpuset and
|
|
* ->cgroup.subsys[]. And ->vfork_done.
|
|
*
|
|
* Nests both inside and outside of read_lock(&tasklist_lock).
|
|
* It must not be nested with write_lock_irq(&tasklist_lock),
|
|
* neither inside nor outside.
|
|
*/
|
|
static inline void task_lock(struct task_struct *p)
|
|
{
|
|
spin_lock(&p->alloc_lock);
|
|
}
|
|
|
|
static inline void task_unlock(struct task_struct *p)
|
|
{
|
|
spin_unlock(&p->alloc_lock);
|
|
}
|
|
|
|
#endif /* _LINUX_SCHED_TASK_H */
|