mirror of
https://github.com/AuxXxilium/linux_dsm_epyc7002.git
synced 2025-01-14 05:46:24 +07:00
49cb2fc42c
The main motivation to add set_tid to clone3() is CRIU. To restore a process with the same PID/TID CRIU currently uses /proc/sys/kernel/ns_last_pid. It writes the desired (PID - 1) to ns_last_pid and then (quickly) does a clone(). This works most of the time, but it is racy. It is also slow as it requires multiple syscalls. Extending clone3() to support *set_tid makes it possible restore a process using CRIU without accessing /proc/sys/kernel/ns_last_pid and race free (as long as the desired PID/TID is available). This clone3() extension places the same restrictions (CAP_SYS_ADMIN) on clone3() with *set_tid as they are currently in place for ns_last_pid. The original version of this change was using a single value for set_tid. At the 2019 LPC, after presenting set_tid, it was, however, decided to change set_tid to an array to enable setting the PID of a process in multiple PID namespaces at the same time. If a process is created in a PID namespace it is possible to influence the PID inside and outside of the PID namespace. Details also in the corresponding selftest. To create a process with the following PIDs: PID NS level Requested PID 0 (host) 31496 1 42 2 1 For that example the two newly introduced parameters to struct clone_args (set_tid and set_tid_size) would need to be: set_tid[0] = 1; set_tid[1] = 42; set_tid[2] = 31496; set_tid_size = 3; If only the PIDs of the two innermost nested PID namespaces should be defined it would look like this: set_tid[0] = 1; set_tid[1] = 42; set_tid_size = 2; The PID of the newly created process would then be the next available free PID in the PID namespace level 0 (host) and 42 in the PID namespace at level 1 and the PID of the process in the innermost PID namespace would be 1. The set_tid array is used to specify the PID of a process starting from the innermost nested PID namespaces up to set_tid_size PID namespaces. set_tid_size cannot be larger then the current PID namespace level. Signed-off-by: Adrian Reber <areber@redhat.com> Reviewed-by: Christian Brauner <christian.brauner@ubuntu.com> Reviewed-by: Oleg Nesterov <oleg@redhat.com> Reviewed-by: Dmitry Safonov <0x7f454c46@gmail.com> Acked-by: Andrei Vagin <avagin@gmail.com> Link: https://lore.kernel.org/r/20191115123621.142252-1-areber@redhat.com Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
180 lines
4.8 KiB
C
180 lines
4.8 KiB
C
/* SPDX-License-Identifier: GPL-2.0 */
|
|
#ifndef _LINUX_SCHED_TASK_H
|
|
#define _LINUX_SCHED_TASK_H
|
|
|
|
/*
|
|
* Interface between the scheduler and various task lifetime (fork()/exit())
|
|
* functionality:
|
|
*/
|
|
|
|
#include <linux/sched.h>
|
|
#include <linux/uaccess.h>
|
|
|
|
struct task_struct;
|
|
struct rusage;
|
|
union thread_union;
|
|
|
|
/* All the bits taken by the old clone syscall. */
|
|
#define CLONE_LEGACY_FLAGS 0xffffffffULL
|
|
|
|
struct kernel_clone_args {
|
|
u64 flags;
|
|
int __user *pidfd;
|
|
int __user *child_tid;
|
|
int __user *parent_tid;
|
|
int exit_signal;
|
|
unsigned long stack;
|
|
unsigned long stack_size;
|
|
unsigned long tls;
|
|
pid_t *set_tid;
|
|
/* Number of elements in *set_tid */
|
|
size_t set_tid_size;
|
|
};
|
|
|
|
/*
|
|
* This serializes "schedule()" and also protects
|
|
* the run-queue from deletions/modifications (but
|
|
* _adding_ to the beginning of the run-queue has
|
|
* a separate lock).
|
|
*/
|
|
extern rwlock_t tasklist_lock;
|
|
extern spinlock_t mmlist_lock;
|
|
|
|
extern union thread_union init_thread_union;
|
|
extern struct task_struct init_task;
|
|
|
|
#ifdef CONFIG_PROVE_RCU
|
|
extern int lockdep_tasklist_lock_is_held(void);
|
|
#endif /* #ifdef CONFIG_PROVE_RCU */
|
|
|
|
extern asmlinkage void schedule_tail(struct task_struct *prev);
|
|
extern void init_idle(struct task_struct *idle, int cpu);
|
|
|
|
extern int sched_fork(unsigned long clone_flags, struct task_struct *p);
|
|
extern void sched_dead(struct task_struct *p);
|
|
|
|
void __noreturn do_task_dead(void);
|
|
|
|
extern void proc_caches_init(void);
|
|
|
|
extern void fork_init(void);
|
|
|
|
extern void release_task(struct task_struct * p);
|
|
|
|
#ifdef CONFIG_HAVE_COPY_THREAD_TLS
|
|
extern int copy_thread_tls(unsigned long, unsigned long, unsigned long,
|
|
struct task_struct *, unsigned long);
|
|
#else
|
|
extern int copy_thread(unsigned long, unsigned long, unsigned long,
|
|
struct task_struct *);
|
|
|
|
/* Architectures that haven't opted into copy_thread_tls get the tls argument
|
|
* via pt_regs, so ignore the tls argument passed via C. */
|
|
static inline int copy_thread_tls(
|
|
unsigned long clone_flags, unsigned long sp, unsigned long arg,
|
|
struct task_struct *p, unsigned long tls)
|
|
{
|
|
return copy_thread(clone_flags, sp, arg, p);
|
|
}
|
|
#endif
|
|
extern void flush_thread(void);
|
|
|
|
#ifdef CONFIG_HAVE_EXIT_THREAD
|
|
extern void exit_thread(struct task_struct *tsk);
|
|
#else
|
|
static inline void exit_thread(struct task_struct *tsk)
|
|
{
|
|
}
|
|
#endif
|
|
extern void do_group_exit(int);
|
|
|
|
extern void exit_files(struct task_struct *);
|
|
extern void exit_itimers(struct signal_struct *);
|
|
|
|
extern long _do_fork(struct kernel_clone_args *kargs);
|
|
extern bool legacy_clone_args_valid(const struct kernel_clone_args *kargs);
|
|
extern long do_fork(unsigned long, unsigned long, unsigned long, int __user *, int __user *);
|
|
struct task_struct *fork_idle(int);
|
|
struct mm_struct *copy_init_mm(void);
|
|
extern pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags);
|
|
extern long kernel_wait4(pid_t, int __user *, int, struct rusage *);
|
|
|
|
extern void free_task(struct task_struct *tsk);
|
|
|
|
/* sched_exec is called by processes performing an exec */
|
|
#ifdef CONFIG_SMP
|
|
extern void sched_exec(void);
|
|
#else
|
|
#define sched_exec() {}
|
|
#endif
|
|
|
|
static inline struct task_struct *get_task_struct(struct task_struct *t)
|
|
{
|
|
refcount_inc(&t->usage);
|
|
return t;
|
|
}
|
|
|
|
extern void __put_task_struct(struct task_struct *t);
|
|
|
|
static inline void put_task_struct(struct task_struct *t)
|
|
{
|
|
if (refcount_dec_and_test(&t->usage))
|
|
__put_task_struct(t);
|
|
}
|
|
|
|
void put_task_struct_rcu_user(struct task_struct *task);
|
|
|
|
#ifdef CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT
|
|
extern int arch_task_struct_size __read_mostly;
|
|
#else
|
|
# define arch_task_struct_size (sizeof(struct task_struct))
|
|
#endif
|
|
|
|
#ifndef CONFIG_HAVE_ARCH_THREAD_STRUCT_WHITELIST
|
|
/*
|
|
* If an architecture has not declared a thread_struct whitelist we
|
|
* must assume something there may need to be copied to userspace.
|
|
*/
|
|
static inline void arch_thread_struct_whitelist(unsigned long *offset,
|
|
unsigned long *size)
|
|
{
|
|
*offset = 0;
|
|
/* Handle dynamically sized thread_struct. */
|
|
*size = arch_task_struct_size - offsetof(struct task_struct, thread);
|
|
}
|
|
#endif
|
|
|
|
#ifdef CONFIG_VMAP_STACK
|
|
static inline struct vm_struct *task_stack_vm_area(const struct task_struct *t)
|
|
{
|
|
return t->stack_vm_area;
|
|
}
|
|
#else
|
|
static inline struct vm_struct *task_stack_vm_area(const struct task_struct *t)
|
|
{
|
|
return NULL;
|
|
}
|
|
#endif
|
|
|
|
/*
|
|
* Protects ->fs, ->files, ->mm, ->group_info, ->comm, keyring
|
|
* subscriptions and synchronises with wait4(). Also used in procfs. Also
|
|
* pins the final release of task.io_context. Also protects ->cpuset and
|
|
* ->cgroup.subsys[]. And ->vfork_done.
|
|
*
|
|
* Nests both inside and outside of read_lock(&tasklist_lock).
|
|
* It must not be nested with write_lock_irq(&tasklist_lock),
|
|
* neither inside nor outside.
|
|
*/
|
|
static inline void task_lock(struct task_struct *p)
|
|
{
|
|
spin_lock(&p->alloc_lock);
|
|
}
|
|
|
|
static inline void task_unlock(struct task_struct *p)
|
|
{
|
|
spin_unlock(&p->alloc_lock);
|
|
}
|
|
|
|
#endif /* _LINUX_SCHED_TASK_H */
|