mirror of
https://github.com/AuxXxilium/linux_dsm_epyc7002.git
synced 2024-11-24 06:30:53 +07:00
Merge branch 'for-4.2' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup
Pull cgroup updates from Tejun Heo: - threadgroup_lock got reorganized so that its users can pick the actual locking mechanism to use. Its only user - cgroups - is updated to use a percpu_rwsem instead of per-process rwsem. This makes things a bit lighter on hot paths and allows cgroups to perform and fail multi-task (a process) migrations atomically. Multi-task migrations are used in several places including the unified hierarchy. - Delegation rule and documentation added to unified hierarchy. This will likely be the last interface update from the cgroup core side for unified hierarchy before lifting the devel mask. - Some groundwork for the pids controller which is scheduled to be merged in the coming devel cycle. * 'for-4.2' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup: cgroup: add delegation section to unified hierarchy documentation cgroup: require write perm on common ancestor when moving processes on the default hierarchy cgroup: separate out cgroup_procs_write_permission() from __cgroup_procs_write() kernfs: make kernfs_get_inode() public MAINTAINERS: add a cgroup core co-maintainer cgroup: fix uninitialised iterator in for_each_subsys_which cgroup: replace explicit ss_mask checking with for_each_subsys_which cgroup: use bitmask to filter for_each_subsys cgroup: add seq_file forward declaration for struct cftype cgroup: simplify threadgroup locking sched, cgroup: replace signal_struct->group_rwsem with a global percpu_rwsem sched, cgroup: reorganize threadgroup locking cgroup: switch to unsigned long for bitmasks cgroup: reorganize include/linux/cgroup.h cgroup: separate out include/linux/cgroup-defs.h cgroup: fix some comment typos
This commit is contained in:
commit
bbe179f88d
@ -17,15 +17,18 @@ CONTENTS
|
||||
3. Structural Constraints
|
||||
3-1. Top-down
|
||||
3-2. No internal tasks
|
||||
4. Other Changes
|
||||
4-1. [Un]populated Notification
|
||||
4-2. Other Core Changes
|
||||
4-3. Per-Controller Changes
|
||||
4-3-1. blkio
|
||||
4-3-2. cpuset
|
||||
4-3-3. memory
|
||||
5. Planned Changes
|
||||
5-1. CAP for resource control
|
||||
4. Delegation
|
||||
4-1. Model of delegation
|
||||
4-2. Common ancestor rule
|
||||
5. Other Changes
|
||||
5-1. [Un]populated Notification
|
||||
5-2. Other Core Changes
|
||||
5-3. Per-Controller Changes
|
||||
5-3-1. blkio
|
||||
5-3-2. cpuset
|
||||
5-3-3. memory
|
||||
6. Planned Changes
|
||||
6-1. CAP for resource control
|
||||
|
||||
|
||||
1. Background
|
||||
@ -245,9 +248,72 @@ cgroup must create children and transfer all its tasks to the children
|
||||
before enabling controllers in its "cgroup.subtree_control" file.
|
||||
|
||||
|
||||
4. Other Changes
|
||||
4. Delegation
|
||||
|
||||
4-1. [Un]populated Notification
|
||||
4-1. Model of delegation
|
||||
|
||||
A cgroup can be delegated to a less privileged user by granting write
|
||||
access of the directory and its "cgroup.procs" file to the user. Note
|
||||
that the resource control knobs in a given directory concern the
|
||||
resources of the parent and thus must not be delegated along with the
|
||||
directory.
|
||||
|
||||
Once delegated, the user can build sub-hierarchy under the directory,
|
||||
organize processes as it sees fit and further distribute the resources
|
||||
it got from the parent. The limits and other settings of all resource
|
||||
controllers are hierarchical and regardless of what happens in the
|
||||
delegated sub-hierarchy, nothing can escape the resource restrictions
|
||||
imposed by the parent.
|
||||
|
||||
Currently, cgroup doesn't impose any restrictions on the number of
|
||||
cgroups in or nesting depth of a delegated sub-hierarchy; however,
|
||||
this may in the future be limited explicitly.
|
||||
|
||||
|
||||
4-2. Common ancestor rule
|
||||
|
||||
On the unified hierarchy, to write to a "cgroup.procs" file, in
|
||||
addition to the usual write permission to the file and uid match, the
|
||||
writer must also have write access to the "cgroup.procs" file of the
|
||||
common ancestor of the source and destination cgroups. This prevents
|
||||
delegatees from smuggling processes across disjoint sub-hierarchies.
|
||||
|
||||
Let's say cgroups C0 and C1 have been delegated to user U0 who created
|
||||
C00, C01 under C0 and C10 under C1 as follows.
|
||||
|
||||
~~~~~~~~~~~~~ - C0 - C00
|
||||
~ cgroup ~ \ C01
|
||||
~ hierarchy ~
|
||||
~~~~~~~~~~~~~ - C1 - C10
|
||||
|
||||
C0 and C1 are separate entities in terms of resource distribution
|
||||
regardless of their relative positions in the hierarchy. The
|
||||
resources the processes under C0 are entitled to are controlled by
|
||||
C0's ancestors and may be completely different from C1. It's clear
|
||||
that the intention of delegating C0 to U0 is allowing U0 to organize
|
||||
the processes under C0 and further control the distribution of C0's
|
||||
resources.
|
||||
|
||||
On traditional hierarchies, if a task has write access to "tasks" or
|
||||
"cgroup.procs" file of a cgroup and its uid agrees with the target, it
|
||||
can move the target to the cgroup. In the above example, U0 will not
|
||||
only be able to move processes in each sub-hierarchy but also across
|
||||
the two sub-hierarchies, effectively allowing it to violate the
|
||||
organizational and resource restrictions implied by the hierarchical
|
||||
structure above C0 and C1.
|
||||
|
||||
On the unified hierarchy, let's say U0 wants to write the pid of a
|
||||
process which has a matching uid and is currently in C10 into
|
||||
"C00/cgroup.procs". U0 obviously has write access to the file and
|
||||
migration permission on the process; however, the common ancestor of
|
||||
the source cgroup C10 and the destination cgroup C00 is above the
|
||||
points of delegation and U0 would not have write access to its
|
||||
"cgroup.procs" and thus be denied with -EACCES.
|
||||
|
||||
|
||||
5. Other Changes
|
||||
|
||||
5-1. [Un]populated Notification
|
||||
|
||||
cgroup users often need a way to determine when a cgroup's
|
||||
subhierarchy becomes empty so that it can be cleaned up. cgroup
|
||||
@ -289,7 +355,7 @@ supported and the interface files "release_agent" and
|
||||
"notify_on_release" do not exist.
|
||||
|
||||
|
||||
4-2. Other Core Changes
|
||||
5-2. Other Core Changes
|
||||
|
||||
- None of the mount options is allowed.
|
||||
|
||||
@ -306,14 +372,14 @@ supported and the interface files "release_agent" and
|
||||
- The "cgroup.clone_children" file is removed.
|
||||
|
||||
|
||||
4-3. Per-Controller Changes
|
||||
5-3. Per-Controller Changes
|
||||
|
||||
4-3-1. blkio
|
||||
5-3-1. blkio
|
||||
|
||||
- blk-throttle becomes properly hierarchical.
|
||||
|
||||
|
||||
4-3-2. cpuset
|
||||
5-3-2. cpuset
|
||||
|
||||
- Tasks are kept in empty cpusets after hotplug and take on the masks
|
||||
of the nearest non-empty ancestor, instead of being moved to it.
|
||||
@ -322,7 +388,7 @@ supported and the interface files "release_agent" and
|
||||
masks of the nearest non-empty ancestor.
|
||||
|
||||
|
||||
4-3-3. memory
|
||||
5-3-3. memory
|
||||
|
||||
- use_hierarchy is on by default and the cgroup file for the flag is
|
||||
not created.
|
||||
@ -407,9 +473,9 @@ supported and the interface files "release_agent" and
|
||||
memory.low, memory.high, and memory.max will use the string "max" to
|
||||
indicate and set the highest possible value.
|
||||
|
||||
5. Planned Changes
|
||||
6. Planned Changes
|
||||
|
||||
5-1. CAP for resource control
|
||||
6-1. CAP for resource control
|
||||
|
||||
Unified hierarchy will require one of the capabilities(7), which is
|
||||
yet to be decided, for all resource control related knobs. Process
|
||||
|
@ -2816,6 +2816,7 @@ F: drivers/connector/
|
||||
CONTROL GROUP (CGROUP)
|
||||
M: Tejun Heo <tj@kernel.org>
|
||||
M: Li Zefan <lizefan@huawei.com>
|
||||
M: Johannes Weiner <hannes@cmpxchg.org>
|
||||
L: cgroups@vger.kernel.org
|
||||
T: git git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup.git
|
||||
S: Maintained
|
||||
|
@ -76,7 +76,6 @@ extern struct kmem_cache *kernfs_node_cache;
|
||||
/*
|
||||
* inode.c
|
||||
*/
|
||||
struct inode *kernfs_get_inode(struct super_block *sb, struct kernfs_node *kn);
|
||||
void kernfs_evict_inode(struct inode *inode);
|
||||
int kernfs_iop_permission(struct inode *inode, int mask);
|
||||
int kernfs_iop_setattr(struct dentry *dentry, struct iattr *iattr);
|
||||
|
501
include/linux/cgroup-defs.h
Normal file
501
include/linux/cgroup-defs.h
Normal file
@ -0,0 +1,501 @@
|
||||
/*
|
||||
* linux/cgroup-defs.h - basic definitions for cgroup
|
||||
*
|
||||
* This file provides basic type and interface. Include this file directly
|
||||
* only if necessary to avoid cyclic dependencies.
|
||||
*/
|
||||
#ifndef _LINUX_CGROUP_DEFS_H
|
||||
#define _LINUX_CGROUP_DEFS_H
|
||||
|
||||
#include <linux/limits.h>
|
||||
#include <linux/list.h>
|
||||
#include <linux/idr.h>
|
||||
#include <linux/wait.h>
|
||||
#include <linux/mutex.h>
|
||||
#include <linux/rcupdate.h>
|
||||
#include <linux/percpu-refcount.h>
|
||||
#include <linux/percpu-rwsem.h>
|
||||
#include <linux/workqueue.h>
|
||||
|
||||
#ifdef CONFIG_CGROUPS
|
||||
|
||||
struct cgroup;
|
||||
struct cgroup_root;
|
||||
struct cgroup_subsys;
|
||||
struct cgroup_taskset;
|
||||
struct kernfs_node;
|
||||
struct kernfs_ops;
|
||||
struct kernfs_open_file;
|
||||
struct seq_file;
|
||||
|
||||
#define MAX_CGROUP_TYPE_NAMELEN 32
|
||||
#define MAX_CGROUP_ROOT_NAMELEN 64
|
||||
#define MAX_CFTYPE_NAME 64
|
||||
|
||||
/* define the enumeration of all cgroup subsystems */
|
||||
#define SUBSYS(_x) _x ## _cgrp_id,
|
||||
enum cgroup_subsys_id {
|
||||
#include <linux/cgroup_subsys.h>
|
||||
CGROUP_SUBSYS_COUNT,
|
||||
};
|
||||
#undef SUBSYS
|
||||
|
||||
/* bits in struct cgroup_subsys_state flags field */
|
||||
enum {
|
||||
CSS_NO_REF = (1 << 0), /* no reference counting for this css */
|
||||
CSS_ONLINE = (1 << 1), /* between ->css_online() and ->css_offline() */
|
||||
CSS_RELEASED = (1 << 2), /* refcnt reached zero, released */
|
||||
};
|
||||
|
||||
/* bits in struct cgroup flags field */
|
||||
enum {
|
||||
/* Control Group requires release notifications to userspace */
|
||||
CGRP_NOTIFY_ON_RELEASE,
|
||||
/*
|
||||
* Clone the parent's configuration when creating a new child
|
||||
* cpuset cgroup. For historical reasons, this option can be
|
||||
* specified at mount time and thus is implemented here.
|
||||
*/
|
||||
CGRP_CPUSET_CLONE_CHILDREN,
|
||||
};
|
||||
|
||||
/* cgroup_root->flags */
|
||||
enum {
|
||||
CGRP_ROOT_SANE_BEHAVIOR = (1 << 0), /* __DEVEL__sane_behavior specified */
|
||||
CGRP_ROOT_NOPREFIX = (1 << 1), /* mounted subsystems have no named prefix */
|
||||
CGRP_ROOT_XATTR = (1 << 2), /* supports extended attributes */
|
||||
};
|
||||
|
||||
/* cftype->flags */
|
||||
enum {
|
||||
CFTYPE_ONLY_ON_ROOT = (1 << 0), /* only create on root cgrp */
|
||||
CFTYPE_NOT_ON_ROOT = (1 << 1), /* don't create on root cgrp */
|
||||
CFTYPE_NO_PREFIX = (1 << 3), /* (DON'T USE FOR NEW FILES) no subsys prefix */
|
||||
|
||||
/* internal flags, do not use outside cgroup core proper */
|
||||
__CFTYPE_ONLY_ON_DFL = (1 << 16), /* only on default hierarchy */
|
||||
__CFTYPE_NOT_ON_DFL = (1 << 17), /* not on default hierarchy */
|
||||
};
|
||||
|
||||
/*
|
||||
* Per-subsystem/per-cgroup state maintained by the system. This is the
|
||||
* fundamental structural building block that controllers deal with.
|
||||
*
|
||||
* Fields marked with "PI:" are public and immutable and may be accessed
|
||||
* directly without synchronization.
|
||||
*/
|
||||
struct cgroup_subsys_state {
|
||||
/* PI: the cgroup that this css is attached to */
|
||||
struct cgroup *cgroup;
|
||||
|
||||
/* PI: the cgroup subsystem that this css is attached to */
|
||||
struct cgroup_subsys *ss;
|
||||
|
||||
/* reference count - access via css_[try]get() and css_put() */
|
||||
struct percpu_ref refcnt;
|
||||
|
||||
/* PI: the parent css */
|
||||
struct cgroup_subsys_state *parent;
|
||||
|
||||
/* siblings list anchored at the parent's ->children */
|
||||
struct list_head sibling;
|
||||
struct list_head children;
|
||||
|
||||
/*
|
||||
* PI: Subsys-unique ID. 0 is unused and root is always 1. The
|
||||
* matching css can be looked up using css_from_id().
|
||||
*/
|
||||
int id;
|
||||
|
||||
unsigned int flags;
|
||||
|
||||
/*
|
||||
* Monotonically increasing unique serial number which defines a
|
||||
* uniform order among all csses. It's guaranteed that all
|
||||
* ->children lists are in the ascending order of ->serial_nr and
|
||||
* used to allow interrupting and resuming iterations.
|
||||
*/
|
||||
u64 serial_nr;
|
||||
|
||||
/* percpu_ref killing and RCU release */
|
||||
struct rcu_head rcu_head;
|
||||
struct work_struct destroy_work;
|
||||
};
|
||||
|
||||
/*
|
||||
* A css_set is a structure holding pointers to a set of
|
||||
* cgroup_subsys_state objects. This saves space in the task struct
|
||||
* object and speeds up fork()/exit(), since a single inc/dec and a
|
||||
* list_add()/del() can bump the reference count on the entire cgroup
|
||||
* set for a task.
|
||||
*/
|
||||
struct css_set {
|
||||
/* Reference count */
|
||||
atomic_t refcount;
|
||||
|
||||
/*
|
||||
* List running through all cgroup groups in the same hash
|
||||
* slot. Protected by css_set_lock
|
||||
*/
|
||||
struct hlist_node hlist;
|
||||
|
||||
/*
|
||||
* Lists running through all tasks using this cgroup group.
|
||||
* mg_tasks lists tasks which belong to this cset but are in the
|
||||
* process of being migrated out or in. Protected by
|
||||
* css_set_rwsem, but, during migration, once tasks are moved to
|
||||
* mg_tasks, it can be read safely while holding cgroup_mutex.
|
||||
*/
|
||||
struct list_head tasks;
|
||||
struct list_head mg_tasks;
|
||||
|
||||
/*
|
||||
* List of cgrp_cset_links pointing at cgroups referenced from this
|
||||
* css_set. Protected by css_set_lock.
|
||||
*/
|
||||
struct list_head cgrp_links;
|
||||
|
||||
/* the default cgroup associated with this css_set */
|
||||
struct cgroup *dfl_cgrp;
|
||||
|
||||
/*
|
||||
* Set of subsystem states, one for each subsystem. This array is
|
||||
* immutable after creation apart from the init_css_set during
|
||||
* subsystem registration (at boot time).
|
||||
*/
|
||||
struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT];
|
||||
|
||||
/*
|
||||
* List of csets participating in the on-going migration either as
|
||||
* source or destination. Protected by cgroup_mutex.
|
||||
*/
|
||||
struct list_head mg_preload_node;
|
||||
struct list_head mg_node;
|
||||
|
||||
/*
|
||||
* If this cset is acting as the source of migration the following
|
||||
* two fields are set. mg_src_cgrp is the source cgroup of the
|
||||
* on-going migration and mg_dst_cset is the destination cset the
|
||||
* target tasks on this cset should be migrated to. Protected by
|
||||
* cgroup_mutex.
|
||||
*/
|
||||
struct cgroup *mg_src_cgrp;
|
||||
struct css_set *mg_dst_cset;
|
||||
|
||||
/*
|
||||
* On the default hierarhcy, ->subsys[ssid] may point to a css
|
||||
* attached to an ancestor instead of the cgroup this css_set is
|
||||
* associated with. The following node is anchored at
|
||||
* ->subsys[ssid]->cgroup->e_csets[ssid] and provides a way to
|
||||
* iterate through all css's attached to a given cgroup.
|
||||
*/
|
||||
struct list_head e_cset_node[CGROUP_SUBSYS_COUNT];
|
||||
|
||||
/* For RCU-protected deletion */
|
||||
struct rcu_head rcu_head;
|
||||
};
|
||||
|
||||
struct cgroup {
|
||||
/* self css with NULL ->ss, points back to this cgroup */
|
||||
struct cgroup_subsys_state self;
|
||||
|
||||
unsigned long flags; /* "unsigned long" so bitops work */
|
||||
|
||||
/*
|
||||
* idr allocated in-hierarchy ID.
|
||||
*
|
||||
* ID 0 is not used, the ID of the root cgroup is always 1, and a
|
||||
* new cgroup will be assigned with a smallest available ID.
|
||||
*
|
||||
* Allocating/Removing ID must be protected by cgroup_mutex.
|
||||
*/
|
||||
int id;
|
||||
|
||||
/*
|
||||
* If this cgroup contains any tasks, it contributes one to
|
||||
* populated_cnt. All children with non-zero popuplated_cnt of
|
||||
* their own contribute one. The count is zero iff there's no task
|
||||
* in this cgroup or its subtree.
|
||||
*/
|
||||
int populated_cnt;
|
||||
|
||||
struct kernfs_node *kn; /* cgroup kernfs entry */
|
||||
struct kernfs_node *procs_kn; /* kn for "cgroup.procs" */
|
||||
struct kernfs_node *populated_kn; /* kn for "cgroup.subtree_populated" */
|
||||
|
||||
/*
|
||||
* The bitmask of subsystems enabled on the child cgroups.
|
||||
* ->subtree_control is the one configured through
|
||||
* "cgroup.subtree_control" while ->child_subsys_mask is the
|
||||
* effective one which may have more subsystems enabled.
|
||||
* Controller knobs are made available iff it's enabled in
|
||||
* ->subtree_control.
|
||||
*/
|
||||
unsigned int subtree_control;
|
||||
unsigned int child_subsys_mask;
|
||||
|
||||
/* Private pointers for each registered subsystem */
|
||||
struct cgroup_subsys_state __rcu *subsys[CGROUP_SUBSYS_COUNT];
|
||||
|
||||
struct cgroup_root *root;
|
||||
|
||||
/*
|
||||
* List of cgrp_cset_links pointing at css_sets with tasks in this
|
||||
* cgroup. Protected by css_set_lock.
|
||||
*/
|
||||
struct list_head cset_links;
|
||||
|
||||
/*
|
||||
* On the default hierarchy, a css_set for a cgroup with some
|
||||
* susbsys disabled will point to css's which are associated with
|
||||
* the closest ancestor which has the subsys enabled. The
|
||||
* following lists all css_sets which point to this cgroup's css
|
||||
* for the given subsystem.
|
||||
*/
|
||||
struct list_head e_csets[CGROUP_SUBSYS_COUNT];
|
||||
|
||||
/*
|
||||
* list of pidlists, up to two for each namespace (one for procs, one
|
||||
* for tasks); created on demand.
|
||||
*/
|
||||
struct list_head pidlists;
|
||||
struct mutex pidlist_mutex;
|
||||
|
||||
/* used to wait for offlining of csses */
|
||||
wait_queue_head_t offline_waitq;
|
||||
|
||||
/* used to schedule release agent */
|
||||
struct work_struct release_agent_work;
|
||||
};
|
||||
|
||||
/*
|
||||
* A cgroup_root represents the root of a cgroup hierarchy, and may be
|
||||
* associated with a kernfs_root to form an active hierarchy. This is
|
||||
* internal to cgroup core. Don't access directly from controllers.
|
||||
*/
|
||||
struct cgroup_root {
|
||||
struct kernfs_root *kf_root;
|
||||
|
||||
/* The bitmask of subsystems attached to this hierarchy */
|
||||
unsigned int subsys_mask;
|
||||
|
||||
/* Unique id for this hierarchy. */
|
||||
int hierarchy_id;
|
||||
|
||||
/* The root cgroup. Root is destroyed on its release. */
|
||||
struct cgroup cgrp;
|
||||
|
||||
/* Number of cgroups in the hierarchy, used only for /proc/cgroups */
|
||||
atomic_t nr_cgrps;
|
||||
|
||||
/* A list running through the active hierarchies */
|
||||
struct list_head root_list;
|
||||
|
||||
/* Hierarchy-specific flags */
|
||||
unsigned int flags;
|
||||
|
||||
/* IDs for cgroups in this hierarchy */
|
||||
struct idr cgroup_idr;
|
||||
|
||||
/* The path to use for release notifications. */
|
||||
char release_agent_path[PATH_MAX];
|
||||
|
||||
/* The name for this hierarchy - may be empty */
|
||||
char name[MAX_CGROUP_ROOT_NAMELEN];
|
||||
};
|
||||
|
||||
/*
|
||||
* struct cftype: handler definitions for cgroup control files
|
||||
*
|
||||
* When reading/writing to a file:
|
||||
* - the cgroup to use is file->f_path.dentry->d_parent->d_fsdata
|
||||
* - the 'cftype' of the file is file->f_path.dentry->d_fsdata
|
||||
*/
|
||||
struct cftype {
|
||||
/*
|
||||
* By convention, the name should begin with the name of the
|
||||
* subsystem, followed by a period. Zero length string indicates
|
||||
* end of cftype array.
|
||||
*/
|
||||
char name[MAX_CFTYPE_NAME];
|
||||
int private;
|
||||
/*
|
||||
* If not 0, file mode is set to this value, otherwise it will
|
||||
* be figured out automatically
|
||||
*/
|
||||
umode_t mode;
|
||||
|
||||
/*
|
||||
* The maximum length of string, excluding trailing nul, that can
|
||||
* be passed to write. If < PAGE_SIZE-1, PAGE_SIZE-1 is assumed.
|
||||
*/
|
||||
size_t max_write_len;
|
||||
|
||||
/* CFTYPE_* flags */
|
||||
unsigned int flags;
|
||||
|
||||
/*
|
||||
* Fields used for internal bookkeeping. Initialized automatically
|
||||
* during registration.
|
||||
*/
|
||||
struct cgroup_subsys *ss; /* NULL for cgroup core files */
|
||||
struct list_head node; /* anchored at ss->cfts */
|
||||
struct kernfs_ops *kf_ops;
|
||||
|
||||
/*
|
||||
* read_u64() is a shortcut for the common case of returning a
|
||||
* single integer. Use it in place of read()
|
||||
*/
|
||||
u64 (*read_u64)(struct cgroup_subsys_state *css, struct cftype *cft);
|
||||
/*
|
||||
* read_s64() is a signed version of read_u64()
|
||||
*/
|
||||
s64 (*read_s64)(struct cgroup_subsys_state *css, struct cftype *cft);
|
||||
|
||||
/* generic seq_file read interface */
|
||||
int (*seq_show)(struct seq_file *sf, void *v);
|
||||
|
||||
/* optional ops, implement all or none */
|
||||
void *(*seq_start)(struct seq_file *sf, loff_t *ppos);
|
||||
void *(*seq_next)(struct seq_file *sf, void *v, loff_t *ppos);
|
||||
void (*seq_stop)(struct seq_file *sf, void *v);
|
||||
|
||||
/*
|
||||
* write_u64() is a shortcut for the common case of accepting
|
||||
* a single integer (as parsed by simple_strtoull) from
|
||||
* userspace. Use in place of write(); return 0 or error.
|
||||
*/
|
||||
int (*write_u64)(struct cgroup_subsys_state *css, struct cftype *cft,
|
||||
u64 val);
|
||||
/*
|
||||
* write_s64() is a signed version of write_u64()
|
||||
*/
|
||||
int (*write_s64)(struct cgroup_subsys_state *css, struct cftype *cft,
|
||||
s64 val);
|
||||
|
||||
/*
|
||||
* write() is the generic write callback which maps directly to
|
||||
* kernfs write operation and overrides all other operations.
|
||||
* Maximum write size is determined by ->max_write_len. Use
|
||||
* of_css/cft() to access the associated css and cft.
|
||||
*/
|
||||
ssize_t (*write)(struct kernfs_open_file *of,
|
||||
char *buf, size_t nbytes, loff_t off);
|
||||
|
||||
#ifdef CONFIG_DEBUG_LOCK_ALLOC
|
||||
struct lock_class_key lockdep_key;
|
||||
#endif
|
||||
};
|
||||
|
||||
/*
|
||||
* Control Group subsystem type.
|
||||
* See Documentation/cgroups/cgroups.txt for details
|
||||
*/
|
||||
struct cgroup_subsys {
|
||||
struct cgroup_subsys_state *(*css_alloc)(struct cgroup_subsys_state *parent_css);
|
||||
int (*css_online)(struct cgroup_subsys_state *css);
|
||||
void (*css_offline)(struct cgroup_subsys_state *css);
|
||||
void (*css_released)(struct cgroup_subsys_state *css);
|
||||
void (*css_free)(struct cgroup_subsys_state *css);
|
||||
void (*css_reset)(struct cgroup_subsys_state *css);
|
||||
void (*css_e_css_changed)(struct cgroup_subsys_state *css);
|
||||
|
||||
int (*can_attach)(struct cgroup_subsys_state *css,
|
||||
struct cgroup_taskset *tset);
|
||||
void (*cancel_attach)(struct cgroup_subsys_state *css,
|
||||
struct cgroup_taskset *tset);
|
||||
void (*attach)(struct cgroup_subsys_state *css,
|
||||
struct cgroup_taskset *tset);
|
||||
void (*fork)(struct task_struct *task);
|
||||
void (*exit)(struct cgroup_subsys_state *css,
|
||||
struct cgroup_subsys_state *old_css,
|
||||
struct task_struct *task);
|
||||
void (*bind)(struct cgroup_subsys_state *root_css);
|
||||
|
||||
int disabled;
|
||||
int early_init;
|
||||
|
||||
/*
|
||||
* If %false, this subsystem is properly hierarchical -
|
||||
* configuration, resource accounting and restriction on a parent
|
||||
* cgroup cover those of its children. If %true, hierarchy support
|
||||
* is broken in some ways - some subsystems ignore hierarchy
|
||||
* completely while others are only implemented half-way.
|
||||
*
|
||||
* It's now disallowed to create nested cgroups if the subsystem is
|
||||
* broken and cgroup core will emit a warning message on such
|
||||
* cases. Eventually, all subsystems will be made properly
|
||||
* hierarchical and this will go away.
|
||||
*/
|
||||
bool broken_hierarchy;
|
||||
bool warned_broken_hierarchy;
|
||||
|
||||
/* the following two fields are initialized automtically during boot */
|
||||
int id;
|
||||
const char *name;
|
||||
|
||||
/* link to parent, protected by cgroup_lock() */
|
||||
struct cgroup_root *root;
|
||||
|
||||
/* idr for css->id */
|
||||
struct idr css_idr;
|
||||
|
||||
/*
|
||||
* List of cftypes. Each entry is the first entry of an array
|
||||
* terminated by zero length name.
|
||||
*/
|
||||
struct list_head cfts;
|
||||
|
||||
/*
|
||||
* Base cftypes which are automatically registered. The two can
|
||||
* point to the same array.
|
||||
*/
|
||||
struct cftype *dfl_cftypes; /* for the default hierarchy */
|
||||
struct cftype *legacy_cftypes; /* for the legacy hierarchies */
|
||||
|
||||
/*
|
||||
* A subsystem may depend on other subsystems. When such subsystem
|
||||
* is enabled on a cgroup, the depended-upon subsystems are enabled
|
||||
* together if available. Subsystems enabled due to dependency are
|
||||
* not visible to userland until explicitly enabled. The following
|
||||
* specifies the mask of subsystems that this one depends on.
|
||||
*/
|
||||
unsigned int depends_on;
|
||||
};
|
||||
|
||||
extern struct percpu_rw_semaphore cgroup_threadgroup_rwsem;
|
||||
|
||||
/**
|
||||
* cgroup_threadgroup_change_begin - threadgroup exclusion for cgroups
|
||||
* @tsk: target task
|
||||
*
|
||||
* Called from threadgroup_change_begin() and allows cgroup operations to
|
||||
* synchronize against threadgroup changes using a percpu_rw_semaphore.
|
||||
*/
|
||||
static inline void cgroup_threadgroup_change_begin(struct task_struct *tsk)
|
||||
{
|
||||
percpu_down_read(&cgroup_threadgroup_rwsem);
|
||||
}
|
||||
|
||||
/**
|
||||
* cgroup_threadgroup_change_end - threadgroup exclusion for cgroups
|
||||
* @tsk: target task
|
||||
*
|
||||
* Called from threadgroup_change_end(). Counterpart of
|
||||
* cgroup_threadcgroup_change_begin().
|
||||
*/
|
||||
static inline void cgroup_threadgroup_change_end(struct task_struct *tsk)
|
||||
{
|
||||
percpu_up_read(&cgroup_threadgroup_rwsem);
|
||||
}
|
||||
|
||||
#else /* CONFIG_CGROUPS */
|
||||
|
||||
#define CGROUP_SUBSYS_COUNT 0
|
||||
|
||||
static inline void cgroup_threadgroup_change_begin(struct task_struct *tsk) {}
|
||||
static inline void cgroup_threadgroup_change_end(struct task_struct *tsk) {}
|
||||
|
||||
#endif /* CONFIG_CGROUPS */
|
||||
|
||||
#endif /* _LINUX_CGROUP_DEFS_H */
|
File diff suppressed because it is too large
Load Diff
@ -25,13 +25,6 @@
|
||||
extern struct files_struct init_files;
|
||||
extern struct fs_struct init_fs;
|
||||
|
||||
#ifdef CONFIG_CGROUPS
|
||||
#define INIT_GROUP_RWSEM(sig) \
|
||||
.group_rwsem = __RWSEM_INITIALIZER(sig.group_rwsem),
|
||||
#else
|
||||
#define INIT_GROUP_RWSEM(sig)
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_CPUSETS
|
||||
#define INIT_CPUSET_SEQ(tsk) \
|
||||
.mems_allowed_seq = SEQCNT_ZERO(tsk.mems_allowed_seq),
|
||||
@ -55,7 +48,6 @@ extern struct fs_struct init_fs;
|
||||
}, \
|
||||
.cred_guard_mutex = \
|
||||
__MUTEX_INITIALIZER(sig.cred_guard_mutex), \
|
||||
INIT_GROUP_RWSEM(sig) \
|
||||
}
|
||||
|
||||
extern struct nsproxy init_nsproxy;
|
||||
|
@ -277,6 +277,7 @@ void kernfs_put(struct kernfs_node *kn);
|
||||
|
||||
struct kernfs_node *kernfs_node_from_dentry(struct dentry *dentry);
|
||||
struct kernfs_root *kernfs_root_from_sb(struct super_block *sb);
|
||||
struct inode *kernfs_get_inode(struct super_block *sb, struct kernfs_node *kn);
|
||||
|
||||
struct kernfs_root *kernfs_create_root(struct kernfs_syscall_ops *scops,
|
||||
unsigned int flags, void *priv);
|
||||
@ -352,6 +353,10 @@ static inline struct kernfs_node *kernfs_node_from_dentry(struct dentry *dentry)
|
||||
static inline struct kernfs_root *kernfs_root_from_sb(struct super_block *sb)
|
||||
{ return NULL; }
|
||||
|
||||
static inline struct inode *
|
||||
kernfs_get_inode(struct super_block *sb, struct kernfs_node *kn)
|
||||
{ return NULL; }
|
||||
|
||||
static inline struct kernfs_root *
|
||||
kernfs_create_root(struct kernfs_syscall_ops *scops, unsigned int flags,
|
||||
void *priv)
|
||||
|
@ -58,6 +58,7 @@ struct sched_param {
|
||||
#include <linux/uidgid.h>
|
||||
#include <linux/gfp.h>
|
||||
#include <linux/magic.h>
|
||||
#include <linux/cgroup-defs.h>
|
||||
|
||||
#include <asm/processor.h>
|
||||
|
||||
@ -755,18 +756,6 @@ struct signal_struct {
|
||||
unsigned audit_tty_log_passwd;
|
||||
struct tty_audit_buf *tty_audit_buf;
|
||||
#endif
|
||||
#ifdef CONFIG_CGROUPS
|
||||
/*
|
||||
* group_rwsem prevents new tasks from entering the threadgroup and
|
||||
* member tasks from exiting,a more specifically, setting of
|
||||
* PF_EXITING. fork and exit paths are protected with this rwsem
|
||||
* using threadgroup_change_begin/end(). Users which require
|
||||
* threadgroup to remain stable should use threadgroup_[un]lock()
|
||||
* which also takes care of exec path. Currently, cgroup is the
|
||||
* only user.
|
||||
*/
|
||||
struct rw_semaphore group_rwsem;
|
||||
#endif
|
||||
|
||||
oom_flags_t oom_flags;
|
||||
short oom_score_adj; /* OOM kill score adjustment */
|
||||
@ -2725,54 +2714,34 @@ static inline void unlock_task_sighand(struct task_struct *tsk,
|
||||
spin_unlock_irqrestore(&tsk->sighand->siglock, *flags);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_CGROUPS
|
||||
/**
|
||||
* threadgroup_change_begin - mark the beginning of changes to a threadgroup
|
||||
* @tsk: task causing the changes
|
||||
*
|
||||
* All operations which modify a threadgroup - a new thread joining the
|
||||
* group, death of a member thread (the assertion of PF_EXITING) and
|
||||
* exec(2) dethreading the process and replacing the leader - are wrapped
|
||||
* by threadgroup_change_{begin|end}(). This is to provide a place which
|
||||
* subsystems needing threadgroup stability can hook into for
|
||||
* synchronization.
|
||||
*/
|
||||
static inline void threadgroup_change_begin(struct task_struct *tsk)
|
||||
{
|
||||
down_read(&tsk->signal->group_rwsem);
|
||||
might_sleep();
|
||||
cgroup_threadgroup_change_begin(tsk);
|
||||
}
|
||||
|
||||
/**
|
||||
* threadgroup_change_end - mark the end of changes to a threadgroup
|
||||
* @tsk: task causing the changes
|
||||
*
|
||||
* See threadgroup_change_begin().
|
||||
*/
|
||||
static inline void threadgroup_change_end(struct task_struct *tsk)
|
||||
{
|
||||
up_read(&tsk->signal->group_rwsem);
|
||||
cgroup_threadgroup_change_end(tsk);
|
||||
}
|
||||
|
||||
/**
|
||||
* threadgroup_lock - lock threadgroup
|
||||
* @tsk: member task of the threadgroup to lock
|
||||
*
|
||||
* Lock the threadgroup @tsk belongs to. No new task is allowed to enter
|
||||
* and member tasks aren't allowed to exit (as indicated by PF_EXITING) or
|
||||
* change ->group_leader/pid. This is useful for cases where the threadgroup
|
||||
* needs to stay stable across blockable operations.
|
||||
*
|
||||
* fork and exit paths explicitly call threadgroup_change_{begin|end}() for
|
||||
* synchronization. While held, no new task will be added to threadgroup
|
||||
* and no existing live task will have its PF_EXITING set.
|
||||
*
|
||||
* de_thread() does threadgroup_change_{begin|end}() when a non-leader
|
||||
* sub-thread becomes a new leader.
|
||||
*/
|
||||
static inline void threadgroup_lock(struct task_struct *tsk)
|
||||
{
|
||||
down_write(&tsk->signal->group_rwsem);
|
||||
}
|
||||
|
||||
/**
|
||||
* threadgroup_unlock - unlock threadgroup
|
||||
* @tsk: member task of the threadgroup to unlock
|
||||
*
|
||||
* Reverse threadgroup_lock().
|
||||
*/
|
||||
static inline void threadgroup_unlock(struct task_struct *tsk)
|
||||
{
|
||||
up_write(&tsk->signal->group_rwsem);
|
||||
}
|
||||
#else
|
||||
static inline void threadgroup_change_begin(struct task_struct *tsk) {}
|
||||
static inline void threadgroup_change_end(struct task_struct *tsk) {}
|
||||
static inline void threadgroup_lock(struct task_struct *tsk) {}
|
||||
static inline void threadgroup_unlock(struct task_struct *tsk) {}
|
||||
#endif
|
||||
|
||||
#ifndef __HAVE_THREAD_FUNCTIONS
|
||||
|
||||
#define task_thread_info(task) ((struct thread_info *)(task)->stack)
|
||||
|
@ -924,6 +924,7 @@ config NUMA_BALANCING_DEFAULT_ENABLED
|
||||
menuconfig CGROUPS
|
||||
bool "Control Group support"
|
||||
select KERNFS
|
||||
select PERCPU_RWSEM
|
||||
help
|
||||
This option adds support for grouping sets of processes together, for
|
||||
use with process control subsystems such as Cpusets, CFS, memory
|
||||
|
273
kernel/cgroup.c
273
kernel/cgroup.c
@ -46,6 +46,7 @@
|
||||
#include <linux/slab.h>
|
||||
#include <linux/spinlock.h>
|
||||
#include <linux/rwsem.h>
|
||||
#include <linux/percpu-rwsem.h>
|
||||
#include <linux/string.h>
|
||||
#include <linux/sort.h>
|
||||
#include <linux/kmod.h>
|
||||
@ -103,6 +104,8 @@ static DEFINE_SPINLOCK(cgroup_idr_lock);
|
||||
*/
|
||||
static DEFINE_SPINLOCK(release_agent_path_lock);
|
||||
|
||||
struct percpu_rw_semaphore cgroup_threadgroup_rwsem;
|
||||
|
||||
#define cgroup_assert_mutex_or_rcu_locked() \
|
||||
rcu_lockdep_assert(rcu_read_lock_held() || \
|
||||
lockdep_is_held(&cgroup_mutex), \
|
||||
@ -156,7 +159,7 @@ static bool cgrp_dfl_root_visible;
|
||||
static bool cgroup_legacy_files_on_dfl;
|
||||
|
||||
/* some controllers are not supported in the default hierarchy */
|
||||
static unsigned int cgrp_dfl_root_inhibit_ss_mask;
|
||||
static unsigned long cgrp_dfl_root_inhibit_ss_mask;
|
||||
|
||||
/* The list of hierarchy roots */
|
||||
|
||||
@ -175,18 +178,19 @@ static DEFINE_IDR(cgroup_hierarchy_idr);
|
||||
*/
|
||||
static u64 css_serial_nr_next = 1;
|
||||
|
||||
/* This flag indicates whether tasks in the fork and exit paths should
|
||||
* check for fork/exit handlers to call. This avoids us having to do
|
||||
* extra work in the fork/exit path if none of the subsystems need to
|
||||
* be called.
|
||||
/*
|
||||
* These bitmask flags indicate whether tasks in the fork and exit paths have
|
||||
* fork/exit handlers to call. This avoids us having to do extra work in the
|
||||
* fork/exit path to check which subsystems have fork/exit callbacks.
|
||||
*/
|
||||
static int need_forkexit_callback __read_mostly;
|
||||
static unsigned long have_fork_callback __read_mostly;
|
||||
static unsigned long have_exit_callback __read_mostly;
|
||||
|
||||
static struct cftype cgroup_dfl_base_files[];
|
||||
static struct cftype cgroup_legacy_base_files[];
|
||||
|
||||
static int rebind_subsystems(struct cgroup_root *dst_root,
|
||||
unsigned int ss_mask);
|
||||
unsigned long ss_mask);
|
||||
static int cgroup_destroy_locked(struct cgroup *cgrp);
|
||||
static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss,
|
||||
bool visible);
|
||||
@ -261,7 +265,7 @@ static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,
|
||||
* @cgrp: the cgroup of interest
|
||||
* @ss: the subsystem of interest (%NULL returns @cgrp->self)
|
||||
*
|
||||
* Similar to cgroup_css() but returns the effctive css, which is defined
|
||||
* Similar to cgroup_css() but returns the effective css, which is defined
|
||||
* as the matching css of the nearest ancestor including self which has @ss
|
||||
* enabled. If @ss is associated with the hierarchy @cgrp is on, this
|
||||
* function is guaranteed to return non-NULL css.
|
||||
@ -409,6 +413,24 @@ static int notify_on_release(const struct cgroup *cgrp)
|
||||
for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT && \
|
||||
(((ss) = cgroup_subsys[ssid]) || true); (ssid)++)
|
||||
|
||||
/**
|
||||
* for_each_subsys_which - filter for_each_subsys with a bitmask
|
||||
* @ss: the iteration cursor
|
||||
* @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
|
||||
* @ss_maskp: a pointer to the bitmask
|
||||
*
|
||||
* The block will only run for cases where the ssid-th bit (1 << ssid) of
|
||||
* mask is set to 1.
|
||||
*/
|
||||
#define for_each_subsys_which(ss, ssid, ss_maskp) \
|
||||
if (!CGROUP_SUBSYS_COUNT) /* to avoid spurious gcc warning */ \
|
||||
(ssid) = 0; \
|
||||
else \
|
||||
for_each_set_bit(ssid, ss_maskp, CGROUP_SUBSYS_COUNT) \
|
||||
if (((ss) = cgroup_subsys[ssid]) && false) \
|
||||
break; \
|
||||
else
|
||||
|
||||
/* iterate across the hierarchies */
|
||||
#define for_each_root(root) \
|
||||
list_for_each_entry((root), &cgroup_roots, root_list)
|
||||
@ -882,7 +904,7 @@ static void cgroup_exit_root_id(struct cgroup_root *root)
|
||||
static void cgroup_free_root(struct cgroup_root *root)
|
||||
{
|
||||
if (root) {
|
||||
/* hierarhcy ID shoulid already have been released */
|
||||
/* hierarchy ID should already have been released */
|
||||
WARN_ON_ONCE(root->hierarchy_id);
|
||||
|
||||
idr_destroy(&root->cgroup_idr);
|
||||
@ -998,7 +1020,7 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task,
|
||||
* update of a tasks cgroup pointer by cgroup_attach_task()
|
||||
*/
|
||||
|
||||
static int cgroup_populate_dir(struct cgroup *cgrp, unsigned int subsys_mask);
|
||||
static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask);
|
||||
static struct kernfs_syscall_ops cgroup_kf_syscall_ops;
|
||||
static const struct file_operations proc_cgroupstats_operations;
|
||||
|
||||
@ -1068,11 +1090,11 @@ static void cgroup_put(struct cgroup *cgrp)
|
||||
* @subtree_control is to be applied to @cgrp. The returned mask is always
|
||||
* a superset of @subtree_control and follows the usual hierarchy rules.
|
||||
*/
|
||||
static unsigned int cgroup_calc_child_subsys_mask(struct cgroup *cgrp,
|
||||
unsigned int subtree_control)
|
||||
static unsigned long cgroup_calc_child_subsys_mask(struct cgroup *cgrp,
|
||||
unsigned long subtree_control)
|
||||
{
|
||||
struct cgroup *parent = cgroup_parent(cgrp);
|
||||
unsigned int cur_ss_mask = subtree_control;
|
||||
unsigned long cur_ss_mask = subtree_control;
|
||||
struct cgroup_subsys *ss;
|
||||
int ssid;
|
||||
|
||||
@ -1082,11 +1104,10 @@ static unsigned int cgroup_calc_child_subsys_mask(struct cgroup *cgrp,
|
||||
return cur_ss_mask;
|
||||
|
||||
while (true) {
|
||||
unsigned int new_ss_mask = cur_ss_mask;
|
||||
unsigned long new_ss_mask = cur_ss_mask;
|
||||
|
||||
for_each_subsys(ss, ssid)
|
||||
if (cur_ss_mask & (1 << ssid))
|
||||
new_ss_mask |= ss->depends_on;
|
||||
for_each_subsys_which(ss, ssid, &cur_ss_mask)
|
||||
new_ss_mask |= ss->depends_on;
|
||||
|
||||
/*
|
||||
* Mask out subsystems which aren't available. This can
|
||||
@ -1200,7 +1221,7 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
|
||||
* @cgrp: target cgroup
|
||||
* @subsys_mask: mask of the subsystem ids whose files should be removed
|
||||
*/
|
||||
static void cgroup_clear_dir(struct cgroup *cgrp, unsigned int subsys_mask)
|
||||
static void cgroup_clear_dir(struct cgroup *cgrp, unsigned long subsys_mask)
|
||||
{
|
||||
struct cgroup_subsys *ss;
|
||||
int i;
|
||||
@ -1215,18 +1236,16 @@ static void cgroup_clear_dir(struct cgroup *cgrp, unsigned int subsys_mask)
|
||||
}
|
||||
}
|
||||
|
||||
static int rebind_subsystems(struct cgroup_root *dst_root, unsigned int ss_mask)
|
||||
static int rebind_subsystems(struct cgroup_root *dst_root,
|
||||
unsigned long ss_mask)
|
||||
{
|
||||
struct cgroup_subsys *ss;
|
||||
unsigned int tmp_ss_mask;
|
||||
unsigned long tmp_ss_mask;
|
||||
int ssid, i, ret;
|
||||
|
||||
lockdep_assert_held(&cgroup_mutex);
|
||||
|
||||
for_each_subsys(ss, ssid) {
|
||||
if (!(ss_mask & (1 << ssid)))
|
||||
continue;
|
||||
|
||||
for_each_subsys_which(ss, ssid, &ss_mask) {
|
||||
/* if @ss has non-root csses attached to it, can't move */
|
||||
if (css_next_child(NULL, cgroup_css(&ss->root->cgrp, ss)))
|
||||
return -EBUSY;
|
||||
@ -1253,7 +1272,7 @@ static int rebind_subsystems(struct cgroup_root *dst_root, unsigned int ss_mask)
|
||||
* Just warn about it and continue.
|
||||
*/
|
||||
if (cgrp_dfl_root_visible) {
|
||||
pr_warn("failed to create files (%d) while rebinding 0x%x to default root\n",
|
||||
pr_warn("failed to create files (%d) while rebinding 0x%lx to default root\n",
|
||||
ret, ss_mask);
|
||||
pr_warn("you may retry by moving them to a different hierarchy and unbinding\n");
|
||||
}
|
||||
@ -1263,18 +1282,14 @@ static int rebind_subsystems(struct cgroup_root *dst_root, unsigned int ss_mask)
|
||||
* Nothing can fail from this point on. Remove files for the
|
||||
* removed subsystems and rebind each subsystem.
|
||||
*/
|
||||
for_each_subsys(ss, ssid)
|
||||
if (ss_mask & (1 << ssid))
|
||||
cgroup_clear_dir(&ss->root->cgrp, 1 << ssid);
|
||||
for_each_subsys_which(ss, ssid, &ss_mask)
|
||||
cgroup_clear_dir(&ss->root->cgrp, 1 << ssid);
|
||||
|
||||
for_each_subsys(ss, ssid) {
|
||||
for_each_subsys_which(ss, ssid, &ss_mask) {
|
||||
struct cgroup_root *src_root;
|
||||
struct cgroup_subsys_state *css;
|
||||
struct css_set *cset;
|
||||
|
||||
if (!(ss_mask & (1 << ssid)))
|
||||
continue;
|
||||
|
||||
src_root = ss->root;
|
||||
css = cgroup_css(&src_root->cgrp, ss);
|
||||
|
||||
@ -1338,7 +1353,7 @@ static int cgroup_show_options(struct seq_file *seq,
|
||||
}
|
||||
|
||||
struct cgroup_sb_opts {
|
||||
unsigned int subsys_mask;
|
||||
unsigned long subsys_mask;
|
||||
unsigned int flags;
|
||||
char *release_agent;
|
||||
bool cpuset_clone_children;
|
||||
@ -1351,7 +1366,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
|
||||
{
|
||||
char *token, *o = data;
|
||||
bool all_ss = false, one_ss = false;
|
||||
unsigned int mask = -1U;
|
||||
unsigned long mask = -1UL;
|
||||
struct cgroup_subsys *ss;
|
||||
int nr_opts = 0;
|
||||
int i;
|
||||
@ -1495,7 +1510,7 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
|
||||
int ret = 0;
|
||||
struct cgroup_root *root = cgroup_root_from_kf(kf_root);
|
||||
struct cgroup_sb_opts opts;
|
||||
unsigned int added_mask, removed_mask;
|
||||
unsigned long added_mask, removed_mask;
|
||||
|
||||
if (root == &cgrp_dfl_root) {
|
||||
pr_err("remount is not allowed\n");
|
||||
@ -1641,7 +1656,7 @@ static void init_cgroup_root(struct cgroup_root *root,
|
||||
set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags);
|
||||
}
|
||||
|
||||
static int cgroup_setup_root(struct cgroup_root *root, unsigned int ss_mask)
|
||||
static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask)
|
||||
{
|
||||
LIST_HEAD(tmp_links);
|
||||
struct cgroup *root_cgrp = &root->cgrp;
|
||||
@ -2052,9 +2067,9 @@ static void cgroup_task_migrate(struct cgroup *old_cgrp,
|
||||
lockdep_assert_held(&css_set_rwsem);
|
||||
|
||||
/*
|
||||
* We are synchronized through threadgroup_lock() against PF_EXITING
|
||||
* setting such that we can't race against cgroup_exit() changing the
|
||||
* css_set to init_css_set and dropping the old one.
|
||||
* We are synchronized through cgroup_threadgroup_rwsem against
|
||||
* PF_EXITING setting such that we can't race against cgroup_exit()
|
||||
* changing the css_set to init_css_set and dropping the old one.
|
||||
*/
|
||||
WARN_ON_ONCE(tsk->flags & PF_EXITING);
|
||||
old_cset = task_css_set(tsk);
|
||||
@ -2111,10 +2126,11 @@ static void cgroup_migrate_finish(struct list_head *preloaded_csets)
|
||||
* @src_cset and add it to @preloaded_csets, which should later be cleaned
|
||||
* up by cgroup_migrate_finish().
|
||||
*
|
||||
* This function may be called without holding threadgroup_lock even if the
|
||||
* target is a process. Threads may be created and destroyed but as long
|
||||
* as cgroup_mutex is not dropped, no new css_set can be put into play and
|
||||
* the preloaded css_sets are guaranteed to cover all migrations.
|
||||
* This function may be called without holding cgroup_threadgroup_rwsem
|
||||
* even if the target is a process. Threads may be created and destroyed
|
||||
* but as long as cgroup_mutex is not dropped, no new css_set can be put
|
||||
* into play and the preloaded css_sets are guaranteed to cover all
|
||||
* migrations.
|
||||
*/
|
||||
static void cgroup_migrate_add_src(struct css_set *src_cset,
|
||||
struct cgroup *dst_cgrp,
|
||||
@ -2217,7 +2233,7 @@ static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp,
|
||||
* @threadgroup: whether @leader points to the whole process or a single task
|
||||
*
|
||||
* Migrate a process or task denoted by @leader to @cgrp. If migrating a
|
||||
* process, the caller must be holding threadgroup_lock of @leader. The
|
||||
* process, the caller must be holding cgroup_threadgroup_rwsem. The
|
||||
* caller is also responsible for invoking cgroup_migrate_add_src() and
|
||||
* cgroup_migrate_prepare_dst() on the targets before invoking this
|
||||
* function and following up with cgroup_migrate_finish().
|
||||
@ -2345,7 +2361,7 @@ static int cgroup_migrate(struct cgroup *cgrp, struct task_struct *leader,
|
||||
* @leader: the task or the leader of the threadgroup to be attached
|
||||
* @threadgroup: attach the whole threadgroup?
|
||||
*
|
||||
* Call holding cgroup_mutex and threadgroup_lock of @leader.
|
||||
* Call holding cgroup_mutex and cgroup_threadgroup_rwsem.
|
||||
*/
|
||||
static int cgroup_attach_task(struct cgroup *dst_cgrp,
|
||||
struct task_struct *leader, bool threadgroup)
|
||||
@ -2376,6 +2392,47 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp,
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int cgroup_procs_write_permission(struct task_struct *task,
|
||||
struct cgroup *dst_cgrp,
|
||||
struct kernfs_open_file *of)
|
||||
{
|
||||
const struct cred *cred = current_cred();
|
||||
const struct cred *tcred = get_task_cred(task);
|
||||
int ret = 0;
|
||||
|
||||
/*
|
||||
* even if we're attaching all tasks in the thread group, we only
|
||||
* need to check permissions on one of them.
|
||||
*/
|
||||
if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
|
||||
!uid_eq(cred->euid, tcred->uid) &&
|
||||
!uid_eq(cred->euid, tcred->suid))
|
||||
ret = -EACCES;
|
||||
|
||||
if (!ret && cgroup_on_dfl(dst_cgrp)) {
|
||||
struct super_block *sb = of->file->f_path.dentry->d_sb;
|
||||
struct cgroup *cgrp;
|
||||
struct inode *inode;
|
||||
|
||||
down_read(&css_set_rwsem);
|
||||
cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
|
||||
up_read(&css_set_rwsem);
|
||||
|
||||
while (!cgroup_is_descendant(dst_cgrp, cgrp))
|
||||
cgrp = cgroup_parent(cgrp);
|
||||
|
||||
ret = -ENOMEM;
|
||||
inode = kernfs_get_inode(sb, cgrp->procs_kn);
|
||||
if (inode) {
|
||||
ret = inode_permission(inode, MAY_WRITE);
|
||||
iput(inode);
|
||||
}
|
||||
}
|
||||
|
||||
put_cred(tcred);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Find the task_struct of the task to attach by vpid and pass it along to the
|
||||
* function to attach either it or all tasks in its threadgroup. Will lock
|
||||
@ -2385,7 +2442,6 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
|
||||
size_t nbytes, loff_t off, bool threadgroup)
|
||||
{
|
||||
struct task_struct *tsk;
|
||||
const struct cred *cred = current_cred(), *tcred;
|
||||
struct cgroup *cgrp;
|
||||
pid_t pid;
|
||||
int ret;
|
||||
@ -2397,29 +2453,17 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
|
||||
if (!cgrp)
|
||||
return -ENODEV;
|
||||
|
||||
retry_find_task:
|
||||
percpu_down_write(&cgroup_threadgroup_rwsem);
|
||||
rcu_read_lock();
|
||||
if (pid) {
|
||||
tsk = find_task_by_vpid(pid);
|
||||
if (!tsk) {
|
||||
rcu_read_unlock();
|
||||
ret = -ESRCH;
|
||||
goto out_unlock_cgroup;
|
||||
goto out_unlock_rcu;
|
||||
}
|
||||
/*
|
||||
* even if we're attaching all tasks in the thread group, we
|
||||
* only need to check permissions on one of them.
|
||||
*/
|
||||
tcred = __task_cred(tsk);
|
||||
if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
|
||||
!uid_eq(cred->euid, tcred->uid) &&
|
||||
!uid_eq(cred->euid, tcred->suid)) {
|
||||
rcu_read_unlock();
|
||||
ret = -EACCES;
|
||||
goto out_unlock_cgroup;
|
||||
}
|
||||
} else
|
||||
} else {
|
||||
tsk = current;
|
||||
}
|
||||
|
||||
if (threadgroup)
|
||||
tsk = tsk->group_leader;
|
||||
@ -2431,35 +2475,23 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
|
||||
*/
|
||||
if (tsk == kthreadd_task || (tsk->flags & PF_NO_SETAFFINITY)) {
|
||||
ret = -EINVAL;
|
||||
rcu_read_unlock();
|
||||
goto out_unlock_cgroup;
|
||||
goto out_unlock_rcu;
|
||||
}
|
||||
|
||||
get_task_struct(tsk);
|
||||
rcu_read_unlock();
|
||||
|
||||
threadgroup_lock(tsk);
|
||||
if (threadgroup) {
|
||||
if (!thread_group_leader(tsk)) {
|
||||
/*
|
||||
* a race with de_thread from another thread's exec()
|
||||
* may strip us of our leadership, if this happens,
|
||||
* there is no choice but to throw this task away and
|
||||
* try again; this is
|
||||
* "double-double-toil-and-trouble-check locking".
|
||||
*/
|
||||
threadgroup_unlock(tsk);
|
||||
put_task_struct(tsk);
|
||||
goto retry_find_task;
|
||||
}
|
||||
}
|
||||
|
||||
ret = cgroup_attach_task(cgrp, tsk, threadgroup);
|
||||
|
||||
threadgroup_unlock(tsk);
|
||||
ret = cgroup_procs_write_permission(tsk, cgrp, of);
|
||||
if (!ret)
|
||||
ret = cgroup_attach_task(cgrp, tsk, threadgroup);
|
||||
|
||||
put_task_struct(tsk);
|
||||
out_unlock_cgroup:
|
||||
goto out_unlock_threadgroup;
|
||||
|
||||
out_unlock_rcu:
|
||||
rcu_read_unlock();
|
||||
out_unlock_threadgroup:
|
||||
percpu_up_write(&cgroup_threadgroup_rwsem);
|
||||
cgroup_kn_unlock(of->kn);
|
||||
return ret ?: nbytes;
|
||||
}
|
||||
@ -2542,19 +2574,17 @@ static int cgroup_sane_behavior_show(struct seq_file *seq, void *v)
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void cgroup_print_ss_mask(struct seq_file *seq, unsigned int ss_mask)
|
||||
static void cgroup_print_ss_mask(struct seq_file *seq, unsigned long ss_mask)
|
||||
{
|
||||
struct cgroup_subsys *ss;
|
||||
bool printed = false;
|
||||
int ssid;
|
||||
|
||||
for_each_subsys(ss, ssid) {
|
||||
if (ss_mask & (1 << ssid)) {
|
||||
if (printed)
|
||||
seq_putc(seq, ' ');
|
||||
seq_printf(seq, "%s", ss->name);
|
||||
printed = true;
|
||||
}
|
||||
for_each_subsys_which(ss, ssid, &ss_mask) {
|
||||
if (printed)
|
||||
seq_putc(seq, ' ');
|
||||
seq_printf(seq, "%s", ss->name);
|
||||
printed = true;
|
||||
}
|
||||
if (printed)
|
||||
seq_putc(seq, '\n');
|
||||
@ -2606,6 +2636,8 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
|
||||
|
||||
lockdep_assert_held(&cgroup_mutex);
|
||||
|
||||
percpu_down_write(&cgroup_threadgroup_rwsem);
|
||||
|
||||
/* look up all csses currently attached to @cgrp's subtree */
|
||||
down_read(&css_set_rwsem);
|
||||
css_for_each_descendant_pre(css, cgroup_css(cgrp, NULL)) {
|
||||
@ -2661,17 +2693,8 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
|
||||
goto out_finish;
|
||||
last_task = task;
|
||||
|
||||
threadgroup_lock(task);
|
||||
/* raced against de_thread() from another thread? */
|
||||
if (!thread_group_leader(task)) {
|
||||
threadgroup_unlock(task);
|
||||
put_task_struct(task);
|
||||
continue;
|
||||
}
|
||||
|
||||
ret = cgroup_migrate(src_cset->dfl_cgrp, task, true);
|
||||
|
||||
threadgroup_unlock(task);
|
||||
put_task_struct(task);
|
||||
|
||||
if (WARN(ret, "cgroup: failed to update controllers for the default hierarchy (%d), further operations may crash or hang\n", ret))
|
||||
@ -2681,6 +2704,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
|
||||
|
||||
out_finish:
|
||||
cgroup_migrate_finish(&preloaded_csets);
|
||||
percpu_up_write(&cgroup_threadgroup_rwsem);
|
||||
return ret;
|
||||
}
|
||||
|
||||
@ -2689,8 +2713,8 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
|
||||
char *buf, size_t nbytes,
|
||||
loff_t off)
|
||||
{
|
||||
unsigned int enable = 0, disable = 0;
|
||||
unsigned int css_enable, css_disable, old_sc, new_sc, old_ss, new_ss;
|
||||
unsigned long enable = 0, disable = 0;
|
||||
unsigned long css_enable, css_disable, old_sc, new_sc, old_ss, new_ss;
|
||||
struct cgroup *cgrp, *child;
|
||||
struct cgroup_subsys *ss;
|
||||
char *tok;
|
||||
@ -2702,11 +2726,12 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
|
||||
*/
|
||||
buf = strstrip(buf);
|
||||
while ((tok = strsep(&buf, " "))) {
|
||||
unsigned long tmp_ss_mask = ~cgrp_dfl_root_inhibit_ss_mask;
|
||||
|
||||
if (tok[0] == '\0')
|
||||
continue;
|
||||
for_each_subsys(ss, ssid) {
|
||||
if (ss->disabled || strcmp(tok + 1, ss->name) ||
|
||||
((1 << ss->id) & cgrp_dfl_root_inhibit_ss_mask))
|
||||
for_each_subsys_which(ss, ssid, &tmp_ss_mask) {
|
||||
if (ss->disabled || strcmp(tok + 1, ss->name))
|
||||
continue;
|
||||
|
||||
if (*tok == '+') {
|
||||
@ -2793,10 +2818,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
|
||||
* still around. In such cases, wait till it's gone using
|
||||
* offline_waitq.
|
||||
*/
|
||||
for_each_subsys(ss, ssid) {
|
||||
if (!(css_enable & (1 << ssid)))
|
||||
continue;
|
||||
|
||||
for_each_subsys_which(ss, ssid, &css_enable) {
|
||||
cgroup_for_each_live_child(child, cgrp) {
|
||||
DEFINE_WAIT(wait);
|
||||
|
||||
@ -3087,7 +3109,9 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft)
|
||||
return ret;
|
||||
}
|
||||
|
||||
if (cft->seq_show == cgroup_populated_show)
|
||||
if (cft->write == cgroup_procs_write)
|
||||
cgrp->procs_kn = kn;
|
||||
else if (cft->seq_show == cgroup_populated_show)
|
||||
cgrp->populated_kn = kn;
|
||||
return 0;
|
||||
}
|
||||
@ -4322,7 +4346,7 @@ static struct cftype cgroup_legacy_base_files[] = {
|
||||
*
|
||||
* On failure, no file is added.
|
||||
*/
|
||||
static int cgroup_populate_dir(struct cgroup *cgrp, unsigned int subsys_mask)
|
||||
static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask)
|
||||
{
|
||||
struct cgroup_subsys *ss;
|
||||
int i, ret = 0;
|
||||
@ -4931,7 +4955,8 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
|
||||
* init_css_set is in the subsystem's root cgroup. */
|
||||
init_css_set.subsys[ss->id] = css;
|
||||
|
||||
need_forkexit_callback |= ss->fork || ss->exit;
|
||||
have_fork_callback |= (bool)ss->fork << ss->id;
|
||||
have_exit_callback |= (bool)ss->exit << ss->id;
|
||||
|
||||
/* At system boot, before all subsystems have been
|
||||
* registered, no tasks have been forked, so we don't
|
||||
@ -4989,6 +5014,7 @@ int __init cgroup_init(void)
|
||||
unsigned long key;
|
||||
int ssid, err;
|
||||
|
||||
BUG_ON(percpu_init_rwsem(&cgroup_threadgroup_rwsem));
|
||||
BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files));
|
||||
BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files));
|
||||
|
||||
@ -5241,11 +5267,8 @@ void cgroup_post_fork(struct task_struct *child)
|
||||
* css_set; otherwise, @child might change state between ->fork()
|
||||
* and addition to css_set.
|
||||
*/
|
||||
if (need_forkexit_callback) {
|
||||
for_each_subsys(ss, i)
|
||||
if (ss->fork)
|
||||
ss->fork(child);
|
||||
}
|
||||
for_each_subsys_which(ss, i, &have_fork_callback)
|
||||
ss->fork(child);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -5289,16 +5312,12 @@ void cgroup_exit(struct task_struct *tsk)
|
||||
cset = task_css_set(tsk);
|
||||
RCU_INIT_POINTER(tsk->cgroups, &init_css_set);
|
||||
|
||||
if (need_forkexit_callback) {
|
||||
/* see cgroup_post_fork() for details */
|
||||
for_each_subsys(ss, i) {
|
||||
if (ss->exit) {
|
||||
struct cgroup_subsys_state *old_css = cset->subsys[i];
|
||||
struct cgroup_subsys_state *css = task_css(tsk, i);
|
||||
/* see cgroup_post_fork() for details */
|
||||
for_each_subsys_which(ss, i, &have_exit_callback) {
|
||||
struct cgroup_subsys_state *old_css = cset->subsys[i];
|
||||
struct cgroup_subsys_state *css = task_css(tsk, i);
|
||||
|
||||
ss->exit(css, old_css, tsk);
|
||||
}
|
||||
}
|
||||
ss->exit(css, old_css, tsk);
|
||||
}
|
||||
|
||||
if (put_cset)
|
||||
|
@ -1141,10 +1141,6 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
|
||||
tty_audit_fork(sig);
|
||||
sched_autogroup_fork(sig);
|
||||
|
||||
#ifdef CONFIG_CGROUPS
|
||||
init_rwsem(&sig->group_rwsem);
|
||||
#endif
|
||||
|
||||
sig->oom_score_adj = current->signal->oom_score_adj;
|
||||
sig->oom_score_adj_min = current->signal->oom_score_adj_min;
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user