From 51bee5abeab2058ea5813c5615d6197a23dbf041 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 28 Jan 2019 17:00:13 +0100 Subject: [PATCH 1/6] cgroup/pids: turn cgroup_subsys->free() into cgroup_subsys->release() to fix the accounting The only user of cgroup_subsys->free() callback is pids_cgrp_subsys which needs pids_free() to uncharge the pid. However, ->free() is called from __put_task_struct()->cgroup_free() and this is too late. Even the trivial program which does for (;;) { int pid = fork(); assert(pid >= 0); if (pid) wait(NULL); else exit(0); } can run out of limits because release_task()->call_rcu(delayed_put_task_struct) implies an RCU gp after the task/pid goes away and before the final put(). Test-case: mkdir -p /tmp/CG mount -t cgroup2 none /tmp/CG echo '+pids' > /tmp/CG/cgroup.subtree_control mkdir /tmp/CG/PID echo 2 > /tmp/CG/PID/pids.max perl -e 'while ($p = fork) { wait; } $p // die "fork failed: $!\n"' & echo $! > /tmp/CG/PID/cgroup.procs Without this patch the forking process fails soon after migration. Rename cgroup_subsys->free() to cgroup_subsys->release() and move the callsite into the new helper, cgroup_release(), called by release_task() which actually frees the pid(s). Reported-by: Herton R. Krzesinski Reported-by: Jan Stancek Signed-off-by: Oleg Nesterov Signed-off-by: Tejun Heo --- include/linux/cgroup-defs.h | 2 +- include/linux/cgroup.h | 2 ++ kernel/cgroup/cgroup.c | 15 +++++++++------ kernel/cgroup/pids.c | 4 ++-- kernel/exit.c | 1 + 5 files changed, 15 insertions(+), 9 deletions(-) diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 8fcbae1b8db0..120d1d40704b 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -602,7 +602,7 @@ struct cgroup_subsys { void (*cancel_fork)(struct task_struct *task); void (*fork)(struct task_struct *task); void (*exit)(struct task_struct *task); - void (*free)(struct task_struct *task); + void (*release)(struct task_struct *task); void (*bind)(struct cgroup_subsys_state *root_css); bool early_init:1; diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 9968332cceed..81f58b4a5418 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -121,6 +121,7 @@ extern int cgroup_can_fork(struct task_struct *p); extern void cgroup_cancel_fork(struct task_struct *p); extern void cgroup_post_fork(struct task_struct *p); void cgroup_exit(struct task_struct *p); +void cgroup_release(struct task_struct *p); void cgroup_free(struct task_struct *p); int cgroup_init_early(void); @@ -697,6 +698,7 @@ static inline int cgroup_can_fork(struct task_struct *p) { return 0; } static inline void cgroup_cancel_fork(struct task_struct *p) {} static inline void cgroup_post_fork(struct task_struct *p) {} static inline void cgroup_exit(struct task_struct *p) {} +static inline void cgroup_release(struct task_struct *p) {} static inline void cgroup_free(struct task_struct *p) {} static inline int cgroup_init_early(void) { return 0; } diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index f31bd61c9466..f4418371c83b 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -197,7 +197,7 @@ static u64 css_serial_nr_next = 1; */ static u16 have_fork_callback __read_mostly; static u16 have_exit_callback __read_mostly; -static u16 have_free_callback __read_mostly; +static u16 have_release_callback __read_mostly; static u16 have_canfork_callback __read_mostly; /* cgroup namespace for init task */ @@ -5313,7 +5313,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early) have_fork_callback |= (bool)ss->fork << ss->id; have_exit_callback |= (bool)ss->exit << ss->id; - have_free_callback |= (bool)ss->free << ss->id; + have_release_callback |= (bool)ss->release << ss->id; have_canfork_callback |= (bool)ss->can_fork << ss->id; /* At system boot, before all subsystems have been @@ -5749,16 +5749,19 @@ void cgroup_exit(struct task_struct *tsk) } while_each_subsys_mask(); } -void cgroup_free(struct task_struct *task) +void cgroup_release(struct task_struct *task) { - struct css_set *cset = task_css_set(task); struct cgroup_subsys *ss; int ssid; - do_each_subsys_mask(ss, ssid, have_free_callback) { - ss->free(task); + do_each_subsys_mask(ss, ssid, have_release_callback) { + ss->release(task); } while_each_subsys_mask(); +} +void cgroup_free(struct task_struct *task) +{ + struct css_set *cset = task_css_set(task); put_css_set(cset); } diff --git a/kernel/cgroup/pids.c b/kernel/cgroup/pids.c index 9829c67ebc0a..c9960baaa14f 100644 --- a/kernel/cgroup/pids.c +++ b/kernel/cgroup/pids.c @@ -247,7 +247,7 @@ static void pids_cancel_fork(struct task_struct *task) pids_uncharge(pids, 1); } -static void pids_free(struct task_struct *task) +static void pids_release(struct task_struct *task) { struct pids_cgroup *pids = css_pids(task_css(task, pids_cgrp_id)); @@ -342,7 +342,7 @@ struct cgroup_subsys pids_cgrp_subsys = { .cancel_attach = pids_cancel_attach, .can_fork = pids_can_fork, .cancel_fork = pids_cancel_fork, - .free = pids_free, + .release = pids_release, .legacy_cftypes = pids_files, .dfl_cftypes = pids_files, .threaded = true, diff --git a/kernel/exit.c b/kernel/exit.c index 3fb7be001964..c2b8443f30b4 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -219,6 +219,7 @@ void release_task(struct task_struct *p) } write_unlock_irq(&tasklist_lock); + cgroup_release(p); release_thread(p); call_rcu(&p->rcu, delayed_put_task_struct); From 6b3a6a132dfcb88ca88abe90c2bbb7ba0d382b4d Mon Sep 17 00:00:00 2001 From: Andrea Parri Date: Thu, 31 Jan 2019 15:22:19 +0100 Subject: [PATCH 2/6] MAINTAINERS: Update cgroup entry Fix wildcard patterns and add cgroup-v2 documentation. Signed-off-by: Andrea Parri Signed-off-by: Tejun Heo --- MAINTAINERS | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 2d3c1918f1b0..a8b5584cd07b 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3906,9 +3906,10 @@ M: Johannes Weiner L: cgroups@vger.kernel.org T: git git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup.git S: Maintained -F: Documentation/cgroup* +F: Documentation/admin-guide/cgroup-v2.rst +F: Documentation/cgroup-v1/ F: include/linux/cgroup* -F: kernel/cgroup* +F: kernel/cgroup/ CONTROL GROUP - CPUSET M: Li Zefan From 34b43446937e909230fb6cbef3f1bc04642688c9 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 6 Feb 2019 16:59:00 -0800 Subject: [PATCH 3/6] Documentation: cgroup-v2: eliminate markup warnings Fix markup warnings in cgroup-v2.rst: Documentation/admin-guide/cgroup-v2.rst:1509: WARNING: Block quote ends without a blank line; unexpected unindent. Documentation/admin-guide/cgroup-v2.rst:1511: WARNING: Block quote ends without a blank line; unexpected unindent. Documentation/admin-guide/cgroup-v2.rst:1512: WARNING: Block quote ends without a blank line; unexpected unindent. Signed-off-by: Randy Dunlap Cc: Tejun Heo Cc: Li Zefan Cc: Johannes Weiner Cc: cgroups@vger.kernel.org Cc: Jonathan Corbet Cc: linux-doc@vger.kernel.org Signed-off-by: Tejun Heo --- Documentation/admin-guide/cgroup-v2.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index 7bf3f129c68b..61f8bbb0a1b2 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -1503,7 +1503,7 @@ protected workload. The limits are only applied at the peer level in the hierarchy. This means that in the diagram below, only groups A, B, and C will influence each other, and -groups D and F will influence each other. Group G will influence nobody. +groups D and F will influence each other. Group G will influence nobody:: [root] / | \ From 05b71f6ffd182e3af3ac25ab811675d622d4ac2a Mon Sep 17 00:00:00 2001 From: Tibor Billes Date: Sat, 9 Feb 2019 16:05:33 +0100 Subject: [PATCH 4/6] cgroup: add documentation for pids.events file cgroup: add documentation for pids.events file Document the event counters file for pid cgroups. Signed-off-by: Tibor Billes Signed-off-by: Tejun Heo --- Documentation/cgroup-v1/pids.txt | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Documentation/cgroup-v1/pids.txt b/Documentation/cgroup-v1/pids.txt index 1a078b5d281a..e105d708ccde 100644 --- a/Documentation/cgroup-v1/pids.txt +++ b/Documentation/cgroup-v1/pids.txt @@ -33,6 +33,9 @@ limit in the hierarchy is followed). pids.current tracks all child cgroup hierarchies, so parent/pids.current is a superset of parent/child/pids.current. +The pids.events file contains event counters: + - max: Number of times fork failed because limit was hit. + Example ------- From b4ff1b44bcd384d22fcbac6ebaf9cc0d33debe50 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 15 Feb 2019 11:01:31 -0800 Subject: [PATCH 5/6] cgroup, rstat: Don't flush subtree root unless necessary cgroup_rstat_cpu_pop_updated() is used to traverse the updated cgroups on flush. While it was only visiting updated ones in the subtree, it was visiting @root unconditionally. We can easily check whether @root is updated or not by looking at its ->updated_next just as with the cgroups in the subtree. * Remove the unnecessary cgroup_parent() test. The system root cgroup is never updated and thus its ->updated_next is always NULL. No need to test whether cgroup_parent() exists in addition to ->updated_next. * Terminate traverse if ->updated_next is NULL. This can only happen for subtree @root and there's no reason to visit it if it's not marked updated. This reduces cpu consumption when reading a lot of rstat backed files. In a micro benchmark reading stat from ~1600 cgroups, the sys time was lowered by >40%. Signed-off-by: Tejun Heo --- kernel/cgroup/rstat.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index d503d1a9007c..bb95a35e8c2d 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -87,7 +87,6 @@ static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos, struct cgroup *root, int cpu) { struct cgroup_rstat_cpu *rstatc; - struct cgroup *parent; if (pos == root) return NULL; @@ -115,8 +114,8 @@ static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos, * However, due to the way we traverse, @pos will be the first * child in most cases. The only exception is @root. */ - parent = cgroup_parent(pos); - if (parent && rstatc->updated_next) { + if (rstatc->updated_next) { + struct cgroup *parent = cgroup_parent(pos); struct cgroup_rstat_cpu *prstatc = cgroup_rstat_cpu(parent, cpu); struct cgroup_rstat_cpu *nrstatc; struct cgroup **nextp; @@ -140,9 +139,12 @@ static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos, * updated stat. */ smp_mb(); + + return pos; } - return pos; + /* only happens for @root */ + return NULL; } /* see cgroup_rstat_flush() */ From 6a613d24effcb875271b8a1c510172e2d6eaaee8 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 18 Feb 2019 15:28:11 +0900 Subject: [PATCH 6/6] cpuset: remove unused task_has_mempolicy() This is a remnant of commit 5f155f27cb7f ("mm, cpuset: always use seqlock when changing task's nodemask"). Signed-off-by: Masahiro Yamada Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 479743db6c37..72afd55f70c6 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -203,19 +203,6 @@ static inline struct cpuset *parent_cs(struct cpuset *cs) return css_cs(cs->css.parent); } -#ifdef CONFIG_NUMA -static inline bool task_has_mempolicy(struct task_struct *task) -{ - return task->mempolicy; -} -#else -static inline bool task_has_mempolicy(struct task_struct *task) -{ - return false; -} -#endif - - /* bits in struct cpuset flags field */ typedef enum { CS_ONLINE,