linux_dsm_epyc7002/mm/hugetlb_cgroup.c

617 lines
16 KiB
C
Raw Normal View History

/*
*
* Copyright IBM Corporation, 2012
* Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
*
* Cgroup v2
* Copyright (C) 2019 Red Hat, Inc.
* Author: Giuseppe Scrivano <gscrivan@redhat.com>
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of version 2.1 of the GNU Lesser General Public License
* as published by the Free Software Foundation.
*
* This program is distributed in the hope that it would be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
*
*/
#include <linux/cgroup.h>
#include <linux/page_counter.h>
#include <linux/slab.h>
#include <linux/hugetlb.h>
#include <linux/hugetlb_cgroup.h>
enum hugetlb_memory_event {
HUGETLB_MAX,
HUGETLB_NR_MEMORY_EVENTS,
};
struct hugetlb_cgroup {
struct cgroup_subsys_state css;
/*
* the counter to account for hugepages from hugetlb.
*/
struct page_counter hugepage[HUGE_MAX_HSTATE];
atomic_long_t events[HUGE_MAX_HSTATE][HUGETLB_NR_MEMORY_EVENTS];
atomic_long_t events_local[HUGE_MAX_HSTATE][HUGETLB_NR_MEMORY_EVENTS];
/* Handle for "hugetlb.events" */
struct cgroup_file events_file[HUGE_MAX_HSTATE];
/* Handle for "hugetlb.events.local" */
struct cgroup_file events_local_file[HUGE_MAX_HSTATE];
};
#define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val))
#define MEMFILE_IDX(val) (((val) >> 16) & 0xffff)
#define MEMFILE_ATTR(val) ((val) & 0xffff)
#define hugetlb_cgroup_from_counter(counter, idx) \
container_of(counter, struct hugetlb_cgroup, hugepage[idx])
static struct hugetlb_cgroup *root_h_cgroup __read_mostly;
static inline
struct hugetlb_cgroup *hugetlb_cgroup_from_css(struct cgroup_subsys_state *s)
{
return s ? container_of(s, struct hugetlb_cgroup, css) : NULL;
}
static inline
struct hugetlb_cgroup *hugetlb_cgroup_from_task(struct task_struct *task)
{
cgroup: clean up cgroup_subsys names and initialization cgroup_subsys is a bit messier than it needs to be. * The name of a subsys can be different from its internal identifier defined in cgroup_subsys.h. Most subsystems use the matching name but three - cpu, memory and perf_event - use different ones. * cgroup_subsys_id enums are postfixed with _subsys_id and each cgroup_subsys is postfixed with _subsys. cgroup.h is widely included throughout various subsystems, it doesn't and shouldn't have claim on such generic names which don't have any qualifier indicating that they belong to cgroup. * cgroup_subsys->subsys_id should always equal the matching cgroup_subsys_id enum; however, we require each controller to initialize it and then BUG if they don't match, which is a bit silly. This patch cleans up cgroup_subsys names and initialization by doing the followings. * cgroup_subsys_id enums are now postfixed with _cgrp_id, and each cgroup_subsys with _cgrp_subsys. * With the above, renaming subsys identifiers to match the userland visible names doesn't cause any naming conflicts. All non-matching identifiers are renamed to match the official names. cpu_cgroup -> cpu mem_cgroup -> memory perf -> perf_event * controllers no longer need to initialize ->subsys_id and ->name. They're generated in cgroup core and set automatically during boot. * Redundant cgroup_subsys declarations removed. * While updating BUG_ON()s in cgroup_init_early(), convert them to WARN()s. BUGging that early during boot is stupid - the kernel can't print anything, even through serial console and the trap handler doesn't even link stack frame properly for back-tracing. This patch doesn't introduce any behavior changes. v2: Rebased on top of fe1217c4f3f7 ("net: net_cls: move cgroupfs classid handling into core"). Signed-off-by: Tejun Heo <tj@kernel.org> Acked-by: Neil Horman <nhorman@tuxdriver.com> Acked-by: "David S. Miller" <davem@davemloft.net> Acked-by: "Rafael J. Wysocki" <rjw@rjwysocki.net> Acked-by: Michal Hocko <mhocko@suse.cz> Acked-by: Peter Zijlstra <peterz@infradead.org> Acked-by: Aristeu Rozanski <aris@redhat.com> Acked-by: Ingo Molnar <mingo@redhat.com> Acked-by: Li Zefan <lizefan@huawei.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Balbir Singh <bsingharora@gmail.com> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Serge E. Hallyn <serue@us.ibm.com> Cc: Vivek Goyal <vgoyal@redhat.com> Cc: Thomas Graf <tgraf@suug.ch>
2014-02-08 22:36:58 +07:00
return hugetlb_cgroup_from_css(task_css(task, hugetlb_cgrp_id));
}
static inline bool hugetlb_cgroup_is_root(struct hugetlb_cgroup *h_cg)
{
return (h_cg == root_h_cgroup);
}
static inline struct hugetlb_cgroup *
parent_hugetlb_cgroup(struct hugetlb_cgroup *h_cg)
{
return hugetlb_cgroup_from_css(h_cg->css.parent);
}
static inline bool hugetlb_cgroup_have_usage(struct hugetlb_cgroup *h_cg)
{
int idx;
for (idx = 0; idx < hugetlb_max_hstate; idx++) {
if (page_counter_read(&h_cg->hugepage[idx]))
return true;
}
return false;
}
static void hugetlb_cgroup_init(struct hugetlb_cgroup *h_cgroup,
struct hugetlb_cgroup *parent_h_cgroup)
{
int idx;
for (idx = 0; idx < HUGE_MAX_HSTATE; idx++) {
struct page_counter *counter = &h_cgroup->hugepage[idx];
struct page_counter *parent = NULL;
unsigned long limit;
int ret;
if (parent_h_cgroup)
parent = &parent_h_cgroup->hugepage[idx];
page_counter_init(counter, parent);
limit = round_down(PAGE_COUNTER_MAX,
1 << huge_page_order(&hstates[idx]));
ret = page_counter_set_max(counter, limit);
VM_BUG_ON(ret);
}
}
cgroup: pass around cgroup_subsys_state instead of cgroup in subsystem methods cgroup is currently in the process of transitioning to using struct cgroup_subsys_state * as the primary handle instead of struct cgroup * in subsystem implementations for the following reasons. * With unified hierarchy, subsystems will be dynamically bound and unbound from cgroups and thus css's (cgroup_subsys_state) may be created and destroyed dynamically over the lifetime of a cgroup, which is different from the current state where all css's are allocated and destroyed together with the associated cgroup. This in turn means that cgroup_css() should be synchronized and may return NULL, making it more cumbersome to use. * Differing levels of per-subsystem granularity in the unified hierarchy means that the task and descendant iterators should behave differently depending on the specific subsystem the iteration is being performed for. * In majority of the cases, subsystems only care about its part in the cgroup hierarchy - ie. the hierarchy of css's. Subsystem methods often obtain the matching css pointer from the cgroup and don't bother with the cgroup pointer itself. Passing around css fits much better. This patch converts all cgroup_subsys methods to take @css instead of @cgroup. The conversions are mostly straight-forward. A few noteworthy changes are * ->css_alloc() now takes css of the parent cgroup rather than the pointer to the new cgroup as the css for the new cgroup doesn't exist yet. Knowing the parent css is enough for all the existing subsystems. * In kernel/cgroup.c::offline_css(), unnecessary open coded css dereference is replaced with local variable access. This patch shouldn't cause any behavior differences. v2: Unnecessary explicit cgrp->subsys[] deref in css_online() replaced with local variable @css as suggested by Li Zefan. Rebased on top of new for-3.12 which includes for-3.11-fixes so that ->css_free() invocation added by da0a12caff ("cgroup: fix a leak when percpu_ref_init() fails") is converted too. Suggested by Li Zefan. Signed-off-by: Tejun Heo <tj@kernel.org> Acked-by: Li Zefan <lizefan@huawei.com> Acked-by: Michal Hocko <mhocko@suse.cz> Acked-by: Vivek Goyal <vgoyal@redhat.com> Acked-by: Aristeu Rozanski <aris@redhat.com> Acked-by: Daniel Wagner <daniel.wagner@bmw-carit.de> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Ingo Molnar <mingo@redhat.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Balbir Singh <bsingharora@gmail.com> Cc: Matt Helsley <matthltc@us.ibm.com> Cc: Jens Axboe <axboe@kernel.dk> Cc: Steven Rostedt <rostedt@goodmis.org>
2013-08-09 07:11:23 +07:00
static struct cgroup_subsys_state *
hugetlb_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
{
cgroup: pass around cgroup_subsys_state instead of cgroup in subsystem methods cgroup is currently in the process of transitioning to using struct cgroup_subsys_state * as the primary handle instead of struct cgroup * in subsystem implementations for the following reasons. * With unified hierarchy, subsystems will be dynamically bound and unbound from cgroups and thus css's (cgroup_subsys_state) may be created and destroyed dynamically over the lifetime of a cgroup, which is different from the current state where all css's are allocated and destroyed together with the associated cgroup. This in turn means that cgroup_css() should be synchronized and may return NULL, making it more cumbersome to use. * Differing levels of per-subsystem granularity in the unified hierarchy means that the task and descendant iterators should behave differently depending on the specific subsystem the iteration is being performed for. * In majority of the cases, subsystems only care about its part in the cgroup hierarchy - ie. the hierarchy of css's. Subsystem methods often obtain the matching css pointer from the cgroup and don't bother with the cgroup pointer itself. Passing around css fits much better. This patch converts all cgroup_subsys methods to take @css instead of @cgroup. The conversions are mostly straight-forward. A few noteworthy changes are * ->css_alloc() now takes css of the parent cgroup rather than the pointer to the new cgroup as the css for the new cgroup doesn't exist yet. Knowing the parent css is enough for all the existing subsystems. * In kernel/cgroup.c::offline_css(), unnecessary open coded css dereference is replaced with local variable access. This patch shouldn't cause any behavior differences. v2: Unnecessary explicit cgrp->subsys[] deref in css_online() replaced with local variable @css as suggested by Li Zefan. Rebased on top of new for-3.12 which includes for-3.11-fixes so that ->css_free() invocation added by da0a12caff ("cgroup: fix a leak when percpu_ref_init() fails") is converted too. Suggested by Li Zefan. Signed-off-by: Tejun Heo <tj@kernel.org> Acked-by: Li Zefan <lizefan@huawei.com> Acked-by: Michal Hocko <mhocko@suse.cz> Acked-by: Vivek Goyal <vgoyal@redhat.com> Acked-by: Aristeu Rozanski <aris@redhat.com> Acked-by: Daniel Wagner <daniel.wagner@bmw-carit.de> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Ingo Molnar <mingo@redhat.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Balbir Singh <bsingharora@gmail.com> Cc: Matt Helsley <matthltc@us.ibm.com> Cc: Jens Axboe <axboe@kernel.dk> Cc: Steven Rostedt <rostedt@goodmis.org>
2013-08-09 07:11:23 +07:00
struct hugetlb_cgroup *parent_h_cgroup = hugetlb_cgroup_from_css(parent_css);
struct hugetlb_cgroup *h_cgroup;
h_cgroup = kzalloc(sizeof(*h_cgroup), GFP_KERNEL);
if (!h_cgroup)
return ERR_PTR(-ENOMEM);
if (!parent_h_cgroup)
root_h_cgroup = h_cgroup;
hugetlb_cgroup_init(h_cgroup, parent_h_cgroup);
return &h_cgroup->css;
}
cgroup: pass around cgroup_subsys_state instead of cgroup in subsystem methods cgroup is currently in the process of transitioning to using struct cgroup_subsys_state * as the primary handle instead of struct cgroup * in subsystem implementations for the following reasons. * With unified hierarchy, subsystems will be dynamically bound and unbound from cgroups and thus css's (cgroup_subsys_state) may be created and destroyed dynamically over the lifetime of a cgroup, which is different from the current state where all css's are allocated and destroyed together with the associated cgroup. This in turn means that cgroup_css() should be synchronized and may return NULL, making it more cumbersome to use. * Differing levels of per-subsystem granularity in the unified hierarchy means that the task and descendant iterators should behave differently depending on the specific subsystem the iteration is being performed for. * In majority of the cases, subsystems only care about its part in the cgroup hierarchy - ie. the hierarchy of css's. Subsystem methods often obtain the matching css pointer from the cgroup and don't bother with the cgroup pointer itself. Passing around css fits much better. This patch converts all cgroup_subsys methods to take @css instead of @cgroup. The conversions are mostly straight-forward. A few noteworthy changes are * ->css_alloc() now takes css of the parent cgroup rather than the pointer to the new cgroup as the css for the new cgroup doesn't exist yet. Knowing the parent css is enough for all the existing subsystems. * In kernel/cgroup.c::offline_css(), unnecessary open coded css dereference is replaced with local variable access. This patch shouldn't cause any behavior differences. v2: Unnecessary explicit cgrp->subsys[] deref in css_online() replaced with local variable @css as suggested by Li Zefan. Rebased on top of new for-3.12 which includes for-3.11-fixes so that ->css_free() invocation added by da0a12caff ("cgroup: fix a leak when percpu_ref_init() fails") is converted too. Suggested by Li Zefan. Signed-off-by: Tejun Heo <tj@kernel.org> Acked-by: Li Zefan <lizefan@huawei.com> Acked-by: Michal Hocko <mhocko@suse.cz> Acked-by: Vivek Goyal <vgoyal@redhat.com> Acked-by: Aristeu Rozanski <aris@redhat.com> Acked-by: Daniel Wagner <daniel.wagner@bmw-carit.de> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Ingo Molnar <mingo@redhat.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Balbir Singh <bsingharora@gmail.com> Cc: Matt Helsley <matthltc@us.ibm.com> Cc: Jens Axboe <axboe@kernel.dk> Cc: Steven Rostedt <rostedt@goodmis.org>
2013-08-09 07:11:23 +07:00
static void hugetlb_cgroup_css_free(struct cgroup_subsys_state *css)
{
struct hugetlb_cgroup *h_cgroup;
cgroup: pass around cgroup_subsys_state instead of cgroup in subsystem methods cgroup is currently in the process of transitioning to using struct cgroup_subsys_state * as the primary handle instead of struct cgroup * in subsystem implementations for the following reasons. * With unified hierarchy, subsystems will be dynamically bound and unbound from cgroups and thus css's (cgroup_subsys_state) may be created and destroyed dynamically over the lifetime of a cgroup, which is different from the current state where all css's are allocated and destroyed together with the associated cgroup. This in turn means that cgroup_css() should be synchronized and may return NULL, making it more cumbersome to use. * Differing levels of per-subsystem granularity in the unified hierarchy means that the task and descendant iterators should behave differently depending on the specific subsystem the iteration is being performed for. * In majority of the cases, subsystems only care about its part in the cgroup hierarchy - ie. the hierarchy of css's. Subsystem methods often obtain the matching css pointer from the cgroup and don't bother with the cgroup pointer itself. Passing around css fits much better. This patch converts all cgroup_subsys methods to take @css instead of @cgroup. The conversions are mostly straight-forward. A few noteworthy changes are * ->css_alloc() now takes css of the parent cgroup rather than the pointer to the new cgroup as the css for the new cgroup doesn't exist yet. Knowing the parent css is enough for all the existing subsystems. * In kernel/cgroup.c::offline_css(), unnecessary open coded css dereference is replaced with local variable access. This patch shouldn't cause any behavior differences. v2: Unnecessary explicit cgrp->subsys[] deref in css_online() replaced with local variable @css as suggested by Li Zefan. Rebased on top of new for-3.12 which includes for-3.11-fixes so that ->css_free() invocation added by da0a12caff ("cgroup: fix a leak when percpu_ref_init() fails") is converted too. Suggested by Li Zefan. Signed-off-by: Tejun Heo <tj@kernel.org> Acked-by: Li Zefan <lizefan@huawei.com> Acked-by: Michal Hocko <mhocko@suse.cz> Acked-by: Vivek Goyal <vgoyal@redhat.com> Acked-by: Aristeu Rozanski <aris@redhat.com> Acked-by: Daniel Wagner <daniel.wagner@bmw-carit.de> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Ingo Molnar <mingo@redhat.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Balbir Singh <bsingharora@gmail.com> Cc: Matt Helsley <matthltc@us.ibm.com> Cc: Jens Axboe <axboe@kernel.dk> Cc: Steven Rostedt <rostedt@goodmis.org>
2013-08-09 07:11:23 +07:00
h_cgroup = hugetlb_cgroup_from_css(css);
kfree(h_cgroup);
}
/*
* Should be called with hugetlb_lock held.
* Since we are holding hugetlb_lock, pages cannot get moved from
* active list or uncharged from the cgroup, So no need to get
* page reference and test for page active here. This function
* cannot fail.
*/
static void hugetlb_cgroup_move_parent(int idx, struct hugetlb_cgroup *h_cg,
struct page *page)
{
unsigned int nr_pages;
struct page_counter *counter;
struct hugetlb_cgroup *page_hcg;
struct hugetlb_cgroup *parent = parent_hugetlb_cgroup(h_cg);
page_hcg = hugetlb_cgroup_from_page(page);
/*
* We can have pages in active list without any cgroup
* ie, hugepage with less than 3 pages. We can safely
* ignore those pages.
*/
if (!page_hcg || page_hcg != h_cg)
goto out;
nr_pages = compound_nr(page);
if (!parent) {
parent = root_h_cgroup;
/* root has no limit */
page_counter_charge(&parent->hugepage[idx], nr_pages);
}
counter = &h_cg->hugepage[idx];
/* Take the pages off the local counter */
page_counter_cancel(counter, nr_pages);
set_hugetlb_cgroup(page, parent);
out:
return;
}
/*
* Force the hugetlb cgroup to empty the hugetlb resources by moving them to
* the parent cgroup.
*/
cgroup: pass around cgroup_subsys_state instead of cgroup in subsystem methods cgroup is currently in the process of transitioning to using struct cgroup_subsys_state * as the primary handle instead of struct cgroup * in subsystem implementations for the following reasons. * With unified hierarchy, subsystems will be dynamically bound and unbound from cgroups and thus css's (cgroup_subsys_state) may be created and destroyed dynamically over the lifetime of a cgroup, which is different from the current state where all css's are allocated and destroyed together with the associated cgroup. This in turn means that cgroup_css() should be synchronized and may return NULL, making it more cumbersome to use. * Differing levels of per-subsystem granularity in the unified hierarchy means that the task and descendant iterators should behave differently depending on the specific subsystem the iteration is being performed for. * In majority of the cases, subsystems only care about its part in the cgroup hierarchy - ie. the hierarchy of css's. Subsystem methods often obtain the matching css pointer from the cgroup and don't bother with the cgroup pointer itself. Passing around css fits much better. This patch converts all cgroup_subsys methods to take @css instead of @cgroup. The conversions are mostly straight-forward. A few noteworthy changes are * ->css_alloc() now takes css of the parent cgroup rather than the pointer to the new cgroup as the css for the new cgroup doesn't exist yet. Knowing the parent css is enough for all the existing subsystems. * In kernel/cgroup.c::offline_css(), unnecessary open coded css dereference is replaced with local variable access. This patch shouldn't cause any behavior differences. v2: Unnecessary explicit cgrp->subsys[] deref in css_online() replaced with local variable @css as suggested by Li Zefan. Rebased on top of new for-3.12 which includes for-3.11-fixes so that ->css_free() invocation added by da0a12caff ("cgroup: fix a leak when percpu_ref_init() fails") is converted too. Suggested by Li Zefan. Signed-off-by: Tejun Heo <tj@kernel.org> Acked-by: Li Zefan <lizefan@huawei.com> Acked-by: Michal Hocko <mhocko@suse.cz> Acked-by: Vivek Goyal <vgoyal@redhat.com> Acked-by: Aristeu Rozanski <aris@redhat.com> Acked-by: Daniel Wagner <daniel.wagner@bmw-carit.de> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Ingo Molnar <mingo@redhat.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Balbir Singh <bsingharora@gmail.com> Cc: Matt Helsley <matthltc@us.ibm.com> Cc: Jens Axboe <axboe@kernel.dk> Cc: Steven Rostedt <rostedt@goodmis.org>
2013-08-09 07:11:23 +07:00
static void hugetlb_cgroup_css_offline(struct cgroup_subsys_state *css)
{
cgroup: pass around cgroup_subsys_state instead of cgroup in subsystem methods cgroup is currently in the process of transitioning to using struct cgroup_subsys_state * as the primary handle instead of struct cgroup * in subsystem implementations for the following reasons. * With unified hierarchy, subsystems will be dynamically bound and unbound from cgroups and thus css's (cgroup_subsys_state) may be created and destroyed dynamically over the lifetime of a cgroup, which is different from the current state where all css's are allocated and destroyed together with the associated cgroup. This in turn means that cgroup_css() should be synchronized and may return NULL, making it more cumbersome to use. * Differing levels of per-subsystem granularity in the unified hierarchy means that the task and descendant iterators should behave differently depending on the specific subsystem the iteration is being performed for. * In majority of the cases, subsystems only care about its part in the cgroup hierarchy - ie. the hierarchy of css's. Subsystem methods often obtain the matching css pointer from the cgroup and don't bother with the cgroup pointer itself. Passing around css fits much better. This patch converts all cgroup_subsys methods to take @css instead of @cgroup. The conversions are mostly straight-forward. A few noteworthy changes are * ->css_alloc() now takes css of the parent cgroup rather than the pointer to the new cgroup as the css for the new cgroup doesn't exist yet. Knowing the parent css is enough for all the existing subsystems. * In kernel/cgroup.c::offline_css(), unnecessary open coded css dereference is replaced with local variable access. This patch shouldn't cause any behavior differences. v2: Unnecessary explicit cgrp->subsys[] deref in css_online() replaced with local variable @css as suggested by Li Zefan. Rebased on top of new for-3.12 which includes for-3.11-fixes so that ->css_free() invocation added by da0a12caff ("cgroup: fix a leak when percpu_ref_init() fails") is converted too. Suggested by Li Zefan. Signed-off-by: Tejun Heo <tj@kernel.org> Acked-by: Li Zefan <lizefan@huawei.com> Acked-by: Michal Hocko <mhocko@suse.cz> Acked-by: Vivek Goyal <vgoyal@redhat.com> Acked-by: Aristeu Rozanski <aris@redhat.com> Acked-by: Daniel Wagner <daniel.wagner@bmw-carit.de> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Ingo Molnar <mingo@redhat.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Balbir Singh <bsingharora@gmail.com> Cc: Matt Helsley <matthltc@us.ibm.com> Cc: Jens Axboe <axboe@kernel.dk> Cc: Steven Rostedt <rostedt@goodmis.org>
2013-08-09 07:11:23 +07:00
struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css);
struct hstate *h;
struct page *page;
int idx = 0;
do {
for_each_hstate(h) {
spin_lock(&hugetlb_lock);
list_for_each_entry(page, &h->hugepage_activelist, lru)
hugetlb_cgroup_move_parent(idx, h_cg, page);
spin_unlock(&hugetlb_lock);
idx++;
}
cond_resched();
} while (hugetlb_cgroup_have_usage(h_cg));
}
static inline void hugetlb_event(struct hugetlb_cgroup *hugetlb, int idx,
enum hugetlb_memory_event event)
{
atomic_long_inc(&hugetlb->events_local[idx][event]);
cgroup_file_notify(&hugetlb->events_local_file[idx]);
do {
atomic_long_inc(&hugetlb->events[idx][event]);
cgroup_file_notify(&hugetlb->events_file[idx]);
} while ((hugetlb = parent_hugetlb_cgroup(hugetlb)) &&
!hugetlb_cgroup_is_root(hugetlb));
}
int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages,
struct hugetlb_cgroup **ptr)
{
int ret = 0;
struct page_counter *counter;
struct hugetlb_cgroup *h_cg = NULL;
if (hugetlb_cgroup_disabled())
goto done;
/*
* We don't charge any cgroup if the compound page have less
* than 3 pages.
*/
if (huge_page_order(&hstates[idx]) < HUGETLB_CGROUP_MIN_ORDER)
goto done;
again:
rcu_read_lock();
h_cg = hugetlb_cgroup_from_task(current);
if (!css_tryget(&h_cg->css)) {
rcu_read_unlock();
goto again;
}
rcu_read_unlock();
if (!page_counter_try_charge(&h_cg->hugepage[idx], nr_pages,
&counter)) {
ret = -ENOMEM;
hugetlb_event(hugetlb_cgroup_from_counter(counter, idx), idx,
HUGETLB_MAX);
}
css_put(&h_cg->css);
done:
*ptr = h_cg;
return ret;
}
/* Should be called with hugetlb_lock held */
void hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages,
struct hugetlb_cgroup *h_cg,
struct page *page)
{
if (hugetlb_cgroup_disabled() || !h_cg)
return;
set_hugetlb_cgroup(page, h_cg);
return;
}
/*
* Should be called with hugetlb_lock held
*/
void hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages,
struct page *page)
{
struct hugetlb_cgroup *h_cg;
if (hugetlb_cgroup_disabled())
return;
lockdep_assert_held(&hugetlb_lock);
h_cg = hugetlb_cgroup_from_page(page);
if (unlikely(!h_cg))
return;
set_hugetlb_cgroup(page, NULL);
page_counter_uncharge(&h_cg->hugepage[idx], nr_pages);
return;
}
void hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages,
struct hugetlb_cgroup *h_cg)
{
if (hugetlb_cgroup_disabled() || !h_cg)
return;
if (huge_page_order(&hstates[idx]) < HUGETLB_CGROUP_MIN_ORDER)
return;
page_counter_uncharge(&h_cg->hugepage[idx], nr_pages);
return;
}
enum {
RES_USAGE,
RES_LIMIT,
RES_MAX_USAGE,
RES_FAILCNT,
};
static u64 hugetlb_cgroup_read_u64(struct cgroup_subsys_state *css,
struct cftype *cft)
{
struct page_counter *counter;
cgroup: pass around cgroup_subsys_state instead of cgroup in file methods cgroup is currently in the process of transitioning to using struct cgroup_subsys_state * as the primary handle instead of struct cgroup. Please see the previous commit which converts the subsystem methods for rationale. This patch converts all cftype file operations to take @css instead of @cgroup. cftypes for the cgroup core files don't have their subsytem pointer set. These will automatically use the dummy_css added by the previous patch and can be converted the same way. Most subsystem conversions are straight forwards but there are some interesting ones. * freezer: update_if_frozen() is also converted to take @css instead of @cgroup for consistency. This will make the code look simpler too once iterators are converted to use css. * memory/vmpressure: mem_cgroup_from_css() needs to be exported to vmpressure while mem_cgroup_from_cont() can be made static. Updated accordingly. * cpu: cgroup_tg() doesn't have any user left. Removed. * cpuacct: cgroup_ca() doesn't have any user left. Removed. * hugetlb: hugetlb_cgroup_form_cgroup() doesn't have any user left. Removed. * net_cls: cgrp_cls_state() doesn't have any user left. Removed. Signed-off-by: Tejun Heo <tj@kernel.org> Acked-by: Li Zefan <lizefan@huawei.com> Acked-by: Michal Hocko <mhocko@suse.cz> Acked-by: Vivek Goyal <vgoyal@redhat.com> Acked-by: Aristeu Rozanski <aris@redhat.com> Acked-by: Daniel Wagner <daniel.wagner@bmw-carit.de> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Ingo Molnar <mingo@redhat.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Balbir Singh <bsingharora@gmail.com> Cc: Matt Helsley <matthltc@us.ibm.com> Cc: Jens Axboe <axboe@kernel.dk> Cc: Steven Rostedt <rostedt@goodmis.org>
2013-08-09 07:11:24 +07:00
struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css);
counter = &h_cg->hugepage[MEMFILE_IDX(cft->private)];
switch (MEMFILE_ATTR(cft->private)) {
case RES_USAGE:
return (u64)page_counter_read(counter) * PAGE_SIZE;
case RES_LIMIT:
return (u64)counter->max * PAGE_SIZE;
case RES_MAX_USAGE:
return (u64)counter->watermark * PAGE_SIZE;
case RES_FAILCNT:
return counter->failcnt;
default:
BUG();
}
}
static int hugetlb_cgroup_read_u64_max(struct seq_file *seq, void *v)
{
int idx;
u64 val;
struct cftype *cft = seq_cft(seq);
unsigned long limit;
struct page_counter *counter;
struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(seq_css(seq));
idx = MEMFILE_IDX(cft->private);
counter = &h_cg->hugepage[idx];
limit = round_down(PAGE_COUNTER_MAX,
1 << huge_page_order(&hstates[idx]));
switch (MEMFILE_ATTR(cft->private)) {
case RES_USAGE:
val = (u64)page_counter_read(counter);
seq_printf(seq, "%llu\n", val * PAGE_SIZE);
break;
case RES_LIMIT:
val = (u64)counter->max;
if (val == limit)
seq_puts(seq, "max\n");
else
seq_printf(seq, "%llu\n", val * PAGE_SIZE);
break;
default:
BUG();
}
return 0;
}
static DEFINE_MUTEX(hugetlb_limit_mutex);
static ssize_t hugetlb_cgroup_write(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off,
const char *max)
{
int ret, idx;
unsigned long nr_pages;
struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(of_css(of));
if (hugetlb_cgroup_is_root(h_cg)) /* Can't set limit on root */
return -EINVAL;
buf = strstrip(buf);
ret = page_counter_memparse(buf, max, &nr_pages);
if (ret)
return ret;
idx = MEMFILE_IDX(of_cft(of)->private);
nr_pages = round_down(nr_pages, 1 << huge_page_order(&hstates[idx]));
switch (MEMFILE_ATTR(of_cft(of)->private)) {
case RES_LIMIT:
mutex_lock(&hugetlb_limit_mutex);
ret = page_counter_set_max(&h_cg->hugepage[idx], nr_pages);
mutex_unlock(&hugetlb_limit_mutex);
break;
default:
ret = -EINVAL;
break;
}
return ret ?: nbytes;
}
static ssize_t hugetlb_cgroup_write_legacy(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off)
{
return hugetlb_cgroup_write(of, buf, nbytes, off, "-1");
}
static ssize_t hugetlb_cgroup_write_dfl(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off)
{
return hugetlb_cgroup_write(of, buf, nbytes, off, "max");
}
static ssize_t hugetlb_cgroup_reset(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off)
{
int ret = 0;
struct page_counter *counter;
struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(of_css(of));
counter = &h_cg->hugepage[MEMFILE_IDX(of_cft(of)->private)];
switch (MEMFILE_ATTR(of_cft(of)->private)) {
case RES_MAX_USAGE:
page_counter_reset_watermark(counter);
break;
case RES_FAILCNT:
counter->failcnt = 0;
break;
default:
ret = -EINVAL;
break;
}
return ret ?: nbytes;
}
static char *mem_fmt(char *buf, int size, unsigned long hsize)
{
if (hsize >= (1UL << 30))
snprintf(buf, size, "%luGB", hsize >> 30);
else if (hsize >= (1UL << 20))
snprintf(buf, size, "%luMB", hsize >> 20);
else
snprintf(buf, size, "%luKB", hsize >> 10);
return buf;
}
static int __hugetlb_events_show(struct seq_file *seq, bool local)
{
int idx;
long max;
struct cftype *cft = seq_cft(seq);
struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(seq_css(seq));
idx = MEMFILE_IDX(cft->private);
if (local)
max = atomic_long_read(&h_cg->events_local[idx][HUGETLB_MAX]);
else
max = atomic_long_read(&h_cg->events[idx][HUGETLB_MAX]);
seq_printf(seq, "max %lu\n", max);
return 0;
}
static int hugetlb_events_show(struct seq_file *seq, void *v)
{
return __hugetlb_events_show(seq, false);
}
static int hugetlb_events_local_show(struct seq_file *seq, void *v)
{
return __hugetlb_events_show(seq, true);
}
static void __init __hugetlb_cgroup_file_dfl_init(int idx)
{
char buf[32];
struct cftype *cft;
struct hstate *h = &hstates[idx];
/* format the size */
mem_fmt(buf, 32, huge_page_size(h));
/* Add the limit file */
cft = &h->cgroup_files_dfl[0];
snprintf(cft->name, MAX_CFTYPE_NAME, "%s.max", buf);
cft->private = MEMFILE_PRIVATE(idx, RES_LIMIT);
cft->seq_show = hugetlb_cgroup_read_u64_max;
cft->write = hugetlb_cgroup_write_dfl;
cft->flags = CFTYPE_NOT_ON_ROOT;
/* Add the current usage file */
cft = &h->cgroup_files_dfl[1];
snprintf(cft->name, MAX_CFTYPE_NAME, "%s.current", buf);
cft->private = MEMFILE_PRIVATE(idx, RES_USAGE);
cft->seq_show = hugetlb_cgroup_read_u64_max;
cft->flags = CFTYPE_NOT_ON_ROOT;
/* Add the events file */
cft = &h->cgroup_files_dfl[2];
snprintf(cft->name, MAX_CFTYPE_NAME, "%s.events", buf);
cft->private = MEMFILE_PRIVATE(idx, 0);
cft->seq_show = hugetlb_events_show;
cft->file_offset = offsetof(struct hugetlb_cgroup, events_file[idx]),
cft->flags = CFTYPE_NOT_ON_ROOT;
/* Add the events.local file */
cft = &h->cgroup_files_dfl[3];
snprintf(cft->name, MAX_CFTYPE_NAME, "%s.events.local", buf);
cft->private = MEMFILE_PRIVATE(idx, 0);
cft->seq_show = hugetlb_events_local_show;
cft->file_offset = offsetof(struct hugetlb_cgroup,
events_local_file[idx]),
cft->flags = CFTYPE_NOT_ON_ROOT;
/* NULL terminate the last cft */
cft = &h->cgroup_files_dfl[4];
memset(cft, 0, sizeof(*cft));
WARN_ON(cgroup_add_dfl_cftypes(&hugetlb_cgrp_subsys,
h->cgroup_files_dfl));
}
static void __init __hugetlb_cgroup_file_legacy_init(int idx)
{
char buf[32];
struct cftype *cft;
struct hstate *h = &hstates[idx];
/* format the size */
mem_fmt(buf, 32, huge_page_size(h));
/* Add the limit file */
cft = &h->cgroup_files_legacy[0];
snprintf(cft->name, MAX_CFTYPE_NAME, "%s.limit_in_bytes", buf);
cft->private = MEMFILE_PRIVATE(idx, RES_LIMIT);
cft->read_u64 = hugetlb_cgroup_read_u64;
cft->write = hugetlb_cgroup_write_legacy;
/* Add the usage file */
cft = &h->cgroup_files_legacy[1];
snprintf(cft->name, MAX_CFTYPE_NAME, "%s.usage_in_bytes", buf);
cft->private = MEMFILE_PRIVATE(idx, RES_USAGE);
cft->read_u64 = hugetlb_cgroup_read_u64;
/* Add the MAX usage file */
cft = &h->cgroup_files_legacy[2];
snprintf(cft->name, MAX_CFTYPE_NAME, "%s.max_usage_in_bytes", buf);
cft->private = MEMFILE_PRIVATE(idx, RES_MAX_USAGE);
cft->write = hugetlb_cgroup_reset;
cft->read_u64 = hugetlb_cgroup_read_u64;
/* Add the failcntfile */
cft = &h->cgroup_files_legacy[3];
snprintf(cft->name, MAX_CFTYPE_NAME, "%s.failcnt", buf);
cft->private = MEMFILE_PRIVATE(idx, RES_FAILCNT);
cft->write = hugetlb_cgroup_reset;
cft->read_u64 = hugetlb_cgroup_read_u64;
/* NULL terminate the last cft */
cft = &h->cgroup_files_legacy[4];
memset(cft, 0, sizeof(*cft));
WARN_ON(cgroup_add_legacy_cftypes(&hugetlb_cgrp_subsys,
h->cgroup_files_legacy));
}
static void __init __hugetlb_cgroup_file_init(int idx)
{
__hugetlb_cgroup_file_dfl_init(idx);
__hugetlb_cgroup_file_legacy_init(idx);
}
void __init hugetlb_cgroup_file_init(void)
{
struct hstate *h;
for_each_hstate(h) {
/*
* Add cgroup control files only if the huge page consists
* of more than two normal pages. This is because we use
mm: make compound_head() robust Hugh has pointed that compound_head() call can be unsafe in some context. There's one example: CPU0 CPU1 isolate_migratepages_block() page_count() compound_head() !!PageTail() == true put_page() tail->first_page = NULL head = tail->first_page alloc_pages(__GFP_COMP) prep_compound_page() tail->first_page = head __SetPageTail(p); !!PageTail() == true <head == NULL dereferencing> The race is pure theoretical. I don't it's possible to trigger it in practice. But who knows. We can fix the race by changing how encode PageTail() and compound_head() within struct page to be able to update them in one shot. The patch introduces page->compound_head into third double word block in front of compound_dtor and compound_order. Bit 0 encodes PageTail() and the rest bits are pointer to head page if bit zero is set. The patch moves page->pmd_huge_pte out of word, just in case if an architecture defines pgtable_t into something what can have the bit 0 set. hugetlb_cgroup uses page->lru.next in the second tail page to store pointer struct hugetlb_cgroup. The patch switch it to use page->private in the second tail page instead. The space is free since ->first_page is removed from the union. The patch also opens possibility to remove HUGETLB_CGROUP_MIN_ORDER limitation, since there's now space in first tail page to store struct hugetlb_cgroup pointer. But that's out of scope of the patch. That means page->compound_head shares storage space with: - page->lru.next; - page->next; - page->rcu_head.next; That's too long list to be absolutely sure, but looks like nobody uses bit 0 of the word. page->rcu_head.next guaranteed[1] to have bit 0 clean as long as we use call_rcu(), call_rcu_bh(), call_rcu_sched(), or call_srcu(). But future call_rcu_lazy() is not allowed as it makes use of the bit and we can get false positive PageTail(). [1] http://lkml.kernel.org/g/20150827163634.GD4029@linux.vnet.ibm.com Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Acked-by: Michal Hocko <mhocko@suse.com> Reviewed-by: Andrea Arcangeli <aarcange@redhat.com> Cc: Hugh Dickins <hughd@google.com> Cc: David Rientjes <rientjes@google.com> Cc: Vlastimil Babka <vbabka@suse.cz> Acked-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Cc: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com> Cc: Sergey Senozhatsky <sergey.senozhatsky@gmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-11-07 07:29:54 +07:00
* page[2].private for storing cgroup details.
*/
if (huge_page_order(h) >= HUGETLB_CGROUP_MIN_ORDER)
__hugetlb_cgroup_file_init(hstate_index(h));
}
}
/*
* hugetlb_lock will make sure a parallel cgroup rmdir won't happen
* when we migrate hugepages
*/
void hugetlb_cgroup_migrate(struct page *oldhpage, struct page *newhpage)
{
struct hugetlb_cgroup *h_cg;
struct hstate *h = page_hstate(oldhpage);
if (hugetlb_cgroup_disabled())
return;
VM_BUG_ON_PAGE(!PageHuge(oldhpage), oldhpage);
spin_lock(&hugetlb_lock);
h_cg = hugetlb_cgroup_from_page(oldhpage);
set_hugetlb_cgroup(oldhpage, NULL);
/* move the h_cg details to new cgroup */
set_hugetlb_cgroup(newhpage, h_cg);
list_move(&newhpage->lru, &h->hugepage_activelist);
spin_unlock(&hugetlb_lock);
return;
}
static struct cftype hugetlb_files[] = {
{} /* terminate */
};
cgroup: clean up cgroup_subsys names and initialization cgroup_subsys is a bit messier than it needs to be. * The name of a subsys can be different from its internal identifier defined in cgroup_subsys.h. Most subsystems use the matching name but three - cpu, memory and perf_event - use different ones. * cgroup_subsys_id enums are postfixed with _subsys_id and each cgroup_subsys is postfixed with _subsys. cgroup.h is widely included throughout various subsystems, it doesn't and shouldn't have claim on such generic names which don't have any qualifier indicating that they belong to cgroup. * cgroup_subsys->subsys_id should always equal the matching cgroup_subsys_id enum; however, we require each controller to initialize it and then BUG if they don't match, which is a bit silly. This patch cleans up cgroup_subsys names and initialization by doing the followings. * cgroup_subsys_id enums are now postfixed with _cgrp_id, and each cgroup_subsys with _cgrp_subsys. * With the above, renaming subsys identifiers to match the userland visible names doesn't cause any naming conflicts. All non-matching identifiers are renamed to match the official names. cpu_cgroup -> cpu mem_cgroup -> memory perf -> perf_event * controllers no longer need to initialize ->subsys_id and ->name. They're generated in cgroup core and set automatically during boot. * Redundant cgroup_subsys declarations removed. * While updating BUG_ON()s in cgroup_init_early(), convert them to WARN()s. BUGging that early during boot is stupid - the kernel can't print anything, even through serial console and the trap handler doesn't even link stack frame properly for back-tracing. This patch doesn't introduce any behavior changes. v2: Rebased on top of fe1217c4f3f7 ("net: net_cls: move cgroupfs classid handling into core"). Signed-off-by: Tejun Heo <tj@kernel.org> Acked-by: Neil Horman <nhorman@tuxdriver.com> Acked-by: "David S. Miller" <davem@davemloft.net> Acked-by: "Rafael J. Wysocki" <rjw@rjwysocki.net> Acked-by: Michal Hocko <mhocko@suse.cz> Acked-by: Peter Zijlstra <peterz@infradead.org> Acked-by: Aristeu Rozanski <aris@redhat.com> Acked-by: Ingo Molnar <mingo@redhat.com> Acked-by: Li Zefan <lizefan@huawei.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Balbir Singh <bsingharora@gmail.com> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Serge E. Hallyn <serue@us.ibm.com> Cc: Vivek Goyal <vgoyal@redhat.com> Cc: Thomas Graf <tgraf@suug.ch>
2014-02-08 22:36:58 +07:00
struct cgroup_subsys hugetlb_cgrp_subsys = {
.css_alloc = hugetlb_cgroup_css_alloc,
.css_offline = hugetlb_cgroup_css_offline,
.css_free = hugetlb_cgroup_css_free,
.dfl_cftypes = hugetlb_files,
.legacy_cftypes = hugetlb_files,
};