2008-02-07 15:13:50 +07:00
|
|
|
/* memcontrol.h - Memory Controller
|
|
|
|
*
|
|
|
|
* Copyright IBM Corporation, 2007
|
|
|
|
* Author Balbir Singh <balbir@linux.vnet.ibm.com>
|
|
|
|
*
|
2008-02-07 15:13:51 +07:00
|
|
|
* Copyright 2007 OpenVZ SWsoft Inc
|
|
|
|
* Author: Pavel Emelianov <xemul@openvz.org>
|
|
|
|
*
|
2008-02-07 15:13:50 +07:00
|
|
|
* This program is free software; you can redistribute it and/or modify
|
|
|
|
* it under the terms of the GNU General Public License as published by
|
|
|
|
* the Free Software Foundation; either version 2 of the License, or
|
|
|
|
* (at your option) any later version.
|
|
|
|
*
|
|
|
|
* This program is distributed in the hope that it will be useful,
|
|
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
* GNU General Public License for more details.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#ifndef _LINUX_MEMCONTROL_H
|
|
|
|
#define _LINUX_MEMCONTROL_H
|
2009-01-08 09:08:02 +07:00
|
|
|
#include <linux/cgroup.h>
|
2011-05-27 06:25:38 +07:00
|
|
|
#include <linux/vm_event_item.h>
|
|
|
|
|
2008-02-07 15:13:51 +07:00
|
|
|
struct mem_cgroup;
|
|
|
|
struct page_cgroup;
|
2008-02-07 15:13:59 +07:00
|
|
|
struct page;
|
|
|
|
struct mm_struct;
|
2008-02-07 15:13:51 +07:00
|
|
|
|
2011-01-14 06:47:37 +07:00
|
|
|
/* Stats that can be updated by kernel. */
|
|
|
|
enum mem_cgroup_page_stat_item {
|
|
|
|
MEMCG_NR_FILE_MAPPED, /* # of pages charged as file rss */
|
|
|
|
};
|
|
|
|
|
2010-05-25 04:32:40 +07:00
|
|
|
extern unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
|
|
|
|
struct list_head *dst,
|
|
|
|
unsigned long *scanned, int order,
|
2011-11-01 07:06:47 +07:00
|
|
|
isolate_mode_t mode,
|
|
|
|
struct zone *z,
|
2010-05-25 04:32:40 +07:00
|
|
|
struct mem_cgroup *mem_cont,
|
|
|
|
int active, int file);
|
|
|
|
|
2008-03-05 05:28:39 +07:00
|
|
|
#ifdef CONFIG_CGROUP_MEM_RES_CTLR
|
2009-01-08 09:08:10 +07:00
|
|
|
/*
|
|
|
|
* All "charge" functions with gfp_mask should use GFP_KERNEL or
|
|
|
|
* (gfp_mask & GFP_RECLAIM_MASK). In current implementatin, memcg doesn't
|
|
|
|
* alloc memory but reclaims memory from all available zones. So, "where I want
|
|
|
|
* memory from" bits of gfp_mask has no meaning. So any bits of that field is
|
|
|
|
* available but adding a rule is better. charge functions' gfp_mask should
|
|
|
|
* be set to GFP_KERNEL or gfp_mask & GFP_RECLAIM_MASK for avoiding ambiguous
|
|
|
|
* codes.
|
|
|
|
* (Of course, if memcg does memory allocation in future, GFP_KERNEL is sane.)
|
|
|
|
*/
|
2008-02-07 15:13:51 +07:00
|
|
|
|
2009-01-08 09:07:48 +07:00
|
|
|
extern int mem_cgroup_newpage_charge(struct page *page, struct mm_struct *mm,
|
2008-02-07 15:14:02 +07:00
|
|
|
gfp_t gfp_mask);
|
2009-01-08 09:07:48 +07:00
|
|
|
/* for swap handling */
|
2009-01-08 09:08:00 +07:00
|
|
|
extern int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
|
|
|
|
struct page *page, gfp_t mask, struct mem_cgroup **ptr);
|
2009-01-08 09:07:48 +07:00
|
|
|
extern void mem_cgroup_commit_charge_swapin(struct page *page,
|
|
|
|
struct mem_cgroup *ptr);
|
|
|
|
extern void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *ptr);
|
|
|
|
|
2008-03-05 05:29:08 +07:00
|
|
|
extern int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
|
|
|
|
gfp_t gfp_mask);
|
memcg: synchronized LRU
A big patch for changing memcg's LRU semantics.
Now,
- page_cgroup is linked to mem_cgroup's its own LRU (per zone).
- LRU of page_cgroup is not synchronous with global LRU.
- page and page_cgroup is one-to-one and statically allocated.
- To find page_cgroup is on what LRU, you have to check pc->mem_cgroup as
- lru = page_cgroup_zoneinfo(pc, nid_of_pc, zid_of_pc);
- SwapCache is handled.
And, when we handle LRU list of page_cgroup, we do following.
pc = lookup_page_cgroup(page);
lock_page_cgroup(pc); .....................(1)
mz = page_cgroup_zoneinfo(pc);
spin_lock(&mz->lru_lock);
.....add to LRU
spin_unlock(&mz->lru_lock);
unlock_page_cgroup(pc);
But (1) is spin_lock and we have to be afraid of dead-lock with zone->lru_lock.
So, trylock() is used at (1), now. Without (1), we can't trust "mz" is correct.
This is a trial to remove this dirty nesting of locks.
This patch changes mz->lru_lock to be zone->lru_lock.
Then, above sequence will be written as
spin_lock(&zone->lru_lock); # in vmscan.c or swap.c via global LRU
mem_cgroup_add/remove/etc_lru() {
pc = lookup_page_cgroup(page);
mz = page_cgroup_zoneinfo(pc);
if (PageCgroupUsed(pc)) {
....add to LRU
}
spin_lock(&zone->lru_lock); # in vmscan.c or swap.c via global LRU
This is much simpler.
(*) We're safe even if we don't take lock_page_cgroup(pc). Because..
1. When pc->mem_cgroup can be modified.
- at charge.
- at account_move().
2. at charge
the PCG_USED bit is not set before pc->mem_cgroup is fixed.
3. at account_move()
the page is isolated and not on LRU.
Pros.
- easy for maintenance.
- memcg can make use of laziness of pagevec.
- we don't have to duplicated LRU/Active/Unevictable bit in page_cgroup.
- LRU status of memcg will be synchronized with global LRU's one.
- # of locks are reduced.
- account_move() is simplified very much.
Cons.
- may increase cost of LRU rotation.
(no impact if memcg is not configured.)
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Pavel Emelyanov <xemul@openvz.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-01-08 09:08:01 +07:00
|
|
|
extern void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru);
|
|
|
|
extern void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru);
|
2011-03-23 06:32:53 +07:00
|
|
|
extern void mem_cgroup_rotate_reclaimable_page(struct page *page);
|
memcg: synchronized LRU
A big patch for changing memcg's LRU semantics.
Now,
- page_cgroup is linked to mem_cgroup's its own LRU (per zone).
- LRU of page_cgroup is not synchronous with global LRU.
- page and page_cgroup is one-to-one and statically allocated.
- To find page_cgroup is on what LRU, you have to check pc->mem_cgroup as
- lru = page_cgroup_zoneinfo(pc, nid_of_pc, zid_of_pc);
- SwapCache is handled.
And, when we handle LRU list of page_cgroup, we do following.
pc = lookup_page_cgroup(page);
lock_page_cgroup(pc); .....................(1)
mz = page_cgroup_zoneinfo(pc);
spin_lock(&mz->lru_lock);
.....add to LRU
spin_unlock(&mz->lru_lock);
unlock_page_cgroup(pc);
But (1) is spin_lock and we have to be afraid of dead-lock with zone->lru_lock.
So, trylock() is used at (1), now. Without (1), we can't trust "mz" is correct.
This is a trial to remove this dirty nesting of locks.
This patch changes mz->lru_lock to be zone->lru_lock.
Then, above sequence will be written as
spin_lock(&zone->lru_lock); # in vmscan.c or swap.c via global LRU
mem_cgroup_add/remove/etc_lru() {
pc = lookup_page_cgroup(page);
mz = page_cgroup_zoneinfo(pc);
if (PageCgroupUsed(pc)) {
....add to LRU
}
spin_lock(&zone->lru_lock); # in vmscan.c or swap.c via global LRU
This is much simpler.
(*) We're safe even if we don't take lock_page_cgroup(pc). Because..
1. When pc->mem_cgroup can be modified.
- at charge.
- at account_move().
2. at charge
the PCG_USED bit is not set before pc->mem_cgroup is fixed.
3. at account_move()
the page is isolated and not on LRU.
Pros.
- easy for maintenance.
- memcg can make use of laziness of pagevec.
- we don't have to duplicated LRU/Active/Unevictable bit in page_cgroup.
- LRU status of memcg will be synchronized with global LRU's one.
- # of locks are reduced.
- account_move() is simplified very much.
Cons.
- may increase cost of LRU rotation.
(no impact if memcg is not configured.)
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Pavel Emelyanov <xemul@openvz.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-01-08 09:08:01 +07:00
|
|
|
extern void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru);
|
|
|
|
extern void mem_cgroup_del_lru(struct page *page);
|
|
|
|
extern void mem_cgroup_move_lists(struct page *page,
|
|
|
|
enum lru_list from, enum lru_list to);
|
2009-12-16 07:47:03 +07:00
|
|
|
|
|
|
|
/* For coalescing uncharge for reducing memcg' overhead*/
|
|
|
|
extern void mem_cgroup_uncharge_start(void);
|
|
|
|
extern void mem_cgroup_uncharge_end(void);
|
|
|
|
|
2008-02-07 15:14:41 +07:00
|
|
|
extern void mem_cgroup_uncharge_page(struct page *page);
|
memcg: remove refcnt from page_cgroup
memcg: performance improvements
Patch Description
1/5 ... remove refcnt fron page_cgroup patch (shmem handling is fixed)
2/5 ... swapcache handling patch
3/5 ... add helper function for shmem's memory reclaim patch
4/5 ... optimize by likely/unlikely ppatch
5/5 ... remove redundunt check patch (shmem handling is fixed.)
Unix bench result.
== 2.6.26-rc2-mm1 + memory resource controller
Execl Throughput 2915.4 lps (29.6 secs, 3 samples)
C Compiler Throughput 1019.3 lpm (60.0 secs, 3 samples)
Shell Scripts (1 concurrent) 5796.0 lpm (60.0 secs, 3 samples)
Shell Scripts (8 concurrent) 1097.7 lpm (60.0 secs, 3 samples)
Shell Scripts (16 concurrent) 565.3 lpm (60.0 secs, 3 samples)
File Read 1024 bufsize 2000 maxblocks 1022128.0 KBps (30.0 secs, 3 samples)
File Write 1024 bufsize 2000 maxblocks 544057.0 KBps (30.0 secs, 3 samples)
File Copy 1024 bufsize 2000 maxblocks 346481.0 KBps (30.0 secs, 3 samples)
File Read 256 bufsize 500 maxblocks 319325.0 KBps (30.0 secs, 3 samples)
File Write 256 bufsize 500 maxblocks 148788.0 KBps (30.0 secs, 3 samples)
File Copy 256 bufsize 500 maxblocks 99051.0 KBps (30.0 secs, 3 samples)
File Read 4096 bufsize 8000 maxblocks 2058917.0 KBps (30.0 secs, 3 samples)
File Write 4096 bufsize 8000 maxblocks 1606109.0 KBps (30.0 secs, 3 samples)
File Copy 4096 bufsize 8000 maxblocks 854789.0 KBps (30.0 secs, 3 samples)
Dc: sqrt(2) to 99 decimal places 126145.2 lpm (30.0 secs, 3 samples)
INDEX VALUES
TEST BASELINE RESULT INDEX
Execl Throughput 43.0 2915.4 678.0
File Copy 1024 bufsize 2000 maxblocks 3960.0 346481.0 875.0
File Copy 256 bufsize 500 maxblocks 1655.0 99051.0 598.5
File Copy 4096 bufsize 8000 maxblocks 5800.0 854789.0 1473.8
Shell Scripts (8 concurrent) 6.0 1097.7 1829.5
=========
FINAL SCORE 991.3
== 2.6.26-rc2-mm1 + this set ==
Execl Throughput 3012.9 lps (29.9 secs, 3 samples)
C Compiler Throughput 981.0 lpm (60.0 secs, 3 samples)
Shell Scripts (1 concurrent) 5872.0 lpm (60.0 secs, 3 samples)
Shell Scripts (8 concurrent) 1120.3 lpm (60.0 secs, 3 samples)
Shell Scripts (16 concurrent) 578.0 lpm (60.0 secs, 3 samples)
File Read 1024 bufsize 2000 maxblocks 1003993.0 KBps (30.0 secs, 3 samples)
File Write 1024 bufsize 2000 maxblocks 550452.0 KBps (30.0 secs, 3 samples)
File Copy 1024 bufsize 2000 maxblocks 347159.0 KBps (30.0 secs, 3 samples)
File Read 256 bufsize 500 maxblocks 314644.0 KBps (30.0 secs, 3 samples)
File Write 256 bufsize 500 maxblocks 151852.0 KBps (30.0 secs, 3 samples)
File Copy 256 bufsize 500 maxblocks 101000.0 KBps (30.0 secs, 3 samples)
File Read 4096 bufsize 8000 maxblocks 2033256.0 KBps (30.0 secs, 3 samples)
File Write 4096 bufsize 8000 maxblocks 1611814.0 KBps (30.0 secs, 3 samples)
File Copy 4096 bufsize 8000 maxblocks 847979.0 KBps (30.0 secs, 3 samples)
Dc: sqrt(2) to 99 decimal places 128148.7 lpm (30.0 secs, 3 samples)
INDEX VALUES
TEST BASELINE RESULT INDEX
Execl Throughput 43.0 3012.9 700.7
File Copy 1024 bufsize 2000 maxblocks 3960.0 347159.0 876.7
File Copy 256 bufsize 500 maxblocks 1655.0 101000.0 610.3
File Copy 4096 bufsize 8000 maxblocks 5800.0 847979.0 1462.0
Shell Scripts (8 concurrent) 6.0 1120.3 1867.2
=========
FINAL SCORE 1004.6
This patch:
Remove refcnt from page_cgroup().
After this,
* A page is charged only when !page_mapped() && no page_cgroup is assigned.
* Anon page is newly mapped.
* File page is added to mapping->tree.
* A page is uncharged only when
* Anon page is fully unmapped.
* File page is removed from LRU.
There is no change in behavior from user's view.
This patch also removes unnecessary calls in rmap.c which was used only for
refcnt mangement.
[akpm@linux-foundation.org: fix warning]
[hugh@veritas.com: fix shmem_unuse_inode charging]
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Pavel Emelyanov <xemul@openvz.org>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Hugh Dickins <hugh@veritas.com>
Cc: YAMAMOTO Takashi <yamamoto@valinux.co.jp>
Cc: Paul Menage <menage@google.com>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-07-25 15:47:14 +07:00
|
|
|
extern void mem_cgroup_uncharge_cache_page(struct page *page);
|
2008-07-25 15:47:15 +07:00
|
|
|
|
2011-11-03 03:38:15 +07:00
|
|
|
extern void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask);
|
|
|
|
int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg);
|
2008-02-07 15:14:03 +07:00
|
|
|
|
2009-12-16 18:19:59 +07:00
|
|
|
extern struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page);
|
cgroups: add an owner to the mm_struct
Remove the mem_cgroup member from mm_struct and instead adds an owner.
This approach was suggested by Paul Menage. The advantage of this approach
is that, once the mm->owner is known, using the subsystem id, the cgroup
can be determined. It also allows several control groups that are
virtually grouped by mm_struct, to exist independent of the memory
controller i.e., without adding mem_cgroup's for each controller, to
mm_struct.
A new config option CONFIG_MM_OWNER is added and the memory resource
controller selects this config option.
This patch also adds cgroup callbacks to notify subsystems when mm->owner
changes. The mm_cgroup_changed callback is called with the task_lock() of
the new task held and is called just prior to changing the mm->owner.
I am indebted to Paul Menage for the several reviews of this patchset and
helping me make it lighter and simpler.
This patch was tested on a powerpc box, it was compiled with both the
MM_OWNER config turned on and off.
After the thread group leader exits, it's moved to init_css_state by
cgroup_exit(), thus all future charges from runnings threads would be
redirected to the init_css_set's subsystem.
Signed-off-by: Balbir Singh <balbir@linux.vnet.ibm.com>
Cc: Pavel Emelianov <xemul@openvz.org>
Cc: Hugh Dickins <hugh@veritas.com>
Cc: Sudhir Kumar <skumar@linux.vnet.ibm.com>
Cc: YAMAMOTO Takashi <yamamoto@valinux.co.jp>
Cc: Hirokazu Takahashi <taka@valinux.co.jp>
Cc: David Rientjes <rientjes@google.com>,
Cc: Balbir Singh <balbir@linux.vnet.ibm.com>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Acked-by: Pekka Enberg <penberg@cs.helsinki.fi>
Reviewed-by: Paul Menage <menage@google.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-29 15:00:16 +07:00
|
|
|
extern struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p);
|
2011-06-16 05:08:13 +07:00
|
|
|
extern struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm);
|
cgroups: add an owner to the mm_struct
Remove the mem_cgroup member from mm_struct and instead adds an owner.
This approach was suggested by Paul Menage. The advantage of this approach
is that, once the mm->owner is known, using the subsystem id, the cgroup
can be determined. It also allows several control groups that are
virtually grouped by mm_struct, to exist independent of the memory
controller i.e., without adding mem_cgroup's for each controller, to
mm_struct.
A new config option CONFIG_MM_OWNER is added and the memory resource
controller selects this config option.
This patch also adds cgroup callbacks to notify subsystems when mm->owner
changes. The mm_cgroup_changed callback is called with the task_lock() of
the new task held and is called just prior to changing the mm->owner.
I am indebted to Paul Menage for the several reviews of this patchset and
helping me make it lighter and simpler.
This patch was tested on a powerpc box, it was compiled with both the
MM_OWNER config turned on and off.
After the thread group leader exits, it's moved to init_css_state by
cgroup_exit(), thus all future charges from runnings threads would be
redirected to the init_css_set's subsystem.
Signed-off-by: Balbir Singh <balbir@linux.vnet.ibm.com>
Cc: Pavel Emelianov <xemul@openvz.org>
Cc: Hugh Dickins <hugh@veritas.com>
Cc: Sudhir Kumar <skumar@linux.vnet.ibm.com>
Cc: YAMAMOTO Takashi <yamamoto@valinux.co.jp>
Cc: Hirokazu Takahashi <taka@valinux.co.jp>
Cc: David Rientjes <rientjes@google.com>,
Cc: Balbir Singh <balbir@linux.vnet.ibm.com>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Acked-by: Pekka Enberg <penberg@cs.helsinki.fi>
Reviewed-by: Paul Menage <menage@google.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-29 15:00:16 +07:00
|
|
|
|
2009-01-08 09:08:07 +07:00
|
|
|
static inline
|
|
|
|
int mm_match_cgroup(const struct mm_struct *mm, const struct mem_cgroup *cgroup)
|
|
|
|
{
|
2011-11-03 03:38:15 +07:00
|
|
|
struct mem_cgroup *memcg;
|
2009-01-08 09:08:07 +07:00
|
|
|
rcu_read_lock();
|
2011-11-03 03:38:15 +07:00
|
|
|
memcg = mem_cgroup_from_task(rcu_dereference((mm)->owner));
|
2009-01-08 09:08:07 +07:00
|
|
|
rcu_read_unlock();
|
2011-11-03 03:38:15 +07:00
|
|
|
return cgroup == memcg;
|
2009-01-08 09:08:07 +07:00
|
|
|
}
|
2008-02-07 15:13:53 +07:00
|
|
|
|
2011-11-03 03:38:15 +07:00
|
|
|
extern struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg);
|
2009-12-16 18:19:59 +07:00
|
|
|
|
2008-07-25 15:47:10 +07:00
|
|
|
extern int
|
memcg: fix mis-accounting of file mapped racy with migration
FILE_MAPPED per memcg of migrated file cache is not properly updated,
because our hook in page_add_file_rmap() can't know to which memcg
FILE_MAPPED should be counted.
Basically, this patch is for fixing the bug but includes some big changes
to fix up other messes.
Now, at migrating mapped file, events happen in following sequence.
1. allocate a new page.
2. get memcg of an old page.
3. charge ageinst a new page before migration. But at this point,
no changes to new page's page_cgroup, no commit for the charge.
(IOW, PCG_USED bit is not set.)
4. page migration replaces radix-tree, old-page and new-page.
5. page migration remaps the new page if the old page was mapped.
6. Here, the new page is unlocked.
7. memcg commits the charge for newpage, Mark the new page's page_cgroup
as PCG_USED.
Because "commit" happens after page-remap, we can count FILE_MAPPED
at "5", because we should avoid to trust page_cgroup->mem_cgroup.
if PCG_USED bit is unset.
(Note: memcg's LRU removal code does that but LRU-isolation logic is used
for helping it. When we overwrite page_cgroup->mem_cgroup, page_cgroup is
not on LRU or page_cgroup->mem_cgroup is NULL.)
We can lose file_mapped accounting information at 5 because FILE_MAPPED
is updated only when mapcount changes 0->1. So we should catch it.
BTW, historically, above implemntation comes from migration-failure
of anonymous page. Because we charge both of old page and new page
with mapcount=0, we can't catch
- the page is really freed before remap.
- migration fails but it's freed before remap
or .....corner cases.
New migration sequence with memcg is:
1. allocate a new page.
2. mark PageCgroupMigration to the old page.
3. charge against a new page onto the old page's memcg. (here, new page's pc
is marked as PageCgroupUsed.)
4. page migration replaces radix-tree, page table, etc...
5. At remapping, new page's page_cgroup is now makrked as "USED"
We can catch 0->1 event and FILE_MAPPED will be properly updated.
And we can catch SWAPOUT event after unlock this and freeing this
page by unmap() can be caught.
7. Clear PageCgroupMigration of the old page.
So, FILE_MAPPED will be correctly updated.
Then, for what MIGRATION flag is ?
Without it, at migration failure, we may have to charge old page again
because it may be fully unmapped. "charge" means that we have to dive into
memory reclaim or something complated. So, it's better to avoid
charge it again. Before this patch, __commit_charge() was working for
both of the old/new page and fixed up all. But this technique has some
racy condtion around FILE_MAPPED and SWAPOUT etc...
Now, the kernel use MIGRATION flag and don't uncharge old page until
the end of migration.
I hope this change will make memcg's page migration much simpler. This
page migration has caused several troubles. Worth to add a flag for
simplification.
Reviewed-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Tested-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Reported-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: "Kirill A. Shutemov" <kirill@shutemov.name>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2010-05-27 04:42:46 +07:00
|
|
|
mem_cgroup_prepare_migration(struct page *page,
|
2011-03-23 06:30:52 +07:00
|
|
|
struct page *newpage, struct mem_cgroup **ptr, gfp_t gfp_mask);
|
2011-11-03 03:38:15 +07:00
|
|
|
extern void mem_cgroup_end_migration(struct mem_cgroup *memcg,
|
2011-01-14 06:47:43 +07:00
|
|
|
struct page *oldpage, struct page *newpage, bool migration_ok);
|
2008-02-07 15:14:10 +07:00
|
|
|
|
2008-02-07 15:14:32 +07:00
|
|
|
/*
|
|
|
|
* For memory reclaim.
|
|
|
|
*/
|
2011-11-03 03:38:23 +07:00
|
|
|
int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg,
|
|
|
|
struct zone *zone);
|
|
|
|
int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg,
|
|
|
|
struct zone *zone);
|
2011-05-27 06:25:33 +07:00
|
|
|
int mem_cgroup_select_victim_node(struct mem_cgroup *memcg);
|
2011-05-27 06:25:36 +07:00
|
|
|
unsigned long mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg,
|
memcg: consolidate memory cgroup lru stat functions
In mm/memcontrol.c, there are many lru stat functions as..
mem_cgroup_zone_nr_lru_pages
mem_cgroup_node_nr_file_lru_pages
mem_cgroup_nr_file_lru_pages
mem_cgroup_node_nr_anon_lru_pages
mem_cgroup_nr_anon_lru_pages
mem_cgroup_node_nr_unevictable_lru_pages
mem_cgroup_nr_unevictable_lru_pages
mem_cgroup_node_nr_lru_pages
mem_cgroup_nr_lru_pages
mem_cgroup_get_local_zonestat
Some of them are under #ifdef MAX_NUMNODES >1 and others are not.
This seems bad. This patch consolidates all functions into
mem_cgroup_zone_nr_lru_pages()
mem_cgroup_node_nr_lru_pages()
mem_cgroup_nr_lru_pages()
For these functions, "which LRU?" information is passed by a mask.
example:
mem_cgroup_nr_lru_pages(mem, BIT(LRU_ACTIVE_ANON))
And I added some macro as ALL_LRU, ALL_LRU_FILE, ALL_LRU_ANON.
example:
mem_cgroup_nr_lru_pages(mem, ALL_LRU)
BTW, considering layout of NUMA memory placement of counters, this patch seems
to be better.
Now, when we gather all LRU information, we scan in following orer
for_each_lru -> for_each_node -> for_each_zone.
This means we'll touch cache lines in different node in turn.
After patch, we'll scan
for_each_node -> for_each_zone -> for_each_lru(mask)
Then, we'll gather information in the same cacheline at once.
[akpm@linux-foundation.org: fix warnigns, build error]
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Ying Han <yinghan@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2011-07-27 06:08:22 +07:00
|
|
|
int nid, int zid, unsigned int lrumask);
|
2009-01-08 09:08:20 +07:00
|
|
|
struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg,
|
|
|
|
struct zone *zone);
|
|
|
|
struct zone_reclaim_stat*
|
|
|
|
mem_cgroup_get_reclaim_stat_from_page(struct page *page);
|
2009-04-03 06:57:39 +07:00
|
|
|
extern void mem_cgroup_print_oom_info(struct mem_cgroup *memcg,
|
|
|
|
struct task_struct *p);
|
2008-02-07 15:14:32 +07:00
|
|
|
|
2009-01-08 09:07:57 +07:00
|
|
|
#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
|
|
|
|
extern int do_swap_account;
|
|
|
|
#endif
|
2009-01-08 09:08:02 +07:00
|
|
|
|
|
|
|
static inline bool mem_cgroup_disabled(void)
|
|
|
|
{
|
|
|
|
if (mem_cgroup_subsys.disabled)
|
|
|
|
return true;
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2011-01-14 06:47:37 +07:00
|
|
|
void mem_cgroup_update_page_stat(struct page *page,
|
|
|
|
enum mem_cgroup_page_stat_item idx,
|
|
|
|
int val);
|
|
|
|
|
|
|
|
static inline void mem_cgroup_inc_page_stat(struct page *page,
|
|
|
|
enum mem_cgroup_page_stat_item idx)
|
|
|
|
{
|
|
|
|
mem_cgroup_update_page_stat(page, idx, 1);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void mem_cgroup_dec_page_stat(struct page *page,
|
|
|
|
enum mem_cgroup_page_stat_item idx)
|
|
|
|
{
|
|
|
|
mem_cgroup_update_page_stat(page, idx, -1);
|
|
|
|
}
|
|
|
|
|
2009-09-24 05:56:39 +07:00
|
|
|
unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
|
2011-05-27 06:25:25 +07:00
|
|
|
gfp_t gfp_mask,
|
|
|
|
unsigned long *total_scanned);
|
2011-11-03 03:38:15 +07:00
|
|
|
u64 mem_cgroup_get_limit(struct mem_cgroup *memcg);
|
oom: badness heuristic rewrite
This a complete rewrite of the oom killer's badness() heuristic which is
used to determine which task to kill in oom conditions. The goal is to
make it as simple and predictable as possible so the results are better
understood and we end up killing the task which will lead to the most
memory freeing while still respecting the fine-tuning from userspace.
Instead of basing the heuristic on mm->total_vm for each task, the task's
rss and swap space is used instead. This is a better indication of the
amount of memory that will be freeable if the oom killed task is chosen
and subsequently exits. This helps specifically in cases where KDE or
GNOME is chosen for oom kill on desktop systems instead of a memory
hogging task.
The baseline for the heuristic is a proportion of memory that each task is
currently using in memory plus swap compared to the amount of "allowable"
memory. "Allowable," in this sense, means the system-wide resources for
unconstrained oom conditions, the set of mempolicy nodes, the mems
attached to current's cpuset, or a memory controller's limit. The
proportion is given on a scale of 0 (never kill) to 1000 (always kill),
roughly meaning that if a task has a badness() score of 500 that the task
consumes approximately 50% of allowable memory resident in RAM or in swap
space.
The proportion is always relative to the amount of "allowable" memory and
not the total amount of RAM systemwide so that mempolicies and cpusets may
operate in isolation; they shall not need to know the true size of the
machine on which they are running if they are bound to a specific set of
nodes or mems, respectively.
Root tasks are given 3% extra memory just like __vm_enough_memory()
provides in LSMs. In the event of two tasks consuming similar amounts of
memory, it is generally better to save root's task.
Because of the change in the badness() heuristic's baseline, it is also
necessary to introduce a new user interface to tune it. It's not possible
to redefine the meaning of /proc/pid/oom_adj with a new scale since the
ABI cannot be changed for backward compatability. Instead, a new tunable,
/proc/pid/oom_score_adj, is added that ranges from -1000 to +1000. It may
be used to polarize the heuristic such that certain tasks are never
considered for oom kill while others may always be considered. The value
is added directly into the badness() score so a value of -500, for
example, means to discount 50% of its memory consumption in comparison to
other tasks either on the system, bound to the mempolicy, in the cpuset,
or sharing the same memory controller.
/proc/pid/oom_adj is changed so that its meaning is rescaled into the
units used by /proc/pid/oom_score_adj, and vice versa. Changing one of
these per-task tunables will rescale the value of the other to an
equivalent meaning. Although /proc/pid/oom_adj was originally defined as
a bitshift on the badness score, it now shares the same linear growth as
/proc/pid/oom_score_adj but with different granularity. This is required
so the ABI is not broken with userspace applications and allows oom_adj to
be deprecated for future removal.
Signed-off-by: David Rientjes <rientjes@google.com>
Cc: Nick Piggin <npiggin@suse.de>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2010-08-10 07:19:46 +07:00
|
|
|
|
2011-05-27 06:25:38 +07:00
|
|
|
void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx);
|
2011-01-21 05:44:24 +07:00
|
|
|
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
|
|
|
|
void mem_cgroup_split_huge_fixup(struct page *head, struct page *tail);
|
|
|
|
#endif
|
|
|
|
|
2011-03-24 06:42:25 +07:00
|
|
|
#ifdef CONFIG_DEBUG_VM
|
|
|
|
bool mem_cgroup_bad_page_check(struct page *page);
|
|
|
|
void mem_cgroup_print_bad_page(struct page *page);
|
|
|
|
#endif
|
2008-10-19 10:28:16 +07:00
|
|
|
#else /* CONFIG_CGROUP_MEM_RES_CTLR */
|
2009-01-08 09:07:48 +07:00
|
|
|
struct mem_cgroup;
|
|
|
|
|
|
|
|
static inline int mem_cgroup_newpage_charge(struct page *page,
|
2008-03-05 05:29:08 +07:00
|
|
|
struct mm_struct *mm, gfp_t gfp_mask)
|
2008-02-07 15:13:53 +07:00
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2008-03-05 05:29:08 +07:00
|
|
|
static inline int mem_cgroup_cache_charge(struct page *page,
|
|
|
|
struct mm_struct *mm, gfp_t gfp_mask)
|
2008-02-07 15:13:53 +07:00
|
|
|
{
|
2008-03-05 05:29:08 +07:00
|
|
|
return 0;
|
2008-02-07 15:13:53 +07:00
|
|
|
}
|
|
|
|
|
2009-01-08 09:08:00 +07:00
|
|
|
static inline int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
|
|
|
|
struct page *page, gfp_t gfp_mask, struct mem_cgroup **ptr)
|
2009-01-08 09:07:48 +07:00
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void mem_cgroup_commit_charge_swapin(struct page *page,
|
|
|
|
struct mem_cgroup *ptr)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *ptr)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2009-12-16 07:47:03 +07:00
|
|
|
static inline void mem_cgroup_uncharge_start(void)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void mem_cgroup_uncharge_end(void)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2008-02-07 15:13:53 +07:00
|
|
|
static inline void mem_cgroup_uncharge_page(struct page *page)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
memcg: remove refcnt from page_cgroup
memcg: performance improvements
Patch Description
1/5 ... remove refcnt fron page_cgroup patch (shmem handling is fixed)
2/5 ... swapcache handling patch
3/5 ... add helper function for shmem's memory reclaim patch
4/5 ... optimize by likely/unlikely ppatch
5/5 ... remove redundunt check patch (shmem handling is fixed.)
Unix bench result.
== 2.6.26-rc2-mm1 + memory resource controller
Execl Throughput 2915.4 lps (29.6 secs, 3 samples)
C Compiler Throughput 1019.3 lpm (60.0 secs, 3 samples)
Shell Scripts (1 concurrent) 5796.0 lpm (60.0 secs, 3 samples)
Shell Scripts (8 concurrent) 1097.7 lpm (60.0 secs, 3 samples)
Shell Scripts (16 concurrent) 565.3 lpm (60.0 secs, 3 samples)
File Read 1024 bufsize 2000 maxblocks 1022128.0 KBps (30.0 secs, 3 samples)
File Write 1024 bufsize 2000 maxblocks 544057.0 KBps (30.0 secs, 3 samples)
File Copy 1024 bufsize 2000 maxblocks 346481.0 KBps (30.0 secs, 3 samples)
File Read 256 bufsize 500 maxblocks 319325.0 KBps (30.0 secs, 3 samples)
File Write 256 bufsize 500 maxblocks 148788.0 KBps (30.0 secs, 3 samples)
File Copy 256 bufsize 500 maxblocks 99051.0 KBps (30.0 secs, 3 samples)
File Read 4096 bufsize 8000 maxblocks 2058917.0 KBps (30.0 secs, 3 samples)
File Write 4096 bufsize 8000 maxblocks 1606109.0 KBps (30.0 secs, 3 samples)
File Copy 4096 bufsize 8000 maxblocks 854789.0 KBps (30.0 secs, 3 samples)
Dc: sqrt(2) to 99 decimal places 126145.2 lpm (30.0 secs, 3 samples)
INDEX VALUES
TEST BASELINE RESULT INDEX
Execl Throughput 43.0 2915.4 678.0
File Copy 1024 bufsize 2000 maxblocks 3960.0 346481.0 875.0
File Copy 256 bufsize 500 maxblocks 1655.0 99051.0 598.5
File Copy 4096 bufsize 8000 maxblocks 5800.0 854789.0 1473.8
Shell Scripts (8 concurrent) 6.0 1097.7 1829.5
=========
FINAL SCORE 991.3
== 2.6.26-rc2-mm1 + this set ==
Execl Throughput 3012.9 lps (29.9 secs, 3 samples)
C Compiler Throughput 981.0 lpm (60.0 secs, 3 samples)
Shell Scripts (1 concurrent) 5872.0 lpm (60.0 secs, 3 samples)
Shell Scripts (8 concurrent) 1120.3 lpm (60.0 secs, 3 samples)
Shell Scripts (16 concurrent) 578.0 lpm (60.0 secs, 3 samples)
File Read 1024 bufsize 2000 maxblocks 1003993.0 KBps (30.0 secs, 3 samples)
File Write 1024 bufsize 2000 maxblocks 550452.0 KBps (30.0 secs, 3 samples)
File Copy 1024 bufsize 2000 maxblocks 347159.0 KBps (30.0 secs, 3 samples)
File Read 256 bufsize 500 maxblocks 314644.0 KBps (30.0 secs, 3 samples)
File Write 256 bufsize 500 maxblocks 151852.0 KBps (30.0 secs, 3 samples)
File Copy 256 bufsize 500 maxblocks 101000.0 KBps (30.0 secs, 3 samples)
File Read 4096 bufsize 8000 maxblocks 2033256.0 KBps (30.0 secs, 3 samples)
File Write 4096 bufsize 8000 maxblocks 1611814.0 KBps (30.0 secs, 3 samples)
File Copy 4096 bufsize 8000 maxblocks 847979.0 KBps (30.0 secs, 3 samples)
Dc: sqrt(2) to 99 decimal places 128148.7 lpm (30.0 secs, 3 samples)
INDEX VALUES
TEST BASELINE RESULT INDEX
Execl Throughput 43.0 3012.9 700.7
File Copy 1024 bufsize 2000 maxblocks 3960.0 347159.0 876.7
File Copy 256 bufsize 500 maxblocks 1655.0 101000.0 610.3
File Copy 4096 bufsize 8000 maxblocks 5800.0 847979.0 1462.0
Shell Scripts (8 concurrent) 6.0 1120.3 1867.2
=========
FINAL SCORE 1004.6
This patch:
Remove refcnt from page_cgroup().
After this,
* A page is charged only when !page_mapped() && no page_cgroup is assigned.
* Anon page is newly mapped.
* File page is added to mapping->tree.
* A page is uncharged only when
* Anon page is fully unmapped.
* File page is removed from LRU.
There is no change in behavior from user's view.
This patch also removes unnecessary calls in rmap.c which was used only for
refcnt mangement.
[akpm@linux-foundation.org: fix warning]
[hugh@veritas.com: fix shmem_unuse_inode charging]
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Pavel Emelyanov <xemul@openvz.org>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Hugh Dickins <hugh@veritas.com>
Cc: YAMAMOTO Takashi <yamamoto@valinux.co.jp>
Cc: Paul Menage <menage@google.com>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-07-25 15:47:14 +07:00
|
|
|
static inline void mem_cgroup_uncharge_cache_page(struct page *page)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
memcg: synchronized LRU
A big patch for changing memcg's LRU semantics.
Now,
- page_cgroup is linked to mem_cgroup's its own LRU (per zone).
- LRU of page_cgroup is not synchronous with global LRU.
- page and page_cgroup is one-to-one and statically allocated.
- To find page_cgroup is on what LRU, you have to check pc->mem_cgroup as
- lru = page_cgroup_zoneinfo(pc, nid_of_pc, zid_of_pc);
- SwapCache is handled.
And, when we handle LRU list of page_cgroup, we do following.
pc = lookup_page_cgroup(page);
lock_page_cgroup(pc); .....................(1)
mz = page_cgroup_zoneinfo(pc);
spin_lock(&mz->lru_lock);
.....add to LRU
spin_unlock(&mz->lru_lock);
unlock_page_cgroup(pc);
But (1) is spin_lock and we have to be afraid of dead-lock with zone->lru_lock.
So, trylock() is used at (1), now. Without (1), we can't trust "mz" is correct.
This is a trial to remove this dirty nesting of locks.
This patch changes mz->lru_lock to be zone->lru_lock.
Then, above sequence will be written as
spin_lock(&zone->lru_lock); # in vmscan.c or swap.c via global LRU
mem_cgroup_add/remove/etc_lru() {
pc = lookup_page_cgroup(page);
mz = page_cgroup_zoneinfo(pc);
if (PageCgroupUsed(pc)) {
....add to LRU
}
spin_lock(&zone->lru_lock); # in vmscan.c or swap.c via global LRU
This is much simpler.
(*) We're safe even if we don't take lock_page_cgroup(pc). Because..
1. When pc->mem_cgroup can be modified.
- at charge.
- at account_move().
2. at charge
the PCG_USED bit is not set before pc->mem_cgroup is fixed.
3. at account_move()
the page is isolated and not on LRU.
Pros.
- easy for maintenance.
- memcg can make use of laziness of pagevec.
- we don't have to duplicated LRU/Active/Unevictable bit in page_cgroup.
- LRU status of memcg will be synchronized with global LRU's one.
- # of locks are reduced.
- account_move() is simplified very much.
Cons.
- may increase cost of LRU rotation.
(no impact if memcg is not configured.)
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Pavel Emelyanov <xemul@openvz.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-01-08 09:08:01 +07:00
|
|
|
static inline void mem_cgroup_add_lru_list(struct page *page, int lru)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void mem_cgroup_del_lru_list(struct page *page, int lru)
|
|
|
|
{
|
|
|
|
return ;
|
|
|
|
}
|
|
|
|
|
2011-04-15 05:21:52 +07:00
|
|
|
static inline void mem_cgroup_rotate_reclaimable_page(struct page *page)
|
2011-03-23 06:32:53 +07:00
|
|
|
{
|
|
|
|
return ;
|
|
|
|
}
|
|
|
|
|
memcg: synchronized LRU
A big patch for changing memcg's LRU semantics.
Now,
- page_cgroup is linked to mem_cgroup's its own LRU (per zone).
- LRU of page_cgroup is not synchronous with global LRU.
- page and page_cgroup is one-to-one and statically allocated.
- To find page_cgroup is on what LRU, you have to check pc->mem_cgroup as
- lru = page_cgroup_zoneinfo(pc, nid_of_pc, zid_of_pc);
- SwapCache is handled.
And, when we handle LRU list of page_cgroup, we do following.
pc = lookup_page_cgroup(page);
lock_page_cgroup(pc); .....................(1)
mz = page_cgroup_zoneinfo(pc);
spin_lock(&mz->lru_lock);
.....add to LRU
spin_unlock(&mz->lru_lock);
unlock_page_cgroup(pc);
But (1) is spin_lock and we have to be afraid of dead-lock with zone->lru_lock.
So, trylock() is used at (1), now. Without (1), we can't trust "mz" is correct.
This is a trial to remove this dirty nesting of locks.
This patch changes mz->lru_lock to be zone->lru_lock.
Then, above sequence will be written as
spin_lock(&zone->lru_lock); # in vmscan.c or swap.c via global LRU
mem_cgroup_add/remove/etc_lru() {
pc = lookup_page_cgroup(page);
mz = page_cgroup_zoneinfo(pc);
if (PageCgroupUsed(pc)) {
....add to LRU
}
spin_lock(&zone->lru_lock); # in vmscan.c or swap.c via global LRU
This is much simpler.
(*) We're safe even if we don't take lock_page_cgroup(pc). Because..
1. When pc->mem_cgroup can be modified.
- at charge.
- at account_move().
2. at charge
the PCG_USED bit is not set before pc->mem_cgroup is fixed.
3. at account_move()
the page is isolated and not on LRU.
Pros.
- easy for maintenance.
- memcg can make use of laziness of pagevec.
- we don't have to duplicated LRU/Active/Unevictable bit in page_cgroup.
- LRU status of memcg will be synchronized with global LRU's one.
- # of locks are reduced.
- account_move() is simplified very much.
Cons.
- may increase cost of LRU rotation.
(no impact if memcg is not configured.)
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Pavel Emelyanov <xemul@openvz.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-01-08 09:08:01 +07:00
|
|
|
static inline void mem_cgroup_rotate_lru_list(struct page *page, int lru)
|
|
|
|
{
|
|
|
|
return ;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void mem_cgroup_del_lru(struct page *page)
|
|
|
|
{
|
|
|
|
return ;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void
|
|
|
|
mem_cgroup_move_lists(struct page *page, enum lru_list from, enum lru_list to)
|
2008-02-07 15:13:56 +07:00
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2009-12-16 18:19:59 +07:00
|
|
|
static inline struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
|
|
|
|
{
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2011-06-16 05:08:13 +07:00
|
|
|
static inline struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
|
|
|
|
{
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2011-11-03 03:38:15 +07:00
|
|
|
static inline int mm_match_cgroup(struct mm_struct *mm,
|
|
|
|
struct mem_cgroup *memcg)
|
2008-02-07 15:14:01 +07:00
|
|
|
{
|
2008-02-09 15:10:15 +07:00
|
|
|
return 1;
|
2008-02-07 15:14:01 +07:00
|
|
|
}
|
|
|
|
|
2008-02-07 15:14:06 +07:00
|
|
|
static inline int task_in_mem_cgroup(struct task_struct *task,
|
2011-11-03 03:38:15 +07:00
|
|
|
const struct mem_cgroup *memcg)
|
2008-02-07 15:14:06 +07:00
|
|
|
{
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
2011-11-03 03:38:15 +07:00
|
|
|
static inline struct cgroup_subsys_state
|
|
|
|
*mem_cgroup_css(struct mem_cgroup *memcg)
|
2009-12-16 18:19:59 +07:00
|
|
|
{
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2008-07-25 15:47:10 +07:00
|
|
|
static inline int
|
memcg: fix mis-accounting of file mapped racy with migration
FILE_MAPPED per memcg of migrated file cache is not properly updated,
because our hook in page_add_file_rmap() can't know to which memcg
FILE_MAPPED should be counted.
Basically, this patch is for fixing the bug but includes some big changes
to fix up other messes.
Now, at migrating mapped file, events happen in following sequence.
1. allocate a new page.
2. get memcg of an old page.
3. charge ageinst a new page before migration. But at this point,
no changes to new page's page_cgroup, no commit for the charge.
(IOW, PCG_USED bit is not set.)
4. page migration replaces radix-tree, old-page and new-page.
5. page migration remaps the new page if the old page was mapped.
6. Here, the new page is unlocked.
7. memcg commits the charge for newpage, Mark the new page's page_cgroup
as PCG_USED.
Because "commit" happens after page-remap, we can count FILE_MAPPED
at "5", because we should avoid to trust page_cgroup->mem_cgroup.
if PCG_USED bit is unset.
(Note: memcg's LRU removal code does that but LRU-isolation logic is used
for helping it. When we overwrite page_cgroup->mem_cgroup, page_cgroup is
not on LRU or page_cgroup->mem_cgroup is NULL.)
We can lose file_mapped accounting information at 5 because FILE_MAPPED
is updated only when mapcount changes 0->1. So we should catch it.
BTW, historically, above implemntation comes from migration-failure
of anonymous page. Because we charge both of old page and new page
with mapcount=0, we can't catch
- the page is really freed before remap.
- migration fails but it's freed before remap
or .....corner cases.
New migration sequence with memcg is:
1. allocate a new page.
2. mark PageCgroupMigration to the old page.
3. charge against a new page onto the old page's memcg. (here, new page's pc
is marked as PageCgroupUsed.)
4. page migration replaces radix-tree, page table, etc...
5. At remapping, new page's page_cgroup is now makrked as "USED"
We can catch 0->1 event and FILE_MAPPED will be properly updated.
And we can catch SWAPOUT event after unlock this and freeing this
page by unmap() can be caught.
7. Clear PageCgroupMigration of the old page.
So, FILE_MAPPED will be correctly updated.
Then, for what MIGRATION flag is ?
Without it, at migration failure, we may have to charge old page again
because it may be fully unmapped. "charge" means that we have to dive into
memory reclaim or something complated. So, it's better to avoid
charge it again. Before this patch, __commit_charge() was working for
both of the old/new page and fixed up all. But this technique has some
racy condtion around FILE_MAPPED and SWAPOUT etc...
Now, the kernel use MIGRATION flag and don't uncharge old page until
the end of migration.
I hope this change will make memcg's page migration much simpler. This
page migration has caused several troubles. Worth to add a flag for
simplification.
Reviewed-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Tested-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Reported-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: "Kirill A. Shutemov" <kirill@shutemov.name>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2010-05-27 04:42:46 +07:00
|
|
|
mem_cgroup_prepare_migration(struct page *page, struct page *newpage,
|
2011-03-23 06:30:52 +07:00
|
|
|
struct mem_cgroup **ptr, gfp_t gfp_mask)
|
2008-02-07 15:14:10 +07:00
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2011-11-03 03:38:15 +07:00
|
|
|
static inline void mem_cgroup_end_migration(struct mem_cgroup *memcg,
|
2011-01-14 06:47:43 +07:00
|
|
|
struct page *oldpage, struct page *newpage, bool migration_ok)
|
2008-02-07 15:14:10 +07:00
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2011-11-03 03:38:15 +07:00
|
|
|
static inline int mem_cgroup_get_reclaim_priority(struct mem_cgroup *memcg)
|
2008-02-07 15:14:34 +07:00
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2011-11-03 03:38:15 +07:00
|
|
|
static inline void mem_cgroup_note_reclaim_priority(struct mem_cgroup *memcg,
|
2008-02-07 15:14:34 +07:00
|
|
|
int priority)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2011-11-03 03:38:15 +07:00
|
|
|
static inline void mem_cgroup_record_reclaim_priority(struct mem_cgroup *memcg,
|
2008-02-07 15:14:34 +07:00
|
|
|
int priority)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2009-01-08 09:08:02 +07:00
|
|
|
static inline bool mem_cgroup_disabled(void)
|
|
|
|
{
|
|
|
|
return true;
|
|
|
|
}
|
2009-01-08 09:08:08 +07:00
|
|
|
|
2009-01-08 09:08:18 +07:00
|
|
|
static inline int
|
2011-11-03 03:38:23 +07:00
|
|
|
mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg, struct zone *zone)
|
2009-01-08 09:08:18 +07:00
|
|
|
{
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
2009-06-17 05:32:28 +07:00
|
|
|
static inline int
|
2011-11-03 03:38:23 +07:00
|
|
|
mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg, struct zone *zone)
|
2009-06-17 05:32:28 +07:00
|
|
|
{
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
2009-01-08 09:08:19 +07:00
|
|
|
static inline unsigned long
|
memcg: consolidate memory cgroup lru stat functions
In mm/memcontrol.c, there are many lru stat functions as..
mem_cgroup_zone_nr_lru_pages
mem_cgroup_node_nr_file_lru_pages
mem_cgroup_nr_file_lru_pages
mem_cgroup_node_nr_anon_lru_pages
mem_cgroup_nr_anon_lru_pages
mem_cgroup_node_nr_unevictable_lru_pages
mem_cgroup_nr_unevictable_lru_pages
mem_cgroup_node_nr_lru_pages
mem_cgroup_nr_lru_pages
mem_cgroup_get_local_zonestat
Some of them are under #ifdef MAX_NUMNODES >1 and others are not.
This seems bad. This patch consolidates all functions into
mem_cgroup_zone_nr_lru_pages()
mem_cgroup_node_nr_lru_pages()
mem_cgroup_nr_lru_pages()
For these functions, "which LRU?" information is passed by a mask.
example:
mem_cgroup_nr_lru_pages(mem, BIT(LRU_ACTIVE_ANON))
And I added some macro as ALL_LRU, ALL_LRU_FILE, ALL_LRU_ANON.
example:
mem_cgroup_nr_lru_pages(mem, ALL_LRU)
BTW, considering layout of NUMA memory placement of counters, this patch seems
to be better.
Now, when we gather all LRU information, we scan in following orer
for_each_lru -> for_each_node -> for_each_zone.
This means we'll touch cache lines in different node in turn.
After patch, we'll scan
for_each_node -> for_each_zone -> for_each_lru(mask)
Then, we'll gather information in the same cacheline at once.
[akpm@linux-foundation.org: fix warnigns, build error]
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Ying Han <yinghan@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2011-07-27 06:08:22 +07:00
|
|
|
mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg, int nid, int zid,
|
|
|
|
unsigned int lru_mask)
|
2009-01-08 09:08:19 +07:00
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2009-01-08 09:08:20 +07:00
|
|
|
static inline struct zone_reclaim_stat*
|
|
|
|
mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg, struct zone *zone)
|
|
|
|
{
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline struct zone_reclaim_stat*
|
|
|
|
mem_cgroup_get_reclaim_stat_from_page(struct page *page)
|
|
|
|
{
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2009-04-03 06:57:39 +07:00
|
|
|
static inline void
|
|
|
|
mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2011-01-14 06:47:37 +07:00
|
|
|
static inline void mem_cgroup_inc_page_stat(struct page *page,
|
|
|
|
enum mem_cgroup_page_stat_item idx)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void mem_cgroup_dec_page_stat(struct page *page,
|
|
|
|
enum mem_cgroup_page_stat_item idx)
|
2009-06-18 06:26:34 +07:00
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2009-09-24 05:56:39 +07:00
|
|
|
static inline
|
|
|
|
unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
|
2011-05-27 06:25:25 +07:00
|
|
|
gfp_t gfp_mask,
|
|
|
|
unsigned long *total_scanned)
|
2009-09-24 05:56:39 +07:00
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
oom: badness heuristic rewrite
This a complete rewrite of the oom killer's badness() heuristic which is
used to determine which task to kill in oom conditions. The goal is to
make it as simple and predictable as possible so the results are better
understood and we end up killing the task which will lead to the most
memory freeing while still respecting the fine-tuning from userspace.
Instead of basing the heuristic on mm->total_vm for each task, the task's
rss and swap space is used instead. This is a better indication of the
amount of memory that will be freeable if the oom killed task is chosen
and subsequently exits. This helps specifically in cases where KDE or
GNOME is chosen for oom kill on desktop systems instead of a memory
hogging task.
The baseline for the heuristic is a proportion of memory that each task is
currently using in memory plus swap compared to the amount of "allowable"
memory. "Allowable," in this sense, means the system-wide resources for
unconstrained oom conditions, the set of mempolicy nodes, the mems
attached to current's cpuset, or a memory controller's limit. The
proportion is given on a scale of 0 (never kill) to 1000 (always kill),
roughly meaning that if a task has a badness() score of 500 that the task
consumes approximately 50% of allowable memory resident in RAM or in swap
space.
The proportion is always relative to the amount of "allowable" memory and
not the total amount of RAM systemwide so that mempolicies and cpusets may
operate in isolation; they shall not need to know the true size of the
machine on which they are running if they are bound to a specific set of
nodes or mems, respectively.
Root tasks are given 3% extra memory just like __vm_enough_memory()
provides in LSMs. In the event of two tasks consuming similar amounts of
memory, it is generally better to save root's task.
Because of the change in the badness() heuristic's baseline, it is also
necessary to introduce a new user interface to tune it. It's not possible
to redefine the meaning of /proc/pid/oom_adj with a new scale since the
ABI cannot be changed for backward compatability. Instead, a new tunable,
/proc/pid/oom_score_adj, is added that ranges from -1000 to +1000. It may
be used to polarize the heuristic such that certain tasks are never
considered for oom kill while others may always be considered. The value
is added directly into the badness() score so a value of -500, for
example, means to discount 50% of its memory consumption in comparison to
other tasks either on the system, bound to the mempolicy, in the cpuset,
or sharing the same memory controller.
/proc/pid/oom_adj is changed so that its meaning is rescaled into the
units used by /proc/pid/oom_score_adj, and vice versa. Changing one of
these per-task tunables will rescale the value of the other to an
equivalent meaning. Although /proc/pid/oom_adj was originally defined as
a bitshift on the badness score, it now shares the same linear growth as
/proc/pid/oom_score_adj but with different granularity. This is required
so the ABI is not broken with userspace applications and allows oom_adj to
be deprecated for future removal.
Signed-off-by: David Rientjes <rientjes@google.com>
Cc: Nick Piggin <npiggin@suse.de>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2010-08-10 07:19:46 +07:00
|
|
|
static inline
|
2011-11-03 03:38:15 +07:00
|
|
|
u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
|
oom: badness heuristic rewrite
This a complete rewrite of the oom killer's badness() heuristic which is
used to determine which task to kill in oom conditions. The goal is to
make it as simple and predictable as possible so the results are better
understood and we end up killing the task which will lead to the most
memory freeing while still respecting the fine-tuning from userspace.
Instead of basing the heuristic on mm->total_vm for each task, the task's
rss and swap space is used instead. This is a better indication of the
amount of memory that will be freeable if the oom killed task is chosen
and subsequently exits. This helps specifically in cases where KDE or
GNOME is chosen for oom kill on desktop systems instead of a memory
hogging task.
The baseline for the heuristic is a proportion of memory that each task is
currently using in memory plus swap compared to the amount of "allowable"
memory. "Allowable," in this sense, means the system-wide resources for
unconstrained oom conditions, the set of mempolicy nodes, the mems
attached to current's cpuset, or a memory controller's limit. The
proportion is given on a scale of 0 (never kill) to 1000 (always kill),
roughly meaning that if a task has a badness() score of 500 that the task
consumes approximately 50% of allowable memory resident in RAM or in swap
space.
The proportion is always relative to the amount of "allowable" memory and
not the total amount of RAM systemwide so that mempolicies and cpusets may
operate in isolation; they shall not need to know the true size of the
machine on which they are running if they are bound to a specific set of
nodes or mems, respectively.
Root tasks are given 3% extra memory just like __vm_enough_memory()
provides in LSMs. In the event of two tasks consuming similar amounts of
memory, it is generally better to save root's task.
Because of the change in the badness() heuristic's baseline, it is also
necessary to introduce a new user interface to tune it. It's not possible
to redefine the meaning of /proc/pid/oom_adj with a new scale since the
ABI cannot be changed for backward compatability. Instead, a new tunable,
/proc/pid/oom_score_adj, is added that ranges from -1000 to +1000. It may
be used to polarize the heuristic such that certain tasks are never
considered for oom kill while others may always be considered. The value
is added directly into the badness() score so a value of -500, for
example, means to discount 50% of its memory consumption in comparison to
other tasks either on the system, bound to the mempolicy, in the cpuset,
or sharing the same memory controller.
/proc/pid/oom_adj is changed so that its meaning is rescaled into the
units used by /proc/pid/oom_score_adj, and vice versa. Changing one of
these per-task tunables will rescale the value of the other to an
equivalent meaning. Although /proc/pid/oom_adj was originally defined as
a bitshift on the badness score, it now shares the same linear growth as
/proc/pid/oom_score_adj but with different granularity. This is required
so the ABI is not broken with userspace applications and allows oom_adj to
be deprecated for future removal.
Signed-off-by: David Rientjes <rientjes@google.com>
Cc: Nick Piggin <npiggin@suse.de>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2010-08-10 07:19:46 +07:00
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2011-01-21 05:44:24 +07:00
|
|
|
static inline void mem_cgroup_split_huge_fixup(struct page *head,
|
|
|
|
struct page *tail)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2011-05-27 06:25:38 +07:00
|
|
|
static inline
|
|
|
|
void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)
|
|
|
|
{
|
|
|
|
}
|
2008-02-07 15:13:51 +07:00
|
|
|
#endif /* CONFIG_CGROUP_MEM_CONT */
|
|
|
|
|
2011-03-24 06:42:25 +07:00
|
|
|
#if !defined(CONFIG_CGROUP_MEM_RES_CTLR) || !defined(CONFIG_DEBUG_VM)
|
|
|
|
static inline bool
|
|
|
|
mem_cgroup_bad_page_check(struct page *page)
|
|
|
|
{
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void
|
|
|
|
mem_cgroup_print_bad_page(struct page *page)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2008-02-07 15:13:50 +07:00
|
|
|
#endif /* _LINUX_MEMCONTROL_H */
|
|
|
|
|