2019-05-27 13:55:06 +07:00
|
|
|
// SPDX-License-Identifier: GPL-2.0-or-later
|
2008-02-07 15:13:50 +07:00
|
|
|
/* memcontrol.c - Memory Controller
|
|
|
|
*
|
|
|
|
* Copyright IBM Corporation, 2007
|
|
|
|
* Author Balbir Singh <balbir@linux.vnet.ibm.com>
|
|
|
|
*
|
2008-02-07 15:13:51 +07:00
|
|
|
* Copyright 2007 OpenVZ SWsoft Inc
|
|
|
|
* Author: Pavel Emelianov <xemul@openvz.org>
|
|
|
|
*
|
2010-03-11 06:22:24 +07:00
|
|
|
* Memory thresholds
|
|
|
|
* Copyright (C) 2009 Nokia Corporation
|
|
|
|
* Author: Kirill A. Shutemov
|
|
|
|
*
|
2012-12-19 05:21:56 +07:00
|
|
|
* Kernel Memory Controller
|
|
|
|
* Copyright (C) 2012 Parallels Inc. and Google Inc.
|
|
|
|
* Authors: Glauber Costa and Suleiman Souhlal
|
|
|
|
*
|
2015-04-15 05:44:51 +07:00
|
|
|
* Native page reclaim
|
|
|
|
* Charge lifetime sanitation
|
|
|
|
* Lockless page tracking & accounting
|
|
|
|
* Unified hierarchy configuration model
|
|
|
|
* Copyright (C) 2015 Red Hat, Inc., Johannes Weiner
|
2008-02-07 15:13:50 +07:00
|
|
|
*/
|
|
|
|
|
mm: memcontrol: lockless page counters
Memory is internally accounted in bytes, using spinlock-protected 64-bit
counters, even though the smallest accounting delta is a page. The
counter interface is also convoluted and does too many things.
Introduce a new lockless word-sized page counter API, then change all
memory accounting over to it. The translation from and to bytes then only
happens when interfacing with userspace.
The removed locking overhead is noticable when scaling beyond the per-cpu
charge caches - on a 4-socket machine with 144-threads, the following test
shows the performance differences of 288 memcgs concurrently running a
page fault benchmark:
vanilla:
18631648.500498 task-clock (msec) # 140.643 CPUs utilized ( +- 0.33% )
1,380,638 context-switches # 0.074 K/sec ( +- 0.75% )
24,390 cpu-migrations # 0.001 K/sec ( +- 8.44% )
1,843,305,768 page-faults # 0.099 M/sec ( +- 0.00% )
50,134,994,088,218 cycles # 2.691 GHz ( +- 0.33% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
8,049,712,224,651 instructions # 0.16 insns per cycle ( +- 0.04% )
1,586,970,584,979 branches # 85.176 M/sec ( +- 0.05% )
1,724,989,949 branch-misses # 0.11% of all branches ( +- 0.48% )
132.474343877 seconds time elapsed ( +- 0.21% )
lockless:
12195979.037525 task-clock (msec) # 133.480 CPUs utilized ( +- 0.18% )
832,850 context-switches # 0.068 K/sec ( +- 0.54% )
15,624 cpu-migrations # 0.001 K/sec ( +- 10.17% )
1,843,304,774 page-faults # 0.151 M/sec ( +- 0.00% )
32,811,216,801,141 cycles # 2.690 GHz ( +- 0.18% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
9,999,265,091,727 instructions # 0.30 insns per cycle ( +- 0.10% )
2,076,759,325,203 branches # 170.282 M/sec ( +- 0.12% )
1,656,917,214 branch-misses # 0.08% of all branches ( +- 0.55% )
91.369330729 seconds time elapsed ( +- 0.45% )
On top of improved scalability, this also gets rid of the icky long long
types in the very heart of memcg, which is great for 32 bit and also makes
the code a lot more readable.
Notable differences between the old and new API:
- res_counter_charge() and res_counter_charge_nofail() become
page_counter_try_charge() and page_counter_charge() resp. to match
the more common kernel naming scheme of try_do()/do()
- res_counter_uncharge_until() is only ever used to cancel a local
counter and never to uncharge bigger segments of a hierarchy, so
it's replaced by the simpler page_counter_cancel()
- res_counter_set_limit() is replaced by page_counter_limit(), which
expects its callers to serialize against themselves
- res_counter_memparse_write_strategy() is replaced by
page_counter_limit(), which rounds down to the nearest page size -
rather than up. This is more reasonable for explicitely requested
hard upper limits.
- to keep charging light-weight, page_counter_try_charge() charges
speculatively, only to roll back if the result exceeds the limit.
Because of this, a failing bigger charge can temporarily lock out
smaller charges that would otherwise succeed. The error is bounded
to the difference between the smallest and the biggest possible
charge size, so for memcg, this means that a failing THP charge can
send base page charges into reclaim upto 2MB (4MB) before the limit
would have been reached. This should be acceptable.
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE and memparse]
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE, memparse, strncmp, and PAGE_SIZE]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Acked-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-12-11 06:42:31 +07:00
|
|
|
#include <linux/page_counter.h>
|
2008-02-07 15:13:50 +07:00
|
|
|
#include <linux/memcontrol.h>
|
|
|
|
#include <linux/cgroup.h>
|
2019-08-28 21:19:53 +07:00
|
|
|
#include <linux/pagewalk.h>
|
2017-02-09 00:51:29 +07:00
|
|
|
#include <linux/sched/mm.h>
|
2017-02-25 05:59:36 +07:00
|
|
|
#include <linux/shmem_fs.h>
|
2010-03-11 06:22:14 +07:00
|
|
|
#include <linux/hugetlb.h>
|
2009-01-08 09:07:56 +07:00
|
|
|
#include <linux/pagemap.h>
|
2019-03-06 06:48:09 +07:00
|
|
|
#include <linux/vm_event_item.h>
|
2008-02-07 15:14:24 +07:00
|
|
|
#include <linux/smp.h>
|
2008-02-07 15:13:53 +07:00
|
|
|
#include <linux/page-flags.h>
|
2008-02-07 15:13:56 +07:00
|
|
|
#include <linux/backing-dev.h>
|
2008-02-07 15:13:53 +07:00
|
|
|
#include <linux/bit_spinlock.h>
|
|
|
|
#include <linux/rcupdate.h>
|
2009-04-03 06:57:39 +07:00
|
|
|
#include <linux/limits.h>
|
2011-05-27 03:00:52 +07:00
|
|
|
#include <linux/export.h>
|
2009-01-08 09:08:00 +07:00
|
|
|
#include <linux/mutex.h>
|
2013-09-25 05:27:40 +07:00
|
|
|
#include <linux/rbtree.h>
|
2008-04-29 15:00:19 +07:00
|
|
|
#include <linux/slab.h>
|
2008-02-07 15:13:56 +07:00
|
|
|
#include <linux/swap.h>
|
2010-03-11 06:22:17 +07:00
|
|
|
#include <linux/swapops.h>
|
2008-02-07 15:13:56 +07:00
|
|
|
#include <linux/spinlock.h>
|
2010-03-11 06:22:24 +07:00
|
|
|
#include <linux/eventfd.h>
|
2013-11-23 06:20:42 +07:00
|
|
|
#include <linux/poll.h>
|
2010-03-11 06:22:24 +07:00
|
|
|
#include <linux/sort.h>
|
2008-02-07 15:13:56 +07:00
|
|
|
#include <linux/fs.h>
|
2008-02-07 15:14:25 +07:00
|
|
|
#include <linux/seq_file.h>
|
memcg: add memory.pressure_level events
With this patch userland applications that want to maintain the
interactivity/memory allocation cost can use the pressure level
notifications. The levels are defined like this:
The "low" level means that the system is reclaiming memory for new
allocations. Monitoring this reclaiming activity might be useful for
maintaining cache level. Upon notification, the program (typically
"Activity Manager") might analyze vmstat and act in advance (i.e.
prematurely shutdown unimportant services).
The "medium" level means that the system is experiencing medium memory
pressure, the system might be making swap, paging out active file
caches, etc. Upon this event applications may decide to further analyze
vmstat/zoneinfo/memcg or internal memory usage statistics and free any
resources that can be easily reconstructed or re-read from a disk.
The "critical" level means that the system is actively thrashing, it is
about to out of memory (OOM) or even the in-kernel OOM killer is on its
way to trigger. Applications should do whatever they can to help the
system. It might be too late to consult with vmstat or any other
statistics, so it's advisable to take an immediate action.
The events are propagated upward until the event is handled, i.e. the
events are not pass-through. Here is what this means: for example you
have three cgroups: A->B->C. Now you set up an event listener on
cgroups A, B and C, and suppose group C experiences some pressure. In
this situation, only group C will receive the notification, i.e. groups
A and B will not receive it. This is done to avoid excessive
"broadcasting" of messages, which disturbs the system and which is
especially bad if we are low on memory or thrashing. So, organize the
cgroups wisely, or propagate the events manually (or, ask us to
implement the pass-through events, explaining why would you need them.)
Performance wise, the memory pressure notifications feature itself is
lightweight and does not require much of bookkeeping, in contrast to the
rest of memcg features. Unfortunately, as of current memcg
implementation, pages accounting is an inseparable part and cannot be
turned off. The good news is that there are some efforts[1] to improve
the situation; plus, implementing the same, fully API-compatible[2]
interface for CONFIG_MEMCG=n case (e.g. embedded) is also a viable
option, so it will not require any changes on the userland side.
[1] http://permalink.gmane.org/gmane.linux.kernel.cgroups/6291
[2] http://lkml.org/lkml/2013/2/21/454
[akpm@linux-foundation.org: coding-style fixes]
[akpm@linux-foundation.org: fix CONFIG_CGROPUPS=n warnings]
Signed-off-by: Anton Vorontsov <anton.vorontsov@linaro.org>
Acked-by: Kirill A. Shutemov <kirill@shutemov.name>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Glauber Costa <glommer@parallels.com>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Luiz Capitulino <lcapitulino@redhat.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Leonid Moiseichuk <leonid.moiseichuk@nokia.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@gmail.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Cc: John Stultz <john.stultz@linaro.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2013-04-30 05:08:31 +07:00
|
|
|
#include <linux/vmpressure.h>
|
2008-10-19 10:26:14 +07:00
|
|
|
#include <linux/mm_inline.h>
|
2014-12-11 06:44:55 +07:00
|
|
|
#include <linux/swap_cgroup.h>
|
2009-12-16 07:47:08 +07:00
|
|
|
#include <linux/cpu.h>
|
2010-08-11 08:03:00 +07:00
|
|
|
#include <linux/oom.h>
|
2013-11-01 06:34:14 +07:00
|
|
|
#include <linux/lockdep.h>
|
2013-11-23 06:20:42 +07:00
|
|
|
#include <linux/file.h>
|
memcg: punt high overage reclaim to return-to-userland path
Currently, try_charge() tries to reclaim memory synchronously when the
high limit is breached; however, if the allocation doesn't have
__GFP_WAIT, synchronous reclaim is skipped. If a process performs only
speculative allocations, it can blow way past the high limit. This is
actually easily reproducible by simply doing "find /". slab/slub
allocator tries speculative allocations first, so as long as there's
memory which can be consumed without blocking, it can keep allocating
memory regardless of the high limit.
This patch makes try_charge() always punt the over-high reclaim to the
return-to-userland path. If try_charge() detects that high limit is
breached, it adds the overage to current->memcg_nr_pages_over_high and
schedules execution of mem_cgroup_handle_over_high() which performs
synchronous reclaim from the return-to-userland path.
As long as kernel doesn't have a run-away allocation spree, this should
provide enough protection while making kmemcg behave more consistently.
It also has the following benefits.
- All over-high reclaims can use GFP_KERNEL regardless of the specific
gfp mask in use, e.g. GFP_NOFS, when the limit was breached.
- It copes with prio inversion. Previously, a low-prio task with
small memory.high might perform over-high reclaim with a bunch of
locks held. If a higher prio task needed any of these locks, it
would have to wait until the low prio task finished reclaim and
released the locks. By handing over-high reclaim to the task exit
path this issue can be avoided.
Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Michal Hocko <mhocko@kernel.org>
Reviewed-by: Vladimir Davydov <vdavydov@parallels.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-11-06 09:46:11 +07:00
|
|
|
#include <linux/tracehook.h>
|
mm, memcg: throttle allocators when failing reclaim over memory.high
We're trying to use memory.high to limit workloads, but have found that
containment can frequently fail completely and cause OOM situations
outside of the cgroup. This happens especially with swap space -- either
when none is configured, or swap is full. These failures often also don't
have enough warning to allow one to react, whether for a human or for a
daemon monitoring PSI.
Here is output from a simple program showing how long it takes in usec
(column 2) to allocate a megabyte of anonymous memory (column 1) when a
cgroup is already beyond its memory high setting, and no swap is
available:
[root@ktst ~]# systemd-run -p MemoryHigh=100M -p MemorySwapMax=1 \
> --wait -t timeout 300 /root/mdf
[...]
95 1035
96 1038
97 1000
98 1036
99 1048
100 1590
101 1968
102 1776
103 1863
104 1757
105 1921
106 1893
107 1760
108 1748
109 1843
110 1716
111 1924
112 1776
113 1831
114 1766
115 1836
116 1588
117 1912
118 1802
119 1857
120 1731
[...]
[System OOM in 2-3 seconds]
The delay does go up extremely marginally past the 100MB memory.high
threshold, as now we spend time scanning before returning to usermode, but
it's nowhere near enough to contain growth. It also doesn't get worse the
more pages you have, since it only considers nr_pages.
The current situation goes against both the expectations of users of
memory.high, and our intentions as cgroup v2 developers. In
cgroup-v2.txt, we claim that we will throttle and only under "extreme
conditions" will memory.high protection be breached. Likewise, cgroup v2
users generally also expect that memory.high should throttle workloads as
they exceed their high threshold. However, as seen above, this isn't
always how it works in practice -- even on banal setups like those with no
swap, or where swap has become exhausted, we can end up with memory.high
being breached and us having no weapons left in our arsenal to combat
runaway growth with, since reclaim is futile.
It's also hard for system monitoring software or users to tell how bad the
situation is, as "high" events for the memcg may in some cases be benign,
and in others be catastrophic. The current status quo is that we fail
containment in a way that doesn't provide any advance warning that things
are about to go horribly wrong (for example, we are about to invoke the
kernel OOM killer).
This patch introduces explicit throttling when reclaim is failing to keep
memcg size contained at the memory.high setting. It does so by applying
an exponential delay curve derived from the memcg's overage compared to
memory.high. In the normal case where the memcg is either below or only
marginally over its memory.high setting, no throttling will be performed.
This composes well with system health monitoring and remediation, as these
allocator delays are factored into PSI's memory pressure calculations.
This both creates a mechanism system administrators or applications
consuming the PSI interface to trivially see that the memcg in question is
struggling and use that to make more reasonable decisions, and permits
them enough time to act. Either of these can act with significantly more
nuance than that we can provide using the system OOM killer.
This is a similar idea to memory.oom_control in cgroup v1 which would put
the cgroup to sleep if the threshold was violated, but it's also
significantly improved as it results in visible memory pressure, and also
doesn't schedule indefinitely, which previously made tracing and other
introspection difficult (ie. it's clamped at 2*HZ per allocation through
MEMCG_MAX_HIGH_DELAY_JIFFIES).
Contrast the previous results with a kernel with this patch:
[root@ktst ~]# systemd-run -p MemoryHigh=100M -p MemorySwapMax=1 \
> --wait -t timeout 300 /root/mdf
[...]
95 1002
96 1000
97 1002
98 1003
99 1000
100 1043
101 84724
102 330628
103 610511
104 1016265
105 1503969
106 2391692
107 2872061
108 3248003
109 4791904
110 5759832
111 6912509
112 8127818
113 9472203
114 12287622
115 12480079
116 14144008
117 15808029
118 16384500
119 16383242
120 16384979
[...]
As you can see, in the normal case, memory allocation takes around 1000
usec. However, as we exceed our memory.high, things start to increase
exponentially, but fairly leniently at first. Our first megabyte over
memory.high takes us 0.16 seconds, then the next is 0.46 seconds, then the
next is almost an entire second. This gets worse until we reach our
eventual 2*HZ clamp per batch, resulting in 16 seconds per megabyte.
However, this is still making forward progress, so permits tracing or
further analysis with programs like GDB.
We use an exponential curve for our delay penalty for a few reasons:
1. We run mem_cgroup_handle_over_high to potentially do reclaim after
we've already performed allocations, which means that temporarily
going over memory.high by a small amount may be perfectly legitimate,
even for compliant workloads. We don't want to unduly penalise such
cases.
2. An exponential curve (as opposed to a static or linear delay) allows
ramping up memory pressure stats more gradually, which can be useful
to work out that you have set memory.high too low, without destroying
application performance entirely.
This patch expands on earlier work by Johannes Weiner. Thanks!
[akpm@linux-foundation.org: fix max() warning]
[akpm@linux-foundation.org: fix __udivdi3 ref on 32-bit]
[akpm@linux-foundation.org: fix it even more]
[chris@chrisdown.name: fix 64-bit divide even more]
Link: http://lkml.kernel.org/r/20190723180700.GA29459@chrisdown.name
Signed-off-by: Chris Down <chris@chrisdown.name>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Roman Gushchin <guro@fb.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Nathan Chancellor <natechancellor@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-09-24 05:34:55 +07:00
|
|
|
#include <linux/psi.h>
|
mm: memcontrol: dump memory.stat during cgroup OOM
The current cgroup OOM memory info dump doesn't include all the memory
we are tracking, nor does it give insight into what the VM tried to do
leading up to the OOM. All that useful info is in memory.stat.
Furthermore, the recursive printing for every child cgroup can
generate absurd amounts of data on the console for larger cgroup
trees, and it's not like we provide a per-cgroup breakdown during
global OOM kills.
When an OOM kill is triggered, print one set of recursive memory.stat
items at the level whose limit triggered the OOM condition.
Example output:
stress invoked oom-killer: gfp_mask=0x100cca(GFP_HIGHUSER_MOVABLE), order=0, oom_score_adj=0
CPU: 2 PID: 210 Comm: stress Not tainted 5.2.0-rc2-mm1-00247-g47d49835983c #135
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.12.0-20181126_142135-anatol 04/01/2014
Call Trace:
dump_stack+0x46/0x60
dump_header+0x4c/0x2d0
oom_kill_process.cold.10+0xb/0x10
out_of_memory+0x200/0x270
? try_to_free_mem_cgroup_pages+0xdf/0x130
mem_cgroup_out_of_memory+0xb7/0xc0
try_charge+0x680/0x6f0
mem_cgroup_try_charge+0xb5/0x160
__add_to_page_cache_locked+0xc6/0x300
? list_lru_destroy+0x80/0x80
add_to_page_cache_lru+0x45/0xc0
pagecache_get_page+0x11b/0x290
filemap_fault+0x458/0x6d0
ext4_filemap_fault+0x27/0x36
__do_fault+0x2f/0xb0
__handle_mm_fault+0x9c5/0x1140
? apic_timer_interrupt+0xa/0x20
handle_mm_fault+0xc5/0x180
__do_page_fault+0x1ab/0x440
? page_fault+0x8/0x30
page_fault+0x1e/0x30
RIP: 0033:0x55c32167fc10
Code: Bad RIP value.
RSP: 002b:00007fff1d031c50 EFLAGS: 00010206
RAX: 000000000dc00000 RBX: 00007fd2db000010 RCX: 00007fd2db000010
RDX: 0000000000000000 RSI: 0000000010001000 RDI: 0000000000000000
RBP: 000055c321680a54 R08: 00000000ffffffff R09: 0000000000000000
R10: 0000000000000022 R11: 0000000000000246 R12: ffffffffffffffff
R13: 0000000000000002 R14: 0000000000001000 R15: 0000000010000000
memory: usage 1024kB, limit 1024kB, failcnt 75131
swap: usage 0kB, limit 9007199254740988kB, failcnt 0
Memory cgroup stats for /foo:
anon 0
file 0
kernel_stack 36864
slab 274432
sock 0
shmem 0
file_mapped 0
file_dirty 0
file_writeback 0
anon_thp 0
inactive_anon 126976
active_anon 0
inactive_file 0
active_file 0
unevictable 0
slab_reclaimable 0
slab_unreclaimable 274432
pgfault 59466
pgmajfault 1617
workingset_refault 2145
workingset_activate 0
workingset_nodereclaim 0
pgrefill 98952
pgscan 200060
pgsteal 59340
pgactivate 40095
pgdeactivate 96787
pglazyfree 0
pglazyfreed 0
thp_fault_alloc 0
thp_collapse_alloc 0
Tasks state (memory values in pages):
[ pid ] uid tgid total_vm rss pgtables_bytes swapents oom_score_adj name
[ 200] 0 200 1121 884 53248 29 0 bash
[ 209] 0 209 905 246 45056 19 0 stress
[ 210] 0 210 66442 56 499712 56349 0 stress
oom-kill:constraint=CONSTRAINT_NONE,nodemask=(null),oom_memcg=/foo,task_memcg=/foo,task=stress,pid=210,uid=0
Memory cgroup out of memory: Killed process 210 (stress) total-vm:265768kB, anon-rss:0kB, file-rss:224kB, shmem-rss:0kB
oom_reaper: reaped process 210 (stress), now anon-rss:0kB, file-rss:0kB, shmem-rss:0kB
[hannes@cmpxchg.org: s/kvmalloc/kmalloc/ per Michal]
Link: http://lkml.kernel.org/r/20190605161133.GA12453@cmpxchg.org
Link: http://lkml.kernel.org/r/20190604210509.9744-1-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-07-12 10:55:59 +07:00
|
|
|
#include <linux/seq_buf.h>
|
memcg: synchronized LRU
A big patch for changing memcg's LRU semantics.
Now,
- page_cgroup is linked to mem_cgroup's its own LRU (per zone).
- LRU of page_cgroup is not synchronous with global LRU.
- page and page_cgroup is one-to-one and statically allocated.
- To find page_cgroup is on what LRU, you have to check pc->mem_cgroup as
- lru = page_cgroup_zoneinfo(pc, nid_of_pc, zid_of_pc);
- SwapCache is handled.
And, when we handle LRU list of page_cgroup, we do following.
pc = lookup_page_cgroup(page);
lock_page_cgroup(pc); .....................(1)
mz = page_cgroup_zoneinfo(pc);
spin_lock(&mz->lru_lock);
.....add to LRU
spin_unlock(&mz->lru_lock);
unlock_page_cgroup(pc);
But (1) is spin_lock and we have to be afraid of dead-lock with zone->lru_lock.
So, trylock() is used at (1), now. Without (1), we can't trust "mz" is correct.
This is a trial to remove this dirty nesting of locks.
This patch changes mz->lru_lock to be zone->lru_lock.
Then, above sequence will be written as
spin_lock(&zone->lru_lock); # in vmscan.c or swap.c via global LRU
mem_cgroup_add/remove/etc_lru() {
pc = lookup_page_cgroup(page);
mz = page_cgroup_zoneinfo(pc);
if (PageCgroupUsed(pc)) {
....add to LRU
}
spin_lock(&zone->lru_lock); # in vmscan.c or swap.c via global LRU
This is much simpler.
(*) We're safe even if we don't take lock_page_cgroup(pc). Because..
1. When pc->mem_cgroup can be modified.
- at charge.
- at account_move().
2. at charge
the PCG_USED bit is not set before pc->mem_cgroup is fixed.
3. at account_move()
the page is isolated and not on LRU.
Pros.
- easy for maintenance.
- memcg can make use of laziness of pagevec.
- we don't have to duplicated LRU/Active/Unevictable bit in page_cgroup.
- LRU status of memcg will be synchronized with global LRU's one.
- # of locks are reduced.
- account_move() is simplified very much.
Cons.
- may increase cost of LRU rotation.
(no impact if memcg is not configured.)
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Pavel Emelyanov <xemul@openvz.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-01-08 09:08:01 +07:00
|
|
|
#include "internal.h"
|
2011-12-12 04:47:04 +07:00
|
|
|
#include <net/sock.h>
|
2012-10-09 06:33:10 +07:00
|
|
|
#include <net/ip.h>
|
2013-11-13 06:08:22 +07:00
|
|
|
#include "slab.h"
|
2008-02-07 15:13:50 +07:00
|
|
|
|
2016-12-25 02:46:01 +07:00
|
|
|
#include <linux/uaccess.h>
|
2008-02-07 15:13:59 +07:00
|
|
|
|
2010-08-10 07:19:57 +07:00
|
|
|
#include <trace/events/vmscan.h>
|
|
|
|
|
2014-02-08 22:36:58 +07:00
|
|
|
struct cgroup_subsys memory_cgrp_subsys __read_mostly;
|
|
|
|
EXPORT_SYMBOL(memory_cgrp_subsys);
|
2012-12-13 04:51:57 +07:00
|
|
|
|
2016-01-15 06:20:56 +07:00
|
|
|
struct mem_cgroup *root_mem_cgroup __read_mostly;
|
|
|
|
|
mm: kmem: prepare remote memcg charging infra for interrupt contexts
Remote memcg charging API uses current->active_memcg to store the
currently active memory cgroup, which overwrites the memory cgroup of the
current process. It works well for normal contexts, but doesn't work for
interrupt contexts: indeed, if an interrupt occurs during the execution of
a section with an active memcg set, all allocations inside the interrupt
will be charged to the active memcg set (given that we'll enable
accounting for allocations from an interrupt context). But because the
interrupt might have no relation to the active memcg set outside, it's
obviously wrong from the accounting prospective.
To resolve this problem, let's add a global percpu int_active_memcg
variable, which will be used to store an active memory cgroup which will
be used from interrupt contexts. set_active_memcg() will transparently
use current->active_memcg or int_active_memcg depending on the context.
To make the read part simple and transparent for the caller, let's
introduce two new functions:
- struct mem_cgroup *active_memcg(void),
- struct mem_cgroup *get_active_memcg(void).
They are returning the active memcg if it's set, hiding all implementation
details: where to get it depending on the current context.
Signed-off-by: Roman Gushchin <guro@fb.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Link: http://lkml.kernel.org/r/20200827225843.1270629-4-guro@fb.com
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2020-10-18 06:13:50 +07:00
|
|
|
/* Active memory cgroup to use from an interrupt context */
|
|
|
|
DEFINE_PER_CPU(struct mem_cgroup *, int_active_memcg);
|
|
|
|
|
2016-01-15 06:21:29 +07:00
|
|
|
/* Socket memory accounting disabled? */
|
|
|
|
static bool cgroup_memory_nosocket;
|
|
|
|
|
2016-01-21 06:02:38 +07:00
|
|
|
/* Kernel memory accounting disabled? */
|
|
|
|
static bool cgroup_memory_nokmem;
|
|
|
|
|
2015-02-12 06:26:36 +07:00
|
|
|
/* Whether the swap controller is active */
|
2012-08-01 06:43:02 +07:00
|
|
|
#ifdef CONFIG_MEMCG_SWAP
|
2020-06-04 06:02:11 +07:00
|
|
|
bool cgroup_memory_noswap __read_mostly;
|
2009-01-08 09:07:57 +07:00
|
|
|
#else
|
2020-06-04 06:02:11 +07:00
|
|
|
#define cgroup_memory_noswap 1
|
2020-06-04 06:02:14 +07:00
|
|
|
#endif
|
2009-01-08 09:07:57 +07:00
|
|
|
|
writeback, memcg: Implement foreign dirty flushing
There's an inherent mismatch between memcg and writeback. The former
trackes ownership per-page while the latter per-inode. This was a
deliberate design decision because honoring per-page ownership in the
writeback path is complicated, may lead to higher CPU and IO overheads
and deemed unnecessary given that write-sharing an inode across
different cgroups isn't a common use-case.
Combined with inode majority-writer ownership switching, this works
well enough in most cases but there are some pathological cases. For
example, let's say there are two cgroups A and B which keep writing to
different but confined parts of the same inode. B owns the inode and
A's memory is limited far below B's. A's dirty ratio can rise enough
to trigger balance_dirty_pages() sleeps but B's can be low enough to
avoid triggering background writeback. A will be slowed down without
a way to make writeback of the dirty pages happen.
This patch implements foreign dirty recording and foreign mechanism so
that when a memcg encounters a condition as above it can trigger
flushes on bdi_writebacks which can clean its pages. Please see the
comment on top of mem_cgroup_track_foreign_dirty_slowpath() for
details.
A reproducer follows.
write-range.c::
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <fcntl.h>
#include <sys/types.h>
static const char *usage = "write-range FILE START SIZE\n";
int main(int argc, char **argv)
{
int fd;
unsigned long start, size, end, pos;
char *endp;
char buf[4096];
if (argc < 4) {
fprintf(stderr, usage);
return 1;
}
fd = open(argv[1], O_WRONLY);
if (fd < 0) {
perror("open");
return 1;
}
start = strtoul(argv[2], &endp, 0);
if (*endp != '\0') {
fprintf(stderr, usage);
return 1;
}
size = strtoul(argv[3], &endp, 0);
if (*endp != '\0') {
fprintf(stderr, usage);
return 1;
}
end = start + size;
while (1) {
for (pos = start; pos < end; ) {
long bread, bwritten = 0;
if (lseek(fd, pos, SEEK_SET) < 0) {
perror("lseek");
return 1;
}
bread = read(0, buf, sizeof(buf) < end - pos ?
sizeof(buf) : end - pos);
if (bread < 0) {
perror("read");
return 1;
}
if (bread == 0)
return 0;
while (bwritten < bread) {
long this;
this = write(fd, buf + bwritten,
bread - bwritten);
if (this < 0) {
perror("write");
return 1;
}
bwritten += this;
pos += bwritten;
}
}
}
}
repro.sh::
#!/bin/bash
set -e
set -x
sysctl -w vm.dirty_expire_centisecs=300000
sysctl -w vm.dirty_writeback_centisecs=300000
sysctl -w vm.dirtytime_expire_seconds=300000
echo 3 > /proc/sys/vm/drop_caches
TEST=/sys/fs/cgroup/test
A=$TEST/A
B=$TEST/B
mkdir -p $A $B
echo "+memory +io" > $TEST/cgroup.subtree_control
echo $((1<<30)) > $A/memory.high
echo $((32<<30)) > $B/memory.high
rm -f testfile
touch testfile
fallocate -l 4G testfile
echo "Starting B"
(echo $BASHPID > $B/cgroup.procs
pv -q --rate-limit 70M < /dev/urandom | ./write-range testfile $((2<<30)) $((2<<30))) &
echo "Waiting 10s to ensure B claims the testfile inode"
sleep 5
sync
sleep 5
sync
echo "Starting A"
(echo $BASHPID > $A/cgroup.procs
pv < /dev/urandom | ./write-range testfile 0 $((2<<30)))
v2: Added comments explaining why the specific intervals are being used.
v3: Use 0 @nr when calling cgroup_writeback_by_id() to use best-effort
flushing while avoding possible livelocks.
v4: Use get_jiffies_64() and time_before/after64() instead of raw
jiffies_64 and arthimetic comparisons as suggested by Jan.
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2019-08-26 23:06:56 +07:00
|
|
|
#ifdef CONFIG_CGROUP_WRITEBACK
|
|
|
|
static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq);
|
|
|
|
#endif
|
|
|
|
|
2016-01-15 06:21:23 +07:00
|
|
|
/* Whether legacy memory+swap accounting is active */
|
|
|
|
static bool do_memsw_account(void)
|
|
|
|
{
|
2020-06-04 06:02:11 +07:00
|
|
|
return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_noswap;
|
2016-01-15 06:21:23 +07:00
|
|
|
}
|
|
|
|
|
2012-05-30 05:06:56 +07:00
|
|
|
#define THRESHOLDS_EVENTS_TARGET 128
|
|
|
|
#define SOFTLIMIT_EVENTS_TARGET 1024
|
2011-03-24 06:42:37 +07:00
|
|
|
|
2013-09-25 05:27:40 +07:00
|
|
|
/*
|
|
|
|
* Cgroups above their limits are maintained in a RB-Tree, independent of
|
|
|
|
* their hierarchy representation
|
|
|
|
*/
|
|
|
|
|
2016-07-29 05:46:05 +07:00
|
|
|
struct mem_cgroup_tree_per_node {
|
2013-09-25 05:27:40 +07:00
|
|
|
struct rb_root rb_root;
|
2017-09-09 06:15:21 +07:00
|
|
|
struct rb_node *rb_rightmost;
|
2013-09-25 05:27:40 +07:00
|
|
|
spinlock_t lock;
|
|
|
|
};
|
|
|
|
|
|
|
|
struct mem_cgroup_tree {
|
|
|
|
struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
|
|
|
|
};
|
|
|
|
|
|
|
|
static struct mem_cgroup_tree soft_limit_tree __read_mostly;
|
|
|
|
|
2010-05-27 04:42:36 +07:00
|
|
|
/* for OOM */
|
|
|
|
struct mem_cgroup_eventfd_list {
|
|
|
|
struct list_head list;
|
|
|
|
struct eventfd_ctx *eventfd;
|
|
|
|
};
|
2010-03-11 06:22:24 +07:00
|
|
|
|
2013-11-23 06:20:42 +07:00
|
|
|
/*
|
|
|
|
* cgroup_event represents events which userspace want to receive.
|
|
|
|
*/
|
2013-11-23 06:20:44 +07:00
|
|
|
struct mem_cgroup_event {
|
2013-11-23 06:20:42 +07:00
|
|
|
/*
|
2013-11-23 06:20:43 +07:00
|
|
|
* memcg which the event belongs to.
|
2013-11-23 06:20:42 +07:00
|
|
|
*/
|
2013-11-23 06:20:43 +07:00
|
|
|
struct mem_cgroup *memcg;
|
2013-11-23 06:20:42 +07:00
|
|
|
/*
|
|
|
|
* eventfd to signal userspace about the event.
|
|
|
|
*/
|
|
|
|
struct eventfd_ctx *eventfd;
|
|
|
|
/*
|
|
|
|
* Each of these stored in a list by the cgroup.
|
|
|
|
*/
|
|
|
|
struct list_head list;
|
2013-11-23 06:20:43 +07:00
|
|
|
/*
|
|
|
|
* register_event() callback will be used to add new userspace
|
|
|
|
* waiter for changes related to this event. Use eventfd_signal()
|
|
|
|
* on eventfd to send notification to userspace.
|
|
|
|
*/
|
2013-11-23 06:20:43 +07:00
|
|
|
int (*register_event)(struct mem_cgroup *memcg,
|
2013-11-23 06:20:43 +07:00
|
|
|
struct eventfd_ctx *eventfd, const char *args);
|
2013-11-23 06:20:43 +07:00
|
|
|
/*
|
|
|
|
* unregister_event() callback will be called when userspace closes
|
|
|
|
* the eventfd or on cgroup removing. This callback must be set,
|
|
|
|
* if you want provide notification functionality.
|
|
|
|
*/
|
2013-11-23 06:20:43 +07:00
|
|
|
void (*unregister_event)(struct mem_cgroup *memcg,
|
2013-11-23 06:20:43 +07:00
|
|
|
struct eventfd_ctx *eventfd);
|
2013-11-23 06:20:42 +07:00
|
|
|
/*
|
|
|
|
* All fields below needed to unregister event when
|
|
|
|
* userspace closes eventfd.
|
|
|
|
*/
|
|
|
|
poll_table pt;
|
|
|
|
wait_queue_head_t *wqh;
|
2017-06-20 17:06:13 +07:00
|
|
|
wait_queue_entry_t wait;
|
2013-11-23 06:20:42 +07:00
|
|
|
struct work_struct remove;
|
|
|
|
};
|
|
|
|
|
2011-11-03 03:38:15 +07:00
|
|
|
static void mem_cgroup_threshold(struct mem_cgroup *memcg);
|
|
|
|
static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
|
2010-03-11 06:22:24 +07:00
|
|
|
|
2010-03-11 06:22:13 +07:00
|
|
|
/* Stuffs for move charges at task migration. */
|
|
|
|
/*
|
2015-02-12 06:26:09 +07:00
|
|
|
* Types of charges to be moved.
|
2010-03-11 06:22:13 +07:00
|
|
|
*/
|
2015-02-12 06:26:09 +07:00
|
|
|
#define MOVE_ANON 0x1U
|
|
|
|
#define MOVE_FILE 0x2U
|
|
|
|
#define MOVE_MASK (MOVE_ANON | MOVE_FILE)
|
2010-03-11 06:22:13 +07:00
|
|
|
|
2010-03-11 06:22:14 +07:00
|
|
|
/* "mc" and its members are protected by cgroup_mutex */
|
|
|
|
static struct move_charge_struct {
|
2010-11-25 03:57:06 +07:00
|
|
|
spinlock_t lock; /* for from, to */
|
2016-04-22 06:09:02 +07:00
|
|
|
struct mm_struct *mm;
|
2010-03-11 06:22:14 +07:00
|
|
|
struct mem_cgroup *from;
|
|
|
|
struct mem_cgroup *to;
|
2015-02-12 06:26:09 +07:00
|
|
|
unsigned long flags;
|
2010-03-11 06:22:14 +07:00
|
|
|
unsigned long precharge;
|
2010-03-11 06:22:15 +07:00
|
|
|
unsigned long moved_charge;
|
2010-03-11 06:22:18 +07:00
|
|
|
unsigned long moved_swap;
|
2010-03-11 06:22:16 +07:00
|
|
|
struct task_struct *moving_task; /* a task moving charges */
|
|
|
|
wait_queue_head_t waitq; /* a waitq for other context */
|
|
|
|
} mc = {
|
2010-08-11 08:02:58 +07:00
|
|
|
.lock = __SPIN_LOCK_UNLOCKED(mc.lock),
|
2010-03-11 06:22:16 +07:00
|
|
|
.waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
|
|
|
|
};
|
2010-03-11 06:22:14 +07:00
|
|
|
|
2009-09-24 05:56:39 +07:00
|
|
|
/*
|
|
|
|
* Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft
|
|
|
|
* limit reclaim to prevent infinite loops, if they ever occur.
|
|
|
|
*/
|
2012-05-30 05:06:56 +07:00
|
|
|
#define MEM_CGROUP_MAX_RECLAIM_LOOPS 100
|
2013-09-25 05:27:40 +07:00
|
|
|
#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2
|
2009-09-24 05:56:39 +07:00
|
|
|
|
2009-01-08 09:08:00 +07:00
|
|
|
/* for encoding cft->private value on file */
|
2012-12-19 05:21:45 +07:00
|
|
|
enum res_type {
|
|
|
|
_MEM,
|
|
|
|
_MEMSWAP,
|
|
|
|
_OOM_TYPE,
|
2012-12-19 05:21:47 +07:00
|
|
|
_KMEM,
|
2016-01-21 06:02:44 +07:00
|
|
|
_TCP,
|
2012-12-19 05:21:45 +07:00
|
|
|
};
|
|
|
|
|
2012-05-30 05:06:56 +07:00
|
|
|
#define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val))
|
|
|
|
#define MEMFILE_TYPE(val) ((val) >> 16 & 0xffff)
|
2009-01-08 09:08:00 +07:00
|
|
|
#define MEMFILE_ATTR(val) ((val) & 0xffff)
|
2010-05-27 04:42:36 +07:00
|
|
|
/* Used for OOM nofiier */
|
|
|
|
#define OOM_CONTROL (0)
|
2009-01-08 09:08:00 +07:00
|
|
|
|
2018-08-18 05:47:33 +07:00
|
|
|
/*
|
|
|
|
* Iteration constructs for visiting all cgroups (under a tree). If
|
|
|
|
* loops are exited prematurely (break), mem_cgroup_iter_break() must
|
|
|
|
* be used for reference counting.
|
|
|
|
*/
|
|
|
|
#define for_each_mem_cgroup_tree(iter, root) \
|
|
|
|
for (iter = mem_cgroup_iter(root, NULL, NULL); \
|
|
|
|
iter != NULL; \
|
|
|
|
iter = mem_cgroup_iter(root, iter, NULL))
|
|
|
|
|
|
|
|
#define for_each_mem_cgroup(iter) \
|
|
|
|
for (iter = mem_cgroup_iter(NULL, NULL, NULL); \
|
|
|
|
iter != NULL; \
|
|
|
|
iter = mem_cgroup_iter(NULL, iter, NULL))
|
|
|
|
|
2019-03-06 06:46:47 +07:00
|
|
|
static inline bool should_force_charge(void)
|
|
|
|
{
|
|
|
|
return tsk_is_oom_victim(current) || fatal_signal_pending(current) ||
|
|
|
|
(current->flags & PF_EXITING);
|
|
|
|
}
|
|
|
|
|
memcg: add memory.pressure_level events
With this patch userland applications that want to maintain the
interactivity/memory allocation cost can use the pressure level
notifications. The levels are defined like this:
The "low" level means that the system is reclaiming memory for new
allocations. Monitoring this reclaiming activity might be useful for
maintaining cache level. Upon notification, the program (typically
"Activity Manager") might analyze vmstat and act in advance (i.e.
prematurely shutdown unimportant services).
The "medium" level means that the system is experiencing medium memory
pressure, the system might be making swap, paging out active file
caches, etc. Upon this event applications may decide to further analyze
vmstat/zoneinfo/memcg or internal memory usage statistics and free any
resources that can be easily reconstructed or re-read from a disk.
The "critical" level means that the system is actively thrashing, it is
about to out of memory (OOM) or even the in-kernel OOM killer is on its
way to trigger. Applications should do whatever they can to help the
system. It might be too late to consult with vmstat or any other
statistics, so it's advisable to take an immediate action.
The events are propagated upward until the event is handled, i.e. the
events are not pass-through. Here is what this means: for example you
have three cgroups: A->B->C. Now you set up an event listener on
cgroups A, B and C, and suppose group C experiences some pressure. In
this situation, only group C will receive the notification, i.e. groups
A and B will not receive it. This is done to avoid excessive
"broadcasting" of messages, which disturbs the system and which is
especially bad if we are low on memory or thrashing. So, organize the
cgroups wisely, or propagate the events manually (or, ask us to
implement the pass-through events, explaining why would you need them.)
Performance wise, the memory pressure notifications feature itself is
lightweight and does not require much of bookkeeping, in contrast to the
rest of memcg features. Unfortunately, as of current memcg
implementation, pages accounting is an inseparable part and cannot be
turned off. The good news is that there are some efforts[1] to improve
the situation; plus, implementing the same, fully API-compatible[2]
interface for CONFIG_MEMCG=n case (e.g. embedded) is also a viable
option, so it will not require any changes on the userland side.
[1] http://permalink.gmane.org/gmane.linux.kernel.cgroups/6291
[2] http://lkml.org/lkml/2013/2/21/454
[akpm@linux-foundation.org: coding-style fixes]
[akpm@linux-foundation.org: fix CONFIG_CGROPUPS=n warnings]
Signed-off-by: Anton Vorontsov <anton.vorontsov@linaro.org>
Acked-by: Kirill A. Shutemov <kirill@shutemov.name>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Glauber Costa <glommer@parallels.com>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Luiz Capitulino <lcapitulino@redhat.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Leonid Moiseichuk <leonid.moiseichuk@nokia.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@gmail.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Cc: John Stultz <john.stultz@linaro.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2013-04-30 05:08:31 +07:00
|
|
|
/* Some nice accessors for the vmpressure. */
|
|
|
|
struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg)
|
|
|
|
{
|
|
|
|
if (!memcg)
|
|
|
|
memcg = root_mem_cgroup;
|
|
|
|
return &memcg->vmpressure;
|
|
|
|
}
|
|
|
|
|
|
|
|
struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr)
|
|
|
|
{
|
|
|
|
return &container_of(vmpr, struct mem_cgroup, vmpressure)->css;
|
|
|
|
}
|
|
|
|
|
2018-08-18 05:47:25 +07:00
|
|
|
#ifdef CONFIG_MEMCG_KMEM
|
2020-08-07 13:20:49 +07:00
|
|
|
extern spinlock_t css_set_lock;
|
|
|
|
|
|
|
|
static void obj_cgroup_release(struct percpu_ref *ref)
|
|
|
|
{
|
|
|
|
struct obj_cgroup *objcg = container_of(ref, struct obj_cgroup, refcnt);
|
|
|
|
struct mem_cgroup *memcg;
|
|
|
|
unsigned int nr_bytes;
|
|
|
|
unsigned int nr_pages;
|
|
|
|
unsigned long flags;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* At this point all allocated objects are freed, and
|
|
|
|
* objcg->nr_charged_bytes can't have an arbitrary byte value.
|
|
|
|
* However, it can be PAGE_SIZE or (x * PAGE_SIZE).
|
|
|
|
*
|
|
|
|
* The following sequence can lead to it:
|
|
|
|
* 1) CPU0: objcg == stock->cached_objcg
|
|
|
|
* 2) CPU1: we do a small allocation (e.g. 92 bytes),
|
|
|
|
* PAGE_SIZE bytes are charged
|
|
|
|
* 3) CPU1: a process from another memcg is allocating something,
|
|
|
|
* the stock if flushed,
|
|
|
|
* objcg->nr_charged_bytes = PAGE_SIZE - 92
|
|
|
|
* 5) CPU0: we do release this object,
|
|
|
|
* 92 bytes are added to stock->nr_bytes
|
|
|
|
* 6) CPU0: stock is flushed,
|
|
|
|
* 92 bytes are added to objcg->nr_charged_bytes
|
|
|
|
*
|
|
|
|
* In the result, nr_charged_bytes == PAGE_SIZE.
|
|
|
|
* This page will be uncharged in obj_cgroup_release().
|
|
|
|
*/
|
|
|
|
nr_bytes = atomic_read(&objcg->nr_charged_bytes);
|
|
|
|
WARN_ON_ONCE(nr_bytes & (PAGE_SIZE - 1));
|
|
|
|
nr_pages = nr_bytes >> PAGE_SHIFT;
|
|
|
|
|
|
|
|
spin_lock_irqsave(&css_set_lock, flags);
|
|
|
|
memcg = obj_cgroup_memcg(objcg);
|
|
|
|
if (nr_pages)
|
|
|
|
__memcg_kmem_uncharge(memcg, nr_pages);
|
|
|
|
list_del(&objcg->list);
|
|
|
|
mem_cgroup_put(memcg);
|
|
|
|
spin_unlock_irqrestore(&css_set_lock, flags);
|
|
|
|
|
|
|
|
percpu_ref_exit(ref);
|
|
|
|
kfree_rcu(objcg, rcu);
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct obj_cgroup *obj_cgroup_alloc(void)
|
|
|
|
{
|
|
|
|
struct obj_cgroup *objcg;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
objcg = kzalloc(sizeof(struct obj_cgroup), GFP_KERNEL);
|
|
|
|
if (!objcg)
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
ret = percpu_ref_init(&objcg->refcnt, obj_cgroup_release, 0,
|
|
|
|
GFP_KERNEL);
|
|
|
|
if (ret) {
|
|
|
|
kfree(objcg);
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
INIT_LIST_HEAD(&objcg->list);
|
|
|
|
return objcg;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void memcg_reparent_objcgs(struct mem_cgroup *memcg,
|
|
|
|
struct mem_cgroup *parent)
|
|
|
|
{
|
|
|
|
struct obj_cgroup *objcg, *iter;
|
|
|
|
|
|
|
|
objcg = rcu_replace_pointer(memcg->objcg, NULL, true);
|
|
|
|
|
|
|
|
spin_lock_irq(&css_set_lock);
|
|
|
|
|
|
|
|
/* Move active objcg to the parent's list */
|
|
|
|
xchg(&objcg->memcg, parent);
|
|
|
|
css_get(&parent->css);
|
|
|
|
list_add(&objcg->list, &parent->objcg_list);
|
|
|
|
|
|
|
|
/* Move already reparented objcgs to the parent's list */
|
|
|
|
list_for_each_entry(iter, &memcg->objcg_list, list) {
|
|
|
|
css_get(&parent->css);
|
|
|
|
xchg(&iter->memcg, parent);
|
|
|
|
css_put(&memcg->css);
|
|
|
|
}
|
|
|
|
list_splice(&memcg->objcg_list, &parent->objcg_list);
|
|
|
|
|
|
|
|
spin_unlock_irq(&css_set_lock);
|
|
|
|
|
|
|
|
percpu_ref_kill(&objcg->refcnt);
|
|
|
|
}
|
|
|
|
|
memcg: allocate memory for memcg caches whenever a new memcg appears
Every cache that is considered a root cache (basically the "original"
caches, tied to the root memcg/no-memcg) will have an array that should be
large enough to store a cache pointer per each memcg in the system.
Theoreticaly, this is as high as 1 << sizeof(css_id), which is currently
in the 64k pointers range. Most of the time, we won't be using that much.
What goes in this patch, is a simple scheme to dynamically allocate such
an array, in order to minimize memory usage for memcg caches. Because we
would also like to avoid allocations all the time, at least for now, the
array will only grow. It will tend to be big enough to hold the maximum
number of kmem-limited memcgs ever achieved.
We'll allocate it to be a minimum of 64 kmem-limited memcgs. When we have
more than that, we'll start doubling the size of this array every time the
limit is reached.
Because we are only considering kmem limited memcgs, a natural point for
this to happen is when we write to the limit. At that point, we already
have set_limit_mutex held, so that will become our natural synchronization
mechanism.
Signed-off-by: Glauber Costa <glommer@parallels.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Frederic Weisbecker <fweisbec@redhat.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: JoonSoo Kim <js1304@gmail.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Rik van Riel <riel@redhat.com>
Cc: Suleiman Souhlal <suleiman@google.com>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-12-19 05:22:38 +07:00
|
|
|
/*
|
2020-08-07 13:21:10 +07:00
|
|
|
* This will be used as a shrinker list's index.
|
2013-09-23 15:56:47 +07:00
|
|
|
* The main reason for not using cgroup id for this:
|
|
|
|
* this works better in sparse environments, where we have a lot of memcgs,
|
|
|
|
* but only a few kmem-limited. Or also, if we have, for instance, 200
|
|
|
|
* memcgs, and none but the 200th is kmem-limited, we'd have to have a
|
|
|
|
* 200 entry array for that.
|
memcg: allocate memory for memcg caches whenever a new memcg appears
Every cache that is considered a root cache (basically the "original"
caches, tied to the root memcg/no-memcg) will have an array that should be
large enough to store a cache pointer per each memcg in the system.
Theoreticaly, this is as high as 1 << sizeof(css_id), which is currently
in the 64k pointers range. Most of the time, we won't be using that much.
What goes in this patch, is a simple scheme to dynamically allocate such
an array, in order to minimize memory usage for memcg caches. Because we
would also like to avoid allocations all the time, at least for now, the
array will only grow. It will tend to be big enough to hold the maximum
number of kmem-limited memcgs ever achieved.
We'll allocate it to be a minimum of 64 kmem-limited memcgs. When we have
more than that, we'll start doubling the size of this array every time the
limit is reached.
Because we are only considering kmem limited memcgs, a natural point for
this to happen is when we write to the limit. At that point, we already
have set_limit_mutex held, so that will become our natural synchronization
mechanism.
Signed-off-by: Glauber Costa <glommer@parallels.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Frederic Weisbecker <fweisbec@redhat.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: JoonSoo Kim <js1304@gmail.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Rik van Riel <riel@redhat.com>
Cc: Suleiman Souhlal <suleiman@google.com>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-12-19 05:22:38 +07:00
|
|
|
*
|
2015-02-13 05:58:57 +07:00
|
|
|
* The current size of the caches array is stored in memcg_nr_cache_ids. It
|
|
|
|
* will double each time we have to increase it.
|
memcg: allocate memory for memcg caches whenever a new memcg appears
Every cache that is considered a root cache (basically the "original"
caches, tied to the root memcg/no-memcg) will have an array that should be
large enough to store a cache pointer per each memcg in the system.
Theoreticaly, this is as high as 1 << sizeof(css_id), which is currently
in the 64k pointers range. Most of the time, we won't be using that much.
What goes in this patch, is a simple scheme to dynamically allocate such
an array, in order to minimize memory usage for memcg caches. Because we
would also like to avoid allocations all the time, at least for now, the
array will only grow. It will tend to be big enough to hold the maximum
number of kmem-limited memcgs ever achieved.
We'll allocate it to be a minimum of 64 kmem-limited memcgs. When we have
more than that, we'll start doubling the size of this array every time the
limit is reached.
Because we are only considering kmem limited memcgs, a natural point for
this to happen is when we write to the limit. At that point, we already
have set_limit_mutex held, so that will become our natural synchronization
mechanism.
Signed-off-by: Glauber Costa <glommer@parallels.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Frederic Weisbecker <fweisbec@redhat.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: JoonSoo Kim <js1304@gmail.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Rik van Riel <riel@redhat.com>
Cc: Suleiman Souhlal <suleiman@google.com>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-12-19 05:22:38 +07:00
|
|
|
*/
|
2015-02-13 05:58:57 +07:00
|
|
|
static DEFINE_IDA(memcg_cache_ida);
|
|
|
|
int memcg_nr_cache_ids;
|
2012-12-19 05:23:01 +07:00
|
|
|
|
2015-02-13 05:59:01 +07:00
|
|
|
/* Protects memcg_nr_cache_ids */
|
|
|
|
static DECLARE_RWSEM(memcg_cache_ids_sem);
|
|
|
|
|
|
|
|
void memcg_get_cache_ids(void)
|
|
|
|
{
|
|
|
|
down_read(&memcg_cache_ids_sem);
|
|
|
|
}
|
|
|
|
|
|
|
|
void memcg_put_cache_ids(void)
|
|
|
|
{
|
|
|
|
up_read(&memcg_cache_ids_sem);
|
|
|
|
}
|
|
|
|
|
memcg: allocate memory for memcg caches whenever a new memcg appears
Every cache that is considered a root cache (basically the "original"
caches, tied to the root memcg/no-memcg) will have an array that should be
large enough to store a cache pointer per each memcg in the system.
Theoreticaly, this is as high as 1 << sizeof(css_id), which is currently
in the 64k pointers range. Most of the time, we won't be using that much.
What goes in this patch, is a simple scheme to dynamically allocate such
an array, in order to minimize memory usage for memcg caches. Because we
would also like to avoid allocations all the time, at least for now, the
array will only grow. It will tend to be big enough to hold the maximum
number of kmem-limited memcgs ever achieved.
We'll allocate it to be a minimum of 64 kmem-limited memcgs. When we have
more than that, we'll start doubling the size of this array every time the
limit is reached.
Because we are only considering kmem limited memcgs, a natural point for
this to happen is when we write to the limit. At that point, we already
have set_limit_mutex held, so that will become our natural synchronization
mechanism.
Signed-off-by: Glauber Costa <glommer@parallels.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Frederic Weisbecker <fweisbec@redhat.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: JoonSoo Kim <js1304@gmail.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Rik van Riel <riel@redhat.com>
Cc: Suleiman Souhlal <suleiman@google.com>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-12-19 05:22:38 +07:00
|
|
|
/*
|
|
|
|
* MIN_SIZE is different than 1, because we would like to avoid going through
|
|
|
|
* the alloc/free process all the time. In a small machine, 4 kmem-limited
|
|
|
|
* cgroups is a reasonable guess. In the future, it could be a parameter or
|
|
|
|
* tunable, but that is strictly not necessary.
|
|
|
|
*
|
2013-09-23 15:56:47 +07:00
|
|
|
* MAX_SIZE should be as large as the number of cgrp_ids. Ideally, we could get
|
memcg: allocate memory for memcg caches whenever a new memcg appears
Every cache that is considered a root cache (basically the "original"
caches, tied to the root memcg/no-memcg) will have an array that should be
large enough to store a cache pointer per each memcg in the system.
Theoreticaly, this is as high as 1 << sizeof(css_id), which is currently
in the 64k pointers range. Most of the time, we won't be using that much.
What goes in this patch, is a simple scheme to dynamically allocate such
an array, in order to minimize memory usage for memcg caches. Because we
would also like to avoid allocations all the time, at least for now, the
array will only grow. It will tend to be big enough to hold the maximum
number of kmem-limited memcgs ever achieved.
We'll allocate it to be a minimum of 64 kmem-limited memcgs. When we have
more than that, we'll start doubling the size of this array every time the
limit is reached.
Because we are only considering kmem limited memcgs, a natural point for
this to happen is when we write to the limit. At that point, we already
have set_limit_mutex held, so that will become our natural synchronization
mechanism.
Signed-off-by: Glauber Costa <glommer@parallels.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Frederic Weisbecker <fweisbec@redhat.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: JoonSoo Kim <js1304@gmail.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Rik van Riel <riel@redhat.com>
Cc: Suleiman Souhlal <suleiman@google.com>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-12-19 05:22:38 +07:00
|
|
|
* this constant directly from cgroup, but it is understandable that this is
|
|
|
|
* better kept as an internal representation in cgroup.c. In any case, the
|
2013-09-23 15:56:47 +07:00
|
|
|
* cgrp_id space is not getting any smaller, and we don't have to necessarily
|
memcg: allocate memory for memcg caches whenever a new memcg appears
Every cache that is considered a root cache (basically the "original"
caches, tied to the root memcg/no-memcg) will have an array that should be
large enough to store a cache pointer per each memcg in the system.
Theoreticaly, this is as high as 1 << sizeof(css_id), which is currently
in the 64k pointers range. Most of the time, we won't be using that much.
What goes in this patch, is a simple scheme to dynamically allocate such
an array, in order to minimize memory usage for memcg caches. Because we
would also like to avoid allocations all the time, at least for now, the
array will only grow. It will tend to be big enough to hold the maximum
number of kmem-limited memcgs ever achieved.
We'll allocate it to be a minimum of 64 kmem-limited memcgs. When we have
more than that, we'll start doubling the size of this array every time the
limit is reached.
Because we are only considering kmem limited memcgs, a natural point for
this to happen is when we write to the limit. At that point, we already
have set_limit_mutex held, so that will become our natural synchronization
mechanism.
Signed-off-by: Glauber Costa <glommer@parallels.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Frederic Weisbecker <fweisbec@redhat.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: JoonSoo Kim <js1304@gmail.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Rik van Riel <riel@redhat.com>
Cc: Suleiman Souhlal <suleiman@google.com>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-12-19 05:22:38 +07:00
|
|
|
* increase ours as well if it increases.
|
|
|
|
*/
|
|
|
|
#define MEMCG_CACHES_MIN_SIZE 4
|
2013-09-23 15:56:47 +07:00
|
|
|
#define MEMCG_CACHES_MAX_SIZE MEM_CGROUP_ID_MAX
|
memcg: allocate memory for memcg caches whenever a new memcg appears
Every cache that is considered a root cache (basically the "original"
caches, tied to the root memcg/no-memcg) will have an array that should be
large enough to store a cache pointer per each memcg in the system.
Theoreticaly, this is as high as 1 << sizeof(css_id), which is currently
in the 64k pointers range. Most of the time, we won't be using that much.
What goes in this patch, is a simple scheme to dynamically allocate such
an array, in order to minimize memory usage for memcg caches. Because we
would also like to avoid allocations all the time, at least for now, the
array will only grow. It will tend to be big enough to hold the maximum
number of kmem-limited memcgs ever achieved.
We'll allocate it to be a minimum of 64 kmem-limited memcgs. When we have
more than that, we'll start doubling the size of this array every time the
limit is reached.
Because we are only considering kmem limited memcgs, a natural point for
this to happen is when we write to the limit. At that point, we already
have set_limit_mutex held, so that will become our natural synchronization
mechanism.
Signed-off-by: Glauber Costa <glommer@parallels.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Frederic Weisbecker <fweisbec@redhat.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: JoonSoo Kim <js1304@gmail.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Rik van Riel <riel@redhat.com>
Cc: Suleiman Souhlal <suleiman@google.com>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-12-19 05:22:38 +07:00
|
|
|
|
2012-12-19 05:22:40 +07:00
|
|
|
/*
|
|
|
|
* A lot of the calls to the cache allocation functions are expected to be
|
2020-08-07 13:21:17 +07:00
|
|
|
* inlined by the compiler. Since the calls to memcg_slab_pre_alloc_hook() are
|
2012-12-19 05:22:40 +07:00
|
|
|
* conditional to this static branch, we'll have to allow modules that does
|
|
|
|
* kmem_cache_alloc and the such to see this symbol as well
|
|
|
|
*/
|
2016-01-15 06:21:34 +07:00
|
|
|
DEFINE_STATIC_KEY_FALSE(memcg_kmem_enabled_key);
|
2012-12-19 05:22:40 +07:00
|
|
|
EXPORT_SYMBOL(memcg_kmem_enabled_key);
|
2019-09-24 05:38:12 +07:00
|
|
|
#endif
|
2017-02-23 06:41:36 +07:00
|
|
|
|
mm, memcg: assign memcg-aware shrinkers bitmap to memcg
Imagine a big node with many cpus, memory cgroups and containers. Let
we have 200 containers, every container has 10 mounts, and 10 cgroups.
All container tasks don't touch foreign containers mounts. If there is
intensive pages write, and global reclaim happens, a writing task has to
iterate over all memcgs to shrink slab, before it's able to go to
shrink_page_list().
Iteration over all the memcg slabs is very expensive: the task has to
visit 200 * 10 = 2000 shrinkers for every memcg, and since there are
2000 memcgs, the total calls are 2000 * 2000 = 4000000.
So, the shrinker makes 4 million do_shrink_slab() calls just to try to
isolate SWAP_CLUSTER_MAX pages in one of the actively writing memcg via
shrink_page_list(). I've observed a node spending almost 100% in
kernel, making useless iteration over already shrinked slab.
This patch adds bitmap of memcg-aware shrinkers to memcg. The size of
the bitmap depends on bitmap_nr_ids, and during memcg life it's
maintained to be enough to fit bitmap_nr_ids shrinkers. Every bit in
the map is related to corresponding shrinker id.
Next patches will maintain set bit only for really charged memcg. This
will allow shrink_slab() to increase its performance in significant way.
See the last patch for the numbers.
[ktkhai@virtuozzo.com: v9]
Link: http://lkml.kernel.org/r/153112549031.4097.3576147070498769979.stgit@localhost.localdomain
[ktkhai@virtuozzo.com: add comment to mem_cgroup_css_online()]
Link: http://lkml.kernel.org/r/521f9e5f-c436-b388-fe83-4dc870bfb489@virtuozzo.com
Link: http://lkml.kernel.org/r/153063056619.1818.12550500883688681076.stgit@localhost.localdomain
Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Acked-by: Vladimir Davydov <vdavydov.dev@gmail.com>
Tested-by: Shakeel Butt <shakeelb@google.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Guenter Roeck <linux@roeck-us.net>
Cc: "Huang, Ying" <ying.huang@intel.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Josef Bacik <jbacik@fb.com>
Cc: Li RongQing <lirongqing@baidu.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Matthias Kaehlcke <mka@chromium.org>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Philippe Ombredanne <pombredanne@nexb.com>
Cc: Roman Gushchin <guro@fb.com>
Cc: Sahitya Tummala <stummala@codeaurora.org>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Waiman Long <longman@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2018-08-18 05:47:37 +07:00
|
|
|
static int memcg_shrinker_map_size;
|
|
|
|
static DEFINE_MUTEX(memcg_shrinker_map_mutex);
|
|
|
|
|
|
|
|
static void memcg_free_shrinker_map_rcu(struct rcu_head *head)
|
|
|
|
{
|
|
|
|
kvfree(container_of(head, struct memcg_shrinker_map, rcu));
|
|
|
|
}
|
|
|
|
|
|
|
|
static int memcg_expand_one_shrinker_map(struct mem_cgroup *memcg,
|
|
|
|
int size, int old_size)
|
|
|
|
{
|
|
|
|
struct memcg_shrinker_map *new, *old;
|
|
|
|
int nid;
|
|
|
|
|
|
|
|
lockdep_assert_held(&memcg_shrinker_map_mutex);
|
|
|
|
|
|
|
|
for_each_node(nid) {
|
|
|
|
old = rcu_dereference_protected(
|
|
|
|
mem_cgroup_nodeinfo(memcg, nid)->shrinker_map, true);
|
|
|
|
/* Not yet online memcg */
|
|
|
|
if (!old)
|
|
|
|
return 0;
|
|
|
|
|
2020-04-02 11:06:33 +07:00
|
|
|
new = kvmalloc_node(sizeof(*new) + size, GFP_KERNEL, nid);
|
mm, memcg: assign memcg-aware shrinkers bitmap to memcg
Imagine a big node with many cpus, memory cgroups and containers. Let
we have 200 containers, every container has 10 mounts, and 10 cgroups.
All container tasks don't touch foreign containers mounts. If there is
intensive pages write, and global reclaim happens, a writing task has to
iterate over all memcgs to shrink slab, before it's able to go to
shrink_page_list().
Iteration over all the memcg slabs is very expensive: the task has to
visit 200 * 10 = 2000 shrinkers for every memcg, and since there are
2000 memcgs, the total calls are 2000 * 2000 = 4000000.
So, the shrinker makes 4 million do_shrink_slab() calls just to try to
isolate SWAP_CLUSTER_MAX pages in one of the actively writing memcg via
shrink_page_list(). I've observed a node spending almost 100% in
kernel, making useless iteration over already shrinked slab.
This patch adds bitmap of memcg-aware shrinkers to memcg. The size of
the bitmap depends on bitmap_nr_ids, and during memcg life it's
maintained to be enough to fit bitmap_nr_ids shrinkers. Every bit in
the map is related to corresponding shrinker id.
Next patches will maintain set bit only for really charged memcg. This
will allow shrink_slab() to increase its performance in significant way.
See the last patch for the numbers.
[ktkhai@virtuozzo.com: v9]
Link: http://lkml.kernel.org/r/153112549031.4097.3576147070498769979.stgit@localhost.localdomain
[ktkhai@virtuozzo.com: add comment to mem_cgroup_css_online()]
Link: http://lkml.kernel.org/r/521f9e5f-c436-b388-fe83-4dc870bfb489@virtuozzo.com
Link: http://lkml.kernel.org/r/153063056619.1818.12550500883688681076.stgit@localhost.localdomain
Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Acked-by: Vladimir Davydov <vdavydov.dev@gmail.com>
Tested-by: Shakeel Butt <shakeelb@google.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Guenter Roeck <linux@roeck-us.net>
Cc: "Huang, Ying" <ying.huang@intel.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Josef Bacik <jbacik@fb.com>
Cc: Li RongQing <lirongqing@baidu.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Matthias Kaehlcke <mka@chromium.org>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Philippe Ombredanne <pombredanne@nexb.com>
Cc: Roman Gushchin <guro@fb.com>
Cc: Sahitya Tummala <stummala@codeaurora.org>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Waiman Long <longman@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2018-08-18 05:47:37 +07:00
|
|
|
if (!new)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
/* Set all old bits, clear all new bits */
|
|
|
|
memset(new->map, (int)0xff, old_size);
|
|
|
|
memset((void *)new->map + old_size, 0, size - old_size);
|
|
|
|
|
|
|
|
rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_map, new);
|
|
|
|
call_rcu(&old->rcu, memcg_free_shrinker_map_rcu);
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void memcg_free_shrinker_maps(struct mem_cgroup *memcg)
|
|
|
|
{
|
|
|
|
struct mem_cgroup_per_node *pn;
|
|
|
|
struct memcg_shrinker_map *map;
|
|
|
|
int nid;
|
|
|
|
|
|
|
|
if (mem_cgroup_is_root(memcg))
|
|
|
|
return;
|
|
|
|
|
|
|
|
for_each_node(nid) {
|
|
|
|
pn = mem_cgroup_nodeinfo(memcg, nid);
|
|
|
|
map = rcu_dereference_protected(pn->shrinker_map, true);
|
|
|
|
if (map)
|
|
|
|
kvfree(map);
|
|
|
|
rcu_assign_pointer(pn->shrinker_map, NULL);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static int memcg_alloc_shrinker_maps(struct mem_cgroup *memcg)
|
|
|
|
{
|
|
|
|
struct memcg_shrinker_map *map;
|
|
|
|
int nid, size, ret = 0;
|
|
|
|
|
|
|
|
if (mem_cgroup_is_root(memcg))
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
mutex_lock(&memcg_shrinker_map_mutex);
|
|
|
|
size = memcg_shrinker_map_size;
|
|
|
|
for_each_node(nid) {
|
2020-04-02 11:06:33 +07:00
|
|
|
map = kvzalloc_node(sizeof(*map) + size, GFP_KERNEL, nid);
|
mm, memcg: assign memcg-aware shrinkers bitmap to memcg
Imagine a big node with many cpus, memory cgroups and containers. Let
we have 200 containers, every container has 10 mounts, and 10 cgroups.
All container tasks don't touch foreign containers mounts. If there is
intensive pages write, and global reclaim happens, a writing task has to
iterate over all memcgs to shrink slab, before it's able to go to
shrink_page_list().
Iteration over all the memcg slabs is very expensive: the task has to
visit 200 * 10 = 2000 shrinkers for every memcg, and since there are
2000 memcgs, the total calls are 2000 * 2000 = 4000000.
So, the shrinker makes 4 million do_shrink_slab() calls just to try to
isolate SWAP_CLUSTER_MAX pages in one of the actively writing memcg via
shrink_page_list(). I've observed a node spending almost 100% in
kernel, making useless iteration over already shrinked slab.
This patch adds bitmap of memcg-aware shrinkers to memcg. The size of
the bitmap depends on bitmap_nr_ids, and during memcg life it's
maintained to be enough to fit bitmap_nr_ids shrinkers. Every bit in
the map is related to corresponding shrinker id.
Next patches will maintain set bit only for really charged memcg. This
will allow shrink_slab() to increase its performance in significant way.
See the last patch for the numbers.
[ktkhai@virtuozzo.com: v9]
Link: http://lkml.kernel.org/r/153112549031.4097.3576147070498769979.stgit@localhost.localdomain
[ktkhai@virtuozzo.com: add comment to mem_cgroup_css_online()]
Link: http://lkml.kernel.org/r/521f9e5f-c436-b388-fe83-4dc870bfb489@virtuozzo.com
Link: http://lkml.kernel.org/r/153063056619.1818.12550500883688681076.stgit@localhost.localdomain
Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Acked-by: Vladimir Davydov <vdavydov.dev@gmail.com>
Tested-by: Shakeel Butt <shakeelb@google.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Guenter Roeck <linux@roeck-us.net>
Cc: "Huang, Ying" <ying.huang@intel.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Josef Bacik <jbacik@fb.com>
Cc: Li RongQing <lirongqing@baidu.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Matthias Kaehlcke <mka@chromium.org>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Philippe Ombredanne <pombredanne@nexb.com>
Cc: Roman Gushchin <guro@fb.com>
Cc: Sahitya Tummala <stummala@codeaurora.org>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Waiman Long <longman@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2018-08-18 05:47:37 +07:00
|
|
|
if (!map) {
|
|
|
|
memcg_free_shrinker_maps(memcg);
|
|
|
|
ret = -ENOMEM;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_map, map);
|
|
|
|
}
|
|
|
|
mutex_unlock(&memcg_shrinker_map_mutex);
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
int memcg_expand_shrinker_maps(int new_id)
|
|
|
|
{
|
|
|
|
int size, old_size, ret = 0;
|
|
|
|
struct mem_cgroup *memcg;
|
|
|
|
|
|
|
|
size = DIV_ROUND_UP(new_id + 1, BITS_PER_LONG) * sizeof(unsigned long);
|
|
|
|
old_size = memcg_shrinker_map_size;
|
|
|
|
if (size <= old_size)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
mutex_lock(&memcg_shrinker_map_mutex);
|
|
|
|
if (!root_mem_cgroup)
|
|
|
|
goto unlock;
|
|
|
|
|
|
|
|
for_each_mem_cgroup(memcg) {
|
|
|
|
if (mem_cgroup_is_root(memcg))
|
|
|
|
continue;
|
|
|
|
ret = memcg_expand_one_shrinker_map(memcg, size, old_size);
|
2020-02-21 11:04:18 +07:00
|
|
|
if (ret) {
|
|
|
|
mem_cgroup_iter_break(NULL, memcg);
|
mm, memcg: assign memcg-aware shrinkers bitmap to memcg
Imagine a big node with many cpus, memory cgroups and containers. Let
we have 200 containers, every container has 10 mounts, and 10 cgroups.
All container tasks don't touch foreign containers mounts. If there is
intensive pages write, and global reclaim happens, a writing task has to
iterate over all memcgs to shrink slab, before it's able to go to
shrink_page_list().
Iteration over all the memcg slabs is very expensive: the task has to
visit 200 * 10 = 2000 shrinkers for every memcg, and since there are
2000 memcgs, the total calls are 2000 * 2000 = 4000000.
So, the shrinker makes 4 million do_shrink_slab() calls just to try to
isolate SWAP_CLUSTER_MAX pages in one of the actively writing memcg via
shrink_page_list(). I've observed a node spending almost 100% in
kernel, making useless iteration over already shrinked slab.
This patch adds bitmap of memcg-aware shrinkers to memcg. The size of
the bitmap depends on bitmap_nr_ids, and during memcg life it's
maintained to be enough to fit bitmap_nr_ids shrinkers. Every bit in
the map is related to corresponding shrinker id.
Next patches will maintain set bit only for really charged memcg. This
will allow shrink_slab() to increase its performance in significant way.
See the last patch for the numbers.
[ktkhai@virtuozzo.com: v9]
Link: http://lkml.kernel.org/r/153112549031.4097.3576147070498769979.stgit@localhost.localdomain
[ktkhai@virtuozzo.com: add comment to mem_cgroup_css_online()]
Link: http://lkml.kernel.org/r/521f9e5f-c436-b388-fe83-4dc870bfb489@virtuozzo.com
Link: http://lkml.kernel.org/r/153063056619.1818.12550500883688681076.stgit@localhost.localdomain
Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Acked-by: Vladimir Davydov <vdavydov.dev@gmail.com>
Tested-by: Shakeel Butt <shakeelb@google.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Guenter Roeck <linux@roeck-us.net>
Cc: "Huang, Ying" <ying.huang@intel.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Josef Bacik <jbacik@fb.com>
Cc: Li RongQing <lirongqing@baidu.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Matthias Kaehlcke <mka@chromium.org>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Philippe Ombredanne <pombredanne@nexb.com>
Cc: Roman Gushchin <guro@fb.com>
Cc: Sahitya Tummala <stummala@codeaurora.org>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Waiman Long <longman@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2018-08-18 05:47:37 +07:00
|
|
|
goto unlock;
|
2020-02-21 11:04:18 +07:00
|
|
|
}
|
mm, memcg: assign memcg-aware shrinkers bitmap to memcg
Imagine a big node with many cpus, memory cgroups and containers. Let
we have 200 containers, every container has 10 mounts, and 10 cgroups.
All container tasks don't touch foreign containers mounts. If there is
intensive pages write, and global reclaim happens, a writing task has to
iterate over all memcgs to shrink slab, before it's able to go to
shrink_page_list().
Iteration over all the memcg slabs is very expensive: the task has to
visit 200 * 10 = 2000 shrinkers for every memcg, and since there are
2000 memcgs, the total calls are 2000 * 2000 = 4000000.
So, the shrinker makes 4 million do_shrink_slab() calls just to try to
isolate SWAP_CLUSTER_MAX pages in one of the actively writing memcg via
shrink_page_list(). I've observed a node spending almost 100% in
kernel, making useless iteration over already shrinked slab.
This patch adds bitmap of memcg-aware shrinkers to memcg. The size of
the bitmap depends on bitmap_nr_ids, and during memcg life it's
maintained to be enough to fit bitmap_nr_ids shrinkers. Every bit in
the map is related to corresponding shrinker id.
Next patches will maintain set bit only for really charged memcg. This
will allow shrink_slab() to increase its performance in significant way.
See the last patch for the numbers.
[ktkhai@virtuozzo.com: v9]
Link: http://lkml.kernel.org/r/153112549031.4097.3576147070498769979.stgit@localhost.localdomain
[ktkhai@virtuozzo.com: add comment to mem_cgroup_css_online()]
Link: http://lkml.kernel.org/r/521f9e5f-c436-b388-fe83-4dc870bfb489@virtuozzo.com
Link: http://lkml.kernel.org/r/153063056619.1818.12550500883688681076.stgit@localhost.localdomain
Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Acked-by: Vladimir Davydov <vdavydov.dev@gmail.com>
Tested-by: Shakeel Butt <shakeelb@google.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Guenter Roeck <linux@roeck-us.net>
Cc: "Huang, Ying" <ying.huang@intel.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Josef Bacik <jbacik@fb.com>
Cc: Li RongQing <lirongqing@baidu.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Matthias Kaehlcke <mka@chromium.org>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Philippe Ombredanne <pombredanne@nexb.com>
Cc: Roman Gushchin <guro@fb.com>
Cc: Sahitya Tummala <stummala@codeaurora.org>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Waiman Long <longman@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2018-08-18 05:47:37 +07:00
|
|
|
}
|
|
|
|
unlock:
|
|
|
|
if (!ret)
|
|
|
|
memcg_shrinker_map_size = size;
|
|
|
|
mutex_unlock(&memcg_shrinker_map_mutex);
|
|
|
|
return ret;
|
|
|
|
}
|
2018-08-18 05:48:10 +07:00
|
|
|
|
|
|
|
void memcg_set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id)
|
|
|
|
{
|
|
|
|
if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) {
|
|
|
|
struct memcg_shrinker_map *map;
|
|
|
|
|
|
|
|
rcu_read_lock();
|
|
|
|
map = rcu_dereference(memcg->nodeinfo[nid]->shrinker_map);
|
mm/vmscan.c: clear shrinker bit if there are no objects related to memcg
To avoid further unneed calls of do_shrink_slab() for shrinkers, which
already do not have any charged objects in a memcg, their bits have to
be cleared.
This patch introduces a lockless mechanism to do that without races
without parallel list lru add. After do_shrink_slab() returns
SHRINK_EMPTY the first time, we clear the bit and call it once again.
Then we restore the bit, if the new return value is different.
Note, that single smp_mb__after_atomic() in shrink_slab_memcg() covers
two situations:
1)list_lru_add() shrink_slab_memcg
list_add_tail() for_each_set_bit() <--- read bit
do_shrink_slab() <--- missed list update (no barrier)
<MB> <MB>
set_bit() do_shrink_slab() <--- seen list update
This situation, when the first do_shrink_slab() sees set bit, but it
doesn't see list update (i.e., race with the first element queueing), is
rare. So we don't add <MB> before the first call of do_shrink_slab()
instead of this to do not slow down generic case. Also, it's need the
second call as seen in below in (2).
2)list_lru_add() shrink_slab_memcg()
list_add_tail() ...
set_bit() ...
... for_each_set_bit()
do_shrink_slab() do_shrink_slab()
clear_bit() ...
... ...
list_lru_add() ...
list_add_tail() clear_bit()
<MB> <MB>
set_bit() do_shrink_slab()
The barriers guarantee that the second do_shrink_slab() in the right
side task sees list update if really cleared the bit. This case is
drawn in the code comment.
[Results/performance of the patchset]
After the whole patchset applied the below test shows signify increase
of performance:
$echo 1 > /sys/fs/cgroup/memory/memory.use_hierarchy
$mkdir /sys/fs/cgroup/memory/ct
$echo 4000M > /sys/fs/cgroup/memory/ct/memory.kmem.limit_in_bytes
$for i in `seq 0 4000`; do mkdir /sys/fs/cgroup/memory/ct/$i;
echo $$ > /sys/fs/cgroup/memory/ct/$i/cgroup.procs;
mkdir -p s/$i; mount -t tmpfs $i s/$i;
touch s/$i/file; done
Then, 5 sequential calls of drop caches:
$time echo 3 > /proc/sys/vm/drop_caches
1)Before:
0.00user 13.78system 0:13.78elapsed 99%CPU
0.00user 5.59system 0:05.60elapsed 99%CPU
0.00user 5.48system 0:05.48elapsed 99%CPU
0.00user 8.35system 0:08.35elapsed 99%CPU
0.00user 8.34system 0:08.35elapsed 99%CPU
2)After
0.00user 1.10system 0:01.10elapsed 99%CPU
0.00user 0.00system 0:00.01elapsed 64%CPU
0.00user 0.01system 0:00.01elapsed 82%CPU
0.00user 0.00system 0:00.01elapsed 64%CPU
0.00user 0.01system 0:00.01elapsed 82%CPU
The results show the performance increases at least in 548 times.
Shakeel Butt tested this patchset with fork-bomb on his configuration:
> I created 255 memcgs, 255 ext4 mounts and made each memcg create a
> file containing few KiBs on corresponding mount. Then in a separate
> memcg of 200 MiB limit ran a fork-bomb.
>
> I ran the "perf record -ag -- sleep 60" and below are the results:
>
> Without the patch series:
> Samples: 4M of event 'cycles', Event count (approx.): 3279403076005
> + 36.40% fb.sh [kernel.kallsyms] [k] shrink_slab
> + 18.97% fb.sh [kernel.kallsyms] [k] list_lru_count_one
> + 6.75% fb.sh [kernel.kallsyms] [k] super_cache_count
> + 0.49% fb.sh [kernel.kallsyms] [k] down_read_trylock
> + 0.44% fb.sh [kernel.kallsyms] [k] mem_cgroup_iter
> + 0.27% fb.sh [kernel.kallsyms] [k] up_read
> + 0.21% fb.sh [kernel.kallsyms] [k] osq_lock
> + 0.13% fb.sh [kernel.kallsyms] [k] shmem_unused_huge_count
> + 0.08% fb.sh [kernel.kallsyms] [k] shrink_node_memcg
> + 0.08% fb.sh [kernel.kallsyms] [k] shrink_node
>
> With the patch series:
> Samples: 4M of event 'cycles', Event count (approx.): 2756866824946
> + 47.49% fb.sh [kernel.kallsyms] [k] down_read_trylock
> + 30.72% fb.sh [kernel.kallsyms] [k] up_read
> + 9.51% fb.sh [kernel.kallsyms] [k] mem_cgroup_iter
> + 1.69% fb.sh [kernel.kallsyms] [k] shrink_node_memcg
> + 1.35% fb.sh [kernel.kallsyms] [k] mem_cgroup_protected
> + 1.05% fb.sh [kernel.kallsyms] [k] queued_spin_lock_slowpath
> + 0.85% fb.sh [kernel.kallsyms] [k] _raw_spin_lock
> + 0.78% fb.sh [kernel.kallsyms] [k] lruvec_lru_size
> + 0.57% fb.sh [kernel.kallsyms] [k] shrink_node
> + 0.54% fb.sh [kernel.kallsyms] [k] queue_work_on
> + 0.46% fb.sh [kernel.kallsyms] [k] shrink_slab_memcg
[ktkhai@virtuozzo.com: v9]
Link: http://lkml.kernel.org/r/153112561772.4097.11011071937553113003.stgit@localhost.localdomain
Link: http://lkml.kernel.org/r/153063070859.1818.11870882950920963480.stgit@localhost.localdomain
Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Acked-by: Vladimir Davydov <vdavydov.dev@gmail.com>
Tested-by: Shakeel Butt <shakeelb@google.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Guenter Roeck <linux@roeck-us.net>
Cc: "Huang, Ying" <ying.huang@intel.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Josef Bacik <jbacik@fb.com>
Cc: Li RongQing <lirongqing@baidu.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Matthias Kaehlcke <mka@chromium.org>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Philippe Ombredanne <pombredanne@nexb.com>
Cc: Roman Gushchin <guro@fb.com>
Cc: Sahitya Tummala <stummala@codeaurora.org>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Waiman Long <longman@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2018-08-18 05:48:25 +07:00
|
|
|
/* Pairs with smp mb in shrink_slab() */
|
|
|
|
smp_mb__before_atomic();
|
2018-08-18 05:48:10 +07:00
|
|
|
set_bit(shrinker_id, map->map);
|
|
|
|
rcu_read_unlock();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-05-28 07:00:02 +07:00
|
|
|
/**
|
|
|
|
* mem_cgroup_css_from_page - css of the memcg associated with a page
|
|
|
|
* @page: page of interest
|
|
|
|
*
|
|
|
|
* If memcg is bound to the default hierarchy, css of the memcg associated
|
|
|
|
* with @page is returned. The returned css remains associated with @page
|
|
|
|
* until it is released.
|
|
|
|
*
|
|
|
|
* If memcg is bound to a traditional hierarchy, the css of root_mem_cgroup
|
|
|
|
* is returned.
|
|
|
|
*/
|
|
|
|
struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page)
|
|
|
|
{
|
|
|
|
struct mem_cgroup *memcg;
|
|
|
|
|
|
|
|
memcg = page->mem_cgroup;
|
|
|
|
|
2015-09-18 22:56:28 +07:00
|
|
|
if (!memcg || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
|
2015-05-28 07:00:02 +07:00
|
|
|
memcg = root_mem_cgroup;
|
|
|
|
|
|
|
|
return &memcg->css;
|
|
|
|
}
|
|
|
|
|
memcg: add page_cgroup_ino helper
This patchset introduces a new user API for tracking user memory pages
that have not been used for a given period of time. The purpose of this
is to provide the userspace with the means of tracking a workload's
working set, i.e. the set of pages that are actively used by the
workload. Knowing the working set size can be useful for partitioning the
system more efficiently, e.g. by tuning memory cgroup limits
appropriately, or for job placement within a compute cluster.
==== USE CASES ====
The unified cgroup hierarchy has memory.low and memory.high knobs, which
are defined as the low and high boundaries for the workload working set
size. However, the working set size of a workload may be unknown or
change in time. With this patch set, one can periodically estimate the
amount of memory unused by each cgroup and tune their memory.low and
memory.high parameters accordingly, therefore optimizing the overall
memory utilization.
Another use case is balancing workloads within a compute cluster. Knowing
how much memory is not really used by a workload unit may help take a more
optimal decision when considering migrating the unit to another node
within the cluster.
Also, as noted by Minchan, this would be useful for per-process reclaim
(https://lwn.net/Articles/545668/). With idle tracking, we could reclaim idle
pages only by smart user memory manager.
==== USER API ====
The user API consists of two new files:
* /sys/kernel/mm/page_idle/bitmap. This file implements a bitmap where each
bit corresponds to a page, indexed by PFN. When the bit is set, the
corresponding page is idle. A page is considered idle if it has not been
accessed since it was marked idle. To mark a page idle one should set the
bit corresponding to the page by writing to the file. A value written to the
file is OR-ed with the current bitmap value. Only user memory pages can be
marked idle, for other page types input is silently ignored. Writing to this
file beyond max PFN results in the ENXIO error. Only available when
CONFIG_IDLE_PAGE_TRACKING is set.
This file can be used to estimate the amount of pages that are not
used by a particular workload as follows:
1. mark all pages of interest idle by setting corresponding bits in the
/sys/kernel/mm/page_idle/bitmap
2. wait until the workload accesses its working set
3. read /sys/kernel/mm/page_idle/bitmap and count the number of bits set
* /proc/kpagecgroup. This file contains a 64-bit inode number of the
memory cgroup each page is charged to, indexed by PFN. Only available when
CONFIG_MEMCG is set.
This file can be used to find all pages (including unmapped file pages)
accounted to a particular cgroup. Using /sys/kernel/mm/page_idle/bitmap, one
can then estimate the cgroup working set size.
For an example of using these files for estimating the amount of unused
memory pages per each memory cgroup, please see the script attached
below.
==== REASONING ====
The reason to introduce the new user API instead of using
/proc/PID/{clear_refs,smaps} is that the latter has two serious
drawbacks:
- it does not count unmapped file pages
- it affects the reclaimer logic
The new API attempts to overcome them both. For more details on how it
is achieved, please see the comment to patch 6.
==== PATCHSET STRUCTURE ====
The patch set is organized as follows:
- patch 1 adds page_cgroup_ino() helper for the sake of
/proc/kpagecgroup and patches 2-3 do related cleanup
- patch 4 adds /proc/kpagecgroup, which reports cgroup ino each page is
charged to
- patch 5 introduces a new mmu notifier callback, clear_young, which is
a lightweight version of clear_flush_young; it is used in patch 6
- patch 6 implements the idle page tracking feature, including the
userspace API, /sys/kernel/mm/page_idle/bitmap
- patch 7 exports idle flag via /proc/kpageflags
==== SIMILAR WORKS ====
Originally, the patch for tracking idle memory was proposed back in 2011
by Michel Lespinasse (see http://lwn.net/Articles/459269/). The main
difference between Michel's patch and this one is that Michel implemented
a kernel space daemon for estimating idle memory size per cgroup while
this patch only provides the userspace with the minimal API for doing the
job, leaving the rest up to the userspace. However, they both share the
same idea of Idle/Young page flags to avoid affecting the reclaimer logic.
==== PERFORMANCE EVALUATION ====
SPECjvm2008 (https://www.spec.org/jvm2008/) was used to evaluate the
performance impact introduced by this patch set. Three runs were carried
out:
- base: kernel without the patch
- patched: patched kernel, the feature is not used
- patched-active: patched kernel, 1 minute-period daemon is used for
tracking idle memory
For tracking idle memory, idlememstat utility was used:
https://github.com/locker/idlememstat
testcase base patched patched-active
compiler 537.40 ( 0.00)% 532.26 (-0.96)% 538.31 ( 0.17)%
compress 305.47 ( 0.00)% 301.08 (-1.44)% 300.71 (-1.56)%
crypto 284.32 ( 0.00)% 282.21 (-0.74)% 284.87 ( 0.19)%
derby 411.05 ( 0.00)% 413.44 ( 0.58)% 412.07 ( 0.25)%
mpegaudio 189.96 ( 0.00)% 190.87 ( 0.48)% 189.42 (-0.28)%
scimark.large 46.85 ( 0.00)% 46.41 (-0.94)% 47.83 ( 2.09)%
scimark.small 412.91 ( 0.00)% 415.41 ( 0.61)% 421.17 ( 2.00)%
serial 204.23 ( 0.00)% 213.46 ( 4.52)% 203.17 (-0.52)%
startup 36.76 ( 0.00)% 35.49 (-3.45)% 35.64 (-3.05)%
sunflow 115.34 ( 0.00)% 115.08 (-0.23)% 117.37 ( 1.76)%
xml 620.55 ( 0.00)% 619.95 (-0.10)% 620.39 (-0.03)%
composite 211.50 ( 0.00)% 211.15 (-0.17)% 211.67 ( 0.08)%
time idlememstat:
17.20user 65.16system 2:15:23elapsed 1%CPU (0avgtext+0avgdata 8476maxresident)k
448inputs+40outputs (1major+36052minor)pagefaults 0swaps
==== SCRIPT FOR COUNTING IDLE PAGES PER CGROUP ====
#! /usr/bin/python
#
import os
import stat
import errno
import struct
CGROUP_MOUNT = "/sys/fs/cgroup/memory"
BUFSIZE = 8 * 1024 # must be multiple of 8
def get_hugepage_size():
with open("/proc/meminfo", "r") as f:
for s in f:
k, v = s.split(":")
if k == "Hugepagesize":
return int(v.split()[0]) * 1024
PAGE_SIZE = os.sysconf("SC_PAGE_SIZE")
HUGEPAGE_SIZE = get_hugepage_size()
def set_idle():
f = open("/sys/kernel/mm/page_idle/bitmap", "wb", BUFSIZE)
while True:
try:
f.write(struct.pack("Q", pow(2, 64) - 1))
except IOError as err:
if err.errno == errno.ENXIO:
break
raise
f.close()
def count_idle():
f_flags = open("/proc/kpageflags", "rb", BUFSIZE)
f_cgroup = open("/proc/kpagecgroup", "rb", BUFSIZE)
with open("/sys/kernel/mm/page_idle/bitmap", "rb", BUFSIZE) as f:
while f.read(BUFSIZE): pass # update idle flag
idlememsz = {}
while True:
s1, s2 = f_flags.read(8), f_cgroup.read(8)
if not s1 or not s2:
break
flags, = struct.unpack('Q', s1)
cgino, = struct.unpack('Q', s2)
unevictable = (flags >> 18) & 1
huge = (flags >> 22) & 1
idle = (flags >> 25) & 1
if idle and not unevictable:
idlememsz[cgino] = idlememsz.get(cgino, 0) + \
(HUGEPAGE_SIZE if huge else PAGE_SIZE)
f_flags.close()
f_cgroup.close()
return idlememsz
if __name__ == "__main__":
print "Setting the idle flag for each page..."
set_idle()
raw_input("Wait until the workload accesses its working set, "
"then press Enter")
print "Counting idle pages..."
idlememsz = count_idle()
for dir, subdirs, files in os.walk(CGROUP_MOUNT):
ino = os.stat(dir)[stat.ST_INO]
print dir + ": " + str(idlememsz.get(ino, 0) / 1024) + " kB"
==== END SCRIPT ====
This patch (of 8):
Add page_cgroup_ino() helper to memcg.
This function returns the inode number of the closest online ancestor of
the memory cgroup a page is charged to. It is required for exporting
information about which page is charged to which cgroup to userspace,
which will be introduced by a following patch.
Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Reviewed-by: Andres Lagar-Cavilla <andreslc@google.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Raghavendra K T <raghavendra.kt@linux.vnet.ibm.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Greg Thelen <gthelen@google.com>
Cc: Michel Lespinasse <walken@google.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Pavel Emelyanov <xemul@parallels.com>
Cc: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-09-10 05:35:28 +07:00
|
|
|
/**
|
|
|
|
* page_cgroup_ino - return inode number of the memcg a page is charged to
|
|
|
|
* @page: the page
|
|
|
|
*
|
|
|
|
* Look up the closest online ancestor of the memory cgroup @page is charged to
|
|
|
|
* and return its inode number or 0 if @page is not charged to any cgroup. It
|
|
|
|
* is safe to call this function without holding a reference to @page.
|
|
|
|
*
|
|
|
|
* Note, this function is inherently racy, because there is nothing to prevent
|
|
|
|
* the cgroup inode from getting torn down and potentially reallocated a moment
|
|
|
|
* after page_cgroup_ino() returns, so it only should be used by callers that
|
|
|
|
* do not care (such as procfs interfaces).
|
|
|
|
*/
|
|
|
|
ino_t page_cgroup_ino(struct page *page)
|
|
|
|
{
|
|
|
|
struct mem_cgroup *memcg;
|
|
|
|
unsigned long ino = 0;
|
|
|
|
|
|
|
|
rcu_read_lock();
|
2020-08-07 13:21:10 +07:00
|
|
|
memcg = page->mem_cgroup;
|
2020-08-07 13:20:52 +07:00
|
|
|
|
2020-08-07 13:21:10 +07:00
|
|
|
/*
|
|
|
|
* The lowest bit set means that memcg isn't a valid
|
|
|
|
* memcg pointer, but a obj_cgroups pointer.
|
|
|
|
* In this case the page is shared and doesn't belong
|
|
|
|
* to any specific memory cgroup.
|
|
|
|
*/
|
|
|
|
if ((unsigned long) memcg & 0x1UL)
|
|
|
|
memcg = NULL;
|
2020-08-07 13:20:52 +07:00
|
|
|
|
memcg: add page_cgroup_ino helper
This patchset introduces a new user API for tracking user memory pages
that have not been used for a given period of time. The purpose of this
is to provide the userspace with the means of tracking a workload's
working set, i.e. the set of pages that are actively used by the
workload. Knowing the working set size can be useful for partitioning the
system more efficiently, e.g. by tuning memory cgroup limits
appropriately, or for job placement within a compute cluster.
==== USE CASES ====
The unified cgroup hierarchy has memory.low and memory.high knobs, which
are defined as the low and high boundaries for the workload working set
size. However, the working set size of a workload may be unknown or
change in time. With this patch set, one can periodically estimate the
amount of memory unused by each cgroup and tune their memory.low and
memory.high parameters accordingly, therefore optimizing the overall
memory utilization.
Another use case is balancing workloads within a compute cluster. Knowing
how much memory is not really used by a workload unit may help take a more
optimal decision when considering migrating the unit to another node
within the cluster.
Also, as noted by Minchan, this would be useful for per-process reclaim
(https://lwn.net/Articles/545668/). With idle tracking, we could reclaim idle
pages only by smart user memory manager.
==== USER API ====
The user API consists of two new files:
* /sys/kernel/mm/page_idle/bitmap. This file implements a bitmap where each
bit corresponds to a page, indexed by PFN. When the bit is set, the
corresponding page is idle. A page is considered idle if it has not been
accessed since it was marked idle. To mark a page idle one should set the
bit corresponding to the page by writing to the file. A value written to the
file is OR-ed with the current bitmap value. Only user memory pages can be
marked idle, for other page types input is silently ignored. Writing to this
file beyond max PFN results in the ENXIO error. Only available when
CONFIG_IDLE_PAGE_TRACKING is set.
This file can be used to estimate the amount of pages that are not
used by a particular workload as follows:
1. mark all pages of interest idle by setting corresponding bits in the
/sys/kernel/mm/page_idle/bitmap
2. wait until the workload accesses its working set
3. read /sys/kernel/mm/page_idle/bitmap and count the number of bits set
* /proc/kpagecgroup. This file contains a 64-bit inode number of the
memory cgroup each page is charged to, indexed by PFN. Only available when
CONFIG_MEMCG is set.
This file can be used to find all pages (including unmapped file pages)
accounted to a particular cgroup. Using /sys/kernel/mm/page_idle/bitmap, one
can then estimate the cgroup working set size.
For an example of using these files for estimating the amount of unused
memory pages per each memory cgroup, please see the script attached
below.
==== REASONING ====
The reason to introduce the new user API instead of using
/proc/PID/{clear_refs,smaps} is that the latter has two serious
drawbacks:
- it does not count unmapped file pages
- it affects the reclaimer logic
The new API attempts to overcome them both. For more details on how it
is achieved, please see the comment to patch 6.
==== PATCHSET STRUCTURE ====
The patch set is organized as follows:
- patch 1 adds page_cgroup_ino() helper for the sake of
/proc/kpagecgroup and patches 2-3 do related cleanup
- patch 4 adds /proc/kpagecgroup, which reports cgroup ino each page is
charged to
- patch 5 introduces a new mmu notifier callback, clear_young, which is
a lightweight version of clear_flush_young; it is used in patch 6
- patch 6 implements the idle page tracking feature, including the
userspace API, /sys/kernel/mm/page_idle/bitmap
- patch 7 exports idle flag via /proc/kpageflags
==== SIMILAR WORKS ====
Originally, the patch for tracking idle memory was proposed back in 2011
by Michel Lespinasse (see http://lwn.net/Articles/459269/). The main
difference between Michel's patch and this one is that Michel implemented
a kernel space daemon for estimating idle memory size per cgroup while
this patch only provides the userspace with the minimal API for doing the
job, leaving the rest up to the userspace. However, they both share the
same idea of Idle/Young page flags to avoid affecting the reclaimer logic.
==== PERFORMANCE EVALUATION ====
SPECjvm2008 (https://www.spec.org/jvm2008/) was used to evaluate the
performance impact introduced by this patch set. Three runs were carried
out:
- base: kernel without the patch
- patched: patched kernel, the feature is not used
- patched-active: patched kernel, 1 minute-period daemon is used for
tracking idle memory
For tracking idle memory, idlememstat utility was used:
https://github.com/locker/idlememstat
testcase base patched patched-active
compiler 537.40 ( 0.00)% 532.26 (-0.96)% 538.31 ( 0.17)%
compress 305.47 ( 0.00)% 301.08 (-1.44)% 300.71 (-1.56)%
crypto 284.32 ( 0.00)% 282.21 (-0.74)% 284.87 ( 0.19)%
derby 411.05 ( 0.00)% 413.44 ( 0.58)% 412.07 ( 0.25)%
mpegaudio 189.96 ( 0.00)% 190.87 ( 0.48)% 189.42 (-0.28)%
scimark.large 46.85 ( 0.00)% 46.41 (-0.94)% 47.83 ( 2.09)%
scimark.small 412.91 ( 0.00)% 415.41 ( 0.61)% 421.17 ( 2.00)%
serial 204.23 ( 0.00)% 213.46 ( 4.52)% 203.17 (-0.52)%
startup 36.76 ( 0.00)% 35.49 (-3.45)% 35.64 (-3.05)%
sunflow 115.34 ( 0.00)% 115.08 (-0.23)% 117.37 ( 1.76)%
xml 620.55 ( 0.00)% 619.95 (-0.10)% 620.39 (-0.03)%
composite 211.50 ( 0.00)% 211.15 (-0.17)% 211.67 ( 0.08)%
time idlememstat:
17.20user 65.16system 2:15:23elapsed 1%CPU (0avgtext+0avgdata 8476maxresident)k
448inputs+40outputs (1major+36052minor)pagefaults 0swaps
==== SCRIPT FOR COUNTING IDLE PAGES PER CGROUP ====
#! /usr/bin/python
#
import os
import stat
import errno
import struct
CGROUP_MOUNT = "/sys/fs/cgroup/memory"
BUFSIZE = 8 * 1024 # must be multiple of 8
def get_hugepage_size():
with open("/proc/meminfo", "r") as f:
for s in f:
k, v = s.split(":")
if k == "Hugepagesize":
return int(v.split()[0]) * 1024
PAGE_SIZE = os.sysconf("SC_PAGE_SIZE")
HUGEPAGE_SIZE = get_hugepage_size()
def set_idle():
f = open("/sys/kernel/mm/page_idle/bitmap", "wb", BUFSIZE)
while True:
try:
f.write(struct.pack("Q", pow(2, 64) - 1))
except IOError as err:
if err.errno == errno.ENXIO:
break
raise
f.close()
def count_idle():
f_flags = open("/proc/kpageflags", "rb", BUFSIZE)
f_cgroup = open("/proc/kpagecgroup", "rb", BUFSIZE)
with open("/sys/kernel/mm/page_idle/bitmap", "rb", BUFSIZE) as f:
while f.read(BUFSIZE): pass # update idle flag
idlememsz = {}
while True:
s1, s2 = f_flags.read(8), f_cgroup.read(8)
if not s1 or not s2:
break
flags, = struct.unpack('Q', s1)
cgino, = struct.unpack('Q', s2)
unevictable = (flags >> 18) & 1
huge = (flags >> 22) & 1
idle = (flags >> 25) & 1
if idle and not unevictable:
idlememsz[cgino] = idlememsz.get(cgino, 0) + \
(HUGEPAGE_SIZE if huge else PAGE_SIZE)
f_flags.close()
f_cgroup.close()
return idlememsz
if __name__ == "__main__":
print "Setting the idle flag for each page..."
set_idle()
raw_input("Wait until the workload accesses its working set, "
"then press Enter")
print "Counting idle pages..."
idlememsz = count_idle()
for dir, subdirs, files in os.walk(CGROUP_MOUNT):
ino = os.stat(dir)[stat.ST_INO]
print dir + ": " + str(idlememsz.get(ino, 0) / 1024) + " kB"
==== END SCRIPT ====
This patch (of 8):
Add page_cgroup_ino() helper to memcg.
This function returns the inode number of the closest online ancestor of
the memory cgroup a page is charged to. It is required for exporting
information about which page is charged to which cgroup to userspace,
which will be introduced by a following patch.
Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Reviewed-by: Andres Lagar-Cavilla <andreslc@google.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Raghavendra K T <raghavendra.kt@linux.vnet.ibm.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Greg Thelen <gthelen@google.com>
Cc: Michel Lespinasse <walken@google.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Pavel Emelyanov <xemul@parallels.com>
Cc: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-09-10 05:35:28 +07:00
|
|
|
while (memcg && !(memcg->css.flags & CSS_ONLINE))
|
|
|
|
memcg = parent_mem_cgroup(memcg);
|
|
|
|
if (memcg)
|
|
|
|
ino = cgroup_ino(memcg->css.cgroup);
|
|
|
|
rcu_read_unlock();
|
|
|
|
return ino;
|
|
|
|
}
|
|
|
|
|
2016-07-29 05:46:05 +07:00
|
|
|
static struct mem_cgroup_per_node *
|
|
|
|
mem_cgroup_page_nodeinfo(struct mem_cgroup *memcg, struct page *page)
|
2009-09-24 05:56:37 +07:00
|
|
|
{
|
2011-03-24 06:42:27 +07:00
|
|
|
int nid = page_to_nid(page);
|
2009-09-24 05:56:37 +07:00
|
|
|
|
2016-07-29 05:46:05 +07:00
|
|
|
return memcg->nodeinfo[nid];
|
2009-09-24 05:56:37 +07:00
|
|
|
}
|
|
|
|
|
2016-07-29 05:46:05 +07:00
|
|
|
static struct mem_cgroup_tree_per_node *
|
|
|
|
soft_limit_tree_node(int nid)
|
2013-09-25 05:27:40 +07:00
|
|
|
{
|
2016-07-29 05:46:05 +07:00
|
|
|
return soft_limit_tree.rb_tree_per_node[nid];
|
2013-09-25 05:27:40 +07:00
|
|
|
}
|
|
|
|
|
2016-07-29 05:46:05 +07:00
|
|
|
static struct mem_cgroup_tree_per_node *
|
2013-09-25 05:27:40 +07:00
|
|
|
soft_limit_tree_from_page(struct page *page)
|
|
|
|
{
|
|
|
|
int nid = page_to_nid(page);
|
|
|
|
|
2016-07-29 05:46:05 +07:00
|
|
|
return soft_limit_tree.rb_tree_per_node[nid];
|
2013-09-25 05:27:40 +07:00
|
|
|
}
|
|
|
|
|
2016-07-29 05:46:05 +07:00
|
|
|
static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz,
|
|
|
|
struct mem_cgroup_tree_per_node *mctz,
|
mm: memcontrol: lockless page counters
Memory is internally accounted in bytes, using spinlock-protected 64-bit
counters, even though the smallest accounting delta is a page. The
counter interface is also convoluted and does too many things.
Introduce a new lockless word-sized page counter API, then change all
memory accounting over to it. The translation from and to bytes then only
happens when interfacing with userspace.
The removed locking overhead is noticable when scaling beyond the per-cpu
charge caches - on a 4-socket machine with 144-threads, the following test
shows the performance differences of 288 memcgs concurrently running a
page fault benchmark:
vanilla:
18631648.500498 task-clock (msec) # 140.643 CPUs utilized ( +- 0.33% )
1,380,638 context-switches # 0.074 K/sec ( +- 0.75% )
24,390 cpu-migrations # 0.001 K/sec ( +- 8.44% )
1,843,305,768 page-faults # 0.099 M/sec ( +- 0.00% )
50,134,994,088,218 cycles # 2.691 GHz ( +- 0.33% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
8,049,712,224,651 instructions # 0.16 insns per cycle ( +- 0.04% )
1,586,970,584,979 branches # 85.176 M/sec ( +- 0.05% )
1,724,989,949 branch-misses # 0.11% of all branches ( +- 0.48% )
132.474343877 seconds time elapsed ( +- 0.21% )
lockless:
12195979.037525 task-clock (msec) # 133.480 CPUs utilized ( +- 0.18% )
832,850 context-switches # 0.068 K/sec ( +- 0.54% )
15,624 cpu-migrations # 0.001 K/sec ( +- 10.17% )
1,843,304,774 page-faults # 0.151 M/sec ( +- 0.00% )
32,811,216,801,141 cycles # 2.690 GHz ( +- 0.18% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
9,999,265,091,727 instructions # 0.30 insns per cycle ( +- 0.10% )
2,076,759,325,203 branches # 170.282 M/sec ( +- 0.12% )
1,656,917,214 branch-misses # 0.08% of all branches ( +- 0.55% )
91.369330729 seconds time elapsed ( +- 0.45% )
On top of improved scalability, this also gets rid of the icky long long
types in the very heart of memcg, which is great for 32 bit and also makes
the code a lot more readable.
Notable differences between the old and new API:
- res_counter_charge() and res_counter_charge_nofail() become
page_counter_try_charge() and page_counter_charge() resp. to match
the more common kernel naming scheme of try_do()/do()
- res_counter_uncharge_until() is only ever used to cancel a local
counter and never to uncharge bigger segments of a hierarchy, so
it's replaced by the simpler page_counter_cancel()
- res_counter_set_limit() is replaced by page_counter_limit(), which
expects its callers to serialize against themselves
- res_counter_memparse_write_strategy() is replaced by
page_counter_limit(), which rounds down to the nearest page size -
rather than up. This is more reasonable for explicitely requested
hard upper limits.
- to keep charging light-weight, page_counter_try_charge() charges
speculatively, only to roll back if the result exceeds the limit.
Because of this, a failing bigger charge can temporarily lock out
smaller charges that would otherwise succeed. The error is bounded
to the difference between the smallest and the biggest possible
charge size, so for memcg, this means that a failing THP charge can
send base page charges into reclaim upto 2MB (4MB) before the limit
would have been reached. This should be acceptable.
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE and memparse]
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE, memparse, strncmp, and PAGE_SIZE]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Acked-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-12-11 06:42:31 +07:00
|
|
|
unsigned long new_usage_in_excess)
|
2013-09-25 05:27:40 +07:00
|
|
|
{
|
|
|
|
struct rb_node **p = &mctz->rb_root.rb_node;
|
|
|
|
struct rb_node *parent = NULL;
|
2016-07-29 05:46:05 +07:00
|
|
|
struct mem_cgroup_per_node *mz_node;
|
2017-09-09 06:15:21 +07:00
|
|
|
bool rightmost = true;
|
2013-09-25 05:27:40 +07:00
|
|
|
|
|
|
|
if (mz->on_tree)
|
|
|
|
return;
|
|
|
|
|
|
|
|
mz->usage_in_excess = new_usage_in_excess;
|
|
|
|
if (!mz->usage_in_excess)
|
|
|
|
return;
|
|
|
|
while (*p) {
|
|
|
|
parent = *p;
|
2016-07-29 05:46:05 +07:00
|
|
|
mz_node = rb_entry(parent, struct mem_cgroup_per_node,
|
2013-09-25 05:27:40 +07:00
|
|
|
tree_node);
|
2017-09-09 06:15:21 +07:00
|
|
|
if (mz->usage_in_excess < mz_node->usage_in_excess) {
|
2013-09-25 05:27:40 +07:00
|
|
|
p = &(*p)->rb_left;
|
2017-09-09 06:15:21 +07:00
|
|
|
rightmost = false;
|
|
|
|
}
|
|
|
|
|
2013-09-25 05:27:40 +07:00
|
|
|
/*
|
|
|
|
* We can't avoid mem cgroups that are over their soft
|
|
|
|
* limit by the same amount
|
|
|
|
*/
|
|
|
|
else if (mz->usage_in_excess >= mz_node->usage_in_excess)
|
|
|
|
p = &(*p)->rb_right;
|
|
|
|
}
|
2017-09-09 06:15:21 +07:00
|
|
|
|
|
|
|
if (rightmost)
|
|
|
|
mctz->rb_rightmost = &mz->tree_node;
|
|
|
|
|
2013-09-25 05:27:40 +07:00
|
|
|
rb_link_node(&mz->tree_node, parent, p);
|
|
|
|
rb_insert_color(&mz->tree_node, &mctz->rb_root);
|
|
|
|
mz->on_tree = true;
|
|
|
|
}
|
|
|
|
|
2016-07-29 05:46:05 +07:00
|
|
|
static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz,
|
|
|
|
struct mem_cgroup_tree_per_node *mctz)
|
2013-09-25 05:27:40 +07:00
|
|
|
{
|
|
|
|
if (!mz->on_tree)
|
|
|
|
return;
|
2017-09-09 06:15:21 +07:00
|
|
|
|
|
|
|
if (&mz->tree_node == mctz->rb_rightmost)
|
|
|
|
mctz->rb_rightmost = rb_prev(&mz->tree_node);
|
|
|
|
|
2013-09-25 05:27:40 +07:00
|
|
|
rb_erase(&mz->tree_node, &mctz->rb_root);
|
|
|
|
mz->on_tree = false;
|
|
|
|
}
|
|
|
|
|
2016-07-29 05:46:05 +07:00
|
|
|
static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz,
|
|
|
|
struct mem_cgroup_tree_per_node *mctz)
|
2013-09-25 05:27:40 +07:00
|
|
|
{
|
mm: memcontrol: rewrite uncharge API
The memcg uncharging code that is involved towards the end of a page's
lifetime - truncation, reclaim, swapout, migration - is impressively
complicated and fragile.
Because anonymous and file pages were always charged before they had their
page->mapping established, uncharges had to happen when the page type
could still be known from the context; as in unmap for anonymous, page
cache removal for file and shmem pages, and swap cache truncation for swap
pages. However, these operations happen well before the page is actually
freed, and so a lot of synchronization is necessary:
- Charging, uncharging, page migration, and charge migration all need
to take a per-page bit spinlock as they could race with uncharging.
- Swap cache truncation happens during both swap-in and swap-out, and
possibly repeatedly before the page is actually freed. This means
that the memcg swapout code is called from many contexts that make
no sense and it has to figure out the direction from page state to
make sure memory and memory+swap are always correctly charged.
- On page migration, the old page might be unmapped but then reused,
so memcg code has to prevent untimely uncharging in that case.
Because this code - which should be a simple charge transfer - is so
special-cased, it is not reusable for replace_page_cache().
But now that charged pages always have a page->mapping, introduce
mem_cgroup_uncharge(), which is called after the final put_page(), when we
know for sure that nobody is looking at the page anymore.
For page migration, introduce mem_cgroup_migrate(), which is called after
the migration is successful and the new page is fully rmapped. Because
the old page is no longer uncharged after migration, prevent double
charges by decoupling the page's memcg association (PCG_USED and
pc->mem_cgroup) from the page holding an actual charge. The new bits
PCG_MEM and PCG_MEMSW represent the respective charges and are transferred
to the new page during migration.
mem_cgroup_migrate() is suitable for replace_page_cache() as well,
which gets rid of mem_cgroup_replace_page_cache(). However, care
needs to be taken because both the source and the target page can
already be charged and on the LRU when fuse is splicing: grab the page
lock on the charge moving side to prevent changing pc->mem_cgroup of a
page under migration. Also, the lruvecs of both pages change as we
uncharge the old and charge the new during migration, and putback may
race with us, so grab the lru lock and isolate the pages iff on LRU to
prevent races and ensure the pages are on the right lruvec afterward.
Swap accounting is massively simplified: because the page is no longer
uncharged as early as swap cache deletion, a new mem_cgroup_swapout() can
transfer the page's memory+swap charge (PCG_MEMSW) to the swap entry
before the final put_page() in page reclaim.
Finally, page_cgroup changes are now protected by whatever protection the
page itself offers: anonymous pages are charged under the page table lock,
whereas page cache insertions, swapin, and migration hold the page lock.
Uncharging happens under full exclusion with no outstanding references.
Charging and uncharging also ensure that the page is off-LRU, which
serializes against charge migration. Remove the very costly page_cgroup
lock and set pc->flags non-atomically.
[mhocko@suse.cz: mem_cgroup_charge_statistics needs preempt_disable]
[vdavydov@parallels.com: fix flags definition]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Vladimir Davydov <vdavydov@parallels.com>
Tested-by: Jet Chen <jet.chen@intel.com>
Acked-by: Michal Hocko <mhocko@suse.cz>
Tested-by: Felipe Balbi <balbi@ti.com>
Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-08-09 04:19:22 +07:00
|
|
|
unsigned long flags;
|
|
|
|
|
|
|
|
spin_lock_irqsave(&mctz->lock, flags);
|
2014-06-07 04:38:21 +07:00
|
|
|
__mem_cgroup_remove_exceeded(mz, mctz);
|
mm: memcontrol: rewrite uncharge API
The memcg uncharging code that is involved towards the end of a page's
lifetime - truncation, reclaim, swapout, migration - is impressively
complicated and fragile.
Because anonymous and file pages were always charged before they had their
page->mapping established, uncharges had to happen when the page type
could still be known from the context; as in unmap for anonymous, page
cache removal for file and shmem pages, and swap cache truncation for swap
pages. However, these operations happen well before the page is actually
freed, and so a lot of synchronization is necessary:
- Charging, uncharging, page migration, and charge migration all need
to take a per-page bit spinlock as they could race with uncharging.
- Swap cache truncation happens during both swap-in and swap-out, and
possibly repeatedly before the page is actually freed. This means
that the memcg swapout code is called from many contexts that make
no sense and it has to figure out the direction from page state to
make sure memory and memory+swap are always correctly charged.
- On page migration, the old page might be unmapped but then reused,
so memcg code has to prevent untimely uncharging in that case.
Because this code - which should be a simple charge transfer - is so
special-cased, it is not reusable for replace_page_cache().
But now that charged pages always have a page->mapping, introduce
mem_cgroup_uncharge(), which is called after the final put_page(), when we
know for sure that nobody is looking at the page anymore.
For page migration, introduce mem_cgroup_migrate(), which is called after
the migration is successful and the new page is fully rmapped. Because
the old page is no longer uncharged after migration, prevent double
charges by decoupling the page's memcg association (PCG_USED and
pc->mem_cgroup) from the page holding an actual charge. The new bits
PCG_MEM and PCG_MEMSW represent the respective charges and are transferred
to the new page during migration.
mem_cgroup_migrate() is suitable for replace_page_cache() as well,
which gets rid of mem_cgroup_replace_page_cache(). However, care
needs to be taken because both the source and the target page can
already be charged and on the LRU when fuse is splicing: grab the page
lock on the charge moving side to prevent changing pc->mem_cgroup of a
page under migration. Also, the lruvecs of both pages change as we
uncharge the old and charge the new during migration, and putback may
race with us, so grab the lru lock and isolate the pages iff on LRU to
prevent races and ensure the pages are on the right lruvec afterward.
Swap accounting is massively simplified: because the page is no longer
uncharged as early as swap cache deletion, a new mem_cgroup_swapout() can
transfer the page's memory+swap charge (PCG_MEMSW) to the swap entry
before the final put_page() in page reclaim.
Finally, page_cgroup changes are now protected by whatever protection the
page itself offers: anonymous pages are charged under the page table lock,
whereas page cache insertions, swapin, and migration hold the page lock.
Uncharging happens under full exclusion with no outstanding references.
Charging and uncharging also ensure that the page is off-LRU, which
serializes against charge migration. Remove the very costly page_cgroup
lock and set pc->flags non-atomically.
[mhocko@suse.cz: mem_cgroup_charge_statistics needs preempt_disable]
[vdavydov@parallels.com: fix flags definition]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Vladimir Davydov <vdavydov@parallels.com>
Tested-by: Jet Chen <jet.chen@intel.com>
Acked-by: Michal Hocko <mhocko@suse.cz>
Tested-by: Felipe Balbi <balbi@ti.com>
Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-08-09 04:19:22 +07:00
|
|
|
spin_unlock_irqrestore(&mctz->lock, flags);
|
2013-09-25 05:27:40 +07:00
|
|
|
}
|
|
|
|
|
mm: memcontrol: lockless page counters
Memory is internally accounted in bytes, using spinlock-protected 64-bit
counters, even though the smallest accounting delta is a page. The
counter interface is also convoluted and does too many things.
Introduce a new lockless word-sized page counter API, then change all
memory accounting over to it. The translation from and to bytes then only
happens when interfacing with userspace.
The removed locking overhead is noticable when scaling beyond the per-cpu
charge caches - on a 4-socket machine with 144-threads, the following test
shows the performance differences of 288 memcgs concurrently running a
page fault benchmark:
vanilla:
18631648.500498 task-clock (msec) # 140.643 CPUs utilized ( +- 0.33% )
1,380,638 context-switches # 0.074 K/sec ( +- 0.75% )
24,390 cpu-migrations # 0.001 K/sec ( +- 8.44% )
1,843,305,768 page-faults # 0.099 M/sec ( +- 0.00% )
50,134,994,088,218 cycles # 2.691 GHz ( +- 0.33% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
8,049,712,224,651 instructions # 0.16 insns per cycle ( +- 0.04% )
1,586,970,584,979 branches # 85.176 M/sec ( +- 0.05% )
1,724,989,949 branch-misses # 0.11% of all branches ( +- 0.48% )
132.474343877 seconds time elapsed ( +- 0.21% )
lockless:
12195979.037525 task-clock (msec) # 133.480 CPUs utilized ( +- 0.18% )
832,850 context-switches # 0.068 K/sec ( +- 0.54% )
15,624 cpu-migrations # 0.001 K/sec ( +- 10.17% )
1,843,304,774 page-faults # 0.151 M/sec ( +- 0.00% )
32,811,216,801,141 cycles # 2.690 GHz ( +- 0.18% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
9,999,265,091,727 instructions # 0.30 insns per cycle ( +- 0.10% )
2,076,759,325,203 branches # 170.282 M/sec ( +- 0.12% )
1,656,917,214 branch-misses # 0.08% of all branches ( +- 0.55% )
91.369330729 seconds time elapsed ( +- 0.45% )
On top of improved scalability, this also gets rid of the icky long long
types in the very heart of memcg, which is great for 32 bit and also makes
the code a lot more readable.
Notable differences between the old and new API:
- res_counter_charge() and res_counter_charge_nofail() become
page_counter_try_charge() and page_counter_charge() resp. to match
the more common kernel naming scheme of try_do()/do()
- res_counter_uncharge_until() is only ever used to cancel a local
counter and never to uncharge bigger segments of a hierarchy, so
it's replaced by the simpler page_counter_cancel()
- res_counter_set_limit() is replaced by page_counter_limit(), which
expects its callers to serialize against themselves
- res_counter_memparse_write_strategy() is replaced by
page_counter_limit(), which rounds down to the nearest page size -
rather than up. This is more reasonable for explicitely requested
hard upper limits.
- to keep charging light-weight, page_counter_try_charge() charges
speculatively, only to roll back if the result exceeds the limit.
Because of this, a failing bigger charge can temporarily lock out
smaller charges that would otherwise succeed. The error is bounded
to the difference between the smallest and the biggest possible
charge size, so for memcg, this means that a failing THP charge can
send base page charges into reclaim upto 2MB (4MB) before the limit
would have been reached. This should be acceptable.
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE and memparse]
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE, memparse, strncmp, and PAGE_SIZE]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Acked-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-12-11 06:42:31 +07:00
|
|
|
static unsigned long soft_limit_excess(struct mem_cgroup *memcg)
|
|
|
|
{
|
|
|
|
unsigned long nr_pages = page_counter_read(&memcg->memory);
|
2015-04-16 06:14:08 +07:00
|
|
|
unsigned long soft_limit = READ_ONCE(memcg->soft_limit);
|
mm: memcontrol: lockless page counters
Memory is internally accounted in bytes, using spinlock-protected 64-bit
counters, even though the smallest accounting delta is a page. The
counter interface is also convoluted and does too many things.
Introduce a new lockless word-sized page counter API, then change all
memory accounting over to it. The translation from and to bytes then only
happens when interfacing with userspace.
The removed locking overhead is noticable when scaling beyond the per-cpu
charge caches - on a 4-socket machine with 144-threads, the following test
shows the performance differences of 288 memcgs concurrently running a
page fault benchmark:
vanilla:
18631648.500498 task-clock (msec) # 140.643 CPUs utilized ( +- 0.33% )
1,380,638 context-switches # 0.074 K/sec ( +- 0.75% )
24,390 cpu-migrations # 0.001 K/sec ( +- 8.44% )
1,843,305,768 page-faults # 0.099 M/sec ( +- 0.00% )
50,134,994,088,218 cycles # 2.691 GHz ( +- 0.33% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
8,049,712,224,651 instructions # 0.16 insns per cycle ( +- 0.04% )
1,586,970,584,979 branches # 85.176 M/sec ( +- 0.05% )
1,724,989,949 branch-misses # 0.11% of all branches ( +- 0.48% )
132.474343877 seconds time elapsed ( +- 0.21% )
lockless:
12195979.037525 task-clock (msec) # 133.480 CPUs utilized ( +- 0.18% )
832,850 context-switches # 0.068 K/sec ( +- 0.54% )
15,624 cpu-migrations # 0.001 K/sec ( +- 10.17% )
1,843,304,774 page-faults # 0.151 M/sec ( +- 0.00% )
32,811,216,801,141 cycles # 2.690 GHz ( +- 0.18% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
9,999,265,091,727 instructions # 0.30 insns per cycle ( +- 0.10% )
2,076,759,325,203 branches # 170.282 M/sec ( +- 0.12% )
1,656,917,214 branch-misses # 0.08% of all branches ( +- 0.55% )
91.369330729 seconds time elapsed ( +- 0.45% )
On top of improved scalability, this also gets rid of the icky long long
types in the very heart of memcg, which is great for 32 bit and also makes
the code a lot more readable.
Notable differences between the old and new API:
- res_counter_charge() and res_counter_charge_nofail() become
page_counter_try_charge() and page_counter_charge() resp. to match
the more common kernel naming scheme of try_do()/do()
- res_counter_uncharge_until() is only ever used to cancel a local
counter and never to uncharge bigger segments of a hierarchy, so
it's replaced by the simpler page_counter_cancel()
- res_counter_set_limit() is replaced by page_counter_limit(), which
expects its callers to serialize against themselves
- res_counter_memparse_write_strategy() is replaced by
page_counter_limit(), which rounds down to the nearest page size -
rather than up. This is more reasonable for explicitely requested
hard upper limits.
- to keep charging light-weight, page_counter_try_charge() charges
speculatively, only to roll back if the result exceeds the limit.
Because of this, a failing bigger charge can temporarily lock out
smaller charges that would otherwise succeed. The error is bounded
to the difference between the smallest and the biggest possible
charge size, so for memcg, this means that a failing THP charge can
send base page charges into reclaim upto 2MB (4MB) before the limit
would have been reached. This should be acceptable.
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE and memparse]
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE, memparse, strncmp, and PAGE_SIZE]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Acked-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-12-11 06:42:31 +07:00
|
|
|
unsigned long excess = 0;
|
|
|
|
|
|
|
|
if (nr_pages > soft_limit)
|
|
|
|
excess = nr_pages - soft_limit;
|
|
|
|
|
|
|
|
return excess;
|
|
|
|
}
|
2013-09-25 05:27:40 +07:00
|
|
|
|
|
|
|
static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
|
|
|
|
{
|
mm: memcontrol: lockless page counters
Memory is internally accounted in bytes, using spinlock-protected 64-bit
counters, even though the smallest accounting delta is a page. The
counter interface is also convoluted and does too many things.
Introduce a new lockless word-sized page counter API, then change all
memory accounting over to it. The translation from and to bytes then only
happens when interfacing with userspace.
The removed locking overhead is noticable when scaling beyond the per-cpu
charge caches - on a 4-socket machine with 144-threads, the following test
shows the performance differences of 288 memcgs concurrently running a
page fault benchmark:
vanilla:
18631648.500498 task-clock (msec) # 140.643 CPUs utilized ( +- 0.33% )
1,380,638 context-switches # 0.074 K/sec ( +- 0.75% )
24,390 cpu-migrations # 0.001 K/sec ( +- 8.44% )
1,843,305,768 page-faults # 0.099 M/sec ( +- 0.00% )
50,134,994,088,218 cycles # 2.691 GHz ( +- 0.33% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
8,049,712,224,651 instructions # 0.16 insns per cycle ( +- 0.04% )
1,586,970,584,979 branches # 85.176 M/sec ( +- 0.05% )
1,724,989,949 branch-misses # 0.11% of all branches ( +- 0.48% )
132.474343877 seconds time elapsed ( +- 0.21% )
lockless:
12195979.037525 task-clock (msec) # 133.480 CPUs utilized ( +- 0.18% )
832,850 context-switches # 0.068 K/sec ( +- 0.54% )
15,624 cpu-migrations # 0.001 K/sec ( +- 10.17% )
1,843,304,774 page-faults # 0.151 M/sec ( +- 0.00% )
32,811,216,801,141 cycles # 2.690 GHz ( +- 0.18% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
9,999,265,091,727 instructions # 0.30 insns per cycle ( +- 0.10% )
2,076,759,325,203 branches # 170.282 M/sec ( +- 0.12% )
1,656,917,214 branch-misses # 0.08% of all branches ( +- 0.55% )
91.369330729 seconds time elapsed ( +- 0.45% )
On top of improved scalability, this also gets rid of the icky long long
types in the very heart of memcg, which is great for 32 bit and also makes
the code a lot more readable.
Notable differences between the old and new API:
- res_counter_charge() and res_counter_charge_nofail() become
page_counter_try_charge() and page_counter_charge() resp. to match
the more common kernel naming scheme of try_do()/do()
- res_counter_uncharge_until() is only ever used to cancel a local
counter and never to uncharge bigger segments of a hierarchy, so
it's replaced by the simpler page_counter_cancel()
- res_counter_set_limit() is replaced by page_counter_limit(), which
expects its callers to serialize against themselves
- res_counter_memparse_write_strategy() is replaced by
page_counter_limit(), which rounds down to the nearest page size -
rather than up. This is more reasonable for explicitely requested
hard upper limits.
- to keep charging light-weight, page_counter_try_charge() charges
speculatively, only to roll back if the result exceeds the limit.
Because of this, a failing bigger charge can temporarily lock out
smaller charges that would otherwise succeed. The error is bounded
to the difference between the smallest and the biggest possible
charge size, so for memcg, this means that a failing THP charge can
send base page charges into reclaim upto 2MB (4MB) before the limit
would have been reached. This should be acceptable.
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE and memparse]
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE, memparse, strncmp, and PAGE_SIZE]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Acked-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-12-11 06:42:31 +07:00
|
|
|
unsigned long excess;
|
2016-07-29 05:46:05 +07:00
|
|
|
struct mem_cgroup_per_node *mz;
|
|
|
|
struct mem_cgroup_tree_per_node *mctz;
|
2013-09-25 05:27:40 +07:00
|
|
|
|
2014-06-07 04:38:20 +07:00
|
|
|
mctz = soft_limit_tree_from_page(page);
|
mm/cgroup: avoid panic when init with low memory
The system may panic when initialisation is done when almost all the
memory is assigned to the huge pages using the kernel command line
parameter hugepage=xxxx. Panic may occur like this:
Unable to handle kernel paging request for data at address 0x00000000
Faulting instruction address: 0xc000000000302b88
Oops: Kernel access of bad area, sig: 11 [#1]
SMP NR_CPUS=2048 [ 0.082424] NUMA
pSeries
Modules linked in:
CPU: 0 PID: 1 Comm: swapper/0 Not tainted 4.9.0-15-generic #16-Ubuntu
task: c00000021ed01600 task.stack: c00000010d108000
NIP: c000000000302b88 LR: c000000000270e04 CTR: c00000000016cfd0
REGS: c00000010d10b2c0 TRAP: 0300 Not tainted (4.9.0-15-generic)
MSR: 8000000002009033 <SF,VEC,EE,ME,IR,DR,RI,LE>[ 0.082770] CR: 28424422 XER: 00000000
CFAR: c0000000003d28b8 DAR: 0000000000000000 DSISR: 40000000 SOFTE: 1
GPR00: c000000000270e04 c00000010d10b540 c00000000141a300 c00000010fff6300
GPR04: 0000000000000000 00000000026012c0 c00000010d10b630 0000000487ab0000
GPR08: 000000010ee90000 c000000001454fd8 0000000000000000 0000000000000000
GPR12: 0000000000004400 c00000000fb80000 00000000026012c0 00000000026012c0
GPR16: 00000000026012c0 0000000000000000 0000000000000000 0000000000000002
GPR20: 000000000000000c 0000000000000000 0000000000000000 00000000024200c0
GPR24: c0000000016eef48 0000000000000000 c00000010fff7d00 00000000026012c0
GPR28: 0000000000000000 c00000010fff7d00 c00000010fff6300 c00000010d10b6d0
NIP mem_cgroup_soft_limit_reclaim+0xf8/0x4f0
LR do_try_to_free_pages+0x1b4/0x450
Call Trace:
do_try_to_free_pages+0x1b4/0x450
try_to_free_pages+0xf8/0x270
__alloc_pages_nodemask+0x7a8/0xff0
new_slab+0x104/0x8e0
___slab_alloc+0x620/0x700
__slab_alloc+0x34/0x60
kmem_cache_alloc_node_trace+0xdc/0x310
mem_cgroup_init+0x158/0x1c8
do_one_initcall+0x68/0x1d0
kernel_init_freeable+0x278/0x360
kernel_init+0x24/0x170
ret_from_kernel_thread+0x5c/0x74
Instruction dump:
eb81ffe0 eba1ffe8 ebc1fff0 ebe1fff8 4e800020 3d230001 e9499a42 3d220004
3929acd8 794a1f24 7d295214 eac90100 <e9360000> 2fa90000 419eff74 3b200000
---[ end trace 342f5208b00d01b6 ]---
This is a chicken and egg issue where the kernel try to get free memory
when allocating per node data in mem_cgroup_init(), but in that path
mem_cgroup_soft_limit_reclaim() is called which assumes that these data
are allocated.
As mem_cgroup_soft_limit_reclaim() is best effort, it should return when
these data are not yet allocated.
This patch also fixes potential null pointer access in
mem_cgroup_remove_from_trees() and mem_cgroup_update_tree().
Link: http://lkml.kernel.org/r/1487856999-16581-2-git-send-email-ldufour@linux.vnet.ibm.com
Signed-off-by: Laurent Dufour <ldufour@linux.vnet.ibm.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Balbir Singh <bsingharora@gmail.com>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2017-03-10 07:17:06 +07:00
|
|
|
if (!mctz)
|
|
|
|
return;
|
2013-09-25 05:27:40 +07:00
|
|
|
/*
|
|
|
|
* Necessary to update all ancestors when hierarchy is used.
|
|
|
|
* because their event counter is not touched.
|
|
|
|
*/
|
|
|
|
for (; memcg; memcg = parent_mem_cgroup(memcg)) {
|
2016-07-29 05:46:05 +07:00
|
|
|
mz = mem_cgroup_page_nodeinfo(memcg, page);
|
mm: memcontrol: lockless page counters
Memory is internally accounted in bytes, using spinlock-protected 64-bit
counters, even though the smallest accounting delta is a page. The
counter interface is also convoluted and does too many things.
Introduce a new lockless word-sized page counter API, then change all
memory accounting over to it. The translation from and to bytes then only
happens when interfacing with userspace.
The removed locking overhead is noticable when scaling beyond the per-cpu
charge caches - on a 4-socket machine with 144-threads, the following test
shows the performance differences of 288 memcgs concurrently running a
page fault benchmark:
vanilla:
18631648.500498 task-clock (msec) # 140.643 CPUs utilized ( +- 0.33% )
1,380,638 context-switches # 0.074 K/sec ( +- 0.75% )
24,390 cpu-migrations # 0.001 K/sec ( +- 8.44% )
1,843,305,768 page-faults # 0.099 M/sec ( +- 0.00% )
50,134,994,088,218 cycles # 2.691 GHz ( +- 0.33% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
8,049,712,224,651 instructions # 0.16 insns per cycle ( +- 0.04% )
1,586,970,584,979 branches # 85.176 M/sec ( +- 0.05% )
1,724,989,949 branch-misses # 0.11% of all branches ( +- 0.48% )
132.474343877 seconds time elapsed ( +- 0.21% )
lockless:
12195979.037525 task-clock (msec) # 133.480 CPUs utilized ( +- 0.18% )
832,850 context-switches # 0.068 K/sec ( +- 0.54% )
15,624 cpu-migrations # 0.001 K/sec ( +- 10.17% )
1,843,304,774 page-faults # 0.151 M/sec ( +- 0.00% )
32,811,216,801,141 cycles # 2.690 GHz ( +- 0.18% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
9,999,265,091,727 instructions # 0.30 insns per cycle ( +- 0.10% )
2,076,759,325,203 branches # 170.282 M/sec ( +- 0.12% )
1,656,917,214 branch-misses # 0.08% of all branches ( +- 0.55% )
91.369330729 seconds time elapsed ( +- 0.45% )
On top of improved scalability, this also gets rid of the icky long long
types in the very heart of memcg, which is great for 32 bit and also makes
the code a lot more readable.
Notable differences between the old and new API:
- res_counter_charge() and res_counter_charge_nofail() become
page_counter_try_charge() and page_counter_charge() resp. to match
the more common kernel naming scheme of try_do()/do()
- res_counter_uncharge_until() is only ever used to cancel a local
counter and never to uncharge bigger segments of a hierarchy, so
it's replaced by the simpler page_counter_cancel()
- res_counter_set_limit() is replaced by page_counter_limit(), which
expects its callers to serialize against themselves
- res_counter_memparse_write_strategy() is replaced by
page_counter_limit(), which rounds down to the nearest page size -
rather than up. This is more reasonable for explicitely requested
hard upper limits.
- to keep charging light-weight, page_counter_try_charge() charges
speculatively, only to roll back if the result exceeds the limit.
Because of this, a failing bigger charge can temporarily lock out
smaller charges that would otherwise succeed. The error is bounded
to the difference between the smallest and the biggest possible
charge size, so for memcg, this means that a failing THP charge can
send base page charges into reclaim upto 2MB (4MB) before the limit
would have been reached. This should be acceptable.
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE and memparse]
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE, memparse, strncmp, and PAGE_SIZE]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Acked-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-12-11 06:42:31 +07:00
|
|
|
excess = soft_limit_excess(memcg);
|
2013-09-25 05:27:40 +07:00
|
|
|
/*
|
|
|
|
* We have to update the tree if mz is on RB-tree or
|
|
|
|
* mem is over its softlimit.
|
|
|
|
*/
|
|
|
|
if (excess || mz->on_tree) {
|
mm: memcontrol: rewrite uncharge API
The memcg uncharging code that is involved towards the end of a page's
lifetime - truncation, reclaim, swapout, migration - is impressively
complicated and fragile.
Because anonymous and file pages were always charged before they had their
page->mapping established, uncharges had to happen when the page type
could still be known from the context; as in unmap for anonymous, page
cache removal for file and shmem pages, and swap cache truncation for swap
pages. However, these operations happen well before the page is actually
freed, and so a lot of synchronization is necessary:
- Charging, uncharging, page migration, and charge migration all need
to take a per-page bit spinlock as they could race with uncharging.
- Swap cache truncation happens during both swap-in and swap-out, and
possibly repeatedly before the page is actually freed. This means
that the memcg swapout code is called from many contexts that make
no sense and it has to figure out the direction from page state to
make sure memory and memory+swap are always correctly charged.
- On page migration, the old page might be unmapped but then reused,
so memcg code has to prevent untimely uncharging in that case.
Because this code - which should be a simple charge transfer - is so
special-cased, it is not reusable for replace_page_cache().
But now that charged pages always have a page->mapping, introduce
mem_cgroup_uncharge(), which is called after the final put_page(), when we
know for sure that nobody is looking at the page anymore.
For page migration, introduce mem_cgroup_migrate(), which is called after
the migration is successful and the new page is fully rmapped. Because
the old page is no longer uncharged after migration, prevent double
charges by decoupling the page's memcg association (PCG_USED and
pc->mem_cgroup) from the page holding an actual charge. The new bits
PCG_MEM and PCG_MEMSW represent the respective charges and are transferred
to the new page during migration.
mem_cgroup_migrate() is suitable for replace_page_cache() as well,
which gets rid of mem_cgroup_replace_page_cache(). However, care
needs to be taken because both the source and the target page can
already be charged and on the LRU when fuse is splicing: grab the page
lock on the charge moving side to prevent changing pc->mem_cgroup of a
page under migration. Also, the lruvecs of both pages change as we
uncharge the old and charge the new during migration, and putback may
race with us, so grab the lru lock and isolate the pages iff on LRU to
prevent races and ensure the pages are on the right lruvec afterward.
Swap accounting is massively simplified: because the page is no longer
uncharged as early as swap cache deletion, a new mem_cgroup_swapout() can
transfer the page's memory+swap charge (PCG_MEMSW) to the swap entry
before the final put_page() in page reclaim.
Finally, page_cgroup changes are now protected by whatever protection the
page itself offers: anonymous pages are charged under the page table lock,
whereas page cache insertions, swapin, and migration hold the page lock.
Uncharging happens under full exclusion with no outstanding references.
Charging and uncharging also ensure that the page is off-LRU, which
serializes against charge migration. Remove the very costly page_cgroup
lock and set pc->flags non-atomically.
[mhocko@suse.cz: mem_cgroup_charge_statistics needs preempt_disable]
[vdavydov@parallels.com: fix flags definition]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Vladimir Davydov <vdavydov@parallels.com>
Tested-by: Jet Chen <jet.chen@intel.com>
Acked-by: Michal Hocko <mhocko@suse.cz>
Tested-by: Felipe Balbi <balbi@ti.com>
Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-08-09 04:19:22 +07:00
|
|
|
unsigned long flags;
|
|
|
|
|
|
|
|
spin_lock_irqsave(&mctz->lock, flags);
|
2013-09-25 05:27:40 +07:00
|
|
|
/* if on-tree, remove it */
|
|
|
|
if (mz->on_tree)
|
2014-06-07 04:38:21 +07:00
|
|
|
__mem_cgroup_remove_exceeded(mz, mctz);
|
2013-09-25 05:27:40 +07:00
|
|
|
/*
|
|
|
|
* Insert again. mz->usage_in_excess will be updated.
|
|
|
|
* If excess is 0, no tree ops.
|
|
|
|
*/
|
2014-06-07 04:38:21 +07:00
|
|
|
__mem_cgroup_insert_exceeded(mz, mctz, excess);
|
mm: memcontrol: rewrite uncharge API
The memcg uncharging code that is involved towards the end of a page's
lifetime - truncation, reclaim, swapout, migration - is impressively
complicated and fragile.
Because anonymous and file pages were always charged before they had their
page->mapping established, uncharges had to happen when the page type
could still be known from the context; as in unmap for anonymous, page
cache removal for file and shmem pages, and swap cache truncation for swap
pages. However, these operations happen well before the page is actually
freed, and so a lot of synchronization is necessary:
- Charging, uncharging, page migration, and charge migration all need
to take a per-page bit spinlock as they could race with uncharging.
- Swap cache truncation happens during both swap-in and swap-out, and
possibly repeatedly before the page is actually freed. This means
that the memcg swapout code is called from many contexts that make
no sense and it has to figure out the direction from page state to
make sure memory and memory+swap are always correctly charged.
- On page migration, the old page might be unmapped but then reused,
so memcg code has to prevent untimely uncharging in that case.
Because this code - which should be a simple charge transfer - is so
special-cased, it is not reusable for replace_page_cache().
But now that charged pages always have a page->mapping, introduce
mem_cgroup_uncharge(), which is called after the final put_page(), when we
know for sure that nobody is looking at the page anymore.
For page migration, introduce mem_cgroup_migrate(), which is called after
the migration is successful and the new page is fully rmapped. Because
the old page is no longer uncharged after migration, prevent double
charges by decoupling the page's memcg association (PCG_USED and
pc->mem_cgroup) from the page holding an actual charge. The new bits
PCG_MEM and PCG_MEMSW represent the respective charges and are transferred
to the new page during migration.
mem_cgroup_migrate() is suitable for replace_page_cache() as well,
which gets rid of mem_cgroup_replace_page_cache(). However, care
needs to be taken because both the source and the target page can
already be charged and on the LRU when fuse is splicing: grab the page
lock on the charge moving side to prevent changing pc->mem_cgroup of a
page under migration. Also, the lruvecs of both pages change as we
uncharge the old and charge the new during migration, and putback may
race with us, so grab the lru lock and isolate the pages iff on LRU to
prevent races and ensure the pages are on the right lruvec afterward.
Swap accounting is massively simplified: because the page is no longer
uncharged as early as swap cache deletion, a new mem_cgroup_swapout() can
transfer the page's memory+swap charge (PCG_MEMSW) to the swap entry
before the final put_page() in page reclaim.
Finally, page_cgroup changes are now protected by whatever protection the
page itself offers: anonymous pages are charged under the page table lock,
whereas page cache insertions, swapin, and migration hold the page lock.
Uncharging happens under full exclusion with no outstanding references.
Charging and uncharging also ensure that the page is off-LRU, which
serializes against charge migration. Remove the very costly page_cgroup
lock and set pc->flags non-atomically.
[mhocko@suse.cz: mem_cgroup_charge_statistics needs preempt_disable]
[vdavydov@parallels.com: fix flags definition]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Vladimir Davydov <vdavydov@parallels.com>
Tested-by: Jet Chen <jet.chen@intel.com>
Acked-by: Michal Hocko <mhocko@suse.cz>
Tested-by: Felipe Balbi <balbi@ti.com>
Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-08-09 04:19:22 +07:00
|
|
|
spin_unlock_irqrestore(&mctz->lock, flags);
|
2013-09-25 05:27:40 +07:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)
|
|
|
|
{
|
2016-07-29 05:46:05 +07:00
|
|
|
struct mem_cgroup_tree_per_node *mctz;
|
|
|
|
struct mem_cgroup_per_node *mz;
|
|
|
|
int nid;
|
2013-09-25 05:27:40 +07:00
|
|
|
|
2014-06-07 04:38:20 +07:00
|
|
|
for_each_node(nid) {
|
2016-07-29 05:46:05 +07:00
|
|
|
mz = mem_cgroup_nodeinfo(memcg, nid);
|
|
|
|
mctz = soft_limit_tree_node(nid);
|
mm/cgroup: avoid panic when init with low memory
The system may panic when initialisation is done when almost all the
memory is assigned to the huge pages using the kernel command line
parameter hugepage=xxxx. Panic may occur like this:
Unable to handle kernel paging request for data at address 0x00000000
Faulting instruction address: 0xc000000000302b88
Oops: Kernel access of bad area, sig: 11 [#1]
SMP NR_CPUS=2048 [ 0.082424] NUMA
pSeries
Modules linked in:
CPU: 0 PID: 1 Comm: swapper/0 Not tainted 4.9.0-15-generic #16-Ubuntu
task: c00000021ed01600 task.stack: c00000010d108000
NIP: c000000000302b88 LR: c000000000270e04 CTR: c00000000016cfd0
REGS: c00000010d10b2c0 TRAP: 0300 Not tainted (4.9.0-15-generic)
MSR: 8000000002009033 <SF,VEC,EE,ME,IR,DR,RI,LE>[ 0.082770] CR: 28424422 XER: 00000000
CFAR: c0000000003d28b8 DAR: 0000000000000000 DSISR: 40000000 SOFTE: 1
GPR00: c000000000270e04 c00000010d10b540 c00000000141a300 c00000010fff6300
GPR04: 0000000000000000 00000000026012c0 c00000010d10b630 0000000487ab0000
GPR08: 000000010ee90000 c000000001454fd8 0000000000000000 0000000000000000
GPR12: 0000000000004400 c00000000fb80000 00000000026012c0 00000000026012c0
GPR16: 00000000026012c0 0000000000000000 0000000000000000 0000000000000002
GPR20: 000000000000000c 0000000000000000 0000000000000000 00000000024200c0
GPR24: c0000000016eef48 0000000000000000 c00000010fff7d00 00000000026012c0
GPR28: 0000000000000000 c00000010fff7d00 c00000010fff6300 c00000010d10b6d0
NIP mem_cgroup_soft_limit_reclaim+0xf8/0x4f0
LR do_try_to_free_pages+0x1b4/0x450
Call Trace:
do_try_to_free_pages+0x1b4/0x450
try_to_free_pages+0xf8/0x270
__alloc_pages_nodemask+0x7a8/0xff0
new_slab+0x104/0x8e0
___slab_alloc+0x620/0x700
__slab_alloc+0x34/0x60
kmem_cache_alloc_node_trace+0xdc/0x310
mem_cgroup_init+0x158/0x1c8
do_one_initcall+0x68/0x1d0
kernel_init_freeable+0x278/0x360
kernel_init+0x24/0x170
ret_from_kernel_thread+0x5c/0x74
Instruction dump:
eb81ffe0 eba1ffe8 ebc1fff0 ebe1fff8 4e800020 3d230001 e9499a42 3d220004
3929acd8 794a1f24 7d295214 eac90100 <e9360000> 2fa90000 419eff74 3b200000
---[ end trace 342f5208b00d01b6 ]---
This is a chicken and egg issue where the kernel try to get free memory
when allocating per node data in mem_cgroup_init(), but in that path
mem_cgroup_soft_limit_reclaim() is called which assumes that these data
are allocated.
As mem_cgroup_soft_limit_reclaim() is best effort, it should return when
these data are not yet allocated.
This patch also fixes potential null pointer access in
mem_cgroup_remove_from_trees() and mem_cgroup_update_tree().
Link: http://lkml.kernel.org/r/1487856999-16581-2-git-send-email-ldufour@linux.vnet.ibm.com
Signed-off-by: Laurent Dufour <ldufour@linux.vnet.ibm.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Balbir Singh <bsingharora@gmail.com>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2017-03-10 07:17:06 +07:00
|
|
|
if (mctz)
|
|
|
|
mem_cgroup_remove_exceeded(mz, mctz);
|
2013-09-25 05:27:40 +07:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-07-29 05:46:05 +07:00
|
|
|
static struct mem_cgroup_per_node *
|
|
|
|
__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
|
2013-09-25 05:27:40 +07:00
|
|
|
{
|
2016-07-29 05:46:05 +07:00
|
|
|
struct mem_cgroup_per_node *mz;
|
2013-09-25 05:27:40 +07:00
|
|
|
|
|
|
|
retry:
|
|
|
|
mz = NULL;
|
2017-09-09 06:15:21 +07:00
|
|
|
if (!mctz->rb_rightmost)
|
2013-09-25 05:27:40 +07:00
|
|
|
goto done; /* Nothing to reclaim from */
|
|
|
|
|
2017-09-09 06:15:21 +07:00
|
|
|
mz = rb_entry(mctz->rb_rightmost,
|
|
|
|
struct mem_cgroup_per_node, tree_node);
|
2013-09-25 05:27:40 +07:00
|
|
|
/*
|
|
|
|
* Remove the node now but someone else can add it back,
|
|
|
|
* we will to add it back at the end of reclaim to its correct
|
|
|
|
* position in the tree.
|
|
|
|
*/
|
2014-06-07 04:38:21 +07:00
|
|
|
__mem_cgroup_remove_exceeded(mz, mctz);
|
mm: memcontrol: lockless page counters
Memory is internally accounted in bytes, using spinlock-protected 64-bit
counters, even though the smallest accounting delta is a page. The
counter interface is also convoluted and does too many things.
Introduce a new lockless word-sized page counter API, then change all
memory accounting over to it. The translation from and to bytes then only
happens when interfacing with userspace.
The removed locking overhead is noticable when scaling beyond the per-cpu
charge caches - on a 4-socket machine with 144-threads, the following test
shows the performance differences of 288 memcgs concurrently running a
page fault benchmark:
vanilla:
18631648.500498 task-clock (msec) # 140.643 CPUs utilized ( +- 0.33% )
1,380,638 context-switches # 0.074 K/sec ( +- 0.75% )
24,390 cpu-migrations # 0.001 K/sec ( +- 8.44% )
1,843,305,768 page-faults # 0.099 M/sec ( +- 0.00% )
50,134,994,088,218 cycles # 2.691 GHz ( +- 0.33% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
8,049,712,224,651 instructions # 0.16 insns per cycle ( +- 0.04% )
1,586,970,584,979 branches # 85.176 M/sec ( +- 0.05% )
1,724,989,949 branch-misses # 0.11% of all branches ( +- 0.48% )
132.474343877 seconds time elapsed ( +- 0.21% )
lockless:
12195979.037525 task-clock (msec) # 133.480 CPUs utilized ( +- 0.18% )
832,850 context-switches # 0.068 K/sec ( +- 0.54% )
15,624 cpu-migrations # 0.001 K/sec ( +- 10.17% )
1,843,304,774 page-faults # 0.151 M/sec ( +- 0.00% )
32,811,216,801,141 cycles # 2.690 GHz ( +- 0.18% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
9,999,265,091,727 instructions # 0.30 insns per cycle ( +- 0.10% )
2,076,759,325,203 branches # 170.282 M/sec ( +- 0.12% )
1,656,917,214 branch-misses # 0.08% of all branches ( +- 0.55% )
91.369330729 seconds time elapsed ( +- 0.45% )
On top of improved scalability, this also gets rid of the icky long long
types in the very heart of memcg, which is great for 32 bit and also makes
the code a lot more readable.
Notable differences between the old and new API:
- res_counter_charge() and res_counter_charge_nofail() become
page_counter_try_charge() and page_counter_charge() resp. to match
the more common kernel naming scheme of try_do()/do()
- res_counter_uncharge_until() is only ever used to cancel a local
counter and never to uncharge bigger segments of a hierarchy, so
it's replaced by the simpler page_counter_cancel()
- res_counter_set_limit() is replaced by page_counter_limit(), which
expects its callers to serialize against themselves
- res_counter_memparse_write_strategy() is replaced by
page_counter_limit(), which rounds down to the nearest page size -
rather than up. This is more reasonable for explicitely requested
hard upper limits.
- to keep charging light-weight, page_counter_try_charge() charges
speculatively, only to roll back if the result exceeds the limit.
Because of this, a failing bigger charge can temporarily lock out
smaller charges that would otherwise succeed. The error is bounded
to the difference between the smallest and the biggest possible
charge size, so for memcg, this means that a failing THP charge can
send base page charges into reclaim upto 2MB (4MB) before the limit
would have been reached. This should be acceptable.
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE and memparse]
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE, memparse, strncmp, and PAGE_SIZE]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Acked-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-12-11 06:42:31 +07:00
|
|
|
if (!soft_limit_excess(mz->memcg) ||
|
2020-04-02 11:07:10 +07:00
|
|
|
!css_tryget(&mz->memcg->css))
|
2013-09-25 05:27:40 +07:00
|
|
|
goto retry;
|
|
|
|
done:
|
|
|
|
return mz;
|
|
|
|
}
|
|
|
|
|
2016-07-29 05:46:05 +07:00
|
|
|
static struct mem_cgroup_per_node *
|
|
|
|
mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
|
2013-09-25 05:27:40 +07:00
|
|
|
{
|
2016-07-29 05:46:05 +07:00
|
|
|
struct mem_cgroup_per_node *mz;
|
2013-09-25 05:27:40 +07:00
|
|
|
|
mm: memcontrol: rewrite uncharge API
The memcg uncharging code that is involved towards the end of a page's
lifetime - truncation, reclaim, swapout, migration - is impressively
complicated and fragile.
Because anonymous and file pages were always charged before they had their
page->mapping established, uncharges had to happen when the page type
could still be known from the context; as in unmap for anonymous, page
cache removal for file and shmem pages, and swap cache truncation for swap
pages. However, these operations happen well before the page is actually
freed, and so a lot of synchronization is necessary:
- Charging, uncharging, page migration, and charge migration all need
to take a per-page bit spinlock as they could race with uncharging.
- Swap cache truncation happens during both swap-in and swap-out, and
possibly repeatedly before the page is actually freed. This means
that the memcg swapout code is called from many contexts that make
no sense and it has to figure out the direction from page state to
make sure memory and memory+swap are always correctly charged.
- On page migration, the old page might be unmapped but then reused,
so memcg code has to prevent untimely uncharging in that case.
Because this code - which should be a simple charge transfer - is so
special-cased, it is not reusable for replace_page_cache().
But now that charged pages always have a page->mapping, introduce
mem_cgroup_uncharge(), which is called after the final put_page(), when we
know for sure that nobody is looking at the page anymore.
For page migration, introduce mem_cgroup_migrate(), which is called after
the migration is successful and the new page is fully rmapped. Because
the old page is no longer uncharged after migration, prevent double
charges by decoupling the page's memcg association (PCG_USED and
pc->mem_cgroup) from the page holding an actual charge. The new bits
PCG_MEM and PCG_MEMSW represent the respective charges and are transferred
to the new page during migration.
mem_cgroup_migrate() is suitable for replace_page_cache() as well,
which gets rid of mem_cgroup_replace_page_cache(). However, care
needs to be taken because both the source and the target page can
already be charged and on the LRU when fuse is splicing: grab the page
lock on the charge moving side to prevent changing pc->mem_cgroup of a
page under migration. Also, the lruvecs of both pages change as we
uncharge the old and charge the new during migration, and putback may
race with us, so grab the lru lock and isolate the pages iff on LRU to
prevent races and ensure the pages are on the right lruvec afterward.
Swap accounting is massively simplified: because the page is no longer
uncharged as early as swap cache deletion, a new mem_cgroup_swapout() can
transfer the page's memory+swap charge (PCG_MEMSW) to the swap entry
before the final put_page() in page reclaim.
Finally, page_cgroup changes are now protected by whatever protection the
page itself offers: anonymous pages are charged under the page table lock,
whereas page cache insertions, swapin, and migration hold the page lock.
Uncharging happens under full exclusion with no outstanding references.
Charging and uncharging also ensure that the page is off-LRU, which
serializes against charge migration. Remove the very costly page_cgroup
lock and set pc->flags non-atomically.
[mhocko@suse.cz: mem_cgroup_charge_statistics needs preempt_disable]
[vdavydov@parallels.com: fix flags definition]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Vladimir Davydov <vdavydov@parallels.com>
Tested-by: Jet Chen <jet.chen@intel.com>
Acked-by: Michal Hocko <mhocko@suse.cz>
Tested-by: Felipe Balbi <balbi@ti.com>
Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-08-09 04:19:22 +07:00
|
|
|
spin_lock_irq(&mctz->lock);
|
2013-09-25 05:27:40 +07:00
|
|
|
mz = __mem_cgroup_largest_soft_limit_node(mctz);
|
mm: memcontrol: rewrite uncharge API
The memcg uncharging code that is involved towards the end of a page's
lifetime - truncation, reclaim, swapout, migration - is impressively
complicated and fragile.
Because anonymous and file pages were always charged before they had their
page->mapping established, uncharges had to happen when the page type
could still be known from the context; as in unmap for anonymous, page
cache removal for file and shmem pages, and swap cache truncation for swap
pages. However, these operations happen well before the page is actually
freed, and so a lot of synchronization is necessary:
- Charging, uncharging, page migration, and charge migration all need
to take a per-page bit spinlock as they could race with uncharging.
- Swap cache truncation happens during both swap-in and swap-out, and
possibly repeatedly before the page is actually freed. This means
that the memcg swapout code is called from many contexts that make
no sense and it has to figure out the direction from page state to
make sure memory and memory+swap are always correctly charged.
- On page migration, the old page might be unmapped but then reused,
so memcg code has to prevent untimely uncharging in that case.
Because this code - which should be a simple charge transfer - is so
special-cased, it is not reusable for replace_page_cache().
But now that charged pages always have a page->mapping, introduce
mem_cgroup_uncharge(), which is called after the final put_page(), when we
know for sure that nobody is looking at the page anymore.
For page migration, introduce mem_cgroup_migrate(), which is called after
the migration is successful and the new page is fully rmapped. Because
the old page is no longer uncharged after migration, prevent double
charges by decoupling the page's memcg association (PCG_USED and
pc->mem_cgroup) from the page holding an actual charge. The new bits
PCG_MEM and PCG_MEMSW represent the respective charges and are transferred
to the new page during migration.
mem_cgroup_migrate() is suitable for replace_page_cache() as well,
which gets rid of mem_cgroup_replace_page_cache(). However, care
needs to be taken because both the source and the target page can
already be charged and on the LRU when fuse is splicing: grab the page
lock on the charge moving side to prevent changing pc->mem_cgroup of a
page under migration. Also, the lruvecs of both pages change as we
uncharge the old and charge the new during migration, and putback may
race with us, so grab the lru lock and isolate the pages iff on LRU to
prevent races and ensure the pages are on the right lruvec afterward.
Swap accounting is massively simplified: because the page is no longer
uncharged as early as swap cache deletion, a new mem_cgroup_swapout() can
transfer the page's memory+swap charge (PCG_MEMSW) to the swap entry
before the final put_page() in page reclaim.
Finally, page_cgroup changes are now protected by whatever protection the
page itself offers: anonymous pages are charged under the page table lock,
whereas page cache insertions, swapin, and migration hold the page lock.
Uncharging happens under full exclusion with no outstanding references.
Charging and uncharging also ensure that the page is off-LRU, which
serializes against charge migration. Remove the very costly page_cgroup
lock and set pc->flags non-atomically.
[mhocko@suse.cz: mem_cgroup_charge_statistics needs preempt_disable]
[vdavydov@parallels.com: fix flags definition]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Vladimir Davydov <vdavydov@parallels.com>
Tested-by: Jet Chen <jet.chen@intel.com>
Acked-by: Michal Hocko <mhocko@suse.cz>
Tested-by: Felipe Balbi <balbi@ti.com>
Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-08-09 04:19:22 +07:00
|
|
|
spin_unlock_irq(&mctz->lock);
|
2013-09-25 05:27:40 +07:00
|
|
|
return mz;
|
|
|
|
}
|
|
|
|
|
2019-05-15 05:47:09 +07:00
|
|
|
/**
|
|
|
|
* __mod_memcg_state - update cgroup memory statistics
|
|
|
|
* @memcg: the memory cgroup
|
|
|
|
* @idx: the stat item - can be enum memcg_stat_item or enum node_stat_item
|
|
|
|
* @val: delta to add to the counter, can be negative
|
|
|
|
*/
|
|
|
|
void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val)
|
|
|
|
{
|
2020-08-07 13:20:35 +07:00
|
|
|
long x, threshold = MEMCG_CHARGE_BATCH;
|
2019-05-15 05:47:09 +07:00
|
|
|
|
|
|
|
if (mem_cgroup_disabled())
|
|
|
|
return;
|
|
|
|
|
2020-08-12 08:30:21 +07:00
|
|
|
if (memcg_stat_item_in_bytes(idx))
|
2020-08-07 13:20:35 +07:00
|
|
|
threshold <<= PAGE_SHIFT;
|
|
|
|
|
2019-05-15 05:47:09 +07:00
|
|
|
x = val + __this_cpu_read(memcg->vmstats_percpu->stat[idx]);
|
2020-08-07 13:20:35 +07:00
|
|
|
if (unlikely(abs(x) > threshold)) {
|
mm: memcontrol: fix recursive statistics correctness & scalabilty
Right now, when somebody needs to know the recursive memory statistics
and events of a cgroup subtree, they need to walk the entire subtree and
sum up the counters manually.
There are two issues with this:
1. When a cgroup gets deleted, its stats are lost. The state counters
should all be 0 at that point, of course, but the events are not.
When this happens, the event counters, which are supposed to be
monotonic, can go backwards in the parent cgroups.
2. During regular operation, we always have a certain number of lazily
freed cgroups sitting around that have been deleted, have no tasks,
but have a few cache pages remaining. These groups' statistics do not
change until we eventually hit memory pressure, but somebody
watching, say, memory.stat on an ancestor has to iterate those every
time.
This patch addresses both issues by introducing recursive counters at
each level that are propagated from the write side when stats change.
Upward propagation happens when the per-cpu caches spill over into the
local atomic counter. This is the same thing we do during charge and
uncharge, except that the latter uses atomic RMWs, which are more
expensive; stat changes happen at around the same rate. In a sparse
file test (page faults and reclaim at maximum CPU speed) with 5 cgroup
nesting levels, perf shows __mod_memcg_page state at ~1%.
Link: http://lkml.kernel.org/r/20190412151507.2769-4-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Reviewed-by: Roman Gushchin <guro@fb.com>
Cc: Michal Hocko <mhocko@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-05-15 05:47:12 +07:00
|
|
|
struct mem_cgroup *mi;
|
|
|
|
|
2019-07-17 06:26:06 +07:00
|
|
|
/*
|
|
|
|
* Batch local counters to keep them in sync with
|
|
|
|
* the hierarchical ones.
|
|
|
|
*/
|
|
|
|
__this_cpu_add(memcg->vmstats_local->stat[idx], x);
|
mm: memcontrol: fix recursive statistics correctness & scalabilty
Right now, when somebody needs to know the recursive memory statistics
and events of a cgroup subtree, they need to walk the entire subtree and
sum up the counters manually.
There are two issues with this:
1. When a cgroup gets deleted, its stats are lost. The state counters
should all be 0 at that point, of course, but the events are not.
When this happens, the event counters, which are supposed to be
monotonic, can go backwards in the parent cgroups.
2. During regular operation, we always have a certain number of lazily
freed cgroups sitting around that have been deleted, have no tasks,
but have a few cache pages remaining. These groups' statistics do not
change until we eventually hit memory pressure, but somebody
watching, say, memory.stat on an ancestor has to iterate those every
time.
This patch addresses both issues by introducing recursive counters at
each level that are propagated from the write side when stats change.
Upward propagation happens when the per-cpu caches spill over into the
local atomic counter. This is the same thing we do during charge and
uncharge, except that the latter uses atomic RMWs, which are more
expensive; stat changes happen at around the same rate. In a sparse
file test (page faults and reclaim at maximum CPU speed) with 5 cgroup
nesting levels, perf shows __mod_memcg_page state at ~1%.
Link: http://lkml.kernel.org/r/20190412151507.2769-4-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Reviewed-by: Roman Gushchin <guro@fb.com>
Cc: Michal Hocko <mhocko@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-05-15 05:47:12 +07:00
|
|
|
for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
|
|
|
|
atomic_long_add(x, &mi->vmstats[idx]);
|
2019-05-15 05:47:09 +07:00
|
|
|
x = 0;
|
|
|
|
}
|
|
|
|
__this_cpu_write(memcg->vmstats_percpu->stat[idx], x);
|
|
|
|
}
|
|
|
|
|
mm: memcontrol: fix recursive statistics correctness & scalabilty
Right now, when somebody needs to know the recursive memory statistics
and events of a cgroup subtree, they need to walk the entire subtree and
sum up the counters manually.
There are two issues with this:
1. When a cgroup gets deleted, its stats are lost. The state counters
should all be 0 at that point, of course, but the events are not.
When this happens, the event counters, which are supposed to be
monotonic, can go backwards in the parent cgroups.
2. During regular operation, we always have a certain number of lazily
freed cgroups sitting around that have been deleted, have no tasks,
but have a few cache pages remaining. These groups' statistics do not
change until we eventually hit memory pressure, but somebody
watching, say, memory.stat on an ancestor has to iterate those every
time.
This patch addresses both issues by introducing recursive counters at
each level that are propagated from the write side when stats change.
Upward propagation happens when the per-cpu caches spill over into the
local atomic counter. This is the same thing we do during charge and
uncharge, except that the latter uses atomic RMWs, which are more
expensive; stat changes happen at around the same rate. In a sparse
file test (page faults and reclaim at maximum CPU speed) with 5 cgroup
nesting levels, perf shows __mod_memcg_page state at ~1%.
Link: http://lkml.kernel.org/r/20190412151507.2769-4-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Reviewed-by: Roman Gushchin <guro@fb.com>
Cc: Michal Hocko <mhocko@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-05-15 05:47:12 +07:00
|
|
|
static struct mem_cgroup_per_node *
|
|
|
|
parent_nodeinfo(struct mem_cgroup_per_node *pn, int nid)
|
|
|
|
{
|
|
|
|
struct mem_cgroup *parent;
|
|
|
|
|
|
|
|
parent = parent_mem_cgroup(pn->memcg);
|
|
|
|
if (!parent)
|
|
|
|
return NULL;
|
|
|
|
return mem_cgroup_nodeinfo(parent, nid);
|
|
|
|
}
|
|
|
|
|
mm: memcg: factor out memcg- and lruvec-level changes out of __mod_lruvec_state()
Patch series "The new cgroup slab memory controller", v7.
The patchset moves the accounting from the page level to the object level.
It allows to share slab pages between memory cgroups. This leads to a
significant win in the slab utilization (up to 45%) and the corresponding
drop in the total kernel memory footprint. The reduced number of
unmovable slab pages should also have a positive effect on the memory
fragmentation.
The patchset makes the slab accounting code simpler: there is no more need
in the complicated dynamic creation and destruction of per-cgroup slab
caches, all memory cgroups use a global set of shared slab caches. The
lifetime of slab caches is not more connected to the lifetime of memory
cgroups.
The more precise accounting does require more CPU, however in practice the
difference seems to be negligible. We've been using the new slab
controller in Facebook production for several months with different
workloads and haven't seen any noticeable regressions. What we've seen
were memory savings in order of 1 GB per host (it varied heavily depending
on the actual workload, size of RAM, number of CPUs, memory pressure,
etc).
The third version of the patchset added yet another step towards the
simplification of the code: sharing of slab caches between accounted and
non-accounted allocations. It comes with significant upsides (most
noticeable, a complete elimination of dynamic slab caches creation) but
not without some regression risks, so this change sits on top of the
patchset and is not completely merged in. So in the unlikely event of a
noticeable performance regression it can be reverted separately.
The slab memory accounting works in exactly the same way for SLAB and
SLUB. With both allocators the new controller shows significant memory
savings, with SLUB the difference is bigger. On my 16-core desktop
machine running Fedora 32 the size of the slab memory measured after the
start of the system was lower by 58% and 38% with SLUB and SLAB
correspondingly.
As an estimation of a potential CPU overhead, below are results of
slab_bulk_test01 test, kindly provided by Jesper D. Brouer. He also
helped with the evaluation of results.
The test can be found here: https://github.com/netoptimizer/prototype-kernel/
The smallest number in each row should be picked for a comparison.
SLUB-patched - bulk-API
- SLUB-patched : bulk_quick_reuse objects=1 : 187 - 90 - 224 cycles(tsc)
- SLUB-patched : bulk_quick_reuse objects=2 : 110 - 53 - 133 cycles(tsc)
- SLUB-patched : bulk_quick_reuse objects=3 : 88 - 95 - 42 cycles(tsc)
- SLUB-patched : bulk_quick_reuse objects=4 : 91 - 85 - 36 cycles(tsc)
- SLUB-patched : bulk_quick_reuse objects=8 : 32 - 66 - 32 cycles(tsc)
SLUB-original - bulk-API
- SLUB-original: bulk_quick_reuse objects=1 : 87 - 87 - 142 cycles(tsc)
- SLUB-original: bulk_quick_reuse objects=2 : 52 - 53 - 53 cycles(tsc)
- SLUB-original: bulk_quick_reuse objects=3 : 42 - 42 - 91 cycles(tsc)
- SLUB-original: bulk_quick_reuse objects=4 : 91 - 37 - 37 cycles(tsc)
- SLUB-original: bulk_quick_reuse objects=8 : 31 - 79 - 76 cycles(tsc)
SLAB-patched - bulk-API
- SLAB-patched : bulk_quick_reuse objects=1 : 67 - 67 - 140 cycles(tsc)
- SLAB-patched : bulk_quick_reuse objects=2 : 55 - 46 - 46 cycles(tsc)
- SLAB-patched : bulk_quick_reuse objects=3 : 93 - 94 - 39 cycles(tsc)
- SLAB-patched : bulk_quick_reuse objects=4 : 35 - 88 - 85 cycles(tsc)
- SLAB-patched : bulk_quick_reuse objects=8 : 30 - 30 - 30 cycles(tsc)
SLAB-original- bulk-API
- SLAB-original: bulk_quick_reuse objects=1 : 143 - 136 - 67 cycles(tsc)
- SLAB-original: bulk_quick_reuse objects=2 : 45 - 46 - 46 cycles(tsc)
- SLAB-original: bulk_quick_reuse objects=3 : 38 - 39 - 39 cycles(tsc)
- SLAB-original: bulk_quick_reuse objects=4 : 35 - 87 - 87 cycles(tsc)
- SLAB-original: bulk_quick_reuse objects=8 : 29 - 66 - 30 cycles(tsc)
This patch (of 19):
To convert memcg and lruvec slab counters to bytes there must be a way to
change these counters without touching node counters. Factor out
__mod_memcg_lruvec_state() out of __mod_lruvec_state().
Signed-off-by: Roman Gushchin <guro@fb.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Christoph Lameter <cl@linux.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Tejun Heo <tj@kernel.org>
Link: http://lkml.kernel.org/r/20200623174037.3951353-1-guro@fb.com
Link: http://lkml.kernel.org/r/20200623174037.3951353-2-guro@fb.com
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2020-08-07 13:20:32 +07:00
|
|
|
void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
|
|
|
|
int val)
|
2019-05-15 05:47:09 +07:00
|
|
|
{
|
|
|
|
struct mem_cgroup_per_node *pn;
|
mm: memcontrol: fix recursive statistics correctness & scalabilty
Right now, when somebody needs to know the recursive memory statistics
and events of a cgroup subtree, they need to walk the entire subtree and
sum up the counters manually.
There are two issues with this:
1. When a cgroup gets deleted, its stats are lost. The state counters
should all be 0 at that point, of course, but the events are not.
When this happens, the event counters, which are supposed to be
monotonic, can go backwards in the parent cgroups.
2. During regular operation, we always have a certain number of lazily
freed cgroups sitting around that have been deleted, have no tasks,
but have a few cache pages remaining. These groups' statistics do not
change until we eventually hit memory pressure, but somebody
watching, say, memory.stat on an ancestor has to iterate those every
time.
This patch addresses both issues by introducing recursive counters at
each level that are propagated from the write side when stats change.
Upward propagation happens when the per-cpu caches spill over into the
local atomic counter. This is the same thing we do during charge and
uncharge, except that the latter uses atomic RMWs, which are more
expensive; stat changes happen at around the same rate. In a sparse
file test (page faults and reclaim at maximum CPU speed) with 5 cgroup
nesting levels, perf shows __mod_memcg_page state at ~1%.
Link: http://lkml.kernel.org/r/20190412151507.2769-4-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Reviewed-by: Roman Gushchin <guro@fb.com>
Cc: Michal Hocko <mhocko@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-05-15 05:47:12 +07:00
|
|
|
struct mem_cgroup *memcg;
|
2020-08-07 13:20:35 +07:00
|
|
|
long x, threshold = MEMCG_CHARGE_BATCH;
|
2019-05-15 05:47:09 +07:00
|
|
|
|
|
|
|
pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
|
mm: memcontrol: fix recursive statistics correctness & scalabilty
Right now, when somebody needs to know the recursive memory statistics
and events of a cgroup subtree, they need to walk the entire subtree and
sum up the counters manually.
There are two issues with this:
1. When a cgroup gets deleted, its stats are lost. The state counters
should all be 0 at that point, of course, but the events are not.
When this happens, the event counters, which are supposed to be
monotonic, can go backwards in the parent cgroups.
2. During regular operation, we always have a certain number of lazily
freed cgroups sitting around that have been deleted, have no tasks,
but have a few cache pages remaining. These groups' statistics do not
change until we eventually hit memory pressure, but somebody
watching, say, memory.stat on an ancestor has to iterate those every
time.
This patch addresses both issues by introducing recursive counters at
each level that are propagated from the write side when stats change.
Upward propagation happens when the per-cpu caches spill over into the
local atomic counter. This is the same thing we do during charge and
uncharge, except that the latter uses atomic RMWs, which are more
expensive; stat changes happen at around the same rate. In a sparse
file test (page faults and reclaim at maximum CPU speed) with 5 cgroup
nesting levels, perf shows __mod_memcg_page state at ~1%.
Link: http://lkml.kernel.org/r/20190412151507.2769-4-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Reviewed-by: Roman Gushchin <guro@fb.com>
Cc: Michal Hocko <mhocko@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-05-15 05:47:12 +07:00
|
|
|
memcg = pn->memcg;
|
2019-05-15 05:47:09 +07:00
|
|
|
|
|
|
|
/* Update memcg */
|
mm: memcontrol: fix recursive statistics correctness & scalabilty
Right now, when somebody needs to know the recursive memory statistics
and events of a cgroup subtree, they need to walk the entire subtree and
sum up the counters manually.
There are two issues with this:
1. When a cgroup gets deleted, its stats are lost. The state counters
should all be 0 at that point, of course, but the events are not.
When this happens, the event counters, which are supposed to be
monotonic, can go backwards in the parent cgroups.
2. During regular operation, we always have a certain number of lazily
freed cgroups sitting around that have been deleted, have no tasks,
but have a few cache pages remaining. These groups' statistics do not
change until we eventually hit memory pressure, but somebody
watching, say, memory.stat on an ancestor has to iterate those every
time.
This patch addresses both issues by introducing recursive counters at
each level that are propagated from the write side when stats change.
Upward propagation happens when the per-cpu caches spill over into the
local atomic counter. This is the same thing we do during charge and
uncharge, except that the latter uses atomic RMWs, which are more
expensive; stat changes happen at around the same rate. In a sparse
file test (page faults and reclaim at maximum CPU speed) with 5 cgroup
nesting levels, perf shows __mod_memcg_page state at ~1%.
Link: http://lkml.kernel.org/r/20190412151507.2769-4-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Reviewed-by: Roman Gushchin <guro@fb.com>
Cc: Michal Hocko <mhocko@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-05-15 05:47:12 +07:00
|
|
|
__mod_memcg_state(memcg, idx, val);
|
2019-05-15 05:47:09 +07:00
|
|
|
|
mm, memcg: partially revert "mm/memcontrol.c: keep local VM counters in sync with the hierarchical ones"
Commit 766a4c19d880 ("mm/memcontrol.c: keep local VM counters in sync
with the hierarchical ones") effectively decreased the precision of
per-memcg vmstats_local and per-memcg-per-node lruvec percpu counters.
That's good for displaying in memory.stat, but brings a serious
regression into the reclaim process.
One issue I've discovered and debugged is the following:
lruvec_lru_size() can return 0 instead of the actual number of pages in
the lru list, preventing the kernel to reclaim last remaining pages.
Result is yet another dying memory cgroups flooding. The opposite is
also happening: scanning an empty lru list is the waste of cpu time.
Also, inactive_list_is_low() can return incorrect values, preventing the
active lru from being scanned and freed. It can fail both because the
size of active and inactive lists are inaccurate, and because the number
of workingset refaults isn't precise. In other words, the result is
pretty random.
I'm not sure, if using the approximate number of slab pages in
count_shadow_number() is acceptable, but issues described above are
enough to partially revert the patch.
Let's keep per-memcg vmstat_local batched (they are only used for
displaying stats to the userspace), but keep lruvec stats precise. This
change fixes the dead memcg flooding on my setup.
Link: http://lkml.kernel.org/r/20190817004726.2530670-1-guro@fb.com
Fixes: 766a4c19d880 ("mm/memcontrol.c: keep local VM counters in sync with the hierarchical ones")
Signed-off-by: Roman Gushchin <guro@fb.com>
Acked-by: Yafang Shao <laoar.shao@gmail.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-08-31 06:04:39 +07:00
|
|
|
/* Update lruvec */
|
|
|
|
__this_cpu_add(pn->lruvec_stat_local->count[idx], val);
|
|
|
|
|
2020-08-07 13:20:35 +07:00
|
|
|
if (vmstat_item_in_bytes(idx))
|
|
|
|
threshold <<= PAGE_SHIFT;
|
|
|
|
|
2019-05-15 05:47:09 +07:00
|
|
|
x = val + __this_cpu_read(pn->lruvec_stat_cpu->count[idx]);
|
2020-08-07 13:20:35 +07:00
|
|
|
if (unlikely(abs(x) > threshold)) {
|
mm: memcg: factor out memcg- and lruvec-level changes out of __mod_lruvec_state()
Patch series "The new cgroup slab memory controller", v7.
The patchset moves the accounting from the page level to the object level.
It allows to share slab pages between memory cgroups. This leads to a
significant win in the slab utilization (up to 45%) and the corresponding
drop in the total kernel memory footprint. The reduced number of
unmovable slab pages should also have a positive effect on the memory
fragmentation.
The patchset makes the slab accounting code simpler: there is no more need
in the complicated dynamic creation and destruction of per-cgroup slab
caches, all memory cgroups use a global set of shared slab caches. The
lifetime of slab caches is not more connected to the lifetime of memory
cgroups.
The more precise accounting does require more CPU, however in practice the
difference seems to be negligible. We've been using the new slab
controller in Facebook production for several months with different
workloads and haven't seen any noticeable regressions. What we've seen
were memory savings in order of 1 GB per host (it varied heavily depending
on the actual workload, size of RAM, number of CPUs, memory pressure,
etc).
The third version of the patchset added yet another step towards the
simplification of the code: sharing of slab caches between accounted and
non-accounted allocations. It comes with significant upsides (most
noticeable, a complete elimination of dynamic slab caches creation) but
not without some regression risks, so this change sits on top of the
patchset and is not completely merged in. So in the unlikely event of a
noticeable performance regression it can be reverted separately.
The slab memory accounting works in exactly the same way for SLAB and
SLUB. With both allocators the new controller shows significant memory
savings, with SLUB the difference is bigger. On my 16-core desktop
machine running Fedora 32 the size of the slab memory measured after the
start of the system was lower by 58% and 38% with SLUB and SLAB
correspondingly.
As an estimation of a potential CPU overhead, below are results of
slab_bulk_test01 test, kindly provided by Jesper D. Brouer. He also
helped with the evaluation of results.
The test can be found here: https://github.com/netoptimizer/prototype-kernel/
The smallest number in each row should be picked for a comparison.
SLUB-patched - bulk-API
- SLUB-patched : bulk_quick_reuse objects=1 : 187 - 90 - 224 cycles(tsc)
- SLUB-patched : bulk_quick_reuse objects=2 : 110 - 53 - 133 cycles(tsc)
- SLUB-patched : bulk_quick_reuse objects=3 : 88 - 95 - 42 cycles(tsc)
- SLUB-patched : bulk_quick_reuse objects=4 : 91 - 85 - 36 cycles(tsc)
- SLUB-patched : bulk_quick_reuse objects=8 : 32 - 66 - 32 cycles(tsc)
SLUB-original - bulk-API
- SLUB-original: bulk_quick_reuse objects=1 : 87 - 87 - 142 cycles(tsc)
- SLUB-original: bulk_quick_reuse objects=2 : 52 - 53 - 53 cycles(tsc)
- SLUB-original: bulk_quick_reuse objects=3 : 42 - 42 - 91 cycles(tsc)
- SLUB-original: bulk_quick_reuse objects=4 : 91 - 37 - 37 cycles(tsc)
- SLUB-original: bulk_quick_reuse objects=8 : 31 - 79 - 76 cycles(tsc)
SLAB-patched - bulk-API
- SLAB-patched : bulk_quick_reuse objects=1 : 67 - 67 - 140 cycles(tsc)
- SLAB-patched : bulk_quick_reuse objects=2 : 55 - 46 - 46 cycles(tsc)
- SLAB-patched : bulk_quick_reuse objects=3 : 93 - 94 - 39 cycles(tsc)
- SLAB-patched : bulk_quick_reuse objects=4 : 35 - 88 - 85 cycles(tsc)
- SLAB-patched : bulk_quick_reuse objects=8 : 30 - 30 - 30 cycles(tsc)
SLAB-original- bulk-API
- SLAB-original: bulk_quick_reuse objects=1 : 143 - 136 - 67 cycles(tsc)
- SLAB-original: bulk_quick_reuse objects=2 : 45 - 46 - 46 cycles(tsc)
- SLAB-original: bulk_quick_reuse objects=3 : 38 - 39 - 39 cycles(tsc)
- SLAB-original: bulk_quick_reuse objects=4 : 35 - 87 - 87 cycles(tsc)
- SLAB-original: bulk_quick_reuse objects=8 : 29 - 66 - 30 cycles(tsc)
This patch (of 19):
To convert memcg and lruvec slab counters to bytes there must be a way to
change these counters without touching node counters. Factor out
__mod_memcg_lruvec_state() out of __mod_lruvec_state().
Signed-off-by: Roman Gushchin <guro@fb.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Christoph Lameter <cl@linux.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Tejun Heo <tj@kernel.org>
Link: http://lkml.kernel.org/r/20200623174037.3951353-1-guro@fb.com
Link: http://lkml.kernel.org/r/20200623174037.3951353-2-guro@fb.com
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2020-08-07 13:20:32 +07:00
|
|
|
pg_data_t *pgdat = lruvec_pgdat(lruvec);
|
mm: memcontrol: fix recursive statistics correctness & scalabilty
Right now, when somebody needs to know the recursive memory statistics
and events of a cgroup subtree, they need to walk the entire subtree and
sum up the counters manually.
There are two issues with this:
1. When a cgroup gets deleted, its stats are lost. The state counters
should all be 0 at that point, of course, but the events are not.
When this happens, the event counters, which are supposed to be
monotonic, can go backwards in the parent cgroups.
2. During regular operation, we always have a certain number of lazily
freed cgroups sitting around that have been deleted, have no tasks,
but have a few cache pages remaining. These groups' statistics do not
change until we eventually hit memory pressure, but somebody
watching, say, memory.stat on an ancestor has to iterate those every
time.
This patch addresses both issues by introducing recursive counters at
each level that are propagated from the write side when stats change.
Upward propagation happens when the per-cpu caches spill over into the
local atomic counter. This is the same thing we do during charge and
uncharge, except that the latter uses atomic RMWs, which are more
expensive; stat changes happen at around the same rate. In a sparse
file test (page faults and reclaim at maximum CPU speed) with 5 cgroup
nesting levels, perf shows __mod_memcg_page state at ~1%.
Link: http://lkml.kernel.org/r/20190412151507.2769-4-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Reviewed-by: Roman Gushchin <guro@fb.com>
Cc: Michal Hocko <mhocko@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-05-15 05:47:12 +07:00
|
|
|
struct mem_cgroup_per_node *pi;
|
|
|
|
|
|
|
|
for (pi = pn; pi; pi = parent_nodeinfo(pi, pgdat->node_id))
|
|
|
|
atomic_long_add(x, &pi->lruvec_stat[idx]);
|
2019-05-15 05:47:09 +07:00
|
|
|
x = 0;
|
|
|
|
}
|
|
|
|
__this_cpu_write(pn->lruvec_stat_cpu->count[idx], x);
|
|
|
|
}
|
|
|
|
|
mm: memcg: factor out memcg- and lruvec-level changes out of __mod_lruvec_state()
Patch series "The new cgroup slab memory controller", v7.
The patchset moves the accounting from the page level to the object level.
It allows to share slab pages between memory cgroups. This leads to a
significant win in the slab utilization (up to 45%) and the corresponding
drop in the total kernel memory footprint. The reduced number of
unmovable slab pages should also have a positive effect on the memory
fragmentation.
The patchset makes the slab accounting code simpler: there is no more need
in the complicated dynamic creation and destruction of per-cgroup slab
caches, all memory cgroups use a global set of shared slab caches. The
lifetime of slab caches is not more connected to the lifetime of memory
cgroups.
The more precise accounting does require more CPU, however in practice the
difference seems to be negligible. We've been using the new slab
controller in Facebook production for several months with different
workloads and haven't seen any noticeable regressions. What we've seen
were memory savings in order of 1 GB per host (it varied heavily depending
on the actual workload, size of RAM, number of CPUs, memory pressure,
etc).
The third version of the patchset added yet another step towards the
simplification of the code: sharing of slab caches between accounted and
non-accounted allocations. It comes with significant upsides (most
noticeable, a complete elimination of dynamic slab caches creation) but
not without some regression risks, so this change sits on top of the
patchset and is not completely merged in. So in the unlikely event of a
noticeable performance regression it can be reverted separately.
The slab memory accounting works in exactly the same way for SLAB and
SLUB. With both allocators the new controller shows significant memory
savings, with SLUB the difference is bigger. On my 16-core desktop
machine running Fedora 32 the size of the slab memory measured after the
start of the system was lower by 58% and 38% with SLUB and SLAB
correspondingly.
As an estimation of a potential CPU overhead, below are results of
slab_bulk_test01 test, kindly provided by Jesper D. Brouer. He also
helped with the evaluation of results.
The test can be found here: https://github.com/netoptimizer/prototype-kernel/
The smallest number in each row should be picked for a comparison.
SLUB-patched - bulk-API
- SLUB-patched : bulk_quick_reuse objects=1 : 187 - 90 - 224 cycles(tsc)
- SLUB-patched : bulk_quick_reuse objects=2 : 110 - 53 - 133 cycles(tsc)
- SLUB-patched : bulk_quick_reuse objects=3 : 88 - 95 - 42 cycles(tsc)
- SLUB-patched : bulk_quick_reuse objects=4 : 91 - 85 - 36 cycles(tsc)
- SLUB-patched : bulk_quick_reuse objects=8 : 32 - 66 - 32 cycles(tsc)
SLUB-original - bulk-API
- SLUB-original: bulk_quick_reuse objects=1 : 87 - 87 - 142 cycles(tsc)
- SLUB-original: bulk_quick_reuse objects=2 : 52 - 53 - 53 cycles(tsc)
- SLUB-original: bulk_quick_reuse objects=3 : 42 - 42 - 91 cycles(tsc)
- SLUB-original: bulk_quick_reuse objects=4 : 91 - 37 - 37 cycles(tsc)
- SLUB-original: bulk_quick_reuse objects=8 : 31 - 79 - 76 cycles(tsc)
SLAB-patched - bulk-API
- SLAB-patched : bulk_quick_reuse objects=1 : 67 - 67 - 140 cycles(tsc)
- SLAB-patched : bulk_quick_reuse objects=2 : 55 - 46 - 46 cycles(tsc)
- SLAB-patched : bulk_quick_reuse objects=3 : 93 - 94 - 39 cycles(tsc)
- SLAB-patched : bulk_quick_reuse objects=4 : 35 - 88 - 85 cycles(tsc)
- SLAB-patched : bulk_quick_reuse objects=8 : 30 - 30 - 30 cycles(tsc)
SLAB-original- bulk-API
- SLAB-original: bulk_quick_reuse objects=1 : 143 - 136 - 67 cycles(tsc)
- SLAB-original: bulk_quick_reuse objects=2 : 45 - 46 - 46 cycles(tsc)
- SLAB-original: bulk_quick_reuse objects=3 : 38 - 39 - 39 cycles(tsc)
- SLAB-original: bulk_quick_reuse objects=4 : 35 - 87 - 87 cycles(tsc)
- SLAB-original: bulk_quick_reuse objects=8 : 29 - 66 - 30 cycles(tsc)
This patch (of 19):
To convert memcg and lruvec slab counters to bytes there must be a way to
change these counters without touching node counters. Factor out
__mod_memcg_lruvec_state() out of __mod_lruvec_state().
Signed-off-by: Roman Gushchin <guro@fb.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Christoph Lameter <cl@linux.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Tejun Heo <tj@kernel.org>
Link: http://lkml.kernel.org/r/20200623174037.3951353-1-guro@fb.com
Link: http://lkml.kernel.org/r/20200623174037.3951353-2-guro@fb.com
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2020-08-07 13:20:32 +07:00
|
|
|
/**
|
|
|
|
* __mod_lruvec_state - update lruvec memory statistics
|
|
|
|
* @lruvec: the lruvec
|
|
|
|
* @idx: the stat item
|
|
|
|
* @val: delta to add to the counter, can be negative
|
|
|
|
*
|
|
|
|
* The lruvec is the intersection of the NUMA node and a cgroup. This
|
|
|
|
* function updates the all three counters that are affected by a
|
|
|
|
* change of state at this level: per-node, per-cgroup, per-lruvec.
|
|
|
|
*/
|
|
|
|
void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
|
|
|
|
int val)
|
|
|
|
{
|
|
|
|
/* Update node */
|
|
|
|
__mod_node_page_state(lruvec_pgdat(lruvec), idx, val);
|
|
|
|
|
|
|
|
/* Update memcg and lruvec */
|
|
|
|
if (!mem_cgroup_disabled())
|
|
|
|
__mod_memcg_lruvec_state(lruvec, idx, val);
|
|
|
|
}
|
|
|
|
|
2019-08-14 05:37:41 +07:00
|
|
|
void __mod_lruvec_slab_state(void *p, enum node_stat_item idx, int val)
|
|
|
|
{
|
2020-04-02 11:06:36 +07:00
|
|
|
pg_data_t *pgdat = page_pgdat(virt_to_page(p));
|
2019-08-14 05:37:41 +07:00
|
|
|
struct mem_cgroup *memcg;
|
|
|
|
struct lruvec *lruvec;
|
|
|
|
|
|
|
|
rcu_read_lock();
|
2020-04-02 11:06:36 +07:00
|
|
|
memcg = mem_cgroup_from_obj(p);
|
2019-08-14 05:37:41 +07:00
|
|
|
|
2020-11-22 13:17:12 +07:00
|
|
|
/*
|
|
|
|
* Untracked pages have no memcg, no lruvec. Update only the
|
|
|
|
* node. If we reparent the slab objects to the root memcg,
|
|
|
|
* when we free the slab object, we need to update the per-memcg
|
|
|
|
* vmstats to keep it correct for the root memcg.
|
|
|
|
*/
|
|
|
|
if (!memcg) {
|
2019-08-14 05:37:41 +07:00
|
|
|
__mod_node_page_state(pgdat, idx, val);
|
|
|
|
} else {
|
2019-12-01 08:55:34 +07:00
|
|
|
lruvec = mem_cgroup_lruvec(memcg, pgdat);
|
2019-08-14 05:37:41 +07:00
|
|
|
__mod_lruvec_state(lruvec, idx, val);
|
|
|
|
}
|
|
|
|
rcu_read_unlock();
|
|
|
|
}
|
|
|
|
|
2020-03-29 09:17:25 +07:00
|
|
|
void mod_memcg_obj_state(void *p, int idx, int val)
|
|
|
|
{
|
|
|
|
struct mem_cgroup *memcg;
|
|
|
|
|
|
|
|
rcu_read_lock();
|
|
|
|
memcg = mem_cgroup_from_obj(p);
|
|
|
|
if (memcg)
|
|
|
|
mod_memcg_state(memcg, idx, val);
|
|
|
|
rcu_read_unlock();
|
|
|
|
}
|
|
|
|
|
2019-05-15 05:47:09 +07:00
|
|
|
/**
|
|
|
|
* __count_memcg_events - account VM events in a cgroup
|
|
|
|
* @memcg: the memory cgroup
|
|
|
|
* @idx: the event item
|
|
|
|
* @count: the number of events that occured
|
|
|
|
*/
|
|
|
|
void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx,
|
|
|
|
unsigned long count)
|
|
|
|
{
|
|
|
|
unsigned long x;
|
|
|
|
|
|
|
|
if (mem_cgroup_disabled())
|
|
|
|
return;
|
|
|
|
|
|
|
|
x = count + __this_cpu_read(memcg->vmstats_percpu->events[idx]);
|
|
|
|
if (unlikely(x > MEMCG_CHARGE_BATCH)) {
|
mm: memcontrol: fix recursive statistics correctness & scalabilty
Right now, when somebody needs to know the recursive memory statistics
and events of a cgroup subtree, they need to walk the entire subtree and
sum up the counters manually.
There are two issues with this:
1. When a cgroup gets deleted, its stats are lost. The state counters
should all be 0 at that point, of course, but the events are not.
When this happens, the event counters, which are supposed to be
monotonic, can go backwards in the parent cgroups.
2. During regular operation, we always have a certain number of lazily
freed cgroups sitting around that have been deleted, have no tasks,
but have a few cache pages remaining. These groups' statistics do not
change until we eventually hit memory pressure, but somebody
watching, say, memory.stat on an ancestor has to iterate those every
time.
This patch addresses both issues by introducing recursive counters at
each level that are propagated from the write side when stats change.
Upward propagation happens when the per-cpu caches spill over into the
local atomic counter. This is the same thing we do during charge and
uncharge, except that the latter uses atomic RMWs, which are more
expensive; stat changes happen at around the same rate. In a sparse
file test (page faults and reclaim at maximum CPU speed) with 5 cgroup
nesting levels, perf shows __mod_memcg_page state at ~1%.
Link: http://lkml.kernel.org/r/20190412151507.2769-4-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Reviewed-by: Roman Gushchin <guro@fb.com>
Cc: Michal Hocko <mhocko@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-05-15 05:47:12 +07:00
|
|
|
struct mem_cgroup *mi;
|
|
|
|
|
2019-07-17 06:26:06 +07:00
|
|
|
/*
|
|
|
|
* Batch local counters to keep them in sync with
|
|
|
|
* the hierarchical ones.
|
|
|
|
*/
|
|
|
|
__this_cpu_add(memcg->vmstats_local->events[idx], x);
|
mm: memcontrol: fix recursive statistics correctness & scalabilty
Right now, when somebody needs to know the recursive memory statistics
and events of a cgroup subtree, they need to walk the entire subtree and
sum up the counters manually.
There are two issues with this:
1. When a cgroup gets deleted, its stats are lost. The state counters
should all be 0 at that point, of course, but the events are not.
When this happens, the event counters, which are supposed to be
monotonic, can go backwards in the parent cgroups.
2. During regular operation, we always have a certain number of lazily
freed cgroups sitting around that have been deleted, have no tasks,
but have a few cache pages remaining. These groups' statistics do not
change until we eventually hit memory pressure, but somebody
watching, say, memory.stat on an ancestor has to iterate those every
time.
This patch addresses both issues by introducing recursive counters at
each level that are propagated from the write side when stats change.
Upward propagation happens when the per-cpu caches spill over into the
local atomic counter. This is the same thing we do during charge and
uncharge, except that the latter uses atomic RMWs, which are more
expensive; stat changes happen at around the same rate. In a sparse
file test (page faults and reclaim at maximum CPU speed) with 5 cgroup
nesting levels, perf shows __mod_memcg_page state at ~1%.
Link: http://lkml.kernel.org/r/20190412151507.2769-4-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Reviewed-by: Roman Gushchin <guro@fb.com>
Cc: Michal Hocko <mhocko@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-05-15 05:47:12 +07:00
|
|
|
for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
|
|
|
|
atomic_long_add(x, &mi->vmevents[idx]);
|
2019-05-15 05:47:09 +07:00
|
|
|
x = 0;
|
|
|
|
}
|
|
|
|
__this_cpu_write(memcg->vmstats_percpu->events[idx], x);
|
|
|
|
}
|
|
|
|
|
mm: memcontrol: fix recursive statistics correctness & scalabilty
Right now, when somebody needs to know the recursive memory statistics
and events of a cgroup subtree, they need to walk the entire subtree and
sum up the counters manually.
There are two issues with this:
1. When a cgroup gets deleted, its stats are lost. The state counters
should all be 0 at that point, of course, but the events are not.
When this happens, the event counters, which are supposed to be
monotonic, can go backwards in the parent cgroups.
2. During regular operation, we always have a certain number of lazily
freed cgroups sitting around that have been deleted, have no tasks,
but have a few cache pages remaining. These groups' statistics do not
change until we eventually hit memory pressure, but somebody
watching, say, memory.stat on an ancestor has to iterate those every
time.
This patch addresses both issues by introducing recursive counters at
each level that are propagated from the write side when stats change.
Upward propagation happens when the per-cpu caches spill over into the
local atomic counter. This is the same thing we do during charge and
uncharge, except that the latter uses atomic RMWs, which are more
expensive; stat changes happen at around the same rate. In a sparse
file test (page faults and reclaim at maximum CPU speed) with 5 cgroup
nesting levels, perf shows __mod_memcg_page state at ~1%.
Link: http://lkml.kernel.org/r/20190412151507.2769-4-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Reviewed-by: Roman Gushchin <guro@fb.com>
Cc: Michal Hocko <mhocko@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-05-15 05:47:12 +07:00
|
|
|
static unsigned long memcg_events(struct mem_cgroup *memcg, int event)
|
2011-03-24 06:42:37 +07:00
|
|
|
{
|
2019-05-15 05:46:57 +07:00
|
|
|
return atomic_long_read(&memcg->vmevents[event]);
|
2011-03-24 06:42:37 +07:00
|
|
|
}
|
|
|
|
|
mm: memcontrol: fix recursive statistics correctness & scalabilty
Right now, when somebody needs to know the recursive memory statistics
and events of a cgroup subtree, they need to walk the entire subtree and
sum up the counters manually.
There are two issues with this:
1. When a cgroup gets deleted, its stats are lost. The state counters
should all be 0 at that point, of course, but the events are not.
When this happens, the event counters, which are supposed to be
monotonic, can go backwards in the parent cgroups.
2. During regular operation, we always have a certain number of lazily
freed cgroups sitting around that have been deleted, have no tasks,
but have a few cache pages remaining. These groups' statistics do not
change until we eventually hit memory pressure, but somebody
watching, say, memory.stat on an ancestor has to iterate those every
time.
This patch addresses both issues by introducing recursive counters at
each level that are propagated from the write side when stats change.
Upward propagation happens when the per-cpu caches spill over into the
local atomic counter. This is the same thing we do during charge and
uncharge, except that the latter uses atomic RMWs, which are more
expensive; stat changes happen at around the same rate. In a sparse
file test (page faults and reclaim at maximum CPU speed) with 5 cgroup
nesting levels, perf shows __mod_memcg_page state at ~1%.
Link: http://lkml.kernel.org/r/20190412151507.2769-4-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Reviewed-by: Roman Gushchin <guro@fb.com>
Cc: Michal Hocko <mhocko@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-05-15 05:47:12 +07:00
|
|
|
static unsigned long memcg_events_local(struct mem_cgroup *memcg, int event)
|
|
|
|
{
|
mm: memcontrol: don't batch updates of local VM stats and events
The kernel test robot noticed a 26% will-it-scale pagefault regression
from commit 42a300353577 ("mm: memcontrol: fix recursive statistics
correctness & scalabilty"). This appears to be caused by bouncing the
additional cachelines from the new hierarchical statistics counters.
We can fix this by getting rid of the batched local counters instead.
Originally, there were *only* group-local counters, and they were fully
maintained per cpu. A reader of a stats file high up in the cgroup tree
would have to walk the entire subtree and collect each level's per-cpu
counters to get the recursive view. This was prohibitively expensive,
and so we switched to per-cpu batched updates of the local counters
during a983b5ebee57 ("mm: memcontrol: fix excessive complexity in
memory.stat reporting"), reducing the complexity from nr_subgroups *
nr_cpus to nr_subgroups.
With growing machines and cgroup trees, the tree walk itself became too
expensive for monitoring top-level groups, and this is when the culprit
patch added hierarchy counters on each cgroup level. When the per-cpu
batch size would be reached, both the local and the hierarchy counters
would get batch-updated from the per-cpu delta simultaneously.
This makes local and hierarchical counter reads blazingly fast, but it
unfortunately makes the write-side too cache line intense.
Since local counter reads were never a problem - we only centralized
them to accelerate the hierarchy walk - and use of the local counters
are becoming rarer due to replacement with hierarchical views (ongoing
rework in the page reclaim and workingset code), we can make those local
counters unbatched per-cpu counters again.
The scheme will then be as such:
when a memcg statistic changes, the writer will:
- update the local counter (per-cpu)
- update the batch counter (per-cpu). If the batch is full:
- spill the batch into the group's atomic_t
- spill the batch into all ancestors' atomic_ts
- empty out the batch counter (per-cpu)
when a local memcg counter is read, the reader will:
- collect the local counter from all cpus
when a hiearchy memcg counter is read, the reader will:
- read the atomic_t
We might be able to simplify this further and make the recursive
counters unbatched per-cpu counters as well (batch upward propagation,
but leave per-cpu collection to the readers), but that will require a
more in-depth analysis and testing of all the callsites. Deal with the
immediate regression for now.
Link: http://lkml.kernel.org/r/20190521151647.GB2870@cmpxchg.org
Fixes: 42a300353577 ("mm: memcontrol: fix recursive statistics correctness & scalabilty")
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reported-by: kernel test robot <rong.a.chen@intel.com>
Tested-by: kernel test robot <rong.a.chen@intel.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Roman Gushchin <guro@fb.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-06-14 05:55:46 +07:00
|
|
|
long x = 0;
|
|
|
|
int cpu;
|
|
|
|
|
|
|
|
for_each_possible_cpu(cpu)
|
|
|
|
x += per_cpu(memcg->vmstats_local->events[event], cpu);
|
|
|
|
return x;
|
mm: memcontrol: fix recursive statistics correctness & scalabilty
Right now, when somebody needs to know the recursive memory statistics
and events of a cgroup subtree, they need to walk the entire subtree and
sum up the counters manually.
There are two issues with this:
1. When a cgroup gets deleted, its stats are lost. The state counters
should all be 0 at that point, of course, but the events are not.
When this happens, the event counters, which are supposed to be
monotonic, can go backwards in the parent cgroups.
2. During regular operation, we always have a certain number of lazily
freed cgroups sitting around that have been deleted, have no tasks,
but have a few cache pages remaining. These groups' statistics do not
change until we eventually hit memory pressure, but somebody
watching, say, memory.stat on an ancestor has to iterate those every
time.
This patch addresses both issues by introducing recursive counters at
each level that are propagated from the write side when stats change.
Upward propagation happens when the per-cpu caches spill over into the
local atomic counter. This is the same thing we do during charge and
uncharge, except that the latter uses atomic RMWs, which are more
expensive; stat changes happen at around the same rate. In a sparse
file test (page faults and reclaim at maximum CPU speed) with 5 cgroup
nesting levels, perf shows __mod_memcg_page state at ~1%.
Link: http://lkml.kernel.org/r/20190412151507.2769-4-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Reviewed-by: Roman Gushchin <guro@fb.com>
Cc: Michal Hocko <mhocko@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-05-15 05:47:12 +07:00
|
|
|
}
|
|
|
|
|
2011-11-03 03:38:15 +07:00
|
|
|
static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
|
2013-05-08 06:18:09 +07:00
|
|
|
struct page *page,
|
2020-06-04 06:01:31 +07:00
|
|
|
int nr_pages)
|
2008-02-07 15:14:24 +07:00
|
|
|
{
|
2011-01-21 05:44:23 +07:00
|
|
|
/* pagein of a big page is an event. So, ignore page size */
|
|
|
|
if (nr_pages > 0)
|
2018-02-01 07:16:37 +07:00
|
|
|
__count_memcg_events(memcg, PGPGIN, 1);
|
2011-02-02 06:52:45 +07:00
|
|
|
else {
|
2018-02-01 07:16:37 +07:00
|
|
|
__count_memcg_events(memcg, PGPGOUT, 1);
|
2011-02-02 06:52:45 +07:00
|
|
|
nr_pages = -nr_pages; /* for event */
|
|
|
|
}
|
2011-01-21 05:44:23 +07:00
|
|
|
|
2019-05-15 05:46:57 +07:00
|
|
|
__this_cpu_add(memcg->vmstats_percpu->nr_page_events, nr_pages);
|
2008-02-07 15:14:31 +07:00
|
|
|
}
|
|
|
|
|
2012-01-13 08:18:23 +07:00
|
|
|
static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
|
|
|
|
enum mem_cgroup_events_target target)
|
2011-03-24 06:42:38 +07:00
|
|
|
{
|
|
|
|
unsigned long val, next;
|
|
|
|
|
2019-05-15 05:46:57 +07:00
|
|
|
val = __this_cpu_read(memcg->vmstats_percpu->nr_page_events);
|
|
|
|
next = __this_cpu_read(memcg->vmstats_percpu->targets[target]);
|
2011-03-24 06:42:38 +07:00
|
|
|
/* from time_after() in jiffies.h */
|
2017-07-11 05:48:53 +07:00
|
|
|
if ((long)(next - val) < 0) {
|
2012-01-13 08:18:23 +07:00
|
|
|
switch (target) {
|
|
|
|
case MEM_CGROUP_TARGET_THRESH:
|
|
|
|
next = val + THRESHOLDS_EVENTS_TARGET;
|
|
|
|
break;
|
2013-09-25 05:27:40 +07:00
|
|
|
case MEM_CGROUP_TARGET_SOFTLIMIT:
|
|
|
|
next = val + SOFTLIMIT_EVENTS_TARGET;
|
|
|
|
break;
|
2012-01-13 08:18:23 +07:00
|
|
|
default:
|
|
|
|
break;
|
|
|
|
}
|
2019-05-15 05:46:57 +07:00
|
|
|
__this_cpu_write(memcg->vmstats_percpu->targets[target], next);
|
2012-01-13 08:18:23 +07:00
|
|
|
return true;
|
2011-03-24 06:42:38 +07:00
|
|
|
}
|
2012-01-13 08:18:23 +07:00
|
|
|
return false;
|
2010-03-11 06:22:31 +07:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Check events in order.
|
|
|
|
*
|
|
|
|
*/
|
2011-11-03 03:38:15 +07:00
|
|
|
static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
|
2010-03-11 06:22:31 +07:00
|
|
|
{
|
|
|
|
/* threshold event is triggered in finer grain than soft limit */
|
2012-01-13 08:18:23 +07:00
|
|
|
if (unlikely(mem_cgroup_event_ratelimit(memcg,
|
|
|
|
MEM_CGROUP_TARGET_THRESH))) {
|
2013-09-25 05:27:40 +07:00
|
|
|
bool do_softlimit;
|
2012-01-13 08:18:23 +07:00
|
|
|
|
2013-09-25 05:27:40 +07:00
|
|
|
do_softlimit = mem_cgroup_event_ratelimit(memcg,
|
|
|
|
MEM_CGROUP_TARGET_SOFTLIMIT);
|
2011-11-03 03:38:15 +07:00
|
|
|
mem_cgroup_threshold(memcg);
|
2013-09-25 05:27:40 +07:00
|
|
|
if (unlikely(do_softlimit))
|
|
|
|
mem_cgroup_update_tree(memcg, page);
|
mm: memcontrol: rewrite uncharge API
The memcg uncharging code that is involved towards the end of a page's
lifetime - truncation, reclaim, swapout, migration - is impressively
complicated and fragile.
Because anonymous and file pages were always charged before they had their
page->mapping established, uncharges had to happen when the page type
could still be known from the context; as in unmap for anonymous, page
cache removal for file and shmem pages, and swap cache truncation for swap
pages. However, these operations happen well before the page is actually
freed, and so a lot of synchronization is necessary:
- Charging, uncharging, page migration, and charge migration all need
to take a per-page bit spinlock as they could race with uncharging.
- Swap cache truncation happens during both swap-in and swap-out, and
possibly repeatedly before the page is actually freed. This means
that the memcg swapout code is called from many contexts that make
no sense and it has to figure out the direction from page state to
make sure memory and memory+swap are always correctly charged.
- On page migration, the old page might be unmapped but then reused,
so memcg code has to prevent untimely uncharging in that case.
Because this code - which should be a simple charge transfer - is so
special-cased, it is not reusable for replace_page_cache().
But now that charged pages always have a page->mapping, introduce
mem_cgroup_uncharge(), which is called after the final put_page(), when we
know for sure that nobody is looking at the page anymore.
For page migration, introduce mem_cgroup_migrate(), which is called after
the migration is successful and the new page is fully rmapped. Because
the old page is no longer uncharged after migration, prevent double
charges by decoupling the page's memcg association (PCG_USED and
pc->mem_cgroup) from the page holding an actual charge. The new bits
PCG_MEM and PCG_MEMSW represent the respective charges and are transferred
to the new page during migration.
mem_cgroup_migrate() is suitable for replace_page_cache() as well,
which gets rid of mem_cgroup_replace_page_cache(). However, care
needs to be taken because both the source and the target page can
already be charged and on the LRU when fuse is splicing: grab the page
lock on the charge moving side to prevent changing pc->mem_cgroup of a
page under migration. Also, the lruvecs of both pages change as we
uncharge the old and charge the new during migration, and putback may
race with us, so grab the lru lock and isolate the pages iff on LRU to
prevent races and ensure the pages are on the right lruvec afterward.
Swap accounting is massively simplified: because the page is no longer
uncharged as early as swap cache deletion, a new mem_cgroup_swapout() can
transfer the page's memory+swap charge (PCG_MEMSW) to the swap entry
before the final put_page() in page reclaim.
Finally, page_cgroup changes are now protected by whatever protection the
page itself offers: anonymous pages are charged under the page table lock,
whereas page cache insertions, swapin, and migration hold the page lock.
Uncharging happens under full exclusion with no outstanding references.
Charging and uncharging also ensure that the page is off-LRU, which
serializes against charge migration. Remove the very costly page_cgroup
lock and set pc->flags non-atomically.
[mhocko@suse.cz: mem_cgroup_charge_statistics needs preempt_disable]
[vdavydov@parallels.com: fix flags definition]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Vladimir Davydov <vdavydov@parallels.com>
Tested-by: Jet Chen <jet.chen@intel.com>
Acked-by: Michal Hocko <mhocko@suse.cz>
Tested-by: Felipe Balbi <balbi@ti.com>
Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-08-09 04:19:22 +07:00
|
|
|
}
|
2010-03-11 06:22:31 +07:00
|
|
|
}
|
|
|
|
|
cgroups: add an owner to the mm_struct
Remove the mem_cgroup member from mm_struct and instead adds an owner.
This approach was suggested by Paul Menage. The advantage of this approach
is that, once the mm->owner is known, using the subsystem id, the cgroup
can be determined. It also allows several control groups that are
virtually grouped by mm_struct, to exist independent of the memory
controller i.e., without adding mem_cgroup's for each controller, to
mm_struct.
A new config option CONFIG_MM_OWNER is added and the memory resource
controller selects this config option.
This patch also adds cgroup callbacks to notify subsystems when mm->owner
changes. The mm_cgroup_changed callback is called with the task_lock() of
the new task held and is called just prior to changing the mm->owner.
I am indebted to Paul Menage for the several reviews of this patchset and
helping me make it lighter and simpler.
This patch was tested on a powerpc box, it was compiled with both the
MM_OWNER config turned on and off.
After the thread group leader exits, it's moved to init_css_state by
cgroup_exit(), thus all future charges from runnings threads would be
redirected to the init_css_set's subsystem.
Signed-off-by: Balbir Singh <balbir@linux.vnet.ibm.com>
Cc: Pavel Emelianov <xemul@openvz.org>
Cc: Hugh Dickins <hugh@veritas.com>
Cc: Sudhir Kumar <skumar@linux.vnet.ibm.com>
Cc: YAMAMOTO Takashi <yamamoto@valinux.co.jp>
Cc: Hirokazu Takahashi <taka@valinux.co.jp>
Cc: David Rientjes <rientjes@google.com>,
Cc: Balbir Singh <balbir@linux.vnet.ibm.com>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Acked-by: Pekka Enberg <penberg@cs.helsinki.fi>
Reviewed-by: Paul Menage <menage@google.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-29 15:00:16 +07:00
|
|
|
struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
|
2008-02-07 15:13:51 +07:00
|
|
|
{
|
mm owner: fix race between swapoff and exit
There's a race between mm->owner assignment and swapoff, more easily
seen when task slab poisoning is turned on. The condition occurs when
try_to_unuse() runs in parallel with an exiting task. A similar race
can occur with callers of get_task_mm(), such as /proc/<pid>/<mmstats>
or ptrace or page migration.
CPU0 CPU1
try_to_unuse
looks at mm = task0->mm
increments mm->mm_users
task 0 exits
mm->owner needs to be updated, but no
new owner is found (mm_users > 1, but
no other task has task->mm = task0->mm)
mm_update_next_owner() leaves
mmput(mm) decrements mm->mm_users
task0 freed
dereferencing mm->owner fails
The fix is to notify the subsystem via mm_owner_changed callback(),
if no new owner is found, by specifying the new task as NULL.
Jiri Slaby:
mm->owner was set to NULL prior to calling cgroup_mm_owner_callbacks(), but
must be set after that, so as not to pass NULL as old owner causing oops.
Daisuke Nishimura:
mm_update_next_owner() may set mm->owner to NULL, but mem_cgroup_from_task()
and its callers need to take account of this situation to avoid oops.
Hugh Dickins:
Lockdep warning and hang below exec_mmap() when testing these patches.
exit_mm() up_reads mmap_sem before calling mm_update_next_owner(),
so exec_mmap() now needs to do the same. And with that repositioning,
there's now no point in mm_need_new_owner() allowing for NULL mm.
Reported-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Balbir Singh <balbir@linux.vnet.ibm.com>
Signed-off-by: Jiri Slaby <jirislaby@gmail.com>
Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Paul Menage <menage@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-09-29 05:09:31 +07:00
|
|
|
/*
|
|
|
|
* mm_update_next_owner() may clear mm->owner to NULL
|
|
|
|
* if it races with swapoff, page migration, etc.
|
|
|
|
* So this can be called with p == NULL.
|
|
|
|
*/
|
|
|
|
if (unlikely(!p))
|
|
|
|
return NULL;
|
|
|
|
|
2014-02-08 22:36:58 +07:00
|
|
|
return mem_cgroup_from_css(task_css(p, memory_cgrp_id));
|
2008-02-07 15:13:51 +07:00
|
|
|
}
|
2015-09-09 05:01:02 +07:00
|
|
|
EXPORT_SYMBOL(mem_cgroup_from_task);
|
2008-02-07 15:13:51 +07:00
|
|
|
|
fs: fsnotify: account fsnotify metadata to kmemcg
Patch series "Directed kmem charging", v8.
The Linux kernel's memory cgroup allows limiting the memory usage of the
jobs running on the system to provide isolation between the jobs. All
the kernel memory allocated in the context of the job and marked with
__GFP_ACCOUNT will also be included in the memory usage and be limited
by the job's limit.
The kernel memory can only be charged to the memcg of the process in
whose context kernel memory was allocated. However there are cases
where the allocated kernel memory should be charged to the memcg
different from the current processes's memcg. This patch series
contains two such concrete use-cases i.e. fsnotify and buffer_head.
The fsnotify event objects can consume a lot of system memory for large
or unlimited queues if there is either no or slow listener. The events
are allocated in the context of the event producer. However they should
be charged to the event consumer. Similarly the buffer_head objects can
be allocated in a memcg different from the memcg of the page for which
buffer_head objects are being allocated.
To solve this issue, this patch series introduces mechanism to charge
kernel memory to a given memcg. In case of fsnotify events, the memcg
of the consumer can be used for charging and for buffer_head, the memcg
of the page can be charged. For directed charging, the caller can use
the scope API memalloc_[un]use_memcg() to specify the memcg to charge
for all the __GFP_ACCOUNT allocations within the scope.
This patch (of 2):
A lot of memory can be consumed by the events generated for the huge or
unlimited queues if there is either no or slow listener. This can cause
system level memory pressure or OOMs. So, it's better to account the
fsnotify kmem caches to the memcg of the listener.
However the listener can be in a different memcg than the memcg of the
producer and these allocations happen in the context of the event
producer. This patch introduces remote memcg charging API which the
producer can use to charge the allocations to the memcg of the listener.
There are seven fsnotify kmem caches and among them allocations from
dnotify_struct_cache, dnotify_mark_cache, fanotify_mark_cache and
inotify_inode_mark_cachep happens in the context of syscall from the
listener. So, SLAB_ACCOUNT is enough for these caches.
The objects from fsnotify_mark_connector_cachep are not accounted as
they are small compared to the notification mark or events and it is
unclear whom to account connector to since it is shared by all events
attached to the inode.
The allocations from the event caches happen in the context of the event
producer. For such caches we will need to remote charge the allocations
to the listener's memcg. Thus we save the memcg reference in the
fsnotify_group structure of the listener.
This patch has also moved the members of fsnotify_group to keep the size
same, at least for 64 bit build, even with additional member by filling
the holes.
[shakeelb@google.com: use GFP_KERNEL_ACCOUNT rather than open-coding it]
Link: http://lkml.kernel.org/r/20180702215439.211597-1-shakeelb@google.com
Link: http://lkml.kernel.org/r/20180627191250.209150-2-shakeelb@google.com
Signed-off-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Jan Kara <jack@suse.cz>
Cc: Amir Goldstein <amir73il@gmail.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Roman Gushchin <guro@fb.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2018-08-18 05:46:39 +07:00
|
|
|
/**
|
|
|
|
* get_mem_cgroup_from_mm: Obtain a reference on given mm_struct's memcg.
|
|
|
|
* @mm: mm from which memcg should be extracted. It can be NULL.
|
|
|
|
*
|
|
|
|
* Obtain a reference on mm->memcg and returns it if successful. Otherwise
|
|
|
|
* root_mem_cgroup is returned. However if mem_cgroup is disabled, NULL is
|
|
|
|
* returned.
|
|
|
|
*/
|
|
|
|
struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
|
2009-01-08 09:08:33 +07:00
|
|
|
{
|
fs: fsnotify: account fsnotify metadata to kmemcg
Patch series "Directed kmem charging", v8.
The Linux kernel's memory cgroup allows limiting the memory usage of the
jobs running on the system to provide isolation between the jobs. All
the kernel memory allocated in the context of the job and marked with
__GFP_ACCOUNT will also be included in the memory usage and be limited
by the job's limit.
The kernel memory can only be charged to the memcg of the process in
whose context kernel memory was allocated. However there are cases
where the allocated kernel memory should be charged to the memcg
different from the current processes's memcg. This patch series
contains two such concrete use-cases i.e. fsnotify and buffer_head.
The fsnotify event objects can consume a lot of system memory for large
or unlimited queues if there is either no or slow listener. The events
are allocated in the context of the event producer. However they should
be charged to the event consumer. Similarly the buffer_head objects can
be allocated in a memcg different from the memcg of the page for which
buffer_head objects are being allocated.
To solve this issue, this patch series introduces mechanism to charge
kernel memory to a given memcg. In case of fsnotify events, the memcg
of the consumer can be used for charging and for buffer_head, the memcg
of the page can be charged. For directed charging, the caller can use
the scope API memalloc_[un]use_memcg() to specify the memcg to charge
for all the __GFP_ACCOUNT allocations within the scope.
This patch (of 2):
A lot of memory can be consumed by the events generated for the huge or
unlimited queues if there is either no or slow listener. This can cause
system level memory pressure or OOMs. So, it's better to account the
fsnotify kmem caches to the memcg of the listener.
However the listener can be in a different memcg than the memcg of the
producer and these allocations happen in the context of the event
producer. This patch introduces remote memcg charging API which the
producer can use to charge the allocations to the memcg of the listener.
There are seven fsnotify kmem caches and among them allocations from
dnotify_struct_cache, dnotify_mark_cache, fanotify_mark_cache and
inotify_inode_mark_cachep happens in the context of syscall from the
listener. So, SLAB_ACCOUNT is enough for these caches.
The objects from fsnotify_mark_connector_cachep are not accounted as
they are small compared to the notification mark or events and it is
unclear whom to account connector to since it is shared by all events
attached to the inode.
The allocations from the event caches happen in the context of the event
producer. For such caches we will need to remote charge the allocations
to the listener's memcg. Thus we save the memcg reference in the
fsnotify_group structure of the listener.
This patch has also moved the members of fsnotify_group to keep the size
same, at least for 64 bit build, even with additional member by filling
the holes.
[shakeelb@google.com: use GFP_KERNEL_ACCOUNT rather than open-coding it]
Link: http://lkml.kernel.org/r/20180702215439.211597-1-shakeelb@google.com
Link: http://lkml.kernel.org/r/20180627191250.209150-2-shakeelb@google.com
Signed-off-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Jan Kara <jack@suse.cz>
Cc: Amir Goldstein <amir73il@gmail.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Roman Gushchin <guro@fb.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2018-08-18 05:46:39 +07:00
|
|
|
struct mem_cgroup *memcg;
|
|
|
|
|
|
|
|
if (mem_cgroup_disabled())
|
|
|
|
return NULL;
|
memcg: fix OOM killer under memcg
This patch tries to fix OOM Killer problems caused by hierarchy.
Now, memcg itself has OOM KILL function (in oom_kill.c) and tries to
kill a task in memcg.
But, when hierarchy is used, it's broken and correct task cannot
be killed. For example, in following cgroup
/groupA/ hierarchy=1, limit=1G,
01 nolimit
02 nolimit
All tasks' memory usage under /groupA, /groupA/01, groupA/02 is limited to
groupA's 1Gbytes but OOM Killer just kills tasks in groupA.
This patch provides makes the bad process be selected from all tasks
under hierarchy. BTW, currently, oom_jiffies is updated against groupA
in above case. oom_jiffies of tree should be updated.
To see how oom_jiffies is used, please check mem_cgroup_oom_called()
callers.
[akpm@linux-foundation.org: build fix]
[akpm@linux-foundation.org: const fix]
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Paul Menage <menage@google.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-04-03 06:57:38 +07:00
|
|
|
|
2009-01-08 09:08:33 +07:00
|
|
|
rcu_read_lock();
|
|
|
|
do {
|
2014-05-23 01:54:19 +07:00
|
|
|
/*
|
|
|
|
* Page cache insertions can happen withou an
|
|
|
|
* actual mm context, e.g. during disk probing
|
|
|
|
* on boot, loopback IO, acct() writes etc.
|
|
|
|
*/
|
|
|
|
if (unlikely(!mm))
|
2014-04-08 05:37:43 +07:00
|
|
|
memcg = root_mem_cgroup;
|
2014-05-23 01:54:19 +07:00
|
|
|
else {
|
|
|
|
memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
|
|
|
|
if (unlikely(!memcg))
|
|
|
|
memcg = root_mem_cgroup;
|
|
|
|
}
|
2019-11-16 08:34:43 +07:00
|
|
|
} while (!css_tryget(&memcg->css));
|
2009-01-08 09:08:33 +07:00
|
|
|
rcu_read_unlock();
|
2011-11-03 03:38:15 +07:00
|
|
|
return memcg;
|
2009-01-08 09:08:33 +07:00
|
|
|
}
|
fs: fsnotify: account fsnotify metadata to kmemcg
Patch series "Directed kmem charging", v8.
The Linux kernel's memory cgroup allows limiting the memory usage of the
jobs running on the system to provide isolation between the jobs. All
the kernel memory allocated in the context of the job and marked with
__GFP_ACCOUNT will also be included in the memory usage and be limited
by the job's limit.
The kernel memory can only be charged to the memcg of the process in
whose context kernel memory was allocated. However there are cases
where the allocated kernel memory should be charged to the memcg
different from the current processes's memcg. This patch series
contains two such concrete use-cases i.e. fsnotify and buffer_head.
The fsnotify event objects can consume a lot of system memory for large
or unlimited queues if there is either no or slow listener. The events
are allocated in the context of the event producer. However they should
be charged to the event consumer. Similarly the buffer_head objects can
be allocated in a memcg different from the memcg of the page for which
buffer_head objects are being allocated.
To solve this issue, this patch series introduces mechanism to charge
kernel memory to a given memcg. In case of fsnotify events, the memcg
of the consumer can be used for charging and for buffer_head, the memcg
of the page can be charged. For directed charging, the caller can use
the scope API memalloc_[un]use_memcg() to specify the memcg to charge
for all the __GFP_ACCOUNT allocations within the scope.
This patch (of 2):
A lot of memory can be consumed by the events generated for the huge or
unlimited queues if there is either no or slow listener. This can cause
system level memory pressure or OOMs. So, it's better to account the
fsnotify kmem caches to the memcg of the listener.
However the listener can be in a different memcg than the memcg of the
producer and these allocations happen in the context of the event
producer. This patch introduces remote memcg charging API which the
producer can use to charge the allocations to the memcg of the listener.
There are seven fsnotify kmem caches and among them allocations from
dnotify_struct_cache, dnotify_mark_cache, fanotify_mark_cache and
inotify_inode_mark_cachep happens in the context of syscall from the
listener. So, SLAB_ACCOUNT is enough for these caches.
The objects from fsnotify_mark_connector_cachep are not accounted as
they are small compared to the notification mark or events and it is
unclear whom to account connector to since it is shared by all events
attached to the inode.
The allocations from the event caches happen in the context of the event
producer. For such caches we will need to remote charge the allocations
to the listener's memcg. Thus we save the memcg reference in the
fsnotify_group structure of the listener.
This patch has also moved the members of fsnotify_group to keep the size
same, at least for 64 bit build, even with additional member by filling
the holes.
[shakeelb@google.com: use GFP_KERNEL_ACCOUNT rather than open-coding it]
Link: http://lkml.kernel.org/r/20180702215439.211597-1-shakeelb@google.com
Link: http://lkml.kernel.org/r/20180627191250.209150-2-shakeelb@google.com
Signed-off-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Jan Kara <jack@suse.cz>
Cc: Amir Goldstein <amir73il@gmail.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Roman Gushchin <guro@fb.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2018-08-18 05:46:39 +07:00
|
|
|
EXPORT_SYMBOL(get_mem_cgroup_from_mm);
|
|
|
|
|
2018-08-18 05:46:44 +07:00
|
|
|
/**
|
|
|
|
* get_mem_cgroup_from_page: Obtain a reference on given page's memcg.
|
|
|
|
* @page: page from which memcg should be extracted.
|
|
|
|
*
|
|
|
|
* Obtain a reference on page->memcg and returns it if successful. Otherwise
|
|
|
|
* root_mem_cgroup is returned.
|
|
|
|
*/
|
|
|
|
struct mem_cgroup *get_mem_cgroup_from_page(struct page *page)
|
|
|
|
{
|
|
|
|
struct mem_cgroup *memcg = page->mem_cgroup;
|
|
|
|
|
|
|
|
if (mem_cgroup_disabled())
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
rcu_read_lock();
|
2020-04-02 11:07:10 +07:00
|
|
|
/* Page should not get uncharged and freed memcg under us. */
|
|
|
|
if (!memcg || WARN_ON_ONCE(!css_tryget(&memcg->css)))
|
2018-08-18 05:46:44 +07:00
|
|
|
memcg = root_mem_cgroup;
|
|
|
|
rcu_read_unlock();
|
|
|
|
return memcg;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(get_mem_cgroup_from_page);
|
|
|
|
|
mm: kmem: prepare remote memcg charging infra for interrupt contexts
Remote memcg charging API uses current->active_memcg to store the
currently active memory cgroup, which overwrites the memory cgroup of the
current process. It works well for normal contexts, but doesn't work for
interrupt contexts: indeed, if an interrupt occurs during the execution of
a section with an active memcg set, all allocations inside the interrupt
will be charged to the active memcg set (given that we'll enable
accounting for allocations from an interrupt context). But because the
interrupt might have no relation to the active memcg set outside, it's
obviously wrong from the accounting prospective.
To resolve this problem, let's add a global percpu int_active_memcg
variable, which will be used to store an active memory cgroup which will
be used from interrupt contexts. set_active_memcg() will transparently
use current->active_memcg or int_active_memcg depending on the context.
To make the read part simple and transparent for the caller, let's
introduce two new functions:
- struct mem_cgroup *active_memcg(void),
- struct mem_cgroup *get_active_memcg(void).
They are returning the active memcg if it's set, hiding all implementation
details: where to get it depending on the current context.
Signed-off-by: Roman Gushchin <guro@fb.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Link: http://lkml.kernel.org/r/20200827225843.1270629-4-guro@fb.com
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2020-10-18 06:13:50 +07:00
|
|
|
static __always_inline struct mem_cgroup *active_memcg(void)
|
fs: fsnotify: account fsnotify metadata to kmemcg
Patch series "Directed kmem charging", v8.
The Linux kernel's memory cgroup allows limiting the memory usage of the
jobs running on the system to provide isolation between the jobs. All
the kernel memory allocated in the context of the job and marked with
__GFP_ACCOUNT will also be included in the memory usage and be limited
by the job's limit.
The kernel memory can only be charged to the memcg of the process in
whose context kernel memory was allocated. However there are cases
where the allocated kernel memory should be charged to the memcg
different from the current processes's memcg. This patch series
contains two such concrete use-cases i.e. fsnotify and buffer_head.
The fsnotify event objects can consume a lot of system memory for large
or unlimited queues if there is either no or slow listener. The events
are allocated in the context of the event producer. However they should
be charged to the event consumer. Similarly the buffer_head objects can
be allocated in a memcg different from the memcg of the page for which
buffer_head objects are being allocated.
To solve this issue, this patch series introduces mechanism to charge
kernel memory to a given memcg. In case of fsnotify events, the memcg
of the consumer can be used for charging and for buffer_head, the memcg
of the page can be charged. For directed charging, the caller can use
the scope API memalloc_[un]use_memcg() to specify the memcg to charge
for all the __GFP_ACCOUNT allocations within the scope.
This patch (of 2):
A lot of memory can be consumed by the events generated for the huge or
unlimited queues if there is either no or slow listener. This can cause
system level memory pressure or OOMs. So, it's better to account the
fsnotify kmem caches to the memcg of the listener.
However the listener can be in a different memcg than the memcg of the
producer and these allocations happen in the context of the event
producer. This patch introduces remote memcg charging API which the
producer can use to charge the allocations to the memcg of the listener.
There are seven fsnotify kmem caches and among them allocations from
dnotify_struct_cache, dnotify_mark_cache, fanotify_mark_cache and
inotify_inode_mark_cachep happens in the context of syscall from the
listener. So, SLAB_ACCOUNT is enough for these caches.
The objects from fsnotify_mark_connector_cachep are not accounted as
they are small compared to the notification mark or events and it is
unclear whom to account connector to since it is shared by all events
attached to the inode.
The allocations from the event caches happen in the context of the event
producer. For such caches we will need to remote charge the allocations
to the listener's memcg. Thus we save the memcg reference in the
fsnotify_group structure of the listener.
This patch has also moved the members of fsnotify_group to keep the size
same, at least for 64 bit build, even with additional member by filling
the holes.
[shakeelb@google.com: use GFP_KERNEL_ACCOUNT rather than open-coding it]
Link: http://lkml.kernel.org/r/20180702215439.211597-1-shakeelb@google.com
Link: http://lkml.kernel.org/r/20180627191250.209150-2-shakeelb@google.com
Signed-off-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Jan Kara <jack@suse.cz>
Cc: Amir Goldstein <amir73il@gmail.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Roman Gushchin <guro@fb.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2018-08-18 05:46:39 +07:00
|
|
|
{
|
mm: kmem: prepare remote memcg charging infra for interrupt contexts
Remote memcg charging API uses current->active_memcg to store the
currently active memory cgroup, which overwrites the memory cgroup of the
current process. It works well for normal contexts, but doesn't work for
interrupt contexts: indeed, if an interrupt occurs during the execution of
a section with an active memcg set, all allocations inside the interrupt
will be charged to the active memcg set (given that we'll enable
accounting for allocations from an interrupt context). But because the
interrupt might have no relation to the active memcg set outside, it's
obviously wrong from the accounting prospective.
To resolve this problem, let's add a global percpu int_active_memcg
variable, which will be used to store an active memory cgroup which will
be used from interrupt contexts. set_active_memcg() will transparently
use current->active_memcg or int_active_memcg depending on the context.
To make the read part simple and transparent for the caller, let's
introduce two new functions:
- struct mem_cgroup *active_memcg(void),
- struct mem_cgroup *get_active_memcg(void).
They are returning the active memcg if it's set, hiding all implementation
details: where to get it depending on the current context.
Signed-off-by: Roman Gushchin <guro@fb.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Link: http://lkml.kernel.org/r/20200827225843.1270629-4-guro@fb.com
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2020-10-18 06:13:50 +07:00
|
|
|
if (in_interrupt())
|
|
|
|
return this_cpu_read(int_active_memcg);
|
|
|
|
else
|
|
|
|
return current->active_memcg;
|
|
|
|
}
|
2020-10-18 06:13:44 +07:00
|
|
|
|
mm: kmem: prepare remote memcg charging infra for interrupt contexts
Remote memcg charging API uses current->active_memcg to store the
currently active memory cgroup, which overwrites the memory cgroup of the
current process. It works well for normal contexts, but doesn't work for
interrupt contexts: indeed, if an interrupt occurs during the execution of
a section with an active memcg set, all allocations inside the interrupt
will be charged to the active memcg set (given that we'll enable
accounting for allocations from an interrupt context). But because the
interrupt might have no relation to the active memcg set outside, it's
obviously wrong from the accounting prospective.
To resolve this problem, let's add a global percpu int_active_memcg
variable, which will be used to store an active memory cgroup which will
be used from interrupt contexts. set_active_memcg() will transparently
use current->active_memcg or int_active_memcg depending on the context.
To make the read part simple and transparent for the caller, let's
introduce two new functions:
- struct mem_cgroup *active_memcg(void),
- struct mem_cgroup *get_active_memcg(void).
They are returning the active memcg if it's set, hiding all implementation
details: where to get it depending on the current context.
Signed-off-by: Roman Gushchin <guro@fb.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Link: http://lkml.kernel.org/r/20200827225843.1270629-4-guro@fb.com
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2020-10-18 06:13:50 +07:00
|
|
|
static __always_inline struct mem_cgroup *get_active_memcg(void)
|
|
|
|
{
|
|
|
|
struct mem_cgroup *memcg;
|
fs: fsnotify: account fsnotify metadata to kmemcg
Patch series "Directed kmem charging", v8.
The Linux kernel's memory cgroup allows limiting the memory usage of the
jobs running on the system to provide isolation between the jobs. All
the kernel memory allocated in the context of the job and marked with
__GFP_ACCOUNT will also be included in the memory usage and be limited
by the job's limit.
The kernel memory can only be charged to the memcg of the process in
whose context kernel memory was allocated. However there are cases
where the allocated kernel memory should be charged to the memcg
different from the current processes's memcg. This patch series
contains two such concrete use-cases i.e. fsnotify and buffer_head.
The fsnotify event objects can consume a lot of system memory for large
or unlimited queues if there is either no or slow listener. The events
are allocated in the context of the event producer. However they should
be charged to the event consumer. Similarly the buffer_head objects can
be allocated in a memcg different from the memcg of the page for which
buffer_head objects are being allocated.
To solve this issue, this patch series introduces mechanism to charge
kernel memory to a given memcg. In case of fsnotify events, the memcg
of the consumer can be used for charging and for buffer_head, the memcg
of the page can be charged. For directed charging, the caller can use
the scope API memalloc_[un]use_memcg() to specify the memcg to charge
for all the __GFP_ACCOUNT allocations within the scope.
This patch (of 2):
A lot of memory can be consumed by the events generated for the huge or
unlimited queues if there is either no or slow listener. This can cause
system level memory pressure or OOMs. So, it's better to account the
fsnotify kmem caches to the memcg of the listener.
However the listener can be in a different memcg than the memcg of the
producer and these allocations happen in the context of the event
producer. This patch introduces remote memcg charging API which the
producer can use to charge the allocations to the memcg of the listener.
There are seven fsnotify kmem caches and among them allocations from
dnotify_struct_cache, dnotify_mark_cache, fanotify_mark_cache and
inotify_inode_mark_cachep happens in the context of syscall from the
listener. So, SLAB_ACCOUNT is enough for these caches.
The objects from fsnotify_mark_connector_cachep are not accounted as
they are small compared to the notification mark or events and it is
unclear whom to account connector to since it is shared by all events
attached to the inode.
The allocations from the event caches happen in the context of the event
producer. For such caches we will need to remote charge the allocations
to the listener's memcg. Thus we save the memcg reference in the
fsnotify_group structure of the listener.
This patch has also moved the members of fsnotify_group to keep the size
same, at least for 64 bit build, even with additional member by filling
the holes.
[shakeelb@google.com: use GFP_KERNEL_ACCOUNT rather than open-coding it]
Link: http://lkml.kernel.org/r/20180702215439.211597-1-shakeelb@google.com
Link: http://lkml.kernel.org/r/20180627191250.209150-2-shakeelb@google.com
Signed-off-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Jan Kara <jack@suse.cz>
Cc: Amir Goldstein <amir73il@gmail.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Roman Gushchin <guro@fb.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2018-08-18 05:46:39 +07:00
|
|
|
|
mm: kmem: prepare remote memcg charging infra for interrupt contexts
Remote memcg charging API uses current->active_memcg to store the
currently active memory cgroup, which overwrites the memory cgroup of the
current process. It works well for normal contexts, but doesn't work for
interrupt contexts: indeed, if an interrupt occurs during the execution of
a section with an active memcg set, all allocations inside the interrupt
will be charged to the active memcg set (given that we'll enable
accounting for allocations from an interrupt context). But because the
interrupt might have no relation to the active memcg set outside, it's
obviously wrong from the accounting prospective.
To resolve this problem, let's add a global percpu int_active_memcg
variable, which will be used to store an active memory cgroup which will
be used from interrupt contexts. set_active_memcg() will transparently
use current->active_memcg or int_active_memcg depending on the context.
To make the read part simple and transparent for the caller, let's
introduce two new functions:
- struct mem_cgroup *active_memcg(void),
- struct mem_cgroup *get_active_memcg(void).
They are returning the active memcg if it's set, hiding all implementation
details: where to get it depending on the current context.
Signed-off-by: Roman Gushchin <guro@fb.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Link: http://lkml.kernel.org/r/20200827225843.1270629-4-guro@fb.com
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2020-10-18 06:13:50 +07:00
|
|
|
rcu_read_lock();
|
|
|
|
memcg = active_memcg();
|
|
|
|
if (memcg) {
|
2020-04-02 11:07:10 +07:00
|
|
|
/* current->active_memcg must hold a ref. */
|
mm: kmem: prepare remote memcg charging infra for interrupt contexts
Remote memcg charging API uses current->active_memcg to store the
currently active memory cgroup, which overwrites the memory cgroup of the
current process. It works well for normal contexts, but doesn't work for
interrupt contexts: indeed, if an interrupt occurs during the execution of
a section with an active memcg set, all allocations inside the interrupt
will be charged to the active memcg set (given that we'll enable
accounting for allocations from an interrupt context). But because the
interrupt might have no relation to the active memcg set outside, it's
obviously wrong from the accounting prospective.
To resolve this problem, let's add a global percpu int_active_memcg
variable, which will be used to store an active memory cgroup which will
be used from interrupt contexts. set_active_memcg() will transparently
use current->active_memcg or int_active_memcg depending on the context.
To make the read part simple and transparent for the caller, let's
introduce two new functions:
- struct mem_cgroup *active_memcg(void),
- struct mem_cgroup *get_active_memcg(void).
They are returning the active memcg if it's set, hiding all implementation
details: where to get it depending on the current context.
Signed-off-by: Roman Gushchin <guro@fb.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Link: http://lkml.kernel.org/r/20200827225843.1270629-4-guro@fb.com
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2020-10-18 06:13:50 +07:00
|
|
|
if (WARN_ON_ONCE(!css_tryget(&memcg->css)))
|
2020-04-02 11:07:10 +07:00
|
|
|
memcg = root_mem_cgroup;
|
|
|
|
else
|
fs: fsnotify: account fsnotify metadata to kmemcg
Patch series "Directed kmem charging", v8.
The Linux kernel's memory cgroup allows limiting the memory usage of the
jobs running on the system to provide isolation between the jobs. All
the kernel memory allocated in the context of the job and marked with
__GFP_ACCOUNT will also be included in the memory usage and be limited
by the job's limit.
The kernel memory can only be charged to the memcg of the process in
whose context kernel memory was allocated. However there are cases
where the allocated kernel memory should be charged to the memcg
different from the current processes's memcg. This patch series
contains two such concrete use-cases i.e. fsnotify and buffer_head.
The fsnotify event objects can consume a lot of system memory for large
or unlimited queues if there is either no or slow listener. The events
are allocated in the context of the event producer. However they should
be charged to the event consumer. Similarly the buffer_head objects can
be allocated in a memcg different from the memcg of the page for which
buffer_head objects are being allocated.
To solve this issue, this patch series introduces mechanism to charge
kernel memory to a given memcg. In case of fsnotify events, the memcg
of the consumer can be used for charging and for buffer_head, the memcg
of the page can be charged. For directed charging, the caller can use
the scope API memalloc_[un]use_memcg() to specify the memcg to charge
for all the __GFP_ACCOUNT allocations within the scope.
This patch (of 2):
A lot of memory can be consumed by the events generated for the huge or
unlimited queues if there is either no or slow listener. This can cause
system level memory pressure or OOMs. So, it's better to account the
fsnotify kmem caches to the memcg of the listener.
However the listener can be in a different memcg than the memcg of the
producer and these allocations happen in the context of the event
producer. This patch introduces remote memcg charging API which the
producer can use to charge the allocations to the memcg of the listener.
There are seven fsnotify kmem caches and among them allocations from
dnotify_struct_cache, dnotify_mark_cache, fanotify_mark_cache and
inotify_inode_mark_cachep happens in the context of syscall from the
listener. So, SLAB_ACCOUNT is enough for these caches.
The objects from fsnotify_mark_connector_cachep are not accounted as
they are small compared to the notification mark or events and it is
unclear whom to account connector to since it is shared by all events
attached to the inode.
The allocations from the event caches happen in the context of the event
producer. For such caches we will need to remote charge the allocations
to the listener's memcg. Thus we save the memcg reference in the
fsnotify_group structure of the listener.
This patch has also moved the members of fsnotify_group to keep the size
same, at least for 64 bit build, even with additional member by filling
the holes.
[shakeelb@google.com: use GFP_KERNEL_ACCOUNT rather than open-coding it]
Link: http://lkml.kernel.org/r/20180702215439.211597-1-shakeelb@google.com
Link: http://lkml.kernel.org/r/20180627191250.209150-2-shakeelb@google.com
Signed-off-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Jan Kara <jack@suse.cz>
Cc: Amir Goldstein <amir73il@gmail.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Roman Gushchin <guro@fb.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2018-08-18 05:46:39 +07:00
|
|
|
memcg = current->active_memcg;
|
|
|
|
}
|
mm: kmem: prepare remote memcg charging infra for interrupt contexts
Remote memcg charging API uses current->active_memcg to store the
currently active memory cgroup, which overwrites the memory cgroup of the
current process. It works well for normal contexts, but doesn't work for
interrupt contexts: indeed, if an interrupt occurs during the execution of
a section with an active memcg set, all allocations inside the interrupt
will be charged to the active memcg set (given that we'll enable
accounting for allocations from an interrupt context). But because the
interrupt might have no relation to the active memcg set outside, it's
obviously wrong from the accounting prospective.
To resolve this problem, let's add a global percpu int_active_memcg
variable, which will be used to store an active memory cgroup which will
be used from interrupt contexts. set_active_memcg() will transparently
use current->active_memcg or int_active_memcg depending on the context.
To make the read part simple and transparent for the caller, let's
introduce two new functions:
- struct mem_cgroup *active_memcg(void),
- struct mem_cgroup *get_active_memcg(void).
They are returning the active memcg if it's set, hiding all implementation
details: where to get it depending on the current context.
Signed-off-by: Roman Gushchin <guro@fb.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Link: http://lkml.kernel.org/r/20200827225843.1270629-4-guro@fb.com
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2020-10-18 06:13:50 +07:00
|
|
|
rcu_read_unlock();
|
|
|
|
|
|
|
|
return memcg;
|
|
|
|
}
|
|
|
|
|
2020-10-18 06:13:53 +07:00
|
|
|
static __always_inline bool memcg_kmem_bypass(void)
|
|
|
|
{
|
|
|
|
/* Allow remote memcg charging from any context. */
|
|
|
|
if (unlikely(active_memcg()))
|
|
|
|
return false;
|
|
|
|
|
|
|
|
/* Memcg to charge can't be determined. */
|
|
|
|
if (in_interrupt() || !current->mm || (current->flags & PF_KTHREAD))
|
|
|
|
return true;
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
mm: kmem: prepare remote memcg charging infra for interrupt contexts
Remote memcg charging API uses current->active_memcg to store the
currently active memory cgroup, which overwrites the memory cgroup of the
current process. It works well for normal contexts, but doesn't work for
interrupt contexts: indeed, if an interrupt occurs during the execution of
a section with an active memcg set, all allocations inside the interrupt
will be charged to the active memcg set (given that we'll enable
accounting for allocations from an interrupt context). But because the
interrupt might have no relation to the active memcg set outside, it's
obviously wrong from the accounting prospective.
To resolve this problem, let's add a global percpu int_active_memcg
variable, which will be used to store an active memory cgroup which will
be used from interrupt contexts. set_active_memcg() will transparently
use current->active_memcg or int_active_memcg depending on the context.
To make the read part simple and transparent for the caller, let's
introduce two new functions:
- struct mem_cgroup *active_memcg(void),
- struct mem_cgroup *get_active_memcg(void).
They are returning the active memcg if it's set, hiding all implementation
details: where to get it depending on the current context.
Signed-off-by: Roman Gushchin <guro@fb.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Link: http://lkml.kernel.org/r/20200827225843.1270629-4-guro@fb.com
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2020-10-18 06:13:50 +07:00
|
|
|
/**
|
|
|
|
* If active memcg is set, do not fallback to current->mm->memcg.
|
|
|
|
*/
|
|
|
|
static __always_inline struct mem_cgroup *get_mem_cgroup_from_current(void)
|
|
|
|
{
|
|
|
|
if (memcg_kmem_bypass())
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
if (unlikely(active_memcg()))
|
|
|
|
return get_active_memcg();
|
|
|
|
|
fs: fsnotify: account fsnotify metadata to kmemcg
Patch series "Directed kmem charging", v8.
The Linux kernel's memory cgroup allows limiting the memory usage of the
jobs running on the system to provide isolation between the jobs. All
the kernel memory allocated in the context of the job and marked with
__GFP_ACCOUNT will also be included in the memory usage and be limited
by the job's limit.
The kernel memory can only be charged to the memcg of the process in
whose context kernel memory was allocated. However there are cases
where the allocated kernel memory should be charged to the memcg
different from the current processes's memcg. This patch series
contains two such concrete use-cases i.e. fsnotify and buffer_head.
The fsnotify event objects can consume a lot of system memory for large
or unlimited queues if there is either no or slow listener. The events
are allocated in the context of the event producer. However they should
be charged to the event consumer. Similarly the buffer_head objects can
be allocated in a memcg different from the memcg of the page for which
buffer_head objects are being allocated.
To solve this issue, this patch series introduces mechanism to charge
kernel memory to a given memcg. In case of fsnotify events, the memcg
of the consumer can be used for charging and for buffer_head, the memcg
of the page can be charged. For directed charging, the caller can use
the scope API memalloc_[un]use_memcg() to specify the memcg to charge
for all the __GFP_ACCOUNT allocations within the scope.
This patch (of 2):
A lot of memory can be consumed by the events generated for the huge or
unlimited queues if there is either no or slow listener. This can cause
system level memory pressure or OOMs. So, it's better to account the
fsnotify kmem caches to the memcg of the listener.
However the listener can be in a different memcg than the memcg of the
producer and these allocations happen in the context of the event
producer. This patch introduces remote memcg charging API which the
producer can use to charge the allocations to the memcg of the listener.
There are seven fsnotify kmem caches and among them allocations from
dnotify_struct_cache, dnotify_mark_cache, fanotify_mark_cache and
inotify_inode_mark_cachep happens in the context of syscall from the
listener. So, SLAB_ACCOUNT is enough for these caches.
The objects from fsnotify_mark_connector_cachep are not accounted as
they are small compared to the notification mark or events and it is
unclear whom to account connector to since it is shared by all events
attached to the inode.
The allocations from the event caches happen in the context of the event
producer. For such caches we will need to remote charge the allocations
to the listener's memcg. Thus we save the memcg reference in the
fsnotify_group structure of the listener.
This patch has also moved the members of fsnotify_group to keep the size
same, at least for 64 bit build, even with additional member by filling
the holes.
[shakeelb@google.com: use GFP_KERNEL_ACCOUNT rather than open-coding it]
Link: http://lkml.kernel.org/r/20180702215439.211597-1-shakeelb@google.com
Link: http://lkml.kernel.org/r/20180627191250.209150-2-shakeelb@google.com
Signed-off-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Jan Kara <jack@suse.cz>
Cc: Amir Goldstein <amir73il@gmail.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Roman Gushchin <guro@fb.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2018-08-18 05:46:39 +07:00
|
|
|
return get_mem_cgroup_from_mm(current->mm);
|
|
|
|
}
|
2009-01-08 09:08:33 +07:00
|
|
|
|
2012-01-13 08:17:59 +07:00
|
|
|
/**
|
|
|
|
* mem_cgroup_iter - iterate over memory cgroup hierarchy
|
|
|
|
* @root: hierarchy root
|
|
|
|
* @prev: previously returned memcg, NULL on first invocation
|
|
|
|
* @reclaim: cookie for shared reclaim walks, NULL for full walks
|
|
|
|
*
|
|
|
|
* Returns references to children of the hierarchy below @root, or
|
|
|
|
* @root itself, or %NULL after a full round-trip.
|
|
|
|
*
|
|
|
|
* Caller must pass the return value in @prev on subsequent
|
|
|
|
* invocations for reference counting, or use mem_cgroup_iter_break()
|
|
|
|
* to cancel a hierarchy walk before the round-trip is complete.
|
|
|
|
*
|
2020-10-14 06:52:45 +07:00
|
|
|
* Reclaimers can specify a node in @reclaim to divide up the memcgs
|
|
|
|
* in the hierarchy among all concurrent reclaimers operating on the
|
|
|
|
* same node.
|
2012-01-13 08:17:59 +07:00
|
|
|
*/
|
2013-09-25 05:27:37 +07:00
|
|
|
struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
|
2012-01-13 08:17:59 +07:00
|
|
|
struct mem_cgroup *prev,
|
2013-09-25 05:27:37 +07:00
|
|
|
struct mem_cgroup_reclaim_cookie *reclaim)
|
2009-04-03 06:57:35 +07:00
|
|
|
{
|
treewide: Remove uninitialized_var() usage
Using uninitialized_var() is dangerous as it papers over real bugs[1]
(or can in the future), and suppresses unrelated compiler warnings
(e.g. "unused variable"). If the compiler thinks it is uninitialized,
either simply initialize the variable or make compiler changes.
In preparation for removing[2] the[3] macro[4], remove all remaining
needless uses with the following script:
git grep '\buninitialized_var\b' | cut -d: -f1 | sort -u | \
xargs perl -pi -e \
's/\buninitialized_var\(([^\)]+)\)/\1/g;
s:\s*/\* (GCC be quiet|to make compiler happy) \*/$::g;'
drivers/video/fbdev/riva/riva_hw.c was manually tweaked to avoid
pathological white-space.
No outstanding warnings were found building allmodconfig with GCC 9.3.0
for x86_64, i386, arm64, arm, powerpc, powerpc64le, s390x, mips, sparc64,
alpha, and m68k.
[1] https://lore.kernel.org/lkml/20200603174714.192027-1-glider@google.com/
[2] https://lore.kernel.org/lkml/CA+55aFw+Vbj0i=1TGqCR5vQkCzWJ0QxK6CernOU6eedsudAixw@mail.gmail.com/
[3] https://lore.kernel.org/lkml/CA+55aFwgbgqhbp1fkxvRKEpzyR5J8n1vKT1VZdz9knmPuXhOeg@mail.gmail.com/
[4] https://lore.kernel.org/lkml/CA+55aFz2500WfbKXAx8s67wrm9=yVJu65TpLgN_ybYNv0VEOKA@mail.gmail.com/
Reviewed-by: Leon Romanovsky <leonro@mellanox.com> # drivers/infiniband and mlx4/mlx5
Acked-by: Jason Gunthorpe <jgg@mellanox.com> # IB
Acked-by: Kalle Valo <kvalo@codeaurora.org> # wireless drivers
Reviewed-by: Chao Yu <yuchao0@huawei.com> # erofs
Signed-off-by: Kees Cook <keescook@chromium.org>
2020-06-04 03:09:38 +07:00
|
|
|
struct mem_cgroup_reclaim_iter *iter;
|
2014-12-11 06:42:39 +07:00
|
|
|
struct cgroup_subsys_state *css = NULL;
|
mm: memcg: consolidate hierarchy iteration primitives
The memcg naturalization series:
Memory control groups are currently bolted onto the side of
traditional memory management in places where better integration would
be preferrable. To reclaim memory, for example, memory control groups
maintain their own LRU list and reclaim strategy aside from the global
per-zone LRU list reclaim. But an extra list head for each existing
page frame is expensive and maintaining it requires additional code.
This patchset disables the global per-zone LRU lists on memory cgroup
configurations and converts all its users to operate on the per-memory
cgroup lists instead. As LRU pages are then exclusively on one list,
this saves two list pointers for each page frame in the system:
page_cgroup array size with 4G physical memory
vanilla: allocated 31457280 bytes of page_cgroup
patched: allocated 15728640 bytes of page_cgroup
At the same time, system performance for various workloads is
unaffected:
100G sparse file cat, 4G physical memory, 10 runs, to test for code
bloat in the traditional LRU handling and kswapd & direct reclaim
paths, without/with the memory controller configured in
vanilla: 71.603(0.207) seconds
patched: 71.640(0.156) seconds
vanilla: 79.558(0.288) seconds
patched: 77.233(0.147) seconds
100G sparse file cat in 1G memory cgroup, 10 runs, to test for code
bloat in the traditional memory cgroup LRU handling and reclaim path
vanilla: 96.844(0.281) seconds
patched: 94.454(0.311) seconds
4 unlimited memcgs running kbuild -j32 each, 4G physical memory, 500M
swap on SSD, 10 runs, to test for regressions in kswapd & direct
reclaim using per-memcg LRU lists with multiple memcgs and multiple
allocators within each memcg
vanilla: 717.722(1.440) seconds [ 69720.100(11600.835) majfaults ]
patched: 714.106(2.313) seconds [ 71109.300(14886.186) majfaults ]
16 unlimited memcgs running kbuild, 1900M hierarchical limit, 500M
swap on SSD, 10 runs, to test for regressions in hierarchical memcg
setups
vanilla: 2742.058(1.992) seconds [ 26479.600(1736.737) majfaults ]
patched: 2743.267(1.214) seconds [ 27240.700(1076.063) majfaults ]
This patch:
There are currently two different implementations of iterating over a
memory cgroup hierarchy tree.
Consolidate them into one worker function and base the convenience
looping-macros on top of it.
Signed-off-by: Johannes Weiner <jweiner@redhat.com>
Reviewed-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Reviewed-by: Michal Hocko <mhocko@suse.cz>
Reviewed-by: Kirill A. Shutemov <kirill@shutemov.name>
Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: Ying Han <yinghan@google.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Michel Lespinasse <walken@google.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Minchan Kim <minchan.kim@gmail.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-01-13 08:17:48 +07:00
|
|
|
struct mem_cgroup *memcg = NULL;
|
2014-12-11 06:42:39 +07:00
|
|
|
struct mem_cgroup *pos = NULL;
|
2010-10-28 05:33:42 +07:00
|
|
|
|
2013-09-25 05:27:37 +07:00
|
|
|
if (mem_cgroup_disabled())
|
|
|
|
return NULL;
|
2012-01-13 08:17:59 +07:00
|
|
|
|
mm: memcg: consolidate hierarchy iteration primitives
The memcg naturalization series:
Memory control groups are currently bolted onto the side of
traditional memory management in places where better integration would
be preferrable. To reclaim memory, for example, memory control groups
maintain their own LRU list and reclaim strategy aside from the global
per-zone LRU list reclaim. But an extra list head for each existing
page frame is expensive and maintaining it requires additional code.
This patchset disables the global per-zone LRU lists on memory cgroup
configurations and converts all its users to operate on the per-memory
cgroup lists instead. As LRU pages are then exclusively on one list,
this saves two list pointers for each page frame in the system:
page_cgroup array size with 4G physical memory
vanilla: allocated 31457280 bytes of page_cgroup
patched: allocated 15728640 bytes of page_cgroup
At the same time, system performance for various workloads is
unaffected:
100G sparse file cat, 4G physical memory, 10 runs, to test for code
bloat in the traditional LRU handling and kswapd & direct reclaim
paths, without/with the memory controller configured in
vanilla: 71.603(0.207) seconds
patched: 71.640(0.156) seconds
vanilla: 79.558(0.288) seconds
patched: 77.233(0.147) seconds
100G sparse file cat in 1G memory cgroup, 10 runs, to test for code
bloat in the traditional memory cgroup LRU handling and reclaim path
vanilla: 96.844(0.281) seconds
patched: 94.454(0.311) seconds
4 unlimited memcgs running kbuild -j32 each, 4G physical memory, 500M
swap on SSD, 10 runs, to test for regressions in kswapd & direct
reclaim using per-memcg LRU lists with multiple memcgs and multiple
allocators within each memcg
vanilla: 717.722(1.440) seconds [ 69720.100(11600.835) majfaults ]
patched: 714.106(2.313) seconds [ 71109.300(14886.186) majfaults ]
16 unlimited memcgs running kbuild, 1900M hierarchical limit, 500M
swap on SSD, 10 runs, to test for regressions in hierarchical memcg
setups
vanilla: 2742.058(1.992) seconds [ 26479.600(1736.737) majfaults ]
patched: 2743.267(1.214) seconds [ 27240.700(1076.063) majfaults ]
This patch:
There are currently two different implementations of iterating over a
memory cgroup hierarchy tree.
Consolidate them into one worker function and base the convenience
looping-macros on top of it.
Signed-off-by: Johannes Weiner <jweiner@redhat.com>
Reviewed-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Reviewed-by: Michal Hocko <mhocko@suse.cz>
Reviewed-by: Kirill A. Shutemov <kirill@shutemov.name>
Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: Ying Han <yinghan@google.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Michel Lespinasse <walken@google.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Minchan Kim <minchan.kim@gmail.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-01-13 08:17:48 +07:00
|
|
|
if (!root)
|
|
|
|
root = root_mem_cgroup;
|
2010-10-28 05:33:41 +07:00
|
|
|
|
mm: memcg: consolidate hierarchy iteration primitives
The memcg naturalization series:
Memory control groups are currently bolted onto the side of
traditional memory management in places where better integration would
be preferrable. To reclaim memory, for example, memory control groups
maintain their own LRU list and reclaim strategy aside from the global
per-zone LRU list reclaim. But an extra list head for each existing
page frame is expensive and maintaining it requires additional code.
This patchset disables the global per-zone LRU lists on memory cgroup
configurations and converts all its users to operate on the per-memory
cgroup lists instead. As LRU pages are then exclusively on one list,
this saves two list pointers for each page frame in the system:
page_cgroup array size with 4G physical memory
vanilla: allocated 31457280 bytes of page_cgroup
patched: allocated 15728640 bytes of page_cgroup
At the same time, system performance for various workloads is
unaffected:
100G sparse file cat, 4G physical memory, 10 runs, to test for code
bloat in the traditional LRU handling and kswapd & direct reclaim
paths, without/with the memory controller configured in
vanilla: 71.603(0.207) seconds
patched: 71.640(0.156) seconds
vanilla: 79.558(0.288) seconds
patched: 77.233(0.147) seconds
100G sparse file cat in 1G memory cgroup, 10 runs, to test for code
bloat in the traditional memory cgroup LRU handling and reclaim path
vanilla: 96.844(0.281) seconds
patched: 94.454(0.311) seconds
4 unlimited memcgs running kbuild -j32 each, 4G physical memory, 500M
swap on SSD, 10 runs, to test for regressions in kswapd & direct
reclaim using per-memcg LRU lists with multiple memcgs and multiple
allocators within each memcg
vanilla: 717.722(1.440) seconds [ 69720.100(11600.835) majfaults ]
patched: 714.106(2.313) seconds [ 71109.300(14886.186) majfaults ]
16 unlimited memcgs running kbuild, 1900M hierarchical limit, 500M
swap on SSD, 10 runs, to test for regressions in hierarchical memcg
setups
vanilla: 2742.058(1.992) seconds [ 26479.600(1736.737) majfaults ]
patched: 2743.267(1.214) seconds [ 27240.700(1076.063) majfaults ]
This patch:
There are currently two different implementations of iterating over a
memory cgroup hierarchy tree.
Consolidate them into one worker function and base the convenience
looping-macros on top of it.
Signed-off-by: Johannes Weiner <jweiner@redhat.com>
Reviewed-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Reviewed-by: Michal Hocko <mhocko@suse.cz>
Reviewed-by: Kirill A. Shutemov <kirill@shutemov.name>
Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: Ying Han <yinghan@google.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Michel Lespinasse <walken@google.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Minchan Kim <minchan.kim@gmail.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-01-13 08:17:48 +07:00
|
|
|
if (prev && !reclaim)
|
2014-12-11 06:42:39 +07:00
|
|
|
pos = prev;
|
2009-04-03 06:57:35 +07:00
|
|
|
|
mm: memcg: consolidate hierarchy iteration primitives
The memcg naturalization series:
Memory control groups are currently bolted onto the side of
traditional memory management in places where better integration would
be preferrable. To reclaim memory, for example, memory control groups
maintain their own LRU list and reclaim strategy aside from the global
per-zone LRU list reclaim. But an extra list head for each existing
page frame is expensive and maintaining it requires additional code.
This patchset disables the global per-zone LRU lists on memory cgroup
configurations and converts all its users to operate on the per-memory
cgroup lists instead. As LRU pages are then exclusively on one list,
this saves two list pointers for each page frame in the system:
page_cgroup array size with 4G physical memory
vanilla: allocated 31457280 bytes of page_cgroup
patched: allocated 15728640 bytes of page_cgroup
At the same time, system performance for various workloads is
unaffected:
100G sparse file cat, 4G physical memory, 10 runs, to test for code
bloat in the traditional LRU handling and kswapd & direct reclaim
paths, without/with the memory controller configured in
vanilla: 71.603(0.207) seconds
patched: 71.640(0.156) seconds
vanilla: 79.558(0.288) seconds
patched: 77.233(0.147) seconds
100G sparse file cat in 1G memory cgroup, 10 runs, to test for code
bloat in the traditional memory cgroup LRU handling and reclaim path
vanilla: 96.844(0.281) seconds
patched: 94.454(0.311) seconds
4 unlimited memcgs running kbuild -j32 each, 4G physical memory, 500M
swap on SSD, 10 runs, to test for regressions in kswapd & direct
reclaim using per-memcg LRU lists with multiple memcgs and multiple
allocators within each memcg
vanilla: 717.722(1.440) seconds [ 69720.100(11600.835) majfaults ]
patched: 714.106(2.313) seconds [ 71109.300(14886.186) majfaults ]
16 unlimited memcgs running kbuild, 1900M hierarchical limit, 500M
swap on SSD, 10 runs, to test for regressions in hierarchical memcg
setups
vanilla: 2742.058(1.992) seconds [ 26479.600(1736.737) majfaults ]
patched: 2743.267(1.214) seconds [ 27240.700(1076.063) majfaults ]
This patch:
There are currently two different implementations of iterating over a
memory cgroup hierarchy tree.
Consolidate them into one worker function and base the convenience
looping-macros on top of it.
Signed-off-by: Johannes Weiner <jweiner@redhat.com>
Reviewed-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Reviewed-by: Michal Hocko <mhocko@suse.cz>
Reviewed-by: Kirill A. Shutemov <kirill@shutemov.name>
Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: Ying Han <yinghan@google.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Michel Lespinasse <walken@google.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Minchan Kim <minchan.kim@gmail.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-01-13 08:17:48 +07:00
|
|
|
if (!root->use_hierarchy && root != root_mem_cgroup) {
|
|
|
|
if (prev)
|
2014-12-11 06:42:39 +07:00
|
|
|
goto out;
|
2013-09-25 05:27:37 +07:00
|
|
|
return root;
|
mm: memcg: consolidate hierarchy iteration primitives
The memcg naturalization series:
Memory control groups are currently bolted onto the side of
traditional memory management in places where better integration would
be preferrable. To reclaim memory, for example, memory control groups
maintain their own LRU list and reclaim strategy aside from the global
per-zone LRU list reclaim. But an extra list head for each existing
page frame is expensive and maintaining it requires additional code.
This patchset disables the global per-zone LRU lists on memory cgroup
configurations and converts all its users to operate on the per-memory
cgroup lists instead. As LRU pages are then exclusively on one list,
this saves two list pointers for each page frame in the system:
page_cgroup array size with 4G physical memory
vanilla: allocated 31457280 bytes of page_cgroup
patched: allocated 15728640 bytes of page_cgroup
At the same time, system performance for various workloads is
unaffected:
100G sparse file cat, 4G physical memory, 10 runs, to test for code
bloat in the traditional LRU handling and kswapd & direct reclaim
paths, without/with the memory controller configured in
vanilla: 71.603(0.207) seconds
patched: 71.640(0.156) seconds
vanilla: 79.558(0.288) seconds
patched: 77.233(0.147) seconds
100G sparse file cat in 1G memory cgroup, 10 runs, to test for code
bloat in the traditional memory cgroup LRU handling and reclaim path
vanilla: 96.844(0.281) seconds
patched: 94.454(0.311) seconds
4 unlimited memcgs running kbuild -j32 each, 4G physical memory, 500M
swap on SSD, 10 runs, to test for regressions in kswapd & direct
reclaim using per-memcg LRU lists with multiple memcgs and multiple
allocators within each memcg
vanilla: 717.722(1.440) seconds [ 69720.100(11600.835) majfaults ]
patched: 714.106(2.313) seconds [ 71109.300(14886.186) majfaults ]
16 unlimited memcgs running kbuild, 1900M hierarchical limit, 500M
swap on SSD, 10 runs, to test for regressions in hierarchical memcg
setups
vanilla: 2742.058(1.992) seconds [ 26479.600(1736.737) majfaults ]
patched: 2743.267(1.214) seconds [ 27240.700(1076.063) majfaults ]
This patch:
There are currently two different implementations of iterating over a
memory cgroup hierarchy tree.
Consolidate them into one worker function and base the convenience
looping-macros on top of it.
Signed-off-by: Johannes Weiner <jweiner@redhat.com>
Reviewed-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Reviewed-by: Michal Hocko <mhocko@suse.cz>
Reviewed-by: Kirill A. Shutemov <kirill@shutemov.name>
Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: Ying Han <yinghan@google.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Michel Lespinasse <walken@google.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Minchan Kim <minchan.kim@gmail.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-01-13 08:17:48 +07:00
|
|
|
}
|
2009-04-03 06:57:35 +07:00
|
|
|
|
2013-04-30 05:07:15 +07:00
|
|
|
rcu_read_lock();
|
2013-04-30 05:07:17 +07:00
|
|
|
|
2014-12-11 06:42:39 +07:00
|
|
|
if (reclaim) {
|
2016-07-29 05:46:05 +07:00
|
|
|
struct mem_cgroup_per_node *mz;
|
2014-12-11 06:42:39 +07:00
|
|
|
|
2016-07-29 05:46:05 +07:00
|
|
|
mz = mem_cgroup_nodeinfo(root, reclaim->pgdat->node_id);
|
2019-12-01 08:50:03 +07:00
|
|
|
iter = &mz->iter;
|
2014-12-11 06:42:39 +07:00
|
|
|
|
|
|
|
if (prev && reclaim->generation != iter->generation)
|
|
|
|
goto out_unlock;
|
|
|
|
|
2015-12-30 05:54:10 +07:00
|
|
|
while (1) {
|
2015-04-16 06:14:08 +07:00
|
|
|
pos = READ_ONCE(iter->position);
|
2015-12-30 05:54:10 +07:00
|
|
|
if (!pos || css_tryget(&pos->css))
|
|
|
|
break;
|
2014-12-11 06:42:39 +07:00
|
|
|
/*
|
2015-12-30 05:54:10 +07:00
|
|
|
* css reference reached zero, so iter->position will
|
|
|
|
* be cleared by ->css_released. However, we should not
|
|
|
|
* rely on this happening soon, because ->css_released
|
|
|
|
* is called from a work queue, and by busy-waiting we
|
|
|
|
* might block it. So we clear iter->position right
|
|
|
|
* away.
|
2014-12-11 06:42:39 +07:00
|
|
|
*/
|
2015-12-30 05:54:10 +07:00
|
|
|
(void)cmpxchg(&iter->position, pos, NULL);
|
|
|
|
}
|
2014-12-11 06:42:39 +07:00
|
|
|
}
|
|
|
|
|
|
|
|
if (pos)
|
|
|
|
css = &pos->css;
|
|
|
|
|
|
|
|
for (;;) {
|
|
|
|
css = css_next_descendant_pre(css, &root->css);
|
|
|
|
if (!css) {
|
|
|
|
/*
|
|
|
|
* Reclaimers share the hierarchy walk, and a
|
|
|
|
* new one might jump in right at the end of
|
|
|
|
* the hierarchy - make sure they see at least
|
|
|
|
* one group and restart from the beginning.
|
|
|
|
*/
|
|
|
|
if (!prev)
|
|
|
|
continue;
|
|
|
|
break;
|
mm: memcg: per-priority per-zone hierarchy scan generations
Memory cgroup limit reclaim currently picks one memory cgroup out of the
target hierarchy, remembers it as the last scanned child, and reclaims
all zones in it with decreasing priority levels.
The new hierarchy reclaim code will pick memory cgroups from the same
hierarchy concurrently from different zones and priority levels, it
becomes necessary that hierarchy roots not only remember the last
scanned child, but do so for each zone and priority level.
Until now, we reclaimed memcgs like this:
mem = mem_cgroup_iter(root)
for each priority level:
for each zone in zonelist:
reclaim(mem, zone)
But subsequent patches will move the memcg iteration inside the loop
over the zones:
for each priority level:
for each zone in zonelist:
mem = mem_cgroup_iter(root)
reclaim(mem, zone)
And to keep with the original scan order - memcg -> priority -> zone -
the last scanned memcg has to be remembered per zone and per priority
level.
Furthermore, global reclaim will be switched to the hierarchy walk as
well. Different from limit reclaim, which can just recheck the limit
after some reclaim progress, its target is to scan all memcgs for the
desired zone pages, proportional to the memcg size, and so reliably
detecting a full hierarchy round-trip will become crucial.
Currently, the code relies on one reclaimer encountering the same memcg
twice, but that is error-prone with concurrent reclaimers. Instead, use
a generation counter that is increased every time the child with the
highest ID has been visited, so that reclaimers can stop when the
generation changes.
Signed-off-by: Johannes Weiner <jweiner@redhat.com>
Reviewed-by: Kirill A. Shutemov <kirill@shutemov.name>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: Ying Han <yinghan@google.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Michel Lespinasse <walken@google.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Minchan Kim <minchan.kim@gmail.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-01-13 08:17:55 +07:00
|
|
|
}
|
2010-10-28 05:33:41 +07:00
|
|
|
|
2014-12-11 06:42:39 +07:00
|
|
|
/*
|
|
|
|
* Verify the css and acquire a reference. The root
|
|
|
|
* is provided by the caller, so we know it's alive
|
|
|
|
* and kicking, and don't take an extra reference.
|
|
|
|
*/
|
|
|
|
memcg = mem_cgroup_from_css(css);
|
2009-04-03 06:57:35 +07:00
|
|
|
|
2014-12-11 06:42:39 +07:00
|
|
|
if (css == &root->css)
|
|
|
|
break;
|
2009-04-03 06:57:35 +07:00
|
|
|
|
2016-01-21 06:02:53 +07:00
|
|
|
if (css_tryget(css))
|
|
|
|
break;
|
mm: memcg: consolidate hierarchy iteration primitives
The memcg naturalization series:
Memory control groups are currently bolted onto the side of
traditional memory management in places where better integration would
be preferrable. To reclaim memory, for example, memory control groups
maintain their own LRU list and reclaim strategy aside from the global
per-zone LRU list reclaim. But an extra list head for each existing
page frame is expensive and maintaining it requires additional code.
This patchset disables the global per-zone LRU lists on memory cgroup
configurations and converts all its users to operate on the per-memory
cgroup lists instead. As LRU pages are then exclusively on one list,
this saves two list pointers for each page frame in the system:
page_cgroup array size with 4G physical memory
vanilla: allocated 31457280 bytes of page_cgroup
patched: allocated 15728640 bytes of page_cgroup
At the same time, system performance for various workloads is
unaffected:
100G sparse file cat, 4G physical memory, 10 runs, to test for code
bloat in the traditional LRU handling and kswapd & direct reclaim
paths, without/with the memory controller configured in
vanilla: 71.603(0.207) seconds
patched: 71.640(0.156) seconds
vanilla: 79.558(0.288) seconds
patched: 77.233(0.147) seconds
100G sparse file cat in 1G memory cgroup, 10 runs, to test for code
bloat in the traditional memory cgroup LRU handling and reclaim path
vanilla: 96.844(0.281) seconds
patched: 94.454(0.311) seconds
4 unlimited memcgs running kbuild -j32 each, 4G physical memory, 500M
swap on SSD, 10 runs, to test for regressions in kswapd & direct
reclaim using per-memcg LRU lists with multiple memcgs and multiple
allocators within each memcg
vanilla: 717.722(1.440) seconds [ 69720.100(11600.835) majfaults ]
patched: 714.106(2.313) seconds [ 71109.300(14886.186) majfaults ]
16 unlimited memcgs running kbuild, 1900M hierarchical limit, 500M
swap on SSD, 10 runs, to test for regressions in hierarchical memcg
setups
vanilla: 2742.058(1.992) seconds [ 26479.600(1736.737) majfaults ]
patched: 2743.267(1.214) seconds [ 27240.700(1076.063) majfaults ]
This patch:
There are currently two different implementations of iterating over a
memory cgroup hierarchy tree.
Consolidate them into one worker function and base the convenience
looping-macros on top of it.
Signed-off-by: Johannes Weiner <jweiner@redhat.com>
Reviewed-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Reviewed-by: Michal Hocko <mhocko@suse.cz>
Reviewed-by: Kirill A. Shutemov <kirill@shutemov.name>
Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: Ying Han <yinghan@google.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Michel Lespinasse <walken@google.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Minchan Kim <minchan.kim@gmail.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-01-13 08:17:48 +07:00
|
|
|
|
2014-12-11 06:42:39 +07:00
|
|
|
memcg = NULL;
|
mm: memcg: consolidate hierarchy iteration primitives
The memcg naturalization series:
Memory control groups are currently bolted onto the side of
traditional memory management in places where better integration would
be preferrable. To reclaim memory, for example, memory control groups
maintain their own LRU list and reclaim strategy aside from the global
per-zone LRU list reclaim. But an extra list head for each existing
page frame is expensive and maintaining it requires additional code.
This patchset disables the global per-zone LRU lists on memory cgroup
configurations and converts all its users to operate on the per-memory
cgroup lists instead. As LRU pages are then exclusively on one list,
this saves two list pointers for each page frame in the system:
page_cgroup array size with 4G physical memory
vanilla: allocated 31457280 bytes of page_cgroup
patched: allocated 15728640 bytes of page_cgroup
At the same time, system performance for various workloads is
unaffected:
100G sparse file cat, 4G physical memory, 10 runs, to test for code
bloat in the traditional LRU handling and kswapd & direct reclaim
paths, without/with the memory controller configured in
vanilla: 71.603(0.207) seconds
patched: 71.640(0.156) seconds
vanilla: 79.558(0.288) seconds
patched: 77.233(0.147) seconds
100G sparse file cat in 1G memory cgroup, 10 runs, to test for code
bloat in the traditional memory cgroup LRU handling and reclaim path
vanilla: 96.844(0.281) seconds
patched: 94.454(0.311) seconds
4 unlimited memcgs running kbuild -j32 each, 4G physical memory, 500M
swap on SSD, 10 runs, to test for regressions in kswapd & direct
reclaim using per-memcg LRU lists with multiple memcgs and multiple
allocators within each memcg
vanilla: 717.722(1.440) seconds [ 69720.100(11600.835) majfaults ]
patched: 714.106(2.313) seconds [ 71109.300(14886.186) majfaults ]
16 unlimited memcgs running kbuild, 1900M hierarchical limit, 500M
swap on SSD, 10 runs, to test for regressions in hierarchical memcg
setups
vanilla: 2742.058(1.992) seconds [ 26479.600(1736.737) majfaults ]
patched: 2743.267(1.214) seconds [ 27240.700(1076.063) majfaults ]
This patch:
There are currently two different implementations of iterating over a
memory cgroup hierarchy tree.
Consolidate them into one worker function and base the convenience
looping-macros on top of it.
Signed-off-by: Johannes Weiner <jweiner@redhat.com>
Reviewed-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Reviewed-by: Michal Hocko <mhocko@suse.cz>
Reviewed-by: Kirill A. Shutemov <kirill@shutemov.name>
Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: Ying Han <yinghan@google.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Michel Lespinasse <walken@google.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Minchan Kim <minchan.kim@gmail.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-01-13 08:17:48 +07:00
|
|
|
}
|
2014-12-11 06:42:39 +07:00
|
|
|
|
|
|
|
if (reclaim) {
|
|
|
|
/*
|
2015-12-30 05:54:10 +07:00
|
|
|
* The position could have already been updated by a competing
|
|
|
|
* thread, so check that the value hasn't changed since we read
|
|
|
|
* it to avoid reclaiming from the same cgroup twice.
|
2014-12-11 06:42:39 +07:00
|
|
|
*/
|
2015-12-30 05:54:10 +07:00
|
|
|
(void)cmpxchg(&iter->position, pos, memcg);
|
|
|
|
|
2014-12-11 06:42:39 +07:00
|
|
|
if (pos)
|
|
|
|
css_put(&pos->css);
|
|
|
|
|
|
|
|
if (!memcg)
|
|
|
|
iter->generation++;
|
|
|
|
else if (!prev)
|
|
|
|
reclaim->generation = iter->generation;
|
mm: memcg: consolidate hierarchy iteration primitives
The memcg naturalization series:
Memory control groups are currently bolted onto the side of
traditional memory management in places where better integration would
be preferrable. To reclaim memory, for example, memory control groups
maintain their own LRU list and reclaim strategy aside from the global
per-zone LRU list reclaim. But an extra list head for each existing
page frame is expensive and maintaining it requires additional code.
This patchset disables the global per-zone LRU lists on memory cgroup
configurations and converts all its users to operate on the per-memory
cgroup lists instead. As LRU pages are then exclusively on one list,
this saves two list pointers for each page frame in the system:
page_cgroup array size with 4G physical memory
vanilla: allocated 31457280 bytes of page_cgroup
patched: allocated 15728640 bytes of page_cgroup
At the same time, system performance for various workloads is
unaffected:
100G sparse file cat, 4G physical memory, 10 runs, to test for code
bloat in the traditional LRU handling and kswapd & direct reclaim
paths, without/with the memory controller configured in
vanilla: 71.603(0.207) seconds
patched: 71.640(0.156) seconds
vanilla: 79.558(0.288) seconds
patched: 77.233(0.147) seconds
100G sparse file cat in 1G memory cgroup, 10 runs, to test for code
bloat in the traditional memory cgroup LRU handling and reclaim path
vanilla: 96.844(0.281) seconds
patched: 94.454(0.311) seconds
4 unlimited memcgs running kbuild -j32 each, 4G physical memory, 500M
swap on SSD, 10 runs, to test for regressions in kswapd & direct
reclaim using per-memcg LRU lists with multiple memcgs and multiple
allocators within each memcg
vanilla: 717.722(1.440) seconds [ 69720.100(11600.835) majfaults ]
patched: 714.106(2.313) seconds [ 71109.300(14886.186) majfaults ]
16 unlimited memcgs running kbuild, 1900M hierarchical limit, 500M
swap on SSD, 10 runs, to test for regressions in hierarchical memcg
setups
vanilla: 2742.058(1.992) seconds [ 26479.600(1736.737) majfaults ]
patched: 2743.267(1.214) seconds [ 27240.700(1076.063) majfaults ]
This patch:
There are currently two different implementations of iterating over a
memory cgroup hierarchy tree.
Consolidate them into one worker function and base the convenience
looping-macros on top of it.
Signed-off-by: Johannes Weiner <jweiner@redhat.com>
Reviewed-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Reviewed-by: Michal Hocko <mhocko@suse.cz>
Reviewed-by: Kirill A. Shutemov <kirill@shutemov.name>
Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: Ying Han <yinghan@google.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Michel Lespinasse <walken@google.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Minchan Kim <minchan.kim@gmail.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-01-13 08:17:48 +07:00
|
|
|
}
|
2014-12-11 06:42:39 +07:00
|
|
|
|
2013-04-30 05:07:15 +07:00
|
|
|
out_unlock:
|
|
|
|
rcu_read_unlock();
|
2014-12-11 06:42:39 +07:00
|
|
|
out:
|
2013-04-30 05:07:14 +07:00
|
|
|
if (prev && prev != root)
|
|
|
|
css_put(&prev->css);
|
|
|
|
|
mm: memcg: consolidate hierarchy iteration primitives
The memcg naturalization series:
Memory control groups are currently bolted onto the side of
traditional memory management in places where better integration would
be preferrable. To reclaim memory, for example, memory control groups
maintain their own LRU list and reclaim strategy aside from the global
per-zone LRU list reclaim. But an extra list head for each existing
page frame is expensive and maintaining it requires additional code.
This patchset disables the global per-zone LRU lists on memory cgroup
configurations and converts all its users to operate on the per-memory
cgroup lists instead. As LRU pages are then exclusively on one list,
this saves two list pointers for each page frame in the system:
page_cgroup array size with 4G physical memory
vanilla: allocated 31457280 bytes of page_cgroup
patched: allocated 15728640 bytes of page_cgroup
At the same time, system performance for various workloads is
unaffected:
100G sparse file cat, 4G physical memory, 10 runs, to test for code
bloat in the traditional LRU handling and kswapd & direct reclaim
paths, without/with the memory controller configured in
vanilla: 71.603(0.207) seconds
patched: 71.640(0.156) seconds
vanilla: 79.558(0.288) seconds
patched: 77.233(0.147) seconds
100G sparse file cat in 1G memory cgroup, 10 runs, to test for code
bloat in the traditional memory cgroup LRU handling and reclaim path
vanilla: 96.844(0.281) seconds
patched: 94.454(0.311) seconds
4 unlimited memcgs running kbuild -j32 each, 4G physical memory, 500M
swap on SSD, 10 runs, to test for regressions in kswapd & direct
reclaim using per-memcg LRU lists with multiple memcgs and multiple
allocators within each memcg
vanilla: 717.722(1.440) seconds [ 69720.100(11600.835) majfaults ]
patched: 714.106(2.313) seconds [ 71109.300(14886.186) majfaults ]
16 unlimited memcgs running kbuild, 1900M hierarchical limit, 500M
swap on SSD, 10 runs, to test for regressions in hierarchical memcg
setups
vanilla: 2742.058(1.992) seconds [ 26479.600(1736.737) majfaults ]
patched: 2743.267(1.214) seconds [ 27240.700(1076.063) majfaults ]
This patch:
There are currently two different implementations of iterating over a
memory cgroup hierarchy tree.
Consolidate them into one worker function and base the convenience
looping-macros on top of it.
Signed-off-by: Johannes Weiner <jweiner@redhat.com>
Reviewed-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Reviewed-by: Michal Hocko <mhocko@suse.cz>
Reviewed-by: Kirill A. Shutemov <kirill@shutemov.name>
Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: Ying Han <yinghan@google.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Michel Lespinasse <walken@google.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Minchan Kim <minchan.kim@gmail.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-01-13 08:17:48 +07:00
|
|
|
return memcg;
|
2009-04-03 06:57:35 +07:00
|
|
|
}
|
2010-10-28 05:33:41 +07:00
|
|
|
|
2012-01-13 08:17:59 +07:00
|
|
|
/**
|
|
|
|
* mem_cgroup_iter_break - abort a hierarchy walk prematurely
|
|
|
|
* @root: hierarchy root
|
|
|
|
* @prev: last visited hierarchy member as returned by mem_cgroup_iter()
|
|
|
|
*/
|
|
|
|
void mem_cgroup_iter_break(struct mem_cgroup *root,
|
|
|
|
struct mem_cgroup *prev)
|
mm: memcg: consolidate hierarchy iteration primitives
The memcg naturalization series:
Memory control groups are currently bolted onto the side of
traditional memory management in places where better integration would
be preferrable. To reclaim memory, for example, memory control groups
maintain their own LRU list and reclaim strategy aside from the global
per-zone LRU list reclaim. But an extra list head for each existing
page frame is expensive and maintaining it requires additional code.
This patchset disables the global per-zone LRU lists on memory cgroup
configurations and converts all its users to operate on the per-memory
cgroup lists instead. As LRU pages are then exclusively on one list,
this saves two list pointers for each page frame in the system:
page_cgroup array size with 4G physical memory
vanilla: allocated 31457280 bytes of page_cgroup
patched: allocated 15728640 bytes of page_cgroup
At the same time, system performance for various workloads is
unaffected:
100G sparse file cat, 4G physical memory, 10 runs, to test for code
bloat in the traditional LRU handling and kswapd & direct reclaim
paths, without/with the memory controller configured in
vanilla: 71.603(0.207) seconds
patched: 71.640(0.156) seconds
vanilla: 79.558(0.288) seconds
patched: 77.233(0.147) seconds
100G sparse file cat in 1G memory cgroup, 10 runs, to test for code
bloat in the traditional memory cgroup LRU handling and reclaim path
vanilla: 96.844(0.281) seconds
patched: 94.454(0.311) seconds
4 unlimited memcgs running kbuild -j32 each, 4G physical memory, 500M
swap on SSD, 10 runs, to test for regressions in kswapd & direct
reclaim using per-memcg LRU lists with multiple memcgs and multiple
allocators within each memcg
vanilla: 717.722(1.440) seconds [ 69720.100(11600.835) majfaults ]
patched: 714.106(2.313) seconds [ 71109.300(14886.186) majfaults ]
16 unlimited memcgs running kbuild, 1900M hierarchical limit, 500M
swap on SSD, 10 runs, to test for regressions in hierarchical memcg
setups
vanilla: 2742.058(1.992) seconds [ 26479.600(1736.737) majfaults ]
patched: 2743.267(1.214) seconds [ 27240.700(1076.063) majfaults ]
This patch:
There are currently two different implementations of iterating over a
memory cgroup hierarchy tree.
Consolidate them into one worker function and base the convenience
looping-macros on top of it.
Signed-off-by: Johannes Weiner <jweiner@redhat.com>
Reviewed-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Reviewed-by: Michal Hocko <mhocko@suse.cz>
Reviewed-by: Kirill A. Shutemov <kirill@shutemov.name>
Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: Ying Han <yinghan@google.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Michel Lespinasse <walken@google.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Minchan Kim <minchan.kim@gmail.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-01-13 08:17:48 +07:00
|
|
|
{
|
|
|
|
if (!root)
|
|
|
|
root = root_mem_cgroup;
|
|
|
|
if (prev && prev != root)
|
|
|
|
css_put(&prev->css);
|
|
|
|
}
|
2010-10-28 05:33:41 +07:00
|
|
|
|
mm/memcontrol.c: fix use after free in mem_cgroup_iter()
This patch is sent to report an use after free in mem_cgroup_iter()
after merging commit be2657752e9e ("mm: memcg: fix use after free in
mem_cgroup_iter()").
I work with android kernel tree (4.9 & 4.14), and commit be2657752e9e
("mm: memcg: fix use after free in mem_cgroup_iter()") has been merged
to the trees. However, I can still observe use after free issues
addressed in the commit be2657752e9e. (on low-end devices, a few times
this month)
backtrace:
css_tryget <- crash here
mem_cgroup_iter
shrink_node
shrink_zones
do_try_to_free_pages
try_to_free_pages
__perform_reclaim
__alloc_pages_direct_reclaim
__alloc_pages_slowpath
__alloc_pages_nodemask
To debug, I poisoned mem_cgroup before freeing it:
static void __mem_cgroup_free(struct mem_cgroup *memcg)
for_each_node(node)
free_mem_cgroup_per_node_info(memcg, node);
free_percpu(memcg->stat);
+ /* poison memcg before freeing it */
+ memset(memcg, 0x78, sizeof(struct mem_cgroup));
kfree(memcg);
}
The coredump shows the position=0xdbbc2a00 is freed.
(gdb) p/x ((struct mem_cgroup_per_node *)0xe5009e00)->iter[8]
$13 = {position = 0xdbbc2a00, generation = 0x2efd}
0xdbbc2a00: 0xdbbc2e00 0x00000000 0xdbbc2800 0x00000100
0xdbbc2a10: 0x00000200 0x78787878 0x00026218 0x00000000
0xdbbc2a20: 0xdcad6000 0x00000001 0x78787800 0x00000000
0xdbbc2a30: 0x78780000 0x00000000 0x0068fb84 0x78787878
0xdbbc2a40: 0x78787878 0x78787878 0x78787878 0xe3fa5cc0
0xdbbc2a50: 0x78787878 0x78787878 0x00000000 0x00000000
0xdbbc2a60: 0x00000000 0x00000000 0x00000000 0x00000000
0xdbbc2a70: 0x00000000 0x00000000 0x00000000 0x00000000
0xdbbc2a80: 0x00000000 0x00000000 0x00000000 0x00000000
0xdbbc2a90: 0x00000001 0x00000000 0x00000000 0x00100000
0xdbbc2aa0: 0x00000001 0xdbbc2ac8 0x00000000 0x00000000
0xdbbc2ab0: 0x00000000 0x00000000 0x00000000 0x00000000
0xdbbc2ac0: 0x00000000 0x00000000 0xe5b02618 0x00001000
0xdbbc2ad0: 0x00000000 0x78787878 0x78787878 0x78787878
0xdbbc2ae0: 0x78787878 0x78787878 0x78787878 0x78787878
0xdbbc2af0: 0x78787878 0x78787878 0x78787878 0x78787878
0xdbbc2b00: 0x78787878 0x78787878 0x78787878 0x78787878
0xdbbc2b10: 0x78787878 0x78787878 0x78787878 0x78787878
0xdbbc2b20: 0x78787878 0x78787878 0x78787878 0x78787878
0xdbbc2b30: 0x78787878 0x78787878 0x78787878 0x78787878
0xdbbc2b40: 0x78787878 0x78787878 0x78787878 0x78787878
0xdbbc2b50: 0x78787878 0x78787878 0x78787878 0x78787878
0xdbbc2b60: 0x78787878 0x78787878 0x78787878 0x78787878
0xdbbc2b70: 0x78787878 0x78787878 0x78787878 0x78787878
0xdbbc2b80: 0x78787878 0x78787878 0x00000000 0x78787878
0xdbbc2b90: 0x78787878 0x78787878 0x78787878 0x78787878
0xdbbc2ba0: 0x78787878 0x78787878 0x78787878 0x78787878
In the reclaim path, try_to_free_pages() does not setup
sc.target_mem_cgroup and sc is passed to do_try_to_free_pages(), ...,
shrink_node().
In mem_cgroup_iter(), root is set to root_mem_cgroup because
sc->target_mem_cgroup is NULL. It is possible to assign a memcg to
root_mem_cgroup.nodeinfo.iter in mem_cgroup_iter().
try_to_free_pages
struct scan_control sc = {...}, target_mem_cgroup is 0x0;
do_try_to_free_pages
shrink_zones
shrink_node
mem_cgroup *root = sc->target_mem_cgroup;
memcg = mem_cgroup_iter(root, NULL, &reclaim);
mem_cgroup_iter()
if (!root)
root = root_mem_cgroup;
...
css = css_next_descendant_pre(css, &root->css);
memcg = mem_cgroup_from_css(css);
cmpxchg(&iter->position, pos, memcg);
My device uses memcg non-hierarchical mode. When we release a memcg:
invalidate_reclaim_iterators() reaches only dead_memcg and its parents.
If non-hierarchical mode is used, invalidate_reclaim_iterators() never
reaches root_mem_cgroup.
static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
{
struct mem_cgroup *memcg = dead_memcg;
for (; memcg; memcg = parent_mem_cgroup(memcg)
...
}
So the use after free scenario looks like:
CPU1 CPU2
try_to_free_pages
do_try_to_free_pages
shrink_zones
shrink_node
mem_cgroup_iter()
if (!root)
root = root_mem_cgroup;
...
css = css_next_descendant_pre(css, &root->css);
memcg = mem_cgroup_from_css(css);
cmpxchg(&iter->position, pos, memcg);
invalidate_reclaim_iterators(memcg);
...
__mem_cgroup_free()
kfree(memcg);
try_to_free_pages
do_try_to_free_pages
shrink_zones
shrink_node
mem_cgroup_iter()
if (!root)
root = root_mem_cgroup;
...
mz = mem_cgroup_nodeinfo(root, reclaim->pgdat->node_id);
iter = &mz->iter[reclaim->priority];
pos = READ_ONCE(iter->position);
css_tryget(&pos->css) <- use after free
To avoid this, we should also invalidate root_mem_cgroup.nodeinfo.iter
in invalidate_reclaim_iterators().
[cai@lca.pw: fix -Wparentheses compilation warning]
Link: http://lkml.kernel.org/r/1564580753-17531-1-git-send-email-cai@lca.pw
Link: http://lkml.kernel.org/r/20190730015729.4406-1-miles.chen@mediatek.com
Fixes: 5ac8fb31ad2e ("mm: memcontrol: convert reclaim iterator to simple css refcounting")
Signed-off-by: Miles Chen <miles.chen@mediatek.com>
Signed-off-by: Qian Cai <cai@lca.pw>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-08-14 05:37:28 +07:00
|
|
|
static void __invalidate_reclaim_iterators(struct mem_cgroup *from,
|
|
|
|
struct mem_cgroup *dead_memcg)
|
2015-12-30 05:54:10 +07:00
|
|
|
{
|
|
|
|
struct mem_cgroup_reclaim_iter *iter;
|
2016-07-29 05:46:05 +07:00
|
|
|
struct mem_cgroup_per_node *mz;
|
|
|
|
int nid;
|
2015-12-30 05:54:10 +07:00
|
|
|
|
mm/memcontrol.c: fix use after free in mem_cgroup_iter()
This patch is sent to report an use after free in mem_cgroup_iter()
after merging commit be2657752e9e ("mm: memcg: fix use after free in
mem_cgroup_iter()").
I work with android kernel tree (4.9 & 4.14), and commit be2657752e9e
("mm: memcg: fix use after free in mem_cgroup_iter()") has been merged
to the trees. However, I can still observe use after free issues
addressed in the commit be2657752e9e. (on low-end devices, a few times
this month)
backtrace:
css_tryget <- crash here
mem_cgroup_iter
shrink_node
shrink_zones
do_try_to_free_pages
try_to_free_pages
__perform_reclaim
__alloc_pages_direct_reclaim
__alloc_pages_slowpath
__alloc_pages_nodemask
To debug, I poisoned mem_cgroup before freeing it:
static void __mem_cgroup_free(struct mem_cgroup *memcg)
for_each_node(node)
free_mem_cgroup_per_node_info(memcg, node);
free_percpu(memcg->stat);
+ /* poison memcg before freeing it */
+ memset(memcg, 0x78, sizeof(struct mem_cgroup));
kfree(memcg);
}
The coredump shows the position=0xdbbc2a00 is freed.
(gdb) p/x ((struct mem_cgroup_per_node *)0xe5009e00)->iter[8]
$13 = {position = 0xdbbc2a00, generation = 0x2efd}
0xdbbc2a00: 0xdbbc2e00 0x00000000 0xdbbc2800 0x00000100
0xdbbc2a10: 0x00000200 0x78787878 0x00026218 0x00000000
0xdbbc2a20: 0xdcad6000 0x00000001 0x78787800 0x00000000
0xdbbc2a30: 0x78780000 0x00000000 0x0068fb84 0x78787878
0xdbbc2a40: 0x78787878 0x78787878 0x78787878 0xe3fa5cc0
0xdbbc2a50: 0x78787878 0x78787878 0x00000000 0x00000000
0xdbbc2a60: 0x00000000 0x00000000 0x00000000 0x00000000
0xdbbc2a70: 0x00000000 0x00000000 0x00000000 0x00000000
0xdbbc2a80: 0x00000000 0x00000000 0x00000000 0x00000000
0xdbbc2a90: 0x00000001 0x00000000 0x00000000 0x00100000
0xdbbc2aa0: 0x00000001 0xdbbc2ac8 0x00000000 0x00000000
0xdbbc2ab0: 0x00000000 0x00000000 0x00000000 0x00000000
0xdbbc2ac0: 0x00000000 0x00000000 0xe5b02618 0x00001000
0xdbbc2ad0: 0x00000000 0x78787878 0x78787878 0x78787878
0xdbbc2ae0: 0x78787878 0x78787878 0x78787878 0x78787878
0xdbbc2af0: 0x78787878 0x78787878 0x78787878 0x78787878
0xdbbc2b00: 0x78787878 0x78787878 0x78787878 0x78787878
0xdbbc2b10: 0x78787878 0x78787878 0x78787878 0x78787878
0xdbbc2b20: 0x78787878 0x78787878 0x78787878 0x78787878
0xdbbc2b30: 0x78787878 0x78787878 0x78787878 0x78787878
0xdbbc2b40: 0x78787878 0x78787878 0x78787878 0x78787878
0xdbbc2b50: 0x78787878 0x78787878 0x78787878 0x78787878
0xdbbc2b60: 0x78787878 0x78787878 0x78787878 0x78787878
0xdbbc2b70: 0x78787878 0x78787878 0x78787878 0x78787878
0xdbbc2b80: 0x78787878 0x78787878 0x00000000 0x78787878
0xdbbc2b90: 0x78787878 0x78787878 0x78787878 0x78787878
0xdbbc2ba0: 0x78787878 0x78787878 0x78787878 0x78787878
In the reclaim path, try_to_free_pages() does not setup
sc.target_mem_cgroup and sc is passed to do_try_to_free_pages(), ...,
shrink_node().
In mem_cgroup_iter(), root is set to root_mem_cgroup because
sc->target_mem_cgroup is NULL. It is possible to assign a memcg to
root_mem_cgroup.nodeinfo.iter in mem_cgroup_iter().
try_to_free_pages
struct scan_control sc = {...}, target_mem_cgroup is 0x0;
do_try_to_free_pages
shrink_zones
shrink_node
mem_cgroup *root = sc->target_mem_cgroup;
memcg = mem_cgroup_iter(root, NULL, &reclaim);
mem_cgroup_iter()
if (!root)
root = root_mem_cgroup;
...
css = css_next_descendant_pre(css, &root->css);
memcg = mem_cgroup_from_css(css);
cmpxchg(&iter->position, pos, memcg);
My device uses memcg non-hierarchical mode. When we release a memcg:
invalidate_reclaim_iterators() reaches only dead_memcg and its parents.
If non-hierarchical mode is used, invalidate_reclaim_iterators() never
reaches root_mem_cgroup.
static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
{
struct mem_cgroup *memcg = dead_memcg;
for (; memcg; memcg = parent_mem_cgroup(memcg)
...
}
So the use after free scenario looks like:
CPU1 CPU2
try_to_free_pages
do_try_to_free_pages
shrink_zones
shrink_node
mem_cgroup_iter()
if (!root)
root = root_mem_cgroup;
...
css = css_next_descendant_pre(css, &root->css);
memcg = mem_cgroup_from_css(css);
cmpxchg(&iter->position, pos, memcg);
invalidate_reclaim_iterators(memcg);
...
__mem_cgroup_free()
kfree(memcg);
try_to_free_pages
do_try_to_free_pages
shrink_zones
shrink_node
mem_cgroup_iter()
if (!root)
root = root_mem_cgroup;
...
mz = mem_cgroup_nodeinfo(root, reclaim->pgdat->node_id);
iter = &mz->iter[reclaim->priority];
pos = READ_ONCE(iter->position);
css_tryget(&pos->css) <- use after free
To avoid this, we should also invalidate root_mem_cgroup.nodeinfo.iter
in invalidate_reclaim_iterators().
[cai@lca.pw: fix -Wparentheses compilation warning]
Link: http://lkml.kernel.org/r/1564580753-17531-1-git-send-email-cai@lca.pw
Link: http://lkml.kernel.org/r/20190730015729.4406-1-miles.chen@mediatek.com
Fixes: 5ac8fb31ad2e ("mm: memcontrol: convert reclaim iterator to simple css refcounting")
Signed-off-by: Miles Chen <miles.chen@mediatek.com>
Signed-off-by: Qian Cai <cai@lca.pw>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-08-14 05:37:28 +07:00
|
|
|
for_each_node(nid) {
|
|
|
|
mz = mem_cgroup_nodeinfo(from, nid);
|
2019-12-01 08:50:03 +07:00
|
|
|
iter = &mz->iter;
|
|
|
|
cmpxchg(&iter->position, dead_memcg, NULL);
|
2015-12-30 05:54:10 +07:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
mm/memcontrol.c: fix use after free in mem_cgroup_iter()
This patch is sent to report an use after free in mem_cgroup_iter()
after merging commit be2657752e9e ("mm: memcg: fix use after free in
mem_cgroup_iter()").
I work with android kernel tree (4.9 & 4.14), and commit be2657752e9e
("mm: memcg: fix use after free in mem_cgroup_iter()") has been merged
to the trees. However, I can still observe use after free issues
addressed in the commit be2657752e9e. (on low-end devices, a few times
this month)
backtrace:
css_tryget <- crash here
mem_cgroup_iter
shrink_node
shrink_zones
do_try_to_free_pages
try_to_free_pages
__perform_reclaim
__alloc_pages_direct_reclaim
__alloc_pages_slowpath
__alloc_pages_nodemask
To debug, I poisoned mem_cgroup before freeing it:
static void __mem_cgroup_free(struct mem_cgroup *memcg)
for_each_node(node)
free_mem_cgroup_per_node_info(memcg, node);
free_percpu(memcg->stat);
+ /* poison memcg before freeing it */
+ memset(memcg, 0x78, sizeof(struct mem_cgroup));
kfree(memcg);
}
The coredump shows the position=0xdbbc2a00 is freed.
(gdb) p/x ((struct mem_cgroup_per_node *)0xe5009e00)->iter[8]
$13 = {position = 0xdbbc2a00, generation = 0x2efd}
0xdbbc2a00: 0xdbbc2e00 0x00000000 0xdbbc2800 0x00000100
0xdbbc2a10: 0x00000200 0x78787878 0x00026218 0x00000000
0xdbbc2a20: 0xdcad6000 0x00000001 0x78787800 0x00000000
0xdbbc2a30: 0x78780000 0x00000000 0x0068fb84 0x78787878
0xdbbc2a40: 0x78787878 0x78787878 0x78787878 0xe3fa5cc0
0xdbbc2a50: 0x78787878 0x78787878 0x00000000 0x00000000
0xdbbc2a60: 0x00000000 0x00000000 0x00000000 0x00000000
0xdbbc2a70: 0x00000000 0x00000000 0x00000000 0x00000000
0xdbbc2a80: 0x00000000 0x00000000 0x00000000 0x00000000
0xdbbc2a90: 0x00000001 0x00000000 0x00000000 0x00100000
0xdbbc2aa0: 0x00000001 0xdbbc2ac8 0x00000000 0x00000000
0xdbbc2ab0: 0x00000000 0x00000000 0x00000000 0x00000000
0xdbbc2ac0: 0x00000000 0x00000000 0xe5b02618 0x00001000
0xdbbc2ad0: 0x00000000 0x78787878 0x78787878 0x78787878
0xdbbc2ae0: 0x78787878 0x78787878 0x78787878 0x78787878
0xdbbc2af0: 0x78787878 0x78787878 0x78787878 0x78787878
0xdbbc2b00: 0x78787878 0x78787878 0x78787878 0x78787878
0xdbbc2b10: 0x78787878 0x78787878 0x78787878 0x78787878
0xdbbc2b20: 0x78787878 0x78787878 0x78787878 0x78787878
0xdbbc2b30: 0x78787878 0x78787878 0x78787878 0x78787878
0xdbbc2b40: 0x78787878 0x78787878 0x78787878 0x78787878
0xdbbc2b50: 0x78787878 0x78787878 0x78787878 0x78787878
0xdbbc2b60: 0x78787878 0x78787878 0x78787878 0x78787878
0xdbbc2b70: 0x78787878 0x78787878 0x78787878 0x78787878
0xdbbc2b80: 0x78787878 0x78787878 0x00000000 0x78787878
0xdbbc2b90: 0x78787878 0x78787878 0x78787878 0x78787878
0xdbbc2ba0: 0x78787878 0x78787878 0x78787878 0x78787878
In the reclaim path, try_to_free_pages() does not setup
sc.target_mem_cgroup and sc is passed to do_try_to_free_pages(), ...,
shrink_node().
In mem_cgroup_iter(), root is set to root_mem_cgroup because
sc->target_mem_cgroup is NULL. It is possible to assign a memcg to
root_mem_cgroup.nodeinfo.iter in mem_cgroup_iter().
try_to_free_pages
struct scan_control sc = {...}, target_mem_cgroup is 0x0;
do_try_to_free_pages
shrink_zones
shrink_node
mem_cgroup *root = sc->target_mem_cgroup;
memcg = mem_cgroup_iter(root, NULL, &reclaim);
mem_cgroup_iter()
if (!root)
root = root_mem_cgroup;
...
css = css_next_descendant_pre(css, &root->css);
memcg = mem_cgroup_from_css(css);
cmpxchg(&iter->position, pos, memcg);
My device uses memcg non-hierarchical mode. When we release a memcg:
invalidate_reclaim_iterators() reaches only dead_memcg and its parents.
If non-hierarchical mode is used, invalidate_reclaim_iterators() never
reaches root_mem_cgroup.
static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
{
struct mem_cgroup *memcg = dead_memcg;
for (; memcg; memcg = parent_mem_cgroup(memcg)
...
}
So the use after free scenario looks like:
CPU1 CPU2
try_to_free_pages
do_try_to_free_pages
shrink_zones
shrink_node
mem_cgroup_iter()
if (!root)
root = root_mem_cgroup;
...
css = css_next_descendant_pre(css, &root->css);
memcg = mem_cgroup_from_css(css);
cmpxchg(&iter->position, pos, memcg);
invalidate_reclaim_iterators(memcg);
...
__mem_cgroup_free()
kfree(memcg);
try_to_free_pages
do_try_to_free_pages
shrink_zones
shrink_node
mem_cgroup_iter()
if (!root)
root = root_mem_cgroup;
...
mz = mem_cgroup_nodeinfo(root, reclaim->pgdat->node_id);
iter = &mz->iter[reclaim->priority];
pos = READ_ONCE(iter->position);
css_tryget(&pos->css) <- use after free
To avoid this, we should also invalidate root_mem_cgroup.nodeinfo.iter
in invalidate_reclaim_iterators().
[cai@lca.pw: fix -Wparentheses compilation warning]
Link: http://lkml.kernel.org/r/1564580753-17531-1-git-send-email-cai@lca.pw
Link: http://lkml.kernel.org/r/20190730015729.4406-1-miles.chen@mediatek.com
Fixes: 5ac8fb31ad2e ("mm: memcontrol: convert reclaim iterator to simple css refcounting")
Signed-off-by: Miles Chen <miles.chen@mediatek.com>
Signed-off-by: Qian Cai <cai@lca.pw>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-08-14 05:37:28 +07:00
|
|
|
static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
|
|
|
|
{
|
|
|
|
struct mem_cgroup *memcg = dead_memcg;
|
|
|
|
struct mem_cgroup *last;
|
|
|
|
|
|
|
|
do {
|
|
|
|
__invalidate_reclaim_iterators(memcg, dead_memcg);
|
|
|
|
last = memcg;
|
|
|
|
} while ((memcg = parent_mem_cgroup(memcg)));
|
|
|
|
|
|
|
|
/*
|
|
|
|
* When cgruop1 non-hierarchy mode is used,
|
|
|
|
* parent_mem_cgroup() does not walk all the way up to the
|
|
|
|
* cgroup root (root_mem_cgroup). So we have to handle
|
|
|
|
* dead_memcg from cgroup root separately.
|
|
|
|
*/
|
|
|
|
if (last != root_mem_cgroup)
|
|
|
|
__invalidate_reclaim_iterators(root_mem_cgroup,
|
|
|
|
dead_memcg);
|
|
|
|
}
|
|
|
|
|
2016-10-08 06:57:23 +07:00
|
|
|
/**
|
|
|
|
* mem_cgroup_scan_tasks - iterate over tasks of a memory cgroup hierarchy
|
|
|
|
* @memcg: hierarchy root
|
|
|
|
* @fn: function to call for each task
|
|
|
|
* @arg: argument passed to @fn
|
|
|
|
*
|
|
|
|
* This function iterates over tasks attached to @memcg or to any of its
|
|
|
|
* descendants and calls @fn for each task. If @fn returns a non-zero
|
|
|
|
* value, the function breaks the iteration loop and returns the value.
|
|
|
|
* Otherwise, it will iterate over all tasks and return 0.
|
|
|
|
*
|
|
|
|
* This function must not be called for the root memory cgroup.
|
|
|
|
*/
|
|
|
|
int mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
|
|
|
|
int (*fn)(struct task_struct *, void *), void *arg)
|
|
|
|
{
|
|
|
|
struct mem_cgroup *iter;
|
|
|
|
int ret = 0;
|
|
|
|
|
|
|
|
BUG_ON(memcg == root_mem_cgroup);
|
|
|
|
|
|
|
|
for_each_mem_cgroup_tree(iter, memcg) {
|
|
|
|
struct css_task_iter it;
|
|
|
|
struct task_struct *task;
|
|
|
|
|
2019-07-12 11:00:20 +07:00
|
|
|
css_task_iter_start(&iter->css, CSS_TASK_ITER_PROCS, &it);
|
2016-10-08 06:57:23 +07:00
|
|
|
while (!ret && (task = css_task_iter_next(&it)))
|
|
|
|
ret = fn(task, arg);
|
|
|
|
css_task_iter_end(&it);
|
|
|
|
if (ret) {
|
|
|
|
mem_cgroup_iter_break(memcg, iter);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2012-01-13 08:18:15 +07:00
|
|
|
/**
|
2014-12-11 06:43:43 +07:00
|
|
|
* mem_cgroup_page_lruvec - return lruvec for isolating/putting an LRU page
|
2012-01-13 08:18:15 +07:00
|
|
|
* @page: the page
|
2018-02-07 06:42:16 +07:00
|
|
|
* @pgdat: pgdat of the page
|
2014-12-11 06:43:43 +07:00
|
|
|
*
|
2020-06-04 06:02:27 +07:00
|
|
|
* This function relies on page->mem_cgroup being stable - see the
|
|
|
|
* access rules in commit_charge().
|
2012-01-13 08:18:15 +07:00
|
|
|
*/
|
2016-07-29 05:45:31 +07:00
|
|
|
struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct pglist_data *pgdat)
|
memcg: synchronized LRU
A big patch for changing memcg's LRU semantics.
Now,
- page_cgroup is linked to mem_cgroup's its own LRU (per zone).
- LRU of page_cgroup is not synchronous with global LRU.
- page and page_cgroup is one-to-one and statically allocated.
- To find page_cgroup is on what LRU, you have to check pc->mem_cgroup as
- lru = page_cgroup_zoneinfo(pc, nid_of_pc, zid_of_pc);
- SwapCache is handled.
And, when we handle LRU list of page_cgroup, we do following.
pc = lookup_page_cgroup(page);
lock_page_cgroup(pc); .....................(1)
mz = page_cgroup_zoneinfo(pc);
spin_lock(&mz->lru_lock);
.....add to LRU
spin_unlock(&mz->lru_lock);
unlock_page_cgroup(pc);
But (1) is spin_lock and we have to be afraid of dead-lock with zone->lru_lock.
So, trylock() is used at (1), now. Without (1), we can't trust "mz" is correct.
This is a trial to remove this dirty nesting of locks.
This patch changes mz->lru_lock to be zone->lru_lock.
Then, above sequence will be written as
spin_lock(&zone->lru_lock); # in vmscan.c or swap.c via global LRU
mem_cgroup_add/remove/etc_lru() {
pc = lookup_page_cgroup(page);
mz = page_cgroup_zoneinfo(pc);
if (PageCgroupUsed(pc)) {
....add to LRU
}
spin_lock(&zone->lru_lock); # in vmscan.c or swap.c via global LRU
This is much simpler.
(*) We're safe even if we don't take lock_page_cgroup(pc). Because..
1. When pc->mem_cgroup can be modified.
- at charge.
- at account_move().
2. at charge
the PCG_USED bit is not set before pc->mem_cgroup is fixed.
3. at account_move()
the page is isolated and not on LRU.
Pros.
- easy for maintenance.
- memcg can make use of laziness of pagevec.
- we don't have to duplicated LRU/Active/Unevictable bit in page_cgroup.
- LRU status of memcg will be synchronized with global LRU's one.
- # of locks are reduced.
- account_move() is simplified very much.
Cons.
- may increase cost of LRU rotation.
(no impact if memcg is not configured.)
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Pavel Emelyanov <xemul@openvz.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-01-08 09:08:01 +07:00
|
|
|
{
|
2016-07-29 05:46:05 +07:00
|
|
|
struct mem_cgroup_per_node *mz;
|
2012-01-13 08:18:15 +07:00
|
|
|
struct mem_cgroup *memcg;
|
memcg: fix hotplugged memory zone oops
When MEMCG is configured on (even when it's disabled by boot option),
when adding or removing a page to/from its lru list, the zone pointer
used for stats updates is nowadays taken from the struct lruvec. (On
many configurations, calculating zone from page is slower.)
But we have no code to update all the lruvecs (per zone, per memcg) when
a memory node is hotadded. Here's an extract from the oops which
results when running numactl to bind a program to a newly onlined node:
BUG: unable to handle kernel NULL pointer dereference at 0000000000000f60
IP: __mod_zone_page_state+0x9/0x60
Pid: 1219, comm: numactl Not tainted 3.6.0-rc5+ #180 Bochs Bochs
Process numactl (pid: 1219, threadinfo ffff880039abc000, task ffff8800383c4ce0)
Call Trace:
__pagevec_lru_add_fn+0xdf/0x140
pagevec_lru_move_fn+0xb1/0x100
__pagevec_lru_add+0x1c/0x30
lru_add_drain_cpu+0xa3/0x130
lru_add_drain+0x2f/0x40
...
The natural solution might be to use a memcg callback whenever memory is
hotadded; but that solution has not been scoped out, and it happens that
we do have an easy location at which to update lruvec->zone. The lruvec
pointer is discovered either by mem_cgroup_zone_lruvec() or by
mem_cgroup_page_lruvec(), and both of those do know the right zone.
So check and set lruvec->zone in those; and remove the inadequate
attempt to set lruvec->zone from lruvec_init(), which is called before
NODE_DATA(node) has been allocated in such cases.
Ah, there was one exceptionr. For no particularly good reason,
mem_cgroup_force_empty_list() has its own code for deciding lruvec.
Change it to use the standard mem_cgroup_zone_lruvec() and
mem_cgroup_get_lru_size() too. In fact it was already safe against such
an oops (the lru lists in danger could only be empty), but we're better
proofed against future changes this way.
I've marked this for stable (3.6) since we introduced the problem in 3.5
(now closed to stable); but I have no idea if this is the only fix
needed to get memory hotadd working with memcg in 3.6, and received no
answer when I enquired twice before.
Reported-by: Tang Chen <tangchen@cn.fujitsu.com>
Signed-off-by: Hugh Dickins <hughd@google.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Konstantin Khlebnikov <khlebnikov@openvz.org>
Cc: Wen Congyang <wency@cn.fujitsu.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-11-17 05:14:54 +07:00
|
|
|
struct lruvec *lruvec;
|
2008-02-07 15:14:31 +07:00
|
|
|
|
memcg: fix hotplugged memory zone oops
When MEMCG is configured on (even when it's disabled by boot option),
when adding or removing a page to/from its lru list, the zone pointer
used for stats updates is nowadays taken from the struct lruvec. (On
many configurations, calculating zone from page is slower.)
But we have no code to update all the lruvecs (per zone, per memcg) when
a memory node is hotadded. Here's an extract from the oops which
results when running numactl to bind a program to a newly onlined node:
BUG: unable to handle kernel NULL pointer dereference at 0000000000000f60
IP: __mod_zone_page_state+0x9/0x60
Pid: 1219, comm: numactl Not tainted 3.6.0-rc5+ #180 Bochs Bochs
Process numactl (pid: 1219, threadinfo ffff880039abc000, task ffff8800383c4ce0)
Call Trace:
__pagevec_lru_add_fn+0xdf/0x140
pagevec_lru_move_fn+0xb1/0x100
__pagevec_lru_add+0x1c/0x30
lru_add_drain_cpu+0xa3/0x130
lru_add_drain+0x2f/0x40
...
The natural solution might be to use a memcg callback whenever memory is
hotadded; but that solution has not been scoped out, and it happens that
we do have an easy location at which to update lruvec->zone. The lruvec
pointer is discovered either by mem_cgroup_zone_lruvec() or by
mem_cgroup_page_lruvec(), and both of those do know the right zone.
So check and set lruvec->zone in those; and remove the inadequate
attempt to set lruvec->zone from lruvec_init(), which is called before
NODE_DATA(node) has been allocated in such cases.
Ah, there was one exceptionr. For no particularly good reason,
mem_cgroup_force_empty_list() has its own code for deciding lruvec.
Change it to use the standard mem_cgroup_zone_lruvec() and
mem_cgroup_get_lru_size() too. In fact it was already safe against such
an oops (the lru lists in danger could only be empty), but we're better
proofed against future changes this way.
I've marked this for stable (3.6) since we introduced the problem in 3.5
(now closed to stable); but I have no idea if this is the only fix
needed to get memory hotadd working with memcg in 3.6, and received no
answer when I enquired twice before.
Reported-by: Tang Chen <tangchen@cn.fujitsu.com>
Signed-off-by: Hugh Dickins <hughd@google.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Konstantin Khlebnikov <khlebnikov@openvz.org>
Cc: Wen Congyang <wency@cn.fujitsu.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-11-17 05:14:54 +07:00
|
|
|
if (mem_cgroup_disabled()) {
|
2019-12-01 08:55:34 +07:00
|
|
|
lruvec = &pgdat->__lruvec;
|
memcg: fix hotplugged memory zone oops
When MEMCG is configured on (even when it's disabled by boot option),
when adding or removing a page to/from its lru list, the zone pointer
used for stats updates is nowadays taken from the struct lruvec. (On
many configurations, calculating zone from page is slower.)
But we have no code to update all the lruvecs (per zone, per memcg) when
a memory node is hotadded. Here's an extract from the oops which
results when running numactl to bind a program to a newly onlined node:
BUG: unable to handle kernel NULL pointer dereference at 0000000000000f60
IP: __mod_zone_page_state+0x9/0x60
Pid: 1219, comm: numactl Not tainted 3.6.0-rc5+ #180 Bochs Bochs
Process numactl (pid: 1219, threadinfo ffff880039abc000, task ffff8800383c4ce0)
Call Trace:
__pagevec_lru_add_fn+0xdf/0x140
pagevec_lru_move_fn+0xb1/0x100
__pagevec_lru_add+0x1c/0x30
lru_add_drain_cpu+0xa3/0x130
lru_add_drain+0x2f/0x40
...
The natural solution might be to use a memcg callback whenever memory is
hotadded; but that solution has not been scoped out, and it happens that
we do have an easy location at which to update lruvec->zone. The lruvec
pointer is discovered either by mem_cgroup_zone_lruvec() or by
mem_cgroup_page_lruvec(), and both of those do know the right zone.
So check and set lruvec->zone in those; and remove the inadequate
attempt to set lruvec->zone from lruvec_init(), which is called before
NODE_DATA(node) has been allocated in such cases.
Ah, there was one exceptionr. For no particularly good reason,
mem_cgroup_force_empty_list() has its own code for deciding lruvec.
Change it to use the standard mem_cgroup_zone_lruvec() and
mem_cgroup_get_lru_size() too. In fact it was already safe against such
an oops (the lru lists in danger could only be empty), but we're better
proofed against future changes this way.
I've marked this for stable (3.6) since we introduced the problem in 3.5
(now closed to stable); but I have no idea if this is the only fix
needed to get memory hotadd working with memcg in 3.6, and received no
answer when I enquired twice before.
Reported-by: Tang Chen <tangchen@cn.fujitsu.com>
Signed-off-by: Hugh Dickins <hughd@google.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Konstantin Khlebnikov <khlebnikov@openvz.org>
Cc: Wen Congyang <wency@cn.fujitsu.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-11-17 05:14:54 +07:00
|
|
|
goto out;
|
|
|
|
}
|
2012-01-13 08:18:15 +07:00
|
|
|
|
2014-12-11 06:44:52 +07:00
|
|
|
memcg = page->mem_cgroup;
|
memcg: fix GPF when cgroup removal races with last exit
When moving tasks from old memcg (with move_charge_at_immigrate on new
memcg), followed by removal of old memcg, hit General Protection Fault in
mem_cgroup_lru_del_list() (called from release_pages called from
free_pages_and_swap_cache from tlb_flush_mmu from tlb_finish_mmu from
exit_mmap from mmput from exit_mm from do_exit).
Somewhat reproducible, takes a few hours: the old struct mem_cgroup has
been freed and poisoned by SLAB_DEBUG, but mem_cgroup_lru_del_list() is
still trying to update its stats, and take page off lru before freeing.
A task, or a charge, or a page on lru: each secures a memcg against
removal. In this case, the last task has been moved out of the old memcg,
and it is exiting: anonymous pages are uncharged one by one from the
memcg, as they are zapped from its pagetables, so the charge gets down to
0; but the pages themselves are queued in an mmu_gather for freeing.
Most of those pages will be on lru (and force_empty is careful to
lru_add_drain_all, to add pages from pagevec to lru first), but not
necessarily all: perhaps some have been isolated for page reclaim, perhaps
some isolated for other reasons. So, force_empty may find no task, no
charge and no page on lru, and let the removal proceed.
There would still be no problem if these pages were immediately freed; but
typically (and the put_page_testzero protocol demands it) they have to be
added back to lru before they are found freeable, then removed from lru
and freed. We don't see the issue when adding, because the
mem_cgroup_iter() loops keep their own reference to the memcg being
scanned; but when it comes to mem_cgroup_lru_del_list().
I believe this was not an issue in v3.2: there, PageCgroupAcctLRU and
PageCgroupUsed flags were used (like a trick with mirrors) to deflect view
of pc->mem_cgroup to the stable root_mem_cgroup when neither set.
38c5d72f3ebe ("memcg: simplify LRU handling by new rule") mercifully
removed those convolutions, but left this General Protection Fault.
But it's surprisingly easy to restore the old behaviour: just check
PageCgroupUsed in mem_cgroup_lru_add_list() (which decides on which lruvec
to add), and reset pc to root_mem_cgroup if page is uncharged. A risky
change? just going back to how it worked before; testing, and an audit of
uses of pc->mem_cgroup, show no problem.
And there's a nice bonus: with mem_cgroup_lru_add_list() itself making
sure that an uncharged page goes to root lru, mem_cgroup_reset_owner() no
longer has any purpose, and we can safely revert 4e5f01c2b9b9 ("memcg:
clear pc->mem_cgroup if necessary").
Calling update_page_reclaim_stat() after add_page_to_lru_list() in swap.c
is not strictly necessary: the lru_lock there, with RCU before memcg
structures are freed, makes mem_cgroup_get_reclaim_stat_from_page safe
without that; but it seems cleaner to rely on one dependency less.
Signed-off-by: Hugh Dickins <hughd@google.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Konstantin Khlebnikov <khlebnikov@openvz.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-03-06 05:59:18 +07:00
|
|
|
/*
|
2014-12-11 06:43:43 +07:00
|
|
|
* Swapcache readahead pages are added to the LRU - and
|
2014-12-11 06:44:02 +07:00
|
|
|
* possibly migrated - before they are charged.
|
memcg: fix GPF when cgroup removal races with last exit
When moving tasks from old memcg (with move_charge_at_immigrate on new
memcg), followed by removal of old memcg, hit General Protection Fault in
mem_cgroup_lru_del_list() (called from release_pages called from
free_pages_and_swap_cache from tlb_flush_mmu from tlb_finish_mmu from
exit_mmap from mmput from exit_mm from do_exit).
Somewhat reproducible, takes a few hours: the old struct mem_cgroup has
been freed and poisoned by SLAB_DEBUG, but mem_cgroup_lru_del_list() is
still trying to update its stats, and take page off lru before freeing.
A task, or a charge, or a page on lru: each secures a memcg against
removal. In this case, the last task has been moved out of the old memcg,
and it is exiting: anonymous pages are uncharged one by one from the
memcg, as they are zapped from its pagetables, so the charge gets down to
0; but the pages themselves are queued in an mmu_gather for freeing.
Most of those pages will be on lru (and force_empty is careful to
lru_add_drain_all, to add pages from pagevec to lru first), but not
necessarily all: perhaps some have been isolated for page reclaim, perhaps
some isolated for other reasons. So, force_empty may find no task, no
charge and no page on lru, and let the removal proceed.
There would still be no problem if these pages were immediately freed; but
typically (and the put_page_testzero protocol demands it) they have to be
added back to lru before they are found freeable, then removed from lru
and freed. We don't see the issue when adding, because the
mem_cgroup_iter() loops keep their own reference to the memcg being
scanned; but when it comes to mem_cgroup_lru_del_list().
I believe this was not an issue in v3.2: there, PageCgroupAcctLRU and
PageCgroupUsed flags were used (like a trick with mirrors) to deflect view
of pc->mem_cgroup to the stable root_mem_cgroup when neither set.
38c5d72f3ebe ("memcg: simplify LRU handling by new rule") mercifully
removed those convolutions, but left this General Protection Fault.
But it's surprisingly easy to restore the old behaviour: just check
PageCgroupUsed in mem_cgroup_lru_add_list() (which decides on which lruvec
to add), and reset pc to root_mem_cgroup if page is uncharged. A risky
change? just going back to how it worked before; testing, and an audit of
uses of pc->mem_cgroup, show no problem.
And there's a nice bonus: with mem_cgroup_lru_add_list() itself making
sure that an uncharged page goes to root lru, mem_cgroup_reset_owner() no
longer has any purpose, and we can safely revert 4e5f01c2b9b9 ("memcg:
clear pc->mem_cgroup if necessary").
Calling update_page_reclaim_stat() after add_page_to_lru_list() in swap.c
is not strictly necessary: the lru_lock there, with RCU before memcg
structures are freed, makes mem_cgroup_get_reclaim_stat_from_page safe
without that; but it seems cleaner to rely on one dependency less.
Signed-off-by: Hugh Dickins <hughd@google.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Konstantin Khlebnikov <khlebnikov@openvz.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-03-06 05:59:18 +07:00
|
|
|
*/
|
2014-12-11 06:44:02 +07:00
|
|
|
if (!memcg)
|
|
|
|
memcg = root_mem_cgroup;
|
memcg: fix GPF when cgroup removal races with last exit
When moving tasks from old memcg (with move_charge_at_immigrate on new
memcg), followed by removal of old memcg, hit General Protection Fault in
mem_cgroup_lru_del_list() (called from release_pages called from
free_pages_and_swap_cache from tlb_flush_mmu from tlb_finish_mmu from
exit_mmap from mmput from exit_mm from do_exit).
Somewhat reproducible, takes a few hours: the old struct mem_cgroup has
been freed and poisoned by SLAB_DEBUG, but mem_cgroup_lru_del_list() is
still trying to update its stats, and take page off lru before freeing.
A task, or a charge, or a page on lru: each secures a memcg against
removal. In this case, the last task has been moved out of the old memcg,
and it is exiting: anonymous pages are uncharged one by one from the
memcg, as they are zapped from its pagetables, so the charge gets down to
0; but the pages themselves are queued in an mmu_gather for freeing.
Most of those pages will be on lru (and force_empty is careful to
lru_add_drain_all, to add pages from pagevec to lru first), but not
necessarily all: perhaps some have been isolated for page reclaim, perhaps
some isolated for other reasons. So, force_empty may find no task, no
charge and no page on lru, and let the removal proceed.
There would still be no problem if these pages were immediately freed; but
typically (and the put_page_testzero protocol demands it) they have to be
added back to lru before they are found freeable, then removed from lru
and freed. We don't see the issue when adding, because the
mem_cgroup_iter() loops keep their own reference to the memcg being
scanned; but when it comes to mem_cgroup_lru_del_list().
I believe this was not an issue in v3.2: there, PageCgroupAcctLRU and
PageCgroupUsed flags were used (like a trick with mirrors) to deflect view
of pc->mem_cgroup to the stable root_mem_cgroup when neither set.
38c5d72f3ebe ("memcg: simplify LRU handling by new rule") mercifully
removed those convolutions, but left this General Protection Fault.
But it's surprisingly easy to restore the old behaviour: just check
PageCgroupUsed in mem_cgroup_lru_add_list() (which decides on which lruvec
to add), and reset pc to root_mem_cgroup if page is uncharged. A risky
change? just going back to how it worked before; testing, and an audit of
uses of pc->mem_cgroup, show no problem.
And there's a nice bonus: with mem_cgroup_lru_add_list() itself making
sure that an uncharged page goes to root lru, mem_cgroup_reset_owner() no
longer has any purpose, and we can safely revert 4e5f01c2b9b9 ("memcg:
clear pc->mem_cgroup if necessary").
Calling update_page_reclaim_stat() after add_page_to_lru_list() in swap.c
is not strictly necessary: the lru_lock there, with RCU before memcg
structures are freed, makes mem_cgroup_get_reclaim_stat_from_page safe
without that; but it seems cleaner to rely on one dependency less.
Signed-off-by: Hugh Dickins <hughd@google.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Konstantin Khlebnikov <khlebnikov@openvz.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-03-06 05:59:18 +07:00
|
|
|
|
2016-07-29 05:46:05 +07:00
|
|
|
mz = mem_cgroup_page_nodeinfo(memcg, page);
|
memcg: fix hotplugged memory zone oops
When MEMCG is configured on (even when it's disabled by boot option),
when adding or removing a page to/from its lru list, the zone pointer
used for stats updates is nowadays taken from the struct lruvec. (On
many configurations, calculating zone from page is slower.)
But we have no code to update all the lruvecs (per zone, per memcg) when
a memory node is hotadded. Here's an extract from the oops which
results when running numactl to bind a program to a newly onlined node:
BUG: unable to handle kernel NULL pointer dereference at 0000000000000f60
IP: __mod_zone_page_state+0x9/0x60
Pid: 1219, comm: numactl Not tainted 3.6.0-rc5+ #180 Bochs Bochs
Process numactl (pid: 1219, threadinfo ffff880039abc000, task ffff8800383c4ce0)
Call Trace:
__pagevec_lru_add_fn+0xdf/0x140
pagevec_lru_move_fn+0xb1/0x100
__pagevec_lru_add+0x1c/0x30
lru_add_drain_cpu+0xa3/0x130
lru_add_drain+0x2f/0x40
...
The natural solution might be to use a memcg callback whenever memory is
hotadded; but that solution has not been scoped out, and it happens that
we do have an easy location at which to update lruvec->zone. The lruvec
pointer is discovered either by mem_cgroup_zone_lruvec() or by
mem_cgroup_page_lruvec(), and both of those do know the right zone.
So check and set lruvec->zone in those; and remove the inadequate
attempt to set lruvec->zone from lruvec_init(), which is called before
NODE_DATA(node) has been allocated in such cases.
Ah, there was one exceptionr. For no particularly good reason,
mem_cgroup_force_empty_list() has its own code for deciding lruvec.
Change it to use the standard mem_cgroup_zone_lruvec() and
mem_cgroup_get_lru_size() too. In fact it was already safe against such
an oops (the lru lists in danger could only be empty), but we're better
proofed against future changes this way.
I've marked this for stable (3.6) since we introduced the problem in 3.5
(now closed to stable); but I have no idea if this is the only fix
needed to get memory hotadd working with memcg in 3.6, and received no
answer when I enquired twice before.
Reported-by: Tang Chen <tangchen@cn.fujitsu.com>
Signed-off-by: Hugh Dickins <hughd@google.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Konstantin Khlebnikov <khlebnikov@openvz.org>
Cc: Wen Congyang <wency@cn.fujitsu.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-11-17 05:14:54 +07:00
|
|
|
lruvec = &mz->lruvec;
|
|
|
|
out:
|
|
|
|
/*
|
|
|
|
* Since a node can be onlined after the mem_cgroup was created,
|
|
|
|
* we have to be prepared to initialize lruvec->zone here;
|
|
|
|
* and if offlined then reonlined, we need to reinitialize it.
|
|
|
|
*/
|
2016-07-29 05:45:31 +07:00
|
|
|
if (unlikely(lruvec->pgdat != pgdat))
|
|
|
|
lruvec->pgdat = pgdat;
|
memcg: fix hotplugged memory zone oops
When MEMCG is configured on (even when it's disabled by boot option),
when adding or removing a page to/from its lru list, the zone pointer
used for stats updates is nowadays taken from the struct lruvec. (On
many configurations, calculating zone from page is slower.)
But we have no code to update all the lruvecs (per zone, per memcg) when
a memory node is hotadded. Here's an extract from the oops which
results when running numactl to bind a program to a newly onlined node:
BUG: unable to handle kernel NULL pointer dereference at 0000000000000f60
IP: __mod_zone_page_state+0x9/0x60
Pid: 1219, comm: numactl Not tainted 3.6.0-rc5+ #180 Bochs Bochs
Process numactl (pid: 1219, threadinfo ffff880039abc000, task ffff8800383c4ce0)
Call Trace:
__pagevec_lru_add_fn+0xdf/0x140
pagevec_lru_move_fn+0xb1/0x100
__pagevec_lru_add+0x1c/0x30
lru_add_drain_cpu+0xa3/0x130
lru_add_drain+0x2f/0x40
...
The natural solution might be to use a memcg callback whenever memory is
hotadded; but that solution has not been scoped out, and it happens that
we do have an easy location at which to update lruvec->zone. The lruvec
pointer is discovered either by mem_cgroup_zone_lruvec() or by
mem_cgroup_page_lruvec(), and both of those do know the right zone.
So check and set lruvec->zone in those; and remove the inadequate
attempt to set lruvec->zone from lruvec_init(), which is called before
NODE_DATA(node) has been allocated in such cases.
Ah, there was one exceptionr. For no particularly good reason,
mem_cgroup_force_empty_list() has its own code for deciding lruvec.
Change it to use the standard mem_cgroup_zone_lruvec() and
mem_cgroup_get_lru_size() too. In fact it was already safe against such
an oops (the lru lists in danger could only be empty), but we're better
proofed against future changes this way.
I've marked this for stable (3.6) since we introduced the problem in 3.5
(now closed to stable); but I have no idea if this is the only fix
needed to get memory hotadd working with memcg in 3.6, and received no
answer when I enquired twice before.
Reported-by: Tang Chen <tangchen@cn.fujitsu.com>
Signed-off-by: Hugh Dickins <hughd@google.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Konstantin Khlebnikov <khlebnikov@openvz.org>
Cc: Wen Congyang <wency@cn.fujitsu.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-11-17 05:14:54 +07:00
|
|
|
return lruvec;
|
memcg: synchronized LRU
A big patch for changing memcg's LRU semantics.
Now,
- page_cgroup is linked to mem_cgroup's its own LRU (per zone).
- LRU of page_cgroup is not synchronous with global LRU.
- page and page_cgroup is one-to-one and statically allocated.
- To find page_cgroup is on what LRU, you have to check pc->mem_cgroup as
- lru = page_cgroup_zoneinfo(pc, nid_of_pc, zid_of_pc);
- SwapCache is handled.
And, when we handle LRU list of page_cgroup, we do following.
pc = lookup_page_cgroup(page);
lock_page_cgroup(pc); .....................(1)
mz = page_cgroup_zoneinfo(pc);
spin_lock(&mz->lru_lock);
.....add to LRU
spin_unlock(&mz->lru_lock);
unlock_page_cgroup(pc);
But (1) is spin_lock and we have to be afraid of dead-lock with zone->lru_lock.
So, trylock() is used at (1), now. Without (1), we can't trust "mz" is correct.
This is a trial to remove this dirty nesting of locks.
This patch changes mz->lru_lock to be zone->lru_lock.
Then, above sequence will be written as
spin_lock(&zone->lru_lock); # in vmscan.c or swap.c via global LRU
mem_cgroup_add/remove/etc_lru() {
pc = lookup_page_cgroup(page);
mz = page_cgroup_zoneinfo(pc);
if (PageCgroupUsed(pc)) {
....add to LRU
}
spin_lock(&zone->lru_lock); # in vmscan.c or swap.c via global LRU
This is much simpler.
(*) We're safe even if we don't take lock_page_cgroup(pc). Because..
1. When pc->mem_cgroup can be modified.
- at charge.
- at account_move().
2. at charge
the PCG_USED bit is not set before pc->mem_cgroup is fixed.
3. at account_move()
the page is isolated and not on LRU.
Pros.
- easy for maintenance.
- memcg can make use of laziness of pagevec.
- we don't have to duplicated LRU/Active/Unevictable bit in page_cgroup.
- LRU status of memcg will be synchronized with global LRU's one.
- # of locks are reduced.
- account_move() is simplified very much.
Cons.
- may increase cost of LRU rotation.
(no impact if memcg is not configured.)
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Pavel Emelyanov <xemul@openvz.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-01-08 09:08:01 +07:00
|
|
|
}
|
2008-10-19 10:26:14 +07:00
|
|
|
|
2012-01-13 08:18:15 +07:00
|
|
|
/**
|
2012-05-30 05:07:09 +07:00
|
|
|
* mem_cgroup_update_lru_size - account for adding or removing an lru page
|
|
|
|
* @lruvec: mem_cgroup per zone lru vector
|
|
|
|
* @lru: index of lru list the page is sitting on
|
2017-01-11 07:58:04 +07:00
|
|
|
* @zid: zone id of the accounted pages
|
2012-05-30 05:07:09 +07:00
|
|
|
* @nr_pages: positive when adding or negative when removing
|
2012-01-13 08:18:15 +07:00
|
|
|
*
|
2016-05-20 07:12:35 +07:00
|
|
|
* This function must be called under lru_lock, just before a page is added
|
|
|
|
* to or just after a page is removed from an lru list (that ordering being
|
|
|
|
* so as to allow it to check that lru_size 0 is consistent with list_empty).
|
2011-03-23 06:32:53 +07:00
|
|
|
*/
|
2012-05-30 05:07:09 +07:00
|
|
|
void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
|
2017-01-11 07:58:04 +07:00
|
|
|
int zid, int nr_pages)
|
2011-03-23 06:32:53 +07:00
|
|
|
{
|
2016-07-29 05:46:05 +07:00
|
|
|
struct mem_cgroup_per_node *mz;
|
2012-05-30 05:07:09 +07:00
|
|
|
unsigned long *lru_size;
|
2016-05-20 07:12:35 +07:00
|
|
|
long size;
|
2011-03-23 06:32:53 +07:00
|
|
|
|
|
|
|
if (mem_cgroup_disabled())
|
|
|
|
return;
|
|
|
|
|
2016-07-29 05:46:05 +07:00
|
|
|
mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
|
2017-01-11 07:58:04 +07:00
|
|
|
lru_size = &mz->lru_zone_size[zid][lru];
|
2016-05-20 07:12:35 +07:00
|
|
|
|
|
|
|
if (nr_pages < 0)
|
|
|
|
*lru_size += nr_pages;
|
|
|
|
|
|
|
|
size = *lru_size;
|
2017-01-11 07:58:04 +07:00
|
|
|
if (WARN_ONCE(size < 0,
|
|
|
|
"%s(%p, %d, %d): lru_size %ld\n",
|
|
|
|
__func__, lruvec, lru, nr_pages, size)) {
|
2016-05-20 07:12:35 +07:00
|
|
|
VM_BUG_ON(1);
|
|
|
|
*lru_size = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (nr_pages > 0)
|
|
|
|
*lru_size += nr_pages;
|
memcg: synchronized LRU
A big patch for changing memcg's LRU semantics.
Now,
- page_cgroup is linked to mem_cgroup's its own LRU (per zone).
- LRU of page_cgroup is not synchronous with global LRU.
- page and page_cgroup is one-to-one and statically allocated.
- To find page_cgroup is on what LRU, you have to check pc->mem_cgroup as
- lru = page_cgroup_zoneinfo(pc, nid_of_pc, zid_of_pc);
- SwapCache is handled.
And, when we handle LRU list of page_cgroup, we do following.
pc = lookup_page_cgroup(page);
lock_page_cgroup(pc); .....................(1)
mz = page_cgroup_zoneinfo(pc);
spin_lock(&mz->lru_lock);
.....add to LRU
spin_unlock(&mz->lru_lock);
unlock_page_cgroup(pc);
But (1) is spin_lock and we have to be afraid of dead-lock with zone->lru_lock.
So, trylock() is used at (1), now. Without (1), we can't trust "mz" is correct.
This is a trial to remove this dirty nesting of locks.
This patch changes mz->lru_lock to be zone->lru_lock.
Then, above sequence will be written as
spin_lock(&zone->lru_lock); # in vmscan.c or swap.c via global LRU
mem_cgroup_add/remove/etc_lru() {
pc = lookup_page_cgroup(page);
mz = page_cgroup_zoneinfo(pc);
if (PageCgroupUsed(pc)) {
....add to LRU
}
spin_lock(&zone->lru_lock); # in vmscan.c or swap.c via global LRU
This is much simpler.
(*) We're safe even if we don't take lock_page_cgroup(pc). Because..
1. When pc->mem_cgroup can be modified.
- at charge.
- at account_move().
2. at charge
the PCG_USED bit is not set before pc->mem_cgroup is fixed.
3. at account_move()
the page is isolated and not on LRU.
Pros.
- easy for maintenance.
- memcg can make use of laziness of pagevec.
- we don't have to duplicated LRU/Active/Unevictable bit in page_cgroup.
- LRU status of memcg will be synchronized with global LRU's one.
- # of locks are reduced.
- account_move() is simplified very much.
Cons.
- may increase cost of LRU rotation.
(no impact if memcg is not configured.)
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Pavel Emelyanov <xemul@openvz.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-01-08 09:08:01 +07:00
|
|
|
}
|
2009-01-08 09:08:34 +07:00
|
|
|
|
2011-02-02 06:52:43 +07:00
|
|
|
/**
|
2011-03-24 06:42:21 +07:00
|
|
|
* mem_cgroup_margin - calculate chargeable space of a memory cgroup
|
2012-06-21 02:53:01 +07:00
|
|
|
* @memcg: the memory cgroup
|
2011-02-02 06:52:43 +07:00
|
|
|
*
|
2011-03-24 06:42:21 +07:00
|
|
|
* Returns the maximum amount of memory @mem can be charged with, in
|
2011-03-24 06:42:36 +07:00
|
|
|
* pages.
|
2011-02-02 06:52:43 +07:00
|
|
|
*/
|
2011-11-03 03:38:15 +07:00
|
|
|
static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)
|
2011-02-02 06:52:43 +07:00
|
|
|
{
|
mm: memcontrol: lockless page counters
Memory is internally accounted in bytes, using spinlock-protected 64-bit
counters, even though the smallest accounting delta is a page. The
counter interface is also convoluted and does too many things.
Introduce a new lockless word-sized page counter API, then change all
memory accounting over to it. The translation from and to bytes then only
happens when interfacing with userspace.
The removed locking overhead is noticable when scaling beyond the per-cpu
charge caches - on a 4-socket machine with 144-threads, the following test
shows the performance differences of 288 memcgs concurrently running a
page fault benchmark:
vanilla:
18631648.500498 task-clock (msec) # 140.643 CPUs utilized ( +- 0.33% )
1,380,638 context-switches # 0.074 K/sec ( +- 0.75% )
24,390 cpu-migrations # 0.001 K/sec ( +- 8.44% )
1,843,305,768 page-faults # 0.099 M/sec ( +- 0.00% )
50,134,994,088,218 cycles # 2.691 GHz ( +- 0.33% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
8,049,712,224,651 instructions # 0.16 insns per cycle ( +- 0.04% )
1,586,970,584,979 branches # 85.176 M/sec ( +- 0.05% )
1,724,989,949 branch-misses # 0.11% of all branches ( +- 0.48% )
132.474343877 seconds time elapsed ( +- 0.21% )
lockless:
12195979.037525 task-clock (msec) # 133.480 CPUs utilized ( +- 0.18% )
832,850 context-switches # 0.068 K/sec ( +- 0.54% )
15,624 cpu-migrations # 0.001 K/sec ( +- 10.17% )
1,843,304,774 page-faults # 0.151 M/sec ( +- 0.00% )
32,811,216,801,141 cycles # 2.690 GHz ( +- 0.18% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
9,999,265,091,727 instructions # 0.30 insns per cycle ( +- 0.10% )
2,076,759,325,203 branches # 170.282 M/sec ( +- 0.12% )
1,656,917,214 branch-misses # 0.08% of all branches ( +- 0.55% )
91.369330729 seconds time elapsed ( +- 0.45% )
On top of improved scalability, this also gets rid of the icky long long
types in the very heart of memcg, which is great for 32 bit and also makes
the code a lot more readable.
Notable differences between the old and new API:
- res_counter_charge() and res_counter_charge_nofail() become
page_counter_try_charge() and page_counter_charge() resp. to match
the more common kernel naming scheme of try_do()/do()
- res_counter_uncharge_until() is only ever used to cancel a local
counter and never to uncharge bigger segments of a hierarchy, so
it's replaced by the simpler page_counter_cancel()
- res_counter_set_limit() is replaced by page_counter_limit(), which
expects its callers to serialize against themselves
- res_counter_memparse_write_strategy() is replaced by
page_counter_limit(), which rounds down to the nearest page size -
rather than up. This is more reasonable for explicitely requested
hard upper limits.
- to keep charging light-weight, page_counter_try_charge() charges
speculatively, only to roll back if the result exceeds the limit.
Because of this, a failing bigger charge can temporarily lock out
smaller charges that would otherwise succeed. The error is bounded
to the difference between the smallest and the biggest possible
charge size, so for memcg, this means that a failing THP charge can
send base page charges into reclaim upto 2MB (4MB) before the limit
would have been reached. This should be acceptable.
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE and memparse]
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE, memparse, strncmp, and PAGE_SIZE]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Acked-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-12-11 06:42:31 +07:00
|
|
|
unsigned long margin = 0;
|
|
|
|
unsigned long count;
|
|
|
|
unsigned long limit;
|
2011-03-24 06:42:21 +07:00
|
|
|
|
mm: memcontrol: lockless page counters
Memory is internally accounted in bytes, using spinlock-protected 64-bit
counters, even though the smallest accounting delta is a page. The
counter interface is also convoluted and does too many things.
Introduce a new lockless word-sized page counter API, then change all
memory accounting over to it. The translation from and to bytes then only
happens when interfacing with userspace.
The removed locking overhead is noticable when scaling beyond the per-cpu
charge caches - on a 4-socket machine with 144-threads, the following test
shows the performance differences of 288 memcgs concurrently running a
page fault benchmark:
vanilla:
18631648.500498 task-clock (msec) # 140.643 CPUs utilized ( +- 0.33% )
1,380,638 context-switches # 0.074 K/sec ( +- 0.75% )
24,390 cpu-migrations # 0.001 K/sec ( +- 8.44% )
1,843,305,768 page-faults # 0.099 M/sec ( +- 0.00% )
50,134,994,088,218 cycles # 2.691 GHz ( +- 0.33% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
8,049,712,224,651 instructions # 0.16 insns per cycle ( +- 0.04% )
1,586,970,584,979 branches # 85.176 M/sec ( +- 0.05% )
1,724,989,949 branch-misses # 0.11% of all branches ( +- 0.48% )
132.474343877 seconds time elapsed ( +- 0.21% )
lockless:
12195979.037525 task-clock (msec) # 133.480 CPUs utilized ( +- 0.18% )
832,850 context-switches # 0.068 K/sec ( +- 0.54% )
15,624 cpu-migrations # 0.001 K/sec ( +- 10.17% )
1,843,304,774 page-faults # 0.151 M/sec ( +- 0.00% )
32,811,216,801,141 cycles # 2.690 GHz ( +- 0.18% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
9,999,265,091,727 instructions # 0.30 insns per cycle ( +- 0.10% )
2,076,759,325,203 branches # 170.282 M/sec ( +- 0.12% )
1,656,917,214 branch-misses # 0.08% of all branches ( +- 0.55% )
91.369330729 seconds time elapsed ( +- 0.45% )
On top of improved scalability, this also gets rid of the icky long long
types in the very heart of memcg, which is great for 32 bit and also makes
the code a lot more readable.
Notable differences between the old and new API:
- res_counter_charge() and res_counter_charge_nofail() become
page_counter_try_charge() and page_counter_charge() resp. to match
the more common kernel naming scheme of try_do()/do()
- res_counter_uncharge_until() is only ever used to cancel a local
counter and never to uncharge bigger segments of a hierarchy, so
it's replaced by the simpler page_counter_cancel()
- res_counter_set_limit() is replaced by page_counter_limit(), which
expects its callers to serialize against themselves
- res_counter_memparse_write_strategy() is replaced by
page_counter_limit(), which rounds down to the nearest page size -
rather than up. This is more reasonable for explicitely requested
hard upper limits.
- to keep charging light-weight, page_counter_try_charge() charges
speculatively, only to roll back if the result exceeds the limit.
Because of this, a failing bigger charge can temporarily lock out
smaller charges that would otherwise succeed. The error is bounded
to the difference between the smallest and the biggest possible
charge size, so for memcg, this means that a failing THP charge can
send base page charges into reclaim upto 2MB (4MB) before the limit
would have been reached. This should be acceptable.
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE and memparse]
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE, memparse, strncmp, and PAGE_SIZE]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Acked-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-12-11 06:42:31 +07:00
|
|
|
count = page_counter_read(&memcg->memory);
|
2018-06-08 07:06:18 +07:00
|
|
|
limit = READ_ONCE(memcg->memory.max);
|
mm: memcontrol: lockless page counters
Memory is internally accounted in bytes, using spinlock-protected 64-bit
counters, even though the smallest accounting delta is a page. The
counter interface is also convoluted and does too many things.
Introduce a new lockless word-sized page counter API, then change all
memory accounting over to it. The translation from and to bytes then only
happens when interfacing with userspace.
The removed locking overhead is noticable when scaling beyond the per-cpu
charge caches - on a 4-socket machine with 144-threads, the following test
shows the performance differences of 288 memcgs concurrently running a
page fault benchmark:
vanilla:
18631648.500498 task-clock (msec) # 140.643 CPUs utilized ( +- 0.33% )
1,380,638 context-switches # 0.074 K/sec ( +- 0.75% )
24,390 cpu-migrations # 0.001 K/sec ( +- 8.44% )
1,843,305,768 page-faults # 0.099 M/sec ( +- 0.00% )
50,134,994,088,218 cycles # 2.691 GHz ( +- 0.33% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
8,049,712,224,651 instructions # 0.16 insns per cycle ( +- 0.04% )
1,586,970,584,979 branches # 85.176 M/sec ( +- 0.05% )
1,724,989,949 branch-misses # 0.11% of all branches ( +- 0.48% )
132.474343877 seconds time elapsed ( +- 0.21% )
lockless:
12195979.037525 task-clock (msec) # 133.480 CPUs utilized ( +- 0.18% )
832,850 context-switches # 0.068 K/sec ( +- 0.54% )
15,624 cpu-migrations # 0.001 K/sec ( +- 10.17% )
1,843,304,774 page-faults # 0.151 M/sec ( +- 0.00% )
32,811,216,801,141 cycles # 2.690 GHz ( +- 0.18% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
9,999,265,091,727 instructions # 0.30 insns per cycle ( +- 0.10% )
2,076,759,325,203 branches # 170.282 M/sec ( +- 0.12% )
1,656,917,214 branch-misses # 0.08% of all branches ( +- 0.55% )
91.369330729 seconds time elapsed ( +- 0.45% )
On top of improved scalability, this also gets rid of the icky long long
types in the very heart of memcg, which is great for 32 bit and also makes
the code a lot more readable.
Notable differences between the old and new API:
- res_counter_charge() and res_counter_charge_nofail() become
page_counter_try_charge() and page_counter_charge() resp. to match
the more common kernel naming scheme of try_do()/do()
- res_counter_uncharge_until() is only ever used to cancel a local
counter and never to uncharge bigger segments of a hierarchy, so
it's replaced by the simpler page_counter_cancel()
- res_counter_set_limit() is replaced by page_counter_limit(), which
expects its callers to serialize against themselves
- res_counter_memparse_write_strategy() is replaced by
page_counter_limit(), which rounds down to the nearest page size -
rather than up. This is more reasonable for explicitely requested
hard upper limits.
- to keep charging light-weight, page_counter_try_charge() charges
speculatively, only to roll back if the result exceeds the limit.
Because of this, a failing bigger charge can temporarily lock out
smaller charges that would otherwise succeed. The error is bounded
to the difference between the smallest and the biggest possible
charge size, so for memcg, this means that a failing THP charge can
send base page charges into reclaim upto 2MB (4MB) before the limit
would have been reached. This should be acceptable.
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE and memparse]
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE, memparse, strncmp, and PAGE_SIZE]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Acked-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-12-11 06:42:31 +07:00
|
|
|
if (count < limit)
|
|
|
|
margin = limit - count;
|
|
|
|
|
2016-01-15 06:21:23 +07:00
|
|
|
if (do_memsw_account()) {
|
mm: memcontrol: lockless page counters
Memory is internally accounted in bytes, using spinlock-protected 64-bit
counters, even though the smallest accounting delta is a page. The
counter interface is also convoluted and does too many things.
Introduce a new lockless word-sized page counter API, then change all
memory accounting over to it. The translation from and to bytes then only
happens when interfacing with userspace.
The removed locking overhead is noticable when scaling beyond the per-cpu
charge caches - on a 4-socket machine with 144-threads, the following test
shows the performance differences of 288 memcgs concurrently running a
page fault benchmark:
vanilla:
18631648.500498 task-clock (msec) # 140.643 CPUs utilized ( +- 0.33% )
1,380,638 context-switches # 0.074 K/sec ( +- 0.75% )
24,390 cpu-migrations # 0.001 K/sec ( +- 8.44% )
1,843,305,768 page-faults # 0.099 M/sec ( +- 0.00% )
50,134,994,088,218 cycles # 2.691 GHz ( +- 0.33% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
8,049,712,224,651 instructions # 0.16 insns per cycle ( +- 0.04% )
1,586,970,584,979 branches # 85.176 M/sec ( +- 0.05% )
1,724,989,949 branch-misses # 0.11% of all branches ( +- 0.48% )
132.474343877 seconds time elapsed ( +- 0.21% )
lockless:
12195979.037525 task-clock (msec) # 133.480 CPUs utilized ( +- 0.18% )
832,850 context-switches # 0.068 K/sec ( +- 0.54% )
15,624 cpu-migrations # 0.001 K/sec ( +- 10.17% )
1,843,304,774 page-faults # 0.151 M/sec ( +- 0.00% )
32,811,216,801,141 cycles # 2.690 GHz ( +- 0.18% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
9,999,265,091,727 instructions # 0.30 insns per cycle ( +- 0.10% )
2,076,759,325,203 branches # 170.282 M/sec ( +- 0.12% )
1,656,917,214 branch-misses # 0.08% of all branches ( +- 0.55% )
91.369330729 seconds time elapsed ( +- 0.45% )
On top of improved scalability, this also gets rid of the icky long long
types in the very heart of memcg, which is great for 32 bit and also makes
the code a lot more readable.
Notable differences between the old and new API:
- res_counter_charge() and res_counter_charge_nofail() become
page_counter_try_charge() and page_counter_charge() resp. to match
the more common kernel naming scheme of try_do()/do()
- res_counter_uncharge_until() is only ever used to cancel a local
counter and never to uncharge bigger segments of a hierarchy, so
it's replaced by the simpler page_counter_cancel()
- res_counter_set_limit() is replaced by page_counter_limit(), which
expects its callers to serialize against themselves
- res_counter_memparse_write_strategy() is replaced by
page_counter_limit(), which rounds down to the nearest page size -
rather than up. This is more reasonable for explicitely requested
hard upper limits.
- to keep charging light-weight, page_counter_try_charge() charges
speculatively, only to roll back if the result exceeds the limit.
Because of this, a failing bigger charge can temporarily lock out
smaller charges that would otherwise succeed. The error is bounded
to the difference between the smallest and the biggest possible
charge size, so for memcg, this means that a failing THP charge can
send base page charges into reclaim upto 2MB (4MB) before the limit
would have been reached. This should be acceptable.
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE and memparse]
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE, memparse, strncmp, and PAGE_SIZE]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Acked-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-12-11 06:42:31 +07:00
|
|
|
count = page_counter_read(&memcg->memsw);
|
2018-06-08 07:06:18 +07:00
|
|
|
limit = READ_ONCE(memcg->memsw.max);
|
2020-06-02 11:49:36 +07:00
|
|
|
if (count < limit)
|
mm: memcontrol: lockless page counters
Memory is internally accounted in bytes, using spinlock-protected 64-bit
counters, even though the smallest accounting delta is a page. The
counter interface is also convoluted and does too many things.
Introduce a new lockless word-sized page counter API, then change all
memory accounting over to it. The translation from and to bytes then only
happens when interfacing with userspace.
The removed locking overhead is noticable when scaling beyond the per-cpu
charge caches - on a 4-socket machine with 144-threads, the following test
shows the performance differences of 288 memcgs concurrently running a
page fault benchmark:
vanilla:
18631648.500498 task-clock (msec) # 140.643 CPUs utilized ( +- 0.33% )
1,380,638 context-switches # 0.074 K/sec ( +- 0.75% )
24,390 cpu-migrations # 0.001 K/sec ( +- 8.44% )
1,843,305,768 page-faults # 0.099 M/sec ( +- 0.00% )
50,134,994,088,218 cycles # 2.691 GHz ( +- 0.33% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
8,049,712,224,651 instructions # 0.16 insns per cycle ( +- 0.04% )
1,586,970,584,979 branches # 85.176 M/sec ( +- 0.05% )
1,724,989,949 branch-misses # 0.11% of all branches ( +- 0.48% )
132.474343877 seconds time elapsed ( +- 0.21% )
lockless:
12195979.037525 task-clock (msec) # 133.480 CPUs utilized ( +- 0.18% )
832,850 context-switches # 0.068 K/sec ( +- 0.54% )
15,624 cpu-migrations # 0.001 K/sec ( +- 10.17% )
1,843,304,774 page-faults # 0.151 M/sec ( +- 0.00% )
32,811,216,801,141 cycles # 2.690 GHz ( +- 0.18% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
9,999,265,091,727 instructions # 0.30 insns per cycle ( +- 0.10% )
2,076,759,325,203 branches # 170.282 M/sec ( +- 0.12% )
1,656,917,214 branch-misses # 0.08% of all branches ( +- 0.55% )
91.369330729 seconds time elapsed ( +- 0.45% )
On top of improved scalability, this also gets rid of the icky long long
types in the very heart of memcg, which is great for 32 bit and also makes
the code a lot more readable.
Notable differences between the old and new API:
- res_counter_charge() and res_counter_charge_nofail() become
page_counter_try_charge() and page_counter_charge() resp. to match
the more common kernel naming scheme of try_do()/do()
- res_counter_uncharge_until() is only ever used to cancel a local
counter and never to uncharge bigger segments of a hierarchy, so
it's replaced by the simpler page_counter_cancel()
- res_counter_set_limit() is replaced by page_counter_limit(), which
expects its callers to serialize against themselves
- res_counter_memparse_write_strategy() is replaced by
page_counter_limit(), which rounds down to the nearest page size -
rather than up. This is more reasonable for explicitely requested
hard upper limits.
- to keep charging light-weight, page_counter_try_charge() charges
speculatively, only to roll back if the result exceeds the limit.
Because of this, a failing bigger charge can temporarily lock out
smaller charges that would otherwise succeed. The error is bounded
to the difference between the smallest and the biggest possible
charge size, so for memcg, this means that a failing THP charge can
send base page charges into reclaim upto 2MB (4MB) before the limit
would have been reached. This should be acceptable.
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE and memparse]
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE, memparse, strncmp, and PAGE_SIZE]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Acked-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-12-11 06:42:31 +07:00
|
|
|
margin = min(margin, limit - count);
|
2016-05-28 04:27:43 +07:00
|
|
|
else
|
|
|
|
margin = 0;
|
mm: memcontrol: lockless page counters
Memory is internally accounted in bytes, using spinlock-protected 64-bit
counters, even though the smallest accounting delta is a page. The
counter interface is also convoluted and does too many things.
Introduce a new lockless word-sized page counter API, then change all
memory accounting over to it. The translation from and to bytes then only
happens when interfacing with userspace.
The removed locking overhead is noticable when scaling beyond the per-cpu
charge caches - on a 4-socket machine with 144-threads, the following test
shows the performance differences of 288 memcgs concurrently running a
page fault benchmark:
vanilla:
18631648.500498 task-clock (msec) # 140.643 CPUs utilized ( +- 0.33% )
1,380,638 context-switches # 0.074 K/sec ( +- 0.75% )
24,390 cpu-migrations # 0.001 K/sec ( +- 8.44% )
1,843,305,768 page-faults # 0.099 M/sec ( +- 0.00% )
50,134,994,088,218 cycles # 2.691 GHz ( +- 0.33% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
8,049,712,224,651 instructions # 0.16 insns per cycle ( +- 0.04% )
1,586,970,584,979 branches # 85.176 M/sec ( +- 0.05% )
1,724,989,949 branch-misses # 0.11% of all branches ( +- 0.48% )
132.474343877 seconds time elapsed ( +- 0.21% )
lockless:
12195979.037525 task-clock (msec) # 133.480 CPUs utilized ( +- 0.18% )
832,850 context-switches # 0.068 K/sec ( +- 0.54% )
15,624 cpu-migrations # 0.001 K/sec ( +- 10.17% )
1,843,304,774 page-faults # 0.151 M/sec ( +- 0.00% )
32,811,216,801,141 cycles # 2.690 GHz ( +- 0.18% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
9,999,265,091,727 instructions # 0.30 insns per cycle ( +- 0.10% )
2,076,759,325,203 branches # 170.282 M/sec ( +- 0.12% )
1,656,917,214 branch-misses # 0.08% of all branches ( +- 0.55% )
91.369330729 seconds time elapsed ( +- 0.45% )
On top of improved scalability, this also gets rid of the icky long long
types in the very heart of memcg, which is great for 32 bit and also makes
the code a lot more readable.
Notable differences between the old and new API:
- res_counter_charge() and res_counter_charge_nofail() become
page_counter_try_charge() and page_counter_charge() resp. to match
the more common kernel naming scheme of try_do()/do()
- res_counter_uncharge_until() is only ever used to cancel a local
counter and never to uncharge bigger segments of a hierarchy, so
it's replaced by the simpler page_counter_cancel()
- res_counter_set_limit() is replaced by page_counter_limit(), which
expects its callers to serialize against themselves
- res_counter_memparse_write_strategy() is replaced by
page_counter_limit(), which rounds down to the nearest page size -
rather than up. This is more reasonable for explicitely requested
hard upper limits.
- to keep charging light-weight, page_counter_try_charge() charges
speculatively, only to roll back if the result exceeds the limit.
Because of this, a failing bigger charge can temporarily lock out
smaller charges that would otherwise succeed. The error is bounded
to the difference between the smallest and the biggest possible
charge size, so for memcg, this means that a failing THP charge can
send base page charges into reclaim upto 2MB (4MB) before the limit
would have been reached. This should be acceptable.
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE and memparse]
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE, memparse, strncmp, and PAGE_SIZE]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Acked-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-12-11 06:42:31 +07:00
|
|
|
}
|
|
|
|
|
|
|
|
return margin;
|
2011-02-02 06:52:43 +07:00
|
|
|
}
|
|
|
|
|
memcg: avoid lock in updating file_mapped (Was fix race in file_mapped accouting flag management
At accounting file events per memory cgroup, we need to find memory cgroup
via page_cgroup->mem_cgroup. Now, we use lock_page_cgroup() for guarantee
pc->mem_cgroup is not overwritten while we make use of it.
But, considering the context which page-cgroup for files are accessed,
we can use alternative light-weight mutual execusion in the most case.
At handling file-caches, the only race we have to take care of is "moving"
account, IOW, overwriting page_cgroup->mem_cgroup. (See comment in the
patch)
Unlike charge/uncharge, "move" happens not so frequently. It happens only when
rmdir() and task-moving (with a special settings.)
This patch adds a race-checker for file-cache-status accounting v.s. account
moving. The new per-cpu-per-memcg counter MEM_CGROUP_ON_MOVE is added.
The routine for account move
1. Increment it before start moving
2. Call synchronize_rcu()
3. Decrement it after the end of moving.
By this, file-status-counting routine can check it needs to call
lock_page_cgroup(). In most case, I doesn't need to call it.
Following is a perf data of a process which mmap()/munmap 32MB of file cache
in a minute.
Before patch:
28.25% mmap mmap [.] main
22.64% mmap [kernel.kallsyms] [k] page_fault
9.96% mmap [kernel.kallsyms] [k] mem_cgroup_update_file_mapped
3.67% mmap [kernel.kallsyms] [k] filemap_fault
3.50% mmap [kernel.kallsyms] [k] unmap_vmas
2.99% mmap [kernel.kallsyms] [k] __do_fault
2.76% mmap [kernel.kallsyms] [k] find_get_page
After patch:
30.00% mmap mmap [.] main
23.78% mmap [kernel.kallsyms] [k] page_fault
5.52% mmap [kernel.kallsyms] [k] mem_cgroup_update_file_mapped
3.81% mmap [kernel.kallsyms] [k] unmap_vmas
3.26% mmap [kernel.kallsyms] [k] find_get_page
3.18% mmap [kernel.kallsyms] [k] __do_fault
3.03% mmap [kernel.kallsyms] [k] filemap_fault
2.40% mmap [kernel.kallsyms] [k] handle_mm_fault
2.40% mmap [kernel.kallsyms] [k] do_page_fault
This patch reduces memcg's cost to some extent.
(mem_cgroup_update_file_mapped is called by both of map/unmap)
Note: It seems some more improvements are required..but no idea.
maybe removing set/unset flag is required.
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Reviewed-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Greg Thelen <gthelen@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2010-10-28 05:33:40 +07:00
|
|
|
/*
|
2014-06-05 06:08:21 +07:00
|
|
|
* A routine for checking "mem" is under move_account() or not.
|
memcg: avoid lock in updating file_mapped (Was fix race in file_mapped accouting flag management
At accounting file events per memory cgroup, we need to find memory cgroup
via page_cgroup->mem_cgroup. Now, we use lock_page_cgroup() for guarantee
pc->mem_cgroup is not overwritten while we make use of it.
But, considering the context which page-cgroup for files are accessed,
we can use alternative light-weight mutual execusion in the most case.
At handling file-caches, the only race we have to take care of is "moving"
account, IOW, overwriting page_cgroup->mem_cgroup. (See comment in the
patch)
Unlike charge/uncharge, "move" happens not so frequently. It happens only when
rmdir() and task-moving (with a special settings.)
This patch adds a race-checker for file-cache-status accounting v.s. account
moving. The new per-cpu-per-memcg counter MEM_CGROUP_ON_MOVE is added.
The routine for account move
1. Increment it before start moving
2. Call synchronize_rcu()
3. Decrement it after the end of moving.
By this, file-status-counting routine can check it needs to call
lock_page_cgroup(). In most case, I doesn't need to call it.
Following is a perf data of a process which mmap()/munmap 32MB of file cache
in a minute.
Before patch:
28.25% mmap mmap [.] main
22.64% mmap [kernel.kallsyms] [k] page_fault
9.96% mmap [kernel.kallsyms] [k] mem_cgroup_update_file_mapped
3.67% mmap [kernel.kallsyms] [k] filemap_fault
3.50% mmap [kernel.kallsyms] [k] unmap_vmas
2.99% mmap [kernel.kallsyms] [k] __do_fault
2.76% mmap [kernel.kallsyms] [k] find_get_page
After patch:
30.00% mmap mmap [.] main
23.78% mmap [kernel.kallsyms] [k] page_fault
5.52% mmap [kernel.kallsyms] [k] mem_cgroup_update_file_mapped
3.81% mmap [kernel.kallsyms] [k] unmap_vmas
3.26% mmap [kernel.kallsyms] [k] find_get_page
3.18% mmap [kernel.kallsyms] [k] __do_fault
3.03% mmap [kernel.kallsyms] [k] filemap_fault
2.40% mmap [kernel.kallsyms] [k] handle_mm_fault
2.40% mmap [kernel.kallsyms] [k] do_page_fault
This patch reduces memcg's cost to some extent.
(mem_cgroup_update_file_mapped is called by both of map/unmap)
Note: It seems some more improvements are required..but no idea.
maybe removing set/unset flag is required.
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Reviewed-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Greg Thelen <gthelen@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2010-10-28 05:33:40 +07:00
|
|
|
*
|
2014-06-05 06:08:21 +07:00
|
|
|
* Checking a cgroup is mc.from or mc.to or under hierarchy of
|
|
|
|
* moving cgroups. This is for waiting at high-memory pressure
|
|
|
|
* caused by "move".
|
memcg: avoid lock in updating file_mapped (Was fix race in file_mapped accouting flag management
At accounting file events per memory cgroup, we need to find memory cgroup
via page_cgroup->mem_cgroup. Now, we use lock_page_cgroup() for guarantee
pc->mem_cgroup is not overwritten while we make use of it.
But, considering the context which page-cgroup for files are accessed,
we can use alternative light-weight mutual execusion in the most case.
At handling file-caches, the only race we have to take care of is "moving"
account, IOW, overwriting page_cgroup->mem_cgroup. (See comment in the
patch)
Unlike charge/uncharge, "move" happens not so frequently. It happens only when
rmdir() and task-moving (with a special settings.)
This patch adds a race-checker for file-cache-status accounting v.s. account
moving. The new per-cpu-per-memcg counter MEM_CGROUP_ON_MOVE is added.
The routine for account move
1. Increment it before start moving
2. Call synchronize_rcu()
3. Decrement it after the end of moving.
By this, file-status-counting routine can check it needs to call
lock_page_cgroup(). In most case, I doesn't need to call it.
Following is a perf data of a process which mmap()/munmap 32MB of file cache
in a minute.
Before patch:
28.25% mmap mmap [.] main
22.64% mmap [kernel.kallsyms] [k] page_fault
9.96% mmap [kernel.kallsyms] [k] mem_cgroup_update_file_mapped
3.67% mmap [kernel.kallsyms] [k] filemap_fault
3.50% mmap [kernel.kallsyms] [k] unmap_vmas
2.99% mmap [kernel.kallsyms] [k] __do_fault
2.76% mmap [kernel.kallsyms] [k] find_get_page
After patch:
30.00% mmap mmap [.] main
23.78% mmap [kernel.kallsyms] [k] page_fault
5.52% mmap [kernel.kallsyms] [k] mem_cgroup_update_file_mapped
3.81% mmap [kernel.kallsyms] [k] unmap_vmas
3.26% mmap [kernel.kallsyms] [k] find_get_page
3.18% mmap [kernel.kallsyms] [k] __do_fault
3.03% mmap [kernel.kallsyms] [k] filemap_fault
2.40% mmap [kernel.kallsyms] [k] handle_mm_fault
2.40% mmap [kernel.kallsyms] [k] do_page_fault
This patch reduces memcg's cost to some extent.
(mem_cgroup_update_file_mapped is called by both of map/unmap)
Note: It seems some more improvements are required..but no idea.
maybe removing set/unset flag is required.
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Reviewed-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Greg Thelen <gthelen@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2010-10-28 05:33:40 +07:00
|
|
|
*/
|
2011-11-03 03:38:15 +07:00
|
|
|
static bool mem_cgroup_under_move(struct mem_cgroup *memcg)
|
2010-08-11 08:02:57 +07:00
|
|
|
{
|
2010-08-11 08:02:58 +07:00
|
|
|
struct mem_cgroup *from;
|
|
|
|
struct mem_cgroup *to;
|
2010-08-11 08:02:57 +07:00
|
|
|
bool ret = false;
|
2010-08-11 08:02:58 +07:00
|
|
|
/*
|
|
|
|
* Unlike task_move routines, we access mc.to, mc.from not under
|
|
|
|
* mutual exclusion by cgroup_mutex. Here, we take spinlock instead.
|
|
|
|
*/
|
|
|
|
spin_lock(&mc.lock);
|
|
|
|
from = mc.from;
|
|
|
|
to = mc.to;
|
|
|
|
if (!from)
|
|
|
|
goto unlock;
|
2011-07-27 06:08:29 +07:00
|
|
|
|
2014-12-11 06:44:33 +07:00
|
|
|
ret = mem_cgroup_is_descendant(from, memcg) ||
|
|
|
|
mem_cgroup_is_descendant(to, memcg);
|
2010-08-11 08:02:58 +07:00
|
|
|
unlock:
|
|
|
|
spin_unlock(&mc.lock);
|
2010-08-11 08:02:57 +07:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2011-11-03 03:38:15 +07:00
|
|
|
static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)
|
2010-08-11 08:02:57 +07:00
|
|
|
{
|
|
|
|
if (mc.moving_task && current != mc.moving_task) {
|
2011-11-03 03:38:15 +07:00
|
|
|
if (mem_cgroup_under_move(memcg)) {
|
2010-08-11 08:02:57 +07:00
|
|
|
DEFINE_WAIT(wait);
|
|
|
|
prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE);
|
|
|
|
/* moving charge context might have finished. */
|
|
|
|
if (mc.moving_task)
|
|
|
|
schedule();
|
|
|
|
finish_wait(&mc.waitq, &wait);
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2020-10-14 06:52:59 +07:00
|
|
|
struct memory_stat {
|
|
|
|
const char *name;
|
|
|
|
unsigned int ratio;
|
|
|
|
unsigned int idx;
|
|
|
|
};
|
|
|
|
|
|
|
|
static struct memory_stat memory_stats[] = {
|
|
|
|
{ "anon", PAGE_SIZE, NR_ANON_MAPPED },
|
|
|
|
{ "file", PAGE_SIZE, NR_FILE_PAGES },
|
|
|
|
{ "kernel_stack", 1024, NR_KERNEL_STACK_KB },
|
|
|
|
{ "percpu", 1, MEMCG_PERCPU_B },
|
|
|
|
{ "sock", PAGE_SIZE, MEMCG_SOCK },
|
|
|
|
{ "shmem", PAGE_SIZE, NR_SHMEM },
|
|
|
|
{ "file_mapped", PAGE_SIZE, NR_FILE_MAPPED },
|
|
|
|
{ "file_dirty", PAGE_SIZE, NR_FILE_DIRTY },
|
|
|
|
{ "file_writeback", PAGE_SIZE, NR_WRITEBACK },
|
|
|
|
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
|
|
|
|
/*
|
|
|
|
* The ratio will be initialized in memory_stats_init(). Because
|
|
|
|
* on some architectures, the macro of HPAGE_PMD_SIZE is not
|
|
|
|
* constant(e.g. powerpc).
|
|
|
|
*/
|
|
|
|
{ "anon_thp", 0, NR_ANON_THPS },
|
|
|
|
#endif
|
|
|
|
{ "inactive_anon", PAGE_SIZE, NR_INACTIVE_ANON },
|
|
|
|
{ "active_anon", PAGE_SIZE, NR_ACTIVE_ANON },
|
|
|
|
{ "inactive_file", PAGE_SIZE, NR_INACTIVE_FILE },
|
|
|
|
{ "active_file", PAGE_SIZE, NR_ACTIVE_FILE },
|
|
|
|
{ "unevictable", PAGE_SIZE, NR_UNEVICTABLE },
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Note: The slab_reclaimable and slab_unreclaimable must be
|
|
|
|
* together and slab_reclaimable must be in front.
|
|
|
|
*/
|
|
|
|
{ "slab_reclaimable", 1, NR_SLAB_RECLAIMABLE_B },
|
|
|
|
{ "slab_unreclaimable", 1, NR_SLAB_UNRECLAIMABLE_B },
|
|
|
|
|
|
|
|
/* The memory events */
|
|
|
|
{ "workingset_refault_anon", 1, WORKINGSET_REFAULT_ANON },
|
|
|
|
{ "workingset_refault_file", 1, WORKINGSET_REFAULT_FILE },
|
|
|
|
{ "workingset_activate_anon", 1, WORKINGSET_ACTIVATE_ANON },
|
|
|
|
{ "workingset_activate_file", 1, WORKINGSET_ACTIVATE_FILE },
|
|
|
|
{ "workingset_restore_anon", 1, WORKINGSET_RESTORE_ANON },
|
|
|
|
{ "workingset_restore_file", 1, WORKINGSET_RESTORE_FILE },
|
|
|
|
{ "workingset_nodereclaim", 1, WORKINGSET_NODERECLAIM },
|
|
|
|
};
|
|
|
|
|
|
|
|
static int __init memory_stats_init(void)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
|
|
|
|
for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
|
|
|
|
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
|
|
|
|
if (memory_stats[i].idx == NR_ANON_THPS)
|
|
|
|
memory_stats[i].ratio = HPAGE_PMD_SIZE;
|
|
|
|
#endif
|
|
|
|
VM_BUG_ON(!memory_stats[i].ratio);
|
|
|
|
VM_BUG_ON(memory_stats[i].idx >= MEMCG_NR_STAT);
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
pure_initcall(memory_stats_init);
|
|
|
|
|
mm: memcontrol: dump memory.stat during cgroup OOM
The current cgroup OOM memory info dump doesn't include all the memory
we are tracking, nor does it give insight into what the VM tried to do
leading up to the OOM. All that useful info is in memory.stat.
Furthermore, the recursive printing for every child cgroup can
generate absurd amounts of data on the console for larger cgroup
trees, and it's not like we provide a per-cgroup breakdown during
global OOM kills.
When an OOM kill is triggered, print one set of recursive memory.stat
items at the level whose limit triggered the OOM condition.
Example output:
stress invoked oom-killer: gfp_mask=0x100cca(GFP_HIGHUSER_MOVABLE), order=0, oom_score_adj=0
CPU: 2 PID: 210 Comm: stress Not tainted 5.2.0-rc2-mm1-00247-g47d49835983c #135
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.12.0-20181126_142135-anatol 04/01/2014
Call Trace:
dump_stack+0x46/0x60
dump_header+0x4c/0x2d0
oom_kill_process.cold.10+0xb/0x10
out_of_memory+0x200/0x270
? try_to_free_mem_cgroup_pages+0xdf/0x130
mem_cgroup_out_of_memory+0xb7/0xc0
try_charge+0x680/0x6f0
mem_cgroup_try_charge+0xb5/0x160
__add_to_page_cache_locked+0xc6/0x300
? list_lru_destroy+0x80/0x80
add_to_page_cache_lru+0x45/0xc0
pagecache_get_page+0x11b/0x290
filemap_fault+0x458/0x6d0
ext4_filemap_fault+0x27/0x36
__do_fault+0x2f/0xb0
__handle_mm_fault+0x9c5/0x1140
? apic_timer_interrupt+0xa/0x20
handle_mm_fault+0xc5/0x180
__do_page_fault+0x1ab/0x440
? page_fault+0x8/0x30
page_fault+0x1e/0x30
RIP: 0033:0x55c32167fc10
Code: Bad RIP value.
RSP: 002b:00007fff1d031c50 EFLAGS: 00010206
RAX: 000000000dc00000 RBX: 00007fd2db000010 RCX: 00007fd2db000010
RDX: 0000000000000000 RSI: 0000000010001000 RDI: 0000000000000000
RBP: 000055c321680a54 R08: 00000000ffffffff R09: 0000000000000000
R10: 0000000000000022 R11: 0000000000000246 R12: ffffffffffffffff
R13: 0000000000000002 R14: 0000000000001000 R15: 0000000010000000
memory: usage 1024kB, limit 1024kB, failcnt 75131
swap: usage 0kB, limit 9007199254740988kB, failcnt 0
Memory cgroup stats for /foo:
anon 0
file 0
kernel_stack 36864
slab 274432
sock 0
shmem 0
file_mapped 0
file_dirty 0
file_writeback 0
anon_thp 0
inactive_anon 126976
active_anon 0
inactive_file 0
active_file 0
unevictable 0
slab_reclaimable 0
slab_unreclaimable 274432
pgfault 59466
pgmajfault 1617
workingset_refault 2145
workingset_activate 0
workingset_nodereclaim 0
pgrefill 98952
pgscan 200060
pgsteal 59340
pgactivate 40095
pgdeactivate 96787
pglazyfree 0
pglazyfreed 0
thp_fault_alloc 0
thp_collapse_alloc 0
Tasks state (memory values in pages):
[ pid ] uid tgid total_vm rss pgtables_bytes swapents oom_score_adj name
[ 200] 0 200 1121 884 53248 29 0 bash
[ 209] 0 209 905 246 45056 19 0 stress
[ 210] 0 210 66442 56 499712 56349 0 stress
oom-kill:constraint=CONSTRAINT_NONE,nodemask=(null),oom_memcg=/foo,task_memcg=/foo,task=stress,pid=210,uid=0
Memory cgroup out of memory: Killed process 210 (stress) total-vm:265768kB, anon-rss:0kB, file-rss:224kB, shmem-rss:0kB
oom_reaper: reaped process 210 (stress), now anon-rss:0kB, file-rss:0kB, shmem-rss:0kB
[hannes@cmpxchg.org: s/kvmalloc/kmalloc/ per Michal]
Link: http://lkml.kernel.org/r/20190605161133.GA12453@cmpxchg.org
Link: http://lkml.kernel.org/r/20190604210509.9744-1-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-07-12 10:55:59 +07:00
|
|
|
static char *memory_stat_format(struct mem_cgroup *memcg)
|
|
|
|
{
|
|
|
|
struct seq_buf s;
|
|
|
|
int i;
|
2017-05-04 04:55:13 +07:00
|
|
|
|
mm: memcontrol: dump memory.stat during cgroup OOM
The current cgroup OOM memory info dump doesn't include all the memory
we are tracking, nor does it give insight into what the VM tried to do
leading up to the OOM. All that useful info is in memory.stat.
Furthermore, the recursive printing for every child cgroup can
generate absurd amounts of data on the console for larger cgroup
trees, and it's not like we provide a per-cgroup breakdown during
global OOM kills.
When an OOM kill is triggered, print one set of recursive memory.stat
items at the level whose limit triggered the OOM condition.
Example output:
stress invoked oom-killer: gfp_mask=0x100cca(GFP_HIGHUSER_MOVABLE), order=0, oom_score_adj=0
CPU: 2 PID: 210 Comm: stress Not tainted 5.2.0-rc2-mm1-00247-g47d49835983c #135
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.12.0-20181126_142135-anatol 04/01/2014
Call Trace:
dump_stack+0x46/0x60
dump_header+0x4c/0x2d0
oom_kill_process.cold.10+0xb/0x10
out_of_memory+0x200/0x270
? try_to_free_mem_cgroup_pages+0xdf/0x130
mem_cgroup_out_of_memory+0xb7/0xc0
try_charge+0x680/0x6f0
mem_cgroup_try_charge+0xb5/0x160
__add_to_page_cache_locked+0xc6/0x300
? list_lru_destroy+0x80/0x80
add_to_page_cache_lru+0x45/0xc0
pagecache_get_page+0x11b/0x290
filemap_fault+0x458/0x6d0
ext4_filemap_fault+0x27/0x36
__do_fault+0x2f/0xb0
__handle_mm_fault+0x9c5/0x1140
? apic_timer_interrupt+0xa/0x20
handle_mm_fault+0xc5/0x180
__do_page_fault+0x1ab/0x440
? page_fault+0x8/0x30
page_fault+0x1e/0x30
RIP: 0033:0x55c32167fc10
Code: Bad RIP value.
RSP: 002b:00007fff1d031c50 EFLAGS: 00010206
RAX: 000000000dc00000 RBX: 00007fd2db000010 RCX: 00007fd2db000010
RDX: 0000000000000000 RSI: 0000000010001000 RDI: 0000000000000000
RBP: 000055c321680a54 R08: 00000000ffffffff R09: 0000000000000000
R10: 0000000000000022 R11: 0000000000000246 R12: ffffffffffffffff
R13: 0000000000000002 R14: 0000000000001000 R15: 0000000010000000
memory: usage 1024kB, limit 1024kB, failcnt 75131
swap: usage 0kB, limit 9007199254740988kB, failcnt 0
Memory cgroup stats for /foo:
anon 0
file 0
kernel_stack 36864
slab 274432
sock 0
shmem 0
file_mapped 0
file_dirty 0
file_writeback 0
anon_thp 0
inactive_anon 126976
active_anon 0
inactive_file 0
active_file 0
unevictable 0
slab_reclaimable 0
slab_unreclaimable 274432
pgfault 59466
pgmajfault 1617
workingset_refault 2145
workingset_activate 0
workingset_nodereclaim 0
pgrefill 98952
pgscan 200060
pgsteal 59340
pgactivate 40095
pgdeactivate 96787
pglazyfree 0
pglazyfreed 0
thp_fault_alloc 0
thp_collapse_alloc 0
Tasks state (memory values in pages):
[ pid ] uid tgid total_vm rss pgtables_bytes swapents oom_score_adj name
[ 200] 0 200 1121 884 53248 29 0 bash
[ 209] 0 209 905 246 45056 19 0 stress
[ 210] 0 210 66442 56 499712 56349 0 stress
oom-kill:constraint=CONSTRAINT_NONE,nodemask=(null),oom_memcg=/foo,task_memcg=/foo,task=stress,pid=210,uid=0
Memory cgroup out of memory: Killed process 210 (stress) total-vm:265768kB, anon-rss:0kB, file-rss:224kB, shmem-rss:0kB
oom_reaper: reaped process 210 (stress), now anon-rss:0kB, file-rss:0kB, shmem-rss:0kB
[hannes@cmpxchg.org: s/kvmalloc/kmalloc/ per Michal]
Link: http://lkml.kernel.org/r/20190605161133.GA12453@cmpxchg.org
Link: http://lkml.kernel.org/r/20190604210509.9744-1-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-07-12 10:55:59 +07:00
|
|
|
seq_buf_init(&s, kmalloc(PAGE_SIZE, GFP_KERNEL), PAGE_SIZE);
|
|
|
|
if (!s.buffer)
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Provide statistics on the state of the memory subsystem as
|
|
|
|
* well as cumulative event counters that show past behavior.
|
|
|
|
*
|
|
|
|
* This list is ordered following a combination of these gradients:
|
|
|
|
* 1) generic big picture -> specifics and details
|
|
|
|
* 2) reflecting userspace activity -> reflecting kernel heuristics
|
|
|
|
*
|
|
|
|
* Current memory state:
|
|
|
|
*/
|
|
|
|
|
2020-10-14 06:52:59 +07:00
|
|
|
for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
|
|
|
|
u64 size;
|
mm: memcontrol: dump memory.stat during cgroup OOM
The current cgroup OOM memory info dump doesn't include all the memory
we are tracking, nor does it give insight into what the VM tried to do
leading up to the OOM. All that useful info is in memory.stat.
Furthermore, the recursive printing for every child cgroup can
generate absurd amounts of data on the console for larger cgroup
trees, and it's not like we provide a per-cgroup breakdown during
global OOM kills.
When an OOM kill is triggered, print one set of recursive memory.stat
items at the level whose limit triggered the OOM condition.
Example output:
stress invoked oom-killer: gfp_mask=0x100cca(GFP_HIGHUSER_MOVABLE), order=0, oom_score_adj=0
CPU: 2 PID: 210 Comm: stress Not tainted 5.2.0-rc2-mm1-00247-g47d49835983c #135
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.12.0-20181126_142135-anatol 04/01/2014
Call Trace:
dump_stack+0x46/0x60
dump_header+0x4c/0x2d0
oom_kill_process.cold.10+0xb/0x10
out_of_memory+0x200/0x270
? try_to_free_mem_cgroup_pages+0xdf/0x130
mem_cgroup_out_of_memory+0xb7/0xc0
try_charge+0x680/0x6f0
mem_cgroup_try_charge+0xb5/0x160
__add_to_page_cache_locked+0xc6/0x300
? list_lru_destroy+0x80/0x80
add_to_page_cache_lru+0x45/0xc0
pagecache_get_page+0x11b/0x290
filemap_fault+0x458/0x6d0
ext4_filemap_fault+0x27/0x36
__do_fault+0x2f/0xb0
__handle_mm_fault+0x9c5/0x1140
? apic_timer_interrupt+0xa/0x20
handle_mm_fault+0xc5/0x180
__do_page_fault+0x1ab/0x440
? page_fault+0x8/0x30
page_fault+0x1e/0x30
RIP: 0033:0x55c32167fc10
Code: Bad RIP value.
RSP: 002b:00007fff1d031c50 EFLAGS: 00010206
RAX: 000000000dc00000 RBX: 00007fd2db000010 RCX: 00007fd2db000010
RDX: 0000000000000000 RSI: 0000000010001000 RDI: 0000000000000000
RBP: 000055c321680a54 R08: 00000000ffffffff R09: 0000000000000000
R10: 0000000000000022 R11: 0000000000000246 R12: ffffffffffffffff
R13: 0000000000000002 R14: 0000000000001000 R15: 0000000010000000
memory: usage 1024kB, limit 1024kB, failcnt 75131
swap: usage 0kB, limit 9007199254740988kB, failcnt 0
Memory cgroup stats for /foo:
anon 0
file 0
kernel_stack 36864
slab 274432
sock 0
shmem 0
file_mapped 0
file_dirty 0
file_writeback 0
anon_thp 0
inactive_anon 126976
active_anon 0
inactive_file 0
active_file 0
unevictable 0
slab_reclaimable 0
slab_unreclaimable 274432
pgfault 59466
pgmajfault 1617
workingset_refault 2145
workingset_activate 0
workingset_nodereclaim 0
pgrefill 98952
pgscan 200060
pgsteal 59340
pgactivate 40095
pgdeactivate 96787
pglazyfree 0
pglazyfreed 0
thp_fault_alloc 0
thp_collapse_alloc 0
Tasks state (memory values in pages):
[ pid ] uid tgid total_vm rss pgtables_bytes swapents oom_score_adj name
[ 200] 0 200 1121 884 53248 29 0 bash
[ 209] 0 209 905 246 45056 19 0 stress
[ 210] 0 210 66442 56 499712 56349 0 stress
oom-kill:constraint=CONSTRAINT_NONE,nodemask=(null),oom_memcg=/foo,task_memcg=/foo,task=stress,pid=210,uid=0
Memory cgroup out of memory: Killed process 210 (stress) total-vm:265768kB, anon-rss:0kB, file-rss:224kB, shmem-rss:0kB
oom_reaper: reaped process 210 (stress), now anon-rss:0kB, file-rss:0kB, shmem-rss:0kB
[hannes@cmpxchg.org: s/kvmalloc/kmalloc/ per Michal]
Link: http://lkml.kernel.org/r/20190605161133.GA12453@cmpxchg.org
Link: http://lkml.kernel.org/r/20190604210509.9744-1-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-07-12 10:55:59 +07:00
|
|
|
|
2020-10-14 06:52:59 +07:00
|
|
|
size = memcg_page_state(memcg, memory_stats[i].idx);
|
|
|
|
size *= memory_stats[i].ratio;
|
|
|
|
seq_buf_printf(&s, "%s %llu\n", memory_stats[i].name, size);
|
mm: memcontrol: dump memory.stat during cgroup OOM
The current cgroup OOM memory info dump doesn't include all the memory
we are tracking, nor does it give insight into what the VM tried to do
leading up to the OOM. All that useful info is in memory.stat.
Furthermore, the recursive printing for every child cgroup can
generate absurd amounts of data on the console for larger cgroup
trees, and it's not like we provide a per-cgroup breakdown during
global OOM kills.
When an OOM kill is triggered, print one set of recursive memory.stat
items at the level whose limit triggered the OOM condition.
Example output:
stress invoked oom-killer: gfp_mask=0x100cca(GFP_HIGHUSER_MOVABLE), order=0, oom_score_adj=0
CPU: 2 PID: 210 Comm: stress Not tainted 5.2.0-rc2-mm1-00247-g47d49835983c #135
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.12.0-20181126_142135-anatol 04/01/2014
Call Trace:
dump_stack+0x46/0x60
dump_header+0x4c/0x2d0
oom_kill_process.cold.10+0xb/0x10
out_of_memory+0x200/0x270
? try_to_free_mem_cgroup_pages+0xdf/0x130
mem_cgroup_out_of_memory+0xb7/0xc0
try_charge+0x680/0x6f0
mem_cgroup_try_charge+0xb5/0x160
__add_to_page_cache_locked+0xc6/0x300
? list_lru_destroy+0x80/0x80
add_to_page_cache_lru+0x45/0xc0
pagecache_get_page+0x11b/0x290
filemap_fault+0x458/0x6d0
ext4_filemap_fault+0x27/0x36
__do_fault+0x2f/0xb0
__handle_mm_fault+0x9c5/0x1140
? apic_timer_interrupt+0xa/0x20
handle_mm_fault+0xc5/0x180
__do_page_fault+0x1ab/0x440
? page_fault+0x8/0x30
page_fault+0x1e/0x30
RIP: 0033:0x55c32167fc10
Code: Bad RIP value.
RSP: 002b:00007fff1d031c50 EFLAGS: 00010206
RAX: 000000000dc00000 RBX: 00007fd2db000010 RCX: 00007fd2db000010
RDX: 0000000000000000 RSI: 0000000010001000 RDI: 0000000000000000
RBP: 000055c321680a54 R08: 00000000ffffffff R09: 0000000000000000
R10: 0000000000000022 R11: 0000000000000246 R12: ffffffffffffffff
R13: 0000000000000002 R14: 0000000000001000 R15: 0000000010000000
memory: usage 1024kB, limit 1024kB, failcnt 75131
swap: usage 0kB, limit 9007199254740988kB, failcnt 0
Memory cgroup stats for /foo:
anon 0
file 0
kernel_stack 36864
slab 274432
sock 0
shmem 0
file_mapped 0
file_dirty 0
file_writeback 0
anon_thp 0
inactive_anon 126976
active_anon 0
inactive_file 0
active_file 0
unevictable 0
slab_reclaimable 0
slab_unreclaimable 274432
pgfault 59466
pgmajfault 1617
workingset_refault 2145
workingset_activate 0
workingset_nodereclaim 0
pgrefill 98952
pgscan 200060
pgsteal 59340
pgactivate 40095
pgdeactivate 96787
pglazyfree 0
pglazyfreed 0
thp_fault_alloc 0
thp_collapse_alloc 0
Tasks state (memory values in pages):
[ pid ] uid tgid total_vm rss pgtables_bytes swapents oom_score_adj name
[ 200] 0 200 1121 884 53248 29 0 bash
[ 209] 0 209 905 246 45056 19 0 stress
[ 210] 0 210 66442 56 499712 56349 0 stress
oom-kill:constraint=CONSTRAINT_NONE,nodemask=(null),oom_memcg=/foo,task_memcg=/foo,task=stress,pid=210,uid=0
Memory cgroup out of memory: Killed process 210 (stress) total-vm:265768kB, anon-rss:0kB, file-rss:224kB, shmem-rss:0kB
oom_reaper: reaped process 210 (stress), now anon-rss:0kB, file-rss:0kB, shmem-rss:0kB
[hannes@cmpxchg.org: s/kvmalloc/kmalloc/ per Michal]
Link: http://lkml.kernel.org/r/20190605161133.GA12453@cmpxchg.org
Link: http://lkml.kernel.org/r/20190604210509.9744-1-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-07-12 10:55:59 +07:00
|
|
|
|
2020-10-14 06:52:59 +07:00
|
|
|
if (unlikely(memory_stats[i].idx == NR_SLAB_UNRECLAIMABLE_B)) {
|
|
|
|
size = memcg_page_state(memcg, NR_SLAB_RECLAIMABLE_B) +
|
|
|
|
memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE_B);
|
|
|
|
seq_buf_printf(&s, "slab %llu\n", size);
|
|
|
|
}
|
|
|
|
}
|
mm: memcontrol: dump memory.stat during cgroup OOM
The current cgroup OOM memory info dump doesn't include all the memory
we are tracking, nor does it give insight into what the VM tried to do
leading up to the OOM. All that useful info is in memory.stat.
Furthermore, the recursive printing for every child cgroup can
generate absurd amounts of data on the console for larger cgroup
trees, and it's not like we provide a per-cgroup breakdown during
global OOM kills.
When an OOM kill is triggered, print one set of recursive memory.stat
items at the level whose limit triggered the OOM condition.
Example output:
stress invoked oom-killer: gfp_mask=0x100cca(GFP_HIGHUSER_MOVABLE), order=0, oom_score_adj=0
CPU: 2 PID: 210 Comm: stress Not tainted 5.2.0-rc2-mm1-00247-g47d49835983c #135
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.12.0-20181126_142135-anatol 04/01/2014
Call Trace:
dump_stack+0x46/0x60
dump_header+0x4c/0x2d0
oom_kill_process.cold.10+0xb/0x10
out_of_memory+0x200/0x270
? try_to_free_mem_cgroup_pages+0xdf/0x130
mem_cgroup_out_of_memory+0xb7/0xc0
try_charge+0x680/0x6f0
mem_cgroup_try_charge+0xb5/0x160
__add_to_page_cache_locked+0xc6/0x300
? list_lru_destroy+0x80/0x80
add_to_page_cache_lru+0x45/0xc0
pagecache_get_page+0x11b/0x290
filemap_fault+0x458/0x6d0
ext4_filemap_fault+0x27/0x36
__do_fault+0x2f/0xb0
__handle_mm_fault+0x9c5/0x1140
? apic_timer_interrupt+0xa/0x20
handle_mm_fault+0xc5/0x180
__do_page_fault+0x1ab/0x440
? page_fault+0x8/0x30
page_fault+0x1e/0x30
RIP: 0033:0x55c32167fc10
Code: Bad RIP value.
RSP: 002b:00007fff1d031c50 EFLAGS: 00010206
RAX: 000000000dc00000 RBX: 00007fd2db000010 RCX: 00007fd2db000010
RDX: 0000000000000000 RSI: 0000000010001000 RDI: 0000000000000000
RBP: 000055c321680a54 R08: 00000000ffffffff R09: 0000000000000000
R10: 0000000000000022 R11: 0000000000000246 R12: ffffffffffffffff
R13: 0000000000000002 R14: 0000000000001000 R15: 0000000010000000
memory: usage 1024kB, limit 1024kB, failcnt 75131
swap: usage 0kB, limit 9007199254740988kB, failcnt 0
Memory cgroup stats for /foo:
anon 0
file 0
kernel_stack 36864
slab 274432
sock 0
shmem 0
file_mapped 0
file_dirty 0
file_writeback 0
anon_thp 0
inactive_anon 126976
active_anon 0
inactive_file 0
active_file 0
unevictable 0
slab_reclaimable 0
slab_unreclaimable 274432
pgfault 59466
pgmajfault 1617
workingset_refault 2145
workingset_activate 0
workingset_nodereclaim 0
pgrefill 98952
pgscan 200060
pgsteal 59340
pgactivate 40095
pgdeactivate 96787
pglazyfree 0
pglazyfreed 0
thp_fault_alloc 0
thp_collapse_alloc 0
Tasks state (memory values in pages):
[ pid ] uid tgid total_vm rss pgtables_bytes swapents oom_score_adj name
[ 200] 0 200 1121 884 53248 29 0 bash
[ 209] 0 209 905 246 45056 19 0 stress
[ 210] 0 210 66442 56 499712 56349 0 stress
oom-kill:constraint=CONSTRAINT_NONE,nodemask=(null),oom_memcg=/foo,task_memcg=/foo,task=stress,pid=210,uid=0
Memory cgroup out of memory: Killed process 210 (stress) total-vm:265768kB, anon-rss:0kB, file-rss:224kB, shmem-rss:0kB
oom_reaper: reaped process 210 (stress), now anon-rss:0kB, file-rss:0kB, shmem-rss:0kB
[hannes@cmpxchg.org: s/kvmalloc/kmalloc/ per Michal]
Link: http://lkml.kernel.org/r/20190605161133.GA12453@cmpxchg.org
Link: http://lkml.kernel.org/r/20190604210509.9744-1-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-07-12 10:55:59 +07:00
|
|
|
|
|
|
|
/* Accumulated memory events */
|
|
|
|
|
2019-12-05 07:49:53 +07:00
|
|
|
seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGFAULT),
|
|
|
|
memcg_events(memcg, PGFAULT));
|
|
|
|
seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGMAJFAULT),
|
|
|
|
memcg_events(memcg, PGMAJFAULT));
|
|
|
|
seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGREFILL),
|
|
|
|
memcg_events(memcg, PGREFILL));
|
mm: memcontrol: dump memory.stat during cgroup OOM
The current cgroup OOM memory info dump doesn't include all the memory
we are tracking, nor does it give insight into what the VM tried to do
leading up to the OOM. All that useful info is in memory.stat.
Furthermore, the recursive printing for every child cgroup can
generate absurd amounts of data on the console for larger cgroup
trees, and it's not like we provide a per-cgroup breakdown during
global OOM kills.
When an OOM kill is triggered, print one set of recursive memory.stat
items at the level whose limit triggered the OOM condition.
Example output:
stress invoked oom-killer: gfp_mask=0x100cca(GFP_HIGHUSER_MOVABLE), order=0, oom_score_adj=0
CPU: 2 PID: 210 Comm: stress Not tainted 5.2.0-rc2-mm1-00247-g47d49835983c #135
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.12.0-20181126_142135-anatol 04/01/2014
Call Trace:
dump_stack+0x46/0x60
dump_header+0x4c/0x2d0
oom_kill_process.cold.10+0xb/0x10
out_of_memory+0x200/0x270
? try_to_free_mem_cgroup_pages+0xdf/0x130
mem_cgroup_out_of_memory+0xb7/0xc0
try_charge+0x680/0x6f0
mem_cgroup_try_charge+0xb5/0x160
__add_to_page_cache_locked+0xc6/0x300
? list_lru_destroy+0x80/0x80
add_to_page_cache_lru+0x45/0xc0
pagecache_get_page+0x11b/0x290
filemap_fault+0x458/0x6d0
ext4_filemap_fault+0x27/0x36
__do_fault+0x2f/0xb0
__handle_mm_fault+0x9c5/0x1140
? apic_timer_interrupt+0xa/0x20
handle_mm_fault+0xc5/0x180
__do_page_fault+0x1ab/0x440
? page_fault+0x8/0x30
page_fault+0x1e/0x30
RIP: 0033:0x55c32167fc10
Code: Bad RIP value.
RSP: 002b:00007fff1d031c50 EFLAGS: 00010206
RAX: 000000000dc00000 RBX: 00007fd2db000010 RCX: 00007fd2db000010
RDX: 0000000000000000 RSI: 0000000010001000 RDI: 0000000000000000
RBP: 000055c321680a54 R08: 00000000ffffffff R09: 0000000000000000
R10: 0000000000000022 R11: 0000000000000246 R12: ffffffffffffffff
R13: 0000000000000002 R14: 0000000000001000 R15: 0000000010000000
memory: usage 1024kB, limit 1024kB, failcnt 75131
swap: usage 0kB, limit 9007199254740988kB, failcnt 0
Memory cgroup stats for /foo:
anon 0
file 0
kernel_stack 36864
slab 274432
sock 0
shmem 0
file_mapped 0
file_dirty 0
file_writeback 0
anon_thp 0
inactive_anon 126976
active_anon 0
inactive_file 0
active_file 0
unevictable 0
slab_reclaimable 0
slab_unreclaimable 274432
pgfault 59466
pgmajfault 1617
workingset_refault 2145
workingset_activate 0
workingset_nodereclaim 0
pgrefill 98952
pgscan 200060
pgsteal 59340
pgactivate 40095
pgdeactivate 96787
pglazyfree 0
pglazyfreed 0
thp_fault_alloc 0
thp_collapse_alloc 0
Tasks state (memory values in pages):
[ pid ] uid tgid total_vm rss pgtables_bytes swapents oom_score_adj name
[ 200] 0 200 1121 884 53248 29 0 bash
[ 209] 0 209 905 246 45056 19 0 stress
[ 210] 0 210 66442 56 499712 56349 0 stress
oom-kill:constraint=CONSTRAINT_NONE,nodemask=(null),oom_memcg=/foo,task_memcg=/foo,task=stress,pid=210,uid=0
Memory cgroup out of memory: Killed process 210 (stress) total-vm:265768kB, anon-rss:0kB, file-rss:224kB, shmem-rss:0kB
oom_reaper: reaped process 210 (stress), now anon-rss:0kB, file-rss:0kB, shmem-rss:0kB
[hannes@cmpxchg.org: s/kvmalloc/kmalloc/ per Michal]
Link: http://lkml.kernel.org/r/20190605161133.GA12453@cmpxchg.org
Link: http://lkml.kernel.org/r/20190604210509.9744-1-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-07-12 10:55:59 +07:00
|
|
|
seq_buf_printf(&s, "pgscan %lu\n",
|
|
|
|
memcg_events(memcg, PGSCAN_KSWAPD) +
|
|
|
|
memcg_events(memcg, PGSCAN_DIRECT));
|
|
|
|
seq_buf_printf(&s, "pgsteal %lu\n",
|
|
|
|
memcg_events(memcg, PGSTEAL_KSWAPD) +
|
|
|
|
memcg_events(memcg, PGSTEAL_DIRECT));
|
2019-12-05 07:49:53 +07:00
|
|
|
seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGACTIVATE),
|
|
|
|
memcg_events(memcg, PGACTIVATE));
|
|
|
|
seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGDEACTIVATE),
|
|
|
|
memcg_events(memcg, PGDEACTIVATE));
|
|
|
|
seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGLAZYFREE),
|
|
|
|
memcg_events(memcg, PGLAZYFREE));
|
|
|
|
seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGLAZYFREED),
|
|
|
|
memcg_events(memcg, PGLAZYFREED));
|
mm: memcontrol: dump memory.stat during cgroup OOM
The current cgroup OOM memory info dump doesn't include all the memory
we are tracking, nor does it give insight into what the VM tried to do
leading up to the OOM. All that useful info is in memory.stat.
Furthermore, the recursive printing for every child cgroup can
generate absurd amounts of data on the console for larger cgroup
trees, and it's not like we provide a per-cgroup breakdown during
global OOM kills.
When an OOM kill is triggered, print one set of recursive memory.stat
items at the level whose limit triggered the OOM condition.
Example output:
stress invoked oom-killer: gfp_mask=0x100cca(GFP_HIGHUSER_MOVABLE), order=0, oom_score_adj=0
CPU: 2 PID: 210 Comm: stress Not tainted 5.2.0-rc2-mm1-00247-g47d49835983c #135
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.12.0-20181126_142135-anatol 04/01/2014
Call Trace:
dump_stack+0x46/0x60
dump_header+0x4c/0x2d0
oom_kill_process.cold.10+0xb/0x10
out_of_memory+0x200/0x270
? try_to_free_mem_cgroup_pages+0xdf/0x130
mem_cgroup_out_of_memory+0xb7/0xc0
try_charge+0x680/0x6f0
mem_cgroup_try_charge+0xb5/0x160
__add_to_page_cache_locked+0xc6/0x300
? list_lru_destroy+0x80/0x80
add_to_page_cache_lru+0x45/0xc0
pagecache_get_page+0x11b/0x290
filemap_fault+0x458/0x6d0
ext4_filemap_fault+0x27/0x36
__do_fault+0x2f/0xb0
__handle_mm_fault+0x9c5/0x1140
? apic_timer_interrupt+0xa/0x20
handle_mm_fault+0xc5/0x180
__do_page_fault+0x1ab/0x440
? page_fault+0x8/0x30
page_fault+0x1e/0x30
RIP: 0033:0x55c32167fc10
Code: Bad RIP value.
RSP: 002b:00007fff1d031c50 EFLAGS: 00010206
RAX: 000000000dc00000 RBX: 00007fd2db000010 RCX: 00007fd2db000010
RDX: 0000000000000000 RSI: 0000000010001000 RDI: 0000000000000000
RBP: 000055c321680a54 R08: 00000000ffffffff R09: 0000000000000000
R10: 0000000000000022 R11: 0000000000000246 R12: ffffffffffffffff
R13: 0000000000000002 R14: 0000000000001000 R15: 0000000010000000
memory: usage 1024kB, limit 1024kB, failcnt 75131
swap: usage 0kB, limit 9007199254740988kB, failcnt 0
Memory cgroup stats for /foo:
anon 0
file 0
kernel_stack 36864
slab 274432
sock 0
shmem 0
file_mapped 0
file_dirty 0
file_writeback 0
anon_thp 0
inactive_anon 126976
active_anon 0
inactive_file 0
active_file 0
unevictable 0
slab_reclaimable 0
slab_unreclaimable 274432
pgfault 59466
pgmajfault 1617
workingset_refault 2145
workingset_activate 0
workingset_nodereclaim 0
pgrefill 98952
pgscan 200060
pgsteal 59340
pgactivate 40095
pgdeactivate 96787
pglazyfree 0
pglazyfreed 0
thp_fault_alloc 0
thp_collapse_alloc 0
Tasks state (memory values in pages):
[ pid ] uid tgid total_vm rss pgtables_bytes swapents oom_score_adj name
[ 200] 0 200 1121 884 53248 29 0 bash
[ 209] 0 209 905 246 45056 19 0 stress
[ 210] 0 210 66442 56 499712 56349 0 stress
oom-kill:constraint=CONSTRAINT_NONE,nodemask=(null),oom_memcg=/foo,task_memcg=/foo,task=stress,pid=210,uid=0
Memory cgroup out of memory: Killed process 210 (stress) total-vm:265768kB, anon-rss:0kB, file-rss:224kB, shmem-rss:0kB
oom_reaper: reaped process 210 (stress), now anon-rss:0kB, file-rss:0kB, shmem-rss:0kB
[hannes@cmpxchg.org: s/kvmalloc/kmalloc/ per Michal]
Link: http://lkml.kernel.org/r/20190605161133.GA12453@cmpxchg.org
Link: http://lkml.kernel.org/r/20190604210509.9744-1-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-07-12 10:55:59 +07:00
|
|
|
|
|
|
|
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
|
2019-12-05 07:49:53 +07:00
|
|
|
seq_buf_printf(&s, "%s %lu\n", vm_event_name(THP_FAULT_ALLOC),
|
mm: memcontrol: dump memory.stat during cgroup OOM
The current cgroup OOM memory info dump doesn't include all the memory
we are tracking, nor does it give insight into what the VM tried to do
leading up to the OOM. All that useful info is in memory.stat.
Furthermore, the recursive printing for every child cgroup can
generate absurd amounts of data on the console for larger cgroup
trees, and it's not like we provide a per-cgroup breakdown during
global OOM kills.
When an OOM kill is triggered, print one set of recursive memory.stat
items at the level whose limit triggered the OOM condition.
Example output:
stress invoked oom-killer: gfp_mask=0x100cca(GFP_HIGHUSER_MOVABLE), order=0, oom_score_adj=0
CPU: 2 PID: 210 Comm: stress Not tainted 5.2.0-rc2-mm1-00247-g47d49835983c #135
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.12.0-20181126_142135-anatol 04/01/2014
Call Trace:
dump_stack+0x46/0x60
dump_header+0x4c/0x2d0
oom_kill_process.cold.10+0xb/0x10
out_of_memory+0x200/0x270
? try_to_free_mem_cgroup_pages+0xdf/0x130
mem_cgroup_out_of_memory+0xb7/0xc0
try_charge+0x680/0x6f0
mem_cgroup_try_charge+0xb5/0x160
__add_to_page_cache_locked+0xc6/0x300
? list_lru_destroy+0x80/0x80
add_to_page_cache_lru+0x45/0xc0
pagecache_get_page+0x11b/0x290
filemap_fault+0x458/0x6d0
ext4_filemap_fault+0x27/0x36
__do_fault+0x2f/0xb0
__handle_mm_fault+0x9c5/0x1140
? apic_timer_interrupt+0xa/0x20
handle_mm_fault+0xc5/0x180
__do_page_fault+0x1ab/0x440
? page_fault+0x8/0x30
page_fault+0x1e/0x30
RIP: 0033:0x55c32167fc10
Code: Bad RIP value.
RSP: 002b:00007fff1d031c50 EFLAGS: 00010206
RAX: 000000000dc00000 RBX: 00007fd2db000010 RCX: 00007fd2db000010
RDX: 0000000000000000 RSI: 0000000010001000 RDI: 0000000000000000
RBP: 000055c321680a54 R08: 00000000ffffffff R09: 0000000000000000
R10: 0000000000000022 R11: 0000000000000246 R12: ffffffffffffffff
R13: 0000000000000002 R14: 0000000000001000 R15: 0000000010000000
memory: usage 1024kB, limit 1024kB, failcnt 75131
swap: usage 0kB, limit 9007199254740988kB, failcnt 0
Memory cgroup stats for /foo:
anon 0
file 0
kernel_stack 36864
slab 274432
sock 0
shmem 0
file_mapped 0
file_dirty 0
file_writeback 0
anon_thp 0
inactive_anon 126976
active_anon 0
inactive_file 0
active_file 0
unevictable 0
slab_reclaimable 0
slab_unreclaimable 274432
pgfault 59466
pgmajfault 1617
workingset_refault 2145
workingset_activate 0
workingset_nodereclaim 0
pgrefill 98952
pgscan 200060
pgsteal 59340
pgactivate 40095
pgdeactivate 96787
pglazyfree 0
pglazyfreed 0
thp_fault_alloc 0
thp_collapse_alloc 0
Tasks state (memory values in pages):
[ pid ] uid tgid total_vm rss pgtables_bytes swapents oom_score_adj name
[ 200] 0 200 1121 884 53248 29 0 bash
[ 209] 0 209 905 246 45056 19 0 stress
[ 210] 0 210 66442 56 499712 56349 0 stress
oom-kill:constraint=CONSTRAINT_NONE,nodemask=(null),oom_memcg=/foo,task_memcg=/foo,task=stress,pid=210,uid=0
Memory cgroup out of memory: Killed process 210 (stress) total-vm:265768kB, anon-rss:0kB, file-rss:224kB, shmem-rss:0kB
oom_reaper: reaped process 210 (stress), now anon-rss:0kB, file-rss:0kB, shmem-rss:0kB
[hannes@cmpxchg.org: s/kvmalloc/kmalloc/ per Michal]
Link: http://lkml.kernel.org/r/20190605161133.GA12453@cmpxchg.org
Link: http://lkml.kernel.org/r/20190604210509.9744-1-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-07-12 10:55:59 +07:00
|
|
|
memcg_events(memcg, THP_FAULT_ALLOC));
|
2019-12-05 07:49:53 +07:00
|
|
|
seq_buf_printf(&s, "%s %lu\n", vm_event_name(THP_COLLAPSE_ALLOC),
|
mm: memcontrol: dump memory.stat during cgroup OOM
The current cgroup OOM memory info dump doesn't include all the memory
we are tracking, nor does it give insight into what the VM tried to do
leading up to the OOM. All that useful info is in memory.stat.
Furthermore, the recursive printing for every child cgroup can
generate absurd amounts of data on the console for larger cgroup
trees, and it's not like we provide a per-cgroup breakdown during
global OOM kills.
When an OOM kill is triggered, print one set of recursive memory.stat
items at the level whose limit triggered the OOM condition.
Example output:
stress invoked oom-killer: gfp_mask=0x100cca(GFP_HIGHUSER_MOVABLE), order=0, oom_score_adj=0
CPU: 2 PID: 210 Comm: stress Not tainted 5.2.0-rc2-mm1-00247-g47d49835983c #135
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.12.0-20181126_142135-anatol 04/01/2014
Call Trace:
dump_stack+0x46/0x60
dump_header+0x4c/0x2d0
oom_kill_process.cold.10+0xb/0x10
out_of_memory+0x200/0x270
? try_to_free_mem_cgroup_pages+0xdf/0x130
mem_cgroup_out_of_memory+0xb7/0xc0
try_charge+0x680/0x6f0
mem_cgroup_try_charge+0xb5/0x160
__add_to_page_cache_locked+0xc6/0x300
? list_lru_destroy+0x80/0x80
add_to_page_cache_lru+0x45/0xc0
pagecache_get_page+0x11b/0x290
filemap_fault+0x458/0x6d0
ext4_filemap_fault+0x27/0x36
__do_fault+0x2f/0xb0
__handle_mm_fault+0x9c5/0x1140
? apic_timer_interrupt+0xa/0x20
handle_mm_fault+0xc5/0x180
__do_page_fault+0x1ab/0x440
? page_fault+0x8/0x30
page_fault+0x1e/0x30
RIP: 0033:0x55c32167fc10
Code: Bad RIP value.
RSP: 002b:00007fff1d031c50 EFLAGS: 00010206
RAX: 000000000dc00000 RBX: 00007fd2db000010 RCX: 00007fd2db000010
RDX: 0000000000000000 RSI: 0000000010001000 RDI: 0000000000000000
RBP: 000055c321680a54 R08: 00000000ffffffff R09: 0000000000000000
R10: 0000000000000022 R11: 0000000000000246 R12: ffffffffffffffff
R13: 0000000000000002 R14: 0000000000001000 R15: 0000000010000000
memory: usage 1024kB, limit 1024kB, failcnt 75131
swap: usage 0kB, limit 9007199254740988kB, failcnt 0
Memory cgroup stats for /foo:
anon 0
file 0
kernel_stack 36864
slab 274432
sock 0
shmem 0
file_mapped 0
file_dirty 0
file_writeback 0
anon_thp 0
inactive_anon 126976
active_anon 0
inactive_file 0
active_file 0
unevictable 0
slab_reclaimable 0
slab_unreclaimable 274432
pgfault 59466
pgmajfault 1617
workingset_refault 2145
workingset_activate 0
workingset_nodereclaim 0
pgrefill 98952
pgscan 200060
pgsteal 59340
pgactivate 40095
pgdeactivate 96787
pglazyfree 0
pglazyfreed 0
thp_fault_alloc 0
thp_collapse_alloc 0
Tasks state (memory values in pages):
[ pid ] uid tgid total_vm rss pgtables_bytes swapents oom_score_adj name
[ 200] 0 200 1121 884 53248 29 0 bash
[ 209] 0 209 905 246 45056 19 0 stress
[ 210] 0 210 66442 56 499712 56349 0 stress
oom-kill:constraint=CONSTRAINT_NONE,nodemask=(null),oom_memcg=/foo,task_memcg=/foo,task=stress,pid=210,uid=0
Memory cgroup out of memory: Killed process 210 (stress) total-vm:265768kB, anon-rss:0kB, file-rss:224kB, shmem-rss:0kB
oom_reaper: reaped process 210 (stress), now anon-rss:0kB, file-rss:0kB, shmem-rss:0kB
[hannes@cmpxchg.org: s/kvmalloc/kmalloc/ per Michal]
Link: http://lkml.kernel.org/r/20190605161133.GA12453@cmpxchg.org
Link: http://lkml.kernel.org/r/20190604210509.9744-1-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-07-12 10:55:59 +07:00
|
|
|
memcg_events(memcg, THP_COLLAPSE_ALLOC));
|
|
|
|
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
|
|
|
|
|
|
|
|
/* The above should easily fit into one page */
|
|
|
|
WARN_ON_ONCE(seq_buf_has_overflowed(&s));
|
|
|
|
|
|
|
|
return s.buffer;
|
|
|
|
}
|
2017-05-04 04:55:13 +07:00
|
|
|
|
memcg, oom: provide more precise dump info while memcg oom happening
Currently when a memcg oom is happening the oom dump messages is still
global state and provides few useful info for users. This patch prints
more pointed memcg page statistics for memcg-oom and take hierarchy into
consideration:
Based on Michal's advice, we take hierarchy into consideration: supppose
we trigger an OOM on A's limit
root_memcg
|
A (use_hierachy=1)
/ \
B C
|
D
then the printed info will be:
Memory cgroup stats for /A:...
Memory cgroup stats for /A/B:...
Memory cgroup stats for /A/C:...
Memory cgroup stats for /A/B/D:...
Following are samples of oom output:
(1) Before change:
mal-80 invoked oom-killer:gfp_mask=0xd0, order=0, oom_score_adj=0
mal-80 cpuset=/ mems_allowed=0
Pid: 2976, comm: mal-80 Not tainted 3.7.0+ #10
Call Trace:
[<ffffffff8167fbfb>] dump_header+0x83/0x1ca
..... (call trace)
[<ffffffff8168a818>] page_fault+0x28/0x30
<<<<<<<<<<<<<<<<<<<<< memcg specific information
Task in /A/B/D killed as a result of limit of /A
memory: usage 101376kB, limit 101376kB, failcnt 57
memory+swap: usage 101376kB, limit 101376kB, failcnt 0
kmem: usage 0kB, limit 9007199254740991kB, failcnt 0
<<<<<<<<<<<<<<<<<<<<< print per cpu pageset stat
Mem-Info:
Node 0 DMA per-cpu:
CPU 0: hi: 0, btch: 1 usd: 0
......
CPU 3: hi: 0, btch: 1 usd: 0
Node 0 DMA32 per-cpu:
CPU 0: hi: 186, btch: 31 usd: 173
......
CPU 3: hi: 186, btch: 31 usd: 130
<<<<<<<<<<<<<<<<<<<<< print global page state
active_anon:92963 inactive_anon:40777 isolated_anon:0
active_file:33027 inactive_file:51718 isolated_file:0
unevictable:0 dirty:3 writeback:0 unstable:0
free:729995 slab_reclaimable:6897 slab_unreclaimable:6263
mapped:20278 shmem:35971 pagetables:5885 bounce:0
free_cma:0
<<<<<<<<<<<<<<<<<<<<< print per zone page state
Node 0 DMA free:15836kB ... all_unreclaimable? no
lowmem_reserve[]: 0 3175 3899 3899
Node 0 DMA32 free:2888564kB ... all_unrelaimable? no
lowmem_reserve[]: 0 0 724 724
lowmem_reserve[]: 0 0 0 0
Node 0 DMA: 1*4kB (U) ... 3*4096kB (M) = 15836kB
Node 0 DMA32: 41*4kB (UM) ... 702*4096kB (MR) = 2888316kB
120710 total pagecache pages
0 pages in swap cache
<<<<<<<<<<<<<<<<<<<<< print global swap cache stat
Swap cache stats: add 0, delete 0, find 0/0
Free swap = 499708kB
Total swap = 499708kB
1040368 pages RAM
58678 pages reserved
169065 pages shared
173632 pages non-shared
[ pid ] uid tgid total_vm rss nr_ptes swapents oom_score_adj name
[ 2693] 0 2693 6005 1324 17 0 0 god
[ 2754] 0 2754 6003 1320 16 0 0 god
[ 2811] 0 2811 5992 1304 18 0 0 god
[ 2874] 0 2874 6005 1323 18 0 0 god
[ 2935] 0 2935 8720 7742 21 0 0 mal-30
[ 2976] 0 2976 21520 17577 42 0 0 mal-80
Memory cgroup out of memory: Kill process 2976 (mal-80) score 665 or sacrifice child
Killed process 2976 (mal-80) total-vm:86080kB, anon-rss:69964kB, file-rss:344kB
We can see that messages dumped by show_free_areas() are longsome and can
provide so limited info for memcg that just happen oom.
(2) After change
mal-80 invoked oom-killer: gfp_mask=0xd0, order=0, oom_score_adj=0
mal-80 cpuset=/ mems_allowed=0
Pid: 2704, comm: mal-80 Not tainted 3.7.0+ #10
Call Trace:
[<ffffffff8167fd0b>] dump_header+0x83/0x1d1
.......(call trace)
[<ffffffff8168a918>] page_fault+0x28/0x30
Task in /A/B/D killed as a result of limit of /A
<<<<<<<<<<<<<<<<<<<<< memcg specific information
memory: usage 102400kB, limit 102400kB, failcnt 140
memory+swap: usage 102400kB, limit 102400kB, failcnt 0
kmem: usage 0kB, limit 9007199254740991kB, failcnt 0
Memory cgroup stats for /A: cache:32KB rss:30984KB mapped_file:0KB swap:0KB inactive_anon:6912KB active_anon:24072KB inactive_file:32KB active_file:0KB unevictable:0KB
Memory cgroup stats for /A/B: cache:0KB rss:0KB mapped_file:0KB swap:0KB inactive_anon:0KB active_anon:0KB inactive_file:0KB active_file:0KB unevictable:0KB
Memory cgroup stats for /A/C: cache:0KB rss:0KB mapped_file:0KB swap:0KB inactive_anon:0KB active_anon:0KB inactive_file:0KB active_file:0KB unevictable:0KB
Memory cgroup stats for /A/B/D: cache:32KB rss:71352KB mapped_file:0KB swap:0KB inactive_anon:6656KB active_anon:64696KB inactive_file:16KB active_file:16KB unevictable:0KB
[ pid ] uid tgid total_vm rss nr_ptes swapents oom_score_adj name
[ 2260] 0 2260 6006 1325 18 0 0 god
[ 2383] 0 2383 6003 1319 17 0 0 god
[ 2503] 0 2503 6004 1321 18 0 0 god
[ 2622] 0 2622 6004 1321 16 0 0 god
[ 2695] 0 2695 8720 7741 22 0 0 mal-30
[ 2704] 0 2704 21520 17839 43 0 0 mal-80
Memory cgroup out of memory: Kill process 2704 (mal-80) score 669 or sacrifice child
Killed process 2704 (mal-80) total-vm:86080kB, anon-rss:71016kB, file-rss:340kB
This version provides more pointed info for memcg in "Memory cgroup stats
for XXX" section.
Signed-off-by: Sha Zhengju <handai.szj@taobao.com>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Acked-by: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2013-02-23 07:32:05 +07:00
|
|
|
#define K(x) ((x) << (PAGE_SHIFT-10))
|
2009-04-03 06:57:39 +07:00
|
|
|
/**
|
mm, oom: add oom victim's memcg to the oom context information
The current oom report doesn't display victim's memcg context during the
global OOM situation. While this information is not strictly needed, it
can be really helpful for containerized environments to locate which
container has lost a process. Now that we have a single line for the oom
context, we can trivially add both the oom memcg (this can be either
global_oom or a specific memcg which hits its hard limits) and task_memcg
which is the victim's memcg.
Below is the single line output in the oom report after this patch.
- global oom context information:
oom-kill:constraint=<constraint>,nodemask=<nodemask>,cpuset=<cpuset>,mems_allowed=<mems_allowed>,global_oom,task_memcg=<memcg>,task=<comm>,pid=<pid>,uid=<uid>
- memcg oom context information:
oom-kill:constraint=<constraint>,nodemask=<nodemask>,cpuset=<cpuset>,mems_allowed=<mems_allowed>,oom_memcg=<memcg>,task_memcg=<memcg>,task=<comm>,pid=<pid>,uid=<uid>
[penguin-kernel@I-love.SAKURA.ne.jp: use pr_cont() in mem_cgroup_print_oom_context()]
Link: http://lkml.kernel.org/r/201812190723.wBJ7NdkN032628@www262.sakura.ne.jp
Link: http://lkml.kernel.org/r/1542799799-36184-2-git-send-email-ufo19890607@gmail.com
Signed-off-by: yuzhoujian <yuzhoujian@didichuxing.com>
Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: David Rientjes <rientjes@google.com>
Cc: "Kirill A . Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp>
Cc: Roman Gushchin <guro@fb.com>
Cc: Yang Shi <yang.s@alibaba-inc.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2018-12-28 15:36:10 +07:00
|
|
|
* mem_cgroup_print_oom_context: Print OOM information relevant to
|
|
|
|
* memory controller.
|
2009-04-03 06:57:39 +07:00
|
|
|
* @memcg: The memory cgroup that went over limit
|
|
|
|
* @p: Task that is going to be killed
|
|
|
|
*
|
|
|
|
* NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is
|
|
|
|
* enabled
|
|
|
|
*/
|
mm, oom: add oom victim's memcg to the oom context information
The current oom report doesn't display victim's memcg context during the
global OOM situation. While this information is not strictly needed, it
can be really helpful for containerized environments to locate which
container has lost a process. Now that we have a single line for the oom
context, we can trivially add both the oom memcg (this can be either
global_oom or a specific memcg which hits its hard limits) and task_memcg
which is the victim's memcg.
Below is the single line output in the oom report after this patch.
- global oom context information:
oom-kill:constraint=<constraint>,nodemask=<nodemask>,cpuset=<cpuset>,mems_allowed=<mems_allowed>,global_oom,task_memcg=<memcg>,task=<comm>,pid=<pid>,uid=<uid>
- memcg oom context information:
oom-kill:constraint=<constraint>,nodemask=<nodemask>,cpuset=<cpuset>,mems_allowed=<mems_allowed>,oom_memcg=<memcg>,task_memcg=<memcg>,task=<comm>,pid=<pid>,uid=<uid>
[penguin-kernel@I-love.SAKURA.ne.jp: use pr_cont() in mem_cgroup_print_oom_context()]
Link: http://lkml.kernel.org/r/201812190723.wBJ7NdkN032628@www262.sakura.ne.jp
Link: http://lkml.kernel.org/r/1542799799-36184-2-git-send-email-ufo19890607@gmail.com
Signed-off-by: yuzhoujian <yuzhoujian@didichuxing.com>
Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: David Rientjes <rientjes@google.com>
Cc: "Kirill A . Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp>
Cc: Roman Gushchin <guro@fb.com>
Cc: Yang Shi <yang.s@alibaba-inc.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2018-12-28 15:36:10 +07:00
|
|
|
void mem_cgroup_print_oom_context(struct mem_cgroup *memcg, struct task_struct *p)
|
2009-04-03 06:57:39 +07:00
|
|
|
{
|
|
|
|
rcu_read_lock();
|
|
|
|
|
mm, oom: add oom victim's memcg to the oom context information
The current oom report doesn't display victim's memcg context during the
global OOM situation. While this information is not strictly needed, it
can be really helpful for containerized environments to locate which
container has lost a process. Now that we have a single line for the oom
context, we can trivially add both the oom memcg (this can be either
global_oom or a specific memcg which hits its hard limits) and task_memcg
which is the victim's memcg.
Below is the single line output in the oom report after this patch.
- global oom context information:
oom-kill:constraint=<constraint>,nodemask=<nodemask>,cpuset=<cpuset>,mems_allowed=<mems_allowed>,global_oom,task_memcg=<memcg>,task=<comm>,pid=<pid>,uid=<uid>
- memcg oom context information:
oom-kill:constraint=<constraint>,nodemask=<nodemask>,cpuset=<cpuset>,mems_allowed=<mems_allowed>,oom_memcg=<memcg>,task_memcg=<memcg>,task=<comm>,pid=<pid>,uid=<uid>
[penguin-kernel@I-love.SAKURA.ne.jp: use pr_cont() in mem_cgroup_print_oom_context()]
Link: http://lkml.kernel.org/r/201812190723.wBJ7NdkN032628@www262.sakura.ne.jp
Link: http://lkml.kernel.org/r/1542799799-36184-2-git-send-email-ufo19890607@gmail.com
Signed-off-by: yuzhoujian <yuzhoujian@didichuxing.com>
Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: David Rientjes <rientjes@google.com>
Cc: "Kirill A . Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp>
Cc: Roman Gushchin <guro@fb.com>
Cc: Yang Shi <yang.s@alibaba-inc.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2018-12-28 15:36:10 +07:00
|
|
|
if (memcg) {
|
|
|
|
pr_cont(",oom_memcg=");
|
|
|
|
pr_cont_cgroup_path(memcg->css.cgroup);
|
|
|
|
} else
|
|
|
|
pr_cont(",global_oom");
|
2015-04-15 05:48:18 +07:00
|
|
|
if (p) {
|
mm, oom: add oom victim's memcg to the oom context information
The current oom report doesn't display victim's memcg context during the
global OOM situation. While this information is not strictly needed, it
can be really helpful for containerized environments to locate which
container has lost a process. Now that we have a single line for the oom
context, we can trivially add both the oom memcg (this can be either
global_oom or a specific memcg which hits its hard limits) and task_memcg
which is the victim's memcg.
Below is the single line output in the oom report after this patch.
- global oom context information:
oom-kill:constraint=<constraint>,nodemask=<nodemask>,cpuset=<cpuset>,mems_allowed=<mems_allowed>,global_oom,task_memcg=<memcg>,task=<comm>,pid=<pid>,uid=<uid>
- memcg oom context information:
oom-kill:constraint=<constraint>,nodemask=<nodemask>,cpuset=<cpuset>,mems_allowed=<mems_allowed>,oom_memcg=<memcg>,task_memcg=<memcg>,task=<comm>,pid=<pid>,uid=<uid>
[penguin-kernel@I-love.SAKURA.ne.jp: use pr_cont() in mem_cgroup_print_oom_context()]
Link: http://lkml.kernel.org/r/201812190723.wBJ7NdkN032628@www262.sakura.ne.jp
Link: http://lkml.kernel.org/r/1542799799-36184-2-git-send-email-ufo19890607@gmail.com
Signed-off-by: yuzhoujian <yuzhoujian@didichuxing.com>
Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: David Rientjes <rientjes@google.com>
Cc: "Kirill A . Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp>
Cc: Roman Gushchin <guro@fb.com>
Cc: Yang Shi <yang.s@alibaba-inc.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2018-12-28 15:36:10 +07:00
|
|
|
pr_cont(",task_memcg=");
|
2015-04-15 05:48:18 +07:00
|
|
|
pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id));
|
|
|
|
}
|
2009-04-03 06:57:39 +07:00
|
|
|
rcu_read_unlock();
|
mm, oom: add oom victim's memcg to the oom context information
The current oom report doesn't display victim's memcg context during the
global OOM situation. While this information is not strictly needed, it
can be really helpful for containerized environments to locate which
container has lost a process. Now that we have a single line for the oom
context, we can trivially add both the oom memcg (this can be either
global_oom or a specific memcg which hits its hard limits) and task_memcg
which is the victim's memcg.
Below is the single line output in the oom report after this patch.
- global oom context information:
oom-kill:constraint=<constraint>,nodemask=<nodemask>,cpuset=<cpuset>,mems_allowed=<mems_allowed>,global_oom,task_memcg=<memcg>,task=<comm>,pid=<pid>,uid=<uid>
- memcg oom context information:
oom-kill:constraint=<constraint>,nodemask=<nodemask>,cpuset=<cpuset>,mems_allowed=<mems_allowed>,oom_memcg=<memcg>,task_memcg=<memcg>,task=<comm>,pid=<pid>,uid=<uid>
[penguin-kernel@I-love.SAKURA.ne.jp: use pr_cont() in mem_cgroup_print_oom_context()]
Link: http://lkml.kernel.org/r/201812190723.wBJ7NdkN032628@www262.sakura.ne.jp
Link: http://lkml.kernel.org/r/1542799799-36184-2-git-send-email-ufo19890607@gmail.com
Signed-off-by: yuzhoujian <yuzhoujian@didichuxing.com>
Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: David Rientjes <rientjes@google.com>
Cc: "Kirill A . Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp>
Cc: Roman Gushchin <guro@fb.com>
Cc: Yang Shi <yang.s@alibaba-inc.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2018-12-28 15:36:10 +07:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* mem_cgroup_print_oom_meminfo: Print OOM memory information relevant to
|
|
|
|
* memory controller.
|
|
|
|
* @memcg: The memory cgroup that went over limit
|
|
|
|
*/
|
|
|
|
void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg)
|
|
|
|
{
|
mm: memcontrol: dump memory.stat during cgroup OOM
The current cgroup OOM memory info dump doesn't include all the memory
we are tracking, nor does it give insight into what the VM tried to do
leading up to the OOM. All that useful info is in memory.stat.
Furthermore, the recursive printing for every child cgroup can
generate absurd amounts of data on the console for larger cgroup
trees, and it's not like we provide a per-cgroup breakdown during
global OOM kills.
When an OOM kill is triggered, print one set of recursive memory.stat
items at the level whose limit triggered the OOM condition.
Example output:
stress invoked oom-killer: gfp_mask=0x100cca(GFP_HIGHUSER_MOVABLE), order=0, oom_score_adj=0
CPU: 2 PID: 210 Comm: stress Not tainted 5.2.0-rc2-mm1-00247-g47d49835983c #135
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.12.0-20181126_142135-anatol 04/01/2014
Call Trace:
dump_stack+0x46/0x60
dump_header+0x4c/0x2d0
oom_kill_process.cold.10+0xb/0x10
out_of_memory+0x200/0x270
? try_to_free_mem_cgroup_pages+0xdf/0x130
mem_cgroup_out_of_memory+0xb7/0xc0
try_charge+0x680/0x6f0
mem_cgroup_try_charge+0xb5/0x160
__add_to_page_cache_locked+0xc6/0x300
? list_lru_destroy+0x80/0x80
add_to_page_cache_lru+0x45/0xc0
pagecache_get_page+0x11b/0x290
filemap_fault+0x458/0x6d0
ext4_filemap_fault+0x27/0x36
__do_fault+0x2f/0xb0
__handle_mm_fault+0x9c5/0x1140
? apic_timer_interrupt+0xa/0x20
handle_mm_fault+0xc5/0x180
__do_page_fault+0x1ab/0x440
? page_fault+0x8/0x30
page_fault+0x1e/0x30
RIP: 0033:0x55c32167fc10
Code: Bad RIP value.
RSP: 002b:00007fff1d031c50 EFLAGS: 00010206
RAX: 000000000dc00000 RBX: 00007fd2db000010 RCX: 00007fd2db000010
RDX: 0000000000000000 RSI: 0000000010001000 RDI: 0000000000000000
RBP: 000055c321680a54 R08: 00000000ffffffff R09: 0000000000000000
R10: 0000000000000022 R11: 0000000000000246 R12: ffffffffffffffff
R13: 0000000000000002 R14: 0000000000001000 R15: 0000000010000000
memory: usage 1024kB, limit 1024kB, failcnt 75131
swap: usage 0kB, limit 9007199254740988kB, failcnt 0
Memory cgroup stats for /foo:
anon 0
file 0
kernel_stack 36864
slab 274432
sock 0
shmem 0
file_mapped 0
file_dirty 0
file_writeback 0
anon_thp 0
inactive_anon 126976
active_anon 0
inactive_file 0
active_file 0
unevictable 0
slab_reclaimable 0
slab_unreclaimable 274432
pgfault 59466
pgmajfault 1617
workingset_refault 2145
workingset_activate 0
workingset_nodereclaim 0
pgrefill 98952
pgscan 200060
pgsteal 59340
pgactivate 40095
pgdeactivate 96787
pglazyfree 0
pglazyfreed 0
thp_fault_alloc 0
thp_collapse_alloc 0
Tasks state (memory values in pages):
[ pid ] uid tgid total_vm rss pgtables_bytes swapents oom_score_adj name
[ 200] 0 200 1121 884 53248 29 0 bash
[ 209] 0 209 905 246 45056 19 0 stress
[ 210] 0 210 66442 56 499712 56349 0 stress
oom-kill:constraint=CONSTRAINT_NONE,nodemask=(null),oom_memcg=/foo,task_memcg=/foo,task=stress,pid=210,uid=0
Memory cgroup out of memory: Killed process 210 (stress) total-vm:265768kB, anon-rss:0kB, file-rss:224kB, shmem-rss:0kB
oom_reaper: reaped process 210 (stress), now anon-rss:0kB, file-rss:0kB, shmem-rss:0kB
[hannes@cmpxchg.org: s/kvmalloc/kmalloc/ per Michal]
Link: http://lkml.kernel.org/r/20190605161133.GA12453@cmpxchg.org
Link: http://lkml.kernel.org/r/20190604210509.9744-1-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-07-12 10:55:59 +07:00
|
|
|
char *buf;
|
2009-04-03 06:57:39 +07:00
|
|
|
|
mm: memcontrol: lockless page counters
Memory is internally accounted in bytes, using spinlock-protected 64-bit
counters, even though the smallest accounting delta is a page. The
counter interface is also convoluted and does too many things.
Introduce a new lockless word-sized page counter API, then change all
memory accounting over to it. The translation from and to bytes then only
happens when interfacing with userspace.
The removed locking overhead is noticable when scaling beyond the per-cpu
charge caches - on a 4-socket machine with 144-threads, the following test
shows the performance differences of 288 memcgs concurrently running a
page fault benchmark:
vanilla:
18631648.500498 task-clock (msec) # 140.643 CPUs utilized ( +- 0.33% )
1,380,638 context-switches # 0.074 K/sec ( +- 0.75% )
24,390 cpu-migrations # 0.001 K/sec ( +- 8.44% )
1,843,305,768 page-faults # 0.099 M/sec ( +- 0.00% )
50,134,994,088,218 cycles # 2.691 GHz ( +- 0.33% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
8,049,712,224,651 instructions # 0.16 insns per cycle ( +- 0.04% )
1,586,970,584,979 branches # 85.176 M/sec ( +- 0.05% )
1,724,989,949 branch-misses # 0.11% of all branches ( +- 0.48% )
132.474343877 seconds time elapsed ( +- 0.21% )
lockless:
12195979.037525 task-clock (msec) # 133.480 CPUs utilized ( +- 0.18% )
832,850 context-switches # 0.068 K/sec ( +- 0.54% )
15,624 cpu-migrations # 0.001 K/sec ( +- 10.17% )
1,843,304,774 page-faults # 0.151 M/sec ( +- 0.00% )
32,811,216,801,141 cycles # 2.690 GHz ( +- 0.18% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
9,999,265,091,727 instructions # 0.30 insns per cycle ( +- 0.10% )
2,076,759,325,203 branches # 170.282 M/sec ( +- 0.12% )
1,656,917,214 branch-misses # 0.08% of all branches ( +- 0.55% )
91.369330729 seconds time elapsed ( +- 0.45% )
On top of improved scalability, this also gets rid of the icky long long
types in the very heart of memcg, which is great for 32 bit and also makes
the code a lot more readable.
Notable differences between the old and new API:
- res_counter_charge() and res_counter_charge_nofail() become
page_counter_try_charge() and page_counter_charge() resp. to match
the more common kernel naming scheme of try_do()/do()
- res_counter_uncharge_until() is only ever used to cancel a local
counter and never to uncharge bigger segments of a hierarchy, so
it's replaced by the simpler page_counter_cancel()
- res_counter_set_limit() is replaced by page_counter_limit(), which
expects its callers to serialize against themselves
- res_counter_memparse_write_strategy() is replaced by
page_counter_limit(), which rounds down to the nearest page size -
rather than up. This is more reasonable for explicitely requested
hard upper limits.
- to keep charging light-weight, page_counter_try_charge() charges
speculatively, only to roll back if the result exceeds the limit.
Because of this, a failing bigger charge can temporarily lock out
smaller charges that would otherwise succeed. The error is bounded
to the difference between the smallest and the biggest possible
charge size, so for memcg, this means that a failing THP charge can
send base page charges into reclaim upto 2MB (4MB) before the limit
would have been reached. This should be acceptable.
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE and memparse]
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE, memparse, strncmp, and PAGE_SIZE]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Acked-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-12-11 06:42:31 +07:00
|
|
|
pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n",
|
|
|
|
K((u64)page_counter_read(&memcg->memory)),
|
2020-04-02 11:07:20 +07:00
|
|
|
K((u64)READ_ONCE(memcg->memory.max)), memcg->memory.failcnt);
|
mm: memcontrol: dump memory.stat during cgroup OOM
The current cgroup OOM memory info dump doesn't include all the memory
we are tracking, nor does it give insight into what the VM tried to do
leading up to the OOM. All that useful info is in memory.stat.
Furthermore, the recursive printing for every child cgroup can
generate absurd amounts of data on the console for larger cgroup
trees, and it's not like we provide a per-cgroup breakdown during
global OOM kills.
When an OOM kill is triggered, print one set of recursive memory.stat
items at the level whose limit triggered the OOM condition.
Example output:
stress invoked oom-killer: gfp_mask=0x100cca(GFP_HIGHUSER_MOVABLE), order=0, oom_score_adj=0
CPU: 2 PID: 210 Comm: stress Not tainted 5.2.0-rc2-mm1-00247-g47d49835983c #135
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.12.0-20181126_142135-anatol 04/01/2014
Call Trace:
dump_stack+0x46/0x60
dump_header+0x4c/0x2d0
oom_kill_process.cold.10+0xb/0x10
out_of_memory+0x200/0x270
? try_to_free_mem_cgroup_pages+0xdf/0x130
mem_cgroup_out_of_memory+0xb7/0xc0
try_charge+0x680/0x6f0
mem_cgroup_try_charge+0xb5/0x160
__add_to_page_cache_locked+0xc6/0x300
? list_lru_destroy+0x80/0x80
add_to_page_cache_lru+0x45/0xc0
pagecache_get_page+0x11b/0x290
filemap_fault+0x458/0x6d0
ext4_filemap_fault+0x27/0x36
__do_fault+0x2f/0xb0
__handle_mm_fault+0x9c5/0x1140
? apic_timer_interrupt+0xa/0x20
handle_mm_fault+0xc5/0x180
__do_page_fault+0x1ab/0x440
? page_fault+0x8/0x30
page_fault+0x1e/0x30
RIP: 0033:0x55c32167fc10
Code: Bad RIP value.
RSP: 002b:00007fff1d031c50 EFLAGS: 00010206
RAX: 000000000dc00000 RBX: 00007fd2db000010 RCX: 00007fd2db000010
RDX: 0000000000000000 RSI: 0000000010001000 RDI: 0000000000000000
RBP: 000055c321680a54 R08: 00000000ffffffff R09: 0000000000000000
R10: 0000000000000022 R11: 0000000000000246 R12: ffffffffffffffff
R13: 0000000000000002 R14: 0000000000001000 R15: 0000000010000000
memory: usage 1024kB, limit 1024kB, failcnt 75131
swap: usage 0kB, limit 9007199254740988kB, failcnt 0
Memory cgroup stats for /foo:
anon 0
file 0
kernel_stack 36864
slab 274432
sock 0
shmem 0
file_mapped 0
file_dirty 0
file_writeback 0
anon_thp 0
inactive_anon 126976
active_anon 0
inactive_file 0
active_file 0
unevictable 0
slab_reclaimable 0
slab_unreclaimable 274432
pgfault 59466
pgmajfault 1617
workingset_refault 2145
workingset_activate 0
workingset_nodereclaim 0
pgrefill 98952
pgscan 200060
pgsteal 59340
pgactivate 40095
pgdeactivate 96787
pglazyfree 0
pglazyfreed 0
thp_fault_alloc 0
thp_collapse_alloc 0
Tasks state (memory values in pages):
[ pid ] uid tgid total_vm rss pgtables_bytes swapents oom_score_adj name
[ 200] 0 200 1121 884 53248 29 0 bash
[ 209] 0 209 905 246 45056 19 0 stress
[ 210] 0 210 66442 56 499712 56349 0 stress
oom-kill:constraint=CONSTRAINT_NONE,nodemask=(null),oom_memcg=/foo,task_memcg=/foo,task=stress,pid=210,uid=0
Memory cgroup out of memory: Killed process 210 (stress) total-vm:265768kB, anon-rss:0kB, file-rss:224kB, shmem-rss:0kB
oom_reaper: reaped process 210 (stress), now anon-rss:0kB, file-rss:0kB, shmem-rss:0kB
[hannes@cmpxchg.org: s/kvmalloc/kmalloc/ per Michal]
Link: http://lkml.kernel.org/r/20190605161133.GA12453@cmpxchg.org
Link: http://lkml.kernel.org/r/20190604210509.9744-1-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-07-12 10:55:59 +07:00
|
|
|
if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
|
|
|
|
pr_info("swap: usage %llukB, limit %llukB, failcnt %lu\n",
|
|
|
|
K((u64)page_counter_read(&memcg->swap)),
|
2020-04-02 11:07:30 +07:00
|
|
|
K((u64)READ_ONCE(memcg->swap.max)), memcg->swap.failcnt);
|
mm: memcontrol: dump memory.stat during cgroup OOM
The current cgroup OOM memory info dump doesn't include all the memory
we are tracking, nor does it give insight into what the VM tried to do
leading up to the OOM. All that useful info is in memory.stat.
Furthermore, the recursive printing for every child cgroup can
generate absurd amounts of data on the console for larger cgroup
trees, and it's not like we provide a per-cgroup breakdown during
global OOM kills.
When an OOM kill is triggered, print one set of recursive memory.stat
items at the level whose limit triggered the OOM condition.
Example output:
stress invoked oom-killer: gfp_mask=0x100cca(GFP_HIGHUSER_MOVABLE), order=0, oom_score_adj=0
CPU: 2 PID: 210 Comm: stress Not tainted 5.2.0-rc2-mm1-00247-g47d49835983c #135
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.12.0-20181126_142135-anatol 04/01/2014
Call Trace:
dump_stack+0x46/0x60
dump_header+0x4c/0x2d0
oom_kill_process.cold.10+0xb/0x10
out_of_memory+0x200/0x270
? try_to_free_mem_cgroup_pages+0xdf/0x130
mem_cgroup_out_of_memory+0xb7/0xc0
try_charge+0x680/0x6f0
mem_cgroup_try_charge+0xb5/0x160
__add_to_page_cache_locked+0xc6/0x300
? list_lru_destroy+0x80/0x80
add_to_page_cache_lru+0x45/0xc0
pagecache_get_page+0x11b/0x290
filemap_fault+0x458/0x6d0
ext4_filemap_fault+0x27/0x36
__do_fault+0x2f/0xb0
__handle_mm_fault+0x9c5/0x1140
? apic_timer_interrupt+0xa/0x20
handle_mm_fault+0xc5/0x180
__do_page_fault+0x1ab/0x440
? page_fault+0x8/0x30
page_fault+0x1e/0x30
RIP: 0033:0x55c32167fc10
Code: Bad RIP value.
RSP: 002b:00007fff1d031c50 EFLAGS: 00010206
RAX: 000000000dc00000 RBX: 00007fd2db000010 RCX: 00007fd2db000010
RDX: 0000000000000000 RSI: 0000000010001000 RDI: 0000000000000000
RBP: 000055c321680a54 R08: 00000000ffffffff R09: 0000000000000000
R10: 0000000000000022 R11: 0000000000000246 R12: ffffffffffffffff
R13: 0000000000000002 R14: 0000000000001000 R15: 0000000010000000
memory: usage 1024kB, limit 1024kB, failcnt 75131
swap: usage 0kB, limit 9007199254740988kB, failcnt 0
Memory cgroup stats for /foo:
anon 0
file 0
kernel_stack 36864
slab 274432
sock 0
shmem 0
file_mapped 0
file_dirty 0
file_writeback 0
anon_thp 0
inactive_anon 126976
active_anon 0
inactive_file 0
active_file 0
unevictable 0
slab_reclaimable 0
slab_unreclaimable 274432
pgfault 59466
pgmajfault 1617
workingset_refault 2145
workingset_activate 0
workingset_nodereclaim 0
pgrefill 98952
pgscan 200060
pgsteal 59340
pgactivate 40095
pgdeactivate 96787
pglazyfree 0
pglazyfreed 0
thp_fault_alloc 0
thp_collapse_alloc 0
Tasks state (memory values in pages):
[ pid ] uid tgid total_vm rss pgtables_bytes swapents oom_score_adj name
[ 200] 0 200 1121 884 53248 29 0 bash
[ 209] 0 209 905 246 45056 19 0 stress
[ 210] 0 210 66442 56 499712 56349 0 stress
oom-kill:constraint=CONSTRAINT_NONE,nodemask=(null),oom_memcg=/foo,task_memcg=/foo,task=stress,pid=210,uid=0
Memory cgroup out of memory: Killed process 210 (stress) total-vm:265768kB, anon-rss:0kB, file-rss:224kB, shmem-rss:0kB
oom_reaper: reaped process 210 (stress), now anon-rss:0kB, file-rss:0kB, shmem-rss:0kB
[hannes@cmpxchg.org: s/kvmalloc/kmalloc/ per Michal]
Link: http://lkml.kernel.org/r/20190605161133.GA12453@cmpxchg.org
Link: http://lkml.kernel.org/r/20190604210509.9744-1-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-07-12 10:55:59 +07:00
|
|
|
else {
|
|
|
|
pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n",
|
|
|
|
K((u64)page_counter_read(&memcg->memsw)),
|
|
|
|
K((u64)memcg->memsw.max), memcg->memsw.failcnt);
|
|
|
|
pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n",
|
|
|
|
K((u64)page_counter_read(&memcg->kmem)),
|
|
|
|
K((u64)memcg->kmem.max), memcg->kmem.failcnt);
|
memcg, oom: provide more precise dump info while memcg oom happening
Currently when a memcg oom is happening the oom dump messages is still
global state and provides few useful info for users. This patch prints
more pointed memcg page statistics for memcg-oom and take hierarchy into
consideration:
Based on Michal's advice, we take hierarchy into consideration: supppose
we trigger an OOM on A's limit
root_memcg
|
A (use_hierachy=1)
/ \
B C
|
D
then the printed info will be:
Memory cgroup stats for /A:...
Memory cgroup stats for /A/B:...
Memory cgroup stats for /A/C:...
Memory cgroup stats for /A/B/D:...
Following are samples of oom output:
(1) Before change:
mal-80 invoked oom-killer:gfp_mask=0xd0, order=0, oom_score_adj=0
mal-80 cpuset=/ mems_allowed=0
Pid: 2976, comm: mal-80 Not tainted 3.7.0+ #10
Call Trace:
[<ffffffff8167fbfb>] dump_header+0x83/0x1ca
..... (call trace)
[<ffffffff8168a818>] page_fault+0x28/0x30
<<<<<<<<<<<<<<<<<<<<< memcg specific information
Task in /A/B/D killed as a result of limit of /A
memory: usage 101376kB, limit 101376kB, failcnt 57
memory+swap: usage 101376kB, limit 101376kB, failcnt 0
kmem: usage 0kB, limit 9007199254740991kB, failcnt 0
<<<<<<<<<<<<<<<<<<<<< print per cpu pageset stat
Mem-Info:
Node 0 DMA per-cpu:
CPU 0: hi: 0, btch: 1 usd: 0
......
CPU 3: hi: 0, btch: 1 usd: 0
Node 0 DMA32 per-cpu:
CPU 0: hi: 186, btch: 31 usd: 173
......
CPU 3: hi: 186, btch: 31 usd: 130
<<<<<<<<<<<<<<<<<<<<< print global page state
active_anon:92963 inactive_anon:40777 isolated_anon:0
active_file:33027 inactive_file:51718 isolated_file:0
unevictable:0 dirty:3 writeback:0 unstable:0
free:729995 slab_reclaimable:6897 slab_unreclaimable:6263
mapped:20278 shmem:35971 pagetables:5885 bounce:0
free_cma:0
<<<<<<<<<<<<<<<<<<<<< print per zone page state
Node 0 DMA free:15836kB ... all_unreclaimable? no
lowmem_reserve[]: 0 3175 3899 3899
Node 0 DMA32 free:2888564kB ... all_unrelaimable? no
lowmem_reserve[]: 0 0 724 724
lowmem_reserve[]: 0 0 0 0
Node 0 DMA: 1*4kB (U) ... 3*4096kB (M) = 15836kB
Node 0 DMA32: 41*4kB (UM) ... 702*4096kB (MR) = 2888316kB
120710 total pagecache pages
0 pages in swap cache
<<<<<<<<<<<<<<<<<<<<< print global swap cache stat
Swap cache stats: add 0, delete 0, find 0/0
Free swap = 499708kB
Total swap = 499708kB
1040368 pages RAM
58678 pages reserved
169065 pages shared
173632 pages non-shared
[ pid ] uid tgid total_vm rss nr_ptes swapents oom_score_adj name
[ 2693] 0 2693 6005 1324 17 0 0 god
[ 2754] 0 2754 6003 1320 16 0 0 god
[ 2811] 0 2811 5992 1304 18 0 0 god
[ 2874] 0 2874 6005 1323 18 0 0 god
[ 2935] 0 2935 8720 7742 21 0 0 mal-30
[ 2976] 0 2976 21520 17577 42 0 0 mal-80
Memory cgroup out of memory: Kill process 2976 (mal-80) score 665 or sacrifice child
Killed process 2976 (mal-80) total-vm:86080kB, anon-rss:69964kB, file-rss:344kB
We can see that messages dumped by show_free_areas() are longsome and can
provide so limited info for memcg that just happen oom.
(2) After change
mal-80 invoked oom-killer: gfp_mask=0xd0, order=0, oom_score_adj=0
mal-80 cpuset=/ mems_allowed=0
Pid: 2704, comm: mal-80 Not tainted 3.7.0+ #10
Call Trace:
[<ffffffff8167fd0b>] dump_header+0x83/0x1d1
.......(call trace)
[<ffffffff8168a918>] page_fault+0x28/0x30
Task in /A/B/D killed as a result of limit of /A
<<<<<<<<<<<<<<<<<<<<< memcg specific information
memory: usage 102400kB, limit 102400kB, failcnt 140
memory+swap: usage 102400kB, limit 102400kB, failcnt 0
kmem: usage 0kB, limit 9007199254740991kB, failcnt 0
Memory cgroup stats for /A: cache:32KB rss:30984KB mapped_file:0KB swap:0KB inactive_anon:6912KB active_anon:24072KB inactive_file:32KB active_file:0KB unevictable:0KB
Memory cgroup stats for /A/B: cache:0KB rss:0KB mapped_file:0KB swap:0KB inactive_anon:0KB active_anon:0KB inactive_file:0KB active_file:0KB unevictable:0KB
Memory cgroup stats for /A/C: cache:0KB rss:0KB mapped_file:0KB swap:0KB inactive_anon:0KB active_anon:0KB inactive_file:0KB active_file:0KB unevictable:0KB
Memory cgroup stats for /A/B/D: cache:32KB rss:71352KB mapped_file:0KB swap:0KB inactive_anon:6656KB active_anon:64696KB inactive_file:16KB active_file:16KB unevictable:0KB
[ pid ] uid tgid total_vm rss nr_ptes swapents oom_score_adj name
[ 2260] 0 2260 6006 1325 18 0 0 god
[ 2383] 0 2383 6003 1319 17 0 0 god
[ 2503] 0 2503 6004 1321 18 0 0 god
[ 2622] 0 2622 6004 1321 16 0 0 god
[ 2695] 0 2695 8720 7741 22 0 0 mal-30
[ 2704] 0 2704 21520 17839 43 0 0 mal-80
Memory cgroup out of memory: Kill process 2704 (mal-80) score 669 or sacrifice child
Killed process 2704 (mal-80) total-vm:86080kB, anon-rss:71016kB, file-rss:340kB
This version provides more pointed info for memcg in "Memory cgroup stats
for XXX" section.
Signed-off-by: Sha Zhengju <handai.szj@taobao.com>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Acked-by: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2013-02-23 07:32:05 +07:00
|
|
|
}
|
mm: memcontrol: dump memory.stat during cgroup OOM
The current cgroup OOM memory info dump doesn't include all the memory
we are tracking, nor does it give insight into what the VM tried to do
leading up to the OOM. All that useful info is in memory.stat.
Furthermore, the recursive printing for every child cgroup can
generate absurd amounts of data on the console for larger cgroup
trees, and it's not like we provide a per-cgroup breakdown during
global OOM kills.
When an OOM kill is triggered, print one set of recursive memory.stat
items at the level whose limit triggered the OOM condition.
Example output:
stress invoked oom-killer: gfp_mask=0x100cca(GFP_HIGHUSER_MOVABLE), order=0, oom_score_adj=0
CPU: 2 PID: 210 Comm: stress Not tainted 5.2.0-rc2-mm1-00247-g47d49835983c #135
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.12.0-20181126_142135-anatol 04/01/2014
Call Trace:
dump_stack+0x46/0x60
dump_header+0x4c/0x2d0
oom_kill_process.cold.10+0xb/0x10
out_of_memory+0x200/0x270
? try_to_free_mem_cgroup_pages+0xdf/0x130
mem_cgroup_out_of_memory+0xb7/0xc0
try_charge+0x680/0x6f0
mem_cgroup_try_charge+0xb5/0x160
__add_to_page_cache_locked+0xc6/0x300
? list_lru_destroy+0x80/0x80
add_to_page_cache_lru+0x45/0xc0
pagecache_get_page+0x11b/0x290
filemap_fault+0x458/0x6d0
ext4_filemap_fault+0x27/0x36
__do_fault+0x2f/0xb0
__handle_mm_fault+0x9c5/0x1140
? apic_timer_interrupt+0xa/0x20
handle_mm_fault+0xc5/0x180
__do_page_fault+0x1ab/0x440
? page_fault+0x8/0x30
page_fault+0x1e/0x30
RIP: 0033:0x55c32167fc10
Code: Bad RIP value.
RSP: 002b:00007fff1d031c50 EFLAGS: 00010206
RAX: 000000000dc00000 RBX: 00007fd2db000010 RCX: 00007fd2db000010
RDX: 0000000000000000 RSI: 0000000010001000 RDI: 0000000000000000
RBP: 000055c321680a54 R08: 00000000ffffffff R09: 0000000000000000
R10: 0000000000000022 R11: 0000000000000246 R12: ffffffffffffffff
R13: 0000000000000002 R14: 0000000000001000 R15: 0000000010000000
memory: usage 1024kB, limit 1024kB, failcnt 75131
swap: usage 0kB, limit 9007199254740988kB, failcnt 0
Memory cgroup stats for /foo:
anon 0
file 0
kernel_stack 36864
slab 274432
sock 0
shmem 0
file_mapped 0
file_dirty 0
file_writeback 0
anon_thp 0
inactive_anon 126976
active_anon 0
inactive_file 0
active_file 0
unevictable 0
slab_reclaimable 0
slab_unreclaimable 274432
pgfault 59466
pgmajfault 1617
workingset_refault 2145
workingset_activate 0
workingset_nodereclaim 0
pgrefill 98952
pgscan 200060
pgsteal 59340
pgactivate 40095
pgdeactivate 96787
pglazyfree 0
pglazyfreed 0
thp_fault_alloc 0
thp_collapse_alloc 0
Tasks state (memory values in pages):
[ pid ] uid tgid total_vm rss pgtables_bytes swapents oom_score_adj name
[ 200] 0 200 1121 884 53248 29 0 bash
[ 209] 0 209 905 246 45056 19 0 stress
[ 210] 0 210 66442 56 499712 56349 0 stress
oom-kill:constraint=CONSTRAINT_NONE,nodemask=(null),oom_memcg=/foo,task_memcg=/foo,task=stress,pid=210,uid=0
Memory cgroup out of memory: Killed process 210 (stress) total-vm:265768kB, anon-rss:0kB, file-rss:224kB, shmem-rss:0kB
oom_reaper: reaped process 210 (stress), now anon-rss:0kB, file-rss:0kB, shmem-rss:0kB
[hannes@cmpxchg.org: s/kvmalloc/kmalloc/ per Michal]
Link: http://lkml.kernel.org/r/20190605161133.GA12453@cmpxchg.org
Link: http://lkml.kernel.org/r/20190604210509.9744-1-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-07-12 10:55:59 +07:00
|
|
|
|
|
|
|
pr_info("Memory cgroup stats for ");
|
|
|
|
pr_cont_cgroup_path(memcg->css.cgroup);
|
|
|
|
pr_cont(":");
|
|
|
|
buf = memory_stat_format(memcg);
|
|
|
|
if (!buf)
|
|
|
|
return;
|
|
|
|
pr_info("%s", buf);
|
|
|
|
kfree(buf);
|
2009-04-03 06:57:39 +07:00
|
|
|
}
|
|
|
|
|
oom: badness heuristic rewrite
This a complete rewrite of the oom killer's badness() heuristic which is
used to determine which task to kill in oom conditions. The goal is to
make it as simple and predictable as possible so the results are better
understood and we end up killing the task which will lead to the most
memory freeing while still respecting the fine-tuning from userspace.
Instead of basing the heuristic on mm->total_vm for each task, the task's
rss and swap space is used instead. This is a better indication of the
amount of memory that will be freeable if the oom killed task is chosen
and subsequently exits. This helps specifically in cases where KDE or
GNOME is chosen for oom kill on desktop systems instead of a memory
hogging task.
The baseline for the heuristic is a proportion of memory that each task is
currently using in memory plus swap compared to the amount of "allowable"
memory. "Allowable," in this sense, means the system-wide resources for
unconstrained oom conditions, the set of mempolicy nodes, the mems
attached to current's cpuset, or a memory controller's limit. The
proportion is given on a scale of 0 (never kill) to 1000 (always kill),
roughly meaning that if a task has a badness() score of 500 that the task
consumes approximately 50% of allowable memory resident in RAM or in swap
space.
The proportion is always relative to the amount of "allowable" memory and
not the total amount of RAM systemwide so that mempolicies and cpusets may
operate in isolation; they shall not need to know the true size of the
machine on which they are running if they are bound to a specific set of
nodes or mems, respectively.
Root tasks are given 3% extra memory just like __vm_enough_memory()
provides in LSMs. In the event of two tasks consuming similar amounts of
memory, it is generally better to save root's task.
Because of the change in the badness() heuristic's baseline, it is also
necessary to introduce a new user interface to tune it. It's not possible
to redefine the meaning of /proc/pid/oom_adj with a new scale since the
ABI cannot be changed for backward compatability. Instead, a new tunable,
/proc/pid/oom_score_adj, is added that ranges from -1000 to +1000. It may
be used to polarize the heuristic such that certain tasks are never
considered for oom kill while others may always be considered. The value
is added directly into the badness() score so a value of -500, for
example, means to discount 50% of its memory consumption in comparison to
other tasks either on the system, bound to the mempolicy, in the cpuset,
or sharing the same memory controller.
/proc/pid/oom_adj is changed so that its meaning is rescaled into the
units used by /proc/pid/oom_score_adj, and vice versa. Changing one of
these per-task tunables will rescale the value of the other to an
equivalent meaning. Although /proc/pid/oom_adj was originally defined as
a bitshift on the badness score, it now shares the same linear growth as
/proc/pid/oom_score_adj but with different granularity. This is required
so the ABI is not broken with userspace applications and allows oom_adj to
be deprecated for future removal.
Signed-off-by: David Rientjes <rientjes@google.com>
Cc: Nick Piggin <npiggin@suse.de>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2010-08-10 07:19:46 +07:00
|
|
|
/*
|
|
|
|
* Return the memory (and swap, if configured) limit for a memcg.
|
|
|
|
*/
|
2018-06-08 07:06:18 +07:00
|
|
|
unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg)
|
oom: badness heuristic rewrite
This a complete rewrite of the oom killer's badness() heuristic which is
used to determine which task to kill in oom conditions. The goal is to
make it as simple and predictable as possible so the results are better
understood and we end up killing the task which will lead to the most
memory freeing while still respecting the fine-tuning from userspace.
Instead of basing the heuristic on mm->total_vm for each task, the task's
rss and swap space is used instead. This is a better indication of the
amount of memory that will be freeable if the oom killed task is chosen
and subsequently exits. This helps specifically in cases where KDE or
GNOME is chosen for oom kill on desktop systems instead of a memory
hogging task.
The baseline for the heuristic is a proportion of memory that each task is
currently using in memory plus swap compared to the amount of "allowable"
memory. "Allowable," in this sense, means the system-wide resources for
unconstrained oom conditions, the set of mempolicy nodes, the mems
attached to current's cpuset, or a memory controller's limit. The
proportion is given on a scale of 0 (never kill) to 1000 (always kill),
roughly meaning that if a task has a badness() score of 500 that the task
consumes approximately 50% of allowable memory resident in RAM or in swap
space.
The proportion is always relative to the amount of "allowable" memory and
not the total amount of RAM systemwide so that mempolicies and cpusets may
operate in isolation; they shall not need to know the true size of the
machine on which they are running if they are bound to a specific set of
nodes or mems, respectively.
Root tasks are given 3% extra memory just like __vm_enough_memory()
provides in LSMs. In the event of two tasks consuming similar amounts of
memory, it is generally better to save root's task.
Because of the change in the badness() heuristic's baseline, it is also
necessary to introduce a new user interface to tune it. It's not possible
to redefine the meaning of /proc/pid/oom_adj with a new scale since the
ABI cannot be changed for backward compatability. Instead, a new tunable,
/proc/pid/oom_score_adj, is added that ranges from -1000 to +1000. It may
be used to polarize the heuristic such that certain tasks are never
considered for oom kill while others may always be considered. The value
is added directly into the badness() score so a value of -500, for
example, means to discount 50% of its memory consumption in comparison to
other tasks either on the system, bound to the mempolicy, in the cpuset,
or sharing the same memory controller.
/proc/pid/oom_adj is changed so that its meaning is rescaled into the
units used by /proc/pid/oom_score_adj, and vice versa. Changing one of
these per-task tunables will rescale the value of the other to an
equivalent meaning. Although /proc/pid/oom_adj was originally defined as
a bitshift on the badness score, it now shares the same linear growth as
/proc/pid/oom_score_adj but with different granularity. This is required
so the ABI is not broken with userspace applications and allows oom_adj to
be deprecated for future removal.
Signed-off-by: David Rientjes <rientjes@google.com>
Cc: Nick Piggin <npiggin@suse.de>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2010-08-10 07:19:46 +07:00
|
|
|
{
|
2020-10-14 06:52:52 +07:00
|
|
|
unsigned long max = READ_ONCE(memcg->memory.max);
|
|
|
|
|
|
|
|
if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
|
|
|
|
if (mem_cgroup_swappiness(memcg))
|
|
|
|
max += min(READ_ONCE(memcg->swap.max),
|
|
|
|
(unsigned long)total_swap_pages);
|
|
|
|
} else { /* v1 */
|
|
|
|
if (mem_cgroup_swappiness(memcg)) {
|
|
|
|
/* Calculate swap excess capacity from memsw limit */
|
|
|
|
unsigned long swap = READ_ONCE(memcg->memsw.max) - max;
|
|
|
|
|
|
|
|
max += min(swap, (unsigned long)total_swap_pages);
|
|
|
|
}
|
2012-11-17 05:14:49 +07:00
|
|
|
}
|
2018-06-08 07:06:18 +07:00
|
|
|
return max;
|
oom: badness heuristic rewrite
This a complete rewrite of the oom killer's badness() heuristic which is
used to determine which task to kill in oom conditions. The goal is to
make it as simple and predictable as possible so the results are better
understood and we end up killing the task which will lead to the most
memory freeing while still respecting the fine-tuning from userspace.
Instead of basing the heuristic on mm->total_vm for each task, the task's
rss and swap space is used instead. This is a better indication of the
amount of memory that will be freeable if the oom killed task is chosen
and subsequently exits. This helps specifically in cases where KDE or
GNOME is chosen for oom kill on desktop systems instead of a memory
hogging task.
The baseline for the heuristic is a proportion of memory that each task is
currently using in memory plus swap compared to the amount of "allowable"
memory. "Allowable," in this sense, means the system-wide resources for
unconstrained oom conditions, the set of mempolicy nodes, the mems
attached to current's cpuset, or a memory controller's limit. The
proportion is given on a scale of 0 (never kill) to 1000 (always kill),
roughly meaning that if a task has a badness() score of 500 that the task
consumes approximately 50% of allowable memory resident in RAM or in swap
space.
The proportion is always relative to the amount of "allowable" memory and
not the total amount of RAM systemwide so that mempolicies and cpusets may
operate in isolation; they shall not need to know the true size of the
machine on which they are running if they are bound to a specific set of
nodes or mems, respectively.
Root tasks are given 3% extra memory just like __vm_enough_memory()
provides in LSMs. In the event of two tasks consuming similar amounts of
memory, it is generally better to save root's task.
Because of the change in the badness() heuristic's baseline, it is also
necessary to introduce a new user interface to tune it. It's not possible
to redefine the meaning of /proc/pid/oom_adj with a new scale since the
ABI cannot be changed for backward compatability. Instead, a new tunable,
/proc/pid/oom_score_adj, is added that ranges from -1000 to +1000. It may
be used to polarize the heuristic such that certain tasks are never
considered for oom kill while others may always be considered. The value
is added directly into the badness() score so a value of -500, for
example, means to discount 50% of its memory consumption in comparison to
other tasks either on the system, bound to the mempolicy, in the cpuset,
or sharing the same memory controller.
/proc/pid/oom_adj is changed so that its meaning is rescaled into the
units used by /proc/pid/oom_score_adj, and vice versa. Changing one of
these per-task tunables will rescale the value of the other to an
equivalent meaning. Although /proc/pid/oom_adj was originally defined as
a bitshift on the badness score, it now shares the same linear growth as
/proc/pid/oom_score_adj but with different granularity. This is required
so the ABI is not broken with userspace applications and allows oom_adj to
be deprecated for future removal.
Signed-off-by: David Rientjes <rientjes@google.com>
Cc: Nick Piggin <npiggin@suse.de>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2010-08-10 07:19:46 +07:00
|
|
|
}
|
|
|
|
|
mm, memcg: proportional memory.{low,min} reclaim
cgroup v2 introduces two memory protection thresholds: memory.low
(best-effort) and memory.min (hard protection). While they generally do
what they say on the tin, there is a limitation in their implementation
that makes them difficult to use effectively: that cliff behaviour often
manifests when they become eligible for reclaim. This patch implements
more intuitive and usable behaviour, where we gradually mount more
reclaim pressure as cgroups further and further exceed their protection
thresholds.
This cliff edge behaviour happens because we only choose whether or not
to reclaim based on whether the memcg is within its protection limits
(see the use of mem_cgroup_protected in shrink_node), but we don't vary
our reclaim behaviour based on this information. Imagine the following
timeline, with the numbers the lruvec size in this zone:
1. memory.low=1000000, memory.current=999999. 0 pages may be scanned.
2. memory.low=1000000, memory.current=1000000. 0 pages may be scanned.
3. memory.low=1000000, memory.current=1000001. 1000001* pages may be
scanned. (?!)
* Of course, we won't usually scan all available pages in the zone even
without this patch because of scan control priority, over-reclaim
protection, etc. However, as shown by the tests at the end, these
techniques don't sufficiently throttle such an extreme change in input,
so cliff-like behaviour isn't really averted by their existence alone.
Here's an example of how this plays out in practice. At Facebook, we are
trying to protect various workloads from "system" software, like
configuration management tools, metric collectors, etc (see this[0] case
study). In order to find a suitable memory.low value, we start by
determining the expected memory range within which the workload will be
comfortable operating. This isn't an exact science -- memory usage deemed
"comfortable" will vary over time due to user behaviour, differences in
composition of work, etc, etc. As such we need to ballpark memory.low,
but doing this is currently problematic:
1. If we end up setting it too low for the workload, it won't have
*any* effect (see discussion above). The group will receive the full
weight of reclaim and won't have any priority while competing with the
less important system software, as if we had no memory.low configured
at all.
2. Because of this behaviour, we end up erring on the side of setting
it too high, such that the comfort range is reliably covered. However,
protected memory is completely unavailable to the rest of the system,
so we might cause undue memory and IO pressure there when we *know* we
have some elasticity in the workload.
3. Even if we get the value totally right, smack in the middle of the
comfort zone, we get extreme jumps between no pressure and full
pressure that cause unpredictable pressure spikes in the workload due
to the current binary reclaim behaviour.
With this patch, we can set it to our ballpark estimation without too much
worry. Any undesirable behaviour, such as too much or too little reclaim
pressure on the workload or system will be proportional to how far our
estimation is off. This means we can set memory.low much more
conservatively and thus waste less resources *without* the risk of the
workload falling off a cliff if we overshoot.
As a more abstract technical description, this unintuitive behaviour
results in having to give high-priority workloads a large protection
buffer on top of their expected usage to function reliably, as otherwise
we have abrupt periods of dramatically increased memory pressure which
hamper performance. Having to set these thresholds so high wastes
resources and generally works against the principle of work conservation.
In addition, having proportional memory reclaim behaviour has other
benefits. Most notably, before this patch it's basically mandatory to set
memory.low to a higher than desirable value because otherwise as soon as
you exceed memory.low, all protection is lost, and all pages are eligible
to scan again. By contrast, having a gradual ramp in reclaim pressure
means that you now still get some protection when thresholds are exceeded,
which means that one can now be more comfortable setting memory.low to
lower values without worrying that all protection will be lost. This is
important because workingset size is really hard to know exactly,
especially with variable workloads, so at least getting *some* protection
if your workingset size grows larger than you expect increases user
confidence in setting memory.low without a huge buffer on top being
needed.
Thanks a lot to Johannes Weiner and Tejun Heo for their advice and
assistance in thinking about how to make this work better.
In testing these changes, I intended to verify that:
1. Changes in page scanning become gradual and proportional instead of
binary.
To test this, I experimented stepping further and further down
memory.low protection on a workload that floats around 19G workingset
when under memory.low protection, watching page scan rates for the
workload cgroup:
+------------+-----------------+--------------------+--------------+
| memory.low | test (pgscan/s) | control (pgscan/s) | % of control |
+------------+-----------------+--------------------+--------------+
| 21G | 0 | 0 | N/A |
| 17G | 867 | 3799 | 23% |
| 12G | 1203 | 3543 | 34% |
| 8G | 2534 | 3979 | 64% |
| 4G | 3980 | 4147 | 96% |
| 0 | 3799 | 3980 | 95% |
+------------+-----------------+--------------------+--------------+
As you can see, the test kernel (with a kernel containing this
patch) ramps up page scanning significantly more gradually than the
control kernel (without this patch).
2. More gradual ramp up in reclaim aggression doesn't result in
premature OOMs.
To test this, I wrote a script that slowly increments the number of
pages held by stress(1)'s --vm-keep mode until a production system
entered severe overall memory contention. This script runs in a highly
protected slice taking up the majority of available system memory.
Watching vmstat revealed that page scanning continued essentially
nominally between test and control, without causing forward reclaim
progress to become arrested.
[0]: https://facebookmicrosites.github.io/cgroup2/docs/overview.html#case-study-the-fbtax2-project
[akpm@linux-foundation.org: reflow block comments to fit in 80 cols]
[chris@chrisdown.name: handle cgroup_disable=memory when getting memcg protection]
Link: http://lkml.kernel.org/r/20190201045711.GA18302@chrisdown.name
Link: http://lkml.kernel.org/r/20190124014455.GA6396@chrisdown.name
Signed-off-by: Chris Down <chris@chrisdown.name>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Roman Gushchin <guro@fb.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Dennis Zhou <dennis@kernel.org>
Cc: Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-10-07 07:58:32 +07:00
|
|
|
unsigned long mem_cgroup_size(struct mem_cgroup *memcg)
|
|
|
|
{
|
|
|
|
return page_counter_read(&memcg->memory);
|
|
|
|
}
|
|
|
|
|
2016-03-18 04:20:28 +07:00
|
|
|
static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
|
2012-12-12 07:00:26 +07:00
|
|
|
int order)
|
mm, memcg: introduce own oom handler to iterate only over its own threads
The global oom killer is serialized by the per-zonelist
try_set_zonelist_oom() which is used in the page allocator. Concurrent
oom kills are thus a rare event and only occur in systems using
mempolicies and with a large number of nodes.
Memory controller oom kills, however, can frequently be concurrent since
there is no serialization once the oom killer is called for oom conditions
in several different memcgs in parallel.
This creates a massive contention on tasklist_lock since the oom killer
requires the readside for the tasklist iteration. If several memcgs are
calling the oom killer, this lock can be held for a substantial amount of
time, especially if threads continue to enter it as other threads are
exiting.
Since the exit path grabs the writeside of the lock with irqs disabled in
a few different places, this can cause a soft lockup on cpus as a result
of tasklist_lock starvation.
The kernel lacks unfair writelocks, and successful calls to the oom killer
usually result in at least one thread entering the exit path, so an
alternative solution is needed.
This patch introduces a seperate oom handler for memcgs so that they do
not require tasklist_lock for as much time. Instead, it iterates only
over the threads attached to the oom memcg and grabs a reference to the
selected thread before calling oom_kill_process() to ensure it doesn't
prematurely exit.
This still requires tasklist_lock for the tasklist dump, iterating
children of the selected process, and killing all other threads on the
system sharing the same memory as the selected victim. So while this
isn't a complete solution to tasklist_lock starvation, it significantly
reduces the amount of time that it is held.
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Acked-by: Michal Hocko <mhocko@suse.cz>
Signed-off-by: David Rientjes <rientjes@google.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Reviewed-by: Sha Zhengju <handai.szj@taobao.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-08-01 06:43:44 +07:00
|
|
|
{
|
2015-09-09 05:00:36 +07:00
|
|
|
struct oom_control oc = {
|
|
|
|
.zonelist = NULL,
|
|
|
|
.nodemask = NULL,
|
2016-07-27 05:22:33 +07:00
|
|
|
.memcg = memcg,
|
2015-09-09 05:00:36 +07:00
|
|
|
.gfp_mask = gfp_mask,
|
|
|
|
.order = order,
|
|
|
|
};
|
2020-08-07 13:22:08 +07:00
|
|
|
bool ret = true;
|
mm, memcg: introduce own oom handler to iterate only over its own threads
The global oom killer is serialized by the per-zonelist
try_set_zonelist_oom() which is used in the page allocator. Concurrent
oom kills are thus a rare event and only occur in systems using
mempolicies and with a large number of nodes.
Memory controller oom kills, however, can frequently be concurrent since
there is no serialization once the oom killer is called for oom conditions
in several different memcgs in parallel.
This creates a massive contention on tasklist_lock since the oom killer
requires the readside for the tasklist iteration. If several memcgs are
calling the oom killer, this lock can be held for a substantial amount of
time, especially if threads continue to enter it as other threads are
exiting.
Since the exit path grabs the writeside of the lock with irqs disabled in
a few different places, this can cause a soft lockup on cpus as a result
of tasklist_lock starvation.
The kernel lacks unfair writelocks, and successful calls to the oom killer
usually result in at least one thread entering the exit path, so an
alternative solution is needed.
This patch introduces a seperate oom handler for memcgs so that they do
not require tasklist_lock for as much time. Instead, it iterates only
over the threads attached to the oom memcg and grabs a reference to the
selected thread before calling oom_kill_process() to ensure it doesn't
prematurely exit.
This still requires tasklist_lock for the tasklist dump, iterating
children of the selected process, and killing all other threads on the
system sharing the same memory as the selected victim. So while this
isn't a complete solution to tasklist_lock starvation, it significantly
reduces the amount of time that it is held.
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Acked-by: Michal Hocko <mhocko@suse.cz>
Signed-off-by: David Rientjes <rientjes@google.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Reviewed-by: Sha Zhengju <handai.szj@taobao.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-08-01 06:43:44 +07:00
|
|
|
|
2019-03-06 06:46:47 +07:00
|
|
|
if (mutex_lock_killable(&oom_lock))
|
|
|
|
return true;
|
2020-08-07 13:22:08 +07:00
|
|
|
|
|
|
|
if (mem_cgroup_margin(memcg) >= (1 << order))
|
|
|
|
goto unlock;
|
|
|
|
|
2019-03-06 06:46:47 +07:00
|
|
|
/*
|
|
|
|
* A few threads which were not waiting at mutex_lock_killable() can
|
|
|
|
* fail to bail out. Therefore, check again after holding oom_lock.
|
|
|
|
*/
|
|
|
|
ret = should_force_charge() || out_of_memory(&oc);
|
2020-08-07 13:22:08 +07:00
|
|
|
|
|
|
|
unlock:
|
2015-06-25 06:57:19 +07:00
|
|
|
mutex_unlock(&oom_lock);
|
2016-10-08 06:57:23 +07:00
|
|
|
return ret;
|
mm, memcg: introduce own oom handler to iterate only over its own threads
The global oom killer is serialized by the per-zonelist
try_set_zonelist_oom() which is used in the page allocator. Concurrent
oom kills are thus a rare event and only occur in systems using
mempolicies and with a large number of nodes.
Memory controller oom kills, however, can frequently be concurrent since
there is no serialization once the oom killer is called for oom conditions
in several different memcgs in parallel.
This creates a massive contention on tasklist_lock since the oom killer
requires the readside for the tasklist iteration. If several memcgs are
calling the oom killer, this lock can be held for a substantial amount of
time, especially if threads continue to enter it as other threads are
exiting.
Since the exit path grabs the writeside of the lock with irqs disabled in
a few different places, this can cause a soft lockup on cpus as a result
of tasklist_lock starvation.
The kernel lacks unfair writelocks, and successful calls to the oom killer
usually result in at least one thread entering the exit path, so an
alternative solution is needed.
This patch introduces a seperate oom handler for memcgs so that they do
not require tasklist_lock for as much time. Instead, it iterates only
over the threads attached to the oom memcg and grabs a reference to the
selected thread before calling oom_kill_process() to ensure it doesn't
prematurely exit.
This still requires tasklist_lock for the tasklist dump, iterating
children of the selected process, and killing all other threads on the
system sharing the same memory as the selected victim. So while this
isn't a complete solution to tasklist_lock starvation, it significantly
reduces the amount of time that it is held.
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Acked-by: Michal Hocko <mhocko@suse.cz>
Signed-off-by: David Rientjes <rientjes@google.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Reviewed-by: Sha Zhengju <handai.szj@taobao.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-08-01 06:43:44 +07:00
|
|
|
}
|
|
|
|
|
2013-09-25 05:27:41 +07:00
|
|
|
static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
|
2016-07-29 05:46:05 +07:00
|
|
|
pg_data_t *pgdat,
|
2013-09-25 05:27:41 +07:00
|
|
|
gfp_t gfp_mask,
|
|
|
|
unsigned long *total_scanned)
|
|
|
|
{
|
|
|
|
struct mem_cgroup *victim = NULL;
|
|
|
|
int total = 0;
|
|
|
|
int loop = 0;
|
|
|
|
unsigned long excess;
|
|
|
|
unsigned long nr_scanned;
|
|
|
|
struct mem_cgroup_reclaim_cookie reclaim = {
|
2016-07-29 05:46:05 +07:00
|
|
|
.pgdat = pgdat,
|
2013-09-25 05:27:41 +07:00
|
|
|
};
|
|
|
|
|
mm: memcontrol: lockless page counters
Memory is internally accounted in bytes, using spinlock-protected 64-bit
counters, even though the smallest accounting delta is a page. The
counter interface is also convoluted and does too many things.
Introduce a new lockless word-sized page counter API, then change all
memory accounting over to it. The translation from and to bytes then only
happens when interfacing with userspace.
The removed locking overhead is noticable when scaling beyond the per-cpu
charge caches - on a 4-socket machine with 144-threads, the following test
shows the performance differences of 288 memcgs concurrently running a
page fault benchmark:
vanilla:
18631648.500498 task-clock (msec) # 140.643 CPUs utilized ( +- 0.33% )
1,380,638 context-switches # 0.074 K/sec ( +- 0.75% )
24,390 cpu-migrations # 0.001 K/sec ( +- 8.44% )
1,843,305,768 page-faults # 0.099 M/sec ( +- 0.00% )
50,134,994,088,218 cycles # 2.691 GHz ( +- 0.33% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
8,049,712,224,651 instructions # 0.16 insns per cycle ( +- 0.04% )
1,586,970,584,979 branches # 85.176 M/sec ( +- 0.05% )
1,724,989,949 branch-misses # 0.11% of all branches ( +- 0.48% )
132.474343877 seconds time elapsed ( +- 0.21% )
lockless:
12195979.037525 task-clock (msec) # 133.480 CPUs utilized ( +- 0.18% )
832,850 context-switches # 0.068 K/sec ( +- 0.54% )
15,624 cpu-migrations # 0.001 K/sec ( +- 10.17% )
1,843,304,774 page-faults # 0.151 M/sec ( +- 0.00% )
32,811,216,801,141 cycles # 2.690 GHz ( +- 0.18% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
9,999,265,091,727 instructions # 0.30 insns per cycle ( +- 0.10% )
2,076,759,325,203 branches # 170.282 M/sec ( +- 0.12% )
1,656,917,214 branch-misses # 0.08% of all branches ( +- 0.55% )
91.369330729 seconds time elapsed ( +- 0.45% )
On top of improved scalability, this also gets rid of the icky long long
types in the very heart of memcg, which is great for 32 bit and also makes
the code a lot more readable.
Notable differences between the old and new API:
- res_counter_charge() and res_counter_charge_nofail() become
page_counter_try_charge() and page_counter_charge() resp. to match
the more common kernel naming scheme of try_do()/do()
- res_counter_uncharge_until() is only ever used to cancel a local
counter and never to uncharge bigger segments of a hierarchy, so
it's replaced by the simpler page_counter_cancel()
- res_counter_set_limit() is replaced by page_counter_limit(), which
expects its callers to serialize against themselves
- res_counter_memparse_write_strategy() is replaced by
page_counter_limit(), which rounds down to the nearest page size -
rather than up. This is more reasonable for explicitely requested
hard upper limits.
- to keep charging light-weight, page_counter_try_charge() charges
speculatively, only to roll back if the result exceeds the limit.
Because of this, a failing bigger charge can temporarily lock out
smaller charges that would otherwise succeed. The error is bounded
to the difference between the smallest and the biggest possible
charge size, so for memcg, this means that a failing THP charge can
send base page charges into reclaim upto 2MB (4MB) before the limit
would have been reached. This should be acceptable.
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE and memparse]
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE, memparse, strncmp, and PAGE_SIZE]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Acked-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-12-11 06:42:31 +07:00
|
|
|
excess = soft_limit_excess(root_memcg);
|
2013-09-25 05:27:41 +07:00
|
|
|
|
|
|
|
while (1) {
|
|
|
|
victim = mem_cgroup_iter(root_memcg, victim, &reclaim);
|
|
|
|
if (!victim) {
|
|
|
|
loop++;
|
|
|
|
if (loop >= 2) {
|
|
|
|
/*
|
|
|
|
* If we have not been able to reclaim
|
|
|
|
* anything, it might because there are
|
|
|
|
* no reclaimable pages under this hierarchy
|
|
|
|
*/
|
|
|
|
if (!total)
|
|
|
|
break;
|
|
|
|
/*
|
|
|
|
* We want to do more targeted reclaim.
|
|
|
|
* excess >> 2 is not to excessive so as to
|
|
|
|
* reclaim too much, nor too less that we keep
|
|
|
|
* coming back to reclaim from this cgroup
|
|
|
|
*/
|
|
|
|
if (total >= (excess >> 2) ||
|
|
|
|
(loop > MEM_CGROUP_MAX_RECLAIM_LOOPS))
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
continue;
|
|
|
|
}
|
2016-07-29 05:46:02 +07:00
|
|
|
total += mem_cgroup_shrink_node(victim, gfp_mask, false,
|
2016-07-29 05:46:05 +07:00
|
|
|
pgdat, &nr_scanned);
|
2013-09-25 05:27:41 +07:00
|
|
|
*total_scanned += nr_scanned;
|
mm: memcontrol: lockless page counters
Memory is internally accounted in bytes, using spinlock-protected 64-bit
counters, even though the smallest accounting delta is a page. The
counter interface is also convoluted and does too many things.
Introduce a new lockless word-sized page counter API, then change all
memory accounting over to it. The translation from and to bytes then only
happens when interfacing with userspace.
The removed locking overhead is noticable when scaling beyond the per-cpu
charge caches - on a 4-socket machine with 144-threads, the following test
shows the performance differences of 288 memcgs concurrently running a
page fault benchmark:
vanilla:
18631648.500498 task-clock (msec) # 140.643 CPUs utilized ( +- 0.33% )
1,380,638 context-switches # 0.074 K/sec ( +- 0.75% )
24,390 cpu-migrations # 0.001 K/sec ( +- 8.44% )
1,843,305,768 page-faults # 0.099 M/sec ( +- 0.00% )
50,134,994,088,218 cycles # 2.691 GHz ( +- 0.33% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
8,049,712,224,651 instructions # 0.16 insns per cycle ( +- 0.04% )
1,586,970,584,979 branches # 85.176 M/sec ( +- 0.05% )
1,724,989,949 branch-misses # 0.11% of all branches ( +- 0.48% )
132.474343877 seconds time elapsed ( +- 0.21% )
lockless:
12195979.037525 task-clock (msec) # 133.480 CPUs utilized ( +- 0.18% )
832,850 context-switches # 0.068 K/sec ( +- 0.54% )
15,624 cpu-migrations # 0.001 K/sec ( +- 10.17% )
1,843,304,774 page-faults # 0.151 M/sec ( +- 0.00% )
32,811,216,801,141 cycles # 2.690 GHz ( +- 0.18% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
9,999,265,091,727 instructions # 0.30 insns per cycle ( +- 0.10% )
2,076,759,325,203 branches # 170.282 M/sec ( +- 0.12% )
1,656,917,214 branch-misses # 0.08% of all branches ( +- 0.55% )
91.369330729 seconds time elapsed ( +- 0.45% )
On top of improved scalability, this also gets rid of the icky long long
types in the very heart of memcg, which is great for 32 bit and also makes
the code a lot more readable.
Notable differences between the old and new API:
- res_counter_charge() and res_counter_charge_nofail() become
page_counter_try_charge() and page_counter_charge() resp. to match
the more common kernel naming scheme of try_do()/do()
- res_counter_uncharge_until() is only ever used to cancel a local
counter and never to uncharge bigger segments of a hierarchy, so
it's replaced by the simpler page_counter_cancel()
- res_counter_set_limit() is replaced by page_counter_limit(), which
expects its callers to serialize against themselves
- res_counter_memparse_write_strategy() is replaced by
page_counter_limit(), which rounds down to the nearest page size -
rather than up. This is more reasonable for explicitely requested
hard upper limits.
- to keep charging light-weight, page_counter_try_charge() charges
speculatively, only to roll back if the result exceeds the limit.
Because of this, a failing bigger charge can temporarily lock out
smaller charges that would otherwise succeed. The error is bounded
to the difference between the smallest and the biggest possible
charge size, so for memcg, this means that a failing THP charge can
send base page charges into reclaim upto 2MB (4MB) before the limit
would have been reached. This should be acceptable.
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE and memparse]
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE, memparse, strncmp, and PAGE_SIZE]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Acked-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-12-11 06:42:31 +07:00
|
|
|
if (!soft_limit_excess(root_memcg))
|
2013-09-25 05:27:41 +07:00
|
|
|
break;
|
2009-01-08 09:08:06 +07:00
|
|
|
}
|
2013-09-25 05:27:41 +07:00
|
|
|
mem_cgroup_iter_break(root_memcg, victim);
|
|
|
|
return total;
|
2009-01-08 09:08:06 +07:00
|
|
|
}
|
|
|
|
|
2013-11-01 06:34:14 +07:00
|
|
|
#ifdef CONFIG_LOCKDEP
|
|
|
|
static struct lockdep_map memcg_oom_lock_dep_map = {
|
|
|
|
.name = "memcg_oom_lock",
|
|
|
|
};
|
|
|
|
#endif
|
|
|
|
|
2013-09-13 05:13:43 +07:00
|
|
|
static DEFINE_SPINLOCK(memcg_oom_lock);
|
|
|
|
|
2010-03-11 06:22:39 +07:00
|
|
|
/*
|
|
|
|
* Check OOM-Killer is already running under our hierarchy.
|
|
|
|
* If someone is running, return false.
|
|
|
|
*/
|
2013-09-13 05:13:43 +07:00
|
|
|
static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg)
|
2010-03-11 06:22:39 +07:00
|
|
|
{
|
2011-07-27 06:08:23 +07:00
|
|
|
struct mem_cgroup *iter, *failed = NULL;
|
2009-01-08 09:08:08 +07:00
|
|
|
|
2013-09-13 05:13:43 +07:00
|
|
|
spin_lock(&memcg_oom_lock);
|
|
|
|
|
mm: memcg: consolidate hierarchy iteration primitives
The memcg naturalization series:
Memory control groups are currently bolted onto the side of
traditional memory management in places where better integration would
be preferrable. To reclaim memory, for example, memory control groups
maintain their own LRU list and reclaim strategy aside from the global
per-zone LRU list reclaim. But an extra list head for each existing
page frame is expensive and maintaining it requires additional code.
This patchset disables the global per-zone LRU lists on memory cgroup
configurations and converts all its users to operate on the per-memory
cgroup lists instead. As LRU pages are then exclusively on one list,
this saves two list pointers for each page frame in the system:
page_cgroup array size with 4G physical memory
vanilla: allocated 31457280 bytes of page_cgroup
patched: allocated 15728640 bytes of page_cgroup
At the same time, system performance for various workloads is
unaffected:
100G sparse file cat, 4G physical memory, 10 runs, to test for code
bloat in the traditional LRU handling and kswapd & direct reclaim
paths, without/with the memory controller configured in
vanilla: 71.603(0.207) seconds
patched: 71.640(0.156) seconds
vanilla: 79.558(0.288) seconds
patched: 77.233(0.147) seconds
100G sparse file cat in 1G memory cgroup, 10 runs, to test for code
bloat in the traditional memory cgroup LRU handling and reclaim path
vanilla: 96.844(0.281) seconds
patched: 94.454(0.311) seconds
4 unlimited memcgs running kbuild -j32 each, 4G physical memory, 500M
swap on SSD, 10 runs, to test for regressions in kswapd & direct
reclaim using per-memcg LRU lists with multiple memcgs and multiple
allocators within each memcg
vanilla: 717.722(1.440) seconds [ 69720.100(11600.835) majfaults ]
patched: 714.106(2.313) seconds [ 71109.300(14886.186) majfaults ]
16 unlimited memcgs running kbuild, 1900M hierarchical limit, 500M
swap on SSD, 10 runs, to test for regressions in hierarchical memcg
setups
vanilla: 2742.058(1.992) seconds [ 26479.600(1736.737) majfaults ]
patched: 2743.267(1.214) seconds [ 27240.700(1076.063) majfaults ]
This patch:
There are currently two different implementations of iterating over a
memory cgroup hierarchy tree.
Consolidate them into one worker function and base the convenience
looping-macros on top of it.
Signed-off-by: Johannes Weiner <jweiner@redhat.com>
Reviewed-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Reviewed-by: Michal Hocko <mhocko@suse.cz>
Reviewed-by: Kirill A. Shutemov <kirill@shutemov.name>
Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: Ying Han <yinghan@google.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Michel Lespinasse <walken@google.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Minchan Kim <minchan.kim@gmail.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-01-13 08:17:48 +07:00
|
|
|
for_each_mem_cgroup_tree(iter, memcg) {
|
2011-08-26 05:59:16 +07:00
|
|
|
if (iter->oom_lock) {
|
2011-07-27 06:08:23 +07:00
|
|
|
/*
|
|
|
|
* this subtree of our hierarchy is already locked
|
|
|
|
* so we cannot give a lock.
|
|
|
|
*/
|
|
|
|
failed = iter;
|
mm: memcg: consolidate hierarchy iteration primitives
The memcg naturalization series:
Memory control groups are currently bolted onto the side of
traditional memory management in places where better integration would
be preferrable. To reclaim memory, for example, memory control groups
maintain their own LRU list and reclaim strategy aside from the global
per-zone LRU list reclaim. But an extra list head for each existing
page frame is expensive and maintaining it requires additional code.
This patchset disables the global per-zone LRU lists on memory cgroup
configurations and converts all its users to operate on the per-memory
cgroup lists instead. As LRU pages are then exclusively on one list,
this saves two list pointers for each page frame in the system:
page_cgroup array size with 4G physical memory
vanilla: allocated 31457280 bytes of page_cgroup
patched: allocated 15728640 bytes of page_cgroup
At the same time, system performance for various workloads is
unaffected:
100G sparse file cat, 4G physical memory, 10 runs, to test for code
bloat in the traditional LRU handling and kswapd & direct reclaim
paths, without/with the memory controller configured in
vanilla: 71.603(0.207) seconds
patched: 71.640(0.156) seconds
vanilla: 79.558(0.288) seconds
patched: 77.233(0.147) seconds
100G sparse file cat in 1G memory cgroup, 10 runs, to test for code
bloat in the traditional memory cgroup LRU handling and reclaim path
vanilla: 96.844(0.281) seconds
patched: 94.454(0.311) seconds
4 unlimited memcgs running kbuild -j32 each, 4G physical memory, 500M
swap on SSD, 10 runs, to test for regressions in kswapd & direct
reclaim using per-memcg LRU lists with multiple memcgs and multiple
allocators within each memcg
vanilla: 717.722(1.440) seconds [ 69720.100(11600.835) majfaults ]
patched: 714.106(2.313) seconds [ 71109.300(14886.186) majfaults ]
16 unlimited memcgs running kbuild, 1900M hierarchical limit, 500M
swap on SSD, 10 runs, to test for regressions in hierarchical memcg
setups
vanilla: 2742.058(1.992) seconds [ 26479.600(1736.737) majfaults ]
patched: 2743.267(1.214) seconds [ 27240.700(1076.063) majfaults ]
This patch:
There are currently two different implementations of iterating over a
memory cgroup hierarchy tree.
Consolidate them into one worker function and base the convenience
looping-macros on top of it.
Signed-off-by: Johannes Weiner <jweiner@redhat.com>
Reviewed-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Reviewed-by: Michal Hocko <mhocko@suse.cz>
Reviewed-by: Kirill A. Shutemov <kirill@shutemov.name>
Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: Ying Han <yinghan@google.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Michel Lespinasse <walken@google.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Minchan Kim <minchan.kim@gmail.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-01-13 08:17:48 +07:00
|
|
|
mem_cgroup_iter_break(memcg, iter);
|
|
|
|
break;
|
2011-08-26 05:59:16 +07:00
|
|
|
} else
|
|
|
|
iter->oom_lock = true;
|
2010-10-28 05:33:41 +07:00
|
|
|
}
|
2010-03-11 06:22:39 +07:00
|
|
|
|
2013-09-13 05:13:43 +07:00
|
|
|
if (failed) {
|
|
|
|
/*
|
|
|
|
* OK, we failed to lock the whole subtree so we have
|
|
|
|
* to clean up what we set up to the failing subtree
|
|
|
|
*/
|
|
|
|
for_each_mem_cgroup_tree(iter, memcg) {
|
|
|
|
if (iter == failed) {
|
|
|
|
mem_cgroup_iter_break(memcg, iter);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
iter->oom_lock = false;
|
2011-07-27 06:08:23 +07:00
|
|
|
}
|
2013-11-01 06:34:14 +07:00
|
|
|
} else
|
|
|
|
mutex_acquire(&memcg_oom_lock_dep_map, 0, 1, _RET_IP_);
|
2013-09-13 05:13:43 +07:00
|
|
|
|
|
|
|
spin_unlock(&memcg_oom_lock);
|
|
|
|
|
|
|
|
return !failed;
|
2009-01-08 09:08:08 +07:00
|
|
|
}
|
memcg: fix OOM killer under memcg
This patch tries to fix OOM Killer problems caused by hierarchy.
Now, memcg itself has OOM KILL function (in oom_kill.c) and tries to
kill a task in memcg.
But, when hierarchy is used, it's broken and correct task cannot
be killed. For example, in following cgroup
/groupA/ hierarchy=1, limit=1G,
01 nolimit
02 nolimit
All tasks' memory usage under /groupA, /groupA/01, groupA/02 is limited to
groupA's 1Gbytes but OOM Killer just kills tasks in groupA.
This patch provides makes the bad process be selected from all tasks
under hierarchy. BTW, currently, oom_jiffies is updated against groupA
in above case. oom_jiffies of tree should be updated.
To see how oom_jiffies is used, please check mem_cgroup_oom_called()
callers.
[akpm@linux-foundation.org: build fix]
[akpm@linux-foundation.org: const fix]
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Paul Menage <menage@google.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-04-03 06:57:38 +07:00
|
|
|
|
2013-09-13 05:13:43 +07:00
|
|
|
static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
|
memcg: fix OOM killer under memcg
This patch tries to fix OOM Killer problems caused by hierarchy.
Now, memcg itself has OOM KILL function (in oom_kill.c) and tries to
kill a task in memcg.
But, when hierarchy is used, it's broken and correct task cannot
be killed. For example, in following cgroup
/groupA/ hierarchy=1, limit=1G,
01 nolimit
02 nolimit
All tasks' memory usage under /groupA, /groupA/01, groupA/02 is limited to
groupA's 1Gbytes but OOM Killer just kills tasks in groupA.
This patch provides makes the bad process be selected from all tasks
under hierarchy. BTW, currently, oom_jiffies is updated against groupA
in above case. oom_jiffies of tree should be updated.
To see how oom_jiffies is used, please check mem_cgroup_oom_called()
callers.
[akpm@linux-foundation.org: build fix]
[akpm@linux-foundation.org: const fix]
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Paul Menage <menage@google.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-04-03 06:57:38 +07:00
|
|
|
{
|
2010-10-28 05:33:41 +07:00
|
|
|
struct mem_cgroup *iter;
|
|
|
|
|
2013-09-13 05:13:43 +07:00
|
|
|
spin_lock(&memcg_oom_lock);
|
2019-09-19 23:09:40 +07:00
|
|
|
mutex_release(&memcg_oom_lock_dep_map, _RET_IP_);
|
2011-11-03 03:38:15 +07:00
|
|
|
for_each_mem_cgroup_tree(iter, memcg)
|
2011-07-27 06:08:23 +07:00
|
|
|
iter->oom_lock = false;
|
2013-09-13 05:13:43 +07:00
|
|
|
spin_unlock(&memcg_oom_lock);
|
2011-07-27 06:08:23 +07:00
|
|
|
}
|
|
|
|
|
2011-11-03 03:38:15 +07:00
|
|
|
static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg)
|
2011-07-27 06:08:23 +07:00
|
|
|
{
|
|
|
|
struct mem_cgroup *iter;
|
|
|
|
|
2015-06-25 06:58:23 +07:00
|
|
|
spin_lock(&memcg_oom_lock);
|
2011-11-03 03:38:15 +07:00
|
|
|
for_each_mem_cgroup_tree(iter, memcg)
|
2015-06-25 06:58:23 +07:00
|
|
|
iter->under_oom++;
|
|
|
|
spin_unlock(&memcg_oom_lock);
|
2011-07-27 06:08:23 +07:00
|
|
|
}
|
|
|
|
|
2011-11-03 03:38:15 +07:00
|
|
|
static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)
|
2011-07-27 06:08:23 +07:00
|
|
|
{
|
|
|
|
struct mem_cgroup *iter;
|
|
|
|
|
2010-03-11 06:22:39 +07:00
|
|
|
/*
|
2020-10-14 06:53:05 +07:00
|
|
|
* Be careful about under_oom underflows becase a child memcg
|
|
|
|
* could have been added after mem_cgroup_mark_under_oom.
|
2010-03-11 06:22:39 +07:00
|
|
|
*/
|
2015-06-25 06:58:23 +07:00
|
|
|
spin_lock(&memcg_oom_lock);
|
2011-11-03 03:38:15 +07:00
|
|
|
for_each_mem_cgroup_tree(iter, memcg)
|
2015-06-25 06:58:23 +07:00
|
|
|
if (iter->under_oom > 0)
|
|
|
|
iter->under_oom--;
|
|
|
|
spin_unlock(&memcg_oom_lock);
|
memcg: fix OOM killer under memcg
This patch tries to fix OOM Killer problems caused by hierarchy.
Now, memcg itself has OOM KILL function (in oom_kill.c) and tries to
kill a task in memcg.
But, when hierarchy is used, it's broken and correct task cannot
be killed. For example, in following cgroup
/groupA/ hierarchy=1, limit=1G,
01 nolimit
02 nolimit
All tasks' memory usage under /groupA, /groupA/01, groupA/02 is limited to
groupA's 1Gbytes but OOM Killer just kills tasks in groupA.
This patch provides makes the bad process be selected from all tasks
under hierarchy. BTW, currently, oom_jiffies is updated against groupA
in above case. oom_jiffies of tree should be updated.
To see how oom_jiffies is used, please check mem_cgroup_oom_called()
callers.
[akpm@linux-foundation.org: build fix]
[akpm@linux-foundation.org: const fix]
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Paul Menage <menage@google.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-04-03 06:57:38 +07:00
|
|
|
}
|
|
|
|
|
2010-03-11 06:22:39 +07:00
|
|
|
static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
|
|
|
|
|
2010-05-27 04:42:36 +07:00
|
|
|
struct oom_wait_info {
|
2012-03-22 06:34:18 +07:00
|
|
|
struct mem_cgroup *memcg;
|
2017-06-20 17:06:13 +07:00
|
|
|
wait_queue_entry_t wait;
|
2010-05-27 04:42:36 +07:00
|
|
|
};
|
|
|
|
|
2017-06-20 17:06:13 +07:00
|
|
|
static int memcg_oom_wake_function(wait_queue_entry_t *wait,
|
2010-05-27 04:42:36 +07:00
|
|
|
unsigned mode, int sync, void *arg)
|
|
|
|
{
|
2012-03-22 06:34:18 +07:00
|
|
|
struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg;
|
|
|
|
struct mem_cgroup *oom_wait_memcg;
|
2010-05-27 04:42:36 +07:00
|
|
|
struct oom_wait_info *oom_wait_info;
|
|
|
|
|
|
|
|
oom_wait_info = container_of(wait, struct oom_wait_info, wait);
|
2012-03-22 06:34:18 +07:00
|
|
|
oom_wait_memcg = oom_wait_info->memcg;
|
2010-05-27 04:42:36 +07:00
|
|
|
|
2014-12-11 06:44:33 +07:00
|
|
|
if (!mem_cgroup_is_descendant(wake_memcg, oom_wait_memcg) &&
|
|
|
|
!mem_cgroup_is_descendant(oom_wait_memcg, wake_memcg))
|
2010-05-27 04:42:36 +07:00
|
|
|
return 0;
|
|
|
|
return autoremove_wake_function(wait, mode, sync, arg);
|
|
|
|
}
|
|
|
|
|
2011-11-03 03:38:15 +07:00
|
|
|
static void memcg_oom_recover(struct mem_cgroup *memcg)
|
2010-05-27 04:42:37 +07:00
|
|
|
{
|
2015-06-25 06:58:23 +07:00
|
|
|
/*
|
|
|
|
* For the following lockless ->under_oom test, the only required
|
|
|
|
* guarantee is that it must see the state asserted by an OOM when
|
|
|
|
* this function is called as a result of userland actions
|
|
|
|
* triggered by the notification of the OOM. This is trivially
|
|
|
|
* achieved by invoking mem_cgroup_mark_under_oom() before
|
|
|
|
* triggering notification.
|
|
|
|
*/
|
|
|
|
if (memcg && memcg->under_oom)
|
2015-06-25 06:58:21 +07:00
|
|
|
__wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
|
2010-05-27 04:42:37 +07:00
|
|
|
}
|
|
|
|
|
2018-08-18 05:47:11 +07:00
|
|
|
enum oom_status {
|
|
|
|
OOM_SUCCESS,
|
|
|
|
OOM_FAILED,
|
|
|
|
OOM_ASYNC,
|
|
|
|
OOM_SKIPPED
|
|
|
|
};
|
|
|
|
|
|
|
|
static enum oom_status mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
|
memcg: fix OOM killer under memcg
This patch tries to fix OOM Killer problems caused by hierarchy.
Now, memcg itself has OOM KILL function (in oom_kill.c) and tries to
kill a task in memcg.
But, when hierarchy is used, it's broken and correct task cannot
be killed. For example, in following cgroup
/groupA/ hierarchy=1, limit=1G,
01 nolimit
02 nolimit
All tasks' memory usage under /groupA, /groupA/01, groupA/02 is limited to
groupA's 1Gbytes but OOM Killer just kills tasks in groupA.
This patch provides makes the bad process be selected from all tasks
under hierarchy. BTW, currently, oom_jiffies is updated against groupA
in above case. oom_jiffies of tree should be updated.
To see how oom_jiffies is used, please check mem_cgroup_oom_called()
callers.
[akpm@linux-foundation.org: build fix]
[akpm@linux-foundation.org: const fix]
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Paul Menage <menage@google.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-04-03 06:57:38 +07:00
|
|
|
{
|
2018-12-28 15:39:57 +07:00
|
|
|
enum oom_status ret;
|
|
|
|
bool locked;
|
|
|
|
|
2018-08-18 05:47:11 +07:00
|
|
|
if (order > PAGE_ALLOC_COSTLY_ORDER)
|
|
|
|
return OOM_SKIPPED;
|
|
|
|
|
mm: don't raise MEMCG_OOM event due to failed high-order allocation
It was reported that on some of our machines containers were restarted
with OOM symptoms without an obvious reason. Despite there were almost no
memory pressure and plenty of page cache, MEMCG_OOM event was raised
occasionally, causing the container management software to think, that OOM
has happened. However, no tasks have been killed.
The following investigation showed that the problem is caused by a failing
attempt to charge a high-order page. In such case, the OOM killer is
never invoked. As shown below, it can happen under conditions, which are
very far from a real OOM: e.g. there is plenty of clean page cache and no
memory pressure.
There is no sense in raising an OOM event in this case, as it might
confuse a user and lead to wrong and excessive actions (e.g. restart the
workload, as in my case).
Let's look at the charging path in try_charge(). If the memory usage is
about memory.max, which is absolutely natural for most memory cgroups, we
try to reclaim some pages. Even if we were able to reclaim enough memory
for the allocation, the following check can fail due to a race with
another concurrent allocation:
if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
goto retry;
For regular pages the following condition will save us from triggering
the OOM:
if (nr_reclaimed && nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER))
goto retry;
But for high-order allocation this condition will intentionally fail. The
reason behind is that we'll likely fall to regular pages anyway, so it's
ok and even preferred to return ENOMEM.
In this case the idea of raising MEMCG_OOM looks dubious.
Fix this by moving MEMCG_OOM raising to mem_cgroup_oom() after allocation
order check, so that the event won't be raised for high order allocations.
This change doesn't affect regular pages allocation and charging.
Link: http://lkml.kernel.org/r/20181004214050.7417-1-guro@fb.com
Signed-off-by: Roman Gushchin <guro@fb.com>
Acked-by: David Rientjes <rientjes@google.com>
Acked-by: Michal Hocko <mhocko@kernel.org>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2018-10-27 05:09:48 +07:00
|
|
|
memcg_memory_event(memcg, MEMCG_OOM);
|
|
|
|
|
2010-03-11 06:22:39 +07:00
|
|
|
/*
|
2013-10-17 03:46:59 +07:00
|
|
|
* We are in the middle of the charge context here, so we
|
|
|
|
* don't want to block when potentially sitting on a callstack
|
|
|
|
* that holds all kinds of filesystem and mm locks.
|
|
|
|
*
|
2018-08-18 05:47:11 +07:00
|
|
|
* cgroup1 allows disabling the OOM killer and waiting for outside
|
|
|
|
* handling until the charge can succeed; remember the context and put
|
|
|
|
* the task to sleep at the end of the page fault when all locks are
|
|
|
|
* released.
|
2013-10-17 03:46:59 +07:00
|
|
|
*
|
2018-08-18 05:47:11 +07:00
|
|
|
* On the other hand, in-kernel OOM killer allows for an async victim
|
|
|
|
* memory reclaim (oom_reaper) and that means that we are not solely
|
|
|
|
* relying on the oom victim to make a forward progress and we can
|
|
|
|
* invoke the oom killer here.
|
|
|
|
*
|
|
|
|
* Please note that mem_cgroup_out_of_memory might fail to find a
|
|
|
|
* victim and then we have to bail out from the charge path.
|
2010-03-11 06:22:39 +07:00
|
|
|
*/
|
2018-08-18 05:47:11 +07:00
|
|
|
if (memcg->oom_kill_disable) {
|
|
|
|
if (!current->in_user_fault)
|
|
|
|
return OOM_SKIPPED;
|
|
|
|
css_get(&memcg->css);
|
|
|
|
current->memcg_in_oom = memcg;
|
|
|
|
current->memcg_oom_gfp_mask = mask;
|
|
|
|
current->memcg_oom_order = order;
|
|
|
|
|
|
|
|
return OOM_ASYNC;
|
|
|
|
}
|
|
|
|
|
2018-12-28 15:39:57 +07:00
|
|
|
mem_cgroup_mark_under_oom(memcg);
|
|
|
|
|
|
|
|
locked = mem_cgroup_oom_trylock(memcg);
|
|
|
|
|
|
|
|
if (locked)
|
|
|
|
mem_cgroup_oom_notify(memcg);
|
|
|
|
|
|
|
|
mem_cgroup_unmark_under_oom(memcg);
|
2018-08-18 05:47:11 +07:00
|
|
|
if (mem_cgroup_out_of_memory(memcg, mask, order))
|
2018-12-28 15:39:57 +07:00
|
|
|
ret = OOM_SUCCESS;
|
|
|
|
else
|
|
|
|
ret = OOM_FAILED;
|
|
|
|
|
|
|
|
if (locked)
|
|
|
|
mem_cgroup_oom_unlock(memcg);
|
2018-08-18 05:47:11 +07:00
|
|
|
|
2018-12-28 15:39:57 +07:00
|
|
|
return ret;
|
mm: memcg: do not trap chargers with full callstack on OOM
The memcg OOM handling is incredibly fragile and can deadlock. When a
task fails to charge memory, it invokes the OOM killer and loops right
there in the charge code until it succeeds. Comparably, any other task
that enters the charge path at this point will go to a waitqueue right
then and there and sleep until the OOM situation is resolved. The problem
is that these tasks may hold filesystem locks and the mmap_sem; locks that
the selected OOM victim may need to exit.
For example, in one reported case, the task invoking the OOM killer was
about to charge a page cache page during a write(), which holds the
i_mutex. The OOM killer selected a task that was just entering truncate()
and trying to acquire the i_mutex:
OOM invoking task:
mem_cgroup_handle_oom+0x241/0x3b0
mem_cgroup_cache_charge+0xbe/0xe0
add_to_page_cache_locked+0x4c/0x140
add_to_page_cache_lru+0x22/0x50
grab_cache_page_write_begin+0x8b/0xe0
ext3_write_begin+0x88/0x270
generic_file_buffered_write+0x116/0x290
__generic_file_aio_write+0x27c/0x480
generic_file_aio_write+0x76/0xf0 # takes ->i_mutex
do_sync_write+0xea/0x130
vfs_write+0xf3/0x1f0
sys_write+0x51/0x90
system_call_fastpath+0x18/0x1d
OOM kill victim:
do_truncate+0x58/0xa0 # takes i_mutex
do_last+0x250/0xa30
path_openat+0xd7/0x440
do_filp_open+0x49/0xa0
do_sys_open+0x106/0x240
sys_open+0x20/0x30
system_call_fastpath+0x18/0x1d
The OOM handling task will retry the charge indefinitely while the OOM
killed task is not releasing any resources.
A similar scenario can happen when the kernel OOM killer for a memcg is
disabled and a userspace task is in charge of resolving OOM situations.
In this case, ALL tasks that enter the OOM path will be made to sleep on
the OOM waitqueue and wait for userspace to free resources or increase
the group's limit. But a userspace OOM handler is prone to deadlock
itself on the locks held by the waiting tasks. For example one of the
sleeping tasks may be stuck in a brk() call with the mmap_sem held for
writing but the userspace handler, in order to pick an optimal victim,
may need to read files from /proc/<pid>, which tries to acquire the same
mmap_sem for reading and deadlocks.
This patch changes the way tasks behave after detecting a memcg OOM and
makes sure nobody loops or sleeps with locks held:
1. When OOMing in a user fault, invoke the OOM killer and restart the
fault instead of looping on the charge attempt. This way, the OOM
victim can not get stuck on locks the looping task may hold.
2. When OOMing in a user fault but somebody else is handling it
(either the kernel OOM killer or a userspace handler), don't go to
sleep in the charge context. Instead, remember the OOMing memcg in
the task struct and then fully unwind the page fault stack with
-ENOMEM. pagefault_out_of_memory() will then call back into the
memcg code to check if the -ENOMEM came from the memcg, and then
either put the task to sleep on the memcg's OOM waitqueue or just
restart the fault. The OOM victim can no longer get stuck on any
lock a sleeping task may hold.
Debugged by Michal Hocko.
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reported-by: azurIt <azurit@pobox.sk>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: David Rientjes <rientjes@google.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2013-09-13 05:13:44 +07:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* mem_cgroup_oom_synchronize - complete memcg OOM handling
|
2013-10-17 03:46:59 +07:00
|
|
|
* @handle: actually kill/wait or just clean up the OOM state
|
mm: memcg: do not trap chargers with full callstack on OOM
The memcg OOM handling is incredibly fragile and can deadlock. When a
task fails to charge memory, it invokes the OOM killer and loops right
there in the charge code until it succeeds. Comparably, any other task
that enters the charge path at this point will go to a waitqueue right
then and there and sleep until the OOM situation is resolved. The problem
is that these tasks may hold filesystem locks and the mmap_sem; locks that
the selected OOM victim may need to exit.
For example, in one reported case, the task invoking the OOM killer was
about to charge a page cache page during a write(), which holds the
i_mutex. The OOM killer selected a task that was just entering truncate()
and trying to acquire the i_mutex:
OOM invoking task:
mem_cgroup_handle_oom+0x241/0x3b0
mem_cgroup_cache_charge+0xbe/0xe0
add_to_page_cache_locked+0x4c/0x140
add_to_page_cache_lru+0x22/0x50
grab_cache_page_write_begin+0x8b/0xe0
ext3_write_begin+0x88/0x270
generic_file_buffered_write+0x116/0x290
__generic_file_aio_write+0x27c/0x480
generic_file_aio_write+0x76/0xf0 # takes ->i_mutex
do_sync_write+0xea/0x130
vfs_write+0xf3/0x1f0
sys_write+0x51/0x90
system_call_fastpath+0x18/0x1d
OOM kill victim:
do_truncate+0x58/0xa0 # takes i_mutex
do_last+0x250/0xa30
path_openat+0xd7/0x440
do_filp_open+0x49/0xa0
do_sys_open+0x106/0x240
sys_open+0x20/0x30
system_call_fastpath+0x18/0x1d
The OOM handling task will retry the charge indefinitely while the OOM
killed task is not releasing any resources.
A similar scenario can happen when the kernel OOM killer for a memcg is
disabled and a userspace task is in charge of resolving OOM situations.
In this case, ALL tasks that enter the OOM path will be made to sleep on
the OOM waitqueue and wait for userspace to free resources or increase
the group's limit. But a userspace OOM handler is prone to deadlock
itself on the locks held by the waiting tasks. For example one of the
sleeping tasks may be stuck in a brk() call with the mmap_sem held for
writing but the userspace handler, in order to pick an optimal victim,
may need to read files from /proc/<pid>, which tries to acquire the same
mmap_sem for reading and deadlocks.
This patch changes the way tasks behave after detecting a memcg OOM and
makes sure nobody loops or sleeps with locks held:
1. When OOMing in a user fault, invoke the OOM killer and restart the
fault instead of looping on the charge attempt. This way, the OOM
victim can not get stuck on locks the looping task may hold.
2. When OOMing in a user fault but somebody else is handling it
(either the kernel OOM killer or a userspace handler), don't go to
sleep in the charge context. Instead, remember the OOMing memcg in
the task struct and then fully unwind the page fault stack with
-ENOMEM. pagefault_out_of_memory() will then call back into the
memcg code to check if the -ENOMEM came from the memcg, and then
either put the task to sleep on the memcg's OOM waitqueue or just
restart the fault. The OOM victim can no longer get stuck on any
lock a sleeping task may hold.
Debugged by Michal Hocko.
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reported-by: azurIt <azurit@pobox.sk>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: David Rientjes <rientjes@google.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2013-09-13 05:13:44 +07:00
|
|
|
*
|
2013-10-17 03:46:59 +07:00
|
|
|
* This has to be called at the end of a page fault if the memcg OOM
|
|
|
|
* handler was enabled.
|
mm: memcg: do not trap chargers with full callstack on OOM
The memcg OOM handling is incredibly fragile and can deadlock. When a
task fails to charge memory, it invokes the OOM killer and loops right
there in the charge code until it succeeds. Comparably, any other task
that enters the charge path at this point will go to a waitqueue right
then and there and sleep until the OOM situation is resolved. The problem
is that these tasks may hold filesystem locks and the mmap_sem; locks that
the selected OOM victim may need to exit.
For example, in one reported case, the task invoking the OOM killer was
about to charge a page cache page during a write(), which holds the
i_mutex. The OOM killer selected a task that was just entering truncate()
and trying to acquire the i_mutex:
OOM invoking task:
mem_cgroup_handle_oom+0x241/0x3b0
mem_cgroup_cache_charge+0xbe/0xe0
add_to_page_cache_locked+0x4c/0x140
add_to_page_cache_lru+0x22/0x50
grab_cache_page_write_begin+0x8b/0xe0
ext3_write_begin+0x88/0x270
generic_file_buffered_write+0x116/0x290
__generic_file_aio_write+0x27c/0x480
generic_file_aio_write+0x76/0xf0 # takes ->i_mutex
do_sync_write+0xea/0x130
vfs_write+0xf3/0x1f0
sys_write+0x51/0x90
system_call_fastpath+0x18/0x1d
OOM kill victim:
do_truncate+0x58/0xa0 # takes i_mutex
do_last+0x250/0xa30
path_openat+0xd7/0x440
do_filp_open+0x49/0xa0
do_sys_open+0x106/0x240
sys_open+0x20/0x30
system_call_fastpath+0x18/0x1d
The OOM handling task will retry the charge indefinitely while the OOM
killed task is not releasing any resources.
A similar scenario can happen when the kernel OOM killer for a memcg is
disabled and a userspace task is in charge of resolving OOM situations.
In this case, ALL tasks that enter the OOM path will be made to sleep on
the OOM waitqueue and wait for userspace to free resources or increase
the group's limit. But a userspace OOM handler is prone to deadlock
itself on the locks held by the waiting tasks. For example one of the
sleeping tasks may be stuck in a brk() call with the mmap_sem held for
writing but the userspace handler, in order to pick an optimal victim,
may need to read files from /proc/<pid>, which tries to acquire the same
mmap_sem for reading and deadlocks.
This patch changes the way tasks behave after detecting a memcg OOM and
makes sure nobody loops or sleeps with locks held:
1. When OOMing in a user fault, invoke the OOM killer and restart the
fault instead of looping on the charge attempt. This way, the OOM
victim can not get stuck on locks the looping task may hold.
2. When OOMing in a user fault but somebody else is handling it
(either the kernel OOM killer or a userspace handler), don't go to
sleep in the charge context. Instead, remember the OOMing memcg in
the task struct and then fully unwind the page fault stack with
-ENOMEM. pagefault_out_of_memory() will then call back into the
memcg code to check if the -ENOMEM came from the memcg, and then
either put the task to sleep on the memcg's OOM waitqueue or just
restart the fault. The OOM victim can no longer get stuck on any
lock a sleeping task may hold.
Debugged by Michal Hocko.
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reported-by: azurIt <azurit@pobox.sk>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: David Rientjes <rientjes@google.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2013-09-13 05:13:44 +07:00
|
|
|
*
|
2013-10-17 03:46:59 +07:00
|
|
|
* Memcg supports userspace OOM handling where failed allocations must
|
mm: memcg: do not trap chargers with full callstack on OOM
The memcg OOM handling is incredibly fragile and can deadlock. When a
task fails to charge memory, it invokes the OOM killer and loops right
there in the charge code until it succeeds. Comparably, any other task
that enters the charge path at this point will go to a waitqueue right
then and there and sleep until the OOM situation is resolved. The problem
is that these tasks may hold filesystem locks and the mmap_sem; locks that
the selected OOM victim may need to exit.
For example, in one reported case, the task invoking the OOM killer was
about to charge a page cache page during a write(), which holds the
i_mutex. The OOM killer selected a task that was just entering truncate()
and trying to acquire the i_mutex:
OOM invoking task:
mem_cgroup_handle_oom+0x241/0x3b0
mem_cgroup_cache_charge+0xbe/0xe0
add_to_page_cache_locked+0x4c/0x140
add_to_page_cache_lru+0x22/0x50
grab_cache_page_write_begin+0x8b/0xe0
ext3_write_begin+0x88/0x270
generic_file_buffered_write+0x116/0x290
__generic_file_aio_write+0x27c/0x480
generic_file_aio_write+0x76/0xf0 # takes ->i_mutex
do_sync_write+0xea/0x130
vfs_write+0xf3/0x1f0
sys_write+0x51/0x90
system_call_fastpath+0x18/0x1d
OOM kill victim:
do_truncate+0x58/0xa0 # takes i_mutex
do_last+0x250/0xa30
path_openat+0xd7/0x440
do_filp_open+0x49/0xa0
do_sys_open+0x106/0x240
sys_open+0x20/0x30
system_call_fastpath+0x18/0x1d
The OOM handling task will retry the charge indefinitely while the OOM
killed task is not releasing any resources.
A similar scenario can happen when the kernel OOM killer for a memcg is
disabled and a userspace task is in charge of resolving OOM situations.
In this case, ALL tasks that enter the OOM path will be made to sleep on
the OOM waitqueue and wait for userspace to free resources or increase
the group's limit. But a userspace OOM handler is prone to deadlock
itself on the locks held by the waiting tasks. For example one of the
sleeping tasks may be stuck in a brk() call with the mmap_sem held for
writing but the userspace handler, in order to pick an optimal victim,
may need to read files from /proc/<pid>, which tries to acquire the same
mmap_sem for reading and deadlocks.
This patch changes the way tasks behave after detecting a memcg OOM and
makes sure nobody loops or sleeps with locks held:
1. When OOMing in a user fault, invoke the OOM killer and restart the
fault instead of looping on the charge attempt. This way, the OOM
victim can not get stuck on locks the looping task may hold.
2. When OOMing in a user fault but somebody else is handling it
(either the kernel OOM killer or a userspace handler), don't go to
sleep in the charge context. Instead, remember the OOMing memcg in
the task struct and then fully unwind the page fault stack with
-ENOMEM. pagefault_out_of_memory() will then call back into the
memcg code to check if the -ENOMEM came from the memcg, and then
either put the task to sleep on the memcg's OOM waitqueue or just
restart the fault. The OOM victim can no longer get stuck on any
lock a sleeping task may hold.
Debugged by Michal Hocko.
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reported-by: azurIt <azurit@pobox.sk>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: David Rientjes <rientjes@google.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2013-09-13 05:13:44 +07:00
|
|
|
* sleep on a waitqueue until the userspace task resolves the
|
|
|
|
* situation. Sleeping directly in the charge context with all kinds
|
|
|
|
* of locks held is not a good idea, instead we remember an OOM state
|
|
|
|
* in the task and mem_cgroup_oom_synchronize() has to be called at
|
2013-10-17 03:46:59 +07:00
|
|
|
* the end of the page fault to complete the OOM handling.
|
mm: memcg: do not trap chargers with full callstack on OOM
The memcg OOM handling is incredibly fragile and can deadlock. When a
task fails to charge memory, it invokes the OOM killer and loops right
there in the charge code until it succeeds. Comparably, any other task
that enters the charge path at this point will go to a waitqueue right
then and there and sleep until the OOM situation is resolved. The problem
is that these tasks may hold filesystem locks and the mmap_sem; locks that
the selected OOM victim may need to exit.
For example, in one reported case, the task invoking the OOM killer was
about to charge a page cache page during a write(), which holds the
i_mutex. The OOM killer selected a task that was just entering truncate()
and trying to acquire the i_mutex:
OOM invoking task:
mem_cgroup_handle_oom+0x241/0x3b0
mem_cgroup_cache_charge+0xbe/0xe0
add_to_page_cache_locked+0x4c/0x140
add_to_page_cache_lru+0x22/0x50
grab_cache_page_write_begin+0x8b/0xe0
ext3_write_begin+0x88/0x270
generic_file_buffered_write+0x116/0x290
__generic_file_aio_write+0x27c/0x480
generic_file_aio_write+0x76/0xf0 # takes ->i_mutex
do_sync_write+0xea/0x130
vfs_write+0xf3/0x1f0
sys_write+0x51/0x90
system_call_fastpath+0x18/0x1d
OOM kill victim:
do_truncate+0x58/0xa0 # takes i_mutex
do_last+0x250/0xa30
path_openat+0xd7/0x440
do_filp_open+0x49/0xa0
do_sys_open+0x106/0x240
sys_open+0x20/0x30
system_call_fastpath+0x18/0x1d
The OOM handling task will retry the charge indefinitely while the OOM
killed task is not releasing any resources.
A similar scenario can happen when the kernel OOM killer for a memcg is
disabled and a userspace task is in charge of resolving OOM situations.
In this case, ALL tasks that enter the OOM path will be made to sleep on
the OOM waitqueue and wait for userspace to free resources or increase
the group's limit. But a userspace OOM handler is prone to deadlock
itself on the locks held by the waiting tasks. For example one of the
sleeping tasks may be stuck in a brk() call with the mmap_sem held for
writing but the userspace handler, in order to pick an optimal victim,
may need to read files from /proc/<pid>, which tries to acquire the same
mmap_sem for reading and deadlocks.
This patch changes the way tasks behave after detecting a memcg OOM and
makes sure nobody loops or sleeps with locks held:
1. When OOMing in a user fault, invoke the OOM killer and restart the
fault instead of looping on the charge attempt. This way, the OOM
victim can not get stuck on locks the looping task may hold.
2. When OOMing in a user fault but somebody else is handling it
(either the kernel OOM killer or a userspace handler), don't go to
sleep in the charge context. Instead, remember the OOMing memcg in
the task struct and then fully unwind the page fault stack with
-ENOMEM. pagefault_out_of_memory() will then call back into the
memcg code to check if the -ENOMEM came from the memcg, and then
either put the task to sleep on the memcg's OOM waitqueue or just
restart the fault. The OOM victim can no longer get stuck on any
lock a sleeping task may hold.
Debugged by Michal Hocko.
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reported-by: azurIt <azurit@pobox.sk>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: David Rientjes <rientjes@google.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2013-09-13 05:13:44 +07:00
|
|
|
*
|
|
|
|
* Returns %true if an ongoing memcg OOM situation was detected and
|
2013-10-17 03:46:59 +07:00
|
|
|
* completed, %false otherwise.
|
mm: memcg: do not trap chargers with full callstack on OOM
The memcg OOM handling is incredibly fragile and can deadlock. When a
task fails to charge memory, it invokes the OOM killer and loops right
there in the charge code until it succeeds. Comparably, any other task
that enters the charge path at this point will go to a waitqueue right
then and there and sleep until the OOM situation is resolved. The problem
is that these tasks may hold filesystem locks and the mmap_sem; locks that
the selected OOM victim may need to exit.
For example, in one reported case, the task invoking the OOM killer was
about to charge a page cache page during a write(), which holds the
i_mutex. The OOM killer selected a task that was just entering truncate()
and trying to acquire the i_mutex:
OOM invoking task:
mem_cgroup_handle_oom+0x241/0x3b0
mem_cgroup_cache_charge+0xbe/0xe0
add_to_page_cache_locked+0x4c/0x140
add_to_page_cache_lru+0x22/0x50
grab_cache_page_write_begin+0x8b/0xe0
ext3_write_begin+0x88/0x270
generic_file_buffered_write+0x116/0x290
__generic_file_aio_write+0x27c/0x480
generic_file_aio_write+0x76/0xf0 # takes ->i_mutex
do_sync_write+0xea/0x130
vfs_write+0xf3/0x1f0
sys_write+0x51/0x90
system_call_fastpath+0x18/0x1d
OOM kill victim:
do_truncate+0x58/0xa0 # takes i_mutex
do_last+0x250/0xa30
path_openat+0xd7/0x440
do_filp_open+0x49/0xa0
do_sys_open+0x106/0x240
sys_open+0x20/0x30
system_call_fastpath+0x18/0x1d
The OOM handling task will retry the charge indefinitely while the OOM
killed task is not releasing any resources.
A similar scenario can happen when the kernel OOM killer for a memcg is
disabled and a userspace task is in charge of resolving OOM situations.
In this case, ALL tasks that enter the OOM path will be made to sleep on
the OOM waitqueue and wait for userspace to free resources or increase
the group's limit. But a userspace OOM handler is prone to deadlock
itself on the locks held by the waiting tasks. For example one of the
sleeping tasks may be stuck in a brk() call with the mmap_sem held for
writing but the userspace handler, in order to pick an optimal victim,
may need to read files from /proc/<pid>, which tries to acquire the same
mmap_sem for reading and deadlocks.
This patch changes the way tasks behave after detecting a memcg OOM and
makes sure nobody loops or sleeps with locks held:
1. When OOMing in a user fault, invoke the OOM killer and restart the
fault instead of looping on the charge attempt. This way, the OOM
victim can not get stuck on locks the looping task may hold.
2. When OOMing in a user fault but somebody else is handling it
(either the kernel OOM killer or a userspace handler), don't go to
sleep in the charge context. Instead, remember the OOMing memcg in
the task struct and then fully unwind the page fault stack with
-ENOMEM. pagefault_out_of_memory() will then call back into the
memcg code to check if the -ENOMEM came from the memcg, and then
either put the task to sleep on the memcg's OOM waitqueue or just
restart the fault. The OOM victim can no longer get stuck on any
lock a sleeping task may hold.
Debugged by Michal Hocko.
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reported-by: azurIt <azurit@pobox.sk>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: David Rientjes <rientjes@google.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2013-09-13 05:13:44 +07:00
|
|
|
*/
|
2013-10-17 03:46:59 +07:00
|
|
|
bool mem_cgroup_oom_synchronize(bool handle)
|
mm: memcg: do not trap chargers with full callstack on OOM
The memcg OOM handling is incredibly fragile and can deadlock. When a
task fails to charge memory, it invokes the OOM killer and loops right
there in the charge code until it succeeds. Comparably, any other task
that enters the charge path at this point will go to a waitqueue right
then and there and sleep until the OOM situation is resolved. The problem
is that these tasks may hold filesystem locks and the mmap_sem; locks that
the selected OOM victim may need to exit.
For example, in one reported case, the task invoking the OOM killer was
about to charge a page cache page during a write(), which holds the
i_mutex. The OOM killer selected a task that was just entering truncate()
and trying to acquire the i_mutex:
OOM invoking task:
mem_cgroup_handle_oom+0x241/0x3b0
mem_cgroup_cache_charge+0xbe/0xe0
add_to_page_cache_locked+0x4c/0x140
add_to_page_cache_lru+0x22/0x50
grab_cache_page_write_begin+0x8b/0xe0
ext3_write_begin+0x88/0x270
generic_file_buffered_write+0x116/0x290
__generic_file_aio_write+0x27c/0x480
generic_file_aio_write+0x76/0xf0 # takes ->i_mutex
do_sync_write+0xea/0x130
vfs_write+0xf3/0x1f0
sys_write+0x51/0x90
system_call_fastpath+0x18/0x1d
OOM kill victim:
do_truncate+0x58/0xa0 # takes i_mutex
do_last+0x250/0xa30
path_openat+0xd7/0x440
do_filp_open+0x49/0xa0
do_sys_open+0x106/0x240
sys_open+0x20/0x30
system_call_fastpath+0x18/0x1d
The OOM handling task will retry the charge indefinitely while the OOM
killed task is not releasing any resources.
A similar scenario can happen when the kernel OOM killer for a memcg is
disabled and a userspace task is in charge of resolving OOM situations.
In this case, ALL tasks that enter the OOM path will be made to sleep on
the OOM waitqueue and wait for userspace to free resources or increase
the group's limit. But a userspace OOM handler is prone to deadlock
itself on the locks held by the waiting tasks. For example one of the
sleeping tasks may be stuck in a brk() call with the mmap_sem held for
writing but the userspace handler, in order to pick an optimal victim,
may need to read files from /proc/<pid>, which tries to acquire the same
mmap_sem for reading and deadlocks.
This patch changes the way tasks behave after detecting a memcg OOM and
makes sure nobody loops or sleeps with locks held:
1. When OOMing in a user fault, invoke the OOM killer and restart the
fault instead of looping on the charge attempt. This way, the OOM
victim can not get stuck on locks the looping task may hold.
2. When OOMing in a user fault but somebody else is handling it
(either the kernel OOM killer or a userspace handler), don't go to
sleep in the charge context. Instead, remember the OOMing memcg in
the task struct and then fully unwind the page fault stack with
-ENOMEM. pagefault_out_of_memory() will then call back into the
memcg code to check if the -ENOMEM came from the memcg, and then
either put the task to sleep on the memcg's OOM waitqueue or just
restart the fault. The OOM victim can no longer get stuck on any
lock a sleeping task may hold.
Debugged by Michal Hocko.
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reported-by: azurIt <azurit@pobox.sk>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: David Rientjes <rientjes@google.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2013-09-13 05:13:44 +07:00
|
|
|
{
|
2015-11-06 09:46:09 +07:00
|
|
|
struct mem_cgroup *memcg = current->memcg_in_oom;
|
mm: memcg: do not trap chargers with full callstack on OOM
The memcg OOM handling is incredibly fragile and can deadlock. When a
task fails to charge memory, it invokes the OOM killer and loops right
there in the charge code until it succeeds. Comparably, any other task
that enters the charge path at this point will go to a waitqueue right
then and there and sleep until the OOM situation is resolved. The problem
is that these tasks may hold filesystem locks and the mmap_sem; locks that
the selected OOM victim may need to exit.
For example, in one reported case, the task invoking the OOM killer was
about to charge a page cache page during a write(), which holds the
i_mutex. The OOM killer selected a task that was just entering truncate()
and trying to acquire the i_mutex:
OOM invoking task:
mem_cgroup_handle_oom+0x241/0x3b0
mem_cgroup_cache_charge+0xbe/0xe0
add_to_page_cache_locked+0x4c/0x140
add_to_page_cache_lru+0x22/0x50
grab_cache_page_write_begin+0x8b/0xe0
ext3_write_begin+0x88/0x270
generic_file_buffered_write+0x116/0x290
__generic_file_aio_write+0x27c/0x480
generic_file_aio_write+0x76/0xf0 # takes ->i_mutex
do_sync_write+0xea/0x130
vfs_write+0xf3/0x1f0
sys_write+0x51/0x90
system_call_fastpath+0x18/0x1d
OOM kill victim:
do_truncate+0x58/0xa0 # takes i_mutex
do_last+0x250/0xa30
path_openat+0xd7/0x440
do_filp_open+0x49/0xa0
do_sys_open+0x106/0x240
sys_open+0x20/0x30
system_call_fastpath+0x18/0x1d
The OOM handling task will retry the charge indefinitely while the OOM
killed task is not releasing any resources.
A similar scenario can happen when the kernel OOM killer for a memcg is
disabled and a userspace task is in charge of resolving OOM situations.
In this case, ALL tasks that enter the OOM path will be made to sleep on
the OOM waitqueue and wait for userspace to free resources or increase
the group's limit. But a userspace OOM handler is prone to deadlock
itself on the locks held by the waiting tasks. For example one of the
sleeping tasks may be stuck in a brk() call with the mmap_sem held for
writing but the userspace handler, in order to pick an optimal victim,
may need to read files from /proc/<pid>, which tries to acquire the same
mmap_sem for reading and deadlocks.
This patch changes the way tasks behave after detecting a memcg OOM and
makes sure nobody loops or sleeps with locks held:
1. When OOMing in a user fault, invoke the OOM killer and restart the
fault instead of looping on the charge attempt. This way, the OOM
victim can not get stuck on locks the looping task may hold.
2. When OOMing in a user fault but somebody else is handling it
(either the kernel OOM killer or a userspace handler), don't go to
sleep in the charge context. Instead, remember the OOMing memcg in
the task struct and then fully unwind the page fault stack with
-ENOMEM. pagefault_out_of_memory() will then call back into the
memcg code to check if the -ENOMEM came from the memcg, and then
either put the task to sleep on the memcg's OOM waitqueue or just
restart the fault. The OOM victim can no longer get stuck on any
lock a sleeping task may hold.
Debugged by Michal Hocko.
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reported-by: azurIt <azurit@pobox.sk>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: David Rientjes <rientjes@google.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2013-09-13 05:13:44 +07:00
|
|
|
struct oom_wait_info owait;
|
2013-10-17 03:46:59 +07:00
|
|
|
bool locked;
|
mm: memcg: do not trap chargers with full callstack on OOM
The memcg OOM handling is incredibly fragile and can deadlock. When a
task fails to charge memory, it invokes the OOM killer and loops right
there in the charge code until it succeeds. Comparably, any other task
that enters the charge path at this point will go to a waitqueue right
then and there and sleep until the OOM situation is resolved. The problem
is that these tasks may hold filesystem locks and the mmap_sem; locks that
the selected OOM victim may need to exit.
For example, in one reported case, the task invoking the OOM killer was
about to charge a page cache page during a write(), which holds the
i_mutex. The OOM killer selected a task that was just entering truncate()
and trying to acquire the i_mutex:
OOM invoking task:
mem_cgroup_handle_oom+0x241/0x3b0
mem_cgroup_cache_charge+0xbe/0xe0
add_to_page_cache_locked+0x4c/0x140
add_to_page_cache_lru+0x22/0x50
grab_cache_page_write_begin+0x8b/0xe0
ext3_write_begin+0x88/0x270
generic_file_buffered_write+0x116/0x290
__generic_file_aio_write+0x27c/0x480
generic_file_aio_write+0x76/0xf0 # takes ->i_mutex
do_sync_write+0xea/0x130
vfs_write+0xf3/0x1f0
sys_write+0x51/0x90
system_call_fastpath+0x18/0x1d
OOM kill victim:
do_truncate+0x58/0xa0 # takes i_mutex
do_last+0x250/0xa30
path_openat+0xd7/0x440
do_filp_open+0x49/0xa0
do_sys_open+0x106/0x240
sys_open+0x20/0x30
system_call_fastpath+0x18/0x1d
The OOM handling task will retry the charge indefinitely while the OOM
killed task is not releasing any resources.
A similar scenario can happen when the kernel OOM killer for a memcg is
disabled and a userspace task is in charge of resolving OOM situations.
In this case, ALL tasks that enter the OOM path will be made to sleep on
the OOM waitqueue and wait for userspace to free resources or increase
the group's limit. But a userspace OOM handler is prone to deadlock
itself on the locks held by the waiting tasks. For example one of the
sleeping tasks may be stuck in a brk() call with the mmap_sem held for
writing but the userspace handler, in order to pick an optimal victim,
may need to read files from /proc/<pid>, which tries to acquire the same
mmap_sem for reading and deadlocks.
This patch changes the way tasks behave after detecting a memcg OOM and
makes sure nobody loops or sleeps with locks held:
1. When OOMing in a user fault, invoke the OOM killer and restart the
fault instead of looping on the charge attempt. This way, the OOM
victim can not get stuck on locks the looping task may hold.
2. When OOMing in a user fault but somebody else is handling it
(either the kernel OOM killer or a userspace handler), don't go to
sleep in the charge context. Instead, remember the OOMing memcg in
the task struct and then fully unwind the page fault stack with
-ENOMEM. pagefault_out_of_memory() will then call back into the
memcg code to check if the -ENOMEM came from the memcg, and then
either put the task to sleep on the memcg's OOM waitqueue or just
restart the fault. The OOM victim can no longer get stuck on any
lock a sleeping task may hold.
Debugged by Michal Hocko.
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reported-by: azurIt <azurit@pobox.sk>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: David Rientjes <rientjes@google.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2013-09-13 05:13:44 +07:00
|
|
|
|
|
|
|
/* OOM is global, do not handle */
|
|
|
|
if (!memcg)
|
2013-10-17 03:46:59 +07:00
|
|
|
return false;
|
mm: memcg: do not trap chargers with full callstack on OOM
The memcg OOM handling is incredibly fragile and can deadlock. When a
task fails to charge memory, it invokes the OOM killer and loops right
there in the charge code until it succeeds. Comparably, any other task
that enters the charge path at this point will go to a waitqueue right
then and there and sleep until the OOM situation is resolved. The problem
is that these tasks may hold filesystem locks and the mmap_sem; locks that
the selected OOM victim may need to exit.
For example, in one reported case, the task invoking the OOM killer was
about to charge a page cache page during a write(), which holds the
i_mutex. The OOM killer selected a task that was just entering truncate()
and trying to acquire the i_mutex:
OOM invoking task:
mem_cgroup_handle_oom+0x241/0x3b0
mem_cgroup_cache_charge+0xbe/0xe0
add_to_page_cache_locked+0x4c/0x140
add_to_page_cache_lru+0x22/0x50
grab_cache_page_write_begin+0x8b/0xe0
ext3_write_begin+0x88/0x270
generic_file_buffered_write+0x116/0x290
__generic_file_aio_write+0x27c/0x480
generic_file_aio_write+0x76/0xf0 # takes ->i_mutex
do_sync_write+0xea/0x130
vfs_write+0xf3/0x1f0
sys_write+0x51/0x90
system_call_fastpath+0x18/0x1d
OOM kill victim:
do_truncate+0x58/0xa0 # takes i_mutex
do_last+0x250/0xa30
path_openat+0xd7/0x440
do_filp_open+0x49/0xa0
do_sys_open+0x106/0x240
sys_open+0x20/0x30
system_call_fastpath+0x18/0x1d
The OOM handling task will retry the charge indefinitely while the OOM
killed task is not releasing any resources.
A similar scenario can happen when the kernel OOM killer for a memcg is
disabled and a userspace task is in charge of resolving OOM situations.
In this case, ALL tasks that enter the OOM path will be made to sleep on
the OOM waitqueue and wait for userspace to free resources or increase
the group's limit. But a userspace OOM handler is prone to deadlock
itself on the locks held by the waiting tasks. For example one of the
sleeping tasks may be stuck in a brk() call with the mmap_sem held for
writing but the userspace handler, in order to pick an optimal victim,
may need to read files from /proc/<pid>, which tries to acquire the same
mmap_sem for reading and deadlocks.
This patch changes the way tasks behave after detecting a memcg OOM and
makes sure nobody loops or sleeps with locks held:
1. When OOMing in a user fault, invoke the OOM killer and restart the
fault instead of looping on the charge attempt. This way, the OOM
victim can not get stuck on locks the looping task may hold.
2. When OOMing in a user fault but somebody else is handling it
(either the kernel OOM killer or a userspace handler), don't go to
sleep in the charge context. Instead, remember the OOMing memcg in
the task struct and then fully unwind the page fault stack with
-ENOMEM. pagefault_out_of_memory() will then call back into the
memcg code to check if the -ENOMEM came from the memcg, and then
either put the task to sleep on the memcg's OOM waitqueue or just
restart the fault. The OOM victim can no longer get stuck on any
lock a sleeping task may hold.
Debugged by Michal Hocko.
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reported-by: azurIt <azurit@pobox.sk>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: David Rientjes <rientjes@google.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2013-09-13 05:13:44 +07:00
|
|
|
|
2016-10-08 06:57:23 +07:00
|
|
|
if (!handle)
|
2013-10-17 03:46:59 +07:00
|
|
|
goto cleanup;
|
mm: memcg: do not trap chargers with full callstack on OOM
The memcg OOM handling is incredibly fragile and can deadlock. When a
task fails to charge memory, it invokes the OOM killer and loops right
there in the charge code until it succeeds. Comparably, any other task
that enters the charge path at this point will go to a waitqueue right
then and there and sleep until the OOM situation is resolved. The problem
is that these tasks may hold filesystem locks and the mmap_sem; locks that
the selected OOM victim may need to exit.
For example, in one reported case, the task invoking the OOM killer was
about to charge a page cache page during a write(), which holds the
i_mutex. The OOM killer selected a task that was just entering truncate()
and trying to acquire the i_mutex:
OOM invoking task:
mem_cgroup_handle_oom+0x241/0x3b0
mem_cgroup_cache_charge+0xbe/0xe0
add_to_page_cache_locked+0x4c/0x140
add_to_page_cache_lru+0x22/0x50
grab_cache_page_write_begin+0x8b/0xe0
ext3_write_begin+0x88/0x270
generic_file_buffered_write+0x116/0x290
__generic_file_aio_write+0x27c/0x480
generic_file_aio_write+0x76/0xf0 # takes ->i_mutex
do_sync_write+0xea/0x130
vfs_write+0xf3/0x1f0
sys_write+0x51/0x90
system_call_fastpath+0x18/0x1d
OOM kill victim:
do_truncate+0x58/0xa0 # takes i_mutex
do_last+0x250/0xa30
path_openat+0xd7/0x440
do_filp_open+0x49/0xa0
do_sys_open+0x106/0x240
sys_open+0x20/0x30
system_call_fastpath+0x18/0x1d
The OOM handling task will retry the charge indefinitely while the OOM
killed task is not releasing any resources.
A similar scenario can happen when the kernel OOM killer for a memcg is
disabled and a userspace task is in charge of resolving OOM situations.
In this case, ALL tasks that enter the OOM path will be made to sleep on
the OOM waitqueue and wait for userspace to free resources or increase
the group's limit. But a userspace OOM handler is prone to deadlock
itself on the locks held by the waiting tasks. For example one of the
sleeping tasks may be stuck in a brk() call with the mmap_sem held for
writing but the userspace handler, in order to pick an optimal victim,
may need to read files from /proc/<pid>, which tries to acquire the same
mmap_sem for reading and deadlocks.
This patch changes the way tasks behave after detecting a memcg OOM and
makes sure nobody loops or sleeps with locks held:
1. When OOMing in a user fault, invoke the OOM killer and restart the
fault instead of looping on the charge attempt. This way, the OOM
victim can not get stuck on locks the looping task may hold.
2. When OOMing in a user fault but somebody else is handling it
(either the kernel OOM killer or a userspace handler), don't go to
sleep in the charge context. Instead, remember the OOMing memcg in
the task struct and then fully unwind the page fault stack with
-ENOMEM. pagefault_out_of_memory() will then call back into the
memcg code to check if the -ENOMEM came from the memcg, and then
either put the task to sleep on the memcg's OOM waitqueue or just
restart the fault. The OOM victim can no longer get stuck on any
lock a sleeping task may hold.
Debugged by Michal Hocko.
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reported-by: azurIt <azurit@pobox.sk>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: David Rientjes <rientjes@google.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2013-09-13 05:13:44 +07:00
|
|
|
|
|
|
|
owait.memcg = memcg;
|
|
|
|
owait.wait.flags = 0;
|
|
|
|
owait.wait.func = memcg_oom_wake_function;
|
|
|
|
owait.wait.private = current;
|
sched/wait: Disambiguate wq_entry->task_list and wq_head->task_list naming
So I've noticed a number of instances where it was not obvious from the
code whether ->task_list was for a wait-queue head or a wait-queue entry.
Furthermore, there's a number of wait-queue users where the lists are
not for 'tasks' but other entities (poll tables, etc.), in which case
the 'task_list' name is actively confusing.
To clear this all up, name the wait-queue head and entry list structure
fields unambiguously:
struct wait_queue_head::task_list => ::head
struct wait_queue_entry::task_list => ::entry
For example, this code:
rqw->wait.task_list.next != &wait->task_list
... is was pretty unclear (to me) what it's doing, while now it's written this way:
rqw->wait.head.next != &wait->entry
... which makes it pretty clear that we are iterating a list until we see the head.
Other examples are:
list_for_each_entry_safe(pos, next, &x->task_list, task_list) {
list_for_each_entry(wq, &fence->wait.task_list, task_list) {
... where it's unclear (to me) what we are iterating, and during review it's
hard to tell whether it's trying to walk a wait-queue entry (which would be
a bug), while now it's written as:
list_for_each_entry_safe(pos, next, &x->head, entry) {
list_for_each_entry(wq, &fence->wait.head, entry) {
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-06-20 17:06:46 +07:00
|
|
|
INIT_LIST_HEAD(&owait.wait.entry);
|
2010-03-11 06:22:39 +07:00
|
|
|
|
mm: memcg: do not trap chargers with full callstack on OOM
The memcg OOM handling is incredibly fragile and can deadlock. When a
task fails to charge memory, it invokes the OOM killer and loops right
there in the charge code until it succeeds. Comparably, any other task
that enters the charge path at this point will go to a waitqueue right
then and there and sleep until the OOM situation is resolved. The problem
is that these tasks may hold filesystem locks and the mmap_sem; locks that
the selected OOM victim may need to exit.
For example, in one reported case, the task invoking the OOM killer was
about to charge a page cache page during a write(), which holds the
i_mutex. The OOM killer selected a task that was just entering truncate()
and trying to acquire the i_mutex:
OOM invoking task:
mem_cgroup_handle_oom+0x241/0x3b0
mem_cgroup_cache_charge+0xbe/0xe0
add_to_page_cache_locked+0x4c/0x140
add_to_page_cache_lru+0x22/0x50
grab_cache_page_write_begin+0x8b/0xe0
ext3_write_begin+0x88/0x270
generic_file_buffered_write+0x116/0x290
__generic_file_aio_write+0x27c/0x480
generic_file_aio_write+0x76/0xf0 # takes ->i_mutex
do_sync_write+0xea/0x130
vfs_write+0xf3/0x1f0
sys_write+0x51/0x90
system_call_fastpath+0x18/0x1d
OOM kill victim:
do_truncate+0x58/0xa0 # takes i_mutex
do_last+0x250/0xa30
path_openat+0xd7/0x440
do_filp_open+0x49/0xa0
do_sys_open+0x106/0x240
sys_open+0x20/0x30
system_call_fastpath+0x18/0x1d
The OOM handling task will retry the charge indefinitely while the OOM
killed task is not releasing any resources.
A similar scenario can happen when the kernel OOM killer for a memcg is
disabled and a userspace task is in charge of resolving OOM situations.
In this case, ALL tasks that enter the OOM path will be made to sleep on
the OOM waitqueue and wait for userspace to free resources or increase
the group's limit. But a userspace OOM handler is prone to deadlock
itself on the locks held by the waiting tasks. For example one of the
sleeping tasks may be stuck in a brk() call with the mmap_sem held for
writing but the userspace handler, in order to pick an optimal victim,
may need to read files from /proc/<pid>, which tries to acquire the same
mmap_sem for reading and deadlocks.
This patch changes the way tasks behave after detecting a memcg OOM and
makes sure nobody loops or sleeps with locks held:
1. When OOMing in a user fault, invoke the OOM killer and restart the
fault instead of looping on the charge attempt. This way, the OOM
victim can not get stuck on locks the looping task may hold.
2. When OOMing in a user fault but somebody else is handling it
(either the kernel OOM killer or a userspace handler), don't go to
sleep in the charge context. Instead, remember the OOMing memcg in
the task struct and then fully unwind the page fault stack with
-ENOMEM. pagefault_out_of_memory() will then call back into the
memcg code to check if the -ENOMEM came from the memcg, and then
either put the task to sleep on the memcg's OOM waitqueue or just
restart the fault. The OOM victim can no longer get stuck on any
lock a sleeping task may hold.
Debugged by Michal Hocko.
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reported-by: azurIt <azurit@pobox.sk>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: David Rientjes <rientjes@google.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2013-09-13 05:13:44 +07:00
|
|
|
prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
|
2013-10-17 03:46:59 +07:00
|
|
|
mem_cgroup_mark_under_oom(memcg);
|
|
|
|
|
|
|
|
locked = mem_cgroup_oom_trylock(memcg);
|
|
|
|
|
|
|
|
if (locked)
|
|
|
|
mem_cgroup_oom_notify(memcg);
|
|
|
|
|
|
|
|
if (locked && !memcg->oom_kill_disable) {
|
|
|
|
mem_cgroup_unmark_under_oom(memcg);
|
|
|
|
finish_wait(&memcg_oom_waitq, &owait.wait);
|
2015-11-06 09:46:09 +07:00
|
|
|
mem_cgroup_out_of_memory(memcg, current->memcg_oom_gfp_mask,
|
|
|
|
current->memcg_oom_order);
|
2013-10-17 03:46:59 +07:00
|
|
|
} else {
|
mm: memcg: do not trap chargers with full callstack on OOM
The memcg OOM handling is incredibly fragile and can deadlock. When a
task fails to charge memory, it invokes the OOM killer and loops right
there in the charge code until it succeeds. Comparably, any other task
that enters the charge path at this point will go to a waitqueue right
then and there and sleep until the OOM situation is resolved. The problem
is that these tasks may hold filesystem locks and the mmap_sem; locks that
the selected OOM victim may need to exit.
For example, in one reported case, the task invoking the OOM killer was
about to charge a page cache page during a write(), which holds the
i_mutex. The OOM killer selected a task that was just entering truncate()
and trying to acquire the i_mutex:
OOM invoking task:
mem_cgroup_handle_oom+0x241/0x3b0
mem_cgroup_cache_charge+0xbe/0xe0
add_to_page_cache_locked+0x4c/0x140
add_to_page_cache_lru+0x22/0x50
grab_cache_page_write_begin+0x8b/0xe0
ext3_write_begin+0x88/0x270
generic_file_buffered_write+0x116/0x290
__generic_file_aio_write+0x27c/0x480
generic_file_aio_write+0x76/0xf0 # takes ->i_mutex
do_sync_write+0xea/0x130
vfs_write+0xf3/0x1f0
sys_write+0x51/0x90
system_call_fastpath+0x18/0x1d
OOM kill victim:
do_truncate+0x58/0xa0 # takes i_mutex
do_last+0x250/0xa30
path_openat+0xd7/0x440
do_filp_open+0x49/0xa0
do_sys_open+0x106/0x240
sys_open+0x20/0x30
system_call_fastpath+0x18/0x1d
The OOM handling task will retry the charge indefinitely while the OOM
killed task is not releasing any resources.
A similar scenario can happen when the kernel OOM killer for a memcg is
disabled and a userspace task is in charge of resolving OOM situations.
In this case, ALL tasks that enter the OOM path will be made to sleep on
the OOM waitqueue and wait for userspace to free resources or increase
the group's limit. But a userspace OOM handler is prone to deadlock
itself on the locks held by the waiting tasks. For example one of the
sleeping tasks may be stuck in a brk() call with the mmap_sem held for
writing but the userspace handler, in order to pick an optimal victim,
may need to read files from /proc/<pid>, which tries to acquire the same
mmap_sem for reading and deadlocks.
This patch changes the way tasks behave after detecting a memcg OOM and
makes sure nobody loops or sleeps with locks held:
1. When OOMing in a user fault, invoke the OOM killer and restart the
fault instead of looping on the charge attempt. This way, the OOM
victim can not get stuck on locks the looping task may hold.
2. When OOMing in a user fault but somebody else is handling it
(either the kernel OOM killer or a userspace handler), don't go to
sleep in the charge context. Instead, remember the OOMing memcg in
the task struct and then fully unwind the page fault stack with
-ENOMEM. pagefault_out_of_memory() will then call back into the
memcg code to check if the -ENOMEM came from the memcg, and then
either put the task to sleep on the memcg's OOM waitqueue or just
restart the fault. The OOM victim can no longer get stuck on any
lock a sleeping task may hold.
Debugged by Michal Hocko.
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reported-by: azurIt <azurit@pobox.sk>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: David Rientjes <rientjes@google.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2013-09-13 05:13:44 +07:00
|
|
|
schedule();
|
2013-10-17 03:46:59 +07:00
|
|
|
mem_cgroup_unmark_under_oom(memcg);
|
|
|
|
finish_wait(&memcg_oom_waitq, &owait.wait);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (locked) {
|
2013-09-13 05:13:43 +07:00
|
|
|
mem_cgroup_oom_unlock(memcg);
|
|
|
|
/*
|
|
|
|
* There is no guarantee that an OOM-lock contender
|
|
|
|
* sees the wakeups triggered by the OOM kill
|
|
|
|
* uncharges. Wake any sleepers explicitely.
|
|
|
|
*/
|
|
|
|
memcg_oom_recover(memcg);
|
|
|
|
}
|
2013-10-17 03:46:59 +07:00
|
|
|
cleanup:
|
2015-11-06 09:46:09 +07:00
|
|
|
current->memcg_in_oom = NULL;
|
mm: memcg: do not trap chargers with full callstack on OOM
The memcg OOM handling is incredibly fragile and can deadlock. When a
task fails to charge memory, it invokes the OOM killer and loops right
there in the charge code until it succeeds. Comparably, any other task
that enters the charge path at this point will go to a waitqueue right
then and there and sleep until the OOM situation is resolved. The problem
is that these tasks may hold filesystem locks and the mmap_sem; locks that
the selected OOM victim may need to exit.
For example, in one reported case, the task invoking the OOM killer was
about to charge a page cache page during a write(), which holds the
i_mutex. The OOM killer selected a task that was just entering truncate()
and trying to acquire the i_mutex:
OOM invoking task:
mem_cgroup_handle_oom+0x241/0x3b0
mem_cgroup_cache_charge+0xbe/0xe0
add_to_page_cache_locked+0x4c/0x140
add_to_page_cache_lru+0x22/0x50
grab_cache_page_write_begin+0x8b/0xe0
ext3_write_begin+0x88/0x270
generic_file_buffered_write+0x116/0x290
__generic_file_aio_write+0x27c/0x480
generic_file_aio_write+0x76/0xf0 # takes ->i_mutex
do_sync_write+0xea/0x130
vfs_write+0xf3/0x1f0
sys_write+0x51/0x90
system_call_fastpath+0x18/0x1d
OOM kill victim:
do_truncate+0x58/0xa0 # takes i_mutex
do_last+0x250/0xa30
path_openat+0xd7/0x440
do_filp_open+0x49/0xa0
do_sys_open+0x106/0x240
sys_open+0x20/0x30
system_call_fastpath+0x18/0x1d
The OOM handling task will retry the charge indefinitely while the OOM
killed task is not releasing any resources.
A similar scenario can happen when the kernel OOM killer for a memcg is
disabled and a userspace task is in charge of resolving OOM situations.
In this case, ALL tasks that enter the OOM path will be made to sleep on
the OOM waitqueue and wait for userspace to free resources or increase
the group's limit. But a userspace OOM handler is prone to deadlock
itself on the locks held by the waiting tasks. For example one of the
sleeping tasks may be stuck in a brk() call with the mmap_sem held for
writing but the userspace handler, in order to pick an optimal victim,
may need to read files from /proc/<pid>, which tries to acquire the same
mmap_sem for reading and deadlocks.
This patch changes the way tasks behave after detecting a memcg OOM and
makes sure nobody loops or sleeps with locks held:
1. When OOMing in a user fault, invoke the OOM killer and restart the
fault instead of looping on the charge attempt. This way, the OOM
victim can not get stuck on locks the looping task may hold.
2. When OOMing in a user fault but somebody else is handling it
(either the kernel OOM killer or a userspace handler), don't go to
sleep in the charge context. Instead, remember the OOMing memcg in
the task struct and then fully unwind the page fault stack with
-ENOMEM. pagefault_out_of_memory() will then call back into the
memcg code to check if the -ENOMEM came from the memcg, and then
either put the task to sleep on the memcg's OOM waitqueue or just
restart the fault. The OOM victim can no longer get stuck on any
lock a sleeping task may hold.
Debugged by Michal Hocko.
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reported-by: azurIt <azurit@pobox.sk>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: David Rientjes <rientjes@google.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2013-09-13 05:13:44 +07:00
|
|
|
css_put(&memcg->css);
|
2010-03-11 06:22:39 +07:00
|
|
|
return true;
|
memcg: fix OOM killer under memcg
This patch tries to fix OOM Killer problems caused by hierarchy.
Now, memcg itself has OOM KILL function (in oom_kill.c) and tries to
kill a task in memcg.
But, when hierarchy is used, it's broken and correct task cannot
be killed. For example, in following cgroup
/groupA/ hierarchy=1, limit=1G,
01 nolimit
02 nolimit
All tasks' memory usage under /groupA, /groupA/01, groupA/02 is limited to
groupA's 1Gbytes but OOM Killer just kills tasks in groupA.
This patch provides makes the bad process be selected from all tasks
under hierarchy. BTW, currently, oom_jiffies is updated against groupA
in above case. oom_jiffies of tree should be updated.
To see how oom_jiffies is used, please check mem_cgroup_oom_called()
callers.
[akpm@linux-foundation.org: build fix]
[akpm@linux-foundation.org: const fix]
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Paul Menage <menage@google.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-04-03 06:57:38 +07:00
|
|
|
}
|
|
|
|
|
2018-08-22 11:53:54 +07:00
|
|
|
/**
|
|
|
|
* mem_cgroup_get_oom_group - get a memory cgroup to clean up after OOM
|
|
|
|
* @victim: task to be killed by the OOM killer
|
|
|
|
* @oom_domain: memcg in case of memcg OOM, NULL in case of system-wide OOM
|
|
|
|
*
|
|
|
|
* Returns a pointer to a memory cgroup, which has to be cleaned up
|
|
|
|
* by killing all belonging OOM-killable tasks.
|
|
|
|
*
|
|
|
|
* Caller has to call mem_cgroup_put() on the returned non-NULL memcg.
|
|
|
|
*/
|
|
|
|
struct mem_cgroup *mem_cgroup_get_oom_group(struct task_struct *victim,
|
|
|
|
struct mem_cgroup *oom_domain)
|
|
|
|
{
|
|
|
|
struct mem_cgroup *oom_group = NULL;
|
|
|
|
struct mem_cgroup *memcg;
|
|
|
|
|
|
|
|
if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
if (!oom_domain)
|
|
|
|
oom_domain = root_mem_cgroup;
|
|
|
|
|
|
|
|
rcu_read_lock();
|
|
|
|
|
|
|
|
memcg = mem_cgroup_from_task(victim);
|
|
|
|
if (memcg == root_mem_cgroup)
|
|
|
|
goto out;
|
|
|
|
|
2020-04-02 11:07:39 +07:00
|
|
|
/*
|
|
|
|
* If the victim task has been asynchronously moved to a different
|
|
|
|
* memory cgroup, we might end up killing tasks outside oom_domain.
|
|
|
|
* In this case it's better to ignore memory.group.oom.
|
|
|
|
*/
|
|
|
|
if (unlikely(!mem_cgroup_is_descendant(memcg, oom_domain)))
|
|
|
|
goto out;
|
|
|
|
|
2018-08-22 11:53:54 +07:00
|
|
|
/*
|
|
|
|
* Traverse the memory cgroup hierarchy from the victim task's
|
|
|
|
* cgroup up to the OOMing cgroup (or root) to find the
|
|
|
|
* highest-level memory cgroup with oom.group set.
|
|
|
|
*/
|
|
|
|
for (; memcg; memcg = parent_mem_cgroup(memcg)) {
|
|
|
|
if (memcg->oom_group)
|
|
|
|
oom_group = memcg;
|
|
|
|
|
|
|
|
if (memcg == oom_domain)
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (oom_group)
|
|
|
|
css_get(&oom_group->css);
|
|
|
|
out:
|
|
|
|
rcu_read_unlock();
|
|
|
|
|
|
|
|
return oom_group;
|
|
|
|
}
|
|
|
|
|
|
|
|
void mem_cgroup_print_oom_group(struct mem_cgroup *memcg)
|
|
|
|
{
|
|
|
|
pr_info("Tasks in ");
|
|
|
|
pr_cont_cgroup_path(memcg->css.cgroup);
|
|
|
|
pr_cont(" are going to be killed due to memory.oom.group set\n");
|
|
|
|
}
|
|
|
|
|
mm: memcontrol: fix missed end-writeback page accounting
Commit 0a31bc97c80c ("mm: memcontrol: rewrite uncharge API") changed
page migration to uncharge the old page right away. The page is locked,
unmapped, truncated, and off the LRU, but it could race with writeback
ending, which then doesn't unaccount the page properly:
test_clear_page_writeback() migration
wait_on_page_writeback()
TestClearPageWriteback()
mem_cgroup_migrate()
clear PCG_USED
mem_cgroup_update_page_stat()
if (PageCgroupUsed(pc))
decrease memcg pages under writeback
release pc->mem_cgroup->move_lock
The per-page statistics interface is heavily optimized to avoid a
function call and a lookup_page_cgroup() in the file unmap fast path,
which means it doesn't verify whether a page is still charged before
clearing PageWriteback() and it has to do it in the stat update later.
Rework it so that it looks up the page's memcg once at the beginning of
the transaction and then uses it throughout. The charge will be
verified before clearing PageWriteback() and migration can't uncharge
the page as long as that is still set. The RCU lock will protect the
memcg past uncharge.
As far as losing the optimization goes, the following test results are
from a microbenchmark that maps, faults, and unmaps a 4GB sparse file
three times in a nested fashion, so that there are two negative passes
that don't account but still go through the new transaction overhead.
There is no actual difference:
old: 33.195102545 seconds time elapsed ( +- 0.01% )
new: 33.199231369 seconds time elapsed ( +- 0.03% )
The time spent in page_remove_rmap()'s callees still adds up to the
same, but the time spent in the function itself seems reduced:
# Children Self Command Shared Object Symbol
old: 0.12% 0.11% filemapstress [kernel.kallsyms] [k] page_remove_rmap
new: 0.12% 0.08% filemapstress [kernel.kallsyms] [k] page_remove_rmap
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: Vladimir Davydov <vdavydov@parallels.com>
Cc: <stable@vger.kernel.org> [3.17.x]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-10-30 04:50:48 +07:00
|
|
|
/**
|
2016-03-16 04:57:04 +07:00
|
|
|
* lock_page_memcg - lock a page->mem_cgroup binding
|
|
|
|
* @page: the page
|
memcg: avoid lock in updating file_mapped (Was fix race in file_mapped accouting flag management
At accounting file events per memory cgroup, we need to find memory cgroup
via page_cgroup->mem_cgroup. Now, we use lock_page_cgroup() for guarantee
pc->mem_cgroup is not overwritten while we make use of it.
But, considering the context which page-cgroup for files are accessed,
we can use alternative light-weight mutual execusion in the most case.
At handling file-caches, the only race we have to take care of is "moving"
account, IOW, overwriting page_cgroup->mem_cgroup. (See comment in the
patch)
Unlike charge/uncharge, "move" happens not so frequently. It happens only when
rmdir() and task-moving (with a special settings.)
This patch adds a race-checker for file-cache-status accounting v.s. account
moving. The new per-cpu-per-memcg counter MEM_CGROUP_ON_MOVE is added.
The routine for account move
1. Increment it before start moving
2. Call synchronize_rcu()
3. Decrement it after the end of moving.
By this, file-status-counting routine can check it needs to call
lock_page_cgroup(). In most case, I doesn't need to call it.
Following is a perf data of a process which mmap()/munmap 32MB of file cache
in a minute.
Before patch:
28.25% mmap mmap [.] main
22.64% mmap [kernel.kallsyms] [k] page_fault
9.96% mmap [kernel.kallsyms] [k] mem_cgroup_update_file_mapped
3.67% mmap [kernel.kallsyms] [k] filemap_fault
3.50% mmap [kernel.kallsyms] [k] unmap_vmas
2.99% mmap [kernel.kallsyms] [k] __do_fault
2.76% mmap [kernel.kallsyms] [k] find_get_page
After patch:
30.00% mmap mmap [.] main
23.78% mmap [kernel.kallsyms] [k] page_fault
5.52% mmap [kernel.kallsyms] [k] mem_cgroup_update_file_mapped
3.81% mmap [kernel.kallsyms] [k] unmap_vmas
3.26% mmap [kernel.kallsyms] [k] find_get_page
3.18% mmap [kernel.kallsyms] [k] __do_fault
3.03% mmap [kernel.kallsyms] [k] filemap_fault
2.40% mmap [kernel.kallsyms] [k] handle_mm_fault
2.40% mmap [kernel.kallsyms] [k] do_page_fault
This patch reduces memcg's cost to some extent.
(mem_cgroup_update_file_mapped is called by both of map/unmap)
Note: It seems some more improvements are required..but no idea.
maybe removing set/unset flag is required.
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Reviewed-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Greg Thelen <gthelen@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2010-10-28 05:33:40 +07:00
|
|
|
*
|
2016-03-16 04:57:04 +07:00
|
|
|
* This function protects unlocked LRU pages from being moved to
|
mm: memcontrol: fix NULL pointer crash in test_clear_page_writeback()
Jaegeuk and Brad report a NULL pointer crash when writeback ending tries
to update the memcg stats:
BUG: unable to handle kernel NULL pointer dereference at 00000000000003b0
IP: test_clear_page_writeback+0x12e/0x2c0
[...]
RIP: 0010:test_clear_page_writeback+0x12e/0x2c0
Call Trace:
<IRQ>
end_page_writeback+0x47/0x70
f2fs_write_end_io+0x76/0x180 [f2fs]
bio_endio+0x9f/0x120
blk_update_request+0xa8/0x2f0
scsi_end_request+0x39/0x1d0
scsi_io_completion+0x211/0x690
scsi_finish_command+0xd9/0x120
scsi_softirq_done+0x127/0x150
__blk_mq_complete_request_remote+0x13/0x20
flush_smp_call_function_queue+0x56/0x110
generic_smp_call_function_single_interrupt+0x13/0x30
smp_call_function_single_interrupt+0x27/0x40
call_function_single_interrupt+0x89/0x90
RIP: 0010:native_safe_halt+0x6/0x10
(gdb) l *(test_clear_page_writeback+0x12e)
0xffffffff811bae3e is in test_clear_page_writeback (./include/linux/memcontrol.h:619).
614 mod_node_page_state(page_pgdat(page), idx, val);
615 if (mem_cgroup_disabled() || !page->mem_cgroup)
616 return;
617 mod_memcg_state(page->mem_cgroup, idx, val);
618 pn = page->mem_cgroup->nodeinfo[page_to_nid(page)];
619 this_cpu_add(pn->lruvec_stat->count[idx], val);
620 }
621
622 unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
623 gfp_t gfp_mask,
The issue is that writeback doesn't hold a page reference and the page
might get freed after PG_writeback is cleared (and the mapping is
unlocked) in test_clear_page_writeback(). The stat functions looking up
the page's node or zone are safe, as those attributes are static across
allocation and free cycles. But page->mem_cgroup is not, and it will
get cleared if we race with truncation or migration.
It appears this race window has been around for a while, but less likely
to trigger when the memcg stats were updated first thing after
PG_writeback is cleared. Recent changes reshuffled this code to update
the global node stats before the memcg ones, though, stretching the race
window out to an extent where people can reproduce the problem.
Update test_clear_page_writeback() to look up and pin page->mem_cgroup
before clearing PG_writeback, then not use that pointer afterward. It
is a partial revert of 62cccb8c8e7a ("mm: simplify lock_page_memcg()")
but leaves the pageref-holding callsites that aren't affected alone.
Link: http://lkml.kernel.org/r/20170809183825.GA26387@cmpxchg.org
Fixes: 62cccb8c8e7a ("mm: simplify lock_page_memcg()")
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reported-by: Jaegeuk Kim <jaegeuk@kernel.org>
Tested-by: Jaegeuk Kim <jaegeuk@kernel.org>
Reported-by: Bradley Bolen <bradleybolen@gmail.com>
Tested-by: Brad Bolen <bradleybolen@gmail.com>
Cc: Vladimir Davydov <vdavydov@virtuozzo.com>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: <stable@vger.kernel.org> [4.6+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2017-08-19 05:15:48 +07:00
|
|
|
* another cgroup.
|
|
|
|
*
|
|
|
|
* It ensures lifetime of the returned memcg. Caller is responsible
|
|
|
|
* for the lifetime of the page; __unlock_page_memcg() is available
|
|
|
|
* when @page might get freed inside the locked section.
|
2009-06-18 06:26:34 +07:00
|
|
|
*/
|
mm: memcontrol: fix NULL pointer crash in test_clear_page_writeback()
Jaegeuk and Brad report a NULL pointer crash when writeback ending tries
to update the memcg stats:
BUG: unable to handle kernel NULL pointer dereference at 00000000000003b0
IP: test_clear_page_writeback+0x12e/0x2c0
[...]
RIP: 0010:test_clear_page_writeback+0x12e/0x2c0
Call Trace:
<IRQ>
end_page_writeback+0x47/0x70
f2fs_write_end_io+0x76/0x180 [f2fs]
bio_endio+0x9f/0x120
blk_update_request+0xa8/0x2f0
scsi_end_request+0x39/0x1d0
scsi_io_completion+0x211/0x690
scsi_finish_command+0xd9/0x120
scsi_softirq_done+0x127/0x150
__blk_mq_complete_request_remote+0x13/0x20
flush_smp_call_function_queue+0x56/0x110
generic_smp_call_function_single_interrupt+0x13/0x30
smp_call_function_single_interrupt+0x27/0x40
call_function_single_interrupt+0x89/0x90
RIP: 0010:native_safe_halt+0x6/0x10
(gdb) l *(test_clear_page_writeback+0x12e)
0xffffffff811bae3e is in test_clear_page_writeback (./include/linux/memcontrol.h:619).
614 mod_node_page_state(page_pgdat(page), idx, val);
615 if (mem_cgroup_disabled() || !page->mem_cgroup)
616 return;
617 mod_memcg_state(page->mem_cgroup, idx, val);
618 pn = page->mem_cgroup->nodeinfo[page_to_nid(page)];
619 this_cpu_add(pn->lruvec_stat->count[idx], val);
620 }
621
622 unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
623 gfp_t gfp_mask,
The issue is that writeback doesn't hold a page reference and the page
might get freed after PG_writeback is cleared (and the mapping is
unlocked) in test_clear_page_writeback(). The stat functions looking up
the page's node or zone are safe, as those attributes are static across
allocation and free cycles. But page->mem_cgroup is not, and it will
get cleared if we race with truncation or migration.
It appears this race window has been around for a while, but less likely
to trigger when the memcg stats were updated first thing after
PG_writeback is cleared. Recent changes reshuffled this code to update
the global node stats before the memcg ones, though, stretching the race
window out to an extent where people can reproduce the problem.
Update test_clear_page_writeback() to look up and pin page->mem_cgroup
before clearing PG_writeback, then not use that pointer afterward. It
is a partial revert of 62cccb8c8e7a ("mm: simplify lock_page_memcg()")
but leaves the pageref-holding callsites that aren't affected alone.
Link: http://lkml.kernel.org/r/20170809183825.GA26387@cmpxchg.org
Fixes: 62cccb8c8e7a ("mm: simplify lock_page_memcg()")
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reported-by: Jaegeuk Kim <jaegeuk@kernel.org>
Tested-by: Jaegeuk Kim <jaegeuk@kernel.org>
Reported-by: Bradley Bolen <bradleybolen@gmail.com>
Tested-by: Brad Bolen <bradleybolen@gmail.com>
Cc: Vladimir Davydov <vdavydov@virtuozzo.com>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: <stable@vger.kernel.org> [4.6+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2017-08-19 05:15:48 +07:00
|
|
|
struct mem_cgroup *lock_page_memcg(struct page *page)
|
memcg: use new logic for page stat accounting
Now, page-stat-per-memcg is recorded into per page_cgroup flag by
duplicating page's status into the flag. The reason is that memcg has a
feature to move a page from a group to another group and we have race
between "move" and "page stat accounting",
Under current logic, assume CPU-A and CPU-B. CPU-A does "move" and CPU-B
does "page stat accounting".
When CPU-A goes 1st,
CPU-A CPU-B
update "struct page" info.
move_lock_mem_cgroup(memcg)
see pc->flags
copy page stat to new group
overwrite pc->mem_cgroup.
move_unlock_mem_cgroup(memcg)
move_lock_mem_cgroup(mem)
set pc->flags
update page stat accounting
move_unlock_mem_cgroup(mem)
stat accounting is guarded by move_lock_mem_cgroup() and "move" logic
(CPU-A) doesn't see changes in "struct page" information.
But it's costly to have the same information both in 'struct page' and
'struct page_cgroup'. And, there is a potential problem.
For example, assume we have PG_dirty accounting in memcg.
PG_..is a flag for struct page.
PCG_ is a flag for struct page_cgroup.
(This is just an example. The same problem can be found in any
kind of page stat accounting.)
CPU-A CPU-B
TestSet PG_dirty
(delay) TestClear PG_dirty
if (TestClear(PCG_dirty))
memcg->nr_dirty--
if (TestSet(PCG_dirty))
memcg->nr_dirty++
Here, memcg->nr_dirty = +1, this is wrong. This race was reported by Greg
Thelen <gthelen@google.com>. Now, only FILE_MAPPED is supported but
fortunately, it's serialized by page table lock and this is not real bug,
_now_,
If this potential problem is caused by having duplicated information in
struct page and struct page_cgroup, we may be able to fix this by using
original 'struct page' information. But we'll have a problem in "move
account"
Assume we use only PG_dirty.
CPU-A CPU-B
TestSet PG_dirty
(delay) move_lock_mem_cgroup()
if (PageDirty(page))
new_memcg->nr_dirty++
pc->mem_cgroup = new_memcg;
move_unlock_mem_cgroup()
move_lock_mem_cgroup()
memcg = pc->mem_cgroup
new_memcg->nr_dirty++
accounting information may be double-counted. This was original reason to
have PCG_xxx flags but it seems PCG_xxx has another problem.
I think we need a bigger lock as
move_lock_mem_cgroup(page)
TestSetPageDirty(page)
update page stats (without any checks)
move_unlock_mem_cgroup(page)
This fixes both of problems and we don't have to duplicate page flag into
page_cgroup. Please note: move_lock_mem_cgroup() is held only when there
are possibility of "account move" under the system. So, in most path,
status update will go without atomic locks.
This patch introduces mem_cgroup_begin_update_page_stat() and
mem_cgroup_end_update_page_stat() both should be called at modifying
'struct page' information if memcg takes care of it. as
mem_cgroup_begin_update_page_stat()
modify page information
mem_cgroup_update_page_stat()
=> never check any 'struct page' info, just update counters.
mem_cgroup_end_update_page_stat().
This patch is slow because we need to call begin_update_page_stat()/
end_update_page_stat() regardless of accounted will be changed or not. A
following patch adds an easy optimization and reduces the cost.
[akpm@linux-foundation.org: s/lock/locked/]
[hughd@google.com: fix deadlock by avoiding stat lock when anon]
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Greg Thelen <gthelen@google.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Ying Han <yinghan@google.com>
Signed-off-by: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-03-22 06:34:25 +07:00
|
|
|
{
|
2020-06-04 06:01:51 +07:00
|
|
|
struct page *head = compound_head(page); /* rmap on tail pages */
|
memcg: use new logic for page stat accounting
Now, page-stat-per-memcg is recorded into per page_cgroup flag by
duplicating page's status into the flag. The reason is that memcg has a
feature to move a page from a group to another group and we have race
between "move" and "page stat accounting",
Under current logic, assume CPU-A and CPU-B. CPU-A does "move" and CPU-B
does "page stat accounting".
When CPU-A goes 1st,
CPU-A CPU-B
update "struct page" info.
move_lock_mem_cgroup(memcg)
see pc->flags
copy page stat to new group
overwrite pc->mem_cgroup.
move_unlock_mem_cgroup(memcg)
move_lock_mem_cgroup(mem)
set pc->flags
update page stat accounting
move_unlock_mem_cgroup(mem)
stat accounting is guarded by move_lock_mem_cgroup() and "move" logic
(CPU-A) doesn't see changes in "struct page" information.
But it's costly to have the same information both in 'struct page' and
'struct page_cgroup'. And, there is a potential problem.
For example, assume we have PG_dirty accounting in memcg.
PG_..is a flag for struct page.
PCG_ is a flag for struct page_cgroup.
(This is just an example. The same problem can be found in any
kind of page stat accounting.)
CPU-A CPU-B
TestSet PG_dirty
(delay) TestClear PG_dirty
if (TestClear(PCG_dirty))
memcg->nr_dirty--
if (TestSet(PCG_dirty))
memcg->nr_dirty++
Here, memcg->nr_dirty = +1, this is wrong. This race was reported by Greg
Thelen <gthelen@google.com>. Now, only FILE_MAPPED is supported but
fortunately, it's serialized by page table lock and this is not real bug,
_now_,
If this potential problem is caused by having duplicated information in
struct page and struct page_cgroup, we may be able to fix this by using
original 'struct page' information. But we'll have a problem in "move
account"
Assume we use only PG_dirty.
CPU-A CPU-B
TestSet PG_dirty
(delay) move_lock_mem_cgroup()
if (PageDirty(page))
new_memcg->nr_dirty++
pc->mem_cgroup = new_memcg;
move_unlock_mem_cgroup()
move_lock_mem_cgroup()
memcg = pc->mem_cgroup
new_memcg->nr_dirty++
accounting information may be double-counted. This was original reason to
have PCG_xxx flags but it seems PCG_xxx has another problem.
I think we need a bigger lock as
move_lock_mem_cgroup(page)
TestSetPageDirty(page)
update page stats (without any checks)
move_unlock_mem_cgroup(page)
This fixes both of problems and we don't have to duplicate page flag into
page_cgroup. Please note: move_lock_mem_cgroup() is held only when there
are possibility of "account move" under the system. So, in most path,
status update will go without atomic locks.
This patch introduces mem_cgroup_begin_update_page_stat() and
mem_cgroup_end_update_page_stat() both should be called at modifying
'struct page' information if memcg takes care of it. as
mem_cgroup_begin_update_page_stat()
modify page information
mem_cgroup_update_page_stat()
=> never check any 'struct page' info, just update counters.
mem_cgroup_end_update_page_stat().
This patch is slow because we need to call begin_update_page_stat()/
end_update_page_stat() regardless of accounted will be changed or not. A
following patch adds an easy optimization and reduces the cost.
[akpm@linux-foundation.org: s/lock/locked/]
[hughd@google.com: fix deadlock by avoiding stat lock when anon]
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Greg Thelen <gthelen@google.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Ying Han <yinghan@google.com>
Signed-off-by: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-03-22 06:34:25 +07:00
|
|
|
struct mem_cgroup *memcg;
|
2015-02-12 06:25:01 +07:00
|
|
|
unsigned long flags;
|
memcg: use new logic for page stat accounting
Now, page-stat-per-memcg is recorded into per page_cgroup flag by
duplicating page's status into the flag. The reason is that memcg has a
feature to move a page from a group to another group and we have race
between "move" and "page stat accounting",
Under current logic, assume CPU-A and CPU-B. CPU-A does "move" and CPU-B
does "page stat accounting".
When CPU-A goes 1st,
CPU-A CPU-B
update "struct page" info.
move_lock_mem_cgroup(memcg)
see pc->flags
copy page stat to new group
overwrite pc->mem_cgroup.
move_unlock_mem_cgroup(memcg)
move_lock_mem_cgroup(mem)
set pc->flags
update page stat accounting
move_unlock_mem_cgroup(mem)
stat accounting is guarded by move_lock_mem_cgroup() and "move" logic
(CPU-A) doesn't see changes in "struct page" information.
But it's costly to have the same information both in 'struct page' and
'struct page_cgroup'. And, there is a potential problem.
For example, assume we have PG_dirty accounting in memcg.
PG_..is a flag for struct page.
PCG_ is a flag for struct page_cgroup.
(This is just an example. The same problem can be found in any
kind of page stat accounting.)
CPU-A CPU-B
TestSet PG_dirty
(delay) TestClear PG_dirty
if (TestClear(PCG_dirty))
memcg->nr_dirty--
if (TestSet(PCG_dirty))
memcg->nr_dirty++
Here, memcg->nr_dirty = +1, this is wrong. This race was reported by Greg
Thelen <gthelen@google.com>. Now, only FILE_MAPPED is supported but
fortunately, it's serialized by page table lock and this is not real bug,
_now_,
If this potential problem is caused by having duplicated information in
struct page and struct page_cgroup, we may be able to fix this by using
original 'struct page' information. But we'll have a problem in "move
account"
Assume we use only PG_dirty.
CPU-A CPU-B
TestSet PG_dirty
(delay) move_lock_mem_cgroup()
if (PageDirty(page))
new_memcg->nr_dirty++
pc->mem_cgroup = new_memcg;
move_unlock_mem_cgroup()
move_lock_mem_cgroup()
memcg = pc->mem_cgroup
new_memcg->nr_dirty++
accounting information may be double-counted. This was original reason to
have PCG_xxx flags but it seems PCG_xxx has another problem.
I think we need a bigger lock as
move_lock_mem_cgroup(page)
TestSetPageDirty(page)
update page stats (without any checks)
move_unlock_mem_cgroup(page)
This fixes both of problems and we don't have to duplicate page flag into
page_cgroup. Please note: move_lock_mem_cgroup() is held only when there
are possibility of "account move" under the system. So, in most path,
status update will go without atomic locks.
This patch introduces mem_cgroup_begin_update_page_stat() and
mem_cgroup_end_update_page_stat() both should be called at modifying
'struct page' information if memcg takes care of it. as
mem_cgroup_begin_update_page_stat()
modify page information
mem_cgroup_update_page_stat()
=> never check any 'struct page' info, just update counters.
mem_cgroup_end_update_page_stat().
This patch is slow because we need to call begin_update_page_stat()/
end_update_page_stat() regardless of accounted will be changed or not. A
following patch adds an easy optimization and reduces the cost.
[akpm@linux-foundation.org: s/lock/locked/]
[hughd@google.com: fix deadlock by avoiding stat lock when anon]
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Greg Thelen <gthelen@google.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Ying Han <yinghan@google.com>
Signed-off-by: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-03-22 06:34:25 +07:00
|
|
|
|
2015-02-12 06:25:01 +07:00
|
|
|
/*
|
|
|
|
* The RCU lock is held throughout the transaction. The fast
|
|
|
|
* path can get away without acquiring the memcg->move_lock
|
|
|
|
* because page moving starts with an RCU grace period.
|
mm: memcontrol: fix NULL pointer crash in test_clear_page_writeback()
Jaegeuk and Brad report a NULL pointer crash when writeback ending tries
to update the memcg stats:
BUG: unable to handle kernel NULL pointer dereference at 00000000000003b0
IP: test_clear_page_writeback+0x12e/0x2c0
[...]
RIP: 0010:test_clear_page_writeback+0x12e/0x2c0
Call Trace:
<IRQ>
end_page_writeback+0x47/0x70
f2fs_write_end_io+0x76/0x180 [f2fs]
bio_endio+0x9f/0x120
blk_update_request+0xa8/0x2f0
scsi_end_request+0x39/0x1d0
scsi_io_completion+0x211/0x690
scsi_finish_command+0xd9/0x120
scsi_softirq_done+0x127/0x150
__blk_mq_complete_request_remote+0x13/0x20
flush_smp_call_function_queue+0x56/0x110
generic_smp_call_function_single_interrupt+0x13/0x30
smp_call_function_single_interrupt+0x27/0x40
call_function_single_interrupt+0x89/0x90
RIP: 0010:native_safe_halt+0x6/0x10
(gdb) l *(test_clear_page_writeback+0x12e)
0xffffffff811bae3e is in test_clear_page_writeback (./include/linux/memcontrol.h:619).
614 mod_node_page_state(page_pgdat(page), idx, val);
615 if (mem_cgroup_disabled() || !page->mem_cgroup)
616 return;
617 mod_memcg_state(page->mem_cgroup, idx, val);
618 pn = page->mem_cgroup->nodeinfo[page_to_nid(page)];
619 this_cpu_add(pn->lruvec_stat->count[idx], val);
620 }
621
622 unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
623 gfp_t gfp_mask,
The issue is that writeback doesn't hold a page reference and the page
might get freed after PG_writeback is cleared (and the mapping is
unlocked) in test_clear_page_writeback(). The stat functions looking up
the page's node or zone are safe, as those attributes are static across
allocation and free cycles. But page->mem_cgroup is not, and it will
get cleared if we race with truncation or migration.
It appears this race window has been around for a while, but less likely
to trigger when the memcg stats were updated first thing after
PG_writeback is cleared. Recent changes reshuffled this code to update
the global node stats before the memcg ones, though, stretching the race
window out to an extent where people can reproduce the problem.
Update test_clear_page_writeback() to look up and pin page->mem_cgroup
before clearing PG_writeback, then not use that pointer afterward. It
is a partial revert of 62cccb8c8e7a ("mm: simplify lock_page_memcg()")
but leaves the pageref-holding callsites that aren't affected alone.
Link: http://lkml.kernel.org/r/20170809183825.GA26387@cmpxchg.org
Fixes: 62cccb8c8e7a ("mm: simplify lock_page_memcg()")
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reported-by: Jaegeuk Kim <jaegeuk@kernel.org>
Tested-by: Jaegeuk Kim <jaegeuk@kernel.org>
Reported-by: Bradley Bolen <bradleybolen@gmail.com>
Tested-by: Brad Bolen <bradleybolen@gmail.com>
Cc: Vladimir Davydov <vdavydov@virtuozzo.com>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: <stable@vger.kernel.org> [4.6+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2017-08-19 05:15:48 +07:00
|
|
|
*
|
|
|
|
* The RCU lock also protects the memcg from being freed when
|
|
|
|
* the page state that is going to change is the only thing
|
|
|
|
* preventing the page itself from being freed. E.g. writeback
|
|
|
|
* doesn't hold a page reference and relies on PG_writeback to
|
|
|
|
* keep off truncation, migration and so forth.
|
|
|
|
*/
|
mm: memcontrol: fix missed end-writeback page accounting
Commit 0a31bc97c80c ("mm: memcontrol: rewrite uncharge API") changed
page migration to uncharge the old page right away. The page is locked,
unmapped, truncated, and off the LRU, but it could race with writeback
ending, which then doesn't unaccount the page properly:
test_clear_page_writeback() migration
wait_on_page_writeback()
TestClearPageWriteback()
mem_cgroup_migrate()
clear PCG_USED
mem_cgroup_update_page_stat()
if (PageCgroupUsed(pc))
decrease memcg pages under writeback
release pc->mem_cgroup->move_lock
The per-page statistics interface is heavily optimized to avoid a
function call and a lookup_page_cgroup() in the file unmap fast path,
which means it doesn't verify whether a page is still charged before
clearing PageWriteback() and it has to do it in the stat update later.
Rework it so that it looks up the page's memcg once at the beginning of
the transaction and then uses it throughout. The charge will be
verified before clearing PageWriteback() and migration can't uncharge
the page as long as that is still set. The RCU lock will protect the
memcg past uncharge.
As far as losing the optimization goes, the following test results are
from a microbenchmark that maps, faults, and unmaps a 4GB sparse file
three times in a nested fashion, so that there are two negative passes
that don't account but still go through the new transaction overhead.
There is no actual difference:
old: 33.195102545 seconds time elapsed ( +- 0.01% )
new: 33.199231369 seconds time elapsed ( +- 0.03% )
The time spent in page_remove_rmap()'s callees still adds up to the
same, but the time spent in the function itself seems reduced:
# Children Self Command Shared Object Symbol
old: 0.12% 0.11% filemapstress [kernel.kallsyms] [k] page_remove_rmap
new: 0.12% 0.08% filemapstress [kernel.kallsyms] [k] page_remove_rmap
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: Vladimir Davydov <vdavydov@parallels.com>
Cc: <stable@vger.kernel.org> [3.17.x]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-10-30 04:50:48 +07:00
|
|
|
rcu_read_lock();
|
|
|
|
|
|
|
|
if (mem_cgroup_disabled())
|
mm: memcontrol: fix NULL pointer crash in test_clear_page_writeback()
Jaegeuk and Brad report a NULL pointer crash when writeback ending tries
to update the memcg stats:
BUG: unable to handle kernel NULL pointer dereference at 00000000000003b0
IP: test_clear_page_writeback+0x12e/0x2c0
[...]
RIP: 0010:test_clear_page_writeback+0x12e/0x2c0
Call Trace:
<IRQ>
end_page_writeback+0x47/0x70
f2fs_write_end_io+0x76/0x180 [f2fs]
bio_endio+0x9f/0x120
blk_update_request+0xa8/0x2f0
scsi_end_request+0x39/0x1d0
scsi_io_completion+0x211/0x690
scsi_finish_command+0xd9/0x120
scsi_softirq_done+0x127/0x150
__blk_mq_complete_request_remote+0x13/0x20
flush_smp_call_function_queue+0x56/0x110
generic_smp_call_function_single_interrupt+0x13/0x30
smp_call_function_single_interrupt+0x27/0x40
call_function_single_interrupt+0x89/0x90
RIP: 0010:native_safe_halt+0x6/0x10
(gdb) l *(test_clear_page_writeback+0x12e)
0xffffffff811bae3e is in test_clear_page_writeback (./include/linux/memcontrol.h:619).
614 mod_node_page_state(page_pgdat(page), idx, val);
615 if (mem_cgroup_disabled() || !page->mem_cgroup)
616 return;
617 mod_memcg_state(page->mem_cgroup, idx, val);
618 pn = page->mem_cgroup->nodeinfo[page_to_nid(page)];
619 this_cpu_add(pn->lruvec_stat->count[idx], val);
620 }
621
622 unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
623 gfp_t gfp_mask,
The issue is that writeback doesn't hold a page reference and the page
might get freed after PG_writeback is cleared (and the mapping is
unlocked) in test_clear_page_writeback(). The stat functions looking up
the page's node or zone are safe, as those attributes are static across
allocation and free cycles. But page->mem_cgroup is not, and it will
get cleared if we race with truncation or migration.
It appears this race window has been around for a while, but less likely
to trigger when the memcg stats were updated first thing after
PG_writeback is cleared. Recent changes reshuffled this code to update
the global node stats before the memcg ones, though, stretching the race
window out to an extent where people can reproduce the problem.
Update test_clear_page_writeback() to look up and pin page->mem_cgroup
before clearing PG_writeback, then not use that pointer afterward. It
is a partial revert of 62cccb8c8e7a ("mm: simplify lock_page_memcg()")
but leaves the pageref-holding callsites that aren't affected alone.
Link: http://lkml.kernel.org/r/20170809183825.GA26387@cmpxchg.org
Fixes: 62cccb8c8e7a ("mm: simplify lock_page_memcg()")
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reported-by: Jaegeuk Kim <jaegeuk@kernel.org>
Tested-by: Jaegeuk Kim <jaegeuk@kernel.org>
Reported-by: Bradley Bolen <bradleybolen@gmail.com>
Tested-by: Brad Bolen <bradleybolen@gmail.com>
Cc: Vladimir Davydov <vdavydov@virtuozzo.com>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: <stable@vger.kernel.org> [4.6+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2017-08-19 05:15:48 +07:00
|
|
|
return NULL;
|
memcg: use new logic for page stat accounting
Now, page-stat-per-memcg is recorded into per page_cgroup flag by
duplicating page's status into the flag. The reason is that memcg has a
feature to move a page from a group to another group and we have race
between "move" and "page stat accounting",
Under current logic, assume CPU-A and CPU-B. CPU-A does "move" and CPU-B
does "page stat accounting".
When CPU-A goes 1st,
CPU-A CPU-B
update "struct page" info.
move_lock_mem_cgroup(memcg)
see pc->flags
copy page stat to new group
overwrite pc->mem_cgroup.
move_unlock_mem_cgroup(memcg)
move_lock_mem_cgroup(mem)
set pc->flags
update page stat accounting
move_unlock_mem_cgroup(mem)
stat accounting is guarded by move_lock_mem_cgroup() and "move" logic
(CPU-A) doesn't see changes in "struct page" information.
But it's costly to have the same information both in 'struct page' and
'struct page_cgroup'. And, there is a potential problem.
For example, assume we have PG_dirty accounting in memcg.
PG_..is a flag for struct page.
PCG_ is a flag for struct page_cgroup.
(This is just an example. The same problem can be found in any
kind of page stat accounting.)
CPU-A CPU-B
TestSet PG_dirty
(delay) TestClear PG_dirty
if (TestClear(PCG_dirty))
memcg->nr_dirty--
if (TestSet(PCG_dirty))
memcg->nr_dirty++
Here, memcg->nr_dirty = +1, this is wrong. This race was reported by Greg
Thelen <gthelen@google.com>. Now, only FILE_MAPPED is supported but
fortunately, it's serialized by page table lock and this is not real bug,
_now_,
If this potential problem is caused by having duplicated information in
struct page and struct page_cgroup, we may be able to fix this by using
original 'struct page' information. But we'll have a problem in "move
account"
Assume we use only PG_dirty.
CPU-A CPU-B
TestSet PG_dirty
(delay) move_lock_mem_cgroup()
if (PageDirty(page))
new_memcg->nr_dirty++
pc->mem_cgroup = new_memcg;
move_unlock_mem_cgroup()
move_lock_mem_cgroup()
memcg = pc->mem_cgroup
new_memcg->nr_dirty++
accounting information may be double-counted. This was original reason to
have PCG_xxx flags but it seems PCG_xxx has another problem.
I think we need a bigger lock as
move_lock_mem_cgroup(page)
TestSetPageDirty(page)
update page stats (without any checks)
move_unlock_mem_cgroup(page)
This fixes both of problems and we don't have to duplicate page flag into
page_cgroup. Please note: move_lock_mem_cgroup() is held only when there
are possibility of "account move" under the system. So, in most path,
status update will go without atomic locks.
This patch introduces mem_cgroup_begin_update_page_stat() and
mem_cgroup_end_update_page_stat() both should be called at modifying
'struct page' information if memcg takes care of it. as
mem_cgroup_begin_update_page_stat()
modify page information
mem_cgroup_update_page_stat()
=> never check any 'struct page' info, just update counters.
mem_cgroup_end_update_page_stat().
This patch is slow because we need to call begin_update_page_stat()/
end_update_page_stat() regardless of accounted will be changed or not. A
following patch adds an easy optimization and reduces the cost.
[akpm@linux-foundation.org: s/lock/locked/]
[hughd@google.com: fix deadlock by avoiding stat lock when anon]
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Greg Thelen <gthelen@google.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Ying Han <yinghan@google.com>
Signed-off-by: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-03-22 06:34:25 +07:00
|
|
|
again:
|
2020-06-04 06:01:51 +07:00
|
|
|
memcg = head->mem_cgroup;
|
2014-12-11 06:44:02 +07:00
|
|
|
if (unlikely(!memcg))
|
mm: memcontrol: fix NULL pointer crash in test_clear_page_writeback()
Jaegeuk and Brad report a NULL pointer crash when writeback ending tries
to update the memcg stats:
BUG: unable to handle kernel NULL pointer dereference at 00000000000003b0
IP: test_clear_page_writeback+0x12e/0x2c0
[...]
RIP: 0010:test_clear_page_writeback+0x12e/0x2c0
Call Trace:
<IRQ>
end_page_writeback+0x47/0x70
f2fs_write_end_io+0x76/0x180 [f2fs]
bio_endio+0x9f/0x120
blk_update_request+0xa8/0x2f0
scsi_end_request+0x39/0x1d0
scsi_io_completion+0x211/0x690
scsi_finish_command+0xd9/0x120
scsi_softirq_done+0x127/0x150
__blk_mq_complete_request_remote+0x13/0x20
flush_smp_call_function_queue+0x56/0x110
generic_smp_call_function_single_interrupt+0x13/0x30
smp_call_function_single_interrupt+0x27/0x40
call_function_single_interrupt+0x89/0x90
RIP: 0010:native_safe_halt+0x6/0x10
(gdb) l *(test_clear_page_writeback+0x12e)
0xffffffff811bae3e is in test_clear_page_writeback (./include/linux/memcontrol.h:619).
614 mod_node_page_state(page_pgdat(page), idx, val);
615 if (mem_cgroup_disabled() || !page->mem_cgroup)
616 return;
617 mod_memcg_state(page->mem_cgroup, idx, val);
618 pn = page->mem_cgroup->nodeinfo[page_to_nid(page)];
619 this_cpu_add(pn->lruvec_stat->count[idx], val);
620 }
621
622 unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
623 gfp_t gfp_mask,
The issue is that writeback doesn't hold a page reference and the page
might get freed after PG_writeback is cleared (and the mapping is
unlocked) in test_clear_page_writeback(). The stat functions looking up
the page's node or zone are safe, as those attributes are static across
allocation and free cycles. But page->mem_cgroup is not, and it will
get cleared if we race with truncation or migration.
It appears this race window has been around for a while, but less likely
to trigger when the memcg stats were updated first thing after
PG_writeback is cleared. Recent changes reshuffled this code to update
the global node stats before the memcg ones, though, stretching the race
window out to an extent where people can reproduce the problem.
Update test_clear_page_writeback() to look up and pin page->mem_cgroup
before clearing PG_writeback, then not use that pointer afterward. It
is a partial revert of 62cccb8c8e7a ("mm: simplify lock_page_memcg()")
but leaves the pageref-holding callsites that aren't affected alone.
Link: http://lkml.kernel.org/r/20170809183825.GA26387@cmpxchg.org
Fixes: 62cccb8c8e7a ("mm: simplify lock_page_memcg()")
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reported-by: Jaegeuk Kim <jaegeuk@kernel.org>
Tested-by: Jaegeuk Kim <jaegeuk@kernel.org>
Reported-by: Bradley Bolen <bradleybolen@gmail.com>
Tested-by: Brad Bolen <bradleybolen@gmail.com>
Cc: Vladimir Davydov <vdavydov@virtuozzo.com>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: <stable@vger.kernel.org> [4.6+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2017-08-19 05:15:48 +07:00
|
|
|
return NULL;
|
mm: memcontrol: fix missed end-writeback page accounting
Commit 0a31bc97c80c ("mm: memcontrol: rewrite uncharge API") changed
page migration to uncharge the old page right away. The page is locked,
unmapped, truncated, and off the LRU, but it could race with writeback
ending, which then doesn't unaccount the page properly:
test_clear_page_writeback() migration
wait_on_page_writeback()
TestClearPageWriteback()
mem_cgroup_migrate()
clear PCG_USED
mem_cgroup_update_page_stat()
if (PageCgroupUsed(pc))
decrease memcg pages under writeback
release pc->mem_cgroup->move_lock
The per-page statistics interface is heavily optimized to avoid a
function call and a lookup_page_cgroup() in the file unmap fast path,
which means it doesn't verify whether a page is still charged before
clearing PageWriteback() and it has to do it in the stat update later.
Rework it so that it looks up the page's memcg once at the beginning of
the transaction and then uses it throughout. The charge will be
verified before clearing PageWriteback() and migration can't uncharge
the page as long as that is still set. The RCU lock will protect the
memcg past uncharge.
As far as losing the optimization goes, the following test results are
from a microbenchmark that maps, faults, and unmaps a 4GB sparse file
three times in a nested fashion, so that there are two negative passes
that don't account but still go through the new transaction overhead.
There is no actual difference:
old: 33.195102545 seconds time elapsed ( +- 0.01% )
new: 33.199231369 seconds time elapsed ( +- 0.03% )
The time spent in page_remove_rmap()'s callees still adds up to the
same, but the time spent in the function itself seems reduced:
# Children Self Command Shared Object Symbol
old: 0.12% 0.11% filemapstress [kernel.kallsyms] [k] page_remove_rmap
new: 0.12% 0.08% filemapstress [kernel.kallsyms] [k] page_remove_rmap
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: Vladimir Davydov <vdavydov@parallels.com>
Cc: <stable@vger.kernel.org> [3.17.x]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-10-30 04:50:48 +07:00
|
|
|
|
2014-06-05 06:08:21 +07:00
|
|
|
if (atomic_read(&memcg->moving_account) <= 0)
|
mm: memcontrol: fix NULL pointer crash in test_clear_page_writeback()
Jaegeuk and Brad report a NULL pointer crash when writeback ending tries
to update the memcg stats:
BUG: unable to handle kernel NULL pointer dereference at 00000000000003b0
IP: test_clear_page_writeback+0x12e/0x2c0
[...]
RIP: 0010:test_clear_page_writeback+0x12e/0x2c0
Call Trace:
<IRQ>
end_page_writeback+0x47/0x70
f2fs_write_end_io+0x76/0x180 [f2fs]
bio_endio+0x9f/0x120
blk_update_request+0xa8/0x2f0
scsi_end_request+0x39/0x1d0
scsi_io_completion+0x211/0x690
scsi_finish_command+0xd9/0x120
scsi_softirq_done+0x127/0x150
__blk_mq_complete_request_remote+0x13/0x20
flush_smp_call_function_queue+0x56/0x110
generic_smp_call_function_single_interrupt+0x13/0x30
smp_call_function_single_interrupt+0x27/0x40
call_function_single_interrupt+0x89/0x90
RIP: 0010:native_safe_halt+0x6/0x10
(gdb) l *(test_clear_page_writeback+0x12e)
0xffffffff811bae3e is in test_clear_page_writeback (./include/linux/memcontrol.h:619).
614 mod_node_page_state(page_pgdat(page), idx, val);
615 if (mem_cgroup_disabled() || !page->mem_cgroup)
616 return;
617 mod_memcg_state(page->mem_cgroup, idx, val);
618 pn = page->mem_cgroup->nodeinfo[page_to_nid(page)];
619 this_cpu_add(pn->lruvec_stat->count[idx], val);
620 }
621
622 unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
623 gfp_t gfp_mask,
The issue is that writeback doesn't hold a page reference and the page
might get freed after PG_writeback is cleared (and the mapping is
unlocked) in test_clear_page_writeback(). The stat functions looking up
the page's node or zone are safe, as those attributes are static across
allocation and free cycles. But page->mem_cgroup is not, and it will
get cleared if we race with truncation or migration.
It appears this race window has been around for a while, but less likely
to trigger when the memcg stats were updated first thing after
PG_writeback is cleared. Recent changes reshuffled this code to update
the global node stats before the memcg ones, though, stretching the race
window out to an extent where people can reproduce the problem.
Update test_clear_page_writeback() to look up and pin page->mem_cgroup
before clearing PG_writeback, then not use that pointer afterward. It
is a partial revert of 62cccb8c8e7a ("mm: simplify lock_page_memcg()")
but leaves the pageref-holding callsites that aren't affected alone.
Link: http://lkml.kernel.org/r/20170809183825.GA26387@cmpxchg.org
Fixes: 62cccb8c8e7a ("mm: simplify lock_page_memcg()")
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reported-by: Jaegeuk Kim <jaegeuk@kernel.org>
Tested-by: Jaegeuk Kim <jaegeuk@kernel.org>
Reported-by: Bradley Bolen <bradleybolen@gmail.com>
Tested-by: Brad Bolen <bradleybolen@gmail.com>
Cc: Vladimir Davydov <vdavydov@virtuozzo.com>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: <stable@vger.kernel.org> [4.6+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2017-08-19 05:15:48 +07:00
|
|
|
return memcg;
|
memcg: use new logic for page stat accounting
Now, page-stat-per-memcg is recorded into per page_cgroup flag by
duplicating page's status into the flag. The reason is that memcg has a
feature to move a page from a group to another group and we have race
between "move" and "page stat accounting",
Under current logic, assume CPU-A and CPU-B. CPU-A does "move" and CPU-B
does "page stat accounting".
When CPU-A goes 1st,
CPU-A CPU-B
update "struct page" info.
move_lock_mem_cgroup(memcg)
see pc->flags
copy page stat to new group
overwrite pc->mem_cgroup.
move_unlock_mem_cgroup(memcg)
move_lock_mem_cgroup(mem)
set pc->flags
update page stat accounting
move_unlock_mem_cgroup(mem)
stat accounting is guarded by move_lock_mem_cgroup() and "move" logic
(CPU-A) doesn't see changes in "struct page" information.
But it's costly to have the same information both in 'struct page' and
'struct page_cgroup'. And, there is a potential problem.
For example, assume we have PG_dirty accounting in memcg.
PG_..is a flag for struct page.
PCG_ is a flag for struct page_cgroup.
(This is just an example. The same problem can be found in any
kind of page stat accounting.)
CPU-A CPU-B
TestSet PG_dirty
(delay) TestClear PG_dirty
if (TestClear(PCG_dirty))
memcg->nr_dirty--
if (TestSet(PCG_dirty))
memcg->nr_dirty++
Here, memcg->nr_dirty = +1, this is wrong. This race was reported by Greg
Thelen <gthelen@google.com>. Now, only FILE_MAPPED is supported but
fortunately, it's serialized by page table lock and this is not real bug,
_now_,
If this potential problem is caused by having duplicated information in
struct page and struct page_cgroup, we may be able to fix this by using
original 'struct page' information. But we'll have a problem in "move
account"
Assume we use only PG_dirty.
CPU-A CPU-B
TestSet PG_dirty
(delay) move_lock_mem_cgroup()
if (PageDirty(page))
new_memcg->nr_dirty++
pc->mem_cgroup = new_memcg;
move_unlock_mem_cgroup()
move_lock_mem_cgroup()
memcg = pc->mem_cgroup
new_memcg->nr_dirty++
accounting information may be double-counted. This was original reason to
have PCG_xxx flags but it seems PCG_xxx has another problem.
I think we need a bigger lock as
move_lock_mem_cgroup(page)
TestSetPageDirty(page)
update page stats (without any checks)
move_unlock_mem_cgroup(page)
This fixes both of problems and we don't have to duplicate page flag into
page_cgroup. Please note: move_lock_mem_cgroup() is held only when there
are possibility of "account move" under the system. So, in most path,
status update will go without atomic locks.
This patch introduces mem_cgroup_begin_update_page_stat() and
mem_cgroup_end_update_page_stat() both should be called at modifying
'struct page' information if memcg takes care of it. as
mem_cgroup_begin_update_page_stat()
modify page information
mem_cgroup_update_page_stat()
=> never check any 'struct page' info, just update counters.
mem_cgroup_end_update_page_stat().
This patch is slow because we need to call begin_update_page_stat()/
end_update_page_stat() regardless of accounted will be changed or not. A
following patch adds an easy optimization and reduces the cost.
[akpm@linux-foundation.org: s/lock/locked/]
[hughd@google.com: fix deadlock by avoiding stat lock when anon]
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Greg Thelen <gthelen@google.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Ying Han <yinghan@google.com>
Signed-off-by: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-03-22 06:34:25 +07:00
|
|
|
|
2015-02-12 06:25:01 +07:00
|
|
|
spin_lock_irqsave(&memcg->move_lock, flags);
|
2020-06-04 06:01:51 +07:00
|
|
|
if (memcg != head->mem_cgroup) {
|
2015-02-12 06:25:01 +07:00
|
|
|
spin_unlock_irqrestore(&memcg->move_lock, flags);
|
memcg: use new logic for page stat accounting
Now, page-stat-per-memcg is recorded into per page_cgroup flag by
duplicating page's status into the flag. The reason is that memcg has a
feature to move a page from a group to another group and we have race
between "move" and "page stat accounting",
Under current logic, assume CPU-A and CPU-B. CPU-A does "move" and CPU-B
does "page stat accounting".
When CPU-A goes 1st,
CPU-A CPU-B
update "struct page" info.
move_lock_mem_cgroup(memcg)
see pc->flags
copy page stat to new group
overwrite pc->mem_cgroup.
move_unlock_mem_cgroup(memcg)
move_lock_mem_cgroup(mem)
set pc->flags
update page stat accounting
move_unlock_mem_cgroup(mem)
stat accounting is guarded by move_lock_mem_cgroup() and "move" logic
(CPU-A) doesn't see changes in "struct page" information.
But it's costly to have the same information both in 'struct page' and
'struct page_cgroup'. And, there is a potential problem.
For example, assume we have PG_dirty accounting in memcg.
PG_..is a flag for struct page.
PCG_ is a flag for struct page_cgroup.
(This is just an example. The same problem can be found in any
kind of page stat accounting.)
CPU-A CPU-B
TestSet PG_dirty
(delay) TestClear PG_dirty
if (TestClear(PCG_dirty))
memcg->nr_dirty--
if (TestSet(PCG_dirty))
memcg->nr_dirty++
Here, memcg->nr_dirty = +1, this is wrong. This race was reported by Greg
Thelen <gthelen@google.com>. Now, only FILE_MAPPED is supported but
fortunately, it's serialized by page table lock and this is not real bug,
_now_,
If this potential problem is caused by having duplicated information in
struct page and struct page_cgroup, we may be able to fix this by using
original 'struct page' information. But we'll have a problem in "move
account"
Assume we use only PG_dirty.
CPU-A CPU-B
TestSet PG_dirty
(delay) move_lock_mem_cgroup()
if (PageDirty(page))
new_memcg->nr_dirty++
pc->mem_cgroup = new_memcg;
move_unlock_mem_cgroup()
move_lock_mem_cgroup()
memcg = pc->mem_cgroup
new_memcg->nr_dirty++
accounting information may be double-counted. This was original reason to
have PCG_xxx flags but it seems PCG_xxx has another problem.
I think we need a bigger lock as
move_lock_mem_cgroup(page)
TestSetPageDirty(page)
update page stats (without any checks)
move_unlock_mem_cgroup(page)
This fixes both of problems and we don't have to duplicate page flag into
page_cgroup. Please note: move_lock_mem_cgroup() is held only when there
are possibility of "account move" under the system. So, in most path,
status update will go without atomic locks.
This patch introduces mem_cgroup_begin_update_page_stat() and
mem_cgroup_end_update_page_stat() both should be called at modifying
'struct page' information if memcg takes care of it. as
mem_cgroup_begin_update_page_stat()
modify page information
mem_cgroup_update_page_stat()
=> never check any 'struct page' info, just update counters.
mem_cgroup_end_update_page_stat().
This patch is slow because we need to call begin_update_page_stat()/
end_update_page_stat() regardless of accounted will be changed or not. A
following patch adds an easy optimization and reduces the cost.
[akpm@linux-foundation.org: s/lock/locked/]
[hughd@google.com: fix deadlock by avoiding stat lock when anon]
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Greg Thelen <gthelen@google.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Ying Han <yinghan@google.com>
Signed-off-by: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-03-22 06:34:25 +07:00
|
|
|
goto again;
|
|
|
|
}
|
2015-02-12 06:25:01 +07:00
|
|
|
|
|
|
|
/*
|
|
|
|
* When charge migration first begins, we can have locked and
|
|
|
|
* unlocked page stat updates happening concurrently. Track
|
2016-03-16 04:57:04 +07:00
|
|
|
* the task who has the lock for unlock_page_memcg().
|
2015-02-12 06:25:01 +07:00
|
|
|
*/
|
|
|
|
memcg->move_lock_task = current;
|
|
|
|
memcg->move_lock_flags = flags;
|
mm: memcontrol: fix missed end-writeback page accounting
Commit 0a31bc97c80c ("mm: memcontrol: rewrite uncharge API") changed
page migration to uncharge the old page right away. The page is locked,
unmapped, truncated, and off the LRU, but it could race with writeback
ending, which then doesn't unaccount the page properly:
test_clear_page_writeback() migration
wait_on_page_writeback()
TestClearPageWriteback()
mem_cgroup_migrate()
clear PCG_USED
mem_cgroup_update_page_stat()
if (PageCgroupUsed(pc))
decrease memcg pages under writeback
release pc->mem_cgroup->move_lock
The per-page statistics interface is heavily optimized to avoid a
function call and a lookup_page_cgroup() in the file unmap fast path,
which means it doesn't verify whether a page is still charged before
clearing PageWriteback() and it has to do it in the stat update later.
Rework it so that it looks up the page's memcg once at the beginning of
the transaction and then uses it throughout. The charge will be
verified before clearing PageWriteback() and migration can't uncharge
the page as long as that is still set. The RCU lock will protect the
memcg past uncharge.
As far as losing the optimization goes, the following test results are
from a microbenchmark that maps, faults, and unmaps a 4GB sparse file
three times in a nested fashion, so that there are two negative passes
that don't account but still go through the new transaction overhead.
There is no actual difference:
old: 33.195102545 seconds time elapsed ( +- 0.01% )
new: 33.199231369 seconds time elapsed ( +- 0.03% )
The time spent in page_remove_rmap()'s callees still adds up to the
same, but the time spent in the function itself seems reduced:
# Children Self Command Shared Object Symbol
old: 0.12% 0.11% filemapstress [kernel.kallsyms] [k] page_remove_rmap
new: 0.12% 0.08% filemapstress [kernel.kallsyms] [k] page_remove_rmap
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: Vladimir Davydov <vdavydov@parallels.com>
Cc: <stable@vger.kernel.org> [3.17.x]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-10-30 04:50:48 +07:00
|
|
|
|
mm: memcontrol: fix NULL pointer crash in test_clear_page_writeback()
Jaegeuk and Brad report a NULL pointer crash when writeback ending tries
to update the memcg stats:
BUG: unable to handle kernel NULL pointer dereference at 00000000000003b0
IP: test_clear_page_writeback+0x12e/0x2c0
[...]
RIP: 0010:test_clear_page_writeback+0x12e/0x2c0
Call Trace:
<IRQ>
end_page_writeback+0x47/0x70
f2fs_write_end_io+0x76/0x180 [f2fs]
bio_endio+0x9f/0x120
blk_update_request+0xa8/0x2f0
scsi_end_request+0x39/0x1d0
scsi_io_completion+0x211/0x690
scsi_finish_command+0xd9/0x120
scsi_softirq_done+0x127/0x150
__blk_mq_complete_request_remote+0x13/0x20
flush_smp_call_function_queue+0x56/0x110
generic_smp_call_function_single_interrupt+0x13/0x30
smp_call_function_single_interrupt+0x27/0x40
call_function_single_interrupt+0x89/0x90
RIP: 0010:native_safe_halt+0x6/0x10
(gdb) l *(test_clear_page_writeback+0x12e)
0xffffffff811bae3e is in test_clear_page_writeback (./include/linux/memcontrol.h:619).
614 mod_node_page_state(page_pgdat(page), idx, val);
615 if (mem_cgroup_disabled() || !page->mem_cgroup)
616 return;
617 mod_memcg_state(page->mem_cgroup, idx, val);
618 pn = page->mem_cgroup->nodeinfo[page_to_nid(page)];
619 this_cpu_add(pn->lruvec_stat->count[idx], val);
620 }
621
622 unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
623 gfp_t gfp_mask,
The issue is that writeback doesn't hold a page reference and the page
might get freed after PG_writeback is cleared (and the mapping is
unlocked) in test_clear_page_writeback(). The stat functions looking up
the page's node or zone are safe, as those attributes are static across
allocation and free cycles. But page->mem_cgroup is not, and it will
get cleared if we race with truncation or migration.
It appears this race window has been around for a while, but less likely
to trigger when the memcg stats were updated first thing after
PG_writeback is cleared. Recent changes reshuffled this code to update
the global node stats before the memcg ones, though, stretching the race
window out to an extent where people can reproduce the problem.
Update test_clear_page_writeback() to look up and pin page->mem_cgroup
before clearing PG_writeback, then not use that pointer afterward. It
is a partial revert of 62cccb8c8e7a ("mm: simplify lock_page_memcg()")
but leaves the pageref-holding callsites that aren't affected alone.
Link: http://lkml.kernel.org/r/20170809183825.GA26387@cmpxchg.org
Fixes: 62cccb8c8e7a ("mm: simplify lock_page_memcg()")
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reported-by: Jaegeuk Kim <jaegeuk@kernel.org>
Tested-by: Jaegeuk Kim <jaegeuk@kernel.org>
Reported-by: Bradley Bolen <bradleybolen@gmail.com>
Tested-by: Brad Bolen <bradleybolen@gmail.com>
Cc: Vladimir Davydov <vdavydov@virtuozzo.com>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: <stable@vger.kernel.org> [4.6+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2017-08-19 05:15:48 +07:00
|
|
|
return memcg;
|
memcg: use new logic for page stat accounting
Now, page-stat-per-memcg is recorded into per page_cgroup flag by
duplicating page's status into the flag. The reason is that memcg has a
feature to move a page from a group to another group and we have race
between "move" and "page stat accounting",
Under current logic, assume CPU-A and CPU-B. CPU-A does "move" and CPU-B
does "page stat accounting".
When CPU-A goes 1st,
CPU-A CPU-B
update "struct page" info.
move_lock_mem_cgroup(memcg)
see pc->flags
copy page stat to new group
overwrite pc->mem_cgroup.
move_unlock_mem_cgroup(memcg)
move_lock_mem_cgroup(mem)
set pc->flags
update page stat accounting
move_unlock_mem_cgroup(mem)
stat accounting is guarded by move_lock_mem_cgroup() and "move" logic
(CPU-A) doesn't see changes in "struct page" information.
But it's costly to have the same information both in 'struct page' and
'struct page_cgroup'. And, there is a potential problem.
For example, assume we have PG_dirty accounting in memcg.
PG_..is a flag for struct page.
PCG_ is a flag for struct page_cgroup.
(This is just an example. The same problem can be found in any
kind of page stat accounting.)
CPU-A CPU-B
TestSet PG_dirty
(delay) TestClear PG_dirty
if (TestClear(PCG_dirty))
memcg->nr_dirty--
if (TestSet(PCG_dirty))
memcg->nr_dirty++
Here, memcg->nr_dirty = +1, this is wrong. This race was reported by Greg
Thelen <gthelen@google.com>. Now, only FILE_MAPPED is supported but
fortunately, it's serialized by page table lock and this is not real bug,
_now_,
If this potential problem is caused by having duplicated information in
struct page and struct page_cgroup, we may be able to fix this by using
original 'struct page' information. But we'll have a problem in "move
account"
Assume we use only PG_dirty.
CPU-A CPU-B
TestSet PG_dirty
(delay) move_lock_mem_cgroup()
if (PageDirty(page))
new_memcg->nr_dirty++
pc->mem_cgroup = new_memcg;
move_unlock_mem_cgroup()
move_lock_mem_cgroup()
memcg = pc->mem_cgroup
new_memcg->nr_dirty++
accounting information may be double-counted. This was original reason to
have PCG_xxx flags but it seems PCG_xxx has another problem.
I think we need a bigger lock as
move_lock_mem_cgroup(page)
TestSetPageDirty(page)
update page stats (without any checks)
move_unlock_mem_cgroup(page)
This fixes both of problems and we don't have to duplicate page flag into
page_cgroup. Please note: move_lock_mem_cgroup() is held only when there
are possibility of "account move" under the system. So, in most path,
status update will go without atomic locks.
This patch introduces mem_cgroup_begin_update_page_stat() and
mem_cgroup_end_update_page_stat() both should be called at modifying
'struct page' information if memcg takes care of it. as
mem_cgroup_begin_update_page_stat()
modify page information
mem_cgroup_update_page_stat()
=> never check any 'struct page' info, just update counters.
mem_cgroup_end_update_page_stat().
This patch is slow because we need to call begin_update_page_stat()/
end_update_page_stat() regardless of accounted will be changed or not. A
following patch adds an easy optimization and reduces the cost.
[akpm@linux-foundation.org: s/lock/locked/]
[hughd@google.com: fix deadlock by avoiding stat lock when anon]
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Greg Thelen <gthelen@google.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Ying Han <yinghan@google.com>
Signed-off-by: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-03-22 06:34:25 +07:00
|
|
|
}
|
2016-03-16 04:57:04 +07:00
|
|
|
EXPORT_SYMBOL(lock_page_memcg);
|
memcg: use new logic for page stat accounting
Now, page-stat-per-memcg is recorded into per page_cgroup flag by
duplicating page's status into the flag. The reason is that memcg has a
feature to move a page from a group to another group and we have race
between "move" and "page stat accounting",
Under current logic, assume CPU-A and CPU-B. CPU-A does "move" and CPU-B
does "page stat accounting".
When CPU-A goes 1st,
CPU-A CPU-B
update "struct page" info.
move_lock_mem_cgroup(memcg)
see pc->flags
copy page stat to new group
overwrite pc->mem_cgroup.
move_unlock_mem_cgroup(memcg)
move_lock_mem_cgroup(mem)
set pc->flags
update page stat accounting
move_unlock_mem_cgroup(mem)
stat accounting is guarded by move_lock_mem_cgroup() and "move" logic
(CPU-A) doesn't see changes in "struct page" information.
But it's costly to have the same information both in 'struct page' and
'struct page_cgroup'. And, there is a potential problem.
For example, assume we have PG_dirty accounting in memcg.
PG_..is a flag for struct page.
PCG_ is a flag for struct page_cgroup.
(This is just an example. The same problem can be found in any
kind of page stat accounting.)
CPU-A CPU-B
TestSet PG_dirty
(delay) TestClear PG_dirty
if (TestClear(PCG_dirty))
memcg->nr_dirty--
if (TestSet(PCG_dirty))
memcg->nr_dirty++
Here, memcg->nr_dirty = +1, this is wrong. This race was reported by Greg
Thelen <gthelen@google.com>. Now, only FILE_MAPPED is supported but
fortunately, it's serialized by page table lock and this is not real bug,
_now_,
If this potential problem is caused by having duplicated information in
struct page and struct page_cgroup, we may be able to fix this by using
original 'struct page' information. But we'll have a problem in "move
account"
Assume we use only PG_dirty.
CPU-A CPU-B
TestSet PG_dirty
(delay) move_lock_mem_cgroup()
if (PageDirty(page))
new_memcg->nr_dirty++
pc->mem_cgroup = new_memcg;
move_unlock_mem_cgroup()
move_lock_mem_cgroup()
memcg = pc->mem_cgroup
new_memcg->nr_dirty++
accounting information may be double-counted. This was original reason to
have PCG_xxx flags but it seems PCG_xxx has another problem.
I think we need a bigger lock as
move_lock_mem_cgroup(page)
TestSetPageDirty(page)
update page stats (without any checks)
move_unlock_mem_cgroup(page)
This fixes both of problems and we don't have to duplicate page flag into
page_cgroup. Please note: move_lock_mem_cgroup() is held only when there
are possibility of "account move" under the system. So, in most path,
status update will go without atomic locks.
This patch introduces mem_cgroup_begin_update_page_stat() and
mem_cgroup_end_update_page_stat() both should be called at modifying
'struct page' information if memcg takes care of it. as
mem_cgroup_begin_update_page_stat()
modify page information
mem_cgroup_update_page_stat()
=> never check any 'struct page' info, just update counters.
mem_cgroup_end_update_page_stat().
This patch is slow because we need to call begin_update_page_stat()/
end_update_page_stat() regardless of accounted will be changed or not. A
following patch adds an easy optimization and reduces the cost.
[akpm@linux-foundation.org: s/lock/locked/]
[hughd@google.com: fix deadlock by avoiding stat lock when anon]
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Greg Thelen <gthelen@google.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Ying Han <yinghan@google.com>
Signed-off-by: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-03-22 06:34:25 +07:00
|
|
|
|
mm: memcontrol: fix missed end-writeback page accounting
Commit 0a31bc97c80c ("mm: memcontrol: rewrite uncharge API") changed
page migration to uncharge the old page right away. The page is locked,
unmapped, truncated, and off the LRU, but it could race with writeback
ending, which then doesn't unaccount the page properly:
test_clear_page_writeback() migration
wait_on_page_writeback()
TestClearPageWriteback()
mem_cgroup_migrate()
clear PCG_USED
mem_cgroup_update_page_stat()
if (PageCgroupUsed(pc))
decrease memcg pages under writeback
release pc->mem_cgroup->move_lock
The per-page statistics interface is heavily optimized to avoid a
function call and a lookup_page_cgroup() in the file unmap fast path,
which means it doesn't verify whether a page is still charged before
clearing PageWriteback() and it has to do it in the stat update later.
Rework it so that it looks up the page's memcg once at the beginning of
the transaction and then uses it throughout. The charge will be
verified before clearing PageWriteback() and migration can't uncharge
the page as long as that is still set. The RCU lock will protect the
memcg past uncharge.
As far as losing the optimization goes, the following test results are
from a microbenchmark that maps, faults, and unmaps a 4GB sparse file
three times in a nested fashion, so that there are two negative passes
that don't account but still go through the new transaction overhead.
There is no actual difference:
old: 33.195102545 seconds time elapsed ( +- 0.01% )
new: 33.199231369 seconds time elapsed ( +- 0.03% )
The time spent in page_remove_rmap()'s callees still adds up to the
same, but the time spent in the function itself seems reduced:
# Children Self Command Shared Object Symbol
old: 0.12% 0.11% filemapstress [kernel.kallsyms] [k] page_remove_rmap
new: 0.12% 0.08% filemapstress [kernel.kallsyms] [k] page_remove_rmap
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: Vladimir Davydov <vdavydov@parallels.com>
Cc: <stable@vger.kernel.org> [3.17.x]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-10-30 04:50:48 +07:00
|
|
|
/**
|
mm: memcontrol: fix NULL pointer crash in test_clear_page_writeback()
Jaegeuk and Brad report a NULL pointer crash when writeback ending tries
to update the memcg stats:
BUG: unable to handle kernel NULL pointer dereference at 00000000000003b0
IP: test_clear_page_writeback+0x12e/0x2c0
[...]
RIP: 0010:test_clear_page_writeback+0x12e/0x2c0
Call Trace:
<IRQ>
end_page_writeback+0x47/0x70
f2fs_write_end_io+0x76/0x180 [f2fs]
bio_endio+0x9f/0x120
blk_update_request+0xa8/0x2f0
scsi_end_request+0x39/0x1d0
scsi_io_completion+0x211/0x690
scsi_finish_command+0xd9/0x120
scsi_softirq_done+0x127/0x150
__blk_mq_complete_request_remote+0x13/0x20
flush_smp_call_function_queue+0x56/0x110
generic_smp_call_function_single_interrupt+0x13/0x30
smp_call_function_single_interrupt+0x27/0x40
call_function_single_interrupt+0x89/0x90
RIP: 0010:native_safe_halt+0x6/0x10
(gdb) l *(test_clear_page_writeback+0x12e)
0xffffffff811bae3e is in test_clear_page_writeback (./include/linux/memcontrol.h:619).
614 mod_node_page_state(page_pgdat(page), idx, val);
615 if (mem_cgroup_disabled() || !page->mem_cgroup)
616 return;
617 mod_memcg_state(page->mem_cgroup, idx, val);
618 pn = page->mem_cgroup->nodeinfo[page_to_nid(page)];
619 this_cpu_add(pn->lruvec_stat->count[idx], val);
620 }
621
622 unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
623 gfp_t gfp_mask,
The issue is that writeback doesn't hold a page reference and the page
might get freed after PG_writeback is cleared (and the mapping is
unlocked) in test_clear_page_writeback(). The stat functions looking up
the page's node or zone are safe, as those attributes are static across
allocation and free cycles. But page->mem_cgroup is not, and it will
get cleared if we race with truncation or migration.
It appears this race window has been around for a while, but less likely
to trigger when the memcg stats were updated first thing after
PG_writeback is cleared. Recent changes reshuffled this code to update
the global node stats before the memcg ones, though, stretching the race
window out to an extent where people can reproduce the problem.
Update test_clear_page_writeback() to look up and pin page->mem_cgroup
before clearing PG_writeback, then not use that pointer afterward. It
is a partial revert of 62cccb8c8e7a ("mm: simplify lock_page_memcg()")
but leaves the pageref-holding callsites that aren't affected alone.
Link: http://lkml.kernel.org/r/20170809183825.GA26387@cmpxchg.org
Fixes: 62cccb8c8e7a ("mm: simplify lock_page_memcg()")
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reported-by: Jaegeuk Kim <jaegeuk@kernel.org>
Tested-by: Jaegeuk Kim <jaegeuk@kernel.org>
Reported-by: Bradley Bolen <bradleybolen@gmail.com>
Tested-by: Brad Bolen <bradleybolen@gmail.com>
Cc: Vladimir Davydov <vdavydov@virtuozzo.com>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: <stable@vger.kernel.org> [4.6+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2017-08-19 05:15:48 +07:00
|
|
|
* __unlock_page_memcg - unlock and unpin a memcg
|
|
|
|
* @memcg: the memcg
|
|
|
|
*
|
|
|
|
* Unlock and unpin a memcg returned by lock_page_memcg().
|
mm: memcontrol: fix missed end-writeback page accounting
Commit 0a31bc97c80c ("mm: memcontrol: rewrite uncharge API") changed
page migration to uncharge the old page right away. The page is locked,
unmapped, truncated, and off the LRU, but it could race with writeback
ending, which then doesn't unaccount the page properly:
test_clear_page_writeback() migration
wait_on_page_writeback()
TestClearPageWriteback()
mem_cgroup_migrate()
clear PCG_USED
mem_cgroup_update_page_stat()
if (PageCgroupUsed(pc))
decrease memcg pages under writeback
release pc->mem_cgroup->move_lock
The per-page statistics interface is heavily optimized to avoid a
function call and a lookup_page_cgroup() in the file unmap fast path,
which means it doesn't verify whether a page is still charged before
clearing PageWriteback() and it has to do it in the stat update later.
Rework it so that it looks up the page's memcg once at the beginning of
the transaction and then uses it throughout. The charge will be
verified before clearing PageWriteback() and migration can't uncharge
the page as long as that is still set. The RCU lock will protect the
memcg past uncharge.
As far as losing the optimization goes, the following test results are
from a microbenchmark that maps, faults, and unmaps a 4GB sparse file
three times in a nested fashion, so that there are two negative passes
that don't account but still go through the new transaction overhead.
There is no actual difference:
old: 33.195102545 seconds time elapsed ( +- 0.01% )
new: 33.199231369 seconds time elapsed ( +- 0.03% )
The time spent in page_remove_rmap()'s callees still adds up to the
same, but the time spent in the function itself seems reduced:
# Children Self Command Shared Object Symbol
old: 0.12% 0.11% filemapstress [kernel.kallsyms] [k] page_remove_rmap
new: 0.12% 0.08% filemapstress [kernel.kallsyms] [k] page_remove_rmap
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: Vladimir Davydov <vdavydov@parallels.com>
Cc: <stable@vger.kernel.org> [3.17.x]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-10-30 04:50:48 +07:00
|
|
|
*/
|
mm: memcontrol: fix NULL pointer crash in test_clear_page_writeback()
Jaegeuk and Brad report a NULL pointer crash when writeback ending tries
to update the memcg stats:
BUG: unable to handle kernel NULL pointer dereference at 00000000000003b0
IP: test_clear_page_writeback+0x12e/0x2c0
[...]
RIP: 0010:test_clear_page_writeback+0x12e/0x2c0
Call Trace:
<IRQ>
end_page_writeback+0x47/0x70
f2fs_write_end_io+0x76/0x180 [f2fs]
bio_endio+0x9f/0x120
blk_update_request+0xa8/0x2f0
scsi_end_request+0x39/0x1d0
scsi_io_completion+0x211/0x690
scsi_finish_command+0xd9/0x120
scsi_softirq_done+0x127/0x150
__blk_mq_complete_request_remote+0x13/0x20
flush_smp_call_function_queue+0x56/0x110
generic_smp_call_function_single_interrupt+0x13/0x30
smp_call_function_single_interrupt+0x27/0x40
call_function_single_interrupt+0x89/0x90
RIP: 0010:native_safe_halt+0x6/0x10
(gdb) l *(test_clear_page_writeback+0x12e)
0xffffffff811bae3e is in test_clear_page_writeback (./include/linux/memcontrol.h:619).
614 mod_node_page_state(page_pgdat(page), idx, val);
615 if (mem_cgroup_disabled() || !page->mem_cgroup)
616 return;
617 mod_memcg_state(page->mem_cgroup, idx, val);
618 pn = page->mem_cgroup->nodeinfo[page_to_nid(page)];
619 this_cpu_add(pn->lruvec_stat->count[idx], val);
620 }
621
622 unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
623 gfp_t gfp_mask,
The issue is that writeback doesn't hold a page reference and the page
might get freed after PG_writeback is cleared (and the mapping is
unlocked) in test_clear_page_writeback(). The stat functions looking up
the page's node or zone are safe, as those attributes are static across
allocation and free cycles. But page->mem_cgroup is not, and it will
get cleared if we race with truncation or migration.
It appears this race window has been around for a while, but less likely
to trigger when the memcg stats were updated first thing after
PG_writeback is cleared. Recent changes reshuffled this code to update
the global node stats before the memcg ones, though, stretching the race
window out to an extent where people can reproduce the problem.
Update test_clear_page_writeback() to look up and pin page->mem_cgroup
before clearing PG_writeback, then not use that pointer afterward. It
is a partial revert of 62cccb8c8e7a ("mm: simplify lock_page_memcg()")
but leaves the pageref-holding callsites that aren't affected alone.
Link: http://lkml.kernel.org/r/20170809183825.GA26387@cmpxchg.org
Fixes: 62cccb8c8e7a ("mm: simplify lock_page_memcg()")
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reported-by: Jaegeuk Kim <jaegeuk@kernel.org>
Tested-by: Jaegeuk Kim <jaegeuk@kernel.org>
Reported-by: Bradley Bolen <bradleybolen@gmail.com>
Tested-by: Brad Bolen <bradleybolen@gmail.com>
Cc: Vladimir Davydov <vdavydov@virtuozzo.com>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: <stable@vger.kernel.org> [4.6+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2017-08-19 05:15:48 +07:00
|
|
|
void __unlock_page_memcg(struct mem_cgroup *memcg)
|
memcg: use new logic for page stat accounting
Now, page-stat-per-memcg is recorded into per page_cgroup flag by
duplicating page's status into the flag. The reason is that memcg has a
feature to move a page from a group to another group and we have race
between "move" and "page stat accounting",
Under current logic, assume CPU-A and CPU-B. CPU-A does "move" and CPU-B
does "page stat accounting".
When CPU-A goes 1st,
CPU-A CPU-B
update "struct page" info.
move_lock_mem_cgroup(memcg)
see pc->flags
copy page stat to new group
overwrite pc->mem_cgroup.
move_unlock_mem_cgroup(memcg)
move_lock_mem_cgroup(mem)
set pc->flags
update page stat accounting
move_unlock_mem_cgroup(mem)
stat accounting is guarded by move_lock_mem_cgroup() and "move" logic
(CPU-A) doesn't see changes in "struct page" information.
But it's costly to have the same information both in 'struct page' and
'struct page_cgroup'. And, there is a potential problem.
For example, assume we have PG_dirty accounting in memcg.
PG_..is a flag for struct page.
PCG_ is a flag for struct page_cgroup.
(This is just an example. The same problem can be found in any
kind of page stat accounting.)
CPU-A CPU-B
TestSet PG_dirty
(delay) TestClear PG_dirty
if (TestClear(PCG_dirty))
memcg->nr_dirty--
if (TestSet(PCG_dirty))
memcg->nr_dirty++
Here, memcg->nr_dirty = +1, this is wrong. This race was reported by Greg
Thelen <gthelen@google.com>. Now, only FILE_MAPPED is supported but
fortunately, it's serialized by page table lock and this is not real bug,
_now_,
If this potential problem is caused by having duplicated information in
struct page and struct page_cgroup, we may be able to fix this by using
original 'struct page' information. But we'll have a problem in "move
account"
Assume we use only PG_dirty.
CPU-A CPU-B
TestSet PG_dirty
(delay) move_lock_mem_cgroup()
if (PageDirty(page))
new_memcg->nr_dirty++
pc->mem_cgroup = new_memcg;
move_unlock_mem_cgroup()
move_lock_mem_cgroup()
memcg = pc->mem_cgroup
new_memcg->nr_dirty++
accounting information may be double-counted. This was original reason to
have PCG_xxx flags but it seems PCG_xxx has another problem.
I think we need a bigger lock as
move_lock_mem_cgroup(page)
TestSetPageDirty(page)
update page stats (without any checks)
move_unlock_mem_cgroup(page)
This fixes both of problems and we don't have to duplicate page flag into
page_cgroup. Please note: move_lock_mem_cgroup() is held only when there
are possibility of "account move" under the system. So, in most path,
status update will go without atomic locks.
This patch introduces mem_cgroup_begin_update_page_stat() and
mem_cgroup_end_update_page_stat() both should be called at modifying
'struct page' information if memcg takes care of it. as
mem_cgroup_begin_update_page_stat()
modify page information
mem_cgroup_update_page_stat()
=> never check any 'struct page' info, just update counters.
mem_cgroup_end_update_page_stat().
This patch is slow because we need to call begin_update_page_stat()/
end_update_page_stat() regardless of accounted will be changed or not. A
following patch adds an easy optimization and reduces the cost.
[akpm@linux-foundation.org: s/lock/locked/]
[hughd@google.com: fix deadlock by avoiding stat lock when anon]
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Greg Thelen <gthelen@google.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Ying Han <yinghan@google.com>
Signed-off-by: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-03-22 06:34:25 +07:00
|
|
|
{
|
2015-02-12 06:25:01 +07:00
|
|
|
if (memcg && memcg->move_lock_task == current) {
|
|
|
|
unsigned long flags = memcg->move_lock_flags;
|
|
|
|
|
|
|
|
memcg->move_lock_task = NULL;
|
|
|
|
memcg->move_lock_flags = 0;
|
|
|
|
|
|
|
|
spin_unlock_irqrestore(&memcg->move_lock, flags);
|
|
|
|
}
|
memcg: use new logic for page stat accounting
Now, page-stat-per-memcg is recorded into per page_cgroup flag by
duplicating page's status into the flag. The reason is that memcg has a
feature to move a page from a group to another group and we have race
between "move" and "page stat accounting",
Under current logic, assume CPU-A and CPU-B. CPU-A does "move" and CPU-B
does "page stat accounting".
When CPU-A goes 1st,
CPU-A CPU-B
update "struct page" info.
move_lock_mem_cgroup(memcg)
see pc->flags
copy page stat to new group
overwrite pc->mem_cgroup.
move_unlock_mem_cgroup(memcg)
move_lock_mem_cgroup(mem)
set pc->flags
update page stat accounting
move_unlock_mem_cgroup(mem)
stat accounting is guarded by move_lock_mem_cgroup() and "move" logic
(CPU-A) doesn't see changes in "struct page" information.
But it's costly to have the same information both in 'struct page' and
'struct page_cgroup'. And, there is a potential problem.
For example, assume we have PG_dirty accounting in memcg.
PG_..is a flag for struct page.
PCG_ is a flag for struct page_cgroup.
(This is just an example. The same problem can be found in any
kind of page stat accounting.)
CPU-A CPU-B
TestSet PG_dirty
(delay) TestClear PG_dirty
if (TestClear(PCG_dirty))
memcg->nr_dirty--
if (TestSet(PCG_dirty))
memcg->nr_dirty++
Here, memcg->nr_dirty = +1, this is wrong. This race was reported by Greg
Thelen <gthelen@google.com>. Now, only FILE_MAPPED is supported but
fortunately, it's serialized by page table lock and this is not real bug,
_now_,
If this potential problem is caused by having duplicated information in
struct page and struct page_cgroup, we may be able to fix this by using
original 'struct page' information. But we'll have a problem in "move
account"
Assume we use only PG_dirty.
CPU-A CPU-B
TestSet PG_dirty
(delay) move_lock_mem_cgroup()
if (PageDirty(page))
new_memcg->nr_dirty++
pc->mem_cgroup = new_memcg;
move_unlock_mem_cgroup()
move_lock_mem_cgroup()
memcg = pc->mem_cgroup
new_memcg->nr_dirty++
accounting information may be double-counted. This was original reason to
have PCG_xxx flags but it seems PCG_xxx has another problem.
I think we need a bigger lock as
move_lock_mem_cgroup(page)
TestSetPageDirty(page)
update page stats (without any checks)
move_unlock_mem_cgroup(page)
This fixes both of problems and we don't have to duplicate page flag into
page_cgroup. Please note: move_lock_mem_cgroup() is held only when there
are possibility of "account move" under the system. So, in most path,
status update will go without atomic locks.
This patch introduces mem_cgroup_begin_update_page_stat() and
mem_cgroup_end_update_page_stat() both should be called at modifying
'struct page' information if memcg takes care of it. as
mem_cgroup_begin_update_page_stat()
modify page information
mem_cgroup_update_page_stat()
=> never check any 'struct page' info, just update counters.
mem_cgroup_end_update_page_stat().
This patch is slow because we need to call begin_update_page_stat()/
end_update_page_stat() regardless of accounted will be changed or not. A
following patch adds an easy optimization and reduces the cost.
[akpm@linux-foundation.org: s/lock/locked/]
[hughd@google.com: fix deadlock by avoiding stat lock when anon]
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Greg Thelen <gthelen@google.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Ying Han <yinghan@google.com>
Signed-off-by: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-03-22 06:34:25 +07:00
|
|
|
|
mm: memcontrol: fix missed end-writeback page accounting
Commit 0a31bc97c80c ("mm: memcontrol: rewrite uncharge API") changed
page migration to uncharge the old page right away. The page is locked,
unmapped, truncated, and off the LRU, but it could race with writeback
ending, which then doesn't unaccount the page properly:
test_clear_page_writeback() migration
wait_on_page_writeback()
TestClearPageWriteback()
mem_cgroup_migrate()
clear PCG_USED
mem_cgroup_update_page_stat()
if (PageCgroupUsed(pc))
decrease memcg pages under writeback
release pc->mem_cgroup->move_lock
The per-page statistics interface is heavily optimized to avoid a
function call and a lookup_page_cgroup() in the file unmap fast path,
which means it doesn't verify whether a page is still charged before
clearing PageWriteback() and it has to do it in the stat update later.
Rework it so that it looks up the page's memcg once at the beginning of
the transaction and then uses it throughout. The charge will be
verified before clearing PageWriteback() and migration can't uncharge
the page as long as that is still set. The RCU lock will protect the
memcg past uncharge.
As far as losing the optimization goes, the following test results are
from a microbenchmark that maps, faults, and unmaps a 4GB sparse file
three times in a nested fashion, so that there are two negative passes
that don't account but still go through the new transaction overhead.
There is no actual difference:
old: 33.195102545 seconds time elapsed ( +- 0.01% )
new: 33.199231369 seconds time elapsed ( +- 0.03% )
The time spent in page_remove_rmap()'s callees still adds up to the
same, but the time spent in the function itself seems reduced:
# Children Self Command Shared Object Symbol
old: 0.12% 0.11% filemapstress [kernel.kallsyms] [k] page_remove_rmap
new: 0.12% 0.08% filemapstress [kernel.kallsyms] [k] page_remove_rmap
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: Vladimir Davydov <vdavydov@parallels.com>
Cc: <stable@vger.kernel.org> [3.17.x]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-10-30 04:50:48 +07:00
|
|
|
rcu_read_unlock();
|
memcg: use new logic for page stat accounting
Now, page-stat-per-memcg is recorded into per page_cgroup flag by
duplicating page's status into the flag. The reason is that memcg has a
feature to move a page from a group to another group and we have race
between "move" and "page stat accounting",
Under current logic, assume CPU-A and CPU-B. CPU-A does "move" and CPU-B
does "page stat accounting".
When CPU-A goes 1st,
CPU-A CPU-B
update "struct page" info.
move_lock_mem_cgroup(memcg)
see pc->flags
copy page stat to new group
overwrite pc->mem_cgroup.
move_unlock_mem_cgroup(memcg)
move_lock_mem_cgroup(mem)
set pc->flags
update page stat accounting
move_unlock_mem_cgroup(mem)
stat accounting is guarded by move_lock_mem_cgroup() and "move" logic
(CPU-A) doesn't see changes in "struct page" information.
But it's costly to have the same information both in 'struct page' and
'struct page_cgroup'. And, there is a potential problem.
For example, assume we have PG_dirty accounting in memcg.
PG_..is a flag for struct page.
PCG_ is a flag for struct page_cgroup.
(This is just an example. The same problem can be found in any
kind of page stat accounting.)
CPU-A CPU-B
TestSet PG_dirty
(delay) TestClear PG_dirty
if (TestClear(PCG_dirty))
memcg->nr_dirty--
if (TestSet(PCG_dirty))
memcg->nr_dirty++
Here, memcg->nr_dirty = +1, this is wrong. This race was reported by Greg
Thelen <gthelen@google.com>. Now, only FILE_MAPPED is supported but
fortunately, it's serialized by page table lock and this is not real bug,
_now_,
If this potential problem is caused by having duplicated information in
struct page and struct page_cgroup, we may be able to fix this by using
original 'struct page' information. But we'll have a problem in "move
account"
Assume we use only PG_dirty.
CPU-A CPU-B
TestSet PG_dirty
(delay) move_lock_mem_cgroup()
if (PageDirty(page))
new_memcg->nr_dirty++
pc->mem_cgroup = new_memcg;
move_unlock_mem_cgroup()
move_lock_mem_cgroup()
memcg = pc->mem_cgroup
new_memcg->nr_dirty++
accounting information may be double-counted. This was original reason to
have PCG_xxx flags but it seems PCG_xxx has another problem.
I think we need a bigger lock as
move_lock_mem_cgroup(page)
TestSetPageDirty(page)
update page stats (without any checks)
move_unlock_mem_cgroup(page)
This fixes both of problems and we don't have to duplicate page flag into
page_cgroup. Please note: move_lock_mem_cgroup() is held only when there
are possibility of "account move" under the system. So, in most path,
status update will go without atomic locks.
This patch introduces mem_cgroup_begin_update_page_stat() and
mem_cgroup_end_update_page_stat() both should be called at modifying
'struct page' information if memcg takes care of it. as
mem_cgroup_begin_update_page_stat()
modify page information
mem_cgroup_update_page_stat()
=> never check any 'struct page' info, just update counters.
mem_cgroup_end_update_page_stat().
This patch is slow because we need to call begin_update_page_stat()/
end_update_page_stat() regardless of accounted will be changed or not. A
following patch adds an easy optimization and reduces the cost.
[akpm@linux-foundation.org: s/lock/locked/]
[hughd@google.com: fix deadlock by avoiding stat lock when anon]
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Greg Thelen <gthelen@google.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Ying Han <yinghan@google.com>
Signed-off-by: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-03-22 06:34:25 +07:00
|
|
|
}
|
mm: memcontrol: fix NULL pointer crash in test_clear_page_writeback()
Jaegeuk and Brad report a NULL pointer crash when writeback ending tries
to update the memcg stats:
BUG: unable to handle kernel NULL pointer dereference at 00000000000003b0
IP: test_clear_page_writeback+0x12e/0x2c0
[...]
RIP: 0010:test_clear_page_writeback+0x12e/0x2c0
Call Trace:
<IRQ>
end_page_writeback+0x47/0x70
f2fs_write_end_io+0x76/0x180 [f2fs]
bio_endio+0x9f/0x120
blk_update_request+0xa8/0x2f0
scsi_end_request+0x39/0x1d0
scsi_io_completion+0x211/0x690
scsi_finish_command+0xd9/0x120
scsi_softirq_done+0x127/0x150
__blk_mq_complete_request_remote+0x13/0x20
flush_smp_call_function_queue+0x56/0x110
generic_smp_call_function_single_interrupt+0x13/0x30
smp_call_function_single_interrupt+0x27/0x40
call_function_single_interrupt+0x89/0x90
RIP: 0010:native_safe_halt+0x6/0x10
(gdb) l *(test_clear_page_writeback+0x12e)
0xffffffff811bae3e is in test_clear_page_writeback (./include/linux/memcontrol.h:619).
614 mod_node_page_state(page_pgdat(page), idx, val);
615 if (mem_cgroup_disabled() || !page->mem_cgroup)
616 return;
617 mod_memcg_state(page->mem_cgroup, idx, val);
618 pn = page->mem_cgroup->nodeinfo[page_to_nid(page)];
619 this_cpu_add(pn->lruvec_stat->count[idx], val);
620 }
621
622 unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
623 gfp_t gfp_mask,
The issue is that writeback doesn't hold a page reference and the page
might get freed after PG_writeback is cleared (and the mapping is
unlocked) in test_clear_page_writeback(). The stat functions looking up
the page's node or zone are safe, as those attributes are static across
allocation and free cycles. But page->mem_cgroup is not, and it will
get cleared if we race with truncation or migration.
It appears this race window has been around for a while, but less likely
to trigger when the memcg stats were updated first thing after
PG_writeback is cleared. Recent changes reshuffled this code to update
the global node stats before the memcg ones, though, stretching the race
window out to an extent where people can reproduce the problem.
Update test_clear_page_writeback() to look up and pin page->mem_cgroup
before clearing PG_writeback, then not use that pointer afterward. It
is a partial revert of 62cccb8c8e7a ("mm: simplify lock_page_memcg()")
but leaves the pageref-holding callsites that aren't affected alone.
Link: http://lkml.kernel.org/r/20170809183825.GA26387@cmpxchg.org
Fixes: 62cccb8c8e7a ("mm: simplify lock_page_memcg()")
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reported-by: Jaegeuk Kim <jaegeuk@kernel.org>
Tested-by: Jaegeuk Kim <jaegeuk@kernel.org>
Reported-by: Bradley Bolen <bradleybolen@gmail.com>
Tested-by: Brad Bolen <bradleybolen@gmail.com>
Cc: Vladimir Davydov <vdavydov@virtuozzo.com>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: <stable@vger.kernel.org> [4.6+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2017-08-19 05:15:48 +07:00
|
|
|
|
|
|
|
/**
|
|
|
|
* unlock_page_memcg - unlock a page->mem_cgroup binding
|
|
|
|
* @page: the page
|
|
|
|
*/
|
|
|
|
void unlock_page_memcg(struct page *page)
|
|
|
|
{
|
2020-06-04 06:01:51 +07:00
|
|
|
struct page *head = compound_head(page);
|
|
|
|
|
|
|
|
__unlock_page_memcg(head->mem_cgroup);
|
mm: memcontrol: fix NULL pointer crash in test_clear_page_writeback()
Jaegeuk and Brad report a NULL pointer crash when writeback ending tries
to update the memcg stats:
BUG: unable to handle kernel NULL pointer dereference at 00000000000003b0
IP: test_clear_page_writeback+0x12e/0x2c0
[...]
RIP: 0010:test_clear_page_writeback+0x12e/0x2c0
Call Trace:
<IRQ>
end_page_writeback+0x47/0x70
f2fs_write_end_io+0x76/0x180 [f2fs]
bio_endio+0x9f/0x120
blk_update_request+0xa8/0x2f0
scsi_end_request+0x39/0x1d0
scsi_io_completion+0x211/0x690
scsi_finish_command+0xd9/0x120
scsi_softirq_done+0x127/0x150
__blk_mq_complete_request_remote+0x13/0x20
flush_smp_call_function_queue+0x56/0x110
generic_smp_call_function_single_interrupt+0x13/0x30
smp_call_function_single_interrupt+0x27/0x40
call_function_single_interrupt+0x89/0x90
RIP: 0010:native_safe_halt+0x6/0x10
(gdb) l *(test_clear_page_writeback+0x12e)
0xffffffff811bae3e is in test_clear_page_writeback (./include/linux/memcontrol.h:619).
614 mod_node_page_state(page_pgdat(page), idx, val);
615 if (mem_cgroup_disabled() || !page->mem_cgroup)
616 return;
617 mod_memcg_state(page->mem_cgroup, idx, val);
618 pn = page->mem_cgroup->nodeinfo[page_to_nid(page)];
619 this_cpu_add(pn->lruvec_stat->count[idx], val);
620 }
621
622 unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
623 gfp_t gfp_mask,
The issue is that writeback doesn't hold a page reference and the page
might get freed after PG_writeback is cleared (and the mapping is
unlocked) in test_clear_page_writeback(). The stat functions looking up
the page's node or zone are safe, as those attributes are static across
allocation and free cycles. But page->mem_cgroup is not, and it will
get cleared if we race with truncation or migration.
It appears this race window has been around for a while, but less likely
to trigger when the memcg stats were updated first thing after
PG_writeback is cleared. Recent changes reshuffled this code to update
the global node stats before the memcg ones, though, stretching the race
window out to an extent where people can reproduce the problem.
Update test_clear_page_writeback() to look up and pin page->mem_cgroup
before clearing PG_writeback, then not use that pointer afterward. It
is a partial revert of 62cccb8c8e7a ("mm: simplify lock_page_memcg()")
but leaves the pageref-holding callsites that aren't affected alone.
Link: http://lkml.kernel.org/r/20170809183825.GA26387@cmpxchg.org
Fixes: 62cccb8c8e7a ("mm: simplify lock_page_memcg()")
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reported-by: Jaegeuk Kim <jaegeuk@kernel.org>
Tested-by: Jaegeuk Kim <jaegeuk@kernel.org>
Reported-by: Bradley Bolen <bradleybolen@gmail.com>
Tested-by: Brad Bolen <bradleybolen@gmail.com>
Cc: Vladimir Davydov <vdavydov@virtuozzo.com>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: <stable@vger.kernel.org> [4.6+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2017-08-19 05:15:48 +07:00
|
|
|
}
|
2016-03-16 04:57:04 +07:00
|
|
|
EXPORT_SYMBOL(unlock_page_memcg);
|
memcg: use new logic for page stat accounting
Now, page-stat-per-memcg is recorded into per page_cgroup flag by
duplicating page's status into the flag. The reason is that memcg has a
feature to move a page from a group to another group and we have race
between "move" and "page stat accounting",
Under current logic, assume CPU-A and CPU-B. CPU-A does "move" and CPU-B
does "page stat accounting".
When CPU-A goes 1st,
CPU-A CPU-B
update "struct page" info.
move_lock_mem_cgroup(memcg)
see pc->flags
copy page stat to new group
overwrite pc->mem_cgroup.
move_unlock_mem_cgroup(memcg)
move_lock_mem_cgroup(mem)
set pc->flags
update page stat accounting
move_unlock_mem_cgroup(mem)
stat accounting is guarded by move_lock_mem_cgroup() and "move" logic
(CPU-A) doesn't see changes in "struct page" information.
But it's costly to have the same information both in 'struct page' and
'struct page_cgroup'. And, there is a potential problem.
For example, assume we have PG_dirty accounting in memcg.
PG_..is a flag for struct page.
PCG_ is a flag for struct page_cgroup.
(This is just an example. The same problem can be found in any
kind of page stat accounting.)
CPU-A CPU-B
TestSet PG_dirty
(delay) TestClear PG_dirty
if (TestClear(PCG_dirty))
memcg->nr_dirty--
if (TestSet(PCG_dirty))
memcg->nr_dirty++
Here, memcg->nr_dirty = +1, this is wrong. This race was reported by Greg
Thelen <gthelen@google.com>. Now, only FILE_MAPPED is supported but
fortunately, it's serialized by page table lock and this is not real bug,
_now_,
If this potential problem is caused by having duplicated information in
struct page and struct page_cgroup, we may be able to fix this by using
original 'struct page' information. But we'll have a problem in "move
account"
Assume we use only PG_dirty.
CPU-A CPU-B
TestSet PG_dirty
(delay) move_lock_mem_cgroup()
if (PageDirty(page))
new_memcg->nr_dirty++
pc->mem_cgroup = new_memcg;
move_unlock_mem_cgroup()
move_lock_mem_cgroup()
memcg = pc->mem_cgroup
new_memcg->nr_dirty++
accounting information may be double-counted. This was original reason to
have PCG_xxx flags but it seems PCG_xxx has another problem.
I think we need a bigger lock as
move_lock_mem_cgroup(page)
TestSetPageDirty(page)
update page stats (without any checks)
move_unlock_mem_cgroup(page)
This fixes both of problems and we don't have to duplicate page flag into
page_cgroup. Please note: move_lock_mem_cgroup() is held only when there
are possibility of "account move" under the system. So, in most path,
status update will go without atomic locks.
This patch introduces mem_cgroup_begin_update_page_stat() and
mem_cgroup_end_update_page_stat() both should be called at modifying
'struct page' information if memcg takes care of it. as
mem_cgroup_begin_update_page_stat()
modify page information
mem_cgroup_update_page_stat()
=> never check any 'struct page' info, just update counters.
mem_cgroup_end_update_page_stat().
This patch is slow because we need to call begin_update_page_stat()/
end_update_page_stat() regardless of accounted will be changed or not. A
following patch adds an easy optimization and reduces the cost.
[akpm@linux-foundation.org: s/lock/locked/]
[hughd@google.com: fix deadlock by avoiding stat lock when anon]
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Greg Thelen <gthelen@google.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Ying Han <yinghan@google.com>
Signed-off-by: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-03-22 06:34:25 +07:00
|
|
|
|
2009-12-16 07:47:08 +07:00
|
|
|
struct memcg_stock_pcp {
|
|
|
|
struct mem_cgroup *cached; /* this never be root cgroup */
|
2011-03-24 06:42:34 +07:00
|
|
|
unsigned int nr_pages;
|
2020-08-07 13:20:49 +07:00
|
|
|
|
|
|
|
#ifdef CONFIG_MEMCG_KMEM
|
|
|
|
struct obj_cgroup *cached_objcg;
|
|
|
|
unsigned int nr_bytes;
|
|
|
|
#endif
|
|
|
|
|
2009-12-16 07:47:08 +07:00
|
|
|
struct work_struct work;
|
2011-06-16 05:08:45 +07:00
|
|
|
unsigned long flags;
|
2012-05-30 05:06:56 +07:00
|
|
|
#define FLUSHING_CACHED_CHARGE 0
|
2009-12-16 07:47:08 +07:00
|
|
|
};
|
|
|
|
static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
|
2011-08-09 16:56:26 +07:00
|
|
|
static DEFINE_MUTEX(percpu_charge_mutex);
|
2009-12-16 07:47:08 +07:00
|
|
|
|
2020-08-07 13:20:49 +07:00
|
|
|
#ifdef CONFIG_MEMCG_KMEM
|
|
|
|
static void drain_obj_stock(struct memcg_stock_pcp *stock);
|
|
|
|
static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
|
|
|
|
struct mem_cgroup *root_memcg);
|
|
|
|
|
|
|
|
#else
|
|
|
|
static inline void drain_obj_stock(struct memcg_stock_pcp *stock)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
|
|
|
|
struct mem_cgroup *root_memcg)
|
|
|
|
{
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2012-12-19 05:21:36 +07:00
|
|
|
/**
|
|
|
|
* consume_stock: Try to consume stocked charge on this cpu.
|
|
|
|
* @memcg: memcg to consume from.
|
|
|
|
* @nr_pages: how many pages to charge.
|
|
|
|
*
|
|
|
|
* The charges will only happen if @memcg matches the current cpu's memcg
|
|
|
|
* stock, and at least @nr_pages are available in that stock. Failure to
|
|
|
|
* service an allocation will refill the stock.
|
|
|
|
*
|
|
|
|
* returns true if successful, false otherwise.
|
2009-12-16 07:47:08 +07:00
|
|
|
*/
|
2012-12-19 05:21:36 +07:00
|
|
|
static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
|
2009-12-16 07:47:08 +07:00
|
|
|
{
|
|
|
|
struct memcg_stock_pcp *stock;
|
2016-09-20 04:44:36 +07:00
|
|
|
unsigned long flags;
|
mm: memcontrol: lockless page counters
Memory is internally accounted in bytes, using spinlock-protected 64-bit
counters, even though the smallest accounting delta is a page. The
counter interface is also convoluted and does too many things.
Introduce a new lockless word-sized page counter API, then change all
memory accounting over to it. The translation from and to bytes then only
happens when interfacing with userspace.
The removed locking overhead is noticable when scaling beyond the per-cpu
charge caches - on a 4-socket machine with 144-threads, the following test
shows the performance differences of 288 memcgs concurrently running a
page fault benchmark:
vanilla:
18631648.500498 task-clock (msec) # 140.643 CPUs utilized ( +- 0.33% )
1,380,638 context-switches # 0.074 K/sec ( +- 0.75% )
24,390 cpu-migrations # 0.001 K/sec ( +- 8.44% )
1,843,305,768 page-faults # 0.099 M/sec ( +- 0.00% )
50,134,994,088,218 cycles # 2.691 GHz ( +- 0.33% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
8,049,712,224,651 instructions # 0.16 insns per cycle ( +- 0.04% )
1,586,970,584,979 branches # 85.176 M/sec ( +- 0.05% )
1,724,989,949 branch-misses # 0.11% of all branches ( +- 0.48% )
132.474343877 seconds time elapsed ( +- 0.21% )
lockless:
12195979.037525 task-clock (msec) # 133.480 CPUs utilized ( +- 0.18% )
832,850 context-switches # 0.068 K/sec ( +- 0.54% )
15,624 cpu-migrations # 0.001 K/sec ( +- 10.17% )
1,843,304,774 page-faults # 0.151 M/sec ( +- 0.00% )
32,811,216,801,141 cycles # 2.690 GHz ( +- 0.18% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
9,999,265,091,727 instructions # 0.30 insns per cycle ( +- 0.10% )
2,076,759,325,203 branches # 170.282 M/sec ( +- 0.12% )
1,656,917,214 branch-misses # 0.08% of all branches ( +- 0.55% )
91.369330729 seconds time elapsed ( +- 0.45% )
On top of improved scalability, this also gets rid of the icky long long
types in the very heart of memcg, which is great for 32 bit and also makes
the code a lot more readable.
Notable differences between the old and new API:
- res_counter_charge() and res_counter_charge_nofail() become
page_counter_try_charge() and page_counter_charge() resp. to match
the more common kernel naming scheme of try_do()/do()
- res_counter_uncharge_until() is only ever used to cancel a local
counter and never to uncharge bigger segments of a hierarchy, so
it's replaced by the simpler page_counter_cancel()
- res_counter_set_limit() is replaced by page_counter_limit(), which
expects its callers to serialize against themselves
- res_counter_memparse_write_strategy() is replaced by
page_counter_limit(), which rounds down to the nearest page size -
rather than up. This is more reasonable for explicitely requested
hard upper limits.
- to keep charging light-weight, page_counter_try_charge() charges
speculatively, only to roll back if the result exceeds the limit.
Because of this, a failing bigger charge can temporarily lock out
smaller charges that would otherwise succeed. The error is bounded
to the difference between the smallest and the biggest possible
charge size, so for memcg, this means that a failing THP charge can
send base page charges into reclaim upto 2MB (4MB) before the limit
would have been reached. This should be acceptable.
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE and memparse]
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE, memparse, strncmp, and PAGE_SIZE]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Acked-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-12-11 06:42:31 +07:00
|
|
|
bool ret = false;
|
2009-12-16 07:47:08 +07:00
|
|
|
|
2018-02-01 07:16:45 +07:00
|
|
|
if (nr_pages > MEMCG_CHARGE_BATCH)
|
mm: memcontrol: lockless page counters
Memory is internally accounted in bytes, using spinlock-protected 64-bit
counters, even though the smallest accounting delta is a page. The
counter interface is also convoluted and does too many things.
Introduce a new lockless word-sized page counter API, then change all
memory accounting over to it. The translation from and to bytes then only
happens when interfacing with userspace.
The removed locking overhead is noticable when scaling beyond the per-cpu
charge caches - on a 4-socket machine with 144-threads, the following test
shows the performance differences of 288 memcgs concurrently running a
page fault benchmark:
vanilla:
18631648.500498 task-clock (msec) # 140.643 CPUs utilized ( +- 0.33% )
1,380,638 context-switches # 0.074 K/sec ( +- 0.75% )
24,390 cpu-migrations # 0.001 K/sec ( +- 8.44% )
1,843,305,768 page-faults # 0.099 M/sec ( +- 0.00% )
50,134,994,088,218 cycles # 2.691 GHz ( +- 0.33% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
8,049,712,224,651 instructions # 0.16 insns per cycle ( +- 0.04% )
1,586,970,584,979 branches # 85.176 M/sec ( +- 0.05% )
1,724,989,949 branch-misses # 0.11% of all branches ( +- 0.48% )
132.474343877 seconds time elapsed ( +- 0.21% )
lockless:
12195979.037525 task-clock (msec) # 133.480 CPUs utilized ( +- 0.18% )
832,850 context-switches # 0.068 K/sec ( +- 0.54% )
15,624 cpu-migrations # 0.001 K/sec ( +- 10.17% )
1,843,304,774 page-faults # 0.151 M/sec ( +- 0.00% )
32,811,216,801,141 cycles # 2.690 GHz ( +- 0.18% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
9,999,265,091,727 instructions # 0.30 insns per cycle ( +- 0.10% )
2,076,759,325,203 branches # 170.282 M/sec ( +- 0.12% )
1,656,917,214 branch-misses # 0.08% of all branches ( +- 0.55% )
91.369330729 seconds time elapsed ( +- 0.45% )
On top of improved scalability, this also gets rid of the icky long long
types in the very heart of memcg, which is great for 32 bit and also makes
the code a lot more readable.
Notable differences between the old and new API:
- res_counter_charge() and res_counter_charge_nofail() become
page_counter_try_charge() and page_counter_charge() resp. to match
the more common kernel naming scheme of try_do()/do()
- res_counter_uncharge_until() is only ever used to cancel a local
counter and never to uncharge bigger segments of a hierarchy, so
it's replaced by the simpler page_counter_cancel()
- res_counter_set_limit() is replaced by page_counter_limit(), which
expects its callers to serialize against themselves
- res_counter_memparse_write_strategy() is replaced by
page_counter_limit(), which rounds down to the nearest page size -
rather than up. This is more reasonable for explicitely requested
hard upper limits.
- to keep charging light-weight, page_counter_try_charge() charges
speculatively, only to roll back if the result exceeds the limit.
Because of this, a failing bigger charge can temporarily lock out
smaller charges that would otherwise succeed. The error is bounded
to the difference between the smallest and the biggest possible
charge size, so for memcg, this means that a failing THP charge can
send base page charges into reclaim upto 2MB (4MB) before the limit
would have been reached. This should be acceptable.
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE and memparse]
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE, memparse, strncmp, and PAGE_SIZE]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Acked-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-12-11 06:42:31 +07:00
|
|
|
return ret;
|
2012-12-19 05:21:36 +07:00
|
|
|
|
2016-09-20 04:44:36 +07:00
|
|
|
local_irq_save(flags);
|
|
|
|
|
|
|
|
stock = this_cpu_ptr(&memcg_stock);
|
mm: memcontrol: lockless page counters
Memory is internally accounted in bytes, using spinlock-protected 64-bit
counters, even though the smallest accounting delta is a page. The
counter interface is also convoluted and does too many things.
Introduce a new lockless word-sized page counter API, then change all
memory accounting over to it. The translation from and to bytes then only
happens when interfacing with userspace.
The removed locking overhead is noticable when scaling beyond the per-cpu
charge caches - on a 4-socket machine with 144-threads, the following test
shows the performance differences of 288 memcgs concurrently running a
page fault benchmark:
vanilla:
18631648.500498 task-clock (msec) # 140.643 CPUs utilized ( +- 0.33% )
1,380,638 context-switches # 0.074 K/sec ( +- 0.75% )
24,390 cpu-migrations # 0.001 K/sec ( +- 8.44% )
1,843,305,768 page-faults # 0.099 M/sec ( +- 0.00% )
50,134,994,088,218 cycles # 2.691 GHz ( +- 0.33% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
8,049,712,224,651 instructions # 0.16 insns per cycle ( +- 0.04% )
1,586,970,584,979 branches # 85.176 M/sec ( +- 0.05% )
1,724,989,949 branch-misses # 0.11% of all branches ( +- 0.48% )
132.474343877 seconds time elapsed ( +- 0.21% )
lockless:
12195979.037525 task-clock (msec) # 133.480 CPUs utilized ( +- 0.18% )
832,850 context-switches # 0.068 K/sec ( +- 0.54% )
15,624 cpu-migrations # 0.001 K/sec ( +- 10.17% )
1,843,304,774 page-faults # 0.151 M/sec ( +- 0.00% )
32,811,216,801,141 cycles # 2.690 GHz ( +- 0.18% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
9,999,265,091,727 instructions # 0.30 insns per cycle ( +- 0.10% )
2,076,759,325,203 branches # 170.282 M/sec ( +- 0.12% )
1,656,917,214 branch-misses # 0.08% of all branches ( +- 0.55% )
91.369330729 seconds time elapsed ( +- 0.45% )
On top of improved scalability, this also gets rid of the icky long long
types in the very heart of memcg, which is great for 32 bit and also makes
the code a lot more readable.
Notable differences between the old and new API:
- res_counter_charge() and res_counter_charge_nofail() become
page_counter_try_charge() and page_counter_charge() resp. to match
the more common kernel naming scheme of try_do()/do()
- res_counter_uncharge_until() is only ever used to cancel a local
counter and never to uncharge bigger segments of a hierarchy, so
it's replaced by the simpler page_counter_cancel()
- res_counter_set_limit() is replaced by page_counter_limit(), which
expects its callers to serialize against themselves
- res_counter_memparse_write_strategy() is replaced by
page_counter_limit(), which rounds down to the nearest page size -
rather than up. This is more reasonable for explicitely requested
hard upper limits.
- to keep charging light-weight, page_counter_try_charge() charges
speculatively, only to roll back if the result exceeds the limit.
Because of this, a failing bigger charge can temporarily lock out
smaller charges that would otherwise succeed. The error is bounded
to the difference between the smallest and the biggest possible
charge size, so for memcg, this means that a failing THP charge can
send base page charges into reclaim upto 2MB (4MB) before the limit
would have been reached. This should be acceptable.
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE and memparse]
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE, memparse, strncmp, and PAGE_SIZE]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Acked-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-12-11 06:42:31 +07:00
|
|
|
if (memcg == stock->cached && stock->nr_pages >= nr_pages) {
|
2012-12-19 05:21:36 +07:00
|
|
|
stock->nr_pages -= nr_pages;
|
mm: memcontrol: lockless page counters
Memory is internally accounted in bytes, using spinlock-protected 64-bit
counters, even though the smallest accounting delta is a page. The
counter interface is also convoluted and does too many things.
Introduce a new lockless word-sized page counter API, then change all
memory accounting over to it. The translation from and to bytes then only
happens when interfacing with userspace.
The removed locking overhead is noticable when scaling beyond the per-cpu
charge caches - on a 4-socket machine with 144-threads, the following test
shows the performance differences of 288 memcgs concurrently running a
page fault benchmark:
vanilla:
18631648.500498 task-clock (msec) # 140.643 CPUs utilized ( +- 0.33% )
1,380,638 context-switches # 0.074 K/sec ( +- 0.75% )
24,390 cpu-migrations # 0.001 K/sec ( +- 8.44% )
1,843,305,768 page-faults # 0.099 M/sec ( +- 0.00% )
50,134,994,088,218 cycles # 2.691 GHz ( +- 0.33% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
8,049,712,224,651 instructions # 0.16 insns per cycle ( +- 0.04% )
1,586,970,584,979 branches # 85.176 M/sec ( +- 0.05% )
1,724,989,949 branch-misses # 0.11% of all branches ( +- 0.48% )
132.474343877 seconds time elapsed ( +- 0.21% )
lockless:
12195979.037525 task-clock (msec) # 133.480 CPUs utilized ( +- 0.18% )
832,850 context-switches # 0.068 K/sec ( +- 0.54% )
15,624 cpu-migrations # 0.001 K/sec ( +- 10.17% )
1,843,304,774 page-faults # 0.151 M/sec ( +- 0.00% )
32,811,216,801,141 cycles # 2.690 GHz ( +- 0.18% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
9,999,265,091,727 instructions # 0.30 insns per cycle ( +- 0.10% )
2,076,759,325,203 branches # 170.282 M/sec ( +- 0.12% )
1,656,917,214 branch-misses # 0.08% of all branches ( +- 0.55% )
91.369330729 seconds time elapsed ( +- 0.45% )
On top of improved scalability, this also gets rid of the icky long long
types in the very heart of memcg, which is great for 32 bit and also makes
the code a lot more readable.
Notable differences between the old and new API:
- res_counter_charge() and res_counter_charge_nofail() become
page_counter_try_charge() and page_counter_charge() resp. to match
the more common kernel naming scheme of try_do()/do()
- res_counter_uncharge_until() is only ever used to cancel a local
counter and never to uncharge bigger segments of a hierarchy, so
it's replaced by the simpler page_counter_cancel()
- res_counter_set_limit() is replaced by page_counter_limit(), which
expects its callers to serialize against themselves
- res_counter_memparse_write_strategy() is replaced by
page_counter_limit(), which rounds down to the nearest page size -
rather than up. This is more reasonable for explicitely requested
hard upper limits.
- to keep charging light-weight, page_counter_try_charge() charges
speculatively, only to roll back if the result exceeds the limit.
Because of this, a failing bigger charge can temporarily lock out
smaller charges that would otherwise succeed. The error is bounded
to the difference between the smallest and the biggest possible
charge size, so for memcg, this means that a failing THP charge can
send base page charges into reclaim upto 2MB (4MB) before the limit
would have been reached. This should be acceptable.
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE and memparse]
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE, memparse, strncmp, and PAGE_SIZE]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Acked-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-12-11 06:42:31 +07:00
|
|
|
ret = true;
|
|
|
|
}
|
2016-09-20 04:44:36 +07:00
|
|
|
|
|
|
|
local_irq_restore(flags);
|
|
|
|
|
2009-12-16 07:47:08 +07:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
mm: memcontrol: lockless page counters
Memory is internally accounted in bytes, using spinlock-protected 64-bit
counters, even though the smallest accounting delta is a page. The
counter interface is also convoluted and does too many things.
Introduce a new lockless word-sized page counter API, then change all
memory accounting over to it. The translation from and to bytes then only
happens when interfacing with userspace.
The removed locking overhead is noticable when scaling beyond the per-cpu
charge caches - on a 4-socket machine with 144-threads, the following test
shows the performance differences of 288 memcgs concurrently running a
page fault benchmark:
vanilla:
18631648.500498 task-clock (msec) # 140.643 CPUs utilized ( +- 0.33% )
1,380,638 context-switches # 0.074 K/sec ( +- 0.75% )
24,390 cpu-migrations # 0.001 K/sec ( +- 8.44% )
1,843,305,768 page-faults # 0.099 M/sec ( +- 0.00% )
50,134,994,088,218 cycles # 2.691 GHz ( +- 0.33% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
8,049,712,224,651 instructions # 0.16 insns per cycle ( +- 0.04% )
1,586,970,584,979 branches # 85.176 M/sec ( +- 0.05% )
1,724,989,949 branch-misses # 0.11% of all branches ( +- 0.48% )
132.474343877 seconds time elapsed ( +- 0.21% )
lockless:
12195979.037525 task-clock (msec) # 133.480 CPUs utilized ( +- 0.18% )
832,850 context-switches # 0.068 K/sec ( +- 0.54% )
15,624 cpu-migrations # 0.001 K/sec ( +- 10.17% )
1,843,304,774 page-faults # 0.151 M/sec ( +- 0.00% )
32,811,216,801,141 cycles # 2.690 GHz ( +- 0.18% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
9,999,265,091,727 instructions # 0.30 insns per cycle ( +- 0.10% )
2,076,759,325,203 branches # 170.282 M/sec ( +- 0.12% )
1,656,917,214 branch-misses # 0.08% of all branches ( +- 0.55% )
91.369330729 seconds time elapsed ( +- 0.45% )
On top of improved scalability, this also gets rid of the icky long long
types in the very heart of memcg, which is great for 32 bit and also makes
the code a lot more readable.
Notable differences between the old and new API:
- res_counter_charge() and res_counter_charge_nofail() become
page_counter_try_charge() and page_counter_charge() resp. to match
the more common kernel naming scheme of try_do()/do()
- res_counter_uncharge_until() is only ever used to cancel a local
counter and never to uncharge bigger segments of a hierarchy, so
it's replaced by the simpler page_counter_cancel()
- res_counter_set_limit() is replaced by page_counter_limit(), which
expects its callers to serialize against themselves
- res_counter_memparse_write_strategy() is replaced by
page_counter_limit(), which rounds down to the nearest page size -
rather than up. This is more reasonable for explicitely requested
hard upper limits.
- to keep charging light-weight, page_counter_try_charge() charges
speculatively, only to roll back if the result exceeds the limit.
Because of this, a failing bigger charge can temporarily lock out
smaller charges that would otherwise succeed. The error is bounded
to the difference between the smallest and the biggest possible
charge size, so for memcg, this means that a failing THP charge can
send base page charges into reclaim upto 2MB (4MB) before the limit
would have been reached. This should be acceptable.
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE and memparse]
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE, memparse, strncmp, and PAGE_SIZE]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Acked-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-12-11 06:42:31 +07:00
|
|
|
* Returns stocks cached in percpu and reset cached information.
|
2009-12-16 07:47:08 +07:00
|
|
|
*/
|
|
|
|
static void drain_stock(struct memcg_stock_pcp *stock)
|
|
|
|
{
|
|
|
|
struct mem_cgroup *old = stock->cached;
|
|
|
|
|
2020-08-07 13:20:45 +07:00
|
|
|
if (!old)
|
|
|
|
return;
|
|
|
|
|
2011-03-24 06:42:34 +07:00
|
|
|
if (stock->nr_pages) {
|
mm: memcontrol: lockless page counters
Memory is internally accounted in bytes, using spinlock-protected 64-bit
counters, even though the smallest accounting delta is a page. The
counter interface is also convoluted and does too many things.
Introduce a new lockless word-sized page counter API, then change all
memory accounting over to it. The translation from and to bytes then only
happens when interfacing with userspace.
The removed locking overhead is noticable when scaling beyond the per-cpu
charge caches - on a 4-socket machine with 144-threads, the following test
shows the performance differences of 288 memcgs concurrently running a
page fault benchmark:
vanilla:
18631648.500498 task-clock (msec) # 140.643 CPUs utilized ( +- 0.33% )
1,380,638 context-switches # 0.074 K/sec ( +- 0.75% )
24,390 cpu-migrations # 0.001 K/sec ( +- 8.44% )
1,843,305,768 page-faults # 0.099 M/sec ( +- 0.00% )
50,134,994,088,218 cycles # 2.691 GHz ( +- 0.33% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
8,049,712,224,651 instructions # 0.16 insns per cycle ( +- 0.04% )
1,586,970,584,979 branches # 85.176 M/sec ( +- 0.05% )
1,724,989,949 branch-misses # 0.11% of all branches ( +- 0.48% )
132.474343877 seconds time elapsed ( +- 0.21% )
lockless:
12195979.037525 task-clock (msec) # 133.480 CPUs utilized ( +- 0.18% )
832,850 context-switches # 0.068 K/sec ( +- 0.54% )
15,624 cpu-migrations # 0.001 K/sec ( +- 10.17% )
1,843,304,774 page-faults # 0.151 M/sec ( +- 0.00% )
32,811,216,801,141 cycles # 2.690 GHz ( +- 0.18% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
9,999,265,091,727 instructions # 0.30 insns per cycle ( +- 0.10% )
2,076,759,325,203 branches # 170.282 M/sec ( +- 0.12% )
1,656,917,214 branch-misses # 0.08% of all branches ( +- 0.55% )
91.369330729 seconds time elapsed ( +- 0.45% )
On top of improved scalability, this also gets rid of the icky long long
types in the very heart of memcg, which is great for 32 bit and also makes
the code a lot more readable.
Notable differences between the old and new API:
- res_counter_charge() and res_counter_charge_nofail() become
page_counter_try_charge() and page_counter_charge() resp. to match
the more common kernel naming scheme of try_do()/do()
- res_counter_uncharge_until() is only ever used to cancel a local
counter and never to uncharge bigger segments of a hierarchy, so
it's replaced by the simpler page_counter_cancel()
- res_counter_set_limit() is replaced by page_counter_limit(), which
expects its callers to serialize against themselves
- res_counter_memparse_write_strategy() is replaced by
page_counter_limit(), which rounds down to the nearest page size -
rather than up. This is more reasonable for explicitely requested
hard upper limits.
- to keep charging light-weight, page_counter_try_charge() charges
speculatively, only to roll back if the result exceeds the limit.
Because of this, a failing bigger charge can temporarily lock out
smaller charges that would otherwise succeed. The error is bounded
to the difference between the smallest and the biggest possible
charge size, so for memcg, this means that a failing THP charge can
send base page charges into reclaim upto 2MB (4MB) before the limit
would have been reached. This should be acceptable.
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE and memparse]
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE, memparse, strncmp, and PAGE_SIZE]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Acked-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-12-11 06:42:31 +07:00
|
|
|
page_counter_uncharge(&old->memory, stock->nr_pages);
|
2016-01-15 06:21:23 +07:00
|
|
|
if (do_memsw_account())
|
mm: memcontrol: lockless page counters
Memory is internally accounted in bytes, using spinlock-protected 64-bit
counters, even though the smallest accounting delta is a page. The
counter interface is also convoluted and does too many things.
Introduce a new lockless word-sized page counter API, then change all
memory accounting over to it. The translation from and to bytes then only
happens when interfacing with userspace.
The removed locking overhead is noticable when scaling beyond the per-cpu
charge caches - on a 4-socket machine with 144-threads, the following test
shows the performance differences of 288 memcgs concurrently running a
page fault benchmark:
vanilla:
18631648.500498 task-clock (msec) # 140.643 CPUs utilized ( +- 0.33% )
1,380,638 context-switches # 0.074 K/sec ( +- 0.75% )
24,390 cpu-migrations # 0.001 K/sec ( +- 8.44% )
1,843,305,768 page-faults # 0.099 M/sec ( +- 0.00% )
50,134,994,088,218 cycles # 2.691 GHz ( +- 0.33% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
8,049,712,224,651 instructions # 0.16 insns per cycle ( +- 0.04% )
1,586,970,584,979 branches # 85.176 M/sec ( +- 0.05% )
1,724,989,949 branch-misses # 0.11% of all branches ( +- 0.48% )
132.474343877 seconds time elapsed ( +- 0.21% )
lockless:
12195979.037525 task-clock (msec) # 133.480 CPUs utilized ( +- 0.18% )
832,850 context-switches # 0.068 K/sec ( +- 0.54% )
15,624 cpu-migrations # 0.001 K/sec ( +- 10.17% )
1,843,304,774 page-faults # 0.151 M/sec ( +- 0.00% )
32,811,216,801,141 cycles # 2.690 GHz ( +- 0.18% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
9,999,265,091,727 instructions # 0.30 insns per cycle ( +- 0.10% )
2,076,759,325,203 branches # 170.282 M/sec ( +- 0.12% )
1,656,917,214 branch-misses # 0.08% of all branches ( +- 0.55% )
91.369330729 seconds time elapsed ( +- 0.45% )
On top of improved scalability, this also gets rid of the icky long long
types in the very heart of memcg, which is great for 32 bit and also makes
the code a lot more readable.
Notable differences between the old and new API:
- res_counter_charge() and res_counter_charge_nofail() become
page_counter_try_charge() and page_counter_charge() resp. to match
the more common kernel naming scheme of try_do()/do()
- res_counter_uncharge_until() is only ever used to cancel a local
counter and never to uncharge bigger segments of a hierarchy, so
it's replaced by the simpler page_counter_cancel()
- res_counter_set_limit() is replaced by page_counter_limit(), which
expects its callers to serialize against themselves
- res_counter_memparse_write_strategy() is replaced by
page_counter_limit(), which rounds down to the nearest page size -
rather than up. This is more reasonable for explicitely requested
hard upper limits.
- to keep charging light-weight, page_counter_try_charge() charges
speculatively, only to roll back if the result exceeds the limit.
Because of this, a failing bigger charge can temporarily lock out
smaller charges that would otherwise succeed. The error is bounded
to the difference between the smallest and the biggest possible
charge size, so for memcg, this means that a failing THP charge can
send base page charges into reclaim upto 2MB (4MB) before the limit
would have been reached. This should be acceptable.
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE and memparse]
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE, memparse, strncmp, and PAGE_SIZE]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Acked-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-12-11 06:42:31 +07:00
|
|
|
page_counter_uncharge(&old->memsw, stock->nr_pages);
|
2011-03-24 06:42:34 +07:00
|
|
|
stock->nr_pages = 0;
|
2009-12-16 07:47:08 +07:00
|
|
|
}
|
2020-08-07 13:20:45 +07:00
|
|
|
|
|
|
|
css_put(&old->css);
|
2009-12-16 07:47:08 +07:00
|
|
|
stock->cached = NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void drain_local_stock(struct work_struct *dummy)
|
|
|
|
{
|
2016-09-20 04:44:36 +07:00
|
|
|
struct memcg_stock_pcp *stock;
|
|
|
|
unsigned long flags;
|
|
|
|
|
mm, memcg: remove hotplug locking from try_charge
The following lockdep splat has been noticed during LTP testing
======================================================
WARNING: possible circular locking dependency detected
4.13.0-rc3-next-20170807 #12 Not tainted
------------------------------------------------------
a.out/4771 is trying to acquire lock:
(cpu_hotplug_lock.rw_sem){++++++}, at: [<ffffffff812b4668>] drain_all_stock.part.35+0x18/0x140
but task is already holding lock:
(&mm->mmap_sem){++++++}, at: [<ffffffff8106eb35>] __do_page_fault+0x175/0x530
which lock already depends on the new lock.
the existing dependency chain (in reverse order) is:
-> #3 (&mm->mmap_sem){++++++}:
lock_acquire+0xc9/0x230
__might_fault+0x70/0xa0
_copy_to_user+0x23/0x70
filldir+0xa7/0x110
xfs_dir2_sf_getdents.isra.10+0x20c/0x2c0 [xfs]
xfs_readdir+0x1fa/0x2c0 [xfs]
xfs_file_readdir+0x30/0x40 [xfs]
iterate_dir+0x17a/0x1a0
SyS_getdents+0xb0/0x160
entry_SYSCALL_64_fastpath+0x1f/0xbe
-> #2 (&type->i_mutex_dir_key#3){++++++}:
lock_acquire+0xc9/0x230
down_read+0x51/0xb0
lookup_slow+0xde/0x210
walk_component+0x160/0x250
link_path_walk+0x1a6/0x610
path_openat+0xe4/0xd50
do_filp_open+0x91/0x100
file_open_name+0xf5/0x130
filp_open+0x33/0x50
kernel_read_file_from_path+0x39/0x80
_request_firmware+0x39f/0x880
request_firmware_direct+0x37/0x50
request_microcode_fw+0x64/0xe0
reload_store+0xf7/0x180
dev_attr_store+0x18/0x30
sysfs_kf_write+0x44/0x60
kernfs_fop_write+0x113/0x1a0
__vfs_write+0x37/0x170
vfs_write+0xc7/0x1c0
SyS_write+0x58/0xc0
do_syscall_64+0x6c/0x1f0
return_from_SYSCALL_64+0x0/0x7a
-> #1 (microcode_mutex){+.+.+.}:
lock_acquire+0xc9/0x230
__mutex_lock+0x88/0x960
mutex_lock_nested+0x1b/0x20
microcode_init+0xbb/0x208
do_one_initcall+0x51/0x1a9
kernel_init_freeable+0x208/0x2a7
kernel_init+0xe/0x104
ret_from_fork+0x2a/0x40
-> #0 (cpu_hotplug_lock.rw_sem){++++++}:
__lock_acquire+0x153c/0x1550
lock_acquire+0xc9/0x230
cpus_read_lock+0x4b/0x90
drain_all_stock.part.35+0x18/0x140
try_charge+0x3ab/0x6e0
mem_cgroup_try_charge+0x7f/0x2c0
shmem_getpage_gfp+0x25f/0x1050
shmem_fault+0x96/0x200
__do_fault+0x1e/0xa0
__handle_mm_fault+0x9c3/0xe00
handle_mm_fault+0x16e/0x380
__do_page_fault+0x24a/0x530
do_page_fault+0x30/0x80
page_fault+0x28/0x30
other info that might help us debug this:
Chain exists of:
cpu_hotplug_lock.rw_sem --> &type->i_mutex_dir_key#3 --> &mm->mmap_sem
Possible unsafe locking scenario:
CPU0 CPU1
---- ----
lock(&mm->mmap_sem);
lock(&type->i_mutex_dir_key#3);
lock(&mm->mmap_sem);
lock(cpu_hotplug_lock.rw_sem);
*** DEADLOCK ***
2 locks held by a.out/4771:
#0: (&mm->mmap_sem){++++++}, at: [<ffffffff8106eb35>] __do_page_fault+0x175/0x530
#1: (percpu_charge_mutex){+.+...}, at: [<ffffffff812b4c97>] try_charge+0x397/0x6e0
The problem is very similar to the one fixed by commit a459eeb7b852
("mm, page_alloc: do not depend on cpu hotplug locks inside the
allocator"). We are taking hotplug locks while we can be sitting on top
of basically arbitrary locks. This just calls for problems.
We can get rid of {get,put}_online_cpus, fortunately. We do not have to
be worried about races with memory hotplug because drain_local_stock,
which is called from both the WQ draining and the memory hotplug
contexts, is always operating on the local cpu stock with IRQs disabled.
The only thing to be careful about is that the target memcg doesn't
vanish while we are still in drain_all_stock so take a reference on it.
Link: http://lkml.kernel.org/r/20170913090023.28322-1-mhocko@kernel.org
Signed-off-by: Michal Hocko <mhocko@suse.com>
Reported-by: Artem Savkov <asavkov@redhat.com>
Tested-by: Artem Savkov <asavkov@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2017-10-04 06:14:53 +07:00
|
|
|
/*
|
|
|
|
* The only protection from memory hotplug vs. drain_stock races is
|
|
|
|
* that we always operate on local CPU stock here with IRQ disabled
|
|
|
|
*/
|
2016-09-20 04:44:36 +07:00
|
|
|
local_irq_save(flags);
|
|
|
|
|
|
|
|
stock = this_cpu_ptr(&memcg_stock);
|
2020-08-07 13:20:49 +07:00
|
|
|
drain_obj_stock(stock);
|
2009-12-16 07:47:08 +07:00
|
|
|
drain_stock(stock);
|
2011-06-16 05:08:45 +07:00
|
|
|
clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
|
2016-09-20 04:44:36 +07:00
|
|
|
|
|
|
|
local_irq_restore(flags);
|
2009-12-16 07:47:08 +07:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
mm: memcontrol: lockless page counters
Memory is internally accounted in bytes, using spinlock-protected 64-bit
counters, even though the smallest accounting delta is a page. The
counter interface is also convoluted and does too many things.
Introduce a new lockless word-sized page counter API, then change all
memory accounting over to it. The translation from and to bytes then only
happens when interfacing with userspace.
The removed locking overhead is noticable when scaling beyond the per-cpu
charge caches - on a 4-socket machine with 144-threads, the following test
shows the performance differences of 288 memcgs concurrently running a
page fault benchmark:
vanilla:
18631648.500498 task-clock (msec) # 140.643 CPUs utilized ( +- 0.33% )
1,380,638 context-switches # 0.074 K/sec ( +- 0.75% )
24,390 cpu-migrations # 0.001 K/sec ( +- 8.44% )
1,843,305,768 page-faults # 0.099 M/sec ( +- 0.00% )
50,134,994,088,218 cycles # 2.691 GHz ( +- 0.33% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
8,049,712,224,651 instructions # 0.16 insns per cycle ( +- 0.04% )
1,586,970,584,979 branches # 85.176 M/sec ( +- 0.05% )
1,724,989,949 branch-misses # 0.11% of all branches ( +- 0.48% )
132.474343877 seconds time elapsed ( +- 0.21% )
lockless:
12195979.037525 task-clock (msec) # 133.480 CPUs utilized ( +- 0.18% )
832,850 context-switches # 0.068 K/sec ( +- 0.54% )
15,624 cpu-migrations # 0.001 K/sec ( +- 10.17% )
1,843,304,774 page-faults # 0.151 M/sec ( +- 0.00% )
32,811,216,801,141 cycles # 2.690 GHz ( +- 0.18% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
9,999,265,091,727 instructions # 0.30 insns per cycle ( +- 0.10% )
2,076,759,325,203 branches # 170.282 M/sec ( +- 0.12% )
1,656,917,214 branch-misses # 0.08% of all branches ( +- 0.55% )
91.369330729 seconds time elapsed ( +- 0.45% )
On top of improved scalability, this also gets rid of the icky long long
types in the very heart of memcg, which is great for 32 bit and also makes
the code a lot more readable.
Notable differences between the old and new API:
- res_counter_charge() and res_counter_charge_nofail() become
page_counter_try_charge() and page_counter_charge() resp. to match
the more common kernel naming scheme of try_do()/do()
- res_counter_uncharge_until() is only ever used to cancel a local
counter and never to uncharge bigger segments of a hierarchy, so
it's replaced by the simpler page_counter_cancel()
- res_counter_set_limit() is replaced by page_counter_limit(), which
expects its callers to serialize against themselves
- res_counter_memparse_write_strategy() is replaced by
page_counter_limit(), which rounds down to the nearest page size -
rather than up. This is more reasonable for explicitely requested
hard upper limits.
- to keep charging light-weight, page_counter_try_charge() charges
speculatively, only to roll back if the result exceeds the limit.
Because of this, a failing bigger charge can temporarily lock out
smaller charges that would otherwise succeed. The error is bounded
to the difference between the smallest and the biggest possible
charge size, so for memcg, this means that a failing THP charge can
send base page charges into reclaim upto 2MB (4MB) before the limit
would have been reached. This should be acceptable.
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE and memparse]
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE, memparse, strncmp, and PAGE_SIZE]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Acked-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-12-11 06:42:31 +07:00
|
|
|
* Cache charges(val) to local per_cpu area.
|
2010-03-15 21:27:28 +07:00
|
|
|
* This will be consumed by consume_stock() function, later.
|
2009-12-16 07:47:08 +07:00
|
|
|
*/
|
2011-11-03 03:38:15 +07:00
|
|
|
static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
|
2009-12-16 07:47:08 +07:00
|
|
|
{
|
2016-09-20 04:44:36 +07:00
|
|
|
struct memcg_stock_pcp *stock;
|
|
|
|
unsigned long flags;
|
|
|
|
|
|
|
|
local_irq_save(flags);
|
2009-12-16 07:47:08 +07:00
|
|
|
|
2016-09-20 04:44:36 +07:00
|
|
|
stock = this_cpu_ptr(&memcg_stock);
|
2011-11-03 03:38:15 +07:00
|
|
|
if (stock->cached != memcg) { /* reset if necessary */
|
2009-12-16 07:47:08 +07:00
|
|
|
drain_stock(stock);
|
2020-08-07 13:20:45 +07:00
|
|
|
css_get(&memcg->css);
|
2011-11-03 03:38:15 +07:00
|
|
|
stock->cached = memcg;
|
2009-12-16 07:47:08 +07:00
|
|
|
}
|
2011-03-24 06:42:34 +07:00
|
|
|
stock->nr_pages += nr_pages;
|
2016-09-20 04:44:36 +07:00
|
|
|
|
2018-02-01 07:16:45 +07:00
|
|
|
if (stock->nr_pages > MEMCG_CHARGE_BATCH)
|
2017-09-09 06:13:09 +07:00
|
|
|
drain_stock(stock);
|
|
|
|
|
2016-09-20 04:44:36 +07:00
|
|
|
local_irq_restore(flags);
|
2009-12-16 07:47:08 +07:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2011-11-03 03:38:15 +07:00
|
|
|
* Drains all per-CPU charge caches for given root_memcg resp. subtree
|
2014-12-11 06:42:50 +07:00
|
|
|
* of the hierarchy under it.
|
2009-12-16 07:47:08 +07:00
|
|
|
*/
|
2014-12-11 06:42:50 +07:00
|
|
|
static void drain_all_stock(struct mem_cgroup *root_memcg)
|
2009-12-16 07:47:08 +07:00
|
|
|
{
|
2011-06-16 05:08:45 +07:00
|
|
|
int cpu, curcpu;
|
2011-07-27 06:08:28 +07:00
|
|
|
|
2014-12-11 06:42:50 +07:00
|
|
|
/* If someone's already draining, avoid adding running more workers. */
|
|
|
|
if (!mutex_trylock(&percpu_charge_mutex))
|
|
|
|
return;
|
mm, memcg: remove hotplug locking from try_charge
The following lockdep splat has been noticed during LTP testing
======================================================
WARNING: possible circular locking dependency detected
4.13.0-rc3-next-20170807 #12 Not tainted
------------------------------------------------------
a.out/4771 is trying to acquire lock:
(cpu_hotplug_lock.rw_sem){++++++}, at: [<ffffffff812b4668>] drain_all_stock.part.35+0x18/0x140
but task is already holding lock:
(&mm->mmap_sem){++++++}, at: [<ffffffff8106eb35>] __do_page_fault+0x175/0x530
which lock already depends on the new lock.
the existing dependency chain (in reverse order) is:
-> #3 (&mm->mmap_sem){++++++}:
lock_acquire+0xc9/0x230
__might_fault+0x70/0xa0
_copy_to_user+0x23/0x70
filldir+0xa7/0x110
xfs_dir2_sf_getdents.isra.10+0x20c/0x2c0 [xfs]
xfs_readdir+0x1fa/0x2c0 [xfs]
xfs_file_readdir+0x30/0x40 [xfs]
iterate_dir+0x17a/0x1a0
SyS_getdents+0xb0/0x160
entry_SYSCALL_64_fastpath+0x1f/0xbe
-> #2 (&type->i_mutex_dir_key#3){++++++}:
lock_acquire+0xc9/0x230
down_read+0x51/0xb0
lookup_slow+0xde/0x210
walk_component+0x160/0x250
link_path_walk+0x1a6/0x610
path_openat+0xe4/0xd50
do_filp_open+0x91/0x100
file_open_name+0xf5/0x130
filp_open+0x33/0x50
kernel_read_file_from_path+0x39/0x80
_request_firmware+0x39f/0x880
request_firmware_direct+0x37/0x50
request_microcode_fw+0x64/0xe0
reload_store+0xf7/0x180
dev_attr_store+0x18/0x30
sysfs_kf_write+0x44/0x60
kernfs_fop_write+0x113/0x1a0
__vfs_write+0x37/0x170
vfs_write+0xc7/0x1c0
SyS_write+0x58/0xc0
do_syscall_64+0x6c/0x1f0
return_from_SYSCALL_64+0x0/0x7a
-> #1 (microcode_mutex){+.+.+.}:
lock_acquire+0xc9/0x230
__mutex_lock+0x88/0x960
mutex_lock_nested+0x1b/0x20
microcode_init+0xbb/0x208
do_one_initcall+0x51/0x1a9
kernel_init_freeable+0x208/0x2a7
kernel_init+0xe/0x104
ret_from_fork+0x2a/0x40
-> #0 (cpu_hotplug_lock.rw_sem){++++++}:
__lock_acquire+0x153c/0x1550
lock_acquire+0xc9/0x230
cpus_read_lock+0x4b/0x90
drain_all_stock.part.35+0x18/0x140
try_charge+0x3ab/0x6e0
mem_cgroup_try_charge+0x7f/0x2c0
shmem_getpage_gfp+0x25f/0x1050
shmem_fault+0x96/0x200
__do_fault+0x1e/0xa0
__handle_mm_fault+0x9c3/0xe00
handle_mm_fault+0x16e/0x380
__do_page_fault+0x24a/0x530
do_page_fault+0x30/0x80
page_fault+0x28/0x30
other info that might help us debug this:
Chain exists of:
cpu_hotplug_lock.rw_sem --> &type->i_mutex_dir_key#3 --> &mm->mmap_sem
Possible unsafe locking scenario:
CPU0 CPU1
---- ----
lock(&mm->mmap_sem);
lock(&type->i_mutex_dir_key#3);
lock(&mm->mmap_sem);
lock(cpu_hotplug_lock.rw_sem);
*** DEADLOCK ***
2 locks held by a.out/4771:
#0: (&mm->mmap_sem){++++++}, at: [<ffffffff8106eb35>] __do_page_fault+0x175/0x530
#1: (percpu_charge_mutex){+.+...}, at: [<ffffffff812b4c97>] try_charge+0x397/0x6e0
The problem is very similar to the one fixed by commit a459eeb7b852
("mm, page_alloc: do not depend on cpu hotplug locks inside the
allocator"). We are taking hotplug locks while we can be sitting on top
of basically arbitrary locks. This just calls for problems.
We can get rid of {get,put}_online_cpus, fortunately. We do not have to
be worried about races with memory hotplug because drain_local_stock,
which is called from both the WQ draining and the memory hotplug
contexts, is always operating on the local cpu stock with IRQs disabled.
The only thing to be careful about is that the target memcg doesn't
vanish while we are still in drain_all_stock so take a reference on it.
Link: http://lkml.kernel.org/r/20170913090023.28322-1-mhocko@kernel.org
Signed-off-by: Michal Hocko <mhocko@suse.com>
Reported-by: Artem Savkov <asavkov@redhat.com>
Tested-by: Artem Savkov <asavkov@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2017-10-04 06:14:53 +07:00
|
|
|
/*
|
|
|
|
* Notify other cpus that system-wide "drain" is running
|
|
|
|
* We do not care about races with the cpu hotplug because cpu down
|
|
|
|
* as well as workers from this path always operate on the local
|
|
|
|
* per-cpu data. CPU up doesn't touch memcg_stock at all.
|
|
|
|
*/
|
2011-08-26 05:59:07 +07:00
|
|
|
curcpu = get_cpu();
|
2009-12-16 07:47:08 +07:00
|
|
|
for_each_online_cpu(cpu) {
|
|
|
|
struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
|
2011-11-03 03:38:15 +07:00
|
|
|
struct mem_cgroup *memcg;
|
mm: memcontrol: switch to rcu protection in drain_all_stock()
Commit 72f0184c8a00 ("mm, memcg: remove hotplug locking from try_charge")
introduced css_tryget()/css_put() calls in drain_all_stock(), which are
supposed to protect the target memory cgroup from being released during
the mem_cgroup_is_descendant() call.
However, it's not completely safe. In theory, memcg can go away between
reading stock->cached pointer and calling css_tryget().
This can happen if drain_all_stock() races with drain_local_stock()
performed on the remote cpu as a result of a work, scheduled by the
previous invocation of drain_all_stock().
The race is a bit theoretical and there are few chances to trigger it, but
the current code looks a bit confusing, so it makes sense to fix it
anyway. The code looks like as if css_tryget() and css_put() are used to
protect stocks drainage. It's not necessary because stocked pages are
holding references to the cached cgroup. And it obviously won't work for
works, scheduled on other cpus.
So, let's read the stock->cached pointer and evaluate the memory cgroup
inside a rcu read section, and get rid of css_tryget()/css_put() calls.
Link: http://lkml.kernel.org/r/20190802192241.3253165-1-guro@fb.com
Signed-off-by: Roman Gushchin <guro@fb.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Hillf Danton <hdanton@sina.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-09-24 05:34:58 +07:00
|
|
|
bool flush = false;
|
2011-06-16 05:08:45 +07:00
|
|
|
|
mm: memcontrol: switch to rcu protection in drain_all_stock()
Commit 72f0184c8a00 ("mm, memcg: remove hotplug locking from try_charge")
introduced css_tryget()/css_put() calls in drain_all_stock(), which are
supposed to protect the target memory cgroup from being released during
the mem_cgroup_is_descendant() call.
However, it's not completely safe. In theory, memcg can go away between
reading stock->cached pointer and calling css_tryget().
This can happen if drain_all_stock() races with drain_local_stock()
performed on the remote cpu as a result of a work, scheduled by the
previous invocation of drain_all_stock().
The race is a bit theoretical and there are few chances to trigger it, but
the current code looks a bit confusing, so it makes sense to fix it
anyway. The code looks like as if css_tryget() and css_put() are used to
protect stocks drainage. It's not necessary because stocked pages are
holding references to the cached cgroup. And it obviously won't work for
works, scheduled on other cpus.
So, let's read the stock->cached pointer and evaluate the memory cgroup
inside a rcu read section, and get rid of css_tryget()/css_put() calls.
Link: http://lkml.kernel.org/r/20190802192241.3253165-1-guro@fb.com
Signed-off-by: Roman Gushchin <guro@fb.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Hillf Danton <hdanton@sina.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-09-24 05:34:58 +07:00
|
|
|
rcu_read_lock();
|
2011-11-03 03:38:15 +07:00
|
|
|
memcg = stock->cached;
|
mm: memcontrol: switch to rcu protection in drain_all_stock()
Commit 72f0184c8a00 ("mm, memcg: remove hotplug locking from try_charge")
introduced css_tryget()/css_put() calls in drain_all_stock(), which are
supposed to protect the target memory cgroup from being released during
the mem_cgroup_is_descendant() call.
However, it's not completely safe. In theory, memcg can go away between
reading stock->cached pointer and calling css_tryget().
This can happen if drain_all_stock() races with drain_local_stock()
performed on the remote cpu as a result of a work, scheduled by the
previous invocation of drain_all_stock().
The race is a bit theoretical and there are few chances to trigger it, but
the current code looks a bit confusing, so it makes sense to fix it
anyway. The code looks like as if css_tryget() and css_put() are used to
protect stocks drainage. It's not necessary because stocked pages are
holding references to the cached cgroup. And it obviously won't work for
works, scheduled on other cpus.
So, let's read the stock->cached pointer and evaluate the memory cgroup
inside a rcu read section, and get rid of css_tryget()/css_put() calls.
Link: http://lkml.kernel.org/r/20190802192241.3253165-1-guro@fb.com
Signed-off-by: Roman Gushchin <guro@fb.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Hillf Danton <hdanton@sina.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-09-24 05:34:58 +07:00
|
|
|
if (memcg && stock->nr_pages &&
|
|
|
|
mem_cgroup_is_descendant(memcg, root_memcg))
|
|
|
|
flush = true;
|
2020-08-07 13:20:49 +07:00
|
|
|
if (obj_stock_flush_required(stock, root_memcg))
|
|
|
|
flush = true;
|
mm: memcontrol: switch to rcu protection in drain_all_stock()
Commit 72f0184c8a00 ("mm, memcg: remove hotplug locking from try_charge")
introduced css_tryget()/css_put() calls in drain_all_stock(), which are
supposed to protect the target memory cgroup from being released during
the mem_cgroup_is_descendant() call.
However, it's not completely safe. In theory, memcg can go away between
reading stock->cached pointer and calling css_tryget().
This can happen if drain_all_stock() races with drain_local_stock()
performed on the remote cpu as a result of a work, scheduled by the
previous invocation of drain_all_stock().
The race is a bit theoretical and there are few chances to trigger it, but
the current code looks a bit confusing, so it makes sense to fix it
anyway. The code looks like as if css_tryget() and css_put() are used to
protect stocks drainage. It's not necessary because stocked pages are
holding references to the cached cgroup. And it obviously won't work for
works, scheduled on other cpus.
So, let's read the stock->cached pointer and evaluate the memory cgroup
inside a rcu read section, and get rid of css_tryget()/css_put() calls.
Link: http://lkml.kernel.org/r/20190802192241.3253165-1-guro@fb.com
Signed-off-by: Roman Gushchin <guro@fb.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Hillf Danton <hdanton@sina.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-09-24 05:34:58 +07:00
|
|
|
rcu_read_unlock();
|
|
|
|
|
|
|
|
if (flush &&
|
|
|
|
!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
|
2011-07-27 06:08:27 +07:00
|
|
|
if (cpu == curcpu)
|
|
|
|
drain_local_stock(&stock->work);
|
|
|
|
else
|
|
|
|
schedule_work_on(cpu, &stock->work);
|
|
|
|
}
|
2009-12-16 07:47:08 +07:00
|
|
|
}
|
2011-08-26 05:59:07 +07:00
|
|
|
put_cpu();
|
2011-08-09 16:56:26 +07:00
|
|
|
mutex_unlock(&percpu_charge_mutex);
|
2009-12-16 07:47:08 +07:00
|
|
|
}
|
|
|
|
|
2016-11-03 21:49:59 +07:00
|
|
|
static int memcg_hotplug_cpu_dead(unsigned int cpu)
|
2009-12-16 07:47:08 +07:00
|
|
|
{
|
|
|
|
struct memcg_stock_pcp *stock;
|
mm: memcontrol: fix recursive statistics correctness & scalabilty
Right now, when somebody needs to know the recursive memory statistics
and events of a cgroup subtree, they need to walk the entire subtree and
sum up the counters manually.
There are two issues with this:
1. When a cgroup gets deleted, its stats are lost. The state counters
should all be 0 at that point, of course, but the events are not.
When this happens, the event counters, which are supposed to be
monotonic, can go backwards in the parent cgroups.
2. During regular operation, we always have a certain number of lazily
freed cgroups sitting around that have been deleted, have no tasks,
but have a few cache pages remaining. These groups' statistics do not
change until we eventually hit memory pressure, but somebody
watching, say, memory.stat on an ancestor has to iterate those every
time.
This patch addresses both issues by introducing recursive counters at
each level that are propagated from the write side when stats change.
Upward propagation happens when the per-cpu caches spill over into the
local atomic counter. This is the same thing we do during charge and
uncharge, except that the latter uses atomic RMWs, which are more
expensive; stat changes happen at around the same rate. In a sparse
file test (page faults and reclaim at maximum CPU speed) with 5 cgroup
nesting levels, perf shows __mod_memcg_page state at ~1%.
Link: http://lkml.kernel.org/r/20190412151507.2769-4-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Reviewed-by: Roman Gushchin <guro@fb.com>
Cc: Michal Hocko <mhocko@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-05-15 05:47:12 +07:00
|
|
|
struct mem_cgroup *memcg, *mi;
|
2009-12-16 07:47:08 +07:00
|
|
|
|
|
|
|
stock = &per_cpu(memcg_stock, cpu);
|
|
|
|
drain_stock(stock);
|
2018-02-01 07:16:45 +07:00
|
|
|
|
|
|
|
for_each_mem_cgroup(memcg) {
|
|
|
|
int i;
|
|
|
|
|
|
|
|
for (i = 0; i < MEMCG_NR_STAT; i++) {
|
|
|
|
int nid;
|
|
|
|
long x;
|
|
|
|
|
2019-05-15 05:46:57 +07:00
|
|
|
x = this_cpu_xchg(memcg->vmstats_percpu->stat[i], 0);
|
mm: memcontrol: don't batch updates of local VM stats and events
The kernel test robot noticed a 26% will-it-scale pagefault regression
from commit 42a300353577 ("mm: memcontrol: fix recursive statistics
correctness & scalabilty"). This appears to be caused by bouncing the
additional cachelines from the new hierarchical statistics counters.
We can fix this by getting rid of the batched local counters instead.
Originally, there were *only* group-local counters, and they were fully
maintained per cpu. A reader of a stats file high up in the cgroup tree
would have to walk the entire subtree and collect each level's per-cpu
counters to get the recursive view. This was prohibitively expensive,
and so we switched to per-cpu batched updates of the local counters
during a983b5ebee57 ("mm: memcontrol: fix excessive complexity in
memory.stat reporting"), reducing the complexity from nr_subgroups *
nr_cpus to nr_subgroups.
With growing machines and cgroup trees, the tree walk itself became too
expensive for monitoring top-level groups, and this is when the culprit
patch added hierarchy counters on each cgroup level. When the per-cpu
batch size would be reached, both the local and the hierarchy counters
would get batch-updated from the per-cpu delta simultaneously.
This makes local and hierarchical counter reads blazingly fast, but it
unfortunately makes the write-side too cache line intense.
Since local counter reads were never a problem - we only centralized
them to accelerate the hierarchy walk - and use of the local counters
are becoming rarer due to replacement with hierarchical views (ongoing
rework in the page reclaim and workingset code), we can make those local
counters unbatched per-cpu counters again.
The scheme will then be as such:
when a memcg statistic changes, the writer will:
- update the local counter (per-cpu)
- update the batch counter (per-cpu). If the batch is full:
- spill the batch into the group's atomic_t
- spill the batch into all ancestors' atomic_ts
- empty out the batch counter (per-cpu)
when a local memcg counter is read, the reader will:
- collect the local counter from all cpus
when a hiearchy memcg counter is read, the reader will:
- read the atomic_t
We might be able to simplify this further and make the recursive
counters unbatched per-cpu counters as well (batch upward propagation,
but leave per-cpu collection to the readers), but that will require a
more in-depth analysis and testing of all the callsites. Deal with the
immediate regression for now.
Link: http://lkml.kernel.org/r/20190521151647.GB2870@cmpxchg.org
Fixes: 42a300353577 ("mm: memcontrol: fix recursive statistics correctness & scalabilty")
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reported-by: kernel test robot <rong.a.chen@intel.com>
Tested-by: kernel test robot <rong.a.chen@intel.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Roman Gushchin <guro@fb.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-06-14 05:55:46 +07:00
|
|
|
if (x)
|
mm: memcontrol: fix recursive statistics correctness & scalabilty
Right now, when somebody needs to know the recursive memory statistics
and events of a cgroup subtree, they need to walk the entire subtree and
sum up the counters manually.
There are two issues with this:
1. When a cgroup gets deleted, its stats are lost. The state counters
should all be 0 at that point, of course, but the events are not.
When this happens, the event counters, which are supposed to be
monotonic, can go backwards in the parent cgroups.
2. During regular operation, we always have a certain number of lazily
freed cgroups sitting around that have been deleted, have no tasks,
but have a few cache pages remaining. These groups' statistics do not
change until we eventually hit memory pressure, but somebody
watching, say, memory.stat on an ancestor has to iterate those every
time.
This patch addresses both issues by introducing recursive counters at
each level that are propagated from the write side when stats change.
Upward propagation happens when the per-cpu caches spill over into the
local atomic counter. This is the same thing we do during charge and
uncharge, except that the latter uses atomic RMWs, which are more
expensive; stat changes happen at around the same rate. In a sparse
file test (page faults and reclaim at maximum CPU speed) with 5 cgroup
nesting levels, perf shows __mod_memcg_page state at ~1%.
Link: http://lkml.kernel.org/r/20190412151507.2769-4-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Reviewed-by: Roman Gushchin <guro@fb.com>
Cc: Michal Hocko <mhocko@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-05-15 05:47:12 +07:00
|
|
|
for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
|
|
|
|
atomic_long_add(x, &memcg->vmstats[i]);
|
2018-02-01 07:16:45 +07:00
|
|
|
|
|
|
|
if (i >= NR_VM_NODE_STAT_ITEMS)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
for_each_node(nid) {
|
|
|
|
struct mem_cgroup_per_node *pn;
|
|
|
|
|
|
|
|
pn = mem_cgroup_nodeinfo(memcg, nid);
|
|
|
|
x = this_cpu_xchg(pn->lruvec_stat_cpu->count[i], 0);
|
mm: memcontrol: don't batch updates of local VM stats and events
The kernel test robot noticed a 26% will-it-scale pagefault regression
from commit 42a300353577 ("mm: memcontrol: fix recursive statistics
correctness & scalabilty"). This appears to be caused by bouncing the
additional cachelines from the new hierarchical statistics counters.
We can fix this by getting rid of the batched local counters instead.
Originally, there were *only* group-local counters, and they were fully
maintained per cpu. A reader of a stats file high up in the cgroup tree
would have to walk the entire subtree and collect each level's per-cpu
counters to get the recursive view. This was prohibitively expensive,
and so we switched to per-cpu batched updates of the local counters
during a983b5ebee57 ("mm: memcontrol: fix excessive complexity in
memory.stat reporting"), reducing the complexity from nr_subgroups *
nr_cpus to nr_subgroups.
With growing machines and cgroup trees, the tree walk itself became too
expensive for monitoring top-level groups, and this is when the culprit
patch added hierarchy counters on each cgroup level. When the per-cpu
batch size would be reached, both the local and the hierarchy counters
would get batch-updated from the per-cpu delta simultaneously.
This makes local and hierarchical counter reads blazingly fast, but it
unfortunately makes the write-side too cache line intense.
Since local counter reads were never a problem - we only centralized
them to accelerate the hierarchy walk - and use of the local counters
are becoming rarer due to replacement with hierarchical views (ongoing
rework in the page reclaim and workingset code), we can make those local
counters unbatched per-cpu counters again.
The scheme will then be as such:
when a memcg statistic changes, the writer will:
- update the local counter (per-cpu)
- update the batch counter (per-cpu). If the batch is full:
- spill the batch into the group's atomic_t
- spill the batch into all ancestors' atomic_ts
- empty out the batch counter (per-cpu)
when a local memcg counter is read, the reader will:
- collect the local counter from all cpus
when a hiearchy memcg counter is read, the reader will:
- read the atomic_t
We might be able to simplify this further and make the recursive
counters unbatched per-cpu counters as well (batch upward propagation,
but leave per-cpu collection to the readers), but that will require a
more in-depth analysis and testing of all the callsites. Deal with the
immediate regression for now.
Link: http://lkml.kernel.org/r/20190521151647.GB2870@cmpxchg.org
Fixes: 42a300353577 ("mm: memcontrol: fix recursive statistics correctness & scalabilty")
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reported-by: kernel test robot <rong.a.chen@intel.com>
Tested-by: kernel test robot <rong.a.chen@intel.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Roman Gushchin <guro@fb.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-06-14 05:55:46 +07:00
|
|
|
if (x)
|
mm: memcontrol: fix recursive statistics correctness & scalabilty
Right now, when somebody needs to know the recursive memory statistics
and events of a cgroup subtree, they need to walk the entire subtree and
sum up the counters manually.
There are two issues with this:
1. When a cgroup gets deleted, its stats are lost. The state counters
should all be 0 at that point, of course, but the events are not.
When this happens, the event counters, which are supposed to be
monotonic, can go backwards in the parent cgroups.
2. During regular operation, we always have a certain number of lazily
freed cgroups sitting around that have been deleted, have no tasks,
but have a few cache pages remaining. These groups' statistics do not
change until we eventually hit memory pressure, but somebody
watching, say, memory.stat on an ancestor has to iterate those every
time.
This patch addresses both issues by introducing recursive counters at
each level that are propagated from the write side when stats change.
Upward propagation happens when the per-cpu caches spill over into the
local atomic counter. This is the same thing we do during charge and
uncharge, except that the latter uses atomic RMWs, which are more
expensive; stat changes happen at around the same rate. In a sparse
file test (page faults and reclaim at maximum CPU speed) with 5 cgroup
nesting levels, perf shows __mod_memcg_page state at ~1%.
Link: http://lkml.kernel.org/r/20190412151507.2769-4-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Reviewed-by: Roman Gushchin <guro@fb.com>
Cc: Michal Hocko <mhocko@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-05-15 05:47:12 +07:00
|
|
|
do {
|
|
|
|
atomic_long_add(x, &pn->lruvec_stat[i]);
|
|
|
|
} while ((pn = parent_nodeinfo(pn, nid)));
|
2018-02-01 07:16:45 +07:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-04-11 06:29:45 +07:00
|
|
|
for (i = 0; i < NR_VM_EVENT_ITEMS; i++) {
|
2018-02-01 07:16:45 +07:00
|
|
|
long x;
|
|
|
|
|
2019-05-15 05:46:57 +07:00
|
|
|
x = this_cpu_xchg(memcg->vmstats_percpu->events[i], 0);
|
mm: memcontrol: don't batch updates of local VM stats and events
The kernel test robot noticed a 26% will-it-scale pagefault regression
from commit 42a300353577 ("mm: memcontrol: fix recursive statistics
correctness & scalabilty"). This appears to be caused by bouncing the
additional cachelines from the new hierarchical statistics counters.
We can fix this by getting rid of the batched local counters instead.
Originally, there were *only* group-local counters, and they were fully
maintained per cpu. A reader of a stats file high up in the cgroup tree
would have to walk the entire subtree and collect each level's per-cpu
counters to get the recursive view. This was prohibitively expensive,
and so we switched to per-cpu batched updates of the local counters
during a983b5ebee57 ("mm: memcontrol: fix excessive complexity in
memory.stat reporting"), reducing the complexity from nr_subgroups *
nr_cpus to nr_subgroups.
With growing machines and cgroup trees, the tree walk itself became too
expensive for monitoring top-level groups, and this is when the culprit
patch added hierarchy counters on each cgroup level. When the per-cpu
batch size would be reached, both the local and the hierarchy counters
would get batch-updated from the per-cpu delta simultaneously.
This makes local and hierarchical counter reads blazingly fast, but it
unfortunately makes the write-side too cache line intense.
Since local counter reads were never a problem - we only centralized
them to accelerate the hierarchy walk - and use of the local counters
are becoming rarer due to replacement with hierarchical views (ongoing
rework in the page reclaim and workingset code), we can make those local
counters unbatched per-cpu counters again.
The scheme will then be as such:
when a memcg statistic changes, the writer will:
- update the local counter (per-cpu)
- update the batch counter (per-cpu). If the batch is full:
- spill the batch into the group's atomic_t
- spill the batch into all ancestors' atomic_ts
- empty out the batch counter (per-cpu)
when a local memcg counter is read, the reader will:
- collect the local counter from all cpus
when a hiearchy memcg counter is read, the reader will:
- read the atomic_t
We might be able to simplify this further and make the recursive
counters unbatched per-cpu counters as well (batch upward propagation,
but leave per-cpu collection to the readers), but that will require a
more in-depth analysis and testing of all the callsites. Deal with the
immediate regression for now.
Link: http://lkml.kernel.org/r/20190521151647.GB2870@cmpxchg.org
Fixes: 42a300353577 ("mm: memcontrol: fix recursive statistics correctness & scalabilty")
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reported-by: kernel test robot <rong.a.chen@intel.com>
Tested-by: kernel test robot <rong.a.chen@intel.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Roman Gushchin <guro@fb.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-06-14 05:55:46 +07:00
|
|
|
if (x)
|
mm: memcontrol: fix recursive statistics correctness & scalabilty
Right now, when somebody needs to know the recursive memory statistics
and events of a cgroup subtree, they need to walk the entire subtree and
sum up the counters manually.
There are two issues with this:
1. When a cgroup gets deleted, its stats are lost. The state counters
should all be 0 at that point, of course, but the events are not.
When this happens, the event counters, which are supposed to be
monotonic, can go backwards in the parent cgroups.
2. During regular operation, we always have a certain number of lazily
freed cgroups sitting around that have been deleted, have no tasks,
but have a few cache pages remaining. These groups' statistics do not
change until we eventually hit memory pressure, but somebody
watching, say, memory.stat on an ancestor has to iterate those every
time.
This patch addresses both issues by introducing recursive counters at
each level that are propagated from the write side when stats change.
Upward propagation happens when the per-cpu caches spill over into the
local atomic counter. This is the same thing we do during charge and
uncharge, except that the latter uses atomic RMWs, which are more
expensive; stat changes happen at around the same rate. In a sparse
file test (page faults and reclaim at maximum CPU speed) with 5 cgroup
nesting levels, perf shows __mod_memcg_page state at ~1%.
Link: http://lkml.kernel.org/r/20190412151507.2769-4-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Reviewed-by: Roman Gushchin <guro@fb.com>
Cc: Michal Hocko <mhocko@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-05-15 05:47:12 +07:00
|
|
|
for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
|
|
|
|
atomic_long_add(x, &memcg->vmevents[i]);
|
2018-02-01 07:16:45 +07:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-11-03 21:49:59 +07:00
|
|
|
return 0;
|
2009-12-16 07:47:08 +07:00
|
|
|
}
|
|
|
|
|
mm, memcg: reclaim more aggressively before high allocator throttling
Patch series "mm, memcg: reclaim harder before high throttling", v2.
This patch (of 2):
In Facebook production, we've seen cases where cgroups have been put into
allocator throttling even when they appear to have a lot of slack file
caches which should be trivially reclaimable.
Looking more closely, the problem is that we only try a single cgroup
reclaim walk for each return to usermode before calculating whether or not
we should throttle. This single attempt doesn't produce enough pressure
to shrink for cgroups with a rapidly growing amount of file caches prior
to entering allocator throttling.
As an example, we see that threads in an affected cgroup are stuck in
allocator throttling:
# for i in $(cat cgroup.threads); do
> grep over_high "/proc/$i/stack"
> done
[<0>] mem_cgroup_handle_over_high+0x10b/0x150
[<0>] mem_cgroup_handle_over_high+0x10b/0x150
[<0>] mem_cgroup_handle_over_high+0x10b/0x150
...however, there is no I/O pressure reported by PSI, despite a lot of
slack file pages:
# cat memory.pressure
some avg10=78.50 avg60=84.99 avg300=84.53 total=5702440903
full avg10=78.50 avg60=84.99 avg300=84.53 total=5702116959
# cat io.pressure
some avg10=0.00 avg60=0.00 avg300=0.00 total=78051391
full avg10=0.00 avg60=0.00 avg300=0.00 total=78049640
# grep _file memory.stat
inactive_file 1370939392
active_file 661635072
This patch changes the behaviour to retry reclaim either until the current
task goes below the 10ms grace period, or we are making no reclaim
progress at all. In the latter case, we enter reclaim throttling as
before.
To a user, there's no intuitive reason for the reclaim behaviour to differ
from hitting memory.high as part of a new allocation, as opposed to
hitting memory.high because someone lowered its value. As such this also
brings an added benefit: it unifies the reclaim behaviour between the two.
There's precedent for this behaviour: we already do reclaim retries when
writing to memory.{high,max}, in max reclaim, and in the page allocator
itself.
Signed-off-by: Chris Down <chris@chrisdown.name>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Roman Gushchin <guro@fb.com>
Link: http://lkml.kernel.org/r/cover.1594640214.git.chris@chrisdown.name
Link: http://lkml.kernel.org/r/a4e23b59e9ef499b575ae73a8120ee089b7d3373.1594640214.git.chris@chrisdown.name
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2020-08-07 13:21:54 +07:00
|
|
|
static unsigned long reclaim_high(struct mem_cgroup *memcg,
|
|
|
|
unsigned int nr_pages,
|
|
|
|
gfp_t gfp_mask)
|
2016-01-15 06:21:29 +07:00
|
|
|
{
|
mm, memcg: reclaim more aggressively before high allocator throttling
Patch series "mm, memcg: reclaim harder before high throttling", v2.
This patch (of 2):
In Facebook production, we've seen cases where cgroups have been put into
allocator throttling even when they appear to have a lot of slack file
caches which should be trivially reclaimable.
Looking more closely, the problem is that we only try a single cgroup
reclaim walk for each return to usermode before calculating whether or not
we should throttle. This single attempt doesn't produce enough pressure
to shrink for cgroups with a rapidly growing amount of file caches prior
to entering allocator throttling.
As an example, we see that threads in an affected cgroup are stuck in
allocator throttling:
# for i in $(cat cgroup.threads); do
> grep over_high "/proc/$i/stack"
> done
[<0>] mem_cgroup_handle_over_high+0x10b/0x150
[<0>] mem_cgroup_handle_over_high+0x10b/0x150
[<0>] mem_cgroup_handle_over_high+0x10b/0x150
...however, there is no I/O pressure reported by PSI, despite a lot of
slack file pages:
# cat memory.pressure
some avg10=78.50 avg60=84.99 avg300=84.53 total=5702440903
full avg10=78.50 avg60=84.99 avg300=84.53 total=5702116959
# cat io.pressure
some avg10=0.00 avg60=0.00 avg300=0.00 total=78051391
full avg10=0.00 avg60=0.00 avg300=0.00 total=78049640
# grep _file memory.stat
inactive_file 1370939392
active_file 661635072
This patch changes the behaviour to retry reclaim either until the current
task goes below the 10ms grace period, or we are making no reclaim
progress at all. In the latter case, we enter reclaim throttling as
before.
To a user, there's no intuitive reason for the reclaim behaviour to differ
from hitting memory.high as part of a new allocation, as opposed to
hitting memory.high because someone lowered its value. As such this also
brings an added benefit: it unifies the reclaim behaviour between the two.
There's precedent for this behaviour: we already do reclaim retries when
writing to memory.{high,max}, in max reclaim, and in the page allocator
itself.
Signed-off-by: Chris Down <chris@chrisdown.name>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Roman Gushchin <guro@fb.com>
Link: http://lkml.kernel.org/r/cover.1594640214.git.chris@chrisdown.name
Link: http://lkml.kernel.org/r/a4e23b59e9ef499b575ae73a8120ee089b7d3373.1594640214.git.chris@chrisdown.name
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2020-08-07 13:21:54 +07:00
|
|
|
unsigned long nr_reclaimed = 0;
|
|
|
|
|
2016-01-15 06:21:29 +07:00
|
|
|
do {
|
2020-08-07 13:22:15 +07:00
|
|
|
unsigned long pflags;
|
|
|
|
|
2020-06-02 11:49:49 +07:00
|
|
|
if (page_counter_read(&memcg->memory) <=
|
|
|
|
READ_ONCE(memcg->memory.high))
|
2016-01-15 06:21:29 +07:00
|
|
|
continue;
|
2020-08-07 13:22:15 +07:00
|
|
|
|
2018-04-11 06:29:45 +07:00
|
|
|
memcg_memory_event(memcg, MEMCG_HIGH);
|
2020-08-07 13:22:15 +07:00
|
|
|
|
|
|
|
psi_memstall_enter(&pflags);
|
mm, memcg: reclaim more aggressively before high allocator throttling
Patch series "mm, memcg: reclaim harder before high throttling", v2.
This patch (of 2):
In Facebook production, we've seen cases where cgroups have been put into
allocator throttling even when they appear to have a lot of slack file
caches which should be trivially reclaimable.
Looking more closely, the problem is that we only try a single cgroup
reclaim walk for each return to usermode before calculating whether or not
we should throttle. This single attempt doesn't produce enough pressure
to shrink for cgroups with a rapidly growing amount of file caches prior
to entering allocator throttling.
As an example, we see that threads in an affected cgroup are stuck in
allocator throttling:
# for i in $(cat cgroup.threads); do
> grep over_high "/proc/$i/stack"
> done
[<0>] mem_cgroup_handle_over_high+0x10b/0x150
[<0>] mem_cgroup_handle_over_high+0x10b/0x150
[<0>] mem_cgroup_handle_over_high+0x10b/0x150
...however, there is no I/O pressure reported by PSI, despite a lot of
slack file pages:
# cat memory.pressure
some avg10=78.50 avg60=84.99 avg300=84.53 total=5702440903
full avg10=78.50 avg60=84.99 avg300=84.53 total=5702116959
# cat io.pressure
some avg10=0.00 avg60=0.00 avg300=0.00 total=78051391
full avg10=0.00 avg60=0.00 avg300=0.00 total=78049640
# grep _file memory.stat
inactive_file 1370939392
active_file 661635072
This patch changes the behaviour to retry reclaim either until the current
task goes below the 10ms grace period, or we are making no reclaim
progress at all. In the latter case, we enter reclaim throttling as
before.
To a user, there's no intuitive reason for the reclaim behaviour to differ
from hitting memory.high as part of a new allocation, as opposed to
hitting memory.high because someone lowered its value. As such this also
brings an added benefit: it unifies the reclaim behaviour between the two.
There's precedent for this behaviour: we already do reclaim retries when
writing to memory.{high,max}, in max reclaim, and in the page allocator
itself.
Signed-off-by: Chris Down <chris@chrisdown.name>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Roman Gushchin <guro@fb.com>
Link: http://lkml.kernel.org/r/cover.1594640214.git.chris@chrisdown.name
Link: http://lkml.kernel.org/r/a4e23b59e9ef499b575ae73a8120ee089b7d3373.1594640214.git.chris@chrisdown.name
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2020-08-07 13:21:54 +07:00
|
|
|
nr_reclaimed += try_to_free_mem_cgroup_pages(memcg, nr_pages,
|
|
|
|
gfp_mask, true);
|
2020-08-07 13:22:15 +07:00
|
|
|
psi_memstall_leave(&pflags);
|
2020-04-07 10:03:30 +07:00
|
|
|
} while ((memcg = parent_mem_cgroup(memcg)) &&
|
|
|
|
!mem_cgroup_is_root(memcg));
|
mm, memcg: reclaim more aggressively before high allocator throttling
Patch series "mm, memcg: reclaim harder before high throttling", v2.
This patch (of 2):
In Facebook production, we've seen cases where cgroups have been put into
allocator throttling even when they appear to have a lot of slack file
caches which should be trivially reclaimable.
Looking more closely, the problem is that we only try a single cgroup
reclaim walk for each return to usermode before calculating whether or not
we should throttle. This single attempt doesn't produce enough pressure
to shrink for cgroups with a rapidly growing amount of file caches prior
to entering allocator throttling.
As an example, we see that threads in an affected cgroup are stuck in
allocator throttling:
# for i in $(cat cgroup.threads); do
> grep over_high "/proc/$i/stack"
> done
[<0>] mem_cgroup_handle_over_high+0x10b/0x150
[<0>] mem_cgroup_handle_over_high+0x10b/0x150
[<0>] mem_cgroup_handle_over_high+0x10b/0x150
...however, there is no I/O pressure reported by PSI, despite a lot of
slack file pages:
# cat memory.pressure
some avg10=78.50 avg60=84.99 avg300=84.53 total=5702440903
full avg10=78.50 avg60=84.99 avg300=84.53 total=5702116959
# cat io.pressure
some avg10=0.00 avg60=0.00 avg300=0.00 total=78051391
full avg10=0.00 avg60=0.00 avg300=0.00 total=78049640
# grep _file memory.stat
inactive_file 1370939392
active_file 661635072
This patch changes the behaviour to retry reclaim either until the current
task goes below the 10ms grace period, or we are making no reclaim
progress at all. In the latter case, we enter reclaim throttling as
before.
To a user, there's no intuitive reason for the reclaim behaviour to differ
from hitting memory.high as part of a new allocation, as opposed to
hitting memory.high because someone lowered its value. As such this also
brings an added benefit: it unifies the reclaim behaviour between the two.
There's precedent for this behaviour: we already do reclaim retries when
writing to memory.{high,max}, in max reclaim, and in the page allocator
itself.
Signed-off-by: Chris Down <chris@chrisdown.name>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Roman Gushchin <guro@fb.com>
Link: http://lkml.kernel.org/r/cover.1594640214.git.chris@chrisdown.name
Link: http://lkml.kernel.org/r/a4e23b59e9ef499b575ae73a8120ee089b7d3373.1594640214.git.chris@chrisdown.name
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2020-08-07 13:21:54 +07:00
|
|
|
|
|
|
|
return nr_reclaimed;
|
2016-01-15 06:21:29 +07:00
|
|
|
}
|
|
|
|
|
|
|
|
static void high_work_func(struct work_struct *work)
|
|
|
|
{
|
|
|
|
struct mem_cgroup *memcg;
|
|
|
|
|
|
|
|
memcg = container_of(work, struct mem_cgroup, high_work);
|
2018-02-01 07:16:45 +07:00
|
|
|
reclaim_high(memcg, MEMCG_CHARGE_BATCH, GFP_KERNEL);
|
2016-01-15 06:21:29 +07:00
|
|
|
}
|
|
|
|
|
mm, memcg: throttle allocators when failing reclaim over memory.high
We're trying to use memory.high to limit workloads, but have found that
containment can frequently fail completely and cause OOM situations
outside of the cgroup. This happens especially with swap space -- either
when none is configured, or swap is full. These failures often also don't
have enough warning to allow one to react, whether for a human or for a
daemon monitoring PSI.
Here is output from a simple program showing how long it takes in usec
(column 2) to allocate a megabyte of anonymous memory (column 1) when a
cgroup is already beyond its memory high setting, and no swap is
available:
[root@ktst ~]# systemd-run -p MemoryHigh=100M -p MemorySwapMax=1 \
> --wait -t timeout 300 /root/mdf
[...]
95 1035
96 1038
97 1000
98 1036
99 1048
100 1590
101 1968
102 1776
103 1863
104 1757
105 1921
106 1893
107 1760
108 1748
109 1843
110 1716
111 1924
112 1776
113 1831
114 1766
115 1836
116 1588
117 1912
118 1802
119 1857
120 1731
[...]
[System OOM in 2-3 seconds]
The delay does go up extremely marginally past the 100MB memory.high
threshold, as now we spend time scanning before returning to usermode, but
it's nowhere near enough to contain growth. It also doesn't get worse the
more pages you have, since it only considers nr_pages.
The current situation goes against both the expectations of users of
memory.high, and our intentions as cgroup v2 developers. In
cgroup-v2.txt, we claim that we will throttle and only under "extreme
conditions" will memory.high protection be breached. Likewise, cgroup v2
users generally also expect that memory.high should throttle workloads as
they exceed their high threshold. However, as seen above, this isn't
always how it works in practice -- even on banal setups like those with no
swap, or where swap has become exhausted, we can end up with memory.high
being breached and us having no weapons left in our arsenal to combat
runaway growth with, since reclaim is futile.
It's also hard for system monitoring software or users to tell how bad the
situation is, as "high" events for the memcg may in some cases be benign,
and in others be catastrophic. The current status quo is that we fail
containment in a way that doesn't provide any advance warning that things
are about to go horribly wrong (for example, we are about to invoke the
kernel OOM killer).
This patch introduces explicit throttling when reclaim is failing to keep
memcg size contained at the memory.high setting. It does so by applying
an exponential delay curve derived from the memcg's overage compared to
memory.high. In the normal case where the memcg is either below or only
marginally over its memory.high setting, no throttling will be performed.
This composes well with system health monitoring and remediation, as these
allocator delays are factored into PSI's memory pressure calculations.
This both creates a mechanism system administrators or applications
consuming the PSI interface to trivially see that the memcg in question is
struggling and use that to make more reasonable decisions, and permits
them enough time to act. Either of these can act with significantly more
nuance than that we can provide using the system OOM killer.
This is a similar idea to memory.oom_control in cgroup v1 which would put
the cgroup to sleep if the threshold was violated, but it's also
significantly improved as it results in visible memory pressure, and also
doesn't schedule indefinitely, which previously made tracing and other
introspection difficult (ie. it's clamped at 2*HZ per allocation through
MEMCG_MAX_HIGH_DELAY_JIFFIES).
Contrast the previous results with a kernel with this patch:
[root@ktst ~]# systemd-run -p MemoryHigh=100M -p MemorySwapMax=1 \
> --wait -t timeout 300 /root/mdf
[...]
95 1002
96 1000
97 1002
98 1003
99 1000
100 1043
101 84724
102 330628
103 610511
104 1016265
105 1503969
106 2391692
107 2872061
108 3248003
109 4791904
110 5759832
111 6912509
112 8127818
113 9472203
114 12287622
115 12480079
116 14144008
117 15808029
118 16384500
119 16383242
120 16384979
[...]
As you can see, in the normal case, memory allocation takes around 1000
usec. However, as we exceed our memory.high, things start to increase
exponentially, but fairly leniently at first. Our first megabyte over
memory.high takes us 0.16 seconds, then the next is 0.46 seconds, then the
next is almost an entire second. This gets worse until we reach our
eventual 2*HZ clamp per batch, resulting in 16 seconds per megabyte.
However, this is still making forward progress, so permits tracing or
further analysis with programs like GDB.
We use an exponential curve for our delay penalty for a few reasons:
1. We run mem_cgroup_handle_over_high to potentially do reclaim after
we've already performed allocations, which means that temporarily
going over memory.high by a small amount may be perfectly legitimate,
even for compliant workloads. We don't want to unduly penalise such
cases.
2. An exponential curve (as opposed to a static or linear delay) allows
ramping up memory pressure stats more gradually, which can be useful
to work out that you have set memory.high too low, without destroying
application performance entirely.
This patch expands on earlier work by Johannes Weiner. Thanks!
[akpm@linux-foundation.org: fix max() warning]
[akpm@linux-foundation.org: fix __udivdi3 ref on 32-bit]
[akpm@linux-foundation.org: fix it even more]
[chris@chrisdown.name: fix 64-bit divide even more]
Link: http://lkml.kernel.org/r/20190723180700.GA29459@chrisdown.name
Signed-off-by: Chris Down <chris@chrisdown.name>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Roman Gushchin <guro@fb.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Nathan Chancellor <natechancellor@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-09-24 05:34:55 +07:00
|
|
|
/*
|
|
|
|
* Clamp the maximum sleep time per allocation batch to 2 seconds. This is
|
|
|
|
* enough to still cause a significant slowdown in most cases, while still
|
|
|
|
* allowing diagnostics and tracing to proceed without becoming stuck.
|
|
|
|
*/
|
|
|
|
#define MEMCG_MAX_HIGH_DELAY_JIFFIES (2UL*HZ)
|
|
|
|
|
|
|
|
/*
|
|
|
|
* When calculating the delay, we use these either side of the exponentiation to
|
|
|
|
* maintain precision and scale to a reasonable number of jiffies (see the table
|
|
|
|
* below.
|
|
|
|
*
|
|
|
|
* - MEMCG_DELAY_PRECISION_SHIFT: Extra precision bits while translating the
|
|
|
|
* overage ratio to a delay.
|
2020-08-12 08:33:02 +07:00
|
|
|
* - MEMCG_DELAY_SCALING_SHIFT: The number of bits to scale down the
|
mm, memcg: throttle allocators when failing reclaim over memory.high
We're trying to use memory.high to limit workloads, but have found that
containment can frequently fail completely and cause OOM situations
outside of the cgroup. This happens especially with swap space -- either
when none is configured, or swap is full. These failures often also don't
have enough warning to allow one to react, whether for a human or for a
daemon monitoring PSI.
Here is output from a simple program showing how long it takes in usec
(column 2) to allocate a megabyte of anonymous memory (column 1) when a
cgroup is already beyond its memory high setting, and no swap is
available:
[root@ktst ~]# systemd-run -p MemoryHigh=100M -p MemorySwapMax=1 \
> --wait -t timeout 300 /root/mdf
[...]
95 1035
96 1038
97 1000
98 1036
99 1048
100 1590
101 1968
102 1776
103 1863
104 1757
105 1921
106 1893
107 1760
108 1748
109 1843
110 1716
111 1924
112 1776
113 1831
114 1766
115 1836
116 1588
117 1912
118 1802
119 1857
120 1731
[...]
[System OOM in 2-3 seconds]
The delay does go up extremely marginally past the 100MB memory.high
threshold, as now we spend time scanning before returning to usermode, but
it's nowhere near enough to contain growth. It also doesn't get worse the
more pages you have, since it only considers nr_pages.
The current situation goes against both the expectations of users of
memory.high, and our intentions as cgroup v2 developers. In
cgroup-v2.txt, we claim that we will throttle and only under "extreme
conditions" will memory.high protection be breached. Likewise, cgroup v2
users generally also expect that memory.high should throttle workloads as
they exceed their high threshold. However, as seen above, this isn't
always how it works in practice -- even on banal setups like those with no
swap, or where swap has become exhausted, we can end up with memory.high
being breached and us having no weapons left in our arsenal to combat
runaway growth with, since reclaim is futile.
It's also hard for system monitoring software or users to tell how bad the
situation is, as "high" events for the memcg may in some cases be benign,
and in others be catastrophic. The current status quo is that we fail
containment in a way that doesn't provide any advance warning that things
are about to go horribly wrong (for example, we are about to invoke the
kernel OOM killer).
This patch introduces explicit throttling when reclaim is failing to keep
memcg size contained at the memory.high setting. It does so by applying
an exponential delay curve derived from the memcg's overage compared to
memory.high. In the normal case where the memcg is either below or only
marginally over its memory.high setting, no throttling will be performed.
This composes well with system health monitoring and remediation, as these
allocator delays are factored into PSI's memory pressure calculations.
This both creates a mechanism system administrators or applications
consuming the PSI interface to trivially see that the memcg in question is
struggling and use that to make more reasonable decisions, and permits
them enough time to act. Either of these can act with significantly more
nuance than that we can provide using the system OOM killer.
This is a similar idea to memory.oom_control in cgroup v1 which would put
the cgroup to sleep if the threshold was violated, but it's also
significantly improved as it results in visible memory pressure, and also
doesn't schedule indefinitely, which previously made tracing and other
introspection difficult (ie. it's clamped at 2*HZ per allocation through
MEMCG_MAX_HIGH_DELAY_JIFFIES).
Contrast the previous results with a kernel with this patch:
[root@ktst ~]# systemd-run -p MemoryHigh=100M -p MemorySwapMax=1 \
> --wait -t timeout 300 /root/mdf
[...]
95 1002
96 1000
97 1002
98 1003
99 1000
100 1043
101 84724
102 330628
103 610511
104 1016265
105 1503969
106 2391692
107 2872061
108 3248003
109 4791904
110 5759832
111 6912509
112 8127818
113 9472203
114 12287622
115 12480079
116 14144008
117 15808029
118 16384500
119 16383242
120 16384979
[...]
As you can see, in the normal case, memory allocation takes around 1000
usec. However, as we exceed our memory.high, things start to increase
exponentially, but fairly leniently at first. Our first megabyte over
memory.high takes us 0.16 seconds, then the next is 0.46 seconds, then the
next is almost an entire second. This gets worse until we reach our
eventual 2*HZ clamp per batch, resulting in 16 seconds per megabyte.
However, this is still making forward progress, so permits tracing or
further analysis with programs like GDB.
We use an exponential curve for our delay penalty for a few reasons:
1. We run mem_cgroup_handle_over_high to potentially do reclaim after
we've already performed allocations, which means that temporarily
going over memory.high by a small amount may be perfectly legitimate,
even for compliant workloads. We don't want to unduly penalise such
cases.
2. An exponential curve (as opposed to a static or linear delay) allows
ramping up memory pressure stats more gradually, which can be useful
to work out that you have set memory.high too low, without destroying
application performance entirely.
This patch expands on earlier work by Johannes Weiner. Thanks!
[akpm@linux-foundation.org: fix max() warning]
[akpm@linux-foundation.org: fix __udivdi3 ref on 32-bit]
[akpm@linux-foundation.org: fix it even more]
[chris@chrisdown.name: fix 64-bit divide even more]
Link: http://lkml.kernel.org/r/20190723180700.GA29459@chrisdown.name
Signed-off-by: Chris Down <chris@chrisdown.name>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Roman Gushchin <guro@fb.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Nathan Chancellor <natechancellor@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-09-24 05:34:55 +07:00
|
|
|
* proposed penalty in order to reduce to a reasonable number of jiffies, and
|
|
|
|
* to produce a reasonable delay curve.
|
|
|
|
*
|
|
|
|
* MEMCG_DELAY_SCALING_SHIFT just happens to be a number that produces a
|
|
|
|
* reasonable delay curve compared to precision-adjusted overage, not
|
|
|
|
* penalising heavily at first, but still making sure that growth beyond the
|
|
|
|
* limit penalises misbehaviour cgroups by slowing them down exponentially. For
|
|
|
|
* example, with a high of 100 megabytes:
|
|
|
|
*
|
|
|
|
* +-------+------------------------+
|
|
|
|
* | usage | time to allocate in ms |
|
|
|
|
* +-------+------------------------+
|
|
|
|
* | 100M | 0 |
|
|
|
|
* | 101M | 6 |
|
|
|
|
* | 102M | 25 |
|
|
|
|
* | 103M | 57 |
|
|
|
|
* | 104M | 102 |
|
|
|
|
* | 105M | 159 |
|
|
|
|
* | 106M | 230 |
|
|
|
|
* | 107M | 313 |
|
|
|
|
* | 108M | 409 |
|
|
|
|
* | 109M | 518 |
|
|
|
|
* | 110M | 639 |
|
|
|
|
* | 111M | 774 |
|
|
|
|
* | 112M | 921 |
|
|
|
|
* | 113M | 1081 |
|
|
|
|
* | 114M | 1254 |
|
|
|
|
* | 115M | 1439 |
|
|
|
|
* | 116M | 1638 |
|
|
|
|
* | 117M | 1849 |
|
|
|
|
* | 118M | 2000 |
|
|
|
|
* | 119M | 2000 |
|
|
|
|
* | 120M | 2000 |
|
|
|
|
* +-------+------------------------+
|
|
|
|
*/
|
|
|
|
#define MEMCG_DELAY_PRECISION_SHIFT 20
|
|
|
|
#define MEMCG_DELAY_SCALING_SHIFT 14
|
|
|
|
|
2020-06-02 11:49:42 +07:00
|
|
|
static u64 calculate_overage(unsigned long usage, unsigned long high)
|
memcg: punt high overage reclaim to return-to-userland path
Currently, try_charge() tries to reclaim memory synchronously when the
high limit is breached; however, if the allocation doesn't have
__GFP_WAIT, synchronous reclaim is skipped. If a process performs only
speculative allocations, it can blow way past the high limit. This is
actually easily reproducible by simply doing "find /". slab/slub
allocator tries speculative allocations first, so as long as there's
memory which can be consumed without blocking, it can keep allocating
memory regardless of the high limit.
This patch makes try_charge() always punt the over-high reclaim to the
return-to-userland path. If try_charge() detects that high limit is
breached, it adds the overage to current->memcg_nr_pages_over_high and
schedules execution of mem_cgroup_handle_over_high() which performs
synchronous reclaim from the return-to-userland path.
As long as kernel doesn't have a run-away allocation spree, this should
provide enough protection while making kmemcg behave more consistently.
It also has the following benefits.
- All over-high reclaims can use GFP_KERNEL regardless of the specific
gfp mask in use, e.g. GFP_NOFS, when the limit was breached.
- It copes with prio inversion. Previously, a low-prio task with
small memory.high might perform over-high reclaim with a bunch of
locks held. If a higher prio task needed any of these locks, it
would have to wait until the low prio task finished reclaim and
released the locks. By handing over-high reclaim to the task exit
path this issue can be avoided.
Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Michal Hocko <mhocko@kernel.org>
Reviewed-by: Vladimir Davydov <vdavydov@parallels.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-11-06 09:46:11 +07:00
|
|
|
{
|
2020-06-02 11:49:42 +07:00
|
|
|
u64 overage;
|
memcg: punt high overage reclaim to return-to-userland path
Currently, try_charge() tries to reclaim memory synchronously when the
high limit is breached; however, if the allocation doesn't have
__GFP_WAIT, synchronous reclaim is skipped. If a process performs only
speculative allocations, it can blow way past the high limit. This is
actually easily reproducible by simply doing "find /". slab/slub
allocator tries speculative allocations first, so as long as there's
memory which can be consumed without blocking, it can keep allocating
memory regardless of the high limit.
This patch makes try_charge() always punt the over-high reclaim to the
return-to-userland path. If try_charge() detects that high limit is
breached, it adds the overage to current->memcg_nr_pages_over_high and
schedules execution of mem_cgroup_handle_over_high() which performs
synchronous reclaim from the return-to-userland path.
As long as kernel doesn't have a run-away allocation spree, this should
provide enough protection while making kmemcg behave more consistently.
It also has the following benefits.
- All over-high reclaims can use GFP_KERNEL regardless of the specific
gfp mask in use, e.g. GFP_NOFS, when the limit was breached.
- It copes with prio inversion. Previously, a low-prio task with
small memory.high might perform over-high reclaim with a bunch of
locks held. If a higher prio task needed any of these locks, it
would have to wait until the low prio task finished reclaim and
released the locks. By handing over-high reclaim to the task exit
path this issue can be avoided.
Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Michal Hocko <mhocko@kernel.org>
Reviewed-by: Vladimir Davydov <vdavydov@parallels.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-11-06 09:46:11 +07:00
|
|
|
|
2020-06-02 11:49:42 +07:00
|
|
|
if (usage <= high)
|
|
|
|
return 0;
|
2020-03-22 08:22:23 +07:00
|
|
|
|
2020-06-02 11:49:42 +07:00
|
|
|
/*
|
|
|
|
* Prevent division by 0 in overage calculation by acting as if
|
|
|
|
* it was a threshold of 1 page
|
|
|
|
*/
|
|
|
|
high = max(high, 1UL);
|
mm, memcg: do not high throttle allocators based on wraparound
If a cgroup violates its memory.high constraints, we may end up unduly
penalising it. For example, for the following hierarchy:
A: max high, 20 usage
A/B: 9 high, 10 usage
A/C: max high, 10 usage
We would end up doing the following calculation below when calculating
high delay for A/B:
A/B: 10 - 9 = 1...
A: 20 - PAGE_COUNTER_MAX = 21, so set max_overage to 21.
This gets worse with higher disparities in usage in the parent.
I have no idea how this disappeared from the final version of the patch,
but it is certainly Not Good(tm). This wasn't obvious in testing because,
for a simple cgroup hierarchy with only one child, the result is usually
roughly the same. It's only in more complex hierarchies that things go
really awry (although still, the effects are limited to a maximum of 2
seconds in schedule_timeout_killable at a maximum).
[chris@chrisdown.name: changelog]
Fixes: e26733e0d0ec ("mm, memcg: throttle allocators based on ancestral memory.high")
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Chris Down <chris@chrisdown.name>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: <stable@vger.kernel.org> [5.4.x]
Link: http://lkml.kernel.org/r/20200331152424.GA1019937@chrisdown.name
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2020-04-11 04:32:19 +07:00
|
|
|
|
2020-06-02 11:49:42 +07:00
|
|
|
overage = usage - high;
|
|
|
|
overage <<= MEMCG_DELAY_PRECISION_SHIFT;
|
|
|
|
return div64_u64(overage, high);
|
|
|
|
}
|
2020-03-22 08:22:23 +07:00
|
|
|
|
2020-06-02 11:49:42 +07:00
|
|
|
static u64 mem_find_max_overage(struct mem_cgroup *memcg)
|
|
|
|
{
|
|
|
|
u64 overage, max_overage = 0;
|
2020-03-22 08:22:23 +07:00
|
|
|
|
2020-06-02 11:49:42 +07:00
|
|
|
do {
|
|
|
|
overage = calculate_overage(page_counter_read(&memcg->memory),
|
2020-06-02 11:49:49 +07:00
|
|
|
READ_ONCE(memcg->memory.high));
|
2020-06-02 11:49:42 +07:00
|
|
|
max_overage = max(overage, max_overage);
|
2020-03-22 08:22:23 +07:00
|
|
|
} while ((memcg = parent_mem_cgroup(memcg)) &&
|
|
|
|
!mem_cgroup_is_root(memcg));
|
|
|
|
|
2020-06-02 11:49:42 +07:00
|
|
|
return max_overage;
|
|
|
|
}
|
|
|
|
|
2020-06-02 11:49:52 +07:00
|
|
|
static u64 swap_find_max_overage(struct mem_cgroup *memcg)
|
|
|
|
{
|
|
|
|
u64 overage, max_overage = 0;
|
|
|
|
|
|
|
|
do {
|
|
|
|
overage = calculate_overage(page_counter_read(&memcg->swap),
|
|
|
|
READ_ONCE(memcg->swap.high));
|
|
|
|
if (overage)
|
|
|
|
memcg_memory_event(memcg, MEMCG_SWAP_HIGH);
|
|
|
|
max_overage = max(overage, max_overage);
|
|
|
|
} while ((memcg = parent_mem_cgroup(memcg)) &&
|
|
|
|
!mem_cgroup_is_root(memcg));
|
|
|
|
|
|
|
|
return max_overage;
|
|
|
|
}
|
|
|
|
|
2020-06-02 11:49:42 +07:00
|
|
|
/*
|
|
|
|
* Get the number of jiffies that we should penalise a mischievous cgroup which
|
|
|
|
* is exceeding its memory.high by checking both it and its ancestors.
|
|
|
|
*/
|
|
|
|
static unsigned long calculate_high_delay(struct mem_cgroup *memcg,
|
|
|
|
unsigned int nr_pages,
|
|
|
|
u64 max_overage)
|
|
|
|
{
|
|
|
|
unsigned long penalty_jiffies;
|
|
|
|
|
2020-03-22 08:22:23 +07:00
|
|
|
if (!max_overage)
|
|
|
|
return 0;
|
mm, memcg: throttle allocators when failing reclaim over memory.high
We're trying to use memory.high to limit workloads, but have found that
containment can frequently fail completely and cause OOM situations
outside of the cgroup. This happens especially with swap space -- either
when none is configured, or swap is full. These failures often also don't
have enough warning to allow one to react, whether for a human or for a
daemon monitoring PSI.
Here is output from a simple program showing how long it takes in usec
(column 2) to allocate a megabyte of anonymous memory (column 1) when a
cgroup is already beyond its memory high setting, and no swap is
available:
[root@ktst ~]# systemd-run -p MemoryHigh=100M -p MemorySwapMax=1 \
> --wait -t timeout 300 /root/mdf
[...]
95 1035
96 1038
97 1000
98 1036
99 1048
100 1590
101 1968
102 1776
103 1863
104 1757
105 1921
106 1893
107 1760
108 1748
109 1843
110 1716
111 1924
112 1776
113 1831
114 1766
115 1836
116 1588
117 1912
118 1802
119 1857
120 1731
[...]
[System OOM in 2-3 seconds]
The delay does go up extremely marginally past the 100MB memory.high
threshold, as now we spend time scanning before returning to usermode, but
it's nowhere near enough to contain growth. It also doesn't get worse the
more pages you have, since it only considers nr_pages.
The current situation goes against both the expectations of users of
memory.high, and our intentions as cgroup v2 developers. In
cgroup-v2.txt, we claim that we will throttle and only under "extreme
conditions" will memory.high protection be breached. Likewise, cgroup v2
users generally also expect that memory.high should throttle workloads as
they exceed their high threshold. However, as seen above, this isn't
always how it works in practice -- even on banal setups like those with no
swap, or where swap has become exhausted, we can end up with memory.high
being breached and us having no weapons left in our arsenal to combat
runaway growth with, since reclaim is futile.
It's also hard for system monitoring software or users to tell how bad the
situation is, as "high" events for the memcg may in some cases be benign,
and in others be catastrophic. The current status quo is that we fail
containment in a way that doesn't provide any advance warning that things
are about to go horribly wrong (for example, we are about to invoke the
kernel OOM killer).
This patch introduces explicit throttling when reclaim is failing to keep
memcg size contained at the memory.high setting. It does so by applying
an exponential delay curve derived from the memcg's overage compared to
memory.high. In the normal case where the memcg is either below or only
marginally over its memory.high setting, no throttling will be performed.
This composes well with system health monitoring and remediation, as these
allocator delays are factored into PSI's memory pressure calculations.
This both creates a mechanism system administrators or applications
consuming the PSI interface to trivially see that the memcg in question is
struggling and use that to make more reasonable decisions, and permits
them enough time to act. Either of these can act with significantly more
nuance than that we can provide using the system OOM killer.
This is a similar idea to memory.oom_control in cgroup v1 which would put
the cgroup to sleep if the threshold was violated, but it's also
significantly improved as it results in visible memory pressure, and also
doesn't schedule indefinitely, which previously made tracing and other
introspection difficult (ie. it's clamped at 2*HZ per allocation through
MEMCG_MAX_HIGH_DELAY_JIFFIES).
Contrast the previous results with a kernel with this patch:
[root@ktst ~]# systemd-run -p MemoryHigh=100M -p MemorySwapMax=1 \
> --wait -t timeout 300 /root/mdf
[...]
95 1002
96 1000
97 1002
98 1003
99 1000
100 1043
101 84724
102 330628
103 610511
104 1016265
105 1503969
106 2391692
107 2872061
108 3248003
109 4791904
110 5759832
111 6912509
112 8127818
113 9472203
114 12287622
115 12480079
116 14144008
117 15808029
118 16384500
119 16383242
120 16384979
[...]
As you can see, in the normal case, memory allocation takes around 1000
usec. However, as we exceed our memory.high, things start to increase
exponentially, but fairly leniently at first. Our first megabyte over
memory.high takes us 0.16 seconds, then the next is 0.46 seconds, then the
next is almost an entire second. This gets worse until we reach our
eventual 2*HZ clamp per batch, resulting in 16 seconds per megabyte.
However, this is still making forward progress, so permits tracing or
further analysis with programs like GDB.
We use an exponential curve for our delay penalty for a few reasons:
1. We run mem_cgroup_handle_over_high to potentially do reclaim after
we've already performed allocations, which means that temporarily
going over memory.high by a small amount may be perfectly legitimate,
even for compliant workloads. We don't want to unduly penalise such
cases.
2. An exponential curve (as opposed to a static or linear delay) allows
ramping up memory pressure stats more gradually, which can be useful
to work out that you have set memory.high too low, without destroying
application performance entirely.
This patch expands on earlier work by Johannes Weiner. Thanks!
[akpm@linux-foundation.org: fix max() warning]
[akpm@linux-foundation.org: fix __udivdi3 ref on 32-bit]
[akpm@linux-foundation.org: fix it even more]
[chris@chrisdown.name: fix 64-bit divide even more]
Link: http://lkml.kernel.org/r/20190723180700.GA29459@chrisdown.name
Signed-off-by: Chris Down <chris@chrisdown.name>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Roman Gushchin <guro@fb.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Nathan Chancellor <natechancellor@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-09-24 05:34:55 +07:00
|
|
|
|
|
|
|
/*
|
|
|
|
* We use overage compared to memory.high to calculate the number of
|
|
|
|
* jiffies to sleep (penalty_jiffies). Ideally this value should be
|
|
|
|
* fairly lenient on small overages, and increasingly harsh when the
|
|
|
|
* memcg in question makes it clear that it has no intention of stopping
|
|
|
|
* its crazy behaviour, so we exponentially increase the delay based on
|
|
|
|
* overage amount.
|
|
|
|
*/
|
2020-03-22 08:22:23 +07:00
|
|
|
penalty_jiffies = max_overage * max_overage * HZ;
|
|
|
|
penalty_jiffies >>= MEMCG_DELAY_PRECISION_SHIFT;
|
|
|
|
penalty_jiffies >>= MEMCG_DELAY_SCALING_SHIFT;
|
mm, memcg: throttle allocators when failing reclaim over memory.high
We're trying to use memory.high to limit workloads, but have found that
containment can frequently fail completely and cause OOM situations
outside of the cgroup. This happens especially with swap space -- either
when none is configured, or swap is full. These failures often also don't
have enough warning to allow one to react, whether for a human or for a
daemon monitoring PSI.
Here is output from a simple program showing how long it takes in usec
(column 2) to allocate a megabyte of anonymous memory (column 1) when a
cgroup is already beyond its memory high setting, and no swap is
available:
[root@ktst ~]# systemd-run -p MemoryHigh=100M -p MemorySwapMax=1 \
> --wait -t timeout 300 /root/mdf
[...]
95 1035
96 1038
97 1000
98 1036
99 1048
100 1590
101 1968
102 1776
103 1863
104 1757
105 1921
106 1893
107 1760
108 1748
109 1843
110 1716
111 1924
112 1776
113 1831
114 1766
115 1836
116 1588
117 1912
118 1802
119 1857
120 1731
[...]
[System OOM in 2-3 seconds]
The delay does go up extremely marginally past the 100MB memory.high
threshold, as now we spend time scanning before returning to usermode, but
it's nowhere near enough to contain growth. It also doesn't get worse the
more pages you have, since it only considers nr_pages.
The current situation goes against both the expectations of users of
memory.high, and our intentions as cgroup v2 developers. In
cgroup-v2.txt, we claim that we will throttle and only under "extreme
conditions" will memory.high protection be breached. Likewise, cgroup v2
users generally also expect that memory.high should throttle workloads as
they exceed their high threshold. However, as seen above, this isn't
always how it works in practice -- even on banal setups like those with no
swap, or where swap has become exhausted, we can end up with memory.high
being breached and us having no weapons left in our arsenal to combat
runaway growth with, since reclaim is futile.
It's also hard for system monitoring software or users to tell how bad the
situation is, as "high" events for the memcg may in some cases be benign,
and in others be catastrophic. The current status quo is that we fail
containment in a way that doesn't provide any advance warning that things
are about to go horribly wrong (for example, we are about to invoke the
kernel OOM killer).
This patch introduces explicit throttling when reclaim is failing to keep
memcg size contained at the memory.high setting. It does so by applying
an exponential delay curve derived from the memcg's overage compared to
memory.high. In the normal case where the memcg is either below or only
marginally over its memory.high setting, no throttling will be performed.
This composes well with system health monitoring and remediation, as these
allocator delays are factored into PSI's memory pressure calculations.
This both creates a mechanism system administrators or applications
consuming the PSI interface to trivially see that the memcg in question is
struggling and use that to make more reasonable decisions, and permits
them enough time to act. Either of these can act with significantly more
nuance than that we can provide using the system OOM killer.
This is a similar idea to memory.oom_control in cgroup v1 which would put
the cgroup to sleep if the threshold was violated, but it's also
significantly improved as it results in visible memory pressure, and also
doesn't schedule indefinitely, which previously made tracing and other
introspection difficult (ie. it's clamped at 2*HZ per allocation through
MEMCG_MAX_HIGH_DELAY_JIFFIES).
Contrast the previous results with a kernel with this patch:
[root@ktst ~]# systemd-run -p MemoryHigh=100M -p MemorySwapMax=1 \
> --wait -t timeout 300 /root/mdf
[...]
95 1002
96 1000
97 1002
98 1003
99 1000
100 1043
101 84724
102 330628
103 610511
104 1016265
105 1503969
106 2391692
107 2872061
108 3248003
109 4791904
110 5759832
111 6912509
112 8127818
113 9472203
114 12287622
115 12480079
116 14144008
117 15808029
118 16384500
119 16383242
120 16384979
[...]
As you can see, in the normal case, memory allocation takes around 1000
usec. However, as we exceed our memory.high, things start to increase
exponentially, but fairly leniently at first. Our first megabyte over
memory.high takes us 0.16 seconds, then the next is 0.46 seconds, then the
next is almost an entire second. This gets worse until we reach our
eventual 2*HZ clamp per batch, resulting in 16 seconds per megabyte.
However, this is still making forward progress, so permits tracing or
further analysis with programs like GDB.
We use an exponential curve for our delay penalty for a few reasons:
1. We run mem_cgroup_handle_over_high to potentially do reclaim after
we've already performed allocations, which means that temporarily
going over memory.high by a small amount may be perfectly legitimate,
even for compliant workloads. We don't want to unduly penalise such
cases.
2. An exponential curve (as opposed to a static or linear delay) allows
ramping up memory pressure stats more gradually, which can be useful
to work out that you have set memory.high too low, without destroying
application performance entirely.
This patch expands on earlier work by Johannes Weiner. Thanks!
[akpm@linux-foundation.org: fix max() warning]
[akpm@linux-foundation.org: fix __udivdi3 ref on 32-bit]
[akpm@linux-foundation.org: fix it even more]
[chris@chrisdown.name: fix 64-bit divide even more]
Link: http://lkml.kernel.org/r/20190723180700.GA29459@chrisdown.name
Signed-off-by: Chris Down <chris@chrisdown.name>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Roman Gushchin <guro@fb.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Nathan Chancellor <natechancellor@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-09-24 05:34:55 +07:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Factor in the task's own contribution to the overage, such that four
|
|
|
|
* N-sized allocations are throttled approximately the same as one
|
|
|
|
* 4N-sized allocation.
|
|
|
|
*
|
|
|
|
* MEMCG_CHARGE_BATCH pages is nominal, so work out how much smaller or
|
|
|
|
* larger the current charge patch is than that.
|
|
|
|
*/
|
2020-06-02 11:49:45 +07:00
|
|
|
return penalty_jiffies * nr_pages / MEMCG_CHARGE_BATCH;
|
2020-03-22 08:22:23 +07:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Scheduled by try_charge() to be executed from the userland return path
|
|
|
|
* and reclaims memory over the high limit.
|
|
|
|
*/
|
|
|
|
void mem_cgroup_handle_over_high(void)
|
|
|
|
{
|
|
|
|
unsigned long penalty_jiffies;
|
|
|
|
unsigned long pflags;
|
mm, memcg: reclaim more aggressively before high allocator throttling
Patch series "mm, memcg: reclaim harder before high throttling", v2.
This patch (of 2):
In Facebook production, we've seen cases where cgroups have been put into
allocator throttling even when they appear to have a lot of slack file
caches which should be trivially reclaimable.
Looking more closely, the problem is that we only try a single cgroup
reclaim walk for each return to usermode before calculating whether or not
we should throttle. This single attempt doesn't produce enough pressure
to shrink for cgroups with a rapidly growing amount of file caches prior
to entering allocator throttling.
As an example, we see that threads in an affected cgroup are stuck in
allocator throttling:
# for i in $(cat cgroup.threads); do
> grep over_high "/proc/$i/stack"
> done
[<0>] mem_cgroup_handle_over_high+0x10b/0x150
[<0>] mem_cgroup_handle_over_high+0x10b/0x150
[<0>] mem_cgroup_handle_over_high+0x10b/0x150
...however, there is no I/O pressure reported by PSI, despite a lot of
slack file pages:
# cat memory.pressure
some avg10=78.50 avg60=84.99 avg300=84.53 total=5702440903
full avg10=78.50 avg60=84.99 avg300=84.53 total=5702116959
# cat io.pressure
some avg10=0.00 avg60=0.00 avg300=0.00 total=78051391
full avg10=0.00 avg60=0.00 avg300=0.00 total=78049640
# grep _file memory.stat
inactive_file 1370939392
active_file 661635072
This patch changes the behaviour to retry reclaim either until the current
task goes below the 10ms grace period, or we are making no reclaim
progress at all. In the latter case, we enter reclaim throttling as
before.
To a user, there's no intuitive reason for the reclaim behaviour to differ
from hitting memory.high as part of a new allocation, as opposed to
hitting memory.high because someone lowered its value. As such this also
brings an added benefit: it unifies the reclaim behaviour between the two.
There's precedent for this behaviour: we already do reclaim retries when
writing to memory.{high,max}, in max reclaim, and in the page allocator
itself.
Signed-off-by: Chris Down <chris@chrisdown.name>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Roman Gushchin <guro@fb.com>
Link: http://lkml.kernel.org/r/cover.1594640214.git.chris@chrisdown.name
Link: http://lkml.kernel.org/r/a4e23b59e9ef499b575ae73a8120ee089b7d3373.1594640214.git.chris@chrisdown.name
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2020-08-07 13:21:54 +07:00
|
|
|
unsigned long nr_reclaimed;
|
2020-03-22 08:22:23 +07:00
|
|
|
unsigned int nr_pages = current->memcg_nr_pages_over_high;
|
2020-08-07 13:21:58 +07:00
|
|
|
int nr_retries = MAX_RECLAIM_RETRIES;
|
2020-03-22 08:22:23 +07:00
|
|
|
struct mem_cgroup *memcg;
|
mm, memcg: reclaim more aggressively before high allocator throttling
Patch series "mm, memcg: reclaim harder before high throttling", v2.
This patch (of 2):
In Facebook production, we've seen cases where cgroups have been put into
allocator throttling even when they appear to have a lot of slack file
caches which should be trivially reclaimable.
Looking more closely, the problem is that we only try a single cgroup
reclaim walk for each return to usermode before calculating whether or not
we should throttle. This single attempt doesn't produce enough pressure
to shrink for cgroups with a rapidly growing amount of file caches prior
to entering allocator throttling.
As an example, we see that threads in an affected cgroup are stuck in
allocator throttling:
# for i in $(cat cgroup.threads); do
> grep over_high "/proc/$i/stack"
> done
[<0>] mem_cgroup_handle_over_high+0x10b/0x150
[<0>] mem_cgroup_handle_over_high+0x10b/0x150
[<0>] mem_cgroup_handle_over_high+0x10b/0x150
...however, there is no I/O pressure reported by PSI, despite a lot of
slack file pages:
# cat memory.pressure
some avg10=78.50 avg60=84.99 avg300=84.53 total=5702440903
full avg10=78.50 avg60=84.99 avg300=84.53 total=5702116959
# cat io.pressure
some avg10=0.00 avg60=0.00 avg300=0.00 total=78051391
full avg10=0.00 avg60=0.00 avg300=0.00 total=78049640
# grep _file memory.stat
inactive_file 1370939392
active_file 661635072
This patch changes the behaviour to retry reclaim either until the current
task goes below the 10ms grace period, or we are making no reclaim
progress at all. In the latter case, we enter reclaim throttling as
before.
To a user, there's no intuitive reason for the reclaim behaviour to differ
from hitting memory.high as part of a new allocation, as opposed to
hitting memory.high because someone lowered its value. As such this also
brings an added benefit: it unifies the reclaim behaviour between the two.
There's precedent for this behaviour: we already do reclaim retries when
writing to memory.{high,max}, in max reclaim, and in the page allocator
itself.
Signed-off-by: Chris Down <chris@chrisdown.name>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Roman Gushchin <guro@fb.com>
Link: http://lkml.kernel.org/r/cover.1594640214.git.chris@chrisdown.name
Link: http://lkml.kernel.org/r/a4e23b59e9ef499b575ae73a8120ee089b7d3373.1594640214.git.chris@chrisdown.name
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2020-08-07 13:21:54 +07:00
|
|
|
bool in_retry = false;
|
2020-03-22 08:22:23 +07:00
|
|
|
|
|
|
|
if (likely(!nr_pages))
|
|
|
|
return;
|
|
|
|
|
|
|
|
memcg = get_mem_cgroup_from_mm(current->mm);
|
|
|
|
current->memcg_nr_pages_over_high = 0;
|
|
|
|
|
mm, memcg: reclaim more aggressively before high allocator throttling
Patch series "mm, memcg: reclaim harder before high throttling", v2.
This patch (of 2):
In Facebook production, we've seen cases where cgroups have been put into
allocator throttling even when they appear to have a lot of slack file
caches which should be trivially reclaimable.
Looking more closely, the problem is that we only try a single cgroup
reclaim walk for each return to usermode before calculating whether or not
we should throttle. This single attempt doesn't produce enough pressure
to shrink for cgroups with a rapidly growing amount of file caches prior
to entering allocator throttling.
As an example, we see that threads in an affected cgroup are stuck in
allocator throttling:
# for i in $(cat cgroup.threads); do
> grep over_high "/proc/$i/stack"
> done
[<0>] mem_cgroup_handle_over_high+0x10b/0x150
[<0>] mem_cgroup_handle_over_high+0x10b/0x150
[<0>] mem_cgroup_handle_over_high+0x10b/0x150
...however, there is no I/O pressure reported by PSI, despite a lot of
slack file pages:
# cat memory.pressure
some avg10=78.50 avg60=84.99 avg300=84.53 total=5702440903
full avg10=78.50 avg60=84.99 avg300=84.53 total=5702116959
# cat io.pressure
some avg10=0.00 avg60=0.00 avg300=0.00 total=78051391
full avg10=0.00 avg60=0.00 avg300=0.00 total=78049640
# grep _file memory.stat
inactive_file 1370939392
active_file 661635072
This patch changes the behaviour to retry reclaim either until the current
task goes below the 10ms grace period, or we are making no reclaim
progress at all. In the latter case, we enter reclaim throttling as
before.
To a user, there's no intuitive reason for the reclaim behaviour to differ
from hitting memory.high as part of a new allocation, as opposed to
hitting memory.high because someone lowered its value. As such this also
brings an added benefit: it unifies the reclaim behaviour between the two.
There's precedent for this behaviour: we already do reclaim retries when
writing to memory.{high,max}, in max reclaim, and in the page allocator
itself.
Signed-off-by: Chris Down <chris@chrisdown.name>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Roman Gushchin <guro@fb.com>
Link: http://lkml.kernel.org/r/cover.1594640214.git.chris@chrisdown.name
Link: http://lkml.kernel.org/r/a4e23b59e9ef499b575ae73a8120ee089b7d3373.1594640214.git.chris@chrisdown.name
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2020-08-07 13:21:54 +07:00
|
|
|
retry_reclaim:
|
|
|
|
/*
|
|
|
|
* The allocating task should reclaim at least the batch size, but for
|
|
|
|
* subsequent retries we only want to do what's necessary to prevent oom
|
|
|
|
* or breaching resource isolation.
|
|
|
|
*
|
|
|
|
* This is distinct from memory.max or page allocator behaviour because
|
|
|
|
* memory.high is currently batched, whereas memory.max and the page
|
|
|
|
* allocator run every time an allocation is made.
|
|
|
|
*/
|
|
|
|
nr_reclaimed = reclaim_high(memcg,
|
|
|
|
in_retry ? SWAP_CLUSTER_MAX : nr_pages,
|
|
|
|
GFP_KERNEL);
|
|
|
|
|
2020-03-22 08:22:23 +07:00
|
|
|
/*
|
|
|
|
* memory.high is breached and reclaim is unable to keep up. Throttle
|
|
|
|
* allocators proactively to slow down excessive growth.
|
|
|
|
*/
|
2020-06-02 11:49:42 +07:00
|
|
|
penalty_jiffies = calculate_high_delay(memcg, nr_pages,
|
|
|
|
mem_find_max_overage(memcg));
|
mm, memcg: throttle allocators when failing reclaim over memory.high
We're trying to use memory.high to limit workloads, but have found that
containment can frequently fail completely and cause OOM situations
outside of the cgroup. This happens especially with swap space -- either
when none is configured, or swap is full. These failures often also don't
have enough warning to allow one to react, whether for a human or for a
daemon monitoring PSI.
Here is output from a simple program showing how long it takes in usec
(column 2) to allocate a megabyte of anonymous memory (column 1) when a
cgroup is already beyond its memory high setting, and no swap is
available:
[root@ktst ~]# systemd-run -p MemoryHigh=100M -p MemorySwapMax=1 \
> --wait -t timeout 300 /root/mdf
[...]
95 1035
96 1038
97 1000
98 1036
99 1048
100 1590
101 1968
102 1776
103 1863
104 1757
105 1921
106 1893
107 1760
108 1748
109 1843
110 1716
111 1924
112 1776
113 1831
114 1766
115 1836
116 1588
117 1912
118 1802
119 1857
120 1731
[...]
[System OOM in 2-3 seconds]
The delay does go up extremely marginally past the 100MB memory.high
threshold, as now we spend time scanning before returning to usermode, but
it's nowhere near enough to contain growth. It also doesn't get worse the
more pages you have, since it only considers nr_pages.
The current situation goes against both the expectations of users of
memory.high, and our intentions as cgroup v2 developers. In
cgroup-v2.txt, we claim that we will throttle and only under "extreme
conditions" will memory.high protection be breached. Likewise, cgroup v2
users generally also expect that memory.high should throttle workloads as
they exceed their high threshold. However, as seen above, this isn't
always how it works in practice -- even on banal setups like those with no
swap, or where swap has become exhausted, we can end up with memory.high
being breached and us having no weapons left in our arsenal to combat
runaway growth with, since reclaim is futile.
It's also hard for system monitoring software or users to tell how bad the
situation is, as "high" events for the memcg may in some cases be benign,
and in others be catastrophic. The current status quo is that we fail
containment in a way that doesn't provide any advance warning that things
are about to go horribly wrong (for example, we are about to invoke the
kernel OOM killer).
This patch introduces explicit throttling when reclaim is failing to keep
memcg size contained at the memory.high setting. It does so by applying
an exponential delay curve derived from the memcg's overage compared to
memory.high. In the normal case where the memcg is either below or only
marginally over its memory.high setting, no throttling will be performed.
This composes well with system health monitoring and remediation, as these
allocator delays are factored into PSI's memory pressure calculations.
This both creates a mechanism system administrators or applications
consuming the PSI interface to trivially see that the memcg in question is
struggling and use that to make more reasonable decisions, and permits
them enough time to act. Either of these can act with significantly more
nuance than that we can provide using the system OOM killer.
This is a similar idea to memory.oom_control in cgroup v1 which would put
the cgroup to sleep if the threshold was violated, but it's also
significantly improved as it results in visible memory pressure, and also
doesn't schedule indefinitely, which previously made tracing and other
introspection difficult (ie. it's clamped at 2*HZ per allocation through
MEMCG_MAX_HIGH_DELAY_JIFFIES).
Contrast the previous results with a kernel with this patch:
[root@ktst ~]# systemd-run -p MemoryHigh=100M -p MemorySwapMax=1 \
> --wait -t timeout 300 /root/mdf
[...]
95 1002
96 1000
97 1002
98 1003
99 1000
100 1043
101 84724
102 330628
103 610511
104 1016265
105 1503969
106 2391692
107 2872061
108 3248003
109 4791904
110 5759832
111 6912509
112 8127818
113 9472203
114 12287622
115 12480079
116 14144008
117 15808029
118 16384500
119 16383242
120 16384979
[...]
As you can see, in the normal case, memory allocation takes around 1000
usec. However, as we exceed our memory.high, things start to increase
exponentially, but fairly leniently at first. Our first megabyte over
memory.high takes us 0.16 seconds, then the next is 0.46 seconds, then the
next is almost an entire second. This gets worse until we reach our
eventual 2*HZ clamp per batch, resulting in 16 seconds per megabyte.
However, this is still making forward progress, so permits tracing or
further analysis with programs like GDB.
We use an exponential curve for our delay penalty for a few reasons:
1. We run mem_cgroup_handle_over_high to potentially do reclaim after
we've already performed allocations, which means that temporarily
going over memory.high by a small amount may be perfectly legitimate,
even for compliant workloads. We don't want to unduly penalise such
cases.
2. An exponential curve (as opposed to a static or linear delay) allows
ramping up memory pressure stats more gradually, which can be useful
to work out that you have set memory.high too low, without destroying
application performance entirely.
This patch expands on earlier work by Johannes Weiner. Thanks!
[akpm@linux-foundation.org: fix max() warning]
[akpm@linux-foundation.org: fix __udivdi3 ref on 32-bit]
[akpm@linux-foundation.org: fix it even more]
[chris@chrisdown.name: fix 64-bit divide even more]
Link: http://lkml.kernel.org/r/20190723180700.GA29459@chrisdown.name
Signed-off-by: Chris Down <chris@chrisdown.name>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Roman Gushchin <guro@fb.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Nathan Chancellor <natechancellor@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-09-24 05:34:55 +07:00
|
|
|
|
2020-06-02 11:49:52 +07:00
|
|
|
penalty_jiffies += calculate_high_delay(memcg, nr_pages,
|
|
|
|
swap_find_max_overage(memcg));
|
|
|
|
|
2020-06-02 11:49:45 +07:00
|
|
|
/*
|
|
|
|
* Clamp the max delay per usermode return so as to still keep the
|
|
|
|
* application moving forwards and also permit diagnostics, albeit
|
|
|
|
* extremely slowly.
|
|
|
|
*/
|
|
|
|
penalty_jiffies = min(penalty_jiffies, MEMCG_MAX_HIGH_DELAY_JIFFIES);
|
|
|
|
|
mm, memcg: throttle allocators when failing reclaim over memory.high
We're trying to use memory.high to limit workloads, but have found that
containment can frequently fail completely and cause OOM situations
outside of the cgroup. This happens especially with swap space -- either
when none is configured, or swap is full. These failures often also don't
have enough warning to allow one to react, whether for a human or for a
daemon monitoring PSI.
Here is output from a simple program showing how long it takes in usec
(column 2) to allocate a megabyte of anonymous memory (column 1) when a
cgroup is already beyond its memory high setting, and no swap is
available:
[root@ktst ~]# systemd-run -p MemoryHigh=100M -p MemorySwapMax=1 \
> --wait -t timeout 300 /root/mdf
[...]
95 1035
96 1038
97 1000
98 1036
99 1048
100 1590
101 1968
102 1776
103 1863
104 1757
105 1921
106 1893
107 1760
108 1748
109 1843
110 1716
111 1924
112 1776
113 1831
114 1766
115 1836
116 1588
117 1912
118 1802
119 1857
120 1731
[...]
[System OOM in 2-3 seconds]
The delay does go up extremely marginally past the 100MB memory.high
threshold, as now we spend time scanning before returning to usermode, but
it's nowhere near enough to contain growth. It also doesn't get worse the
more pages you have, since it only considers nr_pages.
The current situation goes against both the expectations of users of
memory.high, and our intentions as cgroup v2 developers. In
cgroup-v2.txt, we claim that we will throttle and only under "extreme
conditions" will memory.high protection be breached. Likewise, cgroup v2
users generally also expect that memory.high should throttle workloads as
they exceed their high threshold. However, as seen above, this isn't
always how it works in practice -- even on banal setups like those with no
swap, or where swap has become exhausted, we can end up with memory.high
being breached and us having no weapons left in our arsenal to combat
runaway growth with, since reclaim is futile.
It's also hard for system monitoring software or users to tell how bad the
situation is, as "high" events for the memcg may in some cases be benign,
and in others be catastrophic. The current status quo is that we fail
containment in a way that doesn't provide any advance warning that things
are about to go horribly wrong (for example, we are about to invoke the
kernel OOM killer).
This patch introduces explicit throttling when reclaim is failing to keep
memcg size contained at the memory.high setting. It does so by applying
an exponential delay curve derived from the memcg's overage compared to
memory.high. In the normal case where the memcg is either below or only
marginally over its memory.high setting, no throttling will be performed.
This composes well with system health monitoring and remediation, as these
allocator delays are factored into PSI's memory pressure calculations.
This both creates a mechanism system administrators or applications
consuming the PSI interface to trivially see that the memcg in question is
struggling and use that to make more reasonable decisions, and permits
them enough time to act. Either of these can act with significantly more
nuance than that we can provide using the system OOM killer.
This is a similar idea to memory.oom_control in cgroup v1 which would put
the cgroup to sleep if the threshold was violated, but it's also
significantly improved as it results in visible memory pressure, and also
doesn't schedule indefinitely, which previously made tracing and other
introspection difficult (ie. it's clamped at 2*HZ per allocation through
MEMCG_MAX_HIGH_DELAY_JIFFIES).
Contrast the previous results with a kernel with this patch:
[root@ktst ~]# systemd-run -p MemoryHigh=100M -p MemorySwapMax=1 \
> --wait -t timeout 300 /root/mdf
[...]
95 1002
96 1000
97 1002
98 1003
99 1000
100 1043
101 84724
102 330628
103 610511
104 1016265
105 1503969
106 2391692
107 2872061
108 3248003
109 4791904
110 5759832
111 6912509
112 8127818
113 9472203
114 12287622
115 12480079
116 14144008
117 15808029
118 16384500
119 16383242
120 16384979
[...]
As you can see, in the normal case, memory allocation takes around 1000
usec. However, as we exceed our memory.high, things start to increase
exponentially, but fairly leniently at first. Our first megabyte over
memory.high takes us 0.16 seconds, then the next is 0.46 seconds, then the
next is almost an entire second. This gets worse until we reach our
eventual 2*HZ clamp per batch, resulting in 16 seconds per megabyte.
However, this is still making forward progress, so permits tracing or
further analysis with programs like GDB.
We use an exponential curve for our delay penalty for a few reasons:
1. We run mem_cgroup_handle_over_high to potentially do reclaim after
we've already performed allocations, which means that temporarily
going over memory.high by a small amount may be perfectly legitimate,
even for compliant workloads. We don't want to unduly penalise such
cases.
2. An exponential curve (as opposed to a static or linear delay) allows
ramping up memory pressure stats more gradually, which can be useful
to work out that you have set memory.high too low, without destroying
application performance entirely.
This patch expands on earlier work by Johannes Weiner. Thanks!
[akpm@linux-foundation.org: fix max() warning]
[akpm@linux-foundation.org: fix __udivdi3 ref on 32-bit]
[akpm@linux-foundation.org: fix it even more]
[chris@chrisdown.name: fix 64-bit divide even more]
Link: http://lkml.kernel.org/r/20190723180700.GA29459@chrisdown.name
Signed-off-by: Chris Down <chris@chrisdown.name>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Roman Gushchin <guro@fb.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Nathan Chancellor <natechancellor@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-09-24 05:34:55 +07:00
|
|
|
/*
|
|
|
|
* Don't sleep if the amount of jiffies this memcg owes us is so low
|
|
|
|
* that it's not even worth doing, in an attempt to be nice to those who
|
|
|
|
* go only a small amount over their memory.high value and maybe haven't
|
|
|
|
* been aggressively reclaimed enough yet.
|
|
|
|
*/
|
|
|
|
if (penalty_jiffies <= HZ / 100)
|
|
|
|
goto out;
|
|
|
|
|
mm, memcg: reclaim more aggressively before high allocator throttling
Patch series "mm, memcg: reclaim harder before high throttling", v2.
This patch (of 2):
In Facebook production, we've seen cases where cgroups have been put into
allocator throttling even when they appear to have a lot of slack file
caches which should be trivially reclaimable.
Looking more closely, the problem is that we only try a single cgroup
reclaim walk for each return to usermode before calculating whether or not
we should throttle. This single attempt doesn't produce enough pressure
to shrink for cgroups with a rapidly growing amount of file caches prior
to entering allocator throttling.
As an example, we see that threads in an affected cgroup are stuck in
allocator throttling:
# for i in $(cat cgroup.threads); do
> grep over_high "/proc/$i/stack"
> done
[<0>] mem_cgroup_handle_over_high+0x10b/0x150
[<0>] mem_cgroup_handle_over_high+0x10b/0x150
[<0>] mem_cgroup_handle_over_high+0x10b/0x150
...however, there is no I/O pressure reported by PSI, despite a lot of
slack file pages:
# cat memory.pressure
some avg10=78.50 avg60=84.99 avg300=84.53 total=5702440903
full avg10=78.50 avg60=84.99 avg300=84.53 total=5702116959
# cat io.pressure
some avg10=0.00 avg60=0.00 avg300=0.00 total=78051391
full avg10=0.00 avg60=0.00 avg300=0.00 total=78049640
# grep _file memory.stat
inactive_file 1370939392
active_file 661635072
This patch changes the behaviour to retry reclaim either until the current
task goes below the 10ms grace period, or we are making no reclaim
progress at all. In the latter case, we enter reclaim throttling as
before.
To a user, there's no intuitive reason for the reclaim behaviour to differ
from hitting memory.high as part of a new allocation, as opposed to
hitting memory.high because someone lowered its value. As such this also
brings an added benefit: it unifies the reclaim behaviour between the two.
There's precedent for this behaviour: we already do reclaim retries when
writing to memory.{high,max}, in max reclaim, and in the page allocator
itself.
Signed-off-by: Chris Down <chris@chrisdown.name>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Roman Gushchin <guro@fb.com>
Link: http://lkml.kernel.org/r/cover.1594640214.git.chris@chrisdown.name
Link: http://lkml.kernel.org/r/a4e23b59e9ef499b575ae73a8120ee089b7d3373.1594640214.git.chris@chrisdown.name
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2020-08-07 13:21:54 +07:00
|
|
|
/*
|
|
|
|
* If reclaim is making forward progress but we're still over
|
|
|
|
* memory.high, we want to encourage that rather than doing allocator
|
|
|
|
* throttling.
|
|
|
|
*/
|
|
|
|
if (nr_reclaimed || nr_retries--) {
|
|
|
|
in_retry = true;
|
|
|
|
goto retry_reclaim;
|
|
|
|
}
|
|
|
|
|
mm, memcg: throttle allocators when failing reclaim over memory.high
We're trying to use memory.high to limit workloads, but have found that
containment can frequently fail completely and cause OOM situations
outside of the cgroup. This happens especially with swap space -- either
when none is configured, or swap is full. These failures often also don't
have enough warning to allow one to react, whether for a human or for a
daemon monitoring PSI.
Here is output from a simple program showing how long it takes in usec
(column 2) to allocate a megabyte of anonymous memory (column 1) when a
cgroup is already beyond its memory high setting, and no swap is
available:
[root@ktst ~]# systemd-run -p MemoryHigh=100M -p MemorySwapMax=1 \
> --wait -t timeout 300 /root/mdf
[...]
95 1035
96 1038
97 1000
98 1036
99 1048
100 1590
101 1968
102 1776
103 1863
104 1757
105 1921
106 1893
107 1760
108 1748
109 1843
110 1716
111 1924
112 1776
113 1831
114 1766
115 1836
116 1588
117 1912
118 1802
119 1857
120 1731
[...]
[System OOM in 2-3 seconds]
The delay does go up extremely marginally past the 100MB memory.high
threshold, as now we spend time scanning before returning to usermode, but
it's nowhere near enough to contain growth. It also doesn't get worse the
more pages you have, since it only considers nr_pages.
The current situation goes against both the expectations of users of
memory.high, and our intentions as cgroup v2 developers. In
cgroup-v2.txt, we claim that we will throttle and only under "extreme
conditions" will memory.high protection be breached. Likewise, cgroup v2
users generally also expect that memory.high should throttle workloads as
they exceed their high threshold. However, as seen above, this isn't
always how it works in practice -- even on banal setups like those with no
swap, or where swap has become exhausted, we can end up with memory.high
being breached and us having no weapons left in our arsenal to combat
runaway growth with, since reclaim is futile.
It's also hard for system monitoring software or users to tell how bad the
situation is, as "high" events for the memcg may in some cases be benign,
and in others be catastrophic. The current status quo is that we fail
containment in a way that doesn't provide any advance warning that things
are about to go horribly wrong (for example, we are about to invoke the
kernel OOM killer).
This patch introduces explicit throttling when reclaim is failing to keep
memcg size contained at the memory.high setting. It does so by applying
an exponential delay curve derived from the memcg's overage compared to
memory.high. In the normal case where the memcg is either below or only
marginally over its memory.high setting, no throttling will be performed.
This composes well with system health monitoring and remediation, as these
allocator delays are factored into PSI's memory pressure calculations.
This both creates a mechanism system administrators or applications
consuming the PSI interface to trivially see that the memcg in question is
struggling and use that to make more reasonable decisions, and permits
them enough time to act. Either of these can act with significantly more
nuance than that we can provide using the system OOM killer.
This is a similar idea to memory.oom_control in cgroup v1 which would put
the cgroup to sleep if the threshold was violated, but it's also
significantly improved as it results in visible memory pressure, and also
doesn't schedule indefinitely, which previously made tracing and other
introspection difficult (ie. it's clamped at 2*HZ per allocation through
MEMCG_MAX_HIGH_DELAY_JIFFIES).
Contrast the previous results with a kernel with this patch:
[root@ktst ~]# systemd-run -p MemoryHigh=100M -p MemorySwapMax=1 \
> --wait -t timeout 300 /root/mdf
[...]
95 1002
96 1000
97 1002
98 1003
99 1000
100 1043
101 84724
102 330628
103 610511
104 1016265
105 1503969
106 2391692
107 2872061
108 3248003
109 4791904
110 5759832
111 6912509
112 8127818
113 9472203
114 12287622
115 12480079
116 14144008
117 15808029
118 16384500
119 16383242
120 16384979
[...]
As you can see, in the normal case, memory allocation takes around 1000
usec. However, as we exceed our memory.high, things start to increase
exponentially, but fairly leniently at first. Our first megabyte over
memory.high takes us 0.16 seconds, then the next is 0.46 seconds, then the
next is almost an entire second. This gets worse until we reach our
eventual 2*HZ clamp per batch, resulting in 16 seconds per megabyte.
However, this is still making forward progress, so permits tracing or
further analysis with programs like GDB.
We use an exponential curve for our delay penalty for a few reasons:
1. We run mem_cgroup_handle_over_high to potentially do reclaim after
we've already performed allocations, which means that temporarily
going over memory.high by a small amount may be perfectly legitimate,
even for compliant workloads. We don't want to unduly penalise such
cases.
2. An exponential curve (as opposed to a static or linear delay) allows
ramping up memory pressure stats more gradually, which can be useful
to work out that you have set memory.high too low, without destroying
application performance entirely.
This patch expands on earlier work by Johannes Weiner. Thanks!
[akpm@linux-foundation.org: fix max() warning]
[akpm@linux-foundation.org: fix __udivdi3 ref on 32-bit]
[akpm@linux-foundation.org: fix it even more]
[chris@chrisdown.name: fix 64-bit divide even more]
Link: http://lkml.kernel.org/r/20190723180700.GA29459@chrisdown.name
Signed-off-by: Chris Down <chris@chrisdown.name>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Roman Gushchin <guro@fb.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Nathan Chancellor <natechancellor@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-09-24 05:34:55 +07:00
|
|
|
/*
|
|
|
|
* If we exit early, we're guaranteed to die (since
|
|
|
|
* schedule_timeout_killable sets TASK_KILLABLE). This means we don't
|
|
|
|
* need to account for any ill-begotten jiffies to pay them off later.
|
|
|
|
*/
|
|
|
|
psi_memstall_enter(&pflags);
|
|
|
|
schedule_timeout_killable(penalty_jiffies);
|
|
|
|
psi_memstall_leave(&pflags);
|
|
|
|
|
|
|
|
out:
|
|
|
|
css_put(&memcg->css);
|
memcg: punt high overage reclaim to return-to-userland path
Currently, try_charge() tries to reclaim memory synchronously when the
high limit is breached; however, if the allocation doesn't have
__GFP_WAIT, synchronous reclaim is skipped. If a process performs only
speculative allocations, it can blow way past the high limit. This is
actually easily reproducible by simply doing "find /". slab/slub
allocator tries speculative allocations first, so as long as there's
memory which can be consumed without blocking, it can keep allocating
memory regardless of the high limit.
This patch makes try_charge() always punt the over-high reclaim to the
return-to-userland path. If try_charge() detects that high limit is
breached, it adds the overage to current->memcg_nr_pages_over_high and
schedules execution of mem_cgroup_handle_over_high() which performs
synchronous reclaim from the return-to-userland path.
As long as kernel doesn't have a run-away allocation spree, this should
provide enough protection while making kmemcg behave more consistently.
It also has the following benefits.
- All over-high reclaims can use GFP_KERNEL regardless of the specific
gfp mask in use, e.g. GFP_NOFS, when the limit was breached.
- It copes with prio inversion. Previously, a low-prio task with
small memory.high might perform over-high reclaim with a bunch of
locks held. If a higher prio task needed any of these locks, it
would have to wait until the low prio task finished reclaim and
released the locks. By handing over-high reclaim to the task exit
path this issue can be avoided.
Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Michal Hocko <mhocko@kernel.org>
Reviewed-by: Vladimir Davydov <vdavydov@parallels.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-11-06 09:46:11 +07:00
|
|
|
}
|
|
|
|
|
mm: memcontrol: rewrite charge API
These patches rework memcg charge lifetime to integrate more naturally
with the lifetime of user pages. This drastically simplifies the code and
reduces charging and uncharging overhead. The most expensive part of
charging and uncharging is the page_cgroup bit spinlock, which is removed
entirely after this series.
Here are the top-10 profile entries of a stress test that reads a 128G
sparse file on a freshly booted box, without even a dedicated cgroup (i.e.
executing in the root memcg). Before:
15.36% cat [kernel.kallsyms] [k] copy_user_generic_string
13.31% cat [kernel.kallsyms] [k] memset
11.48% cat [kernel.kallsyms] [k] do_mpage_readpage
4.23% cat [kernel.kallsyms] [k] get_page_from_freelist
2.38% cat [kernel.kallsyms] [k] put_page
2.32% cat [kernel.kallsyms] [k] __mem_cgroup_commit_charge
2.18% kswapd0 [kernel.kallsyms] [k] __mem_cgroup_uncharge_common
1.92% kswapd0 [kernel.kallsyms] [k] shrink_page_list
1.86% cat [kernel.kallsyms] [k] __radix_tree_lookup
1.62% cat [kernel.kallsyms] [k] __pagevec_lru_add_fn
After:
15.67% cat [kernel.kallsyms] [k] copy_user_generic_string
13.48% cat [kernel.kallsyms] [k] memset
11.42% cat [kernel.kallsyms] [k] do_mpage_readpage
3.98% cat [kernel.kallsyms] [k] get_page_from_freelist
2.46% cat [kernel.kallsyms] [k] put_page
2.13% kswapd0 [kernel.kallsyms] [k] shrink_page_list
1.88% cat [kernel.kallsyms] [k] __radix_tree_lookup
1.67% cat [kernel.kallsyms] [k] __pagevec_lru_add_fn
1.39% kswapd0 [kernel.kallsyms] [k] free_pcppages_bulk
1.30% cat [kernel.kallsyms] [k] kfree
As you can see, the memcg footprint has shrunk quite a bit.
text data bss dec hex filename
37970 9892 400 48262 bc86 mm/memcontrol.o.old
35239 9892 400 45531 b1db mm/memcontrol.o
This patch (of 4):
The memcg charge API charges pages before they are rmapped - i.e. have an
actual "type" - and so every callsite needs its own set of charge and
uncharge functions to know what type is being operated on. Worse,
uncharge has to happen from a context that is still type-specific, rather
than at the end of the page's lifetime with exclusive access, and so
requires a lot of synchronization.
Rewrite the charge API to provide a generic set of try_charge(),
commit_charge() and cancel_charge() transaction operations, much like
what's currently done for swap-in:
mem_cgroup_try_charge() attempts to reserve a charge, reclaiming
pages from the memcg if necessary.
mem_cgroup_commit_charge() commits the page to the charge once it
has a valid page->mapping and PageAnon() reliably tells the type.
mem_cgroup_cancel_charge() aborts the transaction.
This reduces the charge API and enables subsequent patches to
drastically simplify uncharging.
As pages need to be committed after rmap is established but before they
are added to the LRU, page_add_new_anon_rmap() must stop doing LRU
additions again. Revive lru_cache_add_active_or_unevictable().
[hughd@google.com: fix shmem_unuse]
[hughd@google.com: Add comments on the private use of -EAGAIN]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: Tejun Heo <tj@kernel.org>
Cc: Vladimir Davydov <vdavydov@parallels.com>
Signed-off-by: Hugh Dickins <hughd@google.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-08-09 04:19:20 +07:00
|
|
|
static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
|
|
|
|
unsigned int nr_pages)
|
2008-02-07 15:13:53 +07:00
|
|
|
{
|
2018-02-01 07:16:45 +07:00
|
|
|
unsigned int batch = max(MEMCG_CHARGE_BATCH, nr_pages);
|
2020-08-07 13:21:58 +07:00
|
|
|
int nr_retries = MAX_RECLAIM_RETRIES;
|
2014-08-07 06:05:42 +07:00
|
|
|
struct mem_cgroup *mem_over_limit;
|
mm: memcontrol: lockless page counters
Memory is internally accounted in bytes, using spinlock-protected 64-bit
counters, even though the smallest accounting delta is a page. The
counter interface is also convoluted and does too many things.
Introduce a new lockless word-sized page counter API, then change all
memory accounting over to it. The translation from and to bytes then only
happens when interfacing with userspace.
The removed locking overhead is noticable when scaling beyond the per-cpu
charge caches - on a 4-socket machine with 144-threads, the following test
shows the performance differences of 288 memcgs concurrently running a
page fault benchmark:
vanilla:
18631648.500498 task-clock (msec) # 140.643 CPUs utilized ( +- 0.33% )
1,380,638 context-switches # 0.074 K/sec ( +- 0.75% )
24,390 cpu-migrations # 0.001 K/sec ( +- 8.44% )
1,843,305,768 page-faults # 0.099 M/sec ( +- 0.00% )
50,134,994,088,218 cycles # 2.691 GHz ( +- 0.33% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
8,049,712,224,651 instructions # 0.16 insns per cycle ( +- 0.04% )
1,586,970,584,979 branches # 85.176 M/sec ( +- 0.05% )
1,724,989,949 branch-misses # 0.11% of all branches ( +- 0.48% )
132.474343877 seconds time elapsed ( +- 0.21% )
lockless:
12195979.037525 task-clock (msec) # 133.480 CPUs utilized ( +- 0.18% )
832,850 context-switches # 0.068 K/sec ( +- 0.54% )
15,624 cpu-migrations # 0.001 K/sec ( +- 10.17% )
1,843,304,774 page-faults # 0.151 M/sec ( +- 0.00% )
32,811,216,801,141 cycles # 2.690 GHz ( +- 0.18% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
9,999,265,091,727 instructions # 0.30 insns per cycle ( +- 0.10% )
2,076,759,325,203 branches # 170.282 M/sec ( +- 0.12% )
1,656,917,214 branch-misses # 0.08% of all branches ( +- 0.55% )
91.369330729 seconds time elapsed ( +- 0.45% )
On top of improved scalability, this also gets rid of the icky long long
types in the very heart of memcg, which is great for 32 bit and also makes
the code a lot more readable.
Notable differences between the old and new API:
- res_counter_charge() and res_counter_charge_nofail() become
page_counter_try_charge() and page_counter_charge() resp. to match
the more common kernel naming scheme of try_do()/do()
- res_counter_uncharge_until() is only ever used to cancel a local
counter and never to uncharge bigger segments of a hierarchy, so
it's replaced by the simpler page_counter_cancel()
- res_counter_set_limit() is replaced by page_counter_limit(), which
expects its callers to serialize against themselves
- res_counter_memparse_write_strategy() is replaced by
page_counter_limit(), which rounds down to the nearest page size -
rather than up. This is more reasonable for explicitely requested
hard upper limits.
- to keep charging light-weight, page_counter_try_charge() charges
speculatively, only to roll back if the result exceeds the limit.
Because of this, a failing bigger charge can temporarily lock out
smaller charges that would otherwise succeed. The error is bounded
to the difference between the smallest and the biggest possible
charge size, so for memcg, this means that a failing THP charge can
send base page charges into reclaim upto 2MB (4MB) before the limit
would have been reached. This should be acceptable.
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE and memparse]
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE, memparse, strncmp, and PAGE_SIZE]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Acked-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-12-11 06:42:31 +07:00
|
|
|
struct page_counter *counter;
|
2020-08-07 13:22:15 +07:00
|
|
|
enum oom_status oom_status;
|
2014-08-07 06:05:42 +07:00
|
|
|
unsigned long nr_reclaimed;
|
2014-10-10 05:28:56 +07:00
|
|
|
bool may_swap = true;
|
|
|
|
bool drained = false;
|
2020-08-07 13:22:15 +07:00
|
|
|
unsigned long pflags;
|
2009-01-08 09:08:08 +07:00
|
|
|
|
2014-09-05 19:43:57 +07:00
|
|
|
if (mem_cgroup_is_root(memcg))
|
memcg: ratify and consolidate over-charge handling
try_charge() is the main charging logic of memcg. When it hits the limit
but either can't fail the allocation due to __GFP_NOFAIL or the task is
likely to free memory very soon, being OOM killed, has SIGKILL pending or
exiting, it "bypasses" the charge to the root memcg and returns -EINTR.
While this is one approach which can be taken for these situations, it has
several issues.
* It unnecessarily lies about the reality. The number itself doesn't
go over the limit but the actual usage does. memcg is either forced
to or actively chooses to go over the limit because that is the
right behavior under the circumstances, which is completely fine,
but, if at all avoidable, it shouldn't be misrepresenting what's
happening by sneaking the charges into the root memcg.
* Despite trying, we already do over-charge. kmemcg can't deal with
switching over to the root memcg by the point try_charge() returns
-EINTR, so it open-codes over-charing.
* It complicates the callers. Each try_charge() user has to handle
the weird -EINTR exception. memcg_charge_kmem() does the manual
over-charging. mem_cgroup_do_precharge() performs unnecessary
uncharging of root memcg, which BTW is inconsistent with what
memcg_charge_kmem() does but not broken as [un]charging are noops on
root memcg. mem_cgroup_try_charge() needs to switch the returned
cgroup to the root one.
The reality is that in memcg there are cases where we are forced and/or
willing to go over the limit. Each such case needs to be scrutinized and
justified but there definitely are situations where that is the right
thing to do. We alredy do this but with a superficial and inconsistent
disguise which leads to unnecessary complications.
This patch updates try_charge() so that it over-charges and returns 0 when
deemed necessary. -EINTR return is removed along with all special case
handling in the callers.
While at it, remove the local variable @ret, which was initialized to zero
and never changed, along with done: label which just returned the always
zero @ret.
Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Vladimir Davydov <vdavydov@parallels.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-11-06 09:46:17 +07:00
|
|
|
return 0;
|
2014-08-07 06:05:42 +07:00
|
|
|
retry:
|
2014-04-08 05:37:44 +07:00
|
|
|
if (consume_stock(memcg, nr_pages))
|
memcg: ratify and consolidate over-charge handling
try_charge() is the main charging logic of memcg. When it hits the limit
but either can't fail the allocation due to __GFP_NOFAIL or the task is
likely to free memory very soon, being OOM killed, has SIGKILL pending or
exiting, it "bypasses" the charge to the root memcg and returns -EINTR.
While this is one approach which can be taken for these situations, it has
several issues.
* It unnecessarily lies about the reality. The number itself doesn't
go over the limit but the actual usage does. memcg is either forced
to or actively chooses to go over the limit because that is the
right behavior under the circumstances, which is completely fine,
but, if at all avoidable, it shouldn't be misrepresenting what's
happening by sneaking the charges into the root memcg.
* Despite trying, we already do over-charge. kmemcg can't deal with
switching over to the root memcg by the point try_charge() returns
-EINTR, so it open-codes over-charing.
* It complicates the callers. Each try_charge() user has to handle
the weird -EINTR exception. memcg_charge_kmem() does the manual
over-charging. mem_cgroup_do_precharge() performs unnecessary
uncharging of root memcg, which BTW is inconsistent with what
memcg_charge_kmem() does but not broken as [un]charging are noops on
root memcg. mem_cgroup_try_charge() needs to switch the returned
cgroup to the root one.
The reality is that in memcg there are cases where we are forced and/or
willing to go over the limit. Each such case needs to be scrutinized and
justified but there definitely are situations where that is the right
thing to do. We alredy do this but with a superficial and inconsistent
disguise which leads to unnecessary complications.
This patch updates try_charge() so that it over-charges and returns 0 when
deemed necessary. -EINTR return is removed along with all special case
handling in the callers.
While at it, remove the local variable @ret, which was initialized to zero
and never changed, along with done: label which just returned the always
zero @ret.
Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Vladimir Davydov <vdavydov@parallels.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-11-06 09:46:17 +07:00
|
|
|
return 0;
|
2008-02-07 15:13:53 +07:00
|
|
|
|
2016-01-15 06:21:23 +07:00
|
|
|
if (!do_memsw_account() ||
|
2015-11-06 09:50:26 +07:00
|
|
|
page_counter_try_charge(&memcg->memsw, batch, &counter)) {
|
|
|
|
if (page_counter_try_charge(&memcg->memory, batch, &counter))
|
2014-08-07 06:05:42 +07:00
|
|
|
goto done_restock;
|
2016-01-15 06:21:23 +07:00
|
|
|
if (do_memsw_account())
|
mm: memcontrol: lockless page counters
Memory is internally accounted in bytes, using spinlock-protected 64-bit
counters, even though the smallest accounting delta is a page. The
counter interface is also convoluted and does too many things.
Introduce a new lockless word-sized page counter API, then change all
memory accounting over to it. The translation from and to bytes then only
happens when interfacing with userspace.
The removed locking overhead is noticable when scaling beyond the per-cpu
charge caches - on a 4-socket machine with 144-threads, the following test
shows the performance differences of 288 memcgs concurrently running a
page fault benchmark:
vanilla:
18631648.500498 task-clock (msec) # 140.643 CPUs utilized ( +- 0.33% )
1,380,638 context-switches # 0.074 K/sec ( +- 0.75% )
24,390 cpu-migrations # 0.001 K/sec ( +- 8.44% )
1,843,305,768 page-faults # 0.099 M/sec ( +- 0.00% )
50,134,994,088,218 cycles # 2.691 GHz ( +- 0.33% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
8,049,712,224,651 instructions # 0.16 insns per cycle ( +- 0.04% )
1,586,970,584,979 branches # 85.176 M/sec ( +- 0.05% )
1,724,989,949 branch-misses # 0.11% of all branches ( +- 0.48% )
132.474343877 seconds time elapsed ( +- 0.21% )
lockless:
12195979.037525 task-clock (msec) # 133.480 CPUs utilized ( +- 0.18% )
832,850 context-switches # 0.068 K/sec ( +- 0.54% )
15,624 cpu-migrations # 0.001 K/sec ( +- 10.17% )
1,843,304,774 page-faults # 0.151 M/sec ( +- 0.00% )
32,811,216,801,141 cycles # 2.690 GHz ( +- 0.18% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
9,999,265,091,727 instructions # 0.30 insns per cycle ( +- 0.10% )
2,076,759,325,203 branches # 170.282 M/sec ( +- 0.12% )
1,656,917,214 branch-misses # 0.08% of all branches ( +- 0.55% )
91.369330729 seconds time elapsed ( +- 0.45% )
On top of improved scalability, this also gets rid of the icky long long
types in the very heart of memcg, which is great for 32 bit and also makes
the code a lot more readable.
Notable differences between the old and new API:
- res_counter_charge() and res_counter_charge_nofail() become
page_counter_try_charge() and page_counter_charge() resp. to match
the more common kernel naming scheme of try_do()/do()
- res_counter_uncharge_until() is only ever used to cancel a local
counter and never to uncharge bigger segments of a hierarchy, so
it's replaced by the simpler page_counter_cancel()
- res_counter_set_limit() is replaced by page_counter_limit(), which
expects its callers to serialize against themselves
- res_counter_memparse_write_strategy() is replaced by
page_counter_limit(), which rounds down to the nearest page size -
rather than up. This is more reasonable for explicitely requested
hard upper limits.
- to keep charging light-weight, page_counter_try_charge() charges
speculatively, only to roll back if the result exceeds the limit.
Because of this, a failing bigger charge can temporarily lock out
smaller charges that would otherwise succeed. The error is bounded
to the difference between the smallest and the biggest possible
charge size, so for memcg, this means that a failing THP charge can
send base page charges into reclaim upto 2MB (4MB) before the limit
would have been reached. This should be acceptable.
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE and memparse]
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE, memparse, strncmp, and PAGE_SIZE]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Acked-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-12-11 06:42:31 +07:00
|
|
|
page_counter_uncharge(&memcg->memsw, batch);
|
|
|
|
mem_over_limit = mem_cgroup_from_counter(counter, memory);
|
2014-10-10 05:28:54 +07:00
|
|
|
} else {
|
mm: memcontrol: lockless page counters
Memory is internally accounted in bytes, using spinlock-protected 64-bit
counters, even though the smallest accounting delta is a page. The
counter interface is also convoluted and does too many things.
Introduce a new lockless word-sized page counter API, then change all
memory accounting over to it. The translation from and to bytes then only
happens when interfacing with userspace.
The removed locking overhead is noticable when scaling beyond the per-cpu
charge caches - on a 4-socket machine with 144-threads, the following test
shows the performance differences of 288 memcgs concurrently running a
page fault benchmark:
vanilla:
18631648.500498 task-clock (msec) # 140.643 CPUs utilized ( +- 0.33% )
1,380,638 context-switches # 0.074 K/sec ( +- 0.75% )
24,390 cpu-migrations # 0.001 K/sec ( +- 8.44% )
1,843,305,768 page-faults # 0.099 M/sec ( +- 0.00% )
50,134,994,088,218 cycles # 2.691 GHz ( +- 0.33% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
8,049,712,224,651 instructions # 0.16 insns per cycle ( +- 0.04% )
1,586,970,584,979 branches # 85.176 M/sec ( +- 0.05% )
1,724,989,949 branch-misses # 0.11% of all branches ( +- 0.48% )
132.474343877 seconds time elapsed ( +- 0.21% )
lockless:
12195979.037525 task-clock (msec) # 133.480 CPUs utilized ( +- 0.18% )
832,850 context-switches # 0.068 K/sec ( +- 0.54% )
15,624 cpu-migrations # 0.001 K/sec ( +- 10.17% )
1,843,304,774 page-faults # 0.151 M/sec ( +- 0.00% )
32,811,216,801,141 cycles # 2.690 GHz ( +- 0.18% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
9,999,265,091,727 instructions # 0.30 insns per cycle ( +- 0.10% )
2,076,759,325,203 branches # 170.282 M/sec ( +- 0.12% )
1,656,917,214 branch-misses # 0.08% of all branches ( +- 0.55% )
91.369330729 seconds time elapsed ( +- 0.45% )
On top of improved scalability, this also gets rid of the icky long long
types in the very heart of memcg, which is great for 32 bit and also makes
the code a lot more readable.
Notable differences between the old and new API:
- res_counter_charge() and res_counter_charge_nofail() become
page_counter_try_charge() and page_counter_charge() resp. to match
the more common kernel naming scheme of try_do()/do()
- res_counter_uncharge_until() is only ever used to cancel a local
counter and never to uncharge bigger segments of a hierarchy, so
it's replaced by the simpler page_counter_cancel()
- res_counter_set_limit() is replaced by page_counter_limit(), which
expects its callers to serialize against themselves
- res_counter_memparse_write_strategy() is replaced by
page_counter_limit(), which rounds down to the nearest page size -
rather than up. This is more reasonable for explicitely requested
hard upper limits.
- to keep charging light-weight, page_counter_try_charge() charges
speculatively, only to roll back if the result exceeds the limit.
Because of this, a failing bigger charge can temporarily lock out
smaller charges that would otherwise succeed. The error is bounded
to the difference between the smallest and the biggest possible
charge size, so for memcg, this means that a failing THP charge can
send base page charges into reclaim upto 2MB (4MB) before the limit
would have been reached. This should be acceptable.
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE and memparse]
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE, memparse, strncmp, and PAGE_SIZE]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Acked-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-12-11 06:42:31 +07:00
|
|
|
mem_over_limit = mem_cgroup_from_counter(counter, memsw);
|
2014-10-10 05:28:56 +07:00
|
|
|
may_swap = false;
|
2014-10-10 05:28:54 +07:00
|
|
|
}
|
2009-01-08 09:07:48 +07:00
|
|
|
|
2014-08-07 06:05:42 +07:00
|
|
|
if (batch > nr_pages) {
|
|
|
|
batch = nr_pages;
|
|
|
|
goto retry;
|
|
|
|
}
|
2009-01-08 09:08:06 +07:00
|
|
|
|
mm: memcontrol: fix network errors from failing __GFP_ATOMIC charges
While upgrading from 4.16 to 5.2, we noticed these allocation errors in
the log of the new kernel:
SLUB: Unable to allocate memory on node -1, gfp=0xa20(GFP_ATOMIC)
cache: tw_sock_TCPv6(960:helper-logs), object size: 232, buffer size: 240, default order: 1, min order: 0
node 0: slabs: 5, objs: 170, free: 0
slab_out_of_memory+1
___slab_alloc+969
__slab_alloc+14
kmem_cache_alloc+346
inet_twsk_alloc+60
tcp_time_wait+46
tcp_fin+206
tcp_data_queue+2034
tcp_rcv_state_process+784
tcp_v6_do_rcv+405
__release_sock+118
tcp_close+385
inet_release+46
__sock_release+55
sock_close+17
__fput+170
task_work_run+127
exit_to_usermode_loop+191
do_syscall_64+212
entry_SYSCALL_64_after_hwframe+68
accompanied by an increase in machines going completely radio silent
under memory pressure.
One thing that changed since 4.16 is e699e2c6a654 ("net, mm: account
sock objects to kmemcg"), which made these slab caches subject to cgroup
memory accounting and control.
The problem with that is that cgroups, unlike the page allocator, do not
maintain dedicated atomic reserves. As a cgroup's usage hovers at its
limit, atomic allocations - such as done during network rx - can fail
consistently for extended periods of time. The kernel is not able to
operate under these conditions.
We don't want to revert the culprit patch, because it indeed tracks a
potentially substantial amount of memory used by a cgroup.
We also don't want to implement dedicated atomic reserves for cgroups.
There is no point in keeping a fixed margin of unused bytes in the
cgroup's memory budget to accomodate a consumer that is impossible to
predict - we'd be wasting memory and get into configuration headaches,
not unlike what we have going with min_free_kbytes. We do this for
physical mem because we have to, but cgroups are an accounting game.
Instead, account these privileged allocations to the cgroup, but let
them bypass the configured limit if they have to. This way, we get the
benefits of accounting the consumed memory and have it exert pressure on
the rest of the cgroup, but like with the page allocator, we shift the
burden of reclaimining on behalf of atomic allocations onto the regular
allocations that can block.
Link: http://lkml.kernel.org/r/20191022233708.365764-1-hannes@cmpxchg.org
Fixes: e699e2c6a654 ("net, mm: account sock objects to kmemcg")
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Cc: Suleiman Souhlal <suleiman@google.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: <stable@vger.kernel.org> [4.18+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-11-06 12:17:13 +07:00
|
|
|
/*
|
|
|
|
* Memcg doesn't have a dedicated reserve for atomic
|
|
|
|
* allocations. But like the global atomic pool, we need to
|
|
|
|
* put the burden of reclaim on regular allocation requests
|
|
|
|
* and let these go through as privileged allocations.
|
|
|
|
*/
|
|
|
|
if (gfp_mask & __GFP_ATOMIC)
|
|
|
|
goto force;
|
|
|
|
|
2014-08-07 06:05:44 +07:00
|
|
|
/*
|
|
|
|
* Unlike in global OOM situations, memcg is not in a physical
|
|
|
|
* memory shortage. Allow dying and OOM-killed tasks to
|
|
|
|
* bypass the last charges so that they can exit quickly and
|
|
|
|
* free their memory.
|
|
|
|
*/
|
2019-03-06 06:46:47 +07:00
|
|
|
if (unlikely(should_force_charge()))
|
memcg: ratify and consolidate over-charge handling
try_charge() is the main charging logic of memcg. When it hits the limit
but either can't fail the allocation due to __GFP_NOFAIL or the task is
likely to free memory very soon, being OOM killed, has SIGKILL pending or
exiting, it "bypasses" the charge to the root memcg and returns -EINTR.
While this is one approach which can be taken for these situations, it has
several issues.
* It unnecessarily lies about the reality. The number itself doesn't
go over the limit but the actual usage does. memcg is either forced
to or actively chooses to go over the limit because that is the
right behavior under the circumstances, which is completely fine,
but, if at all avoidable, it shouldn't be misrepresenting what's
happening by sneaking the charges into the root memcg.
* Despite trying, we already do over-charge. kmemcg can't deal with
switching over to the root memcg by the point try_charge() returns
-EINTR, so it open-codes over-charing.
* It complicates the callers. Each try_charge() user has to handle
the weird -EINTR exception. memcg_charge_kmem() does the manual
over-charging. mem_cgroup_do_precharge() performs unnecessary
uncharging of root memcg, which BTW is inconsistent with what
memcg_charge_kmem() does but not broken as [un]charging are noops on
root memcg. mem_cgroup_try_charge() needs to switch the returned
cgroup to the root one.
The reality is that in memcg there are cases where we are forced and/or
willing to go over the limit. Each such case needs to be scrutinized and
justified but there definitely are situations where that is the right
thing to do. We alredy do this but with a superficial and inconsistent
disguise which leads to unnecessary complications.
This patch updates try_charge() so that it over-charges and returns 0 when
deemed necessary. -EINTR return is removed along with all special case
handling in the callers.
While at it, remove the local variable @ret, which was initialized to zero
and never changed, along with done: label which just returned the always
zero @ret.
Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Vladimir Davydov <vdavydov@parallels.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-11-06 09:46:17 +07:00
|
|
|
goto force;
|
2014-08-07 06:05:44 +07:00
|
|
|
|
2016-10-28 07:46:56 +07:00
|
|
|
/*
|
|
|
|
* Prevent unbounded recursion when reclaim operations need to
|
|
|
|
* allocate memory. This might exceed the limits temporarily,
|
|
|
|
* but we prefer facilitating memory reclaim and getting back
|
|
|
|
* under the limit over triggering OOM kills in these cases.
|
|
|
|
*/
|
|
|
|
if (unlikely(current->flags & PF_MEMALLOC))
|
|
|
|
goto force;
|
|
|
|
|
2014-08-07 06:05:44 +07:00
|
|
|
if (unlikely(task_in_memcg_oom(current)))
|
|
|
|
goto nomem;
|
|
|
|
|
2015-11-07 07:28:21 +07:00
|
|
|
if (!gfpflags_allow_blocking(gfp_mask))
|
2014-08-07 06:05:42 +07:00
|
|
|
goto nomem;
|
2010-08-11 08:02:57 +07:00
|
|
|
|
2018-04-11 06:29:45 +07:00
|
|
|
memcg_memory_event(mem_over_limit, MEMCG_MAX);
|
mm: memcontrol: default hierarchy interface for memory
Introduce the basic control files to account, partition, and limit
memory using cgroups in default hierarchy mode.
This interface versioning allows us to address fundamental design
issues in the existing memory cgroup interface, further explained
below. The old interface will be maintained indefinitely, but a
clearer model and improved workload performance should encourage
existing users to switch over to the new one eventually.
The control files are thus:
- memory.current shows the current consumption of the cgroup and its
descendants, in bytes.
- memory.low configures the lower end of the cgroup's expected
memory consumption range. The kernel considers memory below that
boundary to be a reserve - the minimum that the workload needs in
order to make forward progress - and generally avoids reclaiming
it, unless there is an imminent risk of entering an OOM situation.
- memory.high configures the upper end of the cgroup's expected
memory consumption range. A cgroup whose consumption grows beyond
this threshold is forced into direct reclaim, to work off the
excess and to throttle new allocations heavily, but is generally
allowed to continue and the OOM killer is not invoked.
- memory.max configures the hard maximum amount of memory that the
cgroup is allowed to consume before the OOM killer is invoked.
- memory.events shows event counters that indicate how often the
cgroup was reclaimed while below memory.low, how often it was
forced to reclaim excess beyond memory.high, how often it hit
memory.max, and how often it entered OOM due to memory.max. This
allows users to identify configuration problems when observing a
degradation in workload performance. An overcommitted system will
have an increased rate of low boundary breaches, whereas increased
rates of high limit breaches, maximum hits, or even OOM situations
will indicate internally overcommitted cgroups.
For existing users of memory cgroups, the following deviations from
the current interface are worth pointing out and explaining:
- The original lower boundary, the soft limit, is defined as a limit
that is per default unset. As a result, the set of cgroups that
global reclaim prefers is opt-in, rather than opt-out. The costs
for optimizing these mostly negative lookups are so high that the
implementation, despite its enormous size, does not even provide
the basic desirable behavior. First off, the soft limit has no
hierarchical meaning. All configured groups are organized in a
global rbtree and treated like equal peers, regardless where they
are located in the hierarchy. This makes subtree delegation
impossible. Second, the soft limit reclaim pass is so aggressive
that it not just introduces high allocation latencies into the
system, but also impacts system performance due to overreclaim, to
the point where the feature becomes self-defeating.
The memory.low boundary on the other hand is a top-down allocated
reserve. A cgroup enjoys reclaim protection when it and all its
ancestors are below their low boundaries, which makes delegation
of subtrees possible. Secondly, new cgroups have no reserve per
default and in the common case most cgroups are eligible for the
preferred reclaim pass. This allows the new low boundary to be
efficiently implemented with just a minor addition to the generic
reclaim code, without the need for out-of-band data structures and
reclaim passes. Because the generic reclaim code considers all
cgroups except for the ones running low in the preferred first
reclaim pass, overreclaim of individual groups is eliminated as
well, resulting in much better overall workload performance.
- The original high boundary, the hard limit, is defined as a strict
limit that can not budge, even if the OOM killer has to be called.
But this generally goes against the goal of making the most out of
the available memory. The memory consumption of workloads varies
during runtime, and that requires users to overcommit. But doing
that with a strict upper limit requires either a fairly accurate
prediction of the working set size or adding slack to the limit.
Since working set size estimation is hard and error prone, and
getting it wrong results in OOM kills, most users tend to err on
the side of a looser limit and end up wasting precious resources.
The memory.high boundary on the other hand can be set much more
conservatively. When hit, it throttles allocations by forcing
them into direct reclaim to work off the excess, but it never
invokes the OOM killer. As a result, a high boundary that is
chosen too aggressively will not terminate the processes, but
instead it will lead to gradual performance degradation. The user
can monitor this and make corrections until the minimal memory
footprint that still gives acceptable performance is found.
In extreme cases, with many concurrent allocations and a complete
breakdown of reclaim progress within the group, the high boundary
can be exceeded. But even then it's mostly better to satisfy the
allocation from the slack available in other groups or the rest of
the system than killing the group. Otherwise, memory.max is there
to limit this type of spillover and ultimately contain buggy or
even malicious applications.
- The original control file names are unwieldy and inconsistent in
many different ways. For example, the upper boundary hit count is
exported in the memory.failcnt file, but an OOM event count has to
be manually counted by listening to memory.oom_control events, and
lower boundary / soft limit events have to be counted by first
setting a threshold for that value and then counting those events.
Also, usage and limit files encode their units in the filename.
That makes the filenames very long, even though this is not
information that a user needs to be reminded of every time they
type out those names.
To address these naming issues, as well as to signal clearly that
the new interface carries a new configuration model, the naming
conventions in it necessarily differ from the old interface.
- The original limit files indicate the state of an unset limit with
a very high number, and a configured limit can be unset by echoing
-1 into those files. But that very high number is implementation
and architecture dependent and not very descriptive. And while -1
can be understood as an underflow into the highest possible value,
-2 or -10M etc. do not work, so it's not inconsistent.
memory.low, memory.high, and memory.max will use the string
"infinity" to indicate and set the highest possible value.
[akpm@linux-foundation.org: use seq_puts() for basic strings]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: Vladimir Davydov <vdavydov@parallels.com>
Cc: Greg Thelen <gthelen@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-02-12 06:26:06 +07:00
|
|
|
|
2020-08-07 13:22:15 +07:00
|
|
|
psi_memstall_enter(&pflags);
|
2014-10-10 05:28:56 +07:00
|
|
|
nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
|
|
|
|
gfp_mask, may_swap);
|
2020-08-07 13:22:15 +07:00
|
|
|
psi_memstall_leave(&pflags);
|
2014-08-07 06:05:42 +07:00
|
|
|
|
2014-08-07 06:08:16 +07:00
|
|
|
if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
|
2014-08-07 06:05:42 +07:00
|
|
|
goto retry;
|
2014-08-07 06:05:47 +07:00
|
|
|
|
2014-10-10 05:28:56 +07:00
|
|
|
if (!drained) {
|
2014-12-11 06:42:50 +07:00
|
|
|
drain_all_stock(mem_over_limit);
|
2014-10-10 05:28:56 +07:00
|
|
|
drained = true;
|
|
|
|
goto retry;
|
|
|
|
}
|
|
|
|
|
2014-08-07 06:05:47 +07:00
|
|
|
if (gfp_mask & __GFP_NORETRY)
|
|
|
|
goto nomem;
|
2014-08-07 06:05:42 +07:00
|
|
|
/*
|
|
|
|
* Even though the limit is exceeded at this point, reclaim
|
|
|
|
* may have been able to free some pages. Retry the charge
|
|
|
|
* before killing the task.
|
|
|
|
*
|
|
|
|
* Only for regular pages, though: huge pages are rather
|
|
|
|
* unlikely to succeed so close to the limit, and we fall back
|
|
|
|
* to regular pages anyway in case of failure.
|
|
|
|
*/
|
2014-08-07 06:08:16 +07:00
|
|
|
if (nr_reclaimed && nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER))
|
2014-08-07 06:05:42 +07:00
|
|
|
goto retry;
|
|
|
|
/*
|
|
|
|
* At task move, charge accounts can be doubly counted. So, it's
|
|
|
|
* better to wait until the end of task_move if something is going on.
|
|
|
|
*/
|
|
|
|
if (mem_cgroup_wait_acct_move(mem_over_limit))
|
|
|
|
goto retry;
|
|
|
|
|
2014-08-07 06:05:51 +07:00
|
|
|
if (nr_retries--)
|
|
|
|
goto retry;
|
|
|
|
|
2019-07-12 10:55:48 +07:00
|
|
|
if (gfp_mask & __GFP_RETRY_MAYFAIL)
|
2018-08-18 05:47:11 +07:00
|
|
|
goto nomem;
|
|
|
|
|
2014-08-07 06:05:44 +07:00
|
|
|
if (gfp_mask & __GFP_NOFAIL)
|
memcg: ratify and consolidate over-charge handling
try_charge() is the main charging logic of memcg. When it hits the limit
but either can't fail the allocation due to __GFP_NOFAIL or the task is
likely to free memory very soon, being OOM killed, has SIGKILL pending or
exiting, it "bypasses" the charge to the root memcg and returns -EINTR.
While this is one approach which can be taken for these situations, it has
several issues.
* It unnecessarily lies about the reality. The number itself doesn't
go over the limit but the actual usage does. memcg is either forced
to or actively chooses to go over the limit because that is the
right behavior under the circumstances, which is completely fine,
but, if at all avoidable, it shouldn't be misrepresenting what's
happening by sneaking the charges into the root memcg.
* Despite trying, we already do over-charge. kmemcg can't deal with
switching over to the root memcg by the point try_charge() returns
-EINTR, so it open-codes over-charing.
* It complicates the callers. Each try_charge() user has to handle
the weird -EINTR exception. memcg_charge_kmem() does the manual
over-charging. mem_cgroup_do_precharge() performs unnecessary
uncharging of root memcg, which BTW is inconsistent with what
memcg_charge_kmem() does but not broken as [un]charging are noops on
root memcg. mem_cgroup_try_charge() needs to switch the returned
cgroup to the root one.
The reality is that in memcg there are cases where we are forced and/or
willing to go over the limit. Each such case needs to be scrutinized and
justified but there definitely are situations where that is the right
thing to do. We alredy do this but with a superficial and inconsistent
disguise which leads to unnecessary complications.
This patch updates try_charge() so that it over-charges and returns 0 when
deemed necessary. -EINTR return is removed along with all special case
handling in the callers.
While at it, remove the local variable @ret, which was initialized to zero
and never changed, along with done: label which just returned the always
zero @ret.
Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Vladimir Davydov <vdavydov@parallels.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-11-06 09:46:17 +07:00
|
|
|
goto force;
|
2014-08-07 06:05:44 +07:00
|
|
|
|
2014-08-07 06:05:42 +07:00
|
|
|
if (fatal_signal_pending(current))
|
memcg: ratify and consolidate over-charge handling
try_charge() is the main charging logic of memcg. When it hits the limit
but either can't fail the allocation due to __GFP_NOFAIL or the task is
likely to free memory very soon, being OOM killed, has SIGKILL pending or
exiting, it "bypasses" the charge to the root memcg and returns -EINTR.
While this is one approach which can be taken for these situations, it has
several issues.
* It unnecessarily lies about the reality. The number itself doesn't
go over the limit but the actual usage does. memcg is either forced
to or actively chooses to go over the limit because that is the
right behavior under the circumstances, which is completely fine,
but, if at all avoidable, it shouldn't be misrepresenting what's
happening by sneaking the charges into the root memcg.
* Despite trying, we already do over-charge. kmemcg can't deal with
switching over to the root memcg by the point try_charge() returns
-EINTR, so it open-codes over-charing.
* It complicates the callers. Each try_charge() user has to handle
the weird -EINTR exception. memcg_charge_kmem() does the manual
over-charging. mem_cgroup_do_precharge() performs unnecessary
uncharging of root memcg, which BTW is inconsistent with what
memcg_charge_kmem() does but not broken as [un]charging are noops on
root memcg. mem_cgroup_try_charge() needs to switch the returned
cgroup to the root one.
The reality is that in memcg there are cases where we are forced and/or
willing to go over the limit. Each such case needs to be scrutinized and
justified but there definitely are situations where that is the right
thing to do. We alredy do this but with a superficial and inconsistent
disguise which leads to unnecessary complications.
This patch updates try_charge() so that it over-charges and returns 0 when
deemed necessary. -EINTR return is removed along with all special case
handling in the callers.
While at it, remove the local variable @ret, which was initialized to zero
and never changed, along with done: label which just returned the always
zero @ret.
Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Vladimir Davydov <vdavydov@parallels.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-11-06 09:46:17 +07:00
|
|
|
goto force;
|
2014-08-07 06:05:42 +07:00
|
|
|
|
2018-08-18 05:47:11 +07:00
|
|
|
/*
|
|
|
|
* keep retrying as long as the memcg oom killer is able to make
|
|
|
|
* a forward progress or bypass the charge if the oom killer
|
|
|
|
* couldn't make any progress.
|
|
|
|
*/
|
|
|
|
oom_status = mem_cgroup_oom(mem_over_limit, gfp_mask,
|
2015-11-06 09:47:29 +07:00
|
|
|
get_order(nr_pages * PAGE_SIZE));
|
2018-08-18 05:47:11 +07:00
|
|
|
switch (oom_status) {
|
|
|
|
case OOM_SUCCESS:
|
2020-08-07 13:21:58 +07:00
|
|
|
nr_retries = MAX_RECLAIM_RETRIES;
|
2018-08-18 05:47:11 +07:00
|
|
|
goto retry;
|
|
|
|
case OOM_FAILED:
|
|
|
|
goto force;
|
|
|
|
default:
|
|
|
|
goto nomem;
|
|
|
|
}
|
2009-01-08 09:07:48 +07:00
|
|
|
nomem:
|
2014-04-08 05:37:45 +07:00
|
|
|
if (!(gfp_mask & __GFP_NOFAIL))
|
2013-11-01 06:34:13 +07:00
|
|
|
return -ENOMEM;
|
memcg: ratify and consolidate over-charge handling
try_charge() is the main charging logic of memcg. When it hits the limit
but either can't fail the allocation due to __GFP_NOFAIL or the task is
likely to free memory very soon, being OOM killed, has SIGKILL pending or
exiting, it "bypasses" the charge to the root memcg and returns -EINTR.
While this is one approach which can be taken for these situations, it has
several issues.
* It unnecessarily lies about the reality. The number itself doesn't
go over the limit but the actual usage does. memcg is either forced
to or actively chooses to go over the limit because that is the
right behavior under the circumstances, which is completely fine,
but, if at all avoidable, it shouldn't be misrepresenting what's
happening by sneaking the charges into the root memcg.
* Despite trying, we already do over-charge. kmemcg can't deal with
switching over to the root memcg by the point try_charge() returns
-EINTR, so it open-codes over-charing.
* It complicates the callers. Each try_charge() user has to handle
the weird -EINTR exception. memcg_charge_kmem() does the manual
over-charging. mem_cgroup_do_precharge() performs unnecessary
uncharging of root memcg, which BTW is inconsistent with what
memcg_charge_kmem() does but not broken as [un]charging are noops on
root memcg. mem_cgroup_try_charge() needs to switch the returned
cgroup to the root one.
The reality is that in memcg there are cases where we are forced and/or
willing to go over the limit. Each such case needs to be scrutinized and
justified but there definitely are situations where that is the right
thing to do. We alredy do this but with a superficial and inconsistent
disguise which leads to unnecessary complications.
This patch updates try_charge() so that it over-charges and returns 0 when
deemed necessary. -EINTR return is removed along with all special case
handling in the callers.
While at it, remove the local variable @ret, which was initialized to zero
and never changed, along with done: label which just returned the always
zero @ret.
Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Vladimir Davydov <vdavydov@parallels.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-11-06 09:46:17 +07:00
|
|
|
force:
|
|
|
|
/*
|
|
|
|
* The allocation either can't fail or will lead to more memory
|
|
|
|
* being freed very soon. Allow memory usage go over the limit
|
|
|
|
* temporarily by force charging it.
|
|
|
|
*/
|
|
|
|
page_counter_charge(&memcg->memory, nr_pages);
|
2016-01-15 06:21:23 +07:00
|
|
|
if (do_memsw_account())
|
memcg: ratify and consolidate over-charge handling
try_charge() is the main charging logic of memcg. When it hits the limit
but either can't fail the allocation due to __GFP_NOFAIL or the task is
likely to free memory very soon, being OOM killed, has SIGKILL pending or
exiting, it "bypasses" the charge to the root memcg and returns -EINTR.
While this is one approach which can be taken for these situations, it has
several issues.
* It unnecessarily lies about the reality. The number itself doesn't
go over the limit but the actual usage does. memcg is either forced
to or actively chooses to go over the limit because that is the
right behavior under the circumstances, which is completely fine,
but, if at all avoidable, it shouldn't be misrepresenting what's
happening by sneaking the charges into the root memcg.
* Despite trying, we already do over-charge. kmemcg can't deal with
switching over to the root memcg by the point try_charge() returns
-EINTR, so it open-codes over-charing.
* It complicates the callers. Each try_charge() user has to handle
the weird -EINTR exception. memcg_charge_kmem() does the manual
over-charging. mem_cgroup_do_precharge() performs unnecessary
uncharging of root memcg, which BTW is inconsistent with what
memcg_charge_kmem() does but not broken as [un]charging are noops on
root memcg. mem_cgroup_try_charge() needs to switch the returned
cgroup to the root one.
The reality is that in memcg there are cases where we are forced and/or
willing to go over the limit. Each such case needs to be scrutinized and
justified but there definitely are situations where that is the right
thing to do. We alredy do this but with a superficial and inconsistent
disguise which leads to unnecessary complications.
This patch updates try_charge() so that it over-charges and returns 0 when
deemed necessary. -EINTR return is removed along with all special case
handling in the callers.
While at it, remove the local variable @ret, which was initialized to zero
and never changed, along with done: label which just returned the always
zero @ret.
Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Vladimir Davydov <vdavydov@parallels.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-11-06 09:46:17 +07:00
|
|
|
page_counter_charge(&memcg->memsw, nr_pages);
|
|
|
|
|
|
|
|
return 0;
|
2014-08-07 06:05:42 +07:00
|
|
|
|
|
|
|
done_restock:
|
|
|
|
if (batch > nr_pages)
|
|
|
|
refill_stock(memcg, batch - nr_pages);
|
memcg: punt high overage reclaim to return-to-userland path
Currently, try_charge() tries to reclaim memory synchronously when the
high limit is breached; however, if the allocation doesn't have
__GFP_WAIT, synchronous reclaim is skipped. If a process performs only
speculative allocations, it can blow way past the high limit. This is
actually easily reproducible by simply doing "find /". slab/slub
allocator tries speculative allocations first, so as long as there's
memory which can be consumed without blocking, it can keep allocating
memory regardless of the high limit.
This patch makes try_charge() always punt the over-high reclaim to the
return-to-userland path. If try_charge() detects that high limit is
breached, it adds the overage to current->memcg_nr_pages_over_high and
schedules execution of mem_cgroup_handle_over_high() which performs
synchronous reclaim from the return-to-userland path.
As long as kernel doesn't have a run-away allocation spree, this should
provide enough protection while making kmemcg behave more consistently.
It also has the following benefits.
- All over-high reclaims can use GFP_KERNEL regardless of the specific
gfp mask in use, e.g. GFP_NOFS, when the limit was breached.
- It copes with prio inversion. Previously, a low-prio task with
small memory.high might perform over-high reclaim with a bunch of
locks held. If a higher prio task needed any of these locks, it
would have to wait until the low prio task finished reclaim and
released the locks. By handing over-high reclaim to the task exit
path this issue can be avoided.
Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Michal Hocko <mhocko@kernel.org>
Reviewed-by: Vladimir Davydov <vdavydov@parallels.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-11-06 09:46:11 +07:00
|
|
|
|
mm: memcontrol: default hierarchy interface for memory
Introduce the basic control files to account, partition, and limit
memory using cgroups in default hierarchy mode.
This interface versioning allows us to address fundamental design
issues in the existing memory cgroup interface, further explained
below. The old interface will be maintained indefinitely, but a
clearer model and improved workload performance should encourage
existing users to switch over to the new one eventually.
The control files are thus:
- memory.current shows the current consumption of the cgroup and its
descendants, in bytes.
- memory.low configures the lower end of the cgroup's expected
memory consumption range. The kernel considers memory below that
boundary to be a reserve - the minimum that the workload needs in
order to make forward progress - and generally avoids reclaiming
it, unless there is an imminent risk of entering an OOM situation.
- memory.high configures the upper end of the cgroup's expected
memory consumption range. A cgroup whose consumption grows beyond
this threshold is forced into direct reclaim, to work off the
excess and to throttle new allocations heavily, but is generally
allowed to continue and the OOM killer is not invoked.
- memory.max configures the hard maximum amount of memory that the
cgroup is allowed to consume before the OOM killer is invoked.
- memory.events shows event counters that indicate how often the
cgroup was reclaimed while below memory.low, how often it was
forced to reclaim excess beyond memory.high, how often it hit
memory.max, and how often it entered OOM due to memory.max. This
allows users to identify configuration problems when observing a
degradation in workload performance. An overcommitted system will
have an increased rate of low boundary breaches, whereas increased
rates of high limit breaches, maximum hits, or even OOM situations
will indicate internally overcommitted cgroups.
For existing users of memory cgroups, the following deviations from
the current interface are worth pointing out and explaining:
- The original lower boundary, the soft limit, is defined as a limit
that is per default unset. As a result, the set of cgroups that
global reclaim prefers is opt-in, rather than opt-out. The costs
for optimizing these mostly negative lookups are so high that the
implementation, despite its enormous size, does not even provide
the basic desirable behavior. First off, the soft limit has no
hierarchical meaning. All configured groups are organized in a
global rbtree and treated like equal peers, regardless where they
are located in the hierarchy. This makes subtree delegation
impossible. Second, the soft limit reclaim pass is so aggressive
that it not just introduces high allocation latencies into the
system, but also impacts system performance due to overreclaim, to
the point where the feature becomes self-defeating.
The memory.low boundary on the other hand is a top-down allocated
reserve. A cgroup enjoys reclaim protection when it and all its
ancestors are below their low boundaries, which makes delegation
of subtrees possible. Secondly, new cgroups have no reserve per
default and in the common case most cgroups are eligible for the
preferred reclaim pass. This allows the new low boundary to be
efficiently implemented with just a minor addition to the generic
reclaim code, without the need for out-of-band data structures and
reclaim passes. Because the generic reclaim code considers all
cgroups except for the ones running low in the preferred first
reclaim pass, overreclaim of individual groups is eliminated as
well, resulting in much better overall workload performance.
- The original high boundary, the hard limit, is defined as a strict
limit that can not budge, even if the OOM killer has to be called.
But this generally goes against the goal of making the most out of
the available memory. The memory consumption of workloads varies
during runtime, and that requires users to overcommit. But doing
that with a strict upper limit requires either a fairly accurate
prediction of the working set size or adding slack to the limit.
Since working set size estimation is hard and error prone, and
getting it wrong results in OOM kills, most users tend to err on
the side of a looser limit and end up wasting precious resources.
The memory.high boundary on the other hand can be set much more
conservatively. When hit, it throttles allocations by forcing
them into direct reclaim to work off the excess, but it never
invokes the OOM killer. As a result, a high boundary that is
chosen too aggressively will not terminate the processes, but
instead it will lead to gradual performance degradation. The user
can monitor this and make corrections until the minimal memory
footprint that still gives acceptable performance is found.
In extreme cases, with many concurrent allocations and a complete
breakdown of reclaim progress within the group, the high boundary
can be exceeded. But even then it's mostly better to satisfy the
allocation from the slack available in other groups or the rest of
the system than killing the group. Otherwise, memory.max is there
to limit this type of spillover and ultimately contain buggy or
even malicious applications.
- The original control file names are unwieldy and inconsistent in
many different ways. For example, the upper boundary hit count is
exported in the memory.failcnt file, but an OOM event count has to
be manually counted by listening to memory.oom_control events, and
lower boundary / soft limit events have to be counted by first
setting a threshold for that value and then counting those events.
Also, usage and limit files encode their units in the filename.
That makes the filenames very long, even though this is not
information that a user needs to be reminded of every time they
type out those names.
To address these naming issues, as well as to signal clearly that
the new interface carries a new configuration model, the naming
conventions in it necessarily differ from the old interface.
- The original limit files indicate the state of an unset limit with
a very high number, and a configured limit can be unset by echoing
-1 into those files. But that very high number is implementation
and architecture dependent and not very descriptive. And while -1
can be understood as an underflow into the highest possible value,
-2 or -10M etc. do not work, so it's not inconsistent.
memory.low, memory.high, and memory.max will use the string
"infinity" to indicate and set the highest possible value.
[akpm@linux-foundation.org: use seq_puts() for basic strings]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: Vladimir Davydov <vdavydov@parallels.com>
Cc: Greg Thelen <gthelen@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-02-12 06:26:06 +07:00
|
|
|
/*
|
memcg: punt high overage reclaim to return-to-userland path
Currently, try_charge() tries to reclaim memory synchronously when the
high limit is breached; however, if the allocation doesn't have
__GFP_WAIT, synchronous reclaim is skipped. If a process performs only
speculative allocations, it can blow way past the high limit. This is
actually easily reproducible by simply doing "find /". slab/slub
allocator tries speculative allocations first, so as long as there's
memory which can be consumed without blocking, it can keep allocating
memory regardless of the high limit.
This patch makes try_charge() always punt the over-high reclaim to the
return-to-userland path. If try_charge() detects that high limit is
breached, it adds the overage to current->memcg_nr_pages_over_high and
schedules execution of mem_cgroup_handle_over_high() which performs
synchronous reclaim from the return-to-userland path.
As long as kernel doesn't have a run-away allocation spree, this should
provide enough protection while making kmemcg behave more consistently.
It also has the following benefits.
- All over-high reclaims can use GFP_KERNEL regardless of the specific
gfp mask in use, e.g. GFP_NOFS, when the limit was breached.
- It copes with prio inversion. Previously, a low-prio task with
small memory.high might perform over-high reclaim with a bunch of
locks held. If a higher prio task needed any of these locks, it
would have to wait until the low prio task finished reclaim and
released the locks. By handing over-high reclaim to the task exit
path this issue can be avoided.
Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Michal Hocko <mhocko@kernel.org>
Reviewed-by: Vladimir Davydov <vdavydov@parallels.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-11-06 09:46:11 +07:00
|
|
|
* If the hierarchy is above the normal consumption range, schedule
|
|
|
|
* reclaim on returning to userland. We can perform reclaim here
|
2015-11-07 07:28:28 +07:00
|
|
|
* if __GFP_RECLAIM but let's always punt for simplicity and so that
|
memcg: punt high overage reclaim to return-to-userland path
Currently, try_charge() tries to reclaim memory synchronously when the
high limit is breached; however, if the allocation doesn't have
__GFP_WAIT, synchronous reclaim is skipped. If a process performs only
speculative allocations, it can blow way past the high limit. This is
actually easily reproducible by simply doing "find /". slab/slub
allocator tries speculative allocations first, so as long as there's
memory which can be consumed without blocking, it can keep allocating
memory regardless of the high limit.
This patch makes try_charge() always punt the over-high reclaim to the
return-to-userland path. If try_charge() detects that high limit is
breached, it adds the overage to current->memcg_nr_pages_over_high and
schedules execution of mem_cgroup_handle_over_high() which performs
synchronous reclaim from the return-to-userland path.
As long as kernel doesn't have a run-away allocation spree, this should
provide enough protection while making kmemcg behave more consistently.
It also has the following benefits.
- All over-high reclaims can use GFP_KERNEL regardless of the specific
gfp mask in use, e.g. GFP_NOFS, when the limit was breached.
- It copes with prio inversion. Previously, a low-prio task with
small memory.high might perform over-high reclaim with a bunch of
locks held. If a higher prio task needed any of these locks, it
would have to wait until the low prio task finished reclaim and
released the locks. By handing over-high reclaim to the task exit
path this issue can be avoided.
Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Michal Hocko <mhocko@kernel.org>
Reviewed-by: Vladimir Davydov <vdavydov@parallels.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-11-06 09:46:11 +07:00
|
|
|
* GFP_KERNEL can consistently be used during reclaim. @memcg is
|
|
|
|
* not recorded as it most likely matches current's and won't
|
|
|
|
* change in the meantime. As high limit is checked again before
|
|
|
|
* reclaim, the cost of mismatch is negligible.
|
mm: memcontrol: default hierarchy interface for memory
Introduce the basic control files to account, partition, and limit
memory using cgroups in default hierarchy mode.
This interface versioning allows us to address fundamental design
issues in the existing memory cgroup interface, further explained
below. The old interface will be maintained indefinitely, but a
clearer model and improved workload performance should encourage
existing users to switch over to the new one eventually.
The control files are thus:
- memory.current shows the current consumption of the cgroup and its
descendants, in bytes.
- memory.low configures the lower end of the cgroup's expected
memory consumption range. The kernel considers memory below that
boundary to be a reserve - the minimum that the workload needs in
order to make forward progress - and generally avoids reclaiming
it, unless there is an imminent risk of entering an OOM situation.
- memory.high configures the upper end of the cgroup's expected
memory consumption range. A cgroup whose consumption grows beyond
this threshold is forced into direct reclaim, to work off the
excess and to throttle new allocations heavily, but is generally
allowed to continue and the OOM killer is not invoked.
- memory.max configures the hard maximum amount of memory that the
cgroup is allowed to consume before the OOM killer is invoked.
- memory.events shows event counters that indicate how often the
cgroup was reclaimed while below memory.low, how often it was
forced to reclaim excess beyond memory.high, how often it hit
memory.max, and how often it entered OOM due to memory.max. This
allows users to identify configuration problems when observing a
degradation in workload performance. An overcommitted system will
have an increased rate of low boundary breaches, whereas increased
rates of high limit breaches, maximum hits, or even OOM situations
will indicate internally overcommitted cgroups.
For existing users of memory cgroups, the following deviations from
the current interface are worth pointing out and explaining:
- The original lower boundary, the soft limit, is defined as a limit
that is per default unset. As a result, the set of cgroups that
global reclaim prefers is opt-in, rather than opt-out. The costs
for optimizing these mostly negative lookups are so high that the
implementation, despite its enormous size, does not even provide
the basic desirable behavior. First off, the soft limit has no
hierarchical meaning. All configured groups are organized in a
global rbtree and treated like equal peers, regardless where they
are located in the hierarchy. This makes subtree delegation
impossible. Second, the soft limit reclaim pass is so aggressive
that it not just introduces high allocation latencies into the
system, but also impacts system performance due to overreclaim, to
the point where the feature becomes self-defeating.
The memory.low boundary on the other hand is a top-down allocated
reserve. A cgroup enjoys reclaim protection when it and all its
ancestors are below their low boundaries, which makes delegation
of subtrees possible. Secondly, new cgroups have no reserve per
default and in the common case most cgroups are eligible for the
preferred reclaim pass. This allows the new low boundary to be
efficiently implemented with just a minor addition to the generic
reclaim code, without the need for out-of-band data structures and
reclaim passes. Because the generic reclaim code considers all
cgroups except for the ones running low in the preferred first
reclaim pass, overreclaim of individual groups is eliminated as
well, resulting in much better overall workload performance.
- The original high boundary, the hard limit, is defined as a strict
limit that can not budge, even if the OOM killer has to be called.
But this generally goes against the goal of making the most out of
the available memory. The memory consumption of workloads varies
during runtime, and that requires users to overcommit. But doing
that with a strict upper limit requires either a fairly accurate
prediction of the working set size or adding slack to the limit.
Since working set size estimation is hard and error prone, and
getting it wrong results in OOM kills, most users tend to err on
the side of a looser limit and end up wasting precious resources.
The memory.high boundary on the other hand can be set much more
conservatively. When hit, it throttles allocations by forcing
them into direct reclaim to work off the excess, but it never
invokes the OOM killer. As a result, a high boundary that is
chosen too aggressively will not terminate the processes, but
instead it will lead to gradual performance degradation. The user
can monitor this and make corrections until the minimal memory
footprint that still gives acceptable performance is found.
In extreme cases, with many concurrent allocations and a complete
breakdown of reclaim progress within the group, the high boundary
can be exceeded. But even then it's mostly better to satisfy the
allocation from the slack available in other groups or the rest of
the system than killing the group. Otherwise, memory.max is there
to limit this type of spillover and ultimately contain buggy or
even malicious applications.
- The original control file names are unwieldy and inconsistent in
many different ways. For example, the upper boundary hit count is
exported in the memory.failcnt file, but an OOM event count has to
be manually counted by listening to memory.oom_control events, and
lower boundary / soft limit events have to be counted by first
setting a threshold for that value and then counting those events.
Also, usage and limit files encode their units in the filename.
That makes the filenames very long, even though this is not
information that a user needs to be reminded of every time they
type out those names.
To address these naming issues, as well as to signal clearly that
the new interface carries a new configuration model, the naming
conventions in it necessarily differ from the old interface.
- The original limit files indicate the state of an unset limit with
a very high number, and a configured limit can be unset by echoing
-1 into those files. But that very high number is implementation
and architecture dependent and not very descriptive. And while -1
can be understood as an underflow into the highest possible value,
-2 or -10M etc. do not work, so it's not inconsistent.
memory.low, memory.high, and memory.max will use the string
"infinity" to indicate and set the highest possible value.
[akpm@linux-foundation.org: use seq_puts() for basic strings]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: Vladimir Davydov <vdavydov@parallels.com>
Cc: Greg Thelen <gthelen@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-02-12 06:26:06 +07:00
|
|
|
*/
|
|
|
|
do {
|
2020-06-02 11:49:52 +07:00
|
|
|
bool mem_high, swap_high;
|
|
|
|
|
|
|
|
mem_high = page_counter_read(&memcg->memory) >
|
|
|
|
READ_ONCE(memcg->memory.high);
|
|
|
|
swap_high = page_counter_read(&memcg->swap) >
|
|
|
|
READ_ONCE(memcg->swap.high);
|
|
|
|
|
|
|
|
/* Don't bother a random interrupted task */
|
|
|
|
if (in_interrupt()) {
|
|
|
|
if (mem_high) {
|
2016-01-15 06:21:29 +07:00
|
|
|
schedule_work(&memcg->high_work);
|
|
|
|
break;
|
|
|
|
}
|
2020-06-02 11:49:52 +07:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (mem_high || swap_high) {
|
|
|
|
/*
|
|
|
|
* The allocating tasks in this cgroup will need to do
|
|
|
|
* reclaim or be throttled to prevent further growth
|
|
|
|
* of the memory or swap footprints.
|
|
|
|
*
|
|
|
|
* Target some best-effort fairness between the tasks,
|
|
|
|
* and distribute reclaim work and delay penalties
|
|
|
|
* based on how much each task is actually allocating.
|
|
|
|
*/
|
2015-12-12 04:40:24 +07:00
|
|
|
current->memcg_nr_pages_over_high += batch;
|
memcg: punt high overage reclaim to return-to-userland path
Currently, try_charge() tries to reclaim memory synchronously when the
high limit is breached; however, if the allocation doesn't have
__GFP_WAIT, synchronous reclaim is skipped. If a process performs only
speculative allocations, it can blow way past the high limit. This is
actually easily reproducible by simply doing "find /". slab/slub
allocator tries speculative allocations first, so as long as there's
memory which can be consumed without blocking, it can keep allocating
memory regardless of the high limit.
This patch makes try_charge() always punt the over-high reclaim to the
return-to-userland path. If try_charge() detects that high limit is
breached, it adds the overage to current->memcg_nr_pages_over_high and
schedules execution of mem_cgroup_handle_over_high() which performs
synchronous reclaim from the return-to-userland path.
As long as kernel doesn't have a run-away allocation spree, this should
provide enough protection while making kmemcg behave more consistently.
It also has the following benefits.
- All over-high reclaims can use GFP_KERNEL regardless of the specific
gfp mask in use, e.g. GFP_NOFS, when the limit was breached.
- It copes with prio inversion. Previously, a low-prio task with
small memory.high might perform over-high reclaim with a bunch of
locks held. If a higher prio task needed any of these locks, it
would have to wait until the low prio task finished reclaim and
released the locks. By handing over-high reclaim to the task exit
path this issue can be avoided.
Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Michal Hocko <mhocko@kernel.org>
Reviewed-by: Vladimir Davydov <vdavydov@parallels.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-11-06 09:46:11 +07:00
|
|
|
set_notify_resume(current);
|
|
|
|
break;
|
|
|
|
}
|
mm: memcontrol: default hierarchy interface for memory
Introduce the basic control files to account, partition, and limit
memory using cgroups in default hierarchy mode.
This interface versioning allows us to address fundamental design
issues in the existing memory cgroup interface, further explained
below. The old interface will be maintained indefinitely, but a
clearer model and improved workload performance should encourage
existing users to switch over to the new one eventually.
The control files are thus:
- memory.current shows the current consumption of the cgroup and its
descendants, in bytes.
- memory.low configures the lower end of the cgroup's expected
memory consumption range. The kernel considers memory below that
boundary to be a reserve - the minimum that the workload needs in
order to make forward progress - and generally avoids reclaiming
it, unless there is an imminent risk of entering an OOM situation.
- memory.high configures the upper end of the cgroup's expected
memory consumption range. A cgroup whose consumption grows beyond
this threshold is forced into direct reclaim, to work off the
excess and to throttle new allocations heavily, but is generally
allowed to continue and the OOM killer is not invoked.
- memory.max configures the hard maximum amount of memory that the
cgroup is allowed to consume before the OOM killer is invoked.
- memory.events shows event counters that indicate how often the
cgroup was reclaimed while below memory.low, how often it was
forced to reclaim excess beyond memory.high, how often it hit
memory.max, and how often it entered OOM due to memory.max. This
allows users to identify configuration problems when observing a
degradation in workload performance. An overcommitted system will
have an increased rate of low boundary breaches, whereas increased
rates of high limit breaches, maximum hits, or even OOM situations
will indicate internally overcommitted cgroups.
For existing users of memory cgroups, the following deviations from
the current interface are worth pointing out and explaining:
- The original lower boundary, the soft limit, is defined as a limit
that is per default unset. As a result, the set of cgroups that
global reclaim prefers is opt-in, rather than opt-out. The costs
for optimizing these mostly negative lookups are so high that the
implementation, despite its enormous size, does not even provide
the basic desirable behavior. First off, the soft limit has no
hierarchical meaning. All configured groups are organized in a
global rbtree and treated like equal peers, regardless where they
are located in the hierarchy. This makes subtree delegation
impossible. Second, the soft limit reclaim pass is so aggressive
that it not just introduces high allocation latencies into the
system, but also impacts system performance due to overreclaim, to
the point where the feature becomes self-defeating.
The memory.low boundary on the other hand is a top-down allocated
reserve. A cgroup enjoys reclaim protection when it and all its
ancestors are below their low boundaries, which makes delegation
of subtrees possible. Secondly, new cgroups have no reserve per
default and in the common case most cgroups are eligible for the
preferred reclaim pass. This allows the new low boundary to be
efficiently implemented with just a minor addition to the generic
reclaim code, without the need for out-of-band data structures and
reclaim passes. Because the generic reclaim code considers all
cgroups except for the ones running low in the preferred first
reclaim pass, overreclaim of individual groups is eliminated as
well, resulting in much better overall workload performance.
- The original high boundary, the hard limit, is defined as a strict
limit that can not budge, even if the OOM killer has to be called.
But this generally goes against the goal of making the most out of
the available memory. The memory consumption of workloads varies
during runtime, and that requires users to overcommit. But doing
that with a strict upper limit requires either a fairly accurate
prediction of the working set size or adding slack to the limit.
Since working set size estimation is hard and error prone, and
getting it wrong results in OOM kills, most users tend to err on
the side of a looser limit and end up wasting precious resources.
The memory.high boundary on the other hand can be set much more
conservatively. When hit, it throttles allocations by forcing
them into direct reclaim to work off the excess, but it never
invokes the OOM killer. As a result, a high boundary that is
chosen too aggressively will not terminate the processes, but
instead it will lead to gradual performance degradation. The user
can monitor this and make corrections until the minimal memory
footprint that still gives acceptable performance is found.
In extreme cases, with many concurrent allocations and a complete
breakdown of reclaim progress within the group, the high boundary
can be exceeded. But even then it's mostly better to satisfy the
allocation from the slack available in other groups or the rest of
the system than killing the group. Otherwise, memory.max is there
to limit this type of spillover and ultimately contain buggy or
even malicious applications.
- The original control file names are unwieldy and inconsistent in
many different ways. For example, the upper boundary hit count is
exported in the memory.failcnt file, but an OOM event count has to
be manually counted by listening to memory.oom_control events, and
lower boundary / soft limit events have to be counted by first
setting a threshold for that value and then counting those events.
Also, usage and limit files encode their units in the filename.
That makes the filenames very long, even though this is not
information that a user needs to be reminded of every time they
type out those names.
To address these naming issues, as well as to signal clearly that
the new interface carries a new configuration model, the naming
conventions in it necessarily differ from the old interface.
- The original limit files indicate the state of an unset limit with
a very high number, and a configured limit can be unset by echoing
-1 into those files. But that very high number is implementation
and architecture dependent and not very descriptive. And while -1
can be understood as an underflow into the highest possible value,
-2 or -10M etc. do not work, so it's not inconsistent.
memory.low, memory.high, and memory.max will use the string
"infinity" to indicate and set the highest possible value.
[akpm@linux-foundation.org: use seq_puts() for basic strings]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: Vladimir Davydov <vdavydov@parallels.com>
Cc: Greg Thelen <gthelen@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-02-12 06:26:06 +07:00
|
|
|
} while ((memcg = parent_mem_cgroup(memcg)));
|
memcg: ratify and consolidate over-charge handling
try_charge() is the main charging logic of memcg. When it hits the limit
but either can't fail the allocation due to __GFP_NOFAIL or the task is
likely to free memory very soon, being OOM killed, has SIGKILL pending or
exiting, it "bypasses" the charge to the root memcg and returns -EINTR.
While this is one approach which can be taken for these situations, it has
several issues.
* It unnecessarily lies about the reality. The number itself doesn't
go over the limit but the actual usage does. memcg is either forced
to or actively chooses to go over the limit because that is the
right behavior under the circumstances, which is completely fine,
but, if at all avoidable, it shouldn't be misrepresenting what's
happening by sneaking the charges into the root memcg.
* Despite trying, we already do over-charge. kmemcg can't deal with
switching over to the root memcg by the point try_charge() returns
-EINTR, so it open-codes over-charing.
* It complicates the callers. Each try_charge() user has to handle
the weird -EINTR exception. memcg_charge_kmem() does the manual
over-charging. mem_cgroup_do_precharge() performs unnecessary
uncharging of root memcg, which BTW is inconsistent with what
memcg_charge_kmem() does but not broken as [un]charging are noops on
root memcg. mem_cgroup_try_charge() needs to switch the returned
cgroup to the root one.
The reality is that in memcg there are cases where we are forced and/or
willing to go over the limit. Each such case needs to be scrutinized and
justified but there definitely are situations where that is the right
thing to do. We alredy do this but with a superficial and inconsistent
disguise which leads to unnecessary complications.
This patch updates try_charge() so that it over-charges and returns 0 when
deemed necessary. -EINTR return is removed along with all special case
handling in the callers.
While at it, remove the local variable @ret, which was initialized to zero
and never changed, along with done: label which just returned the always
zero @ret.
Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Vladimir Davydov <vdavydov@parallels.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-11-06 09:46:17 +07:00
|
|
|
|
|
|
|
return 0;
|
2009-01-08 09:07:48 +07:00
|
|
|
}
|
2008-02-07 15:13:53 +07:00
|
|
|
|
2020-06-04 06:02:07 +07:00
|
|
|
#if defined(CONFIG_MEMCG_KMEM) || defined(CONFIG_MMU)
|
mm: memcontrol: rewrite charge API
These patches rework memcg charge lifetime to integrate more naturally
with the lifetime of user pages. This drastically simplifies the code and
reduces charging and uncharging overhead. The most expensive part of
charging and uncharging is the page_cgroup bit spinlock, which is removed
entirely after this series.
Here are the top-10 profile entries of a stress test that reads a 128G
sparse file on a freshly booted box, without even a dedicated cgroup (i.e.
executing in the root memcg). Before:
15.36% cat [kernel.kallsyms] [k] copy_user_generic_string
13.31% cat [kernel.kallsyms] [k] memset
11.48% cat [kernel.kallsyms] [k] do_mpage_readpage
4.23% cat [kernel.kallsyms] [k] get_page_from_freelist
2.38% cat [kernel.kallsyms] [k] put_page
2.32% cat [kernel.kallsyms] [k] __mem_cgroup_commit_charge
2.18% kswapd0 [kernel.kallsyms] [k] __mem_cgroup_uncharge_common
1.92% kswapd0 [kernel.kallsyms] [k] shrink_page_list
1.86% cat [kernel.kallsyms] [k] __radix_tree_lookup
1.62% cat [kernel.kallsyms] [k] __pagevec_lru_add_fn
After:
15.67% cat [kernel.kallsyms] [k] copy_user_generic_string
13.48% cat [kernel.kallsyms] [k] memset
11.42% cat [kernel.kallsyms] [k] do_mpage_readpage
3.98% cat [kernel.kallsyms] [k] get_page_from_freelist
2.46% cat [kernel.kallsyms] [k] put_page
2.13% kswapd0 [kernel.kallsyms] [k] shrink_page_list
1.88% cat [kernel.kallsyms] [k] __radix_tree_lookup
1.67% cat [kernel.kallsyms] [k] __pagevec_lru_add_fn
1.39% kswapd0 [kernel.kallsyms] [k] free_pcppages_bulk
1.30% cat [kernel.kallsyms] [k] kfree
As you can see, the memcg footprint has shrunk quite a bit.
text data bss dec hex filename
37970 9892 400 48262 bc86 mm/memcontrol.o.old
35239 9892 400 45531 b1db mm/memcontrol.o
This patch (of 4):
The memcg charge API charges pages before they are rmapped - i.e. have an
actual "type" - and so every callsite needs its own set of charge and
uncharge functions to know what type is being operated on. Worse,
uncharge has to happen from a context that is still type-specific, rather
than at the end of the page's lifetime with exclusive access, and so
requires a lot of synchronization.
Rewrite the charge API to provide a generic set of try_charge(),
commit_charge() and cancel_charge() transaction operations, much like
what's currently done for swap-in:
mem_cgroup_try_charge() attempts to reserve a charge, reclaiming
pages from the memcg if necessary.
mem_cgroup_commit_charge() commits the page to the charge once it
has a valid page->mapping and PageAnon() reliably tells the type.
mem_cgroup_cancel_charge() aborts the transaction.
This reduces the charge API and enables subsequent patches to
drastically simplify uncharging.
As pages need to be committed after rmap is established but before they
are added to the LRU, page_add_new_anon_rmap() must stop doing LRU
additions again. Revive lru_cache_add_active_or_unevictable().
[hughd@google.com: fix shmem_unuse]
[hughd@google.com: Add comments on the private use of -EAGAIN]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: Tejun Heo <tj@kernel.org>
Cc: Vladimir Davydov <vdavydov@parallels.com>
Signed-off-by: Hugh Dickins <hughd@google.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-08-09 04:19:20 +07:00
|
|
|
static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
|
2009-12-16 07:47:10 +07:00
|
|
|
{
|
2014-09-05 19:43:57 +07:00
|
|
|
if (mem_cgroup_is_root(memcg))
|
|
|
|
return;
|
|
|
|
|
mm: memcontrol: lockless page counters
Memory is internally accounted in bytes, using spinlock-protected 64-bit
counters, even though the smallest accounting delta is a page. The
counter interface is also convoluted and does too many things.
Introduce a new lockless word-sized page counter API, then change all
memory accounting over to it. The translation from and to bytes then only
happens when interfacing with userspace.
The removed locking overhead is noticable when scaling beyond the per-cpu
charge caches - on a 4-socket machine with 144-threads, the following test
shows the performance differences of 288 memcgs concurrently running a
page fault benchmark:
vanilla:
18631648.500498 task-clock (msec) # 140.643 CPUs utilized ( +- 0.33% )
1,380,638 context-switches # 0.074 K/sec ( +- 0.75% )
24,390 cpu-migrations # 0.001 K/sec ( +- 8.44% )
1,843,305,768 page-faults # 0.099 M/sec ( +- 0.00% )
50,134,994,088,218 cycles # 2.691 GHz ( +- 0.33% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
8,049,712,224,651 instructions # 0.16 insns per cycle ( +- 0.04% )
1,586,970,584,979 branches # 85.176 M/sec ( +- 0.05% )
1,724,989,949 branch-misses # 0.11% of all branches ( +- 0.48% )
132.474343877 seconds time elapsed ( +- 0.21% )
lockless:
12195979.037525 task-clock (msec) # 133.480 CPUs utilized ( +- 0.18% )
832,850 context-switches # 0.068 K/sec ( +- 0.54% )
15,624 cpu-migrations # 0.001 K/sec ( +- 10.17% )
1,843,304,774 page-faults # 0.151 M/sec ( +- 0.00% )
32,811,216,801,141 cycles # 2.690 GHz ( +- 0.18% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
9,999,265,091,727 instructions # 0.30 insns per cycle ( +- 0.10% )
2,076,759,325,203 branches # 170.282 M/sec ( +- 0.12% )
1,656,917,214 branch-misses # 0.08% of all branches ( +- 0.55% )
91.369330729 seconds time elapsed ( +- 0.45% )
On top of improved scalability, this also gets rid of the icky long long
types in the very heart of memcg, which is great for 32 bit and also makes
the code a lot more readable.
Notable differences between the old and new API:
- res_counter_charge() and res_counter_charge_nofail() become
page_counter_try_charge() and page_counter_charge() resp. to match
the more common kernel naming scheme of try_do()/do()
- res_counter_uncharge_until() is only ever used to cancel a local
counter and never to uncharge bigger segments of a hierarchy, so
it's replaced by the simpler page_counter_cancel()
- res_counter_set_limit() is replaced by page_counter_limit(), which
expects its callers to serialize against themselves
- res_counter_memparse_write_strategy() is replaced by
page_counter_limit(), which rounds down to the nearest page size -
rather than up. This is more reasonable for explicitely requested
hard upper limits.
- to keep charging light-weight, page_counter_try_charge() charges
speculatively, only to roll back if the result exceeds the limit.
Because of this, a failing bigger charge can temporarily lock out
smaller charges that would otherwise succeed. The error is bounded
to the difference between the smallest and the biggest possible
charge size, so for memcg, this means that a failing THP charge can
send base page charges into reclaim upto 2MB (4MB) before the limit
would have been reached. This should be acceptable.
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE and memparse]
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE, memparse, strncmp, and PAGE_SIZE]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Acked-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-12-11 06:42:31 +07:00
|
|
|
page_counter_uncharge(&memcg->memory, nr_pages);
|
2016-01-15 06:21:23 +07:00
|
|
|
if (do_memsw_account())
|
mm: memcontrol: lockless page counters
Memory is internally accounted in bytes, using spinlock-protected 64-bit
counters, even though the smallest accounting delta is a page. The
counter interface is also convoluted and does too many things.
Introduce a new lockless word-sized page counter API, then change all
memory accounting over to it. The translation from and to bytes then only
happens when interfacing with userspace.
The removed locking overhead is noticable when scaling beyond the per-cpu
charge caches - on a 4-socket machine with 144-threads, the following test
shows the performance differences of 288 memcgs concurrently running a
page fault benchmark:
vanilla:
18631648.500498 task-clock (msec) # 140.643 CPUs utilized ( +- 0.33% )
1,380,638 context-switches # 0.074 K/sec ( +- 0.75% )
24,390 cpu-migrations # 0.001 K/sec ( +- 8.44% )
1,843,305,768 page-faults # 0.099 M/sec ( +- 0.00% )
50,134,994,088,218 cycles # 2.691 GHz ( +- 0.33% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
8,049,712,224,651 instructions # 0.16 insns per cycle ( +- 0.04% )
1,586,970,584,979 branches # 85.176 M/sec ( +- 0.05% )
1,724,989,949 branch-misses # 0.11% of all branches ( +- 0.48% )
132.474343877 seconds time elapsed ( +- 0.21% )
lockless:
12195979.037525 task-clock (msec) # 133.480 CPUs utilized ( +- 0.18% )
832,850 context-switches # 0.068 K/sec ( +- 0.54% )
15,624 cpu-migrations # 0.001 K/sec ( +- 10.17% )
1,843,304,774 page-faults # 0.151 M/sec ( +- 0.00% )
32,811,216,801,141 cycles # 2.690 GHz ( +- 0.18% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
9,999,265,091,727 instructions # 0.30 insns per cycle ( +- 0.10% )
2,076,759,325,203 branches # 170.282 M/sec ( +- 0.12% )
1,656,917,214 branch-misses # 0.08% of all branches ( +- 0.55% )
91.369330729 seconds time elapsed ( +- 0.45% )
On top of improved scalability, this also gets rid of the icky long long
types in the very heart of memcg, which is great for 32 bit and also makes
the code a lot more readable.
Notable differences between the old and new API:
- res_counter_charge() and res_counter_charge_nofail() become
page_counter_try_charge() and page_counter_charge() resp. to match
the more common kernel naming scheme of try_do()/do()
- res_counter_uncharge_until() is only ever used to cancel a local
counter and never to uncharge bigger segments of a hierarchy, so
it's replaced by the simpler page_counter_cancel()
- res_counter_set_limit() is replaced by page_counter_limit(), which
expects its callers to serialize against themselves
- res_counter_memparse_write_strategy() is replaced by
page_counter_limit(), which rounds down to the nearest page size -
rather than up. This is more reasonable for explicitely requested
hard upper limits.
- to keep charging light-weight, page_counter_try_charge() charges
speculatively, only to roll back if the result exceeds the limit.
Because of this, a failing bigger charge can temporarily lock out
smaller charges that would otherwise succeed. The error is bounded
to the difference between the smallest and the biggest possible
charge size, so for memcg, this means that a failing THP charge can
send base page charges into reclaim upto 2MB (4MB) before the limit
would have been reached. This should be acceptable.
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE and memparse]
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE, memparse, strncmp, and PAGE_SIZE]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Acked-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-12-11 06:42:31 +07:00
|
|
|
page_counter_uncharge(&memcg->memsw, nr_pages);
|
2012-05-30 05:07:03 +07:00
|
|
|
}
|
2020-06-04 06:02:07 +07:00
|
|
|
#endif
|
2012-05-30 05:07:03 +07:00
|
|
|
|
2020-06-04 06:02:24 +07:00
|
|
|
static void commit_charge(struct page *page, struct mem_cgroup *memcg)
|
mm: memcontrol: rewrite uncharge API
The memcg uncharging code that is involved towards the end of a page's
lifetime - truncation, reclaim, swapout, migration - is impressively
complicated and fragile.
Because anonymous and file pages were always charged before they had their
page->mapping established, uncharges had to happen when the page type
could still be known from the context; as in unmap for anonymous, page
cache removal for file and shmem pages, and swap cache truncation for swap
pages. However, these operations happen well before the page is actually
freed, and so a lot of synchronization is necessary:
- Charging, uncharging, page migration, and charge migration all need
to take a per-page bit spinlock as they could race with uncharging.
- Swap cache truncation happens during both swap-in and swap-out, and
possibly repeatedly before the page is actually freed. This means
that the memcg swapout code is called from many contexts that make
no sense and it has to figure out the direction from page state to
make sure memory and memory+swap are always correctly charged.
- On page migration, the old page might be unmapped but then reused,
so memcg code has to prevent untimely uncharging in that case.
Because this code - which should be a simple charge transfer - is so
special-cased, it is not reusable for replace_page_cache().
But now that charged pages always have a page->mapping, introduce
mem_cgroup_uncharge(), which is called after the final put_page(), when we
know for sure that nobody is looking at the page anymore.
For page migration, introduce mem_cgroup_migrate(), which is called after
the migration is successful and the new page is fully rmapped. Because
the old page is no longer uncharged after migration, prevent double
charges by decoupling the page's memcg association (PCG_USED and
pc->mem_cgroup) from the page holding an actual charge. The new bits
PCG_MEM and PCG_MEMSW represent the respective charges and are transferred
to the new page during migration.
mem_cgroup_migrate() is suitable for replace_page_cache() as well,
which gets rid of mem_cgroup_replace_page_cache(). However, care
needs to be taken because both the source and the target page can
already be charged and on the LRU when fuse is splicing: grab the page
lock on the charge moving side to prevent changing pc->mem_cgroup of a
page under migration. Also, the lruvecs of both pages change as we
uncharge the old and charge the new during migration, and putback may
race with us, so grab the lru lock and isolate the pages iff on LRU to
prevent races and ensure the pages are on the right lruvec afterward.
Swap accounting is massively simplified: because the page is no longer
uncharged as early as swap cache deletion, a new mem_cgroup_swapout() can
transfer the page's memory+swap charge (PCG_MEMSW) to the swap entry
before the final put_page() in page reclaim.
Finally, page_cgroup changes are now protected by whatever protection the
page itself offers: anonymous pages are charged under the page table lock,
whereas page cache insertions, swapin, and migration hold the page lock.
Uncharging happens under full exclusion with no outstanding references.
Charging and uncharging also ensure that the page is off-LRU, which
serializes against charge migration. Remove the very costly page_cgroup
lock and set pc->flags non-atomically.
[mhocko@suse.cz: mem_cgroup_charge_statistics needs preempt_disable]
[vdavydov@parallels.com: fix flags definition]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Vladimir Davydov <vdavydov@parallels.com>
Tested-by: Jet Chen <jet.chen@intel.com>
Acked-by: Michal Hocko <mhocko@suse.cz>
Tested-by: Felipe Balbi <balbi@ti.com>
Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-08-09 04:19:22 +07:00
|
|
|
{
|
2014-12-11 06:44:52 +07:00
|
|
|
VM_BUG_ON_PAGE(page->mem_cgroup, page);
|
mm: memcontrol: rewrite uncharge API
The memcg uncharging code that is involved towards the end of a page's
lifetime - truncation, reclaim, swapout, migration - is impressively
complicated and fragile.
Because anonymous and file pages were always charged before they had their
page->mapping established, uncharges had to happen when the page type
could still be known from the context; as in unmap for anonymous, page
cache removal for file and shmem pages, and swap cache truncation for swap
pages. However, these operations happen well before the page is actually
freed, and so a lot of synchronization is necessary:
- Charging, uncharging, page migration, and charge migration all need
to take a per-page bit spinlock as they could race with uncharging.
- Swap cache truncation happens during both swap-in and swap-out, and
possibly repeatedly before the page is actually freed. This means
that the memcg swapout code is called from many contexts that make
no sense and it has to figure out the direction from page state to
make sure memory and memory+swap are always correctly charged.
- On page migration, the old page might be unmapped but then reused,
so memcg code has to prevent untimely uncharging in that case.
Because this code - which should be a simple charge transfer - is so
special-cased, it is not reusable for replace_page_cache().
But now that charged pages always have a page->mapping, introduce
mem_cgroup_uncharge(), which is called after the final put_page(), when we
know for sure that nobody is looking at the page anymore.
For page migration, introduce mem_cgroup_migrate(), which is called after
the migration is successful and the new page is fully rmapped. Because
the old page is no longer uncharged after migration, prevent double
charges by decoupling the page's memcg association (PCG_USED and
pc->mem_cgroup) from the page holding an actual charge. The new bits
PCG_MEM and PCG_MEMSW represent the respective charges and are transferred
to the new page during migration.
mem_cgroup_migrate() is suitable for replace_page_cache() as well,
which gets rid of mem_cgroup_replace_page_cache(). However, care
needs to be taken because both the source and the target page can
already be charged and on the LRU when fuse is splicing: grab the page
lock on the charge moving side to prevent changing pc->mem_cgroup of a
page under migration. Also, the lruvecs of both pages change as we
uncharge the old and charge the new during migration, and putback may
race with us, so grab the lru lock and isolate the pages iff on LRU to
prevent races and ensure the pages are on the right lruvec afterward.
Swap accounting is massively simplified: because the page is no longer
uncharged as early as swap cache deletion, a new mem_cgroup_swapout() can
transfer the page's memory+swap charge (PCG_MEMSW) to the swap entry
before the final put_page() in page reclaim.
Finally, page_cgroup changes are now protected by whatever protection the
page itself offers: anonymous pages are charged under the page table lock,
whereas page cache insertions, swapin, and migration hold the page lock.
Uncharging happens under full exclusion with no outstanding references.
Charging and uncharging also ensure that the page is off-LRU, which
serializes against charge migration. Remove the very costly page_cgroup
lock and set pc->flags non-atomically.
[mhocko@suse.cz: mem_cgroup_charge_statistics needs preempt_disable]
[vdavydov@parallels.com: fix flags definition]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Vladimir Davydov <vdavydov@parallels.com>
Tested-by: Jet Chen <jet.chen@intel.com>
Acked-by: Michal Hocko <mhocko@suse.cz>
Tested-by: Felipe Balbi <balbi@ti.com>
Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-08-09 04:19:22 +07:00
|
|
|
/*
|
2020-06-04 06:02:27 +07:00
|
|
|
* Any of the following ensures page->mem_cgroup stability:
|
mm: memcontrol: rewrite uncharge API
The memcg uncharging code that is involved towards the end of a page's
lifetime - truncation, reclaim, swapout, migration - is impressively
complicated and fragile.
Because anonymous and file pages were always charged before they had their
page->mapping established, uncharges had to happen when the page type
could still be known from the context; as in unmap for anonymous, page
cache removal for file and shmem pages, and swap cache truncation for swap
pages. However, these operations happen well before the page is actually
freed, and so a lot of synchronization is necessary:
- Charging, uncharging, page migration, and charge migration all need
to take a per-page bit spinlock as they could race with uncharging.
- Swap cache truncation happens during both swap-in and swap-out, and
possibly repeatedly before the page is actually freed. This means
that the memcg swapout code is called from many contexts that make
no sense and it has to figure out the direction from page state to
make sure memory and memory+swap are always correctly charged.
- On page migration, the old page might be unmapped but then reused,
so memcg code has to prevent untimely uncharging in that case.
Because this code - which should be a simple charge transfer - is so
special-cased, it is not reusable for replace_page_cache().
But now that charged pages always have a page->mapping, introduce
mem_cgroup_uncharge(), which is called after the final put_page(), when we
know for sure that nobody is looking at the page anymore.
For page migration, introduce mem_cgroup_migrate(), which is called after
the migration is successful and the new page is fully rmapped. Because
the old page is no longer uncharged after migration, prevent double
charges by decoupling the page's memcg association (PCG_USED and
pc->mem_cgroup) from the page holding an actual charge. The new bits
PCG_MEM and PCG_MEMSW represent the respective charges and are transferred
to the new page during migration.
mem_cgroup_migrate() is suitable for replace_page_cache() as well,
which gets rid of mem_cgroup_replace_page_cache(). However, care
needs to be taken because both the source and the target page can
already be charged and on the LRU when fuse is splicing: grab the page
lock on the charge moving side to prevent changing pc->mem_cgroup of a
page under migration. Also, the lruvecs of both pages change as we
uncharge the old and charge the new during migration, and putback may
race with us, so grab the lru lock and isolate the pages iff on LRU to
prevent races and ensure the pages are on the right lruvec afterward.
Swap accounting is massively simplified: because the page is no longer
uncharged as early as swap cache deletion, a new mem_cgroup_swapout() can
transfer the page's memory+swap charge (PCG_MEMSW) to the swap entry
before the final put_page() in page reclaim.
Finally, page_cgroup changes are now protected by whatever protection the
page itself offers: anonymous pages are charged under the page table lock,
whereas page cache insertions, swapin, and migration hold the page lock.
Uncharging happens under full exclusion with no outstanding references.
Charging and uncharging also ensure that the page is off-LRU, which
serializes against charge migration. Remove the very costly page_cgroup
lock and set pc->flags non-atomically.
[mhocko@suse.cz: mem_cgroup_charge_statistics needs preempt_disable]
[vdavydov@parallels.com: fix flags definition]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Vladimir Davydov <vdavydov@parallels.com>
Tested-by: Jet Chen <jet.chen@intel.com>
Acked-by: Michal Hocko <mhocko@suse.cz>
Tested-by: Felipe Balbi <balbi@ti.com>
Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-08-09 04:19:22 +07:00
|
|
|
*
|
2020-06-04 06:02:27 +07:00
|
|
|
* - the page lock
|
|
|
|
* - LRU isolation
|
|
|
|
* - lock_page_memcg()
|
|
|
|
* - exclusive reference
|
mm: memcontrol: rewrite uncharge API
The memcg uncharging code that is involved towards the end of a page's
lifetime - truncation, reclaim, swapout, migration - is impressively
complicated and fragile.
Because anonymous and file pages were always charged before they had their
page->mapping established, uncharges had to happen when the page type
could still be known from the context; as in unmap for anonymous, page
cache removal for file and shmem pages, and swap cache truncation for swap
pages. However, these operations happen well before the page is actually
freed, and so a lot of synchronization is necessary:
- Charging, uncharging, page migration, and charge migration all need
to take a per-page bit spinlock as they could race with uncharging.
- Swap cache truncation happens during both swap-in and swap-out, and
possibly repeatedly before the page is actually freed. This means
that the memcg swapout code is called from many contexts that make
no sense and it has to figure out the direction from page state to
make sure memory and memory+swap are always correctly charged.
- On page migration, the old page might be unmapped but then reused,
so memcg code has to prevent untimely uncharging in that case.
Because this code - which should be a simple charge transfer - is so
special-cased, it is not reusable for replace_page_cache().
But now that charged pages always have a page->mapping, introduce
mem_cgroup_uncharge(), which is called after the final put_page(), when we
know for sure that nobody is looking at the page anymore.
For page migration, introduce mem_cgroup_migrate(), which is called after
the migration is successful and the new page is fully rmapped. Because
the old page is no longer uncharged after migration, prevent double
charges by decoupling the page's memcg association (PCG_USED and
pc->mem_cgroup) from the page holding an actual charge. The new bits
PCG_MEM and PCG_MEMSW represent the respective charges and are transferred
to the new page during migration.
mem_cgroup_migrate() is suitable for replace_page_cache() as well,
which gets rid of mem_cgroup_replace_page_cache(). However, care
needs to be taken because both the source and the target page can
already be charged and on the LRU when fuse is splicing: grab the page
lock on the charge moving side to prevent changing pc->mem_cgroup of a
page under migration. Also, the lruvecs of both pages change as we
uncharge the old and charge the new during migration, and putback may
race with us, so grab the lru lock and isolate the pages iff on LRU to
prevent races and ensure the pages are on the right lruvec afterward.
Swap accounting is massively simplified: because the page is no longer
uncharged as early as swap cache deletion, a new mem_cgroup_swapout() can
transfer the page's memory+swap charge (PCG_MEMSW) to the swap entry
before the final put_page() in page reclaim.
Finally, page_cgroup changes are now protected by whatever protection the
page itself offers: anonymous pages are charged under the page table lock,
whereas page cache insertions, swapin, and migration hold the page lock.
Uncharging happens under full exclusion with no outstanding references.
Charging and uncharging also ensure that the page is off-LRU, which
serializes against charge migration. Remove the very costly page_cgroup
lock and set pc->flags non-atomically.
[mhocko@suse.cz: mem_cgroup_charge_statistics needs preempt_disable]
[vdavydov@parallels.com: fix flags definition]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Vladimir Davydov <vdavydov@parallels.com>
Tested-by: Jet Chen <jet.chen@intel.com>
Acked-by: Michal Hocko <mhocko@suse.cz>
Tested-by: Felipe Balbi <balbi@ti.com>
Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-08-09 04:19:22 +07:00
|
|
|
*/
|
2014-12-11 06:44:52 +07:00
|
|
|
page->mem_cgroup = memcg;
|
2009-01-08 09:07:48 +07:00
|
|
|
}
|
2008-02-07 15:13:56 +07:00
|
|
|
|
2018-08-18 05:47:25 +07:00
|
|
|
#ifdef CONFIG_MEMCG_KMEM
|
2020-08-07 13:21:27 +07:00
|
|
|
int memcg_alloc_page_obj_cgroups(struct page *page, struct kmem_cache *s,
|
|
|
|
gfp_t gfp)
|
|
|
|
{
|
|
|
|
unsigned int objects = objs_per_slab_page(s, page);
|
|
|
|
void *vec;
|
|
|
|
|
|
|
|
vec = kcalloc_node(objects, sizeof(struct obj_cgroup *), gfp,
|
|
|
|
page_to_nid(page));
|
|
|
|
if (!vec)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
if (cmpxchg(&page->obj_cgroups, NULL,
|
|
|
|
(struct obj_cgroup **) ((unsigned long)vec | 0x1UL)))
|
|
|
|
kfree(vec);
|
|
|
|
else
|
|
|
|
kmemleak_not_leak(vec);
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2020-03-29 09:17:25 +07:00
|
|
|
/*
|
|
|
|
* Returns a pointer to the memory cgroup to which the kernel object is charged.
|
|
|
|
*
|
|
|
|
* The caller must ensure the memcg lifetime, e.g. by taking rcu_read_lock(),
|
|
|
|
* cgroup_mutex, etc.
|
|
|
|
*/
|
|
|
|
struct mem_cgroup *mem_cgroup_from_obj(void *p)
|
|
|
|
{
|
|
|
|
struct page *page;
|
|
|
|
|
|
|
|
if (mem_cgroup_disabled())
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
page = virt_to_head_page(p);
|
|
|
|
|
mm: memcg/slab: fix racy access to page->mem_cgroup in mem_cgroup_from_obj()
mem_cgroup_from_obj() checks the lowest bit of the page->mem_cgroup
pointer to determine if the page has an attached obj_cgroup vector instead
of a regular memcg pointer. If it's not set, it simple returns the
page->mem_cgroup value as a struct mem_cgroup pointer.
The commit 10befea91b61 ("mm: memcg/slab: use a single set of kmem_caches
for all allocations") changed the moment when this bit is set: if
previously it was set on the allocation of the slab page, now it can be
set well after, when the first accounted object is allocated on this page.
It opened a race: if page->mem_cgroup is set concurrently after the first
page_has_obj_cgroups(page) check, a pointer to the obj_cgroups array can
be returned as a memory cgroup pointer.
A simple check for page->mem_cgroup pointer for NULL before the
page_has_obj_cgroups() check fixes the race. Indeed, if the pointer is
not NULL, it's either a simple mem_cgroup pointer or a pointer to
obj_cgroup vector. The pointer can be asynchronously changed from NULL to
(obj_cgroup_vec | 0x1UL), but can't be changed from a valid memcg pointer
to objcg vector or back.
If the object passed to mem_cgroup_from_obj() is a slab object and
page->mem_cgroup is NULL, it means that the object is not accounted, so
the function must return NULL.
I've discovered the race looking at the code, so far I haven't seen it in
the wild.
Fixes: 10befea91b61 ("mm: memcg/slab: use a single set of kmem_caches for all allocations")
Signed-off-by: Roman Gushchin <guro@fb.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Link: https://lkml.kernel.org/r/20200910022435.2773735-1-guro@fb.com
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2020-10-14 06:52:42 +07:00
|
|
|
/*
|
|
|
|
* If page->mem_cgroup is set, it's either a simple mem_cgroup pointer
|
|
|
|
* or a pointer to obj_cgroup vector. In the latter case the lowest
|
|
|
|
* bit of the pointer is set.
|
|
|
|
* The page->mem_cgroup pointer can be asynchronously changed
|
|
|
|
* from NULL to (obj_cgroup_vec | 0x1UL), but can't be changed
|
|
|
|
* from a valid memcg pointer to objcg vector or back.
|
|
|
|
*/
|
|
|
|
if (!page->mem_cgroup)
|
|
|
|
return NULL;
|
|
|
|
|
2020-03-29 09:17:25 +07:00
|
|
|
/*
|
2020-08-07 13:21:10 +07:00
|
|
|
* Slab objects are accounted individually, not per-page.
|
|
|
|
* Memcg membership data for each individual object is saved in
|
|
|
|
* the page->obj_cgroups.
|
2020-03-29 09:17:25 +07:00
|
|
|
*/
|
2020-08-07 13:21:10 +07:00
|
|
|
if (page_has_obj_cgroups(page)) {
|
|
|
|
struct obj_cgroup *objcg;
|
|
|
|
unsigned int off;
|
|
|
|
|
|
|
|
off = obj_to_index(page->slab_cache, page, p);
|
|
|
|
objcg = page_obj_cgroups(page)[off];
|
2020-08-07 13:21:27 +07:00
|
|
|
if (objcg)
|
|
|
|
return obj_cgroup_memcg(objcg);
|
|
|
|
|
|
|
|
return NULL;
|
2020-08-07 13:21:10 +07:00
|
|
|
}
|
2020-03-29 09:17:25 +07:00
|
|
|
|
|
|
|
/* All other pages use page->mem_cgroup */
|
|
|
|
return page->mem_cgroup;
|
|
|
|
}
|
|
|
|
|
2020-08-07 13:20:49 +07:00
|
|
|
__always_inline struct obj_cgroup *get_obj_cgroup_from_current(void)
|
|
|
|
{
|
|
|
|
struct obj_cgroup *objcg = NULL;
|
|
|
|
struct mem_cgroup *memcg;
|
|
|
|
|
2020-10-18 06:13:44 +07:00
|
|
|
if (memcg_kmem_bypass())
|
|
|
|
return NULL;
|
|
|
|
|
2020-08-07 13:20:49 +07:00
|
|
|
rcu_read_lock();
|
mm: kmem: prepare remote memcg charging infra for interrupt contexts
Remote memcg charging API uses current->active_memcg to store the
currently active memory cgroup, which overwrites the memory cgroup of the
current process. It works well for normal contexts, but doesn't work for
interrupt contexts: indeed, if an interrupt occurs during the execution of
a section with an active memcg set, all allocations inside the interrupt
will be charged to the active memcg set (given that we'll enable
accounting for allocations from an interrupt context). But because the
interrupt might have no relation to the active memcg set outside, it's
obviously wrong from the accounting prospective.
To resolve this problem, let's add a global percpu int_active_memcg
variable, which will be used to store an active memory cgroup which will
be used from interrupt contexts. set_active_memcg() will transparently
use current->active_memcg or int_active_memcg depending on the context.
To make the read part simple and transparent for the caller, let's
introduce two new functions:
- struct mem_cgroup *active_memcg(void),
- struct mem_cgroup *get_active_memcg(void).
They are returning the active memcg if it's set, hiding all implementation
details: where to get it depending on the current context.
Signed-off-by: Roman Gushchin <guro@fb.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Link: http://lkml.kernel.org/r/20200827225843.1270629-4-guro@fb.com
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2020-10-18 06:13:50 +07:00
|
|
|
if (unlikely(active_memcg()))
|
|
|
|
memcg = active_memcg();
|
2020-08-07 13:20:49 +07:00
|
|
|
else
|
|
|
|
memcg = mem_cgroup_from_task(current);
|
|
|
|
|
|
|
|
for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) {
|
|
|
|
objcg = rcu_dereference(memcg->objcg);
|
|
|
|
if (objcg && obj_cgroup_tryget(objcg))
|
|
|
|
break;
|
2020-12-15 10:06:31 +07:00
|
|
|
objcg = NULL;
|
2020-08-07 13:20:49 +07:00
|
|
|
}
|
|
|
|
rcu_read_unlock();
|
|
|
|
|
|
|
|
return objcg;
|
|
|
|
}
|
|
|
|
|
2014-10-10 05:28:45 +07:00
|
|
|
static int memcg_alloc_cache_id(void)
|
memcg: allocate memory for memcg caches whenever a new memcg appears
Every cache that is considered a root cache (basically the "original"
caches, tied to the root memcg/no-memcg) will have an array that should be
large enough to store a cache pointer per each memcg in the system.
Theoreticaly, this is as high as 1 << sizeof(css_id), which is currently
in the 64k pointers range. Most of the time, we won't be using that much.
What goes in this patch, is a simple scheme to dynamically allocate such
an array, in order to minimize memory usage for memcg caches. Because we
would also like to avoid allocations all the time, at least for now, the
array will only grow. It will tend to be big enough to hold the maximum
number of kmem-limited memcgs ever achieved.
We'll allocate it to be a minimum of 64 kmem-limited memcgs. When we have
more than that, we'll start doubling the size of this array every time the
limit is reached.
Because we are only considering kmem limited memcgs, a natural point for
this to happen is when we write to the limit. At that point, we already
have set_limit_mutex held, so that will become our natural synchronization
mechanism.
Signed-off-by: Glauber Costa <glommer@parallels.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Frederic Weisbecker <fweisbec@redhat.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: JoonSoo Kim <js1304@gmail.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Rik van Riel <riel@redhat.com>
Cc: Suleiman Souhlal <suleiman@google.com>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-12-19 05:22:38 +07:00
|
|
|
{
|
2014-10-10 05:28:45 +07:00
|
|
|
int id, size;
|
|
|
|
int err;
|
|
|
|
|
2015-02-13 05:58:57 +07:00
|
|
|
id = ida_simple_get(&memcg_cache_ida,
|
2014-10-10 05:28:45 +07:00
|
|
|
0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL);
|
|
|
|
if (id < 0)
|
|
|
|
return id;
|
memcg: allocate memory for memcg caches whenever a new memcg appears
Every cache that is considered a root cache (basically the "original"
caches, tied to the root memcg/no-memcg) will have an array that should be
large enough to store a cache pointer per each memcg in the system.
Theoreticaly, this is as high as 1 << sizeof(css_id), which is currently
in the 64k pointers range. Most of the time, we won't be using that much.
What goes in this patch, is a simple scheme to dynamically allocate such
an array, in order to minimize memory usage for memcg caches. Because we
would also like to avoid allocations all the time, at least for now, the
array will only grow. It will tend to be big enough to hold the maximum
number of kmem-limited memcgs ever achieved.
We'll allocate it to be a minimum of 64 kmem-limited memcgs. When we have
more than that, we'll start doubling the size of this array every time the
limit is reached.
Because we are only considering kmem limited memcgs, a natural point for
this to happen is when we write to the limit. At that point, we already
have set_limit_mutex held, so that will become our natural synchronization
mechanism.
Signed-off-by: Glauber Costa <glommer@parallels.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Frederic Weisbecker <fweisbec@redhat.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: JoonSoo Kim <js1304@gmail.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Rik van Riel <riel@redhat.com>
Cc: Suleiman Souhlal <suleiman@google.com>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-12-19 05:22:38 +07:00
|
|
|
|
2015-02-13 05:58:57 +07:00
|
|
|
if (id < memcg_nr_cache_ids)
|
2014-10-10 05:28:45 +07:00
|
|
|
return id;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* There's no space for the new id in memcg_caches arrays,
|
|
|
|
* so we have to grow them.
|
|
|
|
*/
|
2015-02-13 05:59:01 +07:00
|
|
|
down_write(&memcg_cache_ids_sem);
|
2014-10-10 05:28:45 +07:00
|
|
|
|
|
|
|
size = 2 * (id + 1);
|
memcg: allocate memory for memcg caches whenever a new memcg appears
Every cache that is considered a root cache (basically the "original"
caches, tied to the root memcg/no-memcg) will have an array that should be
large enough to store a cache pointer per each memcg in the system.
Theoreticaly, this is as high as 1 << sizeof(css_id), which is currently
in the 64k pointers range. Most of the time, we won't be using that much.
What goes in this patch, is a simple scheme to dynamically allocate such
an array, in order to minimize memory usage for memcg caches. Because we
would also like to avoid allocations all the time, at least for now, the
array will only grow. It will tend to be big enough to hold the maximum
number of kmem-limited memcgs ever achieved.
We'll allocate it to be a minimum of 64 kmem-limited memcgs. When we have
more than that, we'll start doubling the size of this array every time the
limit is reached.
Because we are only considering kmem limited memcgs, a natural point for
this to happen is when we write to the limit. At that point, we already
have set_limit_mutex held, so that will become our natural synchronization
mechanism.
Signed-off-by: Glauber Costa <glommer@parallels.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Frederic Weisbecker <fweisbec@redhat.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: JoonSoo Kim <js1304@gmail.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Rik van Riel <riel@redhat.com>
Cc: Suleiman Souhlal <suleiman@google.com>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-12-19 05:22:38 +07:00
|
|
|
if (size < MEMCG_CACHES_MIN_SIZE)
|
|
|
|
size = MEMCG_CACHES_MIN_SIZE;
|
|
|
|
else if (size > MEMCG_CACHES_MAX_SIZE)
|
|
|
|
size = MEMCG_CACHES_MAX_SIZE;
|
|
|
|
|
2020-08-07 13:21:10 +07:00
|
|
|
err = memcg_update_all_list_lrus(size);
|
2015-02-13 05:59:01 +07:00
|
|
|
if (!err)
|
|
|
|
memcg_nr_cache_ids = size;
|
|
|
|
|
|
|
|
up_write(&memcg_cache_ids_sem);
|
|
|
|
|
2014-10-10 05:28:45 +07:00
|
|
|
if (err) {
|
2015-02-13 05:58:57 +07:00
|
|
|
ida_simple_remove(&memcg_cache_ida, id);
|
2014-10-10 05:28:45 +07:00
|
|
|
return err;
|
|
|
|
}
|
|
|
|
return id;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void memcg_free_cache_id(int id)
|
|
|
|
{
|
2015-02-13 05:58:57 +07:00
|
|
|
ida_simple_remove(&memcg_cache_ida, id);
|
memcg: allocate memory for memcg caches whenever a new memcg appears
Every cache that is considered a root cache (basically the "original"
caches, tied to the root memcg/no-memcg) will have an array that should be
large enough to store a cache pointer per each memcg in the system.
Theoreticaly, this is as high as 1 << sizeof(css_id), which is currently
in the 64k pointers range. Most of the time, we won't be using that much.
What goes in this patch, is a simple scheme to dynamically allocate such
an array, in order to minimize memory usage for memcg caches. Because we
would also like to avoid allocations all the time, at least for now, the
array will only grow. It will tend to be big enough to hold the maximum
number of kmem-limited memcgs ever achieved.
We'll allocate it to be a minimum of 64 kmem-limited memcgs. When we have
more than that, we'll start doubling the size of this array every time the
limit is reached.
Because we are only considering kmem limited memcgs, a natural point for
this to happen is when we write to the limit. At that point, we already
have set_limit_mutex held, so that will become our natural synchronization
mechanism.
Signed-off-by: Glauber Costa <glommer@parallels.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Frederic Weisbecker <fweisbec@redhat.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: JoonSoo Kim <js1304@gmail.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Rik van Riel <riel@redhat.com>
Cc: Suleiman Souhlal <suleiman@google.com>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-12-19 05:22:38 +07:00
|
|
|
}
|
|
|
|
|
2016-07-27 05:24:21 +07:00
|
|
|
/**
|
2020-04-02 11:06:56 +07:00
|
|
|
* __memcg_kmem_charge: charge a number of kernel pages to a memcg
|
mm: kmem: cleanup (__)memcg_kmem_charge_memcg() arguments
Patch series "mm: memcg: kmem API cleanup", v2.
This patchset aims to clean up the kernel memory charging API. It doesn't
bring any functional changes, just removes unused arguments, renames some
functions and fixes some comments.
Currently it's not obvious which functions are most basic
(memcg_kmem_(un)charge_memcg()) and which are based on them
(memcg_kmem_(un)charge()). The patchset renames these functions and
removes unused arguments:
TL;DR:
was:
memcg_kmem_charge_memcg(page, gfp, order, memcg)
memcg_kmem_uncharge_memcg(memcg, nr_pages)
memcg_kmem_charge(page, gfp, order)
memcg_kmem_uncharge(page, order)
now:
memcg_kmem_charge(memcg, gfp, nr_pages)
memcg_kmem_uncharge(memcg, nr_pages)
memcg_kmem_charge_page(page, gfp, order)
memcg_kmem_uncharge_page(page, order)
This patch (of 6):
The first argument of memcg_kmem_charge_memcg() and
__memcg_kmem_charge_memcg() is the page pointer and it's not used. Let's
drop it.
Memcg pointer is passed as the last argument. Move it to the first place
for consistency with other memcg functions, e.g.
__memcg_kmem_uncharge_memcg() or try_charge().
Signed-off-by: Roman Gushchin <guro@fb.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Link: http://lkml.kernel.org/r/20200109202659.752357-2-guro@fb.com
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2020-04-02 11:06:39 +07:00
|
|
|
* @memcg: memory cgroup to charge
|
2016-07-27 05:24:21 +07:00
|
|
|
* @gfp: reclaim mode
|
2020-04-02 11:06:49 +07:00
|
|
|
* @nr_pages: number of pages to charge
|
2016-07-27 05:24:21 +07:00
|
|
|
*
|
|
|
|
* Returns 0 on success, an error code on failure.
|
|
|
|
*/
|
2020-04-02 11:06:56 +07:00
|
|
|
int __memcg_kmem_charge(struct mem_cgroup *memcg, gfp_t gfp,
|
|
|
|
unsigned int nr_pages)
|
2012-12-19 05:21:56 +07:00
|
|
|
{
|
memcg: unify slab and other kmem pages charging
We have memcg_kmem_charge and memcg_kmem_uncharge methods for charging and
uncharging kmem pages to memcg, but currently they are not used for
charging slab pages (i.e. they are only used for charging pages allocated
with alloc_kmem_pages). The only reason why the slab subsystem uses
special helpers, memcg_charge_slab and memcg_uncharge_slab, is that it
needs to charge to the memcg of kmem cache while memcg_charge_kmem charges
to the memcg that the current task belongs to.
To remove this diversity, this patch adds an extra argument to
__memcg_kmem_charge that can be a pointer to a memcg or NULL. If it is
not NULL, the function tries to charge to the memcg it points to,
otherwise it charge to the current context. Next, it makes the slab
subsystem use this function to charge slab pages.
Since memcg_charge_kmem and memcg_uncharge_kmem helpers are now used only
in __memcg_kmem_charge and __memcg_kmem_uncharge, they are inlined. Since
__memcg_kmem_charge stores a pointer to the memcg in the page struct, we
don't need memcg_uncharge_slab anymore and can use free_kmem_pages.
Besides, one can now detect which memcg a slab page belongs to by reading
/proc/kpagecgroup.
Note, this patch switches slab to charge-after-alloc design. Since this
design is already used for all other memcg charges, it should not make any
difference.
[hannes@cmpxchg.org: better to have an outer function than a magic parameter for the memcg lookup]
Signed-off-by: Vladimir Davydov <vdavydov@virtuozzo.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-11-06 09:49:01 +07:00
|
|
|
struct page_counter *counter;
|
2012-12-19 05:21:56 +07:00
|
|
|
int ret;
|
|
|
|
|
memcg: unify slab and other kmem pages charging
We have memcg_kmem_charge and memcg_kmem_uncharge methods for charging and
uncharging kmem pages to memcg, but currently they are not used for
charging slab pages (i.e. they are only used for charging pages allocated
with alloc_kmem_pages). The only reason why the slab subsystem uses
special helpers, memcg_charge_slab and memcg_uncharge_slab, is that it
needs to charge to the memcg of kmem cache while memcg_charge_kmem charges
to the memcg that the current task belongs to.
To remove this diversity, this patch adds an extra argument to
__memcg_kmem_charge that can be a pointer to a memcg or NULL. If it is
not NULL, the function tries to charge to the memcg it points to,
otherwise it charge to the current context. Next, it makes the slab
subsystem use this function to charge slab pages.
Since memcg_charge_kmem and memcg_uncharge_kmem helpers are now used only
in __memcg_kmem_charge and __memcg_kmem_uncharge, they are inlined. Since
__memcg_kmem_charge stores a pointer to the memcg in the page struct, we
don't need memcg_uncharge_slab anymore and can use free_kmem_pages.
Besides, one can now detect which memcg a slab page belongs to by reading
/proc/kpagecgroup.
Note, this patch switches slab to charge-after-alloc design. Since this
design is already used for all other memcg charges, it should not make any
difference.
[hannes@cmpxchg.org: better to have an outer function than a magic parameter for the memcg lookup]
Signed-off-by: Vladimir Davydov <vdavydov@virtuozzo.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-11-06 09:49:01 +07:00
|
|
|
ret = try_charge(memcg, gfp, nr_pages);
|
2016-01-21 06:02:35 +07:00
|
|
|
if (ret)
|
memcg: unify slab and other kmem pages charging
We have memcg_kmem_charge and memcg_kmem_uncharge methods for charging and
uncharging kmem pages to memcg, but currently they are not used for
charging slab pages (i.e. they are only used for charging pages allocated
with alloc_kmem_pages). The only reason why the slab subsystem uses
special helpers, memcg_charge_slab and memcg_uncharge_slab, is that it
needs to charge to the memcg of kmem cache while memcg_charge_kmem charges
to the memcg that the current task belongs to.
To remove this diversity, this patch adds an extra argument to
__memcg_kmem_charge that can be a pointer to a memcg or NULL. If it is
not NULL, the function tries to charge to the memcg it points to,
otherwise it charge to the current context. Next, it makes the slab
subsystem use this function to charge slab pages.
Since memcg_charge_kmem and memcg_uncharge_kmem helpers are now used only
in __memcg_kmem_charge and __memcg_kmem_uncharge, they are inlined. Since
__memcg_kmem_charge stores a pointer to the memcg in the page struct, we
don't need memcg_uncharge_slab anymore and can use free_kmem_pages.
Besides, one can now detect which memcg a slab page belongs to by reading
/proc/kpagecgroup.
Note, this patch switches slab to charge-after-alloc design. Since this
design is already used for all other memcg charges, it should not make any
difference.
[hannes@cmpxchg.org: better to have an outer function than a magic parameter for the memcg lookup]
Signed-off-by: Vladimir Davydov <vdavydov@virtuozzo.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-11-06 09:49:01 +07:00
|
|
|
return ret;
|
2016-01-21 06:02:35 +07:00
|
|
|
|
|
|
|
if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) &&
|
|
|
|
!page_counter_try_charge(&memcg->kmem, nr_pages, &counter)) {
|
2019-09-26 06:45:53 +07:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Enforce __GFP_NOFAIL allocation because callers are not
|
|
|
|
* prepared to see failures and likely do not have any failure
|
|
|
|
* handling code.
|
|
|
|
*/
|
|
|
|
if (gfp & __GFP_NOFAIL) {
|
|
|
|
page_counter_charge(&memcg->kmem, nr_pages);
|
|
|
|
return 0;
|
|
|
|
}
|
2016-01-21 06:02:35 +07:00
|
|
|
cancel_charge(memcg, nr_pages);
|
|
|
|
return -ENOMEM;
|
2012-12-19 05:21:56 +07:00
|
|
|
}
|
memcg: unify slab and other kmem pages charging
We have memcg_kmem_charge and memcg_kmem_uncharge methods for charging and
uncharging kmem pages to memcg, but currently they are not used for
charging slab pages (i.e. they are only used for charging pages allocated
with alloc_kmem_pages). The only reason why the slab subsystem uses
special helpers, memcg_charge_slab and memcg_uncharge_slab, is that it
needs to charge to the memcg of kmem cache while memcg_charge_kmem charges
to the memcg that the current task belongs to.
To remove this diversity, this patch adds an extra argument to
__memcg_kmem_charge that can be a pointer to a memcg or NULL. If it is
not NULL, the function tries to charge to the memcg it points to,
otherwise it charge to the current context. Next, it makes the slab
subsystem use this function to charge slab pages.
Since memcg_charge_kmem and memcg_uncharge_kmem helpers are now used only
in __memcg_kmem_charge and __memcg_kmem_uncharge, they are inlined. Since
__memcg_kmem_charge stores a pointer to the memcg in the page struct, we
don't need memcg_uncharge_slab anymore and can use free_kmem_pages.
Besides, one can now detect which memcg a slab page belongs to by reading
/proc/kpagecgroup.
Note, this patch switches slab to charge-after-alloc design. Since this
design is already used for all other memcg charges, it should not make any
difference.
[hannes@cmpxchg.org: better to have an outer function than a magic parameter for the memcg lookup]
Signed-off-by: Vladimir Davydov <vdavydov@virtuozzo.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-11-06 09:49:01 +07:00
|
|
|
return 0;
|
2012-12-19 05:21:56 +07:00
|
|
|
}
|
|
|
|
|
2020-04-02 11:06:56 +07:00
|
|
|
/**
|
|
|
|
* __memcg_kmem_uncharge: uncharge a number of kernel pages from a memcg
|
|
|
|
* @memcg: memcg to uncharge
|
|
|
|
* @nr_pages: number of pages to uncharge
|
|
|
|
*/
|
|
|
|
void __memcg_kmem_uncharge(struct mem_cgroup *memcg, unsigned int nr_pages)
|
|
|
|
{
|
|
|
|
if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
|
|
|
|
page_counter_uncharge(&memcg->kmem, nr_pages);
|
|
|
|
|
2021-01-24 12:01:07 +07:00
|
|
|
refill_stock(memcg, nr_pages);
|
2020-04-02 11:06:56 +07:00
|
|
|
}
|
|
|
|
|
2016-07-27 05:24:21 +07:00
|
|
|
/**
|
2020-04-02 11:06:46 +07:00
|
|
|
* __memcg_kmem_charge_page: charge a kmem page to the current memory cgroup
|
2016-07-27 05:24:21 +07:00
|
|
|
* @page: page to charge
|
|
|
|
* @gfp: reclaim mode
|
|
|
|
* @order: allocation order
|
|
|
|
*
|
|
|
|
* Returns 0 on success, an error code on failure.
|
|
|
|
*/
|
2020-04-02 11:06:46 +07:00
|
|
|
int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order)
|
2012-12-19 05:21:56 +07:00
|
|
|
{
|
memcg: unify slab and other kmem pages charging
We have memcg_kmem_charge and memcg_kmem_uncharge methods for charging and
uncharging kmem pages to memcg, but currently they are not used for
charging slab pages (i.e. they are only used for charging pages allocated
with alloc_kmem_pages). The only reason why the slab subsystem uses
special helpers, memcg_charge_slab and memcg_uncharge_slab, is that it
needs to charge to the memcg of kmem cache while memcg_charge_kmem charges
to the memcg that the current task belongs to.
To remove this diversity, this patch adds an extra argument to
__memcg_kmem_charge that can be a pointer to a memcg or NULL. If it is
not NULL, the function tries to charge to the memcg it points to,
otherwise it charge to the current context. Next, it makes the slab
subsystem use this function to charge slab pages.
Since memcg_charge_kmem and memcg_uncharge_kmem helpers are now used only
in __memcg_kmem_charge and __memcg_kmem_uncharge, they are inlined. Since
__memcg_kmem_charge stores a pointer to the memcg in the page struct, we
don't need memcg_uncharge_slab anymore and can use free_kmem_pages.
Besides, one can now detect which memcg a slab page belongs to by reading
/proc/kpagecgroup.
Note, this patch switches slab to charge-after-alloc design. Since this
design is already used for all other memcg charges, it should not make any
difference.
[hannes@cmpxchg.org: better to have an outer function than a magic parameter for the memcg lookup]
Signed-off-by: Vladimir Davydov <vdavydov@virtuozzo.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-11-06 09:49:01 +07:00
|
|
|
struct mem_cgroup *memcg;
|
2016-03-18 04:17:29 +07:00
|
|
|
int ret = 0;
|
2012-12-19 05:21:56 +07:00
|
|
|
|
fs: fsnotify: account fsnotify metadata to kmemcg
Patch series "Directed kmem charging", v8.
The Linux kernel's memory cgroup allows limiting the memory usage of the
jobs running on the system to provide isolation between the jobs. All
the kernel memory allocated in the context of the job and marked with
__GFP_ACCOUNT will also be included in the memory usage and be limited
by the job's limit.
The kernel memory can only be charged to the memcg of the process in
whose context kernel memory was allocated. However there are cases
where the allocated kernel memory should be charged to the memcg
different from the current processes's memcg. This patch series
contains two such concrete use-cases i.e. fsnotify and buffer_head.
The fsnotify event objects can consume a lot of system memory for large
or unlimited queues if there is either no or slow listener. The events
are allocated in the context of the event producer. However they should
be charged to the event consumer. Similarly the buffer_head objects can
be allocated in a memcg different from the memcg of the page for which
buffer_head objects are being allocated.
To solve this issue, this patch series introduces mechanism to charge
kernel memory to a given memcg. In case of fsnotify events, the memcg
of the consumer can be used for charging and for buffer_head, the memcg
of the page can be charged. For directed charging, the caller can use
the scope API memalloc_[un]use_memcg() to specify the memcg to charge
for all the __GFP_ACCOUNT allocations within the scope.
This patch (of 2):
A lot of memory can be consumed by the events generated for the huge or
unlimited queues if there is either no or slow listener. This can cause
system level memory pressure or OOMs. So, it's better to account the
fsnotify kmem caches to the memcg of the listener.
However the listener can be in a different memcg than the memcg of the
producer and these allocations happen in the context of the event
producer. This patch introduces remote memcg charging API which the
producer can use to charge the allocations to the memcg of the listener.
There are seven fsnotify kmem caches and among them allocations from
dnotify_struct_cache, dnotify_mark_cache, fanotify_mark_cache and
inotify_inode_mark_cachep happens in the context of syscall from the
listener. So, SLAB_ACCOUNT is enough for these caches.
The objects from fsnotify_mark_connector_cachep are not accounted as
they are small compared to the notification mark or events and it is
unclear whom to account connector to since it is shared by all events
attached to the inode.
The allocations from the event caches happen in the context of the event
producer. For such caches we will need to remote charge the allocations
to the listener's memcg. Thus we save the memcg reference in the
fsnotify_group structure of the listener.
This patch has also moved the members of fsnotify_group to keep the size
same, at least for 64 bit build, even with additional member by filling
the holes.
[shakeelb@google.com: use GFP_KERNEL_ACCOUNT rather than open-coding it]
Link: http://lkml.kernel.org/r/20180702215439.211597-1-shakeelb@google.com
Link: http://lkml.kernel.org/r/20180627191250.209150-2-shakeelb@google.com
Signed-off-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Jan Kara <jack@suse.cz>
Cc: Amir Goldstein <amir73il@gmail.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Roman Gushchin <guro@fb.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2018-08-18 05:46:39 +07:00
|
|
|
memcg = get_mem_cgroup_from_current();
|
2020-10-18 06:13:44 +07:00
|
|
|
if (memcg && !mem_cgroup_is_root(memcg)) {
|
2020-04-02 11:06:56 +07:00
|
|
|
ret = __memcg_kmem_charge(memcg, gfp, 1 << order);
|
2019-07-12 10:56:31 +07:00
|
|
|
if (!ret) {
|
|
|
|
page->mem_cgroup = memcg;
|
mm: memcontrol: only mark charged pages with PageKmemcg
To distinguish non-slab pages charged to kmemcg we mark them PageKmemcg,
which sets page->_mapcount to -512. Currently, we set/clear PageKmemcg
in __alloc_pages_nodemask()/free_pages_prepare() for any page allocated
with __GFP_ACCOUNT, including those that aren't actually charged to any
cgroup, i.e. allocated from the root cgroup context. To avoid overhead
in case cgroups are not used, we only do that if memcg_kmem_enabled() is
true. The latter is set iff there are kmem-enabled memory cgroups
(online or offline). The root cgroup is not considered kmem-enabled.
As a result, if a page is allocated with __GFP_ACCOUNT for the root
cgroup when there are kmem-enabled memory cgroups and is freed after all
kmem-enabled memory cgroups were removed, e.g.
# no memory cgroups has been created yet, create one
mkdir /sys/fs/cgroup/memory/test
# run something allocating pages with __GFP_ACCOUNT, e.g.
# a program using pipe
dmesg | tail
# remove the memory cgroup
rmdir /sys/fs/cgroup/memory/test
we'll get bad page state bug complaining about page->_mapcount != -1:
BUG: Bad page state in process swapper/0 pfn:1fd945c
page:ffffea007f651700 count:0 mapcount:-511 mapping: (null) index:0x0
flags: 0x1000000000000000()
To avoid that, let's mark with PageKmemcg only those pages that are
actually charged to and hence pin a non-root memory cgroup.
Fixes: 4949148ad433 ("mm: charge/uncharge kmemcg from generic page allocator paths")
Reported-and-tested-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: Vladimir Davydov <vdavydov@virtuozzo.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-08-09 03:03:12 +07:00
|
|
|
__SetPageKmemcg(page);
|
2020-08-07 13:20:45 +07:00
|
|
|
return 0;
|
2019-07-12 10:56:31 +07:00
|
|
|
}
|
2020-10-18 06:13:44 +07:00
|
|
|
css_put(&memcg->css);
|
mm: memcontrol: only mark charged pages with PageKmemcg
To distinguish non-slab pages charged to kmemcg we mark them PageKmemcg,
which sets page->_mapcount to -512. Currently, we set/clear PageKmemcg
in __alloc_pages_nodemask()/free_pages_prepare() for any page allocated
with __GFP_ACCOUNT, including those that aren't actually charged to any
cgroup, i.e. allocated from the root cgroup context. To avoid overhead
in case cgroups are not used, we only do that if memcg_kmem_enabled() is
true. The latter is set iff there are kmem-enabled memory cgroups
(online or offline). The root cgroup is not considered kmem-enabled.
As a result, if a page is allocated with __GFP_ACCOUNT for the root
cgroup when there are kmem-enabled memory cgroups and is freed after all
kmem-enabled memory cgroups were removed, e.g.
# no memory cgroups has been created yet, create one
mkdir /sys/fs/cgroup/memory/test
# run something allocating pages with __GFP_ACCOUNT, e.g.
# a program using pipe
dmesg | tail
# remove the memory cgroup
rmdir /sys/fs/cgroup/memory/test
we'll get bad page state bug complaining about page->_mapcount != -1:
BUG: Bad page state in process swapper/0 pfn:1fd945c
page:ffffea007f651700 count:0 mapcount:-511 mapping: (null) index:0x0
flags: 0x1000000000000000()
To avoid that, let's mark with PageKmemcg only those pages that are
actually charged to and hence pin a non-root memory cgroup.
Fixes: 4949148ad433 ("mm: charge/uncharge kmemcg from generic page allocator paths")
Reported-and-tested-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: Vladimir Davydov <vdavydov@virtuozzo.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-08-09 03:03:12 +07:00
|
|
|
}
|
2015-11-06 09:48:59 +07:00
|
|
|
return ret;
|
2012-12-19 05:21:56 +07:00
|
|
|
}
|
2019-07-12 10:56:13 +07:00
|
|
|
|
2016-07-27 05:24:21 +07:00
|
|
|
/**
|
2020-04-02 11:06:46 +07:00
|
|
|
* __memcg_kmem_uncharge_page: uncharge a kmem page
|
2016-07-27 05:24:21 +07:00
|
|
|
* @page: page to uncharge
|
|
|
|
* @order: allocation order
|
|
|
|
*/
|
2020-04-02 11:06:46 +07:00
|
|
|
void __memcg_kmem_uncharge_page(struct page *page, int order)
|
2012-12-19 05:21:56 +07:00
|
|
|
{
|
2014-12-11 06:44:52 +07:00
|
|
|
struct mem_cgroup *memcg = page->mem_cgroup;
|
memcg: unify slab and other kmem pages charging
We have memcg_kmem_charge and memcg_kmem_uncharge methods for charging and
uncharging kmem pages to memcg, but currently they are not used for
charging slab pages (i.e. they are only used for charging pages allocated
with alloc_kmem_pages). The only reason why the slab subsystem uses
special helpers, memcg_charge_slab and memcg_uncharge_slab, is that it
needs to charge to the memcg of kmem cache while memcg_charge_kmem charges
to the memcg that the current task belongs to.
To remove this diversity, this patch adds an extra argument to
__memcg_kmem_charge that can be a pointer to a memcg or NULL. If it is
not NULL, the function tries to charge to the memcg it points to,
otherwise it charge to the current context. Next, it makes the slab
subsystem use this function to charge slab pages.
Since memcg_charge_kmem and memcg_uncharge_kmem helpers are now used only
in __memcg_kmem_charge and __memcg_kmem_uncharge, they are inlined. Since
__memcg_kmem_charge stores a pointer to the memcg in the page struct, we
don't need memcg_uncharge_slab anymore and can use free_kmem_pages.
Besides, one can now detect which memcg a slab page belongs to by reading
/proc/kpagecgroup.
Note, this patch switches slab to charge-after-alloc design. Since this
design is already used for all other memcg charges, it should not make any
difference.
[hannes@cmpxchg.org: better to have an outer function than a magic parameter for the memcg lookup]
Signed-off-by: Vladimir Davydov <vdavydov@virtuozzo.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-11-06 09:49:01 +07:00
|
|
|
unsigned int nr_pages = 1 << order;
|
2012-12-19 05:21:56 +07:00
|
|
|
|
|
|
|
if (!memcg)
|
|
|
|
return;
|
|
|
|
|
2014-01-24 06:52:54 +07:00
|
|
|
VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page);
|
2020-04-02 11:06:56 +07:00
|
|
|
__memcg_kmem_uncharge(memcg, nr_pages);
|
2014-12-11 06:44:52 +07:00
|
|
|
page->mem_cgroup = NULL;
|
2020-08-07 13:20:45 +07:00
|
|
|
css_put(&memcg->css);
|
mm: memcontrol: only mark charged pages with PageKmemcg
To distinguish non-slab pages charged to kmemcg we mark them PageKmemcg,
which sets page->_mapcount to -512. Currently, we set/clear PageKmemcg
in __alloc_pages_nodemask()/free_pages_prepare() for any page allocated
with __GFP_ACCOUNT, including those that aren't actually charged to any
cgroup, i.e. allocated from the root cgroup context. To avoid overhead
in case cgroups are not used, we only do that if memcg_kmem_enabled() is
true. The latter is set iff there are kmem-enabled memory cgroups
(online or offline). The root cgroup is not considered kmem-enabled.
As a result, if a page is allocated with __GFP_ACCOUNT for the root
cgroup when there are kmem-enabled memory cgroups and is freed after all
kmem-enabled memory cgroups were removed, e.g.
# no memory cgroups has been created yet, create one
mkdir /sys/fs/cgroup/memory/test
# run something allocating pages with __GFP_ACCOUNT, e.g.
# a program using pipe
dmesg | tail
# remove the memory cgroup
rmdir /sys/fs/cgroup/memory/test
we'll get bad page state bug complaining about page->_mapcount != -1:
BUG: Bad page state in process swapper/0 pfn:1fd945c
page:ffffea007f651700 count:0 mapcount:-511 mapping: (null) index:0x0
flags: 0x1000000000000000()
To avoid that, let's mark with PageKmemcg only those pages that are
actually charged to and hence pin a non-root memory cgroup.
Fixes: 4949148ad433 ("mm: charge/uncharge kmemcg from generic page allocator paths")
Reported-and-tested-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: Vladimir Davydov <vdavydov@virtuozzo.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-08-09 03:03:12 +07:00
|
|
|
|
|
|
|
/* slab pages do not have PageKmemcg flag set */
|
|
|
|
if (PageKmemcg(page))
|
|
|
|
__ClearPageKmemcg(page);
|
list_lru: introduce per-memcg lists
There are several FS shrinkers, including super_block::s_shrink, that
keep reclaimable objects in the list_lru structure. Hence to turn them
to memcg-aware shrinkers, it is enough to make list_lru per-memcg.
This patch does the trick. It adds an array of lru lists to the
list_lru_node structure (per-node part of the list_lru), one for each
kmem-active memcg, and dispatches every item addition or removal to the
list corresponding to the memcg which the item is accounted to. So now
the list_lru structure is not just per node, but per node and per memcg.
Not all list_lrus need this feature, so this patch also adds a new
method, list_lru_init_memcg, which initializes a list_lru as memcg
aware. Otherwise (i.e. if initialized with old list_lru_init), the
list_lru won't have per memcg lists.
Just like per memcg caches arrays, the arrays of per-memcg lists are
indexed by memcg_cache_id, so we must grow them whenever
memcg_nr_cache_ids is increased. So we introduce a callback,
memcg_update_all_list_lrus, invoked by memcg_alloc_cache_id if the id
space is full.
The locking is implemented in a manner similar to lruvecs, i.e. we have
one lock per node that protects all lists (both global and per cgroup) on
the node.
Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Greg Thelen <gthelen@google.com>
Cc: Glauber Costa <glommer@gmail.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-02-13 05:59:10 +07:00
|
|
|
}
|
2020-08-07 13:20:49 +07:00
|
|
|
|
|
|
|
static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
|
|
|
|
{
|
|
|
|
struct memcg_stock_pcp *stock;
|
|
|
|
unsigned long flags;
|
|
|
|
bool ret = false;
|
|
|
|
|
|
|
|
local_irq_save(flags);
|
|
|
|
|
|
|
|
stock = this_cpu_ptr(&memcg_stock);
|
|
|
|
if (objcg == stock->cached_objcg && stock->nr_bytes >= nr_bytes) {
|
|
|
|
stock->nr_bytes -= nr_bytes;
|
|
|
|
ret = true;
|
|
|
|
}
|
|
|
|
|
|
|
|
local_irq_restore(flags);
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void drain_obj_stock(struct memcg_stock_pcp *stock)
|
|
|
|
{
|
|
|
|
struct obj_cgroup *old = stock->cached_objcg;
|
|
|
|
|
|
|
|
if (!old)
|
|
|
|
return;
|
|
|
|
|
|
|
|
if (stock->nr_bytes) {
|
|
|
|
unsigned int nr_pages = stock->nr_bytes >> PAGE_SHIFT;
|
|
|
|
unsigned int nr_bytes = stock->nr_bytes & (PAGE_SIZE - 1);
|
|
|
|
|
|
|
|
if (nr_pages) {
|
|
|
|
rcu_read_lock();
|
|
|
|
__memcg_kmem_uncharge(obj_cgroup_memcg(old), nr_pages);
|
|
|
|
rcu_read_unlock();
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The leftover is flushed to the centralized per-memcg value.
|
|
|
|
* On the next attempt to refill obj stock it will be moved
|
|
|
|
* to a per-cpu stock (probably, on an other CPU), see
|
|
|
|
* refill_obj_stock().
|
|
|
|
*
|
|
|
|
* How often it's flushed is a trade-off between the memory
|
|
|
|
* limit enforcement accuracy and potential CPU contention,
|
|
|
|
* so it might be changed in the future.
|
|
|
|
*/
|
|
|
|
atomic_add(nr_bytes, &old->nr_charged_bytes);
|
|
|
|
stock->nr_bytes = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
obj_cgroup_put(old);
|
|
|
|
stock->cached_objcg = NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
|
|
|
|
struct mem_cgroup *root_memcg)
|
|
|
|
{
|
|
|
|
struct mem_cgroup *memcg;
|
|
|
|
|
|
|
|
if (stock->cached_objcg) {
|
|
|
|
memcg = obj_cgroup_memcg(stock->cached_objcg);
|
|
|
|
if (memcg && mem_cgroup_is_descendant(memcg, root_memcg))
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
|
|
|
|
{
|
|
|
|
struct memcg_stock_pcp *stock;
|
|
|
|
unsigned long flags;
|
|
|
|
|
|
|
|
local_irq_save(flags);
|
|
|
|
|
|
|
|
stock = this_cpu_ptr(&memcg_stock);
|
|
|
|
if (stock->cached_objcg != objcg) { /* reset if necessary */
|
|
|
|
drain_obj_stock(stock);
|
|
|
|
obj_cgroup_get(objcg);
|
|
|
|
stock->cached_objcg = objcg;
|
|
|
|
stock->nr_bytes = atomic_xchg(&objcg->nr_charged_bytes, 0);
|
|
|
|
}
|
|
|
|
stock->nr_bytes += nr_bytes;
|
|
|
|
|
|
|
|
if (stock->nr_bytes > PAGE_SIZE)
|
|
|
|
drain_obj_stock(stock);
|
|
|
|
|
|
|
|
local_irq_restore(flags);
|
|
|
|
}
|
|
|
|
|
|
|
|
int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size)
|
|
|
|
{
|
|
|
|
struct mem_cgroup *memcg;
|
|
|
|
unsigned int nr_pages, nr_bytes;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
if (consume_obj_stock(objcg, size))
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* In theory, memcg->nr_charged_bytes can have enough
|
|
|
|
* pre-charged bytes to satisfy the allocation. However,
|
|
|
|
* flushing memcg->nr_charged_bytes requires two atomic
|
|
|
|
* operations, and memcg->nr_charged_bytes can't be big,
|
|
|
|
* so it's better to ignore it and try grab some new pages.
|
|
|
|
* memcg->nr_charged_bytes will be flushed in
|
|
|
|
* refill_obj_stock(), called from this function or
|
|
|
|
* independently later.
|
|
|
|
*/
|
|
|
|
rcu_read_lock();
|
2020-12-15 10:06:35 +07:00
|
|
|
retry:
|
2020-08-07 13:20:49 +07:00
|
|
|
memcg = obj_cgroup_memcg(objcg);
|
2020-12-15 10:06:35 +07:00
|
|
|
if (unlikely(!css_tryget(&memcg->css)))
|
|
|
|
goto retry;
|
2020-08-07 13:20:49 +07:00
|
|
|
rcu_read_unlock();
|
|
|
|
|
|
|
|
nr_pages = size >> PAGE_SHIFT;
|
|
|
|
nr_bytes = size & (PAGE_SIZE - 1);
|
|
|
|
|
|
|
|
if (nr_bytes)
|
|
|
|
nr_pages += 1;
|
|
|
|
|
|
|
|
ret = __memcg_kmem_charge(memcg, gfp, nr_pages);
|
|
|
|
if (!ret && nr_bytes)
|
|
|
|
refill_obj_stock(objcg, PAGE_SIZE - nr_bytes);
|
|
|
|
|
|
|
|
css_put(&memcg->css);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size)
|
|
|
|
{
|
|
|
|
refill_obj_stock(objcg, size);
|
|
|
|
}
|
|
|
|
|
2018-08-18 05:47:25 +07:00
|
|
|
#endif /* CONFIG_MEMCG_KMEM */
|
2012-12-19 05:21:56 +07:00
|
|
|
|
2011-01-21 05:44:24 +07:00
|
|
|
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Because tail pages are not marked as "used", set it. We're under
|
2019-03-06 06:49:39 +07:00
|
|
|
* pgdat->lru_lock and migration entries setup in all page mappings.
|
2011-01-21 05:44:24 +07:00
|
|
|
*/
|
2012-01-13 08:18:20 +07:00
|
|
|
void mem_cgroup_split_huge_fixup(struct page *head)
|
2011-01-21 05:44:24 +07:00
|
|
|
{
|
2020-08-07 13:20:45 +07:00
|
|
|
struct mem_cgroup *memcg = head->mem_cgroup;
|
2012-01-13 08:18:20 +07:00
|
|
|
int i;
|
2011-01-21 05:44:24 +07:00
|
|
|
|
2011-01-26 06:07:28 +07:00
|
|
|
if (mem_cgroup_disabled())
|
|
|
|
return;
|
2013-05-08 06:18:09 +07:00
|
|
|
|
2020-08-07 13:20:45 +07:00
|
|
|
for (i = 1; i < HPAGE_PMD_NR; i++) {
|
|
|
|
css_get(&memcg->css);
|
|
|
|
head[i].mem_cgroup = memcg;
|
|
|
|
}
|
2011-01-21 05:44:24 +07:00
|
|
|
}
|
2012-01-13 08:19:52 +07:00
|
|
|
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
|
2011-01-21 05:44:24 +07:00
|
|
|
|
2012-08-01 06:43:02 +07:00
|
|
|
#ifdef CONFIG_MEMCG_SWAP
|
2010-03-11 06:22:17 +07:00
|
|
|
/**
|
|
|
|
* mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record.
|
|
|
|
* @entry: swap entry to be moved
|
|
|
|
* @from: mem_cgroup which the entry is moved from
|
|
|
|
* @to: mem_cgroup which the entry is moved to
|
|
|
|
*
|
|
|
|
* It succeeds only when the swap_cgroup's record for this entry is the same
|
|
|
|
* as the mem_cgroup's id of @from.
|
|
|
|
*
|
|
|
|
* Returns 0 on success, -EINVAL on failure.
|
|
|
|
*
|
mm: memcontrol: lockless page counters
Memory is internally accounted in bytes, using spinlock-protected 64-bit
counters, even though the smallest accounting delta is a page. The
counter interface is also convoluted and does too many things.
Introduce a new lockless word-sized page counter API, then change all
memory accounting over to it. The translation from and to bytes then only
happens when interfacing with userspace.
The removed locking overhead is noticable when scaling beyond the per-cpu
charge caches - on a 4-socket machine with 144-threads, the following test
shows the performance differences of 288 memcgs concurrently running a
page fault benchmark:
vanilla:
18631648.500498 task-clock (msec) # 140.643 CPUs utilized ( +- 0.33% )
1,380,638 context-switches # 0.074 K/sec ( +- 0.75% )
24,390 cpu-migrations # 0.001 K/sec ( +- 8.44% )
1,843,305,768 page-faults # 0.099 M/sec ( +- 0.00% )
50,134,994,088,218 cycles # 2.691 GHz ( +- 0.33% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
8,049,712,224,651 instructions # 0.16 insns per cycle ( +- 0.04% )
1,586,970,584,979 branches # 85.176 M/sec ( +- 0.05% )
1,724,989,949 branch-misses # 0.11% of all branches ( +- 0.48% )
132.474343877 seconds time elapsed ( +- 0.21% )
lockless:
12195979.037525 task-clock (msec) # 133.480 CPUs utilized ( +- 0.18% )
832,850 context-switches # 0.068 K/sec ( +- 0.54% )
15,624 cpu-migrations # 0.001 K/sec ( +- 10.17% )
1,843,304,774 page-faults # 0.151 M/sec ( +- 0.00% )
32,811,216,801,141 cycles # 2.690 GHz ( +- 0.18% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
9,999,265,091,727 instructions # 0.30 insns per cycle ( +- 0.10% )
2,076,759,325,203 branches # 170.282 M/sec ( +- 0.12% )
1,656,917,214 branch-misses # 0.08% of all branches ( +- 0.55% )
91.369330729 seconds time elapsed ( +- 0.45% )
On top of improved scalability, this also gets rid of the icky long long
types in the very heart of memcg, which is great for 32 bit and also makes
the code a lot more readable.
Notable differences between the old and new API:
- res_counter_charge() and res_counter_charge_nofail() become
page_counter_try_charge() and page_counter_charge() resp. to match
the more common kernel naming scheme of try_do()/do()
- res_counter_uncharge_until() is only ever used to cancel a local
counter and never to uncharge bigger segments of a hierarchy, so
it's replaced by the simpler page_counter_cancel()
- res_counter_set_limit() is replaced by page_counter_limit(), which
expects its callers to serialize against themselves
- res_counter_memparse_write_strategy() is replaced by
page_counter_limit(), which rounds down to the nearest page size -
rather than up. This is more reasonable for explicitely requested
hard upper limits.
- to keep charging light-weight, page_counter_try_charge() charges
speculatively, only to roll back if the result exceeds the limit.
Because of this, a failing bigger charge can temporarily lock out
smaller charges that would otherwise succeed. The error is bounded
to the difference between the smallest and the biggest possible
charge size, so for memcg, this means that a failing THP charge can
send base page charges into reclaim upto 2MB (4MB) before the limit
would have been reached. This should be acceptable.
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE and memparse]
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE, memparse, strncmp, and PAGE_SIZE]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Acked-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-12-11 06:42:31 +07:00
|
|
|
* The caller must have charged to @to, IOW, called page_counter_charge() about
|
2010-03-11 06:22:17 +07:00
|
|
|
* both res and memsw, and called css_get().
|
|
|
|
*/
|
|
|
|
static int mem_cgroup_move_swap_account(swp_entry_t entry,
|
2012-05-30 05:06:51 +07:00
|
|
|
struct mem_cgroup *from, struct mem_cgroup *to)
|
2010-03-11 06:22:17 +07:00
|
|
|
{
|
|
|
|
unsigned short old_id, new_id;
|
|
|
|
|
2013-09-23 15:56:01 +07:00
|
|
|
old_id = mem_cgroup_id(from);
|
|
|
|
new_id = mem_cgroup_id(to);
|
2010-03-11 06:22:17 +07:00
|
|
|
|
|
|
|
if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {
|
2018-02-01 07:16:37 +07:00
|
|
|
mod_memcg_state(from, MEMCG_SWAP, -1);
|
|
|
|
mod_memcg_state(to, MEMCG_SWAP, 1);
|
2010-03-11 06:22:17 +07:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
#else
|
|
|
|
static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
|
2012-05-30 05:06:51 +07:00
|
|
|
struct mem_cgroup *from, struct mem_cgroup *to)
|
2010-03-11 06:22:17 +07:00
|
|
|
{
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
2009-01-08 09:08:00 +07:00
|
|
|
#endif
|
2009-01-08 09:07:56 +07:00
|
|
|
|
2018-06-08 07:06:18 +07:00
|
|
|
static DEFINE_MUTEX(memcg_max_mutex);
|
2011-03-24 06:42:25 +07:00
|
|
|
|
2018-06-08 07:06:18 +07:00
|
|
|
static int mem_cgroup_resize_max(struct mem_cgroup *memcg,
|
|
|
|
unsigned long max, bool memsw)
|
2008-07-25 15:47:20 +07:00
|
|
|
{
|
mm: memcontrol: lockless page counters
Memory is internally accounted in bytes, using spinlock-protected 64-bit
counters, even though the smallest accounting delta is a page. The
counter interface is also convoluted and does too many things.
Introduce a new lockless word-sized page counter API, then change all
memory accounting over to it. The translation from and to bytes then only
happens when interfacing with userspace.
The removed locking overhead is noticable when scaling beyond the per-cpu
charge caches - on a 4-socket machine with 144-threads, the following test
shows the performance differences of 288 memcgs concurrently running a
page fault benchmark:
vanilla:
18631648.500498 task-clock (msec) # 140.643 CPUs utilized ( +- 0.33% )
1,380,638 context-switches # 0.074 K/sec ( +- 0.75% )
24,390 cpu-migrations # 0.001 K/sec ( +- 8.44% )
1,843,305,768 page-faults # 0.099 M/sec ( +- 0.00% )
50,134,994,088,218 cycles # 2.691 GHz ( +- 0.33% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
8,049,712,224,651 instructions # 0.16 insns per cycle ( +- 0.04% )
1,586,970,584,979 branches # 85.176 M/sec ( +- 0.05% )
1,724,989,949 branch-misses # 0.11% of all branches ( +- 0.48% )
132.474343877 seconds time elapsed ( +- 0.21% )
lockless:
12195979.037525 task-clock (msec) # 133.480 CPUs utilized ( +- 0.18% )
832,850 context-switches # 0.068 K/sec ( +- 0.54% )
15,624 cpu-migrations # 0.001 K/sec ( +- 10.17% )
1,843,304,774 page-faults # 0.151 M/sec ( +- 0.00% )
32,811,216,801,141 cycles # 2.690 GHz ( +- 0.18% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
9,999,265,091,727 instructions # 0.30 insns per cycle ( +- 0.10% )
2,076,759,325,203 branches # 170.282 M/sec ( +- 0.12% )
1,656,917,214 branch-misses # 0.08% of all branches ( +- 0.55% )
91.369330729 seconds time elapsed ( +- 0.45% )
On top of improved scalability, this also gets rid of the icky long long
types in the very heart of memcg, which is great for 32 bit and also makes
the code a lot more readable.
Notable differences between the old and new API:
- res_counter_charge() and res_counter_charge_nofail() become
page_counter_try_charge() and page_counter_charge() resp. to match
the more common kernel naming scheme of try_do()/do()
- res_counter_uncharge_until() is only ever used to cancel a local
counter and never to uncharge bigger segments of a hierarchy, so
it's replaced by the simpler page_counter_cancel()
- res_counter_set_limit() is replaced by page_counter_limit(), which
expects its callers to serialize against themselves
- res_counter_memparse_write_strategy() is replaced by
page_counter_limit(), which rounds down to the nearest page size -
rather than up. This is more reasonable for explicitely requested
hard upper limits.
- to keep charging light-weight, page_counter_try_charge() charges
speculatively, only to roll back if the result exceeds the limit.
Because of this, a failing bigger charge can temporarily lock out
smaller charges that would otherwise succeed. The error is bounded
to the difference between the smallest and the biggest possible
charge size, so for memcg, this means that a failing THP charge can
send base page charges into reclaim upto 2MB (4MB) before the limit
would have been reached. This should be acceptable.
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE and memparse]
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE, memparse, strncmp, and PAGE_SIZE]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Acked-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-12-11 06:42:31 +07:00
|
|
|
bool enlarge = false;
|
2018-06-08 07:07:27 +07:00
|
|
|
bool drained = false;
|
mm: memcontrol: lockless page counters
Memory is internally accounted in bytes, using spinlock-protected 64-bit
counters, even though the smallest accounting delta is a page. The
counter interface is also convoluted and does too many things.
Introduce a new lockless word-sized page counter API, then change all
memory accounting over to it. The translation from and to bytes then only
happens when interfacing with userspace.
The removed locking overhead is noticable when scaling beyond the per-cpu
charge caches - on a 4-socket machine with 144-threads, the following test
shows the performance differences of 288 memcgs concurrently running a
page fault benchmark:
vanilla:
18631648.500498 task-clock (msec) # 140.643 CPUs utilized ( +- 0.33% )
1,380,638 context-switches # 0.074 K/sec ( +- 0.75% )
24,390 cpu-migrations # 0.001 K/sec ( +- 8.44% )
1,843,305,768 page-faults # 0.099 M/sec ( +- 0.00% )
50,134,994,088,218 cycles # 2.691 GHz ( +- 0.33% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
8,049,712,224,651 instructions # 0.16 insns per cycle ( +- 0.04% )
1,586,970,584,979 branches # 85.176 M/sec ( +- 0.05% )
1,724,989,949 branch-misses # 0.11% of all branches ( +- 0.48% )
132.474343877 seconds time elapsed ( +- 0.21% )
lockless:
12195979.037525 task-clock (msec) # 133.480 CPUs utilized ( +- 0.18% )
832,850 context-switches # 0.068 K/sec ( +- 0.54% )
15,624 cpu-migrations # 0.001 K/sec ( +- 10.17% )
1,843,304,774 page-faults # 0.151 M/sec ( +- 0.00% )
32,811,216,801,141 cycles # 2.690 GHz ( +- 0.18% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
9,999,265,091,727 instructions # 0.30 insns per cycle ( +- 0.10% )
2,076,759,325,203 branches # 170.282 M/sec ( +- 0.12% )
1,656,917,214 branch-misses # 0.08% of all branches ( +- 0.55% )
91.369330729 seconds time elapsed ( +- 0.45% )
On top of improved scalability, this also gets rid of the icky long long
types in the very heart of memcg, which is great for 32 bit and also makes
the code a lot more readable.
Notable differences between the old and new API:
- res_counter_charge() and res_counter_charge_nofail() become
page_counter_try_charge() and page_counter_charge() resp. to match
the more common kernel naming scheme of try_do()/do()
- res_counter_uncharge_until() is only ever used to cancel a local
counter and never to uncharge bigger segments of a hierarchy, so
it's replaced by the simpler page_counter_cancel()
- res_counter_set_limit() is replaced by page_counter_limit(), which
expects its callers to serialize against themselves
- res_counter_memparse_write_strategy() is replaced by
page_counter_limit(), which rounds down to the nearest page size -
rather than up. This is more reasonable for explicitely requested
hard upper limits.
- to keep charging light-weight, page_counter_try_charge() charges
speculatively, only to roll back if the result exceeds the limit.
Because of this, a failing bigger charge can temporarily lock out
smaller charges that would otherwise succeed. The error is bounded
to the difference between the smallest and the biggest possible
charge size, so for memcg, this means that a failing THP charge can
send base page charges into reclaim upto 2MB (4MB) before the limit
would have been reached. This should be acceptable.
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE and memparse]
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE, memparse, strncmp, and PAGE_SIZE]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Acked-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-12-11 06:42:31 +07:00
|
|
|
int ret;
|
2018-02-01 07:20:02 +07:00
|
|
|
bool limits_invariant;
|
|
|
|
struct page_counter *counter = memsw ? &memcg->memsw : &memcg->memory;
|
2009-04-03 06:57:36 +07:00
|
|
|
|
mm: memcontrol: lockless page counters
Memory is internally accounted in bytes, using spinlock-protected 64-bit
counters, even though the smallest accounting delta is a page. The
counter interface is also convoluted and does too many things.
Introduce a new lockless word-sized page counter API, then change all
memory accounting over to it. The translation from and to bytes then only
happens when interfacing with userspace.
The removed locking overhead is noticable when scaling beyond the per-cpu
charge caches - on a 4-socket machine with 144-threads, the following test
shows the performance differences of 288 memcgs concurrently running a
page fault benchmark:
vanilla:
18631648.500498 task-clock (msec) # 140.643 CPUs utilized ( +- 0.33% )
1,380,638 context-switches # 0.074 K/sec ( +- 0.75% )
24,390 cpu-migrations # 0.001 K/sec ( +- 8.44% )
1,843,305,768 page-faults # 0.099 M/sec ( +- 0.00% )
50,134,994,088,218 cycles # 2.691 GHz ( +- 0.33% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
8,049,712,224,651 instructions # 0.16 insns per cycle ( +- 0.04% )
1,586,970,584,979 branches # 85.176 M/sec ( +- 0.05% )
1,724,989,949 branch-misses # 0.11% of all branches ( +- 0.48% )
132.474343877 seconds time elapsed ( +- 0.21% )
lockless:
12195979.037525 task-clock (msec) # 133.480 CPUs utilized ( +- 0.18% )
832,850 context-switches # 0.068 K/sec ( +- 0.54% )
15,624 cpu-migrations # 0.001 K/sec ( +- 10.17% )
1,843,304,774 page-faults # 0.151 M/sec ( +- 0.00% )
32,811,216,801,141 cycles # 2.690 GHz ( +- 0.18% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
9,999,265,091,727 instructions # 0.30 insns per cycle ( +- 0.10% )
2,076,759,325,203 branches # 170.282 M/sec ( +- 0.12% )
1,656,917,214 branch-misses # 0.08% of all branches ( +- 0.55% )
91.369330729 seconds time elapsed ( +- 0.45% )
On top of improved scalability, this also gets rid of the icky long long
types in the very heart of memcg, which is great for 32 bit and also makes
the code a lot more readable.
Notable differences between the old and new API:
- res_counter_charge() and res_counter_charge_nofail() become
page_counter_try_charge() and page_counter_charge() resp. to match
the more common kernel naming scheme of try_do()/do()
- res_counter_uncharge_until() is only ever used to cancel a local
counter and never to uncharge bigger segments of a hierarchy, so
it's replaced by the simpler page_counter_cancel()
- res_counter_set_limit() is replaced by page_counter_limit(), which
expects its callers to serialize against themselves
- res_counter_memparse_write_strategy() is replaced by
page_counter_limit(), which rounds down to the nearest page size -
rather than up. This is more reasonable for explicitely requested
hard upper limits.
- to keep charging light-weight, page_counter_try_charge() charges
speculatively, only to roll back if the result exceeds the limit.
Because of this, a failing bigger charge can temporarily lock out
smaller charges that would otherwise succeed. The error is bounded
to the difference between the smallest and the biggest possible
charge size, so for memcg, this means that a failing THP charge can
send base page charges into reclaim upto 2MB (4MB) before the limit
would have been reached. This should be acceptable.
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE and memparse]
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE, memparse, strncmp, and PAGE_SIZE]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Acked-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-12-11 06:42:31 +07:00
|
|
|
do {
|
2008-07-25 15:47:20 +07:00
|
|
|
if (signal_pending(current)) {
|
|
|
|
ret = -EINTR;
|
|
|
|
break;
|
|
|
|
}
|
mm: memcontrol: lockless page counters
Memory is internally accounted in bytes, using spinlock-protected 64-bit
counters, even though the smallest accounting delta is a page. The
counter interface is also convoluted and does too many things.
Introduce a new lockless word-sized page counter API, then change all
memory accounting over to it. The translation from and to bytes then only
happens when interfacing with userspace.
The removed locking overhead is noticable when scaling beyond the per-cpu
charge caches - on a 4-socket machine with 144-threads, the following test
shows the performance differences of 288 memcgs concurrently running a
page fault benchmark:
vanilla:
18631648.500498 task-clock (msec) # 140.643 CPUs utilized ( +- 0.33% )
1,380,638 context-switches # 0.074 K/sec ( +- 0.75% )
24,390 cpu-migrations # 0.001 K/sec ( +- 8.44% )
1,843,305,768 page-faults # 0.099 M/sec ( +- 0.00% )
50,134,994,088,218 cycles # 2.691 GHz ( +- 0.33% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
8,049,712,224,651 instructions # 0.16 insns per cycle ( +- 0.04% )
1,586,970,584,979 branches # 85.176 M/sec ( +- 0.05% )
1,724,989,949 branch-misses # 0.11% of all branches ( +- 0.48% )
132.474343877 seconds time elapsed ( +- 0.21% )
lockless:
12195979.037525 task-clock (msec) # 133.480 CPUs utilized ( +- 0.18% )
832,850 context-switches # 0.068 K/sec ( +- 0.54% )
15,624 cpu-migrations # 0.001 K/sec ( +- 10.17% )
1,843,304,774 page-faults # 0.151 M/sec ( +- 0.00% )
32,811,216,801,141 cycles # 2.690 GHz ( +- 0.18% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
9,999,265,091,727 instructions # 0.30 insns per cycle ( +- 0.10% )
2,076,759,325,203 branches # 170.282 M/sec ( +- 0.12% )
1,656,917,214 branch-misses # 0.08% of all branches ( +- 0.55% )
91.369330729 seconds time elapsed ( +- 0.45% )
On top of improved scalability, this also gets rid of the icky long long
types in the very heart of memcg, which is great for 32 bit and also makes
the code a lot more readable.
Notable differences between the old and new API:
- res_counter_charge() and res_counter_charge_nofail() become
page_counter_try_charge() and page_counter_charge() resp. to match
the more common kernel naming scheme of try_do()/do()
- res_counter_uncharge_until() is only ever used to cancel a local
counter and never to uncharge bigger segments of a hierarchy, so
it's replaced by the simpler page_counter_cancel()
- res_counter_set_limit() is replaced by page_counter_limit(), which
expects its callers to serialize against themselves
- res_counter_memparse_write_strategy() is replaced by
page_counter_limit(), which rounds down to the nearest page size -
rather than up. This is more reasonable for explicitely requested
hard upper limits.
- to keep charging light-weight, page_counter_try_charge() charges
speculatively, only to roll back if the result exceeds the limit.
Because of this, a failing bigger charge can temporarily lock out
smaller charges that would otherwise succeed. The error is bounded
to the difference between the smallest and the biggest possible
charge size, so for memcg, this means that a failing THP charge can
send base page charges into reclaim upto 2MB (4MB) before the limit
would have been reached. This should be acceptable.
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE and memparse]
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE, memparse, strncmp, and PAGE_SIZE]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Acked-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-12-11 06:42:31 +07:00
|
|
|
|
2018-06-08 07:06:18 +07:00
|
|
|
mutex_lock(&memcg_max_mutex);
|
2018-02-01 07:20:02 +07:00
|
|
|
/*
|
|
|
|
* Make sure that the new limit (memsw or memory limit) doesn't
|
2018-06-08 07:06:18 +07:00
|
|
|
* break our basic invariant rule memory.max <= memsw.max.
|
2018-02-01 07:20:02 +07:00
|
|
|
*/
|
2020-04-02 11:07:20 +07:00
|
|
|
limits_invariant = memsw ? max >= READ_ONCE(memcg->memory.max) :
|
2018-06-08 07:06:18 +07:00
|
|
|
max <= memcg->memsw.max;
|
2018-02-01 07:20:02 +07:00
|
|
|
if (!limits_invariant) {
|
2018-06-08 07:06:18 +07:00
|
|
|
mutex_unlock(&memcg_max_mutex);
|
2009-01-08 09:08:00 +07:00
|
|
|
ret = -EINVAL;
|
|
|
|
break;
|
|
|
|
}
|
2018-06-08 07:06:18 +07:00
|
|
|
if (max > counter->max)
|
mm: memcontrol: lockless page counters
Memory is internally accounted in bytes, using spinlock-protected 64-bit
counters, even though the smallest accounting delta is a page. The
counter interface is also convoluted and does too many things.
Introduce a new lockless word-sized page counter API, then change all
memory accounting over to it. The translation from and to bytes then only
happens when interfacing with userspace.
The removed locking overhead is noticable when scaling beyond the per-cpu
charge caches - on a 4-socket machine with 144-threads, the following test
shows the performance differences of 288 memcgs concurrently running a
page fault benchmark:
vanilla:
18631648.500498 task-clock (msec) # 140.643 CPUs utilized ( +- 0.33% )
1,380,638 context-switches # 0.074 K/sec ( +- 0.75% )
24,390 cpu-migrations # 0.001 K/sec ( +- 8.44% )
1,843,305,768 page-faults # 0.099 M/sec ( +- 0.00% )
50,134,994,088,218 cycles # 2.691 GHz ( +- 0.33% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
8,049,712,224,651 instructions # 0.16 insns per cycle ( +- 0.04% )
1,586,970,584,979 branches # 85.176 M/sec ( +- 0.05% )
1,724,989,949 branch-misses # 0.11% of all branches ( +- 0.48% )
132.474343877 seconds time elapsed ( +- 0.21% )
lockless:
12195979.037525 task-clock (msec) # 133.480 CPUs utilized ( +- 0.18% )
832,850 context-switches # 0.068 K/sec ( +- 0.54% )
15,624 cpu-migrations # 0.001 K/sec ( +- 10.17% )
1,843,304,774 page-faults # 0.151 M/sec ( +- 0.00% )
32,811,216,801,141 cycles # 2.690 GHz ( +- 0.18% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
9,999,265,091,727 instructions # 0.30 insns per cycle ( +- 0.10% )
2,076,759,325,203 branches # 170.282 M/sec ( +- 0.12% )
1,656,917,214 branch-misses # 0.08% of all branches ( +- 0.55% )
91.369330729 seconds time elapsed ( +- 0.45% )
On top of improved scalability, this also gets rid of the icky long long
types in the very heart of memcg, which is great for 32 bit and also makes
the code a lot more readable.
Notable differences between the old and new API:
- res_counter_charge() and res_counter_charge_nofail() become
page_counter_try_charge() and page_counter_charge() resp. to match
the more common kernel naming scheme of try_do()/do()
- res_counter_uncharge_until() is only ever used to cancel a local
counter and never to uncharge bigger segments of a hierarchy, so
it's replaced by the simpler page_counter_cancel()
- res_counter_set_limit() is replaced by page_counter_limit(), which
expects its callers to serialize against themselves
- res_counter_memparse_write_strategy() is replaced by
page_counter_limit(), which rounds down to the nearest page size -
rather than up. This is more reasonable for explicitely requested
hard upper limits.
- to keep charging light-weight, page_counter_try_charge() charges
speculatively, only to roll back if the result exceeds the limit.
Because of this, a failing bigger charge can temporarily lock out
smaller charges that would otherwise succeed. The error is bounded
to the difference between the smallest and the biggest possible
charge size, so for memcg, this means that a failing THP charge can
send base page charges into reclaim upto 2MB (4MB) before the limit
would have been reached. This should be acceptable.
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE and memparse]
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE, memparse, strncmp, and PAGE_SIZE]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Acked-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-12-11 06:42:31 +07:00
|
|
|
enlarge = true;
|
2018-06-08 07:06:18 +07:00
|
|
|
ret = page_counter_set_max(counter, max);
|
|
|
|
mutex_unlock(&memcg_max_mutex);
|
2009-01-08 09:08:00 +07:00
|
|
|
|
|
|
|
if (!ret)
|
|
|
|
break;
|
|
|
|
|
2018-06-08 07:07:27 +07:00
|
|
|
if (!drained) {
|
|
|
|
drain_all_stock(memcg);
|
|
|
|
drained = true;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2018-02-01 07:20:37 +07:00
|
|
|
if (!try_to_free_mem_cgroup_pages(memcg, 1,
|
|
|
|
GFP_KERNEL, !memsw)) {
|
|
|
|
ret = -EBUSY;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
} while (true);
|
mm: memcontrol: lockless page counters
Memory is internally accounted in bytes, using spinlock-protected 64-bit
counters, even though the smallest accounting delta is a page. The
counter interface is also convoluted and does too many things.
Introduce a new lockless word-sized page counter API, then change all
memory accounting over to it. The translation from and to bytes then only
happens when interfacing with userspace.
The removed locking overhead is noticable when scaling beyond the per-cpu
charge caches - on a 4-socket machine with 144-threads, the following test
shows the performance differences of 288 memcgs concurrently running a
page fault benchmark:
vanilla:
18631648.500498 task-clock (msec) # 140.643 CPUs utilized ( +- 0.33% )
1,380,638 context-switches # 0.074 K/sec ( +- 0.75% )
24,390 cpu-migrations # 0.001 K/sec ( +- 8.44% )
1,843,305,768 page-faults # 0.099 M/sec ( +- 0.00% )
50,134,994,088,218 cycles # 2.691 GHz ( +- 0.33% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
8,049,712,224,651 instructions # 0.16 insns per cycle ( +- 0.04% )
1,586,970,584,979 branches # 85.176 M/sec ( +- 0.05% )
1,724,989,949 branch-misses # 0.11% of all branches ( +- 0.48% )
132.474343877 seconds time elapsed ( +- 0.21% )
lockless:
12195979.037525 task-clock (msec) # 133.480 CPUs utilized ( +- 0.18% )
832,850 context-switches # 0.068 K/sec ( +- 0.54% )
15,624 cpu-migrations # 0.001 K/sec ( +- 10.17% )
1,843,304,774 page-faults # 0.151 M/sec ( +- 0.00% )
32,811,216,801,141 cycles # 2.690 GHz ( +- 0.18% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
9,999,265,091,727 instructions # 0.30 insns per cycle ( +- 0.10% )
2,076,759,325,203 branches # 170.282 M/sec ( +- 0.12% )
1,656,917,214 branch-misses # 0.08% of all branches ( +- 0.55% )
91.369330729 seconds time elapsed ( +- 0.45% )
On top of improved scalability, this also gets rid of the icky long long
types in the very heart of memcg, which is great for 32 bit and also makes
the code a lot more readable.
Notable differences between the old and new API:
- res_counter_charge() and res_counter_charge_nofail() become
page_counter_try_charge() and page_counter_charge() resp. to match
the more common kernel naming scheme of try_do()/do()
- res_counter_uncharge_until() is only ever used to cancel a local
counter and never to uncharge bigger segments of a hierarchy, so
it's replaced by the simpler page_counter_cancel()
- res_counter_set_limit() is replaced by page_counter_limit(), which
expects its callers to serialize against themselves
- res_counter_memparse_write_strategy() is replaced by
page_counter_limit(), which rounds down to the nearest page size -
rather than up. This is more reasonable for explicitely requested
hard upper limits.
- to keep charging light-weight, page_counter_try_charge() charges
speculatively, only to roll back if the result exceeds the limit.
Because of this, a failing bigger charge can temporarily lock out
smaller charges that would otherwise succeed. The error is bounded
to the difference between the smallest and the biggest possible
charge size, so for memcg, this means that a failing THP charge can
send base page charges into reclaim upto 2MB (4MB) before the limit
would have been reached. This should be acceptable.
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE and memparse]
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE, memparse, strncmp, and PAGE_SIZE]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Acked-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-12-11 06:42:31 +07:00
|
|
|
|
2010-05-27 04:42:37 +07:00
|
|
|
if (!ret && enlarge)
|
|
|
|
memcg_oom_recover(memcg);
|
mm: memcontrol: lockless page counters
Memory is internally accounted in bytes, using spinlock-protected 64-bit
counters, even though the smallest accounting delta is a page. The
counter interface is also convoluted and does too many things.
Introduce a new lockless word-sized page counter API, then change all
memory accounting over to it. The translation from and to bytes then only
happens when interfacing with userspace.
The removed locking overhead is noticable when scaling beyond the per-cpu
charge caches - on a 4-socket machine with 144-threads, the following test
shows the performance differences of 288 memcgs concurrently running a
page fault benchmark:
vanilla:
18631648.500498 task-clock (msec) # 140.643 CPUs utilized ( +- 0.33% )
1,380,638 context-switches # 0.074 K/sec ( +- 0.75% )
24,390 cpu-migrations # 0.001 K/sec ( +- 8.44% )
1,843,305,768 page-faults # 0.099 M/sec ( +- 0.00% )
50,134,994,088,218 cycles # 2.691 GHz ( +- 0.33% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
8,049,712,224,651 instructions # 0.16 insns per cycle ( +- 0.04% )
1,586,970,584,979 branches # 85.176 M/sec ( +- 0.05% )
1,724,989,949 branch-misses # 0.11% of all branches ( +- 0.48% )
132.474343877 seconds time elapsed ( +- 0.21% )
lockless:
12195979.037525 task-clock (msec) # 133.480 CPUs utilized ( +- 0.18% )
832,850 context-switches # 0.068 K/sec ( +- 0.54% )
15,624 cpu-migrations # 0.001 K/sec ( +- 10.17% )
1,843,304,774 page-faults # 0.151 M/sec ( +- 0.00% )
32,811,216,801,141 cycles # 2.690 GHz ( +- 0.18% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
9,999,265,091,727 instructions # 0.30 insns per cycle ( +- 0.10% )
2,076,759,325,203 branches # 170.282 M/sec ( +- 0.12% )
1,656,917,214 branch-misses # 0.08% of all branches ( +- 0.55% )
91.369330729 seconds time elapsed ( +- 0.45% )
On top of improved scalability, this also gets rid of the icky long long
types in the very heart of memcg, which is great for 32 bit and also makes
the code a lot more readable.
Notable differences between the old and new API:
- res_counter_charge() and res_counter_charge_nofail() become
page_counter_try_charge() and page_counter_charge() resp. to match
the more common kernel naming scheme of try_do()/do()
- res_counter_uncharge_until() is only ever used to cancel a local
counter and never to uncharge bigger segments of a hierarchy, so
it's replaced by the simpler page_counter_cancel()
- res_counter_set_limit() is replaced by page_counter_limit(), which
expects its callers to serialize against themselves
- res_counter_memparse_write_strategy() is replaced by
page_counter_limit(), which rounds down to the nearest page size -
rather than up. This is more reasonable for explicitely requested
hard upper limits.
- to keep charging light-weight, page_counter_try_charge() charges
speculatively, only to roll back if the result exceeds the limit.
Because of this, a failing bigger charge can temporarily lock out
smaller charges that would otherwise succeed. The error is bounded
to the difference between the smallest and the biggest possible
charge size, so for memcg, this means that a failing THP charge can
send base page charges into reclaim upto 2MB (4MB) before the limit
would have been reached. This should be acceptable.
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE and memparse]
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE, memparse, strncmp, and PAGE_SIZE]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Acked-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-12-11 06:42:31 +07:00
|
|
|
|
2008-07-25 15:47:20 +07:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2016-07-29 05:46:05 +07:00
|
|
|
unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
|
2013-09-25 05:27:41 +07:00
|
|
|
gfp_t gfp_mask,
|
|
|
|
unsigned long *total_scanned)
|
|
|
|
{
|
|
|
|
unsigned long nr_reclaimed = 0;
|
2016-07-29 05:46:05 +07:00
|
|
|
struct mem_cgroup_per_node *mz, *next_mz = NULL;
|
2013-09-25 05:27:41 +07:00
|
|
|
unsigned long reclaimed;
|
|
|
|
int loop = 0;
|
2016-07-29 05:46:05 +07:00
|
|
|
struct mem_cgroup_tree_per_node *mctz;
|
mm: memcontrol: lockless page counters
Memory is internally accounted in bytes, using spinlock-protected 64-bit
counters, even though the smallest accounting delta is a page. The
counter interface is also convoluted and does too many things.
Introduce a new lockless word-sized page counter API, then change all
memory accounting over to it. The translation from and to bytes then only
happens when interfacing with userspace.
The removed locking overhead is noticable when scaling beyond the per-cpu
charge caches - on a 4-socket machine with 144-threads, the following test
shows the performance differences of 288 memcgs concurrently running a
page fault benchmark:
vanilla:
18631648.500498 task-clock (msec) # 140.643 CPUs utilized ( +- 0.33% )
1,380,638 context-switches # 0.074 K/sec ( +- 0.75% )
24,390 cpu-migrations # 0.001 K/sec ( +- 8.44% )
1,843,305,768 page-faults # 0.099 M/sec ( +- 0.00% )
50,134,994,088,218 cycles # 2.691 GHz ( +- 0.33% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
8,049,712,224,651 instructions # 0.16 insns per cycle ( +- 0.04% )
1,586,970,584,979 branches # 85.176 M/sec ( +- 0.05% )
1,724,989,949 branch-misses # 0.11% of all branches ( +- 0.48% )
132.474343877 seconds time elapsed ( +- 0.21% )
lockless:
12195979.037525 task-clock (msec) # 133.480 CPUs utilized ( +- 0.18% )
832,850 context-switches # 0.068 K/sec ( +- 0.54% )
15,624 cpu-migrations # 0.001 K/sec ( +- 10.17% )
1,843,304,774 page-faults # 0.151 M/sec ( +- 0.00% )
32,811,216,801,141 cycles # 2.690 GHz ( +- 0.18% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
9,999,265,091,727 instructions # 0.30 insns per cycle ( +- 0.10% )
2,076,759,325,203 branches # 170.282 M/sec ( +- 0.12% )
1,656,917,214 branch-misses # 0.08% of all branches ( +- 0.55% )
91.369330729 seconds time elapsed ( +- 0.45% )
On top of improved scalability, this also gets rid of the icky long long
types in the very heart of memcg, which is great for 32 bit and also makes
the code a lot more readable.
Notable differences between the old and new API:
- res_counter_charge() and res_counter_charge_nofail() become
page_counter_try_charge() and page_counter_charge() resp. to match
the more common kernel naming scheme of try_do()/do()
- res_counter_uncharge_until() is only ever used to cancel a local
counter and never to uncharge bigger segments of a hierarchy, so
it's replaced by the simpler page_counter_cancel()
- res_counter_set_limit() is replaced by page_counter_limit(), which
expects its callers to serialize against themselves
- res_counter_memparse_write_strategy() is replaced by
page_counter_limit(), which rounds down to the nearest page size -
rather than up. This is more reasonable for explicitely requested
hard upper limits.
- to keep charging light-weight, page_counter_try_charge() charges
speculatively, only to roll back if the result exceeds the limit.
Because of this, a failing bigger charge can temporarily lock out
smaller charges that would otherwise succeed. The error is bounded
to the difference between the smallest and the biggest possible
charge size, so for memcg, this means that a failing THP charge can
send base page charges into reclaim upto 2MB (4MB) before the limit
would have been reached. This should be acceptable.
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE and memparse]
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE, memparse, strncmp, and PAGE_SIZE]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Acked-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-12-11 06:42:31 +07:00
|
|
|
unsigned long excess;
|
2013-09-25 05:27:41 +07:00
|
|
|
unsigned long nr_scanned;
|
|
|
|
|
|
|
|
if (order > 0)
|
|
|
|
return 0;
|
|
|
|
|
2016-07-29 05:46:05 +07:00
|
|
|
mctz = soft_limit_tree_node(pgdat->node_id);
|
2016-08-03 04:02:37 +07:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Do not even bother to check the largest node if the root
|
|
|
|
* is empty. Do it lockless to prevent lock bouncing. Races
|
|
|
|
* are acceptable as soft limit is best effort anyway.
|
|
|
|
*/
|
mm/cgroup: avoid panic when init with low memory
The system may panic when initialisation is done when almost all the
memory is assigned to the huge pages using the kernel command line
parameter hugepage=xxxx. Panic may occur like this:
Unable to handle kernel paging request for data at address 0x00000000
Faulting instruction address: 0xc000000000302b88
Oops: Kernel access of bad area, sig: 11 [#1]
SMP NR_CPUS=2048 [ 0.082424] NUMA
pSeries
Modules linked in:
CPU: 0 PID: 1 Comm: swapper/0 Not tainted 4.9.0-15-generic #16-Ubuntu
task: c00000021ed01600 task.stack: c00000010d108000
NIP: c000000000302b88 LR: c000000000270e04 CTR: c00000000016cfd0
REGS: c00000010d10b2c0 TRAP: 0300 Not tainted (4.9.0-15-generic)
MSR: 8000000002009033 <SF,VEC,EE,ME,IR,DR,RI,LE>[ 0.082770] CR: 28424422 XER: 00000000
CFAR: c0000000003d28b8 DAR: 0000000000000000 DSISR: 40000000 SOFTE: 1
GPR00: c000000000270e04 c00000010d10b540 c00000000141a300 c00000010fff6300
GPR04: 0000000000000000 00000000026012c0 c00000010d10b630 0000000487ab0000
GPR08: 000000010ee90000 c000000001454fd8 0000000000000000 0000000000000000
GPR12: 0000000000004400 c00000000fb80000 00000000026012c0 00000000026012c0
GPR16: 00000000026012c0 0000000000000000 0000000000000000 0000000000000002
GPR20: 000000000000000c 0000000000000000 0000000000000000 00000000024200c0
GPR24: c0000000016eef48 0000000000000000 c00000010fff7d00 00000000026012c0
GPR28: 0000000000000000 c00000010fff7d00 c00000010fff6300 c00000010d10b6d0
NIP mem_cgroup_soft_limit_reclaim+0xf8/0x4f0
LR do_try_to_free_pages+0x1b4/0x450
Call Trace:
do_try_to_free_pages+0x1b4/0x450
try_to_free_pages+0xf8/0x270
__alloc_pages_nodemask+0x7a8/0xff0
new_slab+0x104/0x8e0
___slab_alloc+0x620/0x700
__slab_alloc+0x34/0x60
kmem_cache_alloc_node_trace+0xdc/0x310
mem_cgroup_init+0x158/0x1c8
do_one_initcall+0x68/0x1d0
kernel_init_freeable+0x278/0x360
kernel_init+0x24/0x170
ret_from_kernel_thread+0x5c/0x74
Instruction dump:
eb81ffe0 eba1ffe8 ebc1fff0 ebe1fff8 4e800020 3d230001 e9499a42 3d220004
3929acd8 794a1f24 7d295214 eac90100 <e9360000> 2fa90000 419eff74 3b200000
---[ end trace 342f5208b00d01b6 ]---
This is a chicken and egg issue where the kernel try to get free memory
when allocating per node data in mem_cgroup_init(), but in that path
mem_cgroup_soft_limit_reclaim() is called which assumes that these data
are allocated.
As mem_cgroup_soft_limit_reclaim() is best effort, it should return when
these data are not yet allocated.
This patch also fixes potential null pointer access in
mem_cgroup_remove_from_trees() and mem_cgroup_update_tree().
Link: http://lkml.kernel.org/r/1487856999-16581-2-git-send-email-ldufour@linux.vnet.ibm.com
Signed-off-by: Laurent Dufour <ldufour@linux.vnet.ibm.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Balbir Singh <bsingharora@gmail.com>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2017-03-10 07:17:06 +07:00
|
|
|
if (!mctz || RB_EMPTY_ROOT(&mctz->rb_root))
|
2016-08-03 04:02:37 +07:00
|
|
|
return 0;
|
|
|
|
|
2013-09-25 05:27:41 +07:00
|
|
|
/*
|
|
|
|
* This loop can run a while, specially if mem_cgroup's continuously
|
|
|
|
* keep exceeding their soft limit and putting the system under
|
|
|
|
* pressure
|
|
|
|
*/
|
|
|
|
do {
|
|
|
|
if (next_mz)
|
|
|
|
mz = next_mz;
|
|
|
|
else
|
|
|
|
mz = mem_cgroup_largest_soft_limit_node(mctz);
|
|
|
|
if (!mz)
|
|
|
|
break;
|
|
|
|
|
|
|
|
nr_scanned = 0;
|
2016-07-29 05:46:05 +07:00
|
|
|
reclaimed = mem_cgroup_soft_reclaim(mz->memcg, pgdat,
|
2013-09-25 05:27:41 +07:00
|
|
|
gfp_mask, &nr_scanned);
|
|
|
|
nr_reclaimed += reclaimed;
|
|
|
|
*total_scanned += nr_scanned;
|
mm: memcontrol: rewrite uncharge API
The memcg uncharging code that is involved towards the end of a page's
lifetime - truncation, reclaim, swapout, migration - is impressively
complicated and fragile.
Because anonymous and file pages were always charged before they had their
page->mapping established, uncharges had to happen when the page type
could still be known from the context; as in unmap for anonymous, page
cache removal for file and shmem pages, and swap cache truncation for swap
pages. However, these operations happen well before the page is actually
freed, and so a lot of synchronization is necessary:
- Charging, uncharging, page migration, and charge migration all need
to take a per-page bit spinlock as they could race with uncharging.
- Swap cache truncation happens during both swap-in and swap-out, and
possibly repeatedly before the page is actually freed. This means
that the memcg swapout code is called from many contexts that make
no sense and it has to figure out the direction from page state to
make sure memory and memory+swap are always correctly charged.
- On page migration, the old page might be unmapped but then reused,
so memcg code has to prevent untimely uncharging in that case.
Because this code - which should be a simple charge transfer - is so
special-cased, it is not reusable for replace_page_cache().
But now that charged pages always have a page->mapping, introduce
mem_cgroup_uncharge(), which is called after the final put_page(), when we
know for sure that nobody is looking at the page anymore.
For page migration, introduce mem_cgroup_migrate(), which is called after
the migration is successful and the new page is fully rmapped. Because
the old page is no longer uncharged after migration, prevent double
charges by decoupling the page's memcg association (PCG_USED and
pc->mem_cgroup) from the page holding an actual charge. The new bits
PCG_MEM and PCG_MEMSW represent the respective charges and are transferred
to the new page during migration.
mem_cgroup_migrate() is suitable for replace_page_cache() as well,
which gets rid of mem_cgroup_replace_page_cache(). However, care
needs to be taken because both the source and the target page can
already be charged and on the LRU when fuse is splicing: grab the page
lock on the charge moving side to prevent changing pc->mem_cgroup of a
page under migration. Also, the lruvecs of both pages change as we
uncharge the old and charge the new during migration, and putback may
race with us, so grab the lru lock and isolate the pages iff on LRU to
prevent races and ensure the pages are on the right lruvec afterward.
Swap accounting is massively simplified: because the page is no longer
uncharged as early as swap cache deletion, a new mem_cgroup_swapout() can
transfer the page's memory+swap charge (PCG_MEMSW) to the swap entry
before the final put_page() in page reclaim.
Finally, page_cgroup changes are now protected by whatever protection the
page itself offers: anonymous pages are charged under the page table lock,
whereas page cache insertions, swapin, and migration hold the page lock.
Uncharging happens under full exclusion with no outstanding references.
Charging and uncharging also ensure that the page is off-LRU, which
serializes against charge migration. Remove the very costly page_cgroup
lock and set pc->flags non-atomically.
[mhocko@suse.cz: mem_cgroup_charge_statistics needs preempt_disable]
[vdavydov@parallels.com: fix flags definition]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Vladimir Davydov <vdavydov@parallels.com>
Tested-by: Jet Chen <jet.chen@intel.com>
Acked-by: Michal Hocko <mhocko@suse.cz>
Tested-by: Felipe Balbi <balbi@ti.com>
Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-08-09 04:19:22 +07:00
|
|
|
spin_lock_irq(&mctz->lock);
|
2014-12-11 06:43:40 +07:00
|
|
|
__mem_cgroup_remove_exceeded(mz, mctz);
|
2013-09-25 05:27:41 +07:00
|
|
|
|
|
|
|
/*
|
|
|
|
* If we failed to reclaim anything from this memory cgroup
|
|
|
|
* it is time to move on to the next cgroup
|
|
|
|
*/
|
|
|
|
next_mz = NULL;
|
2014-12-11 06:43:40 +07:00
|
|
|
if (!reclaimed)
|
|
|
|
next_mz = __mem_cgroup_largest_soft_limit_node(mctz);
|
|
|
|
|
mm: memcontrol: lockless page counters
Memory is internally accounted in bytes, using spinlock-protected 64-bit
counters, even though the smallest accounting delta is a page. The
counter interface is also convoluted and does too many things.
Introduce a new lockless word-sized page counter API, then change all
memory accounting over to it. The translation from and to bytes then only
happens when interfacing with userspace.
The removed locking overhead is noticable when scaling beyond the per-cpu
charge caches - on a 4-socket machine with 144-threads, the following test
shows the performance differences of 288 memcgs concurrently running a
page fault benchmark:
vanilla:
18631648.500498 task-clock (msec) # 140.643 CPUs utilized ( +- 0.33% )
1,380,638 context-switches # 0.074 K/sec ( +- 0.75% )
24,390 cpu-migrations # 0.001 K/sec ( +- 8.44% )
1,843,305,768 page-faults # 0.099 M/sec ( +- 0.00% )
50,134,994,088,218 cycles # 2.691 GHz ( +- 0.33% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
8,049,712,224,651 instructions # 0.16 insns per cycle ( +- 0.04% )
1,586,970,584,979 branches # 85.176 M/sec ( +- 0.05% )
1,724,989,949 branch-misses # 0.11% of all branches ( +- 0.48% )
132.474343877 seconds time elapsed ( +- 0.21% )
lockless:
12195979.037525 task-clock (msec) # 133.480 CPUs utilized ( +- 0.18% )
832,850 context-switches # 0.068 K/sec ( +- 0.54% )
15,624 cpu-migrations # 0.001 K/sec ( +- 10.17% )
1,843,304,774 page-faults # 0.151 M/sec ( +- 0.00% )
32,811,216,801,141 cycles # 2.690 GHz ( +- 0.18% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
9,999,265,091,727 instructions # 0.30 insns per cycle ( +- 0.10% )
2,076,759,325,203 branches # 170.282 M/sec ( +- 0.12% )
1,656,917,214 branch-misses # 0.08% of all branches ( +- 0.55% )
91.369330729 seconds time elapsed ( +- 0.45% )
On top of improved scalability, this also gets rid of the icky long long
types in the very heart of memcg, which is great for 32 bit and also makes
the code a lot more readable.
Notable differences between the old and new API:
- res_counter_charge() and res_counter_charge_nofail() become
page_counter_try_charge() and page_counter_charge() resp. to match
the more common kernel naming scheme of try_do()/do()
- res_counter_uncharge_until() is only ever used to cancel a local
counter and never to uncharge bigger segments of a hierarchy, so
it's replaced by the simpler page_counter_cancel()
- res_counter_set_limit() is replaced by page_counter_limit(), which
expects its callers to serialize against themselves
- res_counter_memparse_write_strategy() is replaced by
page_counter_limit(), which rounds down to the nearest page size -
rather than up. This is more reasonable for explicitely requested
hard upper limits.
- to keep charging light-weight, page_counter_try_charge() charges
speculatively, only to roll back if the result exceeds the limit.
Because of this, a failing bigger charge can temporarily lock out
smaller charges that would otherwise succeed. The error is bounded
to the difference between the smallest and the biggest possible
charge size, so for memcg, this means that a failing THP charge can
send base page charges into reclaim upto 2MB (4MB) before the limit
would have been reached. This should be acceptable.
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE and memparse]
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE, memparse, strncmp, and PAGE_SIZE]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Acked-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-12-11 06:42:31 +07:00
|
|
|
excess = soft_limit_excess(mz->memcg);
|
2013-09-25 05:27:41 +07:00
|
|
|
/*
|
|
|
|
* One school of thought says that we should not add
|
|
|
|
* back the node to the tree if reclaim returns 0.
|
|
|
|
* But our reclaim could return 0, simply because due
|
|
|
|
* to priority we are exposing a smaller subset of
|
|
|
|
* memory to reclaim from. Consider this as a longer
|
|
|
|
* term TODO.
|
|
|
|
*/
|
|
|
|
/* If excess == 0, no tree ops */
|
2014-06-07 04:38:21 +07:00
|
|
|
__mem_cgroup_insert_exceeded(mz, mctz, excess);
|
mm: memcontrol: rewrite uncharge API
The memcg uncharging code that is involved towards the end of a page's
lifetime - truncation, reclaim, swapout, migration - is impressively
complicated and fragile.
Because anonymous and file pages were always charged before they had their
page->mapping established, uncharges had to happen when the page type
could still be known from the context; as in unmap for anonymous, page
cache removal for file and shmem pages, and swap cache truncation for swap
pages. However, these operations happen well before the page is actually
freed, and so a lot of synchronization is necessary:
- Charging, uncharging, page migration, and charge migration all need
to take a per-page bit spinlock as they could race with uncharging.
- Swap cache truncation happens during both swap-in and swap-out, and
possibly repeatedly before the page is actually freed. This means
that the memcg swapout code is called from many contexts that make
no sense and it has to figure out the direction from page state to
make sure memory and memory+swap are always correctly charged.
- On page migration, the old page might be unmapped but then reused,
so memcg code has to prevent untimely uncharging in that case.
Because this code - which should be a simple charge transfer - is so
special-cased, it is not reusable for replace_page_cache().
But now that charged pages always have a page->mapping, introduce
mem_cgroup_uncharge(), which is called after the final put_page(), when we
know for sure that nobody is looking at the page anymore.
For page migration, introduce mem_cgroup_migrate(), which is called after
the migration is successful and the new page is fully rmapped. Because
the old page is no longer uncharged after migration, prevent double
charges by decoupling the page's memcg association (PCG_USED and
pc->mem_cgroup) from the page holding an actual charge. The new bits
PCG_MEM and PCG_MEMSW represent the respective charges and are transferred
to the new page during migration.
mem_cgroup_migrate() is suitable for replace_page_cache() as well,
which gets rid of mem_cgroup_replace_page_cache(). However, care
needs to be taken because both the source and the target page can
already be charged and on the LRU when fuse is splicing: grab the page
lock on the charge moving side to prevent changing pc->mem_cgroup of a
page under migration. Also, the lruvecs of both pages change as we
uncharge the old and charge the new during migration, and putback may
race with us, so grab the lru lock and isolate the pages iff on LRU to
prevent races and ensure the pages are on the right lruvec afterward.
Swap accounting is massively simplified: because the page is no longer
uncharged as early as swap cache deletion, a new mem_cgroup_swapout() can
transfer the page's memory+swap charge (PCG_MEMSW) to the swap entry
before the final put_page() in page reclaim.
Finally, page_cgroup changes are now protected by whatever protection the
page itself offers: anonymous pages are charged under the page table lock,
whereas page cache insertions, swapin, and migration hold the page lock.
Uncharging happens under full exclusion with no outstanding references.
Charging and uncharging also ensure that the page is off-LRU, which
serializes against charge migration. Remove the very costly page_cgroup
lock and set pc->flags non-atomically.
[mhocko@suse.cz: mem_cgroup_charge_statistics needs preempt_disable]
[vdavydov@parallels.com: fix flags definition]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Vladimir Davydov <vdavydov@parallels.com>
Tested-by: Jet Chen <jet.chen@intel.com>
Acked-by: Michal Hocko <mhocko@suse.cz>
Tested-by: Felipe Balbi <balbi@ti.com>
Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-08-09 04:19:22 +07:00
|
|
|
spin_unlock_irq(&mctz->lock);
|
2013-09-25 05:27:41 +07:00
|
|
|
css_put(&mz->memcg->css);
|
|
|
|
loop++;
|
|
|
|
/*
|
|
|
|
* Could not reclaim anything and there are no more
|
|
|
|
* mem cgroups to try or we seem to be looping without
|
|
|
|
* reclaiming anything.
|
|
|
|
*/
|
|
|
|
if (!nr_reclaimed &&
|
|
|
|
(next_mz == NULL ||
|
|
|
|
loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))
|
|
|
|
break;
|
|
|
|
} while (!nr_reclaimed);
|
|
|
|
if (next_mz)
|
|
|
|
css_put(&next_mz->memcg->css);
|
|
|
|
return nr_reclaimed;
|
|
|
|
}
|
|
|
|
|
2014-05-17 00:22:48 +07:00
|
|
|
/*
|
|
|
|
* Test whether @memcg has children, dead or alive. Note that this
|
|
|
|
* function doesn't care whether @memcg has use_hierarchy enabled and
|
|
|
|
* returns %true if there are child csses according to the cgroup
|
2020-06-05 06:49:28 +07:00
|
|
|
* hierarchy. Testing use_hierarchy is the caller's responsibility.
|
2014-05-17 00:22:48 +07:00
|
|
|
*/
|
2013-02-23 07:34:53 +07:00
|
|
|
static inline bool memcg_has_children(struct mem_cgroup *memcg)
|
|
|
|
{
|
2014-05-17 00:22:48 +07:00
|
|
|
bool ret;
|
|
|
|
|
|
|
|
rcu_read_lock();
|
|
|
|
ret = css_next_child(NULL, &memcg->css);
|
|
|
|
rcu_read_unlock();
|
|
|
|
return ret;
|
2013-02-23 07:34:53 +07:00
|
|
|
}
|
|
|
|
|
2012-10-26 18:37:28 +07:00
|
|
|
/*
|
2016-05-21 06:58:18 +07:00
|
|
|
* Reclaims as many pages from the given memcg as possible.
|
2012-10-26 18:37:28 +07:00
|
|
|
*
|
|
|
|
* Caller is responsible for holding css reference for memcg.
|
|
|
|
*/
|
|
|
|
static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
|
|
|
|
{
|
2020-08-07 13:21:58 +07:00
|
|
|
int nr_retries = MAX_RECLAIM_RETRIES;
|
2012-10-26 18:37:28 +07:00
|
|
|
|
2009-01-08 09:07:55 +07:00
|
|
|
/* we call try-to-free pages for make this cgroup empty */
|
|
|
|
lru_add_drain_all();
|
2018-06-08 07:07:31 +07:00
|
|
|
|
|
|
|
drain_all_stock(memcg);
|
|
|
|
|
memcg: move all acccounting to parent at rmdir()
This patch provides a function to move account information of a page
between mem_cgroups and rewrite force_empty to make use of this.
This moving of page_cgroup is done under
- lru_lock of source/destination mem_cgroup is held.
- lock_page_cgroup() is held.
Then, a routine which touches pc->mem_cgroup without lock_page_cgroup()
should confirm pc->mem_cgroup is still valid or not. Typical code can be
following.
(while page is not under lock_page())
mem = pc->mem_cgroup;
mz = page_cgroup_zoneinfo(pc)
spin_lock_irqsave(&mz->lru_lock);
if (pc->mem_cgroup == mem)
...../* some list handling */
spin_unlock_irqrestore(&mz->lru_lock);
Of course, better way is
lock_page_cgroup(pc);
....
unlock_page_cgroup(pc);
But you should confirm the nest of lock and avoid deadlock.
If you treats page_cgroup from mem_cgroup's LRU under mz->lru_lock,
you don't have to worry about what pc->mem_cgroup points to.
moved pages are added to head of lru, not to tail.
Expected users of this routine is:
- force_empty (rmdir)
- moving tasks between cgroup (for moving account information.)
- hierarchy (maybe useful.)
force_empty(rmdir) uses this move_account and move pages to its parent.
This "move" will not cause OOM (I added "oom" parameter to try_charge().)
If the parent is busy (not enough memory), force_empty calls try_to_free_page()
and reduce usage.
Purpose of this behavior is
- Fix "forget all" behavior of force_empty and avoid leak of accounting.
- By "moving first, free if necessary", keep pages on memory as much as
possible.
Adding a switch to change behavior of force_empty to
- free first, move if necessary
- free all, if there is mlocked/busy pages, return -EBUSY.
is under consideration. (I'll add if someone requtests.)
This patch also removes memory.force_empty file, a brutal debug-only interface.
Reviewed-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Tested-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Paul Menage <menage@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-01-08 09:07:53 +07:00
|
|
|
/* try to free all pages in this cgroup */
|
mm: memcontrol: lockless page counters
Memory is internally accounted in bytes, using spinlock-protected 64-bit
counters, even though the smallest accounting delta is a page. The
counter interface is also convoluted and does too many things.
Introduce a new lockless word-sized page counter API, then change all
memory accounting over to it. The translation from and to bytes then only
happens when interfacing with userspace.
The removed locking overhead is noticable when scaling beyond the per-cpu
charge caches - on a 4-socket machine with 144-threads, the following test
shows the performance differences of 288 memcgs concurrently running a
page fault benchmark:
vanilla:
18631648.500498 task-clock (msec) # 140.643 CPUs utilized ( +- 0.33% )
1,380,638 context-switches # 0.074 K/sec ( +- 0.75% )
24,390 cpu-migrations # 0.001 K/sec ( +- 8.44% )
1,843,305,768 page-faults # 0.099 M/sec ( +- 0.00% )
50,134,994,088,218 cycles # 2.691 GHz ( +- 0.33% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
8,049,712,224,651 instructions # 0.16 insns per cycle ( +- 0.04% )
1,586,970,584,979 branches # 85.176 M/sec ( +- 0.05% )
1,724,989,949 branch-misses # 0.11% of all branches ( +- 0.48% )
132.474343877 seconds time elapsed ( +- 0.21% )
lockless:
12195979.037525 task-clock (msec) # 133.480 CPUs utilized ( +- 0.18% )
832,850 context-switches # 0.068 K/sec ( +- 0.54% )
15,624 cpu-migrations # 0.001 K/sec ( +- 10.17% )
1,843,304,774 page-faults # 0.151 M/sec ( +- 0.00% )
32,811,216,801,141 cycles # 2.690 GHz ( +- 0.18% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
9,999,265,091,727 instructions # 0.30 insns per cycle ( +- 0.10% )
2,076,759,325,203 branches # 170.282 M/sec ( +- 0.12% )
1,656,917,214 branch-misses # 0.08% of all branches ( +- 0.55% )
91.369330729 seconds time elapsed ( +- 0.45% )
On top of improved scalability, this also gets rid of the icky long long
types in the very heart of memcg, which is great for 32 bit and also makes
the code a lot more readable.
Notable differences between the old and new API:
- res_counter_charge() and res_counter_charge_nofail() become
page_counter_try_charge() and page_counter_charge() resp. to match
the more common kernel naming scheme of try_do()/do()
- res_counter_uncharge_until() is only ever used to cancel a local
counter and never to uncharge bigger segments of a hierarchy, so
it's replaced by the simpler page_counter_cancel()
- res_counter_set_limit() is replaced by page_counter_limit(), which
expects its callers to serialize against themselves
- res_counter_memparse_write_strategy() is replaced by
page_counter_limit(), which rounds down to the nearest page size -
rather than up. This is more reasonable for explicitely requested
hard upper limits.
- to keep charging light-weight, page_counter_try_charge() charges
speculatively, only to roll back if the result exceeds the limit.
Because of this, a failing bigger charge can temporarily lock out
smaller charges that would otherwise succeed. The error is bounded
to the difference between the smallest and the biggest possible
charge size, so for memcg, this means that a failing THP charge can
send base page charges into reclaim upto 2MB (4MB) before the limit
would have been reached. This should be acceptable.
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE and memparse]
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE, memparse, strncmp, and PAGE_SIZE]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Acked-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-12-11 06:42:31 +07:00
|
|
|
while (nr_retries && page_counter_read(&memcg->memory)) {
|
memcg: move all acccounting to parent at rmdir()
This patch provides a function to move account information of a page
between mem_cgroups and rewrite force_empty to make use of this.
This moving of page_cgroup is done under
- lru_lock of source/destination mem_cgroup is held.
- lock_page_cgroup() is held.
Then, a routine which touches pc->mem_cgroup without lock_page_cgroup()
should confirm pc->mem_cgroup is still valid or not. Typical code can be
following.
(while page is not under lock_page())
mem = pc->mem_cgroup;
mz = page_cgroup_zoneinfo(pc)
spin_lock_irqsave(&mz->lru_lock);
if (pc->mem_cgroup == mem)
...../* some list handling */
spin_unlock_irqrestore(&mz->lru_lock);
Of course, better way is
lock_page_cgroup(pc);
....
unlock_page_cgroup(pc);
But you should confirm the nest of lock and avoid deadlock.
If you treats page_cgroup from mem_cgroup's LRU under mz->lru_lock,
you don't have to worry about what pc->mem_cgroup points to.
moved pages are added to head of lru, not to tail.
Expected users of this routine is:
- force_empty (rmdir)
- moving tasks between cgroup (for moving account information.)
- hierarchy (maybe useful.)
force_empty(rmdir) uses this move_account and move pages to its parent.
This "move" will not cause OOM (I added "oom" parameter to try_charge().)
If the parent is busy (not enough memory), force_empty calls try_to_free_page()
and reduce usage.
Purpose of this behavior is
- Fix "forget all" behavior of force_empty and avoid leak of accounting.
- By "moving first, free if necessary", keep pages on memory as much as
possible.
Adding a switch to change behavior of force_empty to
- free first, move if necessary
- free all, if there is mlocked/busy pages, return -EBUSY.
is under consideration. (I'll add if someone requtests.)
This patch also removes memory.force_empty file, a brutal debug-only interface.
Reviewed-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Tested-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Paul Menage <menage@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-01-08 09:07:53 +07:00
|
|
|
int progress;
|
2009-01-08 09:07:55 +07:00
|
|
|
|
2012-10-26 18:37:28 +07:00
|
|
|
if (signal_pending(current))
|
|
|
|
return -EINTR;
|
|
|
|
|
2014-10-10 05:28:56 +07:00
|
|
|
progress = try_to_free_mem_cgroup_pages(memcg, 1,
|
|
|
|
GFP_KERNEL, true);
|
2009-01-08 09:07:55 +07:00
|
|
|
if (!progress) {
|
memcg: move all acccounting to parent at rmdir()
This patch provides a function to move account information of a page
between mem_cgroups and rewrite force_empty to make use of this.
This moving of page_cgroup is done under
- lru_lock of source/destination mem_cgroup is held.
- lock_page_cgroup() is held.
Then, a routine which touches pc->mem_cgroup without lock_page_cgroup()
should confirm pc->mem_cgroup is still valid or not. Typical code can be
following.
(while page is not under lock_page())
mem = pc->mem_cgroup;
mz = page_cgroup_zoneinfo(pc)
spin_lock_irqsave(&mz->lru_lock);
if (pc->mem_cgroup == mem)
...../* some list handling */
spin_unlock_irqrestore(&mz->lru_lock);
Of course, better way is
lock_page_cgroup(pc);
....
unlock_page_cgroup(pc);
But you should confirm the nest of lock and avoid deadlock.
If you treats page_cgroup from mem_cgroup's LRU under mz->lru_lock,
you don't have to worry about what pc->mem_cgroup points to.
moved pages are added to head of lru, not to tail.
Expected users of this routine is:
- force_empty (rmdir)
- moving tasks between cgroup (for moving account information.)
- hierarchy (maybe useful.)
force_empty(rmdir) uses this move_account and move pages to its parent.
This "move" will not cause OOM (I added "oom" parameter to try_charge().)
If the parent is busy (not enough memory), force_empty calls try_to_free_page()
and reduce usage.
Purpose of this behavior is
- Fix "forget all" behavior of force_empty and avoid leak of accounting.
- By "moving first, free if necessary", keep pages on memory as much as
possible.
Adding a switch to change behavior of force_empty to
- free first, move if necessary
- free all, if there is mlocked/busy pages, return -EBUSY.
is under consideration. (I'll add if someone requtests.)
This patch also removes memory.force_empty file, a brutal debug-only interface.
Reviewed-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Tested-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Paul Menage <menage@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-01-08 09:07:53 +07:00
|
|
|
nr_retries--;
|
2009-01-08 09:07:55 +07:00
|
|
|
/* maybe some writeback is necessary */
|
2009-07-09 19:52:32 +07:00
|
|
|
congestion_wait(BLK_RW_ASYNC, HZ/10);
|
2009-01-08 09:07:55 +07:00
|
|
|
}
|
memcg: move all acccounting to parent at rmdir()
This patch provides a function to move account information of a page
between mem_cgroups and rewrite force_empty to make use of this.
This moving of page_cgroup is done under
- lru_lock of source/destination mem_cgroup is held.
- lock_page_cgroup() is held.
Then, a routine which touches pc->mem_cgroup without lock_page_cgroup()
should confirm pc->mem_cgroup is still valid or not. Typical code can be
following.
(while page is not under lock_page())
mem = pc->mem_cgroup;
mz = page_cgroup_zoneinfo(pc)
spin_lock_irqsave(&mz->lru_lock);
if (pc->mem_cgroup == mem)
...../* some list handling */
spin_unlock_irqrestore(&mz->lru_lock);
Of course, better way is
lock_page_cgroup(pc);
....
unlock_page_cgroup(pc);
But you should confirm the nest of lock and avoid deadlock.
If you treats page_cgroup from mem_cgroup's LRU under mz->lru_lock,
you don't have to worry about what pc->mem_cgroup points to.
moved pages are added to head of lru, not to tail.
Expected users of this routine is:
- force_empty (rmdir)
- moving tasks between cgroup (for moving account information.)
- hierarchy (maybe useful.)
force_empty(rmdir) uses this move_account and move pages to its parent.
This "move" will not cause OOM (I added "oom" parameter to try_charge().)
If the parent is busy (not enough memory), force_empty calls try_to_free_page()
and reduce usage.
Purpose of this behavior is
- Fix "forget all" behavior of force_empty and avoid leak of accounting.
- By "moving first, free if necessary", keep pages on memory as much as
possible.
Adding a switch to change behavior of force_empty to
- free first, move if necessary
- free all, if there is mlocked/busy pages, return -EBUSY.
is under consideration. (I'll add if someone requtests.)
This patch also removes memory.force_empty file, a brutal debug-only interface.
Reviewed-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Tested-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Paul Menage <menage@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-01-08 09:07:53 +07:00
|
|
|
|
|
|
|
}
|
2012-10-26 18:37:32 +07:00
|
|
|
|
|
|
|
return 0;
|
2008-02-07 15:14:16 +07:00
|
|
|
}
|
|
|
|
|
2014-05-13 23:16:21 +07:00
|
|
|
static ssize_t mem_cgroup_force_empty_write(struct kernfs_open_file *of,
|
|
|
|
char *buf, size_t nbytes,
|
|
|
|
loff_t off)
|
2009-01-08 09:07:55 +07:00
|
|
|
{
|
2014-05-13 23:16:21 +07:00
|
|
|
struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
|
2012-10-26 18:37:28 +07:00
|
|
|
|
2012-10-26 18:37:29 +07:00
|
|
|
if (mem_cgroup_is_root(memcg))
|
|
|
|
return -EINVAL;
|
2014-05-13 23:16:21 +07:00
|
|
|
return mem_cgroup_force_empty(memcg) ?: nbytes;
|
2009-01-08 09:07:55 +07:00
|
|
|
}
|
|
|
|
|
2013-08-09 07:11:24 +07:00
|
|
|
static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css,
|
|
|
|
struct cftype *cft)
|
2009-01-08 09:08:07 +07:00
|
|
|
{
|
2013-08-09 07:11:24 +07:00
|
|
|
return mem_cgroup_from_css(css)->use_hierarchy;
|
2009-01-08 09:08:07 +07:00
|
|
|
}
|
|
|
|
|
2013-08-09 07:11:24 +07:00
|
|
|
static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css,
|
|
|
|
struct cftype *cft, u64 val)
|
2009-01-08 09:08:07 +07:00
|
|
|
{
|
|
|
|
int retval = 0;
|
2013-08-09 07:11:24 +07:00
|
|
|
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
|
2014-05-17 00:22:48 +07:00
|
|
|
struct mem_cgroup *parent_memcg = mem_cgroup_from_css(memcg->css.parent);
|
2009-01-08 09:08:07 +07:00
|
|
|
|
memcg: fix bad behavior in use_hierarchy file
I have an application that does the following:
* copy the state of all controllers attached to a hierarchy
* replicate it as a child of the current level.
I would expect writes to the files to mostly succeed, since they are
inheriting sane values from parents.
But that is not the case for use_hierarchy. If it is set to 0, we succeed
ok. If we're set to 1, the value of the file is automatically set to 1 in
the children, but if userspace tries to write the very same 1, it will
fail. That same situation happens if we set use_hierarchy, create a
child, and then try to write 1 again.
Now, there is no reason whatsoever for failing to write a value that is
already there. It doesn't even match the comments, that states:
/* If parent's use_hierarchy is set, we can't make any modifications
* in the child subtrees...
since we are not changing anything.
So test the new value against the one we're storing, and automatically
return 0 if we're not proposing a change.
Signed-off-by: Glauber Costa <glommer@parallels.com>
Cc: Dhaval Giani <dhaval.giani@gmail.com>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: Kamezawa Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Ying Han <yinghan@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-08-01 06:43:07 +07:00
|
|
|
if (memcg->use_hierarchy == val)
|
2016-01-21 06:02:53 +07:00
|
|
|
return 0;
|
memcg: fix bad behavior in use_hierarchy file
I have an application that does the following:
* copy the state of all controllers attached to a hierarchy
* replicate it as a child of the current level.
I would expect writes to the files to mostly succeed, since they are
inheriting sane values from parents.
But that is not the case for use_hierarchy. If it is set to 0, we succeed
ok. If we're set to 1, the value of the file is automatically set to 1 in
the children, but if userspace tries to write the very same 1, it will
fail. That same situation happens if we set use_hierarchy, create a
child, and then try to write 1 again.
Now, there is no reason whatsoever for failing to write a value that is
already there. It doesn't even match the comments, that states:
/* If parent's use_hierarchy is set, we can't make any modifications
* in the child subtrees...
since we are not changing anything.
So test the new value against the one we're storing, and automatically
return 0 if we're not proposing a change.
Signed-off-by: Glauber Costa <glommer@parallels.com>
Cc: Dhaval Giani <dhaval.giani@gmail.com>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: Kamezawa Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Ying Han <yinghan@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-08-01 06:43:07 +07:00
|
|
|
|
2009-01-08 09:08:07 +07:00
|
|
|
/*
|
tree-wide: fix assorted typos all over the place
That is "success", "unknown", "through", "performance", "[re|un]mapping"
, "access", "default", "reasonable", "[con]currently", "temperature"
, "channel", "[un]used", "application", "example","hierarchy", "therefore"
, "[over|under]flow", "contiguous", "threshold", "enough" and others.
Signed-off-by: André Goddard Rosa <andre.goddard@gmail.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
2009-11-14 22:09:05 +07:00
|
|
|
* If parent's use_hierarchy is set, we can't make any modifications
|
2009-01-08 09:08:07 +07:00
|
|
|
* in the child subtrees. If it is unset, then the change can
|
|
|
|
* occur, provided the current cgroup has no children.
|
|
|
|
*
|
|
|
|
* For the root cgroup, parent_mem is NULL, we allow value to be
|
|
|
|
* set if there are no children.
|
|
|
|
*/
|
2011-11-03 03:38:15 +07:00
|
|
|
if ((!parent_memcg || !parent_memcg->use_hierarchy) &&
|
2009-01-08 09:08:07 +07:00
|
|
|
(val == 1 || val == 0)) {
|
2014-05-17 00:22:48 +07:00
|
|
|
if (!memcg_has_children(memcg))
|
2011-11-03 03:38:15 +07:00
|
|
|
memcg->use_hierarchy = val;
|
2009-01-08 09:08:07 +07:00
|
|
|
else
|
|
|
|
retval = -EBUSY;
|
|
|
|
} else
|
|
|
|
retval = -EINVAL;
|
memcg: fix bad behavior in use_hierarchy file
I have an application that does the following:
* copy the state of all controllers attached to a hierarchy
* replicate it as a child of the current level.
I would expect writes to the files to mostly succeed, since they are
inheriting sane values from parents.
But that is not the case for use_hierarchy. If it is set to 0, we succeed
ok. If we're set to 1, the value of the file is automatically set to 1 in
the children, but if userspace tries to write the very same 1, it will
fail. That same situation happens if we set use_hierarchy, create a
child, and then try to write 1 again.
Now, there is no reason whatsoever for failing to write a value that is
already there. It doesn't even match the comments, that states:
/* If parent's use_hierarchy is set, we can't make any modifications
* in the child subtrees...
since we are not changing anything.
So test the new value against the one we're storing, and automatically
return 0 if we're not proposing a change.
Signed-off-by: Glauber Costa <glommer@parallels.com>
Cc: Dhaval Giani <dhaval.giani@gmail.com>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: Kamezawa Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Ying Han <yinghan@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-08-01 06:43:07 +07:00
|
|
|
|
2009-01-08 09:08:07 +07:00
|
|
|
return retval;
|
|
|
|
}
|
|
|
|
|
2015-11-07 07:28:58 +07:00
|
|
|
static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
|
2014-09-05 19:43:57 +07:00
|
|
|
{
|
mm: memcontrol: fix recursive statistics correctness & scalabilty
Right now, when somebody needs to know the recursive memory statistics
and events of a cgroup subtree, they need to walk the entire subtree and
sum up the counters manually.
There are two issues with this:
1. When a cgroup gets deleted, its stats are lost. The state counters
should all be 0 at that point, of course, but the events are not.
When this happens, the event counters, which are supposed to be
monotonic, can go backwards in the parent cgroups.
2. During regular operation, we always have a certain number of lazily
freed cgroups sitting around that have been deleted, have no tasks,
but have a few cache pages remaining. These groups' statistics do not
change until we eventually hit memory pressure, but somebody
watching, say, memory.stat on an ancestor has to iterate those every
time.
This patch addresses both issues by introducing recursive counters at
each level that are propagated from the write side when stats change.
Upward propagation happens when the per-cpu caches spill over into the
local atomic counter. This is the same thing we do during charge and
uncharge, except that the latter uses atomic RMWs, which are more
expensive; stat changes happen at around the same rate. In a sparse
file test (page faults and reclaim at maximum CPU speed) with 5 cgroup
nesting levels, perf shows __mod_memcg_page state at ~1%.
Link: http://lkml.kernel.org/r/20190412151507.2769-4-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Reviewed-by: Roman Gushchin <guro@fb.com>
Cc: Michal Hocko <mhocko@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-05-15 05:47:12 +07:00
|
|
|
unsigned long val;
|
2014-09-05 19:43:57 +07:00
|
|
|
|
mm: memcontrol: lockless page counters
Memory is internally accounted in bytes, using spinlock-protected 64-bit
counters, even though the smallest accounting delta is a page. The
counter interface is also convoluted and does too many things.
Introduce a new lockless word-sized page counter API, then change all
memory accounting over to it. The translation from and to bytes then only
happens when interfacing with userspace.
The removed locking overhead is noticable when scaling beyond the per-cpu
charge caches - on a 4-socket machine with 144-threads, the following test
shows the performance differences of 288 memcgs concurrently running a
page fault benchmark:
vanilla:
18631648.500498 task-clock (msec) # 140.643 CPUs utilized ( +- 0.33% )
1,380,638 context-switches # 0.074 K/sec ( +- 0.75% )
24,390 cpu-migrations # 0.001 K/sec ( +- 8.44% )
1,843,305,768 page-faults # 0.099 M/sec ( +- 0.00% )
50,134,994,088,218 cycles # 2.691 GHz ( +- 0.33% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
8,049,712,224,651 instructions # 0.16 insns per cycle ( +- 0.04% )
1,586,970,584,979 branches # 85.176 M/sec ( +- 0.05% )
1,724,989,949 branch-misses # 0.11% of all branches ( +- 0.48% )
132.474343877 seconds time elapsed ( +- 0.21% )
lockless:
12195979.037525 task-clock (msec) # 133.480 CPUs utilized ( +- 0.18% )
832,850 context-switches # 0.068 K/sec ( +- 0.54% )
15,624 cpu-migrations # 0.001 K/sec ( +- 10.17% )
1,843,304,774 page-faults # 0.151 M/sec ( +- 0.00% )
32,811,216,801,141 cycles # 2.690 GHz ( +- 0.18% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
9,999,265,091,727 instructions # 0.30 insns per cycle ( +- 0.10% )
2,076,759,325,203 branches # 170.282 M/sec ( +- 0.12% )
1,656,917,214 branch-misses # 0.08% of all branches ( +- 0.55% )
91.369330729 seconds time elapsed ( +- 0.45% )
On top of improved scalability, this also gets rid of the icky long long
types in the very heart of memcg, which is great for 32 bit and also makes
the code a lot more readable.
Notable differences between the old and new API:
- res_counter_charge() and res_counter_charge_nofail() become
page_counter_try_charge() and page_counter_charge() resp. to match
the more common kernel naming scheme of try_do()/do()
- res_counter_uncharge_until() is only ever used to cancel a local
counter and never to uncharge bigger segments of a hierarchy, so
it's replaced by the simpler page_counter_cancel()
- res_counter_set_limit() is replaced by page_counter_limit(), which
expects its callers to serialize against themselves
- res_counter_memparse_write_strategy() is replaced by
page_counter_limit(), which rounds down to the nearest page size -
rather than up. This is more reasonable for explicitely requested
hard upper limits.
- to keep charging light-weight, page_counter_try_charge() charges
speculatively, only to roll back if the result exceeds the limit.
Because of this, a failing bigger charge can temporarily lock out
smaller charges that would otherwise succeed. The error is bounded
to the difference between the smallest and the biggest possible
charge size, so for memcg, this means that a failing THP charge can
send base page charges into reclaim upto 2MB (4MB) before the limit
would have been reached. This should be acceptable.
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE and memparse]
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE, memparse, strncmp, and PAGE_SIZE]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Acked-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-12-11 06:42:31 +07:00
|
|
|
if (mem_cgroup_is_root(memcg)) {
|
2020-06-04 06:01:54 +07:00
|
|
|
val = memcg_page_state(memcg, NR_FILE_PAGES) +
|
2020-06-04 06:01:57 +07:00
|
|
|
memcg_page_state(memcg, NR_ANON_MAPPED);
|
mm: memcontrol: fix recursive statistics correctness & scalabilty
Right now, when somebody needs to know the recursive memory statistics
and events of a cgroup subtree, they need to walk the entire subtree and
sum up the counters manually.
There are two issues with this:
1. When a cgroup gets deleted, its stats are lost. The state counters
should all be 0 at that point, of course, but the events are not.
When this happens, the event counters, which are supposed to be
monotonic, can go backwards in the parent cgroups.
2. During regular operation, we always have a certain number of lazily
freed cgroups sitting around that have been deleted, have no tasks,
but have a few cache pages remaining. These groups' statistics do not
change until we eventually hit memory pressure, but somebody
watching, say, memory.stat on an ancestor has to iterate those every
time.
This patch addresses both issues by introducing recursive counters at
each level that are propagated from the write side when stats change.
Upward propagation happens when the per-cpu caches spill over into the
local atomic counter. This is the same thing we do during charge and
uncharge, except that the latter uses atomic RMWs, which are more
expensive; stat changes happen at around the same rate. In a sparse
file test (page faults and reclaim at maximum CPU speed) with 5 cgroup
nesting levels, perf shows __mod_memcg_page state at ~1%.
Link: http://lkml.kernel.org/r/20190412151507.2769-4-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Reviewed-by: Roman Gushchin <guro@fb.com>
Cc: Michal Hocko <mhocko@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-05-15 05:47:12 +07:00
|
|
|
if (swap)
|
|
|
|
val += memcg_page_state(memcg, MEMCG_SWAP);
|
mm: memcontrol: lockless page counters
Memory is internally accounted in bytes, using spinlock-protected 64-bit
counters, even though the smallest accounting delta is a page. The
counter interface is also convoluted and does too many things.
Introduce a new lockless word-sized page counter API, then change all
memory accounting over to it. The translation from and to bytes then only
happens when interfacing with userspace.
The removed locking overhead is noticable when scaling beyond the per-cpu
charge caches - on a 4-socket machine with 144-threads, the following test
shows the performance differences of 288 memcgs concurrently running a
page fault benchmark:
vanilla:
18631648.500498 task-clock (msec) # 140.643 CPUs utilized ( +- 0.33% )
1,380,638 context-switches # 0.074 K/sec ( +- 0.75% )
24,390 cpu-migrations # 0.001 K/sec ( +- 8.44% )
1,843,305,768 page-faults # 0.099 M/sec ( +- 0.00% )
50,134,994,088,218 cycles # 2.691 GHz ( +- 0.33% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
8,049,712,224,651 instructions # 0.16 insns per cycle ( +- 0.04% )
1,586,970,584,979 branches # 85.176 M/sec ( +- 0.05% )
1,724,989,949 branch-misses # 0.11% of all branches ( +- 0.48% )
132.474343877 seconds time elapsed ( +- 0.21% )
lockless:
12195979.037525 task-clock (msec) # 133.480 CPUs utilized ( +- 0.18% )
832,850 context-switches # 0.068 K/sec ( +- 0.54% )
15,624 cpu-migrations # 0.001 K/sec ( +- 10.17% )
1,843,304,774 page-faults # 0.151 M/sec ( +- 0.00% )
32,811,216,801,141 cycles # 2.690 GHz ( +- 0.18% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
9,999,265,091,727 instructions # 0.30 insns per cycle ( +- 0.10% )
2,076,759,325,203 branches # 170.282 M/sec ( +- 0.12% )
1,656,917,214 branch-misses # 0.08% of all branches ( +- 0.55% )
91.369330729 seconds time elapsed ( +- 0.45% )
On top of improved scalability, this also gets rid of the icky long long
types in the very heart of memcg, which is great for 32 bit and also makes
the code a lot more readable.
Notable differences between the old and new API:
- res_counter_charge() and res_counter_charge_nofail() become
page_counter_try_charge() and page_counter_charge() resp. to match
the more common kernel naming scheme of try_do()/do()
- res_counter_uncharge_until() is only ever used to cancel a local
counter and never to uncharge bigger segments of a hierarchy, so
it's replaced by the simpler page_counter_cancel()
- res_counter_set_limit() is replaced by page_counter_limit(), which
expects its callers to serialize against themselves
- res_counter_memparse_write_strategy() is replaced by
page_counter_limit(), which rounds down to the nearest page size -
rather than up. This is more reasonable for explicitely requested
hard upper limits.
- to keep charging light-weight, page_counter_try_charge() charges
speculatively, only to roll back if the result exceeds the limit.
Because of this, a failing bigger charge can temporarily lock out
smaller charges that would otherwise succeed. The error is bounded
to the difference between the smallest and the biggest possible
charge size, so for memcg, this means that a failing THP charge can
send base page charges into reclaim upto 2MB (4MB) before the limit
would have been reached. This should be acceptable.
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE and memparse]
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE, memparse, strncmp, and PAGE_SIZE]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Acked-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-12-11 06:42:31 +07:00
|
|
|
} else {
|
2014-09-05 19:43:57 +07:00
|
|
|
if (!swap)
|
mm: memcontrol: lockless page counters
Memory is internally accounted in bytes, using spinlock-protected 64-bit
counters, even though the smallest accounting delta is a page. The
counter interface is also convoluted and does too many things.
Introduce a new lockless word-sized page counter API, then change all
memory accounting over to it. The translation from and to bytes then only
happens when interfacing with userspace.
The removed locking overhead is noticable when scaling beyond the per-cpu
charge caches - on a 4-socket machine with 144-threads, the following test
shows the performance differences of 288 memcgs concurrently running a
page fault benchmark:
vanilla:
18631648.500498 task-clock (msec) # 140.643 CPUs utilized ( +- 0.33% )
1,380,638 context-switches # 0.074 K/sec ( +- 0.75% )
24,390 cpu-migrations # 0.001 K/sec ( +- 8.44% )
1,843,305,768 page-faults # 0.099 M/sec ( +- 0.00% )
50,134,994,088,218 cycles # 2.691 GHz ( +- 0.33% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
8,049,712,224,651 instructions # 0.16 insns per cycle ( +- 0.04% )
1,586,970,584,979 branches # 85.176 M/sec ( +- 0.05% )
1,724,989,949 branch-misses # 0.11% of all branches ( +- 0.48% )
132.474343877 seconds time elapsed ( +- 0.21% )
lockless:
12195979.037525 task-clock (msec) # 133.480 CPUs utilized ( +- 0.18% )
832,850 context-switches # 0.068 K/sec ( +- 0.54% )
15,624 cpu-migrations # 0.001 K/sec ( +- 10.17% )
1,843,304,774 page-faults # 0.151 M/sec ( +- 0.00% )
32,811,216,801,141 cycles # 2.690 GHz ( +- 0.18% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
9,999,265,091,727 instructions # 0.30 insns per cycle ( +- 0.10% )
2,076,759,325,203 branches # 170.282 M/sec ( +- 0.12% )
1,656,917,214 branch-misses # 0.08% of all branches ( +- 0.55% )
91.369330729 seconds time elapsed ( +- 0.45% )
On top of improved scalability, this also gets rid of the icky long long
types in the very heart of memcg, which is great for 32 bit and also makes
the code a lot more readable.
Notable differences between the old and new API:
- res_counter_charge() and res_counter_charge_nofail() become
page_counter_try_charge() and page_counter_charge() resp. to match
the more common kernel naming scheme of try_do()/do()
- res_counter_uncharge_until() is only ever used to cancel a local
counter and never to uncharge bigger segments of a hierarchy, so
it's replaced by the simpler page_counter_cancel()
- res_counter_set_limit() is replaced by page_counter_limit(), which
expects its callers to serialize against themselves
- res_counter_memparse_write_strategy() is replaced by
page_counter_limit(), which rounds down to the nearest page size -
rather than up. This is more reasonable for explicitely requested
hard upper limits.
- to keep charging light-weight, page_counter_try_charge() charges
speculatively, only to roll back if the result exceeds the limit.
Because of this, a failing bigger charge can temporarily lock out
smaller charges that would otherwise succeed. The error is bounded
to the difference between the smallest and the biggest possible
charge size, so for memcg, this means that a failing THP charge can
send base page charges into reclaim upto 2MB (4MB) before the limit
would have been reached. This should be acceptable.
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE and memparse]
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE, memparse, strncmp, and PAGE_SIZE]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Acked-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-12-11 06:42:31 +07:00
|
|
|
val = page_counter_read(&memcg->memory);
|
2014-09-05 19:43:57 +07:00
|
|
|
else
|
mm: memcontrol: lockless page counters
Memory is internally accounted in bytes, using spinlock-protected 64-bit
counters, even though the smallest accounting delta is a page. The
counter interface is also convoluted and does too many things.
Introduce a new lockless word-sized page counter API, then change all
memory accounting over to it. The translation from and to bytes then only
happens when interfacing with userspace.
The removed locking overhead is noticable when scaling beyond the per-cpu
charge caches - on a 4-socket machine with 144-threads, the following test
shows the performance differences of 288 memcgs concurrently running a
page fault benchmark:
vanilla:
18631648.500498 task-clock (msec) # 140.643 CPUs utilized ( +- 0.33% )
1,380,638 context-switches # 0.074 K/sec ( +- 0.75% )
24,390 cpu-migrations # 0.001 K/sec ( +- 8.44% )
1,843,305,768 page-faults # 0.099 M/sec ( +- 0.00% )
50,134,994,088,218 cycles # 2.691 GHz ( +- 0.33% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
8,049,712,224,651 instructions # 0.16 insns per cycle ( +- 0.04% )
1,586,970,584,979 branches # 85.176 M/sec ( +- 0.05% )
1,724,989,949 branch-misses # 0.11% of all branches ( +- 0.48% )
132.474343877 seconds time elapsed ( +- 0.21% )
lockless:
12195979.037525 task-clock (msec) # 133.480 CPUs utilized ( +- 0.18% )
832,850 context-switches # 0.068 K/sec ( +- 0.54% )
15,624 cpu-migrations # 0.001 K/sec ( +- 10.17% )
1,843,304,774 page-faults # 0.151 M/sec ( +- 0.00% )
32,811,216,801,141 cycles # 2.690 GHz ( +- 0.18% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
9,999,265,091,727 instructions # 0.30 insns per cycle ( +- 0.10% )
2,076,759,325,203 branches # 170.282 M/sec ( +- 0.12% )
1,656,917,214 branch-misses # 0.08% of all branches ( +- 0.55% )
91.369330729 seconds time elapsed ( +- 0.45% )
On top of improved scalability, this also gets rid of the icky long long
types in the very heart of memcg, which is great for 32 bit and also makes
the code a lot more readable.
Notable differences between the old and new API:
- res_counter_charge() and res_counter_charge_nofail() become
page_counter_try_charge() and page_counter_charge() resp. to match
the more common kernel naming scheme of try_do()/do()
- res_counter_uncharge_until() is only ever used to cancel a local
counter and never to uncharge bigger segments of a hierarchy, so
it's replaced by the simpler page_counter_cancel()
- res_counter_set_limit() is replaced by page_counter_limit(), which
expects its callers to serialize against themselves
- res_counter_memparse_write_strategy() is replaced by
page_counter_limit(), which rounds down to the nearest page size -
rather than up. This is more reasonable for explicitely requested
hard upper limits.
- to keep charging light-weight, page_counter_try_charge() charges
speculatively, only to roll back if the result exceeds the limit.
Because of this, a failing bigger charge can temporarily lock out
smaller charges that would otherwise succeed. The error is bounded
to the difference between the smallest and the biggest possible
charge size, so for memcg, this means that a failing THP charge can
send base page charges into reclaim upto 2MB (4MB) before the limit
would have been reached. This should be acceptable.
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE and memparse]
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE, memparse, strncmp, and PAGE_SIZE]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Acked-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-12-11 06:42:31 +07:00
|
|
|
val = page_counter_read(&memcg->memsw);
|
2014-09-05 19:43:57 +07:00
|
|
|
}
|
2015-11-06 09:50:29 +07:00
|
|
|
return val;
|
2014-09-05 19:43:57 +07:00
|
|
|
}
|
|
|
|
|
mm: memcontrol: lockless page counters
Memory is internally accounted in bytes, using spinlock-protected 64-bit
counters, even though the smallest accounting delta is a page. The
counter interface is also convoluted and does too many things.
Introduce a new lockless word-sized page counter API, then change all
memory accounting over to it. The translation from and to bytes then only
happens when interfacing with userspace.
The removed locking overhead is noticable when scaling beyond the per-cpu
charge caches - on a 4-socket machine with 144-threads, the following test
shows the performance differences of 288 memcgs concurrently running a
page fault benchmark:
vanilla:
18631648.500498 task-clock (msec) # 140.643 CPUs utilized ( +- 0.33% )
1,380,638 context-switches # 0.074 K/sec ( +- 0.75% )
24,390 cpu-migrations # 0.001 K/sec ( +- 8.44% )
1,843,305,768 page-faults # 0.099 M/sec ( +- 0.00% )
50,134,994,088,218 cycles # 2.691 GHz ( +- 0.33% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
8,049,712,224,651 instructions # 0.16 insns per cycle ( +- 0.04% )
1,586,970,584,979 branches # 85.176 M/sec ( +- 0.05% )
1,724,989,949 branch-misses # 0.11% of all branches ( +- 0.48% )
132.474343877 seconds time elapsed ( +- 0.21% )
lockless:
12195979.037525 task-clock (msec) # 133.480 CPUs utilized ( +- 0.18% )
832,850 context-switches # 0.068 K/sec ( +- 0.54% )
15,624 cpu-migrations # 0.001 K/sec ( +- 10.17% )
1,843,304,774 page-faults # 0.151 M/sec ( +- 0.00% )
32,811,216,801,141 cycles # 2.690 GHz ( +- 0.18% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
9,999,265,091,727 instructions # 0.30 insns per cycle ( +- 0.10% )
2,076,759,325,203 branches # 170.282 M/sec ( +- 0.12% )
1,656,917,214 branch-misses # 0.08% of all branches ( +- 0.55% )
91.369330729 seconds time elapsed ( +- 0.45% )
On top of improved scalability, this also gets rid of the icky long long
types in the very heart of memcg, which is great for 32 bit and also makes
the code a lot more readable.
Notable differences between the old and new API:
- res_counter_charge() and res_counter_charge_nofail() become
page_counter_try_charge() and page_counter_charge() resp. to match
the more common kernel naming scheme of try_do()/do()
- res_counter_uncharge_until() is only ever used to cancel a local
counter and never to uncharge bigger segments of a hierarchy, so
it's replaced by the simpler page_counter_cancel()
- res_counter_set_limit() is replaced by page_counter_limit(), which
expects its callers to serialize against themselves
- res_counter_memparse_write_strategy() is replaced by
page_counter_limit(), which rounds down to the nearest page size -
rather than up. This is more reasonable for explicitely requested
hard upper limits.
- to keep charging light-weight, page_counter_try_charge() charges
speculatively, only to roll back if the result exceeds the limit.
Because of this, a failing bigger charge can temporarily lock out
smaller charges that would otherwise succeed. The error is bounded
to the difference between the smallest and the biggest possible
charge size, so for memcg, this means that a failing THP charge can
send base page charges into reclaim upto 2MB (4MB) before the limit
would have been reached. This should be acceptable.
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE and memparse]
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE, memparse, strncmp, and PAGE_SIZE]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Acked-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-12-11 06:42:31 +07:00
|
|
|
enum {
|
|
|
|
RES_USAGE,
|
|
|
|
RES_LIMIT,
|
|
|
|
RES_MAX_USAGE,
|
|
|
|
RES_FAILCNT,
|
|
|
|
RES_SOFT_LIMIT,
|
|
|
|
};
|
2014-09-05 19:43:57 +07:00
|
|
|
|
2013-12-06 00:28:02 +07:00
|
|
|
static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
|
2014-08-07 06:05:59 +07:00
|
|
|
struct cftype *cft)
|
2008-02-07 15:13:50 +07:00
|
|
|
{
|
2013-08-09 07:11:24 +07:00
|
|
|
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
|
mm: memcontrol: lockless page counters
Memory is internally accounted in bytes, using spinlock-protected 64-bit
counters, even though the smallest accounting delta is a page. The
counter interface is also convoluted and does too many things.
Introduce a new lockless word-sized page counter API, then change all
memory accounting over to it. The translation from and to bytes then only
happens when interfacing with userspace.
The removed locking overhead is noticable when scaling beyond the per-cpu
charge caches - on a 4-socket machine with 144-threads, the following test
shows the performance differences of 288 memcgs concurrently running a
page fault benchmark:
vanilla:
18631648.500498 task-clock (msec) # 140.643 CPUs utilized ( +- 0.33% )
1,380,638 context-switches # 0.074 K/sec ( +- 0.75% )
24,390 cpu-migrations # 0.001 K/sec ( +- 8.44% )
1,843,305,768 page-faults # 0.099 M/sec ( +- 0.00% )
50,134,994,088,218 cycles # 2.691 GHz ( +- 0.33% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
8,049,712,224,651 instructions # 0.16 insns per cycle ( +- 0.04% )
1,586,970,584,979 branches # 85.176 M/sec ( +- 0.05% )
1,724,989,949 branch-misses # 0.11% of all branches ( +- 0.48% )
132.474343877 seconds time elapsed ( +- 0.21% )
lockless:
12195979.037525 task-clock (msec) # 133.480 CPUs utilized ( +- 0.18% )
832,850 context-switches # 0.068 K/sec ( +- 0.54% )
15,624 cpu-migrations # 0.001 K/sec ( +- 10.17% )
1,843,304,774 page-faults # 0.151 M/sec ( +- 0.00% )
32,811,216,801,141 cycles # 2.690 GHz ( +- 0.18% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
9,999,265,091,727 instructions # 0.30 insns per cycle ( +- 0.10% )
2,076,759,325,203 branches # 170.282 M/sec ( +- 0.12% )
1,656,917,214 branch-misses # 0.08% of all branches ( +- 0.55% )
91.369330729 seconds time elapsed ( +- 0.45% )
On top of improved scalability, this also gets rid of the icky long long
types in the very heart of memcg, which is great for 32 bit and also makes
the code a lot more readable.
Notable differences between the old and new API:
- res_counter_charge() and res_counter_charge_nofail() become
page_counter_try_charge() and page_counter_charge() resp. to match
the more common kernel naming scheme of try_do()/do()
- res_counter_uncharge_until() is only ever used to cancel a local
counter and never to uncharge bigger segments of a hierarchy, so
it's replaced by the simpler page_counter_cancel()
- res_counter_set_limit() is replaced by page_counter_limit(), which
expects its callers to serialize against themselves
- res_counter_memparse_write_strategy() is replaced by
page_counter_limit(), which rounds down to the nearest page size -
rather than up. This is more reasonable for explicitely requested
hard upper limits.
- to keep charging light-weight, page_counter_try_charge() charges
speculatively, only to roll back if the result exceeds the limit.
Because of this, a failing bigger charge can temporarily lock out
smaller charges that would otherwise succeed. The error is bounded
to the difference between the smallest and the biggest possible
charge size, so for memcg, this means that a failing THP charge can
send base page charges into reclaim upto 2MB (4MB) before the limit
would have been reached. This should be acceptable.
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE and memparse]
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE, memparse, strncmp, and PAGE_SIZE]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Acked-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-12-11 06:42:31 +07:00
|
|
|
struct page_counter *counter;
|
2012-04-02 02:09:55 +07:00
|
|
|
|
mm: memcontrol: lockless page counters
Memory is internally accounted in bytes, using spinlock-protected 64-bit
counters, even though the smallest accounting delta is a page. The
counter interface is also convoluted and does too many things.
Introduce a new lockless word-sized page counter API, then change all
memory accounting over to it. The translation from and to bytes then only
happens when interfacing with userspace.
The removed locking overhead is noticable when scaling beyond the per-cpu
charge caches - on a 4-socket machine with 144-threads, the following test
shows the performance differences of 288 memcgs concurrently running a
page fault benchmark:
vanilla:
18631648.500498 task-clock (msec) # 140.643 CPUs utilized ( +- 0.33% )
1,380,638 context-switches # 0.074 K/sec ( +- 0.75% )
24,390 cpu-migrations # 0.001 K/sec ( +- 8.44% )
1,843,305,768 page-faults # 0.099 M/sec ( +- 0.00% )
50,134,994,088,218 cycles # 2.691 GHz ( +- 0.33% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
8,049,712,224,651 instructions # 0.16 insns per cycle ( +- 0.04% )
1,586,970,584,979 branches # 85.176 M/sec ( +- 0.05% )
1,724,989,949 branch-misses # 0.11% of all branches ( +- 0.48% )
132.474343877 seconds time elapsed ( +- 0.21% )
lockless:
12195979.037525 task-clock (msec) # 133.480 CPUs utilized ( +- 0.18% )
832,850 context-switches # 0.068 K/sec ( +- 0.54% )
15,624 cpu-migrations # 0.001 K/sec ( +- 10.17% )
1,843,304,774 page-faults # 0.151 M/sec ( +- 0.00% )
32,811,216,801,141 cycles # 2.690 GHz ( +- 0.18% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
9,999,265,091,727 instructions # 0.30 insns per cycle ( +- 0.10% )
2,076,759,325,203 branches # 170.282 M/sec ( +- 0.12% )
1,656,917,214 branch-misses # 0.08% of all branches ( +- 0.55% )
91.369330729 seconds time elapsed ( +- 0.45% )
On top of improved scalability, this also gets rid of the icky long long
types in the very heart of memcg, which is great for 32 bit and also makes
the code a lot more readable.
Notable differences between the old and new API:
- res_counter_charge() and res_counter_charge_nofail() become
page_counter_try_charge() and page_counter_charge() resp. to match
the more common kernel naming scheme of try_do()/do()
- res_counter_uncharge_until() is only ever used to cancel a local
counter and never to uncharge bigger segments of a hierarchy, so
it's replaced by the simpler page_counter_cancel()
- res_counter_set_limit() is replaced by page_counter_limit(), which
expects its callers to serialize against themselves
- res_counter_memparse_write_strategy() is replaced by
page_counter_limit(), which rounds down to the nearest page size -
rather than up. This is more reasonable for explicitely requested
hard upper limits.
- to keep charging light-weight, page_counter_try_charge() charges
speculatively, only to roll back if the result exceeds the limit.
Because of this, a failing bigger charge can temporarily lock out
smaller charges that would otherwise succeed. The error is bounded
to the difference between the smallest and the biggest possible
charge size, so for memcg, this means that a failing THP charge can
send base page charges into reclaim upto 2MB (4MB) before the limit
would have been reached. This should be acceptable.
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE and memparse]
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE, memparse, strncmp, and PAGE_SIZE]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Acked-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-12-11 06:42:31 +07:00
|
|
|
switch (MEMFILE_TYPE(cft->private)) {
|
2009-01-08 09:08:00 +07:00
|
|
|
case _MEM:
|
mm: memcontrol: lockless page counters
Memory is internally accounted in bytes, using spinlock-protected 64-bit
counters, even though the smallest accounting delta is a page. The
counter interface is also convoluted and does too many things.
Introduce a new lockless word-sized page counter API, then change all
memory accounting over to it. The translation from and to bytes then only
happens when interfacing with userspace.
The removed locking overhead is noticable when scaling beyond the per-cpu
charge caches - on a 4-socket machine with 144-threads, the following test
shows the performance differences of 288 memcgs concurrently running a
page fault benchmark:
vanilla:
18631648.500498 task-clock (msec) # 140.643 CPUs utilized ( +- 0.33% )
1,380,638 context-switches # 0.074 K/sec ( +- 0.75% )
24,390 cpu-migrations # 0.001 K/sec ( +- 8.44% )
1,843,305,768 page-faults # 0.099 M/sec ( +- 0.00% )
50,134,994,088,218 cycles # 2.691 GHz ( +- 0.33% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
8,049,712,224,651 instructions # 0.16 insns per cycle ( +- 0.04% )
1,586,970,584,979 branches # 85.176 M/sec ( +- 0.05% )
1,724,989,949 branch-misses # 0.11% of all branches ( +- 0.48% )
132.474343877 seconds time elapsed ( +- 0.21% )
lockless:
12195979.037525 task-clock (msec) # 133.480 CPUs utilized ( +- 0.18% )
832,850 context-switches # 0.068 K/sec ( +- 0.54% )
15,624 cpu-migrations # 0.001 K/sec ( +- 10.17% )
1,843,304,774 page-faults # 0.151 M/sec ( +- 0.00% )
32,811,216,801,141 cycles # 2.690 GHz ( +- 0.18% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
9,999,265,091,727 instructions # 0.30 insns per cycle ( +- 0.10% )
2,076,759,325,203 branches # 170.282 M/sec ( +- 0.12% )
1,656,917,214 branch-misses # 0.08% of all branches ( +- 0.55% )
91.369330729 seconds time elapsed ( +- 0.45% )
On top of improved scalability, this also gets rid of the icky long long
types in the very heart of memcg, which is great for 32 bit and also makes
the code a lot more readable.
Notable differences between the old and new API:
- res_counter_charge() and res_counter_charge_nofail() become
page_counter_try_charge() and page_counter_charge() resp. to match
the more common kernel naming scheme of try_do()/do()
- res_counter_uncharge_until() is only ever used to cancel a local
counter and never to uncharge bigger segments of a hierarchy, so
it's replaced by the simpler page_counter_cancel()
- res_counter_set_limit() is replaced by page_counter_limit(), which
expects its callers to serialize against themselves
- res_counter_memparse_write_strategy() is replaced by
page_counter_limit(), which rounds down to the nearest page size -
rather than up. This is more reasonable for explicitely requested
hard upper limits.
- to keep charging light-weight, page_counter_try_charge() charges
speculatively, only to roll back if the result exceeds the limit.
Because of this, a failing bigger charge can temporarily lock out
smaller charges that would otherwise succeed. The error is bounded
to the difference between the smallest and the biggest possible
charge size, so for memcg, this means that a failing THP charge can
send base page charges into reclaim upto 2MB (4MB) before the limit
would have been reached. This should be acceptable.
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE and memparse]
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE, memparse, strncmp, and PAGE_SIZE]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Acked-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-12-11 06:42:31 +07:00
|
|
|
counter = &memcg->memory;
|
|
|
|
break;
|
2009-01-08 09:08:00 +07:00
|
|
|
case _MEMSWAP:
|
mm: memcontrol: lockless page counters
Memory is internally accounted in bytes, using spinlock-protected 64-bit
counters, even though the smallest accounting delta is a page. The
counter interface is also convoluted and does too many things.
Introduce a new lockless word-sized page counter API, then change all
memory accounting over to it. The translation from and to bytes then only
happens when interfacing with userspace.
The removed locking overhead is noticable when scaling beyond the per-cpu
charge caches - on a 4-socket machine with 144-threads, the following test
shows the performance differences of 288 memcgs concurrently running a
page fault benchmark:
vanilla:
18631648.500498 task-clock (msec) # 140.643 CPUs utilized ( +- 0.33% )
1,380,638 context-switches # 0.074 K/sec ( +- 0.75% )
24,390 cpu-migrations # 0.001 K/sec ( +- 8.44% )
1,843,305,768 page-faults # 0.099 M/sec ( +- 0.00% )
50,134,994,088,218 cycles # 2.691 GHz ( +- 0.33% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
8,049,712,224,651 instructions # 0.16 insns per cycle ( +- 0.04% )
1,586,970,584,979 branches # 85.176 M/sec ( +- 0.05% )
1,724,989,949 branch-misses # 0.11% of all branches ( +- 0.48% )
132.474343877 seconds time elapsed ( +- 0.21% )
lockless:
12195979.037525 task-clock (msec) # 133.480 CPUs utilized ( +- 0.18% )
832,850 context-switches # 0.068 K/sec ( +- 0.54% )
15,624 cpu-migrations # 0.001 K/sec ( +- 10.17% )
1,843,304,774 page-faults # 0.151 M/sec ( +- 0.00% )
32,811,216,801,141 cycles # 2.690 GHz ( +- 0.18% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
9,999,265,091,727 instructions # 0.30 insns per cycle ( +- 0.10% )
2,076,759,325,203 branches # 170.282 M/sec ( +- 0.12% )
1,656,917,214 branch-misses # 0.08% of all branches ( +- 0.55% )
91.369330729 seconds time elapsed ( +- 0.45% )
On top of improved scalability, this also gets rid of the icky long long
types in the very heart of memcg, which is great for 32 bit and also makes
the code a lot more readable.
Notable differences between the old and new API:
- res_counter_charge() and res_counter_charge_nofail() become
page_counter_try_charge() and page_counter_charge() resp. to match
the more common kernel naming scheme of try_do()/do()
- res_counter_uncharge_until() is only ever used to cancel a local
counter and never to uncharge bigger segments of a hierarchy, so
it's replaced by the simpler page_counter_cancel()
- res_counter_set_limit() is replaced by page_counter_limit(), which
expects its callers to serialize against themselves
- res_counter_memparse_write_strategy() is replaced by
page_counter_limit(), which rounds down to the nearest page size -
rather than up. This is more reasonable for explicitely requested
hard upper limits.
- to keep charging light-weight, page_counter_try_charge() charges
speculatively, only to roll back if the result exceeds the limit.
Because of this, a failing bigger charge can temporarily lock out
smaller charges that would otherwise succeed. The error is bounded
to the difference between the smallest and the biggest possible
charge size, so for memcg, this means that a failing THP charge can
send base page charges into reclaim upto 2MB (4MB) before the limit
would have been reached. This should be acceptable.
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE and memparse]
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE, memparse, strncmp, and PAGE_SIZE]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Acked-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-12-11 06:42:31 +07:00
|
|
|
counter = &memcg->memsw;
|
|
|
|
break;
|
2012-12-19 05:21:47 +07:00
|
|
|
case _KMEM:
|
mm: memcontrol: lockless page counters
Memory is internally accounted in bytes, using spinlock-protected 64-bit
counters, even though the smallest accounting delta is a page. The
counter interface is also convoluted and does too many things.
Introduce a new lockless word-sized page counter API, then change all
memory accounting over to it. The translation from and to bytes then only
happens when interfacing with userspace.
The removed locking overhead is noticable when scaling beyond the per-cpu
charge caches - on a 4-socket machine with 144-threads, the following test
shows the performance differences of 288 memcgs concurrently running a
page fault benchmark:
vanilla:
18631648.500498 task-clock (msec) # 140.643 CPUs utilized ( +- 0.33% )
1,380,638 context-switches # 0.074 K/sec ( +- 0.75% )
24,390 cpu-migrations # 0.001 K/sec ( +- 8.44% )
1,843,305,768 page-faults # 0.099 M/sec ( +- 0.00% )
50,134,994,088,218 cycles # 2.691 GHz ( +- 0.33% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
8,049,712,224,651 instructions # 0.16 insns per cycle ( +- 0.04% )
1,586,970,584,979 branches # 85.176 M/sec ( +- 0.05% )
1,724,989,949 branch-misses # 0.11% of all branches ( +- 0.48% )
132.474343877 seconds time elapsed ( +- 0.21% )
lockless:
12195979.037525 task-clock (msec) # 133.480 CPUs utilized ( +- 0.18% )
832,850 context-switches # 0.068 K/sec ( +- 0.54% )
15,624 cpu-migrations # 0.001 K/sec ( +- 10.17% )
1,843,304,774 page-faults # 0.151 M/sec ( +- 0.00% )
32,811,216,801,141 cycles # 2.690 GHz ( +- 0.18% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
9,999,265,091,727 instructions # 0.30 insns per cycle ( +- 0.10% )
2,076,759,325,203 branches # 170.282 M/sec ( +- 0.12% )
1,656,917,214 branch-misses # 0.08% of all branches ( +- 0.55% )
91.369330729 seconds time elapsed ( +- 0.45% )
On top of improved scalability, this also gets rid of the icky long long
types in the very heart of memcg, which is great for 32 bit and also makes
the code a lot more readable.
Notable differences between the old and new API:
- res_counter_charge() and res_counter_charge_nofail() become
page_counter_try_charge() and page_counter_charge() resp. to match
the more common kernel naming scheme of try_do()/do()
- res_counter_uncharge_until() is only ever used to cancel a local
counter and never to uncharge bigger segments of a hierarchy, so
it's replaced by the simpler page_counter_cancel()
- res_counter_set_limit() is replaced by page_counter_limit(), which
expects its callers to serialize against themselves
- res_counter_memparse_write_strategy() is replaced by
page_counter_limit(), which rounds down to the nearest page size -
rather than up. This is more reasonable for explicitely requested
hard upper limits.
- to keep charging light-weight, page_counter_try_charge() charges
speculatively, only to roll back if the result exceeds the limit.
Because of this, a failing bigger charge can temporarily lock out
smaller charges that would otherwise succeed. The error is bounded
to the difference between the smallest and the biggest possible
charge size, so for memcg, this means that a failing THP charge can
send base page charges into reclaim upto 2MB (4MB) before the limit
would have been reached. This should be acceptable.
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE and memparse]
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE, memparse, strncmp, and PAGE_SIZE]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Acked-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-12-11 06:42:31 +07:00
|
|
|
counter = &memcg->kmem;
|
2012-12-19 05:21:47 +07:00
|
|
|
break;
|
2016-01-21 06:02:44 +07:00
|
|
|
case _TCP:
|
2016-01-21 06:02:50 +07:00
|
|
|
counter = &memcg->tcpmem;
|
2016-01-21 06:02:44 +07:00
|
|
|
break;
|
2009-01-08 09:08:00 +07:00
|
|
|
default:
|
|
|
|
BUG();
|
|
|
|
}
|
mm: memcontrol: lockless page counters
Memory is internally accounted in bytes, using spinlock-protected 64-bit
counters, even though the smallest accounting delta is a page. The
counter interface is also convoluted and does too many things.
Introduce a new lockless word-sized page counter API, then change all
memory accounting over to it. The translation from and to bytes then only
happens when interfacing with userspace.
The removed locking overhead is noticable when scaling beyond the per-cpu
charge caches - on a 4-socket machine with 144-threads, the following test
shows the performance differences of 288 memcgs concurrently running a
page fault benchmark:
vanilla:
18631648.500498 task-clock (msec) # 140.643 CPUs utilized ( +- 0.33% )
1,380,638 context-switches # 0.074 K/sec ( +- 0.75% )
24,390 cpu-migrations # 0.001 K/sec ( +- 8.44% )
1,843,305,768 page-faults # 0.099 M/sec ( +- 0.00% )
50,134,994,088,218 cycles # 2.691 GHz ( +- 0.33% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
8,049,712,224,651 instructions # 0.16 insns per cycle ( +- 0.04% )
1,586,970,584,979 branches # 85.176 M/sec ( +- 0.05% )
1,724,989,949 branch-misses # 0.11% of all branches ( +- 0.48% )
132.474343877 seconds time elapsed ( +- 0.21% )
lockless:
12195979.037525 task-clock (msec) # 133.480 CPUs utilized ( +- 0.18% )
832,850 context-switches # 0.068 K/sec ( +- 0.54% )
15,624 cpu-migrations # 0.001 K/sec ( +- 10.17% )
1,843,304,774 page-faults # 0.151 M/sec ( +- 0.00% )
32,811,216,801,141 cycles # 2.690 GHz ( +- 0.18% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
9,999,265,091,727 instructions # 0.30 insns per cycle ( +- 0.10% )
2,076,759,325,203 branches # 170.282 M/sec ( +- 0.12% )
1,656,917,214 branch-misses # 0.08% of all branches ( +- 0.55% )
91.369330729 seconds time elapsed ( +- 0.45% )
On top of improved scalability, this also gets rid of the icky long long
types in the very heart of memcg, which is great for 32 bit and also makes
the code a lot more readable.
Notable differences between the old and new API:
- res_counter_charge() and res_counter_charge_nofail() become
page_counter_try_charge() and page_counter_charge() resp. to match
the more common kernel naming scheme of try_do()/do()
- res_counter_uncharge_until() is only ever used to cancel a local
counter and never to uncharge bigger segments of a hierarchy, so
it's replaced by the simpler page_counter_cancel()
- res_counter_set_limit() is replaced by page_counter_limit(), which
expects its callers to serialize against themselves
- res_counter_memparse_write_strategy() is replaced by
page_counter_limit(), which rounds down to the nearest page size -
rather than up. This is more reasonable for explicitely requested
hard upper limits.
- to keep charging light-weight, page_counter_try_charge() charges
speculatively, only to roll back if the result exceeds the limit.
Because of this, a failing bigger charge can temporarily lock out
smaller charges that would otherwise succeed. The error is bounded
to the difference between the smallest and the biggest possible
charge size, so for memcg, this means that a failing THP charge can
send base page charges into reclaim upto 2MB (4MB) before the limit
would have been reached. This should be acceptable.
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE and memparse]
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE, memparse, strncmp, and PAGE_SIZE]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Acked-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-12-11 06:42:31 +07:00
|
|
|
|
|
|
|
switch (MEMFILE_ATTR(cft->private)) {
|
|
|
|
case RES_USAGE:
|
|
|
|
if (counter == &memcg->memory)
|
2015-11-06 09:50:29 +07:00
|
|
|
return (u64)mem_cgroup_usage(memcg, false) * PAGE_SIZE;
|
mm: memcontrol: lockless page counters
Memory is internally accounted in bytes, using spinlock-protected 64-bit
counters, even though the smallest accounting delta is a page. The
counter interface is also convoluted and does too many things.
Introduce a new lockless word-sized page counter API, then change all
memory accounting over to it. The translation from and to bytes then only
happens when interfacing with userspace.
The removed locking overhead is noticable when scaling beyond the per-cpu
charge caches - on a 4-socket machine with 144-threads, the following test
shows the performance differences of 288 memcgs concurrently running a
page fault benchmark:
vanilla:
18631648.500498 task-clock (msec) # 140.643 CPUs utilized ( +- 0.33% )
1,380,638 context-switches # 0.074 K/sec ( +- 0.75% )
24,390 cpu-migrations # 0.001 K/sec ( +- 8.44% )
1,843,305,768 page-faults # 0.099 M/sec ( +- 0.00% )
50,134,994,088,218 cycles # 2.691 GHz ( +- 0.33% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
8,049,712,224,651 instructions # 0.16 insns per cycle ( +- 0.04% )
1,586,970,584,979 branches # 85.176 M/sec ( +- 0.05% )
1,724,989,949 branch-misses # 0.11% of all branches ( +- 0.48% )
132.474343877 seconds time elapsed ( +- 0.21% )
lockless:
12195979.037525 task-clock (msec) # 133.480 CPUs utilized ( +- 0.18% )
832,850 context-switches # 0.068 K/sec ( +- 0.54% )
15,624 cpu-migrations # 0.001 K/sec ( +- 10.17% )
1,843,304,774 page-faults # 0.151 M/sec ( +- 0.00% )
32,811,216,801,141 cycles # 2.690 GHz ( +- 0.18% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
9,999,265,091,727 instructions # 0.30 insns per cycle ( +- 0.10% )
2,076,759,325,203 branches # 170.282 M/sec ( +- 0.12% )
1,656,917,214 branch-misses # 0.08% of all branches ( +- 0.55% )
91.369330729 seconds time elapsed ( +- 0.45% )
On top of improved scalability, this also gets rid of the icky long long
types in the very heart of memcg, which is great for 32 bit and also makes
the code a lot more readable.
Notable differences between the old and new API:
- res_counter_charge() and res_counter_charge_nofail() become
page_counter_try_charge() and page_counter_charge() resp. to match
the more common kernel naming scheme of try_do()/do()
- res_counter_uncharge_until() is only ever used to cancel a local
counter and never to uncharge bigger segments of a hierarchy, so
it's replaced by the simpler page_counter_cancel()
- res_counter_set_limit() is replaced by page_counter_limit(), which
expects its callers to serialize against themselves
- res_counter_memparse_write_strategy() is replaced by
page_counter_limit(), which rounds down to the nearest page size -
rather than up. This is more reasonable for explicitely requested
hard upper limits.
- to keep charging light-weight, page_counter_try_charge() charges
speculatively, only to roll back if the result exceeds the limit.
Because of this, a failing bigger charge can temporarily lock out
smaller charges that would otherwise succeed. The error is bounded
to the difference between the smallest and the biggest possible
charge size, so for memcg, this means that a failing THP charge can
send base page charges into reclaim upto 2MB (4MB) before the limit
would have been reached. This should be acceptable.
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE and memparse]
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE, memparse, strncmp, and PAGE_SIZE]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Acked-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-12-11 06:42:31 +07:00
|
|
|
if (counter == &memcg->memsw)
|
2015-11-06 09:50:29 +07:00
|
|
|
return (u64)mem_cgroup_usage(memcg, true) * PAGE_SIZE;
|
mm: memcontrol: lockless page counters
Memory is internally accounted in bytes, using spinlock-protected 64-bit
counters, even though the smallest accounting delta is a page. The
counter interface is also convoluted and does too many things.
Introduce a new lockless word-sized page counter API, then change all
memory accounting over to it. The translation from and to bytes then only
happens when interfacing with userspace.
The removed locking overhead is noticable when scaling beyond the per-cpu
charge caches - on a 4-socket machine with 144-threads, the following test
shows the performance differences of 288 memcgs concurrently running a
page fault benchmark:
vanilla:
18631648.500498 task-clock (msec) # 140.643 CPUs utilized ( +- 0.33% )
1,380,638 context-switches # 0.074 K/sec ( +- 0.75% )
24,390 cpu-migrations # 0.001 K/sec ( +- 8.44% )
1,843,305,768 page-faults # 0.099 M/sec ( +- 0.00% )
50,134,994,088,218 cycles # 2.691 GHz ( +- 0.33% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
8,049,712,224,651 instructions # 0.16 insns per cycle ( +- 0.04% )
1,586,970,584,979 branches # 85.176 M/sec ( +- 0.05% )
1,724,989,949 branch-misses # 0.11% of all branches ( +- 0.48% )
132.474343877 seconds time elapsed ( +- 0.21% )
lockless:
12195979.037525 task-clock (msec) # 133.480 CPUs utilized ( +- 0.18% )
832,850 context-switches # 0.068 K/sec ( +- 0.54% )
15,624 cpu-migrations # 0.001 K/sec ( +- 10.17% )
1,843,304,774 page-faults # 0.151 M/sec ( +- 0.00% )
32,811,216,801,141 cycles # 2.690 GHz ( +- 0.18% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
9,999,265,091,727 instructions # 0.30 insns per cycle ( +- 0.10% )
2,076,759,325,203 branches # 170.282 M/sec ( +- 0.12% )
1,656,917,214 branch-misses # 0.08% of all branches ( +- 0.55% )
91.369330729 seconds time elapsed ( +- 0.45% )
On top of improved scalability, this also gets rid of the icky long long
types in the very heart of memcg, which is great for 32 bit and also makes
the code a lot more readable.
Notable differences between the old and new API:
- res_counter_charge() and res_counter_charge_nofail() become
page_counter_try_charge() and page_counter_charge() resp. to match
the more common kernel naming scheme of try_do()/do()
- res_counter_uncharge_until() is only ever used to cancel a local
counter and never to uncharge bigger segments of a hierarchy, so
it's replaced by the simpler page_counter_cancel()
- res_counter_set_limit() is replaced by page_counter_limit(), which
expects its callers to serialize against themselves
- res_counter_memparse_write_strategy() is replaced by
page_counter_limit(), which rounds down to the nearest page size -
rather than up. This is more reasonable for explicitely requested
hard upper limits.
- to keep charging light-weight, page_counter_try_charge() charges
speculatively, only to roll back if the result exceeds the limit.
Because of this, a failing bigger charge can temporarily lock out
smaller charges that would otherwise succeed. The error is bounded
to the difference between the smallest and the biggest possible
charge size, so for memcg, this means that a failing THP charge can
send base page charges into reclaim upto 2MB (4MB) before the limit
would have been reached. This should be acceptable.
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE and memparse]
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE, memparse, strncmp, and PAGE_SIZE]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Acked-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-12-11 06:42:31 +07:00
|
|
|
return (u64)page_counter_read(counter) * PAGE_SIZE;
|
|
|
|
case RES_LIMIT:
|
2018-06-08 07:06:18 +07:00
|
|
|
return (u64)counter->max * PAGE_SIZE;
|
mm: memcontrol: lockless page counters
Memory is internally accounted in bytes, using spinlock-protected 64-bit
counters, even though the smallest accounting delta is a page. The
counter interface is also convoluted and does too many things.
Introduce a new lockless word-sized page counter API, then change all
memory accounting over to it. The translation from and to bytes then only
happens when interfacing with userspace.
The removed locking overhead is noticable when scaling beyond the per-cpu
charge caches - on a 4-socket machine with 144-threads, the following test
shows the performance differences of 288 memcgs concurrently running a
page fault benchmark:
vanilla:
18631648.500498 task-clock (msec) # 140.643 CPUs utilized ( +- 0.33% )
1,380,638 context-switches # 0.074 K/sec ( +- 0.75% )
24,390 cpu-migrations # 0.001 K/sec ( +- 8.44% )
1,843,305,768 page-faults # 0.099 M/sec ( +- 0.00% )
50,134,994,088,218 cycles # 2.691 GHz ( +- 0.33% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
8,049,712,224,651 instructions # 0.16 insns per cycle ( +- 0.04% )
1,586,970,584,979 branches # 85.176 M/sec ( +- 0.05% )
1,724,989,949 branch-misses # 0.11% of all branches ( +- 0.48% )
132.474343877 seconds time elapsed ( +- 0.21% )
lockless:
12195979.037525 task-clock (msec) # 133.480 CPUs utilized ( +- 0.18% )
832,850 context-switches # 0.068 K/sec ( +- 0.54% )
15,624 cpu-migrations # 0.001 K/sec ( +- 10.17% )
1,843,304,774 page-faults # 0.151 M/sec ( +- 0.00% )
32,811,216,801,141 cycles # 2.690 GHz ( +- 0.18% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
9,999,265,091,727 instructions # 0.30 insns per cycle ( +- 0.10% )
2,076,759,325,203 branches # 170.282 M/sec ( +- 0.12% )
1,656,917,214 branch-misses # 0.08% of all branches ( +- 0.55% )
91.369330729 seconds time elapsed ( +- 0.45% )
On top of improved scalability, this also gets rid of the icky long long
types in the very heart of memcg, which is great for 32 bit and also makes
the code a lot more readable.
Notable differences between the old and new API:
- res_counter_charge() and res_counter_charge_nofail() become
page_counter_try_charge() and page_counter_charge() resp. to match
the more common kernel naming scheme of try_do()/do()
- res_counter_uncharge_until() is only ever used to cancel a local
counter and never to uncharge bigger segments of a hierarchy, so
it's replaced by the simpler page_counter_cancel()
- res_counter_set_limit() is replaced by page_counter_limit(), which
expects its callers to serialize against themselves
- res_counter_memparse_write_strategy() is replaced by
page_counter_limit(), which rounds down to the nearest page size -
rather than up. This is more reasonable for explicitely requested
hard upper limits.
- to keep charging light-weight, page_counter_try_charge() charges
speculatively, only to roll back if the result exceeds the limit.
Because of this, a failing bigger charge can temporarily lock out
smaller charges that would otherwise succeed. The error is bounded
to the difference between the smallest and the biggest possible
charge size, so for memcg, this means that a failing THP charge can
send base page charges into reclaim upto 2MB (4MB) before the limit
would have been reached. This should be acceptable.
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE and memparse]
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE, memparse, strncmp, and PAGE_SIZE]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Acked-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-12-11 06:42:31 +07:00
|
|
|
case RES_MAX_USAGE:
|
|
|
|
return (u64)counter->watermark * PAGE_SIZE;
|
|
|
|
case RES_FAILCNT:
|
|
|
|
return counter->failcnt;
|
|
|
|
case RES_SOFT_LIMIT:
|
|
|
|
return (u64)memcg->soft_limit * PAGE_SIZE;
|
|
|
|
default:
|
|
|
|
BUG();
|
|
|
|
}
|
2008-02-07 15:13:50 +07:00
|
|
|
}
|
2012-12-19 05:21:47 +07:00
|
|
|
|
mm: memcg/slab: fix percpu slab vmstats flushing
Currently slab percpu vmstats are flushed twice: during the memcg
offlining and just before freeing the memcg structure. Each time percpu
counters are summed, added to the atomic counterparts and propagated up
by the cgroup tree.
The second flushing is required due to how recursive vmstats are
implemented: counters are batched in percpu variables on a local level,
and once a percpu value is crossing some predefined threshold, it spills
over to atomic values on the local and each ascendant levels. It means
that without flushing some numbers cached in percpu variables will be
dropped on floor each time a cgroup is destroyed. And with uptime the
error on upper levels might become noticeable.
The first flushing aims to make counters on ancestor levels more
precise. Dying cgroups may resume in the dying state for a long time.
After kmem_cache reparenting which is performed during the offlining
slab counters of the dying cgroup don't have any chances to be updated,
because any slab operations will be performed on the parent level. It
means that the inaccuracy caused by percpu batching will not decrease up
to the final destruction of the cgroup. By the original idea flushing
slab counters during the offlining should minimize the visible
inaccuracy of slab counters on the parent level.
The problem is that percpu counters are not zeroed after the first
flushing. So every cached percpu value is summed twice. It creates a
small error (up to 32 pages per cpu, but usually less) which accumulates
on parent cgroup level. After creating and destroying of thousands of
child cgroups, slab counter on parent level can be way off the real
value.
For now, let's just stop flushing slab counters on memcg offlining. It
can't be done correctly without scheduling a work on each cpu: reading
and zeroing it during css offlining can race with an asynchronous
update, which doesn't expect values to be changed underneath.
With this change, slab counters on parent level will become eventually
consistent. Once all dying children are gone, values are correct. And
if not, the error is capped by 32 * NR_CPUS pages per dying cgroup.
It's not perfect, as slab are reparented, so any updates after the
reparenting will happen on the parent level. It means that if a slab
page was allocated, a counter on child level was bumped, then the page
was reparented and freed, the annihilation of positive and negative
counter values will not happen until the child cgroup is released. It
makes slab counters different from others, and it might want us to
implement flushing in a correct form again. But it's also a question of
performance: scheduling a work on each cpu isn't free, and it's an open
question if the benefit of having more accurate counters is worth it.
We might also consider flushing all counters on offlining, not only slab
counters.
So let's fix the main problem now: make the slab counters eventually
consistent, so at least the error won't grow with uptime (or more
precisely the number of created and destroyed cgroups). And think about
the accuracy of counters separately.
Link: http://lkml.kernel.org/r/20191220042728.1045881-1-guro@fb.com
Fixes: bee07b33db78 ("mm: memcontrol: flush percpu slab vmstats on kmem offlining")
Signed-off-by: Roman Gushchin <guro@fb.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2020-01-14 07:29:16 +07:00
|
|
|
static void memcg_flush_percpu_vmstats(struct mem_cgroup *memcg)
|
2019-08-25 07:54:47 +07:00
|
|
|
{
|
mm: memcg/slab: fix percpu slab vmstats flushing
Currently slab percpu vmstats are flushed twice: during the memcg
offlining and just before freeing the memcg structure. Each time percpu
counters are summed, added to the atomic counterparts and propagated up
by the cgroup tree.
The second flushing is required due to how recursive vmstats are
implemented: counters are batched in percpu variables on a local level,
and once a percpu value is crossing some predefined threshold, it spills
over to atomic values on the local and each ascendant levels. It means
that without flushing some numbers cached in percpu variables will be
dropped on floor each time a cgroup is destroyed. And with uptime the
error on upper levels might become noticeable.
The first flushing aims to make counters on ancestor levels more
precise. Dying cgroups may resume in the dying state for a long time.
After kmem_cache reparenting which is performed during the offlining
slab counters of the dying cgroup don't have any chances to be updated,
because any slab operations will be performed on the parent level. It
means that the inaccuracy caused by percpu batching will not decrease up
to the final destruction of the cgroup. By the original idea flushing
slab counters during the offlining should minimize the visible
inaccuracy of slab counters on the parent level.
The problem is that percpu counters are not zeroed after the first
flushing. So every cached percpu value is summed twice. It creates a
small error (up to 32 pages per cpu, but usually less) which accumulates
on parent cgroup level. After creating and destroying of thousands of
child cgroups, slab counter on parent level can be way off the real
value.
For now, let's just stop flushing slab counters on memcg offlining. It
can't be done correctly without scheduling a work on each cpu: reading
and zeroing it during css offlining can race with an asynchronous
update, which doesn't expect values to be changed underneath.
With this change, slab counters on parent level will become eventually
consistent. Once all dying children are gone, values are correct. And
if not, the error is capped by 32 * NR_CPUS pages per dying cgroup.
It's not perfect, as slab are reparented, so any updates after the
reparenting will happen on the parent level. It means that if a slab
page was allocated, a counter on child level was bumped, then the page
was reparented and freed, the annihilation of positive and negative
counter values will not happen until the child cgroup is released. It
makes slab counters different from others, and it might want us to
implement flushing in a correct form again. But it's also a question of
performance: scheduling a work on each cpu isn't free, and it's an open
question if the benefit of having more accurate counters is worth it.
We might also consider flushing all counters on offlining, not only slab
counters.
So let's fix the main problem now: make the slab counters eventually
consistent, so at least the error won't grow with uptime (or more
precisely the number of created and destroyed cgroups). And think about
the accuracy of counters separately.
Link: http://lkml.kernel.org/r/20191220042728.1045881-1-guro@fb.com
Fixes: bee07b33db78 ("mm: memcontrol: flush percpu slab vmstats on kmem offlining")
Signed-off-by: Roman Gushchin <guro@fb.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2020-01-14 07:29:16 +07:00
|
|
|
unsigned long stat[MEMCG_NR_STAT] = {0};
|
2019-08-25 07:54:47 +07:00
|
|
|
struct mem_cgroup *mi;
|
|
|
|
int node, cpu, i;
|
|
|
|
|
|
|
|
for_each_online_cpu(cpu)
|
mm: memcg/slab: fix percpu slab vmstats flushing
Currently slab percpu vmstats are flushed twice: during the memcg
offlining and just before freeing the memcg structure. Each time percpu
counters are summed, added to the atomic counterparts and propagated up
by the cgroup tree.
The second flushing is required due to how recursive vmstats are
implemented: counters are batched in percpu variables on a local level,
and once a percpu value is crossing some predefined threshold, it spills
over to atomic values on the local and each ascendant levels. It means
that without flushing some numbers cached in percpu variables will be
dropped on floor each time a cgroup is destroyed. And with uptime the
error on upper levels might become noticeable.
The first flushing aims to make counters on ancestor levels more
precise. Dying cgroups may resume in the dying state for a long time.
After kmem_cache reparenting which is performed during the offlining
slab counters of the dying cgroup don't have any chances to be updated,
because any slab operations will be performed on the parent level. It
means that the inaccuracy caused by percpu batching will not decrease up
to the final destruction of the cgroup. By the original idea flushing
slab counters during the offlining should minimize the visible
inaccuracy of slab counters on the parent level.
The problem is that percpu counters are not zeroed after the first
flushing. So every cached percpu value is summed twice. It creates a
small error (up to 32 pages per cpu, but usually less) which accumulates
on parent cgroup level. After creating and destroying of thousands of
child cgroups, slab counter on parent level can be way off the real
value.
For now, let's just stop flushing slab counters on memcg offlining. It
can't be done correctly without scheduling a work on each cpu: reading
and zeroing it during css offlining can race with an asynchronous
update, which doesn't expect values to be changed underneath.
With this change, slab counters on parent level will become eventually
consistent. Once all dying children are gone, values are correct. And
if not, the error is capped by 32 * NR_CPUS pages per dying cgroup.
It's not perfect, as slab are reparented, so any updates after the
reparenting will happen on the parent level. It means that if a slab
page was allocated, a counter on child level was bumped, then the page
was reparented and freed, the annihilation of positive and negative
counter values will not happen until the child cgroup is released. It
makes slab counters different from others, and it might want us to
implement flushing in a correct form again. But it's also a question of
performance: scheduling a work on each cpu isn't free, and it's an open
question if the benefit of having more accurate counters is worth it.
We might also consider flushing all counters on offlining, not only slab
counters.
So let's fix the main problem now: make the slab counters eventually
consistent, so at least the error won't grow with uptime (or more
precisely the number of created and destroyed cgroups). And think about
the accuracy of counters separately.
Link: http://lkml.kernel.org/r/20191220042728.1045881-1-guro@fb.com
Fixes: bee07b33db78 ("mm: memcontrol: flush percpu slab vmstats on kmem offlining")
Signed-off-by: Roman Gushchin <guro@fb.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2020-01-14 07:29:16 +07:00
|
|
|
for (i = 0; i < MEMCG_NR_STAT; i++)
|
2019-08-31 06:04:53 +07:00
|
|
|
stat[i] += per_cpu(memcg->vmstats_percpu->stat[i], cpu);
|
2019-08-25 07:54:47 +07:00
|
|
|
|
|
|
|
for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
|
mm: memcg/slab: fix percpu slab vmstats flushing
Currently slab percpu vmstats are flushed twice: during the memcg
offlining and just before freeing the memcg structure. Each time percpu
counters are summed, added to the atomic counterparts and propagated up
by the cgroup tree.
The second flushing is required due to how recursive vmstats are
implemented: counters are batched in percpu variables on a local level,
and once a percpu value is crossing some predefined threshold, it spills
over to atomic values on the local and each ascendant levels. It means
that without flushing some numbers cached in percpu variables will be
dropped on floor each time a cgroup is destroyed. And with uptime the
error on upper levels might become noticeable.
The first flushing aims to make counters on ancestor levels more
precise. Dying cgroups may resume in the dying state for a long time.
After kmem_cache reparenting which is performed during the offlining
slab counters of the dying cgroup don't have any chances to be updated,
because any slab operations will be performed on the parent level. It
means that the inaccuracy caused by percpu batching will not decrease up
to the final destruction of the cgroup. By the original idea flushing
slab counters during the offlining should minimize the visible
inaccuracy of slab counters on the parent level.
The problem is that percpu counters are not zeroed after the first
flushing. So every cached percpu value is summed twice. It creates a
small error (up to 32 pages per cpu, but usually less) which accumulates
on parent cgroup level. After creating and destroying of thousands of
child cgroups, slab counter on parent level can be way off the real
value.
For now, let's just stop flushing slab counters on memcg offlining. It
can't be done correctly without scheduling a work on each cpu: reading
and zeroing it during css offlining can race with an asynchronous
update, which doesn't expect values to be changed underneath.
With this change, slab counters on parent level will become eventually
consistent. Once all dying children are gone, values are correct. And
if not, the error is capped by 32 * NR_CPUS pages per dying cgroup.
It's not perfect, as slab are reparented, so any updates after the
reparenting will happen on the parent level. It means that if a slab
page was allocated, a counter on child level was bumped, then the page
was reparented and freed, the annihilation of positive and negative
counter values will not happen until the child cgroup is released. It
makes slab counters different from others, and it might want us to
implement flushing in a correct form again. But it's also a question of
performance: scheduling a work on each cpu isn't free, and it's an open
question if the benefit of having more accurate counters is worth it.
We might also consider flushing all counters on offlining, not only slab
counters.
So let's fix the main problem now: make the slab counters eventually
consistent, so at least the error won't grow with uptime (or more
precisely the number of created and destroyed cgroups). And think about
the accuracy of counters separately.
Link: http://lkml.kernel.org/r/20191220042728.1045881-1-guro@fb.com
Fixes: bee07b33db78 ("mm: memcontrol: flush percpu slab vmstats on kmem offlining")
Signed-off-by: Roman Gushchin <guro@fb.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2020-01-14 07:29:16 +07:00
|
|
|
for (i = 0; i < MEMCG_NR_STAT; i++)
|
2019-08-25 07:54:47 +07:00
|
|
|
atomic_long_add(stat[i], &mi->vmstats[i]);
|
|
|
|
|
|
|
|
for_each_node(node) {
|
|
|
|
struct mem_cgroup_per_node *pn = memcg->nodeinfo[node];
|
|
|
|
struct mem_cgroup_per_node *pi;
|
|
|
|
|
mm: memcg/slab: fix percpu slab vmstats flushing
Currently slab percpu vmstats are flushed twice: during the memcg
offlining and just before freeing the memcg structure. Each time percpu
counters are summed, added to the atomic counterparts and propagated up
by the cgroup tree.
The second flushing is required due to how recursive vmstats are
implemented: counters are batched in percpu variables on a local level,
and once a percpu value is crossing some predefined threshold, it spills
over to atomic values on the local and each ascendant levels. It means
that without flushing some numbers cached in percpu variables will be
dropped on floor each time a cgroup is destroyed. And with uptime the
error on upper levels might become noticeable.
The first flushing aims to make counters on ancestor levels more
precise. Dying cgroups may resume in the dying state for a long time.
After kmem_cache reparenting which is performed during the offlining
slab counters of the dying cgroup don't have any chances to be updated,
because any slab operations will be performed on the parent level. It
means that the inaccuracy caused by percpu batching will not decrease up
to the final destruction of the cgroup. By the original idea flushing
slab counters during the offlining should minimize the visible
inaccuracy of slab counters on the parent level.
The problem is that percpu counters are not zeroed after the first
flushing. So every cached percpu value is summed twice. It creates a
small error (up to 32 pages per cpu, but usually less) which accumulates
on parent cgroup level. After creating and destroying of thousands of
child cgroups, slab counter on parent level can be way off the real
value.
For now, let's just stop flushing slab counters on memcg offlining. It
can't be done correctly without scheduling a work on each cpu: reading
and zeroing it during css offlining can race with an asynchronous
update, which doesn't expect values to be changed underneath.
With this change, slab counters on parent level will become eventually
consistent. Once all dying children are gone, values are correct. And
if not, the error is capped by 32 * NR_CPUS pages per dying cgroup.
It's not perfect, as slab are reparented, so any updates after the
reparenting will happen on the parent level. It means that if a slab
page was allocated, a counter on child level was bumped, then the page
was reparented and freed, the annihilation of positive and negative
counter values will not happen until the child cgroup is released. It
makes slab counters different from others, and it might want us to
implement flushing in a correct form again. But it's also a question of
performance: scheduling a work on each cpu isn't free, and it's an open
question if the benefit of having more accurate counters is worth it.
We might also consider flushing all counters on offlining, not only slab
counters.
So let's fix the main problem now: make the slab counters eventually
consistent, so at least the error won't grow with uptime (or more
precisely the number of created and destroyed cgroups). And think about
the accuracy of counters separately.
Link: http://lkml.kernel.org/r/20191220042728.1045881-1-guro@fb.com
Fixes: bee07b33db78 ("mm: memcontrol: flush percpu slab vmstats on kmem offlining")
Signed-off-by: Roman Gushchin <guro@fb.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2020-01-14 07:29:16 +07:00
|
|
|
for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
|
2019-08-25 07:54:47 +07:00
|
|
|
stat[i] = 0;
|
|
|
|
|
|
|
|
for_each_online_cpu(cpu)
|
mm: memcg/slab: fix percpu slab vmstats flushing
Currently slab percpu vmstats are flushed twice: during the memcg
offlining and just before freeing the memcg structure. Each time percpu
counters are summed, added to the atomic counterparts and propagated up
by the cgroup tree.
The second flushing is required due to how recursive vmstats are
implemented: counters are batched in percpu variables on a local level,
and once a percpu value is crossing some predefined threshold, it spills
over to atomic values on the local and each ascendant levels. It means
that without flushing some numbers cached in percpu variables will be
dropped on floor each time a cgroup is destroyed. And with uptime the
error on upper levels might become noticeable.
The first flushing aims to make counters on ancestor levels more
precise. Dying cgroups may resume in the dying state for a long time.
After kmem_cache reparenting which is performed during the offlining
slab counters of the dying cgroup don't have any chances to be updated,
because any slab operations will be performed on the parent level. It
means that the inaccuracy caused by percpu batching will not decrease up
to the final destruction of the cgroup. By the original idea flushing
slab counters during the offlining should minimize the visible
inaccuracy of slab counters on the parent level.
The problem is that percpu counters are not zeroed after the first
flushing. So every cached percpu value is summed twice. It creates a
small error (up to 32 pages per cpu, but usually less) which accumulates
on parent cgroup level. After creating and destroying of thousands of
child cgroups, slab counter on parent level can be way off the real
value.
For now, let's just stop flushing slab counters on memcg offlining. It
can't be done correctly without scheduling a work on each cpu: reading
and zeroing it during css offlining can race with an asynchronous
update, which doesn't expect values to be changed underneath.
With this change, slab counters on parent level will become eventually
consistent. Once all dying children are gone, values are correct. And
if not, the error is capped by 32 * NR_CPUS pages per dying cgroup.
It's not perfect, as slab are reparented, so any updates after the
reparenting will happen on the parent level. It means that if a slab
page was allocated, a counter on child level was bumped, then the page
was reparented and freed, the annihilation of positive and negative
counter values will not happen until the child cgroup is released. It
makes slab counters different from others, and it might want us to
implement flushing in a correct form again. But it's also a question of
performance: scheduling a work on each cpu isn't free, and it's an open
question if the benefit of having more accurate counters is worth it.
We might also consider flushing all counters on offlining, not only slab
counters.
So let's fix the main problem now: make the slab counters eventually
consistent, so at least the error won't grow with uptime (or more
precisely the number of created and destroyed cgroups). And think about
the accuracy of counters separately.
Link: http://lkml.kernel.org/r/20191220042728.1045881-1-guro@fb.com
Fixes: bee07b33db78 ("mm: memcontrol: flush percpu slab vmstats on kmem offlining")
Signed-off-by: Roman Gushchin <guro@fb.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2020-01-14 07:29:16 +07:00
|
|
|
for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
|
2019-08-31 06:04:53 +07:00
|
|
|
stat[i] += per_cpu(
|
|
|
|
pn->lruvec_stat_cpu->count[i], cpu);
|
2019-08-25 07:54:47 +07:00
|
|
|
|
|
|
|
for (pi = pn; pi; pi = parent_nodeinfo(pi, node))
|
mm: memcg/slab: fix percpu slab vmstats flushing
Currently slab percpu vmstats are flushed twice: during the memcg
offlining and just before freeing the memcg structure. Each time percpu
counters are summed, added to the atomic counterparts and propagated up
by the cgroup tree.
The second flushing is required due to how recursive vmstats are
implemented: counters are batched in percpu variables on a local level,
and once a percpu value is crossing some predefined threshold, it spills
over to atomic values on the local and each ascendant levels. It means
that without flushing some numbers cached in percpu variables will be
dropped on floor each time a cgroup is destroyed. And with uptime the
error on upper levels might become noticeable.
The first flushing aims to make counters on ancestor levels more
precise. Dying cgroups may resume in the dying state for a long time.
After kmem_cache reparenting which is performed during the offlining
slab counters of the dying cgroup don't have any chances to be updated,
because any slab operations will be performed on the parent level. It
means that the inaccuracy caused by percpu batching will not decrease up
to the final destruction of the cgroup. By the original idea flushing
slab counters during the offlining should minimize the visible
inaccuracy of slab counters on the parent level.
The problem is that percpu counters are not zeroed after the first
flushing. So every cached percpu value is summed twice. It creates a
small error (up to 32 pages per cpu, but usually less) which accumulates
on parent cgroup level. After creating and destroying of thousands of
child cgroups, slab counter on parent level can be way off the real
value.
For now, let's just stop flushing slab counters on memcg offlining. It
can't be done correctly without scheduling a work on each cpu: reading
and zeroing it during css offlining can race with an asynchronous
update, which doesn't expect values to be changed underneath.
With this change, slab counters on parent level will become eventually
consistent. Once all dying children are gone, values are correct. And
if not, the error is capped by 32 * NR_CPUS pages per dying cgroup.
It's not perfect, as slab are reparented, so any updates after the
reparenting will happen on the parent level. It means that if a slab
page was allocated, a counter on child level was bumped, then the page
was reparented and freed, the annihilation of positive and negative
counter values will not happen until the child cgroup is released. It
makes slab counters different from others, and it might want us to
implement flushing in a correct form again. But it's also a question of
performance: scheduling a work on each cpu isn't free, and it's an open
question if the benefit of having more accurate counters is worth it.
We might also consider flushing all counters on offlining, not only slab
counters.
So let's fix the main problem now: make the slab counters eventually
consistent, so at least the error won't grow with uptime (or more
precisely the number of created and destroyed cgroups). And think about
the accuracy of counters separately.
Link: http://lkml.kernel.org/r/20191220042728.1045881-1-guro@fb.com
Fixes: bee07b33db78 ("mm: memcontrol: flush percpu slab vmstats on kmem offlining")
Signed-off-by: Roman Gushchin <guro@fb.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2020-01-14 07:29:16 +07:00
|
|
|
for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
|
2019-08-25 07:54:47 +07:00
|
|
|
atomic_long_add(stat[i], &pi->lruvec_stat[i]);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-08-25 07:54:50 +07:00
|
|
|
static void memcg_flush_percpu_vmevents(struct mem_cgroup *memcg)
|
|
|
|
{
|
|
|
|
unsigned long events[NR_VM_EVENT_ITEMS];
|
|
|
|
struct mem_cgroup *mi;
|
|
|
|
int cpu, i;
|
|
|
|
|
|
|
|
for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
|
|
|
|
events[i] = 0;
|
|
|
|
|
|
|
|
for_each_online_cpu(cpu)
|
|
|
|
for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
|
2019-08-31 06:04:53 +07:00
|
|
|
events[i] += per_cpu(memcg->vmstats_percpu->events[i],
|
|
|
|
cpu);
|
2019-08-25 07:54:50 +07:00
|
|
|
|
|
|
|
for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
|
|
|
|
for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
|
|
|
|
atomic_long_add(events[i], &mi->vmevents[i]);
|
|
|
|
}
|
|
|
|
|
2018-08-18 05:47:25 +07:00
|
|
|
#ifdef CONFIG_MEMCG_KMEM
|
2016-01-21 06:02:24 +07:00
|
|
|
static int memcg_online_kmem(struct mem_cgroup *memcg)
|
2014-01-24 06:53:09 +07:00
|
|
|
{
|
2020-08-07 13:20:49 +07:00
|
|
|
struct obj_cgroup *objcg;
|
2014-01-24 06:53:09 +07:00
|
|
|
int memcg_id;
|
|
|
|
|
mm: memcontrol: enable kmem accounting for all cgroups in the legacy hierarchy
Workingset code was recently made memcg aware, but shadow node shrinker
is still global. As a result, one small cgroup can consume all memory
available for shadow nodes, possibly hurting other cgroups by reclaiming
their shadow nodes, even though reclaim distances stored in its shadow
nodes have no effect. To avoid this, we need to make shadow node
shrinker memcg aware.
The actual work is done in patch 6 of the series. Patches 1 and 2
prepare memcg/shrinker infrastructure for the change. Patch 3 is just a
collateral cleanup. Patch 4 makes radix_tree_node accounted, which is
necessary for making shadow node shrinker memcg aware. Patch 5 reduces
shadow nodes overhead in case workload mostly uses anonymous pages.
This patch:
Currently, in the legacy hierarchy kmem accounting is off for all
cgroups by default and must be enabled explicitly by writing something
to memory.kmem.limit_in_bytes. Since we don't support reclaim on
hitting kmem limit, nor do we have any plans to implement it, this is
likely to be -1, just to enable kmem accounting and limit kernel memory
consumption by the memory.limit_in_bytes along with user memory.
This user API was introduced when the implementation of kmem accounting
lacked slab shrinker support and hence was useless in practice. Things
have changed since then - slab shrinkers were made memcg aware, the
accounting overhead seems to be negligible, and a failure to charge a
kmem allocation should not have critical consequences, because we only
account those kernel objects that should be safe to fail. That's why
kmem accounting is enabled by default for all cgroups in the default
hierarchy, which will eventually replace the legacy one.
The ability to enable kmem accounting for some cgroups while keeping it
disabled for others is getting difficult to maintain. E.g. to make
shadow node shrinker memcg aware (see mm/workingset.c), we need to know
the relationship between the number of shadow nodes allocated for a
cgroup and the size of its lru list. If kmem accounting is enabled for
all cgroups there is no problem, but what should we do if kmem
accounting is enabled only for half of cgroups? We've no other choice
but use global lru stats while scanning root cgroup's shadow nodes, but
that would be wrong if kmem accounting was enabled for all cgroups
(which is the case if the unified hierarchy is used), in which case we
should use lru stats of the root cgroup's lruvec.
That being said, let's enable kmem accounting for all memory cgroups by
default. If one finds it unstable or too costly, it can always be
disabled system-wide by passing cgroup.memory=nokmem to the kernel at
boot time.
Signed-off-by: Vladimir Davydov <vdavydov@virtuozzo.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-03-18 04:18:27 +07:00
|
|
|
if (cgroup_memory_nokmem)
|
|
|
|
return 0;
|
|
|
|
|
2015-02-13 05:59:32 +07:00
|
|
|
BUG_ON(memcg->kmemcg_id >= 0);
|
2016-01-21 06:02:24 +07:00
|
|
|
BUG_ON(memcg->kmem_state);
|
2014-01-24 06:53:09 +07:00
|
|
|
|
2014-10-10 05:28:45 +07:00
|
|
|
memcg_id = memcg_alloc_cache_id();
|
2016-01-21 06:02:53 +07:00
|
|
|
if (memcg_id < 0)
|
|
|
|
return memcg_id;
|
2014-01-24 06:53:09 +07:00
|
|
|
|
2020-08-07 13:20:49 +07:00
|
|
|
objcg = obj_cgroup_alloc();
|
|
|
|
if (!objcg) {
|
|
|
|
memcg_free_cache_id(memcg_id);
|
|
|
|
return -ENOMEM;
|
|
|
|
}
|
|
|
|
objcg->memcg = memcg;
|
|
|
|
rcu_assign_pointer(memcg->objcg, objcg);
|
|
|
|
|
2020-08-07 13:20:28 +07:00
|
|
|
static_branch_enable(&memcg_kmem_enabled_key);
|
|
|
|
|
2014-01-24 06:53:09 +07:00
|
|
|
/*
|
2016-01-21 06:02:24 +07:00
|
|
|
* A memory cgroup is considered kmem-online as soon as it gets
|
2014-12-13 07:55:10 +07:00
|
|
|
* kmemcg_id. Setting the id after enabling static branching will
|
2014-01-24 06:53:09 +07:00
|
|
|
* guarantee no one starts accounting before all call sites are
|
|
|
|
* patched.
|
|
|
|
*/
|
2014-12-13 07:55:10 +07:00
|
|
|
memcg->kmemcg_id = memcg_id;
|
2016-01-21 06:02:24 +07:00
|
|
|
memcg->kmem_state = KMEM_ONLINE;
|
2016-01-21 06:02:53 +07:00
|
|
|
|
|
|
|
return 0;
|
2014-01-24 06:53:09 +07:00
|
|
|
}
|
|
|
|
|
2016-01-21 06:02:26 +07:00
|
|
|
static void memcg_offline_kmem(struct mem_cgroup *memcg)
|
|
|
|
{
|
|
|
|
struct cgroup_subsys_state *css;
|
|
|
|
struct mem_cgroup *parent, *child;
|
|
|
|
int kmemcg_id;
|
|
|
|
|
|
|
|
if (memcg->kmem_state != KMEM_ONLINE)
|
|
|
|
return;
|
2020-08-07 13:21:10 +07:00
|
|
|
|
2016-01-21 06:02:26 +07:00
|
|
|
memcg->kmem_state = KMEM_ALLOCATED;
|
|
|
|
|
|
|
|
parent = parent_mem_cgroup(memcg);
|
|
|
|
if (!parent)
|
|
|
|
parent = root_mem_cgroup;
|
|
|
|
|
2020-08-07 13:20:49 +07:00
|
|
|
memcg_reparent_objcgs(memcg, parent);
|
mm: memcg/slab: reparent memcg kmem_caches on cgroup removal
Let's reparent non-root kmem_caches on memcg offlining. This allows us to
release the memory cgroup without waiting for the last outstanding kernel
object (e.g. dentry used by another application).
Since the parent cgroup is already charged, everything we need to do is to
splice the list of kmem_caches to the parent's kmem_caches list, swap the
memcg pointer, drop the css refcounter for each kmem_cache and adjust the
parent's css refcounter.
Please, note that kmem_cache->memcg_params.memcg isn't a stable pointer
anymore. It's safe to read it under rcu_read_lock(), cgroup_mutex held,
or any other way that protects the memory cgroup from being released.
We can race with the slab allocation and deallocation paths. It's not a
big problem: parent's charge and slab global stats are always correct, and
we don't care anymore about the child usage and global stats. The child
cgroup is already offline, so we don't use or show it anywhere.
Local slab stats (NR_SLAB_RECLAIMABLE and NR_SLAB_UNRECLAIMABLE) aren't
used anywhere except count_shadow_nodes(). But even there it won't break
anything: after reparenting "nodes" will be 0 on child level (because
we're already reparenting shrinker lists), and on parent level page stats
always were 0, and this patch won't change anything.
[guro@fb.com: properly handle kmem_caches reparented to root_mem_cgroup]
Link: http://lkml.kernel.org/r/20190620213427.1691847-1-guro@fb.com
Link: http://lkml.kernel.org/r/20190611231813.3148843-11-guro@fb.com
Signed-off-by: Roman Gushchin <guro@fb.com>
Acked-by: Vladimir Davydov <vdavydov.dev@gmail.com>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Acked-by: David Rientjes <rientjes@google.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Waiman Long <longman@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Andrei Vagin <avagin@gmail.com>
Cc: Qian Cai <cai@lca.pw>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-07-12 10:56:34 +07:00
|
|
|
|
|
|
|
kmemcg_id = memcg->kmemcg_id;
|
|
|
|
BUG_ON(kmemcg_id < 0);
|
|
|
|
|
2016-01-21 06:02:26 +07:00
|
|
|
/*
|
|
|
|
* Change kmemcg_id of this cgroup and all its descendants to the
|
|
|
|
* parent's id, and then move all entries from this cgroup's list_lrus
|
|
|
|
* to ones of the parent. After we have finished, all list_lrus
|
|
|
|
* corresponding to this cgroup are guaranteed to remain empty. The
|
|
|
|
* ordering is imposed by list_lru_node->lock taken by
|
|
|
|
* memcg_drain_all_list_lrus().
|
|
|
|
*/
|
2016-06-04 04:55:44 +07:00
|
|
|
rcu_read_lock(); /* can be called from css_free w/o cgroup_mutex */
|
2016-01-21 06:02:26 +07:00
|
|
|
css_for_each_descendant_pre(css, &memcg->css) {
|
|
|
|
child = mem_cgroup_from_css(css);
|
|
|
|
BUG_ON(child->kmemcg_id != kmemcg_id);
|
|
|
|
child->kmemcg_id = parent->kmemcg_id;
|
|
|
|
if (!memcg->use_hierarchy)
|
|
|
|
break;
|
|
|
|
}
|
2016-06-04 04:55:44 +07:00
|
|
|
rcu_read_unlock();
|
|
|
|
|
2018-08-18 05:47:58 +07:00
|
|
|
memcg_drain_all_list_lrus(kmemcg_id, parent);
|
2016-01-21 06:02:26 +07:00
|
|
|
|
|
|
|
memcg_free_cache_id(kmemcg_id);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void memcg_free_kmem(struct mem_cgroup *memcg)
|
|
|
|
{
|
2016-01-21 06:02:53 +07:00
|
|
|
/* css_alloc() failed, offlining didn't happen */
|
|
|
|
if (unlikely(memcg->kmem_state == KMEM_ONLINE))
|
|
|
|
memcg_offline_kmem(memcg);
|
2016-01-21 06:02:26 +07:00
|
|
|
}
|
2014-01-24 06:53:09 +07:00
|
|
|
#else
|
2016-01-21 06:02:53 +07:00
|
|
|
static int memcg_online_kmem(struct mem_cgroup *memcg)
|
2016-01-21 06:02:32 +07:00
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
static void memcg_offline_kmem(struct mem_cgroup *memcg)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
static void memcg_free_kmem(struct mem_cgroup *memcg)
|
|
|
|
{
|
|
|
|
}
|
2018-08-18 05:47:25 +07:00
|
|
|
#endif /* CONFIG_MEMCG_KMEM */
|
2016-01-21 06:02:32 +07:00
|
|
|
|
2018-06-08 07:06:18 +07:00
|
|
|
static int memcg_update_kmem_max(struct mem_cgroup *memcg,
|
|
|
|
unsigned long max)
|
2014-01-24 06:53:09 +07:00
|
|
|
{
|
mm: memcontrol: enable kmem accounting for all cgroups in the legacy hierarchy
Workingset code was recently made memcg aware, but shadow node shrinker
is still global. As a result, one small cgroup can consume all memory
available for shadow nodes, possibly hurting other cgroups by reclaiming
their shadow nodes, even though reclaim distances stored in its shadow
nodes have no effect. To avoid this, we need to make shadow node
shrinker memcg aware.
The actual work is done in patch 6 of the series. Patches 1 and 2
prepare memcg/shrinker infrastructure for the change. Patch 3 is just a
collateral cleanup. Patch 4 makes radix_tree_node accounted, which is
necessary for making shadow node shrinker memcg aware. Patch 5 reduces
shadow nodes overhead in case workload mostly uses anonymous pages.
This patch:
Currently, in the legacy hierarchy kmem accounting is off for all
cgroups by default and must be enabled explicitly by writing something
to memory.kmem.limit_in_bytes. Since we don't support reclaim on
hitting kmem limit, nor do we have any plans to implement it, this is
likely to be -1, just to enable kmem accounting and limit kernel memory
consumption by the memory.limit_in_bytes along with user memory.
This user API was introduced when the implementation of kmem accounting
lacked slab shrinker support and hence was useless in practice. Things
have changed since then - slab shrinkers were made memcg aware, the
accounting overhead seems to be negligible, and a failure to charge a
kmem allocation should not have critical consequences, because we only
account those kernel objects that should be safe to fail. That's why
kmem accounting is enabled by default for all cgroups in the default
hierarchy, which will eventually replace the legacy one.
The ability to enable kmem accounting for some cgroups while keeping it
disabled for others is getting difficult to maintain. E.g. to make
shadow node shrinker memcg aware (see mm/workingset.c), we need to know
the relationship between the number of shadow nodes allocated for a
cgroup and the size of its lru list. If kmem accounting is enabled for
all cgroups there is no problem, but what should we do if kmem
accounting is enabled only for half of cgroups? We've no other choice
but use global lru stats while scanning root cgroup's shadow nodes, but
that would be wrong if kmem accounting was enabled for all cgroups
(which is the case if the unified hierarchy is used), in which case we
should use lru stats of the root cgroup's lruvec.
That being said, let's enable kmem accounting for all memory cgroups by
default. If one finds it unstable or too costly, it can always be
disabled system-wide by passing cgroup.memory=nokmem to the kernel at
boot time.
Signed-off-by: Vladimir Davydov <vdavydov@virtuozzo.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-03-18 04:18:27 +07:00
|
|
|
int ret;
|
2016-01-21 06:02:32 +07:00
|
|
|
|
2018-06-08 07:06:18 +07:00
|
|
|
mutex_lock(&memcg_max_mutex);
|
|
|
|
ret = page_counter_set_max(&memcg->kmem, max);
|
|
|
|
mutex_unlock(&memcg_max_mutex);
|
2016-01-21 06:02:32 +07:00
|
|
|
return ret;
|
2014-01-24 06:53:09 +07:00
|
|
|
}
|
2012-12-19 05:21:47 +07:00
|
|
|
|
2018-06-08 07:06:18 +07:00
|
|
|
static int memcg_update_tcp_max(struct mem_cgroup *memcg, unsigned long max)
|
2016-01-21 06:02:44 +07:00
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
|
2018-06-08 07:06:18 +07:00
|
|
|
mutex_lock(&memcg_max_mutex);
|
2016-01-21 06:02:44 +07:00
|
|
|
|
2018-06-08 07:06:18 +07:00
|
|
|
ret = page_counter_set_max(&memcg->tcpmem, max);
|
2016-01-21 06:02:44 +07:00
|
|
|
if (ret)
|
|
|
|
goto out;
|
|
|
|
|
2016-01-21 06:02:50 +07:00
|
|
|
if (!memcg->tcpmem_active) {
|
2016-01-21 06:02:44 +07:00
|
|
|
/*
|
|
|
|
* The active flag needs to be written after the static_key
|
|
|
|
* update. This is what guarantees that the socket activation
|
2016-10-08 07:00:58 +07:00
|
|
|
* function is the last one to run. See mem_cgroup_sk_alloc()
|
|
|
|
* for details, and note that we don't mark any socket as
|
|
|
|
* belonging to this memcg until that flag is up.
|
2016-01-21 06:02:44 +07:00
|
|
|
*
|
|
|
|
* We need to do this, because static_keys will span multiple
|
|
|
|
* sites, but we can't control their order. If we mark a socket
|
|
|
|
* as accounted, but the accounting functions are not patched in
|
|
|
|
* yet, we'll lose accounting.
|
|
|
|
*
|
2016-10-08 07:00:58 +07:00
|
|
|
* We never race with the readers in mem_cgroup_sk_alloc(),
|
2016-01-21 06:02:44 +07:00
|
|
|
* because when this value change, the code to process it is not
|
|
|
|
* patched in yet.
|
|
|
|
*/
|
|
|
|
static_branch_inc(&memcg_sockets_enabled_key);
|
2016-01-21 06:02:50 +07:00
|
|
|
memcg->tcpmem_active = true;
|
2016-01-21 06:02:44 +07:00
|
|
|
}
|
|
|
|
out:
|
2018-06-08 07:06:18 +07:00
|
|
|
mutex_unlock(&memcg_max_mutex);
|
2016-01-21 06:02:44 +07:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2008-07-25 15:47:20 +07:00
|
|
|
/*
|
|
|
|
* The user of this function is...
|
|
|
|
* RES_LIMIT.
|
|
|
|
*/
|
2014-05-13 23:16:21 +07:00
|
|
|
static ssize_t mem_cgroup_write(struct kernfs_open_file *of,
|
|
|
|
char *buf, size_t nbytes, loff_t off)
|
2008-02-07 15:13:50 +07:00
|
|
|
{
|
2014-05-13 23:16:21 +07:00
|
|
|
struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
|
mm: memcontrol: lockless page counters
Memory is internally accounted in bytes, using spinlock-protected 64-bit
counters, even though the smallest accounting delta is a page. The
counter interface is also convoluted and does too many things.
Introduce a new lockless word-sized page counter API, then change all
memory accounting over to it. The translation from and to bytes then only
happens when interfacing with userspace.
The removed locking overhead is noticable when scaling beyond the per-cpu
charge caches - on a 4-socket machine with 144-threads, the following test
shows the performance differences of 288 memcgs concurrently running a
page fault benchmark:
vanilla:
18631648.500498 task-clock (msec) # 140.643 CPUs utilized ( +- 0.33% )
1,380,638 context-switches # 0.074 K/sec ( +- 0.75% )
24,390 cpu-migrations # 0.001 K/sec ( +- 8.44% )
1,843,305,768 page-faults # 0.099 M/sec ( +- 0.00% )
50,134,994,088,218 cycles # 2.691 GHz ( +- 0.33% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
8,049,712,224,651 instructions # 0.16 insns per cycle ( +- 0.04% )
1,586,970,584,979 branches # 85.176 M/sec ( +- 0.05% )
1,724,989,949 branch-misses # 0.11% of all branches ( +- 0.48% )
132.474343877 seconds time elapsed ( +- 0.21% )
lockless:
12195979.037525 task-clock (msec) # 133.480 CPUs utilized ( +- 0.18% )
832,850 context-switches # 0.068 K/sec ( +- 0.54% )
15,624 cpu-migrations # 0.001 K/sec ( +- 10.17% )
1,843,304,774 page-faults # 0.151 M/sec ( +- 0.00% )
32,811,216,801,141 cycles # 2.690 GHz ( +- 0.18% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
9,999,265,091,727 instructions # 0.30 insns per cycle ( +- 0.10% )
2,076,759,325,203 branches # 170.282 M/sec ( +- 0.12% )
1,656,917,214 branch-misses # 0.08% of all branches ( +- 0.55% )
91.369330729 seconds time elapsed ( +- 0.45% )
On top of improved scalability, this also gets rid of the icky long long
types in the very heart of memcg, which is great for 32 bit and also makes
the code a lot more readable.
Notable differences between the old and new API:
- res_counter_charge() and res_counter_charge_nofail() become
page_counter_try_charge() and page_counter_charge() resp. to match
the more common kernel naming scheme of try_do()/do()
- res_counter_uncharge_until() is only ever used to cancel a local
counter and never to uncharge bigger segments of a hierarchy, so
it's replaced by the simpler page_counter_cancel()
- res_counter_set_limit() is replaced by page_counter_limit(), which
expects its callers to serialize against themselves
- res_counter_memparse_write_strategy() is replaced by
page_counter_limit(), which rounds down to the nearest page size -
rather than up. This is more reasonable for explicitely requested
hard upper limits.
- to keep charging light-weight, page_counter_try_charge() charges
speculatively, only to roll back if the result exceeds the limit.
Because of this, a failing bigger charge can temporarily lock out
smaller charges that would otherwise succeed. The error is bounded
to the difference between the smallest and the biggest possible
charge size, so for memcg, this means that a failing THP charge can
send base page charges into reclaim upto 2MB (4MB) before the limit
would have been reached. This should be acceptable.
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE and memparse]
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE, memparse, strncmp, and PAGE_SIZE]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Acked-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-12-11 06:42:31 +07:00
|
|
|
unsigned long nr_pages;
|
2008-07-25 15:47:20 +07:00
|
|
|
int ret;
|
|
|
|
|
2014-05-13 23:16:21 +07:00
|
|
|
buf = strstrip(buf);
|
2015-02-12 06:26:03 +07:00
|
|
|
ret = page_counter_memparse(buf, "-1", &nr_pages);
|
mm: memcontrol: lockless page counters
Memory is internally accounted in bytes, using spinlock-protected 64-bit
counters, even though the smallest accounting delta is a page. The
counter interface is also convoluted and does too many things.
Introduce a new lockless word-sized page counter API, then change all
memory accounting over to it. The translation from and to bytes then only
happens when interfacing with userspace.
The removed locking overhead is noticable when scaling beyond the per-cpu
charge caches - on a 4-socket machine with 144-threads, the following test
shows the performance differences of 288 memcgs concurrently running a
page fault benchmark:
vanilla:
18631648.500498 task-clock (msec) # 140.643 CPUs utilized ( +- 0.33% )
1,380,638 context-switches # 0.074 K/sec ( +- 0.75% )
24,390 cpu-migrations # 0.001 K/sec ( +- 8.44% )
1,843,305,768 page-faults # 0.099 M/sec ( +- 0.00% )
50,134,994,088,218 cycles # 2.691 GHz ( +- 0.33% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
8,049,712,224,651 instructions # 0.16 insns per cycle ( +- 0.04% )
1,586,970,584,979 branches # 85.176 M/sec ( +- 0.05% )
1,724,989,949 branch-misses # 0.11% of all branches ( +- 0.48% )
132.474343877 seconds time elapsed ( +- 0.21% )
lockless:
12195979.037525 task-clock (msec) # 133.480 CPUs utilized ( +- 0.18% )
832,850 context-switches # 0.068 K/sec ( +- 0.54% )
15,624 cpu-migrations # 0.001 K/sec ( +- 10.17% )
1,843,304,774 page-faults # 0.151 M/sec ( +- 0.00% )
32,811,216,801,141 cycles # 2.690 GHz ( +- 0.18% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
9,999,265,091,727 instructions # 0.30 insns per cycle ( +- 0.10% )
2,076,759,325,203 branches # 170.282 M/sec ( +- 0.12% )
1,656,917,214 branch-misses # 0.08% of all branches ( +- 0.55% )
91.369330729 seconds time elapsed ( +- 0.45% )
On top of improved scalability, this also gets rid of the icky long long
types in the very heart of memcg, which is great for 32 bit and also makes
the code a lot more readable.
Notable differences between the old and new API:
- res_counter_charge() and res_counter_charge_nofail() become
page_counter_try_charge() and page_counter_charge() resp. to match
the more common kernel naming scheme of try_do()/do()
- res_counter_uncharge_until() is only ever used to cancel a local
counter and never to uncharge bigger segments of a hierarchy, so
it's replaced by the simpler page_counter_cancel()
- res_counter_set_limit() is replaced by page_counter_limit(), which
expects its callers to serialize against themselves
- res_counter_memparse_write_strategy() is replaced by
page_counter_limit(), which rounds down to the nearest page size -
rather than up. This is more reasonable for explicitely requested
hard upper limits.
- to keep charging light-weight, page_counter_try_charge() charges
speculatively, only to roll back if the result exceeds the limit.
Because of this, a failing bigger charge can temporarily lock out
smaller charges that would otherwise succeed. The error is bounded
to the difference between the smallest and the biggest possible
charge size, so for memcg, this means that a failing THP charge can
send base page charges into reclaim upto 2MB (4MB) before the limit
would have been reached. This should be acceptable.
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE and memparse]
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE, memparse, strncmp, and PAGE_SIZE]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Acked-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-12-11 06:42:31 +07:00
|
|
|
if (ret)
|
|
|
|
return ret;
|
2012-04-02 02:09:55 +07:00
|
|
|
|
mm: memcontrol: lockless page counters
Memory is internally accounted in bytes, using spinlock-protected 64-bit
counters, even though the smallest accounting delta is a page. The
counter interface is also convoluted and does too many things.
Introduce a new lockless word-sized page counter API, then change all
memory accounting over to it. The translation from and to bytes then only
happens when interfacing with userspace.
The removed locking overhead is noticable when scaling beyond the per-cpu
charge caches - on a 4-socket machine with 144-threads, the following test
shows the performance differences of 288 memcgs concurrently running a
page fault benchmark:
vanilla:
18631648.500498 task-clock (msec) # 140.643 CPUs utilized ( +- 0.33% )
1,380,638 context-switches # 0.074 K/sec ( +- 0.75% )
24,390 cpu-migrations # 0.001 K/sec ( +- 8.44% )
1,843,305,768 page-faults # 0.099 M/sec ( +- 0.00% )
50,134,994,088,218 cycles # 2.691 GHz ( +- 0.33% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
8,049,712,224,651 instructions # 0.16 insns per cycle ( +- 0.04% )
1,586,970,584,979 branches # 85.176 M/sec ( +- 0.05% )
1,724,989,949 branch-misses # 0.11% of all branches ( +- 0.48% )
132.474343877 seconds time elapsed ( +- 0.21% )
lockless:
12195979.037525 task-clock (msec) # 133.480 CPUs utilized ( +- 0.18% )
832,850 context-switches # 0.068 K/sec ( +- 0.54% )
15,624 cpu-migrations # 0.001 K/sec ( +- 10.17% )
1,843,304,774 page-faults # 0.151 M/sec ( +- 0.00% )
32,811,216,801,141 cycles # 2.690 GHz ( +- 0.18% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
9,999,265,091,727 instructions # 0.30 insns per cycle ( +- 0.10% )
2,076,759,325,203 branches # 170.282 M/sec ( +- 0.12% )
1,656,917,214 branch-misses # 0.08% of all branches ( +- 0.55% )
91.369330729 seconds time elapsed ( +- 0.45% )
On top of improved scalability, this also gets rid of the icky long long
types in the very heart of memcg, which is great for 32 bit and also makes
the code a lot more readable.
Notable differences between the old and new API:
- res_counter_charge() and res_counter_charge_nofail() become
page_counter_try_charge() and page_counter_charge() resp. to match
the more common kernel naming scheme of try_do()/do()
- res_counter_uncharge_until() is only ever used to cancel a local
counter and never to uncharge bigger segments of a hierarchy, so
it's replaced by the simpler page_counter_cancel()
- res_counter_set_limit() is replaced by page_counter_limit(), which
expects its callers to serialize against themselves
- res_counter_memparse_write_strategy() is replaced by
page_counter_limit(), which rounds down to the nearest page size -
rather than up. This is more reasonable for explicitely requested
hard upper limits.
- to keep charging light-weight, page_counter_try_charge() charges
speculatively, only to roll back if the result exceeds the limit.
Because of this, a failing bigger charge can temporarily lock out
smaller charges that would otherwise succeed. The error is bounded
to the difference between the smallest and the biggest possible
charge size, so for memcg, this means that a failing THP charge can
send base page charges into reclaim upto 2MB (4MB) before the limit
would have been reached. This should be acceptable.
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE and memparse]
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE, memparse, strncmp, and PAGE_SIZE]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Acked-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-12-11 06:42:31 +07:00
|
|
|
switch (MEMFILE_ATTR(of_cft(of)->private)) {
|
2008-07-25 15:47:20 +07:00
|
|
|
case RES_LIMIT:
|
2009-09-24 05:56:32 +07:00
|
|
|
if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */
|
|
|
|
ret = -EINVAL;
|
|
|
|
break;
|
|
|
|
}
|
mm: memcontrol: lockless page counters
Memory is internally accounted in bytes, using spinlock-protected 64-bit
counters, even though the smallest accounting delta is a page. The
counter interface is also convoluted and does too many things.
Introduce a new lockless word-sized page counter API, then change all
memory accounting over to it. The translation from and to bytes then only
happens when interfacing with userspace.
The removed locking overhead is noticable when scaling beyond the per-cpu
charge caches - on a 4-socket machine with 144-threads, the following test
shows the performance differences of 288 memcgs concurrently running a
page fault benchmark:
vanilla:
18631648.500498 task-clock (msec) # 140.643 CPUs utilized ( +- 0.33% )
1,380,638 context-switches # 0.074 K/sec ( +- 0.75% )
24,390 cpu-migrations # 0.001 K/sec ( +- 8.44% )
1,843,305,768 page-faults # 0.099 M/sec ( +- 0.00% )
50,134,994,088,218 cycles # 2.691 GHz ( +- 0.33% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
8,049,712,224,651 instructions # 0.16 insns per cycle ( +- 0.04% )
1,586,970,584,979 branches # 85.176 M/sec ( +- 0.05% )
1,724,989,949 branch-misses # 0.11% of all branches ( +- 0.48% )
132.474343877 seconds time elapsed ( +- 0.21% )
lockless:
12195979.037525 task-clock (msec) # 133.480 CPUs utilized ( +- 0.18% )
832,850 context-switches # 0.068 K/sec ( +- 0.54% )
15,624 cpu-migrations # 0.001 K/sec ( +- 10.17% )
1,843,304,774 page-faults # 0.151 M/sec ( +- 0.00% )
32,811,216,801,141 cycles # 2.690 GHz ( +- 0.18% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
9,999,265,091,727 instructions # 0.30 insns per cycle ( +- 0.10% )
2,076,759,325,203 branches # 170.282 M/sec ( +- 0.12% )
1,656,917,214 branch-misses # 0.08% of all branches ( +- 0.55% )
91.369330729 seconds time elapsed ( +- 0.45% )
On top of improved scalability, this also gets rid of the icky long long
types in the very heart of memcg, which is great for 32 bit and also makes
the code a lot more readable.
Notable differences between the old and new API:
- res_counter_charge() and res_counter_charge_nofail() become
page_counter_try_charge() and page_counter_charge() resp. to match
the more common kernel naming scheme of try_do()/do()
- res_counter_uncharge_until() is only ever used to cancel a local
counter and never to uncharge bigger segments of a hierarchy, so
it's replaced by the simpler page_counter_cancel()
- res_counter_set_limit() is replaced by page_counter_limit(), which
expects its callers to serialize against themselves
- res_counter_memparse_write_strategy() is replaced by
page_counter_limit(), which rounds down to the nearest page size -
rather than up. This is more reasonable for explicitely requested
hard upper limits.
- to keep charging light-weight, page_counter_try_charge() charges
speculatively, only to roll back if the result exceeds the limit.
Because of this, a failing bigger charge can temporarily lock out
smaller charges that would otherwise succeed. The error is bounded
to the difference between the smallest and the biggest possible
charge size, so for memcg, this means that a failing THP charge can
send base page charges into reclaim upto 2MB (4MB) before the limit
would have been reached. This should be acceptable.
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE and memparse]
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE, memparse, strncmp, and PAGE_SIZE]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Acked-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-12-11 06:42:31 +07:00
|
|
|
switch (MEMFILE_TYPE(of_cft(of)->private)) {
|
|
|
|
case _MEM:
|
2018-06-08 07:06:18 +07:00
|
|
|
ret = mem_cgroup_resize_max(memcg, nr_pages, false);
|
2009-01-08 09:08:00 +07:00
|
|
|
break;
|
mm: memcontrol: lockless page counters
Memory is internally accounted in bytes, using spinlock-protected 64-bit
counters, even though the smallest accounting delta is a page. The
counter interface is also convoluted and does too many things.
Introduce a new lockless word-sized page counter API, then change all
memory accounting over to it. The translation from and to bytes then only
happens when interfacing with userspace.
The removed locking overhead is noticable when scaling beyond the per-cpu
charge caches - on a 4-socket machine with 144-threads, the following test
shows the performance differences of 288 memcgs concurrently running a
page fault benchmark:
vanilla:
18631648.500498 task-clock (msec) # 140.643 CPUs utilized ( +- 0.33% )
1,380,638 context-switches # 0.074 K/sec ( +- 0.75% )
24,390 cpu-migrations # 0.001 K/sec ( +- 8.44% )
1,843,305,768 page-faults # 0.099 M/sec ( +- 0.00% )
50,134,994,088,218 cycles # 2.691 GHz ( +- 0.33% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
8,049,712,224,651 instructions # 0.16 insns per cycle ( +- 0.04% )
1,586,970,584,979 branches # 85.176 M/sec ( +- 0.05% )
1,724,989,949 branch-misses # 0.11% of all branches ( +- 0.48% )
132.474343877 seconds time elapsed ( +- 0.21% )
lockless:
12195979.037525 task-clock (msec) # 133.480 CPUs utilized ( +- 0.18% )
832,850 context-switches # 0.068 K/sec ( +- 0.54% )
15,624 cpu-migrations # 0.001 K/sec ( +- 10.17% )
1,843,304,774 page-faults # 0.151 M/sec ( +- 0.00% )
32,811,216,801,141 cycles # 2.690 GHz ( +- 0.18% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
9,999,265,091,727 instructions # 0.30 insns per cycle ( +- 0.10% )
2,076,759,325,203 branches # 170.282 M/sec ( +- 0.12% )
1,656,917,214 branch-misses # 0.08% of all branches ( +- 0.55% )
91.369330729 seconds time elapsed ( +- 0.45% )
On top of improved scalability, this also gets rid of the icky long long
types in the very heart of memcg, which is great for 32 bit and also makes
the code a lot more readable.
Notable differences between the old and new API:
- res_counter_charge() and res_counter_charge_nofail() become
page_counter_try_charge() and page_counter_charge() resp. to match
the more common kernel naming scheme of try_do()/do()
- res_counter_uncharge_until() is only ever used to cancel a local
counter and never to uncharge bigger segments of a hierarchy, so
it's replaced by the simpler page_counter_cancel()
- res_counter_set_limit() is replaced by page_counter_limit(), which
expects its callers to serialize against themselves
- res_counter_memparse_write_strategy() is replaced by
page_counter_limit(), which rounds down to the nearest page size -
rather than up. This is more reasonable for explicitely requested
hard upper limits.
- to keep charging light-weight, page_counter_try_charge() charges
speculatively, only to roll back if the result exceeds the limit.
Because of this, a failing bigger charge can temporarily lock out
smaller charges that would otherwise succeed. The error is bounded
to the difference between the smallest and the biggest possible
charge size, so for memcg, this means that a failing THP charge can
send base page charges into reclaim upto 2MB (4MB) before the limit
would have been reached. This should be acceptable.
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE and memparse]
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE, memparse, strncmp, and PAGE_SIZE]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Acked-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-12-11 06:42:31 +07:00
|
|
|
case _MEMSWAP:
|
2018-06-08 07:06:18 +07:00
|
|
|
ret = mem_cgroup_resize_max(memcg, nr_pages, true);
|
2009-09-24 05:56:36 +07:00
|
|
|
break;
|
mm: memcontrol: lockless page counters
Memory is internally accounted in bytes, using spinlock-protected 64-bit
counters, even though the smallest accounting delta is a page. The
counter interface is also convoluted and does too many things.
Introduce a new lockless word-sized page counter API, then change all
memory accounting over to it. The translation from and to bytes then only
happens when interfacing with userspace.
The removed locking overhead is noticable when scaling beyond the per-cpu
charge caches - on a 4-socket machine with 144-threads, the following test
shows the performance differences of 288 memcgs concurrently running a
page fault benchmark:
vanilla:
18631648.500498 task-clock (msec) # 140.643 CPUs utilized ( +- 0.33% )
1,380,638 context-switches # 0.074 K/sec ( +- 0.75% )
24,390 cpu-migrations # 0.001 K/sec ( +- 8.44% )
1,843,305,768 page-faults # 0.099 M/sec ( +- 0.00% )
50,134,994,088,218 cycles # 2.691 GHz ( +- 0.33% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
8,049,712,224,651 instructions # 0.16 insns per cycle ( +- 0.04% )
1,586,970,584,979 branches # 85.176 M/sec ( +- 0.05% )
1,724,989,949 branch-misses # 0.11% of all branches ( +- 0.48% )
132.474343877 seconds time elapsed ( +- 0.21% )
lockless:
12195979.037525 task-clock (msec) # 133.480 CPUs utilized ( +- 0.18% )
832,850 context-switches # 0.068 K/sec ( +- 0.54% )
15,624 cpu-migrations # 0.001 K/sec ( +- 10.17% )
1,843,304,774 page-faults # 0.151 M/sec ( +- 0.00% )
32,811,216,801,141 cycles # 2.690 GHz ( +- 0.18% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
9,999,265,091,727 instructions # 0.30 insns per cycle ( +- 0.10% )
2,076,759,325,203 branches # 170.282 M/sec ( +- 0.12% )
1,656,917,214 branch-misses # 0.08% of all branches ( +- 0.55% )
91.369330729 seconds time elapsed ( +- 0.45% )
On top of improved scalability, this also gets rid of the icky long long
types in the very heart of memcg, which is great for 32 bit and also makes
the code a lot more readable.
Notable differences between the old and new API:
- res_counter_charge() and res_counter_charge_nofail() become
page_counter_try_charge() and page_counter_charge() resp. to match
the more common kernel naming scheme of try_do()/do()
- res_counter_uncharge_until() is only ever used to cancel a local
counter and never to uncharge bigger segments of a hierarchy, so
it's replaced by the simpler page_counter_cancel()
- res_counter_set_limit() is replaced by page_counter_limit(), which
expects its callers to serialize against themselves
- res_counter_memparse_write_strategy() is replaced by
page_counter_limit(), which rounds down to the nearest page size -
rather than up. This is more reasonable for explicitely requested
hard upper limits.
- to keep charging light-weight, page_counter_try_charge() charges
speculatively, only to roll back if the result exceeds the limit.
Because of this, a failing bigger charge can temporarily lock out
smaller charges that would otherwise succeed. The error is bounded
to the difference between the smallest and the biggest possible
charge size, so for memcg, this means that a failing THP charge can
send base page charges into reclaim upto 2MB (4MB) before the limit
would have been reached. This should be acceptable.
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE and memparse]
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE, memparse, strncmp, and PAGE_SIZE]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Acked-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-12-11 06:42:31 +07:00
|
|
|
case _KMEM:
|
2019-09-24 05:37:22 +07:00
|
|
|
pr_warn_once("kmem.limit_in_bytes is deprecated and will be removed. "
|
|
|
|
"Please report your usecase to linux-mm@kvack.org if you "
|
|
|
|
"depend on this functionality.\n");
|
2018-06-08 07:06:18 +07:00
|
|
|
ret = memcg_update_kmem_max(memcg, nr_pages);
|
mm: memcontrol: lockless page counters
Memory is internally accounted in bytes, using spinlock-protected 64-bit
counters, even though the smallest accounting delta is a page. The
counter interface is also convoluted and does too many things.
Introduce a new lockless word-sized page counter API, then change all
memory accounting over to it. The translation from and to bytes then only
happens when interfacing with userspace.
The removed locking overhead is noticable when scaling beyond the per-cpu
charge caches - on a 4-socket machine with 144-threads, the following test
shows the performance differences of 288 memcgs concurrently running a
page fault benchmark:
vanilla:
18631648.500498 task-clock (msec) # 140.643 CPUs utilized ( +- 0.33% )
1,380,638 context-switches # 0.074 K/sec ( +- 0.75% )
24,390 cpu-migrations # 0.001 K/sec ( +- 8.44% )
1,843,305,768 page-faults # 0.099 M/sec ( +- 0.00% )
50,134,994,088,218 cycles # 2.691 GHz ( +- 0.33% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
8,049,712,224,651 instructions # 0.16 insns per cycle ( +- 0.04% )
1,586,970,584,979 branches # 85.176 M/sec ( +- 0.05% )
1,724,989,949 branch-misses # 0.11% of all branches ( +- 0.48% )
132.474343877 seconds time elapsed ( +- 0.21% )
lockless:
12195979.037525 task-clock (msec) # 133.480 CPUs utilized ( +- 0.18% )
832,850 context-switches # 0.068 K/sec ( +- 0.54% )
15,624 cpu-migrations # 0.001 K/sec ( +- 10.17% )
1,843,304,774 page-faults # 0.151 M/sec ( +- 0.00% )
32,811,216,801,141 cycles # 2.690 GHz ( +- 0.18% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
9,999,265,091,727 instructions # 0.30 insns per cycle ( +- 0.10% )
2,076,759,325,203 branches # 170.282 M/sec ( +- 0.12% )
1,656,917,214 branch-misses # 0.08% of all branches ( +- 0.55% )
91.369330729 seconds time elapsed ( +- 0.45% )
On top of improved scalability, this also gets rid of the icky long long
types in the very heart of memcg, which is great for 32 bit and also makes
the code a lot more readable.
Notable differences between the old and new API:
- res_counter_charge() and res_counter_charge_nofail() become
page_counter_try_charge() and page_counter_charge() resp. to match
the more common kernel naming scheme of try_do()/do()
- res_counter_uncharge_until() is only ever used to cancel a local
counter and never to uncharge bigger segments of a hierarchy, so
it's replaced by the simpler page_counter_cancel()
- res_counter_set_limit() is replaced by page_counter_limit(), which
expects its callers to serialize against themselves
- res_counter_memparse_write_strategy() is replaced by
page_counter_limit(), which rounds down to the nearest page size -
rather than up. This is more reasonable for explicitely requested
hard upper limits.
- to keep charging light-weight, page_counter_try_charge() charges
speculatively, only to roll back if the result exceeds the limit.
Because of this, a failing bigger charge can temporarily lock out
smaller charges that would otherwise succeed. The error is bounded
to the difference between the smallest and the biggest possible
charge size, so for memcg, this means that a failing THP charge can
send base page charges into reclaim upto 2MB (4MB) before the limit
would have been reached. This should be acceptable.
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE and memparse]
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE, memparse, strncmp, and PAGE_SIZE]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Acked-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-12-11 06:42:31 +07:00
|
|
|
break;
|
2016-01-21 06:02:44 +07:00
|
|
|
case _TCP:
|
2018-06-08 07:06:18 +07:00
|
|
|
ret = memcg_update_tcp_max(memcg, nr_pages);
|
2016-01-21 06:02:44 +07:00
|
|
|
break;
|
mm: memcontrol: lockless page counters
Memory is internally accounted in bytes, using spinlock-protected 64-bit
counters, even though the smallest accounting delta is a page. The
counter interface is also convoluted and does too many things.
Introduce a new lockless word-sized page counter API, then change all
memory accounting over to it. The translation from and to bytes then only
happens when interfacing with userspace.
The removed locking overhead is noticable when scaling beyond the per-cpu
charge caches - on a 4-socket machine with 144-threads, the following test
shows the performance differences of 288 memcgs concurrently running a
page fault benchmark:
vanilla:
18631648.500498 task-clock (msec) # 140.643 CPUs utilized ( +- 0.33% )
1,380,638 context-switches # 0.074 K/sec ( +- 0.75% )
24,390 cpu-migrations # 0.001 K/sec ( +- 8.44% )
1,843,305,768 page-faults # 0.099 M/sec ( +- 0.00% )
50,134,994,088,218 cycles # 2.691 GHz ( +- 0.33% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
8,049,712,224,651 instructions # 0.16 insns per cycle ( +- 0.04% )
1,586,970,584,979 branches # 85.176 M/sec ( +- 0.05% )
1,724,989,949 branch-misses # 0.11% of all branches ( +- 0.48% )
132.474343877 seconds time elapsed ( +- 0.21% )
lockless:
12195979.037525 task-clock (msec) # 133.480 CPUs utilized ( +- 0.18% )
832,850 context-switches # 0.068 K/sec ( +- 0.54% )
15,624 cpu-migrations # 0.001 K/sec ( +- 10.17% )
1,843,304,774 page-faults # 0.151 M/sec ( +- 0.00% )
32,811,216,801,141 cycles # 2.690 GHz ( +- 0.18% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
9,999,265,091,727 instructions # 0.30 insns per cycle ( +- 0.10% )
2,076,759,325,203 branches # 170.282 M/sec ( +- 0.12% )
1,656,917,214 branch-misses # 0.08% of all branches ( +- 0.55% )
91.369330729 seconds time elapsed ( +- 0.45% )
On top of improved scalability, this also gets rid of the icky long long
types in the very heart of memcg, which is great for 32 bit and also makes
the code a lot more readable.
Notable differences between the old and new API:
- res_counter_charge() and res_counter_charge_nofail() become
page_counter_try_charge() and page_counter_charge() resp. to match
the more common kernel naming scheme of try_do()/do()
- res_counter_uncharge_until() is only ever used to cancel a local
counter and never to uncharge bigger segments of a hierarchy, so
it's replaced by the simpler page_counter_cancel()
- res_counter_set_limit() is replaced by page_counter_limit(), which
expects its callers to serialize against themselves
- res_counter_memparse_write_strategy() is replaced by
page_counter_limit(), which rounds down to the nearest page size -
rather than up. This is more reasonable for explicitely requested
hard upper limits.
- to keep charging light-weight, page_counter_try_charge() charges
speculatively, only to roll back if the result exceeds the limit.
Because of this, a failing bigger charge can temporarily lock out
smaller charges that would otherwise succeed. The error is bounded
to the difference between the smallest and the biggest possible
charge size, so for memcg, this means that a failing THP charge can
send base page charges into reclaim upto 2MB (4MB) before the limit
would have been reached. This should be acceptable.
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE and memparse]
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE, memparse, strncmp, and PAGE_SIZE]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Acked-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-12-11 06:42:31 +07:00
|
|
|
}
|
2009-09-24 05:56:36 +07:00
|
|
|
break;
|
mm: memcontrol: lockless page counters
Memory is internally accounted in bytes, using spinlock-protected 64-bit
counters, even though the smallest accounting delta is a page. The
counter interface is also convoluted and does too many things.
Introduce a new lockless word-sized page counter API, then change all
memory accounting over to it. The translation from and to bytes then only
happens when interfacing with userspace.
The removed locking overhead is noticable when scaling beyond the per-cpu
charge caches - on a 4-socket machine with 144-threads, the following test
shows the performance differences of 288 memcgs concurrently running a
page fault benchmark:
vanilla:
18631648.500498 task-clock (msec) # 140.643 CPUs utilized ( +- 0.33% )
1,380,638 context-switches # 0.074 K/sec ( +- 0.75% )
24,390 cpu-migrations # 0.001 K/sec ( +- 8.44% )
1,843,305,768 page-faults # 0.099 M/sec ( +- 0.00% )
50,134,994,088,218 cycles # 2.691 GHz ( +- 0.33% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
8,049,712,224,651 instructions # 0.16 insns per cycle ( +- 0.04% )
1,586,970,584,979 branches # 85.176 M/sec ( +- 0.05% )
1,724,989,949 branch-misses # 0.11% of all branches ( +- 0.48% )
132.474343877 seconds time elapsed ( +- 0.21% )
lockless:
12195979.037525 task-clock (msec) # 133.480 CPUs utilized ( +- 0.18% )
832,850 context-switches # 0.068 K/sec ( +- 0.54% )
15,624 cpu-migrations # 0.001 K/sec ( +- 10.17% )
1,843,304,774 page-faults # 0.151 M/sec ( +- 0.00% )
32,811,216,801,141 cycles # 2.690 GHz ( +- 0.18% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
9,999,265,091,727 instructions # 0.30 insns per cycle ( +- 0.10% )
2,076,759,325,203 branches # 170.282 M/sec ( +- 0.12% )
1,656,917,214 branch-misses # 0.08% of all branches ( +- 0.55% )
91.369330729 seconds time elapsed ( +- 0.45% )
On top of improved scalability, this also gets rid of the icky long long
types in the very heart of memcg, which is great for 32 bit and also makes
the code a lot more readable.
Notable differences between the old and new API:
- res_counter_charge() and res_counter_charge_nofail() become
page_counter_try_charge() and page_counter_charge() resp. to match
the more common kernel naming scheme of try_do()/do()
- res_counter_uncharge_until() is only ever used to cancel a local
counter and never to uncharge bigger segments of a hierarchy, so
it's replaced by the simpler page_counter_cancel()
- res_counter_set_limit() is replaced by page_counter_limit(), which
expects its callers to serialize against themselves
- res_counter_memparse_write_strategy() is replaced by
page_counter_limit(), which rounds down to the nearest page size -
rather than up. This is more reasonable for explicitely requested
hard upper limits.
- to keep charging light-weight, page_counter_try_charge() charges
speculatively, only to roll back if the result exceeds the limit.
Because of this, a failing bigger charge can temporarily lock out
smaller charges that would otherwise succeed. The error is bounded
to the difference between the smallest and the biggest possible
charge size, so for memcg, this means that a failing THP charge can
send base page charges into reclaim upto 2MB (4MB) before the limit
would have been reached. This should be acceptable.
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE and memparse]
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE, memparse, strncmp, and PAGE_SIZE]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Acked-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-12-11 06:42:31 +07:00
|
|
|
case RES_SOFT_LIMIT:
|
|
|
|
memcg->soft_limit = nr_pages;
|
|
|
|
ret = 0;
|
2008-07-25 15:47:20 +07:00
|
|
|
break;
|
|
|
|
}
|
2014-05-13 23:16:21 +07:00
|
|
|
return ret ?: nbytes;
|
2008-02-07 15:13:50 +07:00
|
|
|
}
|
|
|
|
|
2014-05-13 23:16:21 +07:00
|
|
|
static ssize_t mem_cgroup_reset(struct kernfs_open_file *of, char *buf,
|
|
|
|
size_t nbytes, loff_t off)
|
2008-04-29 15:00:17 +07:00
|
|
|
{
|
2014-05-13 23:16:21 +07:00
|
|
|
struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
|
mm: memcontrol: lockless page counters
Memory is internally accounted in bytes, using spinlock-protected 64-bit
counters, even though the smallest accounting delta is a page. The
counter interface is also convoluted and does too many things.
Introduce a new lockless word-sized page counter API, then change all
memory accounting over to it. The translation from and to bytes then only
happens when interfacing with userspace.
The removed locking overhead is noticable when scaling beyond the per-cpu
charge caches - on a 4-socket machine with 144-threads, the following test
shows the performance differences of 288 memcgs concurrently running a
page fault benchmark:
vanilla:
18631648.500498 task-clock (msec) # 140.643 CPUs utilized ( +- 0.33% )
1,380,638 context-switches # 0.074 K/sec ( +- 0.75% )
24,390 cpu-migrations # 0.001 K/sec ( +- 8.44% )
1,843,305,768 page-faults # 0.099 M/sec ( +- 0.00% )
50,134,994,088,218 cycles # 2.691 GHz ( +- 0.33% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
8,049,712,224,651 instructions # 0.16 insns per cycle ( +- 0.04% )
1,586,970,584,979 branches # 85.176 M/sec ( +- 0.05% )
1,724,989,949 branch-misses # 0.11% of all branches ( +- 0.48% )
132.474343877 seconds time elapsed ( +- 0.21% )
lockless:
12195979.037525 task-clock (msec) # 133.480 CPUs utilized ( +- 0.18% )
832,850 context-switches # 0.068 K/sec ( +- 0.54% )
15,624 cpu-migrations # 0.001 K/sec ( +- 10.17% )
1,843,304,774 page-faults # 0.151 M/sec ( +- 0.00% )
32,811,216,801,141 cycles # 2.690 GHz ( +- 0.18% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
9,999,265,091,727 instructions # 0.30 insns per cycle ( +- 0.10% )
2,076,759,325,203 branches # 170.282 M/sec ( +- 0.12% )
1,656,917,214 branch-misses # 0.08% of all branches ( +- 0.55% )
91.369330729 seconds time elapsed ( +- 0.45% )
On top of improved scalability, this also gets rid of the icky long long
types in the very heart of memcg, which is great for 32 bit and also makes
the code a lot more readable.
Notable differences between the old and new API:
- res_counter_charge() and res_counter_charge_nofail() become
page_counter_try_charge() and page_counter_charge() resp. to match
the more common kernel naming scheme of try_do()/do()
- res_counter_uncharge_until() is only ever used to cancel a local
counter and never to uncharge bigger segments of a hierarchy, so
it's replaced by the simpler page_counter_cancel()
- res_counter_set_limit() is replaced by page_counter_limit(), which
expects its callers to serialize against themselves
- res_counter_memparse_write_strategy() is replaced by
page_counter_limit(), which rounds down to the nearest page size -
rather than up. This is more reasonable for explicitely requested
hard upper limits.
- to keep charging light-weight, page_counter_try_charge() charges
speculatively, only to roll back if the result exceeds the limit.
Because of this, a failing bigger charge can temporarily lock out
smaller charges that would otherwise succeed. The error is bounded
to the difference between the smallest and the biggest possible
charge size, so for memcg, this means that a failing THP charge can
send base page charges into reclaim upto 2MB (4MB) before the limit
would have been reached. This should be acceptable.
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE and memparse]
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE, memparse, strncmp, and PAGE_SIZE]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Acked-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-12-11 06:42:31 +07:00
|
|
|
struct page_counter *counter;
|
2008-04-29 15:00:17 +07:00
|
|
|
|
mm: memcontrol: lockless page counters
Memory is internally accounted in bytes, using spinlock-protected 64-bit
counters, even though the smallest accounting delta is a page. The
counter interface is also convoluted and does too many things.
Introduce a new lockless word-sized page counter API, then change all
memory accounting over to it. The translation from and to bytes then only
happens when interfacing with userspace.
The removed locking overhead is noticable when scaling beyond the per-cpu
charge caches - on a 4-socket machine with 144-threads, the following test
shows the performance differences of 288 memcgs concurrently running a
page fault benchmark:
vanilla:
18631648.500498 task-clock (msec) # 140.643 CPUs utilized ( +- 0.33% )
1,380,638 context-switches # 0.074 K/sec ( +- 0.75% )
24,390 cpu-migrations # 0.001 K/sec ( +- 8.44% )
1,843,305,768 page-faults # 0.099 M/sec ( +- 0.00% )
50,134,994,088,218 cycles # 2.691 GHz ( +- 0.33% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
8,049,712,224,651 instructions # 0.16 insns per cycle ( +- 0.04% )
1,586,970,584,979 branches # 85.176 M/sec ( +- 0.05% )
1,724,989,949 branch-misses # 0.11% of all branches ( +- 0.48% )
132.474343877 seconds time elapsed ( +- 0.21% )
lockless:
12195979.037525 task-clock (msec) # 133.480 CPUs utilized ( +- 0.18% )
832,850 context-switches # 0.068 K/sec ( +- 0.54% )
15,624 cpu-migrations # 0.001 K/sec ( +- 10.17% )
1,843,304,774 page-faults # 0.151 M/sec ( +- 0.00% )
32,811,216,801,141 cycles # 2.690 GHz ( +- 0.18% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
9,999,265,091,727 instructions # 0.30 insns per cycle ( +- 0.10% )
2,076,759,325,203 branches # 170.282 M/sec ( +- 0.12% )
1,656,917,214 branch-misses # 0.08% of all branches ( +- 0.55% )
91.369330729 seconds time elapsed ( +- 0.45% )
On top of improved scalability, this also gets rid of the icky long long
types in the very heart of memcg, which is great for 32 bit and also makes
the code a lot more readable.
Notable differences between the old and new API:
- res_counter_charge() and res_counter_charge_nofail() become
page_counter_try_charge() and page_counter_charge() resp. to match
the more common kernel naming scheme of try_do()/do()
- res_counter_uncharge_until() is only ever used to cancel a local
counter and never to uncharge bigger segments of a hierarchy, so
it's replaced by the simpler page_counter_cancel()
- res_counter_set_limit() is replaced by page_counter_limit(), which
expects its callers to serialize against themselves
- res_counter_memparse_write_strategy() is replaced by
page_counter_limit(), which rounds down to the nearest page size -
rather than up. This is more reasonable for explicitely requested
hard upper limits.
- to keep charging light-weight, page_counter_try_charge() charges
speculatively, only to roll back if the result exceeds the limit.
Because of this, a failing bigger charge can temporarily lock out
smaller charges that would otherwise succeed. The error is bounded
to the difference between the smallest and the biggest possible
charge size, so for memcg, this means that a failing THP charge can
send base page charges into reclaim upto 2MB (4MB) before the limit
would have been reached. This should be acceptable.
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE and memparse]
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE, memparse, strncmp, and PAGE_SIZE]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Acked-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-12-11 06:42:31 +07:00
|
|
|
switch (MEMFILE_TYPE(of_cft(of)->private)) {
|
|
|
|
case _MEM:
|
|
|
|
counter = &memcg->memory;
|
|
|
|
break;
|
|
|
|
case _MEMSWAP:
|
|
|
|
counter = &memcg->memsw;
|
|
|
|
break;
|
|
|
|
case _KMEM:
|
|
|
|
counter = &memcg->kmem;
|
|
|
|
break;
|
2016-01-21 06:02:44 +07:00
|
|
|
case _TCP:
|
2016-01-21 06:02:50 +07:00
|
|
|
counter = &memcg->tcpmem;
|
2016-01-21 06:02:44 +07:00
|
|
|
break;
|
mm: memcontrol: lockless page counters
Memory is internally accounted in bytes, using spinlock-protected 64-bit
counters, even though the smallest accounting delta is a page. The
counter interface is also convoluted and does too many things.
Introduce a new lockless word-sized page counter API, then change all
memory accounting over to it. The translation from and to bytes then only
happens when interfacing with userspace.
The removed locking overhead is noticable when scaling beyond the per-cpu
charge caches - on a 4-socket machine with 144-threads, the following test
shows the performance differences of 288 memcgs concurrently running a
page fault benchmark:
vanilla:
18631648.500498 task-clock (msec) # 140.643 CPUs utilized ( +- 0.33% )
1,380,638 context-switches # 0.074 K/sec ( +- 0.75% )
24,390 cpu-migrations # 0.001 K/sec ( +- 8.44% )
1,843,305,768 page-faults # 0.099 M/sec ( +- 0.00% )
50,134,994,088,218 cycles # 2.691 GHz ( +- 0.33% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
8,049,712,224,651 instructions # 0.16 insns per cycle ( +- 0.04% )
1,586,970,584,979 branches # 85.176 M/sec ( +- 0.05% )
1,724,989,949 branch-misses # 0.11% of all branches ( +- 0.48% )
132.474343877 seconds time elapsed ( +- 0.21% )
lockless:
12195979.037525 task-clock (msec) # 133.480 CPUs utilized ( +- 0.18% )
832,850 context-switches # 0.068 K/sec ( +- 0.54% )
15,624 cpu-migrations # 0.001 K/sec ( +- 10.17% )
1,843,304,774 page-faults # 0.151 M/sec ( +- 0.00% )
32,811,216,801,141 cycles # 2.690 GHz ( +- 0.18% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
9,999,265,091,727 instructions # 0.30 insns per cycle ( +- 0.10% )
2,076,759,325,203 branches # 170.282 M/sec ( +- 0.12% )
1,656,917,214 branch-misses # 0.08% of all branches ( +- 0.55% )
91.369330729 seconds time elapsed ( +- 0.45% )
On top of improved scalability, this also gets rid of the icky long long
types in the very heart of memcg, which is great for 32 bit and also makes
the code a lot more readable.
Notable differences between the old and new API:
- res_counter_charge() and res_counter_charge_nofail() become
page_counter_try_charge() and page_counter_charge() resp. to match
the more common kernel naming scheme of try_do()/do()
- res_counter_uncharge_until() is only ever used to cancel a local
counter and never to uncharge bigger segments of a hierarchy, so
it's replaced by the simpler page_counter_cancel()
- res_counter_set_limit() is replaced by page_counter_limit(), which
expects its callers to serialize against themselves
- res_counter_memparse_write_strategy() is replaced by
page_counter_limit(), which rounds down to the nearest page size -
rather than up. This is more reasonable for explicitely requested
hard upper limits.
- to keep charging light-weight, page_counter_try_charge() charges
speculatively, only to roll back if the result exceeds the limit.
Because of this, a failing bigger charge can temporarily lock out
smaller charges that would otherwise succeed. The error is bounded
to the difference between the smallest and the biggest possible
charge size, so for memcg, this means that a failing THP charge can
send base page charges into reclaim upto 2MB (4MB) before the limit
would have been reached. This should be acceptable.
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE and memparse]
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE, memparse, strncmp, and PAGE_SIZE]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Acked-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-12-11 06:42:31 +07:00
|
|
|
default:
|
|
|
|
BUG();
|
|
|
|
}
|
2012-04-02 02:09:55 +07:00
|
|
|
|
mm: memcontrol: lockless page counters
Memory is internally accounted in bytes, using spinlock-protected 64-bit
counters, even though the smallest accounting delta is a page. The
counter interface is also convoluted and does too many things.
Introduce a new lockless word-sized page counter API, then change all
memory accounting over to it. The translation from and to bytes then only
happens when interfacing with userspace.
The removed locking overhead is noticable when scaling beyond the per-cpu
charge caches - on a 4-socket machine with 144-threads, the following test
shows the performance differences of 288 memcgs concurrently running a
page fault benchmark:
vanilla:
18631648.500498 task-clock (msec) # 140.643 CPUs utilized ( +- 0.33% )
1,380,638 context-switches # 0.074 K/sec ( +- 0.75% )
24,390 cpu-migrations # 0.001 K/sec ( +- 8.44% )
1,843,305,768 page-faults # 0.099 M/sec ( +- 0.00% )
50,134,994,088,218 cycles # 2.691 GHz ( +- 0.33% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
8,049,712,224,651 instructions # 0.16 insns per cycle ( +- 0.04% )
1,586,970,584,979 branches # 85.176 M/sec ( +- 0.05% )
1,724,989,949 branch-misses # 0.11% of all branches ( +- 0.48% )
132.474343877 seconds time elapsed ( +- 0.21% )
lockless:
12195979.037525 task-clock (msec) # 133.480 CPUs utilized ( +- 0.18% )
832,850 context-switches # 0.068 K/sec ( +- 0.54% )
15,624 cpu-migrations # 0.001 K/sec ( +- 10.17% )
1,843,304,774 page-faults # 0.151 M/sec ( +- 0.00% )
32,811,216,801,141 cycles # 2.690 GHz ( +- 0.18% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
9,999,265,091,727 instructions # 0.30 insns per cycle ( +- 0.10% )
2,076,759,325,203 branches # 170.282 M/sec ( +- 0.12% )
1,656,917,214 branch-misses # 0.08% of all branches ( +- 0.55% )
91.369330729 seconds time elapsed ( +- 0.45% )
On top of improved scalability, this also gets rid of the icky long long
types in the very heart of memcg, which is great for 32 bit and also makes
the code a lot more readable.
Notable differences between the old and new API:
- res_counter_charge() and res_counter_charge_nofail() become
page_counter_try_charge() and page_counter_charge() resp. to match
the more common kernel naming scheme of try_do()/do()
- res_counter_uncharge_until() is only ever used to cancel a local
counter and never to uncharge bigger segments of a hierarchy, so
it's replaced by the simpler page_counter_cancel()
- res_counter_set_limit() is replaced by page_counter_limit(), which
expects its callers to serialize against themselves
- res_counter_memparse_write_strategy() is replaced by
page_counter_limit(), which rounds down to the nearest page size -
rather than up. This is more reasonable for explicitely requested
hard upper limits.
- to keep charging light-weight, page_counter_try_charge() charges
speculatively, only to roll back if the result exceeds the limit.
Because of this, a failing bigger charge can temporarily lock out
smaller charges that would otherwise succeed. The error is bounded
to the difference between the smallest and the biggest possible
charge size, so for memcg, this means that a failing THP charge can
send base page charges into reclaim upto 2MB (4MB) before the limit
would have been reached. This should be acceptable.
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE and memparse]
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE, memparse, strncmp, and PAGE_SIZE]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Acked-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-12-11 06:42:31 +07:00
|
|
|
switch (MEMFILE_ATTR(of_cft(of)->private)) {
|
2008-04-29 15:00:21 +07:00
|
|
|
case RES_MAX_USAGE:
|
mm: memcontrol: lockless page counters
Memory is internally accounted in bytes, using spinlock-protected 64-bit
counters, even though the smallest accounting delta is a page. The
counter interface is also convoluted and does too many things.
Introduce a new lockless word-sized page counter API, then change all
memory accounting over to it. The translation from and to bytes then only
happens when interfacing with userspace.
The removed locking overhead is noticable when scaling beyond the per-cpu
charge caches - on a 4-socket machine with 144-threads, the following test
shows the performance differences of 288 memcgs concurrently running a
page fault benchmark:
vanilla:
18631648.500498 task-clock (msec) # 140.643 CPUs utilized ( +- 0.33% )
1,380,638 context-switches # 0.074 K/sec ( +- 0.75% )
24,390 cpu-migrations # 0.001 K/sec ( +- 8.44% )
1,843,305,768 page-faults # 0.099 M/sec ( +- 0.00% )
50,134,994,088,218 cycles # 2.691 GHz ( +- 0.33% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
8,049,712,224,651 instructions # 0.16 insns per cycle ( +- 0.04% )
1,586,970,584,979 branches # 85.176 M/sec ( +- 0.05% )
1,724,989,949 branch-misses # 0.11% of all branches ( +- 0.48% )
132.474343877 seconds time elapsed ( +- 0.21% )
lockless:
12195979.037525 task-clock (msec) # 133.480 CPUs utilized ( +- 0.18% )
832,850 context-switches # 0.068 K/sec ( +- 0.54% )
15,624 cpu-migrations # 0.001 K/sec ( +- 10.17% )
1,843,304,774 page-faults # 0.151 M/sec ( +- 0.00% )
32,811,216,801,141 cycles # 2.690 GHz ( +- 0.18% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
9,999,265,091,727 instructions # 0.30 insns per cycle ( +- 0.10% )
2,076,759,325,203 branches # 170.282 M/sec ( +- 0.12% )
1,656,917,214 branch-misses # 0.08% of all branches ( +- 0.55% )
91.369330729 seconds time elapsed ( +- 0.45% )
On top of improved scalability, this also gets rid of the icky long long
types in the very heart of memcg, which is great for 32 bit and also makes
the code a lot more readable.
Notable differences between the old and new API:
- res_counter_charge() and res_counter_charge_nofail() become
page_counter_try_charge() and page_counter_charge() resp. to match
the more common kernel naming scheme of try_do()/do()
- res_counter_uncharge_until() is only ever used to cancel a local
counter and never to uncharge bigger segments of a hierarchy, so
it's replaced by the simpler page_counter_cancel()
- res_counter_set_limit() is replaced by page_counter_limit(), which
expects its callers to serialize against themselves
- res_counter_memparse_write_strategy() is replaced by
page_counter_limit(), which rounds down to the nearest page size -
rather than up. This is more reasonable for explicitely requested
hard upper limits.
- to keep charging light-weight, page_counter_try_charge() charges
speculatively, only to roll back if the result exceeds the limit.
Because of this, a failing bigger charge can temporarily lock out
smaller charges that would otherwise succeed. The error is bounded
to the difference between the smallest and the biggest possible
charge size, so for memcg, this means that a failing THP charge can
send base page charges into reclaim upto 2MB (4MB) before the limit
would have been reached. This should be acceptable.
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE and memparse]
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE, memparse, strncmp, and PAGE_SIZE]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Acked-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-12-11 06:42:31 +07:00
|
|
|
page_counter_reset_watermark(counter);
|
2008-04-29 15:00:21 +07:00
|
|
|
break;
|
|
|
|
case RES_FAILCNT:
|
mm: memcontrol: lockless page counters
Memory is internally accounted in bytes, using spinlock-protected 64-bit
counters, even though the smallest accounting delta is a page. The
counter interface is also convoluted and does too many things.
Introduce a new lockless word-sized page counter API, then change all
memory accounting over to it. The translation from and to bytes then only
happens when interfacing with userspace.
The removed locking overhead is noticable when scaling beyond the per-cpu
charge caches - on a 4-socket machine with 144-threads, the following test
shows the performance differences of 288 memcgs concurrently running a
page fault benchmark:
vanilla:
18631648.500498 task-clock (msec) # 140.643 CPUs utilized ( +- 0.33% )
1,380,638 context-switches # 0.074 K/sec ( +- 0.75% )
24,390 cpu-migrations # 0.001 K/sec ( +- 8.44% )
1,843,305,768 page-faults # 0.099 M/sec ( +- 0.00% )
50,134,994,088,218 cycles # 2.691 GHz ( +- 0.33% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
8,049,712,224,651 instructions # 0.16 insns per cycle ( +- 0.04% )
1,586,970,584,979 branches # 85.176 M/sec ( +- 0.05% )
1,724,989,949 branch-misses # 0.11% of all branches ( +- 0.48% )
132.474343877 seconds time elapsed ( +- 0.21% )
lockless:
12195979.037525 task-clock (msec) # 133.480 CPUs utilized ( +- 0.18% )
832,850 context-switches # 0.068 K/sec ( +- 0.54% )
15,624 cpu-migrations # 0.001 K/sec ( +- 10.17% )
1,843,304,774 page-faults # 0.151 M/sec ( +- 0.00% )
32,811,216,801,141 cycles # 2.690 GHz ( +- 0.18% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
9,999,265,091,727 instructions # 0.30 insns per cycle ( +- 0.10% )
2,076,759,325,203 branches # 170.282 M/sec ( +- 0.12% )
1,656,917,214 branch-misses # 0.08% of all branches ( +- 0.55% )
91.369330729 seconds time elapsed ( +- 0.45% )
On top of improved scalability, this also gets rid of the icky long long
types in the very heart of memcg, which is great for 32 bit and also makes
the code a lot more readable.
Notable differences between the old and new API:
- res_counter_charge() and res_counter_charge_nofail() become
page_counter_try_charge() and page_counter_charge() resp. to match
the more common kernel naming scheme of try_do()/do()
- res_counter_uncharge_until() is only ever used to cancel a local
counter and never to uncharge bigger segments of a hierarchy, so
it's replaced by the simpler page_counter_cancel()
- res_counter_set_limit() is replaced by page_counter_limit(), which
expects its callers to serialize against themselves
- res_counter_memparse_write_strategy() is replaced by
page_counter_limit(), which rounds down to the nearest page size -
rather than up. This is more reasonable for explicitely requested
hard upper limits.
- to keep charging light-weight, page_counter_try_charge() charges
speculatively, only to roll back if the result exceeds the limit.
Because of this, a failing bigger charge can temporarily lock out
smaller charges that would otherwise succeed. The error is bounded
to the difference between the smallest and the biggest possible
charge size, so for memcg, this means that a failing THP charge can
send base page charges into reclaim upto 2MB (4MB) before the limit
would have been reached. This should be acceptable.
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE and memparse]
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE, memparse, strncmp, and PAGE_SIZE]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Acked-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-12-11 06:42:31 +07:00
|
|
|
counter->failcnt = 0;
|
2008-04-29 15:00:21 +07:00
|
|
|
break;
|
mm: memcontrol: lockless page counters
Memory is internally accounted in bytes, using spinlock-protected 64-bit
counters, even though the smallest accounting delta is a page. The
counter interface is also convoluted and does too many things.
Introduce a new lockless word-sized page counter API, then change all
memory accounting over to it. The translation from and to bytes then only
happens when interfacing with userspace.
The removed locking overhead is noticable when scaling beyond the per-cpu
charge caches - on a 4-socket machine with 144-threads, the following test
shows the performance differences of 288 memcgs concurrently running a
page fault benchmark:
vanilla:
18631648.500498 task-clock (msec) # 140.643 CPUs utilized ( +- 0.33% )
1,380,638 context-switches # 0.074 K/sec ( +- 0.75% )
24,390 cpu-migrations # 0.001 K/sec ( +- 8.44% )
1,843,305,768 page-faults # 0.099 M/sec ( +- 0.00% )
50,134,994,088,218 cycles # 2.691 GHz ( +- 0.33% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
8,049,712,224,651 instructions # 0.16 insns per cycle ( +- 0.04% )
1,586,970,584,979 branches # 85.176 M/sec ( +- 0.05% )
1,724,989,949 branch-misses # 0.11% of all branches ( +- 0.48% )
132.474343877 seconds time elapsed ( +- 0.21% )
lockless:
12195979.037525 task-clock (msec) # 133.480 CPUs utilized ( +- 0.18% )
832,850 context-switches # 0.068 K/sec ( +- 0.54% )
15,624 cpu-migrations # 0.001 K/sec ( +- 10.17% )
1,843,304,774 page-faults # 0.151 M/sec ( +- 0.00% )
32,811,216,801,141 cycles # 2.690 GHz ( +- 0.18% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
9,999,265,091,727 instructions # 0.30 insns per cycle ( +- 0.10% )
2,076,759,325,203 branches # 170.282 M/sec ( +- 0.12% )
1,656,917,214 branch-misses # 0.08% of all branches ( +- 0.55% )
91.369330729 seconds time elapsed ( +- 0.45% )
On top of improved scalability, this also gets rid of the icky long long
types in the very heart of memcg, which is great for 32 bit and also makes
the code a lot more readable.
Notable differences between the old and new API:
- res_counter_charge() and res_counter_charge_nofail() become
page_counter_try_charge() and page_counter_charge() resp. to match
the more common kernel naming scheme of try_do()/do()
- res_counter_uncharge_until() is only ever used to cancel a local
counter and never to uncharge bigger segments of a hierarchy, so
it's replaced by the simpler page_counter_cancel()
- res_counter_set_limit() is replaced by page_counter_limit(), which
expects its callers to serialize against themselves
- res_counter_memparse_write_strategy() is replaced by
page_counter_limit(), which rounds down to the nearest page size -
rather than up. This is more reasonable for explicitely requested
hard upper limits.
- to keep charging light-weight, page_counter_try_charge() charges
speculatively, only to roll back if the result exceeds the limit.
Because of this, a failing bigger charge can temporarily lock out
smaller charges that would otherwise succeed. The error is bounded
to the difference between the smallest and the biggest possible
charge size, so for memcg, this means that a failing THP charge can
send base page charges into reclaim upto 2MB (4MB) before the limit
would have been reached. This should be acceptable.
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE and memparse]
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE, memparse, strncmp, and PAGE_SIZE]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Acked-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-12-11 06:42:31 +07:00
|
|
|
default:
|
|
|
|
BUG();
|
2008-04-29 15:00:21 +07:00
|
|
|
}
|
2009-09-24 05:56:37 +07:00
|
|
|
|
2014-05-13 23:16:21 +07:00
|
|
|
return nbytes;
|
2008-04-29 15:00:17 +07:00
|
|
|
}
|
|
|
|
|
2013-08-09 07:11:24 +07:00
|
|
|
static u64 mem_cgroup_move_charge_read(struct cgroup_subsys_state *css,
|
2010-03-11 06:22:13 +07:00
|
|
|
struct cftype *cft)
|
|
|
|
{
|
2013-08-09 07:11:24 +07:00
|
|
|
return mem_cgroup_from_css(css)->move_charge_at_immigrate;
|
2010-03-11 06:22:13 +07:00
|
|
|
}
|
|
|
|
|
2010-03-11 06:22:17 +07:00
|
|
|
#ifdef CONFIG_MMU
|
2013-08-09 07:11:24 +07:00
|
|
|
static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
|
2010-03-11 06:22:13 +07:00
|
|
|
struct cftype *cft, u64 val)
|
|
|
|
{
|
2013-08-09 07:11:24 +07:00
|
|
|
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
|
2010-03-11 06:22:13 +07:00
|
|
|
|
2015-02-12 06:26:09 +07:00
|
|
|
if (val & ~MOVE_MASK)
|
2010-03-11 06:22:13 +07:00
|
|
|
return -EINVAL;
|
memcg: prevent changes to move_charge_at_immigrate during task attach
In memcg, we use the cgroup_lock basically to synchronize against
attaching new children to a cgroup. We do this because we rely on
cgroup core to provide us with this information.
We need to guarantee that upon child creation, our tunables are
consistent. For those, the calls to cgroup_lock() all live in handlers
like mem_cgroup_hierarchy_write(), where we change a tunable in the
group that is hierarchy-related. For instance, the use_hierarchy flag
cannot be changed if the cgroup already have children.
Furthermore, those values are propagated from the parent to the child
when a new child is created. So if we don't lock like this, we can end
up with the following situation:
A B
memcg_css_alloc() mem_cgroup_hierarchy_write()
copy use hierarchy from parent change use hierarchy in parent
finish creation.
This is mainly because during create, we are still not fully connected
to the css tree. So all iterators and the such that we could use, will
fail to show that the group has children.
My observation is that all of creation can proceed in parallel with
those tasks, except value assignment. So what this patch series does is
to first move all value assignment that is dependent on parent values
from css_alloc to css_online, where the iterators all work, and then we
lock only the value assignment. This will guarantee that parent and
children always have consistent values. Together with an online test,
that can be derived from the observation that the refcount of an online
memcg can be made to be always positive, we should be able to
synchronize our side without the cgroup lock.
This patch:
Currently, we rely on the cgroup_lock() to prevent changes to
move_charge_at_immigrate during task migration. However, this is only
needed because the current strategy keeps checking this value throughout
the whole process. Since all we need is serialization, one needs only
to guarantee that whatever decision we made in the beginning of a
specific migration is respected throughout the process.
We can achieve this by just saving it in mc. By doing this, no kind of
locking is needed.
Signed-off-by: Glauber Costa <glommer@parallels.com>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: Tejun Heo <tj@kernel.org>
Cc: Hiroyuki Kamezawa <kamezawa.hiroyuki@gmail.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2013-02-23 07:34:50 +07:00
|
|
|
|
2010-03-11 06:22:13 +07:00
|
|
|
/*
|
memcg: prevent changes to move_charge_at_immigrate during task attach
In memcg, we use the cgroup_lock basically to synchronize against
attaching new children to a cgroup. We do this because we rely on
cgroup core to provide us with this information.
We need to guarantee that upon child creation, our tunables are
consistent. For those, the calls to cgroup_lock() all live in handlers
like mem_cgroup_hierarchy_write(), where we change a tunable in the
group that is hierarchy-related. For instance, the use_hierarchy flag
cannot be changed if the cgroup already have children.
Furthermore, those values are propagated from the parent to the child
when a new child is created. So if we don't lock like this, we can end
up with the following situation:
A B
memcg_css_alloc() mem_cgroup_hierarchy_write()
copy use hierarchy from parent change use hierarchy in parent
finish creation.
This is mainly because during create, we are still not fully connected
to the css tree. So all iterators and the such that we could use, will
fail to show that the group has children.
My observation is that all of creation can proceed in parallel with
those tasks, except value assignment. So what this patch series does is
to first move all value assignment that is dependent on parent values
from css_alloc to css_online, where the iterators all work, and then we
lock only the value assignment. This will guarantee that parent and
children always have consistent values. Together with an online test,
that can be derived from the observation that the refcount of an online
memcg can be made to be always positive, we should be able to
synchronize our side without the cgroup lock.
This patch:
Currently, we rely on the cgroup_lock() to prevent changes to
move_charge_at_immigrate during task migration. However, this is only
needed because the current strategy keeps checking this value throughout
the whole process. Since all we need is serialization, one needs only
to guarantee that whatever decision we made in the beginning of a
specific migration is respected throughout the process.
We can achieve this by just saving it in mc. By doing this, no kind of
locking is needed.
Signed-off-by: Glauber Costa <glommer@parallels.com>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: Tejun Heo <tj@kernel.org>
Cc: Hiroyuki Kamezawa <kamezawa.hiroyuki@gmail.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2013-02-23 07:34:50 +07:00
|
|
|
* No kind of locking is needed in here, because ->can_attach() will
|
|
|
|
* check this value once in the beginning of the process, and then carry
|
|
|
|
* on with stale data. This means that changes to this value will only
|
|
|
|
* affect task migrations starting after the change.
|
2010-03-11 06:22:13 +07:00
|
|
|
*/
|
2011-11-03 03:38:15 +07:00
|
|
|
memcg->move_charge_at_immigrate = val;
|
2010-03-11 06:22:13 +07:00
|
|
|
return 0;
|
|
|
|
}
|
2010-03-11 06:22:17 +07:00
|
|
|
#else
|
2013-08-09 07:11:24 +07:00
|
|
|
static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
|
2010-03-11 06:22:17 +07:00
|
|
|
struct cftype *cft, u64 val)
|
|
|
|
{
|
|
|
|
return -ENOSYS;
|
|
|
|
}
|
|
|
|
#endif
|
2010-03-11 06:22:13 +07:00
|
|
|
|
2011-05-27 06:25:37 +07:00
|
|
|
#ifdef CONFIG_NUMA
|
2019-05-14 07:18:11 +07:00
|
|
|
|
|
|
|
#define LRU_ALL_FILE (BIT(LRU_INACTIVE_FILE) | BIT(LRU_ACTIVE_FILE))
|
|
|
|
#define LRU_ALL_ANON (BIT(LRU_INACTIVE_ANON) | BIT(LRU_ACTIVE_ANON))
|
|
|
|
#define LRU_ALL ((1 << NR_LRU_LISTS) - 1)
|
|
|
|
|
|
|
|
static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
|
2020-06-04 05:56:24 +07:00
|
|
|
int nid, unsigned int lru_mask, bool tree)
|
2019-05-14 07:18:11 +07:00
|
|
|
{
|
2019-12-01 08:55:34 +07:00
|
|
|
struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
|
2019-05-14 07:18:11 +07:00
|
|
|
unsigned long nr = 0;
|
|
|
|
enum lru_list lru;
|
|
|
|
|
|
|
|
VM_BUG_ON((unsigned)nid >= nr_node_ids);
|
|
|
|
|
|
|
|
for_each_lru(lru) {
|
|
|
|
if (!(BIT(lru) & lru_mask))
|
|
|
|
continue;
|
2020-06-04 05:56:24 +07:00
|
|
|
if (tree)
|
|
|
|
nr += lruvec_page_state(lruvec, NR_LRU_BASE + lru);
|
|
|
|
else
|
|
|
|
nr += lruvec_page_state_local(lruvec, NR_LRU_BASE + lru);
|
2019-05-14 07:18:11 +07:00
|
|
|
}
|
|
|
|
return nr;
|
|
|
|
}
|
|
|
|
|
|
|
|
static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
|
2020-06-04 05:56:24 +07:00
|
|
|
unsigned int lru_mask,
|
|
|
|
bool tree)
|
2019-05-14 07:18:11 +07:00
|
|
|
{
|
|
|
|
unsigned long nr = 0;
|
|
|
|
enum lru_list lru;
|
|
|
|
|
|
|
|
for_each_lru(lru) {
|
|
|
|
if (!(BIT(lru) & lru_mask))
|
|
|
|
continue;
|
2020-06-04 05:56:24 +07:00
|
|
|
if (tree)
|
|
|
|
nr += memcg_page_state(memcg, NR_LRU_BASE + lru);
|
|
|
|
else
|
|
|
|
nr += memcg_page_state_local(memcg, NR_LRU_BASE + lru);
|
2019-05-14 07:18:11 +07:00
|
|
|
}
|
|
|
|
return nr;
|
|
|
|
}
|
|
|
|
|
2013-12-06 00:28:04 +07:00
|
|
|
static int memcg_numa_stat_show(struct seq_file *m, void *v)
|
2011-05-27 06:25:37 +07:00
|
|
|
{
|
memcg: refactor mem_control_numa_stat_show()
Refactor mem_control_numa_stat_show() to use a new stats structure for
smaller and simpler code. This consolidates nearly identical code.
text data bss dec hex filename
8,137,679 1,703,496 1,896,448 11,737,623 b31a17 vmlinux.before
8,136,911 1,703,496 1,896,448 11,736,855 b31717 vmlinux.after
Signed-off-by: Greg Thelen <gthelen@google.com>
Signed-off-by: Ying Han <yinghan@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2013-11-13 06:07:40 +07:00
|
|
|
struct numa_stat {
|
|
|
|
const char *name;
|
|
|
|
unsigned int lru_mask;
|
|
|
|
};
|
|
|
|
|
|
|
|
static const struct numa_stat stats[] = {
|
|
|
|
{ "total", LRU_ALL },
|
|
|
|
{ "file", LRU_ALL_FILE },
|
|
|
|
{ "anon", LRU_ALL_ANON },
|
|
|
|
{ "unevictable", BIT(LRU_UNEVICTABLE) },
|
|
|
|
};
|
|
|
|
const struct numa_stat *stat;
|
2011-05-27 06:25:37 +07:00
|
|
|
int nid;
|
2019-03-06 06:45:52 +07:00
|
|
|
struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
|
2011-05-27 06:25:37 +07:00
|
|
|
|
memcg: refactor mem_control_numa_stat_show()
Refactor mem_control_numa_stat_show() to use a new stats structure for
smaller and simpler code. This consolidates nearly identical code.
text data bss dec hex filename
8,137,679 1,703,496 1,896,448 11,737,623 b31a17 vmlinux.before
8,136,911 1,703,496 1,896,448 11,736,855 b31717 vmlinux.after
Signed-off-by: Greg Thelen <gthelen@google.com>
Signed-off-by: Ying Han <yinghan@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2013-11-13 06:07:40 +07:00
|
|
|
for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
|
2020-06-04 05:56:24 +07:00
|
|
|
seq_printf(m, "%s=%lu", stat->name,
|
|
|
|
mem_cgroup_nr_lru_pages(memcg, stat->lru_mask,
|
|
|
|
false));
|
|
|
|
for_each_node_state(nid, N_MEMORY)
|
|
|
|
seq_printf(m, " N%d=%lu", nid,
|
|
|
|
mem_cgroup_node_nr_lru_pages(memcg, nid,
|
|
|
|
stat->lru_mask, false));
|
memcg: refactor mem_control_numa_stat_show()
Refactor mem_control_numa_stat_show() to use a new stats structure for
smaller and simpler code. This consolidates nearly identical code.
text data bss dec hex filename
8,137,679 1,703,496 1,896,448 11,737,623 b31a17 vmlinux.before
8,136,911 1,703,496 1,896,448 11,736,855 b31717 vmlinux.after
Signed-off-by: Greg Thelen <gthelen@google.com>
Signed-off-by: Ying Han <yinghan@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2013-11-13 06:07:40 +07:00
|
|
|
seq_putc(m, '\n');
|
2011-05-27 06:25:37 +07:00
|
|
|
}
|
|
|
|
|
2013-11-13 06:07:41 +07:00
|
|
|
for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
|
2020-06-04 05:56:24 +07:00
|
|
|
|
|
|
|
seq_printf(m, "hierarchical_%s=%lu", stat->name,
|
|
|
|
mem_cgroup_nr_lru_pages(memcg, stat->lru_mask,
|
|
|
|
true));
|
|
|
|
for_each_node_state(nid, N_MEMORY)
|
|
|
|
seq_printf(m, " N%d=%lu", nid,
|
|
|
|
mem_cgroup_node_nr_lru_pages(memcg, nid,
|
|
|
|
stat->lru_mask, true));
|
2013-11-13 06:07:41 +07:00
|
|
|
seq_putc(m, '\n');
|
2011-05-27 06:25:37 +07:00
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
#endif /* CONFIG_NUMA */
|
|
|
|
|
mm: memcontrol: dump memory.stat during cgroup OOM
The current cgroup OOM memory info dump doesn't include all the memory
we are tracking, nor does it give insight into what the VM tried to do
leading up to the OOM. All that useful info is in memory.stat.
Furthermore, the recursive printing for every child cgroup can
generate absurd amounts of data on the console for larger cgroup
trees, and it's not like we provide a per-cgroup breakdown during
global OOM kills.
When an OOM kill is triggered, print one set of recursive memory.stat
items at the level whose limit triggered the OOM condition.
Example output:
stress invoked oom-killer: gfp_mask=0x100cca(GFP_HIGHUSER_MOVABLE), order=0, oom_score_adj=0
CPU: 2 PID: 210 Comm: stress Not tainted 5.2.0-rc2-mm1-00247-g47d49835983c #135
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.12.0-20181126_142135-anatol 04/01/2014
Call Trace:
dump_stack+0x46/0x60
dump_header+0x4c/0x2d0
oom_kill_process.cold.10+0xb/0x10
out_of_memory+0x200/0x270
? try_to_free_mem_cgroup_pages+0xdf/0x130
mem_cgroup_out_of_memory+0xb7/0xc0
try_charge+0x680/0x6f0
mem_cgroup_try_charge+0xb5/0x160
__add_to_page_cache_locked+0xc6/0x300
? list_lru_destroy+0x80/0x80
add_to_page_cache_lru+0x45/0xc0
pagecache_get_page+0x11b/0x290
filemap_fault+0x458/0x6d0
ext4_filemap_fault+0x27/0x36
__do_fault+0x2f/0xb0
__handle_mm_fault+0x9c5/0x1140
? apic_timer_interrupt+0xa/0x20
handle_mm_fault+0xc5/0x180
__do_page_fault+0x1ab/0x440
? page_fault+0x8/0x30
page_fault+0x1e/0x30
RIP: 0033:0x55c32167fc10
Code: Bad RIP value.
RSP: 002b:00007fff1d031c50 EFLAGS: 00010206
RAX: 000000000dc00000 RBX: 00007fd2db000010 RCX: 00007fd2db000010
RDX: 0000000000000000 RSI: 0000000010001000 RDI: 0000000000000000
RBP: 000055c321680a54 R08: 00000000ffffffff R09: 0000000000000000
R10: 0000000000000022 R11: 0000000000000246 R12: ffffffffffffffff
R13: 0000000000000002 R14: 0000000000001000 R15: 0000000010000000
memory: usage 1024kB, limit 1024kB, failcnt 75131
swap: usage 0kB, limit 9007199254740988kB, failcnt 0
Memory cgroup stats for /foo:
anon 0
file 0
kernel_stack 36864
slab 274432
sock 0
shmem 0
file_mapped 0
file_dirty 0
file_writeback 0
anon_thp 0
inactive_anon 126976
active_anon 0
inactive_file 0
active_file 0
unevictable 0
slab_reclaimable 0
slab_unreclaimable 274432
pgfault 59466
pgmajfault 1617
workingset_refault 2145
workingset_activate 0
workingset_nodereclaim 0
pgrefill 98952
pgscan 200060
pgsteal 59340
pgactivate 40095
pgdeactivate 96787
pglazyfree 0
pglazyfreed 0
thp_fault_alloc 0
thp_collapse_alloc 0
Tasks state (memory values in pages):
[ pid ] uid tgid total_vm rss pgtables_bytes swapents oom_score_adj name
[ 200] 0 200 1121 884 53248 29 0 bash
[ 209] 0 209 905 246 45056 19 0 stress
[ 210] 0 210 66442 56 499712 56349 0 stress
oom-kill:constraint=CONSTRAINT_NONE,nodemask=(null),oom_memcg=/foo,task_memcg=/foo,task=stress,pid=210,uid=0
Memory cgroup out of memory: Killed process 210 (stress) total-vm:265768kB, anon-rss:0kB, file-rss:224kB, shmem-rss:0kB
oom_reaper: reaped process 210 (stress), now anon-rss:0kB, file-rss:0kB, shmem-rss:0kB
[hannes@cmpxchg.org: s/kvmalloc/kmalloc/ per Michal]
Link: http://lkml.kernel.org/r/20190605161133.GA12453@cmpxchg.org
Link: http://lkml.kernel.org/r/20190604210509.9744-1-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-07-12 10:55:59 +07:00
|
|
|
static const unsigned int memcg1_stats[] = {
|
2020-06-04 06:01:54 +07:00
|
|
|
NR_FILE_PAGES,
|
2020-06-04 06:01:57 +07:00
|
|
|
NR_ANON_MAPPED,
|
2020-06-04 06:02:01 +07:00
|
|
|
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
|
|
|
|
NR_ANON_THPS,
|
|
|
|
#endif
|
mm: memcontrol: dump memory.stat during cgroup OOM
The current cgroup OOM memory info dump doesn't include all the memory
we are tracking, nor does it give insight into what the VM tried to do
leading up to the OOM. All that useful info is in memory.stat.
Furthermore, the recursive printing for every child cgroup can
generate absurd amounts of data on the console for larger cgroup
trees, and it's not like we provide a per-cgroup breakdown during
global OOM kills.
When an OOM kill is triggered, print one set of recursive memory.stat
items at the level whose limit triggered the OOM condition.
Example output:
stress invoked oom-killer: gfp_mask=0x100cca(GFP_HIGHUSER_MOVABLE), order=0, oom_score_adj=0
CPU: 2 PID: 210 Comm: stress Not tainted 5.2.0-rc2-mm1-00247-g47d49835983c #135
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.12.0-20181126_142135-anatol 04/01/2014
Call Trace:
dump_stack+0x46/0x60
dump_header+0x4c/0x2d0
oom_kill_process.cold.10+0xb/0x10
out_of_memory+0x200/0x270
? try_to_free_mem_cgroup_pages+0xdf/0x130
mem_cgroup_out_of_memory+0xb7/0xc0
try_charge+0x680/0x6f0
mem_cgroup_try_charge+0xb5/0x160
__add_to_page_cache_locked+0xc6/0x300
? list_lru_destroy+0x80/0x80
add_to_page_cache_lru+0x45/0xc0
pagecache_get_page+0x11b/0x290
filemap_fault+0x458/0x6d0
ext4_filemap_fault+0x27/0x36
__do_fault+0x2f/0xb0
__handle_mm_fault+0x9c5/0x1140
? apic_timer_interrupt+0xa/0x20
handle_mm_fault+0xc5/0x180
__do_page_fault+0x1ab/0x440
? page_fault+0x8/0x30
page_fault+0x1e/0x30
RIP: 0033:0x55c32167fc10
Code: Bad RIP value.
RSP: 002b:00007fff1d031c50 EFLAGS: 00010206
RAX: 000000000dc00000 RBX: 00007fd2db000010 RCX: 00007fd2db000010
RDX: 0000000000000000 RSI: 0000000010001000 RDI: 0000000000000000
RBP: 000055c321680a54 R08: 00000000ffffffff R09: 0000000000000000
R10: 0000000000000022 R11: 0000000000000246 R12: ffffffffffffffff
R13: 0000000000000002 R14: 0000000000001000 R15: 0000000010000000
memory: usage 1024kB, limit 1024kB, failcnt 75131
swap: usage 0kB, limit 9007199254740988kB, failcnt 0
Memory cgroup stats for /foo:
anon 0
file 0
kernel_stack 36864
slab 274432
sock 0
shmem 0
file_mapped 0
file_dirty 0
file_writeback 0
anon_thp 0
inactive_anon 126976
active_anon 0
inactive_file 0
active_file 0
unevictable 0
slab_reclaimable 0
slab_unreclaimable 274432
pgfault 59466
pgmajfault 1617
workingset_refault 2145
workingset_activate 0
workingset_nodereclaim 0
pgrefill 98952
pgscan 200060
pgsteal 59340
pgactivate 40095
pgdeactivate 96787
pglazyfree 0
pglazyfreed 0
thp_fault_alloc 0
thp_collapse_alloc 0
Tasks state (memory values in pages):
[ pid ] uid tgid total_vm rss pgtables_bytes swapents oom_score_adj name
[ 200] 0 200 1121 884 53248 29 0 bash
[ 209] 0 209 905 246 45056 19 0 stress
[ 210] 0 210 66442 56 499712 56349 0 stress
oom-kill:constraint=CONSTRAINT_NONE,nodemask=(null),oom_memcg=/foo,task_memcg=/foo,task=stress,pid=210,uid=0
Memory cgroup out of memory: Killed process 210 (stress) total-vm:265768kB, anon-rss:0kB, file-rss:224kB, shmem-rss:0kB
oom_reaper: reaped process 210 (stress), now anon-rss:0kB, file-rss:0kB, shmem-rss:0kB
[hannes@cmpxchg.org: s/kvmalloc/kmalloc/ per Michal]
Link: http://lkml.kernel.org/r/20190605161133.GA12453@cmpxchg.org
Link: http://lkml.kernel.org/r/20190604210509.9744-1-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-07-12 10:55:59 +07:00
|
|
|
NR_SHMEM,
|
|
|
|
NR_FILE_MAPPED,
|
|
|
|
NR_FILE_DIRTY,
|
|
|
|
NR_WRITEBACK,
|
|
|
|
MEMCG_SWAP,
|
|
|
|
};
|
|
|
|
|
|
|
|
static const char *const memcg1_stat_names[] = {
|
|
|
|
"cache",
|
|
|
|
"rss",
|
2020-06-04 06:02:01 +07:00
|
|
|
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
|
mm: memcontrol: dump memory.stat during cgroup OOM
The current cgroup OOM memory info dump doesn't include all the memory
we are tracking, nor does it give insight into what the VM tried to do
leading up to the OOM. All that useful info is in memory.stat.
Furthermore, the recursive printing for every child cgroup can
generate absurd amounts of data on the console for larger cgroup
trees, and it's not like we provide a per-cgroup breakdown during
global OOM kills.
When an OOM kill is triggered, print one set of recursive memory.stat
items at the level whose limit triggered the OOM condition.
Example output:
stress invoked oom-killer: gfp_mask=0x100cca(GFP_HIGHUSER_MOVABLE), order=0, oom_score_adj=0
CPU: 2 PID: 210 Comm: stress Not tainted 5.2.0-rc2-mm1-00247-g47d49835983c #135
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.12.0-20181126_142135-anatol 04/01/2014
Call Trace:
dump_stack+0x46/0x60
dump_header+0x4c/0x2d0
oom_kill_process.cold.10+0xb/0x10
out_of_memory+0x200/0x270
? try_to_free_mem_cgroup_pages+0xdf/0x130
mem_cgroup_out_of_memory+0xb7/0xc0
try_charge+0x680/0x6f0
mem_cgroup_try_charge+0xb5/0x160
__add_to_page_cache_locked+0xc6/0x300
? list_lru_destroy+0x80/0x80
add_to_page_cache_lru+0x45/0xc0
pagecache_get_page+0x11b/0x290
filemap_fault+0x458/0x6d0
ext4_filemap_fault+0x27/0x36
__do_fault+0x2f/0xb0
__handle_mm_fault+0x9c5/0x1140
? apic_timer_interrupt+0xa/0x20
handle_mm_fault+0xc5/0x180
__do_page_fault+0x1ab/0x440
? page_fault+0x8/0x30
page_fault+0x1e/0x30
RIP: 0033:0x55c32167fc10
Code: Bad RIP value.
RSP: 002b:00007fff1d031c50 EFLAGS: 00010206
RAX: 000000000dc00000 RBX: 00007fd2db000010 RCX: 00007fd2db000010
RDX: 0000000000000000 RSI: 0000000010001000 RDI: 0000000000000000
RBP: 000055c321680a54 R08: 00000000ffffffff R09: 0000000000000000
R10: 0000000000000022 R11: 0000000000000246 R12: ffffffffffffffff
R13: 0000000000000002 R14: 0000000000001000 R15: 0000000010000000
memory: usage 1024kB, limit 1024kB, failcnt 75131
swap: usage 0kB, limit 9007199254740988kB, failcnt 0
Memory cgroup stats for /foo:
anon 0
file 0
kernel_stack 36864
slab 274432
sock 0
shmem 0
file_mapped 0
file_dirty 0
file_writeback 0
anon_thp 0
inactive_anon 126976
active_anon 0
inactive_file 0
active_file 0
unevictable 0
slab_reclaimable 0
slab_unreclaimable 274432
pgfault 59466
pgmajfault 1617
workingset_refault 2145
workingset_activate 0
workingset_nodereclaim 0
pgrefill 98952
pgscan 200060
pgsteal 59340
pgactivate 40095
pgdeactivate 96787
pglazyfree 0
pglazyfreed 0
thp_fault_alloc 0
thp_collapse_alloc 0
Tasks state (memory values in pages):
[ pid ] uid tgid total_vm rss pgtables_bytes swapents oom_score_adj name
[ 200] 0 200 1121 884 53248 29 0 bash
[ 209] 0 209 905 246 45056 19 0 stress
[ 210] 0 210 66442 56 499712 56349 0 stress
oom-kill:constraint=CONSTRAINT_NONE,nodemask=(null),oom_memcg=/foo,task_memcg=/foo,task=stress,pid=210,uid=0
Memory cgroup out of memory: Killed process 210 (stress) total-vm:265768kB, anon-rss:0kB, file-rss:224kB, shmem-rss:0kB
oom_reaper: reaped process 210 (stress), now anon-rss:0kB, file-rss:0kB, shmem-rss:0kB
[hannes@cmpxchg.org: s/kvmalloc/kmalloc/ per Michal]
Link: http://lkml.kernel.org/r/20190605161133.GA12453@cmpxchg.org
Link: http://lkml.kernel.org/r/20190604210509.9744-1-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-07-12 10:55:59 +07:00
|
|
|
"rss_huge",
|
2020-06-04 06:02:01 +07:00
|
|
|
#endif
|
mm: memcontrol: dump memory.stat during cgroup OOM
The current cgroup OOM memory info dump doesn't include all the memory
we are tracking, nor does it give insight into what the VM tried to do
leading up to the OOM. All that useful info is in memory.stat.
Furthermore, the recursive printing for every child cgroup can
generate absurd amounts of data on the console for larger cgroup
trees, and it's not like we provide a per-cgroup breakdown during
global OOM kills.
When an OOM kill is triggered, print one set of recursive memory.stat
items at the level whose limit triggered the OOM condition.
Example output:
stress invoked oom-killer: gfp_mask=0x100cca(GFP_HIGHUSER_MOVABLE), order=0, oom_score_adj=0
CPU: 2 PID: 210 Comm: stress Not tainted 5.2.0-rc2-mm1-00247-g47d49835983c #135
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.12.0-20181126_142135-anatol 04/01/2014
Call Trace:
dump_stack+0x46/0x60
dump_header+0x4c/0x2d0
oom_kill_process.cold.10+0xb/0x10
out_of_memory+0x200/0x270
? try_to_free_mem_cgroup_pages+0xdf/0x130
mem_cgroup_out_of_memory+0xb7/0xc0
try_charge+0x680/0x6f0
mem_cgroup_try_charge+0xb5/0x160
__add_to_page_cache_locked+0xc6/0x300
? list_lru_destroy+0x80/0x80
add_to_page_cache_lru+0x45/0xc0
pagecache_get_page+0x11b/0x290
filemap_fault+0x458/0x6d0
ext4_filemap_fault+0x27/0x36
__do_fault+0x2f/0xb0
__handle_mm_fault+0x9c5/0x1140
? apic_timer_interrupt+0xa/0x20
handle_mm_fault+0xc5/0x180
__do_page_fault+0x1ab/0x440
? page_fault+0x8/0x30
page_fault+0x1e/0x30
RIP: 0033:0x55c32167fc10
Code: Bad RIP value.
RSP: 002b:00007fff1d031c50 EFLAGS: 00010206
RAX: 000000000dc00000 RBX: 00007fd2db000010 RCX: 00007fd2db000010
RDX: 0000000000000000 RSI: 0000000010001000 RDI: 0000000000000000
RBP: 000055c321680a54 R08: 00000000ffffffff R09: 0000000000000000
R10: 0000000000000022 R11: 0000000000000246 R12: ffffffffffffffff
R13: 0000000000000002 R14: 0000000000001000 R15: 0000000010000000
memory: usage 1024kB, limit 1024kB, failcnt 75131
swap: usage 0kB, limit 9007199254740988kB, failcnt 0
Memory cgroup stats for /foo:
anon 0
file 0
kernel_stack 36864
slab 274432
sock 0
shmem 0
file_mapped 0
file_dirty 0
file_writeback 0
anon_thp 0
inactive_anon 126976
active_anon 0
inactive_file 0
active_file 0
unevictable 0
slab_reclaimable 0
slab_unreclaimable 274432
pgfault 59466
pgmajfault 1617
workingset_refault 2145
workingset_activate 0
workingset_nodereclaim 0
pgrefill 98952
pgscan 200060
pgsteal 59340
pgactivate 40095
pgdeactivate 96787
pglazyfree 0
pglazyfreed 0
thp_fault_alloc 0
thp_collapse_alloc 0
Tasks state (memory values in pages):
[ pid ] uid tgid total_vm rss pgtables_bytes swapents oom_score_adj name
[ 200] 0 200 1121 884 53248 29 0 bash
[ 209] 0 209 905 246 45056 19 0 stress
[ 210] 0 210 66442 56 499712 56349 0 stress
oom-kill:constraint=CONSTRAINT_NONE,nodemask=(null),oom_memcg=/foo,task_memcg=/foo,task=stress,pid=210,uid=0
Memory cgroup out of memory: Killed process 210 (stress) total-vm:265768kB, anon-rss:0kB, file-rss:224kB, shmem-rss:0kB
oom_reaper: reaped process 210 (stress), now anon-rss:0kB, file-rss:0kB, shmem-rss:0kB
[hannes@cmpxchg.org: s/kvmalloc/kmalloc/ per Michal]
Link: http://lkml.kernel.org/r/20190605161133.GA12453@cmpxchg.org
Link: http://lkml.kernel.org/r/20190604210509.9744-1-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-07-12 10:55:59 +07:00
|
|
|
"shmem",
|
|
|
|
"mapped_file",
|
|
|
|
"dirty",
|
|
|
|
"writeback",
|
|
|
|
"swap",
|
|
|
|
};
|
|
|
|
|
2017-05-04 04:55:10 +07:00
|
|
|
/* Universal VM events cgroup1 shows, original sort order */
|
2018-06-08 07:07:23 +07:00
|
|
|
static const unsigned int memcg1_events[] = {
|
2017-05-04 04:55:10 +07:00
|
|
|
PGPGIN,
|
|
|
|
PGPGOUT,
|
|
|
|
PGFAULT,
|
|
|
|
PGMAJFAULT,
|
|
|
|
};
|
|
|
|
|
2013-12-06 00:28:04 +07:00
|
|
|
static int memcg_stat_show(struct seq_file *m, void *v)
|
2008-02-07 15:14:25 +07:00
|
|
|
{
|
2019-03-06 06:45:52 +07:00
|
|
|
struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
|
mm: memcontrol: lockless page counters
Memory is internally accounted in bytes, using spinlock-protected 64-bit
counters, even though the smallest accounting delta is a page. The
counter interface is also convoluted and does too many things.
Introduce a new lockless word-sized page counter API, then change all
memory accounting over to it. The translation from and to bytes then only
happens when interfacing with userspace.
The removed locking overhead is noticable when scaling beyond the per-cpu
charge caches - on a 4-socket machine with 144-threads, the following test
shows the performance differences of 288 memcgs concurrently running a
page fault benchmark:
vanilla:
18631648.500498 task-clock (msec) # 140.643 CPUs utilized ( +- 0.33% )
1,380,638 context-switches # 0.074 K/sec ( +- 0.75% )
24,390 cpu-migrations # 0.001 K/sec ( +- 8.44% )
1,843,305,768 page-faults # 0.099 M/sec ( +- 0.00% )
50,134,994,088,218 cycles # 2.691 GHz ( +- 0.33% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
8,049,712,224,651 instructions # 0.16 insns per cycle ( +- 0.04% )
1,586,970,584,979 branches # 85.176 M/sec ( +- 0.05% )
1,724,989,949 branch-misses # 0.11% of all branches ( +- 0.48% )
132.474343877 seconds time elapsed ( +- 0.21% )
lockless:
12195979.037525 task-clock (msec) # 133.480 CPUs utilized ( +- 0.18% )
832,850 context-switches # 0.068 K/sec ( +- 0.54% )
15,624 cpu-migrations # 0.001 K/sec ( +- 10.17% )
1,843,304,774 page-faults # 0.151 M/sec ( +- 0.00% )
32,811,216,801,141 cycles # 2.690 GHz ( +- 0.18% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
9,999,265,091,727 instructions # 0.30 insns per cycle ( +- 0.10% )
2,076,759,325,203 branches # 170.282 M/sec ( +- 0.12% )
1,656,917,214 branch-misses # 0.08% of all branches ( +- 0.55% )
91.369330729 seconds time elapsed ( +- 0.45% )
On top of improved scalability, this also gets rid of the icky long long
types in the very heart of memcg, which is great for 32 bit and also makes
the code a lot more readable.
Notable differences between the old and new API:
- res_counter_charge() and res_counter_charge_nofail() become
page_counter_try_charge() and page_counter_charge() resp. to match
the more common kernel naming scheme of try_do()/do()
- res_counter_uncharge_until() is only ever used to cancel a local
counter and never to uncharge bigger segments of a hierarchy, so
it's replaced by the simpler page_counter_cancel()
- res_counter_set_limit() is replaced by page_counter_limit(), which
expects its callers to serialize against themselves
- res_counter_memparse_write_strategy() is replaced by
page_counter_limit(), which rounds down to the nearest page size -
rather than up. This is more reasonable for explicitely requested
hard upper limits.
- to keep charging light-weight, page_counter_try_charge() charges
speculatively, only to roll back if the result exceeds the limit.
Because of this, a failing bigger charge can temporarily lock out
smaller charges that would otherwise succeed. The error is bounded
to the difference between the smallest and the biggest possible
charge size, so for memcg, this means that a failing THP charge can
send base page charges into reclaim upto 2MB (4MB) before the limit
would have been reached. This should be acceptable.
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE and memparse]
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE, memparse, strncmp, and PAGE_SIZE]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Acked-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-12-11 06:42:31 +07:00
|
|
|
unsigned long memory, memsw;
|
2012-05-30 05:07:08 +07:00
|
|
|
struct mem_cgroup *mi;
|
|
|
|
unsigned int i;
|
2011-05-27 06:25:37 +07:00
|
|
|
|
2017-05-04 04:55:13 +07:00
|
|
|
BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats));
|
2014-12-13 07:56:41 +07:00
|
|
|
|
2017-05-04 04:55:13 +07:00
|
|
|
for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
|
2020-06-04 06:02:01 +07:00
|
|
|
unsigned long nr;
|
|
|
|
|
2017-05-04 04:55:13 +07:00
|
|
|
if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
|
2009-09-24 05:56:43 +07:00
|
|
|
continue;
|
2020-06-04 06:02:01 +07:00
|
|
|
nr = memcg_page_state_local(memcg, memcg1_stats[i]);
|
|
|
|
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
|
|
|
|
if (memcg1_stats[i] == NR_ANON_THPS)
|
|
|
|
nr *= HPAGE_PMD_NR;
|
|
|
|
#endif
|
|
|
|
seq_printf(m, "%s %lu\n", memcg1_stat_names[i], nr * PAGE_SIZE);
|
2009-09-24 05:56:43 +07:00
|
|
|
}
|
2008-10-19 10:26:40 +07:00
|
|
|
|
2017-05-04 04:55:10 +07:00
|
|
|
for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
|
2019-12-05 07:49:53 +07:00
|
|
|
seq_printf(m, "%s %lu\n", vm_event_name(memcg1_events[i]),
|
mm: memcontrol: make cgroup stats and events query API explicitly local
Patch series "mm: memcontrol: memory.stat cost & correctness".
The cgroup memory.stat file holds recursive statistics for the entire
subtree. The current implementation does this tree walk on-demand
whenever the file is read. This is giving us problems in production.
1. The cost of aggregating the statistics on-demand is high. A lot of
system service cgroups are mostly idle and their stats don't change
between reads, yet we always have to check them. There are also always
some lazily-dying cgroups sitting around that are pinned by a handful
of remaining page cache; the same applies to them.
In an application that periodically monitors memory.stat in our
fleet, we have seen the aggregation consume up to 5% CPU time.
2. When cgroups die and disappear from the cgroup tree, so do their
accumulated vm events. The result is that the event counters at
higher-level cgroups can go backwards and confuse some of our
automation, let alone people looking at the graphs over time.
To address both issues, this patch series changes the stat
implementation to spill counts upwards when the counters change.
The upward spilling is batched using the existing per-cpu cache. In a
sparse file stress test with 5 level cgroup nesting, the additional cost
of the flushing was negligible (a little under 1% of CPU at 100% CPU
utilization, compared to the 5% of reading memory.stat during regular
operation).
This patch (of 4):
memcg_page_state(), lruvec_page_state(), memcg_sum_events() are
currently returning the state of the local memcg or lruvec, not the
recursive state.
In practice there is a demand for both versions, although the callers
that want the recursive counts currently sum them up by hand.
Per default, cgroups are considered recursive entities and generally we
expect more users of the recursive counters, with the local counts being
special cases. To reflect that in the name, add a _local suffix to the
current implementations.
The following patch will re-incarnate these functions with recursive
semantics, but with an O(1) implementation.
[hannes@cmpxchg.org: fix bisection hole]
Link: http://lkml.kernel.org/r/20190417160347.GC23013@cmpxchg.org
Link: http://lkml.kernel.org/r/20190412151507.2769-2-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Reviewed-by: Roman Gushchin <guro@fb.com>
Cc: Michal Hocko <mhocko@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-05-15 05:47:06 +07:00
|
|
|
memcg_events_local(memcg, memcg1_events[i]));
|
2012-05-30 05:07:08 +07:00
|
|
|
|
|
|
|
for (i = 0; i < NR_LRU_LISTS; i++)
|
2019-12-05 07:49:53 +07:00
|
|
|
seq_printf(m, "%s %lu\n", lru_list_name(i),
|
mm: memcontrol: make cgroup stats and events query API explicitly local
Patch series "mm: memcontrol: memory.stat cost & correctness".
The cgroup memory.stat file holds recursive statistics for the entire
subtree. The current implementation does this tree walk on-demand
whenever the file is read. This is giving us problems in production.
1. The cost of aggregating the statistics on-demand is high. A lot of
system service cgroups are mostly idle and their stats don't change
between reads, yet we always have to check them. There are also always
some lazily-dying cgroups sitting around that are pinned by a handful
of remaining page cache; the same applies to them.
In an application that periodically monitors memory.stat in our
fleet, we have seen the aggregation consume up to 5% CPU time.
2. When cgroups die and disappear from the cgroup tree, so do their
accumulated vm events. The result is that the event counters at
higher-level cgroups can go backwards and confuse some of our
automation, let alone people looking at the graphs over time.
To address both issues, this patch series changes the stat
implementation to spill counts upwards when the counters change.
The upward spilling is batched using the existing per-cpu cache. In a
sparse file stress test with 5 level cgroup nesting, the additional cost
of the flushing was negligible (a little under 1% of CPU at 100% CPU
utilization, compared to the 5% of reading memory.stat during regular
operation).
This patch (of 4):
memcg_page_state(), lruvec_page_state(), memcg_sum_events() are
currently returning the state of the local memcg or lruvec, not the
recursive state.
In practice there is a demand for both versions, although the callers
that want the recursive counts currently sum them up by hand.
Per default, cgroups are considered recursive entities and generally we
expect more users of the recursive counters, with the local counts being
special cases. To reflect that in the name, add a _local suffix to the
current implementations.
The following patch will re-incarnate these functions with recursive
semantics, but with an O(1) implementation.
[hannes@cmpxchg.org: fix bisection hole]
Link: http://lkml.kernel.org/r/20190417160347.GC23013@cmpxchg.org
Link: http://lkml.kernel.org/r/20190412151507.2769-2-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Reviewed-by: Roman Gushchin <guro@fb.com>
Cc: Michal Hocko <mhocko@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-05-15 05:47:06 +07:00
|
|
|
memcg_page_state_local(memcg, NR_LRU_BASE + i) *
|
2019-05-14 07:18:08 +07:00
|
|
|
PAGE_SIZE);
|
2012-05-30 05:07:08 +07:00
|
|
|
|
2009-04-03 06:57:35 +07:00
|
|
|
/* Hierarchical information */
|
mm: memcontrol: lockless page counters
Memory is internally accounted in bytes, using spinlock-protected 64-bit
counters, even though the smallest accounting delta is a page. The
counter interface is also convoluted and does too many things.
Introduce a new lockless word-sized page counter API, then change all
memory accounting over to it. The translation from and to bytes then only
happens when interfacing with userspace.
The removed locking overhead is noticable when scaling beyond the per-cpu
charge caches - on a 4-socket machine with 144-threads, the following test
shows the performance differences of 288 memcgs concurrently running a
page fault benchmark:
vanilla:
18631648.500498 task-clock (msec) # 140.643 CPUs utilized ( +- 0.33% )
1,380,638 context-switches # 0.074 K/sec ( +- 0.75% )
24,390 cpu-migrations # 0.001 K/sec ( +- 8.44% )
1,843,305,768 page-faults # 0.099 M/sec ( +- 0.00% )
50,134,994,088,218 cycles # 2.691 GHz ( +- 0.33% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
8,049,712,224,651 instructions # 0.16 insns per cycle ( +- 0.04% )
1,586,970,584,979 branches # 85.176 M/sec ( +- 0.05% )
1,724,989,949 branch-misses # 0.11% of all branches ( +- 0.48% )
132.474343877 seconds time elapsed ( +- 0.21% )
lockless:
12195979.037525 task-clock (msec) # 133.480 CPUs utilized ( +- 0.18% )
832,850 context-switches # 0.068 K/sec ( +- 0.54% )
15,624 cpu-migrations # 0.001 K/sec ( +- 10.17% )
1,843,304,774 page-faults # 0.151 M/sec ( +- 0.00% )
32,811,216,801,141 cycles # 2.690 GHz ( +- 0.18% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
9,999,265,091,727 instructions # 0.30 insns per cycle ( +- 0.10% )
2,076,759,325,203 branches # 170.282 M/sec ( +- 0.12% )
1,656,917,214 branch-misses # 0.08% of all branches ( +- 0.55% )
91.369330729 seconds time elapsed ( +- 0.45% )
On top of improved scalability, this also gets rid of the icky long long
types in the very heart of memcg, which is great for 32 bit and also makes
the code a lot more readable.
Notable differences between the old and new API:
- res_counter_charge() and res_counter_charge_nofail() become
page_counter_try_charge() and page_counter_charge() resp. to match
the more common kernel naming scheme of try_do()/do()
- res_counter_uncharge_until() is only ever used to cancel a local
counter and never to uncharge bigger segments of a hierarchy, so
it's replaced by the simpler page_counter_cancel()
- res_counter_set_limit() is replaced by page_counter_limit(), which
expects its callers to serialize against themselves
- res_counter_memparse_write_strategy() is replaced by
page_counter_limit(), which rounds down to the nearest page size -
rather than up. This is more reasonable for explicitely requested
hard upper limits.
- to keep charging light-weight, page_counter_try_charge() charges
speculatively, only to roll back if the result exceeds the limit.
Because of this, a failing bigger charge can temporarily lock out
smaller charges that would otherwise succeed. The error is bounded
to the difference between the smallest and the biggest possible
charge size, so for memcg, this means that a failing THP charge can
send base page charges into reclaim upto 2MB (4MB) before the limit
would have been reached. This should be acceptable.
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE and memparse]
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE, memparse, strncmp, and PAGE_SIZE]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Acked-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-12-11 06:42:31 +07:00
|
|
|
memory = memsw = PAGE_COUNTER_MAX;
|
|
|
|
for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) {
|
2020-04-02 11:07:20 +07:00
|
|
|
memory = min(memory, READ_ONCE(mi->memory.max));
|
|
|
|
memsw = min(memsw, READ_ONCE(mi->memsw.max));
|
2009-01-08 09:08:26 +07:00
|
|
|
}
|
mm: memcontrol: lockless page counters
Memory is internally accounted in bytes, using spinlock-protected 64-bit
counters, even though the smallest accounting delta is a page. The
counter interface is also convoluted and does too many things.
Introduce a new lockless word-sized page counter API, then change all
memory accounting over to it. The translation from and to bytes then only
happens when interfacing with userspace.
The removed locking overhead is noticable when scaling beyond the per-cpu
charge caches - on a 4-socket machine with 144-threads, the following test
shows the performance differences of 288 memcgs concurrently running a
page fault benchmark:
vanilla:
18631648.500498 task-clock (msec) # 140.643 CPUs utilized ( +- 0.33% )
1,380,638 context-switches # 0.074 K/sec ( +- 0.75% )
24,390 cpu-migrations # 0.001 K/sec ( +- 8.44% )
1,843,305,768 page-faults # 0.099 M/sec ( +- 0.00% )
50,134,994,088,218 cycles # 2.691 GHz ( +- 0.33% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
8,049,712,224,651 instructions # 0.16 insns per cycle ( +- 0.04% )
1,586,970,584,979 branches # 85.176 M/sec ( +- 0.05% )
1,724,989,949 branch-misses # 0.11% of all branches ( +- 0.48% )
132.474343877 seconds time elapsed ( +- 0.21% )
lockless:
12195979.037525 task-clock (msec) # 133.480 CPUs utilized ( +- 0.18% )
832,850 context-switches # 0.068 K/sec ( +- 0.54% )
15,624 cpu-migrations # 0.001 K/sec ( +- 10.17% )
1,843,304,774 page-faults # 0.151 M/sec ( +- 0.00% )
32,811,216,801,141 cycles # 2.690 GHz ( +- 0.18% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
9,999,265,091,727 instructions # 0.30 insns per cycle ( +- 0.10% )
2,076,759,325,203 branches # 170.282 M/sec ( +- 0.12% )
1,656,917,214 branch-misses # 0.08% of all branches ( +- 0.55% )
91.369330729 seconds time elapsed ( +- 0.45% )
On top of improved scalability, this also gets rid of the icky long long
types in the very heart of memcg, which is great for 32 bit and also makes
the code a lot more readable.
Notable differences between the old and new API:
- res_counter_charge() and res_counter_charge_nofail() become
page_counter_try_charge() and page_counter_charge() resp. to match
the more common kernel naming scheme of try_do()/do()
- res_counter_uncharge_until() is only ever used to cancel a local
counter and never to uncharge bigger segments of a hierarchy, so
it's replaced by the simpler page_counter_cancel()
- res_counter_set_limit() is replaced by page_counter_limit(), which
expects its callers to serialize against themselves
- res_counter_memparse_write_strategy() is replaced by
page_counter_limit(), which rounds down to the nearest page size -
rather than up. This is more reasonable for explicitely requested
hard upper limits.
- to keep charging light-weight, page_counter_try_charge() charges
speculatively, only to roll back if the result exceeds the limit.
Because of this, a failing bigger charge can temporarily lock out
smaller charges that would otherwise succeed. The error is bounded
to the difference between the smallest and the biggest possible
charge size, so for memcg, this means that a failing THP charge can
send base page charges into reclaim upto 2MB (4MB) before the limit
would have been reached. This should be acceptable.
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE and memparse]
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE, memparse, strncmp, and PAGE_SIZE]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Acked-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-12-11 06:42:31 +07:00
|
|
|
seq_printf(m, "hierarchical_memory_limit %llu\n",
|
|
|
|
(u64)memory * PAGE_SIZE);
|
2016-01-15 06:21:23 +07:00
|
|
|
if (do_memsw_account())
|
mm: memcontrol: lockless page counters
Memory is internally accounted in bytes, using spinlock-protected 64-bit
counters, even though the smallest accounting delta is a page. The
counter interface is also convoluted and does too many things.
Introduce a new lockless word-sized page counter API, then change all
memory accounting over to it. The translation from and to bytes then only
happens when interfacing with userspace.
The removed locking overhead is noticable when scaling beyond the per-cpu
charge caches - on a 4-socket machine with 144-threads, the following test
shows the performance differences of 288 memcgs concurrently running a
page fault benchmark:
vanilla:
18631648.500498 task-clock (msec) # 140.643 CPUs utilized ( +- 0.33% )
1,380,638 context-switches # 0.074 K/sec ( +- 0.75% )
24,390 cpu-migrations # 0.001 K/sec ( +- 8.44% )
1,843,305,768 page-faults # 0.099 M/sec ( +- 0.00% )
50,134,994,088,218 cycles # 2.691 GHz ( +- 0.33% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
8,049,712,224,651 instructions # 0.16 insns per cycle ( +- 0.04% )
1,586,970,584,979 branches # 85.176 M/sec ( +- 0.05% )
1,724,989,949 branch-misses # 0.11% of all branches ( +- 0.48% )
132.474343877 seconds time elapsed ( +- 0.21% )
lockless:
12195979.037525 task-clock (msec) # 133.480 CPUs utilized ( +- 0.18% )
832,850 context-switches # 0.068 K/sec ( +- 0.54% )
15,624 cpu-migrations # 0.001 K/sec ( +- 10.17% )
1,843,304,774 page-faults # 0.151 M/sec ( +- 0.00% )
32,811,216,801,141 cycles # 2.690 GHz ( +- 0.18% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
9,999,265,091,727 instructions # 0.30 insns per cycle ( +- 0.10% )
2,076,759,325,203 branches # 170.282 M/sec ( +- 0.12% )
1,656,917,214 branch-misses # 0.08% of all branches ( +- 0.55% )
91.369330729 seconds time elapsed ( +- 0.45% )
On top of improved scalability, this also gets rid of the icky long long
types in the very heart of memcg, which is great for 32 bit and also makes
the code a lot more readable.
Notable differences between the old and new API:
- res_counter_charge() and res_counter_charge_nofail() become
page_counter_try_charge() and page_counter_charge() resp. to match
the more common kernel naming scheme of try_do()/do()
- res_counter_uncharge_until() is only ever used to cancel a local
counter and never to uncharge bigger segments of a hierarchy, so
it's replaced by the simpler page_counter_cancel()
- res_counter_set_limit() is replaced by page_counter_limit(), which
expects its callers to serialize against themselves
- res_counter_memparse_write_strategy() is replaced by
page_counter_limit(), which rounds down to the nearest page size -
rather than up. This is more reasonable for explicitely requested
hard upper limits.
- to keep charging light-weight, page_counter_try_charge() charges
speculatively, only to roll back if the result exceeds the limit.
Because of this, a failing bigger charge can temporarily lock out
smaller charges that would otherwise succeed. The error is bounded
to the difference between the smallest and the biggest possible
charge size, so for memcg, this means that a failing THP charge can
send base page charges into reclaim upto 2MB (4MB) before the limit
would have been reached. This should be acceptable.
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE and memparse]
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE, memparse, strncmp, and PAGE_SIZE]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Acked-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-12-11 06:42:31 +07:00
|
|
|
seq_printf(m, "hierarchical_memsw_limit %llu\n",
|
|
|
|
(u64)memsw * PAGE_SIZE);
|
2009-01-08 09:08:22 +07:00
|
|
|
|
2018-08-22 11:53:17 +07:00
|
|
|
for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
|
2020-11-02 08:07:30 +07:00
|
|
|
unsigned long nr;
|
|
|
|
|
2017-05-04 04:55:13 +07:00
|
|
|
if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
|
2009-09-24 05:56:43 +07:00
|
|
|
continue;
|
2020-11-02 08:07:30 +07:00
|
|
|
nr = memcg_page_state(memcg, memcg1_stats[i]);
|
|
|
|
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
|
|
|
|
if (memcg1_stats[i] == NR_ANON_THPS)
|
|
|
|
nr *= HPAGE_PMD_NR;
|
|
|
|
#endif
|
2018-08-22 11:53:17 +07:00
|
|
|
seq_printf(m, "total_%s %llu\n", memcg1_stat_names[i],
|
2020-11-02 08:07:30 +07:00
|
|
|
(u64)nr * PAGE_SIZE);
|
2012-05-30 05:07:08 +07:00
|
|
|
}
|
|
|
|
|
2018-08-22 11:53:17 +07:00
|
|
|
for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
|
2019-12-05 07:49:53 +07:00
|
|
|
seq_printf(m, "total_%s %llu\n",
|
|
|
|
vm_event_name(memcg1_events[i]),
|
2019-07-12 10:52:11 +07:00
|
|
|
(u64)memcg_events(memcg, memcg1_events[i]));
|
2012-05-30 05:07:08 +07:00
|
|
|
|
2018-08-22 11:53:17 +07:00
|
|
|
for (i = 0; i < NR_LRU_LISTS; i++)
|
2019-12-05 07:49:53 +07:00
|
|
|
seq_printf(m, "total_%s %llu\n", lru_list_name(i),
|
mm: memcontrol: fix recursive statistics correctness & scalabilty
Right now, when somebody needs to know the recursive memory statistics
and events of a cgroup subtree, they need to walk the entire subtree and
sum up the counters manually.
There are two issues with this:
1. When a cgroup gets deleted, its stats are lost. The state counters
should all be 0 at that point, of course, but the events are not.
When this happens, the event counters, which are supposed to be
monotonic, can go backwards in the parent cgroups.
2. During regular operation, we always have a certain number of lazily
freed cgroups sitting around that have been deleted, have no tasks,
but have a few cache pages remaining. These groups' statistics do not
change until we eventually hit memory pressure, but somebody
watching, say, memory.stat on an ancestor has to iterate those every
time.
This patch addresses both issues by introducing recursive counters at
each level that are propagated from the write side when stats change.
Upward propagation happens when the per-cpu caches spill over into the
local atomic counter. This is the same thing we do during charge and
uncharge, except that the latter uses atomic RMWs, which are more
expensive; stat changes happen at around the same rate. In a sparse
file test (page faults and reclaim at maximum CPU speed) with 5 cgroup
nesting levels, perf shows __mod_memcg_page state at ~1%.
Link: http://lkml.kernel.org/r/20190412151507.2769-4-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Reviewed-by: Roman Gushchin <guro@fb.com>
Cc: Michal Hocko <mhocko@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-05-15 05:47:12 +07:00
|
|
|
(u64)memcg_page_state(memcg, NR_LRU_BASE + i) *
|
|
|
|
PAGE_SIZE);
|
2009-04-03 06:57:35 +07:00
|
|
|
|
2009-01-08 09:08:22 +07:00
|
|
|
#ifdef CONFIG_DEBUG_VM
|
|
|
|
{
|
2016-07-29 05:46:05 +07:00
|
|
|
pg_data_t *pgdat;
|
|
|
|
struct mem_cgroup_per_node *mz;
|
mm: base LRU balancing on an explicit cost model
Currently, scan pressure between the anon and file LRU lists is balanced
based on a mixture of reclaim efficiency and a somewhat vague notion of
"value" of having certain pages in memory over others. That concept of
value is problematic, because it has caused us to count any event that
remotely makes one LRU list more or less preferrable for reclaim, even
when these events are not directly comparable and impose very different
costs on the system. One example is referenced file pages that we still
deactivate and referenced anonymous pages that we actually rotate back to
the head of the list.
There is also conceptual overlap with the LRU algorithm itself. By
rotating recently used pages instead of reclaiming them, the algorithm
already biases the applied scan pressure based on page value. Thus, when
rebalancing scan pressure due to rotations, we should think of reclaim
cost, and leave assessing the page value to the LRU algorithm.
Lastly, considering both value-increasing as well as value-decreasing
events can sometimes cause the same type of event to be counted twice,
i.e. how rotating a page increases the LRU value, while reclaiming it
succesfully decreases the value. In itself this will balance out fine,
but it quietly skews the impact of events that are only recorded once.
The abstract metric of "value", the murky relationship with the LRU
algorithm, and accounting both negative and positive events make the
current pressure balancing model hard to reason about and modify.
This patch switches to a balancing model of accounting the concrete,
actually observed cost of reclaiming one LRU over another. For now, that
cost includes pages that are scanned but rotated back to the list head.
Subsequent patches will add consideration for IO caused by refaulting of
recently evicted pages.
Replace struct zone_reclaim_stat with two cost counters in the lruvec, and
make everything that affects cost go through a new lru_note_cost()
function.
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Rik van Riel <riel@surriel.com>
Link: http://lkml.kernel.org/r/20200520232525.798933-9-hannes@cmpxchg.org
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2020-06-04 06:02:53 +07:00
|
|
|
unsigned long anon_cost = 0;
|
|
|
|
unsigned long file_cost = 0;
|
2009-01-08 09:08:22 +07:00
|
|
|
|
2016-07-29 05:46:05 +07:00
|
|
|
for_each_online_pgdat(pgdat) {
|
|
|
|
mz = mem_cgroup_nodeinfo(memcg, pgdat->node_id);
|
2009-01-08 09:08:22 +07:00
|
|
|
|
mm: base LRU balancing on an explicit cost model
Currently, scan pressure between the anon and file LRU lists is balanced
based on a mixture of reclaim efficiency and a somewhat vague notion of
"value" of having certain pages in memory over others. That concept of
value is problematic, because it has caused us to count any event that
remotely makes one LRU list more or less preferrable for reclaim, even
when these events are not directly comparable and impose very different
costs on the system. One example is referenced file pages that we still
deactivate and referenced anonymous pages that we actually rotate back to
the head of the list.
There is also conceptual overlap with the LRU algorithm itself. By
rotating recently used pages instead of reclaiming them, the algorithm
already biases the applied scan pressure based on page value. Thus, when
rebalancing scan pressure due to rotations, we should think of reclaim
cost, and leave assessing the page value to the LRU algorithm.
Lastly, considering both value-increasing as well as value-decreasing
events can sometimes cause the same type of event to be counted twice,
i.e. how rotating a page increases the LRU value, while reclaiming it
succesfully decreases the value. In itself this will balance out fine,
but it quietly skews the impact of events that are only recorded once.
The abstract metric of "value", the murky relationship with the LRU
algorithm, and accounting both negative and positive events make the
current pressure balancing model hard to reason about and modify.
This patch switches to a balancing model of accounting the concrete,
actually observed cost of reclaiming one LRU over another. For now, that
cost includes pages that are scanned but rotated back to the list head.
Subsequent patches will add consideration for IO caused by refaulting of
recently evicted pages.
Replace struct zone_reclaim_stat with two cost counters in the lruvec, and
make everything that affects cost go through a new lru_note_cost()
function.
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Rik van Riel <riel@surriel.com>
Link: http://lkml.kernel.org/r/20200520232525.798933-9-hannes@cmpxchg.org
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2020-06-04 06:02:53 +07:00
|
|
|
anon_cost += mz->lruvec.anon_cost;
|
|
|
|
file_cost += mz->lruvec.file_cost;
|
2016-07-29 05:46:05 +07:00
|
|
|
}
|
mm: base LRU balancing on an explicit cost model
Currently, scan pressure between the anon and file LRU lists is balanced
based on a mixture of reclaim efficiency and a somewhat vague notion of
"value" of having certain pages in memory over others. That concept of
value is problematic, because it has caused us to count any event that
remotely makes one LRU list more or less preferrable for reclaim, even
when these events are not directly comparable and impose very different
costs on the system. One example is referenced file pages that we still
deactivate and referenced anonymous pages that we actually rotate back to
the head of the list.
There is also conceptual overlap with the LRU algorithm itself. By
rotating recently used pages instead of reclaiming them, the algorithm
already biases the applied scan pressure based on page value. Thus, when
rebalancing scan pressure due to rotations, we should think of reclaim
cost, and leave assessing the page value to the LRU algorithm.
Lastly, considering both value-increasing as well as value-decreasing
events can sometimes cause the same type of event to be counted twice,
i.e. how rotating a page increases the LRU value, while reclaiming it
succesfully decreases the value. In itself this will balance out fine,
but it quietly skews the impact of events that are only recorded once.
The abstract metric of "value", the murky relationship with the LRU
algorithm, and accounting both negative and positive events make the
current pressure balancing model hard to reason about and modify.
This patch switches to a balancing model of accounting the concrete,
actually observed cost of reclaiming one LRU over another. For now, that
cost includes pages that are scanned but rotated back to the list head.
Subsequent patches will add consideration for IO caused by refaulting of
recently evicted pages.
Replace struct zone_reclaim_stat with two cost counters in the lruvec, and
make everything that affects cost go through a new lru_note_cost()
function.
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Rik van Riel <riel@surriel.com>
Link: http://lkml.kernel.org/r/20200520232525.798933-9-hannes@cmpxchg.org
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2020-06-04 06:02:53 +07:00
|
|
|
seq_printf(m, "anon_cost %lu\n", anon_cost);
|
|
|
|
seq_printf(m, "file_cost %lu\n", file_cost);
|
2009-01-08 09:08:22 +07:00
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2008-02-07 15:14:25 +07:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2013-08-09 07:11:24 +07:00
|
|
|
static u64 mem_cgroup_swappiness_read(struct cgroup_subsys_state *css,
|
|
|
|
struct cftype *cft)
|
2009-01-08 09:08:24 +07:00
|
|
|
{
|
2013-08-09 07:11:24 +07:00
|
|
|
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
|
2009-01-08 09:08:24 +07:00
|
|
|
|
2011-07-27 06:08:21 +07:00
|
|
|
return mem_cgroup_swappiness(memcg);
|
2009-01-08 09:08:24 +07:00
|
|
|
}
|
|
|
|
|
2013-08-09 07:11:24 +07:00
|
|
|
static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css,
|
|
|
|
struct cftype *cft, u64 val)
|
2009-01-08 09:08:24 +07:00
|
|
|
{
|
2013-08-09 07:11:24 +07:00
|
|
|
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
|
2009-01-08 09:08:24 +07:00
|
|
|
|
2014-06-05 06:07:01 +07:00
|
|
|
if (val > 100)
|
2009-01-08 09:08:24 +07:00
|
|
|
return -EINVAL;
|
|
|
|
|
2014-06-10 05:03:33 +07:00
|
|
|
if (css->parent)
|
2014-06-05 06:07:01 +07:00
|
|
|
memcg->swappiness = val;
|
|
|
|
else
|
|
|
|
vm_swappiness = val;
|
2009-01-16 04:51:26 +07:00
|
|
|
|
2009-01-08 09:08:24 +07:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2010-03-11 06:22:24 +07:00
|
|
|
static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
|
|
|
|
{
|
|
|
|
struct mem_cgroup_threshold_ary *t;
|
mm: memcontrol: lockless page counters
Memory is internally accounted in bytes, using spinlock-protected 64-bit
counters, even though the smallest accounting delta is a page. The
counter interface is also convoluted and does too many things.
Introduce a new lockless word-sized page counter API, then change all
memory accounting over to it. The translation from and to bytes then only
happens when interfacing with userspace.
The removed locking overhead is noticable when scaling beyond the per-cpu
charge caches - on a 4-socket machine with 144-threads, the following test
shows the performance differences of 288 memcgs concurrently running a
page fault benchmark:
vanilla:
18631648.500498 task-clock (msec) # 140.643 CPUs utilized ( +- 0.33% )
1,380,638 context-switches # 0.074 K/sec ( +- 0.75% )
24,390 cpu-migrations # 0.001 K/sec ( +- 8.44% )
1,843,305,768 page-faults # 0.099 M/sec ( +- 0.00% )
50,134,994,088,218 cycles # 2.691 GHz ( +- 0.33% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
8,049,712,224,651 instructions # 0.16 insns per cycle ( +- 0.04% )
1,586,970,584,979 branches # 85.176 M/sec ( +- 0.05% )
1,724,989,949 branch-misses # 0.11% of all branches ( +- 0.48% )
132.474343877 seconds time elapsed ( +- 0.21% )
lockless:
12195979.037525 task-clock (msec) # 133.480 CPUs utilized ( +- 0.18% )
832,850 context-switches # 0.068 K/sec ( +- 0.54% )
15,624 cpu-migrations # 0.001 K/sec ( +- 10.17% )
1,843,304,774 page-faults # 0.151 M/sec ( +- 0.00% )
32,811,216,801,141 cycles # 2.690 GHz ( +- 0.18% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
9,999,265,091,727 instructions # 0.30 insns per cycle ( +- 0.10% )
2,076,759,325,203 branches # 170.282 M/sec ( +- 0.12% )
1,656,917,214 branch-misses # 0.08% of all branches ( +- 0.55% )
91.369330729 seconds time elapsed ( +- 0.45% )
On top of improved scalability, this also gets rid of the icky long long
types in the very heart of memcg, which is great for 32 bit and also makes
the code a lot more readable.
Notable differences between the old and new API:
- res_counter_charge() and res_counter_charge_nofail() become
page_counter_try_charge() and page_counter_charge() resp. to match
the more common kernel naming scheme of try_do()/do()
- res_counter_uncharge_until() is only ever used to cancel a local
counter and never to uncharge bigger segments of a hierarchy, so
it's replaced by the simpler page_counter_cancel()
- res_counter_set_limit() is replaced by page_counter_limit(), which
expects its callers to serialize against themselves
- res_counter_memparse_write_strategy() is replaced by
page_counter_limit(), which rounds down to the nearest page size -
rather than up. This is more reasonable for explicitely requested
hard upper limits.
- to keep charging light-weight, page_counter_try_charge() charges
speculatively, only to roll back if the result exceeds the limit.
Because of this, a failing bigger charge can temporarily lock out
smaller charges that would otherwise succeed. The error is bounded
to the difference between the smallest and the biggest possible
charge size, so for memcg, this means that a failing THP charge can
send base page charges into reclaim upto 2MB (4MB) before the limit
would have been reached. This should be acceptable.
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE and memparse]
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE, memparse, strncmp, and PAGE_SIZE]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Acked-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-12-11 06:42:31 +07:00
|
|
|
unsigned long usage;
|
2010-03-11 06:22:24 +07:00
|
|
|
int i;
|
|
|
|
|
|
|
|
rcu_read_lock();
|
|
|
|
if (!swap)
|
2010-05-27 04:42:47 +07:00
|
|
|
t = rcu_dereference(memcg->thresholds.primary);
|
2010-03-11 06:22:24 +07:00
|
|
|
else
|
2010-05-27 04:42:47 +07:00
|
|
|
t = rcu_dereference(memcg->memsw_thresholds.primary);
|
2010-03-11 06:22:24 +07:00
|
|
|
|
|
|
|
if (!t)
|
|
|
|
goto unlock;
|
|
|
|
|
2014-09-05 19:43:57 +07:00
|
|
|
usage = mem_cgroup_usage(memcg, swap);
|
2010-03-11 06:22:24 +07:00
|
|
|
|
|
|
|
/*
|
2012-05-30 05:06:57 +07:00
|
|
|
* current_threshold points to threshold just below or equal to usage.
|
2010-03-11 06:22:24 +07:00
|
|
|
* If it's not true, a threshold was crossed after last
|
|
|
|
* call of __mem_cgroup_threshold().
|
|
|
|
*/
|
2010-05-27 04:42:42 +07:00
|
|
|
i = t->current_threshold;
|
2010-03-11 06:22:24 +07:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Iterate backward over array of thresholds starting from
|
|
|
|
* current_threshold and check if a threshold is crossed.
|
|
|
|
* If none of thresholds below usage is crossed, we read
|
|
|
|
* only one element of the array here.
|
|
|
|
*/
|
|
|
|
for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--)
|
|
|
|
eventfd_signal(t->entries[i].eventfd, 1);
|
|
|
|
|
|
|
|
/* i = current_threshold + 1 */
|
|
|
|
i++;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Iterate forward over array of thresholds starting from
|
|
|
|
* current_threshold+1 and check if a threshold is crossed.
|
|
|
|
* If none of thresholds above usage is crossed, we read
|
|
|
|
* only one element of the array here.
|
|
|
|
*/
|
|
|
|
for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++)
|
|
|
|
eventfd_signal(t->entries[i].eventfd, 1);
|
|
|
|
|
|
|
|
/* Update current_threshold */
|
2010-05-27 04:42:42 +07:00
|
|
|
t->current_threshold = i - 1;
|
2010-03-11 06:22:24 +07:00
|
|
|
unlock:
|
|
|
|
rcu_read_unlock();
|
|
|
|
}
|
|
|
|
|
|
|
|
static void mem_cgroup_threshold(struct mem_cgroup *memcg)
|
|
|
|
{
|
2010-10-08 02:59:27 +07:00
|
|
|
while (memcg) {
|
|
|
|
__mem_cgroup_threshold(memcg, false);
|
2016-01-15 06:21:23 +07:00
|
|
|
if (do_memsw_account())
|
2010-10-08 02:59:27 +07:00
|
|
|
__mem_cgroup_threshold(memcg, true);
|
|
|
|
|
|
|
|
memcg = parent_mem_cgroup(memcg);
|
|
|
|
}
|
2010-03-11 06:22:24 +07:00
|
|
|
}
|
|
|
|
|
|
|
|
static int compare_thresholds(const void *a, const void *b)
|
|
|
|
{
|
|
|
|
const struct mem_cgroup_threshold *_a = a;
|
|
|
|
const struct mem_cgroup_threshold *_b = b;
|
|
|
|
|
2013-09-12 04:23:08 +07:00
|
|
|
if (_a->threshold > _b->threshold)
|
|
|
|
return 1;
|
|
|
|
|
|
|
|
if (_a->threshold < _b->threshold)
|
|
|
|
return -1;
|
|
|
|
|
|
|
|
return 0;
|
2010-03-11 06:22:24 +07:00
|
|
|
}
|
|
|
|
|
2011-11-03 03:38:15 +07:00
|
|
|
static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg)
|
2010-05-27 04:42:36 +07:00
|
|
|
{
|
|
|
|
struct mem_cgroup_eventfd_list *ev;
|
|
|
|
|
2014-07-31 06:08:33 +07:00
|
|
|
spin_lock(&memcg_oom_lock);
|
|
|
|
|
2011-11-03 03:38:15 +07:00
|
|
|
list_for_each_entry(ev, &memcg->oom_notify, list)
|
2010-05-27 04:42:36 +07:00
|
|
|
eventfd_signal(ev->eventfd, 1);
|
2014-07-31 06:08:33 +07:00
|
|
|
|
|
|
|
spin_unlock(&memcg_oom_lock);
|
2010-05-27 04:42:36 +07:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2011-11-03 03:38:15 +07:00
|
|
|
static void mem_cgroup_oom_notify(struct mem_cgroup *memcg)
|
2010-05-27 04:42:36 +07:00
|
|
|
{
|
2010-10-28 05:33:41 +07:00
|
|
|
struct mem_cgroup *iter;
|
|
|
|
|
2011-11-03 03:38:15 +07:00
|
|
|
for_each_mem_cgroup_tree(iter, memcg)
|
2010-10-28 05:33:41 +07:00
|
|
|
mem_cgroup_oom_notify_cb(iter);
|
2010-05-27 04:42:36 +07:00
|
|
|
}
|
|
|
|
|
2013-11-23 06:20:43 +07:00
|
|
|
static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
|
2013-11-23 06:20:43 +07:00
|
|
|
struct eventfd_ctx *eventfd, const char *args, enum res_type type)
|
2010-03-11 06:22:24 +07:00
|
|
|
{
|
2010-05-27 04:42:47 +07:00
|
|
|
struct mem_cgroup_thresholds *thresholds;
|
|
|
|
struct mem_cgroup_threshold_ary *new;
|
mm: memcontrol: lockless page counters
Memory is internally accounted in bytes, using spinlock-protected 64-bit
counters, even though the smallest accounting delta is a page. The
counter interface is also convoluted and does too many things.
Introduce a new lockless word-sized page counter API, then change all
memory accounting over to it. The translation from and to bytes then only
happens when interfacing with userspace.
The removed locking overhead is noticable when scaling beyond the per-cpu
charge caches - on a 4-socket machine with 144-threads, the following test
shows the performance differences of 288 memcgs concurrently running a
page fault benchmark:
vanilla:
18631648.500498 task-clock (msec) # 140.643 CPUs utilized ( +- 0.33% )
1,380,638 context-switches # 0.074 K/sec ( +- 0.75% )
24,390 cpu-migrations # 0.001 K/sec ( +- 8.44% )
1,843,305,768 page-faults # 0.099 M/sec ( +- 0.00% )
50,134,994,088,218 cycles # 2.691 GHz ( +- 0.33% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
8,049,712,224,651 instructions # 0.16 insns per cycle ( +- 0.04% )
1,586,970,584,979 branches # 85.176 M/sec ( +- 0.05% )
1,724,989,949 branch-misses # 0.11% of all branches ( +- 0.48% )
132.474343877 seconds time elapsed ( +- 0.21% )
lockless:
12195979.037525 task-clock (msec) # 133.480 CPUs utilized ( +- 0.18% )
832,850 context-switches # 0.068 K/sec ( +- 0.54% )
15,624 cpu-migrations # 0.001 K/sec ( +- 10.17% )
1,843,304,774 page-faults # 0.151 M/sec ( +- 0.00% )
32,811,216,801,141 cycles # 2.690 GHz ( +- 0.18% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
9,999,265,091,727 instructions # 0.30 insns per cycle ( +- 0.10% )
2,076,759,325,203 branches # 170.282 M/sec ( +- 0.12% )
1,656,917,214 branch-misses # 0.08% of all branches ( +- 0.55% )
91.369330729 seconds time elapsed ( +- 0.45% )
On top of improved scalability, this also gets rid of the icky long long
types in the very heart of memcg, which is great for 32 bit and also makes
the code a lot more readable.
Notable differences between the old and new API:
- res_counter_charge() and res_counter_charge_nofail() become
page_counter_try_charge() and page_counter_charge() resp. to match
the more common kernel naming scheme of try_do()/do()
- res_counter_uncharge_until() is only ever used to cancel a local
counter and never to uncharge bigger segments of a hierarchy, so
it's replaced by the simpler page_counter_cancel()
- res_counter_set_limit() is replaced by page_counter_limit(), which
expects its callers to serialize against themselves
- res_counter_memparse_write_strategy() is replaced by
page_counter_limit(), which rounds down to the nearest page size -
rather than up. This is more reasonable for explicitely requested
hard upper limits.
- to keep charging light-weight, page_counter_try_charge() charges
speculatively, only to roll back if the result exceeds the limit.
Because of this, a failing bigger charge can temporarily lock out
smaller charges that would otherwise succeed. The error is bounded
to the difference between the smallest and the biggest possible
charge size, so for memcg, this means that a failing THP charge can
send base page charges into reclaim upto 2MB (4MB) before the limit
would have been reached. This should be acceptable.
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE and memparse]
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE, memparse, strncmp, and PAGE_SIZE]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Acked-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-12-11 06:42:31 +07:00
|
|
|
unsigned long threshold;
|
|
|
|
unsigned long usage;
|
2010-05-27 04:42:47 +07:00
|
|
|
int i, size, ret;
|
2010-03-11 06:22:24 +07:00
|
|
|
|
2015-02-12 06:26:03 +07:00
|
|
|
ret = page_counter_memparse(args, "-1", &threshold);
|
2010-03-11 06:22:24 +07:00
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
|
|
|
|
mutex_lock(&memcg->thresholds_lock);
|
2010-05-27 04:42:47 +07:00
|
|
|
|
2014-08-07 06:05:59 +07:00
|
|
|
if (type == _MEM) {
|
2010-05-27 04:42:47 +07:00
|
|
|
thresholds = &memcg->thresholds;
|
2014-09-05 19:43:57 +07:00
|
|
|
usage = mem_cgroup_usage(memcg, false);
|
2014-08-07 06:05:59 +07:00
|
|
|
} else if (type == _MEMSWAP) {
|
2010-05-27 04:42:47 +07:00
|
|
|
thresholds = &memcg->memsw_thresholds;
|
2014-09-05 19:43:57 +07:00
|
|
|
usage = mem_cgroup_usage(memcg, true);
|
2014-08-07 06:05:59 +07:00
|
|
|
} else
|
2010-03-11 06:22:24 +07:00
|
|
|
BUG();
|
|
|
|
|
|
|
|
/* Check if a threshold crossed before adding a new one */
|
2010-05-27 04:42:47 +07:00
|
|
|
if (thresholds->primary)
|
2010-03-11 06:22:24 +07:00
|
|
|
__mem_cgroup_threshold(memcg, type == _MEMSWAP);
|
|
|
|
|
2010-05-27 04:42:47 +07:00
|
|
|
size = thresholds->primary ? thresholds->primary->size + 1 : 1;
|
2010-03-11 06:22:24 +07:00
|
|
|
|
|
|
|
/* Allocate memory for new array of thresholds */
|
2019-03-06 06:44:05 +07:00
|
|
|
new = kmalloc(struct_size(new, entries, size), GFP_KERNEL);
|
2010-05-27 04:42:47 +07:00
|
|
|
if (!new) {
|
2010-03-11 06:22:24 +07:00
|
|
|
ret = -ENOMEM;
|
|
|
|
goto unlock;
|
|
|
|
}
|
2010-05-27 04:42:47 +07:00
|
|
|
new->size = size;
|
2010-03-11 06:22:24 +07:00
|
|
|
|
|
|
|
/* Copy thresholds (if any) to new array */
|
2020-10-14 06:52:36 +07:00
|
|
|
if (thresholds->primary)
|
|
|
|
memcpy(new->entries, thresholds->primary->entries,
|
|
|
|
flex_array_size(new, entries, size - 1));
|
2010-05-27 04:42:47 +07:00
|
|
|
|
2010-03-11 06:22:24 +07:00
|
|
|
/* Add new threshold */
|
2010-05-27 04:42:47 +07:00
|
|
|
new->entries[size - 1].eventfd = eventfd;
|
|
|
|
new->entries[size - 1].threshold = threshold;
|
2010-03-11 06:22:24 +07:00
|
|
|
|
|
|
|
/* Sort thresholds. Registering of new threshold isn't time-critical */
|
2020-10-14 06:52:39 +07:00
|
|
|
sort(new->entries, size, sizeof(*new->entries),
|
2010-03-11 06:22:24 +07:00
|
|
|
compare_thresholds, NULL);
|
|
|
|
|
|
|
|
/* Find current threshold */
|
2010-05-27 04:42:47 +07:00
|
|
|
new->current_threshold = -1;
|
2010-03-11 06:22:24 +07:00
|
|
|
for (i = 0; i < size; i++) {
|
2012-05-30 05:06:57 +07:00
|
|
|
if (new->entries[i].threshold <= usage) {
|
2010-03-11 06:22:24 +07:00
|
|
|
/*
|
2010-05-27 04:42:47 +07:00
|
|
|
* new->current_threshold will not be used until
|
|
|
|
* rcu_assign_pointer(), so it's safe to increment
|
2010-03-11 06:22:24 +07:00
|
|
|
* it here.
|
|
|
|
*/
|
2010-05-27 04:42:47 +07:00
|
|
|
++new->current_threshold;
|
2012-05-30 05:06:57 +07:00
|
|
|
} else
|
|
|
|
break;
|
2010-03-11 06:22:24 +07:00
|
|
|
}
|
|
|
|
|
2010-05-27 04:42:47 +07:00
|
|
|
/* Free old spare buffer and save old primary buffer as spare */
|
|
|
|
kfree(thresholds->spare);
|
|
|
|
thresholds->spare = thresholds->primary;
|
|
|
|
|
|
|
|
rcu_assign_pointer(thresholds->primary, new);
|
2010-03-11 06:22:24 +07:00
|
|
|
|
2010-05-27 04:42:46 +07:00
|
|
|
/* To be sure that nobody uses thresholds */
|
2010-03-11 06:22:24 +07:00
|
|
|
synchronize_rcu();
|
|
|
|
|
|
|
|
unlock:
|
|
|
|
mutex_unlock(&memcg->thresholds_lock);
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2013-11-23 06:20:43 +07:00
|
|
|
static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
|
2013-11-23 06:20:43 +07:00
|
|
|
struct eventfd_ctx *eventfd, const char *args)
|
|
|
|
{
|
2013-11-23 06:20:43 +07:00
|
|
|
return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM);
|
2013-11-23 06:20:43 +07:00
|
|
|
}
|
|
|
|
|
2013-11-23 06:20:43 +07:00
|
|
|
static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg,
|
2013-11-23 06:20:43 +07:00
|
|
|
struct eventfd_ctx *eventfd, const char *args)
|
|
|
|
{
|
2013-11-23 06:20:43 +07:00
|
|
|
return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP);
|
2013-11-23 06:20:43 +07:00
|
|
|
}
|
|
|
|
|
2013-11-23 06:20:43 +07:00
|
|
|
static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
|
2013-11-23 06:20:43 +07:00
|
|
|
struct eventfd_ctx *eventfd, enum res_type type)
|
2010-03-11 06:22:24 +07:00
|
|
|
{
|
2010-05-27 04:42:47 +07:00
|
|
|
struct mem_cgroup_thresholds *thresholds;
|
|
|
|
struct mem_cgroup_threshold_ary *new;
|
mm: memcontrol: lockless page counters
Memory is internally accounted in bytes, using spinlock-protected 64-bit
counters, even though the smallest accounting delta is a page. The
counter interface is also convoluted and does too many things.
Introduce a new lockless word-sized page counter API, then change all
memory accounting over to it. The translation from and to bytes then only
happens when interfacing with userspace.
The removed locking overhead is noticable when scaling beyond the per-cpu
charge caches - on a 4-socket machine with 144-threads, the following test
shows the performance differences of 288 memcgs concurrently running a
page fault benchmark:
vanilla:
18631648.500498 task-clock (msec) # 140.643 CPUs utilized ( +- 0.33% )
1,380,638 context-switches # 0.074 K/sec ( +- 0.75% )
24,390 cpu-migrations # 0.001 K/sec ( +- 8.44% )
1,843,305,768 page-faults # 0.099 M/sec ( +- 0.00% )
50,134,994,088,218 cycles # 2.691 GHz ( +- 0.33% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
8,049,712,224,651 instructions # 0.16 insns per cycle ( +- 0.04% )
1,586,970,584,979 branches # 85.176 M/sec ( +- 0.05% )
1,724,989,949 branch-misses # 0.11% of all branches ( +- 0.48% )
132.474343877 seconds time elapsed ( +- 0.21% )
lockless:
12195979.037525 task-clock (msec) # 133.480 CPUs utilized ( +- 0.18% )
832,850 context-switches # 0.068 K/sec ( +- 0.54% )
15,624 cpu-migrations # 0.001 K/sec ( +- 10.17% )
1,843,304,774 page-faults # 0.151 M/sec ( +- 0.00% )
32,811,216,801,141 cycles # 2.690 GHz ( +- 0.18% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
9,999,265,091,727 instructions # 0.30 insns per cycle ( +- 0.10% )
2,076,759,325,203 branches # 170.282 M/sec ( +- 0.12% )
1,656,917,214 branch-misses # 0.08% of all branches ( +- 0.55% )
91.369330729 seconds time elapsed ( +- 0.45% )
On top of improved scalability, this also gets rid of the icky long long
types in the very heart of memcg, which is great for 32 bit and also makes
the code a lot more readable.
Notable differences between the old and new API:
- res_counter_charge() and res_counter_charge_nofail() become
page_counter_try_charge() and page_counter_charge() resp. to match
the more common kernel naming scheme of try_do()/do()
- res_counter_uncharge_until() is only ever used to cancel a local
counter and never to uncharge bigger segments of a hierarchy, so
it's replaced by the simpler page_counter_cancel()
- res_counter_set_limit() is replaced by page_counter_limit(), which
expects its callers to serialize against themselves
- res_counter_memparse_write_strategy() is replaced by
page_counter_limit(), which rounds down to the nearest page size -
rather than up. This is more reasonable for explicitely requested
hard upper limits.
- to keep charging light-weight, page_counter_try_charge() charges
speculatively, only to roll back if the result exceeds the limit.
Because of this, a failing bigger charge can temporarily lock out
smaller charges that would otherwise succeed. The error is bounded
to the difference between the smallest and the biggest possible
charge size, so for memcg, this means that a failing THP charge can
send base page charges into reclaim upto 2MB (4MB) before the limit
would have been reached. This should be acceptable.
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE and memparse]
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE, memparse, strncmp, and PAGE_SIZE]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Acked-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-12-11 06:42:31 +07:00
|
|
|
unsigned long usage;
|
memcg: fix NULL pointer dereference in __mem_cgroup_usage_unregister_event
An eventfd monitors multiple memory thresholds of the cgroup, closes them,
the kernel deletes all events related to this eventfd. Before all events
are deleted, another eventfd monitors the memory threshold of this cgroup,
leading to a crash:
BUG: kernel NULL pointer dereference, address: 0000000000000004
#PF: supervisor write access in kernel mode
#PF: error_code(0x0002) - not-present page
PGD 800000033058e067 P4D 800000033058e067 PUD 3355ce067 PMD 0
Oops: 0002 [#1] SMP PTI
CPU: 2 PID: 14012 Comm: kworker/2:6 Kdump: loaded Not tainted 5.6.0-rc4 #3
Hardware name: LENOVO 20AWS01K00/20AWS01K00, BIOS GLET70WW (2.24 ) 05/21/2014
Workqueue: events memcg_event_remove
RIP: 0010:__mem_cgroup_usage_unregister_event+0xb3/0x190
RSP: 0018:ffffb47e01c4fe18 EFLAGS: 00010202
RAX: 0000000000000001 RBX: ffff8bb223a8a000 RCX: 0000000000000001
RDX: 0000000000000001 RSI: ffff8bb22fb83540 RDI: 0000000000000001
RBP: ffffb47e01c4fe48 R08: 0000000000000000 R09: 0000000000000010
R10: 000000000000000c R11: 071c71c71c71c71c R12: ffff8bb226aba880
R13: ffff8bb223a8a480 R14: 0000000000000000 R15: 0000000000000000
FS: 0000000000000000(0000) GS:ffff8bb242680000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000000000000004 CR3: 000000032c29c003 CR4: 00000000001606e0
Call Trace:
memcg_event_remove+0x32/0x90
process_one_work+0x172/0x380
worker_thread+0x49/0x3f0
kthread+0xf8/0x130
ret_from_fork+0x35/0x40
CR2: 0000000000000004
We can reproduce this problem in the following ways:
1. We create a new cgroup subdirectory and a new eventfd, and then we
monitor multiple memory thresholds of the cgroup through this eventfd.
2. closing this eventfd, and __mem_cgroup_usage_unregister_event ()
will be called multiple times to delete all events related to this
eventfd.
The first time __mem_cgroup_usage_unregister_event() is called, the
kernel will clear all items related to this eventfd in thresholds->
primary.
Since there is currently only one eventfd, thresholds-> primary becomes
empty, so the kernel will set thresholds-> primary and hresholds-> spare
to NULL. If at this time, the user creates a new eventfd and monitor
the memory threshold of this cgroup, kernel will re-initialize
thresholds-> primary.
Then when __mem_cgroup_usage_unregister_event () is called for the
second time, because thresholds-> primary is not empty, the system will
access thresholds-> spare, but thresholds-> spare is NULL, which will
trigger a crash.
In general, the longer it takes to delete all events related to this
eventfd, the easier it is to trigger this problem.
The solution is to check whether the thresholds associated with the
eventfd has been cleared when deleting the event. If so, we do nothing.
[akpm@linux-foundation.org: fix comment, per Kirill]
Fixes: 907860ed381a ("cgroups: make cftype.unregister_event() void-returning")
Signed-off-by: Chunguang Xu <brookxu@tencent.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: <stable@vger.kernel.org>
Link: http://lkml.kernel.org/r/077a6f67-aefa-4591-efec-f2f3af2b0b02@gmail.com
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2020-03-22 08:22:10 +07:00
|
|
|
int i, j, size, entries;
|
2010-03-11 06:22:24 +07:00
|
|
|
|
|
|
|
mutex_lock(&memcg->thresholds_lock);
|
2014-08-07 06:05:59 +07:00
|
|
|
|
|
|
|
if (type == _MEM) {
|
2010-05-27 04:42:47 +07:00
|
|
|
thresholds = &memcg->thresholds;
|
2014-09-05 19:43:57 +07:00
|
|
|
usage = mem_cgroup_usage(memcg, false);
|
2014-08-07 06:05:59 +07:00
|
|
|
} else if (type == _MEMSWAP) {
|
2010-05-27 04:42:47 +07:00
|
|
|
thresholds = &memcg->memsw_thresholds;
|
2014-09-05 19:43:57 +07:00
|
|
|
usage = mem_cgroup_usage(memcg, true);
|
2014-08-07 06:05:59 +07:00
|
|
|
} else
|
2010-03-11 06:22:24 +07:00
|
|
|
BUG();
|
|
|
|
|
mm: memcg: Correct unregistring of events attached to the same eventfd
There is an issue when memcg unregisters events that were attached to
the same eventfd:
- On the first call mem_cgroup_usage_unregister_event() removes all
events attached to a given eventfd, and if there were no events left,
thresholds->primary would become NULL;
- Since there were several events registered, cgroups core will call
mem_cgroup_usage_unregister_event() again, but now kernel will oops,
as the function doesn't expect that threshold->primary may be NULL.
That's a good question whether mem_cgroup_usage_unregister_event()
should actually remove all events in one go, but nowadays it can't
do any better as cftype->unregister_event callback doesn't pass
any private event-associated cookie. So, let's fix the issue by
simply checking for threshold->primary.
FWIW, w/o the patch the following oops may be observed:
BUG: unable to handle kernel NULL pointer dereference at 0000000000000004
IP: [<ffffffff810be32c>] mem_cgroup_usage_unregister_event+0x9c/0x1f0
Pid: 574, comm: kworker/0:2 Not tainted 3.3.0-rc4+ #9 Bochs Bochs
RIP: 0010:[<ffffffff810be32c>] [<ffffffff810be32c>] mem_cgroup_usage_unregister_event+0x9c/0x1f0
RSP: 0018:ffff88001d0b9d60 EFLAGS: 00010246
Process kworker/0:2 (pid: 574, threadinfo ffff88001d0b8000, task ffff88001de91cc0)
Call Trace:
[<ffffffff8107092b>] cgroup_event_remove+0x2b/0x60
[<ffffffff8103db94>] process_one_work+0x174/0x450
[<ffffffff8103e413>] worker_thread+0x123/0x2d0
Cc: stable <stable@vger.kernel.org>
Signed-off-by: Anton Vorontsov <anton.vorontsov@linaro.org>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Kirill A. Shutemov <kirill@shutemov.name>
Cc: Michal Hocko <mhocko@suse.cz>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-02-24 08:14:46 +07:00
|
|
|
if (!thresholds->primary)
|
|
|
|
goto unlock;
|
|
|
|
|
2010-03-11 06:22:24 +07:00
|
|
|
/* Check if a threshold crossed before removing */
|
|
|
|
__mem_cgroup_threshold(memcg, type == _MEMSWAP);
|
|
|
|
|
|
|
|
/* Calculate new number of threshold */
|
memcg: fix NULL pointer dereference in __mem_cgroup_usage_unregister_event
An eventfd monitors multiple memory thresholds of the cgroup, closes them,
the kernel deletes all events related to this eventfd. Before all events
are deleted, another eventfd monitors the memory threshold of this cgroup,
leading to a crash:
BUG: kernel NULL pointer dereference, address: 0000000000000004
#PF: supervisor write access in kernel mode
#PF: error_code(0x0002) - not-present page
PGD 800000033058e067 P4D 800000033058e067 PUD 3355ce067 PMD 0
Oops: 0002 [#1] SMP PTI
CPU: 2 PID: 14012 Comm: kworker/2:6 Kdump: loaded Not tainted 5.6.0-rc4 #3
Hardware name: LENOVO 20AWS01K00/20AWS01K00, BIOS GLET70WW (2.24 ) 05/21/2014
Workqueue: events memcg_event_remove
RIP: 0010:__mem_cgroup_usage_unregister_event+0xb3/0x190
RSP: 0018:ffffb47e01c4fe18 EFLAGS: 00010202
RAX: 0000000000000001 RBX: ffff8bb223a8a000 RCX: 0000000000000001
RDX: 0000000000000001 RSI: ffff8bb22fb83540 RDI: 0000000000000001
RBP: ffffb47e01c4fe48 R08: 0000000000000000 R09: 0000000000000010
R10: 000000000000000c R11: 071c71c71c71c71c R12: ffff8bb226aba880
R13: ffff8bb223a8a480 R14: 0000000000000000 R15: 0000000000000000
FS: 0000000000000000(0000) GS:ffff8bb242680000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000000000000004 CR3: 000000032c29c003 CR4: 00000000001606e0
Call Trace:
memcg_event_remove+0x32/0x90
process_one_work+0x172/0x380
worker_thread+0x49/0x3f0
kthread+0xf8/0x130
ret_from_fork+0x35/0x40
CR2: 0000000000000004
We can reproduce this problem in the following ways:
1. We create a new cgroup subdirectory and a new eventfd, and then we
monitor multiple memory thresholds of the cgroup through this eventfd.
2. closing this eventfd, and __mem_cgroup_usage_unregister_event ()
will be called multiple times to delete all events related to this
eventfd.
The first time __mem_cgroup_usage_unregister_event() is called, the
kernel will clear all items related to this eventfd in thresholds->
primary.
Since there is currently only one eventfd, thresholds-> primary becomes
empty, so the kernel will set thresholds-> primary and hresholds-> spare
to NULL. If at this time, the user creates a new eventfd and monitor
the memory threshold of this cgroup, kernel will re-initialize
thresholds-> primary.
Then when __mem_cgroup_usage_unregister_event () is called for the
second time, because thresholds-> primary is not empty, the system will
access thresholds-> spare, but thresholds-> spare is NULL, which will
trigger a crash.
In general, the longer it takes to delete all events related to this
eventfd, the easier it is to trigger this problem.
The solution is to check whether the thresholds associated with the
eventfd has been cleared when deleting the event. If so, we do nothing.
[akpm@linux-foundation.org: fix comment, per Kirill]
Fixes: 907860ed381a ("cgroups: make cftype.unregister_event() void-returning")
Signed-off-by: Chunguang Xu <brookxu@tencent.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: <stable@vger.kernel.org>
Link: http://lkml.kernel.org/r/077a6f67-aefa-4591-efec-f2f3af2b0b02@gmail.com
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2020-03-22 08:22:10 +07:00
|
|
|
size = entries = 0;
|
2010-05-27 04:42:47 +07:00
|
|
|
for (i = 0; i < thresholds->primary->size; i++) {
|
|
|
|
if (thresholds->primary->entries[i].eventfd != eventfd)
|
2010-03-11 06:22:24 +07:00
|
|
|
size++;
|
memcg: fix NULL pointer dereference in __mem_cgroup_usage_unregister_event
An eventfd monitors multiple memory thresholds of the cgroup, closes them,
the kernel deletes all events related to this eventfd. Before all events
are deleted, another eventfd monitors the memory threshold of this cgroup,
leading to a crash:
BUG: kernel NULL pointer dereference, address: 0000000000000004
#PF: supervisor write access in kernel mode
#PF: error_code(0x0002) - not-present page
PGD 800000033058e067 P4D 800000033058e067 PUD 3355ce067 PMD 0
Oops: 0002 [#1] SMP PTI
CPU: 2 PID: 14012 Comm: kworker/2:6 Kdump: loaded Not tainted 5.6.0-rc4 #3
Hardware name: LENOVO 20AWS01K00/20AWS01K00, BIOS GLET70WW (2.24 ) 05/21/2014
Workqueue: events memcg_event_remove
RIP: 0010:__mem_cgroup_usage_unregister_event+0xb3/0x190
RSP: 0018:ffffb47e01c4fe18 EFLAGS: 00010202
RAX: 0000000000000001 RBX: ffff8bb223a8a000 RCX: 0000000000000001
RDX: 0000000000000001 RSI: ffff8bb22fb83540 RDI: 0000000000000001
RBP: ffffb47e01c4fe48 R08: 0000000000000000 R09: 0000000000000010
R10: 000000000000000c R11: 071c71c71c71c71c R12: ffff8bb226aba880
R13: ffff8bb223a8a480 R14: 0000000000000000 R15: 0000000000000000
FS: 0000000000000000(0000) GS:ffff8bb242680000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000000000000004 CR3: 000000032c29c003 CR4: 00000000001606e0
Call Trace:
memcg_event_remove+0x32/0x90
process_one_work+0x172/0x380
worker_thread+0x49/0x3f0
kthread+0xf8/0x130
ret_from_fork+0x35/0x40
CR2: 0000000000000004
We can reproduce this problem in the following ways:
1. We create a new cgroup subdirectory and a new eventfd, and then we
monitor multiple memory thresholds of the cgroup through this eventfd.
2. closing this eventfd, and __mem_cgroup_usage_unregister_event ()
will be called multiple times to delete all events related to this
eventfd.
The first time __mem_cgroup_usage_unregister_event() is called, the
kernel will clear all items related to this eventfd in thresholds->
primary.
Since there is currently only one eventfd, thresholds-> primary becomes
empty, so the kernel will set thresholds-> primary and hresholds-> spare
to NULL. If at this time, the user creates a new eventfd and monitor
the memory threshold of this cgroup, kernel will re-initialize
thresholds-> primary.
Then when __mem_cgroup_usage_unregister_event () is called for the
second time, because thresholds-> primary is not empty, the system will
access thresholds-> spare, but thresholds-> spare is NULL, which will
trigger a crash.
In general, the longer it takes to delete all events related to this
eventfd, the easier it is to trigger this problem.
The solution is to check whether the thresholds associated with the
eventfd has been cleared when deleting the event. If so, we do nothing.
[akpm@linux-foundation.org: fix comment, per Kirill]
Fixes: 907860ed381a ("cgroups: make cftype.unregister_event() void-returning")
Signed-off-by: Chunguang Xu <brookxu@tencent.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: <stable@vger.kernel.org>
Link: http://lkml.kernel.org/r/077a6f67-aefa-4591-efec-f2f3af2b0b02@gmail.com
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2020-03-22 08:22:10 +07:00
|
|
|
else
|
|
|
|
entries++;
|
2010-03-11 06:22:24 +07:00
|
|
|
}
|
|
|
|
|
2010-05-27 04:42:47 +07:00
|
|
|
new = thresholds->spare;
|
2010-05-27 04:42:46 +07:00
|
|
|
|
memcg: fix NULL pointer dereference in __mem_cgroup_usage_unregister_event
An eventfd monitors multiple memory thresholds of the cgroup, closes them,
the kernel deletes all events related to this eventfd. Before all events
are deleted, another eventfd monitors the memory threshold of this cgroup,
leading to a crash:
BUG: kernel NULL pointer dereference, address: 0000000000000004
#PF: supervisor write access in kernel mode
#PF: error_code(0x0002) - not-present page
PGD 800000033058e067 P4D 800000033058e067 PUD 3355ce067 PMD 0
Oops: 0002 [#1] SMP PTI
CPU: 2 PID: 14012 Comm: kworker/2:6 Kdump: loaded Not tainted 5.6.0-rc4 #3
Hardware name: LENOVO 20AWS01K00/20AWS01K00, BIOS GLET70WW (2.24 ) 05/21/2014
Workqueue: events memcg_event_remove
RIP: 0010:__mem_cgroup_usage_unregister_event+0xb3/0x190
RSP: 0018:ffffb47e01c4fe18 EFLAGS: 00010202
RAX: 0000000000000001 RBX: ffff8bb223a8a000 RCX: 0000000000000001
RDX: 0000000000000001 RSI: ffff8bb22fb83540 RDI: 0000000000000001
RBP: ffffb47e01c4fe48 R08: 0000000000000000 R09: 0000000000000010
R10: 000000000000000c R11: 071c71c71c71c71c R12: ffff8bb226aba880
R13: ffff8bb223a8a480 R14: 0000000000000000 R15: 0000000000000000
FS: 0000000000000000(0000) GS:ffff8bb242680000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000000000000004 CR3: 000000032c29c003 CR4: 00000000001606e0
Call Trace:
memcg_event_remove+0x32/0x90
process_one_work+0x172/0x380
worker_thread+0x49/0x3f0
kthread+0xf8/0x130
ret_from_fork+0x35/0x40
CR2: 0000000000000004
We can reproduce this problem in the following ways:
1. We create a new cgroup subdirectory and a new eventfd, and then we
monitor multiple memory thresholds of the cgroup through this eventfd.
2. closing this eventfd, and __mem_cgroup_usage_unregister_event ()
will be called multiple times to delete all events related to this
eventfd.
The first time __mem_cgroup_usage_unregister_event() is called, the
kernel will clear all items related to this eventfd in thresholds->
primary.
Since there is currently only one eventfd, thresholds-> primary becomes
empty, so the kernel will set thresholds-> primary and hresholds-> spare
to NULL. If at this time, the user creates a new eventfd and monitor
the memory threshold of this cgroup, kernel will re-initialize
thresholds-> primary.
Then when __mem_cgroup_usage_unregister_event () is called for the
second time, because thresholds-> primary is not empty, the system will
access thresholds-> spare, but thresholds-> spare is NULL, which will
trigger a crash.
In general, the longer it takes to delete all events related to this
eventfd, the easier it is to trigger this problem.
The solution is to check whether the thresholds associated with the
eventfd has been cleared when deleting the event. If so, we do nothing.
[akpm@linux-foundation.org: fix comment, per Kirill]
Fixes: 907860ed381a ("cgroups: make cftype.unregister_event() void-returning")
Signed-off-by: Chunguang Xu <brookxu@tencent.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: <stable@vger.kernel.org>
Link: http://lkml.kernel.org/r/077a6f67-aefa-4591-efec-f2f3af2b0b02@gmail.com
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2020-03-22 08:22:10 +07:00
|
|
|
/* If no items related to eventfd have been cleared, nothing to do */
|
|
|
|
if (!entries)
|
|
|
|
goto unlock;
|
|
|
|
|
2010-03-11 06:22:24 +07:00
|
|
|
/* Set thresholds array to NULL if we don't have thresholds */
|
|
|
|
if (!size) {
|
2010-05-27 04:42:47 +07:00
|
|
|
kfree(new);
|
|
|
|
new = NULL;
|
2010-05-27 04:42:46 +07:00
|
|
|
goto swap_buffers;
|
2010-03-11 06:22:24 +07:00
|
|
|
}
|
|
|
|
|
2010-05-27 04:42:47 +07:00
|
|
|
new->size = size;
|
2010-03-11 06:22:24 +07:00
|
|
|
|
|
|
|
/* Copy thresholds and find current threshold */
|
2010-05-27 04:42:47 +07:00
|
|
|
new->current_threshold = -1;
|
|
|
|
for (i = 0, j = 0; i < thresholds->primary->size; i++) {
|
|
|
|
if (thresholds->primary->entries[i].eventfd == eventfd)
|
2010-03-11 06:22:24 +07:00
|
|
|
continue;
|
|
|
|
|
2010-05-27 04:42:47 +07:00
|
|
|
new->entries[j] = thresholds->primary->entries[i];
|
2012-05-30 05:06:57 +07:00
|
|
|
if (new->entries[j].threshold <= usage) {
|
2010-03-11 06:22:24 +07:00
|
|
|
/*
|
2010-05-27 04:42:47 +07:00
|
|
|
* new->current_threshold will not be used
|
2010-03-11 06:22:24 +07:00
|
|
|
* until rcu_assign_pointer(), so it's safe to increment
|
|
|
|
* it here.
|
|
|
|
*/
|
2010-05-27 04:42:47 +07:00
|
|
|
++new->current_threshold;
|
2010-03-11 06:22:24 +07:00
|
|
|
}
|
|
|
|
j++;
|
|
|
|
}
|
|
|
|
|
2010-05-27 04:42:46 +07:00
|
|
|
swap_buffers:
|
2010-05-27 04:42:47 +07:00
|
|
|
/* Swap primary and spare array */
|
|
|
|
thresholds->spare = thresholds->primary;
|
2012-05-11 03:01:45 +07:00
|
|
|
|
2010-05-27 04:42:47 +07:00
|
|
|
rcu_assign_pointer(thresholds->primary, new);
|
2010-03-11 06:22:24 +07:00
|
|
|
|
2010-05-27 04:42:46 +07:00
|
|
|
/* To be sure that nobody uses thresholds */
|
2010-03-11 06:22:24 +07:00
|
|
|
synchronize_rcu();
|
2016-01-16 07:57:49 +07:00
|
|
|
|
|
|
|
/* If all events are unregistered, free the spare array */
|
|
|
|
if (!new) {
|
|
|
|
kfree(thresholds->spare);
|
|
|
|
thresholds->spare = NULL;
|
|
|
|
}
|
mm: memcg: Correct unregistring of events attached to the same eventfd
There is an issue when memcg unregisters events that were attached to
the same eventfd:
- On the first call mem_cgroup_usage_unregister_event() removes all
events attached to a given eventfd, and if there were no events left,
thresholds->primary would become NULL;
- Since there were several events registered, cgroups core will call
mem_cgroup_usage_unregister_event() again, but now kernel will oops,
as the function doesn't expect that threshold->primary may be NULL.
That's a good question whether mem_cgroup_usage_unregister_event()
should actually remove all events in one go, but nowadays it can't
do any better as cftype->unregister_event callback doesn't pass
any private event-associated cookie. So, let's fix the issue by
simply checking for threshold->primary.
FWIW, w/o the patch the following oops may be observed:
BUG: unable to handle kernel NULL pointer dereference at 0000000000000004
IP: [<ffffffff810be32c>] mem_cgroup_usage_unregister_event+0x9c/0x1f0
Pid: 574, comm: kworker/0:2 Not tainted 3.3.0-rc4+ #9 Bochs Bochs
RIP: 0010:[<ffffffff810be32c>] [<ffffffff810be32c>] mem_cgroup_usage_unregister_event+0x9c/0x1f0
RSP: 0018:ffff88001d0b9d60 EFLAGS: 00010246
Process kworker/0:2 (pid: 574, threadinfo ffff88001d0b8000, task ffff88001de91cc0)
Call Trace:
[<ffffffff8107092b>] cgroup_event_remove+0x2b/0x60
[<ffffffff8103db94>] process_one_work+0x174/0x450
[<ffffffff8103e413>] worker_thread+0x123/0x2d0
Cc: stable <stable@vger.kernel.org>
Signed-off-by: Anton Vorontsov <anton.vorontsov@linaro.org>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Kirill A. Shutemov <kirill@shutemov.name>
Cc: Michal Hocko <mhocko@suse.cz>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-02-24 08:14:46 +07:00
|
|
|
unlock:
|
2010-03-11 06:22:24 +07:00
|
|
|
mutex_unlock(&memcg->thresholds_lock);
|
|
|
|
}
|
2009-01-08 09:07:55 +07:00
|
|
|
|
2013-11-23 06:20:43 +07:00
|
|
|
static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
|
2013-11-23 06:20:43 +07:00
|
|
|
struct eventfd_ctx *eventfd)
|
|
|
|
{
|
2013-11-23 06:20:43 +07:00
|
|
|
return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM);
|
2013-11-23 06:20:43 +07:00
|
|
|
}
|
|
|
|
|
2013-11-23 06:20:43 +07:00
|
|
|
static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
|
2013-11-23 06:20:43 +07:00
|
|
|
struct eventfd_ctx *eventfd)
|
|
|
|
{
|
2013-11-23 06:20:43 +07:00
|
|
|
return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP);
|
2013-11-23 06:20:43 +07:00
|
|
|
}
|
|
|
|
|
2013-11-23 06:20:43 +07:00
|
|
|
static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg,
|
2013-11-23 06:20:43 +07:00
|
|
|
struct eventfd_ctx *eventfd, const char *args)
|
2010-05-27 04:42:36 +07:00
|
|
|
{
|
|
|
|
struct mem_cgroup_eventfd_list *event;
|
|
|
|
|
|
|
|
event = kmalloc(sizeof(*event), GFP_KERNEL);
|
|
|
|
if (!event)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
2011-07-27 06:08:24 +07:00
|
|
|
spin_lock(&memcg_oom_lock);
|
2010-05-27 04:42:36 +07:00
|
|
|
|
|
|
|
event->eventfd = eventfd;
|
|
|
|
list_add(&event->list, &memcg->oom_notify);
|
|
|
|
|
|
|
|
/* already in OOM ? */
|
2015-06-25 06:58:23 +07:00
|
|
|
if (memcg->under_oom)
|
2010-05-27 04:42:36 +07:00
|
|
|
eventfd_signal(eventfd, 1);
|
2011-07-27 06:08:24 +07:00
|
|
|
spin_unlock(&memcg_oom_lock);
|
2010-05-27 04:42:36 +07:00
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2013-11-23 06:20:43 +07:00
|
|
|
static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg,
|
2013-11-23 06:20:43 +07:00
|
|
|
struct eventfd_ctx *eventfd)
|
2010-05-27 04:42:36 +07:00
|
|
|
{
|
|
|
|
struct mem_cgroup_eventfd_list *ev, *tmp;
|
|
|
|
|
2011-07-27 06:08:24 +07:00
|
|
|
spin_lock(&memcg_oom_lock);
|
2010-05-27 04:42:36 +07:00
|
|
|
|
2011-11-03 03:38:15 +07:00
|
|
|
list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) {
|
2010-05-27 04:42:36 +07:00
|
|
|
if (ev->eventfd == eventfd) {
|
|
|
|
list_del(&ev->list);
|
|
|
|
kfree(ev);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2011-07-27 06:08:24 +07:00
|
|
|
spin_unlock(&memcg_oom_lock);
|
2010-05-27 04:42:36 +07:00
|
|
|
}
|
|
|
|
|
2013-12-06 00:28:04 +07:00
|
|
|
static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v)
|
2010-05-27 04:42:37 +07:00
|
|
|
{
|
2019-03-06 06:45:52 +07:00
|
|
|
struct mem_cgroup *memcg = mem_cgroup_from_seq(sf);
|
2010-05-27 04:42:37 +07:00
|
|
|
|
2013-12-06 00:28:02 +07:00
|
|
|
seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable);
|
2015-06-25 06:58:23 +07:00
|
|
|
seq_printf(sf, "under_oom %d\n", (bool)memcg->under_oom);
|
2018-06-15 05:28:05 +07:00
|
|
|
seq_printf(sf, "oom_kill %lu\n",
|
|
|
|
atomic_long_read(&memcg->memory_events[MEMCG_OOM_KILL]));
|
2010-05-27 04:42:37 +07:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2013-08-09 07:11:24 +07:00
|
|
|
static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css,
|
2010-05-27 04:42:37 +07:00
|
|
|
struct cftype *cft, u64 val)
|
|
|
|
{
|
2013-08-09 07:11:24 +07:00
|
|
|
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
|
2010-05-27 04:42:37 +07:00
|
|
|
|
|
|
|
/* cannot set to root cgroup and only 0 and 1 are allowed */
|
2014-06-10 05:03:33 +07:00
|
|
|
if (!css->parent || !((val == 0) || (val == 1)))
|
2010-05-27 04:42:37 +07:00
|
|
|
return -EINVAL;
|
|
|
|
|
2011-11-03 03:38:15 +07:00
|
|
|
memcg->oom_kill_disable = val;
|
2010-06-30 05:05:18 +07:00
|
|
|
if (!val)
|
2011-11-03 03:38:15 +07:00
|
|
|
memcg_oom_recover(memcg);
|
2014-06-05 06:07:01 +07:00
|
|
|
|
2010-05-27 04:42:37 +07:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2015-05-23 04:13:37 +07:00
|
|
|
#ifdef CONFIG_CGROUP_WRITEBACK
|
|
|
|
|
2019-08-30 05:47:19 +07:00
|
|
|
#include <trace/events/writeback.h>
|
|
|
|
|
2015-05-23 05:23:33 +07:00
|
|
|
static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
|
|
|
|
{
|
|
|
|
return wb_domain_init(&memcg->cgwb_domain, gfp);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void memcg_wb_domain_exit(struct mem_cgroup *memcg)
|
|
|
|
{
|
|
|
|
wb_domain_exit(&memcg->cgwb_domain);
|
|
|
|
}
|
|
|
|
|
2015-05-23 05:23:34 +07:00
|
|
|
static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg)
|
|
|
|
{
|
|
|
|
wb_domain_size_changed(&memcg->cgwb_domain);
|
|
|
|
}
|
|
|
|
|
2015-05-23 05:23:33 +07:00
|
|
|
struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb)
|
|
|
|
{
|
|
|
|
struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
|
|
|
|
|
|
|
|
if (!memcg->css.parent)
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
return &memcg->cgwb_domain;
|
|
|
|
}
|
|
|
|
|
mm: writeback: use exact memcg dirty counts
Since commit a983b5ebee57 ("mm: memcontrol: fix excessive complexity in
memory.stat reporting") memcg dirty and writeback counters are managed
as:
1) per-memcg per-cpu values in range of [-32..32]
2) per-memcg atomic counter
When a per-cpu counter cannot fit in [-32..32] it's flushed to the
atomic. Stat readers only check the atomic. Thus readers such as
balance_dirty_pages() may see a nontrivial error margin: 32 pages per
cpu.
Assuming 100 cpus:
4k x86 page_size: 13 MiB error per memcg
64k ppc page_size: 200 MiB error per memcg
Considering that dirty+writeback are used together for some decisions the
errors double.
This inaccuracy can lead to undeserved oom kills. One nasty case is
when all per-cpu counters hold positive values offsetting an atomic
negative value (i.e. per_cpu[*]=32, atomic=n_cpu*-32).
balance_dirty_pages() only consults the atomic and does not consider
throttling the next n_cpu*32 dirty pages. If the file_lru is in the
13..200 MiB range then there's absolutely no dirty throttling, which
burdens vmscan with only dirty+writeback pages thus resorting to oom
kill.
It could be argued that tiny containers are not supported, but it's more
subtle. It's the amount the space available for file lru that matters.
If a container has memory.max-200MiB of non reclaimable memory, then it
will also suffer such oom kills on a 100 cpu machine.
The following test reliably ooms without this patch. This patch avoids
oom kills.
$ cat test
mount -t cgroup2 none /dev/cgroup
cd /dev/cgroup
echo +io +memory > cgroup.subtree_control
mkdir test
cd test
echo 10M > memory.max
(echo $BASHPID > cgroup.procs && exec /memcg-writeback-stress /foo)
(echo $BASHPID > cgroup.procs && exec dd if=/dev/zero of=/foo bs=2M count=100)
$ cat memcg-writeback-stress.c
/*
* Dirty pages from all but one cpu.
* Clean pages from the non dirtying cpu.
* This is to stress per cpu counter imbalance.
* On a 100 cpu machine:
* - per memcg per cpu dirty count is 32 pages for each of 99 cpus
* - per memcg atomic is -99*32 pages
* - thus the complete dirty limit: sum of all counters 0
* - balance_dirty_pages() only sees atomic count -99*32 pages, which
* it max()s to 0.
* - So a workload can dirty -99*32 pages before balance_dirty_pages()
* cares.
*/
#define _GNU_SOURCE
#include <err.h>
#include <fcntl.h>
#include <sched.h>
#include <stdlib.h>
#include <stdio.h>
#include <sys/stat.h>
#include <sys/sysinfo.h>
#include <sys/types.h>
#include <unistd.h>
static char *buf;
static int bufSize;
static void set_affinity(int cpu)
{
cpu_set_t affinity;
CPU_ZERO(&affinity);
CPU_SET(cpu, &affinity);
if (sched_setaffinity(0, sizeof(affinity), &affinity))
err(1, "sched_setaffinity");
}
static void dirty_on(int output_fd, int cpu)
{
int i, wrote;
set_affinity(cpu);
for (i = 0; i < 32; i++) {
for (wrote = 0; wrote < bufSize; ) {
int ret = write(output_fd, buf+wrote, bufSize-wrote);
if (ret == -1)
err(1, "write");
wrote += ret;
}
}
}
int main(int argc, char **argv)
{
int cpu, flush_cpu = 1, output_fd;
const char *output;
if (argc != 2)
errx(1, "usage: output_file");
output = argv[1];
bufSize = getpagesize();
buf = malloc(getpagesize());
if (buf == NULL)
errx(1, "malloc failed");
output_fd = open(output, O_CREAT|O_RDWR);
if (output_fd == -1)
err(1, "open(%s)", output);
for (cpu = 0; cpu < get_nprocs(); cpu++) {
if (cpu != flush_cpu)
dirty_on(output_fd, cpu);
}
set_affinity(flush_cpu);
if (fsync(output_fd))
err(1, "fsync(%s)", output);
if (close(output_fd))
err(1, "close(%s)", output);
free(buf);
}
Make balance_dirty_pages() and wb_over_bg_thresh() work harder to
collect exact per memcg counters. This avoids the aforementioned oom
kills.
This does not affect the overhead of memory.stat, which still reads the
single atomic counter.
Why not use percpu_counter? memcg already handles cpus going offline, so
no need for that overhead from percpu_counter. And the percpu_counter
spinlocks are more heavyweight than is required.
It probably also makes sense to use exact dirty and writeback counters
in memcg oom reports. But that is saved for later.
Link: http://lkml.kernel.org/r/20190329174609.164344-1-gthelen@google.com
Signed-off-by: Greg Thelen <gthelen@google.com>
Reviewed-by: Roman Gushchin <guro@fb.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: <stable@vger.kernel.org> [4.16+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-04-06 08:39:18 +07:00
|
|
|
/*
|
|
|
|
* idx can be of type enum memcg_stat_item or node_stat_item.
|
|
|
|
* Keep in sync with memcg_exact_page().
|
|
|
|
*/
|
|
|
|
static unsigned long memcg_exact_page_state(struct mem_cgroup *memcg, int idx)
|
|
|
|
{
|
2019-05-15 05:46:57 +07:00
|
|
|
long x = atomic_long_read(&memcg->vmstats[idx]);
|
mm: writeback: use exact memcg dirty counts
Since commit a983b5ebee57 ("mm: memcontrol: fix excessive complexity in
memory.stat reporting") memcg dirty and writeback counters are managed
as:
1) per-memcg per-cpu values in range of [-32..32]
2) per-memcg atomic counter
When a per-cpu counter cannot fit in [-32..32] it's flushed to the
atomic. Stat readers only check the atomic. Thus readers such as
balance_dirty_pages() may see a nontrivial error margin: 32 pages per
cpu.
Assuming 100 cpus:
4k x86 page_size: 13 MiB error per memcg
64k ppc page_size: 200 MiB error per memcg
Considering that dirty+writeback are used together for some decisions the
errors double.
This inaccuracy can lead to undeserved oom kills. One nasty case is
when all per-cpu counters hold positive values offsetting an atomic
negative value (i.e. per_cpu[*]=32, atomic=n_cpu*-32).
balance_dirty_pages() only consults the atomic and does not consider
throttling the next n_cpu*32 dirty pages. If the file_lru is in the
13..200 MiB range then there's absolutely no dirty throttling, which
burdens vmscan with only dirty+writeback pages thus resorting to oom
kill.
It could be argued that tiny containers are not supported, but it's more
subtle. It's the amount the space available for file lru that matters.
If a container has memory.max-200MiB of non reclaimable memory, then it
will also suffer such oom kills on a 100 cpu machine.
The following test reliably ooms without this patch. This patch avoids
oom kills.
$ cat test
mount -t cgroup2 none /dev/cgroup
cd /dev/cgroup
echo +io +memory > cgroup.subtree_control
mkdir test
cd test
echo 10M > memory.max
(echo $BASHPID > cgroup.procs && exec /memcg-writeback-stress /foo)
(echo $BASHPID > cgroup.procs && exec dd if=/dev/zero of=/foo bs=2M count=100)
$ cat memcg-writeback-stress.c
/*
* Dirty pages from all but one cpu.
* Clean pages from the non dirtying cpu.
* This is to stress per cpu counter imbalance.
* On a 100 cpu machine:
* - per memcg per cpu dirty count is 32 pages for each of 99 cpus
* - per memcg atomic is -99*32 pages
* - thus the complete dirty limit: sum of all counters 0
* - balance_dirty_pages() only sees atomic count -99*32 pages, which
* it max()s to 0.
* - So a workload can dirty -99*32 pages before balance_dirty_pages()
* cares.
*/
#define _GNU_SOURCE
#include <err.h>
#include <fcntl.h>
#include <sched.h>
#include <stdlib.h>
#include <stdio.h>
#include <sys/stat.h>
#include <sys/sysinfo.h>
#include <sys/types.h>
#include <unistd.h>
static char *buf;
static int bufSize;
static void set_affinity(int cpu)
{
cpu_set_t affinity;
CPU_ZERO(&affinity);
CPU_SET(cpu, &affinity);
if (sched_setaffinity(0, sizeof(affinity), &affinity))
err(1, "sched_setaffinity");
}
static void dirty_on(int output_fd, int cpu)
{
int i, wrote;
set_affinity(cpu);
for (i = 0; i < 32; i++) {
for (wrote = 0; wrote < bufSize; ) {
int ret = write(output_fd, buf+wrote, bufSize-wrote);
if (ret == -1)
err(1, "write");
wrote += ret;
}
}
}
int main(int argc, char **argv)
{
int cpu, flush_cpu = 1, output_fd;
const char *output;
if (argc != 2)
errx(1, "usage: output_file");
output = argv[1];
bufSize = getpagesize();
buf = malloc(getpagesize());
if (buf == NULL)
errx(1, "malloc failed");
output_fd = open(output, O_CREAT|O_RDWR);
if (output_fd == -1)
err(1, "open(%s)", output);
for (cpu = 0; cpu < get_nprocs(); cpu++) {
if (cpu != flush_cpu)
dirty_on(output_fd, cpu);
}
set_affinity(flush_cpu);
if (fsync(output_fd))
err(1, "fsync(%s)", output);
if (close(output_fd))
err(1, "close(%s)", output);
free(buf);
}
Make balance_dirty_pages() and wb_over_bg_thresh() work harder to
collect exact per memcg counters. This avoids the aforementioned oom
kills.
This does not affect the overhead of memory.stat, which still reads the
single atomic counter.
Why not use percpu_counter? memcg already handles cpus going offline, so
no need for that overhead from percpu_counter. And the percpu_counter
spinlocks are more heavyweight than is required.
It probably also makes sense to use exact dirty and writeback counters
in memcg oom reports. But that is saved for later.
Link: http://lkml.kernel.org/r/20190329174609.164344-1-gthelen@google.com
Signed-off-by: Greg Thelen <gthelen@google.com>
Reviewed-by: Roman Gushchin <guro@fb.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: <stable@vger.kernel.org> [4.16+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-04-06 08:39:18 +07:00
|
|
|
int cpu;
|
|
|
|
|
|
|
|
for_each_online_cpu(cpu)
|
2019-05-15 05:46:57 +07:00
|
|
|
x += per_cpu_ptr(memcg->vmstats_percpu, cpu)->stat[idx];
|
mm: writeback: use exact memcg dirty counts
Since commit a983b5ebee57 ("mm: memcontrol: fix excessive complexity in
memory.stat reporting") memcg dirty and writeback counters are managed
as:
1) per-memcg per-cpu values in range of [-32..32]
2) per-memcg atomic counter
When a per-cpu counter cannot fit in [-32..32] it's flushed to the
atomic. Stat readers only check the atomic. Thus readers such as
balance_dirty_pages() may see a nontrivial error margin: 32 pages per
cpu.
Assuming 100 cpus:
4k x86 page_size: 13 MiB error per memcg
64k ppc page_size: 200 MiB error per memcg
Considering that dirty+writeback are used together for some decisions the
errors double.
This inaccuracy can lead to undeserved oom kills. One nasty case is
when all per-cpu counters hold positive values offsetting an atomic
negative value (i.e. per_cpu[*]=32, atomic=n_cpu*-32).
balance_dirty_pages() only consults the atomic and does not consider
throttling the next n_cpu*32 dirty pages. If the file_lru is in the
13..200 MiB range then there's absolutely no dirty throttling, which
burdens vmscan with only dirty+writeback pages thus resorting to oom
kill.
It could be argued that tiny containers are not supported, but it's more
subtle. It's the amount the space available for file lru that matters.
If a container has memory.max-200MiB of non reclaimable memory, then it
will also suffer such oom kills on a 100 cpu machine.
The following test reliably ooms without this patch. This patch avoids
oom kills.
$ cat test
mount -t cgroup2 none /dev/cgroup
cd /dev/cgroup
echo +io +memory > cgroup.subtree_control
mkdir test
cd test
echo 10M > memory.max
(echo $BASHPID > cgroup.procs && exec /memcg-writeback-stress /foo)
(echo $BASHPID > cgroup.procs && exec dd if=/dev/zero of=/foo bs=2M count=100)
$ cat memcg-writeback-stress.c
/*
* Dirty pages from all but one cpu.
* Clean pages from the non dirtying cpu.
* This is to stress per cpu counter imbalance.
* On a 100 cpu machine:
* - per memcg per cpu dirty count is 32 pages for each of 99 cpus
* - per memcg atomic is -99*32 pages
* - thus the complete dirty limit: sum of all counters 0
* - balance_dirty_pages() only sees atomic count -99*32 pages, which
* it max()s to 0.
* - So a workload can dirty -99*32 pages before balance_dirty_pages()
* cares.
*/
#define _GNU_SOURCE
#include <err.h>
#include <fcntl.h>
#include <sched.h>
#include <stdlib.h>
#include <stdio.h>
#include <sys/stat.h>
#include <sys/sysinfo.h>
#include <sys/types.h>
#include <unistd.h>
static char *buf;
static int bufSize;
static void set_affinity(int cpu)
{
cpu_set_t affinity;
CPU_ZERO(&affinity);
CPU_SET(cpu, &affinity);
if (sched_setaffinity(0, sizeof(affinity), &affinity))
err(1, "sched_setaffinity");
}
static void dirty_on(int output_fd, int cpu)
{
int i, wrote;
set_affinity(cpu);
for (i = 0; i < 32; i++) {
for (wrote = 0; wrote < bufSize; ) {
int ret = write(output_fd, buf+wrote, bufSize-wrote);
if (ret == -1)
err(1, "write");
wrote += ret;
}
}
}
int main(int argc, char **argv)
{
int cpu, flush_cpu = 1, output_fd;
const char *output;
if (argc != 2)
errx(1, "usage: output_file");
output = argv[1];
bufSize = getpagesize();
buf = malloc(getpagesize());
if (buf == NULL)
errx(1, "malloc failed");
output_fd = open(output, O_CREAT|O_RDWR);
if (output_fd == -1)
err(1, "open(%s)", output);
for (cpu = 0; cpu < get_nprocs(); cpu++) {
if (cpu != flush_cpu)
dirty_on(output_fd, cpu);
}
set_affinity(flush_cpu);
if (fsync(output_fd))
err(1, "fsync(%s)", output);
if (close(output_fd))
err(1, "close(%s)", output);
free(buf);
}
Make balance_dirty_pages() and wb_over_bg_thresh() work harder to
collect exact per memcg counters. This avoids the aforementioned oom
kills.
This does not affect the overhead of memory.stat, which still reads the
single atomic counter.
Why not use percpu_counter? memcg already handles cpus going offline, so
no need for that overhead from percpu_counter. And the percpu_counter
spinlocks are more heavyweight than is required.
It probably also makes sense to use exact dirty and writeback counters
in memcg oom reports. But that is saved for later.
Link: http://lkml.kernel.org/r/20190329174609.164344-1-gthelen@google.com
Signed-off-by: Greg Thelen <gthelen@google.com>
Reviewed-by: Roman Gushchin <guro@fb.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: <stable@vger.kernel.org> [4.16+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-04-06 08:39:18 +07:00
|
|
|
if (x < 0)
|
|
|
|
x = 0;
|
|
|
|
return x;
|
|
|
|
}
|
|
|
|
|
2015-05-23 05:23:35 +07:00
|
|
|
/**
|
|
|
|
* mem_cgroup_wb_stats - retrieve writeback related stats from its memcg
|
|
|
|
* @wb: bdi_writeback in question
|
writeback: fix incorrect calculation of available memory for memcg domains
For memcg domains, the amount of available memory was calculated as
min(the amount currently in use + headroom according to memcg,
total clean memory)
This isn't quite correct as what should be capped by the amount of
clean memory is the headroom, not the sum of memory in use and
headroom. For example, if a memcg domain has a significant amount of
dirty memory, the above can lead to a value which is lower than the
current amount in use which doesn't make much sense. In most
circumstances, the above leads to a number which is somewhat but not
drastically lower.
As the amount of memory which can be readily allocated to the memcg
domain is capped by the amount of system-wide clean memory which is
not already assigned to the memcg itself, the number we want is
the amount currently in use +
min(headroom according to memcg, clean memory elsewhere in the system)
This patch updates mem_cgroup_wb_stats() to return the number of
filepages and headroom instead of the calculated available pages.
mdtc_cap_avail() is renamed to mdtc_calc_avail() and performs the
above calculation from file, headroom, dirty and globally clean pages.
v2: Dummy mem_cgroup_wb_stats() implementation wasn't updated leading
to build failure when !CGROUP_WRITEBACK. Fixed.
Signed-off-by: Tejun Heo <tj@kernel.org>
Fixes: c2aa723a6093 ("writeback: implement memcg writeback domain based throttling")
Signed-off-by: Jens Axboe <axboe@fb.com>
2015-09-30 00:04:26 +07:00
|
|
|
* @pfilepages: out parameter for number of file pages
|
|
|
|
* @pheadroom: out parameter for number of allocatable pages according to memcg
|
2015-05-23 05:23:35 +07:00
|
|
|
* @pdirty: out parameter for number of dirty pages
|
|
|
|
* @pwriteback: out parameter for number of pages under writeback
|
|
|
|
*
|
writeback: fix incorrect calculation of available memory for memcg domains
For memcg domains, the amount of available memory was calculated as
min(the amount currently in use + headroom according to memcg,
total clean memory)
This isn't quite correct as what should be capped by the amount of
clean memory is the headroom, not the sum of memory in use and
headroom. For example, if a memcg domain has a significant amount of
dirty memory, the above can lead to a value which is lower than the
current amount in use which doesn't make much sense. In most
circumstances, the above leads to a number which is somewhat but not
drastically lower.
As the amount of memory which can be readily allocated to the memcg
domain is capped by the amount of system-wide clean memory which is
not already assigned to the memcg itself, the number we want is
the amount currently in use +
min(headroom according to memcg, clean memory elsewhere in the system)
This patch updates mem_cgroup_wb_stats() to return the number of
filepages and headroom instead of the calculated available pages.
mdtc_cap_avail() is renamed to mdtc_calc_avail() and performs the
above calculation from file, headroom, dirty and globally clean pages.
v2: Dummy mem_cgroup_wb_stats() implementation wasn't updated leading
to build failure when !CGROUP_WRITEBACK. Fixed.
Signed-off-by: Tejun Heo <tj@kernel.org>
Fixes: c2aa723a6093 ("writeback: implement memcg writeback domain based throttling")
Signed-off-by: Jens Axboe <axboe@fb.com>
2015-09-30 00:04:26 +07:00
|
|
|
* Determine the numbers of file, headroom, dirty, and writeback pages in
|
|
|
|
* @wb's memcg. File, dirty and writeback are self-explanatory. Headroom
|
|
|
|
* is a bit more involved.
|
2015-05-23 05:23:35 +07:00
|
|
|
*
|
writeback: fix incorrect calculation of available memory for memcg domains
For memcg domains, the amount of available memory was calculated as
min(the amount currently in use + headroom according to memcg,
total clean memory)
This isn't quite correct as what should be capped by the amount of
clean memory is the headroom, not the sum of memory in use and
headroom. For example, if a memcg domain has a significant amount of
dirty memory, the above can lead to a value which is lower than the
current amount in use which doesn't make much sense. In most
circumstances, the above leads to a number which is somewhat but not
drastically lower.
As the amount of memory which can be readily allocated to the memcg
domain is capped by the amount of system-wide clean memory which is
not already assigned to the memcg itself, the number we want is
the amount currently in use +
min(headroom according to memcg, clean memory elsewhere in the system)
This patch updates mem_cgroup_wb_stats() to return the number of
filepages and headroom instead of the calculated available pages.
mdtc_cap_avail() is renamed to mdtc_calc_avail() and performs the
above calculation from file, headroom, dirty and globally clean pages.
v2: Dummy mem_cgroup_wb_stats() implementation wasn't updated leading
to build failure when !CGROUP_WRITEBACK. Fixed.
Signed-off-by: Tejun Heo <tj@kernel.org>
Fixes: c2aa723a6093 ("writeback: implement memcg writeback domain based throttling")
Signed-off-by: Jens Axboe <axboe@fb.com>
2015-09-30 00:04:26 +07:00
|
|
|
* A memcg's headroom is "min(max, high) - used". In the hierarchy, the
|
|
|
|
* headroom is calculated as the lowest headroom of itself and the
|
|
|
|
* ancestors. Note that this doesn't consider the actual amount of
|
|
|
|
* available memory in the system. The caller should further cap
|
|
|
|
* *@pheadroom accordingly.
|
2015-05-23 05:23:35 +07:00
|
|
|
*/
|
writeback: fix incorrect calculation of available memory for memcg domains
For memcg domains, the amount of available memory was calculated as
min(the amount currently in use + headroom according to memcg,
total clean memory)
This isn't quite correct as what should be capped by the amount of
clean memory is the headroom, not the sum of memory in use and
headroom. For example, if a memcg domain has a significant amount of
dirty memory, the above can lead to a value which is lower than the
current amount in use which doesn't make much sense. In most
circumstances, the above leads to a number which is somewhat but not
drastically lower.
As the amount of memory which can be readily allocated to the memcg
domain is capped by the amount of system-wide clean memory which is
not already assigned to the memcg itself, the number we want is
the amount currently in use +
min(headroom according to memcg, clean memory elsewhere in the system)
This patch updates mem_cgroup_wb_stats() to return the number of
filepages and headroom instead of the calculated available pages.
mdtc_cap_avail() is renamed to mdtc_calc_avail() and performs the
above calculation from file, headroom, dirty and globally clean pages.
v2: Dummy mem_cgroup_wb_stats() implementation wasn't updated leading
to build failure when !CGROUP_WRITEBACK. Fixed.
Signed-off-by: Tejun Heo <tj@kernel.org>
Fixes: c2aa723a6093 ("writeback: implement memcg writeback domain based throttling")
Signed-off-by: Jens Axboe <axboe@fb.com>
2015-09-30 00:04:26 +07:00
|
|
|
void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
|
|
|
|
unsigned long *pheadroom, unsigned long *pdirty,
|
|
|
|
unsigned long *pwriteback)
|
2015-05-23 05:23:35 +07:00
|
|
|
{
|
|
|
|
struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
|
|
|
|
struct mem_cgroup *parent;
|
|
|
|
|
mm: writeback: use exact memcg dirty counts
Since commit a983b5ebee57 ("mm: memcontrol: fix excessive complexity in
memory.stat reporting") memcg dirty and writeback counters are managed
as:
1) per-memcg per-cpu values in range of [-32..32]
2) per-memcg atomic counter
When a per-cpu counter cannot fit in [-32..32] it's flushed to the
atomic. Stat readers only check the atomic. Thus readers such as
balance_dirty_pages() may see a nontrivial error margin: 32 pages per
cpu.
Assuming 100 cpus:
4k x86 page_size: 13 MiB error per memcg
64k ppc page_size: 200 MiB error per memcg
Considering that dirty+writeback are used together for some decisions the
errors double.
This inaccuracy can lead to undeserved oom kills. One nasty case is
when all per-cpu counters hold positive values offsetting an atomic
negative value (i.e. per_cpu[*]=32, atomic=n_cpu*-32).
balance_dirty_pages() only consults the atomic and does not consider
throttling the next n_cpu*32 dirty pages. If the file_lru is in the
13..200 MiB range then there's absolutely no dirty throttling, which
burdens vmscan with only dirty+writeback pages thus resorting to oom
kill.
It could be argued that tiny containers are not supported, but it's more
subtle. It's the amount the space available for file lru that matters.
If a container has memory.max-200MiB of non reclaimable memory, then it
will also suffer such oom kills on a 100 cpu machine.
The following test reliably ooms without this patch. This patch avoids
oom kills.
$ cat test
mount -t cgroup2 none /dev/cgroup
cd /dev/cgroup
echo +io +memory > cgroup.subtree_control
mkdir test
cd test
echo 10M > memory.max
(echo $BASHPID > cgroup.procs && exec /memcg-writeback-stress /foo)
(echo $BASHPID > cgroup.procs && exec dd if=/dev/zero of=/foo bs=2M count=100)
$ cat memcg-writeback-stress.c
/*
* Dirty pages from all but one cpu.
* Clean pages from the non dirtying cpu.
* This is to stress per cpu counter imbalance.
* On a 100 cpu machine:
* - per memcg per cpu dirty count is 32 pages for each of 99 cpus
* - per memcg atomic is -99*32 pages
* - thus the complete dirty limit: sum of all counters 0
* - balance_dirty_pages() only sees atomic count -99*32 pages, which
* it max()s to 0.
* - So a workload can dirty -99*32 pages before balance_dirty_pages()
* cares.
*/
#define _GNU_SOURCE
#include <err.h>
#include <fcntl.h>
#include <sched.h>
#include <stdlib.h>
#include <stdio.h>
#include <sys/stat.h>
#include <sys/sysinfo.h>
#include <sys/types.h>
#include <unistd.h>
static char *buf;
static int bufSize;
static void set_affinity(int cpu)
{
cpu_set_t affinity;
CPU_ZERO(&affinity);
CPU_SET(cpu, &affinity);
if (sched_setaffinity(0, sizeof(affinity), &affinity))
err(1, "sched_setaffinity");
}
static void dirty_on(int output_fd, int cpu)
{
int i, wrote;
set_affinity(cpu);
for (i = 0; i < 32; i++) {
for (wrote = 0; wrote < bufSize; ) {
int ret = write(output_fd, buf+wrote, bufSize-wrote);
if (ret == -1)
err(1, "write");
wrote += ret;
}
}
}
int main(int argc, char **argv)
{
int cpu, flush_cpu = 1, output_fd;
const char *output;
if (argc != 2)
errx(1, "usage: output_file");
output = argv[1];
bufSize = getpagesize();
buf = malloc(getpagesize());
if (buf == NULL)
errx(1, "malloc failed");
output_fd = open(output, O_CREAT|O_RDWR);
if (output_fd == -1)
err(1, "open(%s)", output);
for (cpu = 0; cpu < get_nprocs(); cpu++) {
if (cpu != flush_cpu)
dirty_on(output_fd, cpu);
}
set_affinity(flush_cpu);
if (fsync(output_fd))
err(1, "fsync(%s)", output);
if (close(output_fd))
err(1, "close(%s)", output);
free(buf);
}
Make balance_dirty_pages() and wb_over_bg_thresh() work harder to
collect exact per memcg counters. This avoids the aforementioned oom
kills.
This does not affect the overhead of memory.stat, which still reads the
single atomic counter.
Why not use percpu_counter? memcg already handles cpus going offline, so
no need for that overhead from percpu_counter. And the percpu_counter
spinlocks are more heavyweight than is required.
It probably also makes sense to use exact dirty and writeback counters
in memcg oom reports. But that is saved for later.
Link: http://lkml.kernel.org/r/20190329174609.164344-1-gthelen@google.com
Signed-off-by: Greg Thelen <gthelen@google.com>
Reviewed-by: Roman Gushchin <guro@fb.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: <stable@vger.kernel.org> [4.16+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-04-06 08:39:18 +07:00
|
|
|
*pdirty = memcg_exact_page_state(memcg, NR_FILE_DIRTY);
|
2015-05-23 05:23:35 +07:00
|
|
|
|
mm: writeback: use exact memcg dirty counts
Since commit a983b5ebee57 ("mm: memcontrol: fix excessive complexity in
memory.stat reporting") memcg dirty and writeback counters are managed
as:
1) per-memcg per-cpu values in range of [-32..32]
2) per-memcg atomic counter
When a per-cpu counter cannot fit in [-32..32] it's flushed to the
atomic. Stat readers only check the atomic. Thus readers such as
balance_dirty_pages() may see a nontrivial error margin: 32 pages per
cpu.
Assuming 100 cpus:
4k x86 page_size: 13 MiB error per memcg
64k ppc page_size: 200 MiB error per memcg
Considering that dirty+writeback are used together for some decisions the
errors double.
This inaccuracy can lead to undeserved oom kills. One nasty case is
when all per-cpu counters hold positive values offsetting an atomic
negative value (i.e. per_cpu[*]=32, atomic=n_cpu*-32).
balance_dirty_pages() only consults the atomic and does not consider
throttling the next n_cpu*32 dirty pages. If the file_lru is in the
13..200 MiB range then there's absolutely no dirty throttling, which
burdens vmscan with only dirty+writeback pages thus resorting to oom
kill.
It could be argued that tiny containers are not supported, but it's more
subtle. It's the amount the space available for file lru that matters.
If a container has memory.max-200MiB of non reclaimable memory, then it
will also suffer such oom kills on a 100 cpu machine.
The following test reliably ooms without this patch. This patch avoids
oom kills.
$ cat test
mount -t cgroup2 none /dev/cgroup
cd /dev/cgroup
echo +io +memory > cgroup.subtree_control
mkdir test
cd test
echo 10M > memory.max
(echo $BASHPID > cgroup.procs && exec /memcg-writeback-stress /foo)
(echo $BASHPID > cgroup.procs && exec dd if=/dev/zero of=/foo bs=2M count=100)
$ cat memcg-writeback-stress.c
/*
* Dirty pages from all but one cpu.
* Clean pages from the non dirtying cpu.
* This is to stress per cpu counter imbalance.
* On a 100 cpu machine:
* - per memcg per cpu dirty count is 32 pages for each of 99 cpus
* - per memcg atomic is -99*32 pages
* - thus the complete dirty limit: sum of all counters 0
* - balance_dirty_pages() only sees atomic count -99*32 pages, which
* it max()s to 0.
* - So a workload can dirty -99*32 pages before balance_dirty_pages()
* cares.
*/
#define _GNU_SOURCE
#include <err.h>
#include <fcntl.h>
#include <sched.h>
#include <stdlib.h>
#include <stdio.h>
#include <sys/stat.h>
#include <sys/sysinfo.h>
#include <sys/types.h>
#include <unistd.h>
static char *buf;
static int bufSize;
static void set_affinity(int cpu)
{
cpu_set_t affinity;
CPU_ZERO(&affinity);
CPU_SET(cpu, &affinity);
if (sched_setaffinity(0, sizeof(affinity), &affinity))
err(1, "sched_setaffinity");
}
static void dirty_on(int output_fd, int cpu)
{
int i, wrote;
set_affinity(cpu);
for (i = 0; i < 32; i++) {
for (wrote = 0; wrote < bufSize; ) {
int ret = write(output_fd, buf+wrote, bufSize-wrote);
if (ret == -1)
err(1, "write");
wrote += ret;
}
}
}
int main(int argc, char **argv)
{
int cpu, flush_cpu = 1, output_fd;
const char *output;
if (argc != 2)
errx(1, "usage: output_file");
output = argv[1];
bufSize = getpagesize();
buf = malloc(getpagesize());
if (buf == NULL)
errx(1, "malloc failed");
output_fd = open(output, O_CREAT|O_RDWR);
if (output_fd == -1)
err(1, "open(%s)", output);
for (cpu = 0; cpu < get_nprocs(); cpu++) {
if (cpu != flush_cpu)
dirty_on(output_fd, cpu);
}
set_affinity(flush_cpu);
if (fsync(output_fd))
err(1, "fsync(%s)", output);
if (close(output_fd))
err(1, "close(%s)", output);
free(buf);
}
Make balance_dirty_pages() and wb_over_bg_thresh() work harder to
collect exact per memcg counters. This avoids the aforementioned oom
kills.
This does not affect the overhead of memory.stat, which still reads the
single atomic counter.
Why not use percpu_counter? memcg already handles cpus going offline, so
no need for that overhead from percpu_counter. And the percpu_counter
spinlocks are more heavyweight than is required.
It probably also makes sense to use exact dirty and writeback counters
in memcg oom reports. But that is saved for later.
Link: http://lkml.kernel.org/r/20190329174609.164344-1-gthelen@google.com
Signed-off-by: Greg Thelen <gthelen@google.com>
Reviewed-by: Roman Gushchin <guro@fb.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: <stable@vger.kernel.org> [4.16+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-04-06 08:39:18 +07:00
|
|
|
*pwriteback = memcg_exact_page_state(memcg, NR_WRITEBACK);
|
2019-05-14 07:18:08 +07:00
|
|
|
*pfilepages = memcg_exact_page_state(memcg, NR_INACTIVE_FILE) +
|
|
|
|
memcg_exact_page_state(memcg, NR_ACTIVE_FILE);
|
writeback: fix incorrect calculation of available memory for memcg domains
For memcg domains, the amount of available memory was calculated as
min(the amount currently in use + headroom according to memcg,
total clean memory)
This isn't quite correct as what should be capped by the amount of
clean memory is the headroom, not the sum of memory in use and
headroom. For example, if a memcg domain has a significant amount of
dirty memory, the above can lead to a value which is lower than the
current amount in use which doesn't make much sense. In most
circumstances, the above leads to a number which is somewhat but not
drastically lower.
As the amount of memory which can be readily allocated to the memcg
domain is capped by the amount of system-wide clean memory which is
not already assigned to the memcg itself, the number we want is
the amount currently in use +
min(headroom according to memcg, clean memory elsewhere in the system)
This patch updates mem_cgroup_wb_stats() to return the number of
filepages and headroom instead of the calculated available pages.
mdtc_cap_avail() is renamed to mdtc_calc_avail() and performs the
above calculation from file, headroom, dirty and globally clean pages.
v2: Dummy mem_cgroup_wb_stats() implementation wasn't updated leading
to build failure when !CGROUP_WRITEBACK. Fixed.
Signed-off-by: Tejun Heo <tj@kernel.org>
Fixes: c2aa723a6093 ("writeback: implement memcg writeback domain based throttling")
Signed-off-by: Jens Axboe <axboe@fb.com>
2015-09-30 00:04:26 +07:00
|
|
|
*pheadroom = PAGE_COUNTER_MAX;
|
2015-05-23 05:23:35 +07:00
|
|
|
|
|
|
|
while ((parent = parent_mem_cgroup(memcg))) {
|
2020-04-02 11:07:20 +07:00
|
|
|
unsigned long ceiling = min(READ_ONCE(memcg->memory.max),
|
2020-06-02 11:49:49 +07:00
|
|
|
READ_ONCE(memcg->memory.high));
|
2015-05-23 05:23:35 +07:00
|
|
|
unsigned long used = page_counter_read(&memcg->memory);
|
|
|
|
|
writeback: fix incorrect calculation of available memory for memcg domains
For memcg domains, the amount of available memory was calculated as
min(the amount currently in use + headroom according to memcg,
total clean memory)
This isn't quite correct as what should be capped by the amount of
clean memory is the headroom, not the sum of memory in use and
headroom. For example, if a memcg domain has a significant amount of
dirty memory, the above can lead to a value which is lower than the
current amount in use which doesn't make much sense. In most
circumstances, the above leads to a number which is somewhat but not
drastically lower.
As the amount of memory which can be readily allocated to the memcg
domain is capped by the amount of system-wide clean memory which is
not already assigned to the memcg itself, the number we want is
the amount currently in use +
min(headroom according to memcg, clean memory elsewhere in the system)
This patch updates mem_cgroup_wb_stats() to return the number of
filepages and headroom instead of the calculated available pages.
mdtc_cap_avail() is renamed to mdtc_calc_avail() and performs the
above calculation from file, headroom, dirty and globally clean pages.
v2: Dummy mem_cgroup_wb_stats() implementation wasn't updated leading
to build failure when !CGROUP_WRITEBACK. Fixed.
Signed-off-by: Tejun Heo <tj@kernel.org>
Fixes: c2aa723a6093 ("writeback: implement memcg writeback domain based throttling")
Signed-off-by: Jens Axboe <axboe@fb.com>
2015-09-30 00:04:26 +07:00
|
|
|
*pheadroom = min(*pheadroom, ceiling - min(ceiling, used));
|
2015-05-23 05:23:35 +07:00
|
|
|
memcg = parent;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
writeback, memcg: Implement foreign dirty flushing
There's an inherent mismatch between memcg and writeback. The former
trackes ownership per-page while the latter per-inode. This was a
deliberate design decision because honoring per-page ownership in the
writeback path is complicated, may lead to higher CPU and IO overheads
and deemed unnecessary given that write-sharing an inode across
different cgroups isn't a common use-case.
Combined with inode majority-writer ownership switching, this works
well enough in most cases but there are some pathological cases. For
example, let's say there are two cgroups A and B which keep writing to
different but confined parts of the same inode. B owns the inode and
A's memory is limited far below B's. A's dirty ratio can rise enough
to trigger balance_dirty_pages() sleeps but B's can be low enough to
avoid triggering background writeback. A will be slowed down without
a way to make writeback of the dirty pages happen.
This patch implements foreign dirty recording and foreign mechanism so
that when a memcg encounters a condition as above it can trigger
flushes on bdi_writebacks which can clean its pages. Please see the
comment on top of mem_cgroup_track_foreign_dirty_slowpath() for
details.
A reproducer follows.
write-range.c::
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <fcntl.h>
#include <sys/types.h>
static const char *usage = "write-range FILE START SIZE\n";
int main(int argc, char **argv)
{
int fd;
unsigned long start, size, end, pos;
char *endp;
char buf[4096];
if (argc < 4) {
fprintf(stderr, usage);
return 1;
}
fd = open(argv[1], O_WRONLY);
if (fd < 0) {
perror("open");
return 1;
}
start = strtoul(argv[2], &endp, 0);
if (*endp != '\0') {
fprintf(stderr, usage);
return 1;
}
size = strtoul(argv[3], &endp, 0);
if (*endp != '\0') {
fprintf(stderr, usage);
return 1;
}
end = start + size;
while (1) {
for (pos = start; pos < end; ) {
long bread, bwritten = 0;
if (lseek(fd, pos, SEEK_SET) < 0) {
perror("lseek");
return 1;
}
bread = read(0, buf, sizeof(buf) < end - pos ?
sizeof(buf) : end - pos);
if (bread < 0) {
perror("read");
return 1;
}
if (bread == 0)
return 0;
while (bwritten < bread) {
long this;
this = write(fd, buf + bwritten,
bread - bwritten);
if (this < 0) {
perror("write");
return 1;
}
bwritten += this;
pos += bwritten;
}
}
}
}
repro.sh::
#!/bin/bash
set -e
set -x
sysctl -w vm.dirty_expire_centisecs=300000
sysctl -w vm.dirty_writeback_centisecs=300000
sysctl -w vm.dirtytime_expire_seconds=300000
echo 3 > /proc/sys/vm/drop_caches
TEST=/sys/fs/cgroup/test
A=$TEST/A
B=$TEST/B
mkdir -p $A $B
echo "+memory +io" > $TEST/cgroup.subtree_control
echo $((1<<30)) > $A/memory.high
echo $((32<<30)) > $B/memory.high
rm -f testfile
touch testfile
fallocate -l 4G testfile
echo "Starting B"
(echo $BASHPID > $B/cgroup.procs
pv -q --rate-limit 70M < /dev/urandom | ./write-range testfile $((2<<30)) $((2<<30))) &
echo "Waiting 10s to ensure B claims the testfile inode"
sleep 5
sync
sleep 5
sync
echo "Starting A"
(echo $BASHPID > $A/cgroup.procs
pv < /dev/urandom | ./write-range testfile 0 $((2<<30)))
v2: Added comments explaining why the specific intervals are being used.
v3: Use 0 @nr when calling cgroup_writeback_by_id() to use best-effort
flushing while avoding possible livelocks.
v4: Use get_jiffies_64() and time_before/after64() instead of raw
jiffies_64 and arthimetic comparisons as suggested by Jan.
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2019-08-26 23:06:56 +07:00
|
|
|
/*
|
|
|
|
* Foreign dirty flushing
|
|
|
|
*
|
|
|
|
* There's an inherent mismatch between memcg and writeback. The former
|
|
|
|
* trackes ownership per-page while the latter per-inode. This was a
|
|
|
|
* deliberate design decision because honoring per-page ownership in the
|
|
|
|
* writeback path is complicated, may lead to higher CPU and IO overheads
|
|
|
|
* and deemed unnecessary given that write-sharing an inode across
|
|
|
|
* different cgroups isn't a common use-case.
|
|
|
|
*
|
|
|
|
* Combined with inode majority-writer ownership switching, this works well
|
|
|
|
* enough in most cases but there are some pathological cases. For
|
|
|
|
* example, let's say there are two cgroups A and B which keep writing to
|
|
|
|
* different but confined parts of the same inode. B owns the inode and
|
|
|
|
* A's memory is limited far below B's. A's dirty ratio can rise enough to
|
|
|
|
* trigger balance_dirty_pages() sleeps but B's can be low enough to avoid
|
|
|
|
* triggering background writeback. A will be slowed down without a way to
|
|
|
|
* make writeback of the dirty pages happen.
|
|
|
|
*
|
|
|
|
* Conditions like the above can lead to a cgroup getting repatedly and
|
|
|
|
* severely throttled after making some progress after each
|
|
|
|
* dirty_expire_interval while the underyling IO device is almost
|
|
|
|
* completely idle.
|
|
|
|
*
|
|
|
|
* Solving this problem completely requires matching the ownership tracking
|
|
|
|
* granularities between memcg and writeback in either direction. However,
|
|
|
|
* the more egregious behaviors can be avoided by simply remembering the
|
|
|
|
* most recent foreign dirtying events and initiating remote flushes on
|
|
|
|
* them when local writeback isn't enough to keep the memory clean enough.
|
|
|
|
*
|
|
|
|
* The following two functions implement such mechanism. When a foreign
|
|
|
|
* page - a page whose memcg and writeback ownerships don't match - is
|
|
|
|
* dirtied, mem_cgroup_track_foreign_dirty() records the inode owning
|
|
|
|
* bdi_writeback on the page owning memcg. When balance_dirty_pages()
|
|
|
|
* decides that the memcg needs to sleep due to high dirty ratio, it calls
|
|
|
|
* mem_cgroup_flush_foreign() which queues writeback on the recorded
|
|
|
|
* foreign bdi_writebacks which haven't expired. Both the numbers of
|
|
|
|
* recorded bdi_writebacks and concurrent in-flight foreign writebacks are
|
|
|
|
* limited to MEMCG_CGWB_FRN_CNT.
|
|
|
|
*
|
|
|
|
* The mechanism only remembers IDs and doesn't hold any object references.
|
|
|
|
* As being wrong occasionally doesn't matter, updates and accesses to the
|
|
|
|
* records are lockless and racy.
|
|
|
|
*/
|
|
|
|
void mem_cgroup_track_foreign_dirty_slowpath(struct page *page,
|
|
|
|
struct bdi_writeback *wb)
|
|
|
|
{
|
|
|
|
struct mem_cgroup *memcg = page->mem_cgroup;
|
|
|
|
struct memcg_cgwb_frn *frn;
|
|
|
|
u64 now = get_jiffies_64();
|
|
|
|
u64 oldest_at = now;
|
|
|
|
int oldest = -1;
|
|
|
|
int i;
|
|
|
|
|
2019-08-30 05:47:19 +07:00
|
|
|
trace_track_foreign_dirty(page, wb);
|
|
|
|
|
writeback, memcg: Implement foreign dirty flushing
There's an inherent mismatch between memcg and writeback. The former
trackes ownership per-page while the latter per-inode. This was a
deliberate design decision because honoring per-page ownership in the
writeback path is complicated, may lead to higher CPU and IO overheads
and deemed unnecessary given that write-sharing an inode across
different cgroups isn't a common use-case.
Combined with inode majority-writer ownership switching, this works
well enough in most cases but there are some pathological cases. For
example, let's say there are two cgroups A and B which keep writing to
different but confined parts of the same inode. B owns the inode and
A's memory is limited far below B's. A's dirty ratio can rise enough
to trigger balance_dirty_pages() sleeps but B's can be low enough to
avoid triggering background writeback. A will be slowed down without
a way to make writeback of the dirty pages happen.
This patch implements foreign dirty recording and foreign mechanism so
that when a memcg encounters a condition as above it can trigger
flushes on bdi_writebacks which can clean its pages. Please see the
comment on top of mem_cgroup_track_foreign_dirty_slowpath() for
details.
A reproducer follows.
write-range.c::
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <fcntl.h>
#include <sys/types.h>
static const char *usage = "write-range FILE START SIZE\n";
int main(int argc, char **argv)
{
int fd;
unsigned long start, size, end, pos;
char *endp;
char buf[4096];
if (argc < 4) {
fprintf(stderr, usage);
return 1;
}
fd = open(argv[1], O_WRONLY);
if (fd < 0) {
perror("open");
return 1;
}
start = strtoul(argv[2], &endp, 0);
if (*endp != '\0') {
fprintf(stderr, usage);
return 1;
}
size = strtoul(argv[3], &endp, 0);
if (*endp != '\0') {
fprintf(stderr, usage);
return 1;
}
end = start + size;
while (1) {
for (pos = start; pos < end; ) {
long bread, bwritten = 0;
if (lseek(fd, pos, SEEK_SET) < 0) {
perror("lseek");
return 1;
}
bread = read(0, buf, sizeof(buf) < end - pos ?
sizeof(buf) : end - pos);
if (bread < 0) {
perror("read");
return 1;
}
if (bread == 0)
return 0;
while (bwritten < bread) {
long this;
this = write(fd, buf + bwritten,
bread - bwritten);
if (this < 0) {
perror("write");
return 1;
}
bwritten += this;
pos += bwritten;
}
}
}
}
repro.sh::
#!/bin/bash
set -e
set -x
sysctl -w vm.dirty_expire_centisecs=300000
sysctl -w vm.dirty_writeback_centisecs=300000
sysctl -w vm.dirtytime_expire_seconds=300000
echo 3 > /proc/sys/vm/drop_caches
TEST=/sys/fs/cgroup/test
A=$TEST/A
B=$TEST/B
mkdir -p $A $B
echo "+memory +io" > $TEST/cgroup.subtree_control
echo $((1<<30)) > $A/memory.high
echo $((32<<30)) > $B/memory.high
rm -f testfile
touch testfile
fallocate -l 4G testfile
echo "Starting B"
(echo $BASHPID > $B/cgroup.procs
pv -q --rate-limit 70M < /dev/urandom | ./write-range testfile $((2<<30)) $((2<<30))) &
echo "Waiting 10s to ensure B claims the testfile inode"
sleep 5
sync
sleep 5
sync
echo "Starting A"
(echo $BASHPID > $A/cgroup.procs
pv < /dev/urandom | ./write-range testfile 0 $((2<<30)))
v2: Added comments explaining why the specific intervals are being used.
v3: Use 0 @nr when calling cgroup_writeback_by_id() to use best-effort
flushing while avoding possible livelocks.
v4: Use get_jiffies_64() and time_before/after64() instead of raw
jiffies_64 and arthimetic comparisons as suggested by Jan.
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2019-08-26 23:06:56 +07:00
|
|
|
/*
|
|
|
|
* Pick the slot to use. If there is already a slot for @wb, keep
|
|
|
|
* using it. If not replace the oldest one which isn't being
|
|
|
|
* written out.
|
|
|
|
*/
|
|
|
|
for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) {
|
|
|
|
frn = &memcg->cgwb_frn[i];
|
|
|
|
if (frn->bdi_id == wb->bdi->id &&
|
|
|
|
frn->memcg_id == wb->memcg_css->id)
|
|
|
|
break;
|
|
|
|
if (time_before64(frn->at, oldest_at) &&
|
|
|
|
atomic_read(&frn->done.cnt) == 1) {
|
|
|
|
oldest = i;
|
|
|
|
oldest_at = frn->at;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (i < MEMCG_CGWB_FRN_CNT) {
|
|
|
|
/*
|
|
|
|
* Re-using an existing one. Update timestamp lazily to
|
|
|
|
* avoid making the cacheline hot. We want them to be
|
|
|
|
* reasonably up-to-date and significantly shorter than
|
|
|
|
* dirty_expire_interval as that's what expires the record.
|
|
|
|
* Use the shorter of 1s and dirty_expire_interval / 8.
|
|
|
|
*/
|
|
|
|
unsigned long update_intv =
|
|
|
|
min_t(unsigned long, HZ,
|
|
|
|
msecs_to_jiffies(dirty_expire_interval * 10) / 8);
|
|
|
|
|
|
|
|
if (time_before64(frn->at, now - update_intv))
|
|
|
|
frn->at = now;
|
|
|
|
} else if (oldest >= 0) {
|
|
|
|
/* replace the oldest free one */
|
|
|
|
frn = &memcg->cgwb_frn[oldest];
|
|
|
|
frn->bdi_id = wb->bdi->id;
|
|
|
|
frn->memcg_id = wb->memcg_css->id;
|
|
|
|
frn->at = now;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* issue foreign writeback flushes for recorded foreign dirtying events */
|
|
|
|
void mem_cgroup_flush_foreign(struct bdi_writeback *wb)
|
|
|
|
{
|
|
|
|
struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
|
|
|
|
unsigned long intv = msecs_to_jiffies(dirty_expire_interval * 10);
|
|
|
|
u64 now = jiffies_64;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) {
|
|
|
|
struct memcg_cgwb_frn *frn = &memcg->cgwb_frn[i];
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If the record is older than dirty_expire_interval,
|
|
|
|
* writeback on it has already started. No need to kick it
|
|
|
|
* off again. Also, don't start a new one if there's
|
|
|
|
* already one in flight.
|
|
|
|
*/
|
|
|
|
if (time_after64(frn->at, now - intv) &&
|
|
|
|
atomic_read(&frn->done.cnt) == 1) {
|
|
|
|
frn->at = 0;
|
2019-08-30 05:47:19 +07:00
|
|
|
trace_flush_foreign(wb, frn->bdi_id, frn->memcg_id);
|
writeback, memcg: Implement foreign dirty flushing
There's an inherent mismatch between memcg and writeback. The former
trackes ownership per-page while the latter per-inode. This was a
deliberate design decision because honoring per-page ownership in the
writeback path is complicated, may lead to higher CPU and IO overheads
and deemed unnecessary given that write-sharing an inode across
different cgroups isn't a common use-case.
Combined with inode majority-writer ownership switching, this works
well enough in most cases but there are some pathological cases. For
example, let's say there are two cgroups A and B which keep writing to
different but confined parts of the same inode. B owns the inode and
A's memory is limited far below B's. A's dirty ratio can rise enough
to trigger balance_dirty_pages() sleeps but B's can be low enough to
avoid triggering background writeback. A will be slowed down without
a way to make writeback of the dirty pages happen.
This patch implements foreign dirty recording and foreign mechanism so
that when a memcg encounters a condition as above it can trigger
flushes on bdi_writebacks which can clean its pages. Please see the
comment on top of mem_cgroup_track_foreign_dirty_slowpath() for
details.
A reproducer follows.
write-range.c::
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <fcntl.h>
#include <sys/types.h>
static const char *usage = "write-range FILE START SIZE\n";
int main(int argc, char **argv)
{
int fd;
unsigned long start, size, end, pos;
char *endp;
char buf[4096];
if (argc < 4) {
fprintf(stderr, usage);
return 1;
}
fd = open(argv[1], O_WRONLY);
if (fd < 0) {
perror("open");
return 1;
}
start = strtoul(argv[2], &endp, 0);
if (*endp != '\0') {
fprintf(stderr, usage);
return 1;
}
size = strtoul(argv[3], &endp, 0);
if (*endp != '\0') {
fprintf(stderr, usage);
return 1;
}
end = start + size;
while (1) {
for (pos = start; pos < end; ) {
long bread, bwritten = 0;
if (lseek(fd, pos, SEEK_SET) < 0) {
perror("lseek");
return 1;
}
bread = read(0, buf, sizeof(buf) < end - pos ?
sizeof(buf) : end - pos);
if (bread < 0) {
perror("read");
return 1;
}
if (bread == 0)
return 0;
while (bwritten < bread) {
long this;
this = write(fd, buf + bwritten,
bread - bwritten);
if (this < 0) {
perror("write");
return 1;
}
bwritten += this;
pos += bwritten;
}
}
}
}
repro.sh::
#!/bin/bash
set -e
set -x
sysctl -w vm.dirty_expire_centisecs=300000
sysctl -w vm.dirty_writeback_centisecs=300000
sysctl -w vm.dirtytime_expire_seconds=300000
echo 3 > /proc/sys/vm/drop_caches
TEST=/sys/fs/cgroup/test
A=$TEST/A
B=$TEST/B
mkdir -p $A $B
echo "+memory +io" > $TEST/cgroup.subtree_control
echo $((1<<30)) > $A/memory.high
echo $((32<<30)) > $B/memory.high
rm -f testfile
touch testfile
fallocate -l 4G testfile
echo "Starting B"
(echo $BASHPID > $B/cgroup.procs
pv -q --rate-limit 70M < /dev/urandom | ./write-range testfile $((2<<30)) $((2<<30))) &
echo "Waiting 10s to ensure B claims the testfile inode"
sleep 5
sync
sleep 5
sync
echo "Starting A"
(echo $BASHPID > $A/cgroup.procs
pv < /dev/urandom | ./write-range testfile 0 $((2<<30)))
v2: Added comments explaining why the specific intervals are being used.
v3: Use 0 @nr when calling cgroup_writeback_by_id() to use best-effort
flushing while avoding possible livelocks.
v4: Use get_jiffies_64() and time_before/after64() instead of raw
jiffies_64 and arthimetic comparisons as suggested by Jan.
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2019-08-26 23:06:56 +07:00
|
|
|
cgroup_writeback_by_id(frn->bdi_id, frn->memcg_id, 0,
|
|
|
|
WB_REASON_FOREIGN_FLUSH,
|
|
|
|
&frn->done);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-05-23 05:23:33 +07:00
|
|
|
#else /* CONFIG_CGROUP_WRITEBACK */
|
|
|
|
|
|
|
|
static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
|
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void memcg_wb_domain_exit(struct mem_cgroup *memcg)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2015-05-23 05:23:34 +07:00
|
|
|
static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2015-05-23 04:13:37 +07:00
|
|
|
#endif /* CONFIG_CGROUP_WRITEBACK */
|
|
|
|
|
2013-11-23 06:20:44 +07:00
|
|
|
/*
|
|
|
|
* DO NOT USE IN NEW FILES.
|
|
|
|
*
|
|
|
|
* "cgroup.event_control" implementation.
|
|
|
|
*
|
|
|
|
* This is way over-engineered. It tries to support fully configurable
|
|
|
|
* events for each user. Such level of flexibility is completely
|
|
|
|
* unnecessary especially in the light of the planned unified hierarchy.
|
|
|
|
*
|
|
|
|
* Please deprecate this and replace with something simpler if at all
|
|
|
|
* possible.
|
|
|
|
*/
|
|
|
|
|
2013-11-23 06:20:42 +07:00
|
|
|
/*
|
|
|
|
* Unregister event and free resources.
|
|
|
|
*
|
|
|
|
* Gets called from workqueue.
|
|
|
|
*/
|
2013-11-23 06:20:44 +07:00
|
|
|
static void memcg_event_remove(struct work_struct *work)
|
2013-11-23 06:20:42 +07:00
|
|
|
{
|
2013-11-23 06:20:44 +07:00
|
|
|
struct mem_cgroup_event *event =
|
|
|
|
container_of(work, struct mem_cgroup_event, remove);
|
2013-11-23 06:20:43 +07:00
|
|
|
struct mem_cgroup *memcg = event->memcg;
|
2013-11-23 06:20:42 +07:00
|
|
|
|
|
|
|
remove_wait_queue(event->wqh, &event->wait);
|
|
|
|
|
2013-11-23 06:20:43 +07:00
|
|
|
event->unregister_event(memcg, event->eventfd);
|
2013-11-23 06:20:42 +07:00
|
|
|
|
|
|
|
/* Notify userspace the event is going away. */
|
|
|
|
eventfd_signal(event->eventfd, 1);
|
|
|
|
|
|
|
|
eventfd_ctx_put(event->eventfd);
|
|
|
|
kfree(event);
|
2013-11-23 06:20:43 +07:00
|
|
|
css_put(&memcg->css);
|
2013-11-23 06:20:42 +07:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2018-02-12 05:34:03 +07:00
|
|
|
* Gets called on EPOLLHUP on eventfd when user closes it.
|
2013-11-23 06:20:42 +07:00
|
|
|
*
|
|
|
|
* Called with wqh->lock held and interrupts disabled.
|
|
|
|
*/
|
2017-06-20 17:06:13 +07:00
|
|
|
static int memcg_event_wake(wait_queue_entry_t *wait, unsigned mode,
|
2013-11-23 06:20:44 +07:00
|
|
|
int sync, void *key)
|
2013-11-23 06:20:42 +07:00
|
|
|
{
|
2013-11-23 06:20:44 +07:00
|
|
|
struct mem_cgroup_event *event =
|
|
|
|
container_of(wait, struct mem_cgroup_event, wait);
|
2013-11-23 06:20:43 +07:00
|
|
|
struct mem_cgroup *memcg = event->memcg;
|
2017-07-04 07:14:56 +07:00
|
|
|
__poll_t flags = key_to_poll(key);
|
2013-11-23 06:20:42 +07:00
|
|
|
|
2018-02-12 05:34:03 +07:00
|
|
|
if (flags & EPOLLHUP) {
|
2013-11-23 06:20:42 +07:00
|
|
|
/*
|
|
|
|
* If the event has been detached at cgroup removal, we
|
|
|
|
* can simply return knowing the other side will cleanup
|
|
|
|
* for us.
|
|
|
|
*
|
|
|
|
* We can't race against event freeing since the other
|
|
|
|
* side will require wqh->lock via remove_wait_queue(),
|
|
|
|
* which we hold.
|
|
|
|
*/
|
2013-11-23 06:20:43 +07:00
|
|
|
spin_lock(&memcg->event_list_lock);
|
2013-11-23 06:20:42 +07:00
|
|
|
if (!list_empty(&event->list)) {
|
|
|
|
list_del_init(&event->list);
|
|
|
|
/*
|
|
|
|
* We are in atomic context, but cgroup_event_remove()
|
|
|
|
* may sleep, so we have to call it in workqueue.
|
|
|
|
*/
|
|
|
|
schedule_work(&event->remove);
|
|
|
|
}
|
2013-11-23 06:20:43 +07:00
|
|
|
spin_unlock(&memcg->event_list_lock);
|
2013-11-23 06:20:42 +07:00
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2013-11-23 06:20:44 +07:00
|
|
|
static void memcg_event_ptable_queue_proc(struct file *file,
|
2013-11-23 06:20:42 +07:00
|
|
|
wait_queue_head_t *wqh, poll_table *pt)
|
|
|
|
{
|
2013-11-23 06:20:44 +07:00
|
|
|
struct mem_cgroup_event *event =
|
|
|
|
container_of(pt, struct mem_cgroup_event, pt);
|
2013-11-23 06:20:42 +07:00
|
|
|
|
|
|
|
event->wqh = wqh;
|
|
|
|
add_wait_queue(wqh, &event->wait);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2013-11-23 06:20:44 +07:00
|
|
|
* DO NOT USE IN NEW FILES.
|
|
|
|
*
|
2013-11-23 06:20:42 +07:00
|
|
|
* Parse input and register new cgroup event handler.
|
|
|
|
*
|
|
|
|
* Input must be in format '<event_fd> <control_fd> <args>'.
|
|
|
|
* Interpretation of args is defined by control file implementation.
|
|
|
|
*/
|
2014-05-13 23:16:21 +07:00
|
|
|
static ssize_t memcg_write_event_control(struct kernfs_open_file *of,
|
|
|
|
char *buf, size_t nbytes, loff_t off)
|
2013-11-23 06:20:42 +07:00
|
|
|
{
|
2014-05-13 23:16:21 +07:00
|
|
|
struct cgroup_subsys_state *css = of_css(of);
|
2013-11-23 06:20:43 +07:00
|
|
|
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
|
2013-11-23 06:20:44 +07:00
|
|
|
struct mem_cgroup_event *event;
|
2013-11-23 06:20:42 +07:00
|
|
|
struct cgroup_subsys_state *cfile_css;
|
|
|
|
unsigned int efd, cfd;
|
|
|
|
struct fd efile;
|
|
|
|
struct fd cfile;
|
2013-11-23 06:20:43 +07:00
|
|
|
const char *name;
|
2013-11-23 06:20:42 +07:00
|
|
|
char *endp;
|
|
|
|
int ret;
|
|
|
|
|
2014-05-13 23:16:21 +07:00
|
|
|
buf = strstrip(buf);
|
|
|
|
|
|
|
|
efd = simple_strtoul(buf, &endp, 10);
|
2013-11-23 06:20:42 +07:00
|
|
|
if (*endp != ' ')
|
|
|
|
return -EINVAL;
|
2014-05-13 23:16:21 +07:00
|
|
|
buf = endp + 1;
|
2013-11-23 06:20:42 +07:00
|
|
|
|
2014-05-13 23:16:21 +07:00
|
|
|
cfd = simple_strtoul(buf, &endp, 10);
|
2013-11-23 06:20:42 +07:00
|
|
|
if ((*endp != ' ') && (*endp != '\0'))
|
|
|
|
return -EINVAL;
|
2014-05-13 23:16:21 +07:00
|
|
|
buf = endp + 1;
|
2013-11-23 06:20:42 +07:00
|
|
|
|
|
|
|
event = kzalloc(sizeof(*event), GFP_KERNEL);
|
|
|
|
if (!event)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
2013-11-23 06:20:43 +07:00
|
|
|
event->memcg = memcg;
|
2013-11-23 06:20:42 +07:00
|
|
|
INIT_LIST_HEAD(&event->list);
|
2013-11-23 06:20:44 +07:00
|
|
|
init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc);
|
|
|
|
init_waitqueue_func_entry(&event->wait, memcg_event_wake);
|
|
|
|
INIT_WORK(&event->remove, memcg_event_remove);
|
2013-11-23 06:20:42 +07:00
|
|
|
|
|
|
|
efile = fdget(efd);
|
|
|
|
if (!efile.file) {
|
|
|
|
ret = -EBADF;
|
|
|
|
goto out_kfree;
|
|
|
|
}
|
|
|
|
|
|
|
|
event->eventfd = eventfd_ctx_fileget(efile.file);
|
|
|
|
if (IS_ERR(event->eventfd)) {
|
|
|
|
ret = PTR_ERR(event->eventfd);
|
|
|
|
goto out_put_efile;
|
|
|
|
}
|
|
|
|
|
|
|
|
cfile = fdget(cfd);
|
|
|
|
if (!cfile.file) {
|
|
|
|
ret = -EBADF;
|
|
|
|
goto out_put_eventfd;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* the process need read permission on control file */
|
|
|
|
/* AV: shouldn't we check that it's been opened for read instead? */
|
|
|
|
ret = inode_permission(file_inode(cfile.file), MAY_READ);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out_put_cfile;
|
|
|
|
|
2013-11-23 06:20:43 +07:00
|
|
|
/*
|
|
|
|
* Determine the event callbacks and set them in @event. This used
|
|
|
|
* to be done via struct cftype but cgroup core no longer knows
|
|
|
|
* about these events. The following is crude but the whole thing
|
|
|
|
* is for compatibility anyway.
|
2013-11-23 06:20:44 +07:00
|
|
|
*
|
|
|
|
* DO NOT ADD NEW FILES.
|
2013-11-23 06:20:43 +07:00
|
|
|
*/
|
2014-10-31 12:22:04 +07:00
|
|
|
name = cfile.file->f_path.dentry->d_name.name;
|
2013-11-23 06:20:43 +07:00
|
|
|
|
|
|
|
if (!strcmp(name, "memory.usage_in_bytes")) {
|
|
|
|
event->register_event = mem_cgroup_usage_register_event;
|
|
|
|
event->unregister_event = mem_cgroup_usage_unregister_event;
|
|
|
|
} else if (!strcmp(name, "memory.oom_control")) {
|
|
|
|
event->register_event = mem_cgroup_oom_register_event;
|
|
|
|
event->unregister_event = mem_cgroup_oom_unregister_event;
|
|
|
|
} else if (!strcmp(name, "memory.pressure_level")) {
|
|
|
|
event->register_event = vmpressure_register_event;
|
|
|
|
event->unregister_event = vmpressure_unregister_event;
|
|
|
|
} else if (!strcmp(name, "memory.memsw.usage_in_bytes")) {
|
2013-11-23 06:20:43 +07:00
|
|
|
event->register_event = memsw_cgroup_usage_register_event;
|
|
|
|
event->unregister_event = memsw_cgroup_usage_unregister_event;
|
2013-11-23 06:20:43 +07:00
|
|
|
} else {
|
|
|
|
ret = -EINVAL;
|
|
|
|
goto out_put_cfile;
|
|
|
|
}
|
|
|
|
|
2013-11-23 06:20:42 +07:00
|
|
|
/*
|
2013-11-23 06:20:42 +07:00
|
|
|
* Verify @cfile should belong to @css. Also, remaining events are
|
|
|
|
* automatically removed on cgroup destruction but the removal is
|
|
|
|
* asynchronous, so take an extra ref on @css.
|
2013-11-23 06:20:42 +07:00
|
|
|
*/
|
2014-10-31 12:22:04 +07:00
|
|
|
cfile_css = css_tryget_online_from_dir(cfile.file->f_path.dentry->d_parent,
|
2014-05-13 23:11:01 +07:00
|
|
|
&memory_cgrp_subsys);
|
2013-11-23 06:20:42 +07:00
|
|
|
ret = -EINVAL;
|
2014-02-11 23:52:47 +07:00
|
|
|
if (IS_ERR(cfile_css))
|
2013-11-23 06:20:42 +07:00
|
|
|
goto out_put_cfile;
|
2014-02-11 23:52:47 +07:00
|
|
|
if (cfile_css != css) {
|
|
|
|
css_put(cfile_css);
|
2013-11-23 06:20:42 +07:00
|
|
|
goto out_put_cfile;
|
2014-02-11 23:52:47 +07:00
|
|
|
}
|
2013-11-23 06:20:42 +07:00
|
|
|
|
2014-05-13 23:16:21 +07:00
|
|
|
ret = event->register_event(memcg, event->eventfd, buf);
|
2013-11-23 06:20:42 +07:00
|
|
|
if (ret)
|
|
|
|
goto out_put_css;
|
|
|
|
|
2018-03-05 22:26:05 +07:00
|
|
|
vfs_poll(efile.file, &event->pt);
|
2013-11-23 06:20:42 +07:00
|
|
|
|
2013-11-23 06:20:43 +07:00
|
|
|
spin_lock(&memcg->event_list_lock);
|
|
|
|
list_add(&event->list, &memcg->event_list);
|
|
|
|
spin_unlock(&memcg->event_list_lock);
|
2013-11-23 06:20:42 +07:00
|
|
|
|
|
|
|
fdput(cfile);
|
|
|
|
fdput(efile);
|
|
|
|
|
2014-05-13 23:16:21 +07:00
|
|
|
return nbytes;
|
2013-11-23 06:20:42 +07:00
|
|
|
|
|
|
|
out_put_css:
|
2013-11-23 06:20:42 +07:00
|
|
|
css_put(css);
|
2013-11-23 06:20:42 +07:00
|
|
|
out_put_cfile:
|
|
|
|
fdput(cfile);
|
|
|
|
out_put_eventfd:
|
|
|
|
eventfd_ctx_put(event->eventfd);
|
|
|
|
out_put_efile:
|
|
|
|
fdput(efile);
|
|
|
|
out_kfree:
|
|
|
|
kfree(event);
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
mm: memcontrol: default hierarchy interface for memory
Introduce the basic control files to account, partition, and limit
memory using cgroups in default hierarchy mode.
This interface versioning allows us to address fundamental design
issues in the existing memory cgroup interface, further explained
below. The old interface will be maintained indefinitely, but a
clearer model and improved workload performance should encourage
existing users to switch over to the new one eventually.
The control files are thus:
- memory.current shows the current consumption of the cgroup and its
descendants, in bytes.
- memory.low configures the lower end of the cgroup's expected
memory consumption range. The kernel considers memory below that
boundary to be a reserve - the minimum that the workload needs in
order to make forward progress - and generally avoids reclaiming
it, unless there is an imminent risk of entering an OOM situation.
- memory.high configures the upper end of the cgroup's expected
memory consumption range. A cgroup whose consumption grows beyond
this threshold is forced into direct reclaim, to work off the
excess and to throttle new allocations heavily, but is generally
allowed to continue and the OOM killer is not invoked.
- memory.max configures the hard maximum amount of memory that the
cgroup is allowed to consume before the OOM killer is invoked.
- memory.events shows event counters that indicate how often the
cgroup was reclaimed while below memory.low, how often it was
forced to reclaim excess beyond memory.high, how often it hit
memory.max, and how often it entered OOM due to memory.max. This
allows users to identify configuration problems when observing a
degradation in workload performance. An overcommitted system will
have an increased rate of low boundary breaches, whereas increased
rates of high limit breaches, maximum hits, or even OOM situations
will indicate internally overcommitted cgroups.
For existing users of memory cgroups, the following deviations from
the current interface are worth pointing out and explaining:
- The original lower boundary, the soft limit, is defined as a limit
that is per default unset. As a result, the set of cgroups that
global reclaim prefers is opt-in, rather than opt-out. The costs
for optimizing these mostly negative lookups are so high that the
implementation, despite its enormous size, does not even provide
the basic desirable behavior. First off, the soft limit has no
hierarchical meaning. All configured groups are organized in a
global rbtree and treated like equal peers, regardless where they
are located in the hierarchy. This makes subtree delegation
impossible. Second, the soft limit reclaim pass is so aggressive
that it not just introduces high allocation latencies into the
system, but also impacts system performance due to overreclaim, to
the point where the feature becomes self-defeating.
The memory.low boundary on the other hand is a top-down allocated
reserve. A cgroup enjoys reclaim protection when it and all its
ancestors are below their low boundaries, which makes delegation
of subtrees possible. Secondly, new cgroups have no reserve per
default and in the common case most cgroups are eligible for the
preferred reclaim pass. This allows the new low boundary to be
efficiently implemented with just a minor addition to the generic
reclaim code, without the need for out-of-band data structures and
reclaim passes. Because the generic reclaim code considers all
cgroups except for the ones running low in the preferred first
reclaim pass, overreclaim of individual groups is eliminated as
well, resulting in much better overall workload performance.
- The original high boundary, the hard limit, is defined as a strict
limit that can not budge, even if the OOM killer has to be called.
But this generally goes against the goal of making the most out of
the available memory. The memory consumption of workloads varies
during runtime, and that requires users to overcommit. But doing
that with a strict upper limit requires either a fairly accurate
prediction of the working set size or adding slack to the limit.
Since working set size estimation is hard and error prone, and
getting it wrong results in OOM kills, most users tend to err on
the side of a looser limit and end up wasting precious resources.
The memory.high boundary on the other hand can be set much more
conservatively. When hit, it throttles allocations by forcing
them into direct reclaim to work off the excess, but it never
invokes the OOM killer. As a result, a high boundary that is
chosen too aggressively will not terminate the processes, but
instead it will lead to gradual performance degradation. The user
can monitor this and make corrections until the minimal memory
footprint that still gives acceptable performance is found.
In extreme cases, with many concurrent allocations and a complete
breakdown of reclaim progress within the group, the high boundary
can be exceeded. But even then it's mostly better to satisfy the
allocation from the slack available in other groups or the rest of
the system than killing the group. Otherwise, memory.max is there
to limit this type of spillover and ultimately contain buggy or
even malicious applications.
- The original control file names are unwieldy and inconsistent in
many different ways. For example, the upper boundary hit count is
exported in the memory.failcnt file, but an OOM event count has to
be manually counted by listening to memory.oom_control events, and
lower boundary / soft limit events have to be counted by first
setting a threshold for that value and then counting those events.
Also, usage and limit files encode their units in the filename.
That makes the filenames very long, even though this is not
information that a user needs to be reminded of every time they
type out those names.
To address these naming issues, as well as to signal clearly that
the new interface carries a new configuration model, the naming
conventions in it necessarily differ from the old interface.
- The original limit files indicate the state of an unset limit with
a very high number, and a configured limit can be unset by echoing
-1 into those files. But that very high number is implementation
and architecture dependent and not very descriptive. And while -1
can be understood as an underflow into the highest possible value,
-2 or -10M etc. do not work, so it's not inconsistent.
memory.low, memory.high, and memory.max will use the string
"infinity" to indicate and set the highest possible value.
[akpm@linux-foundation.org: use seq_puts() for basic strings]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: Vladimir Davydov <vdavydov@parallels.com>
Cc: Greg Thelen <gthelen@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-02-12 06:26:06 +07:00
|
|
|
static struct cftype mem_cgroup_legacy_files[] = {
|
2008-02-07 15:13:50 +07:00
|
|
|
{
|
2008-02-07 15:13:57 +07:00
|
|
|
.name = "usage_in_bytes",
|
2009-01-08 09:08:00 +07:00
|
|
|
.private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
|
2013-12-06 00:28:02 +07:00
|
|
|
.read_u64 = mem_cgroup_read_u64,
|
2008-02-07 15:13:50 +07:00
|
|
|
},
|
2008-04-29 15:00:17 +07:00
|
|
|
{
|
|
|
|
.name = "max_usage_in_bytes",
|
2009-01-08 09:08:00 +07:00
|
|
|
.private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),
|
2014-05-13 23:16:21 +07:00
|
|
|
.write = mem_cgroup_reset,
|
2013-12-06 00:28:02 +07:00
|
|
|
.read_u64 = mem_cgroup_read_u64,
|
2008-04-29 15:00:17 +07:00
|
|
|
},
|
2008-02-07 15:13:50 +07:00
|
|
|
{
|
2008-02-07 15:13:57 +07:00
|
|
|
.name = "limit_in_bytes",
|
2009-01-08 09:08:00 +07:00
|
|
|
.private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),
|
2014-05-13 23:16:21 +07:00
|
|
|
.write = mem_cgroup_write,
|
2013-12-06 00:28:02 +07:00
|
|
|
.read_u64 = mem_cgroup_read_u64,
|
2008-02-07 15:13:50 +07:00
|
|
|
},
|
2009-09-24 05:56:36 +07:00
|
|
|
{
|
|
|
|
.name = "soft_limit_in_bytes",
|
|
|
|
.private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),
|
2014-05-13 23:16:21 +07:00
|
|
|
.write = mem_cgroup_write,
|
2013-12-06 00:28:02 +07:00
|
|
|
.read_u64 = mem_cgroup_read_u64,
|
2009-09-24 05:56:36 +07:00
|
|
|
},
|
2008-02-07 15:13:50 +07:00
|
|
|
{
|
|
|
|
.name = "failcnt",
|
2009-01-08 09:08:00 +07:00
|
|
|
.private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
|
2014-05-13 23:16:21 +07:00
|
|
|
.write = mem_cgroup_reset,
|
2013-12-06 00:28:02 +07:00
|
|
|
.read_u64 = mem_cgroup_read_u64,
|
2008-02-07 15:13:50 +07:00
|
|
|
},
|
2008-02-07 15:14:25 +07:00
|
|
|
{
|
|
|
|
.name = "stat",
|
2013-12-06 00:28:04 +07:00
|
|
|
.seq_show = memcg_stat_show,
|
2008-02-07 15:14:25 +07:00
|
|
|
},
|
2009-01-08 09:07:55 +07:00
|
|
|
{
|
|
|
|
.name = "force_empty",
|
2014-05-13 23:16:21 +07:00
|
|
|
.write = mem_cgroup_force_empty_write,
|
2009-01-08 09:07:55 +07:00
|
|
|
},
|
2009-01-08 09:08:07 +07:00
|
|
|
{
|
|
|
|
.name = "use_hierarchy",
|
|
|
|
.write_u64 = mem_cgroup_hierarchy_write,
|
|
|
|
.read_u64 = mem_cgroup_hierarchy_read,
|
|
|
|
},
|
2013-11-23 06:20:42 +07:00
|
|
|
{
|
2013-11-23 06:20:44 +07:00
|
|
|
.name = "cgroup.event_control", /* XXX: for compat */
|
2014-05-13 23:16:21 +07:00
|
|
|
.write = memcg_write_event_control,
|
2015-09-19 04:54:23 +07:00
|
|
|
.flags = CFTYPE_NO_PREFIX | CFTYPE_WORLD_WRITABLE,
|
2013-11-23 06:20:42 +07:00
|
|
|
},
|
2009-01-08 09:08:24 +07:00
|
|
|
{
|
|
|
|
.name = "swappiness",
|
|
|
|
.read_u64 = mem_cgroup_swappiness_read,
|
|
|
|
.write_u64 = mem_cgroup_swappiness_write,
|
|
|
|
},
|
2010-03-11 06:22:13 +07:00
|
|
|
{
|
|
|
|
.name = "move_charge_at_immigrate",
|
|
|
|
.read_u64 = mem_cgroup_move_charge_read,
|
|
|
|
.write_u64 = mem_cgroup_move_charge_write,
|
|
|
|
},
|
2010-05-27 04:42:36 +07:00
|
|
|
{
|
|
|
|
.name = "oom_control",
|
2013-12-06 00:28:04 +07:00
|
|
|
.seq_show = mem_cgroup_oom_control_read,
|
2010-05-27 04:42:37 +07:00
|
|
|
.write_u64 = mem_cgroup_oom_control_write,
|
2010-05-27 04:42:36 +07:00
|
|
|
.private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
|
|
|
|
},
|
memcg: add memory.pressure_level events
With this patch userland applications that want to maintain the
interactivity/memory allocation cost can use the pressure level
notifications. The levels are defined like this:
The "low" level means that the system is reclaiming memory for new
allocations. Monitoring this reclaiming activity might be useful for
maintaining cache level. Upon notification, the program (typically
"Activity Manager") might analyze vmstat and act in advance (i.e.
prematurely shutdown unimportant services).
The "medium" level means that the system is experiencing medium memory
pressure, the system might be making swap, paging out active file
caches, etc. Upon this event applications may decide to further analyze
vmstat/zoneinfo/memcg or internal memory usage statistics and free any
resources that can be easily reconstructed or re-read from a disk.
The "critical" level means that the system is actively thrashing, it is
about to out of memory (OOM) or even the in-kernel OOM killer is on its
way to trigger. Applications should do whatever they can to help the
system. It might be too late to consult with vmstat or any other
statistics, so it's advisable to take an immediate action.
The events are propagated upward until the event is handled, i.e. the
events are not pass-through. Here is what this means: for example you
have three cgroups: A->B->C. Now you set up an event listener on
cgroups A, B and C, and suppose group C experiences some pressure. In
this situation, only group C will receive the notification, i.e. groups
A and B will not receive it. This is done to avoid excessive
"broadcasting" of messages, which disturbs the system and which is
especially bad if we are low on memory or thrashing. So, organize the
cgroups wisely, or propagate the events manually (or, ask us to
implement the pass-through events, explaining why would you need them.)
Performance wise, the memory pressure notifications feature itself is
lightweight and does not require much of bookkeeping, in contrast to the
rest of memcg features. Unfortunately, as of current memcg
implementation, pages accounting is an inseparable part and cannot be
turned off. The good news is that there are some efforts[1] to improve
the situation; plus, implementing the same, fully API-compatible[2]
interface for CONFIG_MEMCG=n case (e.g. embedded) is also a viable
option, so it will not require any changes on the userland side.
[1] http://permalink.gmane.org/gmane.linux.kernel.cgroups/6291
[2] http://lkml.org/lkml/2013/2/21/454
[akpm@linux-foundation.org: coding-style fixes]
[akpm@linux-foundation.org: fix CONFIG_CGROPUPS=n warnings]
Signed-off-by: Anton Vorontsov <anton.vorontsov@linaro.org>
Acked-by: Kirill A. Shutemov <kirill@shutemov.name>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Glauber Costa <glommer@parallels.com>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Luiz Capitulino <lcapitulino@redhat.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Leonid Moiseichuk <leonid.moiseichuk@nokia.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@gmail.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Cc: John Stultz <john.stultz@linaro.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2013-04-30 05:08:31 +07:00
|
|
|
{
|
|
|
|
.name = "pressure_level",
|
|
|
|
},
|
2011-05-27 06:25:37 +07:00
|
|
|
#ifdef CONFIG_NUMA
|
|
|
|
{
|
|
|
|
.name = "numa_stat",
|
2013-12-06 00:28:04 +07:00
|
|
|
.seq_show = memcg_numa_stat_show,
|
2011-05-27 06:25:37 +07:00
|
|
|
},
|
|
|
|
#endif
|
2012-12-19 05:21:47 +07:00
|
|
|
{
|
|
|
|
.name = "kmem.limit_in_bytes",
|
|
|
|
.private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT),
|
2014-05-13 23:16:21 +07:00
|
|
|
.write = mem_cgroup_write,
|
2013-12-06 00:28:02 +07:00
|
|
|
.read_u64 = mem_cgroup_read_u64,
|
2012-12-19 05:21:47 +07:00
|
|
|
},
|
|
|
|
{
|
|
|
|
.name = "kmem.usage_in_bytes",
|
|
|
|
.private = MEMFILE_PRIVATE(_KMEM, RES_USAGE),
|
2013-12-06 00:28:02 +07:00
|
|
|
.read_u64 = mem_cgroup_read_u64,
|
2012-12-19 05:21:47 +07:00
|
|
|
},
|
|
|
|
{
|
|
|
|
.name = "kmem.failcnt",
|
|
|
|
.private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT),
|
2014-05-13 23:16:21 +07:00
|
|
|
.write = mem_cgroup_reset,
|
2013-12-06 00:28:02 +07:00
|
|
|
.read_u64 = mem_cgroup_read_u64,
|
2012-12-19 05:21:47 +07:00
|
|
|
},
|
|
|
|
{
|
|
|
|
.name = "kmem.max_usage_in_bytes",
|
|
|
|
.private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE),
|
2014-05-13 23:16:21 +07:00
|
|
|
.write = mem_cgroup_reset,
|
2013-12-06 00:28:02 +07:00
|
|
|
.read_u64 = mem_cgroup_read_u64,
|
2012-12-19 05:21:47 +07:00
|
|
|
},
|
2020-04-02 11:06:30 +07:00
|
|
|
#if defined(CONFIG_MEMCG_KMEM) && \
|
|
|
|
(defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG))
|
2012-12-19 05:23:01 +07:00
|
|
|
{
|
|
|
|
.name = "kmem.slabinfo",
|
2014-12-11 06:44:19 +07:00
|
|
|
.seq_show = memcg_slab_show,
|
2012-12-19 05:23:01 +07:00
|
|
|
},
|
|
|
|
#endif
|
2016-01-21 06:02:44 +07:00
|
|
|
{
|
|
|
|
.name = "kmem.tcp.limit_in_bytes",
|
|
|
|
.private = MEMFILE_PRIVATE(_TCP, RES_LIMIT),
|
|
|
|
.write = mem_cgroup_write,
|
|
|
|
.read_u64 = mem_cgroup_read_u64,
|
|
|
|
},
|
|
|
|
{
|
|
|
|
.name = "kmem.tcp.usage_in_bytes",
|
|
|
|
.private = MEMFILE_PRIVATE(_TCP, RES_USAGE),
|
|
|
|
.read_u64 = mem_cgroup_read_u64,
|
|
|
|
},
|
|
|
|
{
|
|
|
|
.name = "kmem.tcp.failcnt",
|
|
|
|
.private = MEMFILE_PRIVATE(_TCP, RES_FAILCNT),
|
|
|
|
.write = mem_cgroup_reset,
|
|
|
|
.read_u64 = mem_cgroup_read_u64,
|
|
|
|
},
|
|
|
|
{
|
|
|
|
.name = "kmem.tcp.max_usage_in_bytes",
|
|
|
|
.private = MEMFILE_PRIVATE(_TCP, RES_MAX_USAGE),
|
|
|
|
.write = mem_cgroup_reset,
|
|
|
|
.read_u64 = mem_cgroup_read_u64,
|
|
|
|
},
|
2012-04-02 02:09:55 +07:00
|
|
|
{ }, /* terminate */
|
2012-04-02 02:09:55 +07:00
|
|
|
};
|
2009-01-08 09:08:00 +07:00
|
|
|
|
2016-07-21 05:44:57 +07:00
|
|
|
/*
|
|
|
|
* Private memory cgroup IDR
|
|
|
|
*
|
|
|
|
* Swap-out records and page cache shadow entries need to store memcg
|
|
|
|
* references in constrained space, so we maintain an ID space that is
|
|
|
|
* limited to 16 bit (MEM_CGROUP_ID_MAX), limiting the total number of
|
|
|
|
* memory-controlled cgroups to 64k.
|
|
|
|
*
|
2020-06-05 06:49:28 +07:00
|
|
|
* However, there usually are many references to the offline CSS after
|
2016-07-21 05:44:57 +07:00
|
|
|
* the cgroup has been destroyed, such as page cache or reclaimable
|
|
|
|
* slab objects, that don't need to hang on to the ID. We want to keep
|
|
|
|
* those dead CSS from occupying IDs, or we might quickly exhaust the
|
|
|
|
* relatively small ID space and prevent the creation of new cgroups
|
|
|
|
* even when there are much fewer than 64k cgroups - possibly none.
|
|
|
|
*
|
|
|
|
* Maintain a private 16-bit ID space for memcg, and allow the ID to
|
|
|
|
* be freed and recycled when it's no longer needed, which is usually
|
|
|
|
* when the CSS is offlined.
|
|
|
|
*
|
|
|
|
* The only exception to that are records of swapped out tmpfs/shmem
|
|
|
|
* pages that need to be attributed to live ancestors on swapin. But
|
|
|
|
* those references are manageable from userspace.
|
|
|
|
*/
|
|
|
|
|
|
|
|
static DEFINE_IDR(mem_cgroup_idr);
|
|
|
|
|
2018-08-03 05:36:01 +07:00
|
|
|
static void mem_cgroup_id_remove(struct mem_cgroup *memcg)
|
|
|
|
{
|
|
|
|
if (memcg->id.id > 0) {
|
|
|
|
idr_remove(&mem_cgroup_idr, memcg->id.id);
|
|
|
|
memcg->id.id = 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-04-02 11:07:13 +07:00
|
|
|
static void __maybe_unused mem_cgroup_id_get_many(struct mem_cgroup *memcg,
|
|
|
|
unsigned int n)
|
2016-07-21 05:44:57 +07:00
|
|
|
{
|
2018-10-27 05:09:28 +07:00
|
|
|
refcount_add(n, &memcg->id.ref);
|
2016-07-21 05:44:57 +07:00
|
|
|
}
|
|
|
|
|
2016-08-12 05:33:03 +07:00
|
|
|
static void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n)
|
2016-07-21 05:44:57 +07:00
|
|
|
{
|
2018-10-27 05:09:28 +07:00
|
|
|
if (refcount_sub_and_test(n, &memcg->id.ref)) {
|
2018-08-03 05:36:01 +07:00
|
|
|
mem_cgroup_id_remove(memcg);
|
2016-07-21 05:44:57 +07:00
|
|
|
|
|
|
|
/* Memcg ID pins CSS */
|
|
|
|
css_put(&memcg->css);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-08-12 05:33:03 +07:00
|
|
|
static inline void mem_cgroup_id_put(struct mem_cgroup *memcg)
|
|
|
|
{
|
|
|
|
mem_cgroup_id_put_many(memcg, 1);
|
|
|
|
}
|
|
|
|
|
2016-07-21 05:44:57 +07:00
|
|
|
/**
|
|
|
|
* mem_cgroup_from_id - look up a memcg from a memcg id
|
|
|
|
* @id: the memcg id to look up
|
|
|
|
*
|
|
|
|
* Caller must hold rcu_read_lock().
|
|
|
|
*/
|
|
|
|
struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
|
|
|
|
{
|
|
|
|
WARN_ON_ONCE(!rcu_read_lock_held());
|
|
|
|
return idr_find(&mem_cgroup_idr, id);
|
|
|
|
}
|
|
|
|
|
2016-07-29 05:46:05 +07:00
|
|
|
static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
|
2008-02-07 15:14:31 +07:00
|
|
|
{
|
|
|
|
struct mem_cgroup_per_node *pn;
|
2016-07-29 05:46:05 +07:00
|
|
|
int tmp = node;
|
2008-02-07 15:14:38 +07:00
|
|
|
/*
|
|
|
|
* This routine is called against possible nodes.
|
|
|
|
* But it's BUG to call kmalloc() against offline node.
|
|
|
|
*
|
|
|
|
* TODO: this routine can waste much memory for nodes which will
|
|
|
|
* never be onlined. It's better to use memory hotplug callback
|
|
|
|
* function.
|
|
|
|
*/
|
2008-04-09 07:41:54 +07:00
|
|
|
if (!node_state(node, N_NORMAL_MEMORY))
|
|
|
|
tmp = -1;
|
2011-01-14 06:47:42 +07:00
|
|
|
pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp);
|
2008-02-07 15:14:31 +07:00
|
|
|
if (!pn)
|
|
|
|
return 1;
|
2008-02-07 15:14:38 +07:00
|
|
|
|
2020-08-12 08:30:25 +07:00
|
|
|
pn->lruvec_stat_local = alloc_percpu_gfp(struct lruvec_stat,
|
|
|
|
GFP_KERNEL_ACCOUNT);
|
mm: memcontrol: don't batch updates of local VM stats and events
The kernel test robot noticed a 26% will-it-scale pagefault regression
from commit 42a300353577 ("mm: memcontrol: fix recursive statistics
correctness & scalabilty"). This appears to be caused by bouncing the
additional cachelines from the new hierarchical statistics counters.
We can fix this by getting rid of the batched local counters instead.
Originally, there were *only* group-local counters, and they were fully
maintained per cpu. A reader of a stats file high up in the cgroup tree
would have to walk the entire subtree and collect each level's per-cpu
counters to get the recursive view. This was prohibitively expensive,
and so we switched to per-cpu batched updates of the local counters
during a983b5ebee57 ("mm: memcontrol: fix excessive complexity in
memory.stat reporting"), reducing the complexity from nr_subgroups *
nr_cpus to nr_subgroups.
With growing machines and cgroup trees, the tree walk itself became too
expensive for monitoring top-level groups, and this is when the culprit
patch added hierarchy counters on each cgroup level. When the per-cpu
batch size would be reached, both the local and the hierarchy counters
would get batch-updated from the per-cpu delta simultaneously.
This makes local and hierarchical counter reads blazingly fast, but it
unfortunately makes the write-side too cache line intense.
Since local counter reads were never a problem - we only centralized
them to accelerate the hierarchy walk - and use of the local counters
are becoming rarer due to replacement with hierarchical views (ongoing
rework in the page reclaim and workingset code), we can make those local
counters unbatched per-cpu counters again.
The scheme will then be as such:
when a memcg statistic changes, the writer will:
- update the local counter (per-cpu)
- update the batch counter (per-cpu). If the batch is full:
- spill the batch into the group's atomic_t
- spill the batch into all ancestors' atomic_ts
- empty out the batch counter (per-cpu)
when a local memcg counter is read, the reader will:
- collect the local counter from all cpus
when a hiearchy memcg counter is read, the reader will:
- read the atomic_t
We might be able to simplify this further and make the recursive
counters unbatched per-cpu counters as well (batch upward propagation,
but leave per-cpu collection to the readers), but that will require a
more in-depth analysis and testing of all the callsites. Deal with the
immediate regression for now.
Link: http://lkml.kernel.org/r/20190521151647.GB2870@cmpxchg.org
Fixes: 42a300353577 ("mm: memcontrol: fix recursive statistics correctness & scalabilty")
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reported-by: kernel test robot <rong.a.chen@intel.com>
Tested-by: kernel test robot <rong.a.chen@intel.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Roman Gushchin <guro@fb.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-06-14 05:55:46 +07:00
|
|
|
if (!pn->lruvec_stat_local) {
|
|
|
|
kfree(pn);
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
2020-08-12 08:30:25 +07:00
|
|
|
pn->lruvec_stat_cpu = alloc_percpu_gfp(struct lruvec_stat,
|
|
|
|
GFP_KERNEL_ACCOUNT);
|
2018-02-01 07:16:45 +07:00
|
|
|
if (!pn->lruvec_stat_cpu) {
|
mm: memcontrol: don't batch updates of local VM stats and events
The kernel test robot noticed a 26% will-it-scale pagefault regression
from commit 42a300353577 ("mm: memcontrol: fix recursive statistics
correctness & scalabilty"). This appears to be caused by bouncing the
additional cachelines from the new hierarchical statistics counters.
We can fix this by getting rid of the batched local counters instead.
Originally, there were *only* group-local counters, and they were fully
maintained per cpu. A reader of a stats file high up in the cgroup tree
would have to walk the entire subtree and collect each level's per-cpu
counters to get the recursive view. This was prohibitively expensive,
and so we switched to per-cpu batched updates of the local counters
during a983b5ebee57 ("mm: memcontrol: fix excessive complexity in
memory.stat reporting"), reducing the complexity from nr_subgroups *
nr_cpus to nr_subgroups.
With growing machines and cgroup trees, the tree walk itself became too
expensive for monitoring top-level groups, and this is when the culprit
patch added hierarchy counters on each cgroup level. When the per-cpu
batch size would be reached, both the local and the hierarchy counters
would get batch-updated from the per-cpu delta simultaneously.
This makes local and hierarchical counter reads blazingly fast, but it
unfortunately makes the write-side too cache line intense.
Since local counter reads were never a problem - we only centralized
them to accelerate the hierarchy walk - and use of the local counters
are becoming rarer due to replacement with hierarchical views (ongoing
rework in the page reclaim and workingset code), we can make those local
counters unbatched per-cpu counters again.
The scheme will then be as such:
when a memcg statistic changes, the writer will:
- update the local counter (per-cpu)
- update the batch counter (per-cpu). If the batch is full:
- spill the batch into the group's atomic_t
- spill the batch into all ancestors' atomic_ts
- empty out the batch counter (per-cpu)
when a local memcg counter is read, the reader will:
- collect the local counter from all cpus
when a hiearchy memcg counter is read, the reader will:
- read the atomic_t
We might be able to simplify this further and make the recursive
counters unbatched per-cpu counters as well (batch upward propagation,
but leave per-cpu collection to the readers), but that will require a
more in-depth analysis and testing of all the callsites. Deal with the
immediate regression for now.
Link: http://lkml.kernel.org/r/20190521151647.GB2870@cmpxchg.org
Fixes: 42a300353577 ("mm: memcontrol: fix recursive statistics correctness & scalabilty")
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reported-by: kernel test robot <rong.a.chen@intel.com>
Tested-by: kernel test robot <rong.a.chen@intel.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Roman Gushchin <guro@fb.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-06-14 05:55:46 +07:00
|
|
|
free_percpu(pn->lruvec_stat_local);
|
2017-07-07 05:40:52 +07:00
|
|
|
kfree(pn);
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
2016-07-29 05:46:05 +07:00
|
|
|
lruvec_init(&pn->lruvec);
|
|
|
|
pn->usage_in_excess = 0;
|
|
|
|
pn->on_tree = false;
|
|
|
|
pn->memcg = memcg;
|
|
|
|
|
2013-07-09 05:59:49 +07:00
|
|
|
memcg->nodeinfo[node] = pn;
|
2008-02-07 15:14:31 +07:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2016-07-29 05:46:05 +07:00
|
|
|
static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
|
2008-02-07 15:14:38 +07:00
|
|
|
{
|
2017-07-07 05:40:52 +07:00
|
|
|
struct mem_cgroup_per_node *pn = memcg->nodeinfo[node];
|
|
|
|
|
2018-04-11 06:29:52 +07:00
|
|
|
if (!pn)
|
|
|
|
return;
|
|
|
|
|
2018-02-01 07:16:45 +07:00
|
|
|
free_percpu(pn->lruvec_stat_cpu);
|
mm: memcontrol: don't batch updates of local VM stats and events
The kernel test robot noticed a 26% will-it-scale pagefault regression
from commit 42a300353577 ("mm: memcontrol: fix recursive statistics
correctness & scalabilty"). This appears to be caused by bouncing the
additional cachelines from the new hierarchical statistics counters.
We can fix this by getting rid of the batched local counters instead.
Originally, there were *only* group-local counters, and they were fully
maintained per cpu. A reader of a stats file high up in the cgroup tree
would have to walk the entire subtree and collect each level's per-cpu
counters to get the recursive view. This was prohibitively expensive,
and so we switched to per-cpu batched updates of the local counters
during a983b5ebee57 ("mm: memcontrol: fix excessive complexity in
memory.stat reporting"), reducing the complexity from nr_subgroups *
nr_cpus to nr_subgroups.
With growing machines and cgroup trees, the tree walk itself became too
expensive for monitoring top-level groups, and this is when the culprit
patch added hierarchy counters on each cgroup level. When the per-cpu
batch size would be reached, both the local and the hierarchy counters
would get batch-updated from the per-cpu delta simultaneously.
This makes local and hierarchical counter reads blazingly fast, but it
unfortunately makes the write-side too cache line intense.
Since local counter reads were never a problem - we only centralized
them to accelerate the hierarchy walk - and use of the local counters
are becoming rarer due to replacement with hierarchical views (ongoing
rework in the page reclaim and workingset code), we can make those local
counters unbatched per-cpu counters again.
The scheme will then be as such:
when a memcg statistic changes, the writer will:
- update the local counter (per-cpu)
- update the batch counter (per-cpu). If the batch is full:
- spill the batch into the group's atomic_t
- spill the batch into all ancestors' atomic_ts
- empty out the batch counter (per-cpu)
when a local memcg counter is read, the reader will:
- collect the local counter from all cpus
when a hiearchy memcg counter is read, the reader will:
- read the atomic_t
We might be able to simplify this further and make the recursive
counters unbatched per-cpu counters as well (batch upward propagation,
but leave per-cpu collection to the readers), but that will require a
more in-depth analysis and testing of all the callsites. Deal with the
immediate regression for now.
Link: http://lkml.kernel.org/r/20190521151647.GB2870@cmpxchg.org
Fixes: 42a300353577 ("mm: memcontrol: fix recursive statistics correctness & scalabilty")
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reported-by: kernel test robot <rong.a.chen@intel.com>
Tested-by: kernel test robot <rong.a.chen@intel.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Roman Gushchin <guro@fb.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-06-14 05:55:46 +07:00
|
|
|
free_percpu(pn->lruvec_stat_local);
|
2017-07-07 05:40:52 +07:00
|
|
|
kfree(pn);
|
2008-02-07 15:14:38 +07:00
|
|
|
}
|
|
|
|
|
2017-03-10 07:17:26 +07:00
|
|
|
static void __mem_cgroup_free(struct mem_cgroup *memcg)
|
memcg: free mem_cgroup by RCU to fix oops
After fixing the GPF in mem_cgroup_lru_del_list(), three times one
machine running a similar load (moving and removing memcgs while
swapping) has oopsed in mem_cgroup_zone_nr_lru_pages(), when retrieving
memcg zone numbers for get_scan_count() for shrink_mem_cgroup_zone():
this is where a struct mem_cgroup is first accessed after being chosen
by mem_cgroup_iter().
Just what protects a struct mem_cgroup from being freed, in between
mem_cgroup_iter()'s css_get_next() and its css_tryget()? css_tryget()
fails once css->refcnt is zero with CSS_REMOVED set in flags, yes: but
what if that memory is freed and reused for something else, which sets
"refcnt" non-zero? Hmm, and scope for an indefinite freeze if refcnt is
left at zero but flags are cleared.
It's tempting to move the css_tryget() into css_get_next(), to make it
really "get" the css, but I don't think that actually solves anything:
the same difficulty in moving from css_id found to stable css remains.
But we already have rcu_read_lock() around the two, so it's easily fixed
if __mem_cgroup_free() just uses kfree_rcu() to free mem_cgroup.
However, a big struct mem_cgroup is allocated with vzalloc() instead of
kzalloc(), and we're not allowed to vfree() at interrupt time: there
doesn't appear to be a general vfree_rcu() to help with this, so roll
our own using schedule_work(). The compiler decently removes
vfree_work() and vfree_rcu() when the config doesn't need them.
Signed-off-by: Hugh Dickins <hughd@google.com>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Konstantin Khlebnikov <khlebnikov@openvz.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Ying Han <yinghan@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-03-16 05:17:07 +07:00
|
|
|
{
|
memcg: execute the whole memcg freeing in free_worker()
A lot of the initialization we do in mem_cgroup_create() is done with
softirqs enabled. This include grabbing a css id, which holds
&ss->id_lock->rlock, and the per-zone trees, which holds
rtpz->lock->rlock. All of those signal to the lockdep mechanism that
those locks can be used in SOFTIRQ-ON-W context.
This means that the freeing of memcg structure must happen in a
compatible context, otherwise we'll get a deadlock, like the one below,
caught by lockdep:
free_accounted_pages+0x47/0x4c
free_task+0x31/0x5c
__put_task_struct+0xc2/0xdb
put_task_struct+0x1e/0x22
delayed_put_task_struct+0x7a/0x98
__rcu_process_callbacks+0x269/0x3df
rcu_process_callbacks+0x31/0x5b
__do_softirq+0x122/0x277
This usage pattern could not be triggered before kmem came into play.
With the introduction of kmem stack handling, it is possible that we call
the last mem_cgroup_put() from the task destructor, which is run in an rcu
callback. Such callbacks are run with softirqs disabled, leading to the
offensive usage pattern.
In general, we have little, if any, means to guarantee in which context
the last memcg_put will happen. The best we can do is test it and try to
make sure no invalid context releases are happening. But as we add more
code to memcg, the possible interactions grow in number and expose more
ways to get context conflicts. One thing to keep in mind, is that part of
the freeing process is already deferred to a worker, such as vfree(), that
can only be called from process context.
For the moment, the only two functions we really need moved away are:
* free_css_id(), and
* mem_cgroup_remove_from_trees().
But because the later accesses per-zone info,
free_mem_cgroup_per_zone_info() needs to be moved as well. With that, we
are left with the per_cpu stats only. Better move it all.
Signed-off-by: Glauber Costa <glommer@parallels.com>
Tested-by: Greg Thelen <gthelen@google.com>
Acked-by: Michal Hocko <mhocko@suse.cz>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Frederic Weisbecker <fweisbec@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: JoonSoo Kim <js1304@gmail.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Rik van Riel <riel@redhat.com>
Cc: Suleiman Souhlal <suleiman@google.com>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-12-19 05:22:13 +07:00
|
|
|
int node;
|
memcg: free mem_cgroup by RCU to fix oops
After fixing the GPF in mem_cgroup_lru_del_list(), three times one
machine running a similar load (moving and removing memcgs while
swapping) has oopsed in mem_cgroup_zone_nr_lru_pages(), when retrieving
memcg zone numbers for get_scan_count() for shrink_mem_cgroup_zone():
this is where a struct mem_cgroup is first accessed after being chosen
by mem_cgroup_iter().
Just what protects a struct mem_cgroup from being freed, in between
mem_cgroup_iter()'s css_get_next() and its css_tryget()? css_tryget()
fails once css->refcnt is zero with CSS_REMOVED set in flags, yes: but
what if that memory is freed and reused for something else, which sets
"refcnt" non-zero? Hmm, and scope for an indefinite freeze if refcnt is
left at zero but flags are cleared.
It's tempting to move the css_tryget() into css_get_next(), to make it
really "get" the css, but I don't think that actually solves anything:
the same difficulty in moving from css_id found to stable css remains.
But we already have rcu_read_lock() around the two, so it's easily fixed
if __mem_cgroup_free() just uses kfree_rcu() to free mem_cgroup.
However, a big struct mem_cgroup is allocated with vzalloc() instead of
kzalloc(), and we're not allowed to vfree() at interrupt time: there
doesn't appear to be a general vfree_rcu() to help with this, so roll
our own using schedule_work(). The compiler decently removes
vfree_work() and vfree_rcu() when the config doesn't need them.
Signed-off-by: Hugh Dickins <hughd@google.com>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Konstantin Khlebnikov <khlebnikov@openvz.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Ying Han <yinghan@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-03-16 05:17:07 +07:00
|
|
|
|
memcg: execute the whole memcg freeing in free_worker()
A lot of the initialization we do in mem_cgroup_create() is done with
softirqs enabled. This include grabbing a css id, which holds
&ss->id_lock->rlock, and the per-zone trees, which holds
rtpz->lock->rlock. All of those signal to the lockdep mechanism that
those locks can be used in SOFTIRQ-ON-W context.
This means that the freeing of memcg structure must happen in a
compatible context, otherwise we'll get a deadlock, like the one below,
caught by lockdep:
free_accounted_pages+0x47/0x4c
free_task+0x31/0x5c
__put_task_struct+0xc2/0xdb
put_task_struct+0x1e/0x22
delayed_put_task_struct+0x7a/0x98
__rcu_process_callbacks+0x269/0x3df
rcu_process_callbacks+0x31/0x5b
__do_softirq+0x122/0x277
This usage pattern could not be triggered before kmem came into play.
With the introduction of kmem stack handling, it is possible that we call
the last mem_cgroup_put() from the task destructor, which is run in an rcu
callback. Such callbacks are run with softirqs disabled, leading to the
offensive usage pattern.
In general, we have little, if any, means to guarantee in which context
the last memcg_put will happen. The best we can do is test it and try to
make sure no invalid context releases are happening. But as we add more
code to memcg, the possible interactions grow in number and expose more
ways to get context conflicts. One thing to keep in mind, is that part of
the freeing process is already deferred to a worker, such as vfree(), that
can only be called from process context.
For the moment, the only two functions we really need moved away are:
* free_css_id(), and
* mem_cgroup_remove_from_trees().
But because the later accesses per-zone info,
free_mem_cgroup_per_zone_info() needs to be moved as well. With that, we
are left with the per_cpu stats only. Better move it all.
Signed-off-by: Glauber Costa <glommer@parallels.com>
Tested-by: Greg Thelen <gthelen@google.com>
Acked-by: Michal Hocko <mhocko@suse.cz>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Frederic Weisbecker <fweisbec@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: JoonSoo Kim <js1304@gmail.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Rik van Riel <riel@redhat.com>
Cc: Suleiman Souhlal <suleiman@google.com>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-12-19 05:22:13 +07:00
|
|
|
for_each_node(node)
|
2016-07-29 05:46:05 +07:00
|
|
|
free_mem_cgroup_per_node_info(memcg, node);
|
2019-05-15 05:46:57 +07:00
|
|
|
free_percpu(memcg->vmstats_percpu);
|
mm: memcontrol: don't batch updates of local VM stats and events
The kernel test robot noticed a 26% will-it-scale pagefault regression
from commit 42a300353577 ("mm: memcontrol: fix recursive statistics
correctness & scalabilty"). This appears to be caused by bouncing the
additional cachelines from the new hierarchical statistics counters.
We can fix this by getting rid of the batched local counters instead.
Originally, there were *only* group-local counters, and they were fully
maintained per cpu. A reader of a stats file high up in the cgroup tree
would have to walk the entire subtree and collect each level's per-cpu
counters to get the recursive view. This was prohibitively expensive,
and so we switched to per-cpu batched updates of the local counters
during a983b5ebee57 ("mm: memcontrol: fix excessive complexity in
memory.stat reporting"), reducing the complexity from nr_subgroups *
nr_cpus to nr_subgroups.
With growing machines and cgroup trees, the tree walk itself became too
expensive for monitoring top-level groups, and this is when the culprit
patch added hierarchy counters on each cgroup level. When the per-cpu
batch size would be reached, both the local and the hierarchy counters
would get batch-updated from the per-cpu delta simultaneously.
This makes local and hierarchical counter reads blazingly fast, but it
unfortunately makes the write-side too cache line intense.
Since local counter reads were never a problem - we only centralized
them to accelerate the hierarchy walk - and use of the local counters
are becoming rarer due to replacement with hierarchical views (ongoing
rework in the page reclaim and workingset code), we can make those local
counters unbatched per-cpu counters again.
The scheme will then be as such:
when a memcg statistic changes, the writer will:
- update the local counter (per-cpu)
- update the batch counter (per-cpu). If the batch is full:
- spill the batch into the group's atomic_t
- spill the batch into all ancestors' atomic_ts
- empty out the batch counter (per-cpu)
when a local memcg counter is read, the reader will:
- collect the local counter from all cpus
when a hiearchy memcg counter is read, the reader will:
- read the atomic_t
We might be able to simplify this further and make the recursive
counters unbatched per-cpu counters as well (batch upward propagation,
but leave per-cpu collection to the readers), but that will require a
more in-depth analysis and testing of all the callsites. Deal with the
immediate regression for now.
Link: http://lkml.kernel.org/r/20190521151647.GB2870@cmpxchg.org
Fixes: 42a300353577 ("mm: memcontrol: fix recursive statistics correctness & scalabilty")
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reported-by: kernel test robot <rong.a.chen@intel.com>
Tested-by: kernel test robot <rong.a.chen@intel.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Roman Gushchin <guro@fb.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-06-14 05:55:46 +07:00
|
|
|
free_percpu(memcg->vmstats_local);
|
2014-01-24 06:52:52 +07:00
|
|
|
kfree(memcg);
|
memcg: free mem_cgroup by RCU to fix oops
After fixing the GPF in mem_cgroup_lru_del_list(), three times one
machine running a similar load (moving and removing memcgs while
swapping) has oopsed in mem_cgroup_zone_nr_lru_pages(), when retrieving
memcg zone numbers for get_scan_count() for shrink_mem_cgroup_zone():
this is where a struct mem_cgroup is first accessed after being chosen
by mem_cgroup_iter().
Just what protects a struct mem_cgroup from being freed, in between
mem_cgroup_iter()'s css_get_next() and its css_tryget()? css_tryget()
fails once css->refcnt is zero with CSS_REMOVED set in flags, yes: but
what if that memory is freed and reused for something else, which sets
"refcnt" non-zero? Hmm, and scope for an indefinite freeze if refcnt is
left at zero but flags are cleared.
It's tempting to move the css_tryget() into css_get_next(), to make it
really "get" the css, but I don't think that actually solves anything:
the same difficulty in moving from css_id found to stable css remains.
But we already have rcu_read_lock() around the two, so it's easily fixed
if __mem_cgroup_free() just uses kfree_rcu() to free mem_cgroup.
However, a big struct mem_cgroup is allocated with vzalloc() instead of
kzalloc(), and we're not allowed to vfree() at interrupt time: there
doesn't appear to be a general vfree_rcu() to help with this, so roll
our own using schedule_work(). The compiler decently removes
vfree_work() and vfree_rcu() when the config doesn't need them.
Signed-off-by: Hugh Dickins <hughd@google.com>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Konstantin Khlebnikov <khlebnikov@openvz.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Ying Han <yinghan@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-03-16 05:17:07 +07:00
|
|
|
}
|
2012-05-30 05:07:10 +07:00
|
|
|
|
2017-03-10 07:17:26 +07:00
|
|
|
static void mem_cgroup_free(struct mem_cgroup *memcg)
|
|
|
|
{
|
|
|
|
memcg_wb_domain_exit(memcg);
|
2019-11-06 12:16:21 +07:00
|
|
|
/*
|
|
|
|
* Flush percpu vmstats and vmevents to guarantee the value correctness
|
|
|
|
* on parent's and all ancestor levels.
|
|
|
|
*/
|
mm: memcg/slab: fix percpu slab vmstats flushing
Currently slab percpu vmstats are flushed twice: during the memcg
offlining and just before freeing the memcg structure. Each time percpu
counters are summed, added to the atomic counterparts and propagated up
by the cgroup tree.
The second flushing is required due to how recursive vmstats are
implemented: counters are batched in percpu variables on a local level,
and once a percpu value is crossing some predefined threshold, it spills
over to atomic values on the local and each ascendant levels. It means
that without flushing some numbers cached in percpu variables will be
dropped on floor each time a cgroup is destroyed. And with uptime the
error on upper levels might become noticeable.
The first flushing aims to make counters on ancestor levels more
precise. Dying cgroups may resume in the dying state for a long time.
After kmem_cache reparenting which is performed during the offlining
slab counters of the dying cgroup don't have any chances to be updated,
because any slab operations will be performed on the parent level. It
means that the inaccuracy caused by percpu batching will not decrease up
to the final destruction of the cgroup. By the original idea flushing
slab counters during the offlining should minimize the visible
inaccuracy of slab counters on the parent level.
The problem is that percpu counters are not zeroed after the first
flushing. So every cached percpu value is summed twice. It creates a
small error (up to 32 pages per cpu, but usually less) which accumulates
on parent cgroup level. After creating and destroying of thousands of
child cgroups, slab counter on parent level can be way off the real
value.
For now, let's just stop flushing slab counters on memcg offlining. It
can't be done correctly without scheduling a work on each cpu: reading
and zeroing it during css offlining can race with an asynchronous
update, which doesn't expect values to be changed underneath.
With this change, slab counters on parent level will become eventually
consistent. Once all dying children are gone, values are correct. And
if not, the error is capped by 32 * NR_CPUS pages per dying cgroup.
It's not perfect, as slab are reparented, so any updates after the
reparenting will happen on the parent level. It means that if a slab
page was allocated, a counter on child level was bumped, then the page
was reparented and freed, the annihilation of positive and negative
counter values will not happen until the child cgroup is released. It
makes slab counters different from others, and it might want us to
implement flushing in a correct form again. But it's also a question of
performance: scheduling a work on each cpu isn't free, and it's an open
question if the benefit of having more accurate counters is worth it.
We might also consider flushing all counters on offlining, not only slab
counters.
So let's fix the main problem now: make the slab counters eventually
consistent, so at least the error won't grow with uptime (or more
precisely the number of created and destroyed cgroups). And think about
the accuracy of counters separately.
Link: http://lkml.kernel.org/r/20191220042728.1045881-1-guro@fb.com
Fixes: bee07b33db78 ("mm: memcontrol: flush percpu slab vmstats on kmem offlining")
Signed-off-by: Roman Gushchin <guro@fb.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2020-01-14 07:29:16 +07:00
|
|
|
memcg_flush_percpu_vmstats(memcg);
|
2019-11-06 12:16:21 +07:00
|
|
|
memcg_flush_percpu_vmevents(memcg);
|
2017-03-10 07:17:26 +07:00
|
|
|
__mem_cgroup_free(memcg);
|
|
|
|
}
|
|
|
|
|
2016-01-21 06:02:53 +07:00
|
|
|
static struct mem_cgroup *mem_cgroup_alloc(void)
|
2008-02-07 15:13:50 +07:00
|
|
|
{
|
2013-02-23 07:34:52 +07:00
|
|
|
struct mem_cgroup *memcg;
|
2019-03-06 06:48:26 +07:00
|
|
|
unsigned int size;
|
2008-02-07 15:14:31 +07:00
|
|
|
int node;
|
writeback, memcg: Implement foreign dirty flushing
There's an inherent mismatch between memcg and writeback. The former
trackes ownership per-page while the latter per-inode. This was a
deliberate design decision because honoring per-page ownership in the
writeback path is complicated, may lead to higher CPU and IO overheads
and deemed unnecessary given that write-sharing an inode across
different cgroups isn't a common use-case.
Combined with inode majority-writer ownership switching, this works
well enough in most cases but there are some pathological cases. For
example, let's say there are two cgroups A and B which keep writing to
different but confined parts of the same inode. B owns the inode and
A's memory is limited far below B's. A's dirty ratio can rise enough
to trigger balance_dirty_pages() sleeps but B's can be low enough to
avoid triggering background writeback. A will be slowed down without
a way to make writeback of the dirty pages happen.
This patch implements foreign dirty recording and foreign mechanism so
that when a memcg encounters a condition as above it can trigger
flushes on bdi_writebacks which can clean its pages. Please see the
comment on top of mem_cgroup_track_foreign_dirty_slowpath() for
details.
A reproducer follows.
write-range.c::
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <fcntl.h>
#include <sys/types.h>
static const char *usage = "write-range FILE START SIZE\n";
int main(int argc, char **argv)
{
int fd;
unsigned long start, size, end, pos;
char *endp;
char buf[4096];
if (argc < 4) {
fprintf(stderr, usage);
return 1;
}
fd = open(argv[1], O_WRONLY);
if (fd < 0) {
perror("open");
return 1;
}
start = strtoul(argv[2], &endp, 0);
if (*endp != '\0') {
fprintf(stderr, usage);
return 1;
}
size = strtoul(argv[3], &endp, 0);
if (*endp != '\0') {
fprintf(stderr, usage);
return 1;
}
end = start + size;
while (1) {
for (pos = start; pos < end; ) {
long bread, bwritten = 0;
if (lseek(fd, pos, SEEK_SET) < 0) {
perror("lseek");
return 1;
}
bread = read(0, buf, sizeof(buf) < end - pos ?
sizeof(buf) : end - pos);
if (bread < 0) {
perror("read");
return 1;
}
if (bread == 0)
return 0;
while (bwritten < bread) {
long this;
this = write(fd, buf + bwritten,
bread - bwritten);
if (this < 0) {
perror("write");
return 1;
}
bwritten += this;
pos += bwritten;
}
}
}
}
repro.sh::
#!/bin/bash
set -e
set -x
sysctl -w vm.dirty_expire_centisecs=300000
sysctl -w vm.dirty_writeback_centisecs=300000
sysctl -w vm.dirtytime_expire_seconds=300000
echo 3 > /proc/sys/vm/drop_caches
TEST=/sys/fs/cgroup/test
A=$TEST/A
B=$TEST/B
mkdir -p $A $B
echo "+memory +io" > $TEST/cgroup.subtree_control
echo $((1<<30)) > $A/memory.high
echo $((32<<30)) > $B/memory.high
rm -f testfile
touch testfile
fallocate -l 4G testfile
echo "Starting B"
(echo $BASHPID > $B/cgroup.procs
pv -q --rate-limit 70M < /dev/urandom | ./write-range testfile $((2<<30)) $((2<<30))) &
echo "Waiting 10s to ensure B claims the testfile inode"
sleep 5
sync
sleep 5
sync
echo "Starting A"
(echo $BASHPID > $A/cgroup.procs
pv < /dev/urandom | ./write-range testfile 0 $((2<<30)))
v2: Added comments explaining why the specific intervals are being used.
v3: Use 0 @nr when calling cgroup_writeback_by_id() to use best-effort
flushing while avoding possible livelocks.
v4: Use get_jiffies_64() and time_before/after64() instead of raw
jiffies_64 and arthimetic comparisons as suggested by Jan.
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2019-08-26 23:06:56 +07:00
|
|
|
int __maybe_unused i;
|
2020-05-08 08:35:43 +07:00
|
|
|
long error = -ENOMEM;
|
2008-02-07 15:13:50 +07:00
|
|
|
|
2016-01-21 06:02:53 +07:00
|
|
|
size = sizeof(struct mem_cgroup);
|
|
|
|
size += nr_node_ids * sizeof(struct mem_cgroup_per_node *);
|
|
|
|
|
|
|
|
memcg = kzalloc(size, GFP_KERNEL);
|
2011-11-03 03:38:15 +07:00
|
|
|
if (!memcg)
|
2020-05-08 08:35:43 +07:00
|
|
|
return ERR_PTR(error);
|
2016-01-21 06:02:53 +07:00
|
|
|
|
2016-07-21 05:44:57 +07:00
|
|
|
memcg->id.id = idr_alloc(&mem_cgroup_idr, NULL,
|
|
|
|
1, MEM_CGROUP_ID_MAX,
|
|
|
|
GFP_KERNEL);
|
2020-05-08 08:35:43 +07:00
|
|
|
if (memcg->id.id < 0) {
|
|
|
|
error = memcg->id.id;
|
2016-07-21 05:44:57 +07:00
|
|
|
goto fail;
|
2020-05-08 08:35:43 +07:00
|
|
|
}
|
2016-07-21 05:44:57 +07:00
|
|
|
|
2020-08-12 08:30:25 +07:00
|
|
|
memcg->vmstats_local = alloc_percpu_gfp(struct memcg_vmstats_percpu,
|
|
|
|
GFP_KERNEL_ACCOUNT);
|
mm: memcontrol: don't batch updates of local VM stats and events
The kernel test robot noticed a 26% will-it-scale pagefault regression
from commit 42a300353577 ("mm: memcontrol: fix recursive statistics
correctness & scalabilty"). This appears to be caused by bouncing the
additional cachelines from the new hierarchical statistics counters.
We can fix this by getting rid of the batched local counters instead.
Originally, there were *only* group-local counters, and they were fully
maintained per cpu. A reader of a stats file high up in the cgroup tree
would have to walk the entire subtree and collect each level's per-cpu
counters to get the recursive view. This was prohibitively expensive,
and so we switched to per-cpu batched updates of the local counters
during a983b5ebee57 ("mm: memcontrol: fix excessive complexity in
memory.stat reporting"), reducing the complexity from nr_subgroups *
nr_cpus to nr_subgroups.
With growing machines and cgroup trees, the tree walk itself became too
expensive for monitoring top-level groups, and this is when the culprit
patch added hierarchy counters on each cgroup level. When the per-cpu
batch size would be reached, both the local and the hierarchy counters
would get batch-updated from the per-cpu delta simultaneously.
This makes local and hierarchical counter reads blazingly fast, but it
unfortunately makes the write-side too cache line intense.
Since local counter reads were never a problem - we only centralized
them to accelerate the hierarchy walk - and use of the local counters
are becoming rarer due to replacement with hierarchical views (ongoing
rework in the page reclaim and workingset code), we can make those local
counters unbatched per-cpu counters again.
The scheme will then be as such:
when a memcg statistic changes, the writer will:
- update the local counter (per-cpu)
- update the batch counter (per-cpu). If the batch is full:
- spill the batch into the group's atomic_t
- spill the batch into all ancestors' atomic_ts
- empty out the batch counter (per-cpu)
when a local memcg counter is read, the reader will:
- collect the local counter from all cpus
when a hiearchy memcg counter is read, the reader will:
- read the atomic_t
We might be able to simplify this further and make the recursive
counters unbatched per-cpu counters as well (batch upward propagation,
but leave per-cpu collection to the readers), but that will require a
more in-depth analysis and testing of all the callsites. Deal with the
immediate regression for now.
Link: http://lkml.kernel.org/r/20190521151647.GB2870@cmpxchg.org
Fixes: 42a300353577 ("mm: memcontrol: fix recursive statistics correctness & scalabilty")
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reported-by: kernel test robot <rong.a.chen@intel.com>
Tested-by: kernel test robot <rong.a.chen@intel.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Roman Gushchin <guro@fb.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-06-14 05:55:46 +07:00
|
|
|
if (!memcg->vmstats_local)
|
|
|
|
goto fail;
|
|
|
|
|
2020-08-12 08:30:25 +07:00
|
|
|
memcg->vmstats_percpu = alloc_percpu_gfp(struct memcg_vmstats_percpu,
|
|
|
|
GFP_KERNEL_ACCOUNT);
|
2019-05-15 05:46:57 +07:00
|
|
|
if (!memcg->vmstats_percpu)
|
2016-01-21 06:02:53 +07:00
|
|
|
goto fail;
|
2008-02-07 15:13:51 +07:00
|
|
|
|
2012-01-13 08:19:04 +07:00
|
|
|
for_each_node(node)
|
2016-07-29 05:46:05 +07:00
|
|
|
if (alloc_mem_cgroup_per_node_info(memcg, node))
|
2016-01-21 06:02:53 +07:00
|
|
|
goto fail;
|
2009-09-24 05:56:37 +07:00
|
|
|
|
2016-01-21 06:02:53 +07:00
|
|
|
if (memcg_wb_domain_init(memcg, GFP_KERNEL))
|
|
|
|
goto fail;
|
2009-01-08 09:08:05 +07:00
|
|
|
|
2016-01-15 06:21:29 +07:00
|
|
|
INIT_WORK(&memcg->high_work, high_work_func);
|
2013-02-23 07:34:52 +07:00
|
|
|
INIT_LIST_HEAD(&memcg->oom_notify);
|
|
|
|
mutex_init(&memcg->thresholds_lock);
|
|
|
|
spin_lock_init(&memcg->move_lock);
|
memcg: add memory.pressure_level events
With this patch userland applications that want to maintain the
interactivity/memory allocation cost can use the pressure level
notifications. The levels are defined like this:
The "low" level means that the system is reclaiming memory for new
allocations. Monitoring this reclaiming activity might be useful for
maintaining cache level. Upon notification, the program (typically
"Activity Manager") might analyze vmstat and act in advance (i.e.
prematurely shutdown unimportant services).
The "medium" level means that the system is experiencing medium memory
pressure, the system might be making swap, paging out active file
caches, etc. Upon this event applications may decide to further analyze
vmstat/zoneinfo/memcg or internal memory usage statistics and free any
resources that can be easily reconstructed or re-read from a disk.
The "critical" level means that the system is actively thrashing, it is
about to out of memory (OOM) or even the in-kernel OOM killer is on its
way to trigger. Applications should do whatever they can to help the
system. It might be too late to consult with vmstat or any other
statistics, so it's advisable to take an immediate action.
The events are propagated upward until the event is handled, i.e. the
events are not pass-through. Here is what this means: for example you
have three cgroups: A->B->C. Now you set up an event listener on
cgroups A, B and C, and suppose group C experiences some pressure. In
this situation, only group C will receive the notification, i.e. groups
A and B will not receive it. This is done to avoid excessive
"broadcasting" of messages, which disturbs the system and which is
especially bad if we are low on memory or thrashing. So, organize the
cgroups wisely, or propagate the events manually (or, ask us to
implement the pass-through events, explaining why would you need them.)
Performance wise, the memory pressure notifications feature itself is
lightweight and does not require much of bookkeeping, in contrast to the
rest of memcg features. Unfortunately, as of current memcg
implementation, pages accounting is an inseparable part and cannot be
turned off. The good news is that there are some efforts[1] to improve
the situation; plus, implementing the same, fully API-compatible[2]
interface for CONFIG_MEMCG=n case (e.g. embedded) is also a viable
option, so it will not require any changes on the userland side.
[1] http://permalink.gmane.org/gmane.linux.kernel.cgroups/6291
[2] http://lkml.org/lkml/2013/2/21/454
[akpm@linux-foundation.org: coding-style fixes]
[akpm@linux-foundation.org: fix CONFIG_CGROPUPS=n warnings]
Signed-off-by: Anton Vorontsov <anton.vorontsov@linaro.org>
Acked-by: Kirill A. Shutemov <kirill@shutemov.name>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Glauber Costa <glommer@parallels.com>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Luiz Capitulino <lcapitulino@redhat.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Leonid Moiseichuk <leonid.moiseichuk@nokia.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@gmail.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Cc: John Stultz <john.stultz@linaro.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2013-04-30 05:08:31 +07:00
|
|
|
vmpressure_init(&memcg->vmpressure);
|
2013-11-23 06:20:43 +07:00
|
|
|
INIT_LIST_HEAD(&memcg->event_list);
|
|
|
|
spin_lock_init(&memcg->event_list_lock);
|
2016-01-21 06:02:47 +07:00
|
|
|
memcg->socket_pressure = jiffies;
|
2018-08-18 05:47:25 +07:00
|
|
|
#ifdef CONFIG_MEMCG_KMEM
|
2014-12-13 07:55:10 +07:00
|
|
|
memcg->kmemcg_id = -1;
|
2020-08-07 13:20:49 +07:00
|
|
|
INIT_LIST_HEAD(&memcg->objcg_list);
|
2014-12-13 07:55:10 +07:00
|
|
|
#endif
|
2015-05-23 04:13:37 +07:00
|
|
|
#ifdef CONFIG_CGROUP_WRITEBACK
|
|
|
|
INIT_LIST_HEAD(&memcg->cgwb_list);
|
writeback, memcg: Implement foreign dirty flushing
There's an inherent mismatch between memcg and writeback. The former
trackes ownership per-page while the latter per-inode. This was a
deliberate design decision because honoring per-page ownership in the
writeback path is complicated, may lead to higher CPU and IO overheads
and deemed unnecessary given that write-sharing an inode across
different cgroups isn't a common use-case.
Combined with inode majority-writer ownership switching, this works
well enough in most cases but there are some pathological cases. For
example, let's say there are two cgroups A and B which keep writing to
different but confined parts of the same inode. B owns the inode and
A's memory is limited far below B's. A's dirty ratio can rise enough
to trigger balance_dirty_pages() sleeps but B's can be low enough to
avoid triggering background writeback. A will be slowed down without
a way to make writeback of the dirty pages happen.
This patch implements foreign dirty recording and foreign mechanism so
that when a memcg encounters a condition as above it can trigger
flushes on bdi_writebacks which can clean its pages. Please see the
comment on top of mem_cgroup_track_foreign_dirty_slowpath() for
details.
A reproducer follows.
write-range.c::
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <fcntl.h>
#include <sys/types.h>
static const char *usage = "write-range FILE START SIZE\n";
int main(int argc, char **argv)
{
int fd;
unsigned long start, size, end, pos;
char *endp;
char buf[4096];
if (argc < 4) {
fprintf(stderr, usage);
return 1;
}
fd = open(argv[1], O_WRONLY);
if (fd < 0) {
perror("open");
return 1;
}
start = strtoul(argv[2], &endp, 0);
if (*endp != '\0') {
fprintf(stderr, usage);
return 1;
}
size = strtoul(argv[3], &endp, 0);
if (*endp != '\0') {
fprintf(stderr, usage);
return 1;
}
end = start + size;
while (1) {
for (pos = start; pos < end; ) {
long bread, bwritten = 0;
if (lseek(fd, pos, SEEK_SET) < 0) {
perror("lseek");
return 1;
}
bread = read(0, buf, sizeof(buf) < end - pos ?
sizeof(buf) : end - pos);
if (bread < 0) {
perror("read");
return 1;
}
if (bread == 0)
return 0;
while (bwritten < bread) {
long this;
this = write(fd, buf + bwritten,
bread - bwritten);
if (this < 0) {
perror("write");
return 1;
}
bwritten += this;
pos += bwritten;
}
}
}
}
repro.sh::
#!/bin/bash
set -e
set -x
sysctl -w vm.dirty_expire_centisecs=300000
sysctl -w vm.dirty_writeback_centisecs=300000
sysctl -w vm.dirtytime_expire_seconds=300000
echo 3 > /proc/sys/vm/drop_caches
TEST=/sys/fs/cgroup/test
A=$TEST/A
B=$TEST/B
mkdir -p $A $B
echo "+memory +io" > $TEST/cgroup.subtree_control
echo $((1<<30)) > $A/memory.high
echo $((32<<30)) > $B/memory.high
rm -f testfile
touch testfile
fallocate -l 4G testfile
echo "Starting B"
(echo $BASHPID > $B/cgroup.procs
pv -q --rate-limit 70M < /dev/urandom | ./write-range testfile $((2<<30)) $((2<<30))) &
echo "Waiting 10s to ensure B claims the testfile inode"
sleep 5
sync
sleep 5
sync
echo "Starting A"
(echo $BASHPID > $A/cgroup.procs
pv < /dev/urandom | ./write-range testfile 0 $((2<<30)))
v2: Added comments explaining why the specific intervals are being used.
v3: Use 0 @nr when calling cgroup_writeback_by_id() to use best-effort
flushing while avoding possible livelocks.
v4: Use get_jiffies_64() and time_before/after64() instead of raw
jiffies_64 and arthimetic comparisons as suggested by Jan.
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2019-08-26 23:06:56 +07:00
|
|
|
for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++)
|
|
|
|
memcg->cgwb_frn[i].done =
|
|
|
|
__WB_COMPLETION_INIT(&memcg_cgwb_frn_waitq);
|
2019-09-24 05:38:15 +07:00
|
|
|
#endif
|
|
|
|
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
|
|
|
|
spin_lock_init(&memcg->deferred_split_queue.split_queue_lock);
|
|
|
|
INIT_LIST_HEAD(&memcg->deferred_split_queue.split_queue);
|
|
|
|
memcg->deferred_split_queue.split_queue_len = 0;
|
2015-05-23 04:13:37 +07:00
|
|
|
#endif
|
2016-07-21 05:44:57 +07:00
|
|
|
idr_replace(&mem_cgroup_idr, memcg, memcg->id.id);
|
2016-01-21 06:02:53 +07:00
|
|
|
return memcg;
|
|
|
|
fail:
|
2018-08-03 05:36:01 +07:00
|
|
|
mem_cgroup_id_remove(memcg);
|
2017-03-10 07:17:26 +07:00
|
|
|
__mem_cgroup_free(memcg);
|
2020-05-08 08:35:43 +07:00
|
|
|
return ERR_PTR(error);
|
2013-02-23 07:34:52 +07:00
|
|
|
}
|
|
|
|
|
2016-01-21 06:02:53 +07:00
|
|
|
static struct cgroup_subsys_state * __ref
|
|
|
|
mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
|
2013-02-23 07:34:52 +07:00
|
|
|
{
|
2016-01-21 06:02:53 +07:00
|
|
|
struct mem_cgroup *parent = mem_cgroup_from_css(parent_css);
|
2020-10-18 06:13:40 +07:00
|
|
|
struct mem_cgroup *memcg, *old_memcg;
|
2016-01-21 06:02:53 +07:00
|
|
|
long error = -ENOMEM;
|
2013-02-23 07:34:52 +07:00
|
|
|
|
2020-10-18 06:13:40 +07:00
|
|
|
old_memcg = set_active_memcg(parent);
|
2016-01-21 06:02:53 +07:00
|
|
|
memcg = mem_cgroup_alloc();
|
2020-10-18 06:13:40 +07:00
|
|
|
set_active_memcg(old_memcg);
|
2020-05-08 08:35:43 +07:00
|
|
|
if (IS_ERR(memcg))
|
|
|
|
return ERR_CAST(memcg);
|
2013-02-23 07:34:52 +07:00
|
|
|
|
2020-06-02 11:49:49 +07:00
|
|
|
page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX);
|
2016-01-21 06:02:53 +07:00
|
|
|
memcg->soft_limit = PAGE_COUNTER_MAX;
|
2020-06-02 11:49:52 +07:00
|
|
|
page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX);
|
2016-01-21 06:02:53 +07:00
|
|
|
if (parent) {
|
|
|
|
memcg->swappiness = mem_cgroup_swappiness(parent);
|
|
|
|
memcg->oom_kill_disable = parent->oom_kill_disable;
|
|
|
|
}
|
2020-11-02 08:07:34 +07:00
|
|
|
if (!parent) {
|
|
|
|
page_counter_init(&memcg->memory, NULL);
|
|
|
|
page_counter_init(&memcg->swap, NULL);
|
|
|
|
page_counter_init(&memcg->kmem, NULL);
|
|
|
|
page_counter_init(&memcg->tcpmem, NULL);
|
|
|
|
} else if (parent->use_hierarchy) {
|
2016-01-21 06:02:53 +07:00
|
|
|
memcg->use_hierarchy = true;
|
mm: memcontrol: lockless page counters
Memory is internally accounted in bytes, using spinlock-protected 64-bit
counters, even though the smallest accounting delta is a page. The
counter interface is also convoluted and does too many things.
Introduce a new lockless word-sized page counter API, then change all
memory accounting over to it. The translation from and to bytes then only
happens when interfacing with userspace.
The removed locking overhead is noticable when scaling beyond the per-cpu
charge caches - on a 4-socket machine with 144-threads, the following test
shows the performance differences of 288 memcgs concurrently running a
page fault benchmark:
vanilla:
18631648.500498 task-clock (msec) # 140.643 CPUs utilized ( +- 0.33% )
1,380,638 context-switches # 0.074 K/sec ( +- 0.75% )
24,390 cpu-migrations # 0.001 K/sec ( +- 8.44% )
1,843,305,768 page-faults # 0.099 M/sec ( +- 0.00% )
50,134,994,088,218 cycles # 2.691 GHz ( +- 0.33% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
8,049,712,224,651 instructions # 0.16 insns per cycle ( +- 0.04% )
1,586,970,584,979 branches # 85.176 M/sec ( +- 0.05% )
1,724,989,949 branch-misses # 0.11% of all branches ( +- 0.48% )
132.474343877 seconds time elapsed ( +- 0.21% )
lockless:
12195979.037525 task-clock (msec) # 133.480 CPUs utilized ( +- 0.18% )
832,850 context-switches # 0.068 K/sec ( +- 0.54% )
15,624 cpu-migrations # 0.001 K/sec ( +- 10.17% )
1,843,304,774 page-faults # 0.151 M/sec ( +- 0.00% )
32,811,216,801,141 cycles # 2.690 GHz ( +- 0.18% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
9,999,265,091,727 instructions # 0.30 insns per cycle ( +- 0.10% )
2,076,759,325,203 branches # 170.282 M/sec ( +- 0.12% )
1,656,917,214 branch-misses # 0.08% of all branches ( +- 0.55% )
91.369330729 seconds time elapsed ( +- 0.45% )
On top of improved scalability, this also gets rid of the icky long long
types in the very heart of memcg, which is great for 32 bit and also makes
the code a lot more readable.
Notable differences between the old and new API:
- res_counter_charge() and res_counter_charge_nofail() become
page_counter_try_charge() and page_counter_charge() resp. to match
the more common kernel naming scheme of try_do()/do()
- res_counter_uncharge_until() is only ever used to cancel a local
counter and never to uncharge bigger segments of a hierarchy, so
it's replaced by the simpler page_counter_cancel()
- res_counter_set_limit() is replaced by page_counter_limit(), which
expects its callers to serialize against themselves
- res_counter_memparse_write_strategy() is replaced by
page_counter_limit(), which rounds down to the nearest page size -
rather than up. This is more reasonable for explicitely requested
hard upper limits.
- to keep charging light-weight, page_counter_try_charge() charges
speculatively, only to roll back if the result exceeds the limit.
Because of this, a failing bigger charge can temporarily lock out
smaller charges that would otherwise succeed. The error is bounded
to the difference between the smallest and the biggest possible
charge size, so for memcg, this means that a failing THP charge can
send base page charges into reclaim upto 2MB (4MB) before the limit
would have been reached. This should be acceptable.
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE and memparse]
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE, memparse, strncmp, and PAGE_SIZE]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Acked-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-12-11 06:42:31 +07:00
|
|
|
page_counter_init(&memcg->memory, &parent->memory);
|
mm: memcontrol: charge swap to cgroup2
This patchset introduces swap accounting to cgroup2.
This patch (of 7):
In the legacy hierarchy we charge memsw, which is dubious, because:
- memsw.limit must be >= memory.limit, so it is impossible to limit
swap usage less than memory usage. Taking into account the fact that
the primary limiting mechanism in the unified hierarchy is
memory.high while memory.limit is either left unset or set to a very
large value, moving memsw.limit knob to the unified hierarchy would
effectively make it impossible to limit swap usage according to the
user preference.
- memsw.usage != memory.usage + swap.usage, because a page occupying
both swap entry and a swap cache page is charged only once to memsw
counter. As a result, it is possible to effectively eat up to
memory.limit of memory pages *and* memsw.limit of swap entries, which
looks unexpected.
That said, we should provide a different swap limiting mechanism for
cgroup2.
This patch adds mem_cgroup->swap counter, which charges the actual number
of swap entries used by a cgroup. It is only charged in the unified
hierarchy, while the legacy hierarchy memsw logic is left intact.
The swap usage can be monitored using new memory.swap.current file and
limited using memory.swap.max.
Note, to charge swap resource properly in the unified hierarchy, we have
to make swap_entry_free uncharge swap only when ->usage reaches zero, not
just ->count, i.e. when all references to a swap entry, including the one
taken by swap cache, are gone. This is necessary, because otherwise
swap-in could result in uncharging swap even if the page is still in swap
cache and hence still occupies a swap entry. At the same time, this
shouldn't break memsw counter logic, where a page is never charged twice
for using both memory and swap, because in case of legacy hierarchy we
uncharge swap on commit (see mem_cgroup_commit_charge).
Signed-off-by: Vladimir Davydov <vdavydov@virtuozzo.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-01-21 06:02:56 +07:00
|
|
|
page_counter_init(&memcg->swap, &parent->swap);
|
mm: memcontrol: lockless page counters
Memory is internally accounted in bytes, using spinlock-protected 64-bit
counters, even though the smallest accounting delta is a page. The
counter interface is also convoluted and does too many things.
Introduce a new lockless word-sized page counter API, then change all
memory accounting over to it. The translation from and to bytes then only
happens when interfacing with userspace.
The removed locking overhead is noticable when scaling beyond the per-cpu
charge caches - on a 4-socket machine with 144-threads, the following test
shows the performance differences of 288 memcgs concurrently running a
page fault benchmark:
vanilla:
18631648.500498 task-clock (msec) # 140.643 CPUs utilized ( +- 0.33% )
1,380,638 context-switches # 0.074 K/sec ( +- 0.75% )
24,390 cpu-migrations # 0.001 K/sec ( +- 8.44% )
1,843,305,768 page-faults # 0.099 M/sec ( +- 0.00% )
50,134,994,088,218 cycles # 2.691 GHz ( +- 0.33% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
8,049,712,224,651 instructions # 0.16 insns per cycle ( +- 0.04% )
1,586,970,584,979 branches # 85.176 M/sec ( +- 0.05% )
1,724,989,949 branch-misses # 0.11% of all branches ( +- 0.48% )
132.474343877 seconds time elapsed ( +- 0.21% )
lockless:
12195979.037525 task-clock (msec) # 133.480 CPUs utilized ( +- 0.18% )
832,850 context-switches # 0.068 K/sec ( +- 0.54% )
15,624 cpu-migrations # 0.001 K/sec ( +- 10.17% )
1,843,304,774 page-faults # 0.151 M/sec ( +- 0.00% )
32,811,216,801,141 cycles # 2.690 GHz ( +- 0.18% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
9,999,265,091,727 instructions # 0.30 insns per cycle ( +- 0.10% )
2,076,759,325,203 branches # 170.282 M/sec ( +- 0.12% )
1,656,917,214 branch-misses # 0.08% of all branches ( +- 0.55% )
91.369330729 seconds time elapsed ( +- 0.45% )
On top of improved scalability, this also gets rid of the icky long long
types in the very heart of memcg, which is great for 32 bit and also makes
the code a lot more readable.
Notable differences between the old and new API:
- res_counter_charge() and res_counter_charge_nofail() become
page_counter_try_charge() and page_counter_charge() resp. to match
the more common kernel naming scheme of try_do()/do()
- res_counter_uncharge_until() is only ever used to cancel a local
counter and never to uncharge bigger segments of a hierarchy, so
it's replaced by the simpler page_counter_cancel()
- res_counter_set_limit() is replaced by page_counter_limit(), which
expects its callers to serialize against themselves
- res_counter_memparse_write_strategy() is replaced by
page_counter_limit(), which rounds down to the nearest page size -
rather than up. This is more reasonable for explicitely requested
hard upper limits.
- to keep charging light-weight, page_counter_try_charge() charges
speculatively, only to roll back if the result exceeds the limit.
Because of this, a failing bigger charge can temporarily lock out
smaller charges that would otherwise succeed. The error is bounded
to the difference between the smallest and the biggest possible
charge size, so for memcg, this means that a failing THP charge can
send base page charges into reclaim upto 2MB (4MB) before the limit
would have been reached. This should be acceptable.
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE and memparse]
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE, memparse, strncmp, and PAGE_SIZE]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Acked-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-12-11 06:42:31 +07:00
|
|
|
page_counter_init(&memcg->kmem, &parent->kmem);
|
2016-01-21 06:02:50 +07:00
|
|
|
page_counter_init(&memcg->tcpmem, &parent->tcpmem);
|
2009-01-08 09:08:07 +07:00
|
|
|
} else {
|
2020-11-02 08:07:34 +07:00
|
|
|
page_counter_init(&memcg->memory, &root_mem_cgroup->memory);
|
|
|
|
page_counter_init(&memcg->swap, &root_mem_cgroup->swap);
|
|
|
|
page_counter_init(&memcg->kmem, &root_mem_cgroup->kmem);
|
|
|
|
page_counter_init(&memcg->tcpmem, &root_mem_cgroup->tcpmem);
|
2012-09-14 02:20:58 +07:00
|
|
|
/*
|
|
|
|
* Deeper hierachy with use_hierarchy == false doesn't make
|
|
|
|
* much sense so let cgroup subsystem know about this
|
|
|
|
* unfortunate state in our controller.
|
|
|
|
*/
|
2013-02-23 07:34:52 +07:00
|
|
|
if (parent != root_mem_cgroup)
|
2014-02-08 22:36:58 +07:00
|
|
|
memory_cgrp_subsys.broken_hierarchy = true;
|
2009-01-08 09:08:07 +07:00
|
|
|
}
|
2014-01-24 06:53:09 +07:00
|
|
|
|
2016-01-21 06:02:53 +07:00
|
|
|
/* The following stuff does not apply to the root */
|
|
|
|
if (!parent) {
|
|
|
|
root_mem_cgroup = memcg;
|
|
|
|
return &memcg->css;
|
|
|
|
}
|
|
|
|
|
mm: memcontrol: enable kmem accounting for all cgroups in the legacy hierarchy
Workingset code was recently made memcg aware, but shadow node shrinker
is still global. As a result, one small cgroup can consume all memory
available for shadow nodes, possibly hurting other cgroups by reclaiming
their shadow nodes, even though reclaim distances stored in its shadow
nodes have no effect. To avoid this, we need to make shadow node
shrinker memcg aware.
The actual work is done in patch 6 of the series. Patches 1 and 2
prepare memcg/shrinker infrastructure for the change. Patch 3 is just a
collateral cleanup. Patch 4 makes radix_tree_node accounted, which is
necessary for making shadow node shrinker memcg aware. Patch 5 reduces
shadow nodes overhead in case workload mostly uses anonymous pages.
This patch:
Currently, in the legacy hierarchy kmem accounting is off for all
cgroups by default and must be enabled explicitly by writing something
to memory.kmem.limit_in_bytes. Since we don't support reclaim on
hitting kmem limit, nor do we have any plans to implement it, this is
likely to be -1, just to enable kmem accounting and limit kernel memory
consumption by the memory.limit_in_bytes along with user memory.
This user API was introduced when the implementation of kmem accounting
lacked slab shrinker support and hence was useless in practice. Things
have changed since then - slab shrinkers were made memcg aware, the
accounting overhead seems to be negligible, and a failure to charge a
kmem allocation should not have critical consequences, because we only
account those kernel objects that should be safe to fail. That's why
kmem accounting is enabled by default for all cgroups in the default
hierarchy, which will eventually replace the legacy one.
The ability to enable kmem accounting for some cgroups while keeping it
disabled for others is getting difficult to maintain. E.g. to make
shadow node shrinker memcg aware (see mm/workingset.c), we need to know
the relationship between the number of shadow nodes allocated for a
cgroup and the size of its lru list. If kmem accounting is enabled for
all cgroups there is no problem, but what should we do if kmem
accounting is enabled only for half of cgroups? We've no other choice
but use global lru stats while scanning root cgroup's shadow nodes, but
that would be wrong if kmem accounting was enabled for all cgroups
(which is the case if the unified hierarchy is used), in which case we
should use lru stats of the root cgroup's lruvec.
That being said, let's enable kmem accounting for all memory cgroups by
default. If one finds it unstable or too costly, it can always be
disabled system-wide by passing cgroup.memory=nokmem to the kernel at
boot time.
Signed-off-by: Vladimir Davydov <vdavydov@virtuozzo.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-03-18 04:18:27 +07:00
|
|
|
error = memcg_online_kmem(memcg);
|
2016-01-21 06:02:53 +07:00
|
|
|
if (error)
|
|
|
|
goto fail;
|
2016-01-21 06:02:32 +07:00
|
|
|
|
2016-01-15 06:21:29 +07:00
|
|
|
if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
|
2016-01-15 06:21:34 +07:00
|
|
|
static_branch_inc(&memcg_sockets_enabled_key);
|
2016-01-15 06:21:29 +07:00
|
|
|
|
2016-01-21 06:02:53 +07:00
|
|
|
return &memcg->css;
|
|
|
|
fail:
|
2018-08-03 05:36:01 +07:00
|
|
|
mem_cgroup_id_remove(memcg);
|
2016-01-21 06:02:53 +07:00
|
|
|
mem_cgroup_free(memcg);
|
2020-05-08 08:35:43 +07:00
|
|
|
return ERR_PTR(error);
|
2016-01-21 06:02:53 +07:00
|
|
|
}
|
|
|
|
|
2016-07-21 05:44:57 +07:00
|
|
|
static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
|
2016-01-21 06:02:53 +07:00
|
|
|
{
|
2016-10-08 06:57:29 +07:00
|
|
|
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
|
|
|
|
|
mm, memcg: assign memcg-aware shrinkers bitmap to memcg
Imagine a big node with many cpus, memory cgroups and containers. Let
we have 200 containers, every container has 10 mounts, and 10 cgroups.
All container tasks don't touch foreign containers mounts. If there is
intensive pages write, and global reclaim happens, a writing task has to
iterate over all memcgs to shrink slab, before it's able to go to
shrink_page_list().
Iteration over all the memcg slabs is very expensive: the task has to
visit 200 * 10 = 2000 shrinkers for every memcg, and since there are
2000 memcgs, the total calls are 2000 * 2000 = 4000000.
So, the shrinker makes 4 million do_shrink_slab() calls just to try to
isolate SWAP_CLUSTER_MAX pages in one of the actively writing memcg via
shrink_page_list(). I've observed a node spending almost 100% in
kernel, making useless iteration over already shrinked slab.
This patch adds bitmap of memcg-aware shrinkers to memcg. The size of
the bitmap depends on bitmap_nr_ids, and during memcg life it's
maintained to be enough to fit bitmap_nr_ids shrinkers. Every bit in
the map is related to corresponding shrinker id.
Next patches will maintain set bit only for really charged memcg. This
will allow shrink_slab() to increase its performance in significant way.
See the last patch for the numbers.
[ktkhai@virtuozzo.com: v9]
Link: http://lkml.kernel.org/r/153112549031.4097.3576147070498769979.stgit@localhost.localdomain
[ktkhai@virtuozzo.com: add comment to mem_cgroup_css_online()]
Link: http://lkml.kernel.org/r/521f9e5f-c436-b388-fe83-4dc870bfb489@virtuozzo.com
Link: http://lkml.kernel.org/r/153063056619.1818.12550500883688681076.stgit@localhost.localdomain
Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Acked-by: Vladimir Davydov <vdavydov.dev@gmail.com>
Tested-by: Shakeel Butt <shakeelb@google.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Guenter Roeck <linux@roeck-us.net>
Cc: "Huang, Ying" <ying.huang@intel.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Josef Bacik <jbacik@fb.com>
Cc: Li RongQing <lirongqing@baidu.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Matthias Kaehlcke <mka@chromium.org>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Philippe Ombredanne <pombredanne@nexb.com>
Cc: Roman Gushchin <guro@fb.com>
Cc: Sahitya Tummala <stummala@codeaurora.org>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Waiman Long <longman@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2018-08-18 05:47:37 +07:00
|
|
|
/*
|
|
|
|
* A memcg must be visible for memcg_expand_shrinker_maps()
|
|
|
|
* by the time the maps are allocated. So, we allocate maps
|
|
|
|
* here, when for_each_mem_cgroup() can't skip it.
|
|
|
|
*/
|
|
|
|
if (memcg_alloc_shrinker_maps(memcg)) {
|
|
|
|
mem_cgroup_id_remove(memcg);
|
|
|
|
return -ENOMEM;
|
|
|
|
}
|
|
|
|
|
2016-07-21 05:44:57 +07:00
|
|
|
/* Online state pins memcg ID, memcg ID pins CSS */
|
2018-10-27 05:09:28 +07:00
|
|
|
refcount_set(&memcg->id.ref, 1);
|
2016-07-21 05:44:57 +07:00
|
|
|
css_get(css);
|
2014-10-03 06:16:57 +07:00
|
|
|
return 0;
|
2008-02-07 15:13:50 +07:00
|
|
|
}
|
|
|
|
|
2013-08-09 07:11:23 +07:00
|
|
|
static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
|
2008-02-07 15:14:28 +07:00
|
|
|
{
|
2013-08-09 07:11:23 +07:00
|
|
|
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
|
2013-11-23 06:20:44 +07:00
|
|
|
struct mem_cgroup_event *event, *tmp;
|
2013-11-23 06:20:42 +07:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Unregister events and notify userspace.
|
|
|
|
* Notify userspace about cgroup removing only after rmdir of cgroup
|
|
|
|
* directory to avoid race between userspace and kernelspace.
|
|
|
|
*/
|
2013-11-23 06:20:43 +07:00
|
|
|
spin_lock(&memcg->event_list_lock);
|
|
|
|
list_for_each_entry_safe(event, tmp, &memcg->event_list, list) {
|
2013-11-23 06:20:42 +07:00
|
|
|
list_del_init(&event->list);
|
|
|
|
schedule_work(&event->remove);
|
|
|
|
}
|
2013-11-23 06:20:43 +07:00
|
|
|
spin_unlock(&memcg->event_list_lock);
|
2009-04-03 06:57:26 +07:00
|
|
|
|
2018-06-08 07:07:46 +07:00
|
|
|
page_counter_set_min(&memcg->memory, 0);
|
mm: memory.low hierarchical behavior
This patch aims to address an issue in current memory.low semantics,
which makes it hard to use it in a hierarchy, where some leaf memory
cgroups are more valuable than others.
For example, there are memcgs A, A/B, A/C, A/D and A/E:
A A/memory.low = 2G, A/memory.current = 6G
//\\
BC DE B/memory.low = 3G B/memory.current = 2G
C/memory.low = 1G C/memory.current = 2G
D/memory.low = 0 D/memory.current = 2G
E/memory.low = 10G E/memory.current = 0
If we apply memory pressure, B, C and D are reclaimed at the same pace
while A's usage exceeds 2G. This is obviously wrong, as B's usage is
fully below B's memory.low, and C has 1G of protection as well. Also, A
is pushed to the size, which is less than A's 2G memory.low, which is
also wrong.
A simple bash script (provided below) can be used to reproduce
the problem. Current results are:
A: 1430097920
A/B: 711929856
A/C: 717426688
A/D: 741376
A/E: 0
To address the issue a concept of effective memory.low is introduced.
Effective memory.low is always equal or less than original memory.low.
In a case, when there is no memory.low overcommittment (and also for
top-level cgroups), these two values are equal.
Otherwise it's a part of parent's effective memory.low, calculated as a
cgroup's memory.low usage divided by sum of sibling's memory.low usages
(under memory.low usage I mean the size of actually protected memory:
memory.current if memory.current < memory.low, 0 otherwise). It's
necessary to track the actual usage, because otherwise an empty cgroup
with memory.low set (A/E in my example) will affect actual memory
distribution, which makes no sense. To avoid traversing the cgroup tree
twice, page_counters code is reused.
Calculating effective memory.low can be done in the reclaim path, as we
conveniently traversing the cgroup tree from top to bottom and check
memory.low on each level. So, it's a perfect place to calculate
effective memory low and save it to use it for children cgroups.
This also eliminates a need to traverse the cgroup tree from bottom to
top each time to check if parent's guarantee is not exceeded.
Setting/resetting effective memory.low is intentionally racy, but it's
fine and shouldn't lead to any significant differences in actual memory
distribution.
With this patch applied results are matching the expectations:
A: 2147930112
A/B: 1428721664
A/C: 718393344
A/D: 815104
A/E: 0
Test script:
#!/bin/bash
CGPATH="/sys/fs/cgroup"
truncate /file1 --size 2G
truncate /file2 --size 2G
truncate /file3 --size 2G
truncate /file4 --size 50G
mkdir "${CGPATH}/A"
echo "+memory" > "${CGPATH}/A/cgroup.subtree_control"
mkdir "${CGPATH}/A/B" "${CGPATH}/A/C" "${CGPATH}/A/D" "${CGPATH}/A/E"
echo 2G > "${CGPATH}/A/memory.low"
echo 3G > "${CGPATH}/A/B/memory.low"
echo 1G > "${CGPATH}/A/C/memory.low"
echo 0 > "${CGPATH}/A/D/memory.low"
echo 10G > "${CGPATH}/A/E/memory.low"
echo $$ > "${CGPATH}/A/B/cgroup.procs" && vmtouch -qt /file1
echo $$ > "${CGPATH}/A/C/cgroup.procs" && vmtouch -qt /file2
echo $$ > "${CGPATH}/A/D/cgroup.procs" && vmtouch -qt /file3
echo $$ > "${CGPATH}/cgroup.procs" && vmtouch -qt /file4
echo "A: " `cat "${CGPATH}/A/memory.current"`
echo "A/B: " `cat "${CGPATH}/A/B/memory.current"`
echo "A/C: " `cat "${CGPATH}/A/C/memory.current"`
echo "A/D: " `cat "${CGPATH}/A/D/memory.current"`
echo "A/E: " `cat "${CGPATH}/A/E/memory.current"`
rmdir "${CGPATH}/A/B" "${CGPATH}/A/C" "${CGPATH}/A/D" "${CGPATH}/A/E"
rmdir "${CGPATH}/A"
rm /file1 /file2 /file3 /file4
Link: http://lkml.kernel.org/r/20180405185921.4942-2-guro@fb.com
Signed-off-by: Roman Gushchin <guro@fb.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2018-06-08 07:06:22 +07:00
|
|
|
page_counter_set_low(&memcg->memory, 0);
|
2017-09-07 06:21:47 +07:00
|
|
|
|
2016-01-21 06:02:24 +07:00
|
|
|
memcg_offline_kmem(memcg);
|
2015-05-23 04:13:37 +07:00
|
|
|
wb_memcg_offline(memcg);
|
2016-07-21 05:44:57 +07:00
|
|
|
|
2018-10-27 05:03:23 +07:00
|
|
|
drain_all_stock(memcg);
|
|
|
|
|
2016-07-21 05:44:57 +07:00
|
|
|
mem_cgroup_id_put(memcg);
|
2008-02-07 15:14:28 +07:00
|
|
|
}
|
|
|
|
|
2015-12-30 05:54:10 +07:00
|
|
|
static void mem_cgroup_css_released(struct cgroup_subsys_state *css)
|
|
|
|
{
|
|
|
|
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
|
|
|
|
|
|
|
|
invalidate_reclaim_iterators(memcg);
|
|
|
|
}
|
|
|
|
|
2013-08-09 07:11:23 +07:00
|
|
|
static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
|
2008-02-07 15:13:50 +07:00
|
|
|
{
|
2013-08-09 07:11:23 +07:00
|
|
|
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
|
writeback, memcg: Implement foreign dirty flushing
There's an inherent mismatch between memcg and writeback. The former
trackes ownership per-page while the latter per-inode. This was a
deliberate design decision because honoring per-page ownership in the
writeback path is complicated, may lead to higher CPU and IO overheads
and deemed unnecessary given that write-sharing an inode across
different cgroups isn't a common use-case.
Combined with inode majority-writer ownership switching, this works
well enough in most cases but there are some pathological cases. For
example, let's say there are two cgroups A and B which keep writing to
different but confined parts of the same inode. B owns the inode and
A's memory is limited far below B's. A's dirty ratio can rise enough
to trigger balance_dirty_pages() sleeps but B's can be low enough to
avoid triggering background writeback. A will be slowed down without
a way to make writeback of the dirty pages happen.
This patch implements foreign dirty recording and foreign mechanism so
that when a memcg encounters a condition as above it can trigger
flushes on bdi_writebacks which can clean its pages. Please see the
comment on top of mem_cgroup_track_foreign_dirty_slowpath() for
details.
A reproducer follows.
write-range.c::
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <fcntl.h>
#include <sys/types.h>
static const char *usage = "write-range FILE START SIZE\n";
int main(int argc, char **argv)
{
int fd;
unsigned long start, size, end, pos;
char *endp;
char buf[4096];
if (argc < 4) {
fprintf(stderr, usage);
return 1;
}
fd = open(argv[1], O_WRONLY);
if (fd < 0) {
perror("open");
return 1;
}
start = strtoul(argv[2], &endp, 0);
if (*endp != '\0') {
fprintf(stderr, usage);
return 1;
}
size = strtoul(argv[3], &endp, 0);
if (*endp != '\0') {
fprintf(stderr, usage);
return 1;
}
end = start + size;
while (1) {
for (pos = start; pos < end; ) {
long bread, bwritten = 0;
if (lseek(fd, pos, SEEK_SET) < 0) {
perror("lseek");
return 1;
}
bread = read(0, buf, sizeof(buf) < end - pos ?
sizeof(buf) : end - pos);
if (bread < 0) {
perror("read");
return 1;
}
if (bread == 0)
return 0;
while (bwritten < bread) {
long this;
this = write(fd, buf + bwritten,
bread - bwritten);
if (this < 0) {
perror("write");
return 1;
}
bwritten += this;
pos += bwritten;
}
}
}
}
repro.sh::
#!/bin/bash
set -e
set -x
sysctl -w vm.dirty_expire_centisecs=300000
sysctl -w vm.dirty_writeback_centisecs=300000
sysctl -w vm.dirtytime_expire_seconds=300000
echo 3 > /proc/sys/vm/drop_caches
TEST=/sys/fs/cgroup/test
A=$TEST/A
B=$TEST/B
mkdir -p $A $B
echo "+memory +io" > $TEST/cgroup.subtree_control
echo $((1<<30)) > $A/memory.high
echo $((32<<30)) > $B/memory.high
rm -f testfile
touch testfile
fallocate -l 4G testfile
echo "Starting B"
(echo $BASHPID > $B/cgroup.procs
pv -q --rate-limit 70M < /dev/urandom | ./write-range testfile $((2<<30)) $((2<<30))) &
echo "Waiting 10s to ensure B claims the testfile inode"
sleep 5
sync
sleep 5
sync
echo "Starting A"
(echo $BASHPID > $A/cgroup.procs
pv < /dev/urandom | ./write-range testfile 0 $((2<<30)))
v2: Added comments explaining why the specific intervals are being used.
v3: Use 0 @nr when calling cgroup_writeback_by_id() to use best-effort
flushing while avoding possible livelocks.
v4: Use get_jiffies_64() and time_before/after64() instead of raw
jiffies_64 and arthimetic comparisons as suggested by Jan.
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2019-08-26 23:06:56 +07:00
|
|
|
int __maybe_unused i;
|
2009-01-16 04:51:13 +07:00
|
|
|
|
writeback, memcg: Implement foreign dirty flushing
There's an inherent mismatch between memcg and writeback. The former
trackes ownership per-page while the latter per-inode. This was a
deliberate design decision because honoring per-page ownership in the
writeback path is complicated, may lead to higher CPU and IO overheads
and deemed unnecessary given that write-sharing an inode across
different cgroups isn't a common use-case.
Combined with inode majority-writer ownership switching, this works
well enough in most cases but there are some pathological cases. For
example, let's say there are two cgroups A and B which keep writing to
different but confined parts of the same inode. B owns the inode and
A's memory is limited far below B's. A's dirty ratio can rise enough
to trigger balance_dirty_pages() sleeps but B's can be low enough to
avoid triggering background writeback. A will be slowed down without
a way to make writeback of the dirty pages happen.
This patch implements foreign dirty recording and foreign mechanism so
that when a memcg encounters a condition as above it can trigger
flushes on bdi_writebacks which can clean its pages. Please see the
comment on top of mem_cgroup_track_foreign_dirty_slowpath() for
details.
A reproducer follows.
write-range.c::
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <fcntl.h>
#include <sys/types.h>
static const char *usage = "write-range FILE START SIZE\n";
int main(int argc, char **argv)
{
int fd;
unsigned long start, size, end, pos;
char *endp;
char buf[4096];
if (argc < 4) {
fprintf(stderr, usage);
return 1;
}
fd = open(argv[1], O_WRONLY);
if (fd < 0) {
perror("open");
return 1;
}
start = strtoul(argv[2], &endp, 0);
if (*endp != '\0') {
fprintf(stderr, usage);
return 1;
}
size = strtoul(argv[3], &endp, 0);
if (*endp != '\0') {
fprintf(stderr, usage);
return 1;
}
end = start + size;
while (1) {
for (pos = start; pos < end; ) {
long bread, bwritten = 0;
if (lseek(fd, pos, SEEK_SET) < 0) {
perror("lseek");
return 1;
}
bread = read(0, buf, sizeof(buf) < end - pos ?
sizeof(buf) : end - pos);
if (bread < 0) {
perror("read");
return 1;
}
if (bread == 0)
return 0;
while (bwritten < bread) {
long this;
this = write(fd, buf + bwritten,
bread - bwritten);
if (this < 0) {
perror("write");
return 1;
}
bwritten += this;
pos += bwritten;
}
}
}
}
repro.sh::
#!/bin/bash
set -e
set -x
sysctl -w vm.dirty_expire_centisecs=300000
sysctl -w vm.dirty_writeback_centisecs=300000
sysctl -w vm.dirtytime_expire_seconds=300000
echo 3 > /proc/sys/vm/drop_caches
TEST=/sys/fs/cgroup/test
A=$TEST/A
B=$TEST/B
mkdir -p $A $B
echo "+memory +io" > $TEST/cgroup.subtree_control
echo $((1<<30)) > $A/memory.high
echo $((32<<30)) > $B/memory.high
rm -f testfile
touch testfile
fallocate -l 4G testfile
echo "Starting B"
(echo $BASHPID > $B/cgroup.procs
pv -q --rate-limit 70M < /dev/urandom | ./write-range testfile $((2<<30)) $((2<<30))) &
echo "Waiting 10s to ensure B claims the testfile inode"
sleep 5
sync
sleep 5
sync
echo "Starting A"
(echo $BASHPID > $A/cgroup.procs
pv < /dev/urandom | ./write-range testfile 0 $((2<<30)))
v2: Added comments explaining why the specific intervals are being used.
v3: Use 0 @nr when calling cgroup_writeback_by_id() to use best-effort
flushing while avoding possible livelocks.
v4: Use get_jiffies_64() and time_before/after64() instead of raw
jiffies_64 and arthimetic comparisons as suggested by Jan.
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2019-08-26 23:06:56 +07:00
|
|
|
#ifdef CONFIG_CGROUP_WRITEBACK
|
|
|
|
for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++)
|
|
|
|
wb_wait_for_completion(&memcg->cgwb_frn[i].done);
|
|
|
|
#endif
|
2016-01-15 06:21:29 +07:00
|
|
|
if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
|
2016-01-15 06:21:34 +07:00
|
|
|
static_branch_dec(&memcg_sockets_enabled_key);
|
2016-01-21 06:02:32 +07:00
|
|
|
|
2016-01-21 06:02:50 +07:00
|
|
|
if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && memcg->tcpmem_active)
|
2016-01-21 06:02:44 +07:00
|
|
|
static_branch_dec(&memcg_sockets_enabled_key);
|
2016-01-21 06:02:29 +07:00
|
|
|
|
2016-01-21 06:02:53 +07:00
|
|
|
vmpressure_cleanup(&memcg->vmpressure);
|
|
|
|
cancel_work_sync(&memcg->high_work);
|
|
|
|
mem_cgroup_remove_from_trees(memcg);
|
mm, memcg: assign memcg-aware shrinkers bitmap to memcg
Imagine a big node with many cpus, memory cgroups and containers. Let
we have 200 containers, every container has 10 mounts, and 10 cgroups.
All container tasks don't touch foreign containers mounts. If there is
intensive pages write, and global reclaim happens, a writing task has to
iterate over all memcgs to shrink slab, before it's able to go to
shrink_page_list().
Iteration over all the memcg slabs is very expensive: the task has to
visit 200 * 10 = 2000 shrinkers for every memcg, and since there are
2000 memcgs, the total calls are 2000 * 2000 = 4000000.
So, the shrinker makes 4 million do_shrink_slab() calls just to try to
isolate SWAP_CLUSTER_MAX pages in one of the actively writing memcg via
shrink_page_list(). I've observed a node spending almost 100% in
kernel, making useless iteration over already shrinked slab.
This patch adds bitmap of memcg-aware shrinkers to memcg. The size of
the bitmap depends on bitmap_nr_ids, and during memcg life it's
maintained to be enough to fit bitmap_nr_ids shrinkers. Every bit in
the map is related to corresponding shrinker id.
Next patches will maintain set bit only for really charged memcg. This
will allow shrink_slab() to increase its performance in significant way.
See the last patch for the numbers.
[ktkhai@virtuozzo.com: v9]
Link: http://lkml.kernel.org/r/153112549031.4097.3576147070498769979.stgit@localhost.localdomain
[ktkhai@virtuozzo.com: add comment to mem_cgroup_css_online()]
Link: http://lkml.kernel.org/r/521f9e5f-c436-b388-fe83-4dc870bfb489@virtuozzo.com
Link: http://lkml.kernel.org/r/153063056619.1818.12550500883688681076.stgit@localhost.localdomain
Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Acked-by: Vladimir Davydov <vdavydov.dev@gmail.com>
Tested-by: Shakeel Butt <shakeelb@google.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Guenter Roeck <linux@roeck-us.net>
Cc: "Huang, Ying" <ying.huang@intel.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Josef Bacik <jbacik@fb.com>
Cc: Li RongQing <lirongqing@baidu.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Matthias Kaehlcke <mka@chromium.org>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Philippe Ombredanne <pombredanne@nexb.com>
Cc: Roman Gushchin <guro@fb.com>
Cc: Sahitya Tummala <stummala@codeaurora.org>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Waiman Long <longman@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2018-08-18 05:47:37 +07:00
|
|
|
memcg_free_shrinker_maps(memcg);
|
2016-01-21 06:02:47 +07:00
|
|
|
memcg_free_kmem(memcg);
|
2016-01-21 06:02:53 +07:00
|
|
|
mem_cgroup_free(memcg);
|
2008-02-07 15:13:50 +07:00
|
|
|
}
|
|
|
|
|
2014-07-09 05:02:57 +07:00
|
|
|
/**
|
|
|
|
* mem_cgroup_css_reset - reset the states of a mem_cgroup
|
|
|
|
* @css: the target css
|
|
|
|
*
|
|
|
|
* Reset the states of the mem_cgroup associated with @css. This is
|
|
|
|
* invoked when the userland requests disabling on the default hierarchy
|
|
|
|
* but the memcg is pinned through dependency. The memcg should stop
|
|
|
|
* applying policies and should revert to the vanilla state as it may be
|
|
|
|
* made visible again.
|
|
|
|
*
|
|
|
|
* The current implementation only resets the essential configurations.
|
|
|
|
* This needs to be expanded to cover all the visible parts.
|
|
|
|
*/
|
|
|
|
static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
|
|
|
|
{
|
|
|
|
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
|
|
|
|
|
2018-06-08 07:06:18 +07:00
|
|
|
page_counter_set_max(&memcg->memory, PAGE_COUNTER_MAX);
|
|
|
|
page_counter_set_max(&memcg->swap, PAGE_COUNTER_MAX);
|
|
|
|
page_counter_set_max(&memcg->kmem, PAGE_COUNTER_MAX);
|
|
|
|
page_counter_set_max(&memcg->tcpmem, PAGE_COUNTER_MAX);
|
2018-06-08 07:07:46 +07:00
|
|
|
page_counter_set_min(&memcg->memory, 0);
|
mm: memory.low hierarchical behavior
This patch aims to address an issue in current memory.low semantics,
which makes it hard to use it in a hierarchy, where some leaf memory
cgroups are more valuable than others.
For example, there are memcgs A, A/B, A/C, A/D and A/E:
A A/memory.low = 2G, A/memory.current = 6G
//\\
BC DE B/memory.low = 3G B/memory.current = 2G
C/memory.low = 1G C/memory.current = 2G
D/memory.low = 0 D/memory.current = 2G
E/memory.low = 10G E/memory.current = 0
If we apply memory pressure, B, C and D are reclaimed at the same pace
while A's usage exceeds 2G. This is obviously wrong, as B's usage is
fully below B's memory.low, and C has 1G of protection as well. Also, A
is pushed to the size, which is less than A's 2G memory.low, which is
also wrong.
A simple bash script (provided below) can be used to reproduce
the problem. Current results are:
A: 1430097920
A/B: 711929856
A/C: 717426688
A/D: 741376
A/E: 0
To address the issue a concept of effective memory.low is introduced.
Effective memory.low is always equal or less than original memory.low.
In a case, when there is no memory.low overcommittment (and also for
top-level cgroups), these two values are equal.
Otherwise it's a part of parent's effective memory.low, calculated as a
cgroup's memory.low usage divided by sum of sibling's memory.low usages
(under memory.low usage I mean the size of actually protected memory:
memory.current if memory.current < memory.low, 0 otherwise). It's
necessary to track the actual usage, because otherwise an empty cgroup
with memory.low set (A/E in my example) will affect actual memory
distribution, which makes no sense. To avoid traversing the cgroup tree
twice, page_counters code is reused.
Calculating effective memory.low can be done in the reclaim path, as we
conveniently traversing the cgroup tree from top to bottom and check
memory.low on each level. So, it's a perfect place to calculate
effective memory low and save it to use it for children cgroups.
This also eliminates a need to traverse the cgroup tree from bottom to
top each time to check if parent's guarantee is not exceeded.
Setting/resetting effective memory.low is intentionally racy, but it's
fine and shouldn't lead to any significant differences in actual memory
distribution.
With this patch applied results are matching the expectations:
A: 2147930112
A/B: 1428721664
A/C: 718393344
A/D: 815104
A/E: 0
Test script:
#!/bin/bash
CGPATH="/sys/fs/cgroup"
truncate /file1 --size 2G
truncate /file2 --size 2G
truncate /file3 --size 2G
truncate /file4 --size 50G
mkdir "${CGPATH}/A"
echo "+memory" > "${CGPATH}/A/cgroup.subtree_control"
mkdir "${CGPATH}/A/B" "${CGPATH}/A/C" "${CGPATH}/A/D" "${CGPATH}/A/E"
echo 2G > "${CGPATH}/A/memory.low"
echo 3G > "${CGPATH}/A/B/memory.low"
echo 1G > "${CGPATH}/A/C/memory.low"
echo 0 > "${CGPATH}/A/D/memory.low"
echo 10G > "${CGPATH}/A/E/memory.low"
echo $$ > "${CGPATH}/A/B/cgroup.procs" && vmtouch -qt /file1
echo $$ > "${CGPATH}/A/C/cgroup.procs" && vmtouch -qt /file2
echo $$ > "${CGPATH}/A/D/cgroup.procs" && vmtouch -qt /file3
echo $$ > "${CGPATH}/cgroup.procs" && vmtouch -qt /file4
echo "A: " `cat "${CGPATH}/A/memory.current"`
echo "A/B: " `cat "${CGPATH}/A/B/memory.current"`
echo "A/C: " `cat "${CGPATH}/A/C/memory.current"`
echo "A/D: " `cat "${CGPATH}/A/D/memory.current"`
echo "A/E: " `cat "${CGPATH}/A/E/memory.current"`
rmdir "${CGPATH}/A/B" "${CGPATH}/A/C" "${CGPATH}/A/D" "${CGPATH}/A/E"
rmdir "${CGPATH}/A"
rm /file1 /file2 /file3 /file4
Link: http://lkml.kernel.org/r/20180405185921.4942-2-guro@fb.com
Signed-off-by: Roman Gushchin <guro@fb.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2018-06-08 07:06:22 +07:00
|
|
|
page_counter_set_low(&memcg->memory, 0);
|
2020-06-02 11:49:49 +07:00
|
|
|
page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX);
|
2015-01-09 05:32:35 +07:00
|
|
|
memcg->soft_limit = PAGE_COUNTER_MAX;
|
2020-06-02 11:49:52 +07:00
|
|
|
page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX);
|
2015-05-23 05:23:34 +07:00
|
|
|
memcg_wb_domain_size_changed(memcg);
|
2014-07-09 05:02:57 +07:00
|
|
|
}
|
|
|
|
|
2010-03-11 06:22:17 +07:00
|
|
|
#ifdef CONFIG_MMU
|
2010-03-11 06:22:13 +07:00
|
|
|
/* Handlers for move charge at task migration. */
|
2010-03-11 06:22:15 +07:00
|
|
|
static int mem_cgroup_do_precharge(unsigned long count)
|
2010-03-11 06:22:13 +07:00
|
|
|
{
|
2014-08-07 06:05:59 +07:00
|
|
|
int ret;
|
2014-08-07 06:05:55 +07:00
|
|
|
|
2015-11-07 07:28:21 +07:00
|
|
|
/* Try a single bulk charge without reclaim first, kswapd may wake */
|
|
|
|
ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_DIRECT_RECLAIM, count);
|
2014-08-07 06:05:55 +07:00
|
|
|
if (!ret) {
|
2010-03-11 06:22:15 +07:00
|
|
|
mc.precharge += count;
|
|
|
|
return ret;
|
|
|
|
}
|
2014-08-07 06:05:55 +07:00
|
|
|
|
2017-01-25 06:18:10 +07:00
|
|
|
/* Try charges one by one with reclaim, but do not retry */
|
2010-03-11 06:22:15 +07:00
|
|
|
while (count--) {
|
2017-01-25 06:18:10 +07:00
|
|
|
ret = try_charge(mc.to, GFP_KERNEL | __GFP_NORETRY, 1);
|
2012-01-13 08:19:01 +07:00
|
|
|
if (ret)
|
|
|
|
return ret;
|
2010-03-11 06:22:15 +07:00
|
|
|
mc.precharge++;
|
2014-08-07 06:05:55 +07:00
|
|
|
cond_resched();
|
2010-03-11 06:22:15 +07:00
|
|
|
}
|
2014-08-07 06:05:55 +07:00
|
|
|
return 0;
|
2010-03-11 06:22:14 +07:00
|
|
|
}
|
|
|
|
|
|
|
|
union mc_target {
|
|
|
|
struct page *page;
|
2010-03-11 06:22:17 +07:00
|
|
|
swp_entry_t ent;
|
2010-03-11 06:22:14 +07:00
|
|
|
};
|
|
|
|
|
|
|
|
enum mc_target_type {
|
2012-03-22 06:34:27 +07:00
|
|
|
MC_TARGET_NONE = 0,
|
2010-03-11 06:22:14 +07:00
|
|
|
MC_TARGET_PAGE,
|
2010-03-11 06:22:17 +07:00
|
|
|
MC_TARGET_SWAP,
|
2017-09-09 06:11:54 +07:00
|
|
|
MC_TARGET_DEVICE,
|
2010-03-11 06:22:14 +07:00
|
|
|
};
|
|
|
|
|
2010-05-27 04:42:38 +07:00
|
|
|
static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
|
|
|
|
unsigned long addr, pte_t ptent)
|
2010-03-11 06:22:14 +07:00
|
|
|
{
|
2019-06-14 03:50:49 +07:00
|
|
|
struct page *page = vm_normal_page(vma, addr, ptent);
|
2010-03-11 06:22:14 +07:00
|
|
|
|
2010-05-27 04:42:38 +07:00
|
|
|
if (!page || !page_mapped(page))
|
|
|
|
return NULL;
|
|
|
|
if (PageAnon(page)) {
|
2015-02-12 06:26:09 +07:00
|
|
|
if (!(mc.flags & MOVE_ANON))
|
2010-05-27 04:42:38 +07:00
|
|
|
return NULL;
|
2015-02-12 06:26:09 +07:00
|
|
|
} else {
|
|
|
|
if (!(mc.flags & MOVE_FILE))
|
|
|
|
return NULL;
|
|
|
|
}
|
2010-05-27 04:42:38 +07:00
|
|
|
if (!get_page_unless_zero(page))
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
return page;
|
|
|
|
}
|
|
|
|
|
2017-09-09 06:11:54 +07:00
|
|
|
#if defined(CONFIG_SWAP) || defined(CONFIG_DEVICE_PRIVATE)
|
2010-05-27 04:42:38 +07:00
|
|
|
static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
|
2016-07-27 05:22:14 +07:00
|
|
|
pte_t ptent, swp_entry_t *entry)
|
2010-05-27 04:42:38 +07:00
|
|
|
{
|
|
|
|
struct page *page = NULL;
|
|
|
|
swp_entry_t ent = pte_to_swp_entry(ptent);
|
|
|
|
|
2020-10-14 06:53:13 +07:00
|
|
|
if (!(mc.flags & MOVE_ANON))
|
2010-05-27 04:42:38 +07:00
|
|
|
return NULL;
|
2017-09-09 06:11:54 +07:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Handle MEMORY_DEVICE_PRIVATE which are ZONE_DEVICE page belonging to
|
|
|
|
* a device and because they are not accessible by CPU they are store
|
|
|
|
* as special swap entry in the CPU page table.
|
|
|
|
*/
|
|
|
|
if (is_device_private_entry(ent)) {
|
|
|
|
page = device_private_entry_to_page(ent);
|
|
|
|
/*
|
|
|
|
* MEMORY_DEVICE_PRIVATE means ZONE_DEVICE page and which have
|
|
|
|
* a refcount of 1 when free (unlike normal page)
|
|
|
|
*/
|
|
|
|
if (!page_ref_add_unless(page, 1, 1))
|
|
|
|
return NULL;
|
|
|
|
return page;
|
|
|
|
}
|
|
|
|
|
2020-10-14 06:53:13 +07:00
|
|
|
if (non_swap_entry(ent))
|
|
|
|
return NULL;
|
|
|
|
|
memcg: fix/change behavior of shared anon at moving task
This patch changes memcg's behavior at task_move().
At task_move(), the kernel scans a task's page table and move the changes
for mapped pages from source cgroup to target cgroup. There has been a
bug at handling shared anonymous pages for a long time.
Before patch:
- The spec says 'shared anonymous pages are not moved.'
- The implementation was 'shared anonymoys pages may be moved'.
If page_mapcount <=2, shared anonymous pages's charge were moved.
After patch:
- The spec says 'all anonymous pages are moved'.
- The implementation is 'all anonymous pages are moved'.
Considering usage of memcg, this will not affect user's experience.
'shared anonymous' pages only exists between a tree of processes which
don't do exec(). Moving one of process without exec() seems not sane.
For example, libcgroup will not be affected by this change. (Anyway, no
one noticed the implementation for a long time...)
Below is a discussion log:
- current spec/implementation are complex
- Now, shared file caches are moved
- It adds unclear check as page_mapcount(). To do correct check,
we should check swap users, etc.
- No one notice this implementation behavior. So, no one get benefit
from the design.
- In general, once task is moved to a cgroup for running, it will not
be moved....
- Finally, we have control knob as memory.move_charge_at_immigrate.
Here is a patch to allow moving shared pages, completely. This makes
memcg simpler and fix current broken code.
Suggested-by: Hugh Dickins <hughd@google.com>
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Glauber Costa <glommer@parallels.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-05-30 05:06:51 +07:00
|
|
|
/*
|
|
|
|
* Because lookup_swap_cache() updates some statistics counter,
|
|
|
|
* we call find_get_page() with swapper_space directly.
|
|
|
|
*/
|
mm, swap: use offset of swap entry as key of swap cache
This patch is to improve the performance of swap cache operations when
the type of the swap device is not 0. Originally, the whole swap entry
value is used as the key of the swap cache, even though there is one
radix tree for each swap device. If the type of the swap device is not
0, the height of the radix tree of the swap cache will be increased
unnecessary, especially on 64bit architecture. For example, for a 1GB
swap device on the x86_64 architecture, the height of the radix tree of
the swap cache is 11. But if the offset of the swap entry is used as
the key of the swap cache, the height of the radix tree of the swap
cache is 4. The increased height causes unnecessary radix tree
descending and increased cache footprint.
This patch reduces the height of the radix tree of the swap cache via
using the offset of the swap entry instead of the whole swap entry value
as the key of the swap cache. In 32 processes sequential swap out test
case on a Xeon E5 v3 system with RAM disk as swap, the lock contention
for the spinlock of the swap cache is reduced from 20.15% to 12.19%,
when the type of the swap device is 1.
Use the whole swap entry as key,
perf-profile.calltrace.cycles-pp._raw_spin_lock_irq.__add_to_swap_cache.add_to_swap_cache.add_to_swap.shrink_page_list: 10.37,
perf-profile.calltrace.cycles-pp._raw_spin_lock_irqsave.__remove_mapping.shrink_page_list.shrink_inactive_list.shrink_node_memcg: 9.78,
Use the swap offset as key,
perf-profile.calltrace.cycles-pp._raw_spin_lock_irq.__add_to_swap_cache.add_to_swap_cache.add_to_swap.shrink_page_list: 6.25,
perf-profile.calltrace.cycles-pp._raw_spin_lock_irqsave.__remove_mapping.shrink_page_list.shrink_inactive_list.shrink_node_memcg: 5.94,
Link: http://lkml.kernel.org/r/1473270649-27229-1-git-send-email-ying.huang@intel.com
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Aaron Lu <aaron.lu@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-10-08 07:00:21 +07:00
|
|
|
page = find_get_page(swap_address_space(ent), swp_offset(ent));
|
2020-06-04 06:02:14 +07:00
|
|
|
entry->val = ent.val;
|
2010-05-27 04:42:38 +07:00
|
|
|
|
|
|
|
return page;
|
|
|
|
}
|
memcg: fix/change behavior of shared anon at moving task
This patch changes memcg's behavior at task_move().
At task_move(), the kernel scans a task's page table and move the changes
for mapped pages from source cgroup to target cgroup. There has been a
bug at handling shared anonymous pages for a long time.
Before patch:
- The spec says 'shared anonymous pages are not moved.'
- The implementation was 'shared anonymoys pages may be moved'.
If page_mapcount <=2, shared anonymous pages's charge were moved.
After patch:
- The spec says 'all anonymous pages are moved'.
- The implementation is 'all anonymous pages are moved'.
Considering usage of memcg, this will not affect user's experience.
'shared anonymous' pages only exists between a tree of processes which
don't do exec(). Moving one of process without exec() seems not sane.
For example, libcgroup will not be affected by this change. (Anyway, no
one noticed the implementation for a long time...)
Below is a discussion log:
- current spec/implementation are complex
- Now, shared file caches are moved
- It adds unclear check as page_mapcount(). To do correct check,
we should check swap users, etc.
- No one notice this implementation behavior. So, no one get benefit
from the design.
- In general, once task is moved to a cgroup for running, it will not
be moved....
- Finally, we have control knob as memory.move_charge_at_immigrate.
Here is a patch to allow moving shared pages, completely. This makes
memcg simpler and fix current broken code.
Suggested-by: Hugh Dickins <hughd@google.com>
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Glauber Costa <glommer@parallels.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-05-30 05:06:51 +07:00
|
|
|
#else
|
|
|
|
static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
|
2016-07-27 05:22:14 +07:00
|
|
|
pte_t ptent, swp_entry_t *entry)
|
memcg: fix/change behavior of shared anon at moving task
This patch changes memcg's behavior at task_move().
At task_move(), the kernel scans a task's page table and move the changes
for mapped pages from source cgroup to target cgroup. There has been a
bug at handling shared anonymous pages for a long time.
Before patch:
- The spec says 'shared anonymous pages are not moved.'
- The implementation was 'shared anonymoys pages may be moved'.
If page_mapcount <=2, shared anonymous pages's charge were moved.
After patch:
- The spec says 'all anonymous pages are moved'.
- The implementation is 'all anonymous pages are moved'.
Considering usage of memcg, this will not affect user's experience.
'shared anonymous' pages only exists between a tree of processes which
don't do exec(). Moving one of process without exec() seems not sane.
For example, libcgroup will not be affected by this change. (Anyway, no
one noticed the implementation for a long time...)
Below is a discussion log:
- current spec/implementation are complex
- Now, shared file caches are moved
- It adds unclear check as page_mapcount(). To do correct check,
we should check swap users, etc.
- No one notice this implementation behavior. So, no one get benefit
from the design.
- In general, once task is moved to a cgroup for running, it will not
be moved....
- Finally, we have control knob as memory.move_charge_at_immigrate.
Here is a patch to allow moving shared pages, completely. This makes
memcg simpler and fix current broken code.
Suggested-by: Hugh Dickins <hughd@google.com>
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Glauber Costa <glommer@parallels.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-05-30 05:06:51 +07:00
|
|
|
{
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
#endif
|
2010-05-27 04:42:38 +07:00
|
|
|
|
2010-05-27 04:42:39 +07:00
|
|
|
static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
|
|
|
|
unsigned long addr, pte_t ptent, swp_entry_t *entry)
|
|
|
|
{
|
|
|
|
if (!vma->vm_file) /* anonymous vma */
|
|
|
|
return NULL;
|
2015-02-12 06:26:09 +07:00
|
|
|
if (!(mc.flags & MOVE_FILE))
|
2010-05-27 04:42:39 +07:00
|
|
|
return NULL;
|
|
|
|
|
|
|
|
/* page is moved even if it's not RSS of this task(page-faulted). */
|
2011-08-04 06:21:24 +07:00
|
|
|
/* shmem/tmpfs may report page out on swap: account for that too. */
|
2020-10-14 06:51:21 +07:00
|
|
|
return find_get_incore_page(vma->vm_file->f_mapping,
|
|
|
|
linear_page_index(vma, addr));
|
2010-05-27 04:42:39 +07:00
|
|
|
}
|
|
|
|
|
2015-04-15 05:47:35 +07:00
|
|
|
/**
|
|
|
|
* mem_cgroup_move_account - move account of the page
|
|
|
|
* @page: the page
|
2016-07-27 05:26:56 +07:00
|
|
|
* @compound: charge the page as compound or small page
|
2015-04-15 05:47:35 +07:00
|
|
|
* @from: mem_cgroup which the page is moved from.
|
|
|
|
* @to: mem_cgroup which the page is moved to. @from != @to.
|
|
|
|
*
|
2016-01-16 07:53:07 +07:00
|
|
|
* The caller must make sure the page is not on LRU (isolate_page() is useful.)
|
2015-04-15 05:47:35 +07:00
|
|
|
*
|
|
|
|
* This function doesn't do "charge" to new cgroup and doesn't do "uncharge"
|
|
|
|
* from old cgroup.
|
|
|
|
*/
|
|
|
|
static int mem_cgroup_move_account(struct page *page,
|
2016-01-16 07:52:20 +07:00
|
|
|
bool compound,
|
2015-04-15 05:47:35 +07:00
|
|
|
struct mem_cgroup *from,
|
|
|
|
struct mem_cgroup *to)
|
|
|
|
{
|
2019-10-19 10:20:11 +07:00
|
|
|
struct lruvec *from_vec, *to_vec;
|
|
|
|
struct pglist_data *pgdat;
|
2020-08-15 07:30:37 +07:00
|
|
|
unsigned int nr_pages = compound ? thp_nr_pages(page) : 1;
|
2015-04-15 05:47:35 +07:00
|
|
|
int ret;
|
|
|
|
|
|
|
|
VM_BUG_ON(from == to);
|
|
|
|
VM_BUG_ON_PAGE(PageLRU(page), page);
|
2016-01-16 07:52:20 +07:00
|
|
|
VM_BUG_ON(compound && !PageTransHuge(page));
|
2015-04-15 05:47:35 +07:00
|
|
|
|
|
|
|
/*
|
2016-03-16 04:57:19 +07:00
|
|
|
* Prevent mem_cgroup_migrate() from looking at
|
2015-11-06 09:49:40 +07:00
|
|
|
* page->mem_cgroup of its source page while we change it.
|
2015-04-15 05:47:35 +07:00
|
|
|
*/
|
2016-01-16 07:52:20 +07:00
|
|
|
ret = -EBUSY;
|
2015-04-15 05:47:35 +07:00
|
|
|
if (!trylock_page(page))
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
ret = -EINVAL;
|
|
|
|
if (page->mem_cgroup != from)
|
|
|
|
goto out_unlock;
|
|
|
|
|
2019-10-19 10:20:11 +07:00
|
|
|
pgdat = page_pgdat(page);
|
2019-12-01 08:55:34 +07:00
|
|
|
from_vec = mem_cgroup_lruvec(from, pgdat);
|
|
|
|
to_vec = mem_cgroup_lruvec(to, pgdat);
|
2019-10-19 10:20:11 +07:00
|
|
|
|
2020-06-04 06:01:28 +07:00
|
|
|
lock_page_memcg(page);
|
2015-04-15 05:47:35 +07:00
|
|
|
|
2020-06-04 06:01:57 +07:00
|
|
|
if (PageAnon(page)) {
|
|
|
|
if (page_mapped(page)) {
|
|
|
|
__mod_lruvec_state(from_vec, NR_ANON_MAPPED, -nr_pages);
|
|
|
|
__mod_lruvec_state(to_vec, NR_ANON_MAPPED, nr_pages);
|
2020-06-04 06:02:01 +07:00
|
|
|
if (PageTransHuge(page)) {
|
|
|
|
__mod_lruvec_state(from_vec, NR_ANON_THPS,
|
|
|
|
-nr_pages);
|
|
|
|
__mod_lruvec_state(to_vec, NR_ANON_THPS,
|
|
|
|
nr_pages);
|
|
|
|
}
|
|
|
|
|
2020-06-04 06:01:57 +07:00
|
|
|
}
|
|
|
|
} else {
|
2020-06-04 06:01:54 +07:00
|
|
|
__mod_lruvec_state(from_vec, NR_FILE_PAGES, -nr_pages);
|
|
|
|
__mod_lruvec_state(to_vec, NR_FILE_PAGES, nr_pages);
|
|
|
|
|
|
|
|
if (PageSwapBacked(page)) {
|
|
|
|
__mod_lruvec_state(from_vec, NR_SHMEM, -nr_pages);
|
|
|
|
__mod_lruvec_state(to_vec, NR_SHMEM, nr_pages);
|
|
|
|
}
|
|
|
|
|
2020-06-04 06:01:47 +07:00
|
|
|
if (page_mapped(page)) {
|
|
|
|
__mod_lruvec_state(from_vec, NR_FILE_MAPPED, -nr_pages);
|
|
|
|
__mod_lruvec_state(to_vec, NR_FILE_MAPPED, nr_pages);
|
|
|
|
}
|
2015-04-15 05:47:35 +07:00
|
|
|
|
2020-06-04 06:01:47 +07:00
|
|
|
if (PageDirty(page)) {
|
|
|
|
struct address_space *mapping = page_mapping(page);
|
memcg: add per cgroup dirty page accounting
When modifying PG_Dirty on cached file pages, update the new
MEM_CGROUP_STAT_DIRTY counter. This is done in the same places where
global NR_FILE_DIRTY is managed. The new memcg stat is visible in the
per memcg memory.stat cgroupfs file. The most recent past attempt at
this was http://thread.gmane.org/gmane.linux.kernel.cgroups/8632
The new accounting supports future efforts to add per cgroup dirty
page throttling and writeback. It also helps an administrator break
down a container's memory usage and provides evidence to understand
memcg oom kills (the new dirty count is included in memcg oom kill
messages).
The ability to move page accounting between memcg
(memory.move_charge_at_immigrate) makes this accounting more
complicated than the global counter. The existing
mem_cgroup_{begin,end}_page_stat() lock is used to serialize move
accounting with stat updates.
Typical update operation:
memcg = mem_cgroup_begin_page_stat(page)
if (TestSetPageDirty()) {
[...]
mem_cgroup_update_page_stat(memcg)
}
mem_cgroup_end_page_stat(memcg)
Summary of mem_cgroup_end_page_stat() overhead:
- Without CONFIG_MEMCG it's a no-op
- With CONFIG_MEMCG and no inter memcg task movement, it's just
rcu_read_lock()
- With CONFIG_MEMCG and inter memcg task movement, it's
rcu_read_lock() + spin_lock_irqsave()
A memcg parameter is added to several routines because their callers
now grab mem_cgroup_begin_page_stat() which returns the memcg later
needed by for mem_cgroup_update_page_stat().
Because mem_cgroup_begin_page_stat() may disable interrupts, some
adjustments are needed:
- move __mark_inode_dirty() from __set_page_dirty() to its caller.
__mark_inode_dirty() locking does not want interrupts disabled.
- use spin_lock_irqsave(tree_lock) rather than spin_lock_irq() in
__delete_from_page_cache(), replace_page_cache_page(),
invalidate_complete_page2(), and __remove_mapping().
text data bss dec hex filename
8925147 1774832 1785856 12485835 be84cb vmlinux-!CONFIG_MEMCG-before
8925339 1774832 1785856 12486027 be858b vmlinux-!CONFIG_MEMCG-after
+192 text bytes
8965977 1784992 1785856 12536825 bf4bf9 vmlinux-CONFIG_MEMCG-before
8966750 1784992 1785856 12537598 bf4efe vmlinux-CONFIG_MEMCG-after
+773 text bytes
Performance tests run on v4.0-rc1-36-g4f671fe2f952. Lower is better for
all metrics, they're all wall clock or cycle counts. The read and write
fault benchmarks just measure fault time, they do not include I/O time.
* CONFIG_MEMCG not set:
baseline patched
kbuild 1m25.030000(+-0.088% 3 samples) 1m25.426667(+-0.120% 3 samples)
dd write 100 MiB 0.859211561 +-15.10% 0.874162885 +-15.03%
dd write 200 MiB 1.670653105 +-17.87% 1.669384764 +-11.99%
dd write 1000 MiB 8.434691190 +-14.15% 8.474733215 +-14.77%
read fault cycles 254.0(+-0.000% 10 samples) 253.0(+-0.000% 10 samples)
write fault cycles 2021.2(+-3.070% 10 samples) 1984.5(+-1.036% 10 samples)
* CONFIG_MEMCG=y root_memcg:
baseline patched
kbuild 1m25.716667(+-0.105% 3 samples) 1m25.686667(+-0.153% 3 samples)
dd write 100 MiB 0.855650830 +-14.90% 0.887557919 +-14.90%
dd write 200 MiB 1.688322953 +-12.72% 1.667682724 +-13.33%
dd write 1000 MiB 8.418601605 +-14.30% 8.673532299 +-15.00%
read fault cycles 266.0(+-0.000% 10 samples) 266.0(+-0.000% 10 samples)
write fault cycles 2051.7(+-1.349% 10 samples) 2049.6(+-1.686% 10 samples)
* CONFIG_MEMCG=y non-root_memcg:
baseline patched
kbuild 1m26.120000(+-0.273% 3 samples) 1m25.763333(+-0.127% 3 samples)
dd write 100 MiB 0.861723964 +-15.25% 0.818129350 +-14.82%
dd write 200 MiB 1.669887569 +-13.30% 1.698645885 +-13.27%
dd write 1000 MiB 8.383191730 +-14.65% 8.351742280 +-14.52%
read fault cycles 265.7(+-0.172% 10 samples) 267.0(+-0.000% 10 samples)
write fault cycles 2070.6(+-1.512% 10 samples) 2084.4(+-2.148% 10 samples)
As expected anon page faults are not affected by this patch.
tj: Updated to apply on top of the recent cancel_dirty_page() changes.
Signed-off-by: Sha Zhengju <handai.szj@gmail.com>
Signed-off-by: Greg Thelen <gthelen@google.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@fb.com>
2015-05-23 04:13:16 +07:00
|
|
|
|
2020-09-24 13:51:40 +07:00
|
|
|
if (mapping_can_writeback(mapping)) {
|
2020-06-04 06:01:47 +07:00
|
|
|
__mod_lruvec_state(from_vec, NR_FILE_DIRTY,
|
|
|
|
-nr_pages);
|
|
|
|
__mod_lruvec_state(to_vec, NR_FILE_DIRTY,
|
|
|
|
nr_pages);
|
|
|
|
}
|
memcg: add per cgroup dirty page accounting
When modifying PG_Dirty on cached file pages, update the new
MEM_CGROUP_STAT_DIRTY counter. This is done in the same places where
global NR_FILE_DIRTY is managed. The new memcg stat is visible in the
per memcg memory.stat cgroupfs file. The most recent past attempt at
this was http://thread.gmane.org/gmane.linux.kernel.cgroups/8632
The new accounting supports future efforts to add per cgroup dirty
page throttling and writeback. It also helps an administrator break
down a container's memory usage and provides evidence to understand
memcg oom kills (the new dirty count is included in memcg oom kill
messages).
The ability to move page accounting between memcg
(memory.move_charge_at_immigrate) makes this accounting more
complicated than the global counter. The existing
mem_cgroup_{begin,end}_page_stat() lock is used to serialize move
accounting with stat updates.
Typical update operation:
memcg = mem_cgroup_begin_page_stat(page)
if (TestSetPageDirty()) {
[...]
mem_cgroup_update_page_stat(memcg)
}
mem_cgroup_end_page_stat(memcg)
Summary of mem_cgroup_end_page_stat() overhead:
- Without CONFIG_MEMCG it's a no-op
- With CONFIG_MEMCG and no inter memcg task movement, it's just
rcu_read_lock()
- With CONFIG_MEMCG and inter memcg task movement, it's
rcu_read_lock() + spin_lock_irqsave()
A memcg parameter is added to several routines because their callers
now grab mem_cgroup_begin_page_stat() which returns the memcg later
needed by for mem_cgroup_update_page_stat().
Because mem_cgroup_begin_page_stat() may disable interrupts, some
adjustments are needed:
- move __mark_inode_dirty() from __set_page_dirty() to its caller.
__mark_inode_dirty() locking does not want interrupts disabled.
- use spin_lock_irqsave(tree_lock) rather than spin_lock_irq() in
__delete_from_page_cache(), replace_page_cache_page(),
invalidate_complete_page2(), and __remove_mapping().
text data bss dec hex filename
8925147 1774832 1785856 12485835 be84cb vmlinux-!CONFIG_MEMCG-before
8925339 1774832 1785856 12486027 be858b vmlinux-!CONFIG_MEMCG-after
+192 text bytes
8965977 1784992 1785856 12536825 bf4bf9 vmlinux-CONFIG_MEMCG-before
8966750 1784992 1785856 12537598 bf4efe vmlinux-CONFIG_MEMCG-after
+773 text bytes
Performance tests run on v4.0-rc1-36-g4f671fe2f952. Lower is better for
all metrics, they're all wall clock or cycle counts. The read and write
fault benchmarks just measure fault time, they do not include I/O time.
* CONFIG_MEMCG not set:
baseline patched
kbuild 1m25.030000(+-0.088% 3 samples) 1m25.426667(+-0.120% 3 samples)
dd write 100 MiB 0.859211561 +-15.10% 0.874162885 +-15.03%
dd write 200 MiB 1.670653105 +-17.87% 1.669384764 +-11.99%
dd write 1000 MiB 8.434691190 +-14.15% 8.474733215 +-14.77%
read fault cycles 254.0(+-0.000% 10 samples) 253.0(+-0.000% 10 samples)
write fault cycles 2021.2(+-3.070% 10 samples) 1984.5(+-1.036% 10 samples)
* CONFIG_MEMCG=y root_memcg:
baseline patched
kbuild 1m25.716667(+-0.105% 3 samples) 1m25.686667(+-0.153% 3 samples)
dd write 100 MiB 0.855650830 +-14.90% 0.887557919 +-14.90%
dd write 200 MiB 1.688322953 +-12.72% 1.667682724 +-13.33%
dd write 1000 MiB 8.418601605 +-14.30% 8.673532299 +-15.00%
read fault cycles 266.0(+-0.000% 10 samples) 266.0(+-0.000% 10 samples)
write fault cycles 2051.7(+-1.349% 10 samples) 2049.6(+-1.686% 10 samples)
* CONFIG_MEMCG=y non-root_memcg:
baseline patched
kbuild 1m26.120000(+-0.273% 3 samples) 1m25.763333(+-0.127% 3 samples)
dd write 100 MiB 0.861723964 +-15.25% 0.818129350 +-14.82%
dd write 200 MiB 1.669887569 +-13.30% 1.698645885 +-13.27%
dd write 1000 MiB 8.383191730 +-14.65% 8.351742280 +-14.52%
read fault cycles 265.7(+-0.172% 10 samples) 267.0(+-0.000% 10 samples)
write fault cycles 2070.6(+-1.512% 10 samples) 2084.4(+-2.148% 10 samples)
As expected anon page faults are not affected by this patch.
tj: Updated to apply on top of the recent cancel_dirty_page() changes.
Signed-off-by: Sha Zhengju <handai.szj@gmail.com>
Signed-off-by: Greg Thelen <gthelen@google.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@fb.com>
2015-05-23 04:13:16 +07:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-04-15 05:47:35 +07:00
|
|
|
if (PageWriteback(page)) {
|
2019-10-19 10:20:11 +07:00
|
|
|
__mod_lruvec_state(from_vec, NR_WRITEBACK, -nr_pages);
|
|
|
|
__mod_lruvec_state(to_vec, NR_WRITEBACK, nr_pages);
|
2015-04-15 05:47:35 +07:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2020-06-04 06:01:28 +07:00
|
|
|
* All state has been migrated, let's switch to the new memcg.
|
|
|
|
*
|
2015-04-15 05:47:35 +07:00
|
|
|
* It is safe to change page->mem_cgroup here because the page
|
2020-06-04 06:01:28 +07:00
|
|
|
* is referenced, charged, isolated, and locked: we can't race
|
|
|
|
* with (un)charging, migration, LRU putback, or anything else
|
|
|
|
* that would rely on a stable page->mem_cgroup.
|
|
|
|
*
|
|
|
|
* Note that lock_page_memcg is a memcg lock, not a page lock,
|
|
|
|
* to save space. As soon as we switch page->mem_cgroup to a
|
|
|
|
* new memcg that isn't locked, the above state can change
|
|
|
|
* concurrently again. Make sure we're truly done with it.
|
2015-04-15 05:47:35 +07:00
|
|
|
*/
|
2020-06-04 06:01:28 +07:00
|
|
|
smp_mb();
|
2015-04-15 05:47:35 +07:00
|
|
|
|
2020-08-07 13:20:45 +07:00
|
|
|
css_get(&to->css);
|
|
|
|
css_put(&from->css);
|
|
|
|
|
|
|
|
page->mem_cgroup = to;
|
2019-09-24 05:38:15 +07:00
|
|
|
|
2020-06-04 06:01:28 +07:00
|
|
|
__unlock_page_memcg(from);
|
2015-04-15 05:47:35 +07:00
|
|
|
|
|
|
|
ret = 0;
|
|
|
|
|
|
|
|
local_irq_disable();
|
2020-06-04 06:01:31 +07:00
|
|
|
mem_cgroup_charge_statistics(to, page, nr_pages);
|
2015-04-15 05:47:35 +07:00
|
|
|
memcg_check_events(to, page);
|
2020-06-04 06:01:31 +07:00
|
|
|
mem_cgroup_charge_statistics(from, page, -nr_pages);
|
2015-04-15 05:47:35 +07:00
|
|
|
memcg_check_events(from, page);
|
|
|
|
local_irq_enable();
|
|
|
|
out_unlock:
|
|
|
|
unlock_page(page);
|
|
|
|
out:
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2016-05-28 04:27:46 +07:00
|
|
|
/**
|
|
|
|
* get_mctgt_type - get target type of moving charge
|
|
|
|
* @vma: the vma the pte to be checked belongs
|
|
|
|
* @addr: the address corresponding to the pte to be checked
|
|
|
|
* @ptent: the pte to be checked
|
|
|
|
* @target: the pointer the target page or swap ent will be stored(can be NULL)
|
|
|
|
*
|
|
|
|
* Returns
|
|
|
|
* 0(MC_TARGET_NONE): if the pte is not a target for move charge.
|
|
|
|
* 1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for
|
|
|
|
* move charge. if @target is not NULL, the page is stored in target->page
|
|
|
|
* with extra refcnt got(Callers should handle it).
|
|
|
|
* 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a
|
|
|
|
* target for charge migration. if @target is not NULL, the entry is stored
|
|
|
|
* in target->ent.
|
2019-06-14 03:50:49 +07:00
|
|
|
* 3(MC_TARGET_DEVICE): like MC_TARGET_PAGE but page is MEMORY_DEVICE_PRIVATE
|
|
|
|
* (so ZONE_DEVICE page and thus not on the lru).
|
2017-09-09 06:12:24 +07:00
|
|
|
* For now we such page is charge like a regular page would be as for all
|
|
|
|
* intent and purposes it is just special memory taking the place of a
|
|
|
|
* regular page.
|
2017-09-09 06:11:54 +07:00
|
|
|
*
|
|
|
|
* See Documentations/vm/hmm.txt and include/linux/hmm.h
|
2016-05-28 04:27:46 +07:00
|
|
|
*
|
|
|
|
* Called with pte lock held.
|
|
|
|
*/
|
|
|
|
|
2012-03-22 06:34:27 +07:00
|
|
|
static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
|
2010-05-27 04:42:38 +07:00
|
|
|
unsigned long addr, pte_t ptent, union mc_target *target)
|
|
|
|
{
|
|
|
|
struct page *page = NULL;
|
2012-03-22 06:34:27 +07:00
|
|
|
enum mc_target_type ret = MC_TARGET_NONE;
|
2010-05-27 04:42:38 +07:00
|
|
|
swp_entry_t ent = { .val = 0 };
|
|
|
|
|
|
|
|
if (pte_present(ptent))
|
|
|
|
page = mc_handle_present_pte(vma, addr, ptent);
|
|
|
|
else if (is_swap_pte(ptent))
|
2016-07-27 05:22:14 +07:00
|
|
|
page = mc_handle_swap_pte(vma, ptent, &ent);
|
2015-02-11 05:10:04 +07:00
|
|
|
else if (pte_none(ptent))
|
2010-05-27 04:42:39 +07:00
|
|
|
page = mc_handle_file_pte(vma, addr, ptent, &ent);
|
2010-05-27 04:42:38 +07:00
|
|
|
|
|
|
|
if (!page && !ent.val)
|
2012-03-22 06:34:27 +07:00
|
|
|
return ret;
|
2010-03-11 06:22:17 +07:00
|
|
|
if (page) {
|
|
|
|
/*
|
mm: memcontrol: rewrite uncharge API
The memcg uncharging code that is involved towards the end of a page's
lifetime - truncation, reclaim, swapout, migration - is impressively
complicated and fragile.
Because anonymous and file pages were always charged before they had their
page->mapping established, uncharges had to happen when the page type
could still be known from the context; as in unmap for anonymous, page
cache removal for file and shmem pages, and swap cache truncation for swap
pages. However, these operations happen well before the page is actually
freed, and so a lot of synchronization is necessary:
- Charging, uncharging, page migration, and charge migration all need
to take a per-page bit spinlock as they could race with uncharging.
- Swap cache truncation happens during both swap-in and swap-out, and
possibly repeatedly before the page is actually freed. This means
that the memcg swapout code is called from many contexts that make
no sense and it has to figure out the direction from page state to
make sure memory and memory+swap are always correctly charged.
- On page migration, the old page might be unmapped but then reused,
so memcg code has to prevent untimely uncharging in that case.
Because this code - which should be a simple charge transfer - is so
special-cased, it is not reusable for replace_page_cache().
But now that charged pages always have a page->mapping, introduce
mem_cgroup_uncharge(), which is called after the final put_page(), when we
know for sure that nobody is looking at the page anymore.
For page migration, introduce mem_cgroup_migrate(), which is called after
the migration is successful and the new page is fully rmapped. Because
the old page is no longer uncharged after migration, prevent double
charges by decoupling the page's memcg association (PCG_USED and
pc->mem_cgroup) from the page holding an actual charge. The new bits
PCG_MEM and PCG_MEMSW represent the respective charges and are transferred
to the new page during migration.
mem_cgroup_migrate() is suitable for replace_page_cache() as well,
which gets rid of mem_cgroup_replace_page_cache(). However, care
needs to be taken because both the source and the target page can
already be charged and on the LRU when fuse is splicing: grab the page
lock on the charge moving side to prevent changing pc->mem_cgroup of a
page under migration. Also, the lruvecs of both pages change as we
uncharge the old and charge the new during migration, and putback may
race with us, so grab the lru lock and isolate the pages iff on LRU to
prevent races and ensure the pages are on the right lruvec afterward.
Swap accounting is massively simplified: because the page is no longer
uncharged as early as swap cache deletion, a new mem_cgroup_swapout() can
transfer the page's memory+swap charge (PCG_MEMSW) to the swap entry
before the final put_page() in page reclaim.
Finally, page_cgroup changes are now protected by whatever protection the
page itself offers: anonymous pages are charged under the page table lock,
whereas page cache insertions, swapin, and migration hold the page lock.
Uncharging happens under full exclusion with no outstanding references.
Charging and uncharging also ensure that the page is off-LRU, which
serializes against charge migration. Remove the very costly page_cgroup
lock and set pc->flags non-atomically.
[mhocko@suse.cz: mem_cgroup_charge_statistics needs preempt_disable]
[vdavydov@parallels.com: fix flags definition]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Vladimir Davydov <vdavydov@parallels.com>
Tested-by: Jet Chen <jet.chen@intel.com>
Acked-by: Michal Hocko <mhocko@suse.cz>
Tested-by: Felipe Balbi <balbi@ti.com>
Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-08-09 04:19:22 +07:00
|
|
|
* Do only loose check w/o serialization.
|
2014-12-11 06:44:52 +07:00
|
|
|
* mem_cgroup_move_account() checks the page is valid or
|
mm: memcontrol: rewrite uncharge API
The memcg uncharging code that is involved towards the end of a page's
lifetime - truncation, reclaim, swapout, migration - is impressively
complicated and fragile.
Because anonymous and file pages were always charged before they had their
page->mapping established, uncharges had to happen when the page type
could still be known from the context; as in unmap for anonymous, page
cache removal for file and shmem pages, and swap cache truncation for swap
pages. However, these operations happen well before the page is actually
freed, and so a lot of synchronization is necessary:
- Charging, uncharging, page migration, and charge migration all need
to take a per-page bit spinlock as they could race with uncharging.
- Swap cache truncation happens during both swap-in and swap-out, and
possibly repeatedly before the page is actually freed. This means
that the memcg swapout code is called from many contexts that make
no sense and it has to figure out the direction from page state to
make sure memory and memory+swap are always correctly charged.
- On page migration, the old page might be unmapped but then reused,
so memcg code has to prevent untimely uncharging in that case.
Because this code - which should be a simple charge transfer - is so
special-cased, it is not reusable for replace_page_cache().
But now that charged pages always have a page->mapping, introduce
mem_cgroup_uncharge(), which is called after the final put_page(), when we
know for sure that nobody is looking at the page anymore.
For page migration, introduce mem_cgroup_migrate(), which is called after
the migration is successful and the new page is fully rmapped. Because
the old page is no longer uncharged after migration, prevent double
charges by decoupling the page's memcg association (PCG_USED and
pc->mem_cgroup) from the page holding an actual charge. The new bits
PCG_MEM and PCG_MEMSW represent the respective charges and are transferred
to the new page during migration.
mem_cgroup_migrate() is suitable for replace_page_cache() as well,
which gets rid of mem_cgroup_replace_page_cache(). However, care
needs to be taken because both the source and the target page can
already be charged and on the LRU when fuse is splicing: grab the page
lock on the charge moving side to prevent changing pc->mem_cgroup of a
page under migration. Also, the lruvecs of both pages change as we
uncharge the old and charge the new during migration, and putback may
race with us, so grab the lru lock and isolate the pages iff on LRU to
prevent races and ensure the pages are on the right lruvec afterward.
Swap accounting is massively simplified: because the page is no longer
uncharged as early as swap cache deletion, a new mem_cgroup_swapout() can
transfer the page's memory+swap charge (PCG_MEMSW) to the swap entry
before the final put_page() in page reclaim.
Finally, page_cgroup changes are now protected by whatever protection the
page itself offers: anonymous pages are charged under the page table lock,
whereas page cache insertions, swapin, and migration hold the page lock.
Uncharging happens under full exclusion with no outstanding references.
Charging and uncharging also ensure that the page is off-LRU, which
serializes against charge migration. Remove the very costly page_cgroup
lock and set pc->flags non-atomically.
[mhocko@suse.cz: mem_cgroup_charge_statistics needs preempt_disable]
[vdavydov@parallels.com: fix flags definition]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Vladimir Davydov <vdavydov@parallels.com>
Tested-by: Jet Chen <jet.chen@intel.com>
Acked-by: Michal Hocko <mhocko@suse.cz>
Tested-by: Felipe Balbi <balbi@ti.com>
Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-08-09 04:19:22 +07:00
|
|
|
* not under LRU exclusion.
|
2010-03-11 06:22:17 +07:00
|
|
|
*/
|
2014-12-11 06:44:52 +07:00
|
|
|
if (page->mem_cgroup == mc.from) {
|
2010-03-11 06:22:17 +07:00
|
|
|
ret = MC_TARGET_PAGE;
|
2019-06-14 03:50:49 +07:00
|
|
|
if (is_device_private_page(page))
|
2017-09-09 06:11:54 +07:00
|
|
|
ret = MC_TARGET_DEVICE;
|
2010-03-11 06:22:17 +07:00
|
|
|
if (target)
|
|
|
|
target->page = page;
|
|
|
|
}
|
|
|
|
if (!ret || !target)
|
|
|
|
put_page(page);
|
|
|
|
}
|
2017-09-07 06:22:37 +07:00
|
|
|
/*
|
|
|
|
* There is a swap entry and a page doesn't exist or isn't charged.
|
|
|
|
* But we cannot move a tail-page in a THP.
|
|
|
|
*/
|
|
|
|
if (ent.val && !ret && (!page || !PageTransCompound(page)) &&
|
2013-09-23 15:56:01 +07:00
|
|
|
mem_cgroup_id(mc.from) == lookup_swap_cgroup_id(ent)) {
|
2010-05-12 04:06:58 +07:00
|
|
|
ret = MC_TARGET_SWAP;
|
|
|
|
if (target)
|
|
|
|
target->ent = ent;
|
2010-03-11 06:22:14 +07:00
|
|
|
}
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2012-03-22 06:34:28 +07:00
|
|
|
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
|
|
|
|
/*
|
2017-09-07 06:22:45 +07:00
|
|
|
* We don't consider PMD mapped swapping or file mapped pages because THP does
|
|
|
|
* not support them for now.
|
2012-03-22 06:34:28 +07:00
|
|
|
* Caller should make sure that pmd_trans_huge(pmd) is true.
|
|
|
|
*/
|
|
|
|
static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
|
|
|
|
unsigned long addr, pmd_t pmd, union mc_target *target)
|
|
|
|
{
|
|
|
|
struct page *page = NULL;
|
|
|
|
enum mc_target_type ret = MC_TARGET_NONE;
|
|
|
|
|
mm: thp: check pmd migration entry in common path
When THP migration is being used, memory management code needs to handle
pmd migration entries properly. This patch uses !pmd_present() or
is_swap_pmd() (depending on whether pmd_none() needs separate code or
not) to check pmd migration entries at the places where a pmd entry is
present.
Since pmd-related code uses split_huge_page(), split_huge_pmd(),
pmd_trans_huge(), pmd_trans_unstable(), or
pmd_none_or_trans_huge_or_clear_bad(), this patch:
1. adds pmd migration entry split code in split_huge_pmd(),
2. takes care of pmd migration entries whenever pmd_trans_huge() is present,
3. makes pmd_none_or_trans_huge_or_clear_bad() pmd migration entry aware.
Since split_huge_page() uses split_huge_pmd() and pmd_trans_unstable()
is equivalent to pmd_none_or_trans_huge_or_clear_bad(), we do not change
them.
Until this commit, a pmd entry should be:
1. pointing to a pte page,
2. is_swap_pmd(),
3. pmd_trans_huge(),
4. pmd_devmap(), or
5. pmd_none().
Signed-off-by: Zi Yan <zi.yan@cs.rutgers.edu>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Anshuman Khandual <khandual@linux.vnet.ibm.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: David Nellans <dnellans@nvidia.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Michal Hocko <mhocko@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2017-09-09 06:11:01 +07:00
|
|
|
if (unlikely(is_swap_pmd(pmd))) {
|
|
|
|
VM_BUG_ON(thp_migration_supported() &&
|
|
|
|
!is_pmd_migration_entry(pmd));
|
|
|
|
return ret;
|
|
|
|
}
|
2012-03-22 06:34:28 +07:00
|
|
|
page = pmd_page(pmd);
|
2014-01-24 06:52:54 +07:00
|
|
|
VM_BUG_ON_PAGE(!page || !PageHead(page), page);
|
2015-02-12 06:26:09 +07:00
|
|
|
if (!(mc.flags & MOVE_ANON))
|
2012-03-22 06:34:28 +07:00
|
|
|
return ret;
|
2014-12-11 06:44:52 +07:00
|
|
|
if (page->mem_cgroup == mc.from) {
|
2012-03-22 06:34:28 +07:00
|
|
|
ret = MC_TARGET_PAGE;
|
|
|
|
if (target) {
|
|
|
|
get_page(page);
|
|
|
|
target->page = page;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
#else
|
|
|
|
static inline enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
|
|
|
|
unsigned long addr, pmd_t pmd, union mc_target *target)
|
|
|
|
{
|
|
|
|
return MC_TARGET_NONE;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2010-03-11 06:22:14 +07:00
|
|
|
static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
|
|
|
|
unsigned long addr, unsigned long end,
|
|
|
|
struct mm_walk *walk)
|
|
|
|
{
|
2015-02-12 06:27:57 +07:00
|
|
|
struct vm_area_struct *vma = walk->vma;
|
2010-03-11 06:22:14 +07:00
|
|
|
pte_t *pte;
|
|
|
|
spinlock_t *ptl;
|
|
|
|
|
2016-01-22 07:40:25 +07:00
|
|
|
ptl = pmd_trans_huge_lock(pmd, vma);
|
|
|
|
if (ptl) {
|
2017-09-09 06:11:54 +07:00
|
|
|
/*
|
|
|
|
* Note their can not be MC_TARGET_DEVICE for now as we do not
|
2019-06-14 03:50:49 +07:00
|
|
|
* support transparent huge page with MEMORY_DEVICE_PRIVATE but
|
|
|
|
* this might change.
|
2017-09-09 06:11:54 +07:00
|
|
|
*/
|
2012-03-22 06:34:28 +07:00
|
|
|
if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)
|
|
|
|
mc.precharge += HPAGE_PMD_NR;
|
2013-11-15 05:30:54 +07:00
|
|
|
spin_unlock(ptl);
|
mm: thp: fix pmd_bad() triggering in code paths holding mmap_sem read mode
In some cases it may happen that pmd_none_or_clear_bad() is called with
the mmap_sem hold in read mode. In those cases the huge page faults can
allocate hugepmds under pmd_none_or_clear_bad() and that can trigger a
false positive from pmd_bad() that will not like to see a pmd
materializing as trans huge.
It's not khugepaged causing the problem, khugepaged holds the mmap_sem
in write mode (and all those sites must hold the mmap_sem in read mode
to prevent pagetables to go away from under them, during code review it
seems vm86 mode on 32bit kernels requires that too unless it's
restricted to 1 thread per process or UP builds). The race is only with
the huge pagefaults that can convert a pmd_none() into a
pmd_trans_huge().
Effectively all these pmd_none_or_clear_bad() sites running with
mmap_sem in read mode are somewhat speculative with the page faults, and
the result is always undefined when they run simultaneously. This is
probably why it wasn't common to run into this. For example if the
madvise(MADV_DONTNEED) runs zap_page_range() shortly before the page
fault, the hugepage will not be zapped, if the page fault runs first it
will be zapped.
Altering pmd_bad() not to error out if it finds hugepmds won't be enough
to fix this, because zap_pmd_range would then proceed to call
zap_pte_range (which would be incorrect if the pmd become a
pmd_trans_huge()).
The simplest way to fix this is to read the pmd in the local stack
(regardless of what we read, no need of actual CPU barriers, only
compiler barrier needed), and be sure it is not changing under the code
that computes its value. Even if the real pmd is changing under the
value we hold on the stack, we don't care. If we actually end up in
zap_pte_range it means the pmd was not none already and it was not huge,
and it can't become huge from under us (khugepaged locking explained
above).
All we need is to enforce that there is no way anymore that in a code
path like below, pmd_trans_huge can be false, but pmd_none_or_clear_bad
can run into a hugepmd. The overhead of a barrier() is just a compiler
tweak and should not be measurable (I only added it for THP builds). I
don't exclude different compiler versions may have prevented the race
too by caching the value of *pmd on the stack (that hasn't been
verified, but it wouldn't be impossible considering
pmd_none_or_clear_bad, pmd_bad, pmd_trans_huge, pmd_none are all inlines
and there's no external function called in between pmd_trans_huge and
pmd_none_or_clear_bad).
if (pmd_trans_huge(*pmd)) {
if (next-addr != HPAGE_PMD_SIZE) {
VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem));
split_huge_page_pmd(vma->vm_mm, pmd);
} else if (zap_huge_pmd(tlb, vma, pmd, addr))
continue;
/* fall through */
}
if (pmd_none_or_clear_bad(pmd))
Because this race condition could be exercised without special
privileges this was reported in CVE-2012-1179.
The race was identified and fully explained by Ulrich who debugged it.
I'm quoting his accurate explanation below, for reference.
====== start quote =======
mapcount 0 page_mapcount 1
kernel BUG at mm/huge_memory.c:1384!
At some point prior to the panic, a "bad pmd ..." message similar to the
following is logged on the console:
mm/memory.c:145: bad pmd ffff8800376e1f98(80000000314000e7).
The "bad pmd ..." message is logged by pmd_clear_bad() before it clears
the page's PMD table entry.
143 void pmd_clear_bad(pmd_t *pmd)
144 {
-> 145 pmd_ERROR(*pmd);
146 pmd_clear(pmd);
147 }
After the PMD table entry has been cleared, there is an inconsistency
between the actual number of PMD table entries that are mapping the page
and the page's map count (_mapcount field in struct page). When the page
is subsequently reclaimed, __split_huge_page() detects this inconsistency.
1381 if (mapcount != page_mapcount(page))
1382 printk(KERN_ERR "mapcount %d page_mapcount %d\n",
1383 mapcount, page_mapcount(page));
-> 1384 BUG_ON(mapcount != page_mapcount(page));
The root cause of the problem is a race of two threads in a multithreaded
process. Thread B incurs a page fault on a virtual address that has never
been accessed (PMD entry is zero) while Thread A is executing an madvise()
system call on a virtual address within the same 2 MB (huge page) range.
virtual address space
.---------------------.
| |
| |
.-|---------------------|
| | |
| | |<-- B(fault)
| | |
2 MB | |/////////////////////|-.
huge < |/////////////////////| > A(range)
page | |/////////////////////|-'
| | |
| | |
'-|---------------------|
| |
| |
'---------------------'
- Thread A is executing an madvise(..., MADV_DONTNEED) system call
on the virtual address range "A(range)" shown in the picture.
sys_madvise
// Acquire the semaphore in shared mode.
down_read(¤t->mm->mmap_sem)
...
madvise_vma
switch (behavior)
case MADV_DONTNEED:
madvise_dontneed
zap_page_range
unmap_vmas
unmap_page_range
zap_pud_range
zap_pmd_range
//
// Assume that this huge page has never been accessed.
// I.e. content of the PMD entry is zero (not mapped).
//
if (pmd_trans_huge(*pmd)) {
// We don't get here due to the above assumption.
}
//
// Assume that Thread B incurred a page fault and
.---------> // sneaks in here as shown below.
| //
| if (pmd_none_or_clear_bad(pmd))
| {
| if (unlikely(pmd_bad(*pmd)))
| pmd_clear_bad
| {
| pmd_ERROR
| // Log "bad pmd ..." message here.
| pmd_clear
| // Clear the page's PMD entry.
| // Thread B incremented the map count
| // in page_add_new_anon_rmap(), but
| // now the page is no longer mapped
| // by a PMD entry (-> inconsistency).
| }
| }
|
v
- Thread B is handling a page fault on virtual address "B(fault)" shown
in the picture.
...
do_page_fault
__do_page_fault
// Acquire the semaphore in shared mode.
down_read_trylock(&mm->mmap_sem)
...
handle_mm_fault
if (pmd_none(*pmd) && transparent_hugepage_enabled(vma))
// We get here due to the above assumption (PMD entry is zero).
do_huge_pmd_anonymous_page
alloc_hugepage_vma
// Allocate a new transparent huge page here.
...
__do_huge_pmd_anonymous_page
...
spin_lock(&mm->page_table_lock)
...
page_add_new_anon_rmap
// Here we increment the page's map count (starts at -1).
atomic_set(&page->_mapcount, 0)
set_pmd_at
// Here we set the page's PMD entry which will be cleared
// when Thread A calls pmd_clear_bad().
...
spin_unlock(&mm->page_table_lock)
The mmap_sem does not prevent the race because both threads are acquiring
it in shared mode (down_read). Thread B holds the page_table_lock while
the page's map count and PMD table entry are updated. However, Thread A
does not synchronize on that lock.
====== end quote =======
[akpm@linux-foundation.org: checkpatch fixes]
Reported-by: Ulrich Obergfell <uobergfe@redhat.com>
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Hugh Dickins <hughd@google.com>
Cc: Dave Jones <davej@redhat.com>
Acked-by: Larry Woodman <lwoodman@redhat.com>
Acked-by: Rik van Riel <riel@redhat.com>
Cc: <stable@vger.kernel.org> [2.6.38+]
Cc: Mark Salter <msalter@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-03-22 06:33:42 +07:00
|
|
|
return 0;
|
2012-03-22 06:34:28 +07:00
|
|
|
}
|
2011-03-23 06:32:56 +07:00
|
|
|
|
2012-03-29 04:42:40 +07:00
|
|
|
if (pmd_trans_unstable(pmd))
|
|
|
|
return 0;
|
2010-03-11 06:22:14 +07:00
|
|
|
pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
|
|
|
|
for (; addr != end; pte++, addr += PAGE_SIZE)
|
2012-03-22 06:34:27 +07:00
|
|
|
if (get_mctgt_type(vma, addr, *pte, NULL))
|
2010-03-11 06:22:14 +07:00
|
|
|
mc.precharge++; /* increment precharge temporarily */
|
|
|
|
pte_unmap_unlock(pte - 1, ptl);
|
|
|
|
cond_resched();
|
|
|
|
|
2010-03-11 06:22:13 +07:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2019-08-28 21:19:54 +07:00
|
|
|
static const struct mm_walk_ops precharge_walk_ops = {
|
|
|
|
.pmd_entry = mem_cgroup_count_precharge_pte_range,
|
|
|
|
};
|
|
|
|
|
2010-03-11 06:22:14 +07:00
|
|
|
static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
|
|
|
|
{
|
|
|
|
unsigned long precharge;
|
|
|
|
|
2020-06-09 11:33:25 +07:00
|
|
|
mmap_read_lock(mm);
|
2019-08-28 21:19:54 +07:00
|
|
|
walk_page_range(mm, 0, mm->highest_vm_end, &precharge_walk_ops, NULL);
|
2020-06-09 11:33:25 +07:00
|
|
|
mmap_read_unlock(mm);
|
2010-03-11 06:22:14 +07:00
|
|
|
|
|
|
|
precharge = mc.precharge;
|
|
|
|
mc.precharge = 0;
|
|
|
|
|
|
|
|
return precharge;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int mem_cgroup_precharge_mc(struct mm_struct *mm)
|
|
|
|
{
|
2011-01-14 06:47:41 +07:00
|
|
|
unsigned long precharge = mem_cgroup_count_precharge(mm);
|
|
|
|
|
|
|
|
VM_BUG_ON(mc.moving_task);
|
|
|
|
mc.moving_task = current;
|
|
|
|
return mem_cgroup_do_precharge(precharge);
|
2010-03-11 06:22:14 +07:00
|
|
|
}
|
|
|
|
|
2011-01-14 06:47:41 +07:00
|
|
|
/* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */
|
|
|
|
static void __mem_cgroup_clear_mc(void)
|
2010-03-11 06:22:14 +07:00
|
|
|
{
|
2010-08-11 08:02:58 +07:00
|
|
|
struct mem_cgroup *from = mc.from;
|
|
|
|
struct mem_cgroup *to = mc.to;
|
|
|
|
|
2010-03-11 06:22:14 +07:00
|
|
|
/* we must uncharge all the leftover precharges from mc.to */
|
2010-03-11 06:22:15 +07:00
|
|
|
if (mc.precharge) {
|
mm: memcontrol: rewrite charge API
These patches rework memcg charge lifetime to integrate more naturally
with the lifetime of user pages. This drastically simplifies the code and
reduces charging and uncharging overhead. The most expensive part of
charging and uncharging is the page_cgroup bit spinlock, which is removed
entirely after this series.
Here are the top-10 profile entries of a stress test that reads a 128G
sparse file on a freshly booted box, without even a dedicated cgroup (i.e.
executing in the root memcg). Before:
15.36% cat [kernel.kallsyms] [k] copy_user_generic_string
13.31% cat [kernel.kallsyms] [k] memset
11.48% cat [kernel.kallsyms] [k] do_mpage_readpage
4.23% cat [kernel.kallsyms] [k] get_page_from_freelist
2.38% cat [kernel.kallsyms] [k] put_page
2.32% cat [kernel.kallsyms] [k] __mem_cgroup_commit_charge
2.18% kswapd0 [kernel.kallsyms] [k] __mem_cgroup_uncharge_common
1.92% kswapd0 [kernel.kallsyms] [k] shrink_page_list
1.86% cat [kernel.kallsyms] [k] __radix_tree_lookup
1.62% cat [kernel.kallsyms] [k] __pagevec_lru_add_fn
After:
15.67% cat [kernel.kallsyms] [k] copy_user_generic_string
13.48% cat [kernel.kallsyms] [k] memset
11.42% cat [kernel.kallsyms] [k] do_mpage_readpage
3.98% cat [kernel.kallsyms] [k] get_page_from_freelist
2.46% cat [kernel.kallsyms] [k] put_page
2.13% kswapd0 [kernel.kallsyms] [k] shrink_page_list
1.88% cat [kernel.kallsyms] [k] __radix_tree_lookup
1.67% cat [kernel.kallsyms] [k] __pagevec_lru_add_fn
1.39% kswapd0 [kernel.kallsyms] [k] free_pcppages_bulk
1.30% cat [kernel.kallsyms] [k] kfree
As you can see, the memcg footprint has shrunk quite a bit.
text data bss dec hex filename
37970 9892 400 48262 bc86 mm/memcontrol.o.old
35239 9892 400 45531 b1db mm/memcontrol.o
This patch (of 4):
The memcg charge API charges pages before they are rmapped - i.e. have an
actual "type" - and so every callsite needs its own set of charge and
uncharge functions to know what type is being operated on. Worse,
uncharge has to happen from a context that is still type-specific, rather
than at the end of the page's lifetime with exclusive access, and so
requires a lot of synchronization.
Rewrite the charge API to provide a generic set of try_charge(),
commit_charge() and cancel_charge() transaction operations, much like
what's currently done for swap-in:
mem_cgroup_try_charge() attempts to reserve a charge, reclaiming
pages from the memcg if necessary.
mem_cgroup_commit_charge() commits the page to the charge once it
has a valid page->mapping and PageAnon() reliably tells the type.
mem_cgroup_cancel_charge() aborts the transaction.
This reduces the charge API and enables subsequent patches to
drastically simplify uncharging.
As pages need to be committed after rmap is established but before they
are added to the LRU, page_add_new_anon_rmap() must stop doing LRU
additions again. Revive lru_cache_add_active_or_unevictable().
[hughd@google.com: fix shmem_unuse]
[hughd@google.com: Add comments on the private use of -EAGAIN]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: Tejun Heo <tj@kernel.org>
Cc: Vladimir Davydov <vdavydov@parallels.com>
Signed-off-by: Hugh Dickins <hughd@google.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-08-09 04:19:20 +07:00
|
|
|
cancel_charge(mc.to, mc.precharge);
|
2010-03-11 06:22:15 +07:00
|
|
|
mc.precharge = 0;
|
|
|
|
}
|
|
|
|
/*
|
|
|
|
* we didn't uncharge from mc.from at mem_cgroup_move_account(), so
|
|
|
|
* we must uncharge here.
|
|
|
|
*/
|
|
|
|
if (mc.moved_charge) {
|
mm: memcontrol: rewrite charge API
These patches rework memcg charge lifetime to integrate more naturally
with the lifetime of user pages. This drastically simplifies the code and
reduces charging and uncharging overhead. The most expensive part of
charging and uncharging is the page_cgroup bit spinlock, which is removed
entirely after this series.
Here are the top-10 profile entries of a stress test that reads a 128G
sparse file on a freshly booted box, without even a dedicated cgroup (i.e.
executing in the root memcg). Before:
15.36% cat [kernel.kallsyms] [k] copy_user_generic_string
13.31% cat [kernel.kallsyms] [k] memset
11.48% cat [kernel.kallsyms] [k] do_mpage_readpage
4.23% cat [kernel.kallsyms] [k] get_page_from_freelist
2.38% cat [kernel.kallsyms] [k] put_page
2.32% cat [kernel.kallsyms] [k] __mem_cgroup_commit_charge
2.18% kswapd0 [kernel.kallsyms] [k] __mem_cgroup_uncharge_common
1.92% kswapd0 [kernel.kallsyms] [k] shrink_page_list
1.86% cat [kernel.kallsyms] [k] __radix_tree_lookup
1.62% cat [kernel.kallsyms] [k] __pagevec_lru_add_fn
After:
15.67% cat [kernel.kallsyms] [k] copy_user_generic_string
13.48% cat [kernel.kallsyms] [k] memset
11.42% cat [kernel.kallsyms] [k] do_mpage_readpage
3.98% cat [kernel.kallsyms] [k] get_page_from_freelist
2.46% cat [kernel.kallsyms] [k] put_page
2.13% kswapd0 [kernel.kallsyms] [k] shrink_page_list
1.88% cat [kernel.kallsyms] [k] __radix_tree_lookup
1.67% cat [kernel.kallsyms] [k] __pagevec_lru_add_fn
1.39% kswapd0 [kernel.kallsyms] [k] free_pcppages_bulk
1.30% cat [kernel.kallsyms] [k] kfree
As you can see, the memcg footprint has shrunk quite a bit.
text data bss dec hex filename
37970 9892 400 48262 bc86 mm/memcontrol.o.old
35239 9892 400 45531 b1db mm/memcontrol.o
This patch (of 4):
The memcg charge API charges pages before they are rmapped - i.e. have an
actual "type" - and so every callsite needs its own set of charge and
uncharge functions to know what type is being operated on. Worse,
uncharge has to happen from a context that is still type-specific, rather
than at the end of the page's lifetime with exclusive access, and so
requires a lot of synchronization.
Rewrite the charge API to provide a generic set of try_charge(),
commit_charge() and cancel_charge() transaction operations, much like
what's currently done for swap-in:
mem_cgroup_try_charge() attempts to reserve a charge, reclaiming
pages from the memcg if necessary.
mem_cgroup_commit_charge() commits the page to the charge once it
has a valid page->mapping and PageAnon() reliably tells the type.
mem_cgroup_cancel_charge() aborts the transaction.
This reduces the charge API and enables subsequent patches to
drastically simplify uncharging.
As pages need to be committed after rmap is established but before they
are added to the LRU, page_add_new_anon_rmap() must stop doing LRU
additions again. Revive lru_cache_add_active_or_unevictable().
[hughd@google.com: fix shmem_unuse]
[hughd@google.com: Add comments on the private use of -EAGAIN]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: Tejun Heo <tj@kernel.org>
Cc: Vladimir Davydov <vdavydov@parallels.com>
Signed-off-by: Hugh Dickins <hughd@google.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-08-09 04:19:20 +07:00
|
|
|
cancel_charge(mc.from, mc.moved_charge);
|
2010-03-11 06:22:15 +07:00
|
|
|
mc.moved_charge = 0;
|
2010-03-11 06:22:14 +07:00
|
|
|
}
|
2010-03-11 06:22:18 +07:00
|
|
|
/* we must fixup refcnts and charges */
|
|
|
|
if (mc.moved_swap) {
|
|
|
|
/* uncharge swap account from the old cgroup */
|
2014-09-05 19:43:57 +07:00
|
|
|
if (!mem_cgroup_is_root(mc.from))
|
mm: memcontrol: lockless page counters
Memory is internally accounted in bytes, using spinlock-protected 64-bit
counters, even though the smallest accounting delta is a page. The
counter interface is also convoluted and does too many things.
Introduce a new lockless word-sized page counter API, then change all
memory accounting over to it. The translation from and to bytes then only
happens when interfacing with userspace.
The removed locking overhead is noticable when scaling beyond the per-cpu
charge caches - on a 4-socket machine with 144-threads, the following test
shows the performance differences of 288 memcgs concurrently running a
page fault benchmark:
vanilla:
18631648.500498 task-clock (msec) # 140.643 CPUs utilized ( +- 0.33% )
1,380,638 context-switches # 0.074 K/sec ( +- 0.75% )
24,390 cpu-migrations # 0.001 K/sec ( +- 8.44% )
1,843,305,768 page-faults # 0.099 M/sec ( +- 0.00% )
50,134,994,088,218 cycles # 2.691 GHz ( +- 0.33% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
8,049,712,224,651 instructions # 0.16 insns per cycle ( +- 0.04% )
1,586,970,584,979 branches # 85.176 M/sec ( +- 0.05% )
1,724,989,949 branch-misses # 0.11% of all branches ( +- 0.48% )
132.474343877 seconds time elapsed ( +- 0.21% )
lockless:
12195979.037525 task-clock (msec) # 133.480 CPUs utilized ( +- 0.18% )
832,850 context-switches # 0.068 K/sec ( +- 0.54% )
15,624 cpu-migrations # 0.001 K/sec ( +- 10.17% )
1,843,304,774 page-faults # 0.151 M/sec ( +- 0.00% )
32,811,216,801,141 cycles # 2.690 GHz ( +- 0.18% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
9,999,265,091,727 instructions # 0.30 insns per cycle ( +- 0.10% )
2,076,759,325,203 branches # 170.282 M/sec ( +- 0.12% )
1,656,917,214 branch-misses # 0.08% of all branches ( +- 0.55% )
91.369330729 seconds time elapsed ( +- 0.45% )
On top of improved scalability, this also gets rid of the icky long long
types in the very heart of memcg, which is great for 32 bit and also makes
the code a lot more readable.
Notable differences between the old and new API:
- res_counter_charge() and res_counter_charge_nofail() become
page_counter_try_charge() and page_counter_charge() resp. to match
the more common kernel naming scheme of try_do()/do()
- res_counter_uncharge_until() is only ever used to cancel a local
counter and never to uncharge bigger segments of a hierarchy, so
it's replaced by the simpler page_counter_cancel()
- res_counter_set_limit() is replaced by page_counter_limit(), which
expects its callers to serialize against themselves
- res_counter_memparse_write_strategy() is replaced by
page_counter_limit(), which rounds down to the nearest page size -
rather than up. This is more reasonable for explicitely requested
hard upper limits.
- to keep charging light-weight, page_counter_try_charge() charges
speculatively, only to roll back if the result exceeds the limit.
Because of this, a failing bigger charge can temporarily lock out
smaller charges that would otherwise succeed. The error is bounded
to the difference between the smallest and the biggest possible
charge size, so for memcg, this means that a failing THP charge can
send base page charges into reclaim upto 2MB (4MB) before the limit
would have been reached. This should be acceptable.
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE and memparse]
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE, memparse, strncmp, and PAGE_SIZE]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Acked-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-12-11 06:42:31 +07:00
|
|
|
page_counter_uncharge(&mc.from->memsw, mc.moved_swap);
|
2010-03-11 06:22:18 +07:00
|
|
|
|
2016-08-12 05:33:03 +07:00
|
|
|
mem_cgroup_id_put_many(mc.from, mc.moved_swap);
|
|
|
|
|
2014-08-07 06:05:59 +07:00
|
|
|
/*
|
mm: memcontrol: lockless page counters
Memory is internally accounted in bytes, using spinlock-protected 64-bit
counters, even though the smallest accounting delta is a page. The
counter interface is also convoluted and does too many things.
Introduce a new lockless word-sized page counter API, then change all
memory accounting over to it. The translation from and to bytes then only
happens when interfacing with userspace.
The removed locking overhead is noticable when scaling beyond the per-cpu
charge caches - on a 4-socket machine with 144-threads, the following test
shows the performance differences of 288 memcgs concurrently running a
page fault benchmark:
vanilla:
18631648.500498 task-clock (msec) # 140.643 CPUs utilized ( +- 0.33% )
1,380,638 context-switches # 0.074 K/sec ( +- 0.75% )
24,390 cpu-migrations # 0.001 K/sec ( +- 8.44% )
1,843,305,768 page-faults # 0.099 M/sec ( +- 0.00% )
50,134,994,088,218 cycles # 2.691 GHz ( +- 0.33% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
8,049,712,224,651 instructions # 0.16 insns per cycle ( +- 0.04% )
1,586,970,584,979 branches # 85.176 M/sec ( +- 0.05% )
1,724,989,949 branch-misses # 0.11% of all branches ( +- 0.48% )
132.474343877 seconds time elapsed ( +- 0.21% )
lockless:
12195979.037525 task-clock (msec) # 133.480 CPUs utilized ( +- 0.18% )
832,850 context-switches # 0.068 K/sec ( +- 0.54% )
15,624 cpu-migrations # 0.001 K/sec ( +- 10.17% )
1,843,304,774 page-faults # 0.151 M/sec ( +- 0.00% )
32,811,216,801,141 cycles # 2.690 GHz ( +- 0.18% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
9,999,265,091,727 instructions # 0.30 insns per cycle ( +- 0.10% )
2,076,759,325,203 branches # 170.282 M/sec ( +- 0.12% )
1,656,917,214 branch-misses # 0.08% of all branches ( +- 0.55% )
91.369330729 seconds time elapsed ( +- 0.45% )
On top of improved scalability, this also gets rid of the icky long long
types in the very heart of memcg, which is great for 32 bit and also makes
the code a lot more readable.
Notable differences between the old and new API:
- res_counter_charge() and res_counter_charge_nofail() become
page_counter_try_charge() and page_counter_charge() resp. to match
the more common kernel naming scheme of try_do()/do()
- res_counter_uncharge_until() is only ever used to cancel a local
counter and never to uncharge bigger segments of a hierarchy, so
it's replaced by the simpler page_counter_cancel()
- res_counter_set_limit() is replaced by page_counter_limit(), which
expects its callers to serialize against themselves
- res_counter_memparse_write_strategy() is replaced by
page_counter_limit(), which rounds down to the nearest page size -
rather than up. This is more reasonable for explicitely requested
hard upper limits.
- to keep charging light-weight, page_counter_try_charge() charges
speculatively, only to roll back if the result exceeds the limit.
Because of this, a failing bigger charge can temporarily lock out
smaller charges that would otherwise succeed. The error is bounded
to the difference between the smallest and the biggest possible
charge size, so for memcg, this means that a failing THP charge can
send base page charges into reclaim upto 2MB (4MB) before the limit
would have been reached. This should be acceptable.
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE and memparse]
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE, memparse, strncmp, and PAGE_SIZE]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Acked-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-12-11 06:42:31 +07:00
|
|
|
* we charged both to->memory and to->memsw, so we
|
|
|
|
* should uncharge to->memory.
|
2014-08-07 06:05:59 +07:00
|
|
|
*/
|
2014-09-05 19:43:57 +07:00
|
|
|
if (!mem_cgroup_is_root(mc.to))
|
mm: memcontrol: lockless page counters
Memory is internally accounted in bytes, using spinlock-protected 64-bit
counters, even though the smallest accounting delta is a page. The
counter interface is also convoluted and does too many things.
Introduce a new lockless word-sized page counter API, then change all
memory accounting over to it. The translation from and to bytes then only
happens when interfacing with userspace.
The removed locking overhead is noticable when scaling beyond the per-cpu
charge caches - on a 4-socket machine with 144-threads, the following test
shows the performance differences of 288 memcgs concurrently running a
page fault benchmark:
vanilla:
18631648.500498 task-clock (msec) # 140.643 CPUs utilized ( +- 0.33% )
1,380,638 context-switches # 0.074 K/sec ( +- 0.75% )
24,390 cpu-migrations # 0.001 K/sec ( +- 8.44% )
1,843,305,768 page-faults # 0.099 M/sec ( +- 0.00% )
50,134,994,088,218 cycles # 2.691 GHz ( +- 0.33% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
8,049,712,224,651 instructions # 0.16 insns per cycle ( +- 0.04% )
1,586,970,584,979 branches # 85.176 M/sec ( +- 0.05% )
1,724,989,949 branch-misses # 0.11% of all branches ( +- 0.48% )
132.474343877 seconds time elapsed ( +- 0.21% )
lockless:
12195979.037525 task-clock (msec) # 133.480 CPUs utilized ( +- 0.18% )
832,850 context-switches # 0.068 K/sec ( +- 0.54% )
15,624 cpu-migrations # 0.001 K/sec ( +- 10.17% )
1,843,304,774 page-faults # 0.151 M/sec ( +- 0.00% )
32,811,216,801,141 cycles # 2.690 GHz ( +- 0.18% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
9,999,265,091,727 instructions # 0.30 insns per cycle ( +- 0.10% )
2,076,759,325,203 branches # 170.282 M/sec ( +- 0.12% )
1,656,917,214 branch-misses # 0.08% of all branches ( +- 0.55% )
91.369330729 seconds time elapsed ( +- 0.45% )
On top of improved scalability, this also gets rid of the icky long long
types in the very heart of memcg, which is great for 32 bit and also makes
the code a lot more readable.
Notable differences between the old and new API:
- res_counter_charge() and res_counter_charge_nofail() become
page_counter_try_charge() and page_counter_charge() resp. to match
the more common kernel naming scheme of try_do()/do()
- res_counter_uncharge_until() is only ever used to cancel a local
counter and never to uncharge bigger segments of a hierarchy, so
it's replaced by the simpler page_counter_cancel()
- res_counter_set_limit() is replaced by page_counter_limit(), which
expects its callers to serialize against themselves
- res_counter_memparse_write_strategy() is replaced by
page_counter_limit(), which rounds down to the nearest page size -
rather than up. This is more reasonable for explicitely requested
hard upper limits.
- to keep charging light-weight, page_counter_try_charge() charges
speculatively, only to roll back if the result exceeds the limit.
Because of this, a failing bigger charge can temporarily lock out
smaller charges that would otherwise succeed. The error is bounded
to the difference between the smallest and the biggest possible
charge size, so for memcg, this means that a failing THP charge can
send base page charges into reclaim upto 2MB (4MB) before the limit
would have been reached. This should be acceptable.
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE and memparse]
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE, memparse, strncmp, and PAGE_SIZE]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Acked-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-12-11 06:42:31 +07:00
|
|
|
page_counter_uncharge(&mc.to->memory, mc.moved_swap);
|
|
|
|
|
2010-03-11 06:22:18 +07:00
|
|
|
mc.moved_swap = 0;
|
|
|
|
}
|
2011-01-14 06:47:41 +07:00
|
|
|
memcg_oom_recover(from);
|
|
|
|
memcg_oom_recover(to);
|
|
|
|
wake_up_all(&mc.waitq);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void mem_cgroup_clear_mc(void)
|
|
|
|
{
|
2016-04-22 06:09:02 +07:00
|
|
|
struct mm_struct *mm = mc.mm;
|
|
|
|
|
2011-01-14 06:47:41 +07:00
|
|
|
/*
|
|
|
|
* we must clear moving_task before waking up waiters at the end of
|
|
|
|
* task migration.
|
|
|
|
*/
|
|
|
|
mc.moving_task = NULL;
|
|
|
|
__mem_cgroup_clear_mc();
|
2010-08-11 08:02:58 +07:00
|
|
|
spin_lock(&mc.lock);
|
2010-03-11 06:22:14 +07:00
|
|
|
mc.from = NULL;
|
|
|
|
mc.to = NULL;
|
2016-04-22 06:09:02 +07:00
|
|
|
mc.mm = NULL;
|
2010-08-11 08:02:58 +07:00
|
|
|
spin_unlock(&mc.lock);
|
2016-04-22 06:09:02 +07:00
|
|
|
|
|
|
|
mmput(mm);
|
2010-03-11 06:22:14 +07:00
|
|
|
}
|
|
|
|
|
cgroup: fix handling of multi-destination migration from subtree_control enabling
Consider the following v2 hierarchy.
P0 (+memory) --- P1 (-memory) --- A
\- B
P0 has memory enabled in its subtree_control while P1 doesn't. If
both A and B contain processes, they would belong to the memory css of
P1. Now if memory is enabled on P1's subtree_control, memory csses
should be created on both A and B and A's processes should be moved to
the former and B's processes the latter. IOW, enabling controllers
can cause atomic migrations into different csses.
The core cgroup migration logic has been updated accordingly but the
controller migration methods haven't and still assume that all tasks
migrate to a single target css; furthermore, the methods were fed the
css in which subtree_control was updated which is the parent of the
target csses. pids controller depends on the migration methods to
move charges and this made the controller attribute charges to the
wrong csses often triggering the following warning by driving a
counter negative.
WARNING: CPU: 1 PID: 1 at kernel/cgroup_pids.c:97 pids_cancel.constprop.6+0x31/0x40()
Modules linked in:
CPU: 1 PID: 1 Comm: systemd Not tainted 4.4.0-rc1+ #29
...
ffffffff81f65382 ffff88007c043b90 ffffffff81551ffc 0000000000000000
ffff88007c043bc8 ffffffff810de202 ffff88007a752000 ffff88007a29ab00
ffff88007c043c80 ffff88007a1d8400 0000000000000001 ffff88007c043bd8
Call Trace:
[<ffffffff81551ffc>] dump_stack+0x4e/0x82
[<ffffffff810de202>] warn_slowpath_common+0x82/0xc0
[<ffffffff810de2fa>] warn_slowpath_null+0x1a/0x20
[<ffffffff8118e031>] pids_cancel.constprop.6+0x31/0x40
[<ffffffff8118e0fd>] pids_can_attach+0x6d/0xf0
[<ffffffff81188a4c>] cgroup_taskset_migrate+0x6c/0x330
[<ffffffff81188e05>] cgroup_migrate+0xf5/0x190
[<ffffffff81189016>] cgroup_attach_task+0x176/0x200
[<ffffffff8118949d>] __cgroup_procs_write+0x2ad/0x460
[<ffffffff81189684>] cgroup_procs_write+0x14/0x20
[<ffffffff811854e5>] cgroup_file_write+0x35/0x1c0
[<ffffffff812e26f1>] kernfs_fop_write+0x141/0x190
[<ffffffff81265f88>] __vfs_write+0x28/0xe0
[<ffffffff812666fc>] vfs_write+0xac/0x1a0
[<ffffffff81267019>] SyS_write+0x49/0xb0
[<ffffffff81bcef32>] entry_SYSCALL_64_fastpath+0x12/0x76
This patch fixes the bug by removing @css parameter from the three
migration methods, ->can_attach, ->cancel_attach() and ->attach() and
updating cgroup_taskset iteration helpers also return the destination
css in addition to the task being migrated. All controllers are
updated accordingly.
* Controllers which don't care whether there are one or multiple
target csses can be converted trivially. cpu, io, freezer, perf,
netclassid and netprio fall in this category.
* cpuset's current implementation assumes that there's single source
and destination and thus doesn't support v2 hierarchy already. The
only change made by this patchset is how that single destination css
is obtained.
* memory migration path already doesn't do anything on v2. How the
single destination css is obtained is updated and the prep stage of
mem_cgroup_can_attach() is reordered to accomodate the change.
* pids is the only controller which was affected by this bug. It now
correctly handles multi-destination migrations and no longer causes
counter underflow from incorrect accounting.
Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-and-tested-by: Daniel Wagner <daniel.wagner@bmw-carit.de>
Cc: Aleksa Sarai <cyphar@cyphar.com>
2015-12-03 22:18:21 +07:00
|
|
|
static int mem_cgroup_can_attach(struct cgroup_taskset *tset)
|
2010-03-11 06:22:13 +07:00
|
|
|
{
|
cgroup: fix handling of multi-destination migration from subtree_control enabling
Consider the following v2 hierarchy.
P0 (+memory) --- P1 (-memory) --- A
\- B
P0 has memory enabled in its subtree_control while P1 doesn't. If
both A and B contain processes, they would belong to the memory css of
P1. Now if memory is enabled on P1's subtree_control, memory csses
should be created on both A and B and A's processes should be moved to
the former and B's processes the latter. IOW, enabling controllers
can cause atomic migrations into different csses.
The core cgroup migration logic has been updated accordingly but the
controller migration methods haven't and still assume that all tasks
migrate to a single target css; furthermore, the methods were fed the
css in which subtree_control was updated which is the parent of the
target csses. pids controller depends on the migration methods to
move charges and this made the controller attribute charges to the
wrong csses often triggering the following warning by driving a
counter negative.
WARNING: CPU: 1 PID: 1 at kernel/cgroup_pids.c:97 pids_cancel.constprop.6+0x31/0x40()
Modules linked in:
CPU: 1 PID: 1 Comm: systemd Not tainted 4.4.0-rc1+ #29
...
ffffffff81f65382 ffff88007c043b90 ffffffff81551ffc 0000000000000000
ffff88007c043bc8 ffffffff810de202 ffff88007a752000 ffff88007a29ab00
ffff88007c043c80 ffff88007a1d8400 0000000000000001 ffff88007c043bd8
Call Trace:
[<ffffffff81551ffc>] dump_stack+0x4e/0x82
[<ffffffff810de202>] warn_slowpath_common+0x82/0xc0
[<ffffffff810de2fa>] warn_slowpath_null+0x1a/0x20
[<ffffffff8118e031>] pids_cancel.constprop.6+0x31/0x40
[<ffffffff8118e0fd>] pids_can_attach+0x6d/0xf0
[<ffffffff81188a4c>] cgroup_taskset_migrate+0x6c/0x330
[<ffffffff81188e05>] cgroup_migrate+0xf5/0x190
[<ffffffff81189016>] cgroup_attach_task+0x176/0x200
[<ffffffff8118949d>] __cgroup_procs_write+0x2ad/0x460
[<ffffffff81189684>] cgroup_procs_write+0x14/0x20
[<ffffffff811854e5>] cgroup_file_write+0x35/0x1c0
[<ffffffff812e26f1>] kernfs_fop_write+0x141/0x190
[<ffffffff81265f88>] __vfs_write+0x28/0xe0
[<ffffffff812666fc>] vfs_write+0xac/0x1a0
[<ffffffff81267019>] SyS_write+0x49/0xb0
[<ffffffff81bcef32>] entry_SYSCALL_64_fastpath+0x12/0x76
This patch fixes the bug by removing @css parameter from the three
migration methods, ->can_attach, ->cancel_attach() and ->attach() and
updating cgroup_taskset iteration helpers also return the destination
css in addition to the task being migrated. All controllers are
updated accordingly.
* Controllers which don't care whether there are one or multiple
target csses can be converted trivially. cpu, io, freezer, perf,
netclassid and netprio fall in this category.
* cpuset's current implementation assumes that there's single source
and destination and thus doesn't support v2 hierarchy already. The
only change made by this patchset is how that single destination css
is obtained.
* memory migration path already doesn't do anything on v2. How the
single destination css is obtained is updated and the prep stage of
mem_cgroup_can_attach() is reordered to accomodate the change.
* pids is the only controller which was affected by this bug. It now
correctly handles multi-destination migrations and no longer causes
counter underflow from incorrect accounting.
Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-and-tested-by: Daniel Wagner <daniel.wagner@bmw-carit.de>
Cc: Aleksa Sarai <cyphar@cyphar.com>
2015-12-03 22:18:21 +07:00
|
|
|
struct cgroup_subsys_state *css;
|
2015-12-24 04:53:27 +07:00
|
|
|
struct mem_cgroup *memcg = NULL; /* unneeded init to make gcc happy */
|
2015-09-09 05:01:10 +07:00
|
|
|
struct mem_cgroup *from;
|
2015-09-12 02:00:19 +07:00
|
|
|
struct task_struct *leader, *p;
|
2015-09-09 05:01:10 +07:00
|
|
|
struct mm_struct *mm;
|
2015-02-12 06:26:09 +07:00
|
|
|
unsigned long move_flags;
|
2015-09-09 05:01:10 +07:00
|
|
|
int ret = 0;
|
2010-03-11 06:22:13 +07:00
|
|
|
|
cgroup: fix handling of multi-destination migration from subtree_control enabling
Consider the following v2 hierarchy.
P0 (+memory) --- P1 (-memory) --- A
\- B
P0 has memory enabled in its subtree_control while P1 doesn't. If
both A and B contain processes, they would belong to the memory css of
P1. Now if memory is enabled on P1's subtree_control, memory csses
should be created on both A and B and A's processes should be moved to
the former and B's processes the latter. IOW, enabling controllers
can cause atomic migrations into different csses.
The core cgroup migration logic has been updated accordingly but the
controller migration methods haven't and still assume that all tasks
migrate to a single target css; furthermore, the methods were fed the
css in which subtree_control was updated which is the parent of the
target csses. pids controller depends on the migration methods to
move charges and this made the controller attribute charges to the
wrong csses often triggering the following warning by driving a
counter negative.
WARNING: CPU: 1 PID: 1 at kernel/cgroup_pids.c:97 pids_cancel.constprop.6+0x31/0x40()
Modules linked in:
CPU: 1 PID: 1 Comm: systemd Not tainted 4.4.0-rc1+ #29
...
ffffffff81f65382 ffff88007c043b90 ffffffff81551ffc 0000000000000000
ffff88007c043bc8 ffffffff810de202 ffff88007a752000 ffff88007a29ab00
ffff88007c043c80 ffff88007a1d8400 0000000000000001 ffff88007c043bd8
Call Trace:
[<ffffffff81551ffc>] dump_stack+0x4e/0x82
[<ffffffff810de202>] warn_slowpath_common+0x82/0xc0
[<ffffffff810de2fa>] warn_slowpath_null+0x1a/0x20
[<ffffffff8118e031>] pids_cancel.constprop.6+0x31/0x40
[<ffffffff8118e0fd>] pids_can_attach+0x6d/0xf0
[<ffffffff81188a4c>] cgroup_taskset_migrate+0x6c/0x330
[<ffffffff81188e05>] cgroup_migrate+0xf5/0x190
[<ffffffff81189016>] cgroup_attach_task+0x176/0x200
[<ffffffff8118949d>] __cgroup_procs_write+0x2ad/0x460
[<ffffffff81189684>] cgroup_procs_write+0x14/0x20
[<ffffffff811854e5>] cgroup_file_write+0x35/0x1c0
[<ffffffff812e26f1>] kernfs_fop_write+0x141/0x190
[<ffffffff81265f88>] __vfs_write+0x28/0xe0
[<ffffffff812666fc>] vfs_write+0xac/0x1a0
[<ffffffff81267019>] SyS_write+0x49/0xb0
[<ffffffff81bcef32>] entry_SYSCALL_64_fastpath+0x12/0x76
This patch fixes the bug by removing @css parameter from the three
migration methods, ->can_attach, ->cancel_attach() and ->attach() and
updating cgroup_taskset iteration helpers also return the destination
css in addition to the task being migrated. All controllers are
updated accordingly.
* Controllers which don't care whether there are one or multiple
target csses can be converted trivially. cpu, io, freezer, perf,
netclassid and netprio fall in this category.
* cpuset's current implementation assumes that there's single source
and destination and thus doesn't support v2 hierarchy already. The
only change made by this patchset is how that single destination css
is obtained.
* memory migration path already doesn't do anything on v2. How the
single destination css is obtained is updated and the prep stage of
mem_cgroup_can_attach() is reordered to accomodate the change.
* pids is the only controller which was affected by this bug. It now
correctly handles multi-destination migrations and no longer causes
counter underflow from incorrect accounting.
Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-and-tested-by: Daniel Wagner <daniel.wagner@bmw-carit.de>
Cc: Aleksa Sarai <cyphar@cyphar.com>
2015-12-03 22:18:21 +07:00
|
|
|
/* charge immigration isn't supported on the default hierarchy */
|
|
|
|
if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
|
2015-09-09 05:01:10 +07:00
|
|
|
return 0;
|
|
|
|
|
2015-09-12 02:00:19 +07:00
|
|
|
/*
|
|
|
|
* Multi-process migrations only happen on the default hierarchy
|
|
|
|
* where charge immigration is not used. Perform charge
|
|
|
|
* immigration if @tset contains a leader and whine if there are
|
|
|
|
* multiple.
|
|
|
|
*/
|
|
|
|
p = NULL;
|
cgroup: fix handling of multi-destination migration from subtree_control enabling
Consider the following v2 hierarchy.
P0 (+memory) --- P1 (-memory) --- A
\- B
P0 has memory enabled in its subtree_control while P1 doesn't. If
both A and B contain processes, they would belong to the memory css of
P1. Now if memory is enabled on P1's subtree_control, memory csses
should be created on both A and B and A's processes should be moved to
the former and B's processes the latter. IOW, enabling controllers
can cause atomic migrations into different csses.
The core cgroup migration logic has been updated accordingly but the
controller migration methods haven't and still assume that all tasks
migrate to a single target css; furthermore, the methods were fed the
css in which subtree_control was updated which is the parent of the
target csses. pids controller depends on the migration methods to
move charges and this made the controller attribute charges to the
wrong csses often triggering the following warning by driving a
counter negative.
WARNING: CPU: 1 PID: 1 at kernel/cgroup_pids.c:97 pids_cancel.constprop.6+0x31/0x40()
Modules linked in:
CPU: 1 PID: 1 Comm: systemd Not tainted 4.4.0-rc1+ #29
...
ffffffff81f65382 ffff88007c043b90 ffffffff81551ffc 0000000000000000
ffff88007c043bc8 ffffffff810de202 ffff88007a752000 ffff88007a29ab00
ffff88007c043c80 ffff88007a1d8400 0000000000000001 ffff88007c043bd8
Call Trace:
[<ffffffff81551ffc>] dump_stack+0x4e/0x82
[<ffffffff810de202>] warn_slowpath_common+0x82/0xc0
[<ffffffff810de2fa>] warn_slowpath_null+0x1a/0x20
[<ffffffff8118e031>] pids_cancel.constprop.6+0x31/0x40
[<ffffffff8118e0fd>] pids_can_attach+0x6d/0xf0
[<ffffffff81188a4c>] cgroup_taskset_migrate+0x6c/0x330
[<ffffffff81188e05>] cgroup_migrate+0xf5/0x190
[<ffffffff81189016>] cgroup_attach_task+0x176/0x200
[<ffffffff8118949d>] __cgroup_procs_write+0x2ad/0x460
[<ffffffff81189684>] cgroup_procs_write+0x14/0x20
[<ffffffff811854e5>] cgroup_file_write+0x35/0x1c0
[<ffffffff812e26f1>] kernfs_fop_write+0x141/0x190
[<ffffffff81265f88>] __vfs_write+0x28/0xe0
[<ffffffff812666fc>] vfs_write+0xac/0x1a0
[<ffffffff81267019>] SyS_write+0x49/0xb0
[<ffffffff81bcef32>] entry_SYSCALL_64_fastpath+0x12/0x76
This patch fixes the bug by removing @css parameter from the three
migration methods, ->can_attach, ->cancel_attach() and ->attach() and
updating cgroup_taskset iteration helpers also return the destination
css in addition to the task being migrated. All controllers are
updated accordingly.
* Controllers which don't care whether there are one or multiple
target csses can be converted trivially. cpu, io, freezer, perf,
netclassid and netprio fall in this category.
* cpuset's current implementation assumes that there's single source
and destination and thus doesn't support v2 hierarchy already. The
only change made by this patchset is how that single destination css
is obtained.
* memory migration path already doesn't do anything on v2. How the
single destination css is obtained is updated and the prep stage of
mem_cgroup_can_attach() is reordered to accomodate the change.
* pids is the only controller which was affected by this bug. It now
correctly handles multi-destination migrations and no longer causes
counter underflow from incorrect accounting.
Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-and-tested-by: Daniel Wagner <daniel.wagner@bmw-carit.de>
Cc: Aleksa Sarai <cyphar@cyphar.com>
2015-12-03 22:18:21 +07:00
|
|
|
cgroup_taskset_for_each_leader(leader, css, tset) {
|
2015-09-12 02:00:19 +07:00
|
|
|
WARN_ON_ONCE(p);
|
|
|
|
p = leader;
|
cgroup: fix handling of multi-destination migration from subtree_control enabling
Consider the following v2 hierarchy.
P0 (+memory) --- P1 (-memory) --- A
\- B
P0 has memory enabled in its subtree_control while P1 doesn't. If
both A and B contain processes, they would belong to the memory css of
P1. Now if memory is enabled on P1's subtree_control, memory csses
should be created on both A and B and A's processes should be moved to
the former and B's processes the latter. IOW, enabling controllers
can cause atomic migrations into different csses.
The core cgroup migration logic has been updated accordingly but the
controller migration methods haven't and still assume that all tasks
migrate to a single target css; furthermore, the methods were fed the
css in which subtree_control was updated which is the parent of the
target csses. pids controller depends on the migration methods to
move charges and this made the controller attribute charges to the
wrong csses often triggering the following warning by driving a
counter negative.
WARNING: CPU: 1 PID: 1 at kernel/cgroup_pids.c:97 pids_cancel.constprop.6+0x31/0x40()
Modules linked in:
CPU: 1 PID: 1 Comm: systemd Not tainted 4.4.0-rc1+ #29
...
ffffffff81f65382 ffff88007c043b90 ffffffff81551ffc 0000000000000000
ffff88007c043bc8 ffffffff810de202 ffff88007a752000 ffff88007a29ab00
ffff88007c043c80 ffff88007a1d8400 0000000000000001 ffff88007c043bd8
Call Trace:
[<ffffffff81551ffc>] dump_stack+0x4e/0x82
[<ffffffff810de202>] warn_slowpath_common+0x82/0xc0
[<ffffffff810de2fa>] warn_slowpath_null+0x1a/0x20
[<ffffffff8118e031>] pids_cancel.constprop.6+0x31/0x40
[<ffffffff8118e0fd>] pids_can_attach+0x6d/0xf0
[<ffffffff81188a4c>] cgroup_taskset_migrate+0x6c/0x330
[<ffffffff81188e05>] cgroup_migrate+0xf5/0x190
[<ffffffff81189016>] cgroup_attach_task+0x176/0x200
[<ffffffff8118949d>] __cgroup_procs_write+0x2ad/0x460
[<ffffffff81189684>] cgroup_procs_write+0x14/0x20
[<ffffffff811854e5>] cgroup_file_write+0x35/0x1c0
[<ffffffff812e26f1>] kernfs_fop_write+0x141/0x190
[<ffffffff81265f88>] __vfs_write+0x28/0xe0
[<ffffffff812666fc>] vfs_write+0xac/0x1a0
[<ffffffff81267019>] SyS_write+0x49/0xb0
[<ffffffff81bcef32>] entry_SYSCALL_64_fastpath+0x12/0x76
This patch fixes the bug by removing @css parameter from the three
migration methods, ->can_attach, ->cancel_attach() and ->attach() and
updating cgroup_taskset iteration helpers also return the destination
css in addition to the task being migrated. All controllers are
updated accordingly.
* Controllers which don't care whether there are one or multiple
target csses can be converted trivially. cpu, io, freezer, perf,
netclassid and netprio fall in this category.
* cpuset's current implementation assumes that there's single source
and destination and thus doesn't support v2 hierarchy already. The
only change made by this patchset is how that single destination css
is obtained.
* memory migration path already doesn't do anything on v2. How the
single destination css is obtained is updated and the prep stage of
mem_cgroup_can_attach() is reordered to accomodate the change.
* pids is the only controller which was affected by this bug. It now
correctly handles multi-destination migrations and no longer causes
counter underflow from incorrect accounting.
Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-and-tested-by: Daniel Wagner <daniel.wagner@bmw-carit.de>
Cc: Aleksa Sarai <cyphar@cyphar.com>
2015-12-03 22:18:21 +07:00
|
|
|
memcg = mem_cgroup_from_css(css);
|
2015-09-12 02:00:19 +07:00
|
|
|
}
|
|
|
|
if (!p)
|
|
|
|
return 0;
|
|
|
|
|
cgroup: fix handling of multi-destination migration from subtree_control enabling
Consider the following v2 hierarchy.
P0 (+memory) --- P1 (-memory) --- A
\- B
P0 has memory enabled in its subtree_control while P1 doesn't. If
both A and B contain processes, they would belong to the memory css of
P1. Now if memory is enabled on P1's subtree_control, memory csses
should be created on both A and B and A's processes should be moved to
the former and B's processes the latter. IOW, enabling controllers
can cause atomic migrations into different csses.
The core cgroup migration logic has been updated accordingly but the
controller migration methods haven't and still assume that all tasks
migrate to a single target css; furthermore, the methods were fed the
css in which subtree_control was updated which is the parent of the
target csses. pids controller depends on the migration methods to
move charges and this made the controller attribute charges to the
wrong csses often triggering the following warning by driving a
counter negative.
WARNING: CPU: 1 PID: 1 at kernel/cgroup_pids.c:97 pids_cancel.constprop.6+0x31/0x40()
Modules linked in:
CPU: 1 PID: 1 Comm: systemd Not tainted 4.4.0-rc1+ #29
...
ffffffff81f65382 ffff88007c043b90 ffffffff81551ffc 0000000000000000
ffff88007c043bc8 ffffffff810de202 ffff88007a752000 ffff88007a29ab00
ffff88007c043c80 ffff88007a1d8400 0000000000000001 ffff88007c043bd8
Call Trace:
[<ffffffff81551ffc>] dump_stack+0x4e/0x82
[<ffffffff810de202>] warn_slowpath_common+0x82/0xc0
[<ffffffff810de2fa>] warn_slowpath_null+0x1a/0x20
[<ffffffff8118e031>] pids_cancel.constprop.6+0x31/0x40
[<ffffffff8118e0fd>] pids_can_attach+0x6d/0xf0
[<ffffffff81188a4c>] cgroup_taskset_migrate+0x6c/0x330
[<ffffffff81188e05>] cgroup_migrate+0xf5/0x190
[<ffffffff81189016>] cgroup_attach_task+0x176/0x200
[<ffffffff8118949d>] __cgroup_procs_write+0x2ad/0x460
[<ffffffff81189684>] cgroup_procs_write+0x14/0x20
[<ffffffff811854e5>] cgroup_file_write+0x35/0x1c0
[<ffffffff812e26f1>] kernfs_fop_write+0x141/0x190
[<ffffffff81265f88>] __vfs_write+0x28/0xe0
[<ffffffff812666fc>] vfs_write+0xac/0x1a0
[<ffffffff81267019>] SyS_write+0x49/0xb0
[<ffffffff81bcef32>] entry_SYSCALL_64_fastpath+0x12/0x76
This patch fixes the bug by removing @css parameter from the three
migration methods, ->can_attach, ->cancel_attach() and ->attach() and
updating cgroup_taskset iteration helpers also return the destination
css in addition to the task being migrated. All controllers are
updated accordingly.
* Controllers which don't care whether there are one or multiple
target csses can be converted trivially. cpu, io, freezer, perf,
netclassid and netprio fall in this category.
* cpuset's current implementation assumes that there's single source
and destination and thus doesn't support v2 hierarchy already. The
only change made by this patchset is how that single destination css
is obtained.
* memory migration path already doesn't do anything on v2. How the
single destination css is obtained is updated and the prep stage of
mem_cgroup_can_attach() is reordered to accomodate the change.
* pids is the only controller which was affected by this bug. It now
correctly handles multi-destination migrations and no longer causes
counter underflow from incorrect accounting.
Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-and-tested-by: Daniel Wagner <daniel.wagner@bmw-carit.de>
Cc: Aleksa Sarai <cyphar@cyphar.com>
2015-12-03 22:18:21 +07:00
|
|
|
/*
|
|
|
|
* We are now commited to this value whatever it is. Changes in this
|
|
|
|
* tunable will only affect upcoming migrations, not the current one.
|
|
|
|
* So we need to save it, and keep it going.
|
|
|
|
*/
|
|
|
|
move_flags = READ_ONCE(memcg->move_charge_at_immigrate);
|
|
|
|
if (!move_flags)
|
|
|
|
return 0;
|
|
|
|
|
2015-09-09 05:01:10 +07:00
|
|
|
from = mem_cgroup_from_task(p);
|
|
|
|
|
|
|
|
VM_BUG_ON(from == memcg);
|
|
|
|
|
|
|
|
mm = get_task_mm(p);
|
|
|
|
if (!mm)
|
|
|
|
return 0;
|
|
|
|
/* We move charges only when we move a owner of the mm */
|
|
|
|
if (mm->owner == p) {
|
|
|
|
VM_BUG_ON(mc.from);
|
|
|
|
VM_BUG_ON(mc.to);
|
|
|
|
VM_BUG_ON(mc.precharge);
|
|
|
|
VM_BUG_ON(mc.moved_charge);
|
|
|
|
VM_BUG_ON(mc.moved_swap);
|
|
|
|
|
|
|
|
spin_lock(&mc.lock);
|
2016-04-22 06:09:02 +07:00
|
|
|
mc.mm = mm;
|
2015-09-09 05:01:10 +07:00
|
|
|
mc.from = from;
|
|
|
|
mc.to = memcg;
|
|
|
|
mc.flags = move_flags;
|
|
|
|
spin_unlock(&mc.lock);
|
|
|
|
/* We set mc.moving_task later */
|
|
|
|
|
|
|
|
ret = mem_cgroup_precharge_mc(mm);
|
|
|
|
if (ret)
|
|
|
|
mem_cgroup_clear_mc();
|
2016-04-22 06:09:02 +07:00
|
|
|
} else {
|
|
|
|
mmput(mm);
|
2010-03-11 06:22:13 +07:00
|
|
|
}
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
cgroup: fix handling of multi-destination migration from subtree_control enabling
Consider the following v2 hierarchy.
P0 (+memory) --- P1 (-memory) --- A
\- B
P0 has memory enabled in its subtree_control while P1 doesn't. If
both A and B contain processes, they would belong to the memory css of
P1. Now if memory is enabled on P1's subtree_control, memory csses
should be created on both A and B and A's processes should be moved to
the former and B's processes the latter. IOW, enabling controllers
can cause atomic migrations into different csses.
The core cgroup migration logic has been updated accordingly but the
controller migration methods haven't and still assume that all tasks
migrate to a single target css; furthermore, the methods were fed the
css in which subtree_control was updated which is the parent of the
target csses. pids controller depends on the migration methods to
move charges and this made the controller attribute charges to the
wrong csses often triggering the following warning by driving a
counter negative.
WARNING: CPU: 1 PID: 1 at kernel/cgroup_pids.c:97 pids_cancel.constprop.6+0x31/0x40()
Modules linked in:
CPU: 1 PID: 1 Comm: systemd Not tainted 4.4.0-rc1+ #29
...
ffffffff81f65382 ffff88007c043b90 ffffffff81551ffc 0000000000000000
ffff88007c043bc8 ffffffff810de202 ffff88007a752000 ffff88007a29ab00
ffff88007c043c80 ffff88007a1d8400 0000000000000001 ffff88007c043bd8
Call Trace:
[<ffffffff81551ffc>] dump_stack+0x4e/0x82
[<ffffffff810de202>] warn_slowpath_common+0x82/0xc0
[<ffffffff810de2fa>] warn_slowpath_null+0x1a/0x20
[<ffffffff8118e031>] pids_cancel.constprop.6+0x31/0x40
[<ffffffff8118e0fd>] pids_can_attach+0x6d/0xf0
[<ffffffff81188a4c>] cgroup_taskset_migrate+0x6c/0x330
[<ffffffff81188e05>] cgroup_migrate+0xf5/0x190
[<ffffffff81189016>] cgroup_attach_task+0x176/0x200
[<ffffffff8118949d>] __cgroup_procs_write+0x2ad/0x460
[<ffffffff81189684>] cgroup_procs_write+0x14/0x20
[<ffffffff811854e5>] cgroup_file_write+0x35/0x1c0
[<ffffffff812e26f1>] kernfs_fop_write+0x141/0x190
[<ffffffff81265f88>] __vfs_write+0x28/0xe0
[<ffffffff812666fc>] vfs_write+0xac/0x1a0
[<ffffffff81267019>] SyS_write+0x49/0xb0
[<ffffffff81bcef32>] entry_SYSCALL_64_fastpath+0x12/0x76
This patch fixes the bug by removing @css parameter from the three
migration methods, ->can_attach, ->cancel_attach() and ->attach() and
updating cgroup_taskset iteration helpers also return the destination
css in addition to the task being migrated. All controllers are
updated accordingly.
* Controllers which don't care whether there are one or multiple
target csses can be converted trivially. cpu, io, freezer, perf,
netclassid and netprio fall in this category.
* cpuset's current implementation assumes that there's single source
and destination and thus doesn't support v2 hierarchy already. The
only change made by this patchset is how that single destination css
is obtained.
* memory migration path already doesn't do anything on v2. How the
single destination css is obtained is updated and the prep stage of
mem_cgroup_can_attach() is reordered to accomodate the change.
* pids is the only controller which was affected by this bug. It now
correctly handles multi-destination migrations and no longer causes
counter underflow from incorrect accounting.
Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-and-tested-by: Daniel Wagner <daniel.wagner@bmw-carit.de>
Cc: Aleksa Sarai <cyphar@cyphar.com>
2015-12-03 22:18:21 +07:00
|
|
|
static void mem_cgroup_cancel_attach(struct cgroup_taskset *tset)
|
2010-03-11 06:22:13 +07:00
|
|
|
{
|
2014-12-11 06:44:08 +07:00
|
|
|
if (mc.to)
|
|
|
|
mem_cgroup_clear_mc();
|
2010-03-11 06:22:13 +07:00
|
|
|
}
|
|
|
|
|
2010-03-11 06:22:14 +07:00
|
|
|
static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
|
|
|
|
unsigned long addr, unsigned long end,
|
|
|
|
struct mm_walk *walk)
|
2010-03-11 06:22:13 +07:00
|
|
|
{
|
2010-03-11 06:22:14 +07:00
|
|
|
int ret = 0;
|
2015-02-12 06:27:57 +07:00
|
|
|
struct vm_area_struct *vma = walk->vma;
|
2010-03-11 06:22:14 +07:00
|
|
|
pte_t *pte;
|
|
|
|
spinlock_t *ptl;
|
2012-03-22 06:34:28 +07:00
|
|
|
enum mc_target_type target_type;
|
|
|
|
union mc_target target;
|
|
|
|
struct page *page;
|
2010-03-11 06:22:14 +07:00
|
|
|
|
2016-01-22 07:40:25 +07:00
|
|
|
ptl = pmd_trans_huge_lock(pmd, vma);
|
|
|
|
if (ptl) {
|
2012-05-19 01:28:34 +07:00
|
|
|
if (mc.precharge < HPAGE_PMD_NR) {
|
2013-11-15 05:30:54 +07:00
|
|
|
spin_unlock(ptl);
|
2012-03-22 06:34:28 +07:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
target_type = get_mctgt_type_thp(vma, addr, *pmd, &target);
|
|
|
|
if (target_type == MC_TARGET_PAGE) {
|
|
|
|
page = target.page;
|
|
|
|
if (!isolate_lru_page(page)) {
|
2016-01-16 07:52:20 +07:00
|
|
|
if (!mem_cgroup_move_account(page, true,
|
2014-12-11 06:44:52 +07:00
|
|
|
mc.from, mc.to)) {
|
2012-03-22 06:34:28 +07:00
|
|
|
mc.precharge -= HPAGE_PMD_NR;
|
|
|
|
mc.moved_charge += HPAGE_PMD_NR;
|
|
|
|
}
|
|
|
|
putback_lru_page(page);
|
|
|
|
}
|
|
|
|
put_page(page);
|
2017-09-09 06:11:54 +07:00
|
|
|
} else if (target_type == MC_TARGET_DEVICE) {
|
|
|
|
page = target.page;
|
|
|
|
if (!mem_cgroup_move_account(page, true,
|
|
|
|
mc.from, mc.to)) {
|
|
|
|
mc.precharge -= HPAGE_PMD_NR;
|
|
|
|
mc.moved_charge += HPAGE_PMD_NR;
|
|
|
|
}
|
|
|
|
put_page(page);
|
2012-03-22 06:34:28 +07:00
|
|
|
}
|
2013-11-15 05:30:54 +07:00
|
|
|
spin_unlock(ptl);
|
mm: thp: fix pmd_bad() triggering in code paths holding mmap_sem read mode
In some cases it may happen that pmd_none_or_clear_bad() is called with
the mmap_sem hold in read mode. In those cases the huge page faults can
allocate hugepmds under pmd_none_or_clear_bad() and that can trigger a
false positive from pmd_bad() that will not like to see a pmd
materializing as trans huge.
It's not khugepaged causing the problem, khugepaged holds the mmap_sem
in write mode (and all those sites must hold the mmap_sem in read mode
to prevent pagetables to go away from under them, during code review it
seems vm86 mode on 32bit kernels requires that too unless it's
restricted to 1 thread per process or UP builds). The race is only with
the huge pagefaults that can convert a pmd_none() into a
pmd_trans_huge().
Effectively all these pmd_none_or_clear_bad() sites running with
mmap_sem in read mode are somewhat speculative with the page faults, and
the result is always undefined when they run simultaneously. This is
probably why it wasn't common to run into this. For example if the
madvise(MADV_DONTNEED) runs zap_page_range() shortly before the page
fault, the hugepage will not be zapped, if the page fault runs first it
will be zapped.
Altering pmd_bad() not to error out if it finds hugepmds won't be enough
to fix this, because zap_pmd_range would then proceed to call
zap_pte_range (which would be incorrect if the pmd become a
pmd_trans_huge()).
The simplest way to fix this is to read the pmd in the local stack
(regardless of what we read, no need of actual CPU barriers, only
compiler barrier needed), and be sure it is not changing under the code
that computes its value. Even if the real pmd is changing under the
value we hold on the stack, we don't care. If we actually end up in
zap_pte_range it means the pmd was not none already and it was not huge,
and it can't become huge from under us (khugepaged locking explained
above).
All we need is to enforce that there is no way anymore that in a code
path like below, pmd_trans_huge can be false, but pmd_none_or_clear_bad
can run into a hugepmd. The overhead of a barrier() is just a compiler
tweak and should not be measurable (I only added it for THP builds). I
don't exclude different compiler versions may have prevented the race
too by caching the value of *pmd on the stack (that hasn't been
verified, but it wouldn't be impossible considering
pmd_none_or_clear_bad, pmd_bad, pmd_trans_huge, pmd_none are all inlines
and there's no external function called in between pmd_trans_huge and
pmd_none_or_clear_bad).
if (pmd_trans_huge(*pmd)) {
if (next-addr != HPAGE_PMD_SIZE) {
VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem));
split_huge_page_pmd(vma->vm_mm, pmd);
} else if (zap_huge_pmd(tlb, vma, pmd, addr))
continue;
/* fall through */
}
if (pmd_none_or_clear_bad(pmd))
Because this race condition could be exercised without special
privileges this was reported in CVE-2012-1179.
The race was identified and fully explained by Ulrich who debugged it.
I'm quoting his accurate explanation below, for reference.
====== start quote =======
mapcount 0 page_mapcount 1
kernel BUG at mm/huge_memory.c:1384!
At some point prior to the panic, a "bad pmd ..." message similar to the
following is logged on the console:
mm/memory.c:145: bad pmd ffff8800376e1f98(80000000314000e7).
The "bad pmd ..." message is logged by pmd_clear_bad() before it clears
the page's PMD table entry.
143 void pmd_clear_bad(pmd_t *pmd)
144 {
-> 145 pmd_ERROR(*pmd);
146 pmd_clear(pmd);
147 }
After the PMD table entry has been cleared, there is an inconsistency
between the actual number of PMD table entries that are mapping the page
and the page's map count (_mapcount field in struct page). When the page
is subsequently reclaimed, __split_huge_page() detects this inconsistency.
1381 if (mapcount != page_mapcount(page))
1382 printk(KERN_ERR "mapcount %d page_mapcount %d\n",
1383 mapcount, page_mapcount(page));
-> 1384 BUG_ON(mapcount != page_mapcount(page));
The root cause of the problem is a race of two threads in a multithreaded
process. Thread B incurs a page fault on a virtual address that has never
been accessed (PMD entry is zero) while Thread A is executing an madvise()
system call on a virtual address within the same 2 MB (huge page) range.
virtual address space
.---------------------.
| |
| |
.-|---------------------|
| | |
| | |<-- B(fault)
| | |
2 MB | |/////////////////////|-.
huge < |/////////////////////| > A(range)
page | |/////////////////////|-'
| | |
| | |
'-|---------------------|
| |
| |
'---------------------'
- Thread A is executing an madvise(..., MADV_DONTNEED) system call
on the virtual address range "A(range)" shown in the picture.
sys_madvise
// Acquire the semaphore in shared mode.
down_read(¤t->mm->mmap_sem)
...
madvise_vma
switch (behavior)
case MADV_DONTNEED:
madvise_dontneed
zap_page_range
unmap_vmas
unmap_page_range
zap_pud_range
zap_pmd_range
//
// Assume that this huge page has never been accessed.
// I.e. content of the PMD entry is zero (not mapped).
//
if (pmd_trans_huge(*pmd)) {
// We don't get here due to the above assumption.
}
//
// Assume that Thread B incurred a page fault and
.---------> // sneaks in here as shown below.
| //
| if (pmd_none_or_clear_bad(pmd))
| {
| if (unlikely(pmd_bad(*pmd)))
| pmd_clear_bad
| {
| pmd_ERROR
| // Log "bad pmd ..." message here.
| pmd_clear
| // Clear the page's PMD entry.
| // Thread B incremented the map count
| // in page_add_new_anon_rmap(), but
| // now the page is no longer mapped
| // by a PMD entry (-> inconsistency).
| }
| }
|
v
- Thread B is handling a page fault on virtual address "B(fault)" shown
in the picture.
...
do_page_fault
__do_page_fault
// Acquire the semaphore in shared mode.
down_read_trylock(&mm->mmap_sem)
...
handle_mm_fault
if (pmd_none(*pmd) && transparent_hugepage_enabled(vma))
// We get here due to the above assumption (PMD entry is zero).
do_huge_pmd_anonymous_page
alloc_hugepage_vma
// Allocate a new transparent huge page here.
...
__do_huge_pmd_anonymous_page
...
spin_lock(&mm->page_table_lock)
...
page_add_new_anon_rmap
// Here we increment the page's map count (starts at -1).
atomic_set(&page->_mapcount, 0)
set_pmd_at
// Here we set the page's PMD entry which will be cleared
// when Thread A calls pmd_clear_bad().
...
spin_unlock(&mm->page_table_lock)
The mmap_sem does not prevent the race because both threads are acquiring
it in shared mode (down_read). Thread B holds the page_table_lock while
the page's map count and PMD table entry are updated. However, Thread A
does not synchronize on that lock.
====== end quote =======
[akpm@linux-foundation.org: checkpatch fixes]
Reported-by: Ulrich Obergfell <uobergfe@redhat.com>
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Hugh Dickins <hughd@google.com>
Cc: Dave Jones <davej@redhat.com>
Acked-by: Larry Woodman <lwoodman@redhat.com>
Acked-by: Rik van Riel <riel@redhat.com>
Cc: <stable@vger.kernel.org> [2.6.38+]
Cc: Mark Salter <msalter@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-03-22 06:33:42 +07:00
|
|
|
return 0;
|
2012-03-22 06:34:28 +07:00
|
|
|
}
|
|
|
|
|
2012-03-29 04:42:40 +07:00
|
|
|
if (pmd_trans_unstable(pmd))
|
|
|
|
return 0;
|
2010-03-11 06:22:14 +07:00
|
|
|
retry:
|
|
|
|
pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
|
|
|
|
for (; addr != end; addr += PAGE_SIZE) {
|
|
|
|
pte_t ptent = *(pte++);
|
2017-09-09 06:11:54 +07:00
|
|
|
bool device = false;
|
2010-03-11 06:22:17 +07:00
|
|
|
swp_entry_t ent;
|
2010-03-11 06:22:14 +07:00
|
|
|
|
|
|
|
if (!mc.precharge)
|
|
|
|
break;
|
|
|
|
|
2012-03-22 06:34:27 +07:00
|
|
|
switch (get_mctgt_type(vma, addr, ptent, &target)) {
|
2017-09-09 06:11:54 +07:00
|
|
|
case MC_TARGET_DEVICE:
|
|
|
|
device = true;
|
2020-04-07 10:08:39 +07:00
|
|
|
fallthrough;
|
2010-03-11 06:22:14 +07:00
|
|
|
case MC_TARGET_PAGE:
|
|
|
|
page = target.page;
|
2016-01-16 07:53:42 +07:00
|
|
|
/*
|
|
|
|
* We can have a part of the split pmd here. Moving it
|
|
|
|
* can be done but it would be too convoluted so simply
|
|
|
|
* ignore such a partial THP and keep it in original
|
|
|
|
* memcg. There should be somebody mapping the head.
|
|
|
|
*/
|
|
|
|
if (PageTransCompound(page))
|
|
|
|
goto put;
|
2017-09-09 06:11:54 +07:00
|
|
|
if (!device && isolate_lru_page(page))
|
2010-03-11 06:22:14 +07:00
|
|
|
goto put;
|
2016-01-16 07:52:20 +07:00
|
|
|
if (!mem_cgroup_move_account(page, false,
|
|
|
|
mc.from, mc.to)) {
|
2010-03-11 06:22:14 +07:00
|
|
|
mc.precharge--;
|
2010-03-11 06:22:15 +07:00
|
|
|
/* we uncharge from mc.from later. */
|
|
|
|
mc.moved_charge++;
|
2010-03-11 06:22:14 +07:00
|
|
|
}
|
2017-09-09 06:11:54 +07:00
|
|
|
if (!device)
|
|
|
|
putback_lru_page(page);
|
2012-03-22 06:34:27 +07:00
|
|
|
put: /* get_mctgt_type() gets the page */
|
2010-03-11 06:22:14 +07:00
|
|
|
put_page(page);
|
|
|
|
break;
|
2010-03-11 06:22:17 +07:00
|
|
|
case MC_TARGET_SWAP:
|
|
|
|
ent = target.ent;
|
2012-05-30 05:06:51 +07:00
|
|
|
if (!mem_cgroup_move_swap_account(ent, mc.from, mc.to)) {
|
2010-03-11 06:22:17 +07:00
|
|
|
mc.precharge--;
|
2020-07-24 11:15:24 +07:00
|
|
|
mem_cgroup_id_get_many(mc.to, 1);
|
|
|
|
/* we fixup other refcnts and charges later. */
|
2010-03-11 06:22:18 +07:00
|
|
|
mc.moved_swap++;
|
|
|
|
}
|
2010-03-11 06:22:17 +07:00
|
|
|
break;
|
2010-03-11 06:22:14 +07:00
|
|
|
default:
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
pte_unmap_unlock(pte - 1, ptl);
|
|
|
|
cond_resched();
|
|
|
|
|
|
|
|
if (addr != end) {
|
|
|
|
/*
|
|
|
|
* We have consumed all precharges we got in can_attach().
|
|
|
|
* We try charge one by one, but don't do any additional
|
|
|
|
* charges to mc.to if we have failed in charge once in attach()
|
|
|
|
* phase.
|
|
|
|
*/
|
2010-03-11 06:22:15 +07:00
|
|
|
ret = mem_cgroup_do_precharge(1);
|
2010-03-11 06:22:14 +07:00
|
|
|
if (!ret)
|
|
|
|
goto retry;
|
|
|
|
}
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2019-08-28 21:19:54 +07:00
|
|
|
static const struct mm_walk_ops charge_walk_ops = {
|
|
|
|
.pmd_entry = mem_cgroup_move_charge_pte_range,
|
|
|
|
};
|
|
|
|
|
2016-04-22 06:09:02 +07:00
|
|
|
static void mem_cgroup_move_charge(void)
|
2010-03-11 06:22:14 +07:00
|
|
|
{
|
|
|
|
lru_add_drain_all();
|
2014-12-11 06:44:25 +07:00
|
|
|
/*
|
2016-03-16 04:57:04 +07:00
|
|
|
* Signal lock_page_memcg() to take the memcg's move_lock
|
|
|
|
* while we're moving its pages to another memcg. Then wait
|
|
|
|
* for already started RCU-only updates to finish.
|
2014-12-11 06:44:25 +07:00
|
|
|
*/
|
|
|
|
atomic_inc(&mc.from->moving_account);
|
|
|
|
synchronize_rcu();
|
2011-01-14 06:47:41 +07:00
|
|
|
retry:
|
2020-06-09 11:33:25 +07:00
|
|
|
if (unlikely(!mmap_read_trylock(mc.mm))) {
|
2011-01-14 06:47:41 +07:00
|
|
|
/*
|
2020-06-09 11:33:54 +07:00
|
|
|
* Someone who are holding the mmap_lock might be waiting in
|
2011-01-14 06:47:41 +07:00
|
|
|
* waitq. So we cancel all extra charges, wake up all waiters,
|
|
|
|
* and retry. Because we cancel precharges, we might not be able
|
|
|
|
* to move enough charges, but moving charge is a best-effort
|
|
|
|
* feature anyway, so it wouldn't be a big problem.
|
|
|
|
*/
|
|
|
|
__mem_cgroup_clear_mc();
|
|
|
|
cond_resched();
|
|
|
|
goto retry;
|
|
|
|
}
|
2015-02-12 06:27:57 +07:00
|
|
|
/*
|
|
|
|
* When we have consumed all precharges and failed in doing
|
|
|
|
* additional charge, the page walk just aborts.
|
|
|
|
*/
|
2019-08-28 21:19:54 +07:00
|
|
|
walk_page_range(mc.mm, 0, mc.mm->highest_vm_end, &charge_walk_ops,
|
|
|
|
NULL);
|
2016-10-08 07:00:12 +07:00
|
|
|
|
2020-06-09 11:33:25 +07:00
|
|
|
mmap_read_unlock(mc.mm);
|
2014-12-11 06:44:25 +07:00
|
|
|
atomic_dec(&mc.from->moving_account);
|
2010-03-11 06:22:13 +07:00
|
|
|
}
|
|
|
|
|
2016-04-22 06:09:02 +07:00
|
|
|
static void mem_cgroup_move_task(void)
|
2008-02-07 15:13:54 +07:00
|
|
|
{
|
2016-04-22 06:09:02 +07:00
|
|
|
if (mc.to) {
|
|
|
|
mem_cgroup_move_charge();
|
2011-06-16 05:08:13 +07:00
|
|
|
mem_cgroup_clear_mc();
|
2016-04-22 06:09:02 +07:00
|
|
|
}
|
2008-02-07 15:13:54 +07:00
|
|
|
}
|
2010-03-24 03:35:11 +07:00
|
|
|
#else /* !CONFIG_MMU */
|
cgroup: fix handling of multi-destination migration from subtree_control enabling
Consider the following v2 hierarchy.
P0 (+memory) --- P1 (-memory) --- A
\- B
P0 has memory enabled in its subtree_control while P1 doesn't. If
both A and B contain processes, they would belong to the memory css of
P1. Now if memory is enabled on P1's subtree_control, memory csses
should be created on both A and B and A's processes should be moved to
the former and B's processes the latter. IOW, enabling controllers
can cause atomic migrations into different csses.
The core cgroup migration logic has been updated accordingly but the
controller migration methods haven't and still assume that all tasks
migrate to a single target css; furthermore, the methods were fed the
css in which subtree_control was updated which is the parent of the
target csses. pids controller depends on the migration methods to
move charges and this made the controller attribute charges to the
wrong csses often triggering the following warning by driving a
counter negative.
WARNING: CPU: 1 PID: 1 at kernel/cgroup_pids.c:97 pids_cancel.constprop.6+0x31/0x40()
Modules linked in:
CPU: 1 PID: 1 Comm: systemd Not tainted 4.4.0-rc1+ #29
...
ffffffff81f65382 ffff88007c043b90 ffffffff81551ffc 0000000000000000
ffff88007c043bc8 ffffffff810de202 ffff88007a752000 ffff88007a29ab00
ffff88007c043c80 ffff88007a1d8400 0000000000000001 ffff88007c043bd8
Call Trace:
[<ffffffff81551ffc>] dump_stack+0x4e/0x82
[<ffffffff810de202>] warn_slowpath_common+0x82/0xc0
[<ffffffff810de2fa>] warn_slowpath_null+0x1a/0x20
[<ffffffff8118e031>] pids_cancel.constprop.6+0x31/0x40
[<ffffffff8118e0fd>] pids_can_attach+0x6d/0xf0
[<ffffffff81188a4c>] cgroup_taskset_migrate+0x6c/0x330
[<ffffffff81188e05>] cgroup_migrate+0xf5/0x190
[<ffffffff81189016>] cgroup_attach_task+0x176/0x200
[<ffffffff8118949d>] __cgroup_procs_write+0x2ad/0x460
[<ffffffff81189684>] cgroup_procs_write+0x14/0x20
[<ffffffff811854e5>] cgroup_file_write+0x35/0x1c0
[<ffffffff812e26f1>] kernfs_fop_write+0x141/0x190
[<ffffffff81265f88>] __vfs_write+0x28/0xe0
[<ffffffff812666fc>] vfs_write+0xac/0x1a0
[<ffffffff81267019>] SyS_write+0x49/0xb0
[<ffffffff81bcef32>] entry_SYSCALL_64_fastpath+0x12/0x76
This patch fixes the bug by removing @css parameter from the three
migration methods, ->can_attach, ->cancel_attach() and ->attach() and
updating cgroup_taskset iteration helpers also return the destination
css in addition to the task being migrated. All controllers are
updated accordingly.
* Controllers which don't care whether there are one or multiple
target csses can be converted trivially. cpu, io, freezer, perf,
netclassid and netprio fall in this category.
* cpuset's current implementation assumes that there's single source
and destination and thus doesn't support v2 hierarchy already. The
only change made by this patchset is how that single destination css
is obtained.
* memory migration path already doesn't do anything on v2. How the
single destination css is obtained is updated and the prep stage of
mem_cgroup_can_attach() is reordered to accomodate the change.
* pids is the only controller which was affected by this bug. It now
correctly handles multi-destination migrations and no longer causes
counter underflow from incorrect accounting.
Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-and-tested-by: Daniel Wagner <daniel.wagner@bmw-carit.de>
Cc: Aleksa Sarai <cyphar@cyphar.com>
2015-12-03 22:18:21 +07:00
|
|
|
static int mem_cgroup_can_attach(struct cgroup_taskset *tset)
|
2010-03-24 03:35:11 +07:00
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
cgroup: fix handling of multi-destination migration from subtree_control enabling
Consider the following v2 hierarchy.
P0 (+memory) --- P1 (-memory) --- A
\- B
P0 has memory enabled in its subtree_control while P1 doesn't. If
both A and B contain processes, they would belong to the memory css of
P1. Now if memory is enabled on P1's subtree_control, memory csses
should be created on both A and B and A's processes should be moved to
the former and B's processes the latter. IOW, enabling controllers
can cause atomic migrations into different csses.
The core cgroup migration logic has been updated accordingly but the
controller migration methods haven't and still assume that all tasks
migrate to a single target css; furthermore, the methods were fed the
css in which subtree_control was updated which is the parent of the
target csses. pids controller depends on the migration methods to
move charges and this made the controller attribute charges to the
wrong csses often triggering the following warning by driving a
counter negative.
WARNING: CPU: 1 PID: 1 at kernel/cgroup_pids.c:97 pids_cancel.constprop.6+0x31/0x40()
Modules linked in:
CPU: 1 PID: 1 Comm: systemd Not tainted 4.4.0-rc1+ #29
...
ffffffff81f65382 ffff88007c043b90 ffffffff81551ffc 0000000000000000
ffff88007c043bc8 ffffffff810de202 ffff88007a752000 ffff88007a29ab00
ffff88007c043c80 ffff88007a1d8400 0000000000000001 ffff88007c043bd8
Call Trace:
[<ffffffff81551ffc>] dump_stack+0x4e/0x82
[<ffffffff810de202>] warn_slowpath_common+0x82/0xc0
[<ffffffff810de2fa>] warn_slowpath_null+0x1a/0x20
[<ffffffff8118e031>] pids_cancel.constprop.6+0x31/0x40
[<ffffffff8118e0fd>] pids_can_attach+0x6d/0xf0
[<ffffffff81188a4c>] cgroup_taskset_migrate+0x6c/0x330
[<ffffffff81188e05>] cgroup_migrate+0xf5/0x190
[<ffffffff81189016>] cgroup_attach_task+0x176/0x200
[<ffffffff8118949d>] __cgroup_procs_write+0x2ad/0x460
[<ffffffff81189684>] cgroup_procs_write+0x14/0x20
[<ffffffff811854e5>] cgroup_file_write+0x35/0x1c0
[<ffffffff812e26f1>] kernfs_fop_write+0x141/0x190
[<ffffffff81265f88>] __vfs_write+0x28/0xe0
[<ffffffff812666fc>] vfs_write+0xac/0x1a0
[<ffffffff81267019>] SyS_write+0x49/0xb0
[<ffffffff81bcef32>] entry_SYSCALL_64_fastpath+0x12/0x76
This patch fixes the bug by removing @css parameter from the three
migration methods, ->can_attach, ->cancel_attach() and ->attach() and
updating cgroup_taskset iteration helpers also return the destination
css in addition to the task being migrated. All controllers are
updated accordingly.
* Controllers which don't care whether there are one or multiple
target csses can be converted trivially. cpu, io, freezer, perf,
netclassid and netprio fall in this category.
* cpuset's current implementation assumes that there's single source
and destination and thus doesn't support v2 hierarchy already. The
only change made by this patchset is how that single destination css
is obtained.
* memory migration path already doesn't do anything on v2. How the
single destination css is obtained is updated and the prep stage of
mem_cgroup_can_attach() is reordered to accomodate the change.
* pids is the only controller which was affected by this bug. It now
correctly handles multi-destination migrations and no longer causes
counter underflow from incorrect accounting.
Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-and-tested-by: Daniel Wagner <daniel.wagner@bmw-carit.de>
Cc: Aleksa Sarai <cyphar@cyphar.com>
2015-12-03 22:18:21 +07:00
|
|
|
static void mem_cgroup_cancel_attach(struct cgroup_taskset *tset)
|
2010-03-24 03:35:11 +07:00
|
|
|
{
|
|
|
|
}
|
2016-04-22 06:09:02 +07:00
|
|
|
static void mem_cgroup_move_task(void)
|
2010-03-24 03:35:11 +07:00
|
|
|
{
|
|
|
|
}
|
|
|
|
#endif
|
2008-02-07 15:13:54 +07:00
|
|
|
|
2013-04-16 03:41:15 +07:00
|
|
|
/*
|
|
|
|
* Cgroup retains root cgroups across [un]mount cycles making it necessary
|
2014-07-09 21:08:08 +07:00
|
|
|
* to verify whether we're attached to the default hierarchy on each mount
|
|
|
|
* attempt.
|
2013-04-16 03:41:15 +07:00
|
|
|
*/
|
2013-08-09 07:11:23 +07:00
|
|
|
static void mem_cgroup_bind(struct cgroup_subsys_state *root_css)
|
2013-04-16 03:41:15 +07:00
|
|
|
{
|
|
|
|
/*
|
2014-07-09 21:08:08 +07:00
|
|
|
* use_hierarchy is forced on the default hierarchy. cgroup core
|
2013-04-16 03:41:15 +07:00
|
|
|
* guarantees that @root doesn't have any children, so turning it
|
|
|
|
* on for the root memcg is enough.
|
|
|
|
*/
|
2015-09-18 22:56:28 +07:00
|
|
|
if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
|
2015-03-13 06:26:19 +07:00
|
|
|
root_mem_cgroup->use_hierarchy = true;
|
|
|
|
else
|
|
|
|
root_mem_cgroup->use_hierarchy = false;
|
2013-04-16 03:41:15 +07:00
|
|
|
}
|
|
|
|
|
2019-03-06 06:45:55 +07:00
|
|
|
static int seq_puts_memcg_tunable(struct seq_file *m, unsigned long value)
|
|
|
|
{
|
|
|
|
if (value == PAGE_COUNTER_MAX)
|
|
|
|
seq_puts(m, "max\n");
|
|
|
|
else
|
|
|
|
seq_printf(m, "%llu\n", (u64)value * PAGE_SIZE);
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
mm: memcontrol: default hierarchy interface for memory
Introduce the basic control files to account, partition, and limit
memory using cgroups in default hierarchy mode.
This interface versioning allows us to address fundamental design
issues in the existing memory cgroup interface, further explained
below. The old interface will be maintained indefinitely, but a
clearer model and improved workload performance should encourage
existing users to switch over to the new one eventually.
The control files are thus:
- memory.current shows the current consumption of the cgroup and its
descendants, in bytes.
- memory.low configures the lower end of the cgroup's expected
memory consumption range. The kernel considers memory below that
boundary to be a reserve - the minimum that the workload needs in
order to make forward progress - and generally avoids reclaiming
it, unless there is an imminent risk of entering an OOM situation.
- memory.high configures the upper end of the cgroup's expected
memory consumption range. A cgroup whose consumption grows beyond
this threshold is forced into direct reclaim, to work off the
excess and to throttle new allocations heavily, but is generally
allowed to continue and the OOM killer is not invoked.
- memory.max configures the hard maximum amount of memory that the
cgroup is allowed to consume before the OOM killer is invoked.
- memory.events shows event counters that indicate how often the
cgroup was reclaimed while below memory.low, how often it was
forced to reclaim excess beyond memory.high, how often it hit
memory.max, and how often it entered OOM due to memory.max. This
allows users to identify configuration problems when observing a
degradation in workload performance. An overcommitted system will
have an increased rate of low boundary breaches, whereas increased
rates of high limit breaches, maximum hits, or even OOM situations
will indicate internally overcommitted cgroups.
For existing users of memory cgroups, the following deviations from
the current interface are worth pointing out and explaining:
- The original lower boundary, the soft limit, is defined as a limit
that is per default unset. As a result, the set of cgroups that
global reclaim prefers is opt-in, rather than opt-out. The costs
for optimizing these mostly negative lookups are so high that the
implementation, despite its enormous size, does not even provide
the basic desirable behavior. First off, the soft limit has no
hierarchical meaning. All configured groups are organized in a
global rbtree and treated like equal peers, regardless where they
are located in the hierarchy. This makes subtree delegation
impossible. Second, the soft limit reclaim pass is so aggressive
that it not just introduces high allocation latencies into the
system, but also impacts system performance due to overreclaim, to
the point where the feature becomes self-defeating.
The memory.low boundary on the other hand is a top-down allocated
reserve. A cgroup enjoys reclaim protection when it and all its
ancestors are below their low boundaries, which makes delegation
of subtrees possible. Secondly, new cgroups have no reserve per
default and in the common case most cgroups are eligible for the
preferred reclaim pass. This allows the new low boundary to be
efficiently implemented with just a minor addition to the generic
reclaim code, without the need for out-of-band data structures and
reclaim passes. Because the generic reclaim code considers all
cgroups except for the ones running low in the preferred first
reclaim pass, overreclaim of individual groups is eliminated as
well, resulting in much better overall workload performance.
- The original high boundary, the hard limit, is defined as a strict
limit that can not budge, even if the OOM killer has to be called.
But this generally goes against the goal of making the most out of
the available memory. The memory consumption of workloads varies
during runtime, and that requires users to overcommit. But doing
that with a strict upper limit requires either a fairly accurate
prediction of the working set size or adding slack to the limit.
Since working set size estimation is hard and error prone, and
getting it wrong results in OOM kills, most users tend to err on
the side of a looser limit and end up wasting precious resources.
The memory.high boundary on the other hand can be set much more
conservatively. When hit, it throttles allocations by forcing
them into direct reclaim to work off the excess, but it never
invokes the OOM killer. As a result, a high boundary that is
chosen too aggressively will not terminate the processes, but
instead it will lead to gradual performance degradation. The user
can monitor this and make corrections until the minimal memory
footprint that still gives acceptable performance is found.
In extreme cases, with many concurrent allocations and a complete
breakdown of reclaim progress within the group, the high boundary
can be exceeded. But even then it's mostly better to satisfy the
allocation from the slack available in other groups or the rest of
the system than killing the group. Otherwise, memory.max is there
to limit this type of spillover and ultimately contain buggy or
even malicious applications.
- The original control file names are unwieldy and inconsistent in
many different ways. For example, the upper boundary hit count is
exported in the memory.failcnt file, but an OOM event count has to
be manually counted by listening to memory.oom_control events, and
lower boundary / soft limit events have to be counted by first
setting a threshold for that value and then counting those events.
Also, usage and limit files encode their units in the filename.
That makes the filenames very long, even though this is not
information that a user needs to be reminded of every time they
type out those names.
To address these naming issues, as well as to signal clearly that
the new interface carries a new configuration model, the naming
conventions in it necessarily differ from the old interface.
- The original limit files indicate the state of an unset limit with
a very high number, and a configured limit can be unset by echoing
-1 into those files. But that very high number is implementation
and architecture dependent and not very descriptive. And while -1
can be understood as an underflow into the highest possible value,
-2 or -10M etc. do not work, so it's not inconsistent.
memory.low, memory.high, and memory.max will use the string
"infinity" to indicate and set the highest possible value.
[akpm@linux-foundation.org: use seq_puts() for basic strings]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: Vladimir Davydov <vdavydov@parallels.com>
Cc: Greg Thelen <gthelen@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-02-12 06:26:06 +07:00
|
|
|
static u64 memory_current_read(struct cgroup_subsys_state *css,
|
|
|
|
struct cftype *cft)
|
|
|
|
{
|
2015-11-06 09:50:23 +07:00
|
|
|
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
|
|
|
|
|
|
|
|
return (u64)page_counter_read(&memcg->memory) * PAGE_SIZE;
|
mm: memcontrol: default hierarchy interface for memory
Introduce the basic control files to account, partition, and limit
memory using cgroups in default hierarchy mode.
This interface versioning allows us to address fundamental design
issues in the existing memory cgroup interface, further explained
below. The old interface will be maintained indefinitely, but a
clearer model and improved workload performance should encourage
existing users to switch over to the new one eventually.
The control files are thus:
- memory.current shows the current consumption of the cgroup and its
descendants, in bytes.
- memory.low configures the lower end of the cgroup's expected
memory consumption range. The kernel considers memory below that
boundary to be a reserve - the minimum that the workload needs in
order to make forward progress - and generally avoids reclaiming
it, unless there is an imminent risk of entering an OOM situation.
- memory.high configures the upper end of the cgroup's expected
memory consumption range. A cgroup whose consumption grows beyond
this threshold is forced into direct reclaim, to work off the
excess and to throttle new allocations heavily, but is generally
allowed to continue and the OOM killer is not invoked.
- memory.max configures the hard maximum amount of memory that the
cgroup is allowed to consume before the OOM killer is invoked.
- memory.events shows event counters that indicate how often the
cgroup was reclaimed while below memory.low, how often it was
forced to reclaim excess beyond memory.high, how often it hit
memory.max, and how often it entered OOM due to memory.max. This
allows users to identify configuration problems when observing a
degradation in workload performance. An overcommitted system will
have an increased rate of low boundary breaches, whereas increased
rates of high limit breaches, maximum hits, or even OOM situations
will indicate internally overcommitted cgroups.
For existing users of memory cgroups, the following deviations from
the current interface are worth pointing out and explaining:
- The original lower boundary, the soft limit, is defined as a limit
that is per default unset. As a result, the set of cgroups that
global reclaim prefers is opt-in, rather than opt-out. The costs
for optimizing these mostly negative lookups are so high that the
implementation, despite its enormous size, does not even provide
the basic desirable behavior. First off, the soft limit has no
hierarchical meaning. All configured groups are organized in a
global rbtree and treated like equal peers, regardless where they
are located in the hierarchy. This makes subtree delegation
impossible. Second, the soft limit reclaim pass is so aggressive
that it not just introduces high allocation latencies into the
system, but also impacts system performance due to overreclaim, to
the point where the feature becomes self-defeating.
The memory.low boundary on the other hand is a top-down allocated
reserve. A cgroup enjoys reclaim protection when it and all its
ancestors are below their low boundaries, which makes delegation
of subtrees possible. Secondly, new cgroups have no reserve per
default and in the common case most cgroups are eligible for the
preferred reclaim pass. This allows the new low boundary to be
efficiently implemented with just a minor addition to the generic
reclaim code, without the need for out-of-band data structures and
reclaim passes. Because the generic reclaim code considers all
cgroups except for the ones running low in the preferred first
reclaim pass, overreclaim of individual groups is eliminated as
well, resulting in much better overall workload performance.
- The original high boundary, the hard limit, is defined as a strict
limit that can not budge, even if the OOM killer has to be called.
But this generally goes against the goal of making the most out of
the available memory. The memory consumption of workloads varies
during runtime, and that requires users to overcommit. But doing
that with a strict upper limit requires either a fairly accurate
prediction of the working set size or adding slack to the limit.
Since working set size estimation is hard and error prone, and
getting it wrong results in OOM kills, most users tend to err on
the side of a looser limit and end up wasting precious resources.
The memory.high boundary on the other hand can be set much more
conservatively. When hit, it throttles allocations by forcing
them into direct reclaim to work off the excess, but it never
invokes the OOM killer. As a result, a high boundary that is
chosen too aggressively will not terminate the processes, but
instead it will lead to gradual performance degradation. The user
can monitor this and make corrections until the minimal memory
footprint that still gives acceptable performance is found.
In extreme cases, with many concurrent allocations and a complete
breakdown of reclaim progress within the group, the high boundary
can be exceeded. But even then it's mostly better to satisfy the
allocation from the slack available in other groups or the rest of
the system than killing the group. Otherwise, memory.max is there
to limit this type of spillover and ultimately contain buggy or
even malicious applications.
- The original control file names are unwieldy and inconsistent in
many different ways. For example, the upper boundary hit count is
exported in the memory.failcnt file, but an OOM event count has to
be manually counted by listening to memory.oom_control events, and
lower boundary / soft limit events have to be counted by first
setting a threshold for that value and then counting those events.
Also, usage and limit files encode their units in the filename.
That makes the filenames very long, even though this is not
information that a user needs to be reminded of every time they
type out those names.
To address these naming issues, as well as to signal clearly that
the new interface carries a new configuration model, the naming
conventions in it necessarily differ from the old interface.
- The original limit files indicate the state of an unset limit with
a very high number, and a configured limit can be unset by echoing
-1 into those files. But that very high number is implementation
and architecture dependent and not very descriptive. And while -1
can be understood as an underflow into the highest possible value,
-2 or -10M etc. do not work, so it's not inconsistent.
memory.low, memory.high, and memory.max will use the string
"infinity" to indicate and set the highest possible value.
[akpm@linux-foundation.org: use seq_puts() for basic strings]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: Vladimir Davydov <vdavydov@parallels.com>
Cc: Greg Thelen <gthelen@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-02-12 06:26:06 +07:00
|
|
|
}
|
|
|
|
|
2018-06-08 07:07:46 +07:00
|
|
|
static int memory_min_show(struct seq_file *m, void *v)
|
|
|
|
{
|
2019-03-06 06:45:55 +07:00
|
|
|
return seq_puts_memcg_tunable(m,
|
|
|
|
READ_ONCE(mem_cgroup_from_seq(m)->memory.min));
|
2018-06-08 07:07:46 +07:00
|
|
|
}
|
|
|
|
|
|
|
|
static ssize_t memory_min_write(struct kernfs_open_file *of,
|
|
|
|
char *buf, size_t nbytes, loff_t off)
|
|
|
|
{
|
|
|
|
struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
|
|
|
|
unsigned long min;
|
|
|
|
int err;
|
|
|
|
|
|
|
|
buf = strstrip(buf);
|
|
|
|
err = page_counter_memparse(buf, "max", &min);
|
|
|
|
if (err)
|
|
|
|
return err;
|
|
|
|
|
|
|
|
page_counter_set_min(&memcg->memory, min);
|
|
|
|
|
|
|
|
return nbytes;
|
|
|
|
}
|
|
|
|
|
mm: memcontrol: default hierarchy interface for memory
Introduce the basic control files to account, partition, and limit
memory using cgroups in default hierarchy mode.
This interface versioning allows us to address fundamental design
issues in the existing memory cgroup interface, further explained
below. The old interface will be maintained indefinitely, but a
clearer model and improved workload performance should encourage
existing users to switch over to the new one eventually.
The control files are thus:
- memory.current shows the current consumption of the cgroup and its
descendants, in bytes.
- memory.low configures the lower end of the cgroup's expected
memory consumption range. The kernel considers memory below that
boundary to be a reserve - the minimum that the workload needs in
order to make forward progress - and generally avoids reclaiming
it, unless there is an imminent risk of entering an OOM situation.
- memory.high configures the upper end of the cgroup's expected
memory consumption range. A cgroup whose consumption grows beyond
this threshold is forced into direct reclaim, to work off the
excess and to throttle new allocations heavily, but is generally
allowed to continue and the OOM killer is not invoked.
- memory.max configures the hard maximum amount of memory that the
cgroup is allowed to consume before the OOM killer is invoked.
- memory.events shows event counters that indicate how often the
cgroup was reclaimed while below memory.low, how often it was
forced to reclaim excess beyond memory.high, how often it hit
memory.max, and how often it entered OOM due to memory.max. This
allows users to identify configuration problems when observing a
degradation in workload performance. An overcommitted system will
have an increased rate of low boundary breaches, whereas increased
rates of high limit breaches, maximum hits, or even OOM situations
will indicate internally overcommitted cgroups.
For existing users of memory cgroups, the following deviations from
the current interface are worth pointing out and explaining:
- The original lower boundary, the soft limit, is defined as a limit
that is per default unset. As a result, the set of cgroups that
global reclaim prefers is opt-in, rather than opt-out. The costs
for optimizing these mostly negative lookups are so high that the
implementation, despite its enormous size, does not even provide
the basic desirable behavior. First off, the soft limit has no
hierarchical meaning. All configured groups are organized in a
global rbtree and treated like equal peers, regardless where they
are located in the hierarchy. This makes subtree delegation
impossible. Second, the soft limit reclaim pass is so aggressive
that it not just introduces high allocation latencies into the
system, but also impacts system performance due to overreclaim, to
the point where the feature becomes self-defeating.
The memory.low boundary on the other hand is a top-down allocated
reserve. A cgroup enjoys reclaim protection when it and all its
ancestors are below their low boundaries, which makes delegation
of subtrees possible. Secondly, new cgroups have no reserve per
default and in the common case most cgroups are eligible for the
preferred reclaim pass. This allows the new low boundary to be
efficiently implemented with just a minor addition to the generic
reclaim code, without the need for out-of-band data structures and
reclaim passes. Because the generic reclaim code considers all
cgroups except for the ones running low in the preferred first
reclaim pass, overreclaim of individual groups is eliminated as
well, resulting in much better overall workload performance.
- The original high boundary, the hard limit, is defined as a strict
limit that can not budge, even if the OOM killer has to be called.
But this generally goes against the goal of making the most out of
the available memory. The memory consumption of workloads varies
during runtime, and that requires users to overcommit. But doing
that with a strict upper limit requires either a fairly accurate
prediction of the working set size or adding slack to the limit.
Since working set size estimation is hard and error prone, and
getting it wrong results in OOM kills, most users tend to err on
the side of a looser limit and end up wasting precious resources.
The memory.high boundary on the other hand can be set much more
conservatively. When hit, it throttles allocations by forcing
them into direct reclaim to work off the excess, but it never
invokes the OOM killer. As a result, a high boundary that is
chosen too aggressively will not terminate the processes, but
instead it will lead to gradual performance degradation. The user
can monitor this and make corrections until the minimal memory
footprint that still gives acceptable performance is found.
In extreme cases, with many concurrent allocations and a complete
breakdown of reclaim progress within the group, the high boundary
can be exceeded. But even then it's mostly better to satisfy the
allocation from the slack available in other groups or the rest of
the system than killing the group. Otherwise, memory.max is there
to limit this type of spillover and ultimately contain buggy or
even malicious applications.
- The original control file names are unwieldy and inconsistent in
many different ways. For example, the upper boundary hit count is
exported in the memory.failcnt file, but an OOM event count has to
be manually counted by listening to memory.oom_control events, and
lower boundary / soft limit events have to be counted by first
setting a threshold for that value and then counting those events.
Also, usage and limit files encode their units in the filename.
That makes the filenames very long, even though this is not
information that a user needs to be reminded of every time they
type out those names.
To address these naming issues, as well as to signal clearly that
the new interface carries a new configuration model, the naming
conventions in it necessarily differ from the old interface.
- The original limit files indicate the state of an unset limit with
a very high number, and a configured limit can be unset by echoing
-1 into those files. But that very high number is implementation
and architecture dependent and not very descriptive. And while -1
can be understood as an underflow into the highest possible value,
-2 or -10M etc. do not work, so it's not inconsistent.
memory.low, memory.high, and memory.max will use the string
"infinity" to indicate and set the highest possible value.
[akpm@linux-foundation.org: use seq_puts() for basic strings]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: Vladimir Davydov <vdavydov@parallels.com>
Cc: Greg Thelen <gthelen@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-02-12 06:26:06 +07:00
|
|
|
static int memory_low_show(struct seq_file *m, void *v)
|
|
|
|
{
|
2019-03-06 06:45:55 +07:00
|
|
|
return seq_puts_memcg_tunable(m,
|
|
|
|
READ_ONCE(mem_cgroup_from_seq(m)->memory.low));
|
mm: memcontrol: default hierarchy interface for memory
Introduce the basic control files to account, partition, and limit
memory using cgroups in default hierarchy mode.
This interface versioning allows us to address fundamental design
issues in the existing memory cgroup interface, further explained
below. The old interface will be maintained indefinitely, but a
clearer model and improved workload performance should encourage
existing users to switch over to the new one eventually.
The control files are thus:
- memory.current shows the current consumption of the cgroup and its
descendants, in bytes.
- memory.low configures the lower end of the cgroup's expected
memory consumption range. The kernel considers memory below that
boundary to be a reserve - the minimum that the workload needs in
order to make forward progress - and generally avoids reclaiming
it, unless there is an imminent risk of entering an OOM situation.
- memory.high configures the upper end of the cgroup's expected
memory consumption range. A cgroup whose consumption grows beyond
this threshold is forced into direct reclaim, to work off the
excess and to throttle new allocations heavily, but is generally
allowed to continue and the OOM killer is not invoked.
- memory.max configures the hard maximum amount of memory that the
cgroup is allowed to consume before the OOM killer is invoked.
- memory.events shows event counters that indicate how often the
cgroup was reclaimed while below memory.low, how often it was
forced to reclaim excess beyond memory.high, how often it hit
memory.max, and how often it entered OOM due to memory.max. This
allows users to identify configuration problems when observing a
degradation in workload performance. An overcommitted system will
have an increased rate of low boundary breaches, whereas increased
rates of high limit breaches, maximum hits, or even OOM situations
will indicate internally overcommitted cgroups.
For existing users of memory cgroups, the following deviations from
the current interface are worth pointing out and explaining:
- The original lower boundary, the soft limit, is defined as a limit
that is per default unset. As a result, the set of cgroups that
global reclaim prefers is opt-in, rather than opt-out. The costs
for optimizing these mostly negative lookups are so high that the
implementation, despite its enormous size, does not even provide
the basic desirable behavior. First off, the soft limit has no
hierarchical meaning. All configured groups are organized in a
global rbtree and treated like equal peers, regardless where they
are located in the hierarchy. This makes subtree delegation
impossible. Second, the soft limit reclaim pass is so aggressive
that it not just introduces high allocation latencies into the
system, but also impacts system performance due to overreclaim, to
the point where the feature becomes self-defeating.
The memory.low boundary on the other hand is a top-down allocated
reserve. A cgroup enjoys reclaim protection when it and all its
ancestors are below their low boundaries, which makes delegation
of subtrees possible. Secondly, new cgroups have no reserve per
default and in the common case most cgroups are eligible for the
preferred reclaim pass. This allows the new low boundary to be
efficiently implemented with just a minor addition to the generic
reclaim code, without the need for out-of-band data structures and
reclaim passes. Because the generic reclaim code considers all
cgroups except for the ones running low in the preferred first
reclaim pass, overreclaim of individual groups is eliminated as
well, resulting in much better overall workload performance.
- The original high boundary, the hard limit, is defined as a strict
limit that can not budge, even if the OOM killer has to be called.
But this generally goes against the goal of making the most out of
the available memory. The memory consumption of workloads varies
during runtime, and that requires users to overcommit. But doing
that with a strict upper limit requires either a fairly accurate
prediction of the working set size or adding slack to the limit.
Since working set size estimation is hard and error prone, and
getting it wrong results in OOM kills, most users tend to err on
the side of a looser limit and end up wasting precious resources.
The memory.high boundary on the other hand can be set much more
conservatively. When hit, it throttles allocations by forcing
them into direct reclaim to work off the excess, but it never
invokes the OOM killer. As a result, a high boundary that is
chosen too aggressively will not terminate the processes, but
instead it will lead to gradual performance degradation. The user
can monitor this and make corrections until the minimal memory
footprint that still gives acceptable performance is found.
In extreme cases, with many concurrent allocations and a complete
breakdown of reclaim progress within the group, the high boundary
can be exceeded. But even then it's mostly better to satisfy the
allocation from the slack available in other groups or the rest of
the system than killing the group. Otherwise, memory.max is there
to limit this type of spillover and ultimately contain buggy or
even malicious applications.
- The original control file names are unwieldy and inconsistent in
many different ways. For example, the upper boundary hit count is
exported in the memory.failcnt file, but an OOM event count has to
be manually counted by listening to memory.oom_control events, and
lower boundary / soft limit events have to be counted by first
setting a threshold for that value and then counting those events.
Also, usage and limit files encode their units in the filename.
That makes the filenames very long, even though this is not
information that a user needs to be reminded of every time they
type out those names.
To address these naming issues, as well as to signal clearly that
the new interface carries a new configuration model, the naming
conventions in it necessarily differ from the old interface.
- The original limit files indicate the state of an unset limit with
a very high number, and a configured limit can be unset by echoing
-1 into those files. But that very high number is implementation
and architecture dependent and not very descriptive. And while -1
can be understood as an underflow into the highest possible value,
-2 or -10M etc. do not work, so it's not inconsistent.
memory.low, memory.high, and memory.max will use the string
"infinity" to indicate and set the highest possible value.
[akpm@linux-foundation.org: use seq_puts() for basic strings]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: Vladimir Davydov <vdavydov@parallels.com>
Cc: Greg Thelen <gthelen@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-02-12 06:26:06 +07:00
|
|
|
}
|
|
|
|
|
|
|
|
static ssize_t memory_low_write(struct kernfs_open_file *of,
|
|
|
|
char *buf, size_t nbytes, loff_t off)
|
|
|
|
{
|
|
|
|
struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
|
|
|
|
unsigned long low;
|
|
|
|
int err;
|
|
|
|
|
|
|
|
buf = strstrip(buf);
|
2015-02-28 06:52:04 +07:00
|
|
|
err = page_counter_memparse(buf, "max", &low);
|
mm: memcontrol: default hierarchy interface for memory
Introduce the basic control files to account, partition, and limit
memory using cgroups in default hierarchy mode.
This interface versioning allows us to address fundamental design
issues in the existing memory cgroup interface, further explained
below. The old interface will be maintained indefinitely, but a
clearer model and improved workload performance should encourage
existing users to switch over to the new one eventually.
The control files are thus:
- memory.current shows the current consumption of the cgroup and its
descendants, in bytes.
- memory.low configures the lower end of the cgroup's expected
memory consumption range. The kernel considers memory below that
boundary to be a reserve - the minimum that the workload needs in
order to make forward progress - and generally avoids reclaiming
it, unless there is an imminent risk of entering an OOM situation.
- memory.high configures the upper end of the cgroup's expected
memory consumption range. A cgroup whose consumption grows beyond
this threshold is forced into direct reclaim, to work off the
excess and to throttle new allocations heavily, but is generally
allowed to continue and the OOM killer is not invoked.
- memory.max configures the hard maximum amount of memory that the
cgroup is allowed to consume before the OOM killer is invoked.
- memory.events shows event counters that indicate how often the
cgroup was reclaimed while below memory.low, how often it was
forced to reclaim excess beyond memory.high, how often it hit
memory.max, and how often it entered OOM due to memory.max. This
allows users to identify configuration problems when observing a
degradation in workload performance. An overcommitted system will
have an increased rate of low boundary breaches, whereas increased
rates of high limit breaches, maximum hits, or even OOM situations
will indicate internally overcommitted cgroups.
For existing users of memory cgroups, the following deviations from
the current interface are worth pointing out and explaining:
- The original lower boundary, the soft limit, is defined as a limit
that is per default unset. As a result, the set of cgroups that
global reclaim prefers is opt-in, rather than opt-out. The costs
for optimizing these mostly negative lookups are so high that the
implementation, despite its enormous size, does not even provide
the basic desirable behavior. First off, the soft limit has no
hierarchical meaning. All configured groups are organized in a
global rbtree and treated like equal peers, regardless where they
are located in the hierarchy. This makes subtree delegation
impossible. Second, the soft limit reclaim pass is so aggressive
that it not just introduces high allocation latencies into the
system, but also impacts system performance due to overreclaim, to
the point where the feature becomes self-defeating.
The memory.low boundary on the other hand is a top-down allocated
reserve. A cgroup enjoys reclaim protection when it and all its
ancestors are below their low boundaries, which makes delegation
of subtrees possible. Secondly, new cgroups have no reserve per
default and in the common case most cgroups are eligible for the
preferred reclaim pass. This allows the new low boundary to be
efficiently implemented with just a minor addition to the generic
reclaim code, without the need for out-of-band data structures and
reclaim passes. Because the generic reclaim code considers all
cgroups except for the ones running low in the preferred first
reclaim pass, overreclaim of individual groups is eliminated as
well, resulting in much better overall workload performance.
- The original high boundary, the hard limit, is defined as a strict
limit that can not budge, even if the OOM killer has to be called.
But this generally goes against the goal of making the most out of
the available memory. The memory consumption of workloads varies
during runtime, and that requires users to overcommit. But doing
that with a strict upper limit requires either a fairly accurate
prediction of the working set size or adding slack to the limit.
Since working set size estimation is hard and error prone, and
getting it wrong results in OOM kills, most users tend to err on
the side of a looser limit and end up wasting precious resources.
The memory.high boundary on the other hand can be set much more
conservatively. When hit, it throttles allocations by forcing
them into direct reclaim to work off the excess, but it never
invokes the OOM killer. As a result, a high boundary that is
chosen too aggressively will not terminate the processes, but
instead it will lead to gradual performance degradation. The user
can monitor this and make corrections until the minimal memory
footprint that still gives acceptable performance is found.
In extreme cases, with many concurrent allocations and a complete
breakdown of reclaim progress within the group, the high boundary
can be exceeded. But even then it's mostly better to satisfy the
allocation from the slack available in other groups or the rest of
the system than killing the group. Otherwise, memory.max is there
to limit this type of spillover and ultimately contain buggy or
even malicious applications.
- The original control file names are unwieldy and inconsistent in
many different ways. For example, the upper boundary hit count is
exported in the memory.failcnt file, but an OOM event count has to
be manually counted by listening to memory.oom_control events, and
lower boundary / soft limit events have to be counted by first
setting a threshold for that value and then counting those events.
Also, usage and limit files encode their units in the filename.
That makes the filenames very long, even though this is not
information that a user needs to be reminded of every time they
type out those names.
To address these naming issues, as well as to signal clearly that
the new interface carries a new configuration model, the naming
conventions in it necessarily differ from the old interface.
- The original limit files indicate the state of an unset limit with
a very high number, and a configured limit can be unset by echoing
-1 into those files. But that very high number is implementation
and architecture dependent and not very descriptive. And while -1
can be understood as an underflow into the highest possible value,
-2 or -10M etc. do not work, so it's not inconsistent.
memory.low, memory.high, and memory.max will use the string
"infinity" to indicate and set the highest possible value.
[akpm@linux-foundation.org: use seq_puts() for basic strings]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: Vladimir Davydov <vdavydov@parallels.com>
Cc: Greg Thelen <gthelen@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-02-12 06:26:06 +07:00
|
|
|
if (err)
|
|
|
|
return err;
|
|
|
|
|
mm: memory.low hierarchical behavior
This patch aims to address an issue in current memory.low semantics,
which makes it hard to use it in a hierarchy, where some leaf memory
cgroups are more valuable than others.
For example, there are memcgs A, A/B, A/C, A/D and A/E:
A A/memory.low = 2G, A/memory.current = 6G
//\\
BC DE B/memory.low = 3G B/memory.current = 2G
C/memory.low = 1G C/memory.current = 2G
D/memory.low = 0 D/memory.current = 2G
E/memory.low = 10G E/memory.current = 0
If we apply memory pressure, B, C and D are reclaimed at the same pace
while A's usage exceeds 2G. This is obviously wrong, as B's usage is
fully below B's memory.low, and C has 1G of protection as well. Also, A
is pushed to the size, which is less than A's 2G memory.low, which is
also wrong.
A simple bash script (provided below) can be used to reproduce
the problem. Current results are:
A: 1430097920
A/B: 711929856
A/C: 717426688
A/D: 741376
A/E: 0
To address the issue a concept of effective memory.low is introduced.
Effective memory.low is always equal or less than original memory.low.
In a case, when there is no memory.low overcommittment (and also for
top-level cgroups), these two values are equal.
Otherwise it's a part of parent's effective memory.low, calculated as a
cgroup's memory.low usage divided by sum of sibling's memory.low usages
(under memory.low usage I mean the size of actually protected memory:
memory.current if memory.current < memory.low, 0 otherwise). It's
necessary to track the actual usage, because otherwise an empty cgroup
with memory.low set (A/E in my example) will affect actual memory
distribution, which makes no sense. To avoid traversing the cgroup tree
twice, page_counters code is reused.
Calculating effective memory.low can be done in the reclaim path, as we
conveniently traversing the cgroup tree from top to bottom and check
memory.low on each level. So, it's a perfect place to calculate
effective memory low and save it to use it for children cgroups.
This also eliminates a need to traverse the cgroup tree from bottom to
top each time to check if parent's guarantee is not exceeded.
Setting/resetting effective memory.low is intentionally racy, but it's
fine and shouldn't lead to any significant differences in actual memory
distribution.
With this patch applied results are matching the expectations:
A: 2147930112
A/B: 1428721664
A/C: 718393344
A/D: 815104
A/E: 0
Test script:
#!/bin/bash
CGPATH="/sys/fs/cgroup"
truncate /file1 --size 2G
truncate /file2 --size 2G
truncate /file3 --size 2G
truncate /file4 --size 50G
mkdir "${CGPATH}/A"
echo "+memory" > "${CGPATH}/A/cgroup.subtree_control"
mkdir "${CGPATH}/A/B" "${CGPATH}/A/C" "${CGPATH}/A/D" "${CGPATH}/A/E"
echo 2G > "${CGPATH}/A/memory.low"
echo 3G > "${CGPATH}/A/B/memory.low"
echo 1G > "${CGPATH}/A/C/memory.low"
echo 0 > "${CGPATH}/A/D/memory.low"
echo 10G > "${CGPATH}/A/E/memory.low"
echo $$ > "${CGPATH}/A/B/cgroup.procs" && vmtouch -qt /file1
echo $$ > "${CGPATH}/A/C/cgroup.procs" && vmtouch -qt /file2
echo $$ > "${CGPATH}/A/D/cgroup.procs" && vmtouch -qt /file3
echo $$ > "${CGPATH}/cgroup.procs" && vmtouch -qt /file4
echo "A: " `cat "${CGPATH}/A/memory.current"`
echo "A/B: " `cat "${CGPATH}/A/B/memory.current"`
echo "A/C: " `cat "${CGPATH}/A/C/memory.current"`
echo "A/D: " `cat "${CGPATH}/A/D/memory.current"`
echo "A/E: " `cat "${CGPATH}/A/E/memory.current"`
rmdir "${CGPATH}/A/B" "${CGPATH}/A/C" "${CGPATH}/A/D" "${CGPATH}/A/E"
rmdir "${CGPATH}/A"
rm /file1 /file2 /file3 /file4
Link: http://lkml.kernel.org/r/20180405185921.4942-2-guro@fb.com
Signed-off-by: Roman Gushchin <guro@fb.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2018-06-08 07:06:22 +07:00
|
|
|
page_counter_set_low(&memcg->memory, low);
|
mm: memcontrol: default hierarchy interface for memory
Introduce the basic control files to account, partition, and limit
memory using cgroups in default hierarchy mode.
This interface versioning allows us to address fundamental design
issues in the existing memory cgroup interface, further explained
below. The old interface will be maintained indefinitely, but a
clearer model and improved workload performance should encourage
existing users to switch over to the new one eventually.
The control files are thus:
- memory.current shows the current consumption of the cgroup and its
descendants, in bytes.
- memory.low configures the lower end of the cgroup's expected
memory consumption range. The kernel considers memory below that
boundary to be a reserve - the minimum that the workload needs in
order to make forward progress - and generally avoids reclaiming
it, unless there is an imminent risk of entering an OOM situation.
- memory.high configures the upper end of the cgroup's expected
memory consumption range. A cgroup whose consumption grows beyond
this threshold is forced into direct reclaim, to work off the
excess and to throttle new allocations heavily, but is generally
allowed to continue and the OOM killer is not invoked.
- memory.max configures the hard maximum amount of memory that the
cgroup is allowed to consume before the OOM killer is invoked.
- memory.events shows event counters that indicate how often the
cgroup was reclaimed while below memory.low, how often it was
forced to reclaim excess beyond memory.high, how often it hit
memory.max, and how often it entered OOM due to memory.max. This
allows users to identify configuration problems when observing a
degradation in workload performance. An overcommitted system will
have an increased rate of low boundary breaches, whereas increased
rates of high limit breaches, maximum hits, or even OOM situations
will indicate internally overcommitted cgroups.
For existing users of memory cgroups, the following deviations from
the current interface are worth pointing out and explaining:
- The original lower boundary, the soft limit, is defined as a limit
that is per default unset. As a result, the set of cgroups that
global reclaim prefers is opt-in, rather than opt-out. The costs
for optimizing these mostly negative lookups are so high that the
implementation, despite its enormous size, does not even provide
the basic desirable behavior. First off, the soft limit has no
hierarchical meaning. All configured groups are organized in a
global rbtree and treated like equal peers, regardless where they
are located in the hierarchy. This makes subtree delegation
impossible. Second, the soft limit reclaim pass is so aggressive
that it not just introduces high allocation latencies into the
system, but also impacts system performance due to overreclaim, to
the point where the feature becomes self-defeating.
The memory.low boundary on the other hand is a top-down allocated
reserve. A cgroup enjoys reclaim protection when it and all its
ancestors are below their low boundaries, which makes delegation
of subtrees possible. Secondly, new cgroups have no reserve per
default and in the common case most cgroups are eligible for the
preferred reclaim pass. This allows the new low boundary to be
efficiently implemented with just a minor addition to the generic
reclaim code, without the need for out-of-band data structures and
reclaim passes. Because the generic reclaim code considers all
cgroups except for the ones running low in the preferred first
reclaim pass, overreclaim of individual groups is eliminated as
well, resulting in much better overall workload performance.
- The original high boundary, the hard limit, is defined as a strict
limit that can not budge, even if the OOM killer has to be called.
But this generally goes against the goal of making the most out of
the available memory. The memory consumption of workloads varies
during runtime, and that requires users to overcommit. But doing
that with a strict upper limit requires either a fairly accurate
prediction of the working set size or adding slack to the limit.
Since working set size estimation is hard and error prone, and
getting it wrong results in OOM kills, most users tend to err on
the side of a looser limit and end up wasting precious resources.
The memory.high boundary on the other hand can be set much more
conservatively. When hit, it throttles allocations by forcing
them into direct reclaim to work off the excess, but it never
invokes the OOM killer. As a result, a high boundary that is
chosen too aggressively will not terminate the processes, but
instead it will lead to gradual performance degradation. The user
can monitor this and make corrections until the minimal memory
footprint that still gives acceptable performance is found.
In extreme cases, with many concurrent allocations and a complete
breakdown of reclaim progress within the group, the high boundary
can be exceeded. But even then it's mostly better to satisfy the
allocation from the slack available in other groups or the rest of
the system than killing the group. Otherwise, memory.max is there
to limit this type of spillover and ultimately contain buggy or
even malicious applications.
- The original control file names are unwieldy and inconsistent in
many different ways. For example, the upper boundary hit count is
exported in the memory.failcnt file, but an OOM event count has to
be manually counted by listening to memory.oom_control events, and
lower boundary / soft limit events have to be counted by first
setting a threshold for that value and then counting those events.
Also, usage and limit files encode their units in the filename.
That makes the filenames very long, even though this is not
information that a user needs to be reminded of every time they
type out those names.
To address these naming issues, as well as to signal clearly that
the new interface carries a new configuration model, the naming
conventions in it necessarily differ from the old interface.
- The original limit files indicate the state of an unset limit with
a very high number, and a configured limit can be unset by echoing
-1 into those files. But that very high number is implementation
and architecture dependent and not very descriptive. And while -1
can be understood as an underflow into the highest possible value,
-2 or -10M etc. do not work, so it's not inconsistent.
memory.low, memory.high, and memory.max will use the string
"infinity" to indicate and set the highest possible value.
[akpm@linux-foundation.org: use seq_puts() for basic strings]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: Vladimir Davydov <vdavydov@parallels.com>
Cc: Greg Thelen <gthelen@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-02-12 06:26:06 +07:00
|
|
|
|
|
|
|
return nbytes;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int memory_high_show(struct seq_file *m, void *v)
|
|
|
|
{
|
2020-06-02 11:49:49 +07:00
|
|
|
return seq_puts_memcg_tunable(m,
|
|
|
|
READ_ONCE(mem_cgroup_from_seq(m)->memory.high));
|
mm: memcontrol: default hierarchy interface for memory
Introduce the basic control files to account, partition, and limit
memory using cgroups in default hierarchy mode.
This interface versioning allows us to address fundamental design
issues in the existing memory cgroup interface, further explained
below. The old interface will be maintained indefinitely, but a
clearer model and improved workload performance should encourage
existing users to switch over to the new one eventually.
The control files are thus:
- memory.current shows the current consumption of the cgroup and its
descendants, in bytes.
- memory.low configures the lower end of the cgroup's expected
memory consumption range. The kernel considers memory below that
boundary to be a reserve - the minimum that the workload needs in
order to make forward progress - and generally avoids reclaiming
it, unless there is an imminent risk of entering an OOM situation.
- memory.high configures the upper end of the cgroup's expected
memory consumption range. A cgroup whose consumption grows beyond
this threshold is forced into direct reclaim, to work off the
excess and to throttle new allocations heavily, but is generally
allowed to continue and the OOM killer is not invoked.
- memory.max configures the hard maximum amount of memory that the
cgroup is allowed to consume before the OOM killer is invoked.
- memory.events shows event counters that indicate how often the
cgroup was reclaimed while below memory.low, how often it was
forced to reclaim excess beyond memory.high, how often it hit
memory.max, and how often it entered OOM due to memory.max. This
allows users to identify configuration problems when observing a
degradation in workload performance. An overcommitted system will
have an increased rate of low boundary breaches, whereas increased
rates of high limit breaches, maximum hits, or even OOM situations
will indicate internally overcommitted cgroups.
For existing users of memory cgroups, the following deviations from
the current interface are worth pointing out and explaining:
- The original lower boundary, the soft limit, is defined as a limit
that is per default unset. As a result, the set of cgroups that
global reclaim prefers is opt-in, rather than opt-out. The costs
for optimizing these mostly negative lookups are so high that the
implementation, despite its enormous size, does not even provide
the basic desirable behavior. First off, the soft limit has no
hierarchical meaning. All configured groups are organized in a
global rbtree and treated like equal peers, regardless where they
are located in the hierarchy. This makes subtree delegation
impossible. Second, the soft limit reclaim pass is so aggressive
that it not just introduces high allocation latencies into the
system, but also impacts system performance due to overreclaim, to
the point where the feature becomes self-defeating.
The memory.low boundary on the other hand is a top-down allocated
reserve. A cgroup enjoys reclaim protection when it and all its
ancestors are below their low boundaries, which makes delegation
of subtrees possible. Secondly, new cgroups have no reserve per
default and in the common case most cgroups are eligible for the
preferred reclaim pass. This allows the new low boundary to be
efficiently implemented with just a minor addition to the generic
reclaim code, without the need for out-of-band data structures and
reclaim passes. Because the generic reclaim code considers all
cgroups except for the ones running low in the preferred first
reclaim pass, overreclaim of individual groups is eliminated as
well, resulting in much better overall workload performance.
- The original high boundary, the hard limit, is defined as a strict
limit that can not budge, even if the OOM killer has to be called.
But this generally goes against the goal of making the most out of
the available memory. The memory consumption of workloads varies
during runtime, and that requires users to overcommit. But doing
that with a strict upper limit requires either a fairly accurate
prediction of the working set size or adding slack to the limit.
Since working set size estimation is hard and error prone, and
getting it wrong results in OOM kills, most users tend to err on
the side of a looser limit and end up wasting precious resources.
The memory.high boundary on the other hand can be set much more
conservatively. When hit, it throttles allocations by forcing
them into direct reclaim to work off the excess, but it never
invokes the OOM killer. As a result, a high boundary that is
chosen too aggressively will not terminate the processes, but
instead it will lead to gradual performance degradation. The user
can monitor this and make corrections until the minimal memory
footprint that still gives acceptable performance is found.
In extreme cases, with many concurrent allocations and a complete
breakdown of reclaim progress within the group, the high boundary
can be exceeded. But even then it's mostly better to satisfy the
allocation from the slack available in other groups or the rest of
the system than killing the group. Otherwise, memory.max is there
to limit this type of spillover and ultimately contain buggy or
even malicious applications.
- The original control file names are unwieldy and inconsistent in
many different ways. For example, the upper boundary hit count is
exported in the memory.failcnt file, but an OOM event count has to
be manually counted by listening to memory.oom_control events, and
lower boundary / soft limit events have to be counted by first
setting a threshold for that value and then counting those events.
Also, usage and limit files encode their units in the filename.
That makes the filenames very long, even though this is not
information that a user needs to be reminded of every time they
type out those names.
To address these naming issues, as well as to signal clearly that
the new interface carries a new configuration model, the naming
conventions in it necessarily differ from the old interface.
- The original limit files indicate the state of an unset limit with
a very high number, and a configured limit can be unset by echoing
-1 into those files. But that very high number is implementation
and architecture dependent and not very descriptive. And while -1
can be understood as an underflow into the highest possible value,
-2 or -10M etc. do not work, so it's not inconsistent.
memory.low, memory.high, and memory.max will use the string
"infinity" to indicate and set the highest possible value.
[akpm@linux-foundation.org: use seq_puts() for basic strings]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: Vladimir Davydov <vdavydov@parallels.com>
Cc: Greg Thelen <gthelen@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-02-12 06:26:06 +07:00
|
|
|
}
|
|
|
|
|
|
|
|
static ssize_t memory_high_write(struct kernfs_open_file *of,
|
|
|
|
char *buf, size_t nbytes, loff_t off)
|
|
|
|
{
|
|
|
|
struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
|
2020-08-07 13:21:58 +07:00
|
|
|
unsigned int nr_retries = MAX_RECLAIM_RETRIES;
|
2019-12-01 08:50:09 +07:00
|
|
|
bool drained = false;
|
mm: memcontrol: default hierarchy interface for memory
Introduce the basic control files to account, partition, and limit
memory using cgroups in default hierarchy mode.
This interface versioning allows us to address fundamental design
issues in the existing memory cgroup interface, further explained
below. The old interface will be maintained indefinitely, but a
clearer model and improved workload performance should encourage
existing users to switch over to the new one eventually.
The control files are thus:
- memory.current shows the current consumption of the cgroup and its
descendants, in bytes.
- memory.low configures the lower end of the cgroup's expected
memory consumption range. The kernel considers memory below that
boundary to be a reserve - the minimum that the workload needs in
order to make forward progress - and generally avoids reclaiming
it, unless there is an imminent risk of entering an OOM situation.
- memory.high configures the upper end of the cgroup's expected
memory consumption range. A cgroup whose consumption grows beyond
this threshold is forced into direct reclaim, to work off the
excess and to throttle new allocations heavily, but is generally
allowed to continue and the OOM killer is not invoked.
- memory.max configures the hard maximum amount of memory that the
cgroup is allowed to consume before the OOM killer is invoked.
- memory.events shows event counters that indicate how often the
cgroup was reclaimed while below memory.low, how often it was
forced to reclaim excess beyond memory.high, how often it hit
memory.max, and how often it entered OOM due to memory.max. This
allows users to identify configuration problems when observing a
degradation in workload performance. An overcommitted system will
have an increased rate of low boundary breaches, whereas increased
rates of high limit breaches, maximum hits, or even OOM situations
will indicate internally overcommitted cgroups.
For existing users of memory cgroups, the following deviations from
the current interface are worth pointing out and explaining:
- The original lower boundary, the soft limit, is defined as a limit
that is per default unset. As a result, the set of cgroups that
global reclaim prefers is opt-in, rather than opt-out. The costs
for optimizing these mostly negative lookups are so high that the
implementation, despite its enormous size, does not even provide
the basic desirable behavior. First off, the soft limit has no
hierarchical meaning. All configured groups are organized in a
global rbtree and treated like equal peers, regardless where they
are located in the hierarchy. This makes subtree delegation
impossible. Second, the soft limit reclaim pass is so aggressive
that it not just introduces high allocation latencies into the
system, but also impacts system performance due to overreclaim, to
the point where the feature becomes self-defeating.
The memory.low boundary on the other hand is a top-down allocated
reserve. A cgroup enjoys reclaim protection when it and all its
ancestors are below their low boundaries, which makes delegation
of subtrees possible. Secondly, new cgroups have no reserve per
default and in the common case most cgroups are eligible for the
preferred reclaim pass. This allows the new low boundary to be
efficiently implemented with just a minor addition to the generic
reclaim code, without the need for out-of-band data structures and
reclaim passes. Because the generic reclaim code considers all
cgroups except for the ones running low in the preferred first
reclaim pass, overreclaim of individual groups is eliminated as
well, resulting in much better overall workload performance.
- The original high boundary, the hard limit, is defined as a strict
limit that can not budge, even if the OOM killer has to be called.
But this generally goes against the goal of making the most out of
the available memory. The memory consumption of workloads varies
during runtime, and that requires users to overcommit. But doing
that with a strict upper limit requires either a fairly accurate
prediction of the working set size or adding slack to the limit.
Since working set size estimation is hard and error prone, and
getting it wrong results in OOM kills, most users tend to err on
the side of a looser limit and end up wasting precious resources.
The memory.high boundary on the other hand can be set much more
conservatively. When hit, it throttles allocations by forcing
them into direct reclaim to work off the excess, but it never
invokes the OOM killer. As a result, a high boundary that is
chosen too aggressively will not terminate the processes, but
instead it will lead to gradual performance degradation. The user
can monitor this and make corrections until the minimal memory
footprint that still gives acceptable performance is found.
In extreme cases, with many concurrent allocations and a complete
breakdown of reclaim progress within the group, the high boundary
can be exceeded. But even then it's mostly better to satisfy the
allocation from the slack available in other groups or the rest of
the system than killing the group. Otherwise, memory.max is there
to limit this type of spillover and ultimately contain buggy or
even malicious applications.
- The original control file names are unwieldy and inconsistent in
many different ways. For example, the upper boundary hit count is
exported in the memory.failcnt file, but an OOM event count has to
be manually counted by listening to memory.oom_control events, and
lower boundary / soft limit events have to be counted by first
setting a threshold for that value and then counting those events.
Also, usage and limit files encode their units in the filename.
That makes the filenames very long, even though this is not
information that a user needs to be reminded of every time they
type out those names.
To address these naming issues, as well as to signal clearly that
the new interface carries a new configuration model, the naming
conventions in it necessarily differ from the old interface.
- The original limit files indicate the state of an unset limit with
a very high number, and a configured limit can be unset by echoing
-1 into those files. But that very high number is implementation
and architecture dependent and not very descriptive. And while -1
can be understood as an underflow into the highest possible value,
-2 or -10M etc. do not work, so it's not inconsistent.
memory.low, memory.high, and memory.max will use the string
"infinity" to indicate and set the highest possible value.
[akpm@linux-foundation.org: use seq_puts() for basic strings]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: Vladimir Davydov <vdavydov@parallels.com>
Cc: Greg Thelen <gthelen@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-02-12 06:26:06 +07:00
|
|
|
unsigned long high;
|
|
|
|
int err;
|
|
|
|
|
|
|
|
buf = strstrip(buf);
|
2015-02-28 06:52:04 +07:00
|
|
|
err = page_counter_memparse(buf, "max", &high);
|
mm: memcontrol: default hierarchy interface for memory
Introduce the basic control files to account, partition, and limit
memory using cgroups in default hierarchy mode.
This interface versioning allows us to address fundamental design
issues in the existing memory cgroup interface, further explained
below. The old interface will be maintained indefinitely, but a
clearer model and improved workload performance should encourage
existing users to switch over to the new one eventually.
The control files are thus:
- memory.current shows the current consumption of the cgroup and its
descendants, in bytes.
- memory.low configures the lower end of the cgroup's expected
memory consumption range. The kernel considers memory below that
boundary to be a reserve - the minimum that the workload needs in
order to make forward progress - and generally avoids reclaiming
it, unless there is an imminent risk of entering an OOM situation.
- memory.high configures the upper end of the cgroup's expected
memory consumption range. A cgroup whose consumption grows beyond
this threshold is forced into direct reclaim, to work off the
excess and to throttle new allocations heavily, but is generally
allowed to continue and the OOM killer is not invoked.
- memory.max configures the hard maximum amount of memory that the
cgroup is allowed to consume before the OOM killer is invoked.
- memory.events shows event counters that indicate how often the
cgroup was reclaimed while below memory.low, how often it was
forced to reclaim excess beyond memory.high, how often it hit
memory.max, and how often it entered OOM due to memory.max. This
allows users to identify configuration problems when observing a
degradation in workload performance. An overcommitted system will
have an increased rate of low boundary breaches, whereas increased
rates of high limit breaches, maximum hits, or even OOM situations
will indicate internally overcommitted cgroups.
For existing users of memory cgroups, the following deviations from
the current interface are worth pointing out and explaining:
- The original lower boundary, the soft limit, is defined as a limit
that is per default unset. As a result, the set of cgroups that
global reclaim prefers is opt-in, rather than opt-out. The costs
for optimizing these mostly negative lookups are so high that the
implementation, despite its enormous size, does not even provide
the basic desirable behavior. First off, the soft limit has no
hierarchical meaning. All configured groups are organized in a
global rbtree and treated like equal peers, regardless where they
are located in the hierarchy. This makes subtree delegation
impossible. Second, the soft limit reclaim pass is so aggressive
that it not just introduces high allocation latencies into the
system, but also impacts system performance due to overreclaim, to
the point where the feature becomes self-defeating.
The memory.low boundary on the other hand is a top-down allocated
reserve. A cgroup enjoys reclaim protection when it and all its
ancestors are below their low boundaries, which makes delegation
of subtrees possible. Secondly, new cgroups have no reserve per
default and in the common case most cgroups are eligible for the
preferred reclaim pass. This allows the new low boundary to be
efficiently implemented with just a minor addition to the generic
reclaim code, without the need for out-of-band data structures and
reclaim passes. Because the generic reclaim code considers all
cgroups except for the ones running low in the preferred first
reclaim pass, overreclaim of individual groups is eliminated as
well, resulting in much better overall workload performance.
- The original high boundary, the hard limit, is defined as a strict
limit that can not budge, even if the OOM killer has to be called.
But this generally goes against the goal of making the most out of
the available memory. The memory consumption of workloads varies
during runtime, and that requires users to overcommit. But doing
that with a strict upper limit requires either a fairly accurate
prediction of the working set size or adding slack to the limit.
Since working set size estimation is hard and error prone, and
getting it wrong results in OOM kills, most users tend to err on
the side of a looser limit and end up wasting precious resources.
The memory.high boundary on the other hand can be set much more
conservatively. When hit, it throttles allocations by forcing
them into direct reclaim to work off the excess, but it never
invokes the OOM killer. As a result, a high boundary that is
chosen too aggressively will not terminate the processes, but
instead it will lead to gradual performance degradation. The user
can monitor this and make corrections until the minimal memory
footprint that still gives acceptable performance is found.
In extreme cases, with many concurrent allocations and a complete
breakdown of reclaim progress within the group, the high boundary
can be exceeded. But even then it's mostly better to satisfy the
allocation from the slack available in other groups or the rest of
the system than killing the group. Otherwise, memory.max is there
to limit this type of spillover and ultimately contain buggy or
even malicious applications.
- The original control file names are unwieldy and inconsistent in
many different ways. For example, the upper boundary hit count is
exported in the memory.failcnt file, but an OOM event count has to
be manually counted by listening to memory.oom_control events, and
lower boundary / soft limit events have to be counted by first
setting a threshold for that value and then counting those events.
Also, usage and limit files encode their units in the filename.
That makes the filenames very long, even though this is not
information that a user needs to be reminded of every time they
type out those names.
To address these naming issues, as well as to signal clearly that
the new interface carries a new configuration model, the naming
conventions in it necessarily differ from the old interface.
- The original limit files indicate the state of an unset limit with
a very high number, and a configured limit can be unset by echoing
-1 into those files. But that very high number is implementation
and architecture dependent and not very descriptive. And while -1
can be understood as an underflow into the highest possible value,
-2 or -10M etc. do not work, so it's not inconsistent.
memory.low, memory.high, and memory.max will use the string
"infinity" to indicate and set the highest possible value.
[akpm@linux-foundation.org: use seq_puts() for basic strings]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: Vladimir Davydov <vdavydov@parallels.com>
Cc: Greg Thelen <gthelen@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-02-12 06:26:06 +07:00
|
|
|
if (err)
|
|
|
|
return err;
|
|
|
|
|
2019-12-01 08:50:09 +07:00
|
|
|
for (;;) {
|
|
|
|
unsigned long nr_pages = page_counter_read(&memcg->memory);
|
|
|
|
unsigned long reclaimed;
|
|
|
|
|
|
|
|
if (nr_pages <= high)
|
|
|
|
break;
|
|
|
|
|
|
|
|
if (signal_pending(current))
|
|
|
|
break;
|
|
|
|
|
|
|
|
if (!drained) {
|
|
|
|
drain_all_stock(memcg);
|
|
|
|
drained = true;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
reclaimed = try_to_free_mem_cgroup_pages(memcg, nr_pages - high,
|
|
|
|
GFP_KERNEL, true);
|
|
|
|
|
|
|
|
if (!reclaimed && !nr_retries--)
|
|
|
|
break;
|
|
|
|
}
|
2016-03-18 04:20:25 +07:00
|
|
|
|
2020-08-07 13:21:51 +07:00
|
|
|
page_counter_set_high(&memcg->memory, high);
|
|
|
|
|
2020-08-07 13:22:12 +07:00
|
|
|
memcg_wb_domain_size_changed(memcg);
|
|
|
|
|
mm: memcontrol: default hierarchy interface for memory
Introduce the basic control files to account, partition, and limit
memory using cgroups in default hierarchy mode.
This interface versioning allows us to address fundamental design
issues in the existing memory cgroup interface, further explained
below. The old interface will be maintained indefinitely, but a
clearer model and improved workload performance should encourage
existing users to switch over to the new one eventually.
The control files are thus:
- memory.current shows the current consumption of the cgroup and its
descendants, in bytes.
- memory.low configures the lower end of the cgroup's expected
memory consumption range. The kernel considers memory below that
boundary to be a reserve - the minimum that the workload needs in
order to make forward progress - and generally avoids reclaiming
it, unless there is an imminent risk of entering an OOM situation.
- memory.high configures the upper end of the cgroup's expected
memory consumption range. A cgroup whose consumption grows beyond
this threshold is forced into direct reclaim, to work off the
excess and to throttle new allocations heavily, but is generally
allowed to continue and the OOM killer is not invoked.
- memory.max configures the hard maximum amount of memory that the
cgroup is allowed to consume before the OOM killer is invoked.
- memory.events shows event counters that indicate how often the
cgroup was reclaimed while below memory.low, how often it was
forced to reclaim excess beyond memory.high, how often it hit
memory.max, and how often it entered OOM due to memory.max. This
allows users to identify configuration problems when observing a
degradation in workload performance. An overcommitted system will
have an increased rate of low boundary breaches, whereas increased
rates of high limit breaches, maximum hits, or even OOM situations
will indicate internally overcommitted cgroups.
For existing users of memory cgroups, the following deviations from
the current interface are worth pointing out and explaining:
- The original lower boundary, the soft limit, is defined as a limit
that is per default unset. As a result, the set of cgroups that
global reclaim prefers is opt-in, rather than opt-out. The costs
for optimizing these mostly negative lookups are so high that the
implementation, despite its enormous size, does not even provide
the basic desirable behavior. First off, the soft limit has no
hierarchical meaning. All configured groups are organized in a
global rbtree and treated like equal peers, regardless where they
are located in the hierarchy. This makes subtree delegation
impossible. Second, the soft limit reclaim pass is so aggressive
that it not just introduces high allocation latencies into the
system, but also impacts system performance due to overreclaim, to
the point where the feature becomes self-defeating.
The memory.low boundary on the other hand is a top-down allocated
reserve. A cgroup enjoys reclaim protection when it and all its
ancestors are below their low boundaries, which makes delegation
of subtrees possible. Secondly, new cgroups have no reserve per
default and in the common case most cgroups are eligible for the
preferred reclaim pass. This allows the new low boundary to be
efficiently implemented with just a minor addition to the generic
reclaim code, without the need for out-of-band data structures and
reclaim passes. Because the generic reclaim code considers all
cgroups except for the ones running low in the preferred first
reclaim pass, overreclaim of individual groups is eliminated as
well, resulting in much better overall workload performance.
- The original high boundary, the hard limit, is defined as a strict
limit that can not budge, even if the OOM killer has to be called.
But this generally goes against the goal of making the most out of
the available memory. The memory consumption of workloads varies
during runtime, and that requires users to overcommit. But doing
that with a strict upper limit requires either a fairly accurate
prediction of the working set size or adding slack to the limit.
Since working set size estimation is hard and error prone, and
getting it wrong results in OOM kills, most users tend to err on
the side of a looser limit and end up wasting precious resources.
The memory.high boundary on the other hand can be set much more
conservatively. When hit, it throttles allocations by forcing
them into direct reclaim to work off the excess, but it never
invokes the OOM killer. As a result, a high boundary that is
chosen too aggressively will not terminate the processes, but
instead it will lead to gradual performance degradation. The user
can monitor this and make corrections until the minimal memory
footprint that still gives acceptable performance is found.
In extreme cases, with many concurrent allocations and a complete
breakdown of reclaim progress within the group, the high boundary
can be exceeded. But even then it's mostly better to satisfy the
allocation from the slack available in other groups or the rest of
the system than killing the group. Otherwise, memory.max is there
to limit this type of spillover and ultimately contain buggy or
even malicious applications.
- The original control file names are unwieldy and inconsistent in
many different ways. For example, the upper boundary hit count is
exported in the memory.failcnt file, but an OOM event count has to
be manually counted by listening to memory.oom_control events, and
lower boundary / soft limit events have to be counted by first
setting a threshold for that value and then counting those events.
Also, usage and limit files encode their units in the filename.
That makes the filenames very long, even though this is not
information that a user needs to be reminded of every time they
type out those names.
To address these naming issues, as well as to signal clearly that
the new interface carries a new configuration model, the naming
conventions in it necessarily differ from the old interface.
- The original limit files indicate the state of an unset limit with
a very high number, and a configured limit can be unset by echoing
-1 into those files. But that very high number is implementation
and architecture dependent and not very descriptive. And while -1
can be understood as an underflow into the highest possible value,
-2 or -10M etc. do not work, so it's not inconsistent.
memory.low, memory.high, and memory.max will use the string
"infinity" to indicate and set the highest possible value.
[akpm@linux-foundation.org: use seq_puts() for basic strings]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: Vladimir Davydov <vdavydov@parallels.com>
Cc: Greg Thelen <gthelen@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-02-12 06:26:06 +07:00
|
|
|
return nbytes;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int memory_max_show(struct seq_file *m, void *v)
|
|
|
|
{
|
2019-03-06 06:45:55 +07:00
|
|
|
return seq_puts_memcg_tunable(m,
|
|
|
|
READ_ONCE(mem_cgroup_from_seq(m)->memory.max));
|
mm: memcontrol: default hierarchy interface for memory
Introduce the basic control files to account, partition, and limit
memory using cgroups in default hierarchy mode.
This interface versioning allows us to address fundamental design
issues in the existing memory cgroup interface, further explained
below. The old interface will be maintained indefinitely, but a
clearer model and improved workload performance should encourage
existing users to switch over to the new one eventually.
The control files are thus:
- memory.current shows the current consumption of the cgroup and its
descendants, in bytes.
- memory.low configures the lower end of the cgroup's expected
memory consumption range. The kernel considers memory below that
boundary to be a reserve - the minimum that the workload needs in
order to make forward progress - and generally avoids reclaiming
it, unless there is an imminent risk of entering an OOM situation.
- memory.high configures the upper end of the cgroup's expected
memory consumption range. A cgroup whose consumption grows beyond
this threshold is forced into direct reclaim, to work off the
excess and to throttle new allocations heavily, but is generally
allowed to continue and the OOM killer is not invoked.
- memory.max configures the hard maximum amount of memory that the
cgroup is allowed to consume before the OOM killer is invoked.
- memory.events shows event counters that indicate how often the
cgroup was reclaimed while below memory.low, how often it was
forced to reclaim excess beyond memory.high, how often it hit
memory.max, and how often it entered OOM due to memory.max. This
allows users to identify configuration problems when observing a
degradation in workload performance. An overcommitted system will
have an increased rate of low boundary breaches, whereas increased
rates of high limit breaches, maximum hits, or even OOM situations
will indicate internally overcommitted cgroups.
For existing users of memory cgroups, the following deviations from
the current interface are worth pointing out and explaining:
- The original lower boundary, the soft limit, is defined as a limit
that is per default unset. As a result, the set of cgroups that
global reclaim prefers is opt-in, rather than opt-out. The costs
for optimizing these mostly negative lookups are so high that the
implementation, despite its enormous size, does not even provide
the basic desirable behavior. First off, the soft limit has no
hierarchical meaning. All configured groups are organized in a
global rbtree and treated like equal peers, regardless where they
are located in the hierarchy. This makes subtree delegation
impossible. Second, the soft limit reclaim pass is so aggressive
that it not just introduces high allocation latencies into the
system, but also impacts system performance due to overreclaim, to
the point where the feature becomes self-defeating.
The memory.low boundary on the other hand is a top-down allocated
reserve. A cgroup enjoys reclaim protection when it and all its
ancestors are below their low boundaries, which makes delegation
of subtrees possible. Secondly, new cgroups have no reserve per
default and in the common case most cgroups are eligible for the
preferred reclaim pass. This allows the new low boundary to be
efficiently implemented with just a minor addition to the generic
reclaim code, without the need for out-of-band data structures and
reclaim passes. Because the generic reclaim code considers all
cgroups except for the ones running low in the preferred first
reclaim pass, overreclaim of individual groups is eliminated as
well, resulting in much better overall workload performance.
- The original high boundary, the hard limit, is defined as a strict
limit that can not budge, even if the OOM killer has to be called.
But this generally goes against the goal of making the most out of
the available memory. The memory consumption of workloads varies
during runtime, and that requires users to overcommit. But doing
that with a strict upper limit requires either a fairly accurate
prediction of the working set size or adding slack to the limit.
Since working set size estimation is hard and error prone, and
getting it wrong results in OOM kills, most users tend to err on
the side of a looser limit and end up wasting precious resources.
The memory.high boundary on the other hand can be set much more
conservatively. When hit, it throttles allocations by forcing
them into direct reclaim to work off the excess, but it never
invokes the OOM killer. As a result, a high boundary that is
chosen too aggressively will not terminate the processes, but
instead it will lead to gradual performance degradation. The user
can monitor this and make corrections until the minimal memory
footprint that still gives acceptable performance is found.
In extreme cases, with many concurrent allocations and a complete
breakdown of reclaim progress within the group, the high boundary
can be exceeded. But even then it's mostly better to satisfy the
allocation from the slack available in other groups or the rest of
the system than killing the group. Otherwise, memory.max is there
to limit this type of spillover and ultimately contain buggy or
even malicious applications.
- The original control file names are unwieldy and inconsistent in
many different ways. For example, the upper boundary hit count is
exported in the memory.failcnt file, but an OOM event count has to
be manually counted by listening to memory.oom_control events, and
lower boundary / soft limit events have to be counted by first
setting a threshold for that value and then counting those events.
Also, usage and limit files encode their units in the filename.
That makes the filenames very long, even though this is not
information that a user needs to be reminded of every time they
type out those names.
To address these naming issues, as well as to signal clearly that
the new interface carries a new configuration model, the naming
conventions in it necessarily differ from the old interface.
- The original limit files indicate the state of an unset limit with
a very high number, and a configured limit can be unset by echoing
-1 into those files. But that very high number is implementation
and architecture dependent and not very descriptive. And while -1
can be understood as an underflow into the highest possible value,
-2 or -10M etc. do not work, so it's not inconsistent.
memory.low, memory.high, and memory.max will use the string
"infinity" to indicate and set the highest possible value.
[akpm@linux-foundation.org: use seq_puts() for basic strings]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: Vladimir Davydov <vdavydov@parallels.com>
Cc: Greg Thelen <gthelen@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-02-12 06:26:06 +07:00
|
|
|
}
|
|
|
|
|
|
|
|
static ssize_t memory_max_write(struct kernfs_open_file *of,
|
|
|
|
char *buf, size_t nbytes, loff_t off)
|
|
|
|
{
|
|
|
|
struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
|
2020-08-07 13:21:58 +07:00
|
|
|
unsigned int nr_reclaims = MAX_RECLAIM_RETRIES;
|
2016-03-18 04:20:28 +07:00
|
|
|
bool drained = false;
|
mm: memcontrol: default hierarchy interface for memory
Introduce the basic control files to account, partition, and limit
memory using cgroups in default hierarchy mode.
This interface versioning allows us to address fundamental design
issues in the existing memory cgroup interface, further explained
below. The old interface will be maintained indefinitely, but a
clearer model and improved workload performance should encourage
existing users to switch over to the new one eventually.
The control files are thus:
- memory.current shows the current consumption of the cgroup and its
descendants, in bytes.
- memory.low configures the lower end of the cgroup's expected
memory consumption range. The kernel considers memory below that
boundary to be a reserve - the minimum that the workload needs in
order to make forward progress - and generally avoids reclaiming
it, unless there is an imminent risk of entering an OOM situation.
- memory.high configures the upper end of the cgroup's expected
memory consumption range. A cgroup whose consumption grows beyond
this threshold is forced into direct reclaim, to work off the
excess and to throttle new allocations heavily, but is generally
allowed to continue and the OOM killer is not invoked.
- memory.max configures the hard maximum amount of memory that the
cgroup is allowed to consume before the OOM killer is invoked.
- memory.events shows event counters that indicate how often the
cgroup was reclaimed while below memory.low, how often it was
forced to reclaim excess beyond memory.high, how often it hit
memory.max, and how often it entered OOM due to memory.max. This
allows users to identify configuration problems when observing a
degradation in workload performance. An overcommitted system will
have an increased rate of low boundary breaches, whereas increased
rates of high limit breaches, maximum hits, or even OOM situations
will indicate internally overcommitted cgroups.
For existing users of memory cgroups, the following deviations from
the current interface are worth pointing out and explaining:
- The original lower boundary, the soft limit, is defined as a limit
that is per default unset. As a result, the set of cgroups that
global reclaim prefers is opt-in, rather than opt-out. The costs
for optimizing these mostly negative lookups are so high that the
implementation, despite its enormous size, does not even provide
the basic desirable behavior. First off, the soft limit has no
hierarchical meaning. All configured groups are organized in a
global rbtree and treated like equal peers, regardless where they
are located in the hierarchy. This makes subtree delegation
impossible. Second, the soft limit reclaim pass is so aggressive
that it not just introduces high allocation latencies into the
system, but also impacts system performance due to overreclaim, to
the point where the feature becomes self-defeating.
The memory.low boundary on the other hand is a top-down allocated
reserve. A cgroup enjoys reclaim protection when it and all its
ancestors are below their low boundaries, which makes delegation
of subtrees possible. Secondly, new cgroups have no reserve per
default and in the common case most cgroups are eligible for the
preferred reclaim pass. This allows the new low boundary to be
efficiently implemented with just a minor addition to the generic
reclaim code, without the need for out-of-band data structures and
reclaim passes. Because the generic reclaim code considers all
cgroups except for the ones running low in the preferred first
reclaim pass, overreclaim of individual groups is eliminated as
well, resulting in much better overall workload performance.
- The original high boundary, the hard limit, is defined as a strict
limit that can not budge, even if the OOM killer has to be called.
But this generally goes against the goal of making the most out of
the available memory. The memory consumption of workloads varies
during runtime, and that requires users to overcommit. But doing
that with a strict upper limit requires either a fairly accurate
prediction of the working set size or adding slack to the limit.
Since working set size estimation is hard and error prone, and
getting it wrong results in OOM kills, most users tend to err on
the side of a looser limit and end up wasting precious resources.
The memory.high boundary on the other hand can be set much more
conservatively. When hit, it throttles allocations by forcing
them into direct reclaim to work off the excess, but it never
invokes the OOM killer. As a result, a high boundary that is
chosen too aggressively will not terminate the processes, but
instead it will lead to gradual performance degradation. The user
can monitor this and make corrections until the minimal memory
footprint that still gives acceptable performance is found.
In extreme cases, with many concurrent allocations and a complete
breakdown of reclaim progress within the group, the high boundary
can be exceeded. But even then it's mostly better to satisfy the
allocation from the slack available in other groups or the rest of
the system than killing the group. Otherwise, memory.max is there
to limit this type of spillover and ultimately contain buggy or
even malicious applications.
- The original control file names are unwieldy and inconsistent in
many different ways. For example, the upper boundary hit count is
exported in the memory.failcnt file, but an OOM event count has to
be manually counted by listening to memory.oom_control events, and
lower boundary / soft limit events have to be counted by first
setting a threshold for that value and then counting those events.
Also, usage and limit files encode their units in the filename.
That makes the filenames very long, even though this is not
information that a user needs to be reminded of every time they
type out those names.
To address these naming issues, as well as to signal clearly that
the new interface carries a new configuration model, the naming
conventions in it necessarily differ from the old interface.
- The original limit files indicate the state of an unset limit with
a very high number, and a configured limit can be unset by echoing
-1 into those files. But that very high number is implementation
and architecture dependent and not very descriptive. And while -1
can be understood as an underflow into the highest possible value,
-2 or -10M etc. do not work, so it's not inconsistent.
memory.low, memory.high, and memory.max will use the string
"infinity" to indicate and set the highest possible value.
[akpm@linux-foundation.org: use seq_puts() for basic strings]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: Vladimir Davydov <vdavydov@parallels.com>
Cc: Greg Thelen <gthelen@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-02-12 06:26:06 +07:00
|
|
|
unsigned long max;
|
|
|
|
int err;
|
|
|
|
|
|
|
|
buf = strstrip(buf);
|
2015-02-28 06:52:04 +07:00
|
|
|
err = page_counter_memparse(buf, "max", &max);
|
mm: memcontrol: default hierarchy interface for memory
Introduce the basic control files to account, partition, and limit
memory using cgroups in default hierarchy mode.
This interface versioning allows us to address fundamental design
issues in the existing memory cgroup interface, further explained
below. The old interface will be maintained indefinitely, but a
clearer model and improved workload performance should encourage
existing users to switch over to the new one eventually.
The control files are thus:
- memory.current shows the current consumption of the cgroup and its
descendants, in bytes.
- memory.low configures the lower end of the cgroup's expected
memory consumption range. The kernel considers memory below that
boundary to be a reserve - the minimum that the workload needs in
order to make forward progress - and generally avoids reclaiming
it, unless there is an imminent risk of entering an OOM situation.
- memory.high configures the upper end of the cgroup's expected
memory consumption range. A cgroup whose consumption grows beyond
this threshold is forced into direct reclaim, to work off the
excess and to throttle new allocations heavily, but is generally
allowed to continue and the OOM killer is not invoked.
- memory.max configures the hard maximum amount of memory that the
cgroup is allowed to consume before the OOM killer is invoked.
- memory.events shows event counters that indicate how often the
cgroup was reclaimed while below memory.low, how often it was
forced to reclaim excess beyond memory.high, how often it hit
memory.max, and how often it entered OOM due to memory.max. This
allows users to identify configuration problems when observing a
degradation in workload performance. An overcommitted system will
have an increased rate of low boundary breaches, whereas increased
rates of high limit breaches, maximum hits, or even OOM situations
will indicate internally overcommitted cgroups.
For existing users of memory cgroups, the following deviations from
the current interface are worth pointing out and explaining:
- The original lower boundary, the soft limit, is defined as a limit
that is per default unset. As a result, the set of cgroups that
global reclaim prefers is opt-in, rather than opt-out. The costs
for optimizing these mostly negative lookups are so high that the
implementation, despite its enormous size, does not even provide
the basic desirable behavior. First off, the soft limit has no
hierarchical meaning. All configured groups are organized in a
global rbtree and treated like equal peers, regardless where they
are located in the hierarchy. This makes subtree delegation
impossible. Second, the soft limit reclaim pass is so aggressive
that it not just introduces high allocation latencies into the
system, but also impacts system performance due to overreclaim, to
the point where the feature becomes self-defeating.
The memory.low boundary on the other hand is a top-down allocated
reserve. A cgroup enjoys reclaim protection when it and all its
ancestors are below their low boundaries, which makes delegation
of subtrees possible. Secondly, new cgroups have no reserve per
default and in the common case most cgroups are eligible for the
preferred reclaim pass. This allows the new low boundary to be
efficiently implemented with just a minor addition to the generic
reclaim code, without the need for out-of-band data structures and
reclaim passes. Because the generic reclaim code considers all
cgroups except for the ones running low in the preferred first
reclaim pass, overreclaim of individual groups is eliminated as
well, resulting in much better overall workload performance.
- The original high boundary, the hard limit, is defined as a strict
limit that can not budge, even if the OOM killer has to be called.
But this generally goes against the goal of making the most out of
the available memory. The memory consumption of workloads varies
during runtime, and that requires users to overcommit. But doing
that with a strict upper limit requires either a fairly accurate
prediction of the working set size or adding slack to the limit.
Since working set size estimation is hard and error prone, and
getting it wrong results in OOM kills, most users tend to err on
the side of a looser limit and end up wasting precious resources.
The memory.high boundary on the other hand can be set much more
conservatively. When hit, it throttles allocations by forcing
them into direct reclaim to work off the excess, but it never
invokes the OOM killer. As a result, a high boundary that is
chosen too aggressively will not terminate the processes, but
instead it will lead to gradual performance degradation. The user
can monitor this and make corrections until the minimal memory
footprint that still gives acceptable performance is found.
In extreme cases, with many concurrent allocations and a complete
breakdown of reclaim progress within the group, the high boundary
can be exceeded. But even then it's mostly better to satisfy the
allocation from the slack available in other groups or the rest of
the system than killing the group. Otherwise, memory.max is there
to limit this type of spillover and ultimately contain buggy or
even malicious applications.
- The original control file names are unwieldy and inconsistent in
many different ways. For example, the upper boundary hit count is
exported in the memory.failcnt file, but an OOM event count has to
be manually counted by listening to memory.oom_control events, and
lower boundary / soft limit events have to be counted by first
setting a threshold for that value and then counting those events.
Also, usage and limit files encode their units in the filename.
That makes the filenames very long, even though this is not
information that a user needs to be reminded of every time they
type out those names.
To address these naming issues, as well as to signal clearly that
the new interface carries a new configuration model, the naming
conventions in it necessarily differ from the old interface.
- The original limit files indicate the state of an unset limit with
a very high number, and a configured limit can be unset by echoing
-1 into those files. But that very high number is implementation
and architecture dependent and not very descriptive. And while -1
can be understood as an underflow into the highest possible value,
-2 or -10M etc. do not work, so it's not inconsistent.
memory.low, memory.high, and memory.max will use the string
"infinity" to indicate and set the highest possible value.
[akpm@linux-foundation.org: use seq_puts() for basic strings]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: Vladimir Davydov <vdavydov@parallels.com>
Cc: Greg Thelen <gthelen@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-02-12 06:26:06 +07:00
|
|
|
if (err)
|
|
|
|
return err;
|
|
|
|
|
2018-06-08 07:06:18 +07:00
|
|
|
xchg(&memcg->memory.max, max);
|
2016-03-18 04:20:28 +07:00
|
|
|
|
|
|
|
for (;;) {
|
|
|
|
unsigned long nr_pages = page_counter_read(&memcg->memory);
|
|
|
|
|
|
|
|
if (nr_pages <= max)
|
|
|
|
break;
|
|
|
|
|
2019-12-01 08:50:06 +07:00
|
|
|
if (signal_pending(current))
|
2016-03-18 04:20:28 +07:00
|
|
|
break;
|
|
|
|
|
|
|
|
if (!drained) {
|
|
|
|
drain_all_stock(memcg);
|
|
|
|
drained = true;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (nr_reclaims) {
|
|
|
|
if (!try_to_free_mem_cgroup_pages(memcg, nr_pages - max,
|
|
|
|
GFP_KERNEL, true))
|
|
|
|
nr_reclaims--;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2018-04-11 06:29:45 +07:00
|
|
|
memcg_memory_event(memcg, MEMCG_OOM);
|
2016-03-18 04:20:28 +07:00
|
|
|
if (!mem_cgroup_out_of_memory(memcg, GFP_KERNEL, 0))
|
|
|
|
break;
|
|
|
|
}
|
mm: memcontrol: default hierarchy interface for memory
Introduce the basic control files to account, partition, and limit
memory using cgroups in default hierarchy mode.
This interface versioning allows us to address fundamental design
issues in the existing memory cgroup interface, further explained
below. The old interface will be maintained indefinitely, but a
clearer model and improved workload performance should encourage
existing users to switch over to the new one eventually.
The control files are thus:
- memory.current shows the current consumption of the cgroup and its
descendants, in bytes.
- memory.low configures the lower end of the cgroup's expected
memory consumption range. The kernel considers memory below that
boundary to be a reserve - the minimum that the workload needs in
order to make forward progress - and generally avoids reclaiming
it, unless there is an imminent risk of entering an OOM situation.
- memory.high configures the upper end of the cgroup's expected
memory consumption range. A cgroup whose consumption grows beyond
this threshold is forced into direct reclaim, to work off the
excess and to throttle new allocations heavily, but is generally
allowed to continue and the OOM killer is not invoked.
- memory.max configures the hard maximum amount of memory that the
cgroup is allowed to consume before the OOM killer is invoked.
- memory.events shows event counters that indicate how often the
cgroup was reclaimed while below memory.low, how often it was
forced to reclaim excess beyond memory.high, how often it hit
memory.max, and how often it entered OOM due to memory.max. This
allows users to identify configuration problems when observing a
degradation in workload performance. An overcommitted system will
have an increased rate of low boundary breaches, whereas increased
rates of high limit breaches, maximum hits, or even OOM situations
will indicate internally overcommitted cgroups.
For existing users of memory cgroups, the following deviations from
the current interface are worth pointing out and explaining:
- The original lower boundary, the soft limit, is defined as a limit
that is per default unset. As a result, the set of cgroups that
global reclaim prefers is opt-in, rather than opt-out. The costs
for optimizing these mostly negative lookups are so high that the
implementation, despite its enormous size, does not even provide
the basic desirable behavior. First off, the soft limit has no
hierarchical meaning. All configured groups are organized in a
global rbtree and treated like equal peers, regardless where they
are located in the hierarchy. This makes subtree delegation
impossible. Second, the soft limit reclaim pass is so aggressive
that it not just introduces high allocation latencies into the
system, but also impacts system performance due to overreclaim, to
the point where the feature becomes self-defeating.
The memory.low boundary on the other hand is a top-down allocated
reserve. A cgroup enjoys reclaim protection when it and all its
ancestors are below their low boundaries, which makes delegation
of subtrees possible. Secondly, new cgroups have no reserve per
default and in the common case most cgroups are eligible for the
preferred reclaim pass. This allows the new low boundary to be
efficiently implemented with just a minor addition to the generic
reclaim code, without the need for out-of-band data structures and
reclaim passes. Because the generic reclaim code considers all
cgroups except for the ones running low in the preferred first
reclaim pass, overreclaim of individual groups is eliminated as
well, resulting in much better overall workload performance.
- The original high boundary, the hard limit, is defined as a strict
limit that can not budge, even if the OOM killer has to be called.
But this generally goes against the goal of making the most out of
the available memory. The memory consumption of workloads varies
during runtime, and that requires users to overcommit. But doing
that with a strict upper limit requires either a fairly accurate
prediction of the working set size or adding slack to the limit.
Since working set size estimation is hard and error prone, and
getting it wrong results in OOM kills, most users tend to err on
the side of a looser limit and end up wasting precious resources.
The memory.high boundary on the other hand can be set much more
conservatively. When hit, it throttles allocations by forcing
them into direct reclaim to work off the excess, but it never
invokes the OOM killer. As a result, a high boundary that is
chosen too aggressively will not terminate the processes, but
instead it will lead to gradual performance degradation. The user
can monitor this and make corrections until the minimal memory
footprint that still gives acceptable performance is found.
In extreme cases, with many concurrent allocations and a complete
breakdown of reclaim progress within the group, the high boundary
can be exceeded. But even then it's mostly better to satisfy the
allocation from the slack available in other groups or the rest of
the system than killing the group. Otherwise, memory.max is there
to limit this type of spillover and ultimately contain buggy or
even malicious applications.
- The original control file names are unwieldy and inconsistent in
many different ways. For example, the upper boundary hit count is
exported in the memory.failcnt file, but an OOM event count has to
be manually counted by listening to memory.oom_control events, and
lower boundary / soft limit events have to be counted by first
setting a threshold for that value and then counting those events.
Also, usage and limit files encode their units in the filename.
That makes the filenames very long, even though this is not
information that a user needs to be reminded of every time they
type out those names.
To address these naming issues, as well as to signal clearly that
the new interface carries a new configuration model, the naming
conventions in it necessarily differ from the old interface.
- The original limit files indicate the state of an unset limit with
a very high number, and a configured limit can be unset by echoing
-1 into those files. But that very high number is implementation
and architecture dependent and not very descriptive. And while -1
can be understood as an underflow into the highest possible value,
-2 or -10M etc. do not work, so it's not inconsistent.
memory.low, memory.high, and memory.max will use the string
"infinity" to indicate and set the highest possible value.
[akpm@linux-foundation.org: use seq_puts() for basic strings]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: Vladimir Davydov <vdavydov@parallels.com>
Cc: Greg Thelen <gthelen@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-02-12 06:26:06 +07:00
|
|
|
|
2015-05-23 05:23:34 +07:00
|
|
|
memcg_wb_domain_size_changed(memcg);
|
mm: memcontrol: default hierarchy interface for memory
Introduce the basic control files to account, partition, and limit
memory using cgroups in default hierarchy mode.
This interface versioning allows us to address fundamental design
issues in the existing memory cgroup interface, further explained
below. The old interface will be maintained indefinitely, but a
clearer model and improved workload performance should encourage
existing users to switch over to the new one eventually.
The control files are thus:
- memory.current shows the current consumption of the cgroup and its
descendants, in bytes.
- memory.low configures the lower end of the cgroup's expected
memory consumption range. The kernel considers memory below that
boundary to be a reserve - the minimum that the workload needs in
order to make forward progress - and generally avoids reclaiming
it, unless there is an imminent risk of entering an OOM situation.
- memory.high configures the upper end of the cgroup's expected
memory consumption range. A cgroup whose consumption grows beyond
this threshold is forced into direct reclaim, to work off the
excess and to throttle new allocations heavily, but is generally
allowed to continue and the OOM killer is not invoked.
- memory.max configures the hard maximum amount of memory that the
cgroup is allowed to consume before the OOM killer is invoked.
- memory.events shows event counters that indicate how often the
cgroup was reclaimed while below memory.low, how often it was
forced to reclaim excess beyond memory.high, how often it hit
memory.max, and how often it entered OOM due to memory.max. This
allows users to identify configuration problems when observing a
degradation in workload performance. An overcommitted system will
have an increased rate of low boundary breaches, whereas increased
rates of high limit breaches, maximum hits, or even OOM situations
will indicate internally overcommitted cgroups.
For existing users of memory cgroups, the following deviations from
the current interface are worth pointing out and explaining:
- The original lower boundary, the soft limit, is defined as a limit
that is per default unset. As a result, the set of cgroups that
global reclaim prefers is opt-in, rather than opt-out. The costs
for optimizing these mostly negative lookups are so high that the
implementation, despite its enormous size, does not even provide
the basic desirable behavior. First off, the soft limit has no
hierarchical meaning. All configured groups are organized in a
global rbtree and treated like equal peers, regardless where they
are located in the hierarchy. This makes subtree delegation
impossible. Second, the soft limit reclaim pass is so aggressive
that it not just introduces high allocation latencies into the
system, but also impacts system performance due to overreclaim, to
the point where the feature becomes self-defeating.
The memory.low boundary on the other hand is a top-down allocated
reserve. A cgroup enjoys reclaim protection when it and all its
ancestors are below their low boundaries, which makes delegation
of subtrees possible. Secondly, new cgroups have no reserve per
default and in the common case most cgroups are eligible for the
preferred reclaim pass. This allows the new low boundary to be
efficiently implemented with just a minor addition to the generic
reclaim code, without the need for out-of-band data structures and
reclaim passes. Because the generic reclaim code considers all
cgroups except for the ones running low in the preferred first
reclaim pass, overreclaim of individual groups is eliminated as
well, resulting in much better overall workload performance.
- The original high boundary, the hard limit, is defined as a strict
limit that can not budge, even if the OOM killer has to be called.
But this generally goes against the goal of making the most out of
the available memory. The memory consumption of workloads varies
during runtime, and that requires users to overcommit. But doing
that with a strict upper limit requires either a fairly accurate
prediction of the working set size or adding slack to the limit.
Since working set size estimation is hard and error prone, and
getting it wrong results in OOM kills, most users tend to err on
the side of a looser limit and end up wasting precious resources.
The memory.high boundary on the other hand can be set much more
conservatively. When hit, it throttles allocations by forcing
them into direct reclaim to work off the excess, but it never
invokes the OOM killer. As a result, a high boundary that is
chosen too aggressively will not terminate the processes, but
instead it will lead to gradual performance degradation. The user
can monitor this and make corrections until the minimal memory
footprint that still gives acceptable performance is found.
In extreme cases, with many concurrent allocations and a complete
breakdown of reclaim progress within the group, the high boundary
can be exceeded. But even then it's mostly better to satisfy the
allocation from the slack available in other groups or the rest of
the system than killing the group. Otherwise, memory.max is there
to limit this type of spillover and ultimately contain buggy or
even malicious applications.
- The original control file names are unwieldy and inconsistent in
many different ways. For example, the upper boundary hit count is
exported in the memory.failcnt file, but an OOM event count has to
be manually counted by listening to memory.oom_control events, and
lower boundary / soft limit events have to be counted by first
setting a threshold for that value and then counting those events.
Also, usage and limit files encode their units in the filename.
That makes the filenames very long, even though this is not
information that a user needs to be reminded of every time they
type out those names.
To address these naming issues, as well as to signal clearly that
the new interface carries a new configuration model, the naming
conventions in it necessarily differ from the old interface.
- The original limit files indicate the state of an unset limit with
a very high number, and a configured limit can be unset by echoing
-1 into those files. But that very high number is implementation
and architecture dependent and not very descriptive. And while -1
can be understood as an underflow into the highest possible value,
-2 or -10M etc. do not work, so it's not inconsistent.
memory.low, memory.high, and memory.max will use the string
"infinity" to indicate and set the highest possible value.
[akpm@linux-foundation.org: use seq_puts() for basic strings]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: Vladimir Davydov <vdavydov@parallels.com>
Cc: Greg Thelen <gthelen@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-02-12 06:26:06 +07:00
|
|
|
return nbytes;
|
|
|
|
}
|
|
|
|
|
2019-07-12 10:55:55 +07:00
|
|
|
static void __memory_events_show(struct seq_file *m, atomic_long_t *events)
|
|
|
|
{
|
|
|
|
seq_printf(m, "low %lu\n", atomic_long_read(&events[MEMCG_LOW]));
|
|
|
|
seq_printf(m, "high %lu\n", atomic_long_read(&events[MEMCG_HIGH]));
|
|
|
|
seq_printf(m, "max %lu\n", atomic_long_read(&events[MEMCG_MAX]));
|
|
|
|
seq_printf(m, "oom %lu\n", atomic_long_read(&events[MEMCG_OOM]));
|
|
|
|
seq_printf(m, "oom_kill %lu\n",
|
|
|
|
atomic_long_read(&events[MEMCG_OOM_KILL]));
|
|
|
|
}
|
|
|
|
|
mm: memcontrol: default hierarchy interface for memory
Introduce the basic control files to account, partition, and limit
memory using cgroups in default hierarchy mode.
This interface versioning allows us to address fundamental design
issues in the existing memory cgroup interface, further explained
below. The old interface will be maintained indefinitely, but a
clearer model and improved workload performance should encourage
existing users to switch over to the new one eventually.
The control files are thus:
- memory.current shows the current consumption of the cgroup and its
descendants, in bytes.
- memory.low configures the lower end of the cgroup's expected
memory consumption range. The kernel considers memory below that
boundary to be a reserve - the minimum that the workload needs in
order to make forward progress - and generally avoids reclaiming
it, unless there is an imminent risk of entering an OOM situation.
- memory.high configures the upper end of the cgroup's expected
memory consumption range. A cgroup whose consumption grows beyond
this threshold is forced into direct reclaim, to work off the
excess and to throttle new allocations heavily, but is generally
allowed to continue and the OOM killer is not invoked.
- memory.max configures the hard maximum amount of memory that the
cgroup is allowed to consume before the OOM killer is invoked.
- memory.events shows event counters that indicate how often the
cgroup was reclaimed while below memory.low, how often it was
forced to reclaim excess beyond memory.high, how often it hit
memory.max, and how often it entered OOM due to memory.max. This
allows users to identify configuration problems when observing a
degradation in workload performance. An overcommitted system will
have an increased rate of low boundary breaches, whereas increased
rates of high limit breaches, maximum hits, or even OOM situations
will indicate internally overcommitted cgroups.
For existing users of memory cgroups, the following deviations from
the current interface are worth pointing out and explaining:
- The original lower boundary, the soft limit, is defined as a limit
that is per default unset. As a result, the set of cgroups that
global reclaim prefers is opt-in, rather than opt-out. The costs
for optimizing these mostly negative lookups are so high that the
implementation, despite its enormous size, does not even provide
the basic desirable behavior. First off, the soft limit has no
hierarchical meaning. All configured groups are organized in a
global rbtree and treated like equal peers, regardless where they
are located in the hierarchy. This makes subtree delegation
impossible. Second, the soft limit reclaim pass is so aggressive
that it not just introduces high allocation latencies into the
system, but also impacts system performance due to overreclaim, to
the point where the feature becomes self-defeating.
The memory.low boundary on the other hand is a top-down allocated
reserve. A cgroup enjoys reclaim protection when it and all its
ancestors are below their low boundaries, which makes delegation
of subtrees possible. Secondly, new cgroups have no reserve per
default and in the common case most cgroups are eligible for the
preferred reclaim pass. This allows the new low boundary to be
efficiently implemented with just a minor addition to the generic
reclaim code, without the need for out-of-band data structures and
reclaim passes. Because the generic reclaim code considers all
cgroups except for the ones running low in the preferred first
reclaim pass, overreclaim of individual groups is eliminated as
well, resulting in much better overall workload performance.
- The original high boundary, the hard limit, is defined as a strict
limit that can not budge, even if the OOM killer has to be called.
But this generally goes against the goal of making the most out of
the available memory. The memory consumption of workloads varies
during runtime, and that requires users to overcommit. But doing
that with a strict upper limit requires either a fairly accurate
prediction of the working set size or adding slack to the limit.
Since working set size estimation is hard and error prone, and
getting it wrong results in OOM kills, most users tend to err on
the side of a looser limit and end up wasting precious resources.
The memory.high boundary on the other hand can be set much more
conservatively. When hit, it throttles allocations by forcing
them into direct reclaim to work off the excess, but it never
invokes the OOM killer. As a result, a high boundary that is
chosen too aggressively will not terminate the processes, but
instead it will lead to gradual performance degradation. The user
can monitor this and make corrections until the minimal memory
footprint that still gives acceptable performance is found.
In extreme cases, with many concurrent allocations and a complete
breakdown of reclaim progress within the group, the high boundary
can be exceeded. But even then it's mostly better to satisfy the
allocation from the slack available in other groups or the rest of
the system than killing the group. Otherwise, memory.max is there
to limit this type of spillover and ultimately contain buggy or
even malicious applications.
- The original control file names are unwieldy and inconsistent in
many different ways. For example, the upper boundary hit count is
exported in the memory.failcnt file, but an OOM event count has to
be manually counted by listening to memory.oom_control events, and
lower boundary / soft limit events have to be counted by first
setting a threshold for that value and then counting those events.
Also, usage and limit files encode their units in the filename.
That makes the filenames very long, even though this is not
information that a user needs to be reminded of every time they
type out those names.
To address these naming issues, as well as to signal clearly that
the new interface carries a new configuration model, the naming
conventions in it necessarily differ from the old interface.
- The original limit files indicate the state of an unset limit with
a very high number, and a configured limit can be unset by echoing
-1 into those files. But that very high number is implementation
and architecture dependent and not very descriptive. And while -1
can be understood as an underflow into the highest possible value,
-2 or -10M etc. do not work, so it's not inconsistent.
memory.low, memory.high, and memory.max will use the string
"infinity" to indicate and set the highest possible value.
[akpm@linux-foundation.org: use seq_puts() for basic strings]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: Vladimir Davydov <vdavydov@parallels.com>
Cc: Greg Thelen <gthelen@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-02-12 06:26:06 +07:00
|
|
|
static int memory_events_show(struct seq_file *m, void *v)
|
|
|
|
{
|
2019-03-06 06:45:52 +07:00
|
|
|
struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
|
mm: memcontrol: default hierarchy interface for memory
Introduce the basic control files to account, partition, and limit
memory using cgroups in default hierarchy mode.
This interface versioning allows us to address fundamental design
issues in the existing memory cgroup interface, further explained
below. The old interface will be maintained indefinitely, but a
clearer model and improved workload performance should encourage
existing users to switch over to the new one eventually.
The control files are thus:
- memory.current shows the current consumption of the cgroup and its
descendants, in bytes.
- memory.low configures the lower end of the cgroup's expected
memory consumption range. The kernel considers memory below that
boundary to be a reserve - the minimum that the workload needs in
order to make forward progress - and generally avoids reclaiming
it, unless there is an imminent risk of entering an OOM situation.
- memory.high configures the upper end of the cgroup's expected
memory consumption range. A cgroup whose consumption grows beyond
this threshold is forced into direct reclaim, to work off the
excess and to throttle new allocations heavily, but is generally
allowed to continue and the OOM killer is not invoked.
- memory.max configures the hard maximum amount of memory that the
cgroup is allowed to consume before the OOM killer is invoked.
- memory.events shows event counters that indicate how often the
cgroup was reclaimed while below memory.low, how often it was
forced to reclaim excess beyond memory.high, how often it hit
memory.max, and how often it entered OOM due to memory.max. This
allows users to identify configuration problems when observing a
degradation in workload performance. An overcommitted system will
have an increased rate of low boundary breaches, whereas increased
rates of high limit breaches, maximum hits, or even OOM situations
will indicate internally overcommitted cgroups.
For existing users of memory cgroups, the following deviations from
the current interface are worth pointing out and explaining:
- The original lower boundary, the soft limit, is defined as a limit
that is per default unset. As a result, the set of cgroups that
global reclaim prefers is opt-in, rather than opt-out. The costs
for optimizing these mostly negative lookups are so high that the
implementation, despite its enormous size, does not even provide
the basic desirable behavior. First off, the soft limit has no
hierarchical meaning. All configured groups are organized in a
global rbtree and treated like equal peers, regardless where they
are located in the hierarchy. This makes subtree delegation
impossible. Second, the soft limit reclaim pass is so aggressive
that it not just introduces high allocation latencies into the
system, but also impacts system performance due to overreclaim, to
the point where the feature becomes self-defeating.
The memory.low boundary on the other hand is a top-down allocated
reserve. A cgroup enjoys reclaim protection when it and all its
ancestors are below their low boundaries, which makes delegation
of subtrees possible. Secondly, new cgroups have no reserve per
default and in the common case most cgroups are eligible for the
preferred reclaim pass. This allows the new low boundary to be
efficiently implemented with just a minor addition to the generic
reclaim code, without the need for out-of-band data structures and
reclaim passes. Because the generic reclaim code considers all
cgroups except for the ones running low in the preferred first
reclaim pass, overreclaim of individual groups is eliminated as
well, resulting in much better overall workload performance.
- The original high boundary, the hard limit, is defined as a strict
limit that can not budge, even if the OOM killer has to be called.
But this generally goes against the goal of making the most out of
the available memory. The memory consumption of workloads varies
during runtime, and that requires users to overcommit. But doing
that with a strict upper limit requires either a fairly accurate
prediction of the working set size or adding slack to the limit.
Since working set size estimation is hard and error prone, and
getting it wrong results in OOM kills, most users tend to err on
the side of a looser limit and end up wasting precious resources.
The memory.high boundary on the other hand can be set much more
conservatively. When hit, it throttles allocations by forcing
them into direct reclaim to work off the excess, but it never
invokes the OOM killer. As a result, a high boundary that is
chosen too aggressively will not terminate the processes, but
instead it will lead to gradual performance degradation. The user
can monitor this and make corrections until the minimal memory
footprint that still gives acceptable performance is found.
In extreme cases, with many concurrent allocations and a complete
breakdown of reclaim progress within the group, the high boundary
can be exceeded. But even then it's mostly better to satisfy the
allocation from the slack available in other groups or the rest of
the system than killing the group. Otherwise, memory.max is there
to limit this type of spillover and ultimately contain buggy or
even malicious applications.
- The original control file names are unwieldy and inconsistent in
many different ways. For example, the upper boundary hit count is
exported in the memory.failcnt file, but an OOM event count has to
be manually counted by listening to memory.oom_control events, and
lower boundary / soft limit events have to be counted by first
setting a threshold for that value and then counting those events.
Also, usage and limit files encode their units in the filename.
That makes the filenames very long, even though this is not
information that a user needs to be reminded of every time they
type out those names.
To address these naming issues, as well as to signal clearly that
the new interface carries a new configuration model, the naming
conventions in it necessarily differ from the old interface.
- The original limit files indicate the state of an unset limit with
a very high number, and a configured limit can be unset by echoing
-1 into those files. But that very high number is implementation
and architecture dependent and not very descriptive. And while -1
can be understood as an underflow into the highest possible value,
-2 or -10M etc. do not work, so it's not inconsistent.
memory.low, memory.high, and memory.max will use the string
"infinity" to indicate and set the highest possible value.
[akpm@linux-foundation.org: use seq_puts() for basic strings]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: Vladimir Davydov <vdavydov@parallels.com>
Cc: Greg Thelen <gthelen@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-02-12 06:26:06 +07:00
|
|
|
|
2019-07-12 10:55:55 +07:00
|
|
|
__memory_events_show(m, memcg->memory_events);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int memory_events_local_show(struct seq_file *m, void *v)
|
|
|
|
{
|
|
|
|
struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
|
mm: memcontrol: default hierarchy interface for memory
Introduce the basic control files to account, partition, and limit
memory using cgroups in default hierarchy mode.
This interface versioning allows us to address fundamental design
issues in the existing memory cgroup interface, further explained
below. The old interface will be maintained indefinitely, but a
clearer model and improved workload performance should encourage
existing users to switch over to the new one eventually.
The control files are thus:
- memory.current shows the current consumption of the cgroup and its
descendants, in bytes.
- memory.low configures the lower end of the cgroup's expected
memory consumption range. The kernel considers memory below that
boundary to be a reserve - the minimum that the workload needs in
order to make forward progress - and generally avoids reclaiming
it, unless there is an imminent risk of entering an OOM situation.
- memory.high configures the upper end of the cgroup's expected
memory consumption range. A cgroup whose consumption grows beyond
this threshold is forced into direct reclaim, to work off the
excess and to throttle new allocations heavily, but is generally
allowed to continue and the OOM killer is not invoked.
- memory.max configures the hard maximum amount of memory that the
cgroup is allowed to consume before the OOM killer is invoked.
- memory.events shows event counters that indicate how often the
cgroup was reclaimed while below memory.low, how often it was
forced to reclaim excess beyond memory.high, how often it hit
memory.max, and how often it entered OOM due to memory.max. This
allows users to identify configuration problems when observing a
degradation in workload performance. An overcommitted system will
have an increased rate of low boundary breaches, whereas increased
rates of high limit breaches, maximum hits, or even OOM situations
will indicate internally overcommitted cgroups.
For existing users of memory cgroups, the following deviations from
the current interface are worth pointing out and explaining:
- The original lower boundary, the soft limit, is defined as a limit
that is per default unset. As a result, the set of cgroups that
global reclaim prefers is opt-in, rather than opt-out. The costs
for optimizing these mostly negative lookups are so high that the
implementation, despite its enormous size, does not even provide
the basic desirable behavior. First off, the soft limit has no
hierarchical meaning. All configured groups are organized in a
global rbtree and treated like equal peers, regardless where they
are located in the hierarchy. This makes subtree delegation
impossible. Second, the soft limit reclaim pass is so aggressive
that it not just introduces high allocation latencies into the
system, but also impacts system performance due to overreclaim, to
the point where the feature becomes self-defeating.
The memory.low boundary on the other hand is a top-down allocated
reserve. A cgroup enjoys reclaim protection when it and all its
ancestors are below their low boundaries, which makes delegation
of subtrees possible. Secondly, new cgroups have no reserve per
default and in the common case most cgroups are eligible for the
preferred reclaim pass. This allows the new low boundary to be
efficiently implemented with just a minor addition to the generic
reclaim code, without the need for out-of-band data structures and
reclaim passes. Because the generic reclaim code considers all
cgroups except for the ones running low in the preferred first
reclaim pass, overreclaim of individual groups is eliminated as
well, resulting in much better overall workload performance.
- The original high boundary, the hard limit, is defined as a strict
limit that can not budge, even if the OOM killer has to be called.
But this generally goes against the goal of making the most out of
the available memory. The memory consumption of workloads varies
during runtime, and that requires users to overcommit. But doing
that with a strict upper limit requires either a fairly accurate
prediction of the working set size or adding slack to the limit.
Since working set size estimation is hard and error prone, and
getting it wrong results in OOM kills, most users tend to err on
the side of a looser limit and end up wasting precious resources.
The memory.high boundary on the other hand can be set much more
conservatively. When hit, it throttles allocations by forcing
them into direct reclaim to work off the excess, but it never
invokes the OOM killer. As a result, a high boundary that is
chosen too aggressively will not terminate the processes, but
instead it will lead to gradual performance degradation. The user
can monitor this and make corrections until the minimal memory
footprint that still gives acceptable performance is found.
In extreme cases, with many concurrent allocations and a complete
breakdown of reclaim progress within the group, the high boundary
can be exceeded. But even then it's mostly better to satisfy the
allocation from the slack available in other groups or the rest of
the system than killing the group. Otherwise, memory.max is there
to limit this type of spillover and ultimately contain buggy or
even malicious applications.
- The original control file names are unwieldy and inconsistent in
many different ways. For example, the upper boundary hit count is
exported in the memory.failcnt file, but an OOM event count has to
be manually counted by listening to memory.oom_control events, and
lower boundary / soft limit events have to be counted by first
setting a threshold for that value and then counting those events.
Also, usage and limit files encode their units in the filename.
That makes the filenames very long, even though this is not
information that a user needs to be reminded of every time they
type out those names.
To address these naming issues, as well as to signal clearly that
the new interface carries a new configuration model, the naming
conventions in it necessarily differ from the old interface.
- The original limit files indicate the state of an unset limit with
a very high number, and a configured limit can be unset by echoing
-1 into those files. But that very high number is implementation
and architecture dependent and not very descriptive. And while -1
can be understood as an underflow into the highest possible value,
-2 or -10M etc. do not work, so it's not inconsistent.
memory.low, memory.high, and memory.max will use the string
"infinity" to indicate and set the highest possible value.
[akpm@linux-foundation.org: use seq_puts() for basic strings]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: Vladimir Davydov <vdavydov@parallels.com>
Cc: Greg Thelen <gthelen@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-02-12 06:26:06 +07:00
|
|
|
|
2019-07-12 10:55:55 +07:00
|
|
|
__memory_events_show(m, memcg->memory_events_local);
|
mm: memcontrol: default hierarchy interface for memory
Introduce the basic control files to account, partition, and limit
memory using cgroups in default hierarchy mode.
This interface versioning allows us to address fundamental design
issues in the existing memory cgroup interface, further explained
below. The old interface will be maintained indefinitely, but a
clearer model and improved workload performance should encourage
existing users to switch over to the new one eventually.
The control files are thus:
- memory.current shows the current consumption of the cgroup and its
descendants, in bytes.
- memory.low configures the lower end of the cgroup's expected
memory consumption range. The kernel considers memory below that
boundary to be a reserve - the minimum that the workload needs in
order to make forward progress - and generally avoids reclaiming
it, unless there is an imminent risk of entering an OOM situation.
- memory.high configures the upper end of the cgroup's expected
memory consumption range. A cgroup whose consumption grows beyond
this threshold is forced into direct reclaim, to work off the
excess and to throttle new allocations heavily, but is generally
allowed to continue and the OOM killer is not invoked.
- memory.max configures the hard maximum amount of memory that the
cgroup is allowed to consume before the OOM killer is invoked.
- memory.events shows event counters that indicate how often the
cgroup was reclaimed while below memory.low, how often it was
forced to reclaim excess beyond memory.high, how often it hit
memory.max, and how often it entered OOM due to memory.max. This
allows users to identify configuration problems when observing a
degradation in workload performance. An overcommitted system will
have an increased rate of low boundary breaches, whereas increased
rates of high limit breaches, maximum hits, or even OOM situations
will indicate internally overcommitted cgroups.
For existing users of memory cgroups, the following deviations from
the current interface are worth pointing out and explaining:
- The original lower boundary, the soft limit, is defined as a limit
that is per default unset. As a result, the set of cgroups that
global reclaim prefers is opt-in, rather than opt-out. The costs
for optimizing these mostly negative lookups are so high that the
implementation, despite its enormous size, does not even provide
the basic desirable behavior. First off, the soft limit has no
hierarchical meaning. All configured groups are organized in a
global rbtree and treated like equal peers, regardless where they
are located in the hierarchy. This makes subtree delegation
impossible. Second, the soft limit reclaim pass is so aggressive
that it not just introduces high allocation latencies into the
system, but also impacts system performance due to overreclaim, to
the point where the feature becomes self-defeating.
The memory.low boundary on the other hand is a top-down allocated
reserve. A cgroup enjoys reclaim protection when it and all its
ancestors are below their low boundaries, which makes delegation
of subtrees possible. Secondly, new cgroups have no reserve per
default and in the common case most cgroups are eligible for the
preferred reclaim pass. This allows the new low boundary to be
efficiently implemented with just a minor addition to the generic
reclaim code, without the need for out-of-band data structures and
reclaim passes. Because the generic reclaim code considers all
cgroups except for the ones running low in the preferred first
reclaim pass, overreclaim of individual groups is eliminated as
well, resulting in much better overall workload performance.
- The original high boundary, the hard limit, is defined as a strict
limit that can not budge, even if the OOM killer has to be called.
But this generally goes against the goal of making the most out of
the available memory. The memory consumption of workloads varies
during runtime, and that requires users to overcommit. But doing
that with a strict upper limit requires either a fairly accurate
prediction of the working set size or adding slack to the limit.
Since working set size estimation is hard and error prone, and
getting it wrong results in OOM kills, most users tend to err on
the side of a looser limit and end up wasting precious resources.
The memory.high boundary on the other hand can be set much more
conservatively. When hit, it throttles allocations by forcing
them into direct reclaim to work off the excess, but it never
invokes the OOM killer. As a result, a high boundary that is
chosen too aggressively will not terminate the processes, but
instead it will lead to gradual performance degradation. The user
can monitor this and make corrections until the minimal memory
footprint that still gives acceptable performance is found.
In extreme cases, with many concurrent allocations and a complete
breakdown of reclaim progress within the group, the high boundary
can be exceeded. But even then it's mostly better to satisfy the
allocation from the slack available in other groups or the rest of
the system than killing the group. Otherwise, memory.max is there
to limit this type of spillover and ultimately contain buggy or
even malicious applications.
- The original control file names are unwieldy and inconsistent in
many different ways. For example, the upper boundary hit count is
exported in the memory.failcnt file, but an OOM event count has to
be manually counted by listening to memory.oom_control events, and
lower boundary / soft limit events have to be counted by first
setting a threshold for that value and then counting those events.
Also, usage and limit files encode their units in the filename.
That makes the filenames very long, even though this is not
information that a user needs to be reminded of every time they
type out those names.
To address these naming issues, as well as to signal clearly that
the new interface carries a new configuration model, the naming
conventions in it necessarily differ from the old interface.
- The original limit files indicate the state of an unset limit with
a very high number, and a configured limit can be unset by echoing
-1 into those files. But that very high number is implementation
and architecture dependent and not very descriptive. And while -1
can be understood as an underflow into the highest possible value,
-2 or -10M etc. do not work, so it's not inconsistent.
memory.low, memory.high, and memory.max will use the string
"infinity" to indicate and set the highest possible value.
[akpm@linux-foundation.org: use seq_puts() for basic strings]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: Vladimir Davydov <vdavydov@parallels.com>
Cc: Greg Thelen <gthelen@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-02-12 06:26:06 +07:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2016-01-21 06:03:19 +07:00
|
|
|
static int memory_stat_show(struct seq_file *m, void *v)
|
|
|
|
{
|
2019-03-06 06:45:52 +07:00
|
|
|
struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
|
mm: memcontrol: dump memory.stat during cgroup OOM
The current cgroup OOM memory info dump doesn't include all the memory
we are tracking, nor does it give insight into what the VM tried to do
leading up to the OOM. All that useful info is in memory.stat.
Furthermore, the recursive printing for every child cgroup can
generate absurd amounts of data on the console for larger cgroup
trees, and it's not like we provide a per-cgroup breakdown during
global OOM kills.
When an OOM kill is triggered, print one set of recursive memory.stat
items at the level whose limit triggered the OOM condition.
Example output:
stress invoked oom-killer: gfp_mask=0x100cca(GFP_HIGHUSER_MOVABLE), order=0, oom_score_adj=0
CPU: 2 PID: 210 Comm: stress Not tainted 5.2.0-rc2-mm1-00247-g47d49835983c #135
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.12.0-20181126_142135-anatol 04/01/2014
Call Trace:
dump_stack+0x46/0x60
dump_header+0x4c/0x2d0
oom_kill_process.cold.10+0xb/0x10
out_of_memory+0x200/0x270
? try_to_free_mem_cgroup_pages+0xdf/0x130
mem_cgroup_out_of_memory+0xb7/0xc0
try_charge+0x680/0x6f0
mem_cgroup_try_charge+0xb5/0x160
__add_to_page_cache_locked+0xc6/0x300
? list_lru_destroy+0x80/0x80
add_to_page_cache_lru+0x45/0xc0
pagecache_get_page+0x11b/0x290
filemap_fault+0x458/0x6d0
ext4_filemap_fault+0x27/0x36
__do_fault+0x2f/0xb0
__handle_mm_fault+0x9c5/0x1140
? apic_timer_interrupt+0xa/0x20
handle_mm_fault+0xc5/0x180
__do_page_fault+0x1ab/0x440
? page_fault+0x8/0x30
page_fault+0x1e/0x30
RIP: 0033:0x55c32167fc10
Code: Bad RIP value.
RSP: 002b:00007fff1d031c50 EFLAGS: 00010206
RAX: 000000000dc00000 RBX: 00007fd2db000010 RCX: 00007fd2db000010
RDX: 0000000000000000 RSI: 0000000010001000 RDI: 0000000000000000
RBP: 000055c321680a54 R08: 00000000ffffffff R09: 0000000000000000
R10: 0000000000000022 R11: 0000000000000246 R12: ffffffffffffffff
R13: 0000000000000002 R14: 0000000000001000 R15: 0000000010000000
memory: usage 1024kB, limit 1024kB, failcnt 75131
swap: usage 0kB, limit 9007199254740988kB, failcnt 0
Memory cgroup stats for /foo:
anon 0
file 0
kernel_stack 36864
slab 274432
sock 0
shmem 0
file_mapped 0
file_dirty 0
file_writeback 0
anon_thp 0
inactive_anon 126976
active_anon 0
inactive_file 0
active_file 0
unevictable 0
slab_reclaimable 0
slab_unreclaimable 274432
pgfault 59466
pgmajfault 1617
workingset_refault 2145
workingset_activate 0
workingset_nodereclaim 0
pgrefill 98952
pgscan 200060
pgsteal 59340
pgactivate 40095
pgdeactivate 96787
pglazyfree 0
pglazyfreed 0
thp_fault_alloc 0
thp_collapse_alloc 0
Tasks state (memory values in pages):
[ pid ] uid tgid total_vm rss pgtables_bytes swapents oom_score_adj name
[ 200] 0 200 1121 884 53248 29 0 bash
[ 209] 0 209 905 246 45056 19 0 stress
[ 210] 0 210 66442 56 499712 56349 0 stress
oom-kill:constraint=CONSTRAINT_NONE,nodemask=(null),oom_memcg=/foo,task_memcg=/foo,task=stress,pid=210,uid=0
Memory cgroup out of memory: Killed process 210 (stress) total-vm:265768kB, anon-rss:0kB, file-rss:224kB, shmem-rss:0kB
oom_reaper: reaped process 210 (stress), now anon-rss:0kB, file-rss:0kB, shmem-rss:0kB
[hannes@cmpxchg.org: s/kvmalloc/kmalloc/ per Michal]
Link: http://lkml.kernel.org/r/20190605161133.GA12453@cmpxchg.org
Link: http://lkml.kernel.org/r/20190604210509.9744-1-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-07-12 10:55:59 +07:00
|
|
|
char *buf;
|
2019-03-06 06:48:09 +07:00
|
|
|
|
mm: memcontrol: dump memory.stat during cgroup OOM
The current cgroup OOM memory info dump doesn't include all the memory
we are tracking, nor does it give insight into what the VM tried to do
leading up to the OOM. All that useful info is in memory.stat.
Furthermore, the recursive printing for every child cgroup can
generate absurd amounts of data on the console for larger cgroup
trees, and it's not like we provide a per-cgroup breakdown during
global OOM kills.
When an OOM kill is triggered, print one set of recursive memory.stat
items at the level whose limit triggered the OOM condition.
Example output:
stress invoked oom-killer: gfp_mask=0x100cca(GFP_HIGHUSER_MOVABLE), order=0, oom_score_adj=0
CPU: 2 PID: 210 Comm: stress Not tainted 5.2.0-rc2-mm1-00247-g47d49835983c #135
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.12.0-20181126_142135-anatol 04/01/2014
Call Trace:
dump_stack+0x46/0x60
dump_header+0x4c/0x2d0
oom_kill_process.cold.10+0xb/0x10
out_of_memory+0x200/0x270
? try_to_free_mem_cgroup_pages+0xdf/0x130
mem_cgroup_out_of_memory+0xb7/0xc0
try_charge+0x680/0x6f0
mem_cgroup_try_charge+0xb5/0x160
__add_to_page_cache_locked+0xc6/0x300
? list_lru_destroy+0x80/0x80
add_to_page_cache_lru+0x45/0xc0
pagecache_get_page+0x11b/0x290
filemap_fault+0x458/0x6d0
ext4_filemap_fault+0x27/0x36
__do_fault+0x2f/0xb0
__handle_mm_fault+0x9c5/0x1140
? apic_timer_interrupt+0xa/0x20
handle_mm_fault+0xc5/0x180
__do_page_fault+0x1ab/0x440
? page_fault+0x8/0x30
page_fault+0x1e/0x30
RIP: 0033:0x55c32167fc10
Code: Bad RIP value.
RSP: 002b:00007fff1d031c50 EFLAGS: 00010206
RAX: 000000000dc00000 RBX: 00007fd2db000010 RCX: 00007fd2db000010
RDX: 0000000000000000 RSI: 0000000010001000 RDI: 0000000000000000
RBP: 000055c321680a54 R08: 00000000ffffffff R09: 0000000000000000
R10: 0000000000000022 R11: 0000000000000246 R12: ffffffffffffffff
R13: 0000000000000002 R14: 0000000000001000 R15: 0000000010000000
memory: usage 1024kB, limit 1024kB, failcnt 75131
swap: usage 0kB, limit 9007199254740988kB, failcnt 0
Memory cgroup stats for /foo:
anon 0
file 0
kernel_stack 36864
slab 274432
sock 0
shmem 0
file_mapped 0
file_dirty 0
file_writeback 0
anon_thp 0
inactive_anon 126976
active_anon 0
inactive_file 0
active_file 0
unevictable 0
slab_reclaimable 0
slab_unreclaimable 274432
pgfault 59466
pgmajfault 1617
workingset_refault 2145
workingset_activate 0
workingset_nodereclaim 0
pgrefill 98952
pgscan 200060
pgsteal 59340
pgactivate 40095
pgdeactivate 96787
pglazyfree 0
pglazyfreed 0
thp_fault_alloc 0
thp_collapse_alloc 0
Tasks state (memory values in pages):
[ pid ] uid tgid total_vm rss pgtables_bytes swapents oom_score_adj name
[ 200] 0 200 1121 884 53248 29 0 bash
[ 209] 0 209 905 246 45056 19 0 stress
[ 210] 0 210 66442 56 499712 56349 0 stress
oom-kill:constraint=CONSTRAINT_NONE,nodemask=(null),oom_memcg=/foo,task_memcg=/foo,task=stress,pid=210,uid=0
Memory cgroup out of memory: Killed process 210 (stress) total-vm:265768kB, anon-rss:0kB, file-rss:224kB, shmem-rss:0kB
oom_reaper: reaped process 210 (stress), now anon-rss:0kB, file-rss:0kB, shmem-rss:0kB
[hannes@cmpxchg.org: s/kvmalloc/kmalloc/ per Michal]
Link: http://lkml.kernel.org/r/20190605161133.GA12453@cmpxchg.org
Link: http://lkml.kernel.org/r/20190604210509.9744-1-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-07-12 10:55:59 +07:00
|
|
|
buf = memory_stat_format(memcg);
|
|
|
|
if (!buf)
|
|
|
|
return -ENOMEM;
|
|
|
|
seq_puts(m, buf);
|
|
|
|
kfree(buf);
|
2016-01-21 06:03:19 +07:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2020-10-14 06:52:59 +07:00
|
|
|
#ifdef CONFIG_NUMA
|
|
|
|
static int memory_numa_stat_show(struct seq_file *m, void *v)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
|
|
|
|
|
|
|
|
for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
|
|
|
|
int nid;
|
|
|
|
|
|
|
|
if (memory_stats[i].idx >= NR_VM_NODE_STAT_ITEMS)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
seq_printf(m, "%s", memory_stats[i].name);
|
|
|
|
for_each_node_state(nid, N_MEMORY) {
|
|
|
|
u64 size;
|
|
|
|
struct lruvec *lruvec;
|
|
|
|
|
|
|
|
lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
|
|
|
|
size = lruvec_page_state(lruvec, memory_stats[i].idx);
|
|
|
|
size *= memory_stats[i].ratio;
|
|
|
|
seq_printf(m, " N%d=%llu", nid, size);
|
|
|
|
}
|
|
|
|
seq_putc(m, '\n');
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2018-08-22 11:53:54 +07:00
|
|
|
static int memory_oom_group_show(struct seq_file *m, void *v)
|
|
|
|
{
|
2019-03-06 06:45:52 +07:00
|
|
|
struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
|
2018-08-22 11:53:54 +07:00
|
|
|
|
|
|
|
seq_printf(m, "%d\n", memcg->oom_group);
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static ssize_t memory_oom_group_write(struct kernfs_open_file *of,
|
|
|
|
char *buf, size_t nbytes, loff_t off)
|
|
|
|
{
|
|
|
|
struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
|
|
|
|
int ret, oom_group;
|
|
|
|
|
|
|
|
buf = strstrip(buf);
|
|
|
|
if (!buf)
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
ret = kstrtoint(buf, 0, &oom_group);
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
|
|
|
|
if (oom_group != 0 && oom_group != 1)
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
memcg->oom_group = oom_group;
|
|
|
|
|
|
|
|
return nbytes;
|
|
|
|
}
|
|
|
|
|
mm: memcontrol: default hierarchy interface for memory
Introduce the basic control files to account, partition, and limit
memory using cgroups in default hierarchy mode.
This interface versioning allows us to address fundamental design
issues in the existing memory cgroup interface, further explained
below. The old interface will be maintained indefinitely, but a
clearer model and improved workload performance should encourage
existing users to switch over to the new one eventually.
The control files are thus:
- memory.current shows the current consumption of the cgroup and its
descendants, in bytes.
- memory.low configures the lower end of the cgroup's expected
memory consumption range. The kernel considers memory below that
boundary to be a reserve - the minimum that the workload needs in
order to make forward progress - and generally avoids reclaiming
it, unless there is an imminent risk of entering an OOM situation.
- memory.high configures the upper end of the cgroup's expected
memory consumption range. A cgroup whose consumption grows beyond
this threshold is forced into direct reclaim, to work off the
excess and to throttle new allocations heavily, but is generally
allowed to continue and the OOM killer is not invoked.
- memory.max configures the hard maximum amount of memory that the
cgroup is allowed to consume before the OOM killer is invoked.
- memory.events shows event counters that indicate how often the
cgroup was reclaimed while below memory.low, how often it was
forced to reclaim excess beyond memory.high, how often it hit
memory.max, and how often it entered OOM due to memory.max. This
allows users to identify configuration problems when observing a
degradation in workload performance. An overcommitted system will
have an increased rate of low boundary breaches, whereas increased
rates of high limit breaches, maximum hits, or even OOM situations
will indicate internally overcommitted cgroups.
For existing users of memory cgroups, the following deviations from
the current interface are worth pointing out and explaining:
- The original lower boundary, the soft limit, is defined as a limit
that is per default unset. As a result, the set of cgroups that
global reclaim prefers is opt-in, rather than opt-out. The costs
for optimizing these mostly negative lookups are so high that the
implementation, despite its enormous size, does not even provide
the basic desirable behavior. First off, the soft limit has no
hierarchical meaning. All configured groups are organized in a
global rbtree and treated like equal peers, regardless where they
are located in the hierarchy. This makes subtree delegation
impossible. Second, the soft limit reclaim pass is so aggressive
that it not just introduces high allocation latencies into the
system, but also impacts system performance due to overreclaim, to
the point where the feature becomes self-defeating.
The memory.low boundary on the other hand is a top-down allocated
reserve. A cgroup enjoys reclaim protection when it and all its
ancestors are below their low boundaries, which makes delegation
of subtrees possible. Secondly, new cgroups have no reserve per
default and in the common case most cgroups are eligible for the
preferred reclaim pass. This allows the new low boundary to be
efficiently implemented with just a minor addition to the generic
reclaim code, without the need for out-of-band data structures and
reclaim passes. Because the generic reclaim code considers all
cgroups except for the ones running low in the preferred first
reclaim pass, overreclaim of individual groups is eliminated as
well, resulting in much better overall workload performance.
- The original high boundary, the hard limit, is defined as a strict
limit that can not budge, even if the OOM killer has to be called.
But this generally goes against the goal of making the most out of
the available memory. The memory consumption of workloads varies
during runtime, and that requires users to overcommit. But doing
that with a strict upper limit requires either a fairly accurate
prediction of the working set size or adding slack to the limit.
Since working set size estimation is hard and error prone, and
getting it wrong results in OOM kills, most users tend to err on
the side of a looser limit and end up wasting precious resources.
The memory.high boundary on the other hand can be set much more
conservatively. When hit, it throttles allocations by forcing
them into direct reclaim to work off the excess, but it never
invokes the OOM killer. As a result, a high boundary that is
chosen too aggressively will not terminate the processes, but
instead it will lead to gradual performance degradation. The user
can monitor this and make corrections until the minimal memory
footprint that still gives acceptable performance is found.
In extreme cases, with many concurrent allocations and a complete
breakdown of reclaim progress within the group, the high boundary
can be exceeded. But even then it's mostly better to satisfy the
allocation from the slack available in other groups or the rest of
the system than killing the group. Otherwise, memory.max is there
to limit this type of spillover and ultimately contain buggy or
even malicious applications.
- The original control file names are unwieldy and inconsistent in
many different ways. For example, the upper boundary hit count is
exported in the memory.failcnt file, but an OOM event count has to
be manually counted by listening to memory.oom_control events, and
lower boundary / soft limit events have to be counted by first
setting a threshold for that value and then counting those events.
Also, usage and limit files encode their units in the filename.
That makes the filenames very long, even though this is not
information that a user needs to be reminded of every time they
type out those names.
To address these naming issues, as well as to signal clearly that
the new interface carries a new configuration model, the naming
conventions in it necessarily differ from the old interface.
- The original limit files indicate the state of an unset limit with
a very high number, and a configured limit can be unset by echoing
-1 into those files. But that very high number is implementation
and architecture dependent and not very descriptive. And while -1
can be understood as an underflow into the highest possible value,
-2 or -10M etc. do not work, so it's not inconsistent.
memory.low, memory.high, and memory.max will use the string
"infinity" to indicate and set the highest possible value.
[akpm@linux-foundation.org: use seq_puts() for basic strings]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: Vladimir Davydov <vdavydov@parallels.com>
Cc: Greg Thelen <gthelen@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-02-12 06:26:06 +07:00
|
|
|
static struct cftype memory_files[] = {
|
|
|
|
{
|
|
|
|
.name = "current",
|
2015-11-06 09:50:23 +07:00
|
|
|
.flags = CFTYPE_NOT_ON_ROOT,
|
mm: memcontrol: default hierarchy interface for memory
Introduce the basic control files to account, partition, and limit
memory using cgroups in default hierarchy mode.
This interface versioning allows us to address fundamental design
issues in the existing memory cgroup interface, further explained
below. The old interface will be maintained indefinitely, but a
clearer model and improved workload performance should encourage
existing users to switch over to the new one eventually.
The control files are thus:
- memory.current shows the current consumption of the cgroup and its
descendants, in bytes.
- memory.low configures the lower end of the cgroup's expected
memory consumption range. The kernel considers memory below that
boundary to be a reserve - the minimum that the workload needs in
order to make forward progress - and generally avoids reclaiming
it, unless there is an imminent risk of entering an OOM situation.
- memory.high configures the upper end of the cgroup's expected
memory consumption range. A cgroup whose consumption grows beyond
this threshold is forced into direct reclaim, to work off the
excess and to throttle new allocations heavily, but is generally
allowed to continue and the OOM killer is not invoked.
- memory.max configures the hard maximum amount of memory that the
cgroup is allowed to consume before the OOM killer is invoked.
- memory.events shows event counters that indicate how often the
cgroup was reclaimed while below memory.low, how often it was
forced to reclaim excess beyond memory.high, how often it hit
memory.max, and how often it entered OOM due to memory.max. This
allows users to identify configuration problems when observing a
degradation in workload performance. An overcommitted system will
have an increased rate of low boundary breaches, whereas increased
rates of high limit breaches, maximum hits, or even OOM situations
will indicate internally overcommitted cgroups.
For existing users of memory cgroups, the following deviations from
the current interface are worth pointing out and explaining:
- The original lower boundary, the soft limit, is defined as a limit
that is per default unset. As a result, the set of cgroups that
global reclaim prefers is opt-in, rather than opt-out. The costs
for optimizing these mostly negative lookups are so high that the
implementation, despite its enormous size, does not even provide
the basic desirable behavior. First off, the soft limit has no
hierarchical meaning. All configured groups are organized in a
global rbtree and treated like equal peers, regardless where they
are located in the hierarchy. This makes subtree delegation
impossible. Second, the soft limit reclaim pass is so aggressive
that it not just introduces high allocation latencies into the
system, but also impacts system performance due to overreclaim, to
the point where the feature becomes self-defeating.
The memory.low boundary on the other hand is a top-down allocated
reserve. A cgroup enjoys reclaim protection when it and all its
ancestors are below their low boundaries, which makes delegation
of subtrees possible. Secondly, new cgroups have no reserve per
default and in the common case most cgroups are eligible for the
preferred reclaim pass. This allows the new low boundary to be
efficiently implemented with just a minor addition to the generic
reclaim code, without the need for out-of-band data structures and
reclaim passes. Because the generic reclaim code considers all
cgroups except for the ones running low in the preferred first
reclaim pass, overreclaim of individual groups is eliminated as
well, resulting in much better overall workload performance.
- The original high boundary, the hard limit, is defined as a strict
limit that can not budge, even if the OOM killer has to be called.
But this generally goes against the goal of making the most out of
the available memory. The memory consumption of workloads varies
during runtime, and that requires users to overcommit. But doing
that with a strict upper limit requires either a fairly accurate
prediction of the working set size or adding slack to the limit.
Since working set size estimation is hard and error prone, and
getting it wrong results in OOM kills, most users tend to err on
the side of a looser limit and end up wasting precious resources.
The memory.high boundary on the other hand can be set much more
conservatively. When hit, it throttles allocations by forcing
them into direct reclaim to work off the excess, but it never
invokes the OOM killer. As a result, a high boundary that is
chosen too aggressively will not terminate the processes, but
instead it will lead to gradual performance degradation. The user
can monitor this and make corrections until the minimal memory
footprint that still gives acceptable performance is found.
In extreme cases, with many concurrent allocations and a complete
breakdown of reclaim progress within the group, the high boundary
can be exceeded. But even then it's mostly better to satisfy the
allocation from the slack available in other groups or the rest of
the system than killing the group. Otherwise, memory.max is there
to limit this type of spillover and ultimately contain buggy or
even malicious applications.
- The original control file names are unwieldy and inconsistent in
many different ways. For example, the upper boundary hit count is
exported in the memory.failcnt file, but an OOM event count has to
be manually counted by listening to memory.oom_control events, and
lower boundary / soft limit events have to be counted by first
setting a threshold for that value and then counting those events.
Also, usage and limit files encode their units in the filename.
That makes the filenames very long, even though this is not
information that a user needs to be reminded of every time they
type out those names.
To address these naming issues, as well as to signal clearly that
the new interface carries a new configuration model, the naming
conventions in it necessarily differ from the old interface.
- The original limit files indicate the state of an unset limit with
a very high number, and a configured limit can be unset by echoing
-1 into those files. But that very high number is implementation
and architecture dependent and not very descriptive. And while -1
can be understood as an underflow into the highest possible value,
-2 or -10M etc. do not work, so it's not inconsistent.
memory.low, memory.high, and memory.max will use the string
"infinity" to indicate and set the highest possible value.
[akpm@linux-foundation.org: use seq_puts() for basic strings]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: Vladimir Davydov <vdavydov@parallels.com>
Cc: Greg Thelen <gthelen@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-02-12 06:26:06 +07:00
|
|
|
.read_u64 = memory_current_read,
|
|
|
|
},
|
2018-06-08 07:07:46 +07:00
|
|
|
{
|
|
|
|
.name = "min",
|
|
|
|
.flags = CFTYPE_NOT_ON_ROOT,
|
|
|
|
.seq_show = memory_min_show,
|
|
|
|
.write = memory_min_write,
|
|
|
|
},
|
mm: memcontrol: default hierarchy interface for memory
Introduce the basic control files to account, partition, and limit
memory using cgroups in default hierarchy mode.
This interface versioning allows us to address fundamental design
issues in the existing memory cgroup interface, further explained
below. The old interface will be maintained indefinitely, but a
clearer model and improved workload performance should encourage
existing users to switch over to the new one eventually.
The control files are thus:
- memory.current shows the current consumption of the cgroup and its
descendants, in bytes.
- memory.low configures the lower end of the cgroup's expected
memory consumption range. The kernel considers memory below that
boundary to be a reserve - the minimum that the workload needs in
order to make forward progress - and generally avoids reclaiming
it, unless there is an imminent risk of entering an OOM situation.
- memory.high configures the upper end of the cgroup's expected
memory consumption range. A cgroup whose consumption grows beyond
this threshold is forced into direct reclaim, to work off the
excess and to throttle new allocations heavily, but is generally
allowed to continue and the OOM killer is not invoked.
- memory.max configures the hard maximum amount of memory that the
cgroup is allowed to consume before the OOM killer is invoked.
- memory.events shows event counters that indicate how often the
cgroup was reclaimed while below memory.low, how often it was
forced to reclaim excess beyond memory.high, how often it hit
memory.max, and how often it entered OOM due to memory.max. This
allows users to identify configuration problems when observing a
degradation in workload performance. An overcommitted system will
have an increased rate of low boundary breaches, whereas increased
rates of high limit breaches, maximum hits, or even OOM situations
will indicate internally overcommitted cgroups.
For existing users of memory cgroups, the following deviations from
the current interface are worth pointing out and explaining:
- The original lower boundary, the soft limit, is defined as a limit
that is per default unset. As a result, the set of cgroups that
global reclaim prefers is opt-in, rather than opt-out. The costs
for optimizing these mostly negative lookups are so high that the
implementation, despite its enormous size, does not even provide
the basic desirable behavior. First off, the soft limit has no
hierarchical meaning. All configured groups are organized in a
global rbtree and treated like equal peers, regardless where they
are located in the hierarchy. This makes subtree delegation
impossible. Second, the soft limit reclaim pass is so aggressive
that it not just introduces high allocation latencies into the
system, but also impacts system performance due to overreclaim, to
the point where the feature becomes self-defeating.
The memory.low boundary on the other hand is a top-down allocated
reserve. A cgroup enjoys reclaim protection when it and all its
ancestors are below their low boundaries, which makes delegation
of subtrees possible. Secondly, new cgroups have no reserve per
default and in the common case most cgroups are eligible for the
preferred reclaim pass. This allows the new low boundary to be
efficiently implemented with just a minor addition to the generic
reclaim code, without the need for out-of-band data structures and
reclaim passes. Because the generic reclaim code considers all
cgroups except for the ones running low in the preferred first
reclaim pass, overreclaim of individual groups is eliminated as
well, resulting in much better overall workload performance.
- The original high boundary, the hard limit, is defined as a strict
limit that can not budge, even if the OOM killer has to be called.
But this generally goes against the goal of making the most out of
the available memory. The memory consumption of workloads varies
during runtime, and that requires users to overcommit. But doing
that with a strict upper limit requires either a fairly accurate
prediction of the working set size or adding slack to the limit.
Since working set size estimation is hard and error prone, and
getting it wrong results in OOM kills, most users tend to err on
the side of a looser limit and end up wasting precious resources.
The memory.high boundary on the other hand can be set much more
conservatively. When hit, it throttles allocations by forcing
them into direct reclaim to work off the excess, but it never
invokes the OOM killer. As a result, a high boundary that is
chosen too aggressively will not terminate the processes, but
instead it will lead to gradual performance degradation. The user
can monitor this and make corrections until the minimal memory
footprint that still gives acceptable performance is found.
In extreme cases, with many concurrent allocations and a complete
breakdown of reclaim progress within the group, the high boundary
can be exceeded. But even then it's mostly better to satisfy the
allocation from the slack available in other groups or the rest of
the system than killing the group. Otherwise, memory.max is there
to limit this type of spillover and ultimately contain buggy or
even malicious applications.
- The original control file names are unwieldy and inconsistent in
many different ways. For example, the upper boundary hit count is
exported in the memory.failcnt file, but an OOM event count has to
be manually counted by listening to memory.oom_control events, and
lower boundary / soft limit events have to be counted by first
setting a threshold for that value and then counting those events.
Also, usage and limit files encode their units in the filename.
That makes the filenames very long, even though this is not
information that a user needs to be reminded of every time they
type out those names.
To address these naming issues, as well as to signal clearly that
the new interface carries a new configuration model, the naming
conventions in it necessarily differ from the old interface.
- The original limit files indicate the state of an unset limit with
a very high number, and a configured limit can be unset by echoing
-1 into those files. But that very high number is implementation
and architecture dependent and not very descriptive. And while -1
can be understood as an underflow into the highest possible value,
-2 or -10M etc. do not work, so it's not inconsistent.
memory.low, memory.high, and memory.max will use the string
"infinity" to indicate and set the highest possible value.
[akpm@linux-foundation.org: use seq_puts() for basic strings]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: Vladimir Davydov <vdavydov@parallels.com>
Cc: Greg Thelen <gthelen@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-02-12 06:26:06 +07:00
|
|
|
{
|
|
|
|
.name = "low",
|
|
|
|
.flags = CFTYPE_NOT_ON_ROOT,
|
|
|
|
.seq_show = memory_low_show,
|
|
|
|
.write = memory_low_write,
|
|
|
|
},
|
|
|
|
{
|
|
|
|
.name = "high",
|
|
|
|
.flags = CFTYPE_NOT_ON_ROOT,
|
|
|
|
.seq_show = memory_high_show,
|
|
|
|
.write = memory_high_write,
|
|
|
|
},
|
|
|
|
{
|
|
|
|
.name = "max",
|
|
|
|
.flags = CFTYPE_NOT_ON_ROOT,
|
|
|
|
.seq_show = memory_max_show,
|
|
|
|
.write = memory_max_write,
|
|
|
|
},
|
|
|
|
{
|
|
|
|
.name = "events",
|
|
|
|
.flags = CFTYPE_NOT_ON_ROOT,
|
2015-09-19 05:01:59 +07:00
|
|
|
.file_offset = offsetof(struct mem_cgroup, events_file),
|
mm: memcontrol: default hierarchy interface for memory
Introduce the basic control files to account, partition, and limit
memory using cgroups in default hierarchy mode.
This interface versioning allows us to address fundamental design
issues in the existing memory cgroup interface, further explained
below. The old interface will be maintained indefinitely, but a
clearer model and improved workload performance should encourage
existing users to switch over to the new one eventually.
The control files are thus:
- memory.current shows the current consumption of the cgroup and its
descendants, in bytes.
- memory.low configures the lower end of the cgroup's expected
memory consumption range. The kernel considers memory below that
boundary to be a reserve - the minimum that the workload needs in
order to make forward progress - and generally avoids reclaiming
it, unless there is an imminent risk of entering an OOM situation.
- memory.high configures the upper end of the cgroup's expected
memory consumption range. A cgroup whose consumption grows beyond
this threshold is forced into direct reclaim, to work off the
excess and to throttle new allocations heavily, but is generally
allowed to continue and the OOM killer is not invoked.
- memory.max configures the hard maximum amount of memory that the
cgroup is allowed to consume before the OOM killer is invoked.
- memory.events shows event counters that indicate how often the
cgroup was reclaimed while below memory.low, how often it was
forced to reclaim excess beyond memory.high, how often it hit
memory.max, and how often it entered OOM due to memory.max. This
allows users to identify configuration problems when observing a
degradation in workload performance. An overcommitted system will
have an increased rate of low boundary breaches, whereas increased
rates of high limit breaches, maximum hits, or even OOM situations
will indicate internally overcommitted cgroups.
For existing users of memory cgroups, the following deviations from
the current interface are worth pointing out and explaining:
- The original lower boundary, the soft limit, is defined as a limit
that is per default unset. As a result, the set of cgroups that
global reclaim prefers is opt-in, rather than opt-out. The costs
for optimizing these mostly negative lookups are so high that the
implementation, despite its enormous size, does not even provide
the basic desirable behavior. First off, the soft limit has no
hierarchical meaning. All configured groups are organized in a
global rbtree and treated like equal peers, regardless where they
are located in the hierarchy. This makes subtree delegation
impossible. Second, the soft limit reclaim pass is so aggressive
that it not just introduces high allocation latencies into the
system, but also impacts system performance due to overreclaim, to
the point where the feature becomes self-defeating.
The memory.low boundary on the other hand is a top-down allocated
reserve. A cgroup enjoys reclaim protection when it and all its
ancestors are below their low boundaries, which makes delegation
of subtrees possible. Secondly, new cgroups have no reserve per
default and in the common case most cgroups are eligible for the
preferred reclaim pass. This allows the new low boundary to be
efficiently implemented with just a minor addition to the generic
reclaim code, without the need for out-of-band data structures and
reclaim passes. Because the generic reclaim code considers all
cgroups except for the ones running low in the preferred first
reclaim pass, overreclaim of individual groups is eliminated as
well, resulting in much better overall workload performance.
- The original high boundary, the hard limit, is defined as a strict
limit that can not budge, even if the OOM killer has to be called.
But this generally goes against the goal of making the most out of
the available memory. The memory consumption of workloads varies
during runtime, and that requires users to overcommit. But doing
that with a strict upper limit requires either a fairly accurate
prediction of the working set size or adding slack to the limit.
Since working set size estimation is hard and error prone, and
getting it wrong results in OOM kills, most users tend to err on
the side of a looser limit and end up wasting precious resources.
The memory.high boundary on the other hand can be set much more
conservatively. When hit, it throttles allocations by forcing
them into direct reclaim to work off the excess, but it never
invokes the OOM killer. As a result, a high boundary that is
chosen too aggressively will not terminate the processes, but
instead it will lead to gradual performance degradation. The user
can monitor this and make corrections until the minimal memory
footprint that still gives acceptable performance is found.
In extreme cases, with many concurrent allocations and a complete
breakdown of reclaim progress within the group, the high boundary
can be exceeded. But even then it's mostly better to satisfy the
allocation from the slack available in other groups or the rest of
the system than killing the group. Otherwise, memory.max is there
to limit this type of spillover and ultimately contain buggy or
even malicious applications.
- The original control file names are unwieldy and inconsistent in
many different ways. For example, the upper boundary hit count is
exported in the memory.failcnt file, but an OOM event count has to
be manually counted by listening to memory.oom_control events, and
lower boundary / soft limit events have to be counted by first
setting a threshold for that value and then counting those events.
Also, usage and limit files encode their units in the filename.
That makes the filenames very long, even though this is not
information that a user needs to be reminded of every time they
type out those names.
To address these naming issues, as well as to signal clearly that
the new interface carries a new configuration model, the naming
conventions in it necessarily differ from the old interface.
- The original limit files indicate the state of an unset limit with
a very high number, and a configured limit can be unset by echoing
-1 into those files. But that very high number is implementation
and architecture dependent and not very descriptive. And while -1
can be understood as an underflow into the highest possible value,
-2 or -10M etc. do not work, so it's not inconsistent.
memory.low, memory.high, and memory.max will use the string
"infinity" to indicate and set the highest possible value.
[akpm@linux-foundation.org: use seq_puts() for basic strings]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: Vladimir Davydov <vdavydov@parallels.com>
Cc: Greg Thelen <gthelen@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-02-12 06:26:06 +07:00
|
|
|
.seq_show = memory_events_show,
|
|
|
|
},
|
2019-07-12 10:55:55 +07:00
|
|
|
{
|
|
|
|
.name = "events.local",
|
|
|
|
.flags = CFTYPE_NOT_ON_ROOT,
|
|
|
|
.file_offset = offsetof(struct mem_cgroup, events_local_file),
|
|
|
|
.seq_show = memory_events_local_show,
|
|
|
|
},
|
2016-01-21 06:03:19 +07:00
|
|
|
{
|
|
|
|
.name = "stat",
|
|
|
|
.seq_show = memory_stat_show,
|
|
|
|
},
|
2020-10-14 06:52:59 +07:00
|
|
|
#ifdef CONFIG_NUMA
|
|
|
|
{
|
|
|
|
.name = "numa_stat",
|
|
|
|
.seq_show = memory_numa_stat_show,
|
|
|
|
},
|
|
|
|
#endif
|
2018-08-22 11:53:54 +07:00
|
|
|
{
|
|
|
|
.name = "oom.group",
|
|
|
|
.flags = CFTYPE_NOT_ON_ROOT | CFTYPE_NS_DELEGATABLE,
|
|
|
|
.seq_show = memory_oom_group_show,
|
|
|
|
.write = memory_oom_group_write,
|
|
|
|
},
|
mm: memcontrol: default hierarchy interface for memory
Introduce the basic control files to account, partition, and limit
memory using cgroups in default hierarchy mode.
This interface versioning allows us to address fundamental design
issues in the existing memory cgroup interface, further explained
below. The old interface will be maintained indefinitely, but a
clearer model and improved workload performance should encourage
existing users to switch over to the new one eventually.
The control files are thus:
- memory.current shows the current consumption of the cgroup and its
descendants, in bytes.
- memory.low configures the lower end of the cgroup's expected
memory consumption range. The kernel considers memory below that
boundary to be a reserve - the minimum that the workload needs in
order to make forward progress - and generally avoids reclaiming
it, unless there is an imminent risk of entering an OOM situation.
- memory.high configures the upper end of the cgroup's expected
memory consumption range. A cgroup whose consumption grows beyond
this threshold is forced into direct reclaim, to work off the
excess and to throttle new allocations heavily, but is generally
allowed to continue and the OOM killer is not invoked.
- memory.max configures the hard maximum amount of memory that the
cgroup is allowed to consume before the OOM killer is invoked.
- memory.events shows event counters that indicate how often the
cgroup was reclaimed while below memory.low, how often it was
forced to reclaim excess beyond memory.high, how often it hit
memory.max, and how often it entered OOM due to memory.max. This
allows users to identify configuration problems when observing a
degradation in workload performance. An overcommitted system will
have an increased rate of low boundary breaches, whereas increased
rates of high limit breaches, maximum hits, or even OOM situations
will indicate internally overcommitted cgroups.
For existing users of memory cgroups, the following deviations from
the current interface are worth pointing out and explaining:
- The original lower boundary, the soft limit, is defined as a limit
that is per default unset. As a result, the set of cgroups that
global reclaim prefers is opt-in, rather than opt-out. The costs
for optimizing these mostly negative lookups are so high that the
implementation, despite its enormous size, does not even provide
the basic desirable behavior. First off, the soft limit has no
hierarchical meaning. All configured groups are organized in a
global rbtree and treated like equal peers, regardless where they
are located in the hierarchy. This makes subtree delegation
impossible. Second, the soft limit reclaim pass is so aggressive
that it not just introduces high allocation latencies into the
system, but also impacts system performance due to overreclaim, to
the point where the feature becomes self-defeating.
The memory.low boundary on the other hand is a top-down allocated
reserve. A cgroup enjoys reclaim protection when it and all its
ancestors are below their low boundaries, which makes delegation
of subtrees possible. Secondly, new cgroups have no reserve per
default and in the common case most cgroups are eligible for the
preferred reclaim pass. This allows the new low boundary to be
efficiently implemented with just a minor addition to the generic
reclaim code, without the need for out-of-band data structures and
reclaim passes. Because the generic reclaim code considers all
cgroups except for the ones running low in the preferred first
reclaim pass, overreclaim of individual groups is eliminated as
well, resulting in much better overall workload performance.
- The original high boundary, the hard limit, is defined as a strict
limit that can not budge, even if the OOM killer has to be called.
But this generally goes against the goal of making the most out of
the available memory. The memory consumption of workloads varies
during runtime, and that requires users to overcommit. But doing
that with a strict upper limit requires either a fairly accurate
prediction of the working set size or adding slack to the limit.
Since working set size estimation is hard and error prone, and
getting it wrong results in OOM kills, most users tend to err on
the side of a looser limit and end up wasting precious resources.
The memory.high boundary on the other hand can be set much more
conservatively. When hit, it throttles allocations by forcing
them into direct reclaim to work off the excess, but it never
invokes the OOM killer. As a result, a high boundary that is
chosen too aggressively will not terminate the processes, but
instead it will lead to gradual performance degradation. The user
can monitor this and make corrections until the minimal memory
footprint that still gives acceptable performance is found.
In extreme cases, with many concurrent allocations and a complete
breakdown of reclaim progress within the group, the high boundary
can be exceeded. But even then it's mostly better to satisfy the
allocation from the slack available in other groups or the rest of
the system than killing the group. Otherwise, memory.max is there
to limit this type of spillover and ultimately contain buggy or
even malicious applications.
- The original control file names are unwieldy and inconsistent in
many different ways. For example, the upper boundary hit count is
exported in the memory.failcnt file, but an OOM event count has to
be manually counted by listening to memory.oom_control events, and
lower boundary / soft limit events have to be counted by first
setting a threshold for that value and then counting those events.
Also, usage and limit files encode their units in the filename.
That makes the filenames very long, even though this is not
information that a user needs to be reminded of every time they
type out those names.
To address these naming issues, as well as to signal clearly that
the new interface carries a new configuration model, the naming
conventions in it necessarily differ from the old interface.
- The original limit files indicate the state of an unset limit with
a very high number, and a configured limit can be unset by echoing
-1 into those files. But that very high number is implementation
and architecture dependent and not very descriptive. And while -1
can be understood as an underflow into the highest possible value,
-2 or -10M etc. do not work, so it's not inconsistent.
memory.low, memory.high, and memory.max will use the string
"infinity" to indicate and set the highest possible value.
[akpm@linux-foundation.org: use seq_puts() for basic strings]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: Vladimir Davydov <vdavydov@parallels.com>
Cc: Greg Thelen <gthelen@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-02-12 06:26:06 +07:00
|
|
|
{ } /* terminate */
|
|
|
|
};
|
|
|
|
|
2014-02-08 22:36:58 +07:00
|
|
|
struct cgroup_subsys memory_cgrp_subsys = {
|
2012-11-19 23:13:38 +07:00
|
|
|
.css_alloc = mem_cgroup_css_alloc,
|
2013-02-23 07:34:52 +07:00
|
|
|
.css_online = mem_cgroup_css_online,
|
2012-11-19 23:13:38 +07:00
|
|
|
.css_offline = mem_cgroup_css_offline,
|
2015-12-30 05:54:10 +07:00
|
|
|
.css_released = mem_cgroup_css_released,
|
2012-11-19 23:13:38 +07:00
|
|
|
.css_free = mem_cgroup_css_free,
|
2014-07-09 05:02:57 +07:00
|
|
|
.css_reset = mem_cgroup_css_reset,
|
2010-03-11 06:22:13 +07:00
|
|
|
.can_attach = mem_cgroup_can_attach,
|
|
|
|
.cancel_attach = mem_cgroup_cancel_attach,
|
2016-04-22 06:09:02 +07:00
|
|
|
.post_attach = mem_cgroup_move_task,
|
2013-04-16 03:41:15 +07:00
|
|
|
.bind = mem_cgroup_bind,
|
mm: memcontrol: default hierarchy interface for memory
Introduce the basic control files to account, partition, and limit
memory using cgroups in default hierarchy mode.
This interface versioning allows us to address fundamental design
issues in the existing memory cgroup interface, further explained
below. The old interface will be maintained indefinitely, but a
clearer model and improved workload performance should encourage
existing users to switch over to the new one eventually.
The control files are thus:
- memory.current shows the current consumption of the cgroup and its
descendants, in bytes.
- memory.low configures the lower end of the cgroup's expected
memory consumption range. The kernel considers memory below that
boundary to be a reserve - the minimum that the workload needs in
order to make forward progress - and generally avoids reclaiming
it, unless there is an imminent risk of entering an OOM situation.
- memory.high configures the upper end of the cgroup's expected
memory consumption range. A cgroup whose consumption grows beyond
this threshold is forced into direct reclaim, to work off the
excess and to throttle new allocations heavily, but is generally
allowed to continue and the OOM killer is not invoked.
- memory.max configures the hard maximum amount of memory that the
cgroup is allowed to consume before the OOM killer is invoked.
- memory.events shows event counters that indicate how often the
cgroup was reclaimed while below memory.low, how often it was
forced to reclaim excess beyond memory.high, how often it hit
memory.max, and how often it entered OOM due to memory.max. This
allows users to identify configuration problems when observing a
degradation in workload performance. An overcommitted system will
have an increased rate of low boundary breaches, whereas increased
rates of high limit breaches, maximum hits, or even OOM situations
will indicate internally overcommitted cgroups.
For existing users of memory cgroups, the following deviations from
the current interface are worth pointing out and explaining:
- The original lower boundary, the soft limit, is defined as a limit
that is per default unset. As a result, the set of cgroups that
global reclaim prefers is opt-in, rather than opt-out. The costs
for optimizing these mostly negative lookups are so high that the
implementation, despite its enormous size, does not even provide
the basic desirable behavior. First off, the soft limit has no
hierarchical meaning. All configured groups are organized in a
global rbtree and treated like equal peers, regardless where they
are located in the hierarchy. This makes subtree delegation
impossible. Second, the soft limit reclaim pass is so aggressive
that it not just introduces high allocation latencies into the
system, but also impacts system performance due to overreclaim, to
the point where the feature becomes self-defeating.
The memory.low boundary on the other hand is a top-down allocated
reserve. A cgroup enjoys reclaim protection when it and all its
ancestors are below their low boundaries, which makes delegation
of subtrees possible. Secondly, new cgroups have no reserve per
default and in the common case most cgroups are eligible for the
preferred reclaim pass. This allows the new low boundary to be
efficiently implemented with just a minor addition to the generic
reclaim code, without the need for out-of-band data structures and
reclaim passes. Because the generic reclaim code considers all
cgroups except for the ones running low in the preferred first
reclaim pass, overreclaim of individual groups is eliminated as
well, resulting in much better overall workload performance.
- The original high boundary, the hard limit, is defined as a strict
limit that can not budge, even if the OOM killer has to be called.
But this generally goes against the goal of making the most out of
the available memory. The memory consumption of workloads varies
during runtime, and that requires users to overcommit. But doing
that with a strict upper limit requires either a fairly accurate
prediction of the working set size or adding slack to the limit.
Since working set size estimation is hard and error prone, and
getting it wrong results in OOM kills, most users tend to err on
the side of a looser limit and end up wasting precious resources.
The memory.high boundary on the other hand can be set much more
conservatively. When hit, it throttles allocations by forcing
them into direct reclaim to work off the excess, but it never
invokes the OOM killer. As a result, a high boundary that is
chosen too aggressively will not terminate the processes, but
instead it will lead to gradual performance degradation. The user
can monitor this and make corrections until the minimal memory
footprint that still gives acceptable performance is found.
In extreme cases, with many concurrent allocations and a complete
breakdown of reclaim progress within the group, the high boundary
can be exceeded. But even then it's mostly better to satisfy the
allocation from the slack available in other groups or the rest of
the system than killing the group. Otherwise, memory.max is there
to limit this type of spillover and ultimately contain buggy or
even malicious applications.
- The original control file names are unwieldy and inconsistent in
many different ways. For example, the upper boundary hit count is
exported in the memory.failcnt file, but an OOM event count has to
be manually counted by listening to memory.oom_control events, and
lower boundary / soft limit events have to be counted by first
setting a threshold for that value and then counting those events.
Also, usage and limit files encode their units in the filename.
That makes the filenames very long, even though this is not
information that a user needs to be reminded of every time they
type out those names.
To address these naming issues, as well as to signal clearly that
the new interface carries a new configuration model, the naming
conventions in it necessarily differ from the old interface.
- The original limit files indicate the state of an unset limit with
a very high number, and a configured limit can be unset by echoing
-1 into those files. But that very high number is implementation
and architecture dependent and not very descriptive. And while -1
can be understood as an underflow into the highest possible value,
-2 or -10M etc. do not work, so it's not inconsistent.
memory.low, memory.high, and memory.max will use the string
"infinity" to indicate and set the highest possible value.
[akpm@linux-foundation.org: use seq_puts() for basic strings]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: Vladimir Davydov <vdavydov@parallels.com>
Cc: Greg Thelen <gthelen@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-02-12 06:26:06 +07:00
|
|
|
.dfl_cftypes = memory_files,
|
|
|
|
.legacy_cftypes = mem_cgroup_legacy_files,
|
2008-02-07 15:14:31 +07:00
|
|
|
.early_init = 0,
|
2008-02-07 15:13:50 +07:00
|
|
|
};
|
2009-01-08 09:07:57 +07:00
|
|
|
|
2020-04-02 11:07:03 +07:00
|
|
|
/*
|
|
|
|
* This function calculates an individual cgroup's effective
|
|
|
|
* protection which is derived from its own memory.min/low, its
|
|
|
|
* parent's and siblings' settings, as well as the actual memory
|
|
|
|
* distribution in the tree.
|
|
|
|
*
|
|
|
|
* The following rules apply to the effective protection values:
|
|
|
|
*
|
|
|
|
* 1. At the first level of reclaim, effective protection is equal to
|
|
|
|
* the declared protection in memory.min and memory.low.
|
|
|
|
*
|
|
|
|
* 2. To enable safe delegation of the protection configuration, at
|
|
|
|
* subsequent levels the effective protection is capped to the
|
|
|
|
* parent's effective protection.
|
|
|
|
*
|
|
|
|
* 3. To make complex and dynamic subtrees easier to configure, the
|
|
|
|
* user is allowed to overcommit the declared protection at a given
|
|
|
|
* level. If that is the case, the parent's effective protection is
|
|
|
|
* distributed to the children in proportion to how much protection
|
|
|
|
* they have declared and how much of it they are utilizing.
|
|
|
|
*
|
|
|
|
* This makes distribution proportional, but also work-conserving:
|
|
|
|
* if one cgroup claims much more protection than it uses memory,
|
|
|
|
* the unused remainder is available to its siblings.
|
|
|
|
*
|
|
|
|
* 4. Conversely, when the declared protection is undercommitted at a
|
|
|
|
* given level, the distribution of the larger parental protection
|
|
|
|
* budget is NOT proportional. A cgroup's protection from a sibling
|
|
|
|
* is capped to its own memory.min/low setting.
|
|
|
|
*
|
mm: memcontrol: recursive memory.low protection
Right now, the effective protection of any given cgroup is capped by its
own explicit memory.low setting, regardless of what the parent says. The
reasons for this are mostly historical and ease of implementation: to make
delegation of memory.low safe, effective protection is the min() of all
memory.low up the tree.
Unfortunately, this limitation makes it impossible to protect an entire
subtree from another without forcing the user to make explicit protection
allocations all the way to the leaf cgroups - something that is highly
undesirable in real life scenarios.
Consider memory in a data center host. At the cgroup top level, we have a
distinction between system management software and the actual workload the
system is executing. Both branches are further subdivided into individual
services, job components etc.
We want to protect the workload as a whole from the system management
software, but that doesn't mean we want to protect and prioritize
individual workload wrt each other. Their memory demand can vary over
time, and we'd want the VM to simply cache the hottest data within the
workload subtree. Yet, the current memory.low limitations force us to
allocate a fixed amount of protection to each workload component in order
to get protection from system management software in general. This
results in very inefficient resource distribution.
Another concern with mandating downward allocation is that, as the
complexity of the cgroup tree grows, it gets harder for the lower levels
to be informed about decisions made at the host-level. Consider a
container inside a namespace that in turn creates its own nested tree of
cgroups to run multiple workloads. It'd be extremely difficult to
configure memory.low parameters in those leaf cgroups that on one hand
balance pressure among siblings as the container desires, while also
reflecting the host-level protection from e.g. rpm upgrades, that lie
beyond one or more delegation and namespacing points in the tree.
It's highly unusual from a cgroup interface POV that nested levels have to
be aware of and reflect decisions made at higher levels for them to be
effective.
To enable such use cases and scale configurability for complex trees, this
patch implements a resource inheritance model for memory that is similar
to how the CPU and the IO controller implement work-conserving resource
allocations: a share of a resource allocated to a subree always applies to
the entire subtree recursively, while allowing, but not mandating,
children to further specify distribution rules.
That means that if protection is explicitly allocated among siblings,
those configured shares are being followed during page reclaim just like
they are now. However, if the memory.low set at a higher level is not
fully claimed by the children in that subtree, the "floating" remainder is
applied to each cgroup in the tree in proportion to its size. Since
reclaim pressure is applied in proportion to size as well, each child in
that tree gets the same boost, and the effect is neutral among siblings -
with respect to each other, they behave as if no memory control was
enabled at all, and the VM simply balances the memory demands optimally
within the subtree. But collectively those cgroups enjoy a boost over the
cgroups in neighboring trees.
E.g. a leaf cgroup with a memory.low setting of 0 no longer means that
it's not getting a share of the hierarchically assigned resource, just
that it doesn't claim a fixed amount of it to protect from its siblings.
This allows us to recursively protect one subtree (workload) from another
(system management), while letting subgroups compete freely among each
other - without having to assign fixed shares to each leaf, and without
nested groups having to echo higher-level settings.
The floating protection composes naturally with fixed protection.
Consider the following example tree:
A A: low = 2G
/ \ A1: low = 1G
A1 A2 A2: low = 0G
As outside pressure is applied to this tree, A1 will enjoy a fixed
protection from A2 of 1G, but the remaining, unclaimed 1G from A is split
evenly among A1 and A2, coming out to 1.5G and 0.5G.
There is a slight risk of regressing theoretical setups where the
top-level cgroups don't know about the true budgeting and set bogusly high
"bypass" values that are meaningfully allocated down the tree. Such
setups would rely on unclaimed protection to be discarded, and
distributing it would change the intended behavior. Be safe and hide the
new behavior behind a mount option, 'memory_recursiveprot'.
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Acked-by: Tejun Heo <tj@kernel.org>
Acked-by: Roman Gushchin <guro@fb.com>
Acked-by: Chris Down <chris@chrisdown.name>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Michal Koutný <mkoutny@suse.com>
Link: http://lkml.kernel.org/r/20200227195606.46212-4-hannes@cmpxchg.org
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2020-04-02 11:07:07 +07:00
|
|
|
* 5. However, to allow protecting recursive subtrees from each other
|
|
|
|
* without having to declare each individual cgroup's fixed share
|
|
|
|
* of the ancestor's claim to protection, any unutilized -
|
|
|
|
* "floating" - protection from up the tree is distributed in
|
|
|
|
* proportion to each cgroup's *usage*. This makes the protection
|
|
|
|
* neutral wrt sibling cgroups and lets them compete freely over
|
|
|
|
* the shared parental protection budget, but it protects the
|
|
|
|
* subtree as a whole from neighboring subtrees.
|
|
|
|
*
|
|
|
|
* Note that 4. and 5. are not in conflict: 4. is about protecting
|
|
|
|
* against immediate siblings whereas 5. is about protecting against
|
|
|
|
* neighboring subtrees.
|
2020-04-02 11:07:03 +07:00
|
|
|
*/
|
|
|
|
static unsigned long effective_protection(unsigned long usage,
|
mm: memcontrol: recursive memory.low protection
Right now, the effective protection of any given cgroup is capped by its
own explicit memory.low setting, regardless of what the parent says. The
reasons for this are mostly historical and ease of implementation: to make
delegation of memory.low safe, effective protection is the min() of all
memory.low up the tree.
Unfortunately, this limitation makes it impossible to protect an entire
subtree from another without forcing the user to make explicit protection
allocations all the way to the leaf cgroups - something that is highly
undesirable in real life scenarios.
Consider memory in a data center host. At the cgroup top level, we have a
distinction between system management software and the actual workload the
system is executing. Both branches are further subdivided into individual
services, job components etc.
We want to protect the workload as a whole from the system management
software, but that doesn't mean we want to protect and prioritize
individual workload wrt each other. Their memory demand can vary over
time, and we'd want the VM to simply cache the hottest data within the
workload subtree. Yet, the current memory.low limitations force us to
allocate a fixed amount of protection to each workload component in order
to get protection from system management software in general. This
results in very inefficient resource distribution.
Another concern with mandating downward allocation is that, as the
complexity of the cgroup tree grows, it gets harder for the lower levels
to be informed about decisions made at the host-level. Consider a
container inside a namespace that in turn creates its own nested tree of
cgroups to run multiple workloads. It'd be extremely difficult to
configure memory.low parameters in those leaf cgroups that on one hand
balance pressure among siblings as the container desires, while also
reflecting the host-level protection from e.g. rpm upgrades, that lie
beyond one or more delegation and namespacing points in the tree.
It's highly unusual from a cgroup interface POV that nested levels have to
be aware of and reflect decisions made at higher levels for them to be
effective.
To enable such use cases and scale configurability for complex trees, this
patch implements a resource inheritance model for memory that is similar
to how the CPU and the IO controller implement work-conserving resource
allocations: a share of a resource allocated to a subree always applies to
the entire subtree recursively, while allowing, but not mandating,
children to further specify distribution rules.
That means that if protection is explicitly allocated among siblings,
those configured shares are being followed during page reclaim just like
they are now. However, if the memory.low set at a higher level is not
fully claimed by the children in that subtree, the "floating" remainder is
applied to each cgroup in the tree in proportion to its size. Since
reclaim pressure is applied in proportion to size as well, each child in
that tree gets the same boost, and the effect is neutral among siblings -
with respect to each other, they behave as if no memory control was
enabled at all, and the VM simply balances the memory demands optimally
within the subtree. But collectively those cgroups enjoy a boost over the
cgroups in neighboring trees.
E.g. a leaf cgroup with a memory.low setting of 0 no longer means that
it's not getting a share of the hierarchically assigned resource, just
that it doesn't claim a fixed amount of it to protect from its siblings.
This allows us to recursively protect one subtree (workload) from another
(system management), while letting subgroups compete freely among each
other - without having to assign fixed shares to each leaf, and without
nested groups having to echo higher-level settings.
The floating protection composes naturally with fixed protection.
Consider the following example tree:
A A: low = 2G
/ \ A1: low = 1G
A1 A2 A2: low = 0G
As outside pressure is applied to this tree, A1 will enjoy a fixed
protection from A2 of 1G, but the remaining, unclaimed 1G from A is split
evenly among A1 and A2, coming out to 1.5G and 0.5G.
There is a slight risk of regressing theoretical setups where the
top-level cgroups don't know about the true budgeting and set bogusly high
"bypass" values that are meaningfully allocated down the tree. Such
setups would rely on unclaimed protection to be discarded, and
distributing it would change the intended behavior. Be safe and hide the
new behavior behind a mount option, 'memory_recursiveprot'.
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Acked-by: Tejun Heo <tj@kernel.org>
Acked-by: Roman Gushchin <guro@fb.com>
Acked-by: Chris Down <chris@chrisdown.name>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Michal Koutný <mkoutny@suse.com>
Link: http://lkml.kernel.org/r/20200227195606.46212-4-hannes@cmpxchg.org
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2020-04-02 11:07:07 +07:00
|
|
|
unsigned long parent_usage,
|
2020-04-02 11:07:03 +07:00
|
|
|
unsigned long setting,
|
|
|
|
unsigned long parent_effective,
|
|
|
|
unsigned long siblings_protected)
|
|
|
|
{
|
|
|
|
unsigned long protected;
|
mm: memcontrol: recursive memory.low protection
Right now, the effective protection of any given cgroup is capped by its
own explicit memory.low setting, regardless of what the parent says. The
reasons for this are mostly historical and ease of implementation: to make
delegation of memory.low safe, effective protection is the min() of all
memory.low up the tree.
Unfortunately, this limitation makes it impossible to protect an entire
subtree from another without forcing the user to make explicit protection
allocations all the way to the leaf cgroups - something that is highly
undesirable in real life scenarios.
Consider memory in a data center host. At the cgroup top level, we have a
distinction between system management software and the actual workload the
system is executing. Both branches are further subdivided into individual
services, job components etc.
We want to protect the workload as a whole from the system management
software, but that doesn't mean we want to protect and prioritize
individual workload wrt each other. Their memory demand can vary over
time, and we'd want the VM to simply cache the hottest data within the
workload subtree. Yet, the current memory.low limitations force us to
allocate a fixed amount of protection to each workload component in order
to get protection from system management software in general. This
results in very inefficient resource distribution.
Another concern with mandating downward allocation is that, as the
complexity of the cgroup tree grows, it gets harder for the lower levels
to be informed about decisions made at the host-level. Consider a
container inside a namespace that in turn creates its own nested tree of
cgroups to run multiple workloads. It'd be extremely difficult to
configure memory.low parameters in those leaf cgroups that on one hand
balance pressure among siblings as the container desires, while also
reflecting the host-level protection from e.g. rpm upgrades, that lie
beyond one or more delegation and namespacing points in the tree.
It's highly unusual from a cgroup interface POV that nested levels have to
be aware of and reflect decisions made at higher levels for them to be
effective.
To enable such use cases and scale configurability for complex trees, this
patch implements a resource inheritance model for memory that is similar
to how the CPU and the IO controller implement work-conserving resource
allocations: a share of a resource allocated to a subree always applies to
the entire subtree recursively, while allowing, but not mandating,
children to further specify distribution rules.
That means that if protection is explicitly allocated among siblings,
those configured shares are being followed during page reclaim just like
they are now. However, if the memory.low set at a higher level is not
fully claimed by the children in that subtree, the "floating" remainder is
applied to each cgroup in the tree in proportion to its size. Since
reclaim pressure is applied in proportion to size as well, each child in
that tree gets the same boost, and the effect is neutral among siblings -
with respect to each other, they behave as if no memory control was
enabled at all, and the VM simply balances the memory demands optimally
within the subtree. But collectively those cgroups enjoy a boost over the
cgroups in neighboring trees.
E.g. a leaf cgroup with a memory.low setting of 0 no longer means that
it's not getting a share of the hierarchically assigned resource, just
that it doesn't claim a fixed amount of it to protect from its siblings.
This allows us to recursively protect one subtree (workload) from another
(system management), while letting subgroups compete freely among each
other - without having to assign fixed shares to each leaf, and without
nested groups having to echo higher-level settings.
The floating protection composes naturally with fixed protection.
Consider the following example tree:
A A: low = 2G
/ \ A1: low = 1G
A1 A2 A2: low = 0G
As outside pressure is applied to this tree, A1 will enjoy a fixed
protection from A2 of 1G, but the remaining, unclaimed 1G from A is split
evenly among A1 and A2, coming out to 1.5G and 0.5G.
There is a slight risk of regressing theoretical setups where the
top-level cgroups don't know about the true budgeting and set bogusly high
"bypass" values that are meaningfully allocated down the tree. Such
setups would rely on unclaimed protection to be discarded, and
distributing it would change the intended behavior. Be safe and hide the
new behavior behind a mount option, 'memory_recursiveprot'.
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Acked-by: Tejun Heo <tj@kernel.org>
Acked-by: Roman Gushchin <guro@fb.com>
Acked-by: Chris Down <chris@chrisdown.name>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Michal Koutný <mkoutny@suse.com>
Link: http://lkml.kernel.org/r/20200227195606.46212-4-hannes@cmpxchg.org
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2020-04-02 11:07:07 +07:00
|
|
|
unsigned long ep;
|
2020-04-02 11:07:03 +07:00
|
|
|
|
|
|
|
protected = min(usage, setting);
|
|
|
|
/*
|
|
|
|
* If all cgroups at this level combined claim and use more
|
|
|
|
* protection then what the parent affords them, distribute
|
|
|
|
* shares in proportion to utilization.
|
|
|
|
*
|
|
|
|
* We are using actual utilization rather than the statically
|
|
|
|
* claimed protection in order to be work-conserving: claimed
|
|
|
|
* but unused protection is available to siblings that would
|
|
|
|
* otherwise get a smaller chunk than what they claimed.
|
|
|
|
*/
|
|
|
|
if (siblings_protected > parent_effective)
|
|
|
|
return protected * parent_effective / siblings_protected;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Ok, utilized protection of all children is within what the
|
|
|
|
* parent affords them, so we know whatever this child claims
|
|
|
|
* and utilizes is effectively protected.
|
|
|
|
*
|
|
|
|
* If there is unprotected usage beyond this value, reclaim
|
|
|
|
* will apply pressure in proportion to that amount.
|
|
|
|
*
|
|
|
|
* If there is unutilized protection, the cgroup will be fully
|
|
|
|
* shielded from reclaim, but we do return a smaller value for
|
|
|
|
* protection than what the group could enjoy in theory. This
|
|
|
|
* is okay. With the overcommit distribution above, effective
|
|
|
|
* protection is always dependent on how memory is actually
|
|
|
|
* consumed among the siblings anyway.
|
|
|
|
*/
|
mm: memcontrol: recursive memory.low protection
Right now, the effective protection of any given cgroup is capped by its
own explicit memory.low setting, regardless of what the parent says. The
reasons for this are mostly historical and ease of implementation: to make
delegation of memory.low safe, effective protection is the min() of all
memory.low up the tree.
Unfortunately, this limitation makes it impossible to protect an entire
subtree from another without forcing the user to make explicit protection
allocations all the way to the leaf cgroups - something that is highly
undesirable in real life scenarios.
Consider memory in a data center host. At the cgroup top level, we have a
distinction between system management software and the actual workload the
system is executing. Both branches are further subdivided into individual
services, job components etc.
We want to protect the workload as a whole from the system management
software, but that doesn't mean we want to protect and prioritize
individual workload wrt each other. Their memory demand can vary over
time, and we'd want the VM to simply cache the hottest data within the
workload subtree. Yet, the current memory.low limitations force us to
allocate a fixed amount of protection to each workload component in order
to get protection from system management software in general. This
results in very inefficient resource distribution.
Another concern with mandating downward allocation is that, as the
complexity of the cgroup tree grows, it gets harder for the lower levels
to be informed about decisions made at the host-level. Consider a
container inside a namespace that in turn creates its own nested tree of
cgroups to run multiple workloads. It'd be extremely difficult to
configure memory.low parameters in those leaf cgroups that on one hand
balance pressure among siblings as the container desires, while also
reflecting the host-level protection from e.g. rpm upgrades, that lie
beyond one or more delegation and namespacing points in the tree.
It's highly unusual from a cgroup interface POV that nested levels have to
be aware of and reflect decisions made at higher levels for them to be
effective.
To enable such use cases and scale configurability for complex trees, this
patch implements a resource inheritance model for memory that is similar
to how the CPU and the IO controller implement work-conserving resource
allocations: a share of a resource allocated to a subree always applies to
the entire subtree recursively, while allowing, but not mandating,
children to further specify distribution rules.
That means that if protection is explicitly allocated among siblings,
those configured shares are being followed during page reclaim just like
they are now. However, if the memory.low set at a higher level is not
fully claimed by the children in that subtree, the "floating" remainder is
applied to each cgroup in the tree in proportion to its size. Since
reclaim pressure is applied in proportion to size as well, each child in
that tree gets the same boost, and the effect is neutral among siblings -
with respect to each other, they behave as if no memory control was
enabled at all, and the VM simply balances the memory demands optimally
within the subtree. But collectively those cgroups enjoy a boost over the
cgroups in neighboring trees.
E.g. a leaf cgroup with a memory.low setting of 0 no longer means that
it's not getting a share of the hierarchically assigned resource, just
that it doesn't claim a fixed amount of it to protect from its siblings.
This allows us to recursively protect one subtree (workload) from another
(system management), while letting subgroups compete freely among each
other - without having to assign fixed shares to each leaf, and without
nested groups having to echo higher-level settings.
The floating protection composes naturally with fixed protection.
Consider the following example tree:
A A: low = 2G
/ \ A1: low = 1G
A1 A2 A2: low = 0G
As outside pressure is applied to this tree, A1 will enjoy a fixed
protection from A2 of 1G, but the remaining, unclaimed 1G from A is split
evenly among A1 and A2, coming out to 1.5G and 0.5G.
There is a slight risk of regressing theoretical setups where the
top-level cgroups don't know about the true budgeting and set bogusly high
"bypass" values that are meaningfully allocated down the tree. Such
setups would rely on unclaimed protection to be discarded, and
distributing it would change the intended behavior. Be safe and hide the
new behavior behind a mount option, 'memory_recursiveprot'.
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Acked-by: Tejun Heo <tj@kernel.org>
Acked-by: Roman Gushchin <guro@fb.com>
Acked-by: Chris Down <chris@chrisdown.name>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Michal Koutný <mkoutny@suse.com>
Link: http://lkml.kernel.org/r/20200227195606.46212-4-hannes@cmpxchg.org
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2020-04-02 11:07:07 +07:00
|
|
|
ep = protected;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If the children aren't claiming (all of) the protection
|
|
|
|
* afforded to them by the parent, distribute the remainder in
|
|
|
|
* proportion to the (unprotected) memory of each cgroup. That
|
|
|
|
* way, cgroups that aren't explicitly prioritized wrt each
|
|
|
|
* other compete freely over the allowance, but they are
|
|
|
|
* collectively protected from neighboring trees.
|
|
|
|
*
|
|
|
|
* We're using unprotected memory for the weight so that if
|
|
|
|
* some cgroups DO claim explicit protection, we don't protect
|
|
|
|
* the same bytes twice.
|
2020-06-26 10:30:16 +07:00
|
|
|
*
|
|
|
|
* Check both usage and parent_usage against the respective
|
|
|
|
* protected values. One should imply the other, but they
|
|
|
|
* aren't read atomically - make sure the division is sane.
|
mm: memcontrol: recursive memory.low protection
Right now, the effective protection of any given cgroup is capped by its
own explicit memory.low setting, regardless of what the parent says. The
reasons for this are mostly historical and ease of implementation: to make
delegation of memory.low safe, effective protection is the min() of all
memory.low up the tree.
Unfortunately, this limitation makes it impossible to protect an entire
subtree from another without forcing the user to make explicit protection
allocations all the way to the leaf cgroups - something that is highly
undesirable in real life scenarios.
Consider memory in a data center host. At the cgroup top level, we have a
distinction between system management software and the actual workload the
system is executing. Both branches are further subdivided into individual
services, job components etc.
We want to protect the workload as a whole from the system management
software, but that doesn't mean we want to protect and prioritize
individual workload wrt each other. Their memory demand can vary over
time, and we'd want the VM to simply cache the hottest data within the
workload subtree. Yet, the current memory.low limitations force us to
allocate a fixed amount of protection to each workload component in order
to get protection from system management software in general. This
results in very inefficient resource distribution.
Another concern with mandating downward allocation is that, as the
complexity of the cgroup tree grows, it gets harder for the lower levels
to be informed about decisions made at the host-level. Consider a
container inside a namespace that in turn creates its own nested tree of
cgroups to run multiple workloads. It'd be extremely difficult to
configure memory.low parameters in those leaf cgroups that on one hand
balance pressure among siblings as the container desires, while also
reflecting the host-level protection from e.g. rpm upgrades, that lie
beyond one or more delegation and namespacing points in the tree.
It's highly unusual from a cgroup interface POV that nested levels have to
be aware of and reflect decisions made at higher levels for them to be
effective.
To enable such use cases and scale configurability for complex trees, this
patch implements a resource inheritance model for memory that is similar
to how the CPU and the IO controller implement work-conserving resource
allocations: a share of a resource allocated to a subree always applies to
the entire subtree recursively, while allowing, but not mandating,
children to further specify distribution rules.
That means that if protection is explicitly allocated among siblings,
those configured shares are being followed during page reclaim just like
they are now. However, if the memory.low set at a higher level is not
fully claimed by the children in that subtree, the "floating" remainder is
applied to each cgroup in the tree in proportion to its size. Since
reclaim pressure is applied in proportion to size as well, each child in
that tree gets the same boost, and the effect is neutral among siblings -
with respect to each other, they behave as if no memory control was
enabled at all, and the VM simply balances the memory demands optimally
within the subtree. But collectively those cgroups enjoy a boost over the
cgroups in neighboring trees.
E.g. a leaf cgroup with a memory.low setting of 0 no longer means that
it's not getting a share of the hierarchically assigned resource, just
that it doesn't claim a fixed amount of it to protect from its siblings.
This allows us to recursively protect one subtree (workload) from another
(system management), while letting subgroups compete freely among each
other - without having to assign fixed shares to each leaf, and without
nested groups having to echo higher-level settings.
The floating protection composes naturally with fixed protection.
Consider the following example tree:
A A: low = 2G
/ \ A1: low = 1G
A1 A2 A2: low = 0G
As outside pressure is applied to this tree, A1 will enjoy a fixed
protection from A2 of 1G, but the remaining, unclaimed 1G from A is split
evenly among A1 and A2, coming out to 1.5G and 0.5G.
There is a slight risk of regressing theoretical setups where the
top-level cgroups don't know about the true budgeting and set bogusly high
"bypass" values that are meaningfully allocated down the tree. Such
setups would rely on unclaimed protection to be discarded, and
distributing it would change the intended behavior. Be safe and hide the
new behavior behind a mount option, 'memory_recursiveprot'.
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Acked-by: Tejun Heo <tj@kernel.org>
Acked-by: Roman Gushchin <guro@fb.com>
Acked-by: Chris Down <chris@chrisdown.name>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Michal Koutný <mkoutny@suse.com>
Link: http://lkml.kernel.org/r/20200227195606.46212-4-hannes@cmpxchg.org
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2020-04-02 11:07:07 +07:00
|
|
|
*/
|
|
|
|
if (!(cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT))
|
|
|
|
return ep;
|
2020-06-26 10:30:16 +07:00
|
|
|
if (parent_effective > siblings_protected &&
|
|
|
|
parent_usage > siblings_protected &&
|
|
|
|
usage > protected) {
|
mm: memcontrol: recursive memory.low protection
Right now, the effective protection of any given cgroup is capped by its
own explicit memory.low setting, regardless of what the parent says. The
reasons for this are mostly historical and ease of implementation: to make
delegation of memory.low safe, effective protection is the min() of all
memory.low up the tree.
Unfortunately, this limitation makes it impossible to protect an entire
subtree from another without forcing the user to make explicit protection
allocations all the way to the leaf cgroups - something that is highly
undesirable in real life scenarios.
Consider memory in a data center host. At the cgroup top level, we have a
distinction between system management software and the actual workload the
system is executing. Both branches are further subdivided into individual
services, job components etc.
We want to protect the workload as a whole from the system management
software, but that doesn't mean we want to protect and prioritize
individual workload wrt each other. Their memory demand can vary over
time, and we'd want the VM to simply cache the hottest data within the
workload subtree. Yet, the current memory.low limitations force us to
allocate a fixed amount of protection to each workload component in order
to get protection from system management software in general. This
results in very inefficient resource distribution.
Another concern with mandating downward allocation is that, as the
complexity of the cgroup tree grows, it gets harder for the lower levels
to be informed about decisions made at the host-level. Consider a
container inside a namespace that in turn creates its own nested tree of
cgroups to run multiple workloads. It'd be extremely difficult to
configure memory.low parameters in those leaf cgroups that on one hand
balance pressure among siblings as the container desires, while also
reflecting the host-level protection from e.g. rpm upgrades, that lie
beyond one or more delegation and namespacing points in the tree.
It's highly unusual from a cgroup interface POV that nested levels have to
be aware of and reflect decisions made at higher levels for them to be
effective.
To enable such use cases and scale configurability for complex trees, this
patch implements a resource inheritance model for memory that is similar
to how the CPU and the IO controller implement work-conserving resource
allocations: a share of a resource allocated to a subree always applies to
the entire subtree recursively, while allowing, but not mandating,
children to further specify distribution rules.
That means that if protection is explicitly allocated among siblings,
those configured shares are being followed during page reclaim just like
they are now. However, if the memory.low set at a higher level is not
fully claimed by the children in that subtree, the "floating" remainder is
applied to each cgroup in the tree in proportion to its size. Since
reclaim pressure is applied in proportion to size as well, each child in
that tree gets the same boost, and the effect is neutral among siblings -
with respect to each other, they behave as if no memory control was
enabled at all, and the VM simply balances the memory demands optimally
within the subtree. But collectively those cgroups enjoy a boost over the
cgroups in neighboring trees.
E.g. a leaf cgroup with a memory.low setting of 0 no longer means that
it's not getting a share of the hierarchically assigned resource, just
that it doesn't claim a fixed amount of it to protect from its siblings.
This allows us to recursively protect one subtree (workload) from another
(system management), while letting subgroups compete freely among each
other - without having to assign fixed shares to each leaf, and without
nested groups having to echo higher-level settings.
The floating protection composes naturally with fixed protection.
Consider the following example tree:
A A: low = 2G
/ \ A1: low = 1G
A1 A2 A2: low = 0G
As outside pressure is applied to this tree, A1 will enjoy a fixed
protection from A2 of 1G, but the remaining, unclaimed 1G from A is split
evenly among A1 and A2, coming out to 1.5G and 0.5G.
There is a slight risk of regressing theoretical setups where the
top-level cgroups don't know about the true budgeting and set bogusly high
"bypass" values that are meaningfully allocated down the tree. Such
setups would rely on unclaimed protection to be discarded, and
distributing it would change the intended behavior. Be safe and hide the
new behavior behind a mount option, 'memory_recursiveprot'.
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Acked-by: Tejun Heo <tj@kernel.org>
Acked-by: Roman Gushchin <guro@fb.com>
Acked-by: Chris Down <chris@chrisdown.name>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Michal Koutný <mkoutny@suse.com>
Link: http://lkml.kernel.org/r/20200227195606.46212-4-hannes@cmpxchg.org
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2020-04-02 11:07:07 +07:00
|
|
|
unsigned long unclaimed;
|
|
|
|
|
|
|
|
unclaimed = parent_effective - siblings_protected;
|
|
|
|
unclaimed *= usage - protected;
|
|
|
|
unclaimed /= parent_usage - siblings_protected;
|
|
|
|
|
|
|
|
ep += unclaimed;
|
|
|
|
}
|
|
|
|
|
|
|
|
return ep;
|
2020-04-02 11:07:03 +07:00
|
|
|
}
|
|
|
|
|
mm: memcontrol: default hierarchy interface for memory
Introduce the basic control files to account, partition, and limit
memory using cgroups in default hierarchy mode.
This interface versioning allows us to address fundamental design
issues in the existing memory cgroup interface, further explained
below. The old interface will be maintained indefinitely, but a
clearer model and improved workload performance should encourage
existing users to switch over to the new one eventually.
The control files are thus:
- memory.current shows the current consumption of the cgroup and its
descendants, in bytes.
- memory.low configures the lower end of the cgroup's expected
memory consumption range. The kernel considers memory below that
boundary to be a reserve - the minimum that the workload needs in
order to make forward progress - and generally avoids reclaiming
it, unless there is an imminent risk of entering an OOM situation.
- memory.high configures the upper end of the cgroup's expected
memory consumption range. A cgroup whose consumption grows beyond
this threshold is forced into direct reclaim, to work off the
excess and to throttle new allocations heavily, but is generally
allowed to continue and the OOM killer is not invoked.
- memory.max configures the hard maximum amount of memory that the
cgroup is allowed to consume before the OOM killer is invoked.
- memory.events shows event counters that indicate how often the
cgroup was reclaimed while below memory.low, how often it was
forced to reclaim excess beyond memory.high, how often it hit
memory.max, and how often it entered OOM due to memory.max. This
allows users to identify configuration problems when observing a
degradation in workload performance. An overcommitted system will
have an increased rate of low boundary breaches, whereas increased
rates of high limit breaches, maximum hits, or even OOM situations
will indicate internally overcommitted cgroups.
For existing users of memory cgroups, the following deviations from
the current interface are worth pointing out and explaining:
- The original lower boundary, the soft limit, is defined as a limit
that is per default unset. As a result, the set of cgroups that
global reclaim prefers is opt-in, rather than opt-out. The costs
for optimizing these mostly negative lookups are so high that the
implementation, despite its enormous size, does not even provide
the basic desirable behavior. First off, the soft limit has no
hierarchical meaning. All configured groups are organized in a
global rbtree and treated like equal peers, regardless where they
are located in the hierarchy. This makes subtree delegation
impossible. Second, the soft limit reclaim pass is so aggressive
that it not just introduces high allocation latencies into the
system, but also impacts system performance due to overreclaim, to
the point where the feature becomes self-defeating.
The memory.low boundary on the other hand is a top-down allocated
reserve. A cgroup enjoys reclaim protection when it and all its
ancestors are below their low boundaries, which makes delegation
of subtrees possible. Secondly, new cgroups have no reserve per
default and in the common case most cgroups are eligible for the
preferred reclaim pass. This allows the new low boundary to be
efficiently implemented with just a minor addition to the generic
reclaim code, without the need for out-of-band data structures and
reclaim passes. Because the generic reclaim code considers all
cgroups except for the ones running low in the preferred first
reclaim pass, overreclaim of individual groups is eliminated as
well, resulting in much better overall workload performance.
- The original high boundary, the hard limit, is defined as a strict
limit that can not budge, even if the OOM killer has to be called.
But this generally goes against the goal of making the most out of
the available memory. The memory consumption of workloads varies
during runtime, and that requires users to overcommit. But doing
that with a strict upper limit requires either a fairly accurate
prediction of the working set size or adding slack to the limit.
Since working set size estimation is hard and error prone, and
getting it wrong results in OOM kills, most users tend to err on
the side of a looser limit and end up wasting precious resources.
The memory.high boundary on the other hand can be set much more
conservatively. When hit, it throttles allocations by forcing
them into direct reclaim to work off the excess, but it never
invokes the OOM killer. As a result, a high boundary that is
chosen too aggressively will not terminate the processes, but
instead it will lead to gradual performance degradation. The user
can monitor this and make corrections until the minimal memory
footprint that still gives acceptable performance is found.
In extreme cases, with many concurrent allocations and a complete
breakdown of reclaim progress within the group, the high boundary
can be exceeded. But even then it's mostly better to satisfy the
allocation from the slack available in other groups or the rest of
the system than killing the group. Otherwise, memory.max is there
to limit this type of spillover and ultimately contain buggy or
even malicious applications.
- The original control file names are unwieldy and inconsistent in
many different ways. For example, the upper boundary hit count is
exported in the memory.failcnt file, but an OOM event count has to
be manually counted by listening to memory.oom_control events, and
lower boundary / soft limit events have to be counted by first
setting a threshold for that value and then counting those events.
Also, usage and limit files encode their units in the filename.
That makes the filenames very long, even though this is not
information that a user needs to be reminded of every time they
type out those names.
To address these naming issues, as well as to signal clearly that
the new interface carries a new configuration model, the naming
conventions in it necessarily differ from the old interface.
- The original limit files indicate the state of an unset limit with
a very high number, and a configured limit can be unset by echoing
-1 into those files. But that very high number is implementation
and architecture dependent and not very descriptive. And while -1
can be understood as an underflow into the highest possible value,
-2 or -10M etc. do not work, so it's not inconsistent.
memory.low, memory.high, and memory.max will use the string
"infinity" to indicate and set the highest possible value.
[akpm@linux-foundation.org: use seq_puts() for basic strings]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: Vladimir Davydov <vdavydov@parallels.com>
Cc: Greg Thelen <gthelen@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-02-12 06:26:06 +07:00
|
|
|
/**
|
2018-06-08 07:07:46 +07:00
|
|
|
* mem_cgroup_protected - check if memory consumption is in the normal range
|
mm/memcontrol: exclude @root from checks in mem_cgroup_low
Make @root exclusive in mem_cgroup_low; it is never considered low when
looked at directly and is not checked when traversing the tree. In
effect, @root is handled identically to how root_mem_cgroup was
previously handled by mem_cgroup_low.
If @root is not excluded from the checks, a cgroup underneath @root will
never be considered low during targeted reclaim of @root, e.g. due to
memory.current > memory.high, unless @root is misconfigured to have
memory.low > memory.high.
Excluding @root enables using memory.low to prioritize memory usage
between cgroups within a subtree of the hierarchy that is limited by
memory.high or memory.max, e.g. when ROOT owns @root's controls but
delegates the @root directory to a USER so that USER can create and
administer children of @root.
For example, given cgroup A with children B and C:
A
/ \
B C
and
1. A/memory.current > A/memory.high
2. A/B/memory.current < A/B/memory.low
3. A/C/memory.current >= A/C/memory.low
As 'A' is high, i.e. triggers reclaim from 'A', and 'B' is low, we
should reclaim from 'C' until 'A' is no longer high or until we can no
longer reclaim from 'C'. If 'A', i.e. @root, isn't excluded by
mem_cgroup_low when reclaming from 'A', then 'B' won't be considered low
and we will reclaim indiscriminately from both 'B' and 'C'.
Here is the test I used to confirm the bug and the patch.
20:00:55@sjchrist-vm ? ~ $ cat ~/.bin/memcg_low_test
#!/bin/bash
x62mb=$((62<<20))
x66mb=$((66<<20))
x94mb=$((94<<20))
x98mb=$((98<<20))
setup() {
set -e
if [[ -n $DEBUG ]]; then
set -x
fi
trap teardown EXIT HUP INT TERM
if [[ ! -e /mnt/1gb.swap ]]; then
sudo fallocate -l 1G /mnt/1gb.swap > /dev/null
sudo mkswap /mnt/1gb.swap > /dev/null
fi
if ! swapon --show=NAME | grep -q "/mnt/1gb.swap"; then
sudo swapon /mnt/1gb.swap
fi
if [[ ! -e /cgroup/cgroup.controllers ]]; then
sudo mount -t cgroup2 none /cgroup
fi
grep -q memory /cgroup/cgroup.controllers
sudo sh -c "echo '+memory' > /cgroup/cgroup.subtree_control"
sudo mkdir /cgroup/A && sudo chown $USER:$USER /cgroup/A
sudo sh -c "echo '+memory' > /cgroup/A/cgroup.subtree_control"
sudo sh -c "echo '96m' > /cgroup/A/memory.high"
mkdir /cgroup/A/0
mkdir /cgroup/A/1
echo 64m > /cgroup/A/0/memory.low
}
teardown() {
set +e
trap - EXIT HUP INT TERM
if [[ -z $1 ]]; then
printf "\n"
printf "%0.s*" {1..35}
printf "\nFAILED!\n\n"
tail /cgroup/A/**/memory.current
printf "%0.s*" {1..35}
printf "\n\n"
fi
ps | grep stress | tr -s ' ' | cut -f 2 -d ' ' | xargs -I % kill %
sleep 2
if [[ -e /cgroup/A/0 ]]; then
rmdir /cgroup/A/0
fi
if [[ -e /cgroup/A/1 ]]; then
rmdir /cgroup/A/1
fi
if [[ -e /cgroup/A ]]; then
sudo rmdir /cgroup/A
fi
}
stress_test() {
sudo sh -c "echo $$ > /cgroup/A/$1/cgroup.procs"
stress --vm 1 --vm-bytes 64M --vm-keep > /dev/null &
sudo sh -c "echo $$ > /cgroup/A/$2/cgroup.procs"
stress --vm 1 --vm-bytes 64M --vm-keep > /dev/null &
sudo sh -c "echo $$ > /cgroup/cgroup.procs"
sleep 1
# A/0 should be consuming more memory than A/1
[[ $(cat /cgroup/A/0/memory.current) -ge $(cat /cgroup/A/1/memory.current) ]]
# A/0 should be consuming ~64mb
[[ $(cat /cgroup/A/0/memory.current) -ge $x62mb ]] && [[ $(cat /cgroup/A/0/memory.current) -le $x66mb ]]
# A should cumulatively be consuming ~96mb
[[ $(cat /cgroup/A/memory.current) -ge $x94mb ]] && [[ $(cat /cgroup/A/memory.current) -le $x98mb ]]
# Stop the stressors
ps | grep stress | tr -s ' ' | cut -f 2 -d ' ' | xargs -I % kill %
}
teardown 1
setup
for ((i=1;i<=$1;i++)); do
printf "ITERATION $i of $1 - stress_test 0 1"
stress_test 0 1
printf "\x1b[2K\r"
printf "ITERATION $i of $1 - stress_test 1 0"
stress_test 1 0
printf "\x1b[2K\r"
printf "ITERATION $i of $1 - PASSED\n"
done
teardown 1
echo PASSED!
20:11:26@sjchrist-vm ? ~ $ memcg_low_test 10
Link: http://lkml.kernel.org/r/1496434412-21005-1-git-send-email-sean.j.christopherson@intel.com
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Acked-by: Vladimir Davydov <vdavydov.dev@gmail.com>
Acked-by: Balbir Singh <bsingharora@gmail.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2017-07-11 05:48:05 +07:00
|
|
|
* @root: the top ancestor of the sub-tree being checked
|
mm: memcontrol: default hierarchy interface for memory
Introduce the basic control files to account, partition, and limit
memory using cgroups in default hierarchy mode.
This interface versioning allows us to address fundamental design
issues in the existing memory cgroup interface, further explained
below. The old interface will be maintained indefinitely, but a
clearer model and improved workload performance should encourage
existing users to switch over to the new one eventually.
The control files are thus:
- memory.current shows the current consumption of the cgroup and its
descendants, in bytes.
- memory.low configures the lower end of the cgroup's expected
memory consumption range. The kernel considers memory below that
boundary to be a reserve - the minimum that the workload needs in
order to make forward progress - and generally avoids reclaiming
it, unless there is an imminent risk of entering an OOM situation.
- memory.high configures the upper end of the cgroup's expected
memory consumption range. A cgroup whose consumption grows beyond
this threshold is forced into direct reclaim, to work off the
excess and to throttle new allocations heavily, but is generally
allowed to continue and the OOM killer is not invoked.
- memory.max configures the hard maximum amount of memory that the
cgroup is allowed to consume before the OOM killer is invoked.
- memory.events shows event counters that indicate how often the
cgroup was reclaimed while below memory.low, how often it was
forced to reclaim excess beyond memory.high, how often it hit
memory.max, and how often it entered OOM due to memory.max. This
allows users to identify configuration problems when observing a
degradation in workload performance. An overcommitted system will
have an increased rate of low boundary breaches, whereas increased
rates of high limit breaches, maximum hits, or even OOM situations
will indicate internally overcommitted cgroups.
For existing users of memory cgroups, the following deviations from
the current interface are worth pointing out and explaining:
- The original lower boundary, the soft limit, is defined as a limit
that is per default unset. As a result, the set of cgroups that
global reclaim prefers is opt-in, rather than opt-out. The costs
for optimizing these mostly negative lookups are so high that the
implementation, despite its enormous size, does not even provide
the basic desirable behavior. First off, the soft limit has no
hierarchical meaning. All configured groups are organized in a
global rbtree and treated like equal peers, regardless where they
are located in the hierarchy. This makes subtree delegation
impossible. Second, the soft limit reclaim pass is so aggressive
that it not just introduces high allocation latencies into the
system, but also impacts system performance due to overreclaim, to
the point where the feature becomes self-defeating.
The memory.low boundary on the other hand is a top-down allocated
reserve. A cgroup enjoys reclaim protection when it and all its
ancestors are below their low boundaries, which makes delegation
of subtrees possible. Secondly, new cgroups have no reserve per
default and in the common case most cgroups are eligible for the
preferred reclaim pass. This allows the new low boundary to be
efficiently implemented with just a minor addition to the generic
reclaim code, without the need for out-of-band data structures and
reclaim passes. Because the generic reclaim code considers all
cgroups except for the ones running low in the preferred first
reclaim pass, overreclaim of individual groups is eliminated as
well, resulting in much better overall workload performance.
- The original high boundary, the hard limit, is defined as a strict
limit that can not budge, even if the OOM killer has to be called.
But this generally goes against the goal of making the most out of
the available memory. The memory consumption of workloads varies
during runtime, and that requires users to overcommit. But doing
that with a strict upper limit requires either a fairly accurate
prediction of the working set size or adding slack to the limit.
Since working set size estimation is hard and error prone, and
getting it wrong results in OOM kills, most users tend to err on
the side of a looser limit and end up wasting precious resources.
The memory.high boundary on the other hand can be set much more
conservatively. When hit, it throttles allocations by forcing
them into direct reclaim to work off the excess, but it never
invokes the OOM killer. As a result, a high boundary that is
chosen too aggressively will not terminate the processes, but
instead it will lead to gradual performance degradation. The user
can monitor this and make corrections until the minimal memory
footprint that still gives acceptable performance is found.
In extreme cases, with many concurrent allocations and a complete
breakdown of reclaim progress within the group, the high boundary
can be exceeded. But even then it's mostly better to satisfy the
allocation from the slack available in other groups or the rest of
the system than killing the group. Otherwise, memory.max is there
to limit this type of spillover and ultimately contain buggy or
even malicious applications.
- The original control file names are unwieldy and inconsistent in
many different ways. For example, the upper boundary hit count is
exported in the memory.failcnt file, but an OOM event count has to
be manually counted by listening to memory.oom_control events, and
lower boundary / soft limit events have to be counted by first
setting a threshold for that value and then counting those events.
Also, usage and limit files encode their units in the filename.
That makes the filenames very long, even though this is not
information that a user needs to be reminded of every time they
type out those names.
To address these naming issues, as well as to signal clearly that
the new interface carries a new configuration model, the naming
conventions in it necessarily differ from the old interface.
- The original limit files indicate the state of an unset limit with
a very high number, and a configured limit can be unset by echoing
-1 into those files. But that very high number is implementation
and architecture dependent and not very descriptive. And while -1
can be understood as an underflow into the highest possible value,
-2 or -10M etc. do not work, so it's not inconsistent.
memory.low, memory.high, and memory.max will use the string
"infinity" to indicate and set the highest possible value.
[akpm@linux-foundation.org: use seq_puts() for basic strings]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: Vladimir Davydov <vdavydov@parallels.com>
Cc: Greg Thelen <gthelen@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-02-12 06:26:06 +07:00
|
|
|
* @memcg: the memory cgroup to check
|
|
|
|
*
|
mm: memory.low hierarchical behavior
This patch aims to address an issue in current memory.low semantics,
which makes it hard to use it in a hierarchy, where some leaf memory
cgroups are more valuable than others.
For example, there are memcgs A, A/B, A/C, A/D and A/E:
A A/memory.low = 2G, A/memory.current = 6G
//\\
BC DE B/memory.low = 3G B/memory.current = 2G
C/memory.low = 1G C/memory.current = 2G
D/memory.low = 0 D/memory.current = 2G
E/memory.low = 10G E/memory.current = 0
If we apply memory pressure, B, C and D are reclaimed at the same pace
while A's usage exceeds 2G. This is obviously wrong, as B's usage is
fully below B's memory.low, and C has 1G of protection as well. Also, A
is pushed to the size, which is less than A's 2G memory.low, which is
also wrong.
A simple bash script (provided below) can be used to reproduce
the problem. Current results are:
A: 1430097920
A/B: 711929856
A/C: 717426688
A/D: 741376
A/E: 0
To address the issue a concept of effective memory.low is introduced.
Effective memory.low is always equal or less than original memory.low.
In a case, when there is no memory.low overcommittment (and also for
top-level cgroups), these two values are equal.
Otherwise it's a part of parent's effective memory.low, calculated as a
cgroup's memory.low usage divided by sum of sibling's memory.low usages
(under memory.low usage I mean the size of actually protected memory:
memory.current if memory.current < memory.low, 0 otherwise). It's
necessary to track the actual usage, because otherwise an empty cgroup
with memory.low set (A/E in my example) will affect actual memory
distribution, which makes no sense. To avoid traversing the cgroup tree
twice, page_counters code is reused.
Calculating effective memory.low can be done in the reclaim path, as we
conveniently traversing the cgroup tree from top to bottom and check
memory.low on each level. So, it's a perfect place to calculate
effective memory low and save it to use it for children cgroups.
This also eliminates a need to traverse the cgroup tree from bottom to
top each time to check if parent's guarantee is not exceeded.
Setting/resetting effective memory.low is intentionally racy, but it's
fine and shouldn't lead to any significant differences in actual memory
distribution.
With this patch applied results are matching the expectations:
A: 2147930112
A/B: 1428721664
A/C: 718393344
A/D: 815104
A/E: 0
Test script:
#!/bin/bash
CGPATH="/sys/fs/cgroup"
truncate /file1 --size 2G
truncate /file2 --size 2G
truncate /file3 --size 2G
truncate /file4 --size 50G
mkdir "${CGPATH}/A"
echo "+memory" > "${CGPATH}/A/cgroup.subtree_control"
mkdir "${CGPATH}/A/B" "${CGPATH}/A/C" "${CGPATH}/A/D" "${CGPATH}/A/E"
echo 2G > "${CGPATH}/A/memory.low"
echo 3G > "${CGPATH}/A/B/memory.low"
echo 1G > "${CGPATH}/A/C/memory.low"
echo 0 > "${CGPATH}/A/D/memory.low"
echo 10G > "${CGPATH}/A/E/memory.low"
echo $$ > "${CGPATH}/A/B/cgroup.procs" && vmtouch -qt /file1
echo $$ > "${CGPATH}/A/C/cgroup.procs" && vmtouch -qt /file2
echo $$ > "${CGPATH}/A/D/cgroup.procs" && vmtouch -qt /file3
echo $$ > "${CGPATH}/cgroup.procs" && vmtouch -qt /file4
echo "A: " `cat "${CGPATH}/A/memory.current"`
echo "A/B: " `cat "${CGPATH}/A/B/memory.current"`
echo "A/C: " `cat "${CGPATH}/A/C/memory.current"`
echo "A/D: " `cat "${CGPATH}/A/D/memory.current"`
echo "A/E: " `cat "${CGPATH}/A/E/memory.current"`
rmdir "${CGPATH}/A/B" "${CGPATH}/A/C" "${CGPATH}/A/D" "${CGPATH}/A/E"
rmdir "${CGPATH}/A"
rm /file1 /file2 /file3 /file4
Link: http://lkml.kernel.org/r/20180405185921.4942-2-guro@fb.com
Signed-off-by: Roman Gushchin <guro@fb.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2018-06-08 07:06:22 +07:00
|
|
|
* WARNING: This function is not stateless! It can only be used as part
|
|
|
|
* of a top-down tree iteration, not for isolated queries.
|
mm: memcontrol: default hierarchy interface for memory
Introduce the basic control files to account, partition, and limit
memory using cgroups in default hierarchy mode.
This interface versioning allows us to address fundamental design
issues in the existing memory cgroup interface, further explained
below. The old interface will be maintained indefinitely, but a
clearer model and improved workload performance should encourage
existing users to switch over to the new one eventually.
The control files are thus:
- memory.current shows the current consumption of the cgroup and its
descendants, in bytes.
- memory.low configures the lower end of the cgroup's expected
memory consumption range. The kernel considers memory below that
boundary to be a reserve - the minimum that the workload needs in
order to make forward progress - and generally avoids reclaiming
it, unless there is an imminent risk of entering an OOM situation.
- memory.high configures the upper end of the cgroup's expected
memory consumption range. A cgroup whose consumption grows beyond
this threshold is forced into direct reclaim, to work off the
excess and to throttle new allocations heavily, but is generally
allowed to continue and the OOM killer is not invoked.
- memory.max configures the hard maximum amount of memory that the
cgroup is allowed to consume before the OOM killer is invoked.
- memory.events shows event counters that indicate how often the
cgroup was reclaimed while below memory.low, how often it was
forced to reclaim excess beyond memory.high, how often it hit
memory.max, and how often it entered OOM due to memory.max. This
allows users to identify configuration problems when observing a
degradation in workload performance. An overcommitted system will
have an increased rate of low boundary breaches, whereas increased
rates of high limit breaches, maximum hits, or even OOM situations
will indicate internally overcommitted cgroups.
For existing users of memory cgroups, the following deviations from
the current interface are worth pointing out and explaining:
- The original lower boundary, the soft limit, is defined as a limit
that is per default unset. As a result, the set of cgroups that
global reclaim prefers is opt-in, rather than opt-out. The costs
for optimizing these mostly negative lookups are so high that the
implementation, despite its enormous size, does not even provide
the basic desirable behavior. First off, the soft limit has no
hierarchical meaning. All configured groups are organized in a
global rbtree and treated like equal peers, regardless where they
are located in the hierarchy. This makes subtree delegation
impossible. Second, the soft limit reclaim pass is so aggressive
that it not just introduces high allocation latencies into the
system, but also impacts system performance due to overreclaim, to
the point where the feature becomes self-defeating.
The memory.low boundary on the other hand is a top-down allocated
reserve. A cgroup enjoys reclaim protection when it and all its
ancestors are below their low boundaries, which makes delegation
of subtrees possible. Secondly, new cgroups have no reserve per
default and in the common case most cgroups are eligible for the
preferred reclaim pass. This allows the new low boundary to be
efficiently implemented with just a minor addition to the generic
reclaim code, without the need for out-of-band data structures and
reclaim passes. Because the generic reclaim code considers all
cgroups except for the ones running low in the preferred first
reclaim pass, overreclaim of individual groups is eliminated as
well, resulting in much better overall workload performance.
- The original high boundary, the hard limit, is defined as a strict
limit that can not budge, even if the OOM killer has to be called.
But this generally goes against the goal of making the most out of
the available memory. The memory consumption of workloads varies
during runtime, and that requires users to overcommit. But doing
that with a strict upper limit requires either a fairly accurate
prediction of the working set size or adding slack to the limit.
Since working set size estimation is hard and error prone, and
getting it wrong results in OOM kills, most users tend to err on
the side of a looser limit and end up wasting precious resources.
The memory.high boundary on the other hand can be set much more
conservatively. When hit, it throttles allocations by forcing
them into direct reclaim to work off the excess, but it never
invokes the OOM killer. As a result, a high boundary that is
chosen too aggressively will not terminate the processes, but
instead it will lead to gradual performance degradation. The user
can monitor this and make corrections until the minimal memory
footprint that still gives acceptable performance is found.
In extreme cases, with many concurrent allocations and a complete
breakdown of reclaim progress within the group, the high boundary
can be exceeded. But even then it's mostly better to satisfy the
allocation from the slack available in other groups or the rest of
the system than killing the group. Otherwise, memory.max is there
to limit this type of spillover and ultimately contain buggy or
even malicious applications.
- The original control file names are unwieldy and inconsistent in
many different ways. For example, the upper boundary hit count is
exported in the memory.failcnt file, but an OOM event count has to
be manually counted by listening to memory.oom_control events, and
lower boundary / soft limit events have to be counted by first
setting a threshold for that value and then counting those events.
Also, usage and limit files encode their units in the filename.
That makes the filenames very long, even though this is not
information that a user needs to be reminded of every time they
type out those names.
To address these naming issues, as well as to signal clearly that
the new interface carries a new configuration model, the naming
conventions in it necessarily differ from the old interface.
- The original limit files indicate the state of an unset limit with
a very high number, and a configured limit can be unset by echoing
-1 into those files. But that very high number is implementation
and architecture dependent and not very descriptive. And while -1
can be understood as an underflow into the highest possible value,
-2 or -10M etc. do not work, so it's not inconsistent.
memory.low, memory.high, and memory.max will use the string
"infinity" to indicate and set the highest possible value.
[akpm@linux-foundation.org: use seq_puts() for basic strings]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: Vladimir Davydov <vdavydov@parallels.com>
Cc: Greg Thelen <gthelen@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-02-12 06:26:06 +07:00
|
|
|
*/
|
2020-08-07 13:22:05 +07:00
|
|
|
void mem_cgroup_calculate_protection(struct mem_cgroup *root,
|
|
|
|
struct mem_cgroup *memcg)
|
mm: memcontrol: default hierarchy interface for memory
Introduce the basic control files to account, partition, and limit
memory using cgroups in default hierarchy mode.
This interface versioning allows us to address fundamental design
issues in the existing memory cgroup interface, further explained
below. The old interface will be maintained indefinitely, but a
clearer model and improved workload performance should encourage
existing users to switch over to the new one eventually.
The control files are thus:
- memory.current shows the current consumption of the cgroup and its
descendants, in bytes.
- memory.low configures the lower end of the cgroup's expected
memory consumption range. The kernel considers memory below that
boundary to be a reserve - the minimum that the workload needs in
order to make forward progress - and generally avoids reclaiming
it, unless there is an imminent risk of entering an OOM situation.
- memory.high configures the upper end of the cgroup's expected
memory consumption range. A cgroup whose consumption grows beyond
this threshold is forced into direct reclaim, to work off the
excess and to throttle new allocations heavily, but is generally
allowed to continue and the OOM killer is not invoked.
- memory.max configures the hard maximum amount of memory that the
cgroup is allowed to consume before the OOM killer is invoked.
- memory.events shows event counters that indicate how often the
cgroup was reclaimed while below memory.low, how often it was
forced to reclaim excess beyond memory.high, how often it hit
memory.max, and how often it entered OOM due to memory.max. This
allows users to identify configuration problems when observing a
degradation in workload performance. An overcommitted system will
have an increased rate of low boundary breaches, whereas increased
rates of high limit breaches, maximum hits, or even OOM situations
will indicate internally overcommitted cgroups.
For existing users of memory cgroups, the following deviations from
the current interface are worth pointing out and explaining:
- The original lower boundary, the soft limit, is defined as a limit
that is per default unset. As a result, the set of cgroups that
global reclaim prefers is opt-in, rather than opt-out. The costs
for optimizing these mostly negative lookups are so high that the
implementation, despite its enormous size, does not even provide
the basic desirable behavior. First off, the soft limit has no
hierarchical meaning. All configured groups are organized in a
global rbtree and treated like equal peers, regardless where they
are located in the hierarchy. This makes subtree delegation
impossible. Second, the soft limit reclaim pass is so aggressive
that it not just introduces high allocation latencies into the
system, but also impacts system performance due to overreclaim, to
the point where the feature becomes self-defeating.
The memory.low boundary on the other hand is a top-down allocated
reserve. A cgroup enjoys reclaim protection when it and all its
ancestors are below their low boundaries, which makes delegation
of subtrees possible. Secondly, new cgroups have no reserve per
default and in the common case most cgroups are eligible for the
preferred reclaim pass. This allows the new low boundary to be
efficiently implemented with just a minor addition to the generic
reclaim code, without the need for out-of-band data structures and
reclaim passes. Because the generic reclaim code considers all
cgroups except for the ones running low in the preferred first
reclaim pass, overreclaim of individual groups is eliminated as
well, resulting in much better overall workload performance.
- The original high boundary, the hard limit, is defined as a strict
limit that can not budge, even if the OOM killer has to be called.
But this generally goes against the goal of making the most out of
the available memory. The memory consumption of workloads varies
during runtime, and that requires users to overcommit. But doing
that with a strict upper limit requires either a fairly accurate
prediction of the working set size or adding slack to the limit.
Since working set size estimation is hard and error prone, and
getting it wrong results in OOM kills, most users tend to err on
the side of a looser limit and end up wasting precious resources.
The memory.high boundary on the other hand can be set much more
conservatively. When hit, it throttles allocations by forcing
them into direct reclaim to work off the excess, but it never
invokes the OOM killer. As a result, a high boundary that is
chosen too aggressively will not terminate the processes, but
instead it will lead to gradual performance degradation. The user
can monitor this and make corrections until the minimal memory
footprint that still gives acceptable performance is found.
In extreme cases, with many concurrent allocations and a complete
breakdown of reclaim progress within the group, the high boundary
can be exceeded. But even then it's mostly better to satisfy the
allocation from the slack available in other groups or the rest of
the system than killing the group. Otherwise, memory.max is there
to limit this type of spillover and ultimately contain buggy or
even malicious applications.
- The original control file names are unwieldy and inconsistent in
many different ways. For example, the upper boundary hit count is
exported in the memory.failcnt file, but an OOM event count has to
be manually counted by listening to memory.oom_control events, and
lower boundary / soft limit events have to be counted by first
setting a threshold for that value and then counting those events.
Also, usage and limit files encode their units in the filename.
That makes the filenames very long, even though this is not
information that a user needs to be reminded of every time they
type out those names.
To address these naming issues, as well as to signal clearly that
the new interface carries a new configuration model, the naming
conventions in it necessarily differ from the old interface.
- The original limit files indicate the state of an unset limit with
a very high number, and a configured limit can be unset by echoing
-1 into those files. But that very high number is implementation
and architecture dependent and not very descriptive. And while -1
can be understood as an underflow into the highest possible value,
-2 or -10M etc. do not work, so it's not inconsistent.
memory.low, memory.high, and memory.max will use the string
"infinity" to indicate and set the highest possible value.
[akpm@linux-foundation.org: use seq_puts() for basic strings]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: Vladimir Davydov <vdavydov@parallels.com>
Cc: Greg Thelen <gthelen@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-02-12 06:26:06 +07:00
|
|
|
{
|
mm: memcontrol: recursive memory.low protection
Right now, the effective protection of any given cgroup is capped by its
own explicit memory.low setting, regardless of what the parent says. The
reasons for this are mostly historical and ease of implementation: to make
delegation of memory.low safe, effective protection is the min() of all
memory.low up the tree.
Unfortunately, this limitation makes it impossible to protect an entire
subtree from another without forcing the user to make explicit protection
allocations all the way to the leaf cgroups - something that is highly
undesirable in real life scenarios.
Consider memory in a data center host. At the cgroup top level, we have a
distinction between system management software and the actual workload the
system is executing. Both branches are further subdivided into individual
services, job components etc.
We want to protect the workload as a whole from the system management
software, but that doesn't mean we want to protect and prioritize
individual workload wrt each other. Their memory demand can vary over
time, and we'd want the VM to simply cache the hottest data within the
workload subtree. Yet, the current memory.low limitations force us to
allocate a fixed amount of protection to each workload component in order
to get protection from system management software in general. This
results in very inefficient resource distribution.
Another concern with mandating downward allocation is that, as the
complexity of the cgroup tree grows, it gets harder for the lower levels
to be informed about decisions made at the host-level. Consider a
container inside a namespace that in turn creates its own nested tree of
cgroups to run multiple workloads. It'd be extremely difficult to
configure memory.low parameters in those leaf cgroups that on one hand
balance pressure among siblings as the container desires, while also
reflecting the host-level protection from e.g. rpm upgrades, that lie
beyond one or more delegation and namespacing points in the tree.
It's highly unusual from a cgroup interface POV that nested levels have to
be aware of and reflect decisions made at higher levels for them to be
effective.
To enable such use cases and scale configurability for complex trees, this
patch implements a resource inheritance model for memory that is similar
to how the CPU and the IO controller implement work-conserving resource
allocations: a share of a resource allocated to a subree always applies to
the entire subtree recursively, while allowing, but not mandating,
children to further specify distribution rules.
That means that if protection is explicitly allocated among siblings,
those configured shares are being followed during page reclaim just like
they are now. However, if the memory.low set at a higher level is not
fully claimed by the children in that subtree, the "floating" remainder is
applied to each cgroup in the tree in proportion to its size. Since
reclaim pressure is applied in proportion to size as well, each child in
that tree gets the same boost, and the effect is neutral among siblings -
with respect to each other, they behave as if no memory control was
enabled at all, and the VM simply balances the memory demands optimally
within the subtree. But collectively those cgroups enjoy a boost over the
cgroups in neighboring trees.
E.g. a leaf cgroup with a memory.low setting of 0 no longer means that
it's not getting a share of the hierarchically assigned resource, just
that it doesn't claim a fixed amount of it to protect from its siblings.
This allows us to recursively protect one subtree (workload) from another
(system management), while letting subgroups compete freely among each
other - without having to assign fixed shares to each leaf, and without
nested groups having to echo higher-level settings.
The floating protection composes naturally with fixed protection.
Consider the following example tree:
A A: low = 2G
/ \ A1: low = 1G
A1 A2 A2: low = 0G
As outside pressure is applied to this tree, A1 will enjoy a fixed
protection from A2 of 1G, but the remaining, unclaimed 1G from A is split
evenly among A1 and A2, coming out to 1.5G and 0.5G.
There is a slight risk of regressing theoretical setups where the
top-level cgroups don't know about the true budgeting and set bogusly high
"bypass" values that are meaningfully allocated down the tree. Such
setups would rely on unclaimed protection to be discarded, and
distributing it would change the intended behavior. Be safe and hide the
new behavior behind a mount option, 'memory_recursiveprot'.
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Acked-by: Tejun Heo <tj@kernel.org>
Acked-by: Roman Gushchin <guro@fb.com>
Acked-by: Chris Down <chris@chrisdown.name>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Michal Koutný <mkoutny@suse.com>
Link: http://lkml.kernel.org/r/20200227195606.46212-4-hannes@cmpxchg.org
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2020-04-02 11:07:07 +07:00
|
|
|
unsigned long usage, parent_usage;
|
mm: memory.low hierarchical behavior
This patch aims to address an issue in current memory.low semantics,
which makes it hard to use it in a hierarchy, where some leaf memory
cgroups are more valuable than others.
For example, there are memcgs A, A/B, A/C, A/D and A/E:
A A/memory.low = 2G, A/memory.current = 6G
//\\
BC DE B/memory.low = 3G B/memory.current = 2G
C/memory.low = 1G C/memory.current = 2G
D/memory.low = 0 D/memory.current = 2G
E/memory.low = 10G E/memory.current = 0
If we apply memory pressure, B, C and D are reclaimed at the same pace
while A's usage exceeds 2G. This is obviously wrong, as B's usage is
fully below B's memory.low, and C has 1G of protection as well. Also, A
is pushed to the size, which is less than A's 2G memory.low, which is
also wrong.
A simple bash script (provided below) can be used to reproduce
the problem. Current results are:
A: 1430097920
A/B: 711929856
A/C: 717426688
A/D: 741376
A/E: 0
To address the issue a concept of effective memory.low is introduced.
Effective memory.low is always equal or less than original memory.low.
In a case, when there is no memory.low overcommittment (and also for
top-level cgroups), these two values are equal.
Otherwise it's a part of parent's effective memory.low, calculated as a
cgroup's memory.low usage divided by sum of sibling's memory.low usages
(under memory.low usage I mean the size of actually protected memory:
memory.current if memory.current < memory.low, 0 otherwise). It's
necessary to track the actual usage, because otherwise an empty cgroup
with memory.low set (A/E in my example) will affect actual memory
distribution, which makes no sense. To avoid traversing the cgroup tree
twice, page_counters code is reused.
Calculating effective memory.low can be done in the reclaim path, as we
conveniently traversing the cgroup tree from top to bottom and check
memory.low on each level. So, it's a perfect place to calculate
effective memory low and save it to use it for children cgroups.
This also eliminates a need to traverse the cgroup tree from bottom to
top each time to check if parent's guarantee is not exceeded.
Setting/resetting effective memory.low is intentionally racy, but it's
fine and shouldn't lead to any significant differences in actual memory
distribution.
With this patch applied results are matching the expectations:
A: 2147930112
A/B: 1428721664
A/C: 718393344
A/D: 815104
A/E: 0
Test script:
#!/bin/bash
CGPATH="/sys/fs/cgroup"
truncate /file1 --size 2G
truncate /file2 --size 2G
truncate /file3 --size 2G
truncate /file4 --size 50G
mkdir "${CGPATH}/A"
echo "+memory" > "${CGPATH}/A/cgroup.subtree_control"
mkdir "${CGPATH}/A/B" "${CGPATH}/A/C" "${CGPATH}/A/D" "${CGPATH}/A/E"
echo 2G > "${CGPATH}/A/memory.low"
echo 3G > "${CGPATH}/A/B/memory.low"
echo 1G > "${CGPATH}/A/C/memory.low"
echo 0 > "${CGPATH}/A/D/memory.low"
echo 10G > "${CGPATH}/A/E/memory.low"
echo $$ > "${CGPATH}/A/B/cgroup.procs" && vmtouch -qt /file1
echo $$ > "${CGPATH}/A/C/cgroup.procs" && vmtouch -qt /file2
echo $$ > "${CGPATH}/A/D/cgroup.procs" && vmtouch -qt /file3
echo $$ > "${CGPATH}/cgroup.procs" && vmtouch -qt /file4
echo "A: " `cat "${CGPATH}/A/memory.current"`
echo "A/B: " `cat "${CGPATH}/A/B/memory.current"`
echo "A/C: " `cat "${CGPATH}/A/C/memory.current"`
echo "A/D: " `cat "${CGPATH}/A/D/memory.current"`
echo "A/E: " `cat "${CGPATH}/A/E/memory.current"`
rmdir "${CGPATH}/A/B" "${CGPATH}/A/C" "${CGPATH}/A/D" "${CGPATH}/A/E"
rmdir "${CGPATH}/A"
rm /file1 /file2 /file3 /file4
Link: http://lkml.kernel.org/r/20180405185921.4942-2-guro@fb.com
Signed-off-by: Roman Gushchin <guro@fb.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2018-06-08 07:06:22 +07:00
|
|
|
struct mem_cgroup *parent;
|
|
|
|
|
mm: memcontrol: default hierarchy interface for memory
Introduce the basic control files to account, partition, and limit
memory using cgroups in default hierarchy mode.
This interface versioning allows us to address fundamental design
issues in the existing memory cgroup interface, further explained
below. The old interface will be maintained indefinitely, but a
clearer model and improved workload performance should encourage
existing users to switch over to the new one eventually.
The control files are thus:
- memory.current shows the current consumption of the cgroup and its
descendants, in bytes.
- memory.low configures the lower end of the cgroup's expected
memory consumption range. The kernel considers memory below that
boundary to be a reserve - the minimum that the workload needs in
order to make forward progress - and generally avoids reclaiming
it, unless there is an imminent risk of entering an OOM situation.
- memory.high configures the upper end of the cgroup's expected
memory consumption range. A cgroup whose consumption grows beyond
this threshold is forced into direct reclaim, to work off the
excess and to throttle new allocations heavily, but is generally
allowed to continue and the OOM killer is not invoked.
- memory.max configures the hard maximum amount of memory that the
cgroup is allowed to consume before the OOM killer is invoked.
- memory.events shows event counters that indicate how often the
cgroup was reclaimed while below memory.low, how often it was
forced to reclaim excess beyond memory.high, how often it hit
memory.max, and how often it entered OOM due to memory.max. This
allows users to identify configuration problems when observing a
degradation in workload performance. An overcommitted system will
have an increased rate of low boundary breaches, whereas increased
rates of high limit breaches, maximum hits, or even OOM situations
will indicate internally overcommitted cgroups.
For existing users of memory cgroups, the following deviations from
the current interface are worth pointing out and explaining:
- The original lower boundary, the soft limit, is defined as a limit
that is per default unset. As a result, the set of cgroups that
global reclaim prefers is opt-in, rather than opt-out. The costs
for optimizing these mostly negative lookups are so high that the
implementation, despite its enormous size, does not even provide
the basic desirable behavior. First off, the soft limit has no
hierarchical meaning. All configured groups are organized in a
global rbtree and treated like equal peers, regardless where they
are located in the hierarchy. This makes subtree delegation
impossible. Second, the soft limit reclaim pass is so aggressive
that it not just introduces high allocation latencies into the
system, but also impacts system performance due to overreclaim, to
the point where the feature becomes self-defeating.
The memory.low boundary on the other hand is a top-down allocated
reserve. A cgroup enjoys reclaim protection when it and all its
ancestors are below their low boundaries, which makes delegation
of subtrees possible. Secondly, new cgroups have no reserve per
default and in the common case most cgroups are eligible for the
preferred reclaim pass. This allows the new low boundary to be
efficiently implemented with just a minor addition to the generic
reclaim code, without the need for out-of-band data structures and
reclaim passes. Because the generic reclaim code considers all
cgroups except for the ones running low in the preferred first
reclaim pass, overreclaim of individual groups is eliminated as
well, resulting in much better overall workload performance.
- The original high boundary, the hard limit, is defined as a strict
limit that can not budge, even if the OOM killer has to be called.
But this generally goes against the goal of making the most out of
the available memory. The memory consumption of workloads varies
during runtime, and that requires users to overcommit. But doing
that with a strict upper limit requires either a fairly accurate
prediction of the working set size or adding slack to the limit.
Since working set size estimation is hard and error prone, and
getting it wrong results in OOM kills, most users tend to err on
the side of a looser limit and end up wasting precious resources.
The memory.high boundary on the other hand can be set much more
conservatively. When hit, it throttles allocations by forcing
them into direct reclaim to work off the excess, but it never
invokes the OOM killer. As a result, a high boundary that is
chosen too aggressively will not terminate the processes, but
instead it will lead to gradual performance degradation. The user
can monitor this and make corrections until the minimal memory
footprint that still gives acceptable performance is found.
In extreme cases, with many concurrent allocations and a complete
breakdown of reclaim progress within the group, the high boundary
can be exceeded. But even then it's mostly better to satisfy the
allocation from the slack available in other groups or the rest of
the system than killing the group. Otherwise, memory.max is there
to limit this type of spillover and ultimately contain buggy or
even malicious applications.
- The original control file names are unwieldy and inconsistent in
many different ways. For example, the upper boundary hit count is
exported in the memory.failcnt file, but an OOM event count has to
be manually counted by listening to memory.oom_control events, and
lower boundary / soft limit events have to be counted by first
setting a threshold for that value and then counting those events.
Also, usage and limit files encode their units in the filename.
That makes the filenames very long, even though this is not
information that a user needs to be reminded of every time they
type out those names.
To address these naming issues, as well as to signal clearly that
the new interface carries a new configuration model, the naming
conventions in it necessarily differ from the old interface.
- The original limit files indicate the state of an unset limit with
a very high number, and a configured limit can be unset by echoing
-1 into those files. But that very high number is implementation
and architecture dependent and not very descriptive. And while -1
can be understood as an underflow into the highest possible value,
-2 or -10M etc. do not work, so it's not inconsistent.
memory.low, memory.high, and memory.max will use the string
"infinity" to indicate and set the highest possible value.
[akpm@linux-foundation.org: use seq_puts() for basic strings]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: Vladimir Davydov <vdavydov@parallels.com>
Cc: Greg Thelen <gthelen@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-02-12 06:26:06 +07:00
|
|
|
if (mem_cgroup_disabled())
|
2020-08-07 13:22:05 +07:00
|
|
|
return;
|
mm: memcontrol: default hierarchy interface for memory
Introduce the basic control files to account, partition, and limit
memory using cgroups in default hierarchy mode.
This interface versioning allows us to address fundamental design
issues in the existing memory cgroup interface, further explained
below. The old interface will be maintained indefinitely, but a
clearer model and improved workload performance should encourage
existing users to switch over to the new one eventually.
The control files are thus:
- memory.current shows the current consumption of the cgroup and its
descendants, in bytes.
- memory.low configures the lower end of the cgroup's expected
memory consumption range. The kernel considers memory below that
boundary to be a reserve - the minimum that the workload needs in
order to make forward progress - and generally avoids reclaiming
it, unless there is an imminent risk of entering an OOM situation.
- memory.high configures the upper end of the cgroup's expected
memory consumption range. A cgroup whose consumption grows beyond
this threshold is forced into direct reclaim, to work off the
excess and to throttle new allocations heavily, but is generally
allowed to continue and the OOM killer is not invoked.
- memory.max configures the hard maximum amount of memory that the
cgroup is allowed to consume before the OOM killer is invoked.
- memory.events shows event counters that indicate how often the
cgroup was reclaimed while below memory.low, how often it was
forced to reclaim excess beyond memory.high, how often it hit
memory.max, and how often it entered OOM due to memory.max. This
allows users to identify configuration problems when observing a
degradation in workload performance. An overcommitted system will
have an increased rate of low boundary breaches, whereas increased
rates of high limit breaches, maximum hits, or even OOM situations
will indicate internally overcommitted cgroups.
For existing users of memory cgroups, the following deviations from
the current interface are worth pointing out and explaining:
- The original lower boundary, the soft limit, is defined as a limit
that is per default unset. As a result, the set of cgroups that
global reclaim prefers is opt-in, rather than opt-out. The costs
for optimizing these mostly negative lookups are so high that the
implementation, despite its enormous size, does not even provide
the basic desirable behavior. First off, the soft limit has no
hierarchical meaning. All configured groups are organized in a
global rbtree and treated like equal peers, regardless where they
are located in the hierarchy. This makes subtree delegation
impossible. Second, the soft limit reclaim pass is so aggressive
that it not just introduces high allocation latencies into the
system, but also impacts system performance due to overreclaim, to
the point where the feature becomes self-defeating.
The memory.low boundary on the other hand is a top-down allocated
reserve. A cgroup enjoys reclaim protection when it and all its
ancestors are below their low boundaries, which makes delegation
of subtrees possible. Secondly, new cgroups have no reserve per
default and in the common case most cgroups are eligible for the
preferred reclaim pass. This allows the new low boundary to be
efficiently implemented with just a minor addition to the generic
reclaim code, without the need for out-of-band data structures and
reclaim passes. Because the generic reclaim code considers all
cgroups except for the ones running low in the preferred first
reclaim pass, overreclaim of individual groups is eliminated as
well, resulting in much better overall workload performance.
- The original high boundary, the hard limit, is defined as a strict
limit that can not budge, even if the OOM killer has to be called.
But this generally goes against the goal of making the most out of
the available memory. The memory consumption of workloads varies
during runtime, and that requires users to overcommit. But doing
that with a strict upper limit requires either a fairly accurate
prediction of the working set size or adding slack to the limit.
Since working set size estimation is hard and error prone, and
getting it wrong results in OOM kills, most users tend to err on
the side of a looser limit and end up wasting precious resources.
The memory.high boundary on the other hand can be set much more
conservatively. When hit, it throttles allocations by forcing
them into direct reclaim to work off the excess, but it never
invokes the OOM killer. As a result, a high boundary that is
chosen too aggressively will not terminate the processes, but
instead it will lead to gradual performance degradation. The user
can monitor this and make corrections until the minimal memory
footprint that still gives acceptable performance is found.
In extreme cases, with many concurrent allocations and a complete
breakdown of reclaim progress within the group, the high boundary
can be exceeded. But even then it's mostly better to satisfy the
allocation from the slack available in other groups or the rest of
the system than killing the group. Otherwise, memory.max is there
to limit this type of spillover and ultimately contain buggy or
even malicious applications.
- The original control file names are unwieldy and inconsistent in
many different ways. For example, the upper boundary hit count is
exported in the memory.failcnt file, but an OOM event count has to
be manually counted by listening to memory.oom_control events, and
lower boundary / soft limit events have to be counted by first
setting a threshold for that value and then counting those events.
Also, usage and limit files encode their units in the filename.
That makes the filenames very long, even though this is not
information that a user needs to be reminded of every time they
type out those names.
To address these naming issues, as well as to signal clearly that
the new interface carries a new configuration model, the naming
conventions in it necessarily differ from the old interface.
- The original limit files indicate the state of an unset limit with
a very high number, and a configured limit can be unset by echoing
-1 into those files. But that very high number is implementation
and architecture dependent and not very descriptive. And while -1
can be understood as an underflow into the highest possible value,
-2 or -10M etc. do not work, so it's not inconsistent.
memory.low, memory.high, and memory.max will use the string
"infinity" to indicate and set the highest possible value.
[akpm@linux-foundation.org: use seq_puts() for basic strings]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: Vladimir Davydov <vdavydov@parallels.com>
Cc: Greg Thelen <gthelen@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-02-12 06:26:06 +07:00
|
|
|
|
mm/memcontrol: exclude @root from checks in mem_cgroup_low
Make @root exclusive in mem_cgroup_low; it is never considered low when
looked at directly and is not checked when traversing the tree. In
effect, @root is handled identically to how root_mem_cgroup was
previously handled by mem_cgroup_low.
If @root is not excluded from the checks, a cgroup underneath @root will
never be considered low during targeted reclaim of @root, e.g. due to
memory.current > memory.high, unless @root is misconfigured to have
memory.low > memory.high.
Excluding @root enables using memory.low to prioritize memory usage
between cgroups within a subtree of the hierarchy that is limited by
memory.high or memory.max, e.g. when ROOT owns @root's controls but
delegates the @root directory to a USER so that USER can create and
administer children of @root.
For example, given cgroup A with children B and C:
A
/ \
B C
and
1. A/memory.current > A/memory.high
2. A/B/memory.current < A/B/memory.low
3. A/C/memory.current >= A/C/memory.low
As 'A' is high, i.e. triggers reclaim from 'A', and 'B' is low, we
should reclaim from 'C' until 'A' is no longer high or until we can no
longer reclaim from 'C'. If 'A', i.e. @root, isn't excluded by
mem_cgroup_low when reclaming from 'A', then 'B' won't be considered low
and we will reclaim indiscriminately from both 'B' and 'C'.
Here is the test I used to confirm the bug and the patch.
20:00:55@sjchrist-vm ? ~ $ cat ~/.bin/memcg_low_test
#!/bin/bash
x62mb=$((62<<20))
x66mb=$((66<<20))
x94mb=$((94<<20))
x98mb=$((98<<20))
setup() {
set -e
if [[ -n $DEBUG ]]; then
set -x
fi
trap teardown EXIT HUP INT TERM
if [[ ! -e /mnt/1gb.swap ]]; then
sudo fallocate -l 1G /mnt/1gb.swap > /dev/null
sudo mkswap /mnt/1gb.swap > /dev/null
fi
if ! swapon --show=NAME | grep -q "/mnt/1gb.swap"; then
sudo swapon /mnt/1gb.swap
fi
if [[ ! -e /cgroup/cgroup.controllers ]]; then
sudo mount -t cgroup2 none /cgroup
fi
grep -q memory /cgroup/cgroup.controllers
sudo sh -c "echo '+memory' > /cgroup/cgroup.subtree_control"
sudo mkdir /cgroup/A && sudo chown $USER:$USER /cgroup/A
sudo sh -c "echo '+memory' > /cgroup/A/cgroup.subtree_control"
sudo sh -c "echo '96m' > /cgroup/A/memory.high"
mkdir /cgroup/A/0
mkdir /cgroup/A/1
echo 64m > /cgroup/A/0/memory.low
}
teardown() {
set +e
trap - EXIT HUP INT TERM
if [[ -z $1 ]]; then
printf "\n"
printf "%0.s*" {1..35}
printf "\nFAILED!\n\n"
tail /cgroup/A/**/memory.current
printf "%0.s*" {1..35}
printf "\n\n"
fi
ps | grep stress | tr -s ' ' | cut -f 2 -d ' ' | xargs -I % kill %
sleep 2
if [[ -e /cgroup/A/0 ]]; then
rmdir /cgroup/A/0
fi
if [[ -e /cgroup/A/1 ]]; then
rmdir /cgroup/A/1
fi
if [[ -e /cgroup/A ]]; then
sudo rmdir /cgroup/A
fi
}
stress_test() {
sudo sh -c "echo $$ > /cgroup/A/$1/cgroup.procs"
stress --vm 1 --vm-bytes 64M --vm-keep > /dev/null &
sudo sh -c "echo $$ > /cgroup/A/$2/cgroup.procs"
stress --vm 1 --vm-bytes 64M --vm-keep > /dev/null &
sudo sh -c "echo $$ > /cgroup/cgroup.procs"
sleep 1
# A/0 should be consuming more memory than A/1
[[ $(cat /cgroup/A/0/memory.current) -ge $(cat /cgroup/A/1/memory.current) ]]
# A/0 should be consuming ~64mb
[[ $(cat /cgroup/A/0/memory.current) -ge $x62mb ]] && [[ $(cat /cgroup/A/0/memory.current) -le $x66mb ]]
# A should cumulatively be consuming ~96mb
[[ $(cat /cgroup/A/memory.current) -ge $x94mb ]] && [[ $(cat /cgroup/A/memory.current) -le $x98mb ]]
# Stop the stressors
ps | grep stress | tr -s ' ' | cut -f 2 -d ' ' | xargs -I % kill %
}
teardown 1
setup
for ((i=1;i<=$1;i++)); do
printf "ITERATION $i of $1 - stress_test 0 1"
stress_test 0 1
printf "\x1b[2K\r"
printf "ITERATION $i of $1 - stress_test 1 0"
stress_test 1 0
printf "\x1b[2K\r"
printf "ITERATION $i of $1 - PASSED\n"
done
teardown 1
echo PASSED!
20:11:26@sjchrist-vm ? ~ $ memcg_low_test 10
Link: http://lkml.kernel.org/r/1496434412-21005-1-git-send-email-sean.j.christopherson@intel.com
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Acked-by: Vladimir Davydov <vdavydov.dev@gmail.com>
Acked-by: Balbir Singh <bsingharora@gmail.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2017-07-11 05:48:05 +07:00
|
|
|
if (!root)
|
|
|
|
root = root_mem_cgroup;
|
mm, memcg: avoid stale protection values when cgroup is above protection
Patch series "mm, memcg: memory.{low,min} reclaim fix & cleanup", v4.
This series contains a fix for a edge case in my earlier protection
calculation patches, and a patch to make the area overall a little more
robust to hopefully help avoid this in future.
This patch (of 2):
A cgroup can have both memory protection and a memory limit to isolate it
from its siblings in both directions - for example, to prevent it from
being shrunk below 2G under high pressure from outside, but also from
growing beyond 4G under low pressure.
Commit 9783aa9917f8 ("mm, memcg: proportional memory.{low,min} reclaim")
implemented proportional scan pressure so that multiple siblings in excess
of their protection settings don't get reclaimed equally but instead in
accordance to their unprotected portion.
During limit reclaim, this proportionality shouldn't apply of course:
there is no competition, all pressure is from within the cgroup and should
be applied as such. Reclaim should operate at full efficiency.
However, mem_cgroup_protected() never expected anybody to look at the
effective protection values when it indicated that the cgroup is above its
protection. As a result, a query during limit reclaim may return stale
protection values that were calculated by a previous reclaim cycle in
which the cgroup did have siblings.
When this happens, reclaim is unnecessarily hesitant and potentially slow
to meet the desired limit. In theory this could lead to premature OOM
kills, although it's not obvious this has occurred in practice.
Workaround the problem by special casing reclaim roots in
mem_cgroup_protection. These memcgs are never participating in the
reclaim protection because the reclaim is internal.
We have to ignore effective protection values for reclaim roots because
mem_cgroup_protected might be called from racing reclaim contexts with
different roots. Calculation is relying on root -> leaf tree traversal
therefore top-down reclaim protection invariants should hold. The only
exception is the reclaim root which should have effective protection set
to 0 but that would be problematic for the following setup:
Let's have global and A's reclaim in parallel:
|
A (low=2G, usage = 3G, max = 3G, children_low_usage = 1.5G)
|\
| C (low = 1G, usage = 2.5G)
B (low = 1G, usage = 0.5G)
for A reclaim we have
B.elow = B.low
C.elow = C.low
For the global reclaim
A.elow = A.low
B.elow = min(B.usage, B.low) because children_low_usage <= A.elow
C.elow = min(C.usage, C.low)
With the effective values resetting we have A reclaim
A.elow = 0
B.elow = B.low
C.elow = C.low
and global reclaim could see the above and then
B.elow = C.elow = 0 because children_low_usage > A.elow
Which means that protected memcgs would get reclaimed.
In future we would like to make mem_cgroup_protected more robust against
racing reclaim contexts but that is likely more complex solution than this
simple workaround.
[hannes@cmpxchg.org - large part of the changelog]
[mhocko@suse.com - workaround explanation]
[chris@chrisdown.name - retitle]
Fixes: 9783aa9917f8 ("mm, memcg: proportional memory.{low,min} reclaim")
Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
Signed-off-by: Chris Down <chris@chrisdown.name>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Chris Down <chris@chrisdown.name>
Acked-by: Roman Gushchin <guro@fb.com>
Link: http://lkml.kernel.org/r/cover.1594638158.git.chris@chrisdown.name
Link: http://lkml.kernel.org/r/044fb8ecffd001c7905d27c0c2ad998069fdc396.1594638158.git.chris@chrisdown.name
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2020-08-07 13:22:01 +07:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Effective values of the reclaim targets are ignored so they
|
|
|
|
* can be stale. Have a look at mem_cgroup_protection for more
|
|
|
|
* details.
|
|
|
|
* TODO: calculation should be more robust so that we do not need
|
|
|
|
* that special casing.
|
|
|
|
*/
|
mm/memcontrol: exclude @root from checks in mem_cgroup_low
Make @root exclusive in mem_cgroup_low; it is never considered low when
looked at directly and is not checked when traversing the tree. In
effect, @root is handled identically to how root_mem_cgroup was
previously handled by mem_cgroup_low.
If @root is not excluded from the checks, a cgroup underneath @root will
never be considered low during targeted reclaim of @root, e.g. due to
memory.current > memory.high, unless @root is misconfigured to have
memory.low > memory.high.
Excluding @root enables using memory.low to prioritize memory usage
between cgroups within a subtree of the hierarchy that is limited by
memory.high or memory.max, e.g. when ROOT owns @root's controls but
delegates the @root directory to a USER so that USER can create and
administer children of @root.
For example, given cgroup A with children B and C:
A
/ \
B C
and
1. A/memory.current > A/memory.high
2. A/B/memory.current < A/B/memory.low
3. A/C/memory.current >= A/C/memory.low
As 'A' is high, i.e. triggers reclaim from 'A', and 'B' is low, we
should reclaim from 'C' until 'A' is no longer high or until we can no
longer reclaim from 'C'. If 'A', i.e. @root, isn't excluded by
mem_cgroup_low when reclaming from 'A', then 'B' won't be considered low
and we will reclaim indiscriminately from both 'B' and 'C'.
Here is the test I used to confirm the bug and the patch.
20:00:55@sjchrist-vm ? ~ $ cat ~/.bin/memcg_low_test
#!/bin/bash
x62mb=$((62<<20))
x66mb=$((66<<20))
x94mb=$((94<<20))
x98mb=$((98<<20))
setup() {
set -e
if [[ -n $DEBUG ]]; then
set -x
fi
trap teardown EXIT HUP INT TERM
if [[ ! -e /mnt/1gb.swap ]]; then
sudo fallocate -l 1G /mnt/1gb.swap > /dev/null
sudo mkswap /mnt/1gb.swap > /dev/null
fi
if ! swapon --show=NAME | grep -q "/mnt/1gb.swap"; then
sudo swapon /mnt/1gb.swap
fi
if [[ ! -e /cgroup/cgroup.controllers ]]; then
sudo mount -t cgroup2 none /cgroup
fi
grep -q memory /cgroup/cgroup.controllers
sudo sh -c "echo '+memory' > /cgroup/cgroup.subtree_control"
sudo mkdir /cgroup/A && sudo chown $USER:$USER /cgroup/A
sudo sh -c "echo '+memory' > /cgroup/A/cgroup.subtree_control"
sudo sh -c "echo '96m' > /cgroup/A/memory.high"
mkdir /cgroup/A/0
mkdir /cgroup/A/1
echo 64m > /cgroup/A/0/memory.low
}
teardown() {
set +e
trap - EXIT HUP INT TERM
if [[ -z $1 ]]; then
printf "\n"
printf "%0.s*" {1..35}
printf "\nFAILED!\n\n"
tail /cgroup/A/**/memory.current
printf "%0.s*" {1..35}
printf "\n\n"
fi
ps | grep stress | tr -s ' ' | cut -f 2 -d ' ' | xargs -I % kill %
sleep 2
if [[ -e /cgroup/A/0 ]]; then
rmdir /cgroup/A/0
fi
if [[ -e /cgroup/A/1 ]]; then
rmdir /cgroup/A/1
fi
if [[ -e /cgroup/A ]]; then
sudo rmdir /cgroup/A
fi
}
stress_test() {
sudo sh -c "echo $$ > /cgroup/A/$1/cgroup.procs"
stress --vm 1 --vm-bytes 64M --vm-keep > /dev/null &
sudo sh -c "echo $$ > /cgroup/A/$2/cgroup.procs"
stress --vm 1 --vm-bytes 64M --vm-keep > /dev/null &
sudo sh -c "echo $$ > /cgroup/cgroup.procs"
sleep 1
# A/0 should be consuming more memory than A/1
[[ $(cat /cgroup/A/0/memory.current) -ge $(cat /cgroup/A/1/memory.current) ]]
# A/0 should be consuming ~64mb
[[ $(cat /cgroup/A/0/memory.current) -ge $x62mb ]] && [[ $(cat /cgroup/A/0/memory.current) -le $x66mb ]]
# A should cumulatively be consuming ~96mb
[[ $(cat /cgroup/A/memory.current) -ge $x94mb ]] && [[ $(cat /cgroup/A/memory.current) -le $x98mb ]]
# Stop the stressors
ps | grep stress | tr -s ' ' | cut -f 2 -d ' ' | xargs -I % kill %
}
teardown 1
setup
for ((i=1;i<=$1;i++)); do
printf "ITERATION $i of $1 - stress_test 0 1"
stress_test 0 1
printf "\x1b[2K\r"
printf "ITERATION $i of $1 - stress_test 1 0"
stress_test 1 0
printf "\x1b[2K\r"
printf "ITERATION $i of $1 - PASSED\n"
done
teardown 1
echo PASSED!
20:11:26@sjchrist-vm ? ~ $ memcg_low_test 10
Link: http://lkml.kernel.org/r/1496434412-21005-1-git-send-email-sean.j.christopherson@intel.com
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Acked-by: Vladimir Davydov <vdavydov.dev@gmail.com>
Acked-by: Balbir Singh <bsingharora@gmail.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2017-07-11 05:48:05 +07:00
|
|
|
if (memcg == root)
|
2020-08-07 13:22:05 +07:00
|
|
|
return;
|
mm: memcontrol: default hierarchy interface for memory
Introduce the basic control files to account, partition, and limit
memory using cgroups in default hierarchy mode.
This interface versioning allows us to address fundamental design
issues in the existing memory cgroup interface, further explained
below. The old interface will be maintained indefinitely, but a
clearer model and improved workload performance should encourage
existing users to switch over to the new one eventually.
The control files are thus:
- memory.current shows the current consumption of the cgroup and its
descendants, in bytes.
- memory.low configures the lower end of the cgroup's expected
memory consumption range. The kernel considers memory below that
boundary to be a reserve - the minimum that the workload needs in
order to make forward progress - and generally avoids reclaiming
it, unless there is an imminent risk of entering an OOM situation.
- memory.high configures the upper end of the cgroup's expected
memory consumption range. A cgroup whose consumption grows beyond
this threshold is forced into direct reclaim, to work off the
excess and to throttle new allocations heavily, but is generally
allowed to continue and the OOM killer is not invoked.
- memory.max configures the hard maximum amount of memory that the
cgroup is allowed to consume before the OOM killer is invoked.
- memory.events shows event counters that indicate how often the
cgroup was reclaimed while below memory.low, how often it was
forced to reclaim excess beyond memory.high, how often it hit
memory.max, and how often it entered OOM due to memory.max. This
allows users to identify configuration problems when observing a
degradation in workload performance. An overcommitted system will
have an increased rate of low boundary breaches, whereas increased
rates of high limit breaches, maximum hits, or even OOM situations
will indicate internally overcommitted cgroups.
For existing users of memory cgroups, the following deviations from
the current interface are worth pointing out and explaining:
- The original lower boundary, the soft limit, is defined as a limit
that is per default unset. As a result, the set of cgroups that
global reclaim prefers is opt-in, rather than opt-out. The costs
for optimizing these mostly negative lookups are so high that the
implementation, despite its enormous size, does not even provide
the basic desirable behavior. First off, the soft limit has no
hierarchical meaning. All configured groups are organized in a
global rbtree and treated like equal peers, regardless where they
are located in the hierarchy. This makes subtree delegation
impossible. Second, the soft limit reclaim pass is so aggressive
that it not just introduces high allocation latencies into the
system, but also impacts system performance due to overreclaim, to
the point where the feature becomes self-defeating.
The memory.low boundary on the other hand is a top-down allocated
reserve. A cgroup enjoys reclaim protection when it and all its
ancestors are below their low boundaries, which makes delegation
of subtrees possible. Secondly, new cgroups have no reserve per
default and in the common case most cgroups are eligible for the
preferred reclaim pass. This allows the new low boundary to be
efficiently implemented with just a minor addition to the generic
reclaim code, without the need for out-of-band data structures and
reclaim passes. Because the generic reclaim code considers all
cgroups except for the ones running low in the preferred first
reclaim pass, overreclaim of individual groups is eliminated as
well, resulting in much better overall workload performance.
- The original high boundary, the hard limit, is defined as a strict
limit that can not budge, even if the OOM killer has to be called.
But this generally goes against the goal of making the most out of
the available memory. The memory consumption of workloads varies
during runtime, and that requires users to overcommit. But doing
that with a strict upper limit requires either a fairly accurate
prediction of the working set size or adding slack to the limit.
Since working set size estimation is hard and error prone, and
getting it wrong results in OOM kills, most users tend to err on
the side of a looser limit and end up wasting precious resources.
The memory.high boundary on the other hand can be set much more
conservatively. When hit, it throttles allocations by forcing
them into direct reclaim to work off the excess, but it never
invokes the OOM killer. As a result, a high boundary that is
chosen too aggressively will not terminate the processes, but
instead it will lead to gradual performance degradation. The user
can monitor this and make corrections until the minimal memory
footprint that still gives acceptable performance is found.
In extreme cases, with many concurrent allocations and a complete
breakdown of reclaim progress within the group, the high boundary
can be exceeded. But even then it's mostly better to satisfy the
allocation from the slack available in other groups or the rest of
the system than killing the group. Otherwise, memory.max is there
to limit this type of spillover and ultimately contain buggy or
even malicious applications.
- The original control file names are unwieldy and inconsistent in
many different ways. For example, the upper boundary hit count is
exported in the memory.failcnt file, but an OOM event count has to
be manually counted by listening to memory.oom_control events, and
lower boundary / soft limit events have to be counted by first
setting a threshold for that value and then counting those events.
Also, usage and limit files encode their units in the filename.
That makes the filenames very long, even though this is not
information that a user needs to be reminded of every time they
type out those names.
To address these naming issues, as well as to signal clearly that
the new interface carries a new configuration model, the naming
conventions in it necessarily differ from the old interface.
- The original limit files indicate the state of an unset limit with
a very high number, and a configured limit can be unset by echoing
-1 into those files. But that very high number is implementation
and architecture dependent and not very descriptive. And while -1
can be understood as an underflow into the highest possible value,
-2 or -10M etc. do not work, so it's not inconsistent.
memory.low, memory.high, and memory.max will use the string
"infinity" to indicate and set the highest possible value.
[akpm@linux-foundation.org: use seq_puts() for basic strings]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: Vladimir Davydov <vdavydov@parallels.com>
Cc: Greg Thelen <gthelen@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-02-12 06:26:06 +07:00
|
|
|
|
mm: memory.low hierarchical behavior
This patch aims to address an issue in current memory.low semantics,
which makes it hard to use it in a hierarchy, where some leaf memory
cgroups are more valuable than others.
For example, there are memcgs A, A/B, A/C, A/D and A/E:
A A/memory.low = 2G, A/memory.current = 6G
//\\
BC DE B/memory.low = 3G B/memory.current = 2G
C/memory.low = 1G C/memory.current = 2G
D/memory.low = 0 D/memory.current = 2G
E/memory.low = 10G E/memory.current = 0
If we apply memory pressure, B, C and D are reclaimed at the same pace
while A's usage exceeds 2G. This is obviously wrong, as B's usage is
fully below B's memory.low, and C has 1G of protection as well. Also, A
is pushed to the size, which is less than A's 2G memory.low, which is
also wrong.
A simple bash script (provided below) can be used to reproduce
the problem. Current results are:
A: 1430097920
A/B: 711929856
A/C: 717426688
A/D: 741376
A/E: 0
To address the issue a concept of effective memory.low is introduced.
Effective memory.low is always equal or less than original memory.low.
In a case, when there is no memory.low overcommittment (and also for
top-level cgroups), these two values are equal.
Otherwise it's a part of parent's effective memory.low, calculated as a
cgroup's memory.low usage divided by sum of sibling's memory.low usages
(under memory.low usage I mean the size of actually protected memory:
memory.current if memory.current < memory.low, 0 otherwise). It's
necessary to track the actual usage, because otherwise an empty cgroup
with memory.low set (A/E in my example) will affect actual memory
distribution, which makes no sense. To avoid traversing the cgroup tree
twice, page_counters code is reused.
Calculating effective memory.low can be done in the reclaim path, as we
conveniently traversing the cgroup tree from top to bottom and check
memory.low on each level. So, it's a perfect place to calculate
effective memory low and save it to use it for children cgroups.
This also eliminates a need to traverse the cgroup tree from bottom to
top each time to check if parent's guarantee is not exceeded.
Setting/resetting effective memory.low is intentionally racy, but it's
fine and shouldn't lead to any significant differences in actual memory
distribution.
With this patch applied results are matching the expectations:
A: 2147930112
A/B: 1428721664
A/C: 718393344
A/D: 815104
A/E: 0
Test script:
#!/bin/bash
CGPATH="/sys/fs/cgroup"
truncate /file1 --size 2G
truncate /file2 --size 2G
truncate /file3 --size 2G
truncate /file4 --size 50G
mkdir "${CGPATH}/A"
echo "+memory" > "${CGPATH}/A/cgroup.subtree_control"
mkdir "${CGPATH}/A/B" "${CGPATH}/A/C" "${CGPATH}/A/D" "${CGPATH}/A/E"
echo 2G > "${CGPATH}/A/memory.low"
echo 3G > "${CGPATH}/A/B/memory.low"
echo 1G > "${CGPATH}/A/C/memory.low"
echo 0 > "${CGPATH}/A/D/memory.low"
echo 10G > "${CGPATH}/A/E/memory.low"
echo $$ > "${CGPATH}/A/B/cgroup.procs" && vmtouch -qt /file1
echo $$ > "${CGPATH}/A/C/cgroup.procs" && vmtouch -qt /file2
echo $$ > "${CGPATH}/A/D/cgroup.procs" && vmtouch -qt /file3
echo $$ > "${CGPATH}/cgroup.procs" && vmtouch -qt /file4
echo "A: " `cat "${CGPATH}/A/memory.current"`
echo "A/B: " `cat "${CGPATH}/A/B/memory.current"`
echo "A/C: " `cat "${CGPATH}/A/C/memory.current"`
echo "A/D: " `cat "${CGPATH}/A/D/memory.current"`
echo "A/E: " `cat "${CGPATH}/A/E/memory.current"`
rmdir "${CGPATH}/A/B" "${CGPATH}/A/C" "${CGPATH}/A/D" "${CGPATH}/A/E"
rmdir "${CGPATH}/A"
rm /file1 /file2 /file3 /file4
Link: http://lkml.kernel.org/r/20180405185921.4942-2-guro@fb.com
Signed-off-by: Roman Gushchin <guro@fb.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2018-06-08 07:06:22 +07:00
|
|
|
usage = page_counter_read(&memcg->memory);
|
2018-06-08 07:07:46 +07:00
|
|
|
if (!usage)
|
2020-08-07 13:22:05 +07:00
|
|
|
return;
|
2018-06-08 07:07:46 +07:00
|
|
|
|
|
|
|
parent = parent_mem_cgroup(memcg);
|
2018-06-15 05:26:17 +07:00
|
|
|
/* No parent means a non-hierarchical mode on v1 memcg */
|
|
|
|
if (!parent)
|
2020-08-07 13:22:05 +07:00
|
|
|
return;
|
2018-06-15 05:26:17 +07:00
|
|
|
|
2020-04-02 11:07:03 +07:00
|
|
|
if (parent == root) {
|
2020-04-02 11:07:27 +07:00
|
|
|
memcg->memory.emin = READ_ONCE(memcg->memory.min);
|
2020-06-26 10:30:22 +07:00
|
|
|
memcg->memory.elow = READ_ONCE(memcg->memory.low);
|
2020-08-07 13:22:05 +07:00
|
|
|
return;
|
2018-06-08 07:07:46 +07:00
|
|
|
}
|
|
|
|
|
mm: memcontrol: recursive memory.low protection
Right now, the effective protection of any given cgroup is capped by its
own explicit memory.low setting, regardless of what the parent says. The
reasons for this are mostly historical and ease of implementation: to make
delegation of memory.low safe, effective protection is the min() of all
memory.low up the tree.
Unfortunately, this limitation makes it impossible to protect an entire
subtree from another without forcing the user to make explicit protection
allocations all the way to the leaf cgroups - something that is highly
undesirable in real life scenarios.
Consider memory in a data center host. At the cgroup top level, we have a
distinction between system management software and the actual workload the
system is executing. Both branches are further subdivided into individual
services, job components etc.
We want to protect the workload as a whole from the system management
software, but that doesn't mean we want to protect and prioritize
individual workload wrt each other. Their memory demand can vary over
time, and we'd want the VM to simply cache the hottest data within the
workload subtree. Yet, the current memory.low limitations force us to
allocate a fixed amount of protection to each workload component in order
to get protection from system management software in general. This
results in very inefficient resource distribution.
Another concern with mandating downward allocation is that, as the
complexity of the cgroup tree grows, it gets harder for the lower levels
to be informed about decisions made at the host-level. Consider a
container inside a namespace that in turn creates its own nested tree of
cgroups to run multiple workloads. It'd be extremely difficult to
configure memory.low parameters in those leaf cgroups that on one hand
balance pressure among siblings as the container desires, while also
reflecting the host-level protection from e.g. rpm upgrades, that lie
beyond one or more delegation and namespacing points in the tree.
It's highly unusual from a cgroup interface POV that nested levels have to
be aware of and reflect decisions made at higher levels for them to be
effective.
To enable such use cases and scale configurability for complex trees, this
patch implements a resource inheritance model for memory that is similar
to how the CPU and the IO controller implement work-conserving resource
allocations: a share of a resource allocated to a subree always applies to
the entire subtree recursively, while allowing, but not mandating,
children to further specify distribution rules.
That means that if protection is explicitly allocated among siblings,
those configured shares are being followed during page reclaim just like
they are now. However, if the memory.low set at a higher level is not
fully claimed by the children in that subtree, the "floating" remainder is
applied to each cgroup in the tree in proportion to its size. Since
reclaim pressure is applied in proportion to size as well, each child in
that tree gets the same boost, and the effect is neutral among siblings -
with respect to each other, they behave as if no memory control was
enabled at all, and the VM simply balances the memory demands optimally
within the subtree. But collectively those cgroups enjoy a boost over the
cgroups in neighboring trees.
E.g. a leaf cgroup with a memory.low setting of 0 no longer means that
it's not getting a share of the hierarchically assigned resource, just
that it doesn't claim a fixed amount of it to protect from its siblings.
This allows us to recursively protect one subtree (workload) from another
(system management), while letting subgroups compete freely among each
other - without having to assign fixed shares to each leaf, and without
nested groups having to echo higher-level settings.
The floating protection composes naturally with fixed protection.
Consider the following example tree:
A A: low = 2G
/ \ A1: low = 1G
A1 A2 A2: low = 0G
As outside pressure is applied to this tree, A1 will enjoy a fixed
protection from A2 of 1G, but the remaining, unclaimed 1G from A is split
evenly among A1 and A2, coming out to 1.5G and 0.5G.
There is a slight risk of regressing theoretical setups where the
top-level cgroups don't know about the true budgeting and set bogusly high
"bypass" values that are meaningfully allocated down the tree. Such
setups would rely on unclaimed protection to be discarded, and
distributing it would change the intended behavior. Be safe and hide the
new behavior behind a mount option, 'memory_recursiveprot'.
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Acked-by: Tejun Heo <tj@kernel.org>
Acked-by: Roman Gushchin <guro@fb.com>
Acked-by: Chris Down <chris@chrisdown.name>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Michal Koutný <mkoutny@suse.com>
Link: http://lkml.kernel.org/r/20200227195606.46212-4-hannes@cmpxchg.org
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2020-04-02 11:07:07 +07:00
|
|
|
parent_usage = page_counter_read(&parent->memory);
|
|
|
|
|
2020-04-02 11:07:33 +07:00
|
|
|
WRITE_ONCE(memcg->memory.emin, effective_protection(usage, parent_usage,
|
2020-04-02 11:07:27 +07:00
|
|
|
READ_ONCE(memcg->memory.min),
|
|
|
|
READ_ONCE(parent->memory.emin),
|
2020-04-02 11:07:33 +07:00
|
|
|
atomic_long_read(&parent->memory.children_min_usage)));
|
mm: memory.low hierarchical behavior
This patch aims to address an issue in current memory.low semantics,
which makes it hard to use it in a hierarchy, where some leaf memory
cgroups are more valuable than others.
For example, there are memcgs A, A/B, A/C, A/D and A/E:
A A/memory.low = 2G, A/memory.current = 6G
//\\
BC DE B/memory.low = 3G B/memory.current = 2G
C/memory.low = 1G C/memory.current = 2G
D/memory.low = 0 D/memory.current = 2G
E/memory.low = 10G E/memory.current = 0
If we apply memory pressure, B, C and D are reclaimed at the same pace
while A's usage exceeds 2G. This is obviously wrong, as B's usage is
fully below B's memory.low, and C has 1G of protection as well. Also, A
is pushed to the size, which is less than A's 2G memory.low, which is
also wrong.
A simple bash script (provided below) can be used to reproduce
the problem. Current results are:
A: 1430097920
A/B: 711929856
A/C: 717426688
A/D: 741376
A/E: 0
To address the issue a concept of effective memory.low is introduced.
Effective memory.low is always equal or less than original memory.low.
In a case, when there is no memory.low overcommittment (and also for
top-level cgroups), these two values are equal.
Otherwise it's a part of parent's effective memory.low, calculated as a
cgroup's memory.low usage divided by sum of sibling's memory.low usages
(under memory.low usage I mean the size of actually protected memory:
memory.current if memory.current < memory.low, 0 otherwise). It's
necessary to track the actual usage, because otherwise an empty cgroup
with memory.low set (A/E in my example) will affect actual memory
distribution, which makes no sense. To avoid traversing the cgroup tree
twice, page_counters code is reused.
Calculating effective memory.low can be done in the reclaim path, as we
conveniently traversing the cgroup tree from top to bottom and check
memory.low on each level. So, it's a perfect place to calculate
effective memory low and save it to use it for children cgroups.
This also eliminates a need to traverse the cgroup tree from bottom to
top each time to check if parent's guarantee is not exceeded.
Setting/resetting effective memory.low is intentionally racy, but it's
fine and shouldn't lead to any significant differences in actual memory
distribution.
With this patch applied results are matching the expectations:
A: 2147930112
A/B: 1428721664
A/C: 718393344
A/D: 815104
A/E: 0
Test script:
#!/bin/bash
CGPATH="/sys/fs/cgroup"
truncate /file1 --size 2G
truncate /file2 --size 2G
truncate /file3 --size 2G
truncate /file4 --size 50G
mkdir "${CGPATH}/A"
echo "+memory" > "${CGPATH}/A/cgroup.subtree_control"
mkdir "${CGPATH}/A/B" "${CGPATH}/A/C" "${CGPATH}/A/D" "${CGPATH}/A/E"
echo 2G > "${CGPATH}/A/memory.low"
echo 3G > "${CGPATH}/A/B/memory.low"
echo 1G > "${CGPATH}/A/C/memory.low"
echo 0 > "${CGPATH}/A/D/memory.low"
echo 10G > "${CGPATH}/A/E/memory.low"
echo $$ > "${CGPATH}/A/B/cgroup.procs" && vmtouch -qt /file1
echo $$ > "${CGPATH}/A/C/cgroup.procs" && vmtouch -qt /file2
echo $$ > "${CGPATH}/A/D/cgroup.procs" && vmtouch -qt /file3
echo $$ > "${CGPATH}/cgroup.procs" && vmtouch -qt /file4
echo "A: " `cat "${CGPATH}/A/memory.current"`
echo "A/B: " `cat "${CGPATH}/A/B/memory.current"`
echo "A/C: " `cat "${CGPATH}/A/C/memory.current"`
echo "A/D: " `cat "${CGPATH}/A/D/memory.current"`
echo "A/E: " `cat "${CGPATH}/A/E/memory.current"`
rmdir "${CGPATH}/A/B" "${CGPATH}/A/C" "${CGPATH}/A/D" "${CGPATH}/A/E"
rmdir "${CGPATH}/A"
rm /file1 /file2 /file3 /file4
Link: http://lkml.kernel.org/r/20180405185921.4942-2-guro@fb.com
Signed-off-by: Roman Gushchin <guro@fb.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2018-06-08 07:06:22 +07:00
|
|
|
|
2020-04-02 11:07:33 +07:00
|
|
|
WRITE_ONCE(memcg->memory.elow, effective_protection(usage, parent_usage,
|
2020-06-26 10:30:22 +07:00
|
|
|
READ_ONCE(memcg->memory.low),
|
|
|
|
READ_ONCE(parent->memory.elow),
|
2020-04-02 11:07:33 +07:00
|
|
|
atomic_long_read(&parent->memory.children_low_usage)));
|
mm: memcontrol: default hierarchy interface for memory
Introduce the basic control files to account, partition, and limit
memory using cgroups in default hierarchy mode.
This interface versioning allows us to address fundamental design
issues in the existing memory cgroup interface, further explained
below. The old interface will be maintained indefinitely, but a
clearer model and improved workload performance should encourage
existing users to switch over to the new one eventually.
The control files are thus:
- memory.current shows the current consumption of the cgroup and its
descendants, in bytes.
- memory.low configures the lower end of the cgroup's expected
memory consumption range. The kernel considers memory below that
boundary to be a reserve - the minimum that the workload needs in
order to make forward progress - and generally avoids reclaiming
it, unless there is an imminent risk of entering an OOM situation.
- memory.high configures the upper end of the cgroup's expected
memory consumption range. A cgroup whose consumption grows beyond
this threshold is forced into direct reclaim, to work off the
excess and to throttle new allocations heavily, but is generally
allowed to continue and the OOM killer is not invoked.
- memory.max configures the hard maximum amount of memory that the
cgroup is allowed to consume before the OOM killer is invoked.
- memory.events shows event counters that indicate how often the
cgroup was reclaimed while below memory.low, how often it was
forced to reclaim excess beyond memory.high, how often it hit
memory.max, and how often it entered OOM due to memory.max. This
allows users to identify configuration problems when observing a
degradation in workload performance. An overcommitted system will
have an increased rate of low boundary breaches, whereas increased
rates of high limit breaches, maximum hits, or even OOM situations
will indicate internally overcommitted cgroups.
For existing users of memory cgroups, the following deviations from
the current interface are worth pointing out and explaining:
- The original lower boundary, the soft limit, is defined as a limit
that is per default unset. As a result, the set of cgroups that
global reclaim prefers is opt-in, rather than opt-out. The costs
for optimizing these mostly negative lookups are so high that the
implementation, despite its enormous size, does not even provide
the basic desirable behavior. First off, the soft limit has no
hierarchical meaning. All configured groups are organized in a
global rbtree and treated like equal peers, regardless where they
are located in the hierarchy. This makes subtree delegation
impossible. Second, the soft limit reclaim pass is so aggressive
that it not just introduces high allocation latencies into the
system, but also impacts system performance due to overreclaim, to
the point where the feature becomes self-defeating.
The memory.low boundary on the other hand is a top-down allocated
reserve. A cgroup enjoys reclaim protection when it and all its
ancestors are below their low boundaries, which makes delegation
of subtrees possible. Secondly, new cgroups have no reserve per
default and in the common case most cgroups are eligible for the
preferred reclaim pass. This allows the new low boundary to be
efficiently implemented with just a minor addition to the generic
reclaim code, without the need for out-of-band data structures and
reclaim passes. Because the generic reclaim code considers all
cgroups except for the ones running low in the preferred first
reclaim pass, overreclaim of individual groups is eliminated as
well, resulting in much better overall workload performance.
- The original high boundary, the hard limit, is defined as a strict
limit that can not budge, even if the OOM killer has to be called.
But this generally goes against the goal of making the most out of
the available memory. The memory consumption of workloads varies
during runtime, and that requires users to overcommit. But doing
that with a strict upper limit requires either a fairly accurate
prediction of the working set size or adding slack to the limit.
Since working set size estimation is hard and error prone, and
getting it wrong results in OOM kills, most users tend to err on
the side of a looser limit and end up wasting precious resources.
The memory.high boundary on the other hand can be set much more
conservatively. When hit, it throttles allocations by forcing
them into direct reclaim to work off the excess, but it never
invokes the OOM killer. As a result, a high boundary that is
chosen too aggressively will not terminate the processes, but
instead it will lead to gradual performance degradation. The user
can monitor this and make corrections until the minimal memory
footprint that still gives acceptable performance is found.
In extreme cases, with many concurrent allocations and a complete
breakdown of reclaim progress within the group, the high boundary
can be exceeded. But even then it's mostly better to satisfy the
allocation from the slack available in other groups or the rest of
the system than killing the group. Otherwise, memory.max is there
to limit this type of spillover and ultimately contain buggy or
even malicious applications.
- The original control file names are unwieldy and inconsistent in
many different ways. For example, the upper boundary hit count is
exported in the memory.failcnt file, but an OOM event count has to
be manually counted by listening to memory.oom_control events, and
lower boundary / soft limit events have to be counted by first
setting a threshold for that value and then counting those events.
Also, usage and limit files encode their units in the filename.
That makes the filenames very long, even though this is not
information that a user needs to be reminded of every time they
type out those names.
To address these naming issues, as well as to signal clearly that
the new interface carries a new configuration model, the naming
conventions in it necessarily differ from the old interface.
- The original limit files indicate the state of an unset limit with
a very high number, and a configured limit can be unset by echoing
-1 into those files. But that very high number is implementation
and architecture dependent and not very descriptive. And while -1
can be understood as an underflow into the highest possible value,
-2 or -10M etc. do not work, so it's not inconsistent.
memory.low, memory.high, and memory.max will use the string
"infinity" to indicate and set the highest possible value.
[akpm@linux-foundation.org: use seq_puts() for basic strings]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: Vladimir Davydov <vdavydov@parallels.com>
Cc: Greg Thelen <gthelen@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-02-12 06:26:06 +07:00
|
|
|
}
|
|
|
|
|
mm: memcontrol: rewrite charge API
These patches rework memcg charge lifetime to integrate more naturally
with the lifetime of user pages. This drastically simplifies the code and
reduces charging and uncharging overhead. The most expensive part of
charging and uncharging is the page_cgroup bit spinlock, which is removed
entirely after this series.
Here are the top-10 profile entries of a stress test that reads a 128G
sparse file on a freshly booted box, without even a dedicated cgroup (i.e.
executing in the root memcg). Before:
15.36% cat [kernel.kallsyms] [k] copy_user_generic_string
13.31% cat [kernel.kallsyms] [k] memset
11.48% cat [kernel.kallsyms] [k] do_mpage_readpage
4.23% cat [kernel.kallsyms] [k] get_page_from_freelist
2.38% cat [kernel.kallsyms] [k] put_page
2.32% cat [kernel.kallsyms] [k] __mem_cgroup_commit_charge
2.18% kswapd0 [kernel.kallsyms] [k] __mem_cgroup_uncharge_common
1.92% kswapd0 [kernel.kallsyms] [k] shrink_page_list
1.86% cat [kernel.kallsyms] [k] __radix_tree_lookup
1.62% cat [kernel.kallsyms] [k] __pagevec_lru_add_fn
After:
15.67% cat [kernel.kallsyms] [k] copy_user_generic_string
13.48% cat [kernel.kallsyms] [k] memset
11.42% cat [kernel.kallsyms] [k] do_mpage_readpage
3.98% cat [kernel.kallsyms] [k] get_page_from_freelist
2.46% cat [kernel.kallsyms] [k] put_page
2.13% kswapd0 [kernel.kallsyms] [k] shrink_page_list
1.88% cat [kernel.kallsyms] [k] __radix_tree_lookup
1.67% cat [kernel.kallsyms] [k] __pagevec_lru_add_fn
1.39% kswapd0 [kernel.kallsyms] [k] free_pcppages_bulk
1.30% cat [kernel.kallsyms] [k] kfree
As you can see, the memcg footprint has shrunk quite a bit.
text data bss dec hex filename
37970 9892 400 48262 bc86 mm/memcontrol.o.old
35239 9892 400 45531 b1db mm/memcontrol.o
This patch (of 4):
The memcg charge API charges pages before they are rmapped - i.e. have an
actual "type" - and so every callsite needs its own set of charge and
uncharge functions to know what type is being operated on. Worse,
uncharge has to happen from a context that is still type-specific, rather
than at the end of the page's lifetime with exclusive access, and so
requires a lot of synchronization.
Rewrite the charge API to provide a generic set of try_charge(),
commit_charge() and cancel_charge() transaction operations, much like
what's currently done for swap-in:
mem_cgroup_try_charge() attempts to reserve a charge, reclaiming
pages from the memcg if necessary.
mem_cgroup_commit_charge() commits the page to the charge once it
has a valid page->mapping and PageAnon() reliably tells the type.
mem_cgroup_cancel_charge() aborts the transaction.
This reduces the charge API and enables subsequent patches to
drastically simplify uncharging.
As pages need to be committed after rmap is established but before they
are added to the LRU, page_add_new_anon_rmap() must stop doing LRU
additions again. Revive lru_cache_add_active_or_unevictable().
[hughd@google.com: fix shmem_unuse]
[hughd@google.com: Add comments on the private use of -EAGAIN]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: Tejun Heo <tj@kernel.org>
Cc: Vladimir Davydov <vdavydov@parallels.com>
Signed-off-by: Hugh Dickins <hughd@google.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-08-09 04:19:20 +07:00
|
|
|
/**
|
2020-06-04 06:02:07 +07:00
|
|
|
* mem_cgroup_charge - charge a newly allocated page to a cgroup
|
mm: memcontrol: rewrite charge API
These patches rework memcg charge lifetime to integrate more naturally
with the lifetime of user pages. This drastically simplifies the code and
reduces charging and uncharging overhead. The most expensive part of
charging and uncharging is the page_cgroup bit spinlock, which is removed
entirely after this series.
Here are the top-10 profile entries of a stress test that reads a 128G
sparse file on a freshly booted box, without even a dedicated cgroup (i.e.
executing in the root memcg). Before:
15.36% cat [kernel.kallsyms] [k] copy_user_generic_string
13.31% cat [kernel.kallsyms] [k] memset
11.48% cat [kernel.kallsyms] [k] do_mpage_readpage
4.23% cat [kernel.kallsyms] [k] get_page_from_freelist
2.38% cat [kernel.kallsyms] [k] put_page
2.32% cat [kernel.kallsyms] [k] __mem_cgroup_commit_charge
2.18% kswapd0 [kernel.kallsyms] [k] __mem_cgroup_uncharge_common
1.92% kswapd0 [kernel.kallsyms] [k] shrink_page_list
1.86% cat [kernel.kallsyms] [k] __radix_tree_lookup
1.62% cat [kernel.kallsyms] [k] __pagevec_lru_add_fn
After:
15.67% cat [kernel.kallsyms] [k] copy_user_generic_string
13.48% cat [kernel.kallsyms] [k] memset
11.42% cat [kernel.kallsyms] [k] do_mpage_readpage
3.98% cat [kernel.kallsyms] [k] get_page_from_freelist
2.46% cat [kernel.kallsyms] [k] put_page
2.13% kswapd0 [kernel.kallsyms] [k] shrink_page_list
1.88% cat [kernel.kallsyms] [k] __radix_tree_lookup
1.67% cat [kernel.kallsyms] [k] __pagevec_lru_add_fn
1.39% kswapd0 [kernel.kallsyms] [k] free_pcppages_bulk
1.30% cat [kernel.kallsyms] [k] kfree
As you can see, the memcg footprint has shrunk quite a bit.
text data bss dec hex filename
37970 9892 400 48262 bc86 mm/memcontrol.o.old
35239 9892 400 45531 b1db mm/memcontrol.o
This patch (of 4):
The memcg charge API charges pages before they are rmapped - i.e. have an
actual "type" - and so every callsite needs its own set of charge and
uncharge functions to know what type is being operated on. Worse,
uncharge has to happen from a context that is still type-specific, rather
than at the end of the page's lifetime with exclusive access, and so
requires a lot of synchronization.
Rewrite the charge API to provide a generic set of try_charge(),
commit_charge() and cancel_charge() transaction operations, much like
what's currently done for swap-in:
mem_cgroup_try_charge() attempts to reserve a charge, reclaiming
pages from the memcg if necessary.
mem_cgroup_commit_charge() commits the page to the charge once it
has a valid page->mapping and PageAnon() reliably tells the type.
mem_cgroup_cancel_charge() aborts the transaction.
This reduces the charge API and enables subsequent patches to
drastically simplify uncharging.
As pages need to be committed after rmap is established but before they
are added to the LRU, page_add_new_anon_rmap() must stop doing LRU
additions again. Revive lru_cache_add_active_or_unevictable().
[hughd@google.com: fix shmem_unuse]
[hughd@google.com: Add comments on the private use of -EAGAIN]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: Tejun Heo <tj@kernel.org>
Cc: Vladimir Davydov <vdavydov@parallels.com>
Signed-off-by: Hugh Dickins <hughd@google.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-08-09 04:19:20 +07:00
|
|
|
* @page: page to charge
|
|
|
|
* @mm: mm context of the victim
|
|
|
|
* @gfp_mask: reclaim mode
|
|
|
|
*
|
|
|
|
* Try to charge @page to the memcg that @mm belongs to, reclaiming
|
|
|
|
* pages according to @gfp_mask if necessary.
|
|
|
|
*
|
2020-06-04 06:02:07 +07:00
|
|
|
* Returns 0 on success. Otherwise, an error code is returned.
|
mm: memcontrol: rewrite charge API
These patches rework memcg charge lifetime to integrate more naturally
with the lifetime of user pages. This drastically simplifies the code and
reduces charging and uncharging overhead. The most expensive part of
charging and uncharging is the page_cgroup bit spinlock, which is removed
entirely after this series.
Here are the top-10 profile entries of a stress test that reads a 128G
sparse file on a freshly booted box, without even a dedicated cgroup (i.e.
executing in the root memcg). Before:
15.36% cat [kernel.kallsyms] [k] copy_user_generic_string
13.31% cat [kernel.kallsyms] [k] memset
11.48% cat [kernel.kallsyms] [k] do_mpage_readpage
4.23% cat [kernel.kallsyms] [k] get_page_from_freelist
2.38% cat [kernel.kallsyms] [k] put_page
2.32% cat [kernel.kallsyms] [k] __mem_cgroup_commit_charge
2.18% kswapd0 [kernel.kallsyms] [k] __mem_cgroup_uncharge_common
1.92% kswapd0 [kernel.kallsyms] [k] shrink_page_list
1.86% cat [kernel.kallsyms] [k] __radix_tree_lookup
1.62% cat [kernel.kallsyms] [k] __pagevec_lru_add_fn
After:
15.67% cat [kernel.kallsyms] [k] copy_user_generic_string
13.48% cat [kernel.kallsyms] [k] memset
11.42% cat [kernel.kallsyms] [k] do_mpage_readpage
3.98% cat [kernel.kallsyms] [k] get_page_from_freelist
2.46% cat [kernel.kallsyms] [k] put_page
2.13% kswapd0 [kernel.kallsyms] [k] shrink_page_list
1.88% cat [kernel.kallsyms] [k] __radix_tree_lookup
1.67% cat [kernel.kallsyms] [k] __pagevec_lru_add_fn
1.39% kswapd0 [kernel.kallsyms] [k] free_pcppages_bulk
1.30% cat [kernel.kallsyms] [k] kfree
As you can see, the memcg footprint has shrunk quite a bit.
text data bss dec hex filename
37970 9892 400 48262 bc86 mm/memcontrol.o.old
35239 9892 400 45531 b1db mm/memcontrol.o
This patch (of 4):
The memcg charge API charges pages before they are rmapped - i.e. have an
actual "type" - and so every callsite needs its own set of charge and
uncharge functions to know what type is being operated on. Worse,
uncharge has to happen from a context that is still type-specific, rather
than at the end of the page's lifetime with exclusive access, and so
requires a lot of synchronization.
Rewrite the charge API to provide a generic set of try_charge(),
commit_charge() and cancel_charge() transaction operations, much like
what's currently done for swap-in:
mem_cgroup_try_charge() attempts to reserve a charge, reclaiming
pages from the memcg if necessary.
mem_cgroup_commit_charge() commits the page to the charge once it
has a valid page->mapping and PageAnon() reliably tells the type.
mem_cgroup_cancel_charge() aborts the transaction.
This reduces the charge API and enables subsequent patches to
drastically simplify uncharging.
As pages need to be committed after rmap is established but before they
are added to the LRU, page_add_new_anon_rmap() must stop doing LRU
additions again. Revive lru_cache_add_active_or_unevictable().
[hughd@google.com: fix shmem_unuse]
[hughd@google.com: Add comments on the private use of -EAGAIN]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: Tejun Heo <tj@kernel.org>
Cc: Vladimir Davydov <vdavydov@parallels.com>
Signed-off-by: Hugh Dickins <hughd@google.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-08-09 04:19:20 +07:00
|
|
|
*/
|
2020-06-04 06:02:24 +07:00
|
|
|
int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask)
|
mm: memcontrol: rewrite charge API
These patches rework memcg charge lifetime to integrate more naturally
with the lifetime of user pages. This drastically simplifies the code and
reduces charging and uncharging overhead. The most expensive part of
charging and uncharging is the page_cgroup bit spinlock, which is removed
entirely after this series.
Here are the top-10 profile entries of a stress test that reads a 128G
sparse file on a freshly booted box, without even a dedicated cgroup (i.e.
executing in the root memcg). Before:
15.36% cat [kernel.kallsyms] [k] copy_user_generic_string
13.31% cat [kernel.kallsyms] [k] memset
11.48% cat [kernel.kallsyms] [k] do_mpage_readpage
4.23% cat [kernel.kallsyms] [k] get_page_from_freelist
2.38% cat [kernel.kallsyms] [k] put_page
2.32% cat [kernel.kallsyms] [k] __mem_cgroup_commit_charge
2.18% kswapd0 [kernel.kallsyms] [k] __mem_cgroup_uncharge_common
1.92% kswapd0 [kernel.kallsyms] [k] shrink_page_list
1.86% cat [kernel.kallsyms] [k] __radix_tree_lookup
1.62% cat [kernel.kallsyms] [k] __pagevec_lru_add_fn
After:
15.67% cat [kernel.kallsyms] [k] copy_user_generic_string
13.48% cat [kernel.kallsyms] [k] memset
11.42% cat [kernel.kallsyms] [k] do_mpage_readpage
3.98% cat [kernel.kallsyms] [k] get_page_from_freelist
2.46% cat [kernel.kallsyms] [k] put_page
2.13% kswapd0 [kernel.kallsyms] [k] shrink_page_list
1.88% cat [kernel.kallsyms] [k] __radix_tree_lookup
1.67% cat [kernel.kallsyms] [k] __pagevec_lru_add_fn
1.39% kswapd0 [kernel.kallsyms] [k] free_pcppages_bulk
1.30% cat [kernel.kallsyms] [k] kfree
As you can see, the memcg footprint has shrunk quite a bit.
text data bss dec hex filename
37970 9892 400 48262 bc86 mm/memcontrol.o.old
35239 9892 400 45531 b1db mm/memcontrol.o
This patch (of 4):
The memcg charge API charges pages before they are rmapped - i.e. have an
actual "type" - and so every callsite needs its own set of charge and
uncharge functions to know what type is being operated on. Worse,
uncharge has to happen from a context that is still type-specific, rather
than at the end of the page's lifetime with exclusive access, and so
requires a lot of synchronization.
Rewrite the charge API to provide a generic set of try_charge(),
commit_charge() and cancel_charge() transaction operations, much like
what's currently done for swap-in:
mem_cgroup_try_charge() attempts to reserve a charge, reclaiming
pages from the memcg if necessary.
mem_cgroup_commit_charge() commits the page to the charge once it
has a valid page->mapping and PageAnon() reliably tells the type.
mem_cgroup_cancel_charge() aborts the transaction.
This reduces the charge API and enables subsequent patches to
drastically simplify uncharging.
As pages need to be committed after rmap is established but before they
are added to the LRU, page_add_new_anon_rmap() must stop doing LRU
additions again. Revive lru_cache_add_active_or_unevictable().
[hughd@google.com: fix shmem_unuse]
[hughd@google.com: Add comments on the private use of -EAGAIN]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: Tejun Heo <tj@kernel.org>
Cc: Vladimir Davydov <vdavydov@parallels.com>
Signed-off-by: Hugh Dickins <hughd@google.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-08-09 04:19:20 +07:00
|
|
|
{
|
2020-08-15 07:30:37 +07:00
|
|
|
unsigned int nr_pages = thp_nr_pages(page);
|
mm: memcontrol: rewrite charge API
These patches rework memcg charge lifetime to integrate more naturally
with the lifetime of user pages. This drastically simplifies the code and
reduces charging and uncharging overhead. The most expensive part of
charging and uncharging is the page_cgroup bit spinlock, which is removed
entirely after this series.
Here are the top-10 profile entries of a stress test that reads a 128G
sparse file on a freshly booted box, without even a dedicated cgroup (i.e.
executing in the root memcg). Before:
15.36% cat [kernel.kallsyms] [k] copy_user_generic_string
13.31% cat [kernel.kallsyms] [k] memset
11.48% cat [kernel.kallsyms] [k] do_mpage_readpage
4.23% cat [kernel.kallsyms] [k] get_page_from_freelist
2.38% cat [kernel.kallsyms] [k] put_page
2.32% cat [kernel.kallsyms] [k] __mem_cgroup_commit_charge
2.18% kswapd0 [kernel.kallsyms] [k] __mem_cgroup_uncharge_common
1.92% kswapd0 [kernel.kallsyms] [k] shrink_page_list
1.86% cat [kernel.kallsyms] [k] __radix_tree_lookup
1.62% cat [kernel.kallsyms] [k] __pagevec_lru_add_fn
After:
15.67% cat [kernel.kallsyms] [k] copy_user_generic_string
13.48% cat [kernel.kallsyms] [k] memset
11.42% cat [kernel.kallsyms] [k] do_mpage_readpage
3.98% cat [kernel.kallsyms] [k] get_page_from_freelist
2.46% cat [kernel.kallsyms] [k] put_page
2.13% kswapd0 [kernel.kallsyms] [k] shrink_page_list
1.88% cat [kernel.kallsyms] [k] __radix_tree_lookup
1.67% cat [kernel.kallsyms] [k] __pagevec_lru_add_fn
1.39% kswapd0 [kernel.kallsyms] [k] free_pcppages_bulk
1.30% cat [kernel.kallsyms] [k] kfree
As you can see, the memcg footprint has shrunk quite a bit.
text data bss dec hex filename
37970 9892 400 48262 bc86 mm/memcontrol.o.old
35239 9892 400 45531 b1db mm/memcontrol.o
This patch (of 4):
The memcg charge API charges pages before they are rmapped - i.e. have an
actual "type" - and so every callsite needs its own set of charge and
uncharge functions to know what type is being operated on. Worse,
uncharge has to happen from a context that is still type-specific, rather
than at the end of the page's lifetime with exclusive access, and so
requires a lot of synchronization.
Rewrite the charge API to provide a generic set of try_charge(),
commit_charge() and cancel_charge() transaction operations, much like
what's currently done for swap-in:
mem_cgroup_try_charge() attempts to reserve a charge, reclaiming
pages from the memcg if necessary.
mem_cgroup_commit_charge() commits the page to the charge once it
has a valid page->mapping and PageAnon() reliably tells the type.
mem_cgroup_cancel_charge() aborts the transaction.
This reduces the charge API and enables subsequent patches to
drastically simplify uncharging.
As pages need to be committed after rmap is established but before they
are added to the LRU, page_add_new_anon_rmap() must stop doing LRU
additions again. Revive lru_cache_add_active_or_unevictable().
[hughd@google.com: fix shmem_unuse]
[hughd@google.com: Add comments on the private use of -EAGAIN]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: Tejun Heo <tj@kernel.org>
Cc: Vladimir Davydov <vdavydov@parallels.com>
Signed-off-by: Hugh Dickins <hughd@google.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-08-09 04:19:20 +07:00
|
|
|
struct mem_cgroup *memcg = NULL;
|
|
|
|
int ret = 0;
|
|
|
|
|
|
|
|
if (mem_cgroup_disabled())
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
if (PageSwapCache(page)) {
|
2020-06-04 06:02:14 +07:00
|
|
|
swp_entry_t ent = { .val = page_private(page), };
|
|
|
|
unsigned short id;
|
|
|
|
|
mm: memcontrol: rewrite charge API
These patches rework memcg charge lifetime to integrate more naturally
with the lifetime of user pages. This drastically simplifies the code and
reduces charging and uncharging overhead. The most expensive part of
charging and uncharging is the page_cgroup bit spinlock, which is removed
entirely after this series.
Here are the top-10 profile entries of a stress test that reads a 128G
sparse file on a freshly booted box, without even a dedicated cgroup (i.e.
executing in the root memcg). Before:
15.36% cat [kernel.kallsyms] [k] copy_user_generic_string
13.31% cat [kernel.kallsyms] [k] memset
11.48% cat [kernel.kallsyms] [k] do_mpage_readpage
4.23% cat [kernel.kallsyms] [k] get_page_from_freelist
2.38% cat [kernel.kallsyms] [k] put_page
2.32% cat [kernel.kallsyms] [k] __mem_cgroup_commit_charge
2.18% kswapd0 [kernel.kallsyms] [k] __mem_cgroup_uncharge_common
1.92% kswapd0 [kernel.kallsyms] [k] shrink_page_list
1.86% cat [kernel.kallsyms] [k] __radix_tree_lookup
1.62% cat [kernel.kallsyms] [k] __pagevec_lru_add_fn
After:
15.67% cat [kernel.kallsyms] [k] copy_user_generic_string
13.48% cat [kernel.kallsyms] [k] memset
11.42% cat [kernel.kallsyms] [k] do_mpage_readpage
3.98% cat [kernel.kallsyms] [k] get_page_from_freelist
2.46% cat [kernel.kallsyms] [k] put_page
2.13% kswapd0 [kernel.kallsyms] [k] shrink_page_list
1.88% cat [kernel.kallsyms] [k] __radix_tree_lookup
1.67% cat [kernel.kallsyms] [k] __pagevec_lru_add_fn
1.39% kswapd0 [kernel.kallsyms] [k] free_pcppages_bulk
1.30% cat [kernel.kallsyms] [k] kfree
As you can see, the memcg footprint has shrunk quite a bit.
text data bss dec hex filename
37970 9892 400 48262 bc86 mm/memcontrol.o.old
35239 9892 400 45531 b1db mm/memcontrol.o
This patch (of 4):
The memcg charge API charges pages before they are rmapped - i.e. have an
actual "type" - and so every callsite needs its own set of charge and
uncharge functions to know what type is being operated on. Worse,
uncharge has to happen from a context that is still type-specific, rather
than at the end of the page's lifetime with exclusive access, and so
requires a lot of synchronization.
Rewrite the charge API to provide a generic set of try_charge(),
commit_charge() and cancel_charge() transaction operations, much like
what's currently done for swap-in:
mem_cgroup_try_charge() attempts to reserve a charge, reclaiming
pages from the memcg if necessary.
mem_cgroup_commit_charge() commits the page to the charge once it
has a valid page->mapping and PageAnon() reliably tells the type.
mem_cgroup_cancel_charge() aborts the transaction.
This reduces the charge API and enables subsequent patches to
drastically simplify uncharging.
As pages need to be committed after rmap is established but before they
are added to the LRU, page_add_new_anon_rmap() must stop doing LRU
additions again. Revive lru_cache_add_active_or_unevictable().
[hughd@google.com: fix shmem_unuse]
[hughd@google.com: Add comments on the private use of -EAGAIN]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: Tejun Heo <tj@kernel.org>
Cc: Vladimir Davydov <vdavydov@parallels.com>
Signed-off-by: Hugh Dickins <hughd@google.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-08-09 04:19:20 +07:00
|
|
|
/*
|
|
|
|
* Every swap fault against a single page tries to charge the
|
|
|
|
* page, bail as early as possible. shmem_unuse() encounters
|
2020-06-04 06:02:11 +07:00
|
|
|
* already charged pages, too. page->mem_cgroup is protected
|
|
|
|
* by the page lock, which serializes swap cache removal, which
|
mm: memcontrol: rewrite charge API
These patches rework memcg charge lifetime to integrate more naturally
with the lifetime of user pages. This drastically simplifies the code and
reduces charging and uncharging overhead. The most expensive part of
charging and uncharging is the page_cgroup bit spinlock, which is removed
entirely after this series.
Here are the top-10 profile entries of a stress test that reads a 128G
sparse file on a freshly booted box, without even a dedicated cgroup (i.e.
executing in the root memcg). Before:
15.36% cat [kernel.kallsyms] [k] copy_user_generic_string
13.31% cat [kernel.kallsyms] [k] memset
11.48% cat [kernel.kallsyms] [k] do_mpage_readpage
4.23% cat [kernel.kallsyms] [k] get_page_from_freelist
2.38% cat [kernel.kallsyms] [k] put_page
2.32% cat [kernel.kallsyms] [k] __mem_cgroup_commit_charge
2.18% kswapd0 [kernel.kallsyms] [k] __mem_cgroup_uncharge_common
1.92% kswapd0 [kernel.kallsyms] [k] shrink_page_list
1.86% cat [kernel.kallsyms] [k] __radix_tree_lookup
1.62% cat [kernel.kallsyms] [k] __pagevec_lru_add_fn
After:
15.67% cat [kernel.kallsyms] [k] copy_user_generic_string
13.48% cat [kernel.kallsyms] [k] memset
11.42% cat [kernel.kallsyms] [k] do_mpage_readpage
3.98% cat [kernel.kallsyms] [k] get_page_from_freelist
2.46% cat [kernel.kallsyms] [k] put_page
2.13% kswapd0 [kernel.kallsyms] [k] shrink_page_list
1.88% cat [kernel.kallsyms] [k] __radix_tree_lookup
1.67% cat [kernel.kallsyms] [k] __pagevec_lru_add_fn
1.39% kswapd0 [kernel.kallsyms] [k] free_pcppages_bulk
1.30% cat [kernel.kallsyms] [k] kfree
As you can see, the memcg footprint has shrunk quite a bit.
text data bss dec hex filename
37970 9892 400 48262 bc86 mm/memcontrol.o.old
35239 9892 400 45531 b1db mm/memcontrol.o
This patch (of 4):
The memcg charge API charges pages before they are rmapped - i.e. have an
actual "type" - and so every callsite needs its own set of charge and
uncharge functions to know what type is being operated on. Worse,
uncharge has to happen from a context that is still type-specific, rather
than at the end of the page's lifetime with exclusive access, and so
requires a lot of synchronization.
Rewrite the charge API to provide a generic set of try_charge(),
commit_charge() and cancel_charge() transaction operations, much like
what's currently done for swap-in:
mem_cgroup_try_charge() attempts to reserve a charge, reclaiming
pages from the memcg if necessary.
mem_cgroup_commit_charge() commits the page to the charge once it
has a valid page->mapping and PageAnon() reliably tells the type.
mem_cgroup_cancel_charge() aborts the transaction.
This reduces the charge API and enables subsequent patches to
drastically simplify uncharging.
As pages need to be committed after rmap is established but before they
are added to the LRU, page_add_new_anon_rmap() must stop doing LRU
additions again. Revive lru_cache_add_active_or_unevictable().
[hughd@google.com: fix shmem_unuse]
[hughd@google.com: Add comments on the private use of -EAGAIN]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: Tejun Heo <tj@kernel.org>
Cc: Vladimir Davydov <vdavydov@parallels.com>
Signed-off-by: Hugh Dickins <hughd@google.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-08-09 04:19:20 +07:00
|
|
|
* in turn serializes uncharging.
|
|
|
|
*/
|
2015-09-10 05:35:35 +07:00
|
|
|
VM_BUG_ON_PAGE(!PageLocked(page), page);
|
2017-09-07 06:22:41 +07:00
|
|
|
if (compound_head(page)->mem_cgroup)
|
mm: memcontrol: rewrite charge API
These patches rework memcg charge lifetime to integrate more naturally
with the lifetime of user pages. This drastically simplifies the code and
reduces charging and uncharging overhead. The most expensive part of
charging and uncharging is the page_cgroup bit spinlock, which is removed
entirely after this series.
Here are the top-10 profile entries of a stress test that reads a 128G
sparse file on a freshly booted box, without even a dedicated cgroup (i.e.
executing in the root memcg). Before:
15.36% cat [kernel.kallsyms] [k] copy_user_generic_string
13.31% cat [kernel.kallsyms] [k] memset
11.48% cat [kernel.kallsyms] [k] do_mpage_readpage
4.23% cat [kernel.kallsyms] [k] get_page_from_freelist
2.38% cat [kernel.kallsyms] [k] put_page
2.32% cat [kernel.kallsyms] [k] __mem_cgroup_commit_charge
2.18% kswapd0 [kernel.kallsyms] [k] __mem_cgroup_uncharge_common
1.92% kswapd0 [kernel.kallsyms] [k] shrink_page_list
1.86% cat [kernel.kallsyms] [k] __radix_tree_lookup
1.62% cat [kernel.kallsyms] [k] __pagevec_lru_add_fn
After:
15.67% cat [kernel.kallsyms] [k] copy_user_generic_string
13.48% cat [kernel.kallsyms] [k] memset
11.42% cat [kernel.kallsyms] [k] do_mpage_readpage
3.98% cat [kernel.kallsyms] [k] get_page_from_freelist
2.46% cat [kernel.kallsyms] [k] put_page
2.13% kswapd0 [kernel.kallsyms] [k] shrink_page_list
1.88% cat [kernel.kallsyms] [k] __radix_tree_lookup
1.67% cat [kernel.kallsyms] [k] __pagevec_lru_add_fn
1.39% kswapd0 [kernel.kallsyms] [k] free_pcppages_bulk
1.30% cat [kernel.kallsyms] [k] kfree
As you can see, the memcg footprint has shrunk quite a bit.
text data bss dec hex filename
37970 9892 400 48262 bc86 mm/memcontrol.o.old
35239 9892 400 45531 b1db mm/memcontrol.o
This patch (of 4):
The memcg charge API charges pages before they are rmapped - i.e. have an
actual "type" - and so every callsite needs its own set of charge and
uncharge functions to know what type is being operated on. Worse,
uncharge has to happen from a context that is still type-specific, rather
than at the end of the page's lifetime with exclusive access, and so
requires a lot of synchronization.
Rewrite the charge API to provide a generic set of try_charge(),
commit_charge() and cancel_charge() transaction operations, much like
what's currently done for swap-in:
mem_cgroup_try_charge() attempts to reserve a charge, reclaiming
pages from the memcg if necessary.
mem_cgroup_commit_charge() commits the page to the charge once it
has a valid page->mapping and PageAnon() reliably tells the type.
mem_cgroup_cancel_charge() aborts the transaction.
This reduces the charge API and enables subsequent patches to
drastically simplify uncharging.
As pages need to be committed after rmap is established but before they
are added to the LRU, page_add_new_anon_rmap() must stop doing LRU
additions again. Revive lru_cache_add_active_or_unevictable().
[hughd@google.com: fix shmem_unuse]
[hughd@google.com: Add comments on the private use of -EAGAIN]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: Tejun Heo <tj@kernel.org>
Cc: Vladimir Davydov <vdavydov@parallels.com>
Signed-off-by: Hugh Dickins <hughd@google.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-08-09 04:19:20 +07:00
|
|
|
goto out;
|
2015-09-10 05:35:35 +07:00
|
|
|
|
2020-06-04 06:02:14 +07:00
|
|
|
id = lookup_swap_cgroup_id(ent);
|
|
|
|
rcu_read_lock();
|
|
|
|
memcg = mem_cgroup_from_id(id);
|
|
|
|
if (memcg && !css_tryget_online(&memcg->css))
|
|
|
|
memcg = NULL;
|
|
|
|
rcu_read_unlock();
|
mm: memcontrol: rewrite charge API
These patches rework memcg charge lifetime to integrate more naturally
with the lifetime of user pages. This drastically simplifies the code and
reduces charging and uncharging overhead. The most expensive part of
charging and uncharging is the page_cgroup bit spinlock, which is removed
entirely after this series.
Here are the top-10 profile entries of a stress test that reads a 128G
sparse file on a freshly booted box, without even a dedicated cgroup (i.e.
executing in the root memcg). Before:
15.36% cat [kernel.kallsyms] [k] copy_user_generic_string
13.31% cat [kernel.kallsyms] [k] memset
11.48% cat [kernel.kallsyms] [k] do_mpage_readpage
4.23% cat [kernel.kallsyms] [k] get_page_from_freelist
2.38% cat [kernel.kallsyms] [k] put_page
2.32% cat [kernel.kallsyms] [k] __mem_cgroup_commit_charge
2.18% kswapd0 [kernel.kallsyms] [k] __mem_cgroup_uncharge_common
1.92% kswapd0 [kernel.kallsyms] [k] shrink_page_list
1.86% cat [kernel.kallsyms] [k] __radix_tree_lookup
1.62% cat [kernel.kallsyms] [k] __pagevec_lru_add_fn
After:
15.67% cat [kernel.kallsyms] [k] copy_user_generic_string
13.48% cat [kernel.kallsyms] [k] memset
11.42% cat [kernel.kallsyms] [k] do_mpage_readpage
3.98% cat [kernel.kallsyms] [k] get_page_from_freelist
2.46% cat [kernel.kallsyms] [k] put_page
2.13% kswapd0 [kernel.kallsyms] [k] shrink_page_list
1.88% cat [kernel.kallsyms] [k] __radix_tree_lookup
1.67% cat [kernel.kallsyms] [k] __pagevec_lru_add_fn
1.39% kswapd0 [kernel.kallsyms] [k] free_pcppages_bulk
1.30% cat [kernel.kallsyms] [k] kfree
As you can see, the memcg footprint has shrunk quite a bit.
text data bss dec hex filename
37970 9892 400 48262 bc86 mm/memcontrol.o.old
35239 9892 400 45531 b1db mm/memcontrol.o
This patch (of 4):
The memcg charge API charges pages before they are rmapped - i.e. have an
actual "type" - and so every callsite needs its own set of charge and
uncharge functions to know what type is being operated on. Worse,
uncharge has to happen from a context that is still type-specific, rather
than at the end of the page's lifetime with exclusive access, and so
requires a lot of synchronization.
Rewrite the charge API to provide a generic set of try_charge(),
commit_charge() and cancel_charge() transaction operations, much like
what's currently done for swap-in:
mem_cgroup_try_charge() attempts to reserve a charge, reclaiming
pages from the memcg if necessary.
mem_cgroup_commit_charge() commits the page to the charge once it
has a valid page->mapping and PageAnon() reliably tells the type.
mem_cgroup_cancel_charge() aborts the transaction.
This reduces the charge API and enables subsequent patches to
drastically simplify uncharging.
As pages need to be committed after rmap is established but before they
are added to the LRU, page_add_new_anon_rmap() must stop doing LRU
additions again. Revive lru_cache_add_active_or_unevictable().
[hughd@google.com: fix shmem_unuse]
[hughd@google.com: Add comments on the private use of -EAGAIN]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: Tejun Heo <tj@kernel.org>
Cc: Vladimir Davydov <vdavydov@parallels.com>
Signed-off-by: Hugh Dickins <hughd@google.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-08-09 04:19:20 +07:00
|
|
|
}
|
|
|
|
|
|
|
|
if (!memcg)
|
|
|
|
memcg = get_mem_cgroup_from_mm(mm);
|
|
|
|
|
|
|
|
ret = try_charge(memcg, gfp_mask, nr_pages);
|
2020-06-04 06:02:07 +07:00
|
|
|
if (ret)
|
|
|
|
goto out_put;
|
mm: memcontrol: rewrite charge API
These patches rework memcg charge lifetime to integrate more naturally
with the lifetime of user pages. This drastically simplifies the code and
reduces charging and uncharging overhead. The most expensive part of
charging and uncharging is the page_cgroup bit spinlock, which is removed
entirely after this series.
Here are the top-10 profile entries of a stress test that reads a 128G
sparse file on a freshly booted box, without even a dedicated cgroup (i.e.
executing in the root memcg). Before:
15.36% cat [kernel.kallsyms] [k] copy_user_generic_string
13.31% cat [kernel.kallsyms] [k] memset
11.48% cat [kernel.kallsyms] [k] do_mpage_readpage
4.23% cat [kernel.kallsyms] [k] get_page_from_freelist
2.38% cat [kernel.kallsyms] [k] put_page
2.32% cat [kernel.kallsyms] [k] __mem_cgroup_commit_charge
2.18% kswapd0 [kernel.kallsyms] [k] __mem_cgroup_uncharge_common
1.92% kswapd0 [kernel.kallsyms] [k] shrink_page_list
1.86% cat [kernel.kallsyms] [k] __radix_tree_lookup
1.62% cat [kernel.kallsyms] [k] __pagevec_lru_add_fn
After:
15.67% cat [kernel.kallsyms] [k] copy_user_generic_string
13.48% cat [kernel.kallsyms] [k] memset
11.42% cat [kernel.kallsyms] [k] do_mpage_readpage
3.98% cat [kernel.kallsyms] [k] get_page_from_freelist
2.46% cat [kernel.kallsyms] [k] put_page
2.13% kswapd0 [kernel.kallsyms] [k] shrink_page_list
1.88% cat [kernel.kallsyms] [k] __radix_tree_lookup
1.67% cat [kernel.kallsyms] [k] __pagevec_lru_add_fn
1.39% kswapd0 [kernel.kallsyms] [k] free_pcppages_bulk
1.30% cat [kernel.kallsyms] [k] kfree
As you can see, the memcg footprint has shrunk quite a bit.
text data bss dec hex filename
37970 9892 400 48262 bc86 mm/memcontrol.o.old
35239 9892 400 45531 b1db mm/memcontrol.o
This patch (of 4):
The memcg charge API charges pages before they are rmapped - i.e. have an
actual "type" - and so every callsite needs its own set of charge and
uncharge functions to know what type is being operated on. Worse,
uncharge has to happen from a context that is still type-specific, rather
than at the end of the page's lifetime with exclusive access, and so
requires a lot of synchronization.
Rewrite the charge API to provide a generic set of try_charge(),
commit_charge() and cancel_charge() transaction operations, much like
what's currently done for swap-in:
mem_cgroup_try_charge() attempts to reserve a charge, reclaiming
pages from the memcg if necessary.
mem_cgroup_commit_charge() commits the page to the charge once it
has a valid page->mapping and PageAnon() reliably tells the type.
mem_cgroup_cancel_charge() aborts the transaction.
This reduces the charge API and enables subsequent patches to
drastically simplify uncharging.
As pages need to be committed after rmap is established but before they
are added to the LRU, page_add_new_anon_rmap() must stop doing LRU
additions again. Revive lru_cache_add_active_or_unevictable().
[hughd@google.com: fix shmem_unuse]
[hughd@google.com: Add comments on the private use of -EAGAIN]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: Tejun Heo <tj@kernel.org>
Cc: Vladimir Davydov <vdavydov@parallels.com>
Signed-off-by: Hugh Dickins <hughd@google.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-08-09 04:19:20 +07:00
|
|
|
|
2020-08-07 13:20:45 +07:00
|
|
|
css_get(&memcg->css);
|
2020-06-04 06:02:24 +07:00
|
|
|
commit_charge(page, memcg);
|
2014-08-09 04:19:33 +07:00
|
|
|
|
|
|
|
local_irq_disable();
|
2020-06-04 06:01:31 +07:00
|
|
|
mem_cgroup_charge_statistics(memcg, page, nr_pages);
|
2014-08-09 04:19:33 +07:00
|
|
|
memcg_check_events(memcg, page);
|
|
|
|
local_irq_enable();
|
mm: memcontrol: rewrite charge API
These patches rework memcg charge lifetime to integrate more naturally
with the lifetime of user pages. This drastically simplifies the code and
reduces charging and uncharging overhead. The most expensive part of
charging and uncharging is the page_cgroup bit spinlock, which is removed
entirely after this series.
Here are the top-10 profile entries of a stress test that reads a 128G
sparse file on a freshly booted box, without even a dedicated cgroup (i.e.
executing in the root memcg). Before:
15.36% cat [kernel.kallsyms] [k] copy_user_generic_string
13.31% cat [kernel.kallsyms] [k] memset
11.48% cat [kernel.kallsyms] [k] do_mpage_readpage
4.23% cat [kernel.kallsyms] [k] get_page_from_freelist
2.38% cat [kernel.kallsyms] [k] put_page
2.32% cat [kernel.kallsyms] [k] __mem_cgroup_commit_charge
2.18% kswapd0 [kernel.kallsyms] [k] __mem_cgroup_uncharge_common
1.92% kswapd0 [kernel.kallsyms] [k] shrink_page_list
1.86% cat [kernel.kallsyms] [k] __radix_tree_lookup
1.62% cat [kernel.kallsyms] [k] __pagevec_lru_add_fn
After:
15.67% cat [kernel.kallsyms] [k] copy_user_generic_string
13.48% cat [kernel.kallsyms] [k] memset
11.42% cat [kernel.kallsyms] [k] do_mpage_readpage
3.98% cat [kernel.kallsyms] [k] get_page_from_freelist
2.46% cat [kernel.kallsyms] [k] put_page
2.13% kswapd0 [kernel.kallsyms] [k] shrink_page_list
1.88% cat [kernel.kallsyms] [k] __radix_tree_lookup
1.67% cat [kernel.kallsyms] [k] __pagevec_lru_add_fn
1.39% kswapd0 [kernel.kallsyms] [k] free_pcppages_bulk
1.30% cat [kernel.kallsyms] [k] kfree
As you can see, the memcg footprint has shrunk quite a bit.
text data bss dec hex filename
37970 9892 400 48262 bc86 mm/memcontrol.o.old
35239 9892 400 45531 b1db mm/memcontrol.o
This patch (of 4):
The memcg charge API charges pages before they are rmapped - i.e. have an
actual "type" - and so every callsite needs its own set of charge and
uncharge functions to know what type is being operated on. Worse,
uncharge has to happen from a context that is still type-specific, rather
than at the end of the page's lifetime with exclusive access, and so
requires a lot of synchronization.
Rewrite the charge API to provide a generic set of try_charge(),
commit_charge() and cancel_charge() transaction operations, much like
what's currently done for swap-in:
mem_cgroup_try_charge() attempts to reserve a charge, reclaiming
pages from the memcg if necessary.
mem_cgroup_commit_charge() commits the page to the charge once it
has a valid page->mapping and PageAnon() reliably tells the type.
mem_cgroup_cancel_charge() aborts the transaction.
This reduces the charge API and enables subsequent patches to
drastically simplify uncharging.
As pages need to be committed after rmap is established but before they
are added to the LRU, page_add_new_anon_rmap() must stop doing LRU
additions again. Revive lru_cache_add_active_or_unevictable().
[hughd@google.com: fix shmem_unuse]
[hughd@google.com: Add comments on the private use of -EAGAIN]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: Tejun Heo <tj@kernel.org>
Cc: Vladimir Davydov <vdavydov@parallels.com>
Signed-off-by: Hugh Dickins <hughd@google.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-08-09 04:19:20 +07:00
|
|
|
|
2020-06-04 06:02:14 +07:00
|
|
|
if (PageSwapCache(page)) {
|
mm: memcontrol: rewrite charge API
These patches rework memcg charge lifetime to integrate more naturally
with the lifetime of user pages. This drastically simplifies the code and
reduces charging and uncharging overhead. The most expensive part of
charging and uncharging is the page_cgroup bit spinlock, which is removed
entirely after this series.
Here are the top-10 profile entries of a stress test that reads a 128G
sparse file on a freshly booted box, without even a dedicated cgroup (i.e.
executing in the root memcg). Before:
15.36% cat [kernel.kallsyms] [k] copy_user_generic_string
13.31% cat [kernel.kallsyms] [k] memset
11.48% cat [kernel.kallsyms] [k] do_mpage_readpage
4.23% cat [kernel.kallsyms] [k] get_page_from_freelist
2.38% cat [kernel.kallsyms] [k] put_page
2.32% cat [kernel.kallsyms] [k] __mem_cgroup_commit_charge
2.18% kswapd0 [kernel.kallsyms] [k] __mem_cgroup_uncharge_common
1.92% kswapd0 [kernel.kallsyms] [k] shrink_page_list
1.86% cat [kernel.kallsyms] [k] __radix_tree_lookup
1.62% cat [kernel.kallsyms] [k] __pagevec_lru_add_fn
After:
15.67% cat [kernel.kallsyms] [k] copy_user_generic_string
13.48% cat [kernel.kallsyms] [k] memset
11.42% cat [kernel.kallsyms] [k] do_mpage_readpage
3.98% cat [kernel.kallsyms] [k] get_page_from_freelist
2.46% cat [kernel.kallsyms] [k] put_page
2.13% kswapd0 [kernel.kallsyms] [k] shrink_page_list
1.88% cat [kernel.kallsyms] [k] __radix_tree_lookup
1.67% cat [kernel.kallsyms] [k] __pagevec_lru_add_fn
1.39% kswapd0 [kernel.kallsyms] [k] free_pcppages_bulk
1.30% cat [kernel.kallsyms] [k] kfree
As you can see, the memcg footprint has shrunk quite a bit.
text data bss dec hex filename
37970 9892 400 48262 bc86 mm/memcontrol.o.old
35239 9892 400 45531 b1db mm/memcontrol.o
This patch (of 4):
The memcg charge API charges pages before they are rmapped - i.e. have an
actual "type" - and so every callsite needs its own set of charge and
uncharge functions to know what type is being operated on. Worse,
uncharge has to happen from a context that is still type-specific, rather
than at the end of the page's lifetime with exclusive access, and so
requires a lot of synchronization.
Rewrite the charge API to provide a generic set of try_charge(),
commit_charge() and cancel_charge() transaction operations, much like
what's currently done for swap-in:
mem_cgroup_try_charge() attempts to reserve a charge, reclaiming
pages from the memcg if necessary.
mem_cgroup_commit_charge() commits the page to the charge once it
has a valid page->mapping and PageAnon() reliably tells the type.
mem_cgroup_cancel_charge() aborts the transaction.
This reduces the charge API and enables subsequent patches to
drastically simplify uncharging.
As pages need to be committed after rmap is established but before they
are added to the LRU, page_add_new_anon_rmap() must stop doing LRU
additions again. Revive lru_cache_add_active_or_unevictable().
[hughd@google.com: fix shmem_unuse]
[hughd@google.com: Add comments on the private use of -EAGAIN]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: Tejun Heo <tj@kernel.org>
Cc: Vladimir Davydov <vdavydov@parallels.com>
Signed-off-by: Hugh Dickins <hughd@google.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-08-09 04:19:20 +07:00
|
|
|
swp_entry_t entry = { .val = page_private(page) };
|
|
|
|
/*
|
|
|
|
* The swap entry might not get freed for a long time,
|
|
|
|
* let's not wait for it. The page already received a
|
|
|
|
* memory+swap charge, drop the swap entry duplicate.
|
|
|
|
*/
|
mm, THP, swap: delay splitting THP during swap out
Patch series "THP swap: Delay splitting THP during swapping out", v11.
This patchset is to optimize the performance of Transparent Huge Page
(THP) swap.
Recently, the performance of the storage devices improved so fast that
we cannot saturate the disk bandwidth with single logical CPU when do
page swap out even on a high-end server machine. Because the
performance of the storage device improved faster than that of single
logical CPU. And it seems that the trend will not change in the near
future. On the other hand, the THP becomes more and more popular
because of increased memory size. So it becomes necessary to optimize
THP swap performance.
The advantages of the THP swap support include:
- Batch the swap operations for the THP to reduce lock
acquiring/releasing, including allocating/freeing the swap space,
adding/deleting to/from the swap cache, and writing/reading the swap
space, etc. This will help improve the performance of the THP swap.
- The THP swap space read/write will be 2M sequential IO. It is
particularly helpful for the swap read, which are usually 4k random
IO. This will improve the performance of the THP swap too.
- It will help the memory fragmentation, especially when the THP is
heavily used by the applications. The 2M continuous pages will be
free up after THP swapping out.
- It will improve the THP utilization on the system with the swap
turned on. Because the speed for khugepaged to collapse the normal
pages into the THP is quite slow. After the THP is split during the
swapping out, it will take quite long time for the normal pages to
collapse back into the THP after being swapped in. The high THP
utilization helps the efficiency of the page based memory management
too.
There are some concerns regarding THP swap in, mainly because possible
enlarged read/write IO size (for swap in/out) may put more overhead on
the storage device. To deal with that, the THP swap in should be turned
on only when necessary. For example, it can be selected via
"always/never/madvise" logic, to be turned on globally, turned off
globally, or turned on only for VMA with MADV_HUGEPAGE, etc.
This patchset is the first step for the THP swap support. The plan is
to delay splitting THP step by step, finally avoid splitting THP during
the THP swapping out and swap out/in the THP as a whole.
As the first step, in this patchset, the splitting huge page is delayed
from almost the first step of swapping out to after allocating the swap
space for the THP and adding the THP into the swap cache. This will
reduce lock acquiring/releasing for the locks used for the swap cache
management.
With the patchset, the swap out throughput improves 15.5% (from about
3.73GB/s to about 4.31GB/s) in the vm-scalability swap-w-seq test case
with 8 processes. The test is done on a Xeon E5 v3 system. The swap
device used is a RAM simulated PMEM (persistent memory) device. To test
the sequential swapping out, the test case creates 8 processes, which
sequentially allocate and write to the anonymous pages until the RAM and
part of the swap device is used up.
This patch (of 5):
In this patch, splitting huge page is delayed from almost the first step
of swapping out to after allocating the swap space for the THP
(Transparent Huge Page) and adding the THP into the swap cache. This
will batch the corresponding operation, thus improve THP swap out
throughput.
This is the first step for the THP swap optimization. The plan is to
delay splitting the THP step by step and avoid splitting the THP
finally.
In this patch, one swap cluster is used to hold the contents of each THP
swapped out. So, the size of the swap cluster is changed to that of the
THP (Transparent Huge Page) on x86_64 architecture (512). For other
architectures which want such THP swap optimization,
ARCH_USES_THP_SWAP_CLUSTER needs to be selected in the Kconfig file for
the architecture. In effect, this will enlarge swap cluster size by 2
times on x86_64. Which may make it harder to find a free cluster when
the swap space becomes fragmented. So that, this may reduce the
continuous swap space allocation and sequential write in theory. The
performance test in 0day shows no regressions caused by this.
In the future of THP swap optimization, some information of the swapped
out THP (such as compound map count) will be recorded in the
swap_cluster_info data structure.
The mem cgroup swap accounting functions are enhanced to support charge
or uncharge a swap cluster backing a THP as a whole.
The swap cluster allocate/free functions are added to allocate/free a
swap cluster for a THP. A fair simple algorithm is used for swap
cluster allocation, that is, only the first swap device in priority list
will be tried to allocate the swap cluster. The function will fail if
the trying is not successful, and the caller will fallback to allocate a
single swap slot instead. This works good enough for normal cases. If
the difference of the number of the free swap clusters among multiple
swap devices is significant, it is possible that some THPs are split
earlier than necessary. For example, this could be caused by big size
difference among multiple swap devices.
The swap cache functions is enhanced to support add/delete THP to/from
the swap cache as a set of (HPAGE_PMD_NR) sub-pages. This may be
enhanced in the future with multi-order radix tree. But because we will
split the THP soon during swapping out, that optimization doesn't make
much sense for this first step.
The THP splitting functions are enhanced to support to split THP in swap
cache during swapping out. The page lock will be held during allocating
the swap cluster, adding the THP into the swap cache and splitting the
THP. So in the code path other than swapping out, if the THP need to be
split, the PageSwapCache(THP) will be always false.
The swap cluster is only available for SSD, so the THP swap optimization
in this patchset has no effect for HDD.
[ying.huang@intel.com: fix two issues in THP optimize patch]
Link: http://lkml.kernel.org/r/87k25ed8zo.fsf@yhuang-dev.intel.com
[hannes@cmpxchg.org: extensive cleanups and simplifications, reduce code size]
Link: http://lkml.kernel.org/r/20170515112522.32457-2-ying.huang@intel.com
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Suggested-by: Andrew Morton <akpm@linux-foundation.org> [for config option]
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> [for changes in huge_memory.c and huge_mm.h]
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Ebru Akagunduz <ebru.akagunduz@gmail.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Shaohua Li <shli@kernel.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2017-07-07 05:37:18 +07:00
|
|
|
mem_cgroup_uncharge_swap(entry, nr_pages);
|
mm: memcontrol: rewrite charge API
These patches rework memcg charge lifetime to integrate more naturally
with the lifetime of user pages. This drastically simplifies the code and
reduces charging and uncharging overhead. The most expensive part of
charging and uncharging is the page_cgroup bit spinlock, which is removed
entirely after this series.
Here are the top-10 profile entries of a stress test that reads a 128G
sparse file on a freshly booted box, without even a dedicated cgroup (i.e.
executing in the root memcg). Before:
15.36% cat [kernel.kallsyms] [k] copy_user_generic_string
13.31% cat [kernel.kallsyms] [k] memset
11.48% cat [kernel.kallsyms] [k] do_mpage_readpage
4.23% cat [kernel.kallsyms] [k] get_page_from_freelist
2.38% cat [kernel.kallsyms] [k] put_page
2.32% cat [kernel.kallsyms] [k] __mem_cgroup_commit_charge
2.18% kswapd0 [kernel.kallsyms] [k] __mem_cgroup_uncharge_common
1.92% kswapd0 [kernel.kallsyms] [k] shrink_page_list
1.86% cat [kernel.kallsyms] [k] __radix_tree_lookup
1.62% cat [kernel.kallsyms] [k] __pagevec_lru_add_fn
After:
15.67% cat [kernel.kallsyms] [k] copy_user_generic_string
13.48% cat [kernel.kallsyms] [k] memset
11.42% cat [kernel.kallsyms] [k] do_mpage_readpage
3.98% cat [kernel.kallsyms] [k] get_page_from_freelist
2.46% cat [kernel.kallsyms] [k] put_page
2.13% kswapd0 [kernel.kallsyms] [k] shrink_page_list
1.88% cat [kernel.kallsyms] [k] __radix_tree_lookup
1.67% cat [kernel.kallsyms] [k] __pagevec_lru_add_fn
1.39% kswapd0 [kernel.kallsyms] [k] free_pcppages_bulk
1.30% cat [kernel.kallsyms] [k] kfree
As you can see, the memcg footprint has shrunk quite a bit.
text data bss dec hex filename
37970 9892 400 48262 bc86 mm/memcontrol.o.old
35239 9892 400 45531 b1db mm/memcontrol.o
This patch (of 4):
The memcg charge API charges pages before they are rmapped - i.e. have an
actual "type" - and so every callsite needs its own set of charge and
uncharge functions to know what type is being operated on. Worse,
uncharge has to happen from a context that is still type-specific, rather
than at the end of the page's lifetime with exclusive access, and so
requires a lot of synchronization.
Rewrite the charge API to provide a generic set of try_charge(),
commit_charge() and cancel_charge() transaction operations, much like
what's currently done for swap-in:
mem_cgroup_try_charge() attempts to reserve a charge, reclaiming
pages from the memcg if necessary.
mem_cgroup_commit_charge() commits the page to the charge once it
has a valid page->mapping and PageAnon() reliably tells the type.
mem_cgroup_cancel_charge() aborts the transaction.
This reduces the charge API and enables subsequent patches to
drastically simplify uncharging.
As pages need to be committed after rmap is established but before they
are added to the LRU, page_add_new_anon_rmap() must stop doing LRU
additions again. Revive lru_cache_add_active_or_unevictable().
[hughd@google.com: fix shmem_unuse]
[hughd@google.com: Add comments on the private use of -EAGAIN]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: Tejun Heo <tj@kernel.org>
Cc: Vladimir Davydov <vdavydov@parallels.com>
Signed-off-by: Hugh Dickins <hughd@google.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-08-09 04:19:20 +07:00
|
|
|
}
|
|
|
|
|
2020-06-04 06:02:07 +07:00
|
|
|
out_put:
|
|
|
|
css_put(&memcg->css);
|
|
|
|
out:
|
|
|
|
return ret;
|
mm: memcontrol: convert page cache to a new mem_cgroup_charge() API
The try/commit/cancel protocol that memcg uses dates back to when pages
used to be uncharged upon removal from the page cache, and thus couldn't
be committed before the insertion had succeeded. Nowadays, pages are
uncharged when they are physically freed; it doesn't matter whether the
insertion was successful or not. For the page cache, the transaction
dance has become unnecessary.
Introduce a mem_cgroup_charge() function that simply charges a newly
allocated page to a cgroup and sets up page->mem_cgroup in one single
step. If the insertion fails, the caller doesn't have to do anything but
free/put the page.
Then switch the page cache over to this new API.
Subsequent patches will also convert anon pages, but it needs a bit more
prep work. Right now, memcg depends on page->mapping being already set up
at the time of charging, so that it can maintain its own MEMCG_CACHE and
MEMCG_RSS counters. For anon, page->mapping is set under the same pte
lock under which the page is publishd, so a single charge point that can
block doesn't work there just yet.
The following prep patches will replace the private memcg counters with
the generic vmstat counters, thus removing the page->mapping dependency,
then complete the transition to the new single-point charge API and delete
the old transactional scheme.
v2: leave shmem swapcache when charging fails to avoid double IO (Joonsoo)
v3: rebase on preceeding shmem simplification patch
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Reviewed-by: Alex Shi <alex.shi@linux.alibaba.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: "Kirill A. Shutemov" <kirill@shutemov.name>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Roman Gushchin <guro@fb.com>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Balbir Singh <bsingharora@gmail.com>
Link: http://lkml.kernel.org/r/20200508183105.225460-6-hannes@cmpxchg.org
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2020-06-04 06:01:41 +07:00
|
|
|
}
|
|
|
|
|
2017-09-09 06:11:50 +07:00
|
|
|
struct uncharge_gather {
|
|
|
|
struct mem_cgroup *memcg;
|
2020-06-04 06:01:44 +07:00
|
|
|
unsigned long nr_pages;
|
2017-09-09 06:11:50 +07:00
|
|
|
unsigned long pgpgout;
|
|
|
|
unsigned long nr_kmem;
|
|
|
|
struct page *dummy_page;
|
|
|
|
};
|
|
|
|
|
|
|
|
static inline void uncharge_gather_clear(struct uncharge_gather *ug)
|
2014-08-09 04:19:24 +07:00
|
|
|
{
|
2017-09-09 06:11:50 +07:00
|
|
|
memset(ug, 0, sizeof(*ug));
|
|
|
|
}
|
|
|
|
|
|
|
|
static void uncharge_batch(const struct uncharge_gather *ug)
|
|
|
|
{
|
2014-08-09 04:19:24 +07:00
|
|
|
unsigned long flags;
|
|
|
|
|
2017-09-09 06:11:50 +07:00
|
|
|
if (!mem_cgroup_is_root(ug->memcg)) {
|
2020-06-04 06:01:44 +07:00
|
|
|
page_counter_uncharge(&ug->memcg->memory, ug->nr_pages);
|
2016-01-15 06:21:23 +07:00
|
|
|
if (do_memsw_account())
|
2020-06-04 06:01:44 +07:00
|
|
|
page_counter_uncharge(&ug->memcg->memsw, ug->nr_pages);
|
2017-09-09 06:11:50 +07:00
|
|
|
if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && ug->nr_kmem)
|
|
|
|
page_counter_uncharge(&ug->memcg->kmem, ug->nr_kmem);
|
|
|
|
memcg_oom_recover(ug->memcg);
|
2014-09-05 19:43:57 +07:00
|
|
|
}
|
2014-08-09 04:19:24 +07:00
|
|
|
|
|
|
|
local_irq_save(flags);
|
2018-02-01 07:16:37 +07:00
|
|
|
__count_memcg_events(ug->memcg, PGPGOUT, ug->pgpgout);
|
2020-06-04 06:01:44 +07:00
|
|
|
__this_cpu_add(ug->memcg->vmstats_percpu->nr_page_events, ug->nr_pages);
|
2017-09-09 06:11:50 +07:00
|
|
|
memcg_check_events(ug->memcg, ug->dummy_page);
|
2014-08-09 04:19:24 +07:00
|
|
|
local_irq_restore(flags);
|
2020-09-05 06:35:24 +07:00
|
|
|
|
|
|
|
/* drop reference from uncharge_page */
|
|
|
|
css_put(&ug->memcg->css);
|
2017-09-09 06:11:50 +07:00
|
|
|
}
|
|
|
|
|
|
|
|
static void uncharge_page(struct page *page, struct uncharge_gather *ug)
|
|
|
|
{
|
2020-06-04 06:01:44 +07:00
|
|
|
unsigned long nr_pages;
|
|
|
|
|
2017-09-09 06:11:50 +07:00
|
|
|
VM_BUG_ON_PAGE(PageLRU(page), page);
|
|
|
|
|
|
|
|
if (!page->mem_cgroup)
|
|
|
|
return;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Nobody should be changing or seriously looking at
|
|
|
|
* page->mem_cgroup at this point, we have fully
|
|
|
|
* exclusive access to the page.
|
|
|
|
*/
|
|
|
|
|
|
|
|
if (ug->memcg != page->mem_cgroup) {
|
|
|
|
if (ug->memcg) {
|
|
|
|
uncharge_batch(ug);
|
|
|
|
uncharge_gather_clear(ug);
|
|
|
|
}
|
|
|
|
ug->memcg = page->mem_cgroup;
|
2020-09-05 06:35:24 +07:00
|
|
|
|
|
|
|
/* pairs with css_put in uncharge_batch */
|
|
|
|
css_get(&ug->memcg->css);
|
2017-09-09 06:11:50 +07:00
|
|
|
}
|
|
|
|
|
2020-06-04 06:01:44 +07:00
|
|
|
nr_pages = compound_nr(page);
|
|
|
|
ug->nr_pages += nr_pages;
|
2017-09-09 06:11:50 +07:00
|
|
|
|
2020-06-04 06:01:44 +07:00
|
|
|
if (!PageKmemcg(page)) {
|
2017-09-09 06:11:50 +07:00
|
|
|
ug->pgpgout++;
|
|
|
|
} else {
|
2020-06-04 06:01:44 +07:00
|
|
|
ug->nr_kmem += nr_pages;
|
2017-09-09 06:11:50 +07:00
|
|
|
__ClearPageKmemcg(page);
|
|
|
|
}
|
|
|
|
|
|
|
|
ug->dummy_page = page;
|
|
|
|
page->mem_cgroup = NULL;
|
2020-08-07 13:20:45 +07:00
|
|
|
css_put(&ug->memcg->css);
|
2014-08-09 04:19:24 +07:00
|
|
|
}
|
|
|
|
|
|
|
|
static void uncharge_list(struct list_head *page_list)
|
|
|
|
{
|
2017-09-09 06:11:50 +07:00
|
|
|
struct uncharge_gather ug;
|
2014-08-09 04:19:24 +07:00
|
|
|
struct list_head *next;
|
2017-09-09 06:11:50 +07:00
|
|
|
|
|
|
|
uncharge_gather_clear(&ug);
|
2014-08-09 04:19:24 +07:00
|
|
|
|
2016-03-18 04:20:31 +07:00
|
|
|
/*
|
|
|
|
* Note that the list can be a single page->lru; hence the
|
|
|
|
* do-while loop instead of a simple list_for_each_entry().
|
|
|
|
*/
|
2014-08-09 04:19:24 +07:00
|
|
|
next = page_list->next;
|
|
|
|
do {
|
2017-09-09 06:11:50 +07:00
|
|
|
struct page *page;
|
|
|
|
|
2014-08-09 04:19:24 +07:00
|
|
|
page = list_entry(next, struct page, lru);
|
|
|
|
next = page->lru.next;
|
|
|
|
|
2017-09-09 06:11:50 +07:00
|
|
|
uncharge_page(page, &ug);
|
2014-08-09 04:19:24 +07:00
|
|
|
} while (next != page_list);
|
|
|
|
|
2017-09-09 06:11:50 +07:00
|
|
|
if (ug.memcg)
|
|
|
|
uncharge_batch(&ug);
|
2014-08-09 04:19:24 +07:00
|
|
|
}
|
|
|
|
|
mm: memcontrol: rewrite uncharge API
The memcg uncharging code that is involved towards the end of a page's
lifetime - truncation, reclaim, swapout, migration - is impressively
complicated and fragile.
Because anonymous and file pages were always charged before they had their
page->mapping established, uncharges had to happen when the page type
could still be known from the context; as in unmap for anonymous, page
cache removal for file and shmem pages, and swap cache truncation for swap
pages. However, these operations happen well before the page is actually
freed, and so a lot of synchronization is necessary:
- Charging, uncharging, page migration, and charge migration all need
to take a per-page bit spinlock as they could race with uncharging.
- Swap cache truncation happens during both swap-in and swap-out, and
possibly repeatedly before the page is actually freed. This means
that the memcg swapout code is called from many contexts that make
no sense and it has to figure out the direction from page state to
make sure memory and memory+swap are always correctly charged.
- On page migration, the old page might be unmapped but then reused,
so memcg code has to prevent untimely uncharging in that case.
Because this code - which should be a simple charge transfer - is so
special-cased, it is not reusable for replace_page_cache().
But now that charged pages always have a page->mapping, introduce
mem_cgroup_uncharge(), which is called after the final put_page(), when we
know for sure that nobody is looking at the page anymore.
For page migration, introduce mem_cgroup_migrate(), which is called after
the migration is successful and the new page is fully rmapped. Because
the old page is no longer uncharged after migration, prevent double
charges by decoupling the page's memcg association (PCG_USED and
pc->mem_cgroup) from the page holding an actual charge. The new bits
PCG_MEM and PCG_MEMSW represent the respective charges and are transferred
to the new page during migration.
mem_cgroup_migrate() is suitable for replace_page_cache() as well,
which gets rid of mem_cgroup_replace_page_cache(). However, care
needs to be taken because both the source and the target page can
already be charged and on the LRU when fuse is splicing: grab the page
lock on the charge moving side to prevent changing pc->mem_cgroup of a
page under migration. Also, the lruvecs of both pages change as we
uncharge the old and charge the new during migration, and putback may
race with us, so grab the lru lock and isolate the pages iff on LRU to
prevent races and ensure the pages are on the right lruvec afterward.
Swap accounting is massively simplified: because the page is no longer
uncharged as early as swap cache deletion, a new mem_cgroup_swapout() can
transfer the page's memory+swap charge (PCG_MEMSW) to the swap entry
before the final put_page() in page reclaim.
Finally, page_cgroup changes are now protected by whatever protection the
page itself offers: anonymous pages are charged under the page table lock,
whereas page cache insertions, swapin, and migration hold the page lock.
Uncharging happens under full exclusion with no outstanding references.
Charging and uncharging also ensure that the page is off-LRU, which
serializes against charge migration. Remove the very costly page_cgroup
lock and set pc->flags non-atomically.
[mhocko@suse.cz: mem_cgroup_charge_statistics needs preempt_disable]
[vdavydov@parallels.com: fix flags definition]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Vladimir Davydov <vdavydov@parallels.com>
Tested-by: Jet Chen <jet.chen@intel.com>
Acked-by: Michal Hocko <mhocko@suse.cz>
Tested-by: Felipe Balbi <balbi@ti.com>
Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-08-09 04:19:22 +07:00
|
|
|
/**
|
|
|
|
* mem_cgroup_uncharge - uncharge a page
|
|
|
|
* @page: page to uncharge
|
|
|
|
*
|
2020-06-04 06:02:07 +07:00
|
|
|
* Uncharge a page previously charged with mem_cgroup_charge().
|
mm: memcontrol: rewrite uncharge API
The memcg uncharging code that is involved towards the end of a page's
lifetime - truncation, reclaim, swapout, migration - is impressively
complicated and fragile.
Because anonymous and file pages were always charged before they had their
page->mapping established, uncharges had to happen when the page type
could still be known from the context; as in unmap for anonymous, page
cache removal for file and shmem pages, and swap cache truncation for swap
pages. However, these operations happen well before the page is actually
freed, and so a lot of synchronization is necessary:
- Charging, uncharging, page migration, and charge migration all need
to take a per-page bit spinlock as they could race with uncharging.
- Swap cache truncation happens during both swap-in and swap-out, and
possibly repeatedly before the page is actually freed. This means
that the memcg swapout code is called from many contexts that make
no sense and it has to figure out the direction from page state to
make sure memory and memory+swap are always correctly charged.
- On page migration, the old page might be unmapped but then reused,
so memcg code has to prevent untimely uncharging in that case.
Because this code - which should be a simple charge transfer - is so
special-cased, it is not reusable for replace_page_cache().
But now that charged pages always have a page->mapping, introduce
mem_cgroup_uncharge(), which is called after the final put_page(), when we
know for sure that nobody is looking at the page anymore.
For page migration, introduce mem_cgroup_migrate(), which is called after
the migration is successful and the new page is fully rmapped. Because
the old page is no longer uncharged after migration, prevent double
charges by decoupling the page's memcg association (PCG_USED and
pc->mem_cgroup) from the page holding an actual charge. The new bits
PCG_MEM and PCG_MEMSW represent the respective charges and are transferred
to the new page during migration.
mem_cgroup_migrate() is suitable for replace_page_cache() as well,
which gets rid of mem_cgroup_replace_page_cache(). However, care
needs to be taken because both the source and the target page can
already be charged and on the LRU when fuse is splicing: grab the page
lock on the charge moving side to prevent changing pc->mem_cgroup of a
page under migration. Also, the lruvecs of both pages change as we
uncharge the old and charge the new during migration, and putback may
race with us, so grab the lru lock and isolate the pages iff on LRU to
prevent races and ensure the pages are on the right lruvec afterward.
Swap accounting is massively simplified: because the page is no longer
uncharged as early as swap cache deletion, a new mem_cgroup_swapout() can
transfer the page's memory+swap charge (PCG_MEMSW) to the swap entry
before the final put_page() in page reclaim.
Finally, page_cgroup changes are now protected by whatever protection the
page itself offers: anonymous pages are charged under the page table lock,
whereas page cache insertions, swapin, and migration hold the page lock.
Uncharging happens under full exclusion with no outstanding references.
Charging and uncharging also ensure that the page is off-LRU, which
serializes against charge migration. Remove the very costly page_cgroup
lock and set pc->flags non-atomically.
[mhocko@suse.cz: mem_cgroup_charge_statistics needs preempt_disable]
[vdavydov@parallels.com: fix flags definition]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Vladimir Davydov <vdavydov@parallels.com>
Tested-by: Jet Chen <jet.chen@intel.com>
Acked-by: Michal Hocko <mhocko@suse.cz>
Tested-by: Felipe Balbi <balbi@ti.com>
Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-08-09 04:19:22 +07:00
|
|
|
*/
|
|
|
|
void mem_cgroup_uncharge(struct page *page)
|
|
|
|
{
|
2017-09-09 06:11:50 +07:00
|
|
|
struct uncharge_gather ug;
|
|
|
|
|
mm: memcontrol: rewrite uncharge API
The memcg uncharging code that is involved towards the end of a page's
lifetime - truncation, reclaim, swapout, migration - is impressively
complicated and fragile.
Because anonymous and file pages were always charged before they had their
page->mapping established, uncharges had to happen when the page type
could still be known from the context; as in unmap for anonymous, page
cache removal for file and shmem pages, and swap cache truncation for swap
pages. However, these operations happen well before the page is actually
freed, and so a lot of synchronization is necessary:
- Charging, uncharging, page migration, and charge migration all need
to take a per-page bit spinlock as they could race with uncharging.
- Swap cache truncation happens during both swap-in and swap-out, and
possibly repeatedly before the page is actually freed. This means
that the memcg swapout code is called from many contexts that make
no sense and it has to figure out the direction from page state to
make sure memory and memory+swap are always correctly charged.
- On page migration, the old page might be unmapped but then reused,
so memcg code has to prevent untimely uncharging in that case.
Because this code - which should be a simple charge transfer - is so
special-cased, it is not reusable for replace_page_cache().
But now that charged pages always have a page->mapping, introduce
mem_cgroup_uncharge(), which is called after the final put_page(), when we
know for sure that nobody is looking at the page anymore.
For page migration, introduce mem_cgroup_migrate(), which is called after
the migration is successful and the new page is fully rmapped. Because
the old page is no longer uncharged after migration, prevent double
charges by decoupling the page's memcg association (PCG_USED and
pc->mem_cgroup) from the page holding an actual charge. The new bits
PCG_MEM and PCG_MEMSW represent the respective charges and are transferred
to the new page during migration.
mem_cgroup_migrate() is suitable for replace_page_cache() as well,
which gets rid of mem_cgroup_replace_page_cache(). However, care
needs to be taken because both the source and the target page can
already be charged and on the LRU when fuse is splicing: grab the page
lock on the charge moving side to prevent changing pc->mem_cgroup of a
page under migration. Also, the lruvecs of both pages change as we
uncharge the old and charge the new during migration, and putback may
race with us, so grab the lru lock and isolate the pages iff on LRU to
prevent races and ensure the pages are on the right lruvec afterward.
Swap accounting is massively simplified: because the page is no longer
uncharged as early as swap cache deletion, a new mem_cgroup_swapout() can
transfer the page's memory+swap charge (PCG_MEMSW) to the swap entry
before the final put_page() in page reclaim.
Finally, page_cgroup changes are now protected by whatever protection the
page itself offers: anonymous pages are charged under the page table lock,
whereas page cache insertions, swapin, and migration hold the page lock.
Uncharging happens under full exclusion with no outstanding references.
Charging and uncharging also ensure that the page is off-LRU, which
serializes against charge migration. Remove the very costly page_cgroup
lock and set pc->flags non-atomically.
[mhocko@suse.cz: mem_cgroup_charge_statistics needs preempt_disable]
[vdavydov@parallels.com: fix flags definition]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Vladimir Davydov <vdavydov@parallels.com>
Tested-by: Jet Chen <jet.chen@intel.com>
Acked-by: Michal Hocko <mhocko@suse.cz>
Tested-by: Felipe Balbi <balbi@ti.com>
Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-08-09 04:19:22 +07:00
|
|
|
if (mem_cgroup_disabled())
|
|
|
|
return;
|
|
|
|
|
2014-08-09 04:19:24 +07:00
|
|
|
/* Don't touch page->lru of any random page, pre-check: */
|
2014-12-11 06:44:52 +07:00
|
|
|
if (!page->mem_cgroup)
|
mm: memcontrol: rewrite uncharge API
The memcg uncharging code that is involved towards the end of a page's
lifetime - truncation, reclaim, swapout, migration - is impressively
complicated and fragile.
Because anonymous and file pages were always charged before they had their
page->mapping established, uncharges had to happen when the page type
could still be known from the context; as in unmap for anonymous, page
cache removal for file and shmem pages, and swap cache truncation for swap
pages. However, these operations happen well before the page is actually
freed, and so a lot of synchronization is necessary:
- Charging, uncharging, page migration, and charge migration all need
to take a per-page bit spinlock as they could race with uncharging.
- Swap cache truncation happens during both swap-in and swap-out, and
possibly repeatedly before the page is actually freed. This means
that the memcg swapout code is called from many contexts that make
no sense and it has to figure out the direction from page state to
make sure memory and memory+swap are always correctly charged.
- On page migration, the old page might be unmapped but then reused,
so memcg code has to prevent untimely uncharging in that case.
Because this code - which should be a simple charge transfer - is so
special-cased, it is not reusable for replace_page_cache().
But now that charged pages always have a page->mapping, introduce
mem_cgroup_uncharge(), which is called after the final put_page(), when we
know for sure that nobody is looking at the page anymore.
For page migration, introduce mem_cgroup_migrate(), which is called after
the migration is successful and the new page is fully rmapped. Because
the old page is no longer uncharged after migration, prevent double
charges by decoupling the page's memcg association (PCG_USED and
pc->mem_cgroup) from the page holding an actual charge. The new bits
PCG_MEM and PCG_MEMSW represent the respective charges and are transferred
to the new page during migration.
mem_cgroup_migrate() is suitable for replace_page_cache() as well,
which gets rid of mem_cgroup_replace_page_cache(). However, care
needs to be taken because both the source and the target page can
already be charged and on the LRU when fuse is splicing: grab the page
lock on the charge moving side to prevent changing pc->mem_cgroup of a
page under migration. Also, the lruvecs of both pages change as we
uncharge the old and charge the new during migration, and putback may
race with us, so grab the lru lock and isolate the pages iff on LRU to
prevent races and ensure the pages are on the right lruvec afterward.
Swap accounting is massively simplified: because the page is no longer
uncharged as early as swap cache deletion, a new mem_cgroup_swapout() can
transfer the page's memory+swap charge (PCG_MEMSW) to the swap entry
before the final put_page() in page reclaim.
Finally, page_cgroup changes are now protected by whatever protection the
page itself offers: anonymous pages are charged under the page table lock,
whereas page cache insertions, swapin, and migration hold the page lock.
Uncharging happens under full exclusion with no outstanding references.
Charging and uncharging also ensure that the page is off-LRU, which
serializes against charge migration. Remove the very costly page_cgroup
lock and set pc->flags non-atomically.
[mhocko@suse.cz: mem_cgroup_charge_statistics needs preempt_disable]
[vdavydov@parallels.com: fix flags definition]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Vladimir Davydov <vdavydov@parallels.com>
Tested-by: Jet Chen <jet.chen@intel.com>
Acked-by: Michal Hocko <mhocko@suse.cz>
Tested-by: Felipe Balbi <balbi@ti.com>
Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-08-09 04:19:22 +07:00
|
|
|
return;
|
|
|
|
|
2017-09-09 06:11:50 +07:00
|
|
|
uncharge_gather_clear(&ug);
|
|
|
|
uncharge_page(page, &ug);
|
|
|
|
uncharge_batch(&ug);
|
2014-08-09 04:19:24 +07:00
|
|
|
}
|
mm: memcontrol: rewrite uncharge API
The memcg uncharging code that is involved towards the end of a page's
lifetime - truncation, reclaim, swapout, migration - is impressively
complicated and fragile.
Because anonymous and file pages were always charged before they had their
page->mapping established, uncharges had to happen when the page type
could still be known from the context; as in unmap for anonymous, page
cache removal for file and shmem pages, and swap cache truncation for swap
pages. However, these operations happen well before the page is actually
freed, and so a lot of synchronization is necessary:
- Charging, uncharging, page migration, and charge migration all need
to take a per-page bit spinlock as they could race with uncharging.
- Swap cache truncation happens during both swap-in and swap-out, and
possibly repeatedly before the page is actually freed. This means
that the memcg swapout code is called from many contexts that make
no sense and it has to figure out the direction from page state to
make sure memory and memory+swap are always correctly charged.
- On page migration, the old page might be unmapped but then reused,
so memcg code has to prevent untimely uncharging in that case.
Because this code - which should be a simple charge transfer - is so
special-cased, it is not reusable for replace_page_cache().
But now that charged pages always have a page->mapping, introduce
mem_cgroup_uncharge(), which is called after the final put_page(), when we
know for sure that nobody is looking at the page anymore.
For page migration, introduce mem_cgroup_migrate(), which is called after
the migration is successful and the new page is fully rmapped. Because
the old page is no longer uncharged after migration, prevent double
charges by decoupling the page's memcg association (PCG_USED and
pc->mem_cgroup) from the page holding an actual charge. The new bits
PCG_MEM and PCG_MEMSW represent the respective charges and are transferred
to the new page during migration.
mem_cgroup_migrate() is suitable for replace_page_cache() as well,
which gets rid of mem_cgroup_replace_page_cache(). However, care
needs to be taken because both the source and the target page can
already be charged and on the LRU when fuse is splicing: grab the page
lock on the charge moving side to prevent changing pc->mem_cgroup of a
page under migration. Also, the lruvecs of both pages change as we
uncharge the old and charge the new during migration, and putback may
race with us, so grab the lru lock and isolate the pages iff on LRU to
prevent races and ensure the pages are on the right lruvec afterward.
Swap accounting is massively simplified: because the page is no longer
uncharged as early as swap cache deletion, a new mem_cgroup_swapout() can
transfer the page's memory+swap charge (PCG_MEMSW) to the swap entry
before the final put_page() in page reclaim.
Finally, page_cgroup changes are now protected by whatever protection the
page itself offers: anonymous pages are charged under the page table lock,
whereas page cache insertions, swapin, and migration hold the page lock.
Uncharging happens under full exclusion with no outstanding references.
Charging and uncharging also ensure that the page is off-LRU, which
serializes against charge migration. Remove the very costly page_cgroup
lock and set pc->flags non-atomically.
[mhocko@suse.cz: mem_cgroup_charge_statistics needs preempt_disable]
[vdavydov@parallels.com: fix flags definition]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Vladimir Davydov <vdavydov@parallels.com>
Tested-by: Jet Chen <jet.chen@intel.com>
Acked-by: Michal Hocko <mhocko@suse.cz>
Tested-by: Felipe Balbi <balbi@ti.com>
Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-08-09 04:19:22 +07:00
|
|
|
|
2014-08-09 04:19:24 +07:00
|
|
|
/**
|
|
|
|
* mem_cgroup_uncharge_list - uncharge a list of page
|
|
|
|
* @page_list: list of pages to uncharge
|
|
|
|
*
|
|
|
|
* Uncharge a list of pages previously charged with
|
2020-06-04 06:02:07 +07:00
|
|
|
* mem_cgroup_charge().
|
2014-08-09 04:19:24 +07:00
|
|
|
*/
|
|
|
|
void mem_cgroup_uncharge_list(struct list_head *page_list)
|
|
|
|
{
|
|
|
|
if (mem_cgroup_disabled())
|
|
|
|
return;
|
mm: memcontrol: rewrite uncharge API
The memcg uncharging code that is involved towards the end of a page's
lifetime - truncation, reclaim, swapout, migration - is impressively
complicated and fragile.
Because anonymous and file pages were always charged before they had their
page->mapping established, uncharges had to happen when the page type
could still be known from the context; as in unmap for anonymous, page
cache removal for file and shmem pages, and swap cache truncation for swap
pages. However, these operations happen well before the page is actually
freed, and so a lot of synchronization is necessary:
- Charging, uncharging, page migration, and charge migration all need
to take a per-page bit spinlock as they could race with uncharging.
- Swap cache truncation happens during both swap-in and swap-out, and
possibly repeatedly before the page is actually freed. This means
that the memcg swapout code is called from many contexts that make
no sense and it has to figure out the direction from page state to
make sure memory and memory+swap are always correctly charged.
- On page migration, the old page might be unmapped but then reused,
so memcg code has to prevent untimely uncharging in that case.
Because this code - which should be a simple charge transfer - is so
special-cased, it is not reusable for replace_page_cache().
But now that charged pages always have a page->mapping, introduce
mem_cgroup_uncharge(), which is called after the final put_page(), when we
know for sure that nobody is looking at the page anymore.
For page migration, introduce mem_cgroup_migrate(), which is called after
the migration is successful and the new page is fully rmapped. Because
the old page is no longer uncharged after migration, prevent double
charges by decoupling the page's memcg association (PCG_USED and
pc->mem_cgroup) from the page holding an actual charge. The new bits
PCG_MEM and PCG_MEMSW represent the respective charges and are transferred
to the new page during migration.
mem_cgroup_migrate() is suitable for replace_page_cache() as well,
which gets rid of mem_cgroup_replace_page_cache(). However, care
needs to be taken because both the source and the target page can
already be charged and on the LRU when fuse is splicing: grab the page
lock on the charge moving side to prevent changing pc->mem_cgroup of a
page under migration. Also, the lruvecs of both pages change as we
uncharge the old and charge the new during migration, and putback may
race with us, so grab the lru lock and isolate the pages iff on LRU to
prevent races and ensure the pages are on the right lruvec afterward.
Swap accounting is massively simplified: because the page is no longer
uncharged as early as swap cache deletion, a new mem_cgroup_swapout() can
transfer the page's memory+swap charge (PCG_MEMSW) to the swap entry
before the final put_page() in page reclaim.
Finally, page_cgroup changes are now protected by whatever protection the
page itself offers: anonymous pages are charged under the page table lock,
whereas page cache insertions, swapin, and migration hold the page lock.
Uncharging happens under full exclusion with no outstanding references.
Charging and uncharging also ensure that the page is off-LRU, which
serializes against charge migration. Remove the very costly page_cgroup
lock and set pc->flags non-atomically.
[mhocko@suse.cz: mem_cgroup_charge_statistics needs preempt_disable]
[vdavydov@parallels.com: fix flags definition]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Vladimir Davydov <vdavydov@parallels.com>
Tested-by: Jet Chen <jet.chen@intel.com>
Acked-by: Michal Hocko <mhocko@suse.cz>
Tested-by: Felipe Balbi <balbi@ti.com>
Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-08-09 04:19:22 +07:00
|
|
|
|
2014-08-09 04:19:24 +07:00
|
|
|
if (!list_empty(page_list))
|
|
|
|
uncharge_list(page_list);
|
mm: memcontrol: rewrite uncharge API
The memcg uncharging code that is involved towards the end of a page's
lifetime - truncation, reclaim, swapout, migration - is impressively
complicated and fragile.
Because anonymous and file pages were always charged before they had their
page->mapping established, uncharges had to happen when the page type
could still be known from the context; as in unmap for anonymous, page
cache removal for file and shmem pages, and swap cache truncation for swap
pages. However, these operations happen well before the page is actually
freed, and so a lot of synchronization is necessary:
- Charging, uncharging, page migration, and charge migration all need
to take a per-page bit spinlock as they could race with uncharging.
- Swap cache truncation happens during both swap-in and swap-out, and
possibly repeatedly before the page is actually freed. This means
that the memcg swapout code is called from many contexts that make
no sense and it has to figure out the direction from page state to
make sure memory and memory+swap are always correctly charged.
- On page migration, the old page might be unmapped but then reused,
so memcg code has to prevent untimely uncharging in that case.
Because this code - which should be a simple charge transfer - is so
special-cased, it is not reusable for replace_page_cache().
But now that charged pages always have a page->mapping, introduce
mem_cgroup_uncharge(), which is called after the final put_page(), when we
know for sure that nobody is looking at the page anymore.
For page migration, introduce mem_cgroup_migrate(), which is called after
the migration is successful and the new page is fully rmapped. Because
the old page is no longer uncharged after migration, prevent double
charges by decoupling the page's memcg association (PCG_USED and
pc->mem_cgroup) from the page holding an actual charge. The new bits
PCG_MEM and PCG_MEMSW represent the respective charges and are transferred
to the new page during migration.
mem_cgroup_migrate() is suitable for replace_page_cache() as well,
which gets rid of mem_cgroup_replace_page_cache(). However, care
needs to be taken because both the source and the target page can
already be charged and on the LRU when fuse is splicing: grab the page
lock on the charge moving side to prevent changing pc->mem_cgroup of a
page under migration. Also, the lruvecs of both pages change as we
uncharge the old and charge the new during migration, and putback may
race with us, so grab the lru lock and isolate the pages iff on LRU to
prevent races and ensure the pages are on the right lruvec afterward.
Swap accounting is massively simplified: because the page is no longer
uncharged as early as swap cache deletion, a new mem_cgroup_swapout() can
transfer the page's memory+swap charge (PCG_MEMSW) to the swap entry
before the final put_page() in page reclaim.
Finally, page_cgroup changes are now protected by whatever protection the
page itself offers: anonymous pages are charged under the page table lock,
whereas page cache insertions, swapin, and migration hold the page lock.
Uncharging happens under full exclusion with no outstanding references.
Charging and uncharging also ensure that the page is off-LRU, which
serializes against charge migration. Remove the very costly page_cgroup
lock and set pc->flags non-atomically.
[mhocko@suse.cz: mem_cgroup_charge_statistics needs preempt_disable]
[vdavydov@parallels.com: fix flags definition]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Vladimir Davydov <vdavydov@parallels.com>
Tested-by: Jet Chen <jet.chen@intel.com>
Acked-by: Michal Hocko <mhocko@suse.cz>
Tested-by: Felipe Balbi <balbi@ti.com>
Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-08-09 04:19:22 +07:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
2016-03-16 04:57:19 +07:00
|
|
|
* mem_cgroup_migrate - charge a page's replacement
|
|
|
|
* @oldpage: currently circulating page
|
|
|
|
* @newpage: replacement page
|
mm: memcontrol: rewrite uncharge API
The memcg uncharging code that is involved towards the end of a page's
lifetime - truncation, reclaim, swapout, migration - is impressively
complicated and fragile.
Because anonymous and file pages were always charged before they had their
page->mapping established, uncharges had to happen when the page type
could still be known from the context; as in unmap for anonymous, page
cache removal for file and shmem pages, and swap cache truncation for swap
pages. However, these operations happen well before the page is actually
freed, and so a lot of synchronization is necessary:
- Charging, uncharging, page migration, and charge migration all need
to take a per-page bit spinlock as they could race with uncharging.
- Swap cache truncation happens during both swap-in and swap-out, and
possibly repeatedly before the page is actually freed. This means
that the memcg swapout code is called from many contexts that make
no sense and it has to figure out the direction from page state to
make sure memory and memory+swap are always correctly charged.
- On page migration, the old page might be unmapped but then reused,
so memcg code has to prevent untimely uncharging in that case.
Because this code - which should be a simple charge transfer - is so
special-cased, it is not reusable for replace_page_cache().
But now that charged pages always have a page->mapping, introduce
mem_cgroup_uncharge(), which is called after the final put_page(), when we
know for sure that nobody is looking at the page anymore.
For page migration, introduce mem_cgroup_migrate(), which is called after
the migration is successful and the new page is fully rmapped. Because
the old page is no longer uncharged after migration, prevent double
charges by decoupling the page's memcg association (PCG_USED and
pc->mem_cgroup) from the page holding an actual charge. The new bits
PCG_MEM and PCG_MEMSW represent the respective charges and are transferred
to the new page during migration.
mem_cgroup_migrate() is suitable for replace_page_cache() as well,
which gets rid of mem_cgroup_replace_page_cache(). However, care
needs to be taken because both the source and the target page can
already be charged and on the LRU when fuse is splicing: grab the page
lock on the charge moving side to prevent changing pc->mem_cgroup of a
page under migration. Also, the lruvecs of both pages change as we
uncharge the old and charge the new during migration, and putback may
race with us, so grab the lru lock and isolate the pages iff on LRU to
prevent races and ensure the pages are on the right lruvec afterward.
Swap accounting is massively simplified: because the page is no longer
uncharged as early as swap cache deletion, a new mem_cgroup_swapout() can
transfer the page's memory+swap charge (PCG_MEMSW) to the swap entry
before the final put_page() in page reclaim.
Finally, page_cgroup changes are now protected by whatever protection the
page itself offers: anonymous pages are charged under the page table lock,
whereas page cache insertions, swapin, and migration hold the page lock.
Uncharging happens under full exclusion with no outstanding references.
Charging and uncharging also ensure that the page is off-LRU, which
serializes against charge migration. Remove the very costly page_cgroup
lock and set pc->flags non-atomically.
[mhocko@suse.cz: mem_cgroup_charge_statistics needs preempt_disable]
[vdavydov@parallels.com: fix flags definition]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Vladimir Davydov <vdavydov@parallels.com>
Tested-by: Jet Chen <jet.chen@intel.com>
Acked-by: Michal Hocko <mhocko@suse.cz>
Tested-by: Felipe Balbi <balbi@ti.com>
Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-08-09 04:19:22 +07:00
|
|
|
*
|
2016-03-16 04:57:19 +07:00
|
|
|
* Charge @newpage as a replacement page for @oldpage. @oldpage will
|
|
|
|
* be uncharged upon free.
|
mm: memcontrol: rewrite uncharge API
The memcg uncharging code that is involved towards the end of a page's
lifetime - truncation, reclaim, swapout, migration - is impressively
complicated and fragile.
Because anonymous and file pages were always charged before they had their
page->mapping established, uncharges had to happen when the page type
could still be known from the context; as in unmap for anonymous, page
cache removal for file and shmem pages, and swap cache truncation for swap
pages. However, these operations happen well before the page is actually
freed, and so a lot of synchronization is necessary:
- Charging, uncharging, page migration, and charge migration all need
to take a per-page bit spinlock as they could race with uncharging.
- Swap cache truncation happens during both swap-in and swap-out, and
possibly repeatedly before the page is actually freed. This means
that the memcg swapout code is called from many contexts that make
no sense and it has to figure out the direction from page state to
make sure memory and memory+swap are always correctly charged.
- On page migration, the old page might be unmapped but then reused,
so memcg code has to prevent untimely uncharging in that case.
Because this code - which should be a simple charge transfer - is so
special-cased, it is not reusable for replace_page_cache().
But now that charged pages always have a page->mapping, introduce
mem_cgroup_uncharge(), which is called after the final put_page(), when we
know for sure that nobody is looking at the page anymore.
For page migration, introduce mem_cgroup_migrate(), which is called after
the migration is successful and the new page is fully rmapped. Because
the old page is no longer uncharged after migration, prevent double
charges by decoupling the page's memcg association (PCG_USED and
pc->mem_cgroup) from the page holding an actual charge. The new bits
PCG_MEM and PCG_MEMSW represent the respective charges and are transferred
to the new page during migration.
mem_cgroup_migrate() is suitable for replace_page_cache() as well,
which gets rid of mem_cgroup_replace_page_cache(). However, care
needs to be taken because both the source and the target page can
already be charged and on the LRU when fuse is splicing: grab the page
lock on the charge moving side to prevent changing pc->mem_cgroup of a
page under migration. Also, the lruvecs of both pages change as we
uncharge the old and charge the new during migration, and putback may
race with us, so grab the lru lock and isolate the pages iff on LRU to
prevent races and ensure the pages are on the right lruvec afterward.
Swap accounting is massively simplified: because the page is no longer
uncharged as early as swap cache deletion, a new mem_cgroup_swapout() can
transfer the page's memory+swap charge (PCG_MEMSW) to the swap entry
before the final put_page() in page reclaim.
Finally, page_cgroup changes are now protected by whatever protection the
page itself offers: anonymous pages are charged under the page table lock,
whereas page cache insertions, swapin, and migration hold the page lock.
Uncharging happens under full exclusion with no outstanding references.
Charging and uncharging also ensure that the page is off-LRU, which
serializes against charge migration. Remove the very costly page_cgroup
lock and set pc->flags non-atomically.
[mhocko@suse.cz: mem_cgroup_charge_statistics needs preempt_disable]
[vdavydov@parallels.com: fix flags definition]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Vladimir Davydov <vdavydov@parallels.com>
Tested-by: Jet Chen <jet.chen@intel.com>
Acked-by: Michal Hocko <mhocko@suse.cz>
Tested-by: Felipe Balbi <balbi@ti.com>
Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-08-09 04:19:22 +07:00
|
|
|
*
|
|
|
|
* Both pages must be locked, @newpage->mapping must be set up.
|
|
|
|
*/
|
2016-03-16 04:57:19 +07:00
|
|
|
void mem_cgroup_migrate(struct page *oldpage, struct page *newpage)
|
mm: memcontrol: rewrite uncharge API
The memcg uncharging code that is involved towards the end of a page's
lifetime - truncation, reclaim, swapout, migration - is impressively
complicated and fragile.
Because anonymous and file pages were always charged before they had their
page->mapping established, uncharges had to happen when the page type
could still be known from the context; as in unmap for anonymous, page
cache removal for file and shmem pages, and swap cache truncation for swap
pages. However, these operations happen well before the page is actually
freed, and so a lot of synchronization is necessary:
- Charging, uncharging, page migration, and charge migration all need
to take a per-page bit spinlock as they could race with uncharging.
- Swap cache truncation happens during both swap-in and swap-out, and
possibly repeatedly before the page is actually freed. This means
that the memcg swapout code is called from many contexts that make
no sense and it has to figure out the direction from page state to
make sure memory and memory+swap are always correctly charged.
- On page migration, the old page might be unmapped but then reused,
so memcg code has to prevent untimely uncharging in that case.
Because this code - which should be a simple charge transfer - is so
special-cased, it is not reusable for replace_page_cache().
But now that charged pages always have a page->mapping, introduce
mem_cgroup_uncharge(), which is called after the final put_page(), when we
know for sure that nobody is looking at the page anymore.
For page migration, introduce mem_cgroup_migrate(), which is called after
the migration is successful and the new page is fully rmapped. Because
the old page is no longer uncharged after migration, prevent double
charges by decoupling the page's memcg association (PCG_USED and
pc->mem_cgroup) from the page holding an actual charge. The new bits
PCG_MEM and PCG_MEMSW represent the respective charges and are transferred
to the new page during migration.
mem_cgroup_migrate() is suitable for replace_page_cache() as well,
which gets rid of mem_cgroup_replace_page_cache(). However, care
needs to be taken because both the source and the target page can
already be charged and on the LRU when fuse is splicing: grab the page
lock on the charge moving side to prevent changing pc->mem_cgroup of a
page under migration. Also, the lruvecs of both pages change as we
uncharge the old and charge the new during migration, and putback may
race with us, so grab the lru lock and isolate the pages iff on LRU to
prevent races and ensure the pages are on the right lruvec afterward.
Swap accounting is massively simplified: because the page is no longer
uncharged as early as swap cache deletion, a new mem_cgroup_swapout() can
transfer the page's memory+swap charge (PCG_MEMSW) to the swap entry
before the final put_page() in page reclaim.
Finally, page_cgroup changes are now protected by whatever protection the
page itself offers: anonymous pages are charged under the page table lock,
whereas page cache insertions, swapin, and migration hold the page lock.
Uncharging happens under full exclusion with no outstanding references.
Charging and uncharging also ensure that the page is off-LRU, which
serializes against charge migration. Remove the very costly page_cgroup
lock and set pc->flags non-atomically.
[mhocko@suse.cz: mem_cgroup_charge_statistics needs preempt_disable]
[vdavydov@parallels.com: fix flags definition]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Vladimir Davydov <vdavydov@parallels.com>
Tested-by: Jet Chen <jet.chen@intel.com>
Acked-by: Michal Hocko <mhocko@suse.cz>
Tested-by: Felipe Balbi <balbi@ti.com>
Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-08-09 04:19:22 +07:00
|
|
|
{
|
2014-12-11 06:44:02 +07:00
|
|
|
struct mem_cgroup *memcg;
|
2016-01-21 06:03:16 +07:00
|
|
|
unsigned int nr_pages;
|
2016-06-25 04:49:54 +07:00
|
|
|
unsigned long flags;
|
mm: memcontrol: rewrite uncharge API
The memcg uncharging code that is involved towards the end of a page's
lifetime - truncation, reclaim, swapout, migration - is impressively
complicated and fragile.
Because anonymous and file pages were always charged before they had their
page->mapping established, uncharges had to happen when the page type
could still be known from the context; as in unmap for anonymous, page
cache removal for file and shmem pages, and swap cache truncation for swap
pages. However, these operations happen well before the page is actually
freed, and so a lot of synchronization is necessary:
- Charging, uncharging, page migration, and charge migration all need
to take a per-page bit spinlock as they could race with uncharging.
- Swap cache truncation happens during both swap-in and swap-out, and
possibly repeatedly before the page is actually freed. This means
that the memcg swapout code is called from many contexts that make
no sense and it has to figure out the direction from page state to
make sure memory and memory+swap are always correctly charged.
- On page migration, the old page might be unmapped but then reused,
so memcg code has to prevent untimely uncharging in that case.
Because this code - which should be a simple charge transfer - is so
special-cased, it is not reusable for replace_page_cache().
But now that charged pages always have a page->mapping, introduce
mem_cgroup_uncharge(), which is called after the final put_page(), when we
know for sure that nobody is looking at the page anymore.
For page migration, introduce mem_cgroup_migrate(), which is called after
the migration is successful and the new page is fully rmapped. Because
the old page is no longer uncharged after migration, prevent double
charges by decoupling the page's memcg association (PCG_USED and
pc->mem_cgroup) from the page holding an actual charge. The new bits
PCG_MEM and PCG_MEMSW represent the respective charges and are transferred
to the new page during migration.
mem_cgroup_migrate() is suitable for replace_page_cache() as well,
which gets rid of mem_cgroup_replace_page_cache(). However, care
needs to be taken because both the source and the target page can
already be charged and on the LRU when fuse is splicing: grab the page
lock on the charge moving side to prevent changing pc->mem_cgroup of a
page under migration. Also, the lruvecs of both pages change as we
uncharge the old and charge the new during migration, and putback may
race with us, so grab the lru lock and isolate the pages iff on LRU to
prevent races and ensure the pages are on the right lruvec afterward.
Swap accounting is massively simplified: because the page is no longer
uncharged as early as swap cache deletion, a new mem_cgroup_swapout() can
transfer the page's memory+swap charge (PCG_MEMSW) to the swap entry
before the final put_page() in page reclaim.
Finally, page_cgroup changes are now protected by whatever protection the
page itself offers: anonymous pages are charged under the page table lock,
whereas page cache insertions, swapin, and migration hold the page lock.
Uncharging happens under full exclusion with no outstanding references.
Charging and uncharging also ensure that the page is off-LRU, which
serializes against charge migration. Remove the very costly page_cgroup
lock and set pc->flags non-atomically.
[mhocko@suse.cz: mem_cgroup_charge_statistics needs preempt_disable]
[vdavydov@parallels.com: fix flags definition]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Vladimir Davydov <vdavydov@parallels.com>
Tested-by: Jet Chen <jet.chen@intel.com>
Acked-by: Michal Hocko <mhocko@suse.cz>
Tested-by: Felipe Balbi <balbi@ti.com>
Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-08-09 04:19:22 +07:00
|
|
|
|
|
|
|
VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage);
|
|
|
|
VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
|
|
|
|
VM_BUG_ON_PAGE(PageAnon(oldpage) != PageAnon(newpage), newpage);
|
2014-08-09 04:19:33 +07:00
|
|
|
VM_BUG_ON_PAGE(PageTransHuge(oldpage) != PageTransHuge(newpage),
|
|
|
|
newpage);
|
mm: memcontrol: rewrite uncharge API
The memcg uncharging code that is involved towards the end of a page's
lifetime - truncation, reclaim, swapout, migration - is impressively
complicated and fragile.
Because anonymous and file pages were always charged before they had their
page->mapping established, uncharges had to happen when the page type
could still be known from the context; as in unmap for anonymous, page
cache removal for file and shmem pages, and swap cache truncation for swap
pages. However, these operations happen well before the page is actually
freed, and so a lot of synchronization is necessary:
- Charging, uncharging, page migration, and charge migration all need
to take a per-page bit spinlock as they could race with uncharging.
- Swap cache truncation happens during both swap-in and swap-out, and
possibly repeatedly before the page is actually freed. This means
that the memcg swapout code is called from many contexts that make
no sense and it has to figure out the direction from page state to
make sure memory and memory+swap are always correctly charged.
- On page migration, the old page might be unmapped but then reused,
so memcg code has to prevent untimely uncharging in that case.
Because this code - which should be a simple charge transfer - is so
special-cased, it is not reusable for replace_page_cache().
But now that charged pages always have a page->mapping, introduce
mem_cgroup_uncharge(), which is called after the final put_page(), when we
know for sure that nobody is looking at the page anymore.
For page migration, introduce mem_cgroup_migrate(), which is called after
the migration is successful and the new page is fully rmapped. Because
the old page is no longer uncharged after migration, prevent double
charges by decoupling the page's memcg association (PCG_USED and
pc->mem_cgroup) from the page holding an actual charge. The new bits
PCG_MEM and PCG_MEMSW represent the respective charges and are transferred
to the new page during migration.
mem_cgroup_migrate() is suitable for replace_page_cache() as well,
which gets rid of mem_cgroup_replace_page_cache(). However, care
needs to be taken because both the source and the target page can
already be charged and on the LRU when fuse is splicing: grab the page
lock on the charge moving side to prevent changing pc->mem_cgroup of a
page under migration. Also, the lruvecs of both pages change as we
uncharge the old and charge the new during migration, and putback may
race with us, so grab the lru lock and isolate the pages iff on LRU to
prevent races and ensure the pages are on the right lruvec afterward.
Swap accounting is massively simplified: because the page is no longer
uncharged as early as swap cache deletion, a new mem_cgroup_swapout() can
transfer the page's memory+swap charge (PCG_MEMSW) to the swap entry
before the final put_page() in page reclaim.
Finally, page_cgroup changes are now protected by whatever protection the
page itself offers: anonymous pages are charged under the page table lock,
whereas page cache insertions, swapin, and migration hold the page lock.
Uncharging happens under full exclusion with no outstanding references.
Charging and uncharging also ensure that the page is off-LRU, which
serializes against charge migration. Remove the very costly page_cgroup
lock and set pc->flags non-atomically.
[mhocko@suse.cz: mem_cgroup_charge_statistics needs preempt_disable]
[vdavydov@parallels.com: fix flags definition]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Vladimir Davydov <vdavydov@parallels.com>
Tested-by: Jet Chen <jet.chen@intel.com>
Acked-by: Michal Hocko <mhocko@suse.cz>
Tested-by: Felipe Balbi <balbi@ti.com>
Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-08-09 04:19:22 +07:00
|
|
|
|
|
|
|
if (mem_cgroup_disabled())
|
|
|
|
return;
|
|
|
|
|
|
|
|
/* Page cache replacement: new page already charged? */
|
2014-12-11 06:44:52 +07:00
|
|
|
if (newpage->mem_cgroup)
|
mm: memcontrol: rewrite uncharge API
The memcg uncharging code that is involved towards the end of a page's
lifetime - truncation, reclaim, swapout, migration - is impressively
complicated and fragile.
Because anonymous and file pages were always charged before they had their
page->mapping established, uncharges had to happen when the page type
could still be known from the context; as in unmap for anonymous, page
cache removal for file and shmem pages, and swap cache truncation for swap
pages. However, these operations happen well before the page is actually
freed, and so a lot of synchronization is necessary:
- Charging, uncharging, page migration, and charge migration all need
to take a per-page bit spinlock as they could race with uncharging.
- Swap cache truncation happens during both swap-in and swap-out, and
possibly repeatedly before the page is actually freed. This means
that the memcg swapout code is called from many contexts that make
no sense and it has to figure out the direction from page state to
make sure memory and memory+swap are always correctly charged.
- On page migration, the old page might be unmapped but then reused,
so memcg code has to prevent untimely uncharging in that case.
Because this code - which should be a simple charge transfer - is so
special-cased, it is not reusable for replace_page_cache().
But now that charged pages always have a page->mapping, introduce
mem_cgroup_uncharge(), which is called after the final put_page(), when we
know for sure that nobody is looking at the page anymore.
For page migration, introduce mem_cgroup_migrate(), which is called after
the migration is successful and the new page is fully rmapped. Because
the old page is no longer uncharged after migration, prevent double
charges by decoupling the page's memcg association (PCG_USED and
pc->mem_cgroup) from the page holding an actual charge. The new bits
PCG_MEM and PCG_MEMSW represent the respective charges and are transferred
to the new page during migration.
mem_cgroup_migrate() is suitable for replace_page_cache() as well,
which gets rid of mem_cgroup_replace_page_cache(). However, care
needs to be taken because both the source and the target page can
already be charged and on the LRU when fuse is splicing: grab the page
lock on the charge moving side to prevent changing pc->mem_cgroup of a
page under migration. Also, the lruvecs of both pages change as we
uncharge the old and charge the new during migration, and putback may
race with us, so grab the lru lock and isolate the pages iff on LRU to
prevent races and ensure the pages are on the right lruvec afterward.
Swap accounting is massively simplified: because the page is no longer
uncharged as early as swap cache deletion, a new mem_cgroup_swapout() can
transfer the page's memory+swap charge (PCG_MEMSW) to the swap entry
before the final put_page() in page reclaim.
Finally, page_cgroup changes are now protected by whatever protection the
page itself offers: anonymous pages are charged under the page table lock,
whereas page cache insertions, swapin, and migration hold the page lock.
Uncharging happens under full exclusion with no outstanding references.
Charging and uncharging also ensure that the page is off-LRU, which
serializes against charge migration. Remove the very costly page_cgroup
lock and set pc->flags non-atomically.
[mhocko@suse.cz: mem_cgroup_charge_statistics needs preempt_disable]
[vdavydov@parallels.com: fix flags definition]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Vladimir Davydov <vdavydov@parallels.com>
Tested-by: Jet Chen <jet.chen@intel.com>
Acked-by: Michal Hocko <mhocko@suse.cz>
Tested-by: Felipe Balbi <balbi@ti.com>
Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-08-09 04:19:22 +07:00
|
|
|
return;
|
|
|
|
|
2015-11-06 09:49:40 +07:00
|
|
|
/* Swapcache readahead pages can get replaced before being charged */
|
2014-12-11 06:44:52 +07:00
|
|
|
memcg = oldpage->mem_cgroup;
|
2014-12-11 06:44:02 +07:00
|
|
|
if (!memcg)
|
mm: memcontrol: rewrite uncharge API
The memcg uncharging code that is involved towards the end of a page's
lifetime - truncation, reclaim, swapout, migration - is impressively
complicated and fragile.
Because anonymous and file pages were always charged before they had their
page->mapping established, uncharges had to happen when the page type
could still be known from the context; as in unmap for anonymous, page
cache removal for file and shmem pages, and swap cache truncation for swap
pages. However, these operations happen well before the page is actually
freed, and so a lot of synchronization is necessary:
- Charging, uncharging, page migration, and charge migration all need
to take a per-page bit spinlock as they could race with uncharging.
- Swap cache truncation happens during both swap-in and swap-out, and
possibly repeatedly before the page is actually freed. This means
that the memcg swapout code is called from many contexts that make
no sense and it has to figure out the direction from page state to
make sure memory and memory+swap are always correctly charged.
- On page migration, the old page might be unmapped but then reused,
so memcg code has to prevent untimely uncharging in that case.
Because this code - which should be a simple charge transfer - is so
special-cased, it is not reusable for replace_page_cache().
But now that charged pages always have a page->mapping, introduce
mem_cgroup_uncharge(), which is called after the final put_page(), when we
know for sure that nobody is looking at the page anymore.
For page migration, introduce mem_cgroup_migrate(), which is called after
the migration is successful and the new page is fully rmapped. Because
the old page is no longer uncharged after migration, prevent double
charges by decoupling the page's memcg association (PCG_USED and
pc->mem_cgroup) from the page holding an actual charge. The new bits
PCG_MEM and PCG_MEMSW represent the respective charges and are transferred
to the new page during migration.
mem_cgroup_migrate() is suitable for replace_page_cache() as well,
which gets rid of mem_cgroup_replace_page_cache(). However, care
needs to be taken because both the source and the target page can
already be charged and on the LRU when fuse is splicing: grab the page
lock on the charge moving side to prevent changing pc->mem_cgroup of a
page under migration. Also, the lruvecs of both pages change as we
uncharge the old and charge the new during migration, and putback may
race with us, so grab the lru lock and isolate the pages iff on LRU to
prevent races and ensure the pages are on the right lruvec afterward.
Swap accounting is massively simplified: because the page is no longer
uncharged as early as swap cache deletion, a new mem_cgroup_swapout() can
transfer the page's memory+swap charge (PCG_MEMSW) to the swap entry
before the final put_page() in page reclaim.
Finally, page_cgroup changes are now protected by whatever protection the
page itself offers: anonymous pages are charged under the page table lock,
whereas page cache insertions, swapin, and migration hold the page lock.
Uncharging happens under full exclusion with no outstanding references.
Charging and uncharging also ensure that the page is off-LRU, which
serializes against charge migration. Remove the very costly page_cgroup
lock and set pc->flags non-atomically.
[mhocko@suse.cz: mem_cgroup_charge_statistics needs preempt_disable]
[vdavydov@parallels.com: fix flags definition]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Vladimir Davydov <vdavydov@parallels.com>
Tested-by: Jet Chen <jet.chen@intel.com>
Acked-by: Michal Hocko <mhocko@suse.cz>
Tested-by: Felipe Balbi <balbi@ti.com>
Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-08-09 04:19:22 +07:00
|
|
|
return;
|
|
|
|
|
2016-01-21 06:03:16 +07:00
|
|
|
/* Force-charge the new page. The old one will be freed soon */
|
2020-08-15 07:30:37 +07:00
|
|
|
nr_pages = thp_nr_pages(newpage);
|
2016-01-21 06:03:16 +07:00
|
|
|
|
|
|
|
page_counter_charge(&memcg->memory, nr_pages);
|
|
|
|
if (do_memsw_account())
|
|
|
|
page_counter_charge(&memcg->memsw, nr_pages);
|
mm: memcontrol: rewrite uncharge API
The memcg uncharging code that is involved towards the end of a page's
lifetime - truncation, reclaim, swapout, migration - is impressively
complicated and fragile.
Because anonymous and file pages were always charged before they had their
page->mapping established, uncharges had to happen when the page type
could still be known from the context; as in unmap for anonymous, page
cache removal for file and shmem pages, and swap cache truncation for swap
pages. However, these operations happen well before the page is actually
freed, and so a lot of synchronization is necessary:
- Charging, uncharging, page migration, and charge migration all need
to take a per-page bit spinlock as they could race with uncharging.
- Swap cache truncation happens during both swap-in and swap-out, and
possibly repeatedly before the page is actually freed. This means
that the memcg swapout code is called from many contexts that make
no sense and it has to figure out the direction from page state to
make sure memory and memory+swap are always correctly charged.
- On page migration, the old page might be unmapped but then reused,
so memcg code has to prevent untimely uncharging in that case.
Because this code - which should be a simple charge transfer - is so
special-cased, it is not reusable for replace_page_cache().
But now that charged pages always have a page->mapping, introduce
mem_cgroup_uncharge(), which is called after the final put_page(), when we
know for sure that nobody is looking at the page anymore.
For page migration, introduce mem_cgroup_migrate(), which is called after
the migration is successful and the new page is fully rmapped. Because
the old page is no longer uncharged after migration, prevent double
charges by decoupling the page's memcg association (PCG_USED and
pc->mem_cgroup) from the page holding an actual charge. The new bits
PCG_MEM and PCG_MEMSW represent the respective charges and are transferred
to the new page during migration.
mem_cgroup_migrate() is suitable for replace_page_cache() as well,
which gets rid of mem_cgroup_replace_page_cache(). However, care
needs to be taken because both the source and the target page can
already be charged and on the LRU when fuse is splicing: grab the page
lock on the charge moving side to prevent changing pc->mem_cgroup of a
page under migration. Also, the lruvecs of both pages change as we
uncharge the old and charge the new during migration, and putback may
race with us, so grab the lru lock and isolate the pages iff on LRU to
prevent races and ensure the pages are on the right lruvec afterward.
Swap accounting is massively simplified: because the page is no longer
uncharged as early as swap cache deletion, a new mem_cgroup_swapout() can
transfer the page's memory+swap charge (PCG_MEMSW) to the swap entry
before the final put_page() in page reclaim.
Finally, page_cgroup changes are now protected by whatever protection the
page itself offers: anonymous pages are charged under the page table lock,
whereas page cache insertions, swapin, and migration hold the page lock.
Uncharging happens under full exclusion with no outstanding references.
Charging and uncharging also ensure that the page is off-LRU, which
serializes against charge migration. Remove the very costly page_cgroup
lock and set pc->flags non-atomically.
[mhocko@suse.cz: mem_cgroup_charge_statistics needs preempt_disable]
[vdavydov@parallels.com: fix flags definition]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Vladimir Davydov <vdavydov@parallels.com>
Tested-by: Jet Chen <jet.chen@intel.com>
Acked-by: Michal Hocko <mhocko@suse.cz>
Tested-by: Felipe Balbi <balbi@ti.com>
Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-08-09 04:19:22 +07:00
|
|
|
|
2020-08-07 13:20:45 +07:00
|
|
|
css_get(&memcg->css);
|
2020-06-04 06:02:24 +07:00
|
|
|
commit_charge(newpage, memcg);
|
2016-01-21 06:03:16 +07:00
|
|
|
|
2016-06-25 04:49:54 +07:00
|
|
|
local_irq_save(flags);
|
2020-06-04 06:01:31 +07:00
|
|
|
mem_cgroup_charge_statistics(memcg, newpage, nr_pages);
|
2016-01-21 06:03:16 +07:00
|
|
|
memcg_check_events(memcg, newpage);
|
2016-06-25 04:49:54 +07:00
|
|
|
local_irq_restore(flags);
|
mm: memcontrol: rewrite uncharge API
The memcg uncharging code that is involved towards the end of a page's
lifetime - truncation, reclaim, swapout, migration - is impressively
complicated and fragile.
Because anonymous and file pages were always charged before they had their
page->mapping established, uncharges had to happen when the page type
could still be known from the context; as in unmap for anonymous, page
cache removal for file and shmem pages, and swap cache truncation for swap
pages. However, these operations happen well before the page is actually
freed, and so a lot of synchronization is necessary:
- Charging, uncharging, page migration, and charge migration all need
to take a per-page bit spinlock as they could race with uncharging.
- Swap cache truncation happens during both swap-in and swap-out, and
possibly repeatedly before the page is actually freed. This means
that the memcg swapout code is called from many contexts that make
no sense and it has to figure out the direction from page state to
make sure memory and memory+swap are always correctly charged.
- On page migration, the old page might be unmapped but then reused,
so memcg code has to prevent untimely uncharging in that case.
Because this code - which should be a simple charge transfer - is so
special-cased, it is not reusable for replace_page_cache().
But now that charged pages always have a page->mapping, introduce
mem_cgroup_uncharge(), which is called after the final put_page(), when we
know for sure that nobody is looking at the page anymore.
For page migration, introduce mem_cgroup_migrate(), which is called after
the migration is successful and the new page is fully rmapped. Because
the old page is no longer uncharged after migration, prevent double
charges by decoupling the page's memcg association (PCG_USED and
pc->mem_cgroup) from the page holding an actual charge. The new bits
PCG_MEM and PCG_MEMSW represent the respective charges and are transferred
to the new page during migration.
mem_cgroup_migrate() is suitable for replace_page_cache() as well,
which gets rid of mem_cgroup_replace_page_cache(). However, care
needs to be taken because both the source and the target page can
already be charged and on the LRU when fuse is splicing: grab the page
lock on the charge moving side to prevent changing pc->mem_cgroup of a
page under migration. Also, the lruvecs of both pages change as we
uncharge the old and charge the new during migration, and putback may
race with us, so grab the lru lock and isolate the pages iff on LRU to
prevent races and ensure the pages are on the right lruvec afterward.
Swap accounting is massively simplified: because the page is no longer
uncharged as early as swap cache deletion, a new mem_cgroup_swapout() can
transfer the page's memory+swap charge (PCG_MEMSW) to the swap entry
before the final put_page() in page reclaim.
Finally, page_cgroup changes are now protected by whatever protection the
page itself offers: anonymous pages are charged under the page table lock,
whereas page cache insertions, swapin, and migration hold the page lock.
Uncharging happens under full exclusion with no outstanding references.
Charging and uncharging also ensure that the page is off-LRU, which
serializes against charge migration. Remove the very costly page_cgroup
lock and set pc->flags non-atomically.
[mhocko@suse.cz: mem_cgroup_charge_statistics needs preempt_disable]
[vdavydov@parallels.com: fix flags definition]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Vladimir Davydov <vdavydov@parallels.com>
Tested-by: Jet Chen <jet.chen@intel.com>
Acked-by: Michal Hocko <mhocko@suse.cz>
Tested-by: Felipe Balbi <balbi@ti.com>
Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-08-09 04:19:22 +07:00
|
|
|
}
|
|
|
|
|
2016-01-15 06:21:34 +07:00
|
|
|
DEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key);
|
2016-01-15 06:21:26 +07:00
|
|
|
EXPORT_SYMBOL(memcg_sockets_enabled_key);
|
|
|
|
|
2016-10-08 07:00:58 +07:00
|
|
|
void mem_cgroup_sk_alloc(struct sock *sk)
|
2016-01-15 06:21:26 +07:00
|
|
|
{
|
|
|
|
struct mem_cgroup *memcg;
|
|
|
|
|
2016-10-08 07:00:58 +07:00
|
|
|
if (!mem_cgroup_sockets_enabled)
|
|
|
|
return;
|
|
|
|
|
2020-03-10 12:16:05 +07:00
|
|
|
/* Do not associate the sock with unrelated interrupted task's memcg. */
|
|
|
|
if (in_interrupt())
|
|
|
|
return;
|
|
|
|
|
2016-01-15 06:21:26 +07:00
|
|
|
rcu_read_lock();
|
|
|
|
memcg = mem_cgroup_from_task(current);
|
2016-01-15 06:21:29 +07:00
|
|
|
if (memcg == root_mem_cgroup)
|
|
|
|
goto out;
|
2016-01-21 06:02:50 +07:00
|
|
|
if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && !memcg->tcpmem_active)
|
2016-01-15 06:21:29 +07:00
|
|
|
goto out;
|
2020-04-02 11:07:10 +07:00
|
|
|
if (css_tryget(&memcg->css))
|
2016-01-15 06:21:26 +07:00
|
|
|
sk->sk_memcg = memcg;
|
2016-01-15 06:21:29 +07:00
|
|
|
out:
|
2016-01-15 06:21:26 +07:00
|
|
|
rcu_read_unlock();
|
|
|
|
}
|
|
|
|
|
2016-10-08 07:00:58 +07:00
|
|
|
void mem_cgroup_sk_free(struct sock *sk)
|
2016-01-15 06:21:26 +07:00
|
|
|
{
|
2016-10-08 07:00:58 +07:00
|
|
|
if (sk->sk_memcg)
|
|
|
|
css_put(&sk->sk_memcg->css);
|
2016-01-15 06:21:26 +07:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* mem_cgroup_charge_skmem - charge socket memory
|
|
|
|
* @memcg: memcg to charge
|
|
|
|
* @nr_pages: number of pages to charge
|
|
|
|
*
|
|
|
|
* Charges @nr_pages to @memcg. Returns %true if the charge fit within
|
|
|
|
* @memcg's configured limit, %false if the charge had to be forced.
|
|
|
|
*/
|
|
|
|
bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages)
|
|
|
|
{
|
2016-01-15 06:21:29 +07:00
|
|
|
gfp_t gfp_mask = GFP_KERNEL;
|
2016-01-15 06:21:26 +07:00
|
|
|
|
2016-01-15 06:21:29 +07:00
|
|
|
if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
|
2016-01-21 06:02:50 +07:00
|
|
|
struct page_counter *fail;
|
2016-01-15 06:21:29 +07:00
|
|
|
|
2016-01-21 06:02:50 +07:00
|
|
|
if (page_counter_try_charge(&memcg->tcpmem, nr_pages, &fail)) {
|
|
|
|
memcg->tcpmem_pressure = 0;
|
2016-01-15 06:21:29 +07:00
|
|
|
return true;
|
|
|
|
}
|
2016-01-21 06:02:50 +07:00
|
|
|
page_counter_charge(&memcg->tcpmem, nr_pages);
|
|
|
|
memcg->tcpmem_pressure = 1;
|
2016-01-15 06:21:29 +07:00
|
|
|
return false;
|
2016-01-15 06:21:26 +07:00
|
|
|
}
|
2016-01-21 06:02:47 +07:00
|
|
|
|
2016-01-15 06:21:29 +07:00
|
|
|
/* Don't block in the packet receive path */
|
|
|
|
if (in_softirq())
|
|
|
|
gfp_mask = GFP_NOWAIT;
|
|
|
|
|
2018-02-01 07:16:37 +07:00
|
|
|
mod_memcg_state(memcg, MEMCG_SOCK, nr_pages);
|
2016-01-21 06:03:22 +07:00
|
|
|
|
2016-01-15 06:21:29 +07:00
|
|
|
if (try_charge(memcg, gfp_mask, nr_pages) == 0)
|
|
|
|
return true;
|
|
|
|
|
|
|
|
try_charge(memcg, gfp_mask|__GFP_NOFAIL, nr_pages);
|
2016-01-15 06:21:26 +07:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* mem_cgroup_uncharge_skmem - uncharge socket memory
|
2018-02-07 06:42:13 +07:00
|
|
|
* @memcg: memcg to uncharge
|
|
|
|
* @nr_pages: number of pages to uncharge
|
2016-01-15 06:21:26 +07:00
|
|
|
*/
|
|
|
|
void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages)
|
|
|
|
{
|
2016-01-15 06:21:29 +07:00
|
|
|
if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
|
2016-01-21 06:02:50 +07:00
|
|
|
page_counter_uncharge(&memcg->tcpmem, nr_pages);
|
2016-01-15 06:21:29 +07:00
|
|
|
return;
|
|
|
|
}
|
2016-01-21 06:02:47 +07:00
|
|
|
|
2018-02-01 07:16:37 +07:00
|
|
|
mod_memcg_state(memcg, MEMCG_SOCK, -nr_pages);
|
2016-01-21 06:03:22 +07:00
|
|
|
|
2017-09-09 06:13:09 +07:00
|
|
|
refill_stock(memcg, nr_pages);
|
2016-01-15 06:21:26 +07:00
|
|
|
}
|
|
|
|
|
2016-01-15 06:21:29 +07:00
|
|
|
static int __init cgroup_memory(char *s)
|
|
|
|
{
|
|
|
|
char *token;
|
|
|
|
|
|
|
|
while ((token = strsep(&s, ",")) != NULL) {
|
|
|
|
if (!*token)
|
|
|
|
continue;
|
|
|
|
if (!strcmp(token, "nosocket"))
|
|
|
|
cgroup_memory_nosocket = true;
|
2016-01-21 06:02:38 +07:00
|
|
|
if (!strcmp(token, "nokmem"))
|
|
|
|
cgroup_memory_nokmem = true;
|
2016-01-15 06:21:29 +07:00
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
__setup("cgroup.memory=", cgroup_memory);
|
2016-01-15 06:21:26 +07:00
|
|
|
|
2013-02-23 07:34:43 +07:00
|
|
|
/*
|
2013-02-23 07:35:41 +07:00
|
|
|
* subsys_initcall() for memory controller.
|
|
|
|
*
|
2016-11-03 21:49:59 +07:00
|
|
|
* Some parts like memcg_hotplug_cpu_dead() have to be initialized from this
|
|
|
|
* context because of lock dependencies (cgroup_lock -> cpu hotplug) but
|
|
|
|
* basically everything that doesn't depend on a specific mem_cgroup structure
|
|
|
|
* should be initialized from here.
|
2013-02-23 07:34:43 +07:00
|
|
|
*/
|
|
|
|
static int __init mem_cgroup_init(void)
|
|
|
|
{
|
2015-02-12 06:26:33 +07:00
|
|
|
int cpu, node;
|
|
|
|
|
2016-11-03 21:49:59 +07:00
|
|
|
cpuhp_setup_state_nocalls(CPUHP_MM_MEMCQ_DEAD, "mm/memctrl:dead", NULL,
|
|
|
|
memcg_hotplug_cpu_dead);
|
2015-02-12 06:26:33 +07:00
|
|
|
|
|
|
|
for_each_possible_cpu(cpu)
|
|
|
|
INIT_WORK(&per_cpu_ptr(&memcg_stock, cpu)->work,
|
|
|
|
drain_local_stock);
|
|
|
|
|
|
|
|
for_each_node(node) {
|
|
|
|
struct mem_cgroup_tree_per_node *rtpn;
|
|
|
|
|
|
|
|
rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL,
|
|
|
|
node_online(node) ? node : NUMA_NO_NODE);
|
|
|
|
|
2016-07-29 05:46:05 +07:00
|
|
|
rtpn->rb_root = RB_ROOT;
|
2017-09-09 06:15:21 +07:00
|
|
|
rtpn->rb_rightmost = NULL;
|
2016-07-29 05:46:05 +07:00
|
|
|
spin_lock_init(&rtpn->lock);
|
2015-02-12 06:26:33 +07:00
|
|
|
soft_limit_tree.rb_tree_per_node[node] = rtpn;
|
|
|
|
}
|
|
|
|
|
2013-02-23 07:34:43 +07:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
subsys_initcall(mem_cgroup_init);
|
2015-02-12 06:26:36 +07:00
|
|
|
|
|
|
|
#ifdef CONFIG_MEMCG_SWAP
|
2016-08-26 05:17:08 +07:00
|
|
|
static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg)
|
|
|
|
{
|
2018-10-27 05:09:28 +07:00
|
|
|
while (!refcount_inc_not_zero(&memcg->id.ref)) {
|
2016-08-26 05:17:08 +07:00
|
|
|
/*
|
|
|
|
* The root cgroup cannot be destroyed, so it's refcount must
|
|
|
|
* always be >= 1.
|
|
|
|
*/
|
|
|
|
if (WARN_ON_ONCE(memcg == root_mem_cgroup)) {
|
|
|
|
VM_BUG_ON(1);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
memcg = parent_mem_cgroup(memcg);
|
|
|
|
if (!memcg)
|
|
|
|
memcg = root_mem_cgroup;
|
|
|
|
}
|
|
|
|
return memcg;
|
|
|
|
}
|
|
|
|
|
2015-02-12 06:26:36 +07:00
|
|
|
/**
|
|
|
|
* mem_cgroup_swapout - transfer a memsw charge to swap
|
|
|
|
* @page: page whose memsw charge to transfer
|
|
|
|
* @entry: swap entry to move the charge to
|
|
|
|
*
|
|
|
|
* Transfer the memsw charge of @page to @entry.
|
|
|
|
*/
|
|
|
|
void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
|
|
|
|
{
|
2016-08-12 05:33:00 +07:00
|
|
|
struct mem_cgroup *memcg, *swap_memcg;
|
2017-09-07 06:22:45 +07:00
|
|
|
unsigned int nr_entries;
|
2015-02-12 06:26:36 +07:00
|
|
|
unsigned short oldid;
|
|
|
|
|
|
|
|
VM_BUG_ON_PAGE(PageLRU(page), page);
|
|
|
|
VM_BUG_ON_PAGE(page_count(page), page);
|
|
|
|
|
2020-06-04 06:02:14 +07:00
|
|
|
if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
|
2015-02-12 06:26:36 +07:00
|
|
|
return;
|
|
|
|
|
|
|
|
memcg = page->mem_cgroup;
|
|
|
|
|
|
|
|
/* Readahead page, never charged */
|
|
|
|
if (!memcg)
|
|
|
|
return;
|
|
|
|
|
2016-08-12 05:33:00 +07:00
|
|
|
/*
|
|
|
|
* In case the memcg owning these pages has been offlined and doesn't
|
|
|
|
* have an ID allocated to it anymore, charge the closest online
|
|
|
|
* ancestor for the swap instead and transfer the memory+swap charge.
|
|
|
|
*/
|
|
|
|
swap_memcg = mem_cgroup_id_get_online(memcg);
|
2020-08-15 07:30:37 +07:00
|
|
|
nr_entries = thp_nr_pages(page);
|
2017-09-07 06:22:45 +07:00
|
|
|
/* Get references for the tail pages, too */
|
|
|
|
if (nr_entries > 1)
|
|
|
|
mem_cgroup_id_get_many(swap_memcg, nr_entries - 1);
|
|
|
|
oldid = swap_cgroup_record(entry, mem_cgroup_id(swap_memcg),
|
|
|
|
nr_entries);
|
2015-02-12 06:26:36 +07:00
|
|
|
VM_BUG_ON_PAGE(oldid, page);
|
2018-02-01 07:16:37 +07:00
|
|
|
mod_memcg_state(swap_memcg, MEMCG_SWAP, nr_entries);
|
2015-02-12 06:26:36 +07:00
|
|
|
|
|
|
|
page->mem_cgroup = NULL;
|
|
|
|
|
|
|
|
if (!mem_cgroup_is_root(memcg))
|
2017-09-07 06:22:45 +07:00
|
|
|
page_counter_uncharge(&memcg->memory, nr_entries);
|
2015-02-12 06:26:36 +07:00
|
|
|
|
2020-06-04 06:02:14 +07:00
|
|
|
if (!cgroup_memory_noswap && memcg != swap_memcg) {
|
2016-08-12 05:33:00 +07:00
|
|
|
if (!mem_cgroup_is_root(swap_memcg))
|
2017-09-07 06:22:45 +07:00
|
|
|
page_counter_charge(&swap_memcg->memsw, nr_entries);
|
|
|
|
page_counter_uncharge(&memcg->memsw, nr_entries);
|
2016-08-12 05:33:00 +07:00
|
|
|
}
|
|
|
|
|
2015-09-05 05:47:50 +07:00
|
|
|
/*
|
|
|
|
* Interrupts should be disabled here because the caller holds the
|
2018-04-11 06:36:56 +07:00
|
|
|
* i_pages lock which is taken with interrupts-off. It is
|
2015-09-05 05:47:50 +07:00
|
|
|
* important here to have the interrupts disabled because it is the
|
2018-04-11 06:36:56 +07:00
|
|
|
* only synchronisation we have for updating the per-CPU variables.
|
2015-09-05 05:47:50 +07:00
|
|
|
*/
|
|
|
|
VM_BUG_ON(!irqs_disabled());
|
2020-06-04 06:01:31 +07:00
|
|
|
mem_cgroup_charge_statistics(memcg, page, -nr_entries);
|
2015-02-12 06:26:36 +07:00
|
|
|
memcg_check_events(memcg, page);
|
2016-07-21 05:44:57 +07:00
|
|
|
|
2020-08-07 13:20:45 +07:00
|
|
|
css_put(&memcg->css);
|
2015-02-12 06:26:36 +07:00
|
|
|
}
|
|
|
|
|
mm, THP, swap: delay splitting THP during swap out
Patch series "THP swap: Delay splitting THP during swapping out", v11.
This patchset is to optimize the performance of Transparent Huge Page
(THP) swap.
Recently, the performance of the storage devices improved so fast that
we cannot saturate the disk bandwidth with single logical CPU when do
page swap out even on a high-end server machine. Because the
performance of the storage device improved faster than that of single
logical CPU. And it seems that the trend will not change in the near
future. On the other hand, the THP becomes more and more popular
because of increased memory size. So it becomes necessary to optimize
THP swap performance.
The advantages of the THP swap support include:
- Batch the swap operations for the THP to reduce lock
acquiring/releasing, including allocating/freeing the swap space,
adding/deleting to/from the swap cache, and writing/reading the swap
space, etc. This will help improve the performance of the THP swap.
- The THP swap space read/write will be 2M sequential IO. It is
particularly helpful for the swap read, which are usually 4k random
IO. This will improve the performance of the THP swap too.
- It will help the memory fragmentation, especially when the THP is
heavily used by the applications. The 2M continuous pages will be
free up after THP swapping out.
- It will improve the THP utilization on the system with the swap
turned on. Because the speed for khugepaged to collapse the normal
pages into the THP is quite slow. After the THP is split during the
swapping out, it will take quite long time for the normal pages to
collapse back into the THP after being swapped in. The high THP
utilization helps the efficiency of the page based memory management
too.
There are some concerns regarding THP swap in, mainly because possible
enlarged read/write IO size (for swap in/out) may put more overhead on
the storage device. To deal with that, the THP swap in should be turned
on only when necessary. For example, it can be selected via
"always/never/madvise" logic, to be turned on globally, turned off
globally, or turned on only for VMA with MADV_HUGEPAGE, etc.
This patchset is the first step for the THP swap support. The plan is
to delay splitting THP step by step, finally avoid splitting THP during
the THP swapping out and swap out/in the THP as a whole.
As the first step, in this patchset, the splitting huge page is delayed
from almost the first step of swapping out to after allocating the swap
space for the THP and adding the THP into the swap cache. This will
reduce lock acquiring/releasing for the locks used for the swap cache
management.
With the patchset, the swap out throughput improves 15.5% (from about
3.73GB/s to about 4.31GB/s) in the vm-scalability swap-w-seq test case
with 8 processes. The test is done on a Xeon E5 v3 system. The swap
device used is a RAM simulated PMEM (persistent memory) device. To test
the sequential swapping out, the test case creates 8 processes, which
sequentially allocate and write to the anonymous pages until the RAM and
part of the swap device is used up.
This patch (of 5):
In this patch, splitting huge page is delayed from almost the first step
of swapping out to after allocating the swap space for the THP
(Transparent Huge Page) and adding the THP into the swap cache. This
will batch the corresponding operation, thus improve THP swap out
throughput.
This is the first step for the THP swap optimization. The plan is to
delay splitting the THP step by step and avoid splitting the THP
finally.
In this patch, one swap cluster is used to hold the contents of each THP
swapped out. So, the size of the swap cluster is changed to that of the
THP (Transparent Huge Page) on x86_64 architecture (512). For other
architectures which want such THP swap optimization,
ARCH_USES_THP_SWAP_CLUSTER needs to be selected in the Kconfig file for
the architecture. In effect, this will enlarge swap cluster size by 2
times on x86_64. Which may make it harder to find a free cluster when
the swap space becomes fragmented. So that, this may reduce the
continuous swap space allocation and sequential write in theory. The
performance test in 0day shows no regressions caused by this.
In the future of THP swap optimization, some information of the swapped
out THP (such as compound map count) will be recorded in the
swap_cluster_info data structure.
The mem cgroup swap accounting functions are enhanced to support charge
or uncharge a swap cluster backing a THP as a whole.
The swap cluster allocate/free functions are added to allocate/free a
swap cluster for a THP. A fair simple algorithm is used for swap
cluster allocation, that is, only the first swap device in priority list
will be tried to allocate the swap cluster. The function will fail if
the trying is not successful, and the caller will fallback to allocate a
single swap slot instead. This works good enough for normal cases. If
the difference of the number of the free swap clusters among multiple
swap devices is significant, it is possible that some THPs are split
earlier than necessary. For example, this could be caused by big size
difference among multiple swap devices.
The swap cache functions is enhanced to support add/delete THP to/from
the swap cache as a set of (HPAGE_PMD_NR) sub-pages. This may be
enhanced in the future with multi-order radix tree. But because we will
split the THP soon during swapping out, that optimization doesn't make
much sense for this first step.
The THP splitting functions are enhanced to support to split THP in swap
cache during swapping out. The page lock will be held during allocating
the swap cluster, adding the THP into the swap cache and splitting the
THP. So in the code path other than swapping out, if the THP need to be
split, the PageSwapCache(THP) will be always false.
The swap cluster is only available for SSD, so the THP swap optimization
in this patchset has no effect for HDD.
[ying.huang@intel.com: fix two issues in THP optimize patch]
Link: http://lkml.kernel.org/r/87k25ed8zo.fsf@yhuang-dev.intel.com
[hannes@cmpxchg.org: extensive cleanups and simplifications, reduce code size]
Link: http://lkml.kernel.org/r/20170515112522.32457-2-ying.huang@intel.com
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Suggested-by: Andrew Morton <akpm@linux-foundation.org> [for config option]
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> [for changes in huge_memory.c and huge_mm.h]
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Ebru Akagunduz <ebru.akagunduz@gmail.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Shaohua Li <shli@kernel.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2017-07-07 05:37:18 +07:00
|
|
|
/**
|
|
|
|
* mem_cgroup_try_charge_swap - try charging swap space for a page
|
mm: memcontrol: charge swap to cgroup2
This patchset introduces swap accounting to cgroup2.
This patch (of 7):
In the legacy hierarchy we charge memsw, which is dubious, because:
- memsw.limit must be >= memory.limit, so it is impossible to limit
swap usage less than memory usage. Taking into account the fact that
the primary limiting mechanism in the unified hierarchy is
memory.high while memory.limit is either left unset or set to a very
large value, moving memsw.limit knob to the unified hierarchy would
effectively make it impossible to limit swap usage according to the
user preference.
- memsw.usage != memory.usage + swap.usage, because a page occupying
both swap entry and a swap cache page is charged only once to memsw
counter. As a result, it is possible to effectively eat up to
memory.limit of memory pages *and* memsw.limit of swap entries, which
looks unexpected.
That said, we should provide a different swap limiting mechanism for
cgroup2.
This patch adds mem_cgroup->swap counter, which charges the actual number
of swap entries used by a cgroup. It is only charged in the unified
hierarchy, while the legacy hierarchy memsw logic is left intact.
The swap usage can be monitored using new memory.swap.current file and
limited using memory.swap.max.
Note, to charge swap resource properly in the unified hierarchy, we have
to make swap_entry_free uncharge swap only when ->usage reaches zero, not
just ->count, i.e. when all references to a swap entry, including the one
taken by swap cache, are gone. This is necessary, because otherwise
swap-in could result in uncharging swap even if the page is still in swap
cache and hence still occupies a swap entry. At the same time, this
shouldn't break memsw counter logic, where a page is never charged twice
for using both memory and swap, because in case of legacy hierarchy we
uncharge swap on commit (see mem_cgroup_commit_charge).
Signed-off-by: Vladimir Davydov <vdavydov@virtuozzo.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-01-21 06:02:56 +07:00
|
|
|
* @page: page being added to swap
|
|
|
|
* @entry: swap entry to charge
|
|
|
|
*
|
mm, THP, swap: delay splitting THP during swap out
Patch series "THP swap: Delay splitting THP during swapping out", v11.
This patchset is to optimize the performance of Transparent Huge Page
(THP) swap.
Recently, the performance of the storage devices improved so fast that
we cannot saturate the disk bandwidth with single logical CPU when do
page swap out even on a high-end server machine. Because the
performance of the storage device improved faster than that of single
logical CPU. And it seems that the trend will not change in the near
future. On the other hand, the THP becomes more and more popular
because of increased memory size. So it becomes necessary to optimize
THP swap performance.
The advantages of the THP swap support include:
- Batch the swap operations for the THP to reduce lock
acquiring/releasing, including allocating/freeing the swap space,
adding/deleting to/from the swap cache, and writing/reading the swap
space, etc. This will help improve the performance of the THP swap.
- The THP swap space read/write will be 2M sequential IO. It is
particularly helpful for the swap read, which are usually 4k random
IO. This will improve the performance of the THP swap too.
- It will help the memory fragmentation, especially when the THP is
heavily used by the applications. The 2M continuous pages will be
free up after THP swapping out.
- It will improve the THP utilization on the system with the swap
turned on. Because the speed for khugepaged to collapse the normal
pages into the THP is quite slow. After the THP is split during the
swapping out, it will take quite long time for the normal pages to
collapse back into the THP after being swapped in. The high THP
utilization helps the efficiency of the page based memory management
too.
There are some concerns regarding THP swap in, mainly because possible
enlarged read/write IO size (for swap in/out) may put more overhead on
the storage device. To deal with that, the THP swap in should be turned
on only when necessary. For example, it can be selected via
"always/never/madvise" logic, to be turned on globally, turned off
globally, or turned on only for VMA with MADV_HUGEPAGE, etc.
This patchset is the first step for the THP swap support. The plan is
to delay splitting THP step by step, finally avoid splitting THP during
the THP swapping out and swap out/in the THP as a whole.
As the first step, in this patchset, the splitting huge page is delayed
from almost the first step of swapping out to after allocating the swap
space for the THP and adding the THP into the swap cache. This will
reduce lock acquiring/releasing for the locks used for the swap cache
management.
With the patchset, the swap out throughput improves 15.5% (from about
3.73GB/s to about 4.31GB/s) in the vm-scalability swap-w-seq test case
with 8 processes. The test is done on a Xeon E5 v3 system. The swap
device used is a RAM simulated PMEM (persistent memory) device. To test
the sequential swapping out, the test case creates 8 processes, which
sequentially allocate and write to the anonymous pages until the RAM and
part of the swap device is used up.
This patch (of 5):
In this patch, splitting huge page is delayed from almost the first step
of swapping out to after allocating the swap space for the THP
(Transparent Huge Page) and adding the THP into the swap cache. This
will batch the corresponding operation, thus improve THP swap out
throughput.
This is the first step for the THP swap optimization. The plan is to
delay splitting the THP step by step and avoid splitting the THP
finally.
In this patch, one swap cluster is used to hold the contents of each THP
swapped out. So, the size of the swap cluster is changed to that of the
THP (Transparent Huge Page) on x86_64 architecture (512). For other
architectures which want such THP swap optimization,
ARCH_USES_THP_SWAP_CLUSTER needs to be selected in the Kconfig file for
the architecture. In effect, this will enlarge swap cluster size by 2
times on x86_64. Which may make it harder to find a free cluster when
the swap space becomes fragmented. So that, this may reduce the
continuous swap space allocation and sequential write in theory. The
performance test in 0day shows no regressions caused by this.
In the future of THP swap optimization, some information of the swapped
out THP (such as compound map count) will be recorded in the
swap_cluster_info data structure.
The mem cgroup swap accounting functions are enhanced to support charge
or uncharge a swap cluster backing a THP as a whole.
The swap cluster allocate/free functions are added to allocate/free a
swap cluster for a THP. A fair simple algorithm is used for swap
cluster allocation, that is, only the first swap device in priority list
will be tried to allocate the swap cluster. The function will fail if
the trying is not successful, and the caller will fallback to allocate a
single swap slot instead. This works good enough for normal cases. If
the difference of the number of the free swap clusters among multiple
swap devices is significant, it is possible that some THPs are split
earlier than necessary. For example, this could be caused by big size
difference among multiple swap devices.
The swap cache functions is enhanced to support add/delete THP to/from
the swap cache as a set of (HPAGE_PMD_NR) sub-pages. This may be
enhanced in the future with multi-order radix tree. But because we will
split the THP soon during swapping out, that optimization doesn't make
much sense for this first step.
The THP splitting functions are enhanced to support to split THP in swap
cache during swapping out. The page lock will be held during allocating
the swap cluster, adding the THP into the swap cache and splitting the
THP. So in the code path other than swapping out, if the THP need to be
split, the PageSwapCache(THP) will be always false.
The swap cluster is only available for SSD, so the THP swap optimization
in this patchset has no effect for HDD.
[ying.huang@intel.com: fix two issues in THP optimize patch]
Link: http://lkml.kernel.org/r/87k25ed8zo.fsf@yhuang-dev.intel.com
[hannes@cmpxchg.org: extensive cleanups and simplifications, reduce code size]
Link: http://lkml.kernel.org/r/20170515112522.32457-2-ying.huang@intel.com
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Suggested-by: Andrew Morton <akpm@linux-foundation.org> [for config option]
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> [for changes in huge_memory.c and huge_mm.h]
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Ebru Akagunduz <ebru.akagunduz@gmail.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Shaohua Li <shli@kernel.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2017-07-07 05:37:18 +07:00
|
|
|
* Try to charge @page's memcg for the swap space at @entry.
|
mm: memcontrol: charge swap to cgroup2
This patchset introduces swap accounting to cgroup2.
This patch (of 7):
In the legacy hierarchy we charge memsw, which is dubious, because:
- memsw.limit must be >= memory.limit, so it is impossible to limit
swap usage less than memory usage. Taking into account the fact that
the primary limiting mechanism in the unified hierarchy is
memory.high while memory.limit is either left unset or set to a very
large value, moving memsw.limit knob to the unified hierarchy would
effectively make it impossible to limit swap usage according to the
user preference.
- memsw.usage != memory.usage + swap.usage, because a page occupying
both swap entry and a swap cache page is charged only once to memsw
counter. As a result, it is possible to effectively eat up to
memory.limit of memory pages *and* memsw.limit of swap entries, which
looks unexpected.
That said, we should provide a different swap limiting mechanism for
cgroup2.
This patch adds mem_cgroup->swap counter, which charges the actual number
of swap entries used by a cgroup. It is only charged in the unified
hierarchy, while the legacy hierarchy memsw logic is left intact.
The swap usage can be monitored using new memory.swap.current file and
limited using memory.swap.max.
Note, to charge swap resource properly in the unified hierarchy, we have
to make swap_entry_free uncharge swap only when ->usage reaches zero, not
just ->count, i.e. when all references to a swap entry, including the one
taken by swap cache, are gone. This is necessary, because otherwise
swap-in could result in uncharging swap even if the page is still in swap
cache and hence still occupies a swap entry. At the same time, this
shouldn't break memsw counter logic, where a page is never charged twice
for using both memory and swap, because in case of legacy hierarchy we
uncharge swap on commit (see mem_cgroup_commit_charge).
Signed-off-by: Vladimir Davydov <vdavydov@virtuozzo.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-01-21 06:02:56 +07:00
|
|
|
*
|
|
|
|
* Returns 0 on success, -ENOMEM on failure.
|
|
|
|
*/
|
|
|
|
int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry)
|
|
|
|
{
|
2020-08-15 07:30:37 +07:00
|
|
|
unsigned int nr_pages = thp_nr_pages(page);
|
mm: memcontrol: charge swap to cgroup2
This patchset introduces swap accounting to cgroup2.
This patch (of 7):
In the legacy hierarchy we charge memsw, which is dubious, because:
- memsw.limit must be >= memory.limit, so it is impossible to limit
swap usage less than memory usage. Taking into account the fact that
the primary limiting mechanism in the unified hierarchy is
memory.high while memory.limit is either left unset or set to a very
large value, moving memsw.limit knob to the unified hierarchy would
effectively make it impossible to limit swap usage according to the
user preference.
- memsw.usage != memory.usage + swap.usage, because a page occupying
both swap entry and a swap cache page is charged only once to memsw
counter. As a result, it is possible to effectively eat up to
memory.limit of memory pages *and* memsw.limit of swap entries, which
looks unexpected.
That said, we should provide a different swap limiting mechanism for
cgroup2.
This patch adds mem_cgroup->swap counter, which charges the actual number
of swap entries used by a cgroup. It is only charged in the unified
hierarchy, while the legacy hierarchy memsw logic is left intact.
The swap usage can be monitored using new memory.swap.current file and
limited using memory.swap.max.
Note, to charge swap resource properly in the unified hierarchy, we have
to make swap_entry_free uncharge swap only when ->usage reaches zero, not
just ->count, i.e. when all references to a swap entry, including the one
taken by swap cache, are gone. This is necessary, because otherwise
swap-in could result in uncharging swap even if the page is still in swap
cache and hence still occupies a swap entry. At the same time, this
shouldn't break memsw counter logic, where a page is never charged twice
for using both memory and swap, because in case of legacy hierarchy we
uncharge swap on commit (see mem_cgroup_commit_charge).
Signed-off-by: Vladimir Davydov <vdavydov@virtuozzo.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-01-21 06:02:56 +07:00
|
|
|
struct page_counter *counter;
|
mm, THP, swap: delay splitting THP during swap out
Patch series "THP swap: Delay splitting THP during swapping out", v11.
This patchset is to optimize the performance of Transparent Huge Page
(THP) swap.
Recently, the performance of the storage devices improved so fast that
we cannot saturate the disk bandwidth with single logical CPU when do
page swap out even on a high-end server machine. Because the
performance of the storage device improved faster than that of single
logical CPU. And it seems that the trend will not change in the near
future. On the other hand, the THP becomes more and more popular
because of increased memory size. So it becomes necessary to optimize
THP swap performance.
The advantages of the THP swap support include:
- Batch the swap operations for the THP to reduce lock
acquiring/releasing, including allocating/freeing the swap space,
adding/deleting to/from the swap cache, and writing/reading the swap
space, etc. This will help improve the performance of the THP swap.
- The THP swap space read/write will be 2M sequential IO. It is
particularly helpful for the swap read, which are usually 4k random
IO. This will improve the performance of the THP swap too.
- It will help the memory fragmentation, especially when the THP is
heavily used by the applications. The 2M continuous pages will be
free up after THP swapping out.
- It will improve the THP utilization on the system with the swap
turned on. Because the speed for khugepaged to collapse the normal
pages into the THP is quite slow. After the THP is split during the
swapping out, it will take quite long time for the normal pages to
collapse back into the THP after being swapped in. The high THP
utilization helps the efficiency of the page based memory management
too.
There are some concerns regarding THP swap in, mainly because possible
enlarged read/write IO size (for swap in/out) may put more overhead on
the storage device. To deal with that, the THP swap in should be turned
on only when necessary. For example, it can be selected via
"always/never/madvise" logic, to be turned on globally, turned off
globally, or turned on only for VMA with MADV_HUGEPAGE, etc.
This patchset is the first step for the THP swap support. The plan is
to delay splitting THP step by step, finally avoid splitting THP during
the THP swapping out and swap out/in the THP as a whole.
As the first step, in this patchset, the splitting huge page is delayed
from almost the first step of swapping out to after allocating the swap
space for the THP and adding the THP into the swap cache. This will
reduce lock acquiring/releasing for the locks used for the swap cache
management.
With the patchset, the swap out throughput improves 15.5% (from about
3.73GB/s to about 4.31GB/s) in the vm-scalability swap-w-seq test case
with 8 processes. The test is done on a Xeon E5 v3 system. The swap
device used is a RAM simulated PMEM (persistent memory) device. To test
the sequential swapping out, the test case creates 8 processes, which
sequentially allocate and write to the anonymous pages until the RAM and
part of the swap device is used up.
This patch (of 5):
In this patch, splitting huge page is delayed from almost the first step
of swapping out to after allocating the swap space for the THP
(Transparent Huge Page) and adding the THP into the swap cache. This
will batch the corresponding operation, thus improve THP swap out
throughput.
This is the first step for the THP swap optimization. The plan is to
delay splitting the THP step by step and avoid splitting the THP
finally.
In this patch, one swap cluster is used to hold the contents of each THP
swapped out. So, the size of the swap cluster is changed to that of the
THP (Transparent Huge Page) on x86_64 architecture (512). For other
architectures which want such THP swap optimization,
ARCH_USES_THP_SWAP_CLUSTER needs to be selected in the Kconfig file for
the architecture. In effect, this will enlarge swap cluster size by 2
times on x86_64. Which may make it harder to find a free cluster when
the swap space becomes fragmented. So that, this may reduce the
continuous swap space allocation and sequential write in theory. The
performance test in 0day shows no regressions caused by this.
In the future of THP swap optimization, some information of the swapped
out THP (such as compound map count) will be recorded in the
swap_cluster_info data structure.
The mem cgroup swap accounting functions are enhanced to support charge
or uncharge a swap cluster backing a THP as a whole.
The swap cluster allocate/free functions are added to allocate/free a
swap cluster for a THP. A fair simple algorithm is used for swap
cluster allocation, that is, only the first swap device in priority list
will be tried to allocate the swap cluster. The function will fail if
the trying is not successful, and the caller will fallback to allocate a
single swap slot instead. This works good enough for normal cases. If
the difference of the number of the free swap clusters among multiple
swap devices is significant, it is possible that some THPs are split
earlier than necessary. For example, this could be caused by big size
difference among multiple swap devices.
The swap cache functions is enhanced to support add/delete THP to/from
the swap cache as a set of (HPAGE_PMD_NR) sub-pages. This may be
enhanced in the future with multi-order radix tree. But because we will
split the THP soon during swapping out, that optimization doesn't make
much sense for this first step.
The THP splitting functions are enhanced to support to split THP in swap
cache during swapping out. The page lock will be held during allocating
the swap cluster, adding the THP into the swap cache and splitting the
THP. So in the code path other than swapping out, if the THP need to be
split, the PageSwapCache(THP) will be always false.
The swap cluster is only available for SSD, so the THP swap optimization
in this patchset has no effect for HDD.
[ying.huang@intel.com: fix two issues in THP optimize patch]
Link: http://lkml.kernel.org/r/87k25ed8zo.fsf@yhuang-dev.intel.com
[hannes@cmpxchg.org: extensive cleanups and simplifications, reduce code size]
Link: http://lkml.kernel.org/r/20170515112522.32457-2-ying.huang@intel.com
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Suggested-by: Andrew Morton <akpm@linux-foundation.org> [for config option]
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> [for changes in huge_memory.c and huge_mm.h]
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Ebru Akagunduz <ebru.akagunduz@gmail.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Shaohua Li <shli@kernel.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2017-07-07 05:37:18 +07:00
|
|
|
struct mem_cgroup *memcg;
|
mm: memcontrol: charge swap to cgroup2
This patchset introduces swap accounting to cgroup2.
This patch (of 7):
In the legacy hierarchy we charge memsw, which is dubious, because:
- memsw.limit must be >= memory.limit, so it is impossible to limit
swap usage less than memory usage. Taking into account the fact that
the primary limiting mechanism in the unified hierarchy is
memory.high while memory.limit is either left unset or set to a very
large value, moving memsw.limit knob to the unified hierarchy would
effectively make it impossible to limit swap usage according to the
user preference.
- memsw.usage != memory.usage + swap.usage, because a page occupying
both swap entry and a swap cache page is charged only once to memsw
counter. As a result, it is possible to effectively eat up to
memory.limit of memory pages *and* memsw.limit of swap entries, which
looks unexpected.
That said, we should provide a different swap limiting mechanism for
cgroup2.
This patch adds mem_cgroup->swap counter, which charges the actual number
of swap entries used by a cgroup. It is only charged in the unified
hierarchy, while the legacy hierarchy memsw logic is left intact.
The swap usage can be monitored using new memory.swap.current file and
limited using memory.swap.max.
Note, to charge swap resource properly in the unified hierarchy, we have
to make swap_entry_free uncharge swap only when ->usage reaches zero, not
just ->count, i.e. when all references to a swap entry, including the one
taken by swap cache, are gone. This is necessary, because otherwise
swap-in could result in uncharging swap even if the page is still in swap
cache and hence still occupies a swap entry. At the same time, this
shouldn't break memsw counter logic, where a page is never charged twice
for using both memory and swap, because in case of legacy hierarchy we
uncharge swap on commit (see mem_cgroup_commit_charge).
Signed-off-by: Vladimir Davydov <vdavydov@virtuozzo.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-01-21 06:02:56 +07:00
|
|
|
unsigned short oldid;
|
|
|
|
|
2020-06-04 06:02:14 +07:00
|
|
|
if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
|
mm: memcontrol: charge swap to cgroup2
This patchset introduces swap accounting to cgroup2.
This patch (of 7):
In the legacy hierarchy we charge memsw, which is dubious, because:
- memsw.limit must be >= memory.limit, so it is impossible to limit
swap usage less than memory usage. Taking into account the fact that
the primary limiting mechanism in the unified hierarchy is
memory.high while memory.limit is either left unset or set to a very
large value, moving memsw.limit knob to the unified hierarchy would
effectively make it impossible to limit swap usage according to the
user preference.
- memsw.usage != memory.usage + swap.usage, because a page occupying
both swap entry and a swap cache page is charged only once to memsw
counter. As a result, it is possible to effectively eat up to
memory.limit of memory pages *and* memsw.limit of swap entries, which
looks unexpected.
That said, we should provide a different swap limiting mechanism for
cgroup2.
This patch adds mem_cgroup->swap counter, which charges the actual number
of swap entries used by a cgroup. It is only charged in the unified
hierarchy, while the legacy hierarchy memsw logic is left intact.
The swap usage can be monitored using new memory.swap.current file and
limited using memory.swap.max.
Note, to charge swap resource properly in the unified hierarchy, we have
to make swap_entry_free uncharge swap only when ->usage reaches zero, not
just ->count, i.e. when all references to a swap entry, including the one
taken by swap cache, are gone. This is necessary, because otherwise
swap-in could result in uncharging swap even if the page is still in swap
cache and hence still occupies a swap entry. At the same time, this
shouldn't break memsw counter logic, where a page is never charged twice
for using both memory and swap, because in case of legacy hierarchy we
uncharge swap on commit (see mem_cgroup_commit_charge).
Signed-off-by: Vladimir Davydov <vdavydov@virtuozzo.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-01-21 06:02:56 +07:00
|
|
|
return 0;
|
|
|
|
|
|
|
|
memcg = page->mem_cgroup;
|
|
|
|
|
|
|
|
/* Readahead page, never charged */
|
|
|
|
if (!memcg)
|
|
|
|
return 0;
|
|
|
|
|
2018-06-08 07:05:35 +07:00
|
|
|
if (!entry.val) {
|
|
|
|
memcg_memory_event(memcg, MEMCG_SWAP_FAIL);
|
2018-06-08 07:05:31 +07:00
|
|
|
return 0;
|
2018-06-08 07:05:35 +07:00
|
|
|
}
|
2018-06-08 07:05:31 +07:00
|
|
|
|
2016-08-12 05:33:00 +07:00
|
|
|
memcg = mem_cgroup_id_get_online(memcg);
|
|
|
|
|
2020-06-04 06:02:14 +07:00
|
|
|
if (!cgroup_memory_noswap && !mem_cgroup_is_root(memcg) &&
|
mm, THP, swap: delay splitting THP during swap out
Patch series "THP swap: Delay splitting THP during swapping out", v11.
This patchset is to optimize the performance of Transparent Huge Page
(THP) swap.
Recently, the performance of the storage devices improved so fast that
we cannot saturate the disk bandwidth with single logical CPU when do
page swap out even on a high-end server machine. Because the
performance of the storage device improved faster than that of single
logical CPU. And it seems that the trend will not change in the near
future. On the other hand, the THP becomes more and more popular
because of increased memory size. So it becomes necessary to optimize
THP swap performance.
The advantages of the THP swap support include:
- Batch the swap operations for the THP to reduce lock
acquiring/releasing, including allocating/freeing the swap space,
adding/deleting to/from the swap cache, and writing/reading the swap
space, etc. This will help improve the performance of the THP swap.
- The THP swap space read/write will be 2M sequential IO. It is
particularly helpful for the swap read, which are usually 4k random
IO. This will improve the performance of the THP swap too.
- It will help the memory fragmentation, especially when the THP is
heavily used by the applications. The 2M continuous pages will be
free up after THP swapping out.
- It will improve the THP utilization on the system with the swap
turned on. Because the speed for khugepaged to collapse the normal
pages into the THP is quite slow. After the THP is split during the
swapping out, it will take quite long time for the normal pages to
collapse back into the THP after being swapped in. The high THP
utilization helps the efficiency of the page based memory management
too.
There are some concerns regarding THP swap in, mainly because possible
enlarged read/write IO size (for swap in/out) may put more overhead on
the storage device. To deal with that, the THP swap in should be turned
on only when necessary. For example, it can be selected via
"always/never/madvise" logic, to be turned on globally, turned off
globally, or turned on only for VMA with MADV_HUGEPAGE, etc.
This patchset is the first step for the THP swap support. The plan is
to delay splitting THP step by step, finally avoid splitting THP during
the THP swapping out and swap out/in the THP as a whole.
As the first step, in this patchset, the splitting huge page is delayed
from almost the first step of swapping out to after allocating the swap
space for the THP and adding the THP into the swap cache. This will
reduce lock acquiring/releasing for the locks used for the swap cache
management.
With the patchset, the swap out throughput improves 15.5% (from about
3.73GB/s to about 4.31GB/s) in the vm-scalability swap-w-seq test case
with 8 processes. The test is done on a Xeon E5 v3 system. The swap
device used is a RAM simulated PMEM (persistent memory) device. To test
the sequential swapping out, the test case creates 8 processes, which
sequentially allocate and write to the anonymous pages until the RAM and
part of the swap device is used up.
This patch (of 5):
In this patch, splitting huge page is delayed from almost the first step
of swapping out to after allocating the swap space for the THP
(Transparent Huge Page) and adding the THP into the swap cache. This
will batch the corresponding operation, thus improve THP swap out
throughput.
This is the first step for the THP swap optimization. The plan is to
delay splitting the THP step by step and avoid splitting the THP
finally.
In this patch, one swap cluster is used to hold the contents of each THP
swapped out. So, the size of the swap cluster is changed to that of the
THP (Transparent Huge Page) on x86_64 architecture (512). For other
architectures which want such THP swap optimization,
ARCH_USES_THP_SWAP_CLUSTER needs to be selected in the Kconfig file for
the architecture. In effect, this will enlarge swap cluster size by 2
times on x86_64. Which may make it harder to find a free cluster when
the swap space becomes fragmented. So that, this may reduce the
continuous swap space allocation and sequential write in theory. The
performance test in 0day shows no regressions caused by this.
In the future of THP swap optimization, some information of the swapped
out THP (such as compound map count) will be recorded in the
swap_cluster_info data structure.
The mem cgroup swap accounting functions are enhanced to support charge
or uncharge a swap cluster backing a THP as a whole.
The swap cluster allocate/free functions are added to allocate/free a
swap cluster for a THP. A fair simple algorithm is used for swap
cluster allocation, that is, only the first swap device in priority list
will be tried to allocate the swap cluster. The function will fail if
the trying is not successful, and the caller will fallback to allocate a
single swap slot instead. This works good enough for normal cases. If
the difference of the number of the free swap clusters among multiple
swap devices is significant, it is possible that some THPs are split
earlier than necessary. For example, this could be caused by big size
difference among multiple swap devices.
The swap cache functions is enhanced to support add/delete THP to/from
the swap cache as a set of (HPAGE_PMD_NR) sub-pages. This may be
enhanced in the future with multi-order radix tree. But because we will
split the THP soon during swapping out, that optimization doesn't make
much sense for this first step.
The THP splitting functions are enhanced to support to split THP in swap
cache during swapping out. The page lock will be held during allocating
the swap cluster, adding the THP into the swap cache and splitting the
THP. So in the code path other than swapping out, if the THP need to be
split, the PageSwapCache(THP) will be always false.
The swap cluster is only available for SSD, so the THP swap optimization
in this patchset has no effect for HDD.
[ying.huang@intel.com: fix two issues in THP optimize patch]
Link: http://lkml.kernel.org/r/87k25ed8zo.fsf@yhuang-dev.intel.com
[hannes@cmpxchg.org: extensive cleanups and simplifications, reduce code size]
Link: http://lkml.kernel.org/r/20170515112522.32457-2-ying.huang@intel.com
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Suggested-by: Andrew Morton <akpm@linux-foundation.org> [for config option]
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> [for changes in huge_memory.c and huge_mm.h]
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Ebru Akagunduz <ebru.akagunduz@gmail.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Shaohua Li <shli@kernel.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2017-07-07 05:37:18 +07:00
|
|
|
!page_counter_try_charge(&memcg->swap, nr_pages, &counter)) {
|
2018-06-08 07:05:35 +07:00
|
|
|
memcg_memory_event(memcg, MEMCG_SWAP_MAX);
|
|
|
|
memcg_memory_event(memcg, MEMCG_SWAP_FAIL);
|
2016-08-12 05:33:00 +07:00
|
|
|
mem_cgroup_id_put(memcg);
|
mm: memcontrol: charge swap to cgroup2
This patchset introduces swap accounting to cgroup2.
This patch (of 7):
In the legacy hierarchy we charge memsw, which is dubious, because:
- memsw.limit must be >= memory.limit, so it is impossible to limit
swap usage less than memory usage. Taking into account the fact that
the primary limiting mechanism in the unified hierarchy is
memory.high while memory.limit is either left unset or set to a very
large value, moving memsw.limit knob to the unified hierarchy would
effectively make it impossible to limit swap usage according to the
user preference.
- memsw.usage != memory.usage + swap.usage, because a page occupying
both swap entry and a swap cache page is charged only once to memsw
counter. As a result, it is possible to effectively eat up to
memory.limit of memory pages *and* memsw.limit of swap entries, which
looks unexpected.
That said, we should provide a different swap limiting mechanism for
cgroup2.
This patch adds mem_cgroup->swap counter, which charges the actual number
of swap entries used by a cgroup. It is only charged in the unified
hierarchy, while the legacy hierarchy memsw logic is left intact.
The swap usage can be monitored using new memory.swap.current file and
limited using memory.swap.max.
Note, to charge swap resource properly in the unified hierarchy, we have
to make swap_entry_free uncharge swap only when ->usage reaches zero, not
just ->count, i.e. when all references to a swap entry, including the one
taken by swap cache, are gone. This is necessary, because otherwise
swap-in could result in uncharging swap even if the page is still in swap
cache and hence still occupies a swap entry. At the same time, this
shouldn't break memsw counter logic, where a page is never charged twice
for using both memory and swap, because in case of legacy hierarchy we
uncharge swap on commit (see mem_cgroup_commit_charge).
Signed-off-by: Vladimir Davydov <vdavydov@virtuozzo.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-01-21 06:02:56 +07:00
|
|
|
return -ENOMEM;
|
2016-08-12 05:33:00 +07:00
|
|
|
}
|
mm: memcontrol: charge swap to cgroup2
This patchset introduces swap accounting to cgroup2.
This patch (of 7):
In the legacy hierarchy we charge memsw, which is dubious, because:
- memsw.limit must be >= memory.limit, so it is impossible to limit
swap usage less than memory usage. Taking into account the fact that
the primary limiting mechanism in the unified hierarchy is
memory.high while memory.limit is either left unset or set to a very
large value, moving memsw.limit knob to the unified hierarchy would
effectively make it impossible to limit swap usage according to the
user preference.
- memsw.usage != memory.usage + swap.usage, because a page occupying
both swap entry and a swap cache page is charged only once to memsw
counter. As a result, it is possible to effectively eat up to
memory.limit of memory pages *and* memsw.limit of swap entries, which
looks unexpected.
That said, we should provide a different swap limiting mechanism for
cgroup2.
This patch adds mem_cgroup->swap counter, which charges the actual number
of swap entries used by a cgroup. It is only charged in the unified
hierarchy, while the legacy hierarchy memsw logic is left intact.
The swap usage can be monitored using new memory.swap.current file and
limited using memory.swap.max.
Note, to charge swap resource properly in the unified hierarchy, we have
to make swap_entry_free uncharge swap only when ->usage reaches zero, not
just ->count, i.e. when all references to a swap entry, including the one
taken by swap cache, are gone. This is necessary, because otherwise
swap-in could result in uncharging swap even if the page is still in swap
cache and hence still occupies a swap entry. At the same time, this
shouldn't break memsw counter logic, where a page is never charged twice
for using both memory and swap, because in case of legacy hierarchy we
uncharge swap on commit (see mem_cgroup_commit_charge).
Signed-off-by: Vladimir Davydov <vdavydov@virtuozzo.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-01-21 06:02:56 +07:00
|
|
|
|
mm, THP, swap: delay splitting THP during swap out
Patch series "THP swap: Delay splitting THP during swapping out", v11.
This patchset is to optimize the performance of Transparent Huge Page
(THP) swap.
Recently, the performance of the storage devices improved so fast that
we cannot saturate the disk bandwidth with single logical CPU when do
page swap out even on a high-end server machine. Because the
performance of the storage device improved faster than that of single
logical CPU. And it seems that the trend will not change in the near
future. On the other hand, the THP becomes more and more popular
because of increased memory size. So it becomes necessary to optimize
THP swap performance.
The advantages of the THP swap support include:
- Batch the swap operations for the THP to reduce lock
acquiring/releasing, including allocating/freeing the swap space,
adding/deleting to/from the swap cache, and writing/reading the swap
space, etc. This will help improve the performance of the THP swap.
- The THP swap space read/write will be 2M sequential IO. It is
particularly helpful for the swap read, which are usually 4k random
IO. This will improve the performance of the THP swap too.
- It will help the memory fragmentation, especially when the THP is
heavily used by the applications. The 2M continuous pages will be
free up after THP swapping out.
- It will improve the THP utilization on the system with the swap
turned on. Because the speed for khugepaged to collapse the normal
pages into the THP is quite slow. After the THP is split during the
swapping out, it will take quite long time for the normal pages to
collapse back into the THP after being swapped in. The high THP
utilization helps the efficiency of the page based memory management
too.
There are some concerns regarding THP swap in, mainly because possible
enlarged read/write IO size (for swap in/out) may put more overhead on
the storage device. To deal with that, the THP swap in should be turned
on only when necessary. For example, it can be selected via
"always/never/madvise" logic, to be turned on globally, turned off
globally, or turned on only for VMA with MADV_HUGEPAGE, etc.
This patchset is the first step for the THP swap support. The plan is
to delay splitting THP step by step, finally avoid splitting THP during
the THP swapping out and swap out/in the THP as a whole.
As the first step, in this patchset, the splitting huge page is delayed
from almost the first step of swapping out to after allocating the swap
space for the THP and adding the THP into the swap cache. This will
reduce lock acquiring/releasing for the locks used for the swap cache
management.
With the patchset, the swap out throughput improves 15.5% (from about
3.73GB/s to about 4.31GB/s) in the vm-scalability swap-w-seq test case
with 8 processes. The test is done on a Xeon E5 v3 system. The swap
device used is a RAM simulated PMEM (persistent memory) device. To test
the sequential swapping out, the test case creates 8 processes, which
sequentially allocate and write to the anonymous pages until the RAM and
part of the swap device is used up.
This patch (of 5):
In this patch, splitting huge page is delayed from almost the first step
of swapping out to after allocating the swap space for the THP
(Transparent Huge Page) and adding the THP into the swap cache. This
will batch the corresponding operation, thus improve THP swap out
throughput.
This is the first step for the THP swap optimization. The plan is to
delay splitting the THP step by step and avoid splitting the THP
finally.
In this patch, one swap cluster is used to hold the contents of each THP
swapped out. So, the size of the swap cluster is changed to that of the
THP (Transparent Huge Page) on x86_64 architecture (512). For other
architectures which want such THP swap optimization,
ARCH_USES_THP_SWAP_CLUSTER needs to be selected in the Kconfig file for
the architecture. In effect, this will enlarge swap cluster size by 2
times on x86_64. Which may make it harder to find a free cluster when
the swap space becomes fragmented. So that, this may reduce the
continuous swap space allocation and sequential write in theory. The
performance test in 0day shows no regressions caused by this.
In the future of THP swap optimization, some information of the swapped
out THP (such as compound map count) will be recorded in the
swap_cluster_info data structure.
The mem cgroup swap accounting functions are enhanced to support charge
or uncharge a swap cluster backing a THP as a whole.
The swap cluster allocate/free functions are added to allocate/free a
swap cluster for a THP. A fair simple algorithm is used for swap
cluster allocation, that is, only the first swap device in priority list
will be tried to allocate the swap cluster. The function will fail if
the trying is not successful, and the caller will fallback to allocate a
single swap slot instead. This works good enough for normal cases. If
the difference of the number of the free swap clusters among multiple
swap devices is significant, it is possible that some THPs are split
earlier than necessary. For example, this could be caused by big size
difference among multiple swap devices.
The swap cache functions is enhanced to support add/delete THP to/from
the swap cache as a set of (HPAGE_PMD_NR) sub-pages. This may be
enhanced in the future with multi-order radix tree. But because we will
split the THP soon during swapping out, that optimization doesn't make
much sense for this first step.
The THP splitting functions are enhanced to support to split THP in swap
cache during swapping out. The page lock will be held during allocating
the swap cluster, adding the THP into the swap cache and splitting the
THP. So in the code path other than swapping out, if the THP need to be
split, the PageSwapCache(THP) will be always false.
The swap cluster is only available for SSD, so the THP swap optimization
in this patchset has no effect for HDD.
[ying.huang@intel.com: fix two issues in THP optimize patch]
Link: http://lkml.kernel.org/r/87k25ed8zo.fsf@yhuang-dev.intel.com
[hannes@cmpxchg.org: extensive cleanups and simplifications, reduce code size]
Link: http://lkml.kernel.org/r/20170515112522.32457-2-ying.huang@intel.com
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Suggested-by: Andrew Morton <akpm@linux-foundation.org> [for config option]
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> [for changes in huge_memory.c and huge_mm.h]
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Ebru Akagunduz <ebru.akagunduz@gmail.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Shaohua Li <shli@kernel.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2017-07-07 05:37:18 +07:00
|
|
|
/* Get references for the tail pages, too */
|
|
|
|
if (nr_pages > 1)
|
|
|
|
mem_cgroup_id_get_many(memcg, nr_pages - 1);
|
|
|
|
oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg), nr_pages);
|
mm: memcontrol: charge swap to cgroup2
This patchset introduces swap accounting to cgroup2.
This patch (of 7):
In the legacy hierarchy we charge memsw, which is dubious, because:
- memsw.limit must be >= memory.limit, so it is impossible to limit
swap usage less than memory usage. Taking into account the fact that
the primary limiting mechanism in the unified hierarchy is
memory.high while memory.limit is either left unset or set to a very
large value, moving memsw.limit knob to the unified hierarchy would
effectively make it impossible to limit swap usage according to the
user preference.
- memsw.usage != memory.usage + swap.usage, because a page occupying
both swap entry and a swap cache page is charged only once to memsw
counter. As a result, it is possible to effectively eat up to
memory.limit of memory pages *and* memsw.limit of swap entries, which
looks unexpected.
That said, we should provide a different swap limiting mechanism for
cgroup2.
This patch adds mem_cgroup->swap counter, which charges the actual number
of swap entries used by a cgroup. It is only charged in the unified
hierarchy, while the legacy hierarchy memsw logic is left intact.
The swap usage can be monitored using new memory.swap.current file and
limited using memory.swap.max.
Note, to charge swap resource properly in the unified hierarchy, we have
to make swap_entry_free uncharge swap only when ->usage reaches zero, not
just ->count, i.e. when all references to a swap entry, including the one
taken by swap cache, are gone. This is necessary, because otherwise
swap-in could result in uncharging swap even if the page is still in swap
cache and hence still occupies a swap entry. At the same time, this
shouldn't break memsw counter logic, where a page is never charged twice
for using both memory and swap, because in case of legacy hierarchy we
uncharge swap on commit (see mem_cgroup_commit_charge).
Signed-off-by: Vladimir Davydov <vdavydov@virtuozzo.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-01-21 06:02:56 +07:00
|
|
|
VM_BUG_ON_PAGE(oldid, page);
|
2018-02-01 07:16:37 +07:00
|
|
|
mod_memcg_state(memcg, MEMCG_SWAP, nr_pages);
|
mm: memcontrol: charge swap to cgroup2
This patchset introduces swap accounting to cgroup2.
This patch (of 7):
In the legacy hierarchy we charge memsw, which is dubious, because:
- memsw.limit must be >= memory.limit, so it is impossible to limit
swap usage less than memory usage. Taking into account the fact that
the primary limiting mechanism in the unified hierarchy is
memory.high while memory.limit is either left unset or set to a very
large value, moving memsw.limit knob to the unified hierarchy would
effectively make it impossible to limit swap usage according to the
user preference.
- memsw.usage != memory.usage + swap.usage, because a page occupying
both swap entry and a swap cache page is charged only once to memsw
counter. As a result, it is possible to effectively eat up to
memory.limit of memory pages *and* memsw.limit of swap entries, which
looks unexpected.
That said, we should provide a different swap limiting mechanism for
cgroup2.
This patch adds mem_cgroup->swap counter, which charges the actual number
of swap entries used by a cgroup. It is only charged in the unified
hierarchy, while the legacy hierarchy memsw logic is left intact.
The swap usage can be monitored using new memory.swap.current file and
limited using memory.swap.max.
Note, to charge swap resource properly in the unified hierarchy, we have
to make swap_entry_free uncharge swap only when ->usage reaches zero, not
just ->count, i.e. when all references to a swap entry, including the one
taken by swap cache, are gone. This is necessary, because otherwise
swap-in could result in uncharging swap even if the page is still in swap
cache and hence still occupies a swap entry. At the same time, this
shouldn't break memsw counter logic, where a page is never charged twice
for using both memory and swap, because in case of legacy hierarchy we
uncharge swap on commit (see mem_cgroup_commit_charge).
Signed-off-by: Vladimir Davydov <vdavydov@virtuozzo.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-01-21 06:02:56 +07:00
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2015-02-12 06:26:36 +07:00
|
|
|
/**
|
mm, THP, swap: delay splitting THP during swap out
Patch series "THP swap: Delay splitting THP during swapping out", v11.
This patchset is to optimize the performance of Transparent Huge Page
(THP) swap.
Recently, the performance of the storage devices improved so fast that
we cannot saturate the disk bandwidth with single logical CPU when do
page swap out even on a high-end server machine. Because the
performance of the storage device improved faster than that of single
logical CPU. And it seems that the trend will not change in the near
future. On the other hand, the THP becomes more and more popular
because of increased memory size. So it becomes necessary to optimize
THP swap performance.
The advantages of the THP swap support include:
- Batch the swap operations for the THP to reduce lock
acquiring/releasing, including allocating/freeing the swap space,
adding/deleting to/from the swap cache, and writing/reading the swap
space, etc. This will help improve the performance of the THP swap.
- The THP swap space read/write will be 2M sequential IO. It is
particularly helpful for the swap read, which are usually 4k random
IO. This will improve the performance of the THP swap too.
- It will help the memory fragmentation, especially when the THP is
heavily used by the applications. The 2M continuous pages will be
free up after THP swapping out.
- It will improve the THP utilization on the system with the swap
turned on. Because the speed for khugepaged to collapse the normal
pages into the THP is quite slow. After the THP is split during the
swapping out, it will take quite long time for the normal pages to
collapse back into the THP after being swapped in. The high THP
utilization helps the efficiency of the page based memory management
too.
There are some concerns regarding THP swap in, mainly because possible
enlarged read/write IO size (for swap in/out) may put more overhead on
the storage device. To deal with that, the THP swap in should be turned
on only when necessary. For example, it can be selected via
"always/never/madvise" logic, to be turned on globally, turned off
globally, or turned on only for VMA with MADV_HUGEPAGE, etc.
This patchset is the first step for the THP swap support. The plan is
to delay splitting THP step by step, finally avoid splitting THP during
the THP swapping out and swap out/in the THP as a whole.
As the first step, in this patchset, the splitting huge page is delayed
from almost the first step of swapping out to after allocating the swap
space for the THP and adding the THP into the swap cache. This will
reduce lock acquiring/releasing for the locks used for the swap cache
management.
With the patchset, the swap out throughput improves 15.5% (from about
3.73GB/s to about 4.31GB/s) in the vm-scalability swap-w-seq test case
with 8 processes. The test is done on a Xeon E5 v3 system. The swap
device used is a RAM simulated PMEM (persistent memory) device. To test
the sequential swapping out, the test case creates 8 processes, which
sequentially allocate and write to the anonymous pages until the RAM and
part of the swap device is used up.
This patch (of 5):
In this patch, splitting huge page is delayed from almost the first step
of swapping out to after allocating the swap space for the THP
(Transparent Huge Page) and adding the THP into the swap cache. This
will batch the corresponding operation, thus improve THP swap out
throughput.
This is the first step for the THP swap optimization. The plan is to
delay splitting the THP step by step and avoid splitting the THP
finally.
In this patch, one swap cluster is used to hold the contents of each THP
swapped out. So, the size of the swap cluster is changed to that of the
THP (Transparent Huge Page) on x86_64 architecture (512). For other
architectures which want such THP swap optimization,
ARCH_USES_THP_SWAP_CLUSTER needs to be selected in the Kconfig file for
the architecture. In effect, this will enlarge swap cluster size by 2
times on x86_64. Which may make it harder to find a free cluster when
the swap space becomes fragmented. So that, this may reduce the
continuous swap space allocation and sequential write in theory. The
performance test in 0day shows no regressions caused by this.
In the future of THP swap optimization, some information of the swapped
out THP (such as compound map count) will be recorded in the
swap_cluster_info data structure.
The mem cgroup swap accounting functions are enhanced to support charge
or uncharge a swap cluster backing a THP as a whole.
The swap cluster allocate/free functions are added to allocate/free a
swap cluster for a THP. A fair simple algorithm is used for swap
cluster allocation, that is, only the first swap device in priority list
will be tried to allocate the swap cluster. The function will fail if
the trying is not successful, and the caller will fallback to allocate a
single swap slot instead. This works good enough for normal cases. If
the difference of the number of the free swap clusters among multiple
swap devices is significant, it is possible that some THPs are split
earlier than necessary. For example, this could be caused by big size
difference among multiple swap devices.
The swap cache functions is enhanced to support add/delete THP to/from
the swap cache as a set of (HPAGE_PMD_NR) sub-pages. This may be
enhanced in the future with multi-order radix tree. But because we will
split the THP soon during swapping out, that optimization doesn't make
much sense for this first step.
The THP splitting functions are enhanced to support to split THP in swap
cache during swapping out. The page lock will be held during allocating
the swap cluster, adding the THP into the swap cache and splitting the
THP. So in the code path other than swapping out, if the THP need to be
split, the PageSwapCache(THP) will be always false.
The swap cluster is only available for SSD, so the THP swap optimization
in this patchset has no effect for HDD.
[ying.huang@intel.com: fix two issues in THP optimize patch]
Link: http://lkml.kernel.org/r/87k25ed8zo.fsf@yhuang-dev.intel.com
[hannes@cmpxchg.org: extensive cleanups and simplifications, reduce code size]
Link: http://lkml.kernel.org/r/20170515112522.32457-2-ying.huang@intel.com
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Suggested-by: Andrew Morton <akpm@linux-foundation.org> [for config option]
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> [for changes in huge_memory.c and huge_mm.h]
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Ebru Akagunduz <ebru.akagunduz@gmail.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Shaohua Li <shli@kernel.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2017-07-07 05:37:18 +07:00
|
|
|
* mem_cgroup_uncharge_swap - uncharge swap space
|
2015-02-12 06:26:36 +07:00
|
|
|
* @entry: swap entry to uncharge
|
mm, THP, swap: delay splitting THP during swap out
Patch series "THP swap: Delay splitting THP during swapping out", v11.
This patchset is to optimize the performance of Transparent Huge Page
(THP) swap.
Recently, the performance of the storage devices improved so fast that
we cannot saturate the disk bandwidth with single logical CPU when do
page swap out even on a high-end server machine. Because the
performance of the storage device improved faster than that of single
logical CPU. And it seems that the trend will not change in the near
future. On the other hand, the THP becomes more and more popular
because of increased memory size. So it becomes necessary to optimize
THP swap performance.
The advantages of the THP swap support include:
- Batch the swap operations for the THP to reduce lock
acquiring/releasing, including allocating/freeing the swap space,
adding/deleting to/from the swap cache, and writing/reading the swap
space, etc. This will help improve the performance of the THP swap.
- The THP swap space read/write will be 2M sequential IO. It is
particularly helpful for the swap read, which are usually 4k random
IO. This will improve the performance of the THP swap too.
- It will help the memory fragmentation, especially when the THP is
heavily used by the applications. The 2M continuous pages will be
free up after THP swapping out.
- It will improve the THP utilization on the system with the swap
turned on. Because the speed for khugepaged to collapse the normal
pages into the THP is quite slow. After the THP is split during the
swapping out, it will take quite long time for the normal pages to
collapse back into the THP after being swapped in. The high THP
utilization helps the efficiency of the page based memory management
too.
There are some concerns regarding THP swap in, mainly because possible
enlarged read/write IO size (for swap in/out) may put more overhead on
the storage device. To deal with that, the THP swap in should be turned
on only when necessary. For example, it can be selected via
"always/never/madvise" logic, to be turned on globally, turned off
globally, or turned on only for VMA with MADV_HUGEPAGE, etc.
This patchset is the first step for the THP swap support. The plan is
to delay splitting THP step by step, finally avoid splitting THP during
the THP swapping out and swap out/in the THP as a whole.
As the first step, in this patchset, the splitting huge page is delayed
from almost the first step of swapping out to after allocating the swap
space for the THP and adding the THP into the swap cache. This will
reduce lock acquiring/releasing for the locks used for the swap cache
management.
With the patchset, the swap out throughput improves 15.5% (from about
3.73GB/s to about 4.31GB/s) in the vm-scalability swap-w-seq test case
with 8 processes. The test is done on a Xeon E5 v3 system. The swap
device used is a RAM simulated PMEM (persistent memory) device. To test
the sequential swapping out, the test case creates 8 processes, which
sequentially allocate and write to the anonymous pages until the RAM and
part of the swap device is used up.
This patch (of 5):
In this patch, splitting huge page is delayed from almost the first step
of swapping out to after allocating the swap space for the THP
(Transparent Huge Page) and adding the THP into the swap cache. This
will batch the corresponding operation, thus improve THP swap out
throughput.
This is the first step for the THP swap optimization. The plan is to
delay splitting the THP step by step and avoid splitting the THP
finally.
In this patch, one swap cluster is used to hold the contents of each THP
swapped out. So, the size of the swap cluster is changed to that of the
THP (Transparent Huge Page) on x86_64 architecture (512). For other
architectures which want such THP swap optimization,
ARCH_USES_THP_SWAP_CLUSTER needs to be selected in the Kconfig file for
the architecture. In effect, this will enlarge swap cluster size by 2
times on x86_64. Which may make it harder to find a free cluster when
the swap space becomes fragmented. So that, this may reduce the
continuous swap space allocation and sequential write in theory. The
performance test in 0day shows no regressions caused by this.
In the future of THP swap optimization, some information of the swapped
out THP (such as compound map count) will be recorded in the
swap_cluster_info data structure.
The mem cgroup swap accounting functions are enhanced to support charge
or uncharge a swap cluster backing a THP as a whole.
The swap cluster allocate/free functions are added to allocate/free a
swap cluster for a THP. A fair simple algorithm is used for swap
cluster allocation, that is, only the first swap device in priority list
will be tried to allocate the swap cluster. The function will fail if
the trying is not successful, and the caller will fallback to allocate a
single swap slot instead. This works good enough for normal cases. If
the difference of the number of the free swap clusters among multiple
swap devices is significant, it is possible that some THPs are split
earlier than necessary. For example, this could be caused by big size
difference among multiple swap devices.
The swap cache functions is enhanced to support add/delete THP to/from
the swap cache as a set of (HPAGE_PMD_NR) sub-pages. This may be
enhanced in the future with multi-order radix tree. But because we will
split the THP soon during swapping out, that optimization doesn't make
much sense for this first step.
The THP splitting functions are enhanced to support to split THP in swap
cache during swapping out. The page lock will be held during allocating
the swap cluster, adding the THP into the swap cache and splitting the
THP. So in the code path other than swapping out, if the THP need to be
split, the PageSwapCache(THP) will be always false.
The swap cluster is only available for SSD, so the THP swap optimization
in this patchset has no effect for HDD.
[ying.huang@intel.com: fix two issues in THP optimize patch]
Link: http://lkml.kernel.org/r/87k25ed8zo.fsf@yhuang-dev.intel.com
[hannes@cmpxchg.org: extensive cleanups and simplifications, reduce code size]
Link: http://lkml.kernel.org/r/20170515112522.32457-2-ying.huang@intel.com
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Suggested-by: Andrew Morton <akpm@linux-foundation.org> [for config option]
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> [for changes in huge_memory.c and huge_mm.h]
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Ebru Akagunduz <ebru.akagunduz@gmail.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Shaohua Li <shli@kernel.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2017-07-07 05:37:18 +07:00
|
|
|
* @nr_pages: the amount of swap space to uncharge
|
2015-02-12 06:26:36 +07:00
|
|
|
*/
|
mm, THP, swap: delay splitting THP during swap out
Patch series "THP swap: Delay splitting THP during swapping out", v11.
This patchset is to optimize the performance of Transparent Huge Page
(THP) swap.
Recently, the performance of the storage devices improved so fast that
we cannot saturate the disk bandwidth with single logical CPU when do
page swap out even on a high-end server machine. Because the
performance of the storage device improved faster than that of single
logical CPU. And it seems that the trend will not change in the near
future. On the other hand, the THP becomes more and more popular
because of increased memory size. So it becomes necessary to optimize
THP swap performance.
The advantages of the THP swap support include:
- Batch the swap operations for the THP to reduce lock
acquiring/releasing, including allocating/freeing the swap space,
adding/deleting to/from the swap cache, and writing/reading the swap
space, etc. This will help improve the performance of the THP swap.
- The THP swap space read/write will be 2M sequential IO. It is
particularly helpful for the swap read, which are usually 4k random
IO. This will improve the performance of the THP swap too.
- It will help the memory fragmentation, especially when the THP is
heavily used by the applications. The 2M continuous pages will be
free up after THP swapping out.
- It will improve the THP utilization on the system with the swap
turned on. Because the speed for khugepaged to collapse the normal
pages into the THP is quite slow. After the THP is split during the
swapping out, it will take quite long time for the normal pages to
collapse back into the THP after being swapped in. The high THP
utilization helps the efficiency of the page based memory management
too.
There are some concerns regarding THP swap in, mainly because possible
enlarged read/write IO size (for swap in/out) may put more overhead on
the storage device. To deal with that, the THP swap in should be turned
on only when necessary. For example, it can be selected via
"always/never/madvise" logic, to be turned on globally, turned off
globally, or turned on only for VMA with MADV_HUGEPAGE, etc.
This patchset is the first step for the THP swap support. The plan is
to delay splitting THP step by step, finally avoid splitting THP during
the THP swapping out and swap out/in the THP as a whole.
As the first step, in this patchset, the splitting huge page is delayed
from almost the first step of swapping out to after allocating the swap
space for the THP and adding the THP into the swap cache. This will
reduce lock acquiring/releasing for the locks used for the swap cache
management.
With the patchset, the swap out throughput improves 15.5% (from about
3.73GB/s to about 4.31GB/s) in the vm-scalability swap-w-seq test case
with 8 processes. The test is done on a Xeon E5 v3 system. The swap
device used is a RAM simulated PMEM (persistent memory) device. To test
the sequential swapping out, the test case creates 8 processes, which
sequentially allocate and write to the anonymous pages until the RAM and
part of the swap device is used up.
This patch (of 5):
In this patch, splitting huge page is delayed from almost the first step
of swapping out to after allocating the swap space for the THP
(Transparent Huge Page) and adding the THP into the swap cache. This
will batch the corresponding operation, thus improve THP swap out
throughput.
This is the first step for the THP swap optimization. The plan is to
delay splitting the THP step by step and avoid splitting the THP
finally.
In this patch, one swap cluster is used to hold the contents of each THP
swapped out. So, the size of the swap cluster is changed to that of the
THP (Transparent Huge Page) on x86_64 architecture (512). For other
architectures which want such THP swap optimization,
ARCH_USES_THP_SWAP_CLUSTER needs to be selected in the Kconfig file for
the architecture. In effect, this will enlarge swap cluster size by 2
times on x86_64. Which may make it harder to find a free cluster when
the swap space becomes fragmented. So that, this may reduce the
continuous swap space allocation and sequential write in theory. The
performance test in 0day shows no regressions caused by this.
In the future of THP swap optimization, some information of the swapped
out THP (such as compound map count) will be recorded in the
swap_cluster_info data structure.
The mem cgroup swap accounting functions are enhanced to support charge
or uncharge a swap cluster backing a THP as a whole.
The swap cluster allocate/free functions are added to allocate/free a
swap cluster for a THP. A fair simple algorithm is used for swap
cluster allocation, that is, only the first swap device in priority list
will be tried to allocate the swap cluster. The function will fail if
the trying is not successful, and the caller will fallback to allocate a
single swap slot instead. This works good enough for normal cases. If
the difference of the number of the free swap clusters among multiple
swap devices is significant, it is possible that some THPs are split
earlier than necessary. For example, this could be caused by big size
difference among multiple swap devices.
The swap cache functions is enhanced to support add/delete THP to/from
the swap cache as a set of (HPAGE_PMD_NR) sub-pages. This may be
enhanced in the future with multi-order radix tree. But because we will
split the THP soon during swapping out, that optimization doesn't make
much sense for this first step.
The THP splitting functions are enhanced to support to split THP in swap
cache during swapping out. The page lock will be held during allocating
the swap cluster, adding the THP into the swap cache and splitting the
THP. So in the code path other than swapping out, if the THP need to be
split, the PageSwapCache(THP) will be always false.
The swap cluster is only available for SSD, so the THP swap optimization
in this patchset has no effect for HDD.
[ying.huang@intel.com: fix two issues in THP optimize patch]
Link: http://lkml.kernel.org/r/87k25ed8zo.fsf@yhuang-dev.intel.com
[hannes@cmpxchg.org: extensive cleanups and simplifications, reduce code size]
Link: http://lkml.kernel.org/r/20170515112522.32457-2-ying.huang@intel.com
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Suggested-by: Andrew Morton <akpm@linux-foundation.org> [for config option]
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> [for changes in huge_memory.c and huge_mm.h]
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Ebru Akagunduz <ebru.akagunduz@gmail.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Shaohua Li <shli@kernel.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2017-07-07 05:37:18 +07:00
|
|
|
void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages)
|
2015-02-12 06:26:36 +07:00
|
|
|
{
|
|
|
|
struct mem_cgroup *memcg;
|
|
|
|
unsigned short id;
|
|
|
|
|
mm, THP, swap: delay splitting THP during swap out
Patch series "THP swap: Delay splitting THP during swapping out", v11.
This patchset is to optimize the performance of Transparent Huge Page
(THP) swap.
Recently, the performance of the storage devices improved so fast that
we cannot saturate the disk bandwidth with single logical CPU when do
page swap out even on a high-end server machine. Because the
performance of the storage device improved faster than that of single
logical CPU. And it seems that the trend will not change in the near
future. On the other hand, the THP becomes more and more popular
because of increased memory size. So it becomes necessary to optimize
THP swap performance.
The advantages of the THP swap support include:
- Batch the swap operations for the THP to reduce lock
acquiring/releasing, including allocating/freeing the swap space,
adding/deleting to/from the swap cache, and writing/reading the swap
space, etc. This will help improve the performance of the THP swap.
- The THP swap space read/write will be 2M sequential IO. It is
particularly helpful for the swap read, which are usually 4k random
IO. This will improve the performance of the THP swap too.
- It will help the memory fragmentation, especially when the THP is
heavily used by the applications. The 2M continuous pages will be
free up after THP swapping out.
- It will improve the THP utilization on the system with the swap
turned on. Because the speed for khugepaged to collapse the normal
pages into the THP is quite slow. After the THP is split during the
swapping out, it will take quite long time for the normal pages to
collapse back into the THP after being swapped in. The high THP
utilization helps the efficiency of the page based memory management
too.
There are some concerns regarding THP swap in, mainly because possible
enlarged read/write IO size (for swap in/out) may put more overhead on
the storage device. To deal with that, the THP swap in should be turned
on only when necessary. For example, it can be selected via
"always/never/madvise" logic, to be turned on globally, turned off
globally, or turned on only for VMA with MADV_HUGEPAGE, etc.
This patchset is the first step for the THP swap support. The plan is
to delay splitting THP step by step, finally avoid splitting THP during
the THP swapping out and swap out/in the THP as a whole.
As the first step, in this patchset, the splitting huge page is delayed
from almost the first step of swapping out to after allocating the swap
space for the THP and adding the THP into the swap cache. This will
reduce lock acquiring/releasing for the locks used for the swap cache
management.
With the patchset, the swap out throughput improves 15.5% (from about
3.73GB/s to about 4.31GB/s) in the vm-scalability swap-w-seq test case
with 8 processes. The test is done on a Xeon E5 v3 system. The swap
device used is a RAM simulated PMEM (persistent memory) device. To test
the sequential swapping out, the test case creates 8 processes, which
sequentially allocate and write to the anonymous pages until the RAM and
part of the swap device is used up.
This patch (of 5):
In this patch, splitting huge page is delayed from almost the first step
of swapping out to after allocating the swap space for the THP
(Transparent Huge Page) and adding the THP into the swap cache. This
will batch the corresponding operation, thus improve THP swap out
throughput.
This is the first step for the THP swap optimization. The plan is to
delay splitting the THP step by step and avoid splitting the THP
finally.
In this patch, one swap cluster is used to hold the contents of each THP
swapped out. So, the size of the swap cluster is changed to that of the
THP (Transparent Huge Page) on x86_64 architecture (512). For other
architectures which want such THP swap optimization,
ARCH_USES_THP_SWAP_CLUSTER needs to be selected in the Kconfig file for
the architecture. In effect, this will enlarge swap cluster size by 2
times on x86_64. Which may make it harder to find a free cluster when
the swap space becomes fragmented. So that, this may reduce the
continuous swap space allocation and sequential write in theory. The
performance test in 0day shows no regressions caused by this.
In the future of THP swap optimization, some information of the swapped
out THP (such as compound map count) will be recorded in the
swap_cluster_info data structure.
The mem cgroup swap accounting functions are enhanced to support charge
or uncharge a swap cluster backing a THP as a whole.
The swap cluster allocate/free functions are added to allocate/free a
swap cluster for a THP. A fair simple algorithm is used for swap
cluster allocation, that is, only the first swap device in priority list
will be tried to allocate the swap cluster. The function will fail if
the trying is not successful, and the caller will fallback to allocate a
single swap slot instead. This works good enough for normal cases. If
the difference of the number of the free swap clusters among multiple
swap devices is significant, it is possible that some THPs are split
earlier than necessary. For example, this could be caused by big size
difference among multiple swap devices.
The swap cache functions is enhanced to support add/delete THP to/from
the swap cache as a set of (HPAGE_PMD_NR) sub-pages. This may be
enhanced in the future with multi-order radix tree. But because we will
split the THP soon during swapping out, that optimization doesn't make
much sense for this first step.
The THP splitting functions are enhanced to support to split THP in swap
cache during swapping out. The page lock will be held during allocating
the swap cluster, adding the THP into the swap cache and splitting the
THP. So in the code path other than swapping out, if the THP need to be
split, the PageSwapCache(THP) will be always false.
The swap cluster is only available for SSD, so the THP swap optimization
in this patchset has no effect for HDD.
[ying.huang@intel.com: fix two issues in THP optimize patch]
Link: http://lkml.kernel.org/r/87k25ed8zo.fsf@yhuang-dev.intel.com
[hannes@cmpxchg.org: extensive cleanups and simplifications, reduce code size]
Link: http://lkml.kernel.org/r/20170515112522.32457-2-ying.huang@intel.com
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Suggested-by: Andrew Morton <akpm@linux-foundation.org> [for config option]
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> [for changes in huge_memory.c and huge_mm.h]
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Ebru Akagunduz <ebru.akagunduz@gmail.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Shaohua Li <shli@kernel.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2017-07-07 05:37:18 +07:00
|
|
|
id = swap_cgroup_record(entry, 0, nr_pages);
|
2015-02-12 06:26:36 +07:00
|
|
|
rcu_read_lock();
|
2015-04-16 06:13:00 +07:00
|
|
|
memcg = mem_cgroup_from_id(id);
|
2015-02-12 06:26:36 +07:00
|
|
|
if (memcg) {
|
2020-06-04 06:02:14 +07:00
|
|
|
if (!cgroup_memory_noswap && !mem_cgroup_is_root(memcg)) {
|
mm: memcontrol: charge swap to cgroup2
This patchset introduces swap accounting to cgroup2.
This patch (of 7):
In the legacy hierarchy we charge memsw, which is dubious, because:
- memsw.limit must be >= memory.limit, so it is impossible to limit
swap usage less than memory usage. Taking into account the fact that
the primary limiting mechanism in the unified hierarchy is
memory.high while memory.limit is either left unset or set to a very
large value, moving memsw.limit knob to the unified hierarchy would
effectively make it impossible to limit swap usage according to the
user preference.
- memsw.usage != memory.usage + swap.usage, because a page occupying
both swap entry and a swap cache page is charged only once to memsw
counter. As a result, it is possible to effectively eat up to
memory.limit of memory pages *and* memsw.limit of swap entries, which
looks unexpected.
That said, we should provide a different swap limiting mechanism for
cgroup2.
This patch adds mem_cgroup->swap counter, which charges the actual number
of swap entries used by a cgroup. It is only charged in the unified
hierarchy, while the legacy hierarchy memsw logic is left intact.
The swap usage can be monitored using new memory.swap.current file and
limited using memory.swap.max.
Note, to charge swap resource properly in the unified hierarchy, we have
to make swap_entry_free uncharge swap only when ->usage reaches zero, not
just ->count, i.e. when all references to a swap entry, including the one
taken by swap cache, are gone. This is necessary, because otherwise
swap-in could result in uncharging swap even if the page is still in swap
cache and hence still occupies a swap entry. At the same time, this
shouldn't break memsw counter logic, where a page is never charged twice
for using both memory and swap, because in case of legacy hierarchy we
uncharge swap on commit (see mem_cgroup_commit_charge).
Signed-off-by: Vladimir Davydov <vdavydov@virtuozzo.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-01-21 06:02:56 +07:00
|
|
|
if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
|
mm, THP, swap: delay splitting THP during swap out
Patch series "THP swap: Delay splitting THP during swapping out", v11.
This patchset is to optimize the performance of Transparent Huge Page
(THP) swap.
Recently, the performance of the storage devices improved so fast that
we cannot saturate the disk bandwidth with single logical CPU when do
page swap out even on a high-end server machine. Because the
performance of the storage device improved faster than that of single
logical CPU. And it seems that the trend will not change in the near
future. On the other hand, the THP becomes more and more popular
because of increased memory size. So it becomes necessary to optimize
THP swap performance.
The advantages of the THP swap support include:
- Batch the swap operations for the THP to reduce lock
acquiring/releasing, including allocating/freeing the swap space,
adding/deleting to/from the swap cache, and writing/reading the swap
space, etc. This will help improve the performance of the THP swap.
- The THP swap space read/write will be 2M sequential IO. It is
particularly helpful for the swap read, which are usually 4k random
IO. This will improve the performance of the THP swap too.
- It will help the memory fragmentation, especially when the THP is
heavily used by the applications. The 2M continuous pages will be
free up after THP swapping out.
- It will improve the THP utilization on the system with the swap
turned on. Because the speed for khugepaged to collapse the normal
pages into the THP is quite slow. After the THP is split during the
swapping out, it will take quite long time for the normal pages to
collapse back into the THP after being swapped in. The high THP
utilization helps the efficiency of the page based memory management
too.
There are some concerns regarding THP swap in, mainly because possible
enlarged read/write IO size (for swap in/out) may put more overhead on
the storage device. To deal with that, the THP swap in should be turned
on only when necessary. For example, it can be selected via
"always/never/madvise" logic, to be turned on globally, turned off
globally, or turned on only for VMA with MADV_HUGEPAGE, etc.
This patchset is the first step for the THP swap support. The plan is
to delay splitting THP step by step, finally avoid splitting THP during
the THP swapping out and swap out/in the THP as a whole.
As the first step, in this patchset, the splitting huge page is delayed
from almost the first step of swapping out to after allocating the swap
space for the THP and adding the THP into the swap cache. This will
reduce lock acquiring/releasing for the locks used for the swap cache
management.
With the patchset, the swap out throughput improves 15.5% (from about
3.73GB/s to about 4.31GB/s) in the vm-scalability swap-w-seq test case
with 8 processes. The test is done on a Xeon E5 v3 system. The swap
device used is a RAM simulated PMEM (persistent memory) device. To test
the sequential swapping out, the test case creates 8 processes, which
sequentially allocate and write to the anonymous pages until the RAM and
part of the swap device is used up.
This patch (of 5):
In this patch, splitting huge page is delayed from almost the first step
of swapping out to after allocating the swap space for the THP
(Transparent Huge Page) and adding the THP into the swap cache. This
will batch the corresponding operation, thus improve THP swap out
throughput.
This is the first step for the THP swap optimization. The plan is to
delay splitting the THP step by step and avoid splitting the THP
finally.
In this patch, one swap cluster is used to hold the contents of each THP
swapped out. So, the size of the swap cluster is changed to that of the
THP (Transparent Huge Page) on x86_64 architecture (512). For other
architectures which want such THP swap optimization,
ARCH_USES_THP_SWAP_CLUSTER needs to be selected in the Kconfig file for
the architecture. In effect, this will enlarge swap cluster size by 2
times on x86_64. Which may make it harder to find a free cluster when
the swap space becomes fragmented. So that, this may reduce the
continuous swap space allocation and sequential write in theory. The
performance test in 0day shows no regressions caused by this.
In the future of THP swap optimization, some information of the swapped
out THP (such as compound map count) will be recorded in the
swap_cluster_info data structure.
The mem cgroup swap accounting functions are enhanced to support charge
or uncharge a swap cluster backing a THP as a whole.
The swap cluster allocate/free functions are added to allocate/free a
swap cluster for a THP. A fair simple algorithm is used for swap
cluster allocation, that is, only the first swap device in priority list
will be tried to allocate the swap cluster. The function will fail if
the trying is not successful, and the caller will fallback to allocate a
single swap slot instead. This works good enough for normal cases. If
the difference of the number of the free swap clusters among multiple
swap devices is significant, it is possible that some THPs are split
earlier than necessary. For example, this could be caused by big size
difference among multiple swap devices.
The swap cache functions is enhanced to support add/delete THP to/from
the swap cache as a set of (HPAGE_PMD_NR) sub-pages. This may be
enhanced in the future with multi-order radix tree. But because we will
split the THP soon during swapping out, that optimization doesn't make
much sense for this first step.
The THP splitting functions are enhanced to support to split THP in swap
cache during swapping out. The page lock will be held during allocating
the swap cluster, adding the THP into the swap cache and splitting the
THP. So in the code path other than swapping out, if the THP need to be
split, the PageSwapCache(THP) will be always false.
The swap cluster is only available for SSD, so the THP swap optimization
in this patchset has no effect for HDD.
[ying.huang@intel.com: fix two issues in THP optimize patch]
Link: http://lkml.kernel.org/r/87k25ed8zo.fsf@yhuang-dev.intel.com
[hannes@cmpxchg.org: extensive cleanups and simplifications, reduce code size]
Link: http://lkml.kernel.org/r/20170515112522.32457-2-ying.huang@intel.com
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Suggested-by: Andrew Morton <akpm@linux-foundation.org> [for config option]
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> [for changes in huge_memory.c and huge_mm.h]
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Ebru Akagunduz <ebru.akagunduz@gmail.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Shaohua Li <shli@kernel.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2017-07-07 05:37:18 +07:00
|
|
|
page_counter_uncharge(&memcg->swap, nr_pages);
|
mm: memcontrol: charge swap to cgroup2
This patchset introduces swap accounting to cgroup2.
This patch (of 7):
In the legacy hierarchy we charge memsw, which is dubious, because:
- memsw.limit must be >= memory.limit, so it is impossible to limit
swap usage less than memory usage. Taking into account the fact that
the primary limiting mechanism in the unified hierarchy is
memory.high while memory.limit is either left unset or set to a very
large value, moving memsw.limit knob to the unified hierarchy would
effectively make it impossible to limit swap usage according to the
user preference.
- memsw.usage != memory.usage + swap.usage, because a page occupying
both swap entry and a swap cache page is charged only once to memsw
counter. As a result, it is possible to effectively eat up to
memory.limit of memory pages *and* memsw.limit of swap entries, which
looks unexpected.
That said, we should provide a different swap limiting mechanism for
cgroup2.
This patch adds mem_cgroup->swap counter, which charges the actual number
of swap entries used by a cgroup. It is only charged in the unified
hierarchy, while the legacy hierarchy memsw logic is left intact.
The swap usage can be monitored using new memory.swap.current file and
limited using memory.swap.max.
Note, to charge swap resource properly in the unified hierarchy, we have
to make swap_entry_free uncharge swap only when ->usage reaches zero, not
just ->count, i.e. when all references to a swap entry, including the one
taken by swap cache, are gone. This is necessary, because otherwise
swap-in could result in uncharging swap even if the page is still in swap
cache and hence still occupies a swap entry. At the same time, this
shouldn't break memsw counter logic, where a page is never charged twice
for using both memory and swap, because in case of legacy hierarchy we
uncharge swap on commit (see mem_cgroup_commit_charge).
Signed-off-by: Vladimir Davydov <vdavydov@virtuozzo.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-01-21 06:02:56 +07:00
|
|
|
else
|
mm, THP, swap: delay splitting THP during swap out
Patch series "THP swap: Delay splitting THP during swapping out", v11.
This patchset is to optimize the performance of Transparent Huge Page
(THP) swap.
Recently, the performance of the storage devices improved so fast that
we cannot saturate the disk bandwidth with single logical CPU when do
page swap out even on a high-end server machine. Because the
performance of the storage device improved faster than that of single
logical CPU. And it seems that the trend will not change in the near
future. On the other hand, the THP becomes more and more popular
because of increased memory size. So it becomes necessary to optimize
THP swap performance.
The advantages of the THP swap support include:
- Batch the swap operations for the THP to reduce lock
acquiring/releasing, including allocating/freeing the swap space,
adding/deleting to/from the swap cache, and writing/reading the swap
space, etc. This will help improve the performance of the THP swap.
- The THP swap space read/write will be 2M sequential IO. It is
particularly helpful for the swap read, which are usually 4k random
IO. This will improve the performance of the THP swap too.
- It will help the memory fragmentation, especially when the THP is
heavily used by the applications. The 2M continuous pages will be
free up after THP swapping out.
- It will improve the THP utilization on the system with the swap
turned on. Because the speed for khugepaged to collapse the normal
pages into the THP is quite slow. After the THP is split during the
swapping out, it will take quite long time for the normal pages to
collapse back into the THP after being swapped in. The high THP
utilization helps the efficiency of the page based memory management
too.
There are some concerns regarding THP swap in, mainly because possible
enlarged read/write IO size (for swap in/out) may put more overhead on
the storage device. To deal with that, the THP swap in should be turned
on only when necessary. For example, it can be selected via
"always/never/madvise" logic, to be turned on globally, turned off
globally, or turned on only for VMA with MADV_HUGEPAGE, etc.
This patchset is the first step for the THP swap support. The plan is
to delay splitting THP step by step, finally avoid splitting THP during
the THP swapping out and swap out/in the THP as a whole.
As the first step, in this patchset, the splitting huge page is delayed
from almost the first step of swapping out to after allocating the swap
space for the THP and adding the THP into the swap cache. This will
reduce lock acquiring/releasing for the locks used for the swap cache
management.
With the patchset, the swap out throughput improves 15.5% (from about
3.73GB/s to about 4.31GB/s) in the vm-scalability swap-w-seq test case
with 8 processes. The test is done on a Xeon E5 v3 system. The swap
device used is a RAM simulated PMEM (persistent memory) device. To test
the sequential swapping out, the test case creates 8 processes, which
sequentially allocate and write to the anonymous pages until the RAM and
part of the swap device is used up.
This patch (of 5):
In this patch, splitting huge page is delayed from almost the first step
of swapping out to after allocating the swap space for the THP
(Transparent Huge Page) and adding the THP into the swap cache. This
will batch the corresponding operation, thus improve THP swap out
throughput.
This is the first step for the THP swap optimization. The plan is to
delay splitting the THP step by step and avoid splitting the THP
finally.
In this patch, one swap cluster is used to hold the contents of each THP
swapped out. So, the size of the swap cluster is changed to that of the
THP (Transparent Huge Page) on x86_64 architecture (512). For other
architectures which want such THP swap optimization,
ARCH_USES_THP_SWAP_CLUSTER needs to be selected in the Kconfig file for
the architecture. In effect, this will enlarge swap cluster size by 2
times on x86_64. Which may make it harder to find a free cluster when
the swap space becomes fragmented. So that, this may reduce the
continuous swap space allocation and sequential write in theory. The
performance test in 0day shows no regressions caused by this.
In the future of THP swap optimization, some information of the swapped
out THP (such as compound map count) will be recorded in the
swap_cluster_info data structure.
The mem cgroup swap accounting functions are enhanced to support charge
or uncharge a swap cluster backing a THP as a whole.
The swap cluster allocate/free functions are added to allocate/free a
swap cluster for a THP. A fair simple algorithm is used for swap
cluster allocation, that is, only the first swap device in priority list
will be tried to allocate the swap cluster. The function will fail if
the trying is not successful, and the caller will fallback to allocate a
single swap slot instead. This works good enough for normal cases. If
the difference of the number of the free swap clusters among multiple
swap devices is significant, it is possible that some THPs are split
earlier than necessary. For example, this could be caused by big size
difference among multiple swap devices.
The swap cache functions is enhanced to support add/delete THP to/from
the swap cache as a set of (HPAGE_PMD_NR) sub-pages. This may be
enhanced in the future with multi-order radix tree. But because we will
split the THP soon during swapping out, that optimization doesn't make
much sense for this first step.
The THP splitting functions are enhanced to support to split THP in swap
cache during swapping out. The page lock will be held during allocating
the swap cluster, adding the THP into the swap cache and splitting the
THP. So in the code path other than swapping out, if the THP need to be
split, the PageSwapCache(THP) will be always false.
The swap cluster is only available for SSD, so the THP swap optimization
in this patchset has no effect for HDD.
[ying.huang@intel.com: fix two issues in THP optimize patch]
Link: http://lkml.kernel.org/r/87k25ed8zo.fsf@yhuang-dev.intel.com
[hannes@cmpxchg.org: extensive cleanups and simplifications, reduce code size]
Link: http://lkml.kernel.org/r/20170515112522.32457-2-ying.huang@intel.com
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Suggested-by: Andrew Morton <akpm@linux-foundation.org> [for config option]
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> [for changes in huge_memory.c and huge_mm.h]
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Ebru Akagunduz <ebru.akagunduz@gmail.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Shaohua Li <shli@kernel.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2017-07-07 05:37:18 +07:00
|
|
|
page_counter_uncharge(&memcg->memsw, nr_pages);
|
mm: memcontrol: charge swap to cgroup2
This patchset introduces swap accounting to cgroup2.
This patch (of 7):
In the legacy hierarchy we charge memsw, which is dubious, because:
- memsw.limit must be >= memory.limit, so it is impossible to limit
swap usage less than memory usage. Taking into account the fact that
the primary limiting mechanism in the unified hierarchy is
memory.high while memory.limit is either left unset or set to a very
large value, moving memsw.limit knob to the unified hierarchy would
effectively make it impossible to limit swap usage according to the
user preference.
- memsw.usage != memory.usage + swap.usage, because a page occupying
both swap entry and a swap cache page is charged only once to memsw
counter. As a result, it is possible to effectively eat up to
memory.limit of memory pages *and* memsw.limit of swap entries, which
looks unexpected.
That said, we should provide a different swap limiting mechanism for
cgroup2.
This patch adds mem_cgroup->swap counter, which charges the actual number
of swap entries used by a cgroup. It is only charged in the unified
hierarchy, while the legacy hierarchy memsw logic is left intact.
The swap usage can be monitored using new memory.swap.current file and
limited using memory.swap.max.
Note, to charge swap resource properly in the unified hierarchy, we have
to make swap_entry_free uncharge swap only when ->usage reaches zero, not
just ->count, i.e. when all references to a swap entry, including the one
taken by swap cache, are gone. This is necessary, because otherwise
swap-in could result in uncharging swap even if the page is still in swap
cache and hence still occupies a swap entry. At the same time, this
shouldn't break memsw counter logic, where a page is never charged twice
for using both memory and swap, because in case of legacy hierarchy we
uncharge swap on commit (see mem_cgroup_commit_charge).
Signed-off-by: Vladimir Davydov <vdavydov@virtuozzo.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-01-21 06:02:56 +07:00
|
|
|
}
|
2018-02-01 07:16:37 +07:00
|
|
|
mod_memcg_state(memcg, MEMCG_SWAP, -nr_pages);
|
mm, THP, swap: delay splitting THP during swap out
Patch series "THP swap: Delay splitting THP during swapping out", v11.
This patchset is to optimize the performance of Transparent Huge Page
(THP) swap.
Recently, the performance of the storage devices improved so fast that
we cannot saturate the disk bandwidth with single logical CPU when do
page swap out even on a high-end server machine. Because the
performance of the storage device improved faster than that of single
logical CPU. And it seems that the trend will not change in the near
future. On the other hand, the THP becomes more and more popular
because of increased memory size. So it becomes necessary to optimize
THP swap performance.
The advantages of the THP swap support include:
- Batch the swap operations for the THP to reduce lock
acquiring/releasing, including allocating/freeing the swap space,
adding/deleting to/from the swap cache, and writing/reading the swap
space, etc. This will help improve the performance of the THP swap.
- The THP swap space read/write will be 2M sequential IO. It is
particularly helpful for the swap read, which are usually 4k random
IO. This will improve the performance of the THP swap too.
- It will help the memory fragmentation, especially when the THP is
heavily used by the applications. The 2M continuous pages will be
free up after THP swapping out.
- It will improve the THP utilization on the system with the swap
turned on. Because the speed for khugepaged to collapse the normal
pages into the THP is quite slow. After the THP is split during the
swapping out, it will take quite long time for the normal pages to
collapse back into the THP after being swapped in. The high THP
utilization helps the efficiency of the page based memory management
too.
There are some concerns regarding THP swap in, mainly because possible
enlarged read/write IO size (for swap in/out) may put more overhead on
the storage device. To deal with that, the THP swap in should be turned
on only when necessary. For example, it can be selected via
"always/never/madvise" logic, to be turned on globally, turned off
globally, or turned on only for VMA with MADV_HUGEPAGE, etc.
This patchset is the first step for the THP swap support. The plan is
to delay splitting THP step by step, finally avoid splitting THP during
the THP swapping out and swap out/in the THP as a whole.
As the first step, in this patchset, the splitting huge page is delayed
from almost the first step of swapping out to after allocating the swap
space for the THP and adding the THP into the swap cache. This will
reduce lock acquiring/releasing for the locks used for the swap cache
management.
With the patchset, the swap out throughput improves 15.5% (from about
3.73GB/s to about 4.31GB/s) in the vm-scalability swap-w-seq test case
with 8 processes. The test is done on a Xeon E5 v3 system. The swap
device used is a RAM simulated PMEM (persistent memory) device. To test
the sequential swapping out, the test case creates 8 processes, which
sequentially allocate and write to the anonymous pages until the RAM and
part of the swap device is used up.
This patch (of 5):
In this patch, splitting huge page is delayed from almost the first step
of swapping out to after allocating the swap space for the THP
(Transparent Huge Page) and adding the THP into the swap cache. This
will batch the corresponding operation, thus improve THP swap out
throughput.
This is the first step for the THP swap optimization. The plan is to
delay splitting the THP step by step and avoid splitting the THP
finally.
In this patch, one swap cluster is used to hold the contents of each THP
swapped out. So, the size of the swap cluster is changed to that of the
THP (Transparent Huge Page) on x86_64 architecture (512). For other
architectures which want such THP swap optimization,
ARCH_USES_THP_SWAP_CLUSTER needs to be selected in the Kconfig file for
the architecture. In effect, this will enlarge swap cluster size by 2
times on x86_64. Which may make it harder to find a free cluster when
the swap space becomes fragmented. So that, this may reduce the
continuous swap space allocation and sequential write in theory. The
performance test in 0day shows no regressions caused by this.
In the future of THP swap optimization, some information of the swapped
out THP (such as compound map count) will be recorded in the
swap_cluster_info data structure.
The mem cgroup swap accounting functions are enhanced to support charge
or uncharge a swap cluster backing a THP as a whole.
The swap cluster allocate/free functions are added to allocate/free a
swap cluster for a THP. A fair simple algorithm is used for swap
cluster allocation, that is, only the first swap device in priority list
will be tried to allocate the swap cluster. The function will fail if
the trying is not successful, and the caller will fallback to allocate a
single swap slot instead. This works good enough for normal cases. If
the difference of the number of the free swap clusters among multiple
swap devices is significant, it is possible that some THPs are split
earlier than necessary. For example, this could be caused by big size
difference among multiple swap devices.
The swap cache functions is enhanced to support add/delete THP to/from
the swap cache as a set of (HPAGE_PMD_NR) sub-pages. This may be
enhanced in the future with multi-order radix tree. But because we will
split the THP soon during swapping out, that optimization doesn't make
much sense for this first step.
The THP splitting functions are enhanced to support to split THP in swap
cache during swapping out. The page lock will be held during allocating
the swap cluster, adding the THP into the swap cache and splitting the
THP. So in the code path other than swapping out, if the THP need to be
split, the PageSwapCache(THP) will be always false.
The swap cluster is only available for SSD, so the THP swap optimization
in this patchset has no effect for HDD.
[ying.huang@intel.com: fix two issues in THP optimize patch]
Link: http://lkml.kernel.org/r/87k25ed8zo.fsf@yhuang-dev.intel.com
[hannes@cmpxchg.org: extensive cleanups and simplifications, reduce code size]
Link: http://lkml.kernel.org/r/20170515112522.32457-2-ying.huang@intel.com
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Suggested-by: Andrew Morton <akpm@linux-foundation.org> [for config option]
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> [for changes in huge_memory.c and huge_mm.h]
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Ebru Akagunduz <ebru.akagunduz@gmail.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Shaohua Li <shli@kernel.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2017-07-07 05:37:18 +07:00
|
|
|
mem_cgroup_id_put_many(memcg, nr_pages);
|
2015-02-12 06:26:36 +07:00
|
|
|
}
|
|
|
|
rcu_read_unlock();
|
|
|
|
}
|
|
|
|
|
2016-01-21 06:03:07 +07:00
|
|
|
long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg)
|
|
|
|
{
|
|
|
|
long nr_swap_pages = get_nr_swap_pages();
|
|
|
|
|
2020-06-04 06:02:11 +07:00
|
|
|
if (cgroup_memory_noswap || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
|
2016-01-21 06:03:07 +07:00
|
|
|
return nr_swap_pages;
|
|
|
|
for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg))
|
|
|
|
nr_swap_pages = min_t(long, nr_swap_pages,
|
2018-06-08 07:06:18 +07:00
|
|
|
READ_ONCE(memcg->swap.max) -
|
2016-01-21 06:03:07 +07:00
|
|
|
page_counter_read(&memcg->swap));
|
|
|
|
return nr_swap_pages;
|
|
|
|
}
|
|
|
|
|
2016-01-21 06:03:10 +07:00
|
|
|
bool mem_cgroup_swap_full(struct page *page)
|
|
|
|
{
|
|
|
|
struct mem_cgroup *memcg;
|
|
|
|
|
|
|
|
VM_BUG_ON_PAGE(!PageLocked(page), page);
|
|
|
|
|
|
|
|
if (vm_swap_full())
|
|
|
|
return true;
|
2020-06-04 06:02:11 +07:00
|
|
|
if (cgroup_memory_noswap || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
|
2016-01-21 06:03:10 +07:00
|
|
|
return false;
|
|
|
|
|
|
|
|
memcg = page->mem_cgroup;
|
|
|
|
if (!memcg)
|
|
|
|
return false;
|
|
|
|
|
2020-06-02 11:49:52 +07:00
|
|
|
for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) {
|
|
|
|
unsigned long usage = page_counter_read(&memcg->swap);
|
|
|
|
|
|
|
|
if (usage * 2 >= READ_ONCE(memcg->swap.high) ||
|
|
|
|
usage * 2 >= READ_ONCE(memcg->swap.max))
|
2016-01-21 06:03:10 +07:00
|
|
|
return true;
|
2020-06-02 11:49:52 +07:00
|
|
|
}
|
2016-01-21 06:03:10 +07:00
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2020-06-04 06:02:11 +07:00
|
|
|
static int __init setup_swap_account(char *s)
|
2015-02-12 06:26:36 +07:00
|
|
|
{
|
|
|
|
if (!strcmp(s, "1"))
|
2020-06-04 06:02:11 +07:00
|
|
|
cgroup_memory_noswap = 0;
|
2015-02-12 06:26:36 +07:00
|
|
|
else if (!strcmp(s, "0"))
|
2020-06-04 06:02:11 +07:00
|
|
|
cgroup_memory_noswap = 1;
|
2015-02-12 06:26:36 +07:00
|
|
|
return 1;
|
|
|
|
}
|
2020-06-04 06:02:11 +07:00
|
|
|
__setup("swapaccount=", setup_swap_account);
|
2015-02-12 06:26:36 +07:00
|
|
|
|
mm: memcontrol: charge swap to cgroup2
This patchset introduces swap accounting to cgroup2.
This patch (of 7):
In the legacy hierarchy we charge memsw, which is dubious, because:
- memsw.limit must be >= memory.limit, so it is impossible to limit
swap usage less than memory usage. Taking into account the fact that
the primary limiting mechanism in the unified hierarchy is
memory.high while memory.limit is either left unset or set to a very
large value, moving memsw.limit knob to the unified hierarchy would
effectively make it impossible to limit swap usage according to the
user preference.
- memsw.usage != memory.usage + swap.usage, because a page occupying
both swap entry and a swap cache page is charged only once to memsw
counter. As a result, it is possible to effectively eat up to
memory.limit of memory pages *and* memsw.limit of swap entries, which
looks unexpected.
That said, we should provide a different swap limiting mechanism for
cgroup2.
This patch adds mem_cgroup->swap counter, which charges the actual number
of swap entries used by a cgroup. It is only charged in the unified
hierarchy, while the legacy hierarchy memsw logic is left intact.
The swap usage can be monitored using new memory.swap.current file and
limited using memory.swap.max.
Note, to charge swap resource properly in the unified hierarchy, we have
to make swap_entry_free uncharge swap only when ->usage reaches zero, not
just ->count, i.e. when all references to a swap entry, including the one
taken by swap cache, are gone. This is necessary, because otherwise
swap-in could result in uncharging swap even if the page is still in swap
cache and hence still occupies a swap entry. At the same time, this
shouldn't break memsw counter logic, where a page is never charged twice
for using both memory and swap, because in case of legacy hierarchy we
uncharge swap on commit (see mem_cgroup_commit_charge).
Signed-off-by: Vladimir Davydov <vdavydov@virtuozzo.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-01-21 06:02:56 +07:00
|
|
|
static u64 swap_current_read(struct cgroup_subsys_state *css,
|
|
|
|
struct cftype *cft)
|
|
|
|
{
|
|
|
|
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
|
|
|
|
|
|
|
|
return (u64)page_counter_read(&memcg->swap) * PAGE_SIZE;
|
|
|
|
}
|
|
|
|
|
2020-06-02 11:49:52 +07:00
|
|
|
static int swap_high_show(struct seq_file *m, void *v)
|
|
|
|
{
|
|
|
|
return seq_puts_memcg_tunable(m,
|
|
|
|
READ_ONCE(mem_cgroup_from_seq(m)->swap.high));
|
|
|
|
}
|
|
|
|
|
|
|
|
static ssize_t swap_high_write(struct kernfs_open_file *of,
|
|
|
|
char *buf, size_t nbytes, loff_t off)
|
|
|
|
{
|
|
|
|
struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
|
|
|
|
unsigned long high;
|
|
|
|
int err;
|
|
|
|
|
|
|
|
buf = strstrip(buf);
|
|
|
|
err = page_counter_memparse(buf, "max", &high);
|
|
|
|
if (err)
|
|
|
|
return err;
|
|
|
|
|
|
|
|
page_counter_set_high(&memcg->swap, high);
|
|
|
|
|
|
|
|
return nbytes;
|
|
|
|
}
|
|
|
|
|
mm: memcontrol: charge swap to cgroup2
This patchset introduces swap accounting to cgroup2.
This patch (of 7):
In the legacy hierarchy we charge memsw, which is dubious, because:
- memsw.limit must be >= memory.limit, so it is impossible to limit
swap usage less than memory usage. Taking into account the fact that
the primary limiting mechanism in the unified hierarchy is
memory.high while memory.limit is either left unset or set to a very
large value, moving memsw.limit knob to the unified hierarchy would
effectively make it impossible to limit swap usage according to the
user preference.
- memsw.usage != memory.usage + swap.usage, because a page occupying
both swap entry and a swap cache page is charged only once to memsw
counter. As a result, it is possible to effectively eat up to
memory.limit of memory pages *and* memsw.limit of swap entries, which
looks unexpected.
That said, we should provide a different swap limiting mechanism for
cgroup2.
This patch adds mem_cgroup->swap counter, which charges the actual number
of swap entries used by a cgroup. It is only charged in the unified
hierarchy, while the legacy hierarchy memsw logic is left intact.
The swap usage can be monitored using new memory.swap.current file and
limited using memory.swap.max.
Note, to charge swap resource properly in the unified hierarchy, we have
to make swap_entry_free uncharge swap only when ->usage reaches zero, not
just ->count, i.e. when all references to a swap entry, including the one
taken by swap cache, are gone. This is necessary, because otherwise
swap-in could result in uncharging swap even if the page is still in swap
cache and hence still occupies a swap entry. At the same time, this
shouldn't break memsw counter logic, where a page is never charged twice
for using both memory and swap, because in case of legacy hierarchy we
uncharge swap on commit (see mem_cgroup_commit_charge).
Signed-off-by: Vladimir Davydov <vdavydov@virtuozzo.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-01-21 06:02:56 +07:00
|
|
|
static int swap_max_show(struct seq_file *m, void *v)
|
|
|
|
{
|
2019-03-06 06:45:55 +07:00
|
|
|
return seq_puts_memcg_tunable(m,
|
|
|
|
READ_ONCE(mem_cgroup_from_seq(m)->swap.max));
|
mm: memcontrol: charge swap to cgroup2
This patchset introduces swap accounting to cgroup2.
This patch (of 7):
In the legacy hierarchy we charge memsw, which is dubious, because:
- memsw.limit must be >= memory.limit, so it is impossible to limit
swap usage less than memory usage. Taking into account the fact that
the primary limiting mechanism in the unified hierarchy is
memory.high while memory.limit is either left unset or set to a very
large value, moving memsw.limit knob to the unified hierarchy would
effectively make it impossible to limit swap usage according to the
user preference.
- memsw.usage != memory.usage + swap.usage, because a page occupying
both swap entry and a swap cache page is charged only once to memsw
counter. As a result, it is possible to effectively eat up to
memory.limit of memory pages *and* memsw.limit of swap entries, which
looks unexpected.
That said, we should provide a different swap limiting mechanism for
cgroup2.
This patch adds mem_cgroup->swap counter, which charges the actual number
of swap entries used by a cgroup. It is only charged in the unified
hierarchy, while the legacy hierarchy memsw logic is left intact.
The swap usage can be monitored using new memory.swap.current file and
limited using memory.swap.max.
Note, to charge swap resource properly in the unified hierarchy, we have
to make swap_entry_free uncharge swap only when ->usage reaches zero, not
just ->count, i.e. when all references to a swap entry, including the one
taken by swap cache, are gone. This is necessary, because otherwise
swap-in could result in uncharging swap even if the page is still in swap
cache and hence still occupies a swap entry. At the same time, this
shouldn't break memsw counter logic, where a page is never charged twice
for using both memory and swap, because in case of legacy hierarchy we
uncharge swap on commit (see mem_cgroup_commit_charge).
Signed-off-by: Vladimir Davydov <vdavydov@virtuozzo.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-01-21 06:02:56 +07:00
|
|
|
}
|
|
|
|
|
|
|
|
static ssize_t swap_max_write(struct kernfs_open_file *of,
|
|
|
|
char *buf, size_t nbytes, loff_t off)
|
|
|
|
{
|
|
|
|
struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
|
|
|
|
unsigned long max;
|
|
|
|
int err;
|
|
|
|
|
|
|
|
buf = strstrip(buf);
|
|
|
|
err = page_counter_memparse(buf, "max", &max);
|
|
|
|
if (err)
|
|
|
|
return err;
|
|
|
|
|
2018-06-08 07:09:21 +07:00
|
|
|
xchg(&memcg->swap.max, max);
|
mm: memcontrol: charge swap to cgroup2
This patchset introduces swap accounting to cgroup2.
This patch (of 7):
In the legacy hierarchy we charge memsw, which is dubious, because:
- memsw.limit must be >= memory.limit, so it is impossible to limit
swap usage less than memory usage. Taking into account the fact that
the primary limiting mechanism in the unified hierarchy is
memory.high while memory.limit is either left unset or set to a very
large value, moving memsw.limit knob to the unified hierarchy would
effectively make it impossible to limit swap usage according to the
user preference.
- memsw.usage != memory.usage + swap.usage, because a page occupying
both swap entry and a swap cache page is charged only once to memsw
counter. As a result, it is possible to effectively eat up to
memory.limit of memory pages *and* memsw.limit of swap entries, which
looks unexpected.
That said, we should provide a different swap limiting mechanism for
cgroup2.
This patch adds mem_cgroup->swap counter, which charges the actual number
of swap entries used by a cgroup. It is only charged in the unified
hierarchy, while the legacy hierarchy memsw logic is left intact.
The swap usage can be monitored using new memory.swap.current file and
limited using memory.swap.max.
Note, to charge swap resource properly in the unified hierarchy, we have
to make swap_entry_free uncharge swap only when ->usage reaches zero, not
just ->count, i.e. when all references to a swap entry, including the one
taken by swap cache, are gone. This is necessary, because otherwise
swap-in could result in uncharging swap even if the page is still in swap
cache and hence still occupies a swap entry. At the same time, this
shouldn't break memsw counter logic, where a page is never charged twice
for using both memory and swap, because in case of legacy hierarchy we
uncharge swap on commit (see mem_cgroup_commit_charge).
Signed-off-by: Vladimir Davydov <vdavydov@virtuozzo.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-01-21 06:02:56 +07:00
|
|
|
|
|
|
|
return nbytes;
|
|
|
|
}
|
|
|
|
|
2018-06-08 07:05:35 +07:00
|
|
|
static int swap_events_show(struct seq_file *m, void *v)
|
|
|
|
{
|
2019-03-06 06:45:52 +07:00
|
|
|
struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
|
2018-06-08 07:05:35 +07:00
|
|
|
|
2020-06-02 11:49:52 +07:00
|
|
|
seq_printf(m, "high %lu\n",
|
|
|
|
atomic_long_read(&memcg->memory_events[MEMCG_SWAP_HIGH]));
|
2018-06-08 07:05:35 +07:00
|
|
|
seq_printf(m, "max %lu\n",
|
|
|
|
atomic_long_read(&memcg->memory_events[MEMCG_SWAP_MAX]));
|
|
|
|
seq_printf(m, "fail %lu\n",
|
|
|
|
atomic_long_read(&memcg->memory_events[MEMCG_SWAP_FAIL]));
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
mm: memcontrol: charge swap to cgroup2
This patchset introduces swap accounting to cgroup2.
This patch (of 7):
In the legacy hierarchy we charge memsw, which is dubious, because:
- memsw.limit must be >= memory.limit, so it is impossible to limit
swap usage less than memory usage. Taking into account the fact that
the primary limiting mechanism in the unified hierarchy is
memory.high while memory.limit is either left unset or set to a very
large value, moving memsw.limit knob to the unified hierarchy would
effectively make it impossible to limit swap usage according to the
user preference.
- memsw.usage != memory.usage + swap.usage, because a page occupying
both swap entry and a swap cache page is charged only once to memsw
counter. As a result, it is possible to effectively eat up to
memory.limit of memory pages *and* memsw.limit of swap entries, which
looks unexpected.
That said, we should provide a different swap limiting mechanism for
cgroup2.
This patch adds mem_cgroup->swap counter, which charges the actual number
of swap entries used by a cgroup. It is only charged in the unified
hierarchy, while the legacy hierarchy memsw logic is left intact.
The swap usage can be monitored using new memory.swap.current file and
limited using memory.swap.max.
Note, to charge swap resource properly in the unified hierarchy, we have
to make swap_entry_free uncharge swap only when ->usage reaches zero, not
just ->count, i.e. when all references to a swap entry, including the one
taken by swap cache, are gone. This is necessary, because otherwise
swap-in could result in uncharging swap even if the page is still in swap
cache and hence still occupies a swap entry. At the same time, this
shouldn't break memsw counter logic, where a page is never charged twice
for using both memory and swap, because in case of legacy hierarchy we
uncharge swap on commit (see mem_cgroup_commit_charge).
Signed-off-by: Vladimir Davydov <vdavydov@virtuozzo.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-01-21 06:02:56 +07:00
|
|
|
static struct cftype swap_files[] = {
|
|
|
|
{
|
|
|
|
.name = "swap.current",
|
|
|
|
.flags = CFTYPE_NOT_ON_ROOT,
|
|
|
|
.read_u64 = swap_current_read,
|
|
|
|
},
|
2020-06-02 11:49:52 +07:00
|
|
|
{
|
|
|
|
.name = "swap.high",
|
|
|
|
.flags = CFTYPE_NOT_ON_ROOT,
|
|
|
|
.seq_show = swap_high_show,
|
|
|
|
.write = swap_high_write,
|
|
|
|
},
|
mm: memcontrol: charge swap to cgroup2
This patchset introduces swap accounting to cgroup2.
This patch (of 7):
In the legacy hierarchy we charge memsw, which is dubious, because:
- memsw.limit must be >= memory.limit, so it is impossible to limit
swap usage less than memory usage. Taking into account the fact that
the primary limiting mechanism in the unified hierarchy is
memory.high while memory.limit is either left unset or set to a very
large value, moving memsw.limit knob to the unified hierarchy would
effectively make it impossible to limit swap usage according to the
user preference.
- memsw.usage != memory.usage + swap.usage, because a page occupying
both swap entry and a swap cache page is charged only once to memsw
counter. As a result, it is possible to effectively eat up to
memory.limit of memory pages *and* memsw.limit of swap entries, which
looks unexpected.
That said, we should provide a different swap limiting mechanism for
cgroup2.
This patch adds mem_cgroup->swap counter, which charges the actual number
of swap entries used by a cgroup. It is only charged in the unified
hierarchy, while the legacy hierarchy memsw logic is left intact.
The swap usage can be monitored using new memory.swap.current file and
limited using memory.swap.max.
Note, to charge swap resource properly in the unified hierarchy, we have
to make swap_entry_free uncharge swap only when ->usage reaches zero, not
just ->count, i.e. when all references to a swap entry, including the one
taken by swap cache, are gone. This is necessary, because otherwise
swap-in could result in uncharging swap even if the page is still in swap
cache and hence still occupies a swap entry. At the same time, this
shouldn't break memsw counter logic, where a page is never charged twice
for using both memory and swap, because in case of legacy hierarchy we
uncharge swap on commit (see mem_cgroup_commit_charge).
Signed-off-by: Vladimir Davydov <vdavydov@virtuozzo.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-01-21 06:02:56 +07:00
|
|
|
{
|
|
|
|
.name = "swap.max",
|
|
|
|
.flags = CFTYPE_NOT_ON_ROOT,
|
|
|
|
.seq_show = swap_max_show,
|
|
|
|
.write = swap_max_write,
|
|
|
|
},
|
2018-06-08 07:05:35 +07:00
|
|
|
{
|
|
|
|
.name = "swap.events",
|
|
|
|
.flags = CFTYPE_NOT_ON_ROOT,
|
|
|
|
.file_offset = offsetof(struct mem_cgroup, swap_events_file),
|
|
|
|
.seq_show = swap_events_show,
|
|
|
|
},
|
mm: memcontrol: charge swap to cgroup2
This patchset introduces swap accounting to cgroup2.
This patch (of 7):
In the legacy hierarchy we charge memsw, which is dubious, because:
- memsw.limit must be >= memory.limit, so it is impossible to limit
swap usage less than memory usage. Taking into account the fact that
the primary limiting mechanism in the unified hierarchy is
memory.high while memory.limit is either left unset or set to a very
large value, moving memsw.limit knob to the unified hierarchy would
effectively make it impossible to limit swap usage according to the
user preference.
- memsw.usage != memory.usage + swap.usage, because a page occupying
both swap entry and a swap cache page is charged only once to memsw
counter. As a result, it is possible to effectively eat up to
memory.limit of memory pages *and* memsw.limit of swap entries, which
looks unexpected.
That said, we should provide a different swap limiting mechanism for
cgroup2.
This patch adds mem_cgroup->swap counter, which charges the actual number
of swap entries used by a cgroup. It is only charged in the unified
hierarchy, while the legacy hierarchy memsw logic is left intact.
The swap usage can be monitored using new memory.swap.current file and
limited using memory.swap.max.
Note, to charge swap resource properly in the unified hierarchy, we have
to make swap_entry_free uncharge swap only when ->usage reaches zero, not
just ->count, i.e. when all references to a swap entry, including the one
taken by swap cache, are gone. This is necessary, because otherwise
swap-in could result in uncharging swap even if the page is still in swap
cache and hence still occupies a swap entry. At the same time, this
shouldn't break memsw counter logic, where a page is never charged twice
for using both memory and swap, because in case of legacy hierarchy we
uncharge swap on commit (see mem_cgroup_commit_charge).
Signed-off-by: Vladimir Davydov <vdavydov@virtuozzo.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-01-21 06:02:56 +07:00
|
|
|
{ } /* terminate */
|
|
|
|
};
|
|
|
|
|
2020-06-04 06:02:11 +07:00
|
|
|
static struct cftype memsw_files[] = {
|
2015-02-12 06:26:36 +07:00
|
|
|
{
|
|
|
|
.name = "memsw.usage_in_bytes",
|
|
|
|
.private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
|
|
|
|
.read_u64 = mem_cgroup_read_u64,
|
|
|
|
},
|
|
|
|
{
|
|
|
|
.name = "memsw.max_usage_in_bytes",
|
|
|
|
.private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
|
|
|
|
.write = mem_cgroup_reset,
|
|
|
|
.read_u64 = mem_cgroup_read_u64,
|
|
|
|
},
|
|
|
|
{
|
|
|
|
.name = "memsw.limit_in_bytes",
|
|
|
|
.private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
|
|
|
|
.write = mem_cgroup_write,
|
|
|
|
.read_u64 = mem_cgroup_read_u64,
|
|
|
|
},
|
|
|
|
{
|
|
|
|
.name = "memsw.failcnt",
|
|
|
|
.private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
|
|
|
|
.write = mem_cgroup_reset,
|
|
|
|
.read_u64 = mem_cgroup_read_u64,
|
|
|
|
},
|
|
|
|
{ }, /* terminate */
|
|
|
|
};
|
|
|
|
|
2020-07-24 11:15:21 +07:00
|
|
|
/*
|
|
|
|
* If mem_cgroup_swap_init() is implemented as a subsys_initcall()
|
|
|
|
* instead of a core_initcall(), this could mean cgroup_memory_noswap still
|
|
|
|
* remains set to false even when memcg is disabled via "cgroup_disable=memory"
|
|
|
|
* boot parameter. This may result in premature OOPS inside
|
|
|
|
* mem_cgroup_get_nr_swap_pages() function in corner cases.
|
|
|
|
*/
|
2015-02-12 06:26:36 +07:00
|
|
|
static int __init mem_cgroup_swap_init(void)
|
|
|
|
{
|
2020-06-04 06:02:14 +07:00
|
|
|
/* No memory control -> no swap control */
|
|
|
|
if (mem_cgroup_disabled())
|
|
|
|
cgroup_memory_noswap = true;
|
|
|
|
|
|
|
|
if (cgroup_memory_noswap)
|
2020-06-04 06:02:11 +07:00
|
|
|
return 0;
|
|
|
|
|
|
|
|
WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys, swap_files));
|
|
|
|
WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys, memsw_files));
|
|
|
|
|
2015-02-12 06:26:36 +07:00
|
|
|
return 0;
|
|
|
|
}
|
2020-07-24 11:15:21 +07:00
|
|
|
core_initcall(mem_cgroup_swap_init);
|
2015-02-12 06:26:36 +07:00
|
|
|
|
|
|
|
#endif /* CONFIG_MEMCG_SWAP */
|