2005-04-17 05:20:36 +07:00
|
|
|
#ifndef __LINUX_GFP_H
|
|
|
|
#define __LINUX_GFP_H
|
|
|
|
|
2014-01-24 06:52:54 +07:00
|
|
|
#include <linux/mmdebug.h>
|
2005-04-17 05:20:36 +07:00
|
|
|
#include <linux/mmzone.h>
|
|
|
|
#include <linux/stddef.h>
|
|
|
|
#include <linux/linkage.h>
|
2009-03-13 20:13:37 +07:00
|
|
|
#include <linux/topology.h>
|
2005-04-17 05:20:36 +07:00
|
|
|
|
|
|
|
struct vm_area_struct;
|
|
|
|
|
2016-03-16 04:55:45 +07:00
|
|
|
/*
|
|
|
|
* In case of changes, please don't forget to update
|
mm, tracing: unify mm flags handling in tracepoints and printk
In tracepoints, it's possible to print gfp flags in a human-friendly
format through a macro show_gfp_flags(), which defines a translation
array and passes is to __print_flags(). Since the following patch will
introduce support for gfp flags printing in printk(), it would be nice
to reuse the array. This is not straightforward, since __print_flags()
can't simply reference an array defined in a .c file such as mm/debug.c
- it has to be a macro to allow the macro magic to communicate the
format to userspace tools such as trace-cmd.
The solution is to create a macro __def_gfpflag_names which is used both
in show_gfp_flags(), and to define the gfpflag_names[] array in
mm/debug.c.
On the other hand, mm/debug.c also defines translation tables for page
flags and vma flags, and desire was expressed (but not implemented in
this series) to use these also from tracepoints. Thus, this patch also
renames the events/gfpflags.h file to events/mmflags.h and moves the
table definitions there, using the same macro approach as for gfpflags.
This allows translating all three kinds of mm-specific flags both in
tracepoints and printk.
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: Michal Hocko <mhocko@suse.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Sasha Levin <sasha.levin@oracle.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Mel Gorman <mgorman@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-03-16 04:55:52 +07:00
|
|
|
* include/trace/events/mmflags.h and tools/perf/builtin-kmem.c
|
2016-03-16 04:55:45 +07:00
|
|
|
*/
|
|
|
|
|
2010-10-27 04:22:04 +07:00
|
|
|
/* Plain integer GFP bitmasks. Do not use this directly. */
|
|
|
|
#define ___GFP_DMA 0x01u
|
|
|
|
#define ___GFP_HIGHMEM 0x02u
|
|
|
|
#define ___GFP_DMA32 0x04u
|
|
|
|
#define ___GFP_MOVABLE 0x08u
|
2015-11-07 07:28:18 +07:00
|
|
|
#define ___GFP_RECLAIMABLE 0x10u
|
2010-10-27 04:22:04 +07:00
|
|
|
#define ___GFP_HIGH 0x20u
|
|
|
|
#define ___GFP_IO 0x40u
|
|
|
|
#define ___GFP_FS 0x80u
|
|
|
|
#define ___GFP_COLD 0x100u
|
|
|
|
#define ___GFP_NOWARN 0x200u
|
2017-07-13 04:36:45 +07:00
|
|
|
#define ___GFP_RETRY_MAYFAIL 0x400u
|
2010-10-27 04:22:04 +07:00
|
|
|
#define ___GFP_NOFAIL 0x800u
|
|
|
|
#define ___GFP_NORETRY 0x1000u
|
2012-08-01 06:44:03 +07:00
|
|
|
#define ___GFP_MEMALLOC 0x2000u
|
2010-10-27 04:22:04 +07:00
|
|
|
#define ___GFP_COMP 0x4000u
|
|
|
|
#define ___GFP_ZERO 0x8000u
|
|
|
|
#define ___GFP_NOMEMALLOC 0x10000u
|
|
|
|
#define ___GFP_HARDWALL 0x20000u
|
|
|
|
#define ___GFP_THISNODE 0x40000u
|
2015-11-07 07:28:21 +07:00
|
|
|
#define ___GFP_ATOMIC 0x80000u
|
2016-01-15 06:18:12 +07:00
|
|
|
#define ___GFP_ACCOUNT 0x100000u
|
Revert "revert "Revert "mm: remove __GFP_NO_KSWAPD""" and associated damage
This reverts commits a50915394f1fc02c2861d3b7ce7014788aa5066e and
d7c3b937bdf45f0b844400b7bf6fd3ed50bac604.
This is a revert of a revert of a revert. In addition, it reverts the
even older i915 change to stop using the __GFP_NO_KSWAPD flag due to the
original commits in linux-next.
It turns out that the original patch really was bogus, and that the
original revert was the correct thing to do after all. We thought we
had fixed the problem, and then reverted the revert, but the problem
really is fundamental: waking up kswapd simply isn't the right thing to
do, and direct reclaim sometimes simply _is_ the right thing to do.
When certain allocations fail, we simply should try some direct reclaim,
and if that fails, fail the allocation. That's the right thing to do
for THP allocations, which can easily fail, and the GPU allocations want
to do that too.
So starting kswapd is sometimes simply wrong, and removing the flag that
said "don't start kswapd" was a mistake. Let's hope we never revisit
this mistake again - and certainly not this many times ;)
Acked-by: Mel Gorman <mgorman@suse.de>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-12-11 01:51:16 +07:00
|
|
|
#define ___GFP_NOTRACK 0x200000u
|
2015-11-07 07:28:21 +07:00
|
|
|
#define ___GFP_DIRECT_RECLAIM 0x400000u
|
2017-01-11 07:57:42 +07:00
|
|
|
#define ___GFP_WRITE 0x800000u
|
|
|
|
#define ___GFP_KSWAPD_RECLAIM 0x1000000u
|
2017-05-04 04:53:09 +07:00
|
|
|
#ifdef CONFIG_LOCKDEP
|
2017-06-03 04:46:13 +07:00
|
|
|
#define ___GFP_NOLOCKDEP 0x2000000u
|
2017-05-04 04:53:09 +07:00
|
|
|
#else
|
|
|
|
#define ___GFP_NOLOCKDEP 0
|
|
|
|
#endif
|
2012-12-13 04:51:56 +07:00
|
|
|
/* If the above are modified, __GFP_BITS_SHIFT may need updating */
|
2010-10-27 04:22:04 +07:00
|
|
|
|
2005-04-17 05:20:36 +07:00
|
|
|
/*
|
2015-11-07 07:28:43 +07:00
|
|
|
* Physical address zone modifiers (see linux/mmzone.h - low four bits)
|
2006-09-26 13:31:14 +07:00
|
|
|
*
|
|
|
|
* Do not put any conditional on these. If necessary modify the definitions
|
2010-05-25 04:32:44 +07:00
|
|
|
* without the underscores and use them consistently. The definitions here may
|
2006-09-26 13:31:14 +07:00
|
|
|
* be used in bit comparisons.
|
2005-04-17 05:20:36 +07:00
|
|
|
*/
|
2010-10-27 04:22:04 +07:00
|
|
|
#define __GFP_DMA ((__force gfp_t)___GFP_DMA)
|
|
|
|
#define __GFP_HIGHMEM ((__force gfp_t)___GFP_HIGHMEM)
|
|
|
|
#define __GFP_DMA32 ((__force gfp_t)___GFP_DMA32)
|
2015-11-07 07:28:43 +07:00
|
|
|
#define __GFP_MOVABLE ((__force gfp_t)___GFP_MOVABLE) /* ZONE_MOVABLE allowed */
|
2009-06-17 05:32:46 +07:00
|
|
|
#define GFP_ZONEMASK (__GFP_DMA|__GFP_HIGHMEM|__GFP_DMA32|__GFP_MOVABLE)
|
2015-11-07 07:28:43 +07:00
|
|
|
|
2005-04-17 05:20:36 +07:00
|
|
|
/*
|
2015-11-07 07:28:43 +07:00
|
|
|
* Page mobility and placement hints
|
2005-04-17 05:20:36 +07:00
|
|
|
*
|
2015-11-07 07:28:43 +07:00
|
|
|
* These flags provide hints about how mobile the page is. Pages with similar
|
|
|
|
* mobility are placed within the same pageblocks to minimise problems due
|
|
|
|
* to external fragmentation.
|
2005-04-17 05:20:36 +07:00
|
|
|
*
|
2015-11-07 07:28:43 +07:00
|
|
|
* __GFP_MOVABLE (also a zone modifier) indicates that the page can be
|
|
|
|
* moved by page migration during memory compaction or can be reclaimed.
|
2005-04-17 05:20:36 +07:00
|
|
|
*
|
2015-11-07 07:28:43 +07:00
|
|
|
* __GFP_RECLAIMABLE is used for slab allocations that specify
|
|
|
|
* SLAB_RECLAIM_ACCOUNT and whose pages can be freed via shrinkers.
|
|
|
|
*
|
|
|
|
* __GFP_WRITE indicates the caller intends to dirty the page. Where possible,
|
|
|
|
* these pages will be spread between local zones to avoid all the dirty
|
|
|
|
* pages being in one zone (fair zone allocation policy).
|
2007-07-17 18:03:05 +07:00
|
|
|
*
|
2015-11-07 07:28:43 +07:00
|
|
|
* __GFP_HARDWALL enforces the cpuset memory allocation policy.
|
|
|
|
*
|
|
|
|
* __GFP_THISNODE forces the allocation to be satisified from the requested
|
|
|
|
* node with no fallbacks or placement policy enforcements.
|
2016-01-15 06:18:12 +07:00
|
|
|
*
|
mm: charge/uncharge kmemcg from generic page allocator paths
Currently, to charge a non-slab allocation to kmemcg one has to use
alloc_kmem_pages helper with __GFP_ACCOUNT flag. A page allocated with
this helper should finally be freed using free_kmem_pages, otherwise it
won't be uncharged.
This API suits its current users fine, but it turns out to be impossible
to use along with page reference counting, i.e. when an allocation is
supposed to be freed with put_page, as it is the case with pipe or unix
socket buffers.
To overcome this limitation, this patch moves charging/uncharging to
generic page allocator paths, i.e. to __alloc_pages_nodemask and
free_pages_prepare, and zaps alloc/free_kmem_pages helpers. This way,
one can use any of the available page allocation functions to get the
allocated page charged to kmemcg - it's enough to pass __GFP_ACCOUNT,
just like in case of kmalloc and friends. A charged page will be
automatically uncharged on free.
To make it possible, we need to mark pages charged to kmemcg somehow.
To avoid introducing a new page flag, we make use of page->_mapcount for
marking such pages. Since pages charged to kmemcg are not supposed to
be mapped to userspace, it should work just fine. There are other
(ab)users of page->_mapcount - buddy and balloon pages - but we don't
conflict with them.
In case kmemcg is compiled out or not used at runtime, this patch
introduces no overhead to generic page allocator paths. If kmemcg is
used, it will be plus one gfp flags check on alloc and plus one
page->_mapcount check on free, which shouldn't hurt performance, because
the data accessed are hot.
Link: http://lkml.kernel.org/r/a9736d856f895bcb465d9f257b54efe32eda6f99.1464079538.git.vdavydov@virtuozzo.com
Signed-off-by: Vladimir Davydov <vdavydov@virtuozzo.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Cc: Minchan Kim <minchan@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-07-27 05:24:24 +07:00
|
|
|
* __GFP_ACCOUNT causes the allocation to be accounted to kmemcg.
|
2005-04-17 05:20:36 +07:00
|
|
|
*/
|
2015-11-07 07:28:43 +07:00
|
|
|
#define __GFP_RECLAIMABLE ((__force gfp_t)___GFP_RECLAIMABLE)
|
|
|
|
#define __GFP_WRITE ((__force gfp_t)___GFP_WRITE)
|
|
|
|
#define __GFP_HARDWALL ((__force gfp_t)___GFP_HARDWALL)
|
|
|
|
#define __GFP_THISNODE ((__force gfp_t)___GFP_THISNODE)
|
2016-01-15 06:18:12 +07:00
|
|
|
#define __GFP_ACCOUNT ((__force gfp_t)___GFP_ACCOUNT)
|
2011-01-14 06:46:49 +07:00
|
|
|
|
2015-11-07 07:28:21 +07:00
|
|
|
/*
|
2015-11-07 07:28:43 +07:00
|
|
|
* Watermark modifiers -- controls access to emergency reserves
|
|
|
|
*
|
|
|
|
* __GFP_HIGH indicates that the caller is high-priority and that granting
|
|
|
|
* the request is necessary before the system can make forward progress.
|
|
|
|
* For example, creating an IO context to clean pages.
|
|
|
|
*
|
|
|
|
* __GFP_ATOMIC indicates that the caller cannot reclaim or sleep and is
|
|
|
|
* high priority. Users are typically interrupt handlers. This may be
|
|
|
|
* used in conjunction with __GFP_HIGH
|
|
|
|
*
|
|
|
|
* __GFP_MEMALLOC allows access to all memory. This should only be used when
|
|
|
|
* the caller guarantees the allocation will allow more memory to be freed
|
|
|
|
* very shortly e.g. process exiting or swapping. Users either should
|
|
|
|
* be the MM or co-ordinating closely with the VM (e.g. swap over NFS).
|
|
|
|
*
|
|
|
|
* __GFP_NOMEMALLOC is used to explicitly forbid access to emergency reserves.
|
|
|
|
* This takes precedence over the __GFP_MEMALLOC flag if both are set.
|
2015-11-07 07:28:21 +07:00
|
|
|
*/
|
2015-11-07 07:28:43 +07:00
|
|
|
#define __GFP_ATOMIC ((__force gfp_t)___GFP_ATOMIC)
|
|
|
|
#define __GFP_HIGH ((__force gfp_t)___GFP_HIGH)
|
|
|
|
#define __GFP_MEMALLOC ((__force gfp_t)___GFP_MEMALLOC)
|
|
|
|
#define __GFP_NOMEMALLOC ((__force gfp_t)___GFP_NOMEMALLOC)
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Reclaim modifiers
|
|
|
|
*
|
|
|
|
* __GFP_IO can start physical IO.
|
|
|
|
*
|
|
|
|
* __GFP_FS can call down to the low-level FS. Clearing the flag avoids the
|
|
|
|
* allocator recursing into the filesystem which might already be holding
|
|
|
|
* locks.
|
|
|
|
*
|
|
|
|
* __GFP_DIRECT_RECLAIM indicates that the caller may enter direct reclaim.
|
|
|
|
* This flag can be cleared to avoid unnecessary delays when a fallback
|
|
|
|
* option is available.
|
|
|
|
*
|
|
|
|
* __GFP_KSWAPD_RECLAIM indicates that the caller wants to wake kswapd when
|
|
|
|
* the low watermark is reached and have it reclaim pages until the high
|
|
|
|
* watermark is reached. A caller may wish to clear this flag when fallback
|
|
|
|
* options are available and the reclaim is likely to disrupt the system. The
|
|
|
|
* canonical example is THP allocation where a fallback is cheap but
|
|
|
|
* reclaim/compaction may cause indirect stalls.
|
|
|
|
*
|
|
|
|
* __GFP_RECLAIM is shorthand to allow/forbid both direct and kswapd reclaim.
|
|
|
|
*
|
2017-07-13 04:36:45 +07:00
|
|
|
* The default allocator behavior depends on the request size. We have a concept
|
|
|
|
* of so called costly allocations (with order > PAGE_ALLOC_COSTLY_ORDER).
|
|
|
|
* !costly allocations are too essential to fail so they are implicitly
|
|
|
|
* non-failing by default (with some exceptions like OOM victims might fail so
|
|
|
|
* the caller still has to check for failures) while costly requests try to be
|
|
|
|
* not disruptive and back off even without invoking the OOM killer.
|
|
|
|
* The following three modifiers might be used to override some of these
|
|
|
|
* implicit rules
|
|
|
|
*
|
|
|
|
* __GFP_NORETRY: The VM implementation will try only very lightweight
|
|
|
|
* memory direct reclaim to get some memory under memory pressure (thus
|
|
|
|
* it can sleep). It will avoid disruptive actions like OOM killer. The
|
|
|
|
* caller must handle the failure which is quite likely to happen under
|
|
|
|
* heavy memory pressure. The flag is suitable when failure can easily be
|
|
|
|
* handled at small cost, such as reduced throughput
|
|
|
|
*
|
|
|
|
* __GFP_RETRY_MAYFAIL: The VM implementation will retry memory reclaim
|
|
|
|
* procedures that have previously failed if there is some indication
|
|
|
|
* that progress has been made else where. It can wait for other
|
|
|
|
* tasks to attempt high level approaches to freeing memory such as
|
|
|
|
* compaction (which removes fragmentation) and page-out.
|
|
|
|
* There is still a definite limit to the number of retries, but it is
|
|
|
|
* a larger limit than with __GFP_NORETRY.
|
|
|
|
* Allocations with this flag may fail, but only when there is
|
|
|
|
* genuinely little unused memory. While these allocations do not
|
|
|
|
* directly trigger the OOM killer, their failure indicates that
|
|
|
|
* the system is likely to need to use the OOM killer soon. The
|
|
|
|
* caller must handle failure, but can reasonably do so by failing
|
|
|
|
* a higher-level request, or completing it only in a much less
|
|
|
|
* efficient manner.
|
|
|
|
* If the allocation does fail, and the caller is in a position to
|
|
|
|
* free some non-essential memory, doing so could benefit the system
|
|
|
|
* as a whole.
|
2015-11-07 07:28:43 +07:00
|
|
|
*
|
|
|
|
* __GFP_NOFAIL: The VM implementation _must_ retry infinitely: the caller
|
2017-07-13 04:36:45 +07:00
|
|
|
* cannot handle allocation failures. The allocation could block
|
|
|
|
* indefinitely but will never return with failure. Testing for
|
|
|
|
* failure is pointless.
|
|
|
|
* New users should be evaluated carefully (and the flag should be
|
|
|
|
* used only when there is no reasonable failure policy) but it is
|
|
|
|
* definitely preferable to use the flag rather than opencode endless
|
|
|
|
* loop around allocator.
|
|
|
|
* Using this flag for costly allocations is _highly_ discouraged.
|
2015-11-07 07:28:43 +07:00
|
|
|
*/
|
|
|
|
#define __GFP_IO ((__force gfp_t)___GFP_IO)
|
|
|
|
#define __GFP_FS ((__force gfp_t)___GFP_FS)
|
2015-11-07 07:28:21 +07:00
|
|
|
#define __GFP_DIRECT_RECLAIM ((__force gfp_t)___GFP_DIRECT_RECLAIM) /* Caller can reclaim */
|
|
|
|
#define __GFP_KSWAPD_RECLAIM ((__force gfp_t)___GFP_KSWAPD_RECLAIM) /* kswapd can wake */
|
2015-11-07 07:28:43 +07:00
|
|
|
#define __GFP_RECLAIM ((__force gfp_t)(___GFP_DIRECT_RECLAIM|___GFP_KSWAPD_RECLAIM))
|
2017-07-13 04:36:45 +07:00
|
|
|
#define __GFP_RETRY_MAYFAIL ((__force gfp_t)___GFP_RETRY_MAYFAIL)
|
2015-11-07 07:28:43 +07:00
|
|
|
#define __GFP_NOFAIL ((__force gfp_t)___GFP_NOFAIL)
|
|
|
|
#define __GFP_NORETRY ((__force gfp_t)___GFP_NORETRY)
|
2015-11-07 07:28:21 +07:00
|
|
|
|
kmemcheck: add mm functions
With kmemcheck enabled, the slab allocator needs to do this:
1. Tell kmemcheck to allocate the shadow memory which stores the status of
each byte in the allocation proper, e.g. whether it is initialized or
uninitialized.
2. Tell kmemcheck which parts of memory that should be marked uninitialized.
There are actually a few more states, such as "not yet allocated" and
"recently freed".
If a slab cache is set up using the SLAB_NOTRACK flag, it will never return
memory that can take page faults because of kmemcheck.
If a slab cache is NOT set up using the SLAB_NOTRACK flag, callers can still
request memory with the __GFP_NOTRACK flag. This does not prevent the page
faults from occuring, however, but marks the object in question as being
initialized so that no warnings will ever be produced for this object.
In addition to (and in contrast to) __GFP_NOTRACK, the
__GFP_NOTRACK_FALSE_POSITIVE flag indicates that the allocation should
not be tracked _because_ it would produce a false positive. Their values
are identical, but need not be so in the future (for example, we could now
enable/disable false positives with a config option).
Parts of this patch were contributed by Pekka Enberg but merged for
atomicity.
Signed-off-by: Vegard Nossum <vegard.nossum@gmail.com>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
[rebased for mainline inclusion]
Signed-off-by: Vegard Nossum <vegard.nossum@gmail.com>
2008-05-31 20:56:17 +07:00
|
|
|
/*
|
2015-11-07 07:28:43 +07:00
|
|
|
* Action modifiers
|
|
|
|
*
|
|
|
|
* __GFP_COLD indicates that the caller does not expect to be used in the near
|
|
|
|
* future. Where possible, a cache-cold page will be returned.
|
|
|
|
*
|
|
|
|
* __GFP_NOWARN suppresses allocation failure reports.
|
|
|
|
*
|
|
|
|
* __GFP_COMP address compound page metadata.
|
|
|
|
*
|
|
|
|
* __GFP_ZERO returns a zeroed page on success.
|
|
|
|
*
|
|
|
|
* __GFP_NOTRACK avoids tracking with kmemcheck.
|
|
|
|
*
|
|
|
|
* __GFP_NOTRACK_FALSE_POSITIVE is an alias of __GFP_NOTRACK. It's a means of
|
|
|
|
* distinguishing in the source between false positives and allocations that
|
|
|
|
* cannot be supported (e.g. page tables).
|
kmemcheck: add mm functions
With kmemcheck enabled, the slab allocator needs to do this:
1. Tell kmemcheck to allocate the shadow memory which stores the status of
each byte in the allocation proper, e.g. whether it is initialized or
uninitialized.
2. Tell kmemcheck which parts of memory that should be marked uninitialized.
There are actually a few more states, such as "not yet allocated" and
"recently freed".
If a slab cache is set up using the SLAB_NOTRACK flag, it will never return
memory that can take page faults because of kmemcheck.
If a slab cache is NOT set up using the SLAB_NOTRACK flag, callers can still
request memory with the __GFP_NOTRACK flag. This does not prevent the page
faults from occuring, however, but marks the object in question as being
initialized so that no warnings will ever be produced for this object.
In addition to (and in contrast to) __GFP_NOTRACK, the
__GFP_NOTRACK_FALSE_POSITIVE flag indicates that the allocation should
not be tracked _because_ it would produce a false positive. Their values
are identical, but need not be so in the future (for example, we could now
enable/disable false positives with a config option).
Parts of this patch were contributed by Pekka Enberg but merged for
atomicity.
Signed-off-by: Vegard Nossum <vegard.nossum@gmail.com>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
[rebased for mainline inclusion]
Signed-off-by: Vegard Nossum <vegard.nossum@gmail.com>
2008-05-31 20:56:17 +07:00
|
|
|
*/
|
2015-11-07 07:28:43 +07:00
|
|
|
#define __GFP_COLD ((__force gfp_t)___GFP_COLD)
|
|
|
|
#define __GFP_NOWARN ((__force gfp_t)___GFP_NOWARN)
|
|
|
|
#define __GFP_COMP ((__force gfp_t)___GFP_COMP)
|
|
|
|
#define __GFP_ZERO ((__force gfp_t)___GFP_ZERO)
|
|
|
|
#define __GFP_NOTRACK ((__force gfp_t)___GFP_NOTRACK)
|
kmemcheck: add mm functions
With kmemcheck enabled, the slab allocator needs to do this:
1. Tell kmemcheck to allocate the shadow memory which stores the status of
each byte in the allocation proper, e.g. whether it is initialized or
uninitialized.
2. Tell kmemcheck which parts of memory that should be marked uninitialized.
There are actually a few more states, such as "not yet allocated" and
"recently freed".
If a slab cache is set up using the SLAB_NOTRACK flag, it will never return
memory that can take page faults because of kmemcheck.
If a slab cache is NOT set up using the SLAB_NOTRACK flag, callers can still
request memory with the __GFP_NOTRACK flag. This does not prevent the page
faults from occuring, however, but marks the object in question as being
initialized so that no warnings will ever be produced for this object.
In addition to (and in contrast to) __GFP_NOTRACK, the
__GFP_NOTRACK_FALSE_POSITIVE flag indicates that the allocation should
not be tracked _because_ it would produce a false positive. Their values
are identical, but need not be so in the future (for example, we could now
enable/disable false positives with a config option).
Parts of this patch were contributed by Pekka Enberg but merged for
atomicity.
Signed-off-by: Vegard Nossum <vegard.nossum@gmail.com>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
[rebased for mainline inclusion]
Signed-off-by: Vegard Nossum <vegard.nossum@gmail.com>
2008-05-31 20:56:17 +07:00
|
|
|
#define __GFP_NOTRACK_FALSE_POSITIVE (__GFP_NOTRACK)
|
|
|
|
|
2017-05-04 04:53:09 +07:00
|
|
|
/* Disable lockdep for GFP context tracking */
|
|
|
|
#define __GFP_NOLOCKDEP ((__force gfp_t)___GFP_NOLOCKDEP)
|
|
|
|
|
2015-11-07 07:28:43 +07:00
|
|
|
/* Room for N __GFP_FOO bits */
|
2017-05-04 04:53:09 +07:00
|
|
|
#define __GFP_BITS_SHIFT (25 + IS_ENABLED(CONFIG_LOCKDEP))
|
2005-10-21 13:55:38 +07:00
|
|
|
#define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1))
|
2005-04-17 05:20:36 +07:00
|
|
|
|
2015-11-07 07:28:21 +07:00
|
|
|
/*
|
2015-11-07 07:28:43 +07:00
|
|
|
* Useful GFP flag combinations that are commonly used. It is recommended
|
|
|
|
* that subsystems start with one of these combinations and then set/clear
|
|
|
|
* __GFP_FOO flags as necessary.
|
|
|
|
*
|
|
|
|
* GFP_ATOMIC users can not sleep and need the allocation to succeed. A lower
|
|
|
|
* watermark is applied to allow access to "atomic reserves"
|
|
|
|
*
|
|
|
|
* GFP_KERNEL is typical for kernel-internal allocations. The caller requires
|
|
|
|
* ZONE_NORMAL or a lower zone for direct access but can direct reclaim.
|
|
|
|
*
|
2016-01-15 06:18:12 +07:00
|
|
|
* GFP_KERNEL_ACCOUNT is the same as GFP_KERNEL, except the allocation is
|
|
|
|
* accounted to kmemcg.
|
|
|
|
*
|
2015-11-07 07:28:43 +07:00
|
|
|
* GFP_NOWAIT is for kernel allocations that should not stall for direct
|
|
|
|
* reclaim, start physical IO or use any filesystem callback.
|
|
|
|
*
|
|
|
|
* GFP_NOIO will use direct reclaim to discard clean pages or slab pages
|
|
|
|
* that do not require the starting of any physical IO.
|
mm: introduce memalloc_nofs_{save,restore} API
GFP_NOFS context is used for the following 5 reasons currently:
- to prevent from deadlocks when the lock held by the allocation
context would be needed during the memory reclaim
- to prevent from stack overflows during the reclaim because the
allocation is performed from a deep context already
- to prevent lockups when the allocation context depends on other
reclaimers to make a forward progress indirectly
- just in case because this would be safe from the fs POV
- silence lockdep false positives
Unfortunately overuse of this allocation context brings some problems to
the MM. Memory reclaim is much weaker (especially during heavy FS
metadata workloads), OOM killer cannot be invoked because the MM layer
doesn't have enough information about how much memory is freeable by the
FS layer.
In many cases it is far from clear why the weaker context is even used
and so it might be used unnecessarily. We would like to get rid of
those as much as possible. One way to do that is to use the flag in
scopes rather than isolated cases. Such a scope is declared when really
necessary, tracked per task and all the allocation requests from within
the context will simply inherit the GFP_NOFS semantic.
Not only this is easier to understand and maintain because there are
much less problematic contexts than specific allocation requests, this
also helps code paths where FS layer interacts with other layers (e.g.
crypto, security modules, MM etc...) and there is no easy way to convey
the allocation context between the layers.
Introduce memalloc_nofs_{save,restore} API to control the scope of
GFP_NOFS allocation context. This is basically copying
memalloc_noio_{save,restore} API we have for other restricted allocation
context GFP_NOIO. The PF_MEMALLOC_NOFS flag already exists and it is
just an alias for PF_FSTRANS which has been xfs specific until recently.
There are no more PF_FSTRANS users anymore so let's just drop it.
PF_MEMALLOC_NOFS is now checked in the MM layer and drops __GFP_FS
implicitly same as PF_MEMALLOC_NOIO drops __GFP_IO. memalloc_noio_flags
is renamed to current_gfp_context because it now cares about both
PF_MEMALLOC_NOFS and PF_MEMALLOC_NOIO contexts. Xfs code paths preserve
their semantic. kmem_flags_convert() doesn't need to evaluate the flag
anymore.
This patch shouldn't introduce any functional changes.
Let's hope that filesystems will drop direct GFP_NOFS (resp. ~__GFP_FS)
usage as much as possible and only use a properly documented
memalloc_nofs_{save,restore} checkpoints where they are appropriate.
[akpm@linux-foundation.org: fix comment typo, reflow comment]
Link: http://lkml.kernel.org/r/20170306131408.9828-5-mhocko@kernel.org
Signed-off-by: Michal Hocko <mhocko@suse.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Theodore Ts'o <tytso@mit.edu>
Cc: Chris Mason <clm@fb.com>
Cc: David Sterba <dsterba@suse.cz>
Cc: Jan Kara <jack@suse.cz>
Cc: Brian Foster <bfoster@redhat.com>
Cc: Darrick J. Wong <darrick.wong@oracle.com>
Cc: Nikolay Borisov <nborisov@suse.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2017-05-04 04:53:15 +07:00
|
|
|
* Please try to avoid using this flag directly and instead use
|
|
|
|
* memalloc_noio_{save,restore} to mark the whole scope which cannot
|
|
|
|
* perform any IO with a short explanation why. All allocation requests
|
|
|
|
* will inherit GFP_NOIO implicitly.
|
2015-11-07 07:28:43 +07:00
|
|
|
*
|
|
|
|
* GFP_NOFS will use direct reclaim but will not use any filesystem interfaces.
|
mm: introduce memalloc_nofs_{save,restore} API
GFP_NOFS context is used for the following 5 reasons currently:
- to prevent from deadlocks when the lock held by the allocation
context would be needed during the memory reclaim
- to prevent from stack overflows during the reclaim because the
allocation is performed from a deep context already
- to prevent lockups when the allocation context depends on other
reclaimers to make a forward progress indirectly
- just in case because this would be safe from the fs POV
- silence lockdep false positives
Unfortunately overuse of this allocation context brings some problems to
the MM. Memory reclaim is much weaker (especially during heavy FS
metadata workloads), OOM killer cannot be invoked because the MM layer
doesn't have enough information about how much memory is freeable by the
FS layer.
In many cases it is far from clear why the weaker context is even used
and so it might be used unnecessarily. We would like to get rid of
those as much as possible. One way to do that is to use the flag in
scopes rather than isolated cases. Such a scope is declared when really
necessary, tracked per task and all the allocation requests from within
the context will simply inherit the GFP_NOFS semantic.
Not only this is easier to understand and maintain because there are
much less problematic contexts than specific allocation requests, this
also helps code paths where FS layer interacts with other layers (e.g.
crypto, security modules, MM etc...) and there is no easy way to convey
the allocation context between the layers.
Introduce memalloc_nofs_{save,restore} API to control the scope of
GFP_NOFS allocation context. This is basically copying
memalloc_noio_{save,restore} API we have for other restricted allocation
context GFP_NOIO. The PF_MEMALLOC_NOFS flag already exists and it is
just an alias for PF_FSTRANS which has been xfs specific until recently.
There are no more PF_FSTRANS users anymore so let's just drop it.
PF_MEMALLOC_NOFS is now checked in the MM layer and drops __GFP_FS
implicitly same as PF_MEMALLOC_NOIO drops __GFP_IO. memalloc_noio_flags
is renamed to current_gfp_context because it now cares about both
PF_MEMALLOC_NOFS and PF_MEMALLOC_NOIO contexts. Xfs code paths preserve
their semantic. kmem_flags_convert() doesn't need to evaluate the flag
anymore.
This patch shouldn't introduce any functional changes.
Let's hope that filesystems will drop direct GFP_NOFS (resp. ~__GFP_FS)
usage as much as possible and only use a properly documented
memalloc_nofs_{save,restore} checkpoints where they are appropriate.
[akpm@linux-foundation.org: fix comment typo, reflow comment]
Link: http://lkml.kernel.org/r/20170306131408.9828-5-mhocko@kernel.org
Signed-off-by: Michal Hocko <mhocko@suse.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Theodore Ts'o <tytso@mit.edu>
Cc: Chris Mason <clm@fb.com>
Cc: David Sterba <dsterba@suse.cz>
Cc: Jan Kara <jack@suse.cz>
Cc: Brian Foster <bfoster@redhat.com>
Cc: Darrick J. Wong <darrick.wong@oracle.com>
Cc: Nikolay Borisov <nborisov@suse.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2017-05-04 04:53:15 +07:00
|
|
|
* Please try to avoid using this flag directly and instead use
|
|
|
|
* memalloc_nofs_{save,restore} to mark the whole scope which cannot/shouldn't
|
|
|
|
* recurse into the FS layer with a short explanation why. All allocation
|
|
|
|
* requests will inherit GFP_NOFS implicitly.
|
2015-11-07 07:28:43 +07:00
|
|
|
*
|
|
|
|
* GFP_USER is for userspace allocations that also need to be directly
|
|
|
|
* accessibly by the kernel or hardware. It is typically used by hardware
|
|
|
|
* for buffers that are mapped to userspace (e.g. graphics) that hardware
|
|
|
|
* still must DMA to. cpuset limits are enforced for these allocations.
|
|
|
|
*
|
|
|
|
* GFP_DMA exists for historical reasons and should be avoided where possible.
|
|
|
|
* The flags indicates that the caller requires that the lowest zone be
|
|
|
|
* used (ZONE_DMA or 16M on x86-64). Ideally, this would be removed but
|
|
|
|
* it would require careful auditing as some users really require it and
|
|
|
|
* others use the flag to avoid lowmem reserves in ZONE_DMA and treat the
|
|
|
|
* lowest zone as a type of emergency reserve.
|
|
|
|
*
|
|
|
|
* GFP_DMA32 is similar to GFP_DMA except that the caller requires a 32-bit
|
|
|
|
* address.
|
|
|
|
*
|
|
|
|
* GFP_HIGHUSER is for userspace allocations that may be mapped to userspace,
|
|
|
|
* do not need to be directly accessible by the kernel but that cannot
|
|
|
|
* move once in use. An example may be a hardware allocation that maps
|
|
|
|
* data directly into userspace but has no addressing limitations.
|
|
|
|
*
|
|
|
|
* GFP_HIGHUSER_MOVABLE is for userspace allocations that the kernel does not
|
|
|
|
* need direct access to but can use kmap() when access is required. They
|
|
|
|
* are expected to be movable via page reclaim or page migration. Typically,
|
|
|
|
* pages on the LRU would also be allocated with GFP_HIGHUSER_MOVABLE.
|
|
|
|
*
|
mm, thp: remove __GFP_NORETRY from khugepaged and madvised allocations
After the previous patch, we can distinguish costly allocations that
should be really lightweight, such as THP page faults, with
__GFP_NORETRY. This means we don't need to recognize khugepaged
allocations via PF_KTHREAD anymore. We can also change THP page faults
in areas where madvise(MADV_HUGEPAGE) was used to try as hard as
khugepaged, as the process has indicated that it benefits from THP's and
is willing to pay some initial latency costs.
We can also make the flags handling less cryptic by distinguishing
GFP_TRANSHUGE_LIGHT (no reclaim at all, default mode in page fault) from
GFP_TRANSHUGE (only direct reclaim, khugepaged default). Adding
__GFP_NORETRY or __GFP_KSWAPD_RECLAIM is done where needed.
The patch effectively changes the current GFP_TRANSHUGE users as
follows:
* get_huge_zero_page() - the zero page lifetime should be relatively
long and it's shared by multiple users, so it's worth spending some
effort on it. We use GFP_TRANSHUGE, and __GFP_NORETRY is not added.
This also restores direct reclaim to this allocation, which was
unintentionally removed by commit e4a49efe4e7e ("mm: thp: set THP defrag
by default to madvise and add a stall-free defrag option")
* alloc_hugepage_khugepaged_gfpmask() - this is khugepaged, so latency
is not an issue. So if khugepaged "defrag" is enabled (the default), do
reclaim via GFP_TRANSHUGE without __GFP_NORETRY. We can remove the
PF_KTHREAD check from page alloc.
As a side-effect, khugepaged will now no longer check if the initial
compaction was deferred or contended. This is OK, as khugepaged sleep
times between collapsion attempts are long enough to prevent noticeable
disruption, so we should allow it to spend some effort.
* migrate_misplaced_transhuge_page() - already was masking out
__GFP_RECLAIM, so just convert to GFP_TRANSHUGE_LIGHT which is
equivalent.
* alloc_hugepage_direct_gfpmask() - vma's with VM_HUGEPAGE (via madvise)
are now allocating without __GFP_NORETRY. Other vma's keep using
__GFP_NORETRY if direct reclaim/compaction is at all allowed (by default
it's allowed only for madvised vma's). The rest is conversion to
GFP_TRANSHUGE(_LIGHT).
[mhocko@suse.com: suggested GFP_TRANSHUGE_LIGHT]
Link: http://lkml.kernel.org/r/20160721073614.24395-7-vbabka@suse.cz
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Mel Gorman <mgorman@techsingularity.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-07-29 05:49:25 +07:00
|
|
|
* GFP_TRANSHUGE and GFP_TRANSHUGE_LIGHT are used for THP allocations. They are
|
|
|
|
* compound allocations that will generally fail quickly if memory is not
|
|
|
|
* available and will not wake kswapd/kcompactd on failure. The _LIGHT
|
|
|
|
* version does not attempt reclaim/compaction at all and is by default used
|
|
|
|
* in page fault path, while the non-light is used by khugepaged.
|
2015-11-07 07:28:21 +07:00
|
|
|
*/
|
|
|
|
#define GFP_ATOMIC (__GFP_HIGH|__GFP_ATOMIC|__GFP_KSWAPD_RECLAIM)
|
2015-11-07 07:28:43 +07:00
|
|
|
#define GFP_KERNEL (__GFP_RECLAIM | __GFP_IO | __GFP_FS)
|
2016-01-15 06:18:12 +07:00
|
|
|
#define GFP_KERNEL_ACCOUNT (GFP_KERNEL | __GFP_ACCOUNT)
|
2015-11-07 07:28:21 +07:00
|
|
|
#define GFP_NOWAIT (__GFP_KSWAPD_RECLAIM)
|
2015-11-07 07:28:28 +07:00
|
|
|
#define GFP_NOIO (__GFP_RECLAIM)
|
|
|
|
#define GFP_NOFS (__GFP_RECLAIM | __GFP_IO)
|
|
|
|
#define GFP_USER (__GFP_RECLAIM | __GFP_IO | __GFP_FS | __GFP_HARDWALL)
|
2015-11-07 07:28:43 +07:00
|
|
|
#define GFP_DMA __GFP_DMA
|
|
|
|
#define GFP_DMA32 __GFP_DMA32
|
2014-12-13 07:55:43 +07:00
|
|
|
#define GFP_HIGHUSER (GFP_USER | __GFP_HIGHMEM)
|
|
|
|
#define GFP_HIGHUSER_MOVABLE (GFP_HIGHUSER | __GFP_MOVABLE)
|
mm, thp: remove __GFP_NORETRY from khugepaged and madvised allocations
After the previous patch, we can distinguish costly allocations that
should be really lightweight, such as THP page faults, with
__GFP_NORETRY. This means we don't need to recognize khugepaged
allocations via PF_KTHREAD anymore. We can also change THP page faults
in areas where madvise(MADV_HUGEPAGE) was used to try as hard as
khugepaged, as the process has indicated that it benefits from THP's and
is willing to pay some initial latency costs.
We can also make the flags handling less cryptic by distinguishing
GFP_TRANSHUGE_LIGHT (no reclaim at all, default mode in page fault) from
GFP_TRANSHUGE (only direct reclaim, khugepaged default). Adding
__GFP_NORETRY or __GFP_KSWAPD_RECLAIM is done where needed.
The patch effectively changes the current GFP_TRANSHUGE users as
follows:
* get_huge_zero_page() - the zero page lifetime should be relatively
long and it's shared by multiple users, so it's worth spending some
effort on it. We use GFP_TRANSHUGE, and __GFP_NORETRY is not added.
This also restores direct reclaim to this allocation, which was
unintentionally removed by commit e4a49efe4e7e ("mm: thp: set THP defrag
by default to madvise and add a stall-free defrag option")
* alloc_hugepage_khugepaged_gfpmask() - this is khugepaged, so latency
is not an issue. So if khugepaged "defrag" is enabled (the default), do
reclaim via GFP_TRANSHUGE without __GFP_NORETRY. We can remove the
PF_KTHREAD check from page alloc.
As a side-effect, khugepaged will now no longer check if the initial
compaction was deferred or contended. This is OK, as khugepaged sleep
times between collapsion attempts are long enough to prevent noticeable
disruption, so we should allow it to spend some effort.
* migrate_misplaced_transhuge_page() - already was masking out
__GFP_RECLAIM, so just convert to GFP_TRANSHUGE_LIGHT which is
equivalent.
* alloc_hugepage_direct_gfpmask() - vma's with VM_HUGEPAGE (via madvise)
are now allocating without __GFP_NORETRY. Other vma's keep using
__GFP_NORETRY if direct reclaim/compaction is at all allowed (by default
it's allowed only for madvised vma's). The rest is conversion to
GFP_TRANSHUGE(_LIGHT).
[mhocko@suse.com: suggested GFP_TRANSHUGE_LIGHT]
Link: http://lkml.kernel.org/r/20160721073614.24395-7-vbabka@suse.cz
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Mel Gorman <mgorman@techsingularity.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-07-29 05:49:25 +07:00
|
|
|
#define GFP_TRANSHUGE_LIGHT ((GFP_HIGHUSER_MOVABLE | __GFP_COMP | \
|
|
|
|
__GFP_NOMEMALLOC | __GFP_NOWARN) & ~__GFP_RECLAIM)
|
|
|
|
#define GFP_TRANSHUGE (GFP_TRANSHUGE_LIGHT | __GFP_DIRECT_RECLAIM)
|
2005-04-17 05:20:36 +07:00
|
|
|
|
2015-11-07 07:28:43 +07:00
|
|
|
/* Convert GFP flags to their corresponding migrate type */
|
2007-10-16 15:25:52 +07:00
|
|
|
#define GFP_MOVABLE_MASK (__GFP_RECLAIMABLE|__GFP_MOVABLE)
|
2015-11-07 07:28:18 +07:00
|
|
|
#define GFP_MOVABLE_SHIFT 3
|
2007-10-16 15:25:41 +07:00
|
|
|
|
2014-10-10 05:27:25 +07:00
|
|
|
static inline int gfpflags_to_migratetype(const gfp_t gfp_flags)
|
Print out statistics in relation to fragmentation avoidance to /proc/pagetypeinfo
This patch provides fragmentation avoidance statistics via /proc/pagetypeinfo.
The information is collected only on request so there is no runtime overhead.
The statistics are in three parts:
The first part prints information on the size of blocks that pages are
being grouped on and looks like
Page block order: 10
Pages per block: 1024
The second part is a more detailed version of /proc/buddyinfo and looks like
Free pages count per migrate type at order 0 1 2 3 4 5 6 7 8 9 10
Node 0, zone DMA, type Unmovable 0 0 0 0 0 0 0 0 0 0 0
Node 0, zone DMA, type Reclaimable 1 0 0 0 0 0 0 0 0 0 0
Node 0, zone DMA, type Movable 0 0 0 0 0 0 0 0 0 0 0
Node 0, zone DMA, type Reserve 0 4 4 0 0 0 0 1 0 1 0
Node 0, zone Normal, type Unmovable 111 8 4 4 2 3 1 0 0 0 0
Node 0, zone Normal, type Reclaimable 293 89 8 0 0 0 0 0 0 0 0
Node 0, zone Normal, type Movable 1 6 13 9 7 6 3 0 0 0 0
Node 0, zone Normal, type Reserve 0 0 0 0 0 0 0 0 0 0 4
The third part looks like
Number of blocks type Unmovable Reclaimable Movable Reserve
Node 0, zone DMA 0 1 2 1
Node 0, zone Normal 3 17 94 4
To walk the zones within a node with interrupts disabled, walk_zones_in_node()
is introduced and shared between /proc/buddyinfo, /proc/zoneinfo and
/proc/pagetypeinfo to reduce code duplication. It seems specific to what
vmstat.c requires but could be broken out as a general utility function in
mmzone.c if there were other other potential users.
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Acked-by: Andy Whitcroft <apw@shadowen.org>
Acked-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-10-16 15:26:02 +07:00
|
|
|
{
|
2015-11-07 07:28:18 +07:00
|
|
|
VM_WARN_ON((gfp_flags & GFP_MOVABLE_MASK) == GFP_MOVABLE_MASK);
|
|
|
|
BUILD_BUG_ON((1UL << GFP_MOVABLE_SHIFT) != ___GFP_MOVABLE);
|
|
|
|
BUILD_BUG_ON((___GFP_MOVABLE >> GFP_MOVABLE_SHIFT) != MIGRATE_MOVABLE);
|
Print out statistics in relation to fragmentation avoidance to /proc/pagetypeinfo
This patch provides fragmentation avoidance statistics via /proc/pagetypeinfo.
The information is collected only on request so there is no runtime overhead.
The statistics are in three parts:
The first part prints information on the size of blocks that pages are
being grouped on and looks like
Page block order: 10
Pages per block: 1024
The second part is a more detailed version of /proc/buddyinfo and looks like
Free pages count per migrate type at order 0 1 2 3 4 5 6 7 8 9 10
Node 0, zone DMA, type Unmovable 0 0 0 0 0 0 0 0 0 0 0
Node 0, zone DMA, type Reclaimable 1 0 0 0 0 0 0 0 0 0 0
Node 0, zone DMA, type Movable 0 0 0 0 0 0 0 0 0 0 0
Node 0, zone DMA, type Reserve 0 4 4 0 0 0 0 1 0 1 0
Node 0, zone Normal, type Unmovable 111 8 4 4 2 3 1 0 0 0 0
Node 0, zone Normal, type Reclaimable 293 89 8 0 0 0 0 0 0 0 0
Node 0, zone Normal, type Movable 1 6 13 9 7 6 3 0 0 0 0
Node 0, zone Normal, type Reserve 0 0 0 0 0 0 0 0 0 0 4
The third part looks like
Number of blocks type Unmovable Reclaimable Movable Reserve
Node 0, zone DMA 0 1 2 1
Node 0, zone Normal 3 17 94 4
To walk the zones within a node with interrupts disabled, walk_zones_in_node()
is introduced and shared between /proc/buddyinfo, /proc/zoneinfo and
/proc/pagetypeinfo to reduce code duplication. It seems specific to what
vmstat.c requires but could be broken out as a general utility function in
mmzone.c if there were other other potential users.
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Acked-by: Andy Whitcroft <apw@shadowen.org>
Acked-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-10-16 15:26:02 +07:00
|
|
|
|
|
|
|
if (unlikely(page_group_by_mobility_disabled))
|
|
|
|
return MIGRATE_UNMOVABLE;
|
|
|
|
|
|
|
|
/* Group based on mobility */
|
2015-11-07 07:28:18 +07:00
|
|
|
return (gfp_flags & GFP_MOVABLE_MASK) >> GFP_MOVABLE_SHIFT;
|
Print out statistics in relation to fragmentation avoidance to /proc/pagetypeinfo
This patch provides fragmentation avoidance statistics via /proc/pagetypeinfo.
The information is collected only on request so there is no runtime overhead.
The statistics are in three parts:
The first part prints information on the size of blocks that pages are
being grouped on and looks like
Page block order: 10
Pages per block: 1024
The second part is a more detailed version of /proc/buddyinfo and looks like
Free pages count per migrate type at order 0 1 2 3 4 5 6 7 8 9 10
Node 0, zone DMA, type Unmovable 0 0 0 0 0 0 0 0 0 0 0
Node 0, zone DMA, type Reclaimable 1 0 0 0 0 0 0 0 0 0 0
Node 0, zone DMA, type Movable 0 0 0 0 0 0 0 0 0 0 0
Node 0, zone DMA, type Reserve 0 4 4 0 0 0 0 1 0 1 0
Node 0, zone Normal, type Unmovable 111 8 4 4 2 3 1 0 0 0 0
Node 0, zone Normal, type Reclaimable 293 89 8 0 0 0 0 0 0 0 0
Node 0, zone Normal, type Movable 1 6 13 9 7 6 3 0 0 0 0
Node 0, zone Normal, type Reserve 0 0 0 0 0 0 0 0 0 0 4
The third part looks like
Number of blocks type Unmovable Reclaimable Movable Reserve
Node 0, zone DMA 0 1 2 1
Node 0, zone Normal 3 17 94 4
To walk the zones within a node with interrupts disabled, walk_zones_in_node()
is introduced and shared between /proc/buddyinfo, /proc/zoneinfo and
/proc/pagetypeinfo to reduce code duplication. It seems specific to what
vmstat.c requires but could be broken out as a general utility function in
mmzone.c if there were other other potential users.
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Acked-by: Andy Whitcroft <apw@shadowen.org>
Acked-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-10-16 15:26:02 +07:00
|
|
|
}
|
2015-11-07 07:28:43 +07:00
|
|
|
#undef GFP_MOVABLE_MASK
|
|
|
|
#undef GFP_MOVABLE_SHIFT
|
2005-11-05 23:25:53 +07:00
|
|
|
|
2015-11-07 07:28:21 +07:00
|
|
|
static inline bool gfpflags_allow_blocking(const gfp_t gfp_flags)
|
|
|
|
{
|
2016-01-15 06:22:10 +07:00
|
|
|
return !!(gfp_flags & __GFP_DIRECT_RECLAIM);
|
2015-11-07 07:28:21 +07:00
|
|
|
}
|
|
|
|
|
2009-06-17 05:32:46 +07:00
|
|
|
#ifdef CONFIG_HIGHMEM
|
|
|
|
#define OPT_ZONE_HIGHMEM ZONE_HIGHMEM
|
|
|
|
#else
|
|
|
|
#define OPT_ZONE_HIGHMEM ZONE_NORMAL
|
|
|
|
#endif
|
|
|
|
|
2007-02-10 16:43:10 +07:00
|
|
|
#ifdef CONFIG_ZONE_DMA
|
2009-06-17 05:32:46 +07:00
|
|
|
#define OPT_ZONE_DMA ZONE_DMA
|
|
|
|
#else
|
|
|
|
#define OPT_ZONE_DMA ZONE_NORMAL
|
2007-02-10 16:43:10 +07:00
|
|
|
#endif
|
2009-06-17 05:32:46 +07:00
|
|
|
|
2006-09-26 13:31:17 +07:00
|
|
|
#ifdef CONFIG_ZONE_DMA32
|
2009-06-17 05:32:46 +07:00
|
|
|
#define OPT_ZONE_DMA32 ZONE_DMA32
|
|
|
|
#else
|
|
|
|
#define OPT_ZONE_DMA32 ZONE_NORMAL
|
2006-09-26 13:31:17 +07:00
|
|
|
#endif
|
2009-06-17 05:32:46 +07:00
|
|
|
|
|
|
|
/*
|
|
|
|
* GFP_ZONE_TABLE is a word size bitstring that is used for looking up the
|
2017-05-04 04:54:51 +07:00
|
|
|
* zone to use given the lowest 4 bits of gfp_t. Entries are GFP_ZONES_SHIFT
|
|
|
|
* bits long and there are 16 of them to cover all possible combinations of
|
2010-05-25 04:32:44 +07:00
|
|
|
* __GFP_DMA, __GFP_DMA32, __GFP_MOVABLE and __GFP_HIGHMEM.
|
2009-06-17 05:32:46 +07:00
|
|
|
*
|
|
|
|
* The zone fallback order is MOVABLE=>HIGHMEM=>NORMAL=>DMA32=>DMA.
|
|
|
|
* But GFP_MOVABLE is not only a zone specifier but also an allocation
|
|
|
|
* policy. Therefore __GFP_MOVABLE plus another zone selector is valid.
|
2010-05-25 04:32:44 +07:00
|
|
|
* Only 1 bit of the lowest 3 bits (DMA,DMA32,HIGHMEM) can be set to "1".
|
2009-06-17 05:32:46 +07:00
|
|
|
*
|
|
|
|
* bit result
|
|
|
|
* =================
|
|
|
|
* 0x0 => NORMAL
|
|
|
|
* 0x1 => DMA or NORMAL
|
|
|
|
* 0x2 => HIGHMEM or NORMAL
|
|
|
|
* 0x3 => BAD (DMA+HIGHMEM)
|
|
|
|
* 0x4 => DMA32 or DMA or NORMAL
|
|
|
|
* 0x5 => BAD (DMA+DMA32)
|
|
|
|
* 0x6 => BAD (HIGHMEM+DMA32)
|
|
|
|
* 0x7 => BAD (HIGHMEM+DMA32+DMA)
|
|
|
|
* 0x8 => NORMAL (MOVABLE+0)
|
|
|
|
* 0x9 => DMA or NORMAL (MOVABLE+DMA)
|
|
|
|
* 0xa => MOVABLE (Movable is valid only if HIGHMEM is set too)
|
|
|
|
* 0xb => BAD (MOVABLE+HIGHMEM+DMA)
|
2013-07-09 06:00:02 +07:00
|
|
|
* 0xc => DMA32 (MOVABLE+DMA32)
|
2009-06-17 05:32:46 +07:00
|
|
|
* 0xd => BAD (MOVABLE+DMA32+DMA)
|
|
|
|
* 0xe => BAD (MOVABLE+DMA32+HIGHMEM)
|
|
|
|
* 0xf => BAD (MOVABLE+DMA32+HIGHMEM+DMA)
|
|
|
|
*
|
2016-03-18 04:19:41 +07:00
|
|
|
* GFP_ZONES_SHIFT must be <= 2 on 32 bit platforms.
|
2009-06-17 05:32:46 +07:00
|
|
|
*/
|
|
|
|
|
2016-03-18 04:19:41 +07:00
|
|
|
#if defined(CONFIG_ZONE_DEVICE) && (MAX_NR_ZONES-1) <= 4
|
|
|
|
/* ZONE_DEVICE is not a valid GFP zone specifier */
|
|
|
|
#define GFP_ZONES_SHIFT 2
|
|
|
|
#else
|
|
|
|
#define GFP_ZONES_SHIFT ZONES_SHIFT
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#if 16 * GFP_ZONES_SHIFT > BITS_PER_LONG
|
|
|
|
#error GFP_ZONES_SHIFT too large to create GFP_ZONE_TABLE integer
|
2009-06-17 05:32:46 +07:00
|
|
|
#endif
|
|
|
|
|
|
|
|
#define GFP_ZONE_TABLE ( \
|
2016-03-18 04:19:41 +07:00
|
|
|
(ZONE_NORMAL << 0 * GFP_ZONES_SHIFT) \
|
|
|
|
| (OPT_ZONE_DMA << ___GFP_DMA * GFP_ZONES_SHIFT) \
|
|
|
|
| (OPT_ZONE_HIGHMEM << ___GFP_HIGHMEM * GFP_ZONES_SHIFT) \
|
|
|
|
| (OPT_ZONE_DMA32 << ___GFP_DMA32 * GFP_ZONES_SHIFT) \
|
|
|
|
| (ZONE_NORMAL << ___GFP_MOVABLE * GFP_ZONES_SHIFT) \
|
|
|
|
| (OPT_ZONE_DMA << (___GFP_MOVABLE | ___GFP_DMA) * GFP_ZONES_SHIFT) \
|
|
|
|
| (ZONE_MOVABLE << (___GFP_MOVABLE | ___GFP_HIGHMEM) * GFP_ZONES_SHIFT)\
|
|
|
|
| (OPT_ZONE_DMA32 << (___GFP_MOVABLE | ___GFP_DMA32) * GFP_ZONES_SHIFT)\
|
2009-06-17 05:32:46 +07:00
|
|
|
)
|
|
|
|
|
|
|
|
/*
|
2010-05-25 04:32:44 +07:00
|
|
|
* GFP_ZONE_BAD is a bitmap for all combinations of __GFP_DMA, __GFP_DMA32
|
2009-06-17 05:32:46 +07:00
|
|
|
* __GFP_HIGHMEM and __GFP_MOVABLE that are not permitted. One flag per
|
|
|
|
* entry starting with bit 0. Bit is set if the combination is not
|
|
|
|
* allowed.
|
|
|
|
*/
|
|
|
|
#define GFP_ZONE_BAD ( \
|
2010-10-27 04:22:04 +07:00
|
|
|
1 << (___GFP_DMA | ___GFP_HIGHMEM) \
|
|
|
|
| 1 << (___GFP_DMA | ___GFP_DMA32) \
|
|
|
|
| 1 << (___GFP_DMA32 | ___GFP_HIGHMEM) \
|
|
|
|
| 1 << (___GFP_DMA | ___GFP_DMA32 | ___GFP_HIGHMEM) \
|
|
|
|
| 1 << (___GFP_MOVABLE | ___GFP_HIGHMEM | ___GFP_DMA) \
|
|
|
|
| 1 << (___GFP_MOVABLE | ___GFP_DMA32 | ___GFP_DMA) \
|
|
|
|
| 1 << (___GFP_MOVABLE | ___GFP_DMA32 | ___GFP_HIGHMEM) \
|
|
|
|
| 1 << (___GFP_MOVABLE | ___GFP_DMA32 | ___GFP_DMA | ___GFP_HIGHMEM) \
|
2009-06-17 05:32:46 +07:00
|
|
|
)
|
|
|
|
|
|
|
|
static inline enum zone_type gfp_zone(gfp_t flags)
|
|
|
|
{
|
|
|
|
enum zone_type z;
|
2010-10-27 04:22:04 +07:00
|
|
|
int bit = (__force int) (flags & GFP_ZONEMASK);
|
2009-06-17 05:32:46 +07:00
|
|
|
|
2016-03-18 04:19:41 +07:00
|
|
|
z = (GFP_ZONE_TABLE >> (bit * GFP_ZONES_SHIFT)) &
|
|
|
|
((1 << GFP_ZONES_SHIFT) - 1);
|
2011-05-25 07:11:42 +07:00
|
|
|
VM_BUG_ON((GFP_ZONE_BAD >> bit) & 1);
|
2009-06-17 05:32:46 +07:00
|
|
|
return z;
|
2006-09-26 13:31:17 +07:00
|
|
|
}
|
|
|
|
|
2005-04-17 05:20:36 +07:00
|
|
|
/*
|
|
|
|
* There is only one page-allocator function, and two main namespaces to
|
|
|
|
* it. The alloc_page*() variants return 'struct page *' and as such
|
|
|
|
* can allocate highmem pages, the *get*page*() variants return
|
|
|
|
* virtual kernel addresses to the allocated page(s).
|
|
|
|
*/
|
|
|
|
|
2008-04-28 16:12:16 +07:00
|
|
|
static inline int gfp_zonelist(gfp_t flags)
|
|
|
|
{
|
2016-01-15 06:19:00 +07:00
|
|
|
#ifdef CONFIG_NUMA
|
|
|
|
if (unlikely(flags & __GFP_THISNODE))
|
|
|
|
return ZONELIST_NOFALLBACK;
|
|
|
|
#endif
|
|
|
|
return ZONELIST_FALLBACK;
|
2008-04-28 16:12:16 +07:00
|
|
|
}
|
|
|
|
|
2005-04-17 05:20:36 +07:00
|
|
|
/*
|
|
|
|
* We get the zone list from the current node and the gfp_mask.
|
|
|
|
* This zone list contains a maximum of MAXNODES*MAX_NR_ZONES zones.
|
2008-04-28 16:12:16 +07:00
|
|
|
* There are two zonelists per node, one for all zones with memory and
|
|
|
|
* one containing just zones from the node the zonelist belongs to.
|
2005-04-17 05:20:36 +07:00
|
|
|
*
|
|
|
|
* For the normal case of non-DISCONTIGMEM systems the NODE_DATA() gets
|
|
|
|
* optimized to &contig_page_data at compile-time.
|
|
|
|
*/
|
2008-04-28 16:12:14 +07:00
|
|
|
static inline struct zonelist *node_zonelist(int nid, gfp_t flags)
|
|
|
|
{
|
2008-04-28 16:12:16 +07:00
|
|
|
return NODE_DATA(nid)->node_zonelists + gfp_zonelist(flags);
|
2008-04-28 16:12:14 +07:00
|
|
|
}
|
2005-04-17 05:20:36 +07:00
|
|
|
|
|
|
|
#ifndef HAVE_ARCH_FREE_PAGE
|
|
|
|
static inline void arch_free_page(struct page *page, int order) { }
|
|
|
|
#endif
|
2006-12-07 11:32:00 +07:00
|
|
|
#ifndef HAVE_ARCH_ALLOC_PAGE
|
|
|
|
static inline void arch_alloc_page(struct page *page, int order) { }
|
|
|
|
#endif
|
2005-04-17 05:20:36 +07:00
|
|
|
|
2008-07-24 11:27:01 +07:00
|
|
|
struct page *
|
2017-07-07 05:40:03 +07:00
|
|
|
__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid,
|
|
|
|
nodemask_t *nodemask);
|
2008-07-24 11:27:01 +07:00
|
|
|
|
|
|
|
static inline struct page *
|
2017-07-07 05:40:03 +07:00
|
|
|
__alloc_pages(gfp_t gfp_mask, unsigned int order, int preferred_nid)
|
2008-07-24 11:27:01 +07:00
|
|
|
{
|
2017-07-07 05:40:03 +07:00
|
|
|
return __alloc_pages_nodemask(gfp_mask, order, preferred_nid, NULL);
|
2008-07-24 11:27:01 +07:00
|
|
|
}
|
|
|
|
|
mm: rename alloc_pages_exact_node() to __alloc_pages_node()
alloc_pages_exact_node() was introduced in commit 6484eb3e2a81 ("page
allocator: do not check NUMA node ID when the caller knows the node is
valid") as an optimized variant of alloc_pages_node(), that doesn't
fallback to current node for nid == NUMA_NO_NODE. Unfortunately the
name of the function can easily suggest that the allocation is
restricted to the given node and fails otherwise. In truth, the node is
only preferred, unless __GFP_THISNODE is passed among the gfp flags.
The misleading name has lead to mistakes in the past, see for example
commits 5265047ac301 ("mm, thp: really limit transparent hugepage
allocation to local node") and b360edb43f8e ("mm, mempolicy:
migrate_to_node should only migrate to node").
Another issue with the name is that there's a family of
alloc_pages_exact*() functions where 'exact' means exact size (instead
of page order), which leads to more confusion.
To prevent further mistakes, this patch effectively renames
alloc_pages_exact_node() to __alloc_pages_node() to better convey that
it's an optimized variant of alloc_pages_node() not intended for general
usage. Both functions get described in comments.
It has been also considered to really provide a convenience function for
allocations restricted to a node, but the major opinion seems to be that
__GFP_THISNODE already provides that functionality and we shouldn't
duplicate the API needlessly. The number of users would be small
anyway.
Existing callers of alloc_pages_exact_node() are simply converted to
call __alloc_pages_node(), with the exception of sba_alloc_coherent()
which open-codes the check for NUMA_NO_NODE, so it is converted to use
alloc_pages_node() instead. This means it no longer performs some
VM_BUG_ON checks, and since the current check for nid in
alloc_pages_node() uses a 'nid < 0' comparison (which includes
NUMA_NO_NODE), it may hide wrong values which would be previously
exposed.
Both differences will be rectified by the next patch.
To sum up, this patch makes no functional changes, except temporarily
hiding potentially buggy callers. Restricting the checks in
alloc_pages_node() is left for the next patch which can in turn expose
more existing buggy callers.
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Robin Holt <robinmholt@gmail.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Christoph Lameter <cl@linux.com>
Acked-by: Michael Ellerman <mpe@ellerman.id.au>
Cc: Mel Gorman <mgorman@suse.de>
Cc: David Rientjes <rientjes@google.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Gleb Natapov <gleb@kernel.org>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Cliff Whickman <cpw@sgi.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-09-09 05:03:50 +07:00
|
|
|
/*
|
|
|
|
* Allocate pages, preferring the node given as nid. The node must be valid and
|
|
|
|
* online. For more general interface, see alloc_pages_node().
|
|
|
|
*/
|
|
|
|
static inline struct page *
|
|
|
|
__alloc_pages_node(int nid, gfp_t gfp_mask, unsigned int order)
|
2005-04-17 05:20:36 +07:00
|
|
|
{
|
2015-09-09 05:03:53 +07:00
|
|
|
VM_BUG_ON(nid < 0 || nid >= MAX_NUMNODES);
|
|
|
|
VM_WARN_ON(!node_online(nid));
|
2006-01-12 04:43:45 +07:00
|
|
|
|
2017-07-07 05:40:03 +07:00
|
|
|
return __alloc_pages(gfp_mask, order, nid);
|
2005-04-17 05:20:36 +07:00
|
|
|
}
|
|
|
|
|
mm: rename alloc_pages_exact_node() to __alloc_pages_node()
alloc_pages_exact_node() was introduced in commit 6484eb3e2a81 ("page
allocator: do not check NUMA node ID when the caller knows the node is
valid") as an optimized variant of alloc_pages_node(), that doesn't
fallback to current node for nid == NUMA_NO_NODE. Unfortunately the
name of the function can easily suggest that the allocation is
restricted to the given node and fails otherwise. In truth, the node is
only preferred, unless __GFP_THISNODE is passed among the gfp flags.
The misleading name has lead to mistakes in the past, see for example
commits 5265047ac301 ("mm, thp: really limit transparent hugepage
allocation to local node") and b360edb43f8e ("mm, mempolicy:
migrate_to_node should only migrate to node").
Another issue with the name is that there's a family of
alloc_pages_exact*() functions where 'exact' means exact size (instead
of page order), which leads to more confusion.
To prevent further mistakes, this patch effectively renames
alloc_pages_exact_node() to __alloc_pages_node() to better convey that
it's an optimized variant of alloc_pages_node() not intended for general
usage. Both functions get described in comments.
It has been also considered to really provide a convenience function for
allocations restricted to a node, but the major opinion seems to be that
__GFP_THISNODE already provides that functionality and we shouldn't
duplicate the API needlessly. The number of users would be small
anyway.
Existing callers of alloc_pages_exact_node() are simply converted to
call __alloc_pages_node(), with the exception of sba_alloc_coherent()
which open-codes the check for NUMA_NO_NODE, so it is converted to use
alloc_pages_node() instead. This means it no longer performs some
VM_BUG_ON checks, and since the current check for nid in
alloc_pages_node() uses a 'nid < 0' comparison (which includes
NUMA_NO_NODE), it may hide wrong values which would be previously
exposed.
Both differences will be rectified by the next patch.
To sum up, this patch makes no functional changes, except temporarily
hiding potentially buggy callers. Restricting the checks in
alloc_pages_node() is left for the next patch which can in turn expose
more existing buggy callers.
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Robin Holt <robinmholt@gmail.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Christoph Lameter <cl@linux.com>
Acked-by: Michael Ellerman <mpe@ellerman.id.au>
Cc: Mel Gorman <mgorman@suse.de>
Cc: David Rientjes <rientjes@google.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Gleb Natapov <gleb@kernel.org>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Cliff Whickman <cpw@sgi.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-09-09 05:03:50 +07:00
|
|
|
/*
|
|
|
|
* Allocate pages, preferring the node given as nid. When nid == NUMA_NO_NODE,
|
2015-09-09 05:03:56 +07:00
|
|
|
* prefer the current CPU's closest node. Otherwise node must be valid and
|
|
|
|
* online.
|
mm: rename alloc_pages_exact_node() to __alloc_pages_node()
alloc_pages_exact_node() was introduced in commit 6484eb3e2a81 ("page
allocator: do not check NUMA node ID when the caller knows the node is
valid") as an optimized variant of alloc_pages_node(), that doesn't
fallback to current node for nid == NUMA_NO_NODE. Unfortunately the
name of the function can easily suggest that the allocation is
restricted to the given node and fails otherwise. In truth, the node is
only preferred, unless __GFP_THISNODE is passed among the gfp flags.
The misleading name has lead to mistakes in the past, see for example
commits 5265047ac301 ("mm, thp: really limit transparent hugepage
allocation to local node") and b360edb43f8e ("mm, mempolicy:
migrate_to_node should only migrate to node").
Another issue with the name is that there's a family of
alloc_pages_exact*() functions where 'exact' means exact size (instead
of page order), which leads to more confusion.
To prevent further mistakes, this patch effectively renames
alloc_pages_exact_node() to __alloc_pages_node() to better convey that
it's an optimized variant of alloc_pages_node() not intended for general
usage. Both functions get described in comments.
It has been also considered to really provide a convenience function for
allocations restricted to a node, but the major opinion seems to be that
__GFP_THISNODE already provides that functionality and we shouldn't
duplicate the API needlessly. The number of users would be small
anyway.
Existing callers of alloc_pages_exact_node() are simply converted to
call __alloc_pages_node(), with the exception of sba_alloc_coherent()
which open-codes the check for NUMA_NO_NODE, so it is converted to use
alloc_pages_node() instead. This means it no longer performs some
VM_BUG_ON checks, and since the current check for nid in
alloc_pages_node() uses a 'nid < 0' comparison (which includes
NUMA_NO_NODE), it may hide wrong values which would be previously
exposed.
Both differences will be rectified by the next patch.
To sum up, this patch makes no functional changes, except temporarily
hiding potentially buggy callers. Restricting the checks in
alloc_pages_node() is left for the next patch which can in turn expose
more existing buggy callers.
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Robin Holt <robinmholt@gmail.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Christoph Lameter <cl@linux.com>
Acked-by: Michael Ellerman <mpe@ellerman.id.au>
Cc: Mel Gorman <mgorman@suse.de>
Cc: David Rientjes <rientjes@google.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Gleb Natapov <gleb@kernel.org>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Cliff Whickman <cpw@sgi.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-09-09 05:03:50 +07:00
|
|
|
*/
|
|
|
|
static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask,
|
2009-06-17 05:31:54 +07:00
|
|
|
unsigned int order)
|
|
|
|
{
|
2015-09-09 05:03:53 +07:00
|
|
|
if (nid == NUMA_NO_NODE)
|
2015-09-09 05:03:56 +07:00
|
|
|
nid = numa_mem_id();
|
2009-06-17 05:31:54 +07:00
|
|
|
|
2015-09-09 05:03:53 +07:00
|
|
|
return __alloc_pages_node(nid, gfp_mask, order);
|
2009-06-17 05:31:54 +07:00
|
|
|
}
|
|
|
|
|
2005-04-17 05:20:36 +07:00
|
|
|
#ifdef CONFIG_NUMA
|
2005-10-07 13:46:04 +07:00
|
|
|
extern struct page *alloc_pages_current(gfp_t gfp_mask, unsigned order);
|
2005-04-17 05:20:36 +07:00
|
|
|
|
|
|
|
static inline struct page *
|
2005-10-07 13:46:04 +07:00
|
|
|
alloc_pages(gfp_t gfp_mask, unsigned int order)
|
2005-04-17 05:20:36 +07:00
|
|
|
{
|
|
|
|
return alloc_pages_current(gfp_mask, order);
|
|
|
|
}
|
2011-01-14 06:47:05 +07:00
|
|
|
extern struct page *alloc_pages_vma(gfp_t gfp_mask, int order,
|
2011-03-05 08:36:29 +07:00
|
|
|
struct vm_area_struct *vma, unsigned long addr,
|
2015-02-12 06:27:15 +07:00
|
|
|
int node, bool hugepage);
|
|
|
|
#define alloc_hugepage_vma(gfp_mask, vma, addr, order) \
|
|
|
|
alloc_pages_vma(gfp_mask, order, vma, addr, numa_node_id(), true)
|
2005-04-17 05:20:36 +07:00
|
|
|
#else
|
|
|
|
#define alloc_pages(gfp_mask, order) \
|
|
|
|
alloc_pages_node(numa_node_id(), gfp_mask, order)
|
2015-02-12 06:27:15 +07:00
|
|
|
#define alloc_pages_vma(gfp_mask, order, vma, addr, node, false)\
|
2011-01-14 06:47:05 +07:00
|
|
|
alloc_pages(gfp_mask, order)
|
2015-02-12 06:27:12 +07:00
|
|
|
#define alloc_hugepage_vma(gfp_mask, vma, addr, order) \
|
|
|
|
alloc_pages(gfp_mask, order)
|
2005-04-17 05:20:36 +07:00
|
|
|
#endif
|
|
|
|
#define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0)
|
2011-03-05 08:36:29 +07:00
|
|
|
#define alloc_page_vma(gfp_mask, vma, addr) \
|
2015-02-12 06:27:15 +07:00
|
|
|
alloc_pages_vma(gfp_mask, 0, vma, addr, numa_node_id(), false)
|
2011-03-05 08:36:30 +07:00
|
|
|
#define alloc_page_vma_node(gfp_mask, vma, addr, node) \
|
2015-02-12 06:27:15 +07:00
|
|
|
alloc_pages_vma(gfp_mask, 0, vma, addr, node, false)
|
2005-04-17 05:20:36 +07:00
|
|
|
|
2008-02-14 06:03:15 +07:00
|
|
|
extern unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order);
|
|
|
|
extern unsigned long get_zeroed_page(gfp_t gfp_mask);
|
2005-04-17 05:20:36 +07:00
|
|
|
|
2008-07-24 11:28:11 +07:00
|
|
|
void *alloc_pages_exact(size_t size, gfp_t gfp_mask);
|
|
|
|
void free_pages_exact(void *virt, size_t size);
|
2014-08-07 06:04:59 +07:00
|
|
|
void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask);
|
2008-07-24 11:28:11 +07:00
|
|
|
|
2005-04-17 05:20:36 +07:00
|
|
|
#define __get_free_page(gfp_mask) \
|
2010-05-25 04:32:45 +07:00
|
|
|
__get_free_pages((gfp_mask), 0)
|
2005-04-17 05:20:36 +07:00
|
|
|
|
|
|
|
#define __get_dma_pages(gfp_mask, order) \
|
2010-05-25 04:32:45 +07:00
|
|
|
__get_free_pages((gfp_mask) | GFP_DMA, (order))
|
2005-04-17 05:20:36 +07:00
|
|
|
|
2008-02-14 06:03:15 +07:00
|
|
|
extern void __free_pages(struct page *page, unsigned int order);
|
|
|
|
extern void free_pages(unsigned long addr, unsigned int order);
|
2014-06-05 06:10:22 +07:00
|
|
|
extern void free_hot_cold_page(struct page *page, bool cold);
|
|
|
|
extern void free_hot_cold_page_list(struct list_head *list, bool cold);
|
2005-04-17 05:20:36 +07:00
|
|
|
|
2015-05-07 11:11:57 +07:00
|
|
|
struct page_frag_cache;
|
2017-01-11 07:58:09 +07:00
|
|
|
extern void __page_frag_cache_drain(struct page *page, unsigned int count);
|
2017-01-11 07:58:06 +07:00
|
|
|
extern void *page_frag_alloc(struct page_frag_cache *nc,
|
|
|
|
unsigned int fragsz, gfp_t gfp_mask);
|
|
|
|
extern void page_frag_free(void *addr);
|
2015-05-07 11:11:57 +07:00
|
|
|
|
2005-04-17 05:20:36 +07:00
|
|
|
#define __free_page(page) __free_pages((page), 0)
|
2010-05-25 04:32:45 +07:00
|
|
|
#define free_page(addr) free_pages((addr), 0)
|
2005-04-17 05:20:36 +07:00
|
|
|
|
|
|
|
void page_alloc_init(void);
|
2007-05-09 16:35:14 +07:00
|
|
|
void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp);
|
2014-12-11 06:43:01 +07:00
|
|
|
void drain_all_pages(struct zone *zone);
|
|
|
|
void drain_local_pages(struct zone *zone);
|
2005-04-17 05:20:36 +07:00
|
|
|
|
2015-07-01 04:57:27 +07:00
|
|
|
void page_alloc_init_late(void);
|
|
|
|
|
2012-01-11 06:07:15 +07:00
|
|
|
/*
|
|
|
|
* gfp_allowed_mask is set to GFP_BOOT_MASK during early boot to restrict what
|
|
|
|
* GFP flags are used before interrupts are enabled. Once interrupts are
|
|
|
|
* enabled, it is set to __GFP_BITS_MASK while the system is running. During
|
|
|
|
* hibernation, it is used by PM to avoid I/O during memory allocation while
|
|
|
|
* devices are suspended.
|
|
|
|
*/
|
2009-06-18 10:24:12 +07:00
|
|
|
extern gfp_t gfp_allowed_mask;
|
|
|
|
|
2012-08-01 06:44:19 +07:00
|
|
|
/* Returns true if the gfp_mask allows use of ALLOC_NO_WATERMARK */
|
|
|
|
bool gfp_pfmemalloc_allowed(gfp_t gfp_mask);
|
|
|
|
|
2010-12-04 04:57:45 +07:00
|
|
|
extern void pm_restrict_gfp_mask(void);
|
|
|
|
extern void pm_restore_gfp_mask(void);
|
2009-06-18 10:24:12 +07:00
|
|
|
|
2012-01-11 06:07:15 +07:00
|
|
|
#ifdef CONFIG_PM_SLEEP
|
|
|
|
extern bool pm_suspended_storage(void);
|
|
|
|
#else
|
|
|
|
static inline bool pm_suspended_storage(void)
|
|
|
|
{
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
#endif /* CONFIG_PM_SLEEP */
|
|
|
|
|
2016-02-06 06:36:41 +07:00
|
|
|
#if (defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || defined(CONFIG_CMA)
|
2011-12-29 19:09:50 +07:00
|
|
|
/* The below functions must be run on a range from a single zone. */
|
2012-04-03 20:06:15 +07:00
|
|
|
extern int alloc_contig_range(unsigned long start, unsigned long end,
|
2017-02-25 05:58:37 +07:00
|
|
|
unsigned migratetype, gfp_t gfp_mask);
|
2011-12-29 19:09:50 +07:00
|
|
|
extern void free_contig_range(unsigned long pfn, unsigned nr_pages);
|
2016-02-06 06:36:41 +07:00
|
|
|
#endif
|
2011-12-29 19:09:50 +07:00
|
|
|
|
2016-02-06 06:36:41 +07:00
|
|
|
#ifdef CONFIG_CMA
|
2011-12-29 19:09:50 +07:00
|
|
|
/* CMA stuff */
|
|
|
|
extern void init_cma_reserved_pageblock(struct page *page);
|
2011-12-29 19:09:50 +07:00
|
|
|
#endif
|
|
|
|
|
2005-04-17 05:20:36 +07:00
|
|
|
#endif /* __LINUX_GFP_H */
|