mirror of
https://github.com/AuxXxilium/linux_dsm_epyc7002.git
synced 2024-11-24 06:50:58 +07:00
Merge branch 'akpm' (Andrew's patch-bomb)
Merge Andrew's second set of patches: - MM - a few random fixes - a couple of RTC leftovers * emailed patches from Andrew Morton <akpm@linux-foundation.org>: (120 commits) rtc/rtc-88pm80x: remove unneed devm_kfree rtc/rtc-88pm80x: assign ret only when rtc_register_driver fails mm: hugetlbfs: close race during teardown of hugetlbfs shared page tables tmpfs: distribute interleave better across nodes mm: remove redundant initialization mm: warn if pg_data_t isn't initialized with zero mips: zero out pg_data_t when it's allocated memcg: gix memory accounting scalability in shrink_page_list mm/sparse: remove index_init_lock mm/sparse: more checks on mem_section number mm/sparse: optimize sparse_index_alloc memcg: add mem_cgroup_from_css() helper memcg: further prevent OOM with too many dirty pages memcg: prevent OOM with too many dirty pages mm: mmu_notifier: fix freed page still mapped in secondary MMU mm: memcg: only check anon swapin page charges for swap cache mm: memcg: only check swap cache pages for repeated charging mm: memcg: split swapin charge function into private and public part mm: memcg: remove needless !mm fixup to init_mm when charging mm: memcg: remove unneeded shmem charge type ...
This commit is contained in:
commit
ac694dbdbc
@ -0,0 +1,5 @@
|
||||
What: /proc/sys/vm/nr_pdflush_threads
|
||||
Date: June 2012
|
||||
Contact: Wanpeng Li <liwp@linux.vnet.ibm.com>
|
||||
Description: Since pdflush is replaced by per-BDI flusher, the interface of old pdflush
|
||||
exported in /proc/sys/vm/ should be removed.
|
45
Documentation/cgroups/hugetlb.txt
Normal file
45
Documentation/cgroups/hugetlb.txt
Normal file
@ -0,0 +1,45 @@
|
||||
HugeTLB Controller
|
||||
-------------------
|
||||
|
||||
The HugeTLB controller allows to limit the HugeTLB usage per control group and
|
||||
enforces the controller limit during page fault. Since HugeTLB doesn't
|
||||
support page reclaim, enforcing the limit at page fault time implies that,
|
||||
the application will get SIGBUS signal if it tries to access HugeTLB pages
|
||||
beyond its limit. This requires the application to know beforehand how much
|
||||
HugeTLB pages it would require for its use.
|
||||
|
||||
HugeTLB controller can be created by first mounting the cgroup filesystem.
|
||||
|
||||
# mount -t cgroup -o hugetlb none /sys/fs/cgroup
|
||||
|
||||
With the above step, the initial or the parent HugeTLB group becomes
|
||||
visible at /sys/fs/cgroup. At bootup, this group includes all the tasks in
|
||||
the system. /sys/fs/cgroup/tasks lists the tasks in this cgroup.
|
||||
|
||||
New groups can be created under the parent group /sys/fs/cgroup.
|
||||
|
||||
# cd /sys/fs/cgroup
|
||||
# mkdir g1
|
||||
# echo $$ > g1/tasks
|
||||
|
||||
The above steps create a new group g1 and move the current shell
|
||||
process (bash) into it.
|
||||
|
||||
Brief summary of control files
|
||||
|
||||
hugetlb.<hugepagesize>.limit_in_bytes # set/show limit of "hugepagesize" hugetlb usage
|
||||
hugetlb.<hugepagesize>.max_usage_in_bytes # show max "hugepagesize" hugetlb usage recorded
|
||||
hugetlb.<hugepagesize>.usage_in_bytes # show current res_counter usage for "hugepagesize" hugetlb
|
||||
hugetlb.<hugepagesize>.failcnt # show the number of allocation failure due to HugeTLB limit
|
||||
|
||||
For a system supporting two hugepage size (16M and 16G) the control
|
||||
files include:
|
||||
|
||||
hugetlb.16GB.limit_in_bytes
|
||||
hugetlb.16GB.max_usage_in_bytes
|
||||
hugetlb.16GB.usage_in_bytes
|
||||
hugetlb.16GB.failcnt
|
||||
hugetlb.16MB.limit_in_bytes
|
||||
hugetlb.16MB.max_usage_in_bytes
|
||||
hugetlb.16MB.usage_in_bytes
|
||||
hugetlb.16MB.failcnt
|
@ -73,6 +73,8 @@ Brief summary of control files.
|
||||
|
||||
memory.kmem.tcp.limit_in_bytes # set/show hard limit for tcp buf memory
|
||||
memory.kmem.tcp.usage_in_bytes # show current tcp buf memory allocation
|
||||
memory.kmem.tcp.failcnt # show the number of tcp buf memory usage hits limits
|
||||
memory.kmem.tcp.max_usage_in_bytes # show max tcp buf memory usage recorded
|
||||
|
||||
1. History
|
||||
|
||||
@ -187,12 +189,12 @@ the cgroup that brought it in -- this will happen on memory pressure).
|
||||
But see section 8.2: when moving a task to another cgroup, its pages may
|
||||
be recharged to the new cgroup, if move_charge_at_immigrate has been chosen.
|
||||
|
||||
Exception: If CONFIG_CGROUP_CGROUP_MEM_RES_CTLR_SWAP is not used.
|
||||
Exception: If CONFIG_CGROUP_CGROUP_MEMCG_SWAP is not used.
|
||||
When you do swapoff and make swapped-out pages of shmem(tmpfs) to
|
||||
be backed into memory in force, charges for pages are accounted against the
|
||||
caller of swapoff rather than the users of shmem.
|
||||
|
||||
2.4 Swap Extension (CONFIG_CGROUP_MEM_RES_CTLR_SWAP)
|
||||
2.4 Swap Extension (CONFIG_MEMCG_SWAP)
|
||||
|
||||
Swap Extension allows you to record charge for swap. A swapped-in page is
|
||||
charged back to original page allocator if possible.
|
||||
@ -259,7 +261,7 @@ When oom event notifier is registered, event will be delivered.
|
||||
per-zone-per-cgroup LRU (cgroup's private LRU) is just guarded by
|
||||
zone->lru_lock, it has no lock of its own.
|
||||
|
||||
2.7 Kernel Memory Extension (CONFIG_CGROUP_MEM_RES_CTLR_KMEM)
|
||||
2.7 Kernel Memory Extension (CONFIG_MEMCG_KMEM)
|
||||
|
||||
With the Kernel memory extension, the Memory Controller is able to limit
|
||||
the amount of kernel memory used by the system. Kernel memory is fundamentally
|
||||
@ -286,8 +288,8 @@ per cgroup, instead of globally.
|
||||
|
||||
a. Enable CONFIG_CGROUPS
|
||||
b. Enable CONFIG_RESOURCE_COUNTERS
|
||||
c. Enable CONFIG_CGROUP_MEM_RES_CTLR
|
||||
d. Enable CONFIG_CGROUP_MEM_RES_CTLR_SWAP (to use swap extension)
|
||||
c. Enable CONFIG_MEMCG
|
||||
d. Enable CONFIG_MEMCG_SWAP (to use swap extension)
|
||||
|
||||
1. Prepare the cgroups (see cgroups.txt, Why are cgroups needed?)
|
||||
# mount -t tmpfs none /sys/fs/cgroup
|
||||
|
@ -13,6 +13,14 @@ Who: Jim Cromie <jim.cromie@gmail.com>, Jason Baron <jbaron@redhat.com>
|
||||
|
||||
---------------------------
|
||||
|
||||
What: /proc/sys/vm/nr_pdflush_threads
|
||||
When: 2012
|
||||
Why: Since pdflush is deprecated, the interface exported in /proc/sys/vm/
|
||||
should be removed.
|
||||
Who: Wanpeng Li <liwp@linux.vnet.ibm.com>
|
||||
|
||||
---------------------------
|
||||
|
||||
What: CONFIG_APM_CPU_IDLE, and its ability to call APM BIOS in idle
|
||||
When: 2012
|
||||
Why: This optional sub-feature of APM is of dubious reliability,
|
||||
|
@ -206,6 +206,8 @@ prototypes:
|
||||
int (*launder_page)(struct page *);
|
||||
int (*is_partially_uptodate)(struct page *, read_descriptor_t *, unsigned long);
|
||||
int (*error_remove_page)(struct address_space *, struct page *);
|
||||
int (*swap_activate)(struct file *);
|
||||
int (*swap_deactivate)(struct file *);
|
||||
|
||||
locking rules:
|
||||
All except set_page_dirty and freepage may block
|
||||
@ -229,6 +231,8 @@ migratepage: yes (both)
|
||||
launder_page: yes
|
||||
is_partially_uptodate: yes
|
||||
error_remove_page: yes
|
||||
swap_activate: no
|
||||
swap_deactivate: no
|
||||
|
||||
->write_begin(), ->write_end(), ->sync_page() and ->readpage()
|
||||
may be called from the request handler (/dev/loop).
|
||||
@ -330,6 +334,15 @@ cleaned, or an error value if not. Note that in order to prevent the page
|
||||
getting mapped back in and redirtied, it needs to be kept locked
|
||||
across the entire operation.
|
||||
|
||||
->swap_activate will be called with a non-zero argument on
|
||||
files backing (non block device backed) swapfiles. A return value
|
||||
of zero indicates success, in which case this file can be used for
|
||||
backing swapspace. The swapspace operations will be proxied to the
|
||||
address space operations.
|
||||
|
||||
->swap_deactivate() will be called in the sys_swapoff()
|
||||
path after ->swap_activate() returned success.
|
||||
|
||||
----------------------- file_lock_operations ------------------------------
|
||||
prototypes:
|
||||
void (*fl_copy_lock)(struct file_lock *, struct file_lock *);
|
||||
|
@ -592,6 +592,8 @@ struct address_space_operations {
|
||||
int (*migratepage) (struct page *, struct page *);
|
||||
int (*launder_page) (struct page *);
|
||||
int (*error_remove_page) (struct mapping *mapping, struct page *page);
|
||||
int (*swap_activate)(struct file *);
|
||||
int (*swap_deactivate)(struct file *);
|
||||
};
|
||||
|
||||
writepage: called by the VM to write a dirty page to backing store.
|
||||
@ -760,6 +762,16 @@ struct address_space_operations {
|
||||
Setting this implies you deal with pages going away under you,
|
||||
unless you have them locked or reference counts increased.
|
||||
|
||||
swap_activate: Called when swapon is used on a file to allocate
|
||||
space if necessary and pin the block lookup information in
|
||||
memory. A return value of zero indicates success,
|
||||
in which case this file can be used to back swapspace. The
|
||||
swapspace operations will be proxied to this address space's
|
||||
->swap_{out,in} methods.
|
||||
|
||||
swap_deactivate: Called during swapoff on files where swap_activate
|
||||
was successful.
|
||||
|
||||
|
||||
The File Object
|
||||
===============
|
||||
|
@ -42,7 +42,6 @@ Currently, these files are in /proc/sys/vm:
|
||||
- mmap_min_addr
|
||||
- nr_hugepages
|
||||
- nr_overcommit_hugepages
|
||||
- nr_pdflush_threads
|
||||
- nr_trim_pages (only if CONFIG_MMU=n)
|
||||
- numa_zonelist_order
|
||||
- oom_dump_tasks
|
||||
@ -426,16 +425,6 @@ See Documentation/vm/hugetlbpage.txt
|
||||
|
||||
==============================================================
|
||||
|
||||
nr_pdflush_threads
|
||||
|
||||
The current number of pdflush threads. This value is read-only.
|
||||
The value changes according to the number of dirty pages in the system.
|
||||
|
||||
When necessary, additional pdflush threads are created, one per second, up to
|
||||
nr_pdflush_threads_max.
|
||||
|
||||
==============================================================
|
||||
|
||||
nr_trim_pages
|
||||
|
||||
This is available only on NOMMU kernels.
|
||||
@ -502,9 +491,10 @@ oom_dump_tasks
|
||||
|
||||
Enables a system-wide task dump (excluding kernel threads) to be
|
||||
produced when the kernel performs an OOM-killing and includes such
|
||||
information as pid, uid, tgid, vm size, rss, cpu, oom_adj score, and
|
||||
name. This is helpful to determine why the OOM killer was invoked
|
||||
and to identify the rogue task that caused it.
|
||||
information as pid, uid, tgid, vm size, rss, nr_ptes, swapents,
|
||||
oom_score_adj score, and name. This is helpful to determine why the
|
||||
OOM killer was invoked, to identify the rogue task that caused it,
|
||||
and to determine why the OOM killer chose the task it did to kill.
|
||||
|
||||
If this is set to zero, this information is suppressed. On very
|
||||
large systems with thousands of tasks it may not be feasible to dump
|
||||
@ -574,16 +564,24 @@ of physical RAM. See above.
|
||||
|
||||
page-cluster
|
||||
|
||||
page-cluster controls the number of pages which are written to swap in
|
||||
a single attempt. The swap I/O size.
|
||||
page-cluster controls the number of pages up to which consecutive pages
|
||||
are read in from swap in a single attempt. This is the swap counterpart
|
||||
to page cache readahead.
|
||||
The mentioned consecutivity is not in terms of virtual/physical addresses,
|
||||
but consecutive on swap space - that means they were swapped out together.
|
||||
|
||||
It is a logarithmic value - setting it to zero means "1 page", setting
|
||||
it to 1 means "2 pages", setting it to 2 means "4 pages", etc.
|
||||
Zero disables swap readahead completely.
|
||||
|
||||
The default value is three (eight pages at a time). There may be some
|
||||
small benefits in tuning this to a different value if your workload is
|
||||
swap-intensive.
|
||||
|
||||
Lower values mean lower latencies for initial faults, but at the same time
|
||||
extra faults and I/O delays for following faults if they would have been part of
|
||||
that consecutive pages readahead would have brought in.
|
||||
|
||||
=============================================================
|
||||
|
||||
panic_on_oom
|
||||
|
@ -2353,7 +2353,6 @@ pfm_smpl_buffer_alloc(struct task_struct *task, struct file *filp, pfm_context_t
|
||||
*/
|
||||
insert_vm_struct(mm, vma);
|
||||
|
||||
mm->total_vm += size >> PAGE_SHIFT;
|
||||
vm_stat_account(vma->vm_mm, vma->vm_flags, vma->vm_file,
|
||||
vma_pages(vma));
|
||||
up_write(&task->mm->mmap_sem);
|
||||
|
@ -401,6 +401,7 @@ static void __init node_mem_init(cnodeid_t node)
|
||||
* Allocate the node data structures on the node first.
|
||||
*/
|
||||
__node_data[node] = __va(slot_freepfn << PAGE_SHIFT);
|
||||
memset(__node_data[node], 0, PAGE_SIZE);
|
||||
|
||||
NODE_DATA(node)->bdata = &bootmem_node_data[node];
|
||||
NODE_DATA(node)->node_start_pfn = start_pfn;
|
||||
|
@ -21,8 +21,8 @@ CONFIG_CGROUP_DEVICE=y
|
||||
CONFIG_CPUSETS=y
|
||||
CONFIG_CGROUP_CPUACCT=y
|
||||
CONFIG_RESOURCE_COUNTERS=y
|
||||
CONFIG_CGROUP_MEM_RES_CTLR=y
|
||||
CONFIG_CGROUP_MEM_RES_CTLR_SWAP=y
|
||||
CONFIG_CGROUP_MEMCG=y
|
||||
CONFIG_CGROUP_MEMCG_SWAP=y
|
||||
CONFIG_NAMESPACES=y
|
||||
CONFIG_RELAY=y
|
||||
CONFIG_BLK_DEV_INITRD=y
|
||||
|
@ -16,7 +16,7 @@ CONFIG_CGROUPS=y
|
||||
CONFIG_CPUSETS=y
|
||||
CONFIG_CGROUP_CPUACCT=y
|
||||
CONFIG_RESOURCE_COUNTERS=y
|
||||
CONFIG_CGROUP_MEM_RES_CTLR=y
|
||||
CONFIG_CGROUP_MEMCG=y
|
||||
CONFIG_CGROUP_MEM_RES_CTLR_SWAP=y
|
||||
CONFIG_CGROUP_SCHED=y
|
||||
CONFIG_RT_GROUP_SCHED=y
|
||||
|
@ -11,7 +11,7 @@ CONFIG_CGROUP_FREEZER=y
|
||||
CONFIG_CGROUP_DEVICE=y
|
||||
CONFIG_CGROUP_CPUACCT=y
|
||||
CONFIG_RESOURCE_COUNTERS=y
|
||||
CONFIG_CGROUP_MEM_RES_CTLR=y
|
||||
CONFIG_CGROUP_MEMCG=y
|
||||
CONFIG_BLK_CGROUP=y
|
||||
CONFIG_NAMESPACES=y
|
||||
CONFIG_BLK_DEV_INITRD=y
|
||||
|
@ -18,8 +18,8 @@ CONFIG_CPUSETS=y
|
||||
# CONFIG_PROC_PID_CPUSET is not set
|
||||
CONFIG_CGROUP_CPUACCT=y
|
||||
CONFIG_RESOURCE_COUNTERS=y
|
||||
CONFIG_CGROUP_MEM_RES_CTLR=y
|
||||
CONFIG_CGROUP_MEM_RES_CTLR_SWAP=y
|
||||
CONFIG_CGROUP_MEMCG=y
|
||||
CONFIG_CGROUP_MEMCG_SWAP=y
|
||||
CONFIG_CGROUP_SCHED=y
|
||||
CONFIG_RT_GROUP_SCHED=y
|
||||
CONFIG_BLK_CGROUP=y
|
||||
|
@ -11,7 +11,7 @@ CONFIG_CGROUP_DEBUG=y
|
||||
CONFIG_CGROUP_DEVICE=y
|
||||
CONFIG_CGROUP_CPUACCT=y
|
||||
CONFIG_RESOURCE_COUNTERS=y
|
||||
CONFIG_CGROUP_MEM_RES_CTLR=y
|
||||
CONFIG_CGROUP_MEMCG=y
|
||||
CONFIG_RELAY=y
|
||||
CONFIG_NAMESPACES=y
|
||||
CONFIG_UTS_NS=y
|
||||
|
@ -13,7 +13,7 @@ CONFIG_CGROUP_FREEZER=y
|
||||
CONFIG_CGROUP_DEVICE=y
|
||||
CONFIG_CGROUP_CPUACCT=y
|
||||
CONFIG_RESOURCE_COUNTERS=y
|
||||
CONFIG_CGROUP_MEM_RES_CTLR=y
|
||||
CONFIG_CGROUP_MEMCG=y
|
||||
CONFIG_RELAY=y
|
||||
CONFIG_NAMESPACES=y
|
||||
CONFIG_UTS_NS=y
|
||||
|
@ -15,8 +15,8 @@ CONFIG_CPUSETS=y
|
||||
# CONFIG_PROC_PID_CPUSET is not set
|
||||
CONFIG_CGROUP_CPUACCT=y
|
||||
CONFIG_RESOURCE_COUNTERS=y
|
||||
CONFIG_CGROUP_MEM_RES_CTLR=y
|
||||
CONFIG_CGROUP_MEM_RES_CTLR_SWAP=y
|
||||
CONFIG_CGROUP_MEMCG=y
|
||||
CONFIG_CGROUP_MEMCG_SWAP=y
|
||||
CONFIG_CGROUP_SCHED=y
|
||||
CONFIG_RT_GROUP_SCHED=y
|
||||
CONFIG_BLK_DEV_INITRD=y
|
||||
|
@ -18,8 +18,8 @@ CONFIG_CGROUP_DEVICE=y
|
||||
CONFIG_CPUSETS=y
|
||||
CONFIG_CGROUP_CPUACCT=y
|
||||
CONFIG_RESOURCE_COUNTERS=y
|
||||
CONFIG_CGROUP_MEM_RES_CTLR=y
|
||||
CONFIG_CGROUP_MEM_RES_CTLR_SWAP=y
|
||||
CONFIG_CGROUP_MEMCG=y
|
||||
CONFIG_CGROUP_MEMCG_SWAP=y
|
||||
CONFIG_CGROUP_SCHED=y
|
||||
CONFIG_RT_GROUP_SCHED=y
|
||||
CONFIG_BLK_CGROUP=y
|
||||
|
@ -17,8 +17,8 @@ CONFIG_CGROUP_DEVICE=y
|
||||
CONFIG_CPUSETS=y
|
||||
CONFIG_CGROUP_CPUACCT=y
|
||||
CONFIG_RESOURCE_COUNTERS=y
|
||||
CONFIG_CGROUP_MEM_RES_CTLR=y
|
||||
CONFIG_CGROUP_MEM_RES_CTLR_SWAP=y
|
||||
CONFIG_CGROUP_MEMCG=y
|
||||
CONFIG_CGROUP_MEMCG_SWAP=y
|
||||
CONFIG_CGROUP_SCHED=y
|
||||
CONFIG_RT_GROUP_SCHED=y
|
||||
CONFIG_BLK_CGROUP=y
|
||||
|
@ -155,10 +155,10 @@ CONFIG_CPUSETS=y
|
||||
CONFIG_PROC_PID_CPUSET=y
|
||||
CONFIG_CGROUP_CPUACCT=y
|
||||
CONFIG_RESOURCE_COUNTERS=y
|
||||
CONFIG_CGROUP_MEM_RES_CTLR=y
|
||||
CONFIG_CGROUP_MEM_RES_CTLR_SWAP=y
|
||||
# CONFIG_CGROUP_MEM_RES_CTLR_SWAP_ENABLED is not set
|
||||
# CONFIG_CGROUP_MEM_RES_CTLR_KMEM is not set
|
||||
CONFIG_CGROUP_MEMCG=y
|
||||
CONFIG_CGROUP_MEMCG_SWAP=y
|
||||
# CONFIG_CGROUP_MEMCG_SWAP_ENABLED is not set
|
||||
# CONFIG_CGROUP_MEMCG_KMEM is not set
|
||||
CONFIG_CGROUP_SCHED=y
|
||||
CONFIG_FAIR_GROUP_SCHED=y
|
||||
# CONFIG_CFS_BANDWIDTH is not set
|
||||
|
@ -7,6 +7,7 @@ config ZONE_DMA
|
||||
config XTENSA
|
||||
def_bool y
|
||||
select HAVE_IDE
|
||||
select GENERIC_ATOMIC64
|
||||
select HAVE_GENERIC_HARDIRQS
|
||||
select GENERIC_IRQ_SHOW
|
||||
select GENERIC_CPU_DEVICES
|
||||
|
@ -196,6 +196,7 @@ config CMA
|
||||
bool "Contiguous Memory Allocator (EXPERIMENTAL)"
|
||||
depends on HAVE_DMA_CONTIGUOUS && HAVE_MEMBLOCK && EXPERIMENTAL
|
||||
select MIGRATION
|
||||
select MEMORY_ISOLATION
|
||||
help
|
||||
This enables the Contiguous Memory Allocator which allows drivers
|
||||
to allocate big physically-contiguous blocks of memory for use with
|
||||
|
@ -154,6 +154,7 @@ static int sock_xmit(struct nbd_device *nbd, int send, void *buf, int size,
|
||||
struct msghdr msg;
|
||||
struct kvec iov;
|
||||
sigset_t blocked, oldset;
|
||||
unsigned long pflags = current->flags;
|
||||
|
||||
if (unlikely(!sock)) {
|
||||
dev_err(disk_to_dev(nbd->disk),
|
||||
@ -167,8 +168,9 @@ static int sock_xmit(struct nbd_device *nbd, int send, void *buf, int size,
|
||||
siginitsetinv(&blocked, sigmask(SIGKILL));
|
||||
sigprocmask(SIG_SETMASK, &blocked, &oldset);
|
||||
|
||||
current->flags |= PF_MEMALLOC;
|
||||
do {
|
||||
sock->sk->sk_allocation = GFP_NOIO;
|
||||
sock->sk->sk_allocation = GFP_NOIO | __GFP_MEMALLOC;
|
||||
iov.iov_base = buf;
|
||||
iov.iov_len = size;
|
||||
msg.msg_name = NULL;
|
||||
@ -214,6 +216,7 @@ static int sock_xmit(struct nbd_device *nbd, int send, void *buf, int size,
|
||||
} while (size > 0);
|
||||
|
||||
sigprocmask(SIG_SETMASK, &oldset, NULL);
|
||||
tsk_restore_flags(current, pflags, PF_MEMALLOC);
|
||||
|
||||
return result;
|
||||
}
|
||||
@ -405,6 +408,7 @@ static int nbd_do_it(struct nbd_device *nbd)
|
||||
|
||||
BUG_ON(nbd->magic != NBD_MAGIC);
|
||||
|
||||
sk_set_memalloc(nbd->sock->sk);
|
||||
nbd->pid = task_pid_nr(current);
|
||||
ret = device_create_file(disk_to_dev(nbd->disk), &pid_attr);
|
||||
if (ret) {
|
||||
|
@ -528,7 +528,7 @@ static unsigned int refill_fl(struct adapter *adap, struct sge_fl *q, int n,
|
||||
#endif
|
||||
|
||||
while (n--) {
|
||||
pg = alloc_page(gfp);
|
||||
pg = __skb_alloc_page(gfp, NULL);
|
||||
if (unlikely(!pg)) {
|
||||
q->alloc_failed++;
|
||||
break;
|
||||
|
@ -653,7 +653,7 @@ static unsigned int refill_fl(struct adapter *adapter, struct sge_fl *fl,
|
||||
|
||||
alloc_small_pages:
|
||||
while (n--) {
|
||||
page = alloc_page(gfp | __GFP_NOWARN | __GFP_COLD);
|
||||
page = __skb_alloc_page(gfp | __GFP_NOWARN, NULL);
|
||||
if (unlikely(!page)) {
|
||||
fl->alloc_failed++;
|
||||
break;
|
||||
|
@ -6235,7 +6235,7 @@ static bool igb_alloc_mapped_page(struct igb_ring *rx_ring,
|
||||
return true;
|
||||
|
||||
if (!page) {
|
||||
page = alloc_page(GFP_ATOMIC | __GFP_COLD);
|
||||
page = __skb_alloc_page(GFP_ATOMIC, bi->skb);
|
||||
bi->page = page;
|
||||
if (unlikely(!page)) {
|
||||
rx_ring->rx_stats.alloc_failed++;
|
||||
|
@ -1141,8 +1141,8 @@ static bool ixgbe_alloc_mapped_page(struct ixgbe_ring *rx_ring,
|
||||
|
||||
/* alloc new page for storage */
|
||||
if (likely(!page)) {
|
||||
page = alloc_pages(GFP_ATOMIC | __GFP_COLD | __GFP_COMP,
|
||||
ixgbe_rx_pg_order(rx_ring));
|
||||
page = __skb_alloc_pages(GFP_ATOMIC | __GFP_COLD | __GFP_COMP,
|
||||
bi->skb, ixgbe_rx_pg_order(rx_ring));
|
||||
if (unlikely(!page)) {
|
||||
rx_ring->rx_stats.alloc_rx_page_failed++;
|
||||
return false;
|
||||
|
@ -352,7 +352,6 @@ static void ixgbevf_alloc_rx_buffers(struct ixgbevf_adapter *adapter,
|
||||
adapter->alloc_rx_buff_failed++;
|
||||
goto no_buffers;
|
||||
}
|
||||
|
||||
bi->skb = skb;
|
||||
}
|
||||
if (!bi->dma) {
|
||||
|
@ -130,7 +130,7 @@ static int rx_submit(struct usbpn_dev *pnd, struct urb *req, gfp_t gfp_flags)
|
||||
struct page *page;
|
||||
int err;
|
||||
|
||||
page = alloc_page(gfp_flags);
|
||||
page = __skb_alloc_page(gfp_flags | __GFP_NOMEMALLOC, NULL);
|
||||
if (!page)
|
||||
return -ENOMEM;
|
||||
|
||||
|
@ -314,8 +314,8 @@ static int __devinit pm80x_rtc_probe(struct platform_device *pdev)
|
||||
|
||||
info->rtc_dev = rtc_device_register("88pm80x-rtc", &pdev->dev,
|
||||
&pm80x_rtc_ops, THIS_MODULE);
|
||||
ret = PTR_ERR(info->rtc_dev);
|
||||
if (IS_ERR(info->rtc_dev)) {
|
||||
ret = PTR_ERR(info->rtc_dev);
|
||||
dev_err(&pdev->dev, "Failed to register RTC device: %d\n", ret);
|
||||
goto out_rtc;
|
||||
}
|
||||
@ -339,7 +339,6 @@ static int __devinit pm80x_rtc_probe(struct platform_device *pdev)
|
||||
out_rtc:
|
||||
pm80x_free_irq(chip, info->irq, info);
|
||||
out:
|
||||
devm_kfree(&pdev->dev, info);
|
||||
return ret;
|
||||
}
|
||||
|
||||
@ -349,7 +348,6 @@ static int __devexit pm80x_rtc_remove(struct platform_device *pdev)
|
||||
platform_set_drvdata(pdev, NULL);
|
||||
rtc_device_unregister(info->rtc_dev);
|
||||
pm80x_free_irq(info->chip, info->irq, info);
|
||||
devm_kfree(&pdev->dev, info);
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -301,7 +301,7 @@ pn_rx_submit(struct f_phonet *fp, struct usb_request *req, gfp_t gfp_flags)
|
||||
struct page *page;
|
||||
int err;
|
||||
|
||||
page = alloc_page(gfp_flags);
|
||||
page = __skb_alloc_page(gfp_flags | __GFP_NOMEMALLOC, NULL);
|
||||
if (!page)
|
||||
return -ENOMEM;
|
||||
|
||||
|
@ -52,11 +52,6 @@ struct wb_writeback_work {
|
||||
struct completion *done; /* set if the caller waits */
|
||||
};
|
||||
|
||||
/*
|
||||
* We don't actually have pdflush, but this one is exported though /proc...
|
||||
*/
|
||||
int nr_pdflush_threads;
|
||||
|
||||
/**
|
||||
* writeback_in_progress - determine whether there is writeback in progress
|
||||
* @bdi: the device's backing_dev_info structure.
|
||||
|
@ -416,8 +416,8 @@ hugetlb_vmtruncate_list(struct prio_tree_root *root, pgoff_t pgoff)
|
||||
else
|
||||
v_offset = 0;
|
||||
|
||||
__unmap_hugepage_range(vma,
|
||||
vma->vm_start + v_offset, vma->vm_end, NULL);
|
||||
unmap_hugepage_range(vma, vma->vm_start + v_offset,
|
||||
vma->vm_end, NULL);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -86,6 +86,14 @@ config NFS_V4
|
||||
|
||||
If unsure, say Y.
|
||||
|
||||
config NFS_SWAP
|
||||
bool "Provide swap over NFS support"
|
||||
default n
|
||||
depends on NFS_FS
|
||||
select SUNRPC_SWAP
|
||||
help
|
||||
This option enables swapon to work on files located on NFS mounts.
|
||||
|
||||
config NFS_V4_1
|
||||
bool "NFS client support for NFSv4.1 (EXPERIMENTAL)"
|
||||
depends on NFS_V4 && EXPERIMENTAL
|
||||
|
@ -115,17 +115,28 @@ static inline int put_dreq(struct nfs_direct_req *dreq)
|
||||
* @nr_segs: size of iovec array
|
||||
*
|
||||
* The presence of this routine in the address space ops vector means
|
||||
* the NFS client supports direct I/O. However, we shunt off direct
|
||||
* read and write requests before the VFS gets them, so this method
|
||||
* should never be called.
|
||||
* the NFS client supports direct I/O. However, for most direct IO, we
|
||||
* shunt off direct read and write requests before the VFS gets them,
|
||||
* so this method is only ever called for swap.
|
||||
*/
|
||||
ssize_t nfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_t pos, unsigned long nr_segs)
|
||||
{
|
||||
#ifndef CONFIG_NFS_SWAP
|
||||
dprintk("NFS: nfs_direct_IO (%s) off/no(%Ld/%lu) EINVAL\n",
|
||||
iocb->ki_filp->f_path.dentry->d_name.name,
|
||||
(long long) pos, nr_segs);
|
||||
|
||||
return -EINVAL;
|
||||
#else
|
||||
VM_BUG_ON(iocb->ki_left != PAGE_SIZE);
|
||||
VM_BUG_ON(iocb->ki_nbytes != PAGE_SIZE);
|
||||
|
||||
if (rw == READ || rw == KERNEL_READ)
|
||||
return nfs_file_direct_read(iocb, iov, nr_segs, pos,
|
||||
rw == READ ? true : false);
|
||||
return nfs_file_direct_write(iocb, iov, nr_segs, pos,
|
||||
rw == WRITE ? true : false);
|
||||
#endif /* CONFIG_NFS_SWAP */
|
||||
}
|
||||
|
||||
static void nfs_direct_release_pages(struct page **pages, unsigned int npages)
|
||||
@ -303,7 +314,7 @@ static const struct nfs_pgio_completion_ops nfs_direct_read_completion_ops = {
|
||||
*/
|
||||
static ssize_t nfs_direct_read_schedule_segment(struct nfs_pageio_descriptor *desc,
|
||||
const struct iovec *iov,
|
||||
loff_t pos)
|
||||
loff_t pos, bool uio)
|
||||
{
|
||||
struct nfs_direct_req *dreq = desc->pg_dreq;
|
||||
struct nfs_open_context *ctx = dreq->ctx;
|
||||
@ -331,12 +342,20 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_pageio_descriptor *de
|
||||
GFP_KERNEL);
|
||||
if (!pagevec)
|
||||
break;
|
||||
down_read(¤t->mm->mmap_sem);
|
||||
result = get_user_pages(current, current->mm, user_addr,
|
||||
if (uio) {
|
||||
down_read(¤t->mm->mmap_sem);
|
||||
result = get_user_pages(current, current->mm, user_addr,
|
||||
npages, 1, 0, pagevec, NULL);
|
||||
up_read(¤t->mm->mmap_sem);
|
||||
if (result < 0)
|
||||
break;
|
||||
up_read(¤t->mm->mmap_sem);
|
||||
if (result < 0)
|
||||
break;
|
||||
} else {
|
||||
WARN_ON(npages != 1);
|
||||
result = get_kernel_page(user_addr, 1, pagevec);
|
||||
if (WARN_ON(result != 1))
|
||||
break;
|
||||
}
|
||||
|
||||
if ((unsigned)result < npages) {
|
||||
bytes = result * PAGE_SIZE;
|
||||
if (bytes <= pgbase) {
|
||||
@ -386,7 +405,7 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_pageio_descriptor *de
|
||||
static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
|
||||
const struct iovec *iov,
|
||||
unsigned long nr_segs,
|
||||
loff_t pos)
|
||||
loff_t pos, bool uio)
|
||||
{
|
||||
struct nfs_pageio_descriptor desc;
|
||||
ssize_t result = -EINVAL;
|
||||
@ -400,7 +419,7 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
|
||||
|
||||
for (seg = 0; seg < nr_segs; seg++) {
|
||||
const struct iovec *vec = &iov[seg];
|
||||
result = nfs_direct_read_schedule_segment(&desc, vec, pos);
|
||||
result = nfs_direct_read_schedule_segment(&desc, vec, pos, uio);
|
||||
if (result < 0)
|
||||
break;
|
||||
requested_bytes += result;
|
||||
@ -426,7 +445,7 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
|
||||
}
|
||||
|
||||
static ssize_t nfs_direct_read(struct kiocb *iocb, const struct iovec *iov,
|
||||
unsigned long nr_segs, loff_t pos)
|
||||
unsigned long nr_segs, loff_t pos, bool uio)
|
||||
{
|
||||
ssize_t result = -ENOMEM;
|
||||
struct inode *inode = iocb->ki_filp->f_mapping->host;
|
||||
@ -444,7 +463,7 @@ static ssize_t nfs_direct_read(struct kiocb *iocb, const struct iovec *iov,
|
||||
if (!is_sync_kiocb(iocb))
|
||||
dreq->iocb = iocb;
|
||||
|
||||
result = nfs_direct_read_schedule_iovec(dreq, iov, nr_segs, pos);
|
||||
result = nfs_direct_read_schedule_iovec(dreq, iov, nr_segs, pos, uio);
|
||||
if (!result)
|
||||
result = nfs_direct_wait(dreq);
|
||||
NFS_I(inode)->read_io += result;
|
||||
@ -610,7 +629,7 @@ static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode
|
||||
*/
|
||||
static ssize_t nfs_direct_write_schedule_segment(struct nfs_pageio_descriptor *desc,
|
||||
const struct iovec *iov,
|
||||
loff_t pos)
|
||||
loff_t pos, bool uio)
|
||||
{
|
||||
struct nfs_direct_req *dreq = desc->pg_dreq;
|
||||
struct nfs_open_context *ctx = dreq->ctx;
|
||||
@ -638,12 +657,19 @@ static ssize_t nfs_direct_write_schedule_segment(struct nfs_pageio_descriptor *d
|
||||
if (!pagevec)
|
||||
break;
|
||||
|
||||
down_read(¤t->mm->mmap_sem);
|
||||
result = get_user_pages(current, current->mm, user_addr,
|
||||
npages, 0, 0, pagevec, NULL);
|
||||
up_read(¤t->mm->mmap_sem);
|
||||
if (result < 0)
|
||||
break;
|
||||
if (uio) {
|
||||
down_read(¤t->mm->mmap_sem);
|
||||
result = get_user_pages(current, current->mm, user_addr,
|
||||
npages, 0, 0, pagevec, NULL);
|
||||
up_read(¤t->mm->mmap_sem);
|
||||
if (result < 0)
|
||||
break;
|
||||
} else {
|
||||
WARN_ON(npages != 1);
|
||||
result = get_kernel_page(user_addr, 0, pagevec);
|
||||
if (WARN_ON(result != 1))
|
||||
break;
|
||||
}
|
||||
|
||||
if ((unsigned)result < npages) {
|
||||
bytes = result * PAGE_SIZE;
|
||||
@ -774,7 +800,7 @@ static const struct nfs_pgio_completion_ops nfs_direct_write_completion_ops = {
|
||||
static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
|
||||
const struct iovec *iov,
|
||||
unsigned long nr_segs,
|
||||
loff_t pos)
|
||||
loff_t pos, bool uio)
|
||||
{
|
||||
struct nfs_pageio_descriptor desc;
|
||||
struct inode *inode = dreq->inode;
|
||||
@ -790,7 +816,7 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
|
||||
|
||||
for (seg = 0; seg < nr_segs; seg++) {
|
||||
const struct iovec *vec = &iov[seg];
|
||||
result = nfs_direct_write_schedule_segment(&desc, vec, pos);
|
||||
result = nfs_direct_write_schedule_segment(&desc, vec, pos, uio);
|
||||
if (result < 0)
|
||||
break;
|
||||
requested_bytes += result;
|
||||
@ -818,7 +844,7 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
|
||||
|
||||
static ssize_t nfs_direct_write(struct kiocb *iocb, const struct iovec *iov,
|
||||
unsigned long nr_segs, loff_t pos,
|
||||
size_t count)
|
||||
size_t count, bool uio)
|
||||
{
|
||||
ssize_t result = -ENOMEM;
|
||||
struct inode *inode = iocb->ki_filp->f_mapping->host;
|
||||
@ -836,7 +862,7 @@ static ssize_t nfs_direct_write(struct kiocb *iocb, const struct iovec *iov,
|
||||
if (!is_sync_kiocb(iocb))
|
||||
dreq->iocb = iocb;
|
||||
|
||||
result = nfs_direct_write_schedule_iovec(dreq, iov, nr_segs, pos);
|
||||
result = nfs_direct_write_schedule_iovec(dreq, iov, nr_segs, pos, uio);
|
||||
if (!result)
|
||||
result = nfs_direct_wait(dreq);
|
||||
out_release:
|
||||
@ -867,7 +893,7 @@ static ssize_t nfs_direct_write(struct kiocb *iocb, const struct iovec *iov,
|
||||
* cache.
|
||||
*/
|
||||
ssize_t nfs_file_direct_read(struct kiocb *iocb, const struct iovec *iov,
|
||||
unsigned long nr_segs, loff_t pos)
|
||||
unsigned long nr_segs, loff_t pos, bool uio)
|
||||
{
|
||||
ssize_t retval = -EINVAL;
|
||||
struct file *file = iocb->ki_filp;
|
||||
@ -892,7 +918,7 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, const struct iovec *iov,
|
||||
|
||||
task_io_account_read(count);
|
||||
|
||||
retval = nfs_direct_read(iocb, iov, nr_segs, pos);
|
||||
retval = nfs_direct_read(iocb, iov, nr_segs, pos, uio);
|
||||
if (retval > 0)
|
||||
iocb->ki_pos = pos + retval;
|
||||
|
||||
@ -923,7 +949,7 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, const struct iovec *iov,
|
||||
* is no atomic O_APPEND write facility in the NFS protocol.
|
||||
*/
|
||||
ssize_t nfs_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
|
||||
unsigned long nr_segs, loff_t pos)
|
||||
unsigned long nr_segs, loff_t pos, bool uio)
|
||||
{
|
||||
ssize_t retval = -EINVAL;
|
||||
struct file *file = iocb->ki_filp;
|
||||
@ -955,7 +981,7 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
|
||||
|
||||
task_io_account_write(count);
|
||||
|
||||
retval = nfs_direct_write(iocb, iov, nr_segs, pos, count);
|
||||
retval = nfs_direct_write(iocb, iov, nr_segs, pos, count, uio);
|
||||
if (retval > 0) {
|
||||
struct inode *inode = mapping->host;
|
||||
|
||||
|
@ -180,7 +180,7 @@ nfs_file_read(struct kiocb *iocb, const struct iovec *iov,
|
||||
ssize_t result;
|
||||
|
||||
if (iocb->ki_filp->f_flags & O_DIRECT)
|
||||
return nfs_file_direct_read(iocb, iov, nr_segs, pos);
|
||||
return nfs_file_direct_read(iocb, iov, nr_segs, pos, true);
|
||||
|
||||
dprintk("NFS: read(%s/%s, %lu@%lu)\n",
|
||||
dentry->d_parent->d_name.name, dentry->d_name.name,
|
||||
@ -439,7 +439,7 @@ static void nfs_invalidate_page(struct page *page, unsigned long offset)
|
||||
if (offset != 0)
|
||||
return;
|
||||
/* Cancel any unstarted writes on this page */
|
||||
nfs_wb_page_cancel(page->mapping->host, page);
|
||||
nfs_wb_page_cancel(page_file_mapping(page)->host, page);
|
||||
|
||||
nfs_fscache_invalidate_page(page, page->mapping->host);
|
||||
}
|
||||
@ -484,7 +484,7 @@ static int nfs_release_page(struct page *page, gfp_t gfp)
|
||||
*/
|
||||
static int nfs_launder_page(struct page *page)
|
||||
{
|
||||
struct inode *inode = page->mapping->host;
|
||||
struct inode *inode = page_file_mapping(page)->host;
|
||||
struct nfs_inode *nfsi = NFS_I(inode);
|
||||
|
||||
dfprintk(PAGECACHE, "NFS: launder_page(%ld, %llu)\n",
|
||||
@ -494,6 +494,20 @@ static int nfs_launder_page(struct page *page)
|
||||
return nfs_wb_page(inode, page);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_NFS_SWAP
|
||||
static int nfs_swap_activate(struct swap_info_struct *sis, struct file *file,
|
||||
sector_t *span)
|
||||
{
|
||||
*span = sis->pages;
|
||||
return xs_swapper(NFS_CLIENT(file->f_mapping->host)->cl_xprt, 1);
|
||||
}
|
||||
|
||||
static void nfs_swap_deactivate(struct file *file)
|
||||
{
|
||||
xs_swapper(NFS_CLIENT(file->f_mapping->host)->cl_xprt, 0);
|
||||
}
|
||||
#endif
|
||||
|
||||
const struct address_space_operations nfs_file_aops = {
|
||||
.readpage = nfs_readpage,
|
||||
.readpages = nfs_readpages,
|
||||
@ -508,6 +522,10 @@ const struct address_space_operations nfs_file_aops = {
|
||||
.migratepage = nfs_migrate_page,
|
||||
.launder_page = nfs_launder_page,
|
||||
.error_remove_page = generic_error_remove_page,
|
||||
#ifdef CONFIG_NFS_SWAP
|
||||
.swap_activate = nfs_swap_activate,
|
||||
.swap_deactivate = nfs_swap_deactivate,
|
||||
#endif
|
||||
};
|
||||
|
||||
/*
|
||||
@ -533,7 +551,7 @@ static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
|
||||
nfs_fscache_wait_on_page_write(NFS_I(dentry->d_inode), page);
|
||||
|
||||
lock_page(page);
|
||||
mapping = page->mapping;
|
||||
mapping = page_file_mapping(page);
|
||||
if (mapping != dentry->d_inode->i_mapping)
|
||||
goto out_unlock;
|
||||
|
||||
@ -582,7 +600,7 @@ ssize_t nfs_file_write(struct kiocb *iocb, const struct iovec *iov,
|
||||
size_t count = iov_length(iov, nr_segs);
|
||||
|
||||
if (iocb->ki_filp->f_flags & O_DIRECT)
|
||||
return nfs_file_direct_write(iocb, iov, nr_segs, pos);
|
||||
return nfs_file_direct_write(iocb, iov, nr_segs, pos, true);
|
||||
|
||||
dprintk("NFS: write(%s/%s, %lu@%Ld)\n",
|
||||
dentry->d_parent->d_name.name, dentry->d_name.name,
|
||||
|
@ -897,6 +897,10 @@ int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping)
|
||||
struct nfs_inode *nfsi = NFS_I(inode);
|
||||
int ret = 0;
|
||||
|
||||
/* swapfiles are not supposed to be shared. */
|
||||
if (IS_SWAPFILE(inode))
|
||||
goto out;
|
||||
|
||||
if (nfs_mapping_need_revalidate_inode(inode)) {
|
||||
ret = __nfs_revalidate_inode(NFS_SERVER(inode), inode);
|
||||
if (ret < 0)
|
||||
|
@ -554,13 +554,14 @@ void nfs_super_set_maxbytes(struct super_block *sb, __u64 maxfilesize)
|
||||
static inline
|
||||
unsigned int nfs_page_length(struct page *page)
|
||||
{
|
||||
loff_t i_size = i_size_read(page->mapping->host);
|
||||
loff_t i_size = i_size_read(page_file_mapping(page)->host);
|
||||
|
||||
if (i_size > 0) {
|
||||
pgoff_t page_index = page_file_index(page);
|
||||
pgoff_t end_index = (i_size - 1) >> PAGE_CACHE_SHIFT;
|
||||
if (page->index < end_index)
|
||||
if (page_index < end_index)
|
||||
return PAGE_CACHE_SIZE;
|
||||
if (page->index == end_index)
|
||||
if (page_index == end_index)
|
||||
return ((i_size - 1) & ~PAGE_CACHE_MASK) + 1;
|
||||
}
|
||||
return 0;
|
||||
|
@ -71,7 +71,7 @@ void nfs_set_pgio_error(struct nfs_pgio_header *hdr, int error, loff_t pos)
|
||||
static inline struct nfs_page *
|
||||
nfs_page_alloc(void)
|
||||
{
|
||||
struct nfs_page *p = kmem_cache_zalloc(nfs_page_cachep, GFP_KERNEL);
|
||||
struct nfs_page *p = kmem_cache_zalloc(nfs_page_cachep, GFP_NOIO);
|
||||
if (p)
|
||||
INIT_LIST_HEAD(&p->wb_list);
|
||||
return p;
|
||||
@ -118,7 +118,7 @@ nfs_create_request(struct nfs_open_context *ctx, struct inode *inode,
|
||||
* long write-back delay. This will be adjusted in
|
||||
* update_nfs_request below if the region is not locked. */
|
||||
req->wb_page = page;
|
||||
req->wb_index = page->index;
|
||||
req->wb_index = page_file_index(page);
|
||||
page_cache_get(page);
|
||||
req->wb_offset = offset;
|
||||
req->wb_pgbase = offset;
|
||||
|
@ -527,11 +527,11 @@ static const struct rpc_call_ops nfs_read_common_ops = {
|
||||
int nfs_readpage(struct file *file, struct page *page)
|
||||
{
|
||||
struct nfs_open_context *ctx;
|
||||
struct inode *inode = page->mapping->host;
|
||||
struct inode *inode = page_file_mapping(page)->host;
|
||||
int error;
|
||||
|
||||
dprintk("NFS: nfs_readpage (%p %ld@%lu)\n",
|
||||
page, PAGE_CACHE_SIZE, page->index);
|
||||
page, PAGE_CACHE_SIZE, page_file_index(page));
|
||||
nfs_inc_stats(inode, NFSIOS_VFSREADPAGE);
|
||||
nfs_add_stats(inode, NFSIOS_READPAGES, 1);
|
||||
|
||||
@ -585,7 +585,7 @@ static int
|
||||
readpage_async_filler(void *data, struct page *page)
|
||||
{
|
||||
struct nfs_readdesc *desc = (struct nfs_readdesc *)data;
|
||||
struct inode *inode = page->mapping->host;
|
||||
struct inode *inode = page_file_mapping(page)->host;
|
||||
struct nfs_page *new;
|
||||
unsigned int len;
|
||||
int error;
|
||||
|
@ -52,7 +52,7 @@ static mempool_t *nfs_commit_mempool;
|
||||
|
||||
struct nfs_commit_data *nfs_commitdata_alloc(void)
|
||||
{
|
||||
struct nfs_commit_data *p = mempool_alloc(nfs_commit_mempool, GFP_NOFS);
|
||||
struct nfs_commit_data *p = mempool_alloc(nfs_commit_mempool, GFP_NOIO);
|
||||
|
||||
if (p) {
|
||||
memset(p, 0, sizeof(*p));
|
||||
@ -70,7 +70,7 @@ EXPORT_SYMBOL_GPL(nfs_commit_free);
|
||||
|
||||
struct nfs_write_header *nfs_writehdr_alloc(void)
|
||||
{
|
||||
struct nfs_write_header *p = mempool_alloc(nfs_wdata_mempool, GFP_NOFS);
|
||||
struct nfs_write_header *p = mempool_alloc(nfs_wdata_mempool, GFP_NOIO);
|
||||
|
||||
if (p) {
|
||||
struct nfs_pgio_header *hdr = &p->header;
|
||||
@ -142,25 +142,38 @@ static void nfs_context_set_write_error(struct nfs_open_context *ctx, int error)
|
||||
set_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
|
||||
}
|
||||
|
||||
static struct nfs_page *nfs_page_find_request_locked(struct page *page)
|
||||
static struct nfs_page *
|
||||
nfs_page_find_request_locked(struct nfs_inode *nfsi, struct page *page)
|
||||
{
|
||||
struct nfs_page *req = NULL;
|
||||
|
||||
if (PagePrivate(page)) {
|
||||
if (PagePrivate(page))
|
||||
req = (struct nfs_page *)page_private(page);
|
||||
if (req != NULL)
|
||||
kref_get(&req->wb_kref);
|
||||
else if (unlikely(PageSwapCache(page))) {
|
||||
struct nfs_page *freq, *t;
|
||||
|
||||
/* Linearly search the commit list for the correct req */
|
||||
list_for_each_entry_safe(freq, t, &nfsi->commit_info.list, wb_list) {
|
||||
if (freq->wb_page == page) {
|
||||
req = freq;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (req)
|
||||
kref_get(&req->wb_kref);
|
||||
|
||||
return req;
|
||||
}
|
||||
|
||||
static struct nfs_page *nfs_page_find_request(struct page *page)
|
||||
{
|
||||
struct inode *inode = page->mapping->host;
|
||||
struct inode *inode = page_file_mapping(page)->host;
|
||||
struct nfs_page *req = NULL;
|
||||
|
||||
spin_lock(&inode->i_lock);
|
||||
req = nfs_page_find_request_locked(page);
|
||||
req = nfs_page_find_request_locked(NFS_I(inode), page);
|
||||
spin_unlock(&inode->i_lock);
|
||||
return req;
|
||||
}
|
||||
@ -168,16 +181,16 @@ static struct nfs_page *nfs_page_find_request(struct page *page)
|
||||
/* Adjust the file length if we're writing beyond the end */
|
||||
static void nfs_grow_file(struct page *page, unsigned int offset, unsigned int count)
|
||||
{
|
||||
struct inode *inode = page->mapping->host;
|
||||
struct inode *inode = page_file_mapping(page)->host;
|
||||
loff_t end, i_size;
|
||||
pgoff_t end_index;
|
||||
|
||||
spin_lock(&inode->i_lock);
|
||||
i_size = i_size_read(inode);
|
||||
end_index = (i_size - 1) >> PAGE_CACHE_SHIFT;
|
||||
if (i_size > 0 && page->index < end_index)
|
||||
if (i_size > 0 && page_file_index(page) < end_index)
|
||||
goto out;
|
||||
end = ((loff_t)page->index << PAGE_CACHE_SHIFT) + ((loff_t)offset+count);
|
||||
end = page_file_offset(page) + ((loff_t)offset+count);
|
||||
if (i_size >= end)
|
||||
goto out;
|
||||
i_size_write(inode, end);
|
||||
@ -190,7 +203,7 @@ static void nfs_grow_file(struct page *page, unsigned int offset, unsigned int c
|
||||
static void nfs_set_pageerror(struct page *page)
|
||||
{
|
||||
SetPageError(page);
|
||||
nfs_zap_mapping(page->mapping->host, page->mapping);
|
||||
nfs_zap_mapping(page_file_mapping(page)->host, page_file_mapping(page));
|
||||
}
|
||||
|
||||
/* We can set the PG_uptodate flag if we see that a write request
|
||||
@ -231,7 +244,7 @@ static int nfs_set_page_writeback(struct page *page)
|
||||
int ret = test_set_page_writeback(page);
|
||||
|
||||
if (!ret) {
|
||||
struct inode *inode = page->mapping->host;
|
||||
struct inode *inode = page_file_mapping(page)->host;
|
||||
struct nfs_server *nfss = NFS_SERVER(inode);
|
||||
|
||||
if (atomic_long_inc_return(&nfss->writeback) >
|
||||
@ -245,7 +258,7 @@ static int nfs_set_page_writeback(struct page *page)
|
||||
|
||||
static void nfs_end_page_writeback(struct page *page)
|
||||
{
|
||||
struct inode *inode = page->mapping->host;
|
||||
struct inode *inode = page_file_mapping(page)->host;
|
||||
struct nfs_server *nfss = NFS_SERVER(inode);
|
||||
|
||||
end_page_writeback(page);
|
||||
@ -255,13 +268,13 @@ static void nfs_end_page_writeback(struct page *page)
|
||||
|
||||
static struct nfs_page *nfs_find_and_lock_request(struct page *page, bool nonblock)
|
||||
{
|
||||
struct inode *inode = page->mapping->host;
|
||||
struct inode *inode = page_file_mapping(page)->host;
|
||||
struct nfs_page *req;
|
||||
int ret;
|
||||
|
||||
spin_lock(&inode->i_lock);
|
||||
for (;;) {
|
||||
req = nfs_page_find_request_locked(page);
|
||||
req = nfs_page_find_request_locked(NFS_I(inode), page);
|
||||
if (req == NULL)
|
||||
break;
|
||||
if (nfs_lock_request(req))
|
||||
@ -316,13 +329,13 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
|
||||
|
||||
static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, struct nfs_pageio_descriptor *pgio)
|
||||
{
|
||||
struct inode *inode = page->mapping->host;
|
||||
struct inode *inode = page_file_mapping(page)->host;
|
||||
int ret;
|
||||
|
||||
nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE);
|
||||
nfs_add_stats(inode, NFSIOS_WRITEPAGES, 1);
|
||||
|
||||
nfs_pageio_cond_complete(pgio, page->index);
|
||||
nfs_pageio_cond_complete(pgio, page_file_index(page));
|
||||
ret = nfs_page_async_flush(pgio, page, wbc->sync_mode == WB_SYNC_NONE);
|
||||
if (ret == -EAGAIN) {
|
||||
redirty_page_for_writepage(wbc, page);
|
||||
@ -339,7 +352,7 @@ static int nfs_writepage_locked(struct page *page, struct writeback_control *wbc
|
||||
struct nfs_pageio_descriptor pgio;
|
||||
int err;
|
||||
|
||||
NFS_PROTO(page->mapping->host)->write_pageio_init(&pgio,
|
||||
NFS_PROTO(page_file_mapping(page)->host)->write_pageio_init(&pgio,
|
||||
page->mapping->host,
|
||||
wb_priority(wbc),
|
||||
&nfs_async_write_completion_ops);
|
||||
@ -416,9 +429,15 @@ static void nfs_inode_add_request(struct inode *inode, struct nfs_page *req)
|
||||
spin_lock(&inode->i_lock);
|
||||
if (!nfsi->npages && NFS_PROTO(inode)->have_delegation(inode, FMODE_WRITE))
|
||||
inode->i_version++;
|
||||
set_bit(PG_MAPPED, &req->wb_flags);
|
||||
SetPagePrivate(req->wb_page);
|
||||
set_page_private(req->wb_page, (unsigned long)req);
|
||||
/*
|
||||
* Swap-space should not get truncated. Hence no need to plug the race
|
||||
* with invalidate/truncate.
|
||||
*/
|
||||
if (likely(!PageSwapCache(req->wb_page))) {
|
||||
set_bit(PG_MAPPED, &req->wb_flags);
|
||||
SetPagePrivate(req->wb_page);
|
||||
set_page_private(req->wb_page, (unsigned long)req);
|
||||
}
|
||||
nfsi->npages++;
|
||||
kref_get(&req->wb_kref);
|
||||
spin_unlock(&inode->i_lock);
|
||||
@ -435,9 +454,11 @@ static void nfs_inode_remove_request(struct nfs_page *req)
|
||||
BUG_ON (!NFS_WBACK_BUSY(req));
|
||||
|
||||
spin_lock(&inode->i_lock);
|
||||
set_page_private(req->wb_page, 0);
|
||||
ClearPagePrivate(req->wb_page);
|
||||
clear_bit(PG_MAPPED, &req->wb_flags);
|
||||
if (likely(!PageSwapCache(req->wb_page))) {
|
||||
set_page_private(req->wb_page, 0);
|
||||
ClearPagePrivate(req->wb_page);
|
||||
clear_bit(PG_MAPPED, &req->wb_flags);
|
||||
}
|
||||
nfsi->npages--;
|
||||
spin_unlock(&inode->i_lock);
|
||||
nfs_release_request(req);
|
||||
@ -474,7 +495,7 @@ nfs_request_add_commit_list(struct nfs_page *req, struct list_head *dst,
|
||||
spin_unlock(cinfo->lock);
|
||||
if (!cinfo->dreq) {
|
||||
inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
|
||||
inc_bdi_stat(req->wb_page->mapping->backing_dev_info,
|
||||
inc_bdi_stat(page_file_mapping(req->wb_page)->backing_dev_info,
|
||||
BDI_RECLAIMABLE);
|
||||
__mark_inode_dirty(req->wb_context->dentry->d_inode,
|
||||
I_DIRTY_DATASYNC);
|
||||
@ -541,7 +562,7 @@ static void
|
||||
nfs_clear_page_commit(struct page *page)
|
||||
{
|
||||
dec_zone_page_state(page, NR_UNSTABLE_NFS);
|
||||
dec_bdi_stat(page->mapping->backing_dev_info, BDI_RECLAIMABLE);
|
||||
dec_bdi_stat(page_file_mapping(page)->backing_dev_info, BDI_RECLAIMABLE);
|
||||
}
|
||||
|
||||
static void
|
||||
@ -733,7 +754,7 @@ static struct nfs_page *nfs_try_to_update_request(struct inode *inode,
|
||||
spin_lock(&inode->i_lock);
|
||||
|
||||
for (;;) {
|
||||
req = nfs_page_find_request_locked(page);
|
||||
req = nfs_page_find_request_locked(NFS_I(inode), page);
|
||||
if (req == NULL)
|
||||
goto out_unlock;
|
||||
|
||||
@ -792,7 +813,7 @@ static struct nfs_page *nfs_try_to_update_request(struct inode *inode,
|
||||
static struct nfs_page * nfs_setup_write_request(struct nfs_open_context* ctx,
|
||||
struct page *page, unsigned int offset, unsigned int bytes)
|
||||
{
|
||||
struct inode *inode = page->mapping->host;
|
||||
struct inode *inode = page_file_mapping(page)->host;
|
||||
struct nfs_page *req;
|
||||
|
||||
req = nfs_try_to_update_request(inode, page, offset, bytes);
|
||||
@ -845,7 +866,7 @@ int nfs_flush_incompatible(struct file *file, struct page *page)
|
||||
nfs_release_request(req);
|
||||
if (!do_flush)
|
||||
return 0;
|
||||
status = nfs_wb_page(page->mapping->host, page);
|
||||
status = nfs_wb_page(page_file_mapping(page)->host, page);
|
||||
} while (status == 0);
|
||||
return status;
|
||||
}
|
||||
@ -875,7 +896,7 @@ int nfs_updatepage(struct file *file, struct page *page,
|
||||
unsigned int offset, unsigned int count)
|
||||
{
|
||||
struct nfs_open_context *ctx = nfs_file_open_context(file);
|
||||
struct inode *inode = page->mapping->host;
|
||||
struct inode *inode = page_file_mapping(page)->host;
|
||||
int status = 0;
|
||||
|
||||
nfs_inc_stats(inode, NFSIOS_VFSUPDATEPAGE);
|
||||
@ -883,7 +904,7 @@ int nfs_updatepage(struct file *file, struct page *page,
|
||||
dprintk("NFS: nfs_updatepage(%s/%s %d@%lld)\n",
|
||||
file->f_path.dentry->d_parent->d_name.name,
|
||||
file->f_path.dentry->d_name.name, count,
|
||||
(long long)(page_offset(page) + offset));
|
||||
(long long)(page_file_offset(page) + offset));
|
||||
|
||||
/* If we're not using byte range locks, and we know the page
|
||||
* is up to date, it may be more efficient to extend the write
|
||||
@ -1474,7 +1495,7 @@ void nfs_retry_commit(struct list_head *page_list,
|
||||
nfs_mark_request_commit(req, lseg, cinfo);
|
||||
if (!cinfo->dreq) {
|
||||
dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
|
||||
dec_bdi_stat(req->wb_page->mapping->backing_dev_info,
|
||||
dec_bdi_stat(page_file_mapping(req->wb_page)->backing_dev_info,
|
||||
BDI_RECLAIMABLE);
|
||||
}
|
||||
nfs_unlock_and_release_request(req);
|
||||
@ -1731,7 +1752,7 @@ int nfs_wb_page_cancel(struct inode *inode, struct page *page)
|
||||
*/
|
||||
int nfs_wb_page(struct inode *inode, struct page *page)
|
||||
{
|
||||
loff_t range_start = page_offset(page);
|
||||
loff_t range_start = page_file_offset(page);
|
||||
loff_t range_end = range_start + (loff_t)(PAGE_CACHE_SIZE - 1);
|
||||
struct writeback_control wbc = {
|
||||
.sync_mode = WB_SYNC_ALL,
|
||||
|
@ -62,7 +62,7 @@ static int prune_super(struct shrinker *shrink, struct shrink_control *sc)
|
||||
return -1;
|
||||
|
||||
if (!grab_super_passive(sb))
|
||||
return !sc->nr_to_scan ? 0 : -1;
|
||||
return -1;
|
||||
|
||||
if (sb->s_op && sb->s_op->nr_cached_objects)
|
||||
fs_objects = sb->s_op->nr_cached_objects(sb);
|
||||
|
@ -17,6 +17,7 @@
|
||||
#include <linux/timer.h>
|
||||
#include <linux/writeback.h>
|
||||
#include <linux/atomic.h>
|
||||
#include <linux/sysctl.h>
|
||||
|
||||
struct page;
|
||||
struct device;
|
||||
@ -304,6 +305,8 @@ void clear_bdi_congested(struct backing_dev_info *bdi, int sync);
|
||||
void set_bdi_congested(struct backing_dev_info *bdi, int sync);
|
||||
long congestion_wait(int sync, long timeout);
|
||||
long wait_iff_congested(struct zone *zone, int sync, long timeout);
|
||||
int pdflush_proc_obsolete(struct ctl_table *table, int write,
|
||||
void __user *buffer, size_t *lenp, loff_t *ppos);
|
||||
|
||||
static inline bool bdi_cap_writeback_dirty(struct backing_dev_info *bdi)
|
||||
{
|
||||
|
@ -160,6 +160,7 @@ enum rq_flag_bits {
|
||||
__REQ_FLUSH_SEQ, /* request for flush sequence */
|
||||
__REQ_IO_STAT, /* account I/O stat */
|
||||
__REQ_MIXED_MERGE, /* merge of different types, fail separately */
|
||||
__REQ_KERNEL, /* direct IO to kernel pages */
|
||||
__REQ_NR_BITS, /* stops here */
|
||||
};
|
||||
|
||||
@ -201,5 +202,6 @@ enum rq_flag_bits {
|
||||
#define REQ_IO_STAT (1 << __REQ_IO_STAT)
|
||||
#define REQ_MIXED_MERGE (1 << __REQ_MIXED_MERGE)
|
||||
#define REQ_SECURE (1 << __REQ_SECURE)
|
||||
#define REQ_KERNEL (1 << __REQ_KERNEL)
|
||||
|
||||
#endif /* __LINUX_BLK_TYPES_H */
|
||||
|
@ -31,7 +31,7 @@ SUBSYS(cpuacct)
|
||||
|
||||
/* */
|
||||
|
||||
#ifdef CONFIG_CGROUP_MEM_RES_CTLR
|
||||
#ifdef CONFIG_MEMCG
|
||||
SUBSYS(mem_cgroup)
|
||||
#endif
|
||||
|
||||
@ -72,3 +72,9 @@ SUBSYS(net_prio)
|
||||
#endif
|
||||
|
||||
/* */
|
||||
|
||||
#ifdef CONFIG_CGROUP_HUGETLB
|
||||
SUBSYS(hugetlb)
|
||||
#endif
|
||||
|
||||
/* */
|
||||
|
@ -58,7 +58,7 @@ static inline bool compaction_deferred(struct zone *zone, int order)
|
||||
if (++zone->compact_considered > defer_limit)
|
||||
zone->compact_considered = defer_limit;
|
||||
|
||||
return zone->compact_considered < (1UL << zone->compact_defer_shift);
|
||||
return zone->compact_considered < defer_limit;
|
||||
}
|
||||
|
||||
#else
|
||||
@ -85,7 +85,7 @@ static inline void defer_compaction(struct zone *zone, int order)
|
||||
|
||||
static inline bool compaction_deferred(struct zone *zone, int order)
|
||||
{
|
||||
return 1;
|
||||
return true;
|
||||
}
|
||||
|
||||
#endif /* CONFIG_COMPACTION */
|
||||
|
@ -165,6 +165,8 @@ struct inodes_stat_t {
|
||||
#define READ 0
|
||||
#define WRITE RW_MASK
|
||||
#define READA RWA_MASK
|
||||
#define KERNEL_READ (READ|REQ_KERNEL)
|
||||
#define KERNEL_WRITE (WRITE|REQ_KERNEL)
|
||||
|
||||
#define READ_SYNC (READ | REQ_SYNC)
|
||||
#define WRITE_SYNC (WRITE | REQ_SYNC | REQ_NOIDLE)
|
||||
@ -427,6 +429,7 @@ struct kstatfs;
|
||||
struct vm_area_struct;
|
||||
struct vfsmount;
|
||||
struct cred;
|
||||
struct swap_info_struct;
|
||||
|
||||
extern void __init inode_init(void);
|
||||
extern void __init inode_init_early(void);
|
||||
@ -636,6 +639,11 @@ struct address_space_operations {
|
||||
int (*is_partially_uptodate) (struct page *, read_descriptor_t *,
|
||||
unsigned long);
|
||||
int (*error_remove_page)(struct address_space *, struct page *);
|
||||
|
||||
/* swapfile support */
|
||||
int (*swap_activate)(struct swap_info_struct *sis, struct file *file,
|
||||
sector_t *span);
|
||||
void (*swap_deactivate)(struct file *file);
|
||||
};
|
||||
|
||||
extern const struct address_space_operations empty_aops;
|
||||
|
@ -23,6 +23,7 @@ struct vm_area_struct;
|
||||
#define ___GFP_REPEAT 0x400u
|
||||
#define ___GFP_NOFAIL 0x800u
|
||||
#define ___GFP_NORETRY 0x1000u
|
||||
#define ___GFP_MEMALLOC 0x2000u
|
||||
#define ___GFP_COMP 0x4000u
|
||||
#define ___GFP_ZERO 0x8000u
|
||||
#define ___GFP_NOMEMALLOC 0x10000u
|
||||
@ -76,9 +77,14 @@ struct vm_area_struct;
|
||||
#define __GFP_REPEAT ((__force gfp_t)___GFP_REPEAT) /* See above */
|
||||
#define __GFP_NOFAIL ((__force gfp_t)___GFP_NOFAIL) /* See above */
|
||||
#define __GFP_NORETRY ((__force gfp_t)___GFP_NORETRY) /* See above */
|
||||
#define __GFP_MEMALLOC ((__force gfp_t)___GFP_MEMALLOC)/* Allow access to emergency reserves */
|
||||
#define __GFP_COMP ((__force gfp_t)___GFP_COMP) /* Add compound page metadata */
|
||||
#define __GFP_ZERO ((__force gfp_t)___GFP_ZERO) /* Return zeroed page on success */
|
||||
#define __GFP_NOMEMALLOC ((__force gfp_t)___GFP_NOMEMALLOC) /* Don't use emergency reserves */
|
||||
#define __GFP_NOMEMALLOC ((__force gfp_t)___GFP_NOMEMALLOC) /* Don't use emergency reserves.
|
||||
* This takes precedence over the
|
||||
* __GFP_MEMALLOC flag if both are
|
||||
* set
|
||||
*/
|
||||
#define __GFP_HARDWALL ((__force gfp_t)___GFP_HARDWALL) /* Enforce hardwall cpuset memory allocs */
|
||||
#define __GFP_THISNODE ((__force gfp_t)___GFP_THISNODE)/* No fallback, no policies */
|
||||
#define __GFP_RECLAIMABLE ((__force gfp_t)___GFP_RECLAIMABLE) /* Page is reclaimable */
|
||||
@ -129,7 +135,7 @@ struct vm_area_struct;
|
||||
/* Control page allocator reclaim behavior */
|
||||
#define GFP_RECLAIM_MASK (__GFP_WAIT|__GFP_HIGH|__GFP_IO|__GFP_FS|\
|
||||
__GFP_NOWARN|__GFP_REPEAT|__GFP_NOFAIL|\
|
||||
__GFP_NORETRY|__GFP_NOMEMALLOC)
|
||||
__GFP_NORETRY|__GFP_MEMALLOC|__GFP_NOMEMALLOC)
|
||||
|
||||
/* Control slab gfp mask during early boot */
|
||||
#define GFP_BOOT_MASK (__GFP_BITS_MASK & ~(__GFP_WAIT|__GFP_IO|__GFP_FS))
|
||||
@ -379,6 +385,9 @@ void drain_local_pages(void *dummy);
|
||||
*/
|
||||
extern gfp_t gfp_allowed_mask;
|
||||
|
||||
/* Returns true if the gfp_mask allows use of ALLOC_NO_WATERMARK */
|
||||
bool gfp_pfmemalloc_allowed(gfp_t gfp_mask);
|
||||
|
||||
extern void pm_restrict_gfp_mask(void);
|
||||
extern void pm_restore_gfp_mask(void);
|
||||
|
||||
|
@ -39,10 +39,17 @@ extern unsigned long totalhigh_pages;
|
||||
|
||||
void kmap_flush_unused(void);
|
||||
|
||||
struct page *kmap_to_page(void *addr);
|
||||
|
||||
#else /* CONFIG_HIGHMEM */
|
||||
|
||||
static inline unsigned int nr_free_highpages(void) { return 0; }
|
||||
|
||||
static inline struct page *kmap_to_page(void *addr)
|
||||
{
|
||||
return virt_to_page(addr);
|
||||
}
|
||||
|
||||
#define totalhigh_pages 0UL
|
||||
|
||||
#ifndef ARCH_HAS_KMAP
|
||||
|
@ -4,9 +4,11 @@
|
||||
#include <linux/mm_types.h>
|
||||
#include <linux/fs.h>
|
||||
#include <linux/hugetlb_inline.h>
|
||||
#include <linux/cgroup.h>
|
||||
|
||||
struct ctl_table;
|
||||
struct user_struct;
|
||||
struct mmu_gather;
|
||||
|
||||
#ifdef CONFIG_HUGETLB_PAGE
|
||||
|
||||
@ -20,6 +22,11 @@ struct hugepage_subpool {
|
||||
long max_hpages, used_hpages;
|
||||
};
|
||||
|
||||
extern spinlock_t hugetlb_lock;
|
||||
extern int hugetlb_max_hstate __read_mostly;
|
||||
#define for_each_hstate(h) \
|
||||
for ((h) = hstates; (h) < &hstates[hugetlb_max_hstate]; (h)++)
|
||||
|
||||
struct hugepage_subpool *hugepage_new_subpool(long nr_blocks);
|
||||
void hugepage_put_subpool(struct hugepage_subpool *spool);
|
||||
|
||||
@ -40,9 +47,14 @@ int follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *,
|
||||
struct page **, struct vm_area_struct **,
|
||||
unsigned long *, int *, int, unsigned int flags);
|
||||
void unmap_hugepage_range(struct vm_area_struct *,
|
||||
unsigned long, unsigned long, struct page *);
|
||||
void __unmap_hugepage_range(struct vm_area_struct *,
|
||||
unsigned long, unsigned long, struct page *);
|
||||
unsigned long, unsigned long, struct page *);
|
||||
void __unmap_hugepage_range_final(struct mmu_gather *tlb,
|
||||
struct vm_area_struct *vma,
|
||||
unsigned long start, unsigned long end,
|
||||
struct page *ref_page);
|
||||
void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
|
||||
unsigned long start, unsigned long end,
|
||||
struct page *ref_page);
|
||||
int hugetlb_prefault(struct address_space *, struct vm_area_struct *);
|
||||
void hugetlb_report_meminfo(struct seq_file *);
|
||||
int hugetlb_report_node_meminfo(int, char *);
|
||||
@ -98,7 +110,6 @@ static inline unsigned long hugetlb_total_pages(void)
|
||||
#define follow_huge_addr(mm, addr, write) ERR_PTR(-EINVAL)
|
||||
#define copy_hugetlb_page_range(src, dst, vma) ({ BUG(); 0; })
|
||||
#define hugetlb_prefault(mapping, vma) ({ BUG(); 0; })
|
||||
#define unmap_hugepage_range(vma, start, end, page) BUG()
|
||||
static inline void hugetlb_report_meminfo(struct seq_file *m)
|
||||
{
|
||||
}
|
||||
@ -112,13 +123,31 @@ static inline void hugetlb_report_meminfo(struct seq_file *m)
|
||||
#define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) ({BUG(); 0; })
|
||||
#define hugetlb_fault(mm, vma, addr, flags) ({ BUG(); 0; })
|
||||
#define huge_pte_offset(mm, address) 0
|
||||
#define dequeue_hwpoisoned_huge_page(page) 0
|
||||
static inline int dequeue_hwpoisoned_huge_page(struct page *page)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline void copy_huge_page(struct page *dst, struct page *src)
|
||||
{
|
||||
}
|
||||
|
||||
#define hugetlb_change_protection(vma, address, end, newprot)
|
||||
|
||||
static inline void __unmap_hugepage_range_final(struct mmu_gather *tlb,
|
||||
struct vm_area_struct *vma, unsigned long start,
|
||||
unsigned long end, struct page *ref_page)
|
||||
{
|
||||
BUG();
|
||||
}
|
||||
|
||||
static inline void __unmap_hugepage_range(struct mmu_gather *tlb,
|
||||
struct vm_area_struct *vma, unsigned long start,
|
||||
unsigned long end, struct page *ref_page)
|
||||
{
|
||||
BUG();
|
||||
}
|
||||
|
||||
#endif /* !CONFIG_HUGETLB_PAGE */
|
||||
|
||||
#define HUGETLB_ANON_FILE "anon_hugepage"
|
||||
@ -199,10 +228,15 @@ struct hstate {
|
||||
unsigned long resv_huge_pages;
|
||||
unsigned long surplus_huge_pages;
|
||||
unsigned long nr_overcommit_huge_pages;
|
||||
struct list_head hugepage_activelist;
|
||||
struct list_head hugepage_freelists[MAX_NUMNODES];
|
||||
unsigned int nr_huge_pages_node[MAX_NUMNODES];
|
||||
unsigned int free_huge_pages_node[MAX_NUMNODES];
|
||||
unsigned int surplus_huge_pages_node[MAX_NUMNODES];
|
||||
#ifdef CONFIG_CGROUP_HUGETLB
|
||||
/* cgroup control files */
|
||||
struct cftype cgroup_files[5];
|
||||
#endif
|
||||
char name[HSTATE_NAME_LEN];
|
||||
};
|
||||
|
||||
@ -302,6 +336,11 @@ static inline unsigned hstate_index_to_shift(unsigned index)
|
||||
return hstates[index].order + PAGE_SHIFT;
|
||||
}
|
||||
|
||||
static inline int hstate_index(struct hstate *h)
|
||||
{
|
||||
return h - hstates;
|
||||
}
|
||||
|
||||
#else
|
||||
struct hstate {};
|
||||
#define alloc_huge_page_node(h, nid) NULL
|
||||
@ -320,6 +359,7 @@ static inline unsigned int pages_per_huge_page(struct hstate *h)
|
||||
return 1;
|
||||
}
|
||||
#define hstate_index_to_shift(index) 0
|
||||
#define hstate_index(h) 0
|
||||
#endif
|
||||
|
||||
#endif /* _LINUX_HUGETLB_H */
|
||||
|
126
include/linux/hugetlb_cgroup.h
Normal file
126
include/linux/hugetlb_cgroup.h
Normal file
@ -0,0 +1,126 @@
|
||||
/*
|
||||
* Copyright IBM Corporation, 2012
|
||||
* Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify it
|
||||
* under the terms of version 2.1 of the GNU Lesser General Public License
|
||||
* as published by the Free Software Foundation.
|
||||
*
|
||||
* This program is distributed in the hope that it would be useful, but
|
||||
* WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef _LINUX_HUGETLB_CGROUP_H
|
||||
#define _LINUX_HUGETLB_CGROUP_H
|
||||
|
||||
#include <linux/res_counter.h>
|
||||
|
||||
struct hugetlb_cgroup;
|
||||
/*
|
||||
* Minimum page order trackable by hugetlb cgroup.
|
||||
* At least 3 pages are necessary for all the tracking information.
|
||||
*/
|
||||
#define HUGETLB_CGROUP_MIN_ORDER 2
|
||||
|
||||
#ifdef CONFIG_CGROUP_HUGETLB
|
||||
|
||||
static inline struct hugetlb_cgroup *hugetlb_cgroup_from_page(struct page *page)
|
||||
{
|
||||
VM_BUG_ON(!PageHuge(page));
|
||||
|
||||
if (compound_order(page) < HUGETLB_CGROUP_MIN_ORDER)
|
||||
return NULL;
|
||||
return (struct hugetlb_cgroup *)page[2].lru.next;
|
||||
}
|
||||
|
||||
static inline
|
||||
int set_hugetlb_cgroup(struct page *page, struct hugetlb_cgroup *h_cg)
|
||||
{
|
||||
VM_BUG_ON(!PageHuge(page));
|
||||
|
||||
if (compound_order(page) < HUGETLB_CGROUP_MIN_ORDER)
|
||||
return -1;
|
||||
page[2].lru.next = (void *)h_cg;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline bool hugetlb_cgroup_disabled(void)
|
||||
{
|
||||
if (hugetlb_subsys.disabled)
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
extern int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages,
|
||||
struct hugetlb_cgroup **ptr);
|
||||
extern void hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages,
|
||||
struct hugetlb_cgroup *h_cg,
|
||||
struct page *page);
|
||||
extern void hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages,
|
||||
struct page *page);
|
||||
extern void hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages,
|
||||
struct hugetlb_cgroup *h_cg);
|
||||
extern int hugetlb_cgroup_file_init(int idx) __init;
|
||||
extern void hugetlb_cgroup_migrate(struct page *oldhpage,
|
||||
struct page *newhpage);
|
||||
|
||||
#else
|
||||
static inline struct hugetlb_cgroup *hugetlb_cgroup_from_page(struct page *page)
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static inline
|
||||
int set_hugetlb_cgroup(struct page *page, struct hugetlb_cgroup *h_cg)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline bool hugetlb_cgroup_disabled(void)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
static inline int
|
||||
hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages,
|
||||
struct hugetlb_cgroup **ptr)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline void
|
||||
hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages,
|
||||
struct hugetlb_cgroup *h_cg,
|
||||
struct page *page)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
static inline void
|
||||
hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages, struct page *page)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
static inline void
|
||||
hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages,
|
||||
struct hugetlb_cgroup *h_cg)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
static inline int __init hugetlb_cgroup_file_init(int idx)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline void hugetlb_cgroup_migrate(struct page *oldhpage,
|
||||
struct page *newhpage)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
#endif /* CONFIG_MEM_RES_CTLR_HUGETLB */
|
||||
#endif
|
@ -38,7 +38,7 @@ struct mem_cgroup_reclaim_cookie {
|
||||
unsigned int generation;
|
||||
};
|
||||
|
||||
#ifdef CONFIG_CGROUP_MEM_RES_CTLR
|
||||
#ifdef CONFIG_MEMCG
|
||||
/*
|
||||
* All "charge" functions with gfp_mask should use GFP_KERNEL or
|
||||
* (gfp_mask & GFP_RECLAIM_MASK). In current implementatin, memcg doesn't
|
||||
@ -72,8 +72,6 @@ extern void mem_cgroup_uncharge_end(void);
|
||||
extern void mem_cgroup_uncharge_page(struct page *page);
|
||||
extern void mem_cgroup_uncharge_cache_page(struct page *page);
|
||||
|
||||
extern void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
|
||||
int order);
|
||||
bool __mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
|
||||
struct mem_cgroup *memcg);
|
||||
int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg);
|
||||
@ -100,9 +98,9 @@ int mm_match_cgroup(const struct mm_struct *mm, const struct mem_cgroup *cgroup)
|
||||
|
||||
extern struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg);
|
||||
|
||||
extern int
|
||||
mem_cgroup_prepare_migration(struct page *page,
|
||||
struct page *newpage, struct mem_cgroup **memcgp, gfp_t gfp_mask);
|
||||
extern void
|
||||
mem_cgroup_prepare_migration(struct page *page, struct page *newpage,
|
||||
struct mem_cgroup **memcgp);
|
||||
extern void mem_cgroup_end_migration(struct mem_cgroup *memcg,
|
||||
struct page *oldpage, struct page *newpage, bool migration_ok);
|
||||
|
||||
@ -124,7 +122,7 @@ extern void mem_cgroup_print_oom_info(struct mem_cgroup *memcg,
|
||||
extern void mem_cgroup_replace_page_cache(struct page *oldpage,
|
||||
struct page *newpage);
|
||||
|
||||
#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
|
||||
#ifdef CONFIG_MEMCG_SWAP
|
||||
extern int do_swap_account;
|
||||
#endif
|
||||
|
||||
@ -182,7 +180,6 @@ static inline void mem_cgroup_dec_page_stat(struct page *page,
|
||||
unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
|
||||
gfp_t gfp_mask,
|
||||
unsigned long *total_scanned);
|
||||
u64 mem_cgroup_get_limit(struct mem_cgroup *memcg);
|
||||
|
||||
void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx);
|
||||
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
|
||||
@ -193,7 +190,7 @@ void mem_cgroup_split_huge_fixup(struct page *head);
|
||||
bool mem_cgroup_bad_page_check(struct page *page);
|
||||
void mem_cgroup_print_bad_page(struct page *page);
|
||||
#endif
|
||||
#else /* CONFIG_CGROUP_MEM_RES_CTLR */
|
||||
#else /* CONFIG_MEMCG */
|
||||
struct mem_cgroup;
|
||||
|
||||
static inline int mem_cgroup_newpage_charge(struct page *page,
|
||||
@ -279,11 +276,10 @@ static inline struct cgroup_subsys_state
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static inline int
|
||||
static inline void
|
||||
mem_cgroup_prepare_migration(struct page *page, struct page *newpage,
|
||||
struct mem_cgroup **memcgp, gfp_t gfp_mask)
|
||||
struct mem_cgroup **memcgp)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline void mem_cgroup_end_migration(struct mem_cgroup *memcg,
|
||||
@ -366,12 +362,6 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline
|
||||
u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline void mem_cgroup_split_huge_fixup(struct page *head)
|
||||
{
|
||||
}
|
||||
@ -384,9 +374,9 @@ static inline void mem_cgroup_replace_page_cache(struct page *oldpage,
|
||||
struct page *newpage)
|
||||
{
|
||||
}
|
||||
#endif /* CONFIG_CGROUP_MEM_RES_CTLR */
|
||||
#endif /* CONFIG_MEMCG */
|
||||
|
||||
#if !defined(CONFIG_CGROUP_MEM_RES_CTLR) || !defined(CONFIG_DEBUG_VM)
|
||||
#if !defined(CONFIG_MEMCG) || !defined(CONFIG_DEBUG_VM)
|
||||
static inline bool
|
||||
mem_cgroup_bad_page_check(struct page *page)
|
||||
{
|
||||
@ -406,7 +396,7 @@ enum {
|
||||
};
|
||||
|
||||
struct sock;
|
||||
#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM
|
||||
#ifdef CONFIG_MEMCG_KMEM
|
||||
void sock_update_memcg(struct sock *sk);
|
||||
void sock_release_memcg(struct sock *sk);
|
||||
#else
|
||||
@ -416,6 +406,6 @@ static inline void sock_update_memcg(struct sock *sk)
|
||||
static inline void sock_release_memcg(struct sock *sk)
|
||||
{
|
||||
}
|
||||
#endif /* CONFIG_CGROUP_MEM_RES_CTLR_KMEM */
|
||||
#endif /* CONFIG_MEMCG_KMEM */
|
||||
#endif /* _LINUX_MEMCONTROL_H */
|
||||
|
||||
|
@ -15,7 +15,7 @@ extern int migrate_page(struct address_space *,
|
||||
extern int migrate_pages(struct list_head *l, new_page_t x,
|
||||
unsigned long private, bool offlining,
|
||||
enum migrate_mode mode);
|
||||
extern int migrate_huge_pages(struct list_head *l, new_page_t x,
|
||||
extern int migrate_huge_page(struct page *, new_page_t x,
|
||||
unsigned long private, bool offlining,
|
||||
enum migrate_mode mode);
|
||||
|
||||
@ -36,7 +36,7 @@ static inline void putback_lru_pages(struct list_head *l) {}
|
||||
static inline int migrate_pages(struct list_head *l, new_page_t x,
|
||||
unsigned long private, bool offlining,
|
||||
enum migrate_mode mode) { return -ENOSYS; }
|
||||
static inline int migrate_huge_pages(struct list_head *l, new_page_t x,
|
||||
static inline int migrate_huge_page(struct page *page, new_page_t x,
|
||||
unsigned long private, bool offlining,
|
||||
enum migrate_mode mode) { return -ENOSYS; }
|
||||
|
||||
|
@ -805,6 +805,17 @@ static inline void *page_rmapping(struct page *page)
|
||||
return (void *)((unsigned long)page->mapping & ~PAGE_MAPPING_FLAGS);
|
||||
}
|
||||
|
||||
extern struct address_space *__page_file_mapping(struct page *);
|
||||
|
||||
static inline
|
||||
struct address_space *page_file_mapping(struct page *page)
|
||||
{
|
||||
if (unlikely(PageSwapCache(page)))
|
||||
return __page_file_mapping(page);
|
||||
|
||||
return page->mapping;
|
||||
}
|
||||
|
||||
static inline int PageAnon(struct page *page)
|
||||
{
|
||||
return ((unsigned long)page->mapping & PAGE_MAPPING_ANON) != 0;
|
||||
@ -821,6 +832,20 @@ static inline pgoff_t page_index(struct page *page)
|
||||
return page->index;
|
||||
}
|
||||
|
||||
extern pgoff_t __page_file_index(struct page *page);
|
||||
|
||||
/*
|
||||
* Return the file index of the page. Regular pagecache pages use ->index
|
||||
* whereas swapcache pages use swp_offset(->private)
|
||||
*/
|
||||
static inline pgoff_t page_file_index(struct page *page)
|
||||
{
|
||||
if (unlikely(PageSwapCache(page)))
|
||||
return __page_file_index(page);
|
||||
|
||||
return page->index;
|
||||
}
|
||||
|
||||
/*
|
||||
* Return true if this page is mapped into pagetables.
|
||||
*/
|
||||
@ -994,6 +1019,10 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
|
||||
struct page **pages, struct vm_area_struct **vmas);
|
||||
int get_user_pages_fast(unsigned long start, int nr_pages, int write,
|
||||
struct page **pages);
|
||||
struct kvec;
|
||||
int get_kernel_pages(const struct kvec *iov, int nr_pages, int write,
|
||||
struct page **pages);
|
||||
int get_kernel_page(unsigned long start, int write, struct page **pages);
|
||||
struct page *get_dump_page(unsigned long addr);
|
||||
|
||||
extern int try_to_release_page(struct page * page, gfp_t gfp_mask);
|
||||
@ -1331,6 +1360,7 @@ void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...);
|
||||
extern void setup_per_cpu_pageset(void);
|
||||
|
||||
extern void zone_pcp_update(struct zone *zone);
|
||||
extern void zone_pcp_reset(struct zone *zone);
|
||||
|
||||
/* nommu.c */
|
||||
extern atomic_long_t mmap_pages_allocated;
|
||||
@ -1528,6 +1558,7 @@ void vm_stat_account(struct mm_struct *, unsigned long, struct file *, long);
|
||||
static inline void vm_stat_account(struct mm_struct *mm,
|
||||
unsigned long flags, struct file *file, long pages)
|
||||
{
|
||||
mm->total_vm += pages;
|
||||
}
|
||||
#endif /* CONFIG_PROC_FS */
|
||||
|
||||
|
@ -54,6 +54,15 @@ struct page {
|
||||
union {
|
||||
pgoff_t index; /* Our offset within mapping. */
|
||||
void *freelist; /* slub/slob first free object */
|
||||
bool pfmemalloc; /* If set by the page allocator,
|
||||
* ALLOC_NO_WATERMARKS was set
|
||||
* and the low watermark was not
|
||||
* met implying that the system
|
||||
* is under some pressure. The
|
||||
* caller should try ensure
|
||||
* this page is only used to
|
||||
* free other pages.
|
||||
*/
|
||||
};
|
||||
|
||||
union {
|
||||
|
@ -201,7 +201,7 @@ struct zone_reclaim_stat {
|
||||
struct lruvec {
|
||||
struct list_head lists[NR_LRU_LISTS];
|
||||
struct zone_reclaim_stat reclaim_stat;
|
||||
#ifdef CONFIG_CGROUP_MEM_RES_CTLR
|
||||
#ifdef CONFIG_MEMCG
|
||||
struct zone *zone;
|
||||
#endif
|
||||
};
|
||||
@ -209,7 +209,6 @@ struct lruvec {
|
||||
/* Mask used at gathering information at once (see memcontrol.c) */
|
||||
#define LRU_ALL_FILE (BIT(LRU_INACTIVE_FILE) | BIT(LRU_ACTIVE_FILE))
|
||||
#define LRU_ALL_ANON (BIT(LRU_INACTIVE_ANON) | BIT(LRU_ACTIVE_ANON))
|
||||
#define LRU_ALL_EVICTABLE (LRU_ALL_FILE | LRU_ALL_ANON)
|
||||
#define LRU_ALL ((1 << NR_LRU_LISTS) - 1)
|
||||
|
||||
/* Isolate clean file */
|
||||
@ -369,6 +368,10 @@ struct zone {
|
||||
*/
|
||||
spinlock_t lock;
|
||||
int all_unreclaimable; /* All pages pinned */
|
||||
#if defined CONFIG_COMPACTION || defined CONFIG_CMA
|
||||
/* pfn where the last incremental compaction isolated free pages */
|
||||
unsigned long compact_cached_free_pfn;
|
||||
#endif
|
||||
#ifdef CONFIG_MEMORY_HOTPLUG
|
||||
/* see spanned/present_pages for more description */
|
||||
seqlock_t span_seqlock;
|
||||
@ -475,6 +478,14 @@ struct zone {
|
||||
* rarely used fields:
|
||||
*/
|
||||
const char *name;
|
||||
#ifdef CONFIG_MEMORY_ISOLATION
|
||||
/*
|
||||
* the number of MIGRATE_ISOLATE *pageblock*.
|
||||
* We need this for free page counting. Look at zone_watermark_ok_safe.
|
||||
* It's protected by zone->lock
|
||||
*/
|
||||
int nr_pageblock_isolate;
|
||||
#endif
|
||||
} ____cacheline_internodealigned_in_smp;
|
||||
|
||||
typedef enum {
|
||||
@ -671,7 +682,7 @@ typedef struct pglist_data {
|
||||
int nr_zones;
|
||||
#ifdef CONFIG_FLAT_NODE_MEM_MAP /* means !SPARSEMEM */
|
||||
struct page *node_mem_map;
|
||||
#ifdef CONFIG_CGROUP_MEM_RES_CTLR
|
||||
#ifdef CONFIG_MEMCG
|
||||
struct page_cgroup *node_page_cgroup;
|
||||
#endif
|
||||
#endif
|
||||
@ -694,6 +705,7 @@ typedef struct pglist_data {
|
||||
range, including holes */
|
||||
int node_id;
|
||||
wait_queue_head_t kswapd_wait;
|
||||
wait_queue_head_t pfmemalloc_wait;
|
||||
struct task_struct *kswapd; /* Protected by lock_memory_hotplug() */
|
||||
int kswapd_max_order;
|
||||
enum zone_type classzone_idx;
|
||||
@ -718,7 +730,7 @@ typedef struct pglist_data {
|
||||
#include <linux/memory_hotplug.h>
|
||||
|
||||
extern struct mutex zonelists_mutex;
|
||||
void build_all_zonelists(void *data);
|
||||
void build_all_zonelists(pg_data_t *pgdat, struct zone *zone);
|
||||
void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx);
|
||||
bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,
|
||||
int classzone_idx, int alloc_flags);
|
||||
@ -736,7 +748,7 @@ extern void lruvec_init(struct lruvec *lruvec, struct zone *zone);
|
||||
|
||||
static inline struct zone *lruvec_zone(struct lruvec *lruvec)
|
||||
{
|
||||
#ifdef CONFIG_CGROUP_MEM_RES_CTLR
|
||||
#ifdef CONFIG_MEMCG
|
||||
return lruvec->zone;
|
||||
#else
|
||||
return container_of(lruvec, struct zone, lruvec);
|
||||
@ -773,7 +785,7 @@ extern int movable_zone;
|
||||
|
||||
static inline int zone_movable_is_highmem(void)
|
||||
{
|
||||
#if defined(CONFIG_HIGHMEM) && defined(CONFIG_HAVE_MEMBLOCK_NODE)
|
||||
#if defined(CONFIG_HIGHMEM) && defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP)
|
||||
return movable_zone == ZONE_HIGHMEM;
|
||||
#else
|
||||
return 0;
|
||||
@ -1052,7 +1064,7 @@ struct mem_section {
|
||||
|
||||
/* See declaration of similar field in struct zone */
|
||||
unsigned long *pageblock_flags;
|
||||
#ifdef CONFIG_CGROUP_MEM_RES_CTLR
|
||||
#ifdef CONFIG_MEMCG
|
||||
/*
|
||||
* If !SPARSEMEM, pgdat doesn't have page_cgroup pointer. We use
|
||||
* section. (see memcontrol.h/page_cgroup.h about this.)
|
||||
|
@ -473,10 +473,10 @@ extern ssize_t nfs_direct_IO(int, struct kiocb *, const struct iovec *, loff_t,
|
||||
unsigned long);
|
||||
extern ssize_t nfs_file_direct_read(struct kiocb *iocb,
|
||||
const struct iovec *iov, unsigned long nr_segs,
|
||||
loff_t pos);
|
||||
loff_t pos, bool uio);
|
||||
extern ssize_t nfs_file_direct_write(struct kiocb *iocb,
|
||||
const struct iovec *iov, unsigned long nr_segs,
|
||||
loff_t pos);
|
||||
loff_t pos, bool uio);
|
||||
|
||||
/*
|
||||
* linux/fs/nfs/dir.c
|
||||
|
@ -40,15 +40,36 @@ enum oom_constraint {
|
||||
CONSTRAINT_MEMCG,
|
||||
};
|
||||
|
||||
enum oom_scan_t {
|
||||
OOM_SCAN_OK, /* scan thread and find its badness */
|
||||
OOM_SCAN_CONTINUE, /* do not consider thread for oom kill */
|
||||
OOM_SCAN_ABORT, /* abort the iteration and return */
|
||||
OOM_SCAN_SELECT, /* always select this thread first */
|
||||
};
|
||||
|
||||
extern void compare_swap_oom_score_adj(int old_val, int new_val);
|
||||
extern int test_set_oom_score_adj(int new_val);
|
||||
|
||||
extern unsigned long oom_badness(struct task_struct *p,
|
||||
struct mem_cgroup *memcg, const nodemask_t *nodemask,
|
||||
unsigned long totalpages);
|
||||
extern void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
|
||||
unsigned int points, unsigned long totalpages,
|
||||
struct mem_cgroup *memcg, nodemask_t *nodemask,
|
||||
const char *message);
|
||||
|
||||
extern int try_set_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_flags);
|
||||
extern void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_flags);
|
||||
|
||||
extern void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
|
||||
int order, const nodemask_t *nodemask);
|
||||
|
||||
extern enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
|
||||
unsigned long totalpages, const nodemask_t *nodemask,
|
||||
bool force_kill);
|
||||
extern void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
|
||||
int order);
|
||||
|
||||
extern void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
|
||||
int order, nodemask_t *mask, bool force_kill);
|
||||
extern int register_oom_notifier(struct notifier_block *nb);
|
||||
|
@ -7,6 +7,7 @@
|
||||
|
||||
#include <linux/types.h>
|
||||
#include <linux/bug.h>
|
||||
#include <linux/mmdebug.h>
|
||||
#ifndef __GENERATING_BOUNDS_H
|
||||
#include <linux/mm_types.h>
|
||||
#include <generated/bounds.h>
|
||||
@ -453,6 +454,34 @@ static inline int PageTransTail(struct page *page)
|
||||
}
|
||||
#endif
|
||||
|
||||
/*
|
||||
* If network-based swap is enabled, sl*b must keep track of whether pages
|
||||
* were allocated from pfmemalloc reserves.
|
||||
*/
|
||||
static inline int PageSlabPfmemalloc(struct page *page)
|
||||
{
|
||||
VM_BUG_ON(!PageSlab(page));
|
||||
return PageActive(page);
|
||||
}
|
||||
|
||||
static inline void SetPageSlabPfmemalloc(struct page *page)
|
||||
{
|
||||
VM_BUG_ON(!PageSlab(page));
|
||||
SetPageActive(page);
|
||||
}
|
||||
|
||||
static inline void __ClearPageSlabPfmemalloc(struct page *page)
|
||||
{
|
||||
VM_BUG_ON(!PageSlab(page));
|
||||
__ClearPageActive(page);
|
||||
}
|
||||
|
||||
static inline void ClearPageSlabPfmemalloc(struct page *page)
|
||||
{
|
||||
VM_BUG_ON(!PageSlab(page));
|
||||
ClearPageActive(page);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_MMU
|
||||
#define __PG_MLOCKED (1 << PG_mlocked)
|
||||
#else
|
||||
|
@ -1,6 +1,11 @@
|
||||
#ifndef __LINUX_PAGEISOLATION_H
|
||||
#define __LINUX_PAGEISOLATION_H
|
||||
|
||||
|
||||
bool has_unmovable_pages(struct zone *zone, struct page *page, int count);
|
||||
void set_pageblock_migratetype(struct page *page, int migratetype);
|
||||
int move_freepages_block(struct zone *zone, struct page *page,
|
||||
int migratetype);
|
||||
/*
|
||||
* Changes migrate type in [start_pfn, end_pfn) to be MIGRATE_ISOLATE.
|
||||
* If specified range includes migrate types other than MOVABLE or CMA,
|
||||
@ -10,7 +15,7 @@
|
||||
* free all pages in the range. test_page_isolated() can be used for
|
||||
* test it.
|
||||
*/
|
||||
extern int
|
||||
int
|
||||
start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
|
||||
unsigned migratetype);
|
||||
|
||||
@ -18,7 +23,7 @@ start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
|
||||
* Changes MIGRATE_ISOLATE to MIGRATE_MOVABLE.
|
||||
* target range is [start_pfn, end_pfn)
|
||||
*/
|
||||
extern int
|
||||
int
|
||||
undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
|
||||
unsigned migratetype);
|
||||
|
||||
@ -30,8 +35,8 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn);
|
||||
/*
|
||||
* Internal functions. Changes pageblock's migrate type.
|
||||
*/
|
||||
extern int set_migratetype_isolate(struct page *page);
|
||||
extern void unset_migratetype_isolate(struct page *page, unsigned migratetype);
|
||||
int set_migratetype_isolate(struct page *page);
|
||||
void unset_migratetype_isolate(struct page *page, unsigned migratetype);
|
||||
|
||||
|
||||
#endif
|
||||
|
@ -12,7 +12,7 @@ enum {
|
||||
#ifndef __GENERATING_BOUNDS_H
|
||||
#include <generated/bounds.h>
|
||||
|
||||
#ifdef CONFIG_CGROUP_MEM_RES_CTLR
|
||||
#ifdef CONFIG_MEMCG
|
||||
#include <linux/bit_spinlock.h>
|
||||
|
||||
/*
|
||||
@ -82,7 +82,7 @@ static inline void unlock_page_cgroup(struct page_cgroup *pc)
|
||||
bit_spin_unlock(PCG_LOCK, &pc->flags);
|
||||
}
|
||||
|
||||
#else /* CONFIG_CGROUP_MEM_RES_CTLR */
|
||||
#else /* CONFIG_MEMCG */
|
||||
struct page_cgroup;
|
||||
|
||||
static inline void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat)
|
||||
@ -102,11 +102,11 @@ static inline void __init page_cgroup_init_flatmem(void)
|
||||
{
|
||||
}
|
||||
|
||||
#endif /* CONFIG_CGROUP_MEM_RES_CTLR */
|
||||
#endif /* CONFIG_MEMCG */
|
||||
|
||||
#include <linux/swap.h>
|
||||
|
||||
#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
|
||||
#ifdef CONFIG_MEMCG_SWAP
|
||||
extern unsigned short swap_cgroup_cmpxchg(swp_entry_t ent,
|
||||
unsigned short old, unsigned short new);
|
||||
extern unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id);
|
||||
@ -138,7 +138,7 @@ static inline void swap_cgroup_swapoff(int type)
|
||||
return;
|
||||
}
|
||||
|
||||
#endif /* CONFIG_CGROUP_MEM_RES_CTLR_SWAP */
|
||||
#endif /* CONFIG_MEMCG_SWAP */
|
||||
|
||||
#endif /* !__GENERATING_BOUNDS_H */
|
||||
|
||||
|
@ -286,6 +286,11 @@ static inline loff_t page_offset(struct page *page)
|
||||
return ((loff_t)page->index) << PAGE_CACHE_SHIFT;
|
||||
}
|
||||
|
||||
static inline loff_t page_file_offset(struct page *page)
|
||||
{
|
||||
return ((loff_t)page_file_index(page)) << PAGE_CACHE_SHIFT;
|
||||
}
|
||||
|
||||
extern pgoff_t linear_hugepage_index(struct vm_area_struct *vma,
|
||||
unsigned long address);
|
||||
|
||||
|
@ -1584,7 +1584,7 @@ struct task_struct {
|
||||
/* bitmask and counter of trace recursion */
|
||||
unsigned long trace_recursion;
|
||||
#endif /* CONFIG_TRACING */
|
||||
#ifdef CONFIG_CGROUP_MEM_RES_CTLR /* memcg uses this to do batch job */
|
||||
#ifdef CONFIG_MEMCG /* memcg uses this to do batch job */
|
||||
struct memcg_batch_info {
|
||||
int do_batch; /* incremented when batch uncharge started */
|
||||
struct mem_cgroup *memcg; /* target memcg of uncharge */
|
||||
@ -1894,6 +1894,13 @@ static inline void rcu_copy_process(struct task_struct *p)
|
||||
|
||||
#endif
|
||||
|
||||
static inline void tsk_restore_flags(struct task_struct *task,
|
||||
unsigned long orig_flags, unsigned long flags)
|
||||
{
|
||||
task->flags &= ~flags;
|
||||
task->flags |= orig_flags & flags;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_SMP
|
||||
extern void do_set_cpus_allowed(struct task_struct *p,
|
||||
const struct cpumask *new_mask);
|
||||
|
@ -20,7 +20,6 @@ struct shrink_control {
|
||||
* 'nr_to_scan' entries and attempt to free them up. It should return
|
||||
* the number of objects which remain in the cache. If it returns -1, it means
|
||||
* it cannot do any scanning at this time (eg. there is a risk of deadlock).
|
||||
* The callback must not return -1 if nr_to_scan is zero.
|
||||
*
|
||||
* The 'gfpmask' refers to the allocation we are currently trying to
|
||||
* fulfil.
|
||||
|
@ -462,6 +462,7 @@ struct sk_buff {
|
||||
#ifdef CONFIG_IPV6_NDISC_NODETYPE
|
||||
__u8 ndisc_nodetype:2;
|
||||
#endif
|
||||
__u8 pfmemalloc:1;
|
||||
__u8 ooo_okay:1;
|
||||
__u8 l4_rxhash:1;
|
||||
__u8 wifi_acked_valid:1;
|
||||
@ -502,6 +503,15 @@ struct sk_buff {
|
||||
#include <linux/slab.h>
|
||||
|
||||
|
||||
#define SKB_ALLOC_FCLONE 0x01
|
||||
#define SKB_ALLOC_RX 0x02
|
||||
|
||||
/* Returns true if the skb was allocated from PFMEMALLOC reserves */
|
||||
static inline bool skb_pfmemalloc(const struct sk_buff *skb)
|
||||
{
|
||||
return unlikely(skb->pfmemalloc);
|
||||
}
|
||||
|
||||
/*
|
||||
* skb might have a dst pointer attached, refcounted or not.
|
||||
* _skb_refdst low order bit is set if refcount was _not_ taken
|
||||
@ -565,7 +575,7 @@ extern bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from,
|
||||
bool *fragstolen, int *delta_truesize);
|
||||
|
||||
extern struct sk_buff *__alloc_skb(unsigned int size,
|
||||
gfp_t priority, int fclone, int node);
|
||||
gfp_t priority, int flags, int node);
|
||||
extern struct sk_buff *build_skb(void *data, unsigned int frag_size);
|
||||
static inline struct sk_buff *alloc_skb(unsigned int size,
|
||||
gfp_t priority)
|
||||
@ -576,7 +586,7 @@ static inline struct sk_buff *alloc_skb(unsigned int size,
|
||||
static inline struct sk_buff *alloc_skb_fclone(unsigned int size,
|
||||
gfp_t priority)
|
||||
{
|
||||
return __alloc_skb(size, priority, 1, NUMA_NO_NODE);
|
||||
return __alloc_skb(size, priority, SKB_ALLOC_FCLONE, NUMA_NO_NODE);
|
||||
}
|
||||
|
||||
extern void skb_recycle(struct sk_buff *skb);
|
||||
@ -1237,6 +1247,17 @@ static inline void __skb_fill_page_desc(struct sk_buff *skb, int i,
|
||||
{
|
||||
skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
|
||||
|
||||
/*
|
||||
* Propagate page->pfmemalloc to the skb if we can. The problem is
|
||||
* that not all callers have unique ownership of the page. If
|
||||
* pfmemalloc is set, we check the mapping as a mapping implies
|
||||
* page->index is set (index and pfmemalloc share space).
|
||||
* If it's a valid mapping, we cannot use page->pfmemalloc but we
|
||||
* do not lose pfmemalloc information as the pages would not be
|
||||
* allocated using __GFP_MEMALLOC.
|
||||
*/
|
||||
if (page->pfmemalloc && !page->mapping)
|
||||
skb->pfmemalloc = true;
|
||||
frag->page.p = page;
|
||||
frag->page_offset = off;
|
||||
skb_frag_size_set(frag, size);
|
||||
@ -1753,6 +1774,61 @@ static inline struct sk_buff *netdev_alloc_skb_ip_align(struct net_device *dev,
|
||||
return __netdev_alloc_skb_ip_align(dev, length, GFP_ATOMIC);
|
||||
}
|
||||
|
||||
/*
|
||||
* __skb_alloc_page - allocate pages for ps-rx on a skb and preserve pfmemalloc data
|
||||
* @gfp_mask: alloc_pages_node mask. Set __GFP_NOMEMALLOC if not for network packet RX
|
||||
* @skb: skb to set pfmemalloc on if __GFP_MEMALLOC is used
|
||||
* @order: size of the allocation
|
||||
*
|
||||
* Allocate a new page.
|
||||
*
|
||||
* %NULL is returned if there is no free memory.
|
||||
*/
|
||||
static inline struct page *__skb_alloc_pages(gfp_t gfp_mask,
|
||||
struct sk_buff *skb,
|
||||
unsigned int order)
|
||||
{
|
||||
struct page *page;
|
||||
|
||||
gfp_mask |= __GFP_COLD;
|
||||
|
||||
if (!(gfp_mask & __GFP_NOMEMALLOC))
|
||||
gfp_mask |= __GFP_MEMALLOC;
|
||||
|
||||
page = alloc_pages_node(NUMA_NO_NODE, gfp_mask, order);
|
||||
if (skb && page && page->pfmemalloc)
|
||||
skb->pfmemalloc = true;
|
||||
|
||||
return page;
|
||||
}
|
||||
|
||||
/**
|
||||
* __skb_alloc_page - allocate a page for ps-rx for a given skb and preserve pfmemalloc data
|
||||
* @gfp_mask: alloc_pages_node mask. Set __GFP_NOMEMALLOC if not for network packet RX
|
||||
* @skb: skb to set pfmemalloc on if __GFP_MEMALLOC is used
|
||||
*
|
||||
* Allocate a new page.
|
||||
*
|
||||
* %NULL is returned if there is no free memory.
|
||||
*/
|
||||
static inline struct page *__skb_alloc_page(gfp_t gfp_mask,
|
||||
struct sk_buff *skb)
|
||||
{
|
||||
return __skb_alloc_pages(gfp_mask, skb, 0);
|
||||
}
|
||||
|
||||
/**
|
||||
* skb_propagate_pfmemalloc - Propagate pfmemalloc if skb is allocated after RX page
|
||||
* @page: The page that was allocated from skb_alloc_page
|
||||
* @skb: The skb that may need pfmemalloc set
|
||||
*/
|
||||
static inline void skb_propagate_pfmemalloc(struct page *page,
|
||||
struct sk_buff *skb)
|
||||
{
|
||||
if (page && page->pfmemalloc)
|
||||
skb->pfmemalloc = true;
|
||||
}
|
||||
|
||||
/**
|
||||
* skb_frag_page - retrieve the page refered to by a paged fragment
|
||||
* @frag: the paged fragment
|
||||
|
@ -174,6 +174,8 @@ struct rpc_xprt {
|
||||
unsigned long state; /* transport state */
|
||||
unsigned char shutdown : 1, /* being shut down */
|
||||
resvport : 1; /* use a reserved port */
|
||||
unsigned int swapper; /* we're swapping over this
|
||||
transport */
|
||||
unsigned int bind_index; /* bind function index */
|
||||
|
||||
/*
|
||||
@ -316,6 +318,7 @@ void xprt_release_rqst_cong(struct rpc_task *task);
|
||||
void xprt_disconnect_done(struct rpc_xprt *xprt);
|
||||
void xprt_force_disconnect(struct rpc_xprt *xprt);
|
||||
void xprt_conditional_disconnect(struct rpc_xprt *xprt, unsigned int cookie);
|
||||
int xs_swapper(struct rpc_xprt *xprt, int enable);
|
||||
|
||||
/*
|
||||
* Reserved bit positions in xprt->state
|
||||
|
@ -151,6 +151,7 @@ enum {
|
||||
SWP_SOLIDSTATE = (1 << 4), /* blkdev seeks are cheap */
|
||||
SWP_CONTINUED = (1 << 5), /* swap_map has count continuation */
|
||||
SWP_BLKDEV = (1 << 6), /* its a block device */
|
||||
SWP_FILE = (1 << 7), /* set after swap_activate success */
|
||||
/* add others here before... */
|
||||
SWP_SCANNING = (1 << 8), /* refcount in scan_swap_map */
|
||||
};
|
||||
@ -301,7 +302,7 @@ static inline void scan_unevictable_unregister_node(struct node *node)
|
||||
|
||||
extern int kswapd_run(int nid);
|
||||
extern void kswapd_stop(int nid);
|
||||
#ifdef CONFIG_CGROUP_MEM_RES_CTLR
|
||||
#ifdef CONFIG_MEMCG
|
||||
extern int mem_cgroup_swappiness(struct mem_cgroup *mem);
|
||||
#else
|
||||
static inline int mem_cgroup_swappiness(struct mem_cgroup *mem)
|
||||
@ -309,7 +310,7 @@ static inline int mem_cgroup_swappiness(struct mem_cgroup *mem)
|
||||
return vm_swappiness;
|
||||
}
|
||||
#endif
|
||||
#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
|
||||
#ifdef CONFIG_MEMCG_SWAP
|
||||
extern void mem_cgroup_uncharge_swap(swp_entry_t ent);
|
||||
#else
|
||||
static inline void mem_cgroup_uncharge_swap(swp_entry_t ent)
|
||||
@ -320,8 +321,14 @@ static inline void mem_cgroup_uncharge_swap(swp_entry_t ent)
|
||||
/* linux/mm/page_io.c */
|
||||
extern int swap_readpage(struct page *);
|
||||
extern int swap_writepage(struct page *page, struct writeback_control *wbc);
|
||||
extern int swap_set_page_dirty(struct page *page);
|
||||
extern void end_swap_bio_read(struct bio *bio, int err);
|
||||
|
||||
int add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
|
||||
unsigned long nr_pages, sector_t start_block);
|
||||
int generic_swapfile_activate(struct swap_info_struct *, struct file *,
|
||||
sector_t *);
|
||||
|
||||
/* linux/mm/swap_state.c */
|
||||
extern struct address_space swapper_space;
|
||||
#define total_swapcache_pages swapper_space.nrpages
|
||||
@ -356,11 +363,12 @@ extern unsigned int count_swap_pages(int, int);
|
||||
extern sector_t map_swap_page(struct page *, struct block_device **);
|
||||
extern sector_t swapdev_block(int, pgoff_t);
|
||||
extern int page_swapcount(struct page *);
|
||||
extern struct swap_info_struct *page_swap_info(struct page *);
|
||||
extern int reuse_swap_page(struct page *);
|
||||
extern int try_to_free_swap(struct page *);
|
||||
struct backing_dev_info;
|
||||
|
||||
#ifdef CONFIG_CGROUP_MEM_RES_CTLR
|
||||
#ifdef CONFIG_MEMCG
|
||||
extern void
|
||||
mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout);
|
||||
#else
|
||||
|
@ -30,6 +30,7 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
|
||||
FOR_ALL_ZONES(PGSTEAL_DIRECT),
|
||||
FOR_ALL_ZONES(PGSCAN_KSWAPD),
|
||||
FOR_ALL_ZONES(PGSCAN_DIRECT),
|
||||
PGSCAN_DIRECT_THROTTLE,
|
||||
#ifdef CONFIG_NUMA
|
||||
PGSCAN_ZONE_RECLAIM_FAILED,
|
||||
#endif
|
||||
|
@ -179,11 +179,6 @@ extern void zone_statistics(struct zone *, struct zone *, gfp_t gfp);
|
||||
#define add_zone_page_state(__z, __i, __d) mod_zone_page_state(__z, __i, __d)
|
||||
#define sub_zone_page_state(__z, __i, __d) mod_zone_page_state(__z, __i, -(__d))
|
||||
|
||||
static inline void zap_zone_vm_stats(struct zone *zone)
|
||||
{
|
||||
memset(zone->vm_stat, 0, sizeof(zone->vm_stat));
|
||||
}
|
||||
|
||||
extern void inc_zone_state(struct zone *, enum zone_stat_item);
|
||||
|
||||
#ifdef CONFIG_SMP
|
||||
|
@ -189,9 +189,4 @@ void tag_pages_for_writeback(struct address_space *mapping,
|
||||
|
||||
void account_page_redirty(struct page *page);
|
||||
|
||||
/* pdflush.c */
|
||||
extern int nr_pdflush_threads; /* Global so it can be exported to sysctl
|
||||
read-only. */
|
||||
|
||||
|
||||
#endif /* WRITEBACK_H */
|
||||
|
@ -621,6 +621,7 @@ enum sock_flags {
|
||||
SOCK_RCVTSTAMPNS, /* %SO_TIMESTAMPNS setting */
|
||||
SOCK_LOCALROUTE, /* route locally only, %SO_DONTROUTE setting */
|
||||
SOCK_QUEUE_SHRUNK, /* write queue has been shrunk recently */
|
||||
SOCK_MEMALLOC, /* VM depends on this socket for swapping */
|
||||
SOCK_TIMESTAMPING_TX_HARDWARE, /* %SOF_TIMESTAMPING_TX_HARDWARE */
|
||||
SOCK_TIMESTAMPING_TX_SOFTWARE, /* %SOF_TIMESTAMPING_TX_SOFTWARE */
|
||||
SOCK_TIMESTAMPING_RX_HARDWARE, /* %SOF_TIMESTAMPING_RX_HARDWARE */
|
||||
@ -658,6 +659,26 @@ static inline bool sock_flag(const struct sock *sk, enum sock_flags flag)
|
||||
return test_bit(flag, &sk->sk_flags);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_NET
|
||||
extern struct static_key memalloc_socks;
|
||||
static inline int sk_memalloc_socks(void)
|
||||
{
|
||||
return static_key_false(&memalloc_socks);
|
||||
}
|
||||
#else
|
||||
|
||||
static inline int sk_memalloc_socks(void)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
static inline gfp_t sk_gfp_atomic(struct sock *sk, gfp_t gfp_mask)
|
||||
{
|
||||
return GFP_ATOMIC | (sk->sk_allocation & __GFP_MEMALLOC);
|
||||
}
|
||||
|
||||
static inline void sk_acceptq_removed(struct sock *sk)
|
||||
{
|
||||
sk->sk_ack_backlog--;
|
||||
@ -733,8 +754,13 @@ static inline __must_check int sk_add_backlog(struct sock *sk, struct sk_buff *s
|
||||
return 0;
|
||||
}
|
||||
|
||||
extern int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb);
|
||||
|
||||
static inline int sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
|
||||
{
|
||||
if (sk_memalloc_socks() && skb_pfmemalloc(skb))
|
||||
return __sk_backlog_rcv(sk, skb);
|
||||
|
||||
return sk->sk_backlog_rcv(sk, skb);
|
||||
}
|
||||
|
||||
@ -798,6 +824,8 @@ extern int sk_stream_wait_memory(struct sock *sk, long *timeo_p);
|
||||
extern void sk_stream_wait_close(struct sock *sk, long timeo_p);
|
||||
extern int sk_stream_error(struct sock *sk, int flags, int err);
|
||||
extern void sk_stream_kill_queues(struct sock *sk);
|
||||
extern void sk_set_memalloc(struct sock *sk);
|
||||
extern void sk_clear_memalloc(struct sock *sk);
|
||||
|
||||
extern int sk_wait_data(struct sock *sk, long *timeo);
|
||||
|
||||
@ -913,7 +941,7 @@ struct proto {
|
||||
#ifdef SOCK_REFCNT_DEBUG
|
||||
atomic_t socks;
|
||||
#endif
|
||||
#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM
|
||||
#ifdef CONFIG_MEMCG_KMEM
|
||||
/*
|
||||
* cgroup specific init/deinit functions. Called once for all
|
||||
* protocols that implement it, from cgroups populate function.
|
||||
@ -994,7 +1022,7 @@ inline void sk_refcnt_debug_release(const struct sock *sk)
|
||||
#define sk_refcnt_debug_release(sk) do { } while (0)
|
||||
#endif /* SOCK_REFCNT_DEBUG */
|
||||
|
||||
#if defined(CONFIG_CGROUP_MEM_RES_CTLR_KMEM) && defined(CONFIG_NET)
|
||||
#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_NET)
|
||||
extern struct static_key memcg_socket_limit_enabled;
|
||||
static inline struct cg_proto *parent_cg_proto(struct proto *proto,
|
||||
struct cg_proto *cg_proto)
|
||||
@ -1301,12 +1329,14 @@ static inline bool sk_wmem_schedule(struct sock *sk, int size)
|
||||
__sk_mem_schedule(sk, size, SK_MEM_SEND);
|
||||
}
|
||||
|
||||
static inline bool sk_rmem_schedule(struct sock *sk, int size)
|
||||
static inline bool
|
||||
sk_rmem_schedule(struct sock *sk, struct sk_buff *skb, unsigned int size)
|
||||
{
|
||||
if (!sk_has_account(sk))
|
||||
return true;
|
||||
return size <= sk->sk_forward_alloc ||
|
||||
__sk_mem_schedule(sk, size, SK_MEM_RECV);
|
||||
return size<= sk->sk_forward_alloc ||
|
||||
__sk_mem_schedule(sk, size, SK_MEM_RECV) ||
|
||||
skb_pfmemalloc(skb);
|
||||
}
|
||||
|
||||
static inline void sk_mem_reclaim(struct sock *sk)
|
||||
|
@ -30,6 +30,7 @@
|
||||
{(unsigned long)__GFP_COMP, "GFP_COMP"}, \
|
||||
{(unsigned long)__GFP_ZERO, "GFP_ZERO"}, \
|
||||
{(unsigned long)__GFP_NOMEMALLOC, "GFP_NOMEMALLOC"}, \
|
||||
{(unsigned long)__GFP_MEMALLOC, "GFP_MEMALLOC"}, \
|
||||
{(unsigned long)__GFP_HARDWALL, "GFP_HARDWALL"}, \
|
||||
{(unsigned long)__GFP_THISNODE, "GFP_THISNODE"}, \
|
||||
{(unsigned long)__GFP_RECLAIMABLE, "GFP_RECLAIMABLE"}, \
|
||||
|
29
init/Kconfig
29
init/Kconfig
@ -686,7 +686,7 @@ config RESOURCE_COUNTERS
|
||||
This option enables controller independent resource accounting
|
||||
infrastructure that works with cgroups.
|
||||
|
||||
config CGROUP_MEM_RES_CTLR
|
||||
config MEMCG
|
||||
bool "Memory Resource Controller for Control Groups"
|
||||
depends on RESOURCE_COUNTERS
|
||||
select MM_OWNER
|
||||
@ -709,9 +709,9 @@ config CGROUP_MEM_RES_CTLR
|
||||
This config option also selects MM_OWNER config option, which
|
||||
could in turn add some fork/exit overhead.
|
||||
|
||||
config CGROUP_MEM_RES_CTLR_SWAP
|
||||
config MEMCG_SWAP
|
||||
bool "Memory Resource Controller Swap Extension"
|
||||
depends on CGROUP_MEM_RES_CTLR && SWAP
|
||||
depends on MEMCG && SWAP
|
||||
help
|
||||
Add swap management feature to memory resource controller. When you
|
||||
enable this, you can limit mem+swap usage per cgroup. In other words,
|
||||
@ -726,9 +726,9 @@ config CGROUP_MEM_RES_CTLR_SWAP
|
||||
if boot option "swapaccount=0" is set, swap will not be accounted.
|
||||
Now, memory usage of swap_cgroup is 2 bytes per entry. If swap page
|
||||
size is 4096bytes, 512k per 1Gbytes of swap.
|
||||
config CGROUP_MEM_RES_CTLR_SWAP_ENABLED
|
||||
config MEMCG_SWAP_ENABLED
|
||||
bool "Memory Resource Controller Swap Extension enabled by default"
|
||||
depends on CGROUP_MEM_RES_CTLR_SWAP
|
||||
depends on MEMCG_SWAP
|
||||
default y
|
||||
help
|
||||
Memory Resource Controller Swap Extension comes with its price in
|
||||
@ -739,9 +739,9 @@ config CGROUP_MEM_RES_CTLR_SWAP_ENABLED
|
||||
For those who want to have the feature enabled by default should
|
||||
select this option (if, for some reason, they need to disable it
|
||||
then swapaccount=0 does the trick).
|
||||
config CGROUP_MEM_RES_CTLR_KMEM
|
||||
config MEMCG_KMEM
|
||||
bool "Memory Resource Controller Kernel Memory accounting (EXPERIMENTAL)"
|
||||
depends on CGROUP_MEM_RES_CTLR && EXPERIMENTAL
|
||||
depends on MEMCG && EXPERIMENTAL
|
||||
default n
|
||||
help
|
||||
The Kernel Memory extension for Memory Resource Controller can limit
|
||||
@ -751,6 +751,21 @@ config CGROUP_MEM_RES_CTLR_KMEM
|
||||
the kmem extension can use it to guarantee that no group of processes
|
||||
will ever exhaust kernel resources alone.
|
||||
|
||||
config CGROUP_HUGETLB
|
||||
bool "HugeTLB Resource Controller for Control Groups"
|
||||
depends on RESOURCE_COUNTERS && HUGETLB_PAGE && EXPERIMENTAL
|
||||
default n
|
||||
help
|
||||
Provides a cgroup Resource Controller for HugeTLB pages.
|
||||
When you enable this, you can put a per cgroup limit on HugeTLB usage.
|
||||
The limit is enforced during page fault. Since HugeTLB doesn't
|
||||
support page reclaim, enforcing the limit at page fault time implies
|
||||
that, the application will get SIGBUS signal if it tries to access
|
||||
HugeTLB pages beyond its limit. This requires the application to know
|
||||
beforehand how much HugeTLB pages it would require for its use. The
|
||||
control group is tracked in the third page lru pointer. This means
|
||||
that we cannot use the controller with huge page less than 3 pages.
|
||||
|
||||
config CGROUP_PERF
|
||||
bool "Enable perf_event per-cpu per-container group (cgroup) monitoring"
|
||||
depends on PERF_EVENTS && CGROUPS
|
||||
|
@ -506,7 +506,7 @@ asmlinkage void __init start_kernel(void)
|
||||
setup_per_cpu_areas();
|
||||
smp_prepare_boot_cpu(); /* arch-specific boot-cpu hooks */
|
||||
|
||||
build_all_zonelists(NULL);
|
||||
build_all_zonelists(NULL, NULL);
|
||||
page_alloc_init();
|
||||
|
||||
printk(KERN_NOTICE "Kernel command line: %s\n", boot_command_line);
|
||||
|
@ -416,7 +416,7 @@ int __cpuinit cpu_up(unsigned int cpu)
|
||||
|
||||
if (pgdat->node_zonelists->_zonerefs->zone == NULL) {
|
||||
mutex_lock(&zonelists_mutex);
|
||||
build_all_zonelists(NULL);
|
||||
build_all_zonelists(NULL, NULL);
|
||||
mutex_unlock(&zonelists_mutex);
|
||||
}
|
||||
#endif
|
||||
|
@ -381,10 +381,8 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
|
||||
struct file *file;
|
||||
|
||||
if (mpnt->vm_flags & VM_DONTCOPY) {
|
||||
long pages = vma_pages(mpnt);
|
||||
mm->total_vm -= pages;
|
||||
vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file,
|
||||
-pages);
|
||||
-vma_pages(mpnt));
|
||||
continue;
|
||||
}
|
||||
charge = 0;
|
||||
@ -1308,7 +1306,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
|
||||
#ifdef CONFIG_DEBUG_MUTEXES
|
||||
p->blocked_on = NULL; /* not blocked yet */
|
||||
#endif
|
||||
#ifdef CONFIG_CGROUP_MEM_RES_CTLR
|
||||
#ifdef CONFIG_MEMCG
|
||||
p->memcg_batch.do_batch = 0;
|
||||
p->memcg_batch.memcg = NULL;
|
||||
#endif
|
||||
|
@ -210,6 +210,14 @@ asmlinkage void __do_softirq(void)
|
||||
__u32 pending;
|
||||
int max_restart = MAX_SOFTIRQ_RESTART;
|
||||
int cpu;
|
||||
unsigned long old_flags = current->flags;
|
||||
|
||||
/*
|
||||
* Mask out PF_MEMALLOC s current task context is borrowed for the
|
||||
* softirq. A softirq handled such as network RX might set PF_MEMALLOC
|
||||
* again if the socket is related to swap
|
||||
*/
|
||||
current->flags &= ~PF_MEMALLOC;
|
||||
|
||||
pending = local_softirq_pending();
|
||||
account_system_vtime(current);
|
||||
@ -265,6 +273,7 @@ asmlinkage void __do_softirq(void)
|
||||
|
||||
account_system_vtime(current);
|
||||
__local_bh_enable(SOFTIRQ_OFFSET);
|
||||
tsk_restore_flags(current, old_flags, PF_MEMALLOC);
|
||||
}
|
||||
|
||||
#ifndef __ARCH_HAS_DO_SOFTIRQ
|
||||
|
@ -1101,11 +1101,9 @@ static struct ctl_table vm_table[] = {
|
||||
.extra1 = &zero,
|
||||
},
|
||||
{
|
||||
.procname = "nr_pdflush_threads",
|
||||
.data = &nr_pdflush_threads,
|
||||
.maxlen = sizeof nr_pdflush_threads,
|
||||
.mode = 0444 /* read-only*/,
|
||||
.proc_handler = proc_dointvec,
|
||||
.procname = "nr_pdflush_threads",
|
||||
.mode = 0444 /* read-only */,
|
||||
.proc_handler = pdflush_proc_obsolete,
|
||||
},
|
||||
{
|
||||
.procname = "swappiness",
|
||||
|
@ -147,7 +147,7 @@ static const struct bin_table bin_vm_table[] = {
|
||||
{ CTL_INT, VM_DIRTY_RATIO, "dirty_ratio" },
|
||||
/* VM_DIRTY_WB_CS "dirty_writeback_centisecs" no longer used */
|
||||
/* VM_DIRTY_EXPIRE_CS "dirty_expire_centisecs" no longer used */
|
||||
{ CTL_INT, VM_NR_PDFLUSH_THREADS, "nr_pdflush_threads" },
|
||||
/* VM_NR_PDFLUSH_THREADS "nr_pdflush_threads" no longer used */
|
||||
{ CTL_INT, VM_OVERCOMMIT_RATIO, "overcommit_ratio" },
|
||||
/* VM_PAGEBUF unused */
|
||||
/* VM_HUGETLB_PAGES "nr_hugepages" no longer used */
|
||||
|
@ -140,9 +140,13 @@ config ARCH_DISCARD_MEMBLOCK
|
||||
config NO_BOOTMEM
|
||||
boolean
|
||||
|
||||
config MEMORY_ISOLATION
|
||||
boolean
|
||||
|
||||
# eventually, we can have this option just 'select SPARSEMEM'
|
||||
config MEMORY_HOTPLUG
|
||||
bool "Allow for memory hot-add"
|
||||
select MEMORY_ISOLATION
|
||||
depends on SPARSEMEM || X86_64_ACPI_NUMA
|
||||
depends on HOTPLUG && ARCH_ENABLE_MEMORY_HOTPLUG
|
||||
depends on (IA64 || X86 || PPC_BOOK3S_64 || SUPERH || S390)
|
||||
@ -272,6 +276,7 @@ config MEMORY_FAILURE
|
||||
depends on MMU
|
||||
depends on ARCH_SUPPORTS_MEMORY_FAILURE
|
||||
bool "Enable recovery from hardware memory errors"
|
||||
select MEMORY_ISOLATION
|
||||
help
|
||||
Enables code to recover from some memory failures on systems
|
||||
with MCA recovery. This allows a system to continue running
|
||||
|
@ -15,8 +15,8 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \
|
||||
maccess.o page_alloc.o page-writeback.o \
|
||||
readahead.o swap.o truncate.o vmscan.o shmem.o \
|
||||
prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \
|
||||
page_isolation.o mm_init.o mmu_context.o percpu.o \
|
||||
compaction.o slab_common.o $(mmu-y)
|
||||
mm_init.o mmu_context.o percpu.o slab_common.o \
|
||||
compaction.o $(mmu-y)
|
||||
|
||||
obj-y += init-mm.o
|
||||
|
||||
@ -49,9 +49,11 @@ obj-$(CONFIG_FS_XIP) += filemap_xip.o
|
||||
obj-$(CONFIG_MIGRATION) += migrate.o
|
||||
obj-$(CONFIG_QUICKLIST) += quicklist.o
|
||||
obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o
|
||||
obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o
|
||||
obj-$(CONFIG_MEMCG) += memcontrol.o page_cgroup.o
|
||||
obj-$(CONFIG_CGROUP_HUGETLB) += hugetlb_cgroup.o
|
||||
obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o
|
||||
obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o
|
||||
obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o
|
||||
obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o
|
||||
obj-$(CONFIG_CLEANCACHE) += cleancache.o
|
||||
obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o
|
||||
|
@ -886,3 +886,23 @@ long wait_iff_congested(struct zone *zone, int sync, long timeout)
|
||||
return ret;
|
||||
}
|
||||
EXPORT_SYMBOL(wait_iff_congested);
|
||||
|
||||
int pdflush_proc_obsolete(struct ctl_table *table, int write,
|
||||
void __user *buffer, size_t *lenp, loff_t *ppos)
|
||||
{
|
||||
char kbuf[] = "0\n";
|
||||
|
||||
if (*ppos) {
|
||||
*lenp = 0;
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (copy_to_user(buffer, kbuf, sizeof(kbuf)))
|
||||
return -EFAULT;
|
||||
printk_once(KERN_WARNING "%s exported in /proc is scheduled for removal\n",
|
||||
table->procname);
|
||||
|
||||
*lenp = 2;
|
||||
*ppos += *lenp;
|
||||
return 2;
|
||||
}
|
||||
|
@ -422,6 +422,17 @@ static void isolate_freepages(struct zone *zone,
|
||||
pfn -= pageblock_nr_pages) {
|
||||
unsigned long isolated;
|
||||
|
||||
/*
|
||||
* Skip ahead if another thread is compacting in the area
|
||||
* simultaneously. If we wrapped around, we can only skip
|
||||
* ahead if zone->compact_cached_free_pfn also wrapped to
|
||||
* above our starting point.
|
||||
*/
|
||||
if (cc->order > 0 && (!cc->wrapped ||
|
||||
zone->compact_cached_free_pfn >
|
||||
cc->start_free_pfn))
|
||||
pfn = min(pfn, zone->compact_cached_free_pfn);
|
||||
|
||||
if (!pfn_valid(pfn))
|
||||
continue;
|
||||
|
||||
@ -461,8 +472,11 @@ static void isolate_freepages(struct zone *zone,
|
||||
* looking for free pages, the search will restart here as
|
||||
* page migration may have returned some pages to the allocator
|
||||
*/
|
||||
if (isolated)
|
||||
if (isolated) {
|
||||
high_pfn = max(high_pfn, pfn);
|
||||
if (cc->order > 0)
|
||||
zone->compact_cached_free_pfn = high_pfn;
|
||||
}
|
||||
}
|
||||
|
||||
/* split_free_page does not map the pages */
|
||||
@ -556,6 +570,20 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
|
||||
return ISOLATE_SUCCESS;
|
||||
}
|
||||
|
||||
/*
|
||||
* Returns the start pfn of the last page block in a zone. This is the starting
|
||||
* point for full compaction of a zone. Compaction searches for free pages from
|
||||
* the end of each zone, while isolate_freepages_block scans forward inside each
|
||||
* page block.
|
||||
*/
|
||||
static unsigned long start_free_pfn(struct zone *zone)
|
||||
{
|
||||
unsigned long free_pfn;
|
||||
free_pfn = zone->zone_start_pfn + zone->spanned_pages;
|
||||
free_pfn &= ~(pageblock_nr_pages-1);
|
||||
return free_pfn;
|
||||
}
|
||||
|
||||
static int compact_finished(struct zone *zone,
|
||||
struct compact_control *cc)
|
||||
{
|
||||
@ -565,8 +593,26 @@ static int compact_finished(struct zone *zone,
|
||||
if (fatal_signal_pending(current))
|
||||
return COMPACT_PARTIAL;
|
||||
|
||||
/* Compaction run completes if the migrate and free scanner meet */
|
||||
if (cc->free_pfn <= cc->migrate_pfn)
|
||||
/*
|
||||
* A full (order == -1) compaction run starts at the beginning and
|
||||
* end of a zone; it completes when the migrate and free scanner meet.
|
||||
* A partial (order > 0) compaction can start with the free scanner
|
||||
* at a random point in the zone, and may have to restart.
|
||||
*/
|
||||
if (cc->free_pfn <= cc->migrate_pfn) {
|
||||
if (cc->order > 0 && !cc->wrapped) {
|
||||
/* We started partway through; restart at the end. */
|
||||
unsigned long free_pfn = start_free_pfn(zone);
|
||||
zone->compact_cached_free_pfn = free_pfn;
|
||||
cc->free_pfn = free_pfn;
|
||||
cc->wrapped = 1;
|
||||
return COMPACT_CONTINUE;
|
||||
}
|
||||
return COMPACT_COMPLETE;
|
||||
}
|
||||
|
||||
/* We wrapped around and ended up where we started. */
|
||||
if (cc->wrapped && cc->free_pfn <= cc->start_free_pfn)
|
||||
return COMPACT_COMPLETE;
|
||||
|
||||
/*
|
||||
@ -664,8 +710,15 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
|
||||
|
||||
/* Setup to move all movable pages to the end of the zone */
|
||||
cc->migrate_pfn = zone->zone_start_pfn;
|
||||
cc->free_pfn = cc->migrate_pfn + zone->spanned_pages;
|
||||
cc->free_pfn &= ~(pageblock_nr_pages-1);
|
||||
|
||||
if (cc->order > 0) {
|
||||
/* Incremental compaction. Start where the last one stopped. */
|
||||
cc->free_pfn = zone->compact_cached_free_pfn;
|
||||
cc->start_free_pfn = cc->free_pfn;
|
||||
} else {
|
||||
/* Order == -1 starts at the end of the zone. */
|
||||
cc->free_pfn = start_free_pfn(zone);
|
||||
}
|
||||
|
||||
migrate_prep_local();
|
||||
|
||||
|
18
mm/fadvise.c
18
mm/fadvise.c
@ -93,11 +93,6 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice)
|
||||
spin_unlock(&file->f_lock);
|
||||
break;
|
||||
case POSIX_FADV_WILLNEED:
|
||||
if (!mapping->a_ops->readpage) {
|
||||
ret = -EINVAL;
|
||||
break;
|
||||
}
|
||||
|
||||
/* First and last PARTIAL page! */
|
||||
start_index = offset >> PAGE_CACHE_SHIFT;
|
||||
end_index = endbyte >> PAGE_CACHE_SHIFT;
|
||||
@ -106,12 +101,13 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice)
|
||||
nrpages = end_index - start_index + 1;
|
||||
if (!nrpages)
|
||||
nrpages = ~0UL;
|
||||
|
||||
ret = force_page_cache_readahead(mapping, file,
|
||||
start_index,
|
||||
nrpages);
|
||||
if (ret > 0)
|
||||
ret = 0;
|
||||
|
||||
/*
|
||||
* Ignore return value because fadvise() shall return
|
||||
* success even if filesystem can't retrieve a hint,
|
||||
*/
|
||||
force_page_cache_readahead(mapping, file, start_index,
|
||||
nrpages);
|
||||
break;
|
||||
case POSIX_FADV_NOREUSE:
|
||||
break;
|
||||
|
12
mm/highmem.c
12
mm/highmem.c
@ -94,6 +94,18 @@ static DECLARE_WAIT_QUEUE_HEAD(pkmap_map_wait);
|
||||
do { spin_unlock(&kmap_lock); (void)(flags); } while (0)
|
||||
#endif
|
||||
|
||||
struct page *kmap_to_page(void *vaddr)
|
||||
{
|
||||
unsigned long addr = (unsigned long)vaddr;
|
||||
|
||||
if (addr >= PKMAP_ADDR(0) && addr <= PKMAP_ADDR(LAST_PKMAP)) {
|
||||
int i = (addr - PKMAP_ADDR(0)) >> PAGE_SHIFT;
|
||||
return pte_page(pkmap_page_table[i]);
|
||||
}
|
||||
|
||||
return virt_to_page(addr);
|
||||
}
|
||||
|
||||
static void flush_all_zero_pkmaps(void)
|
||||
{
|
||||
int i;
|
||||
|
195
mm/hugetlb.c
195
mm/hugetlb.c
@ -24,17 +24,20 @@
|
||||
|
||||
#include <asm/page.h>
|
||||
#include <asm/pgtable.h>
|
||||
#include <linux/io.h>
|
||||
#include <asm/tlb.h>
|
||||
|
||||
#include <linux/io.h>
|
||||
#include <linux/hugetlb.h>
|
||||
#include <linux/hugetlb_cgroup.h>
|
||||
#include <linux/node.h>
|
||||
#include <linux/hugetlb_cgroup.h>
|
||||
#include "internal.h"
|
||||
|
||||
const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
|
||||
static gfp_t htlb_alloc_mask = GFP_HIGHUSER;
|
||||
unsigned long hugepages_treat_as_movable;
|
||||
|
||||
static int max_hstate;
|
||||
int hugetlb_max_hstate __read_mostly;
|
||||
unsigned int default_hstate_idx;
|
||||
struct hstate hstates[HUGE_MAX_HSTATE];
|
||||
|
||||
@ -45,13 +48,10 @@ static struct hstate * __initdata parsed_hstate;
|
||||
static unsigned long __initdata default_hstate_max_huge_pages;
|
||||
static unsigned long __initdata default_hstate_size;
|
||||
|
||||
#define for_each_hstate(h) \
|
||||
for ((h) = hstates; (h) < &hstates[max_hstate]; (h)++)
|
||||
|
||||
/*
|
||||
* Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages
|
||||
*/
|
||||
static DEFINE_SPINLOCK(hugetlb_lock);
|
||||
DEFINE_SPINLOCK(hugetlb_lock);
|
||||
|
||||
static inline void unlock_or_release_subpool(struct hugepage_subpool *spool)
|
||||
{
|
||||
@ -509,7 +509,7 @@ void copy_huge_page(struct page *dst, struct page *src)
|
||||
static void enqueue_huge_page(struct hstate *h, struct page *page)
|
||||
{
|
||||
int nid = page_to_nid(page);
|
||||
list_add(&page->lru, &h->hugepage_freelists[nid]);
|
||||
list_move(&page->lru, &h->hugepage_freelists[nid]);
|
||||
h->free_huge_pages++;
|
||||
h->free_huge_pages_node[nid]++;
|
||||
}
|
||||
@ -521,7 +521,7 @@ static struct page *dequeue_huge_page_node(struct hstate *h, int nid)
|
||||
if (list_empty(&h->hugepage_freelists[nid]))
|
||||
return NULL;
|
||||
page = list_entry(h->hugepage_freelists[nid].next, struct page, lru);
|
||||
list_del(&page->lru);
|
||||
list_move(&page->lru, &h->hugepage_activelist);
|
||||
set_page_refcounted(page);
|
||||
h->free_huge_pages--;
|
||||
h->free_huge_pages_node[nid]--;
|
||||
@ -593,6 +593,7 @@ static void update_and_free_page(struct hstate *h, struct page *page)
|
||||
1 << PG_active | 1 << PG_reserved |
|
||||
1 << PG_private | 1 << PG_writeback);
|
||||
}
|
||||
VM_BUG_ON(hugetlb_cgroup_from_page(page));
|
||||
set_compound_page_dtor(page, NULL);
|
||||
set_page_refcounted(page);
|
||||
arch_release_hugepage(page);
|
||||
@ -625,10 +626,13 @@ static void free_huge_page(struct page *page)
|
||||
page->mapping = NULL;
|
||||
BUG_ON(page_count(page));
|
||||
BUG_ON(page_mapcount(page));
|
||||
INIT_LIST_HEAD(&page->lru);
|
||||
|
||||
spin_lock(&hugetlb_lock);
|
||||
hugetlb_cgroup_uncharge_page(hstate_index(h),
|
||||
pages_per_huge_page(h), page);
|
||||
if (h->surplus_huge_pages_node[nid] && huge_page_order(h) < MAX_ORDER) {
|
||||
/* remove the page from active list */
|
||||
list_del(&page->lru);
|
||||
update_and_free_page(h, page);
|
||||
h->surplus_huge_pages--;
|
||||
h->surplus_huge_pages_node[nid]--;
|
||||
@ -641,8 +645,10 @@ static void free_huge_page(struct page *page)
|
||||
|
||||
static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
|
||||
{
|
||||
INIT_LIST_HEAD(&page->lru);
|
||||
set_compound_page_dtor(page, free_huge_page);
|
||||
spin_lock(&hugetlb_lock);
|
||||
set_hugetlb_cgroup(page, NULL);
|
||||
h->nr_huge_pages++;
|
||||
h->nr_huge_pages_node[nid]++;
|
||||
spin_unlock(&hugetlb_lock);
|
||||
@ -889,8 +895,10 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, int nid)
|
||||
|
||||
spin_lock(&hugetlb_lock);
|
||||
if (page) {
|
||||
INIT_LIST_HEAD(&page->lru);
|
||||
r_nid = page_to_nid(page);
|
||||
set_compound_page_dtor(page, free_huge_page);
|
||||
set_hugetlb_cgroup(page, NULL);
|
||||
/*
|
||||
* We incremented the global counters already
|
||||
*/
|
||||
@ -993,7 +1001,6 @@ static int gather_surplus_pages(struct hstate *h, int delta)
|
||||
list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
|
||||
if ((--needed) < 0)
|
||||
break;
|
||||
list_del(&page->lru);
|
||||
/*
|
||||
* This page is now managed by the hugetlb allocator and has
|
||||
* no users -- drop the buddy allocator's reference.
|
||||
@ -1008,7 +1015,6 @@ static int gather_surplus_pages(struct hstate *h, int delta)
|
||||
/* Free unnecessary surplus pages to the buddy allocator */
|
||||
if (!list_empty(&surplus_list)) {
|
||||
list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
|
||||
list_del(&page->lru);
|
||||
put_page(page);
|
||||
}
|
||||
}
|
||||
@ -1112,7 +1118,10 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
|
||||
struct hstate *h = hstate_vma(vma);
|
||||
struct page *page;
|
||||
long chg;
|
||||
int ret, idx;
|
||||
struct hugetlb_cgroup *h_cg;
|
||||
|
||||
idx = hstate_index(h);
|
||||
/*
|
||||
* Processes that did not create the mapping will have no
|
||||
* reserves and will not have accounted against subpool
|
||||
@ -1123,27 +1132,43 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
|
||||
*/
|
||||
chg = vma_needs_reservation(h, vma, addr);
|
||||
if (chg < 0)
|
||||
return ERR_PTR(-VM_FAULT_OOM);
|
||||
return ERR_PTR(-ENOMEM);
|
||||
if (chg)
|
||||
if (hugepage_subpool_get_pages(spool, chg))
|
||||
return ERR_PTR(-VM_FAULT_SIGBUS);
|
||||
return ERR_PTR(-ENOSPC);
|
||||
|
||||
ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg);
|
||||
if (ret) {
|
||||
hugepage_subpool_put_pages(spool, chg);
|
||||
return ERR_PTR(-ENOSPC);
|
||||
}
|
||||
spin_lock(&hugetlb_lock);
|
||||
page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve);
|
||||
spin_unlock(&hugetlb_lock);
|
||||
|
||||
if (!page) {
|
||||
if (page) {
|
||||
/* update page cgroup details */
|
||||
hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h),
|
||||
h_cg, page);
|
||||
spin_unlock(&hugetlb_lock);
|
||||
} else {
|
||||
spin_unlock(&hugetlb_lock);
|
||||
page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
|
||||
if (!page) {
|
||||
hugetlb_cgroup_uncharge_cgroup(idx,
|
||||
pages_per_huge_page(h),
|
||||
h_cg);
|
||||
hugepage_subpool_put_pages(spool, chg);
|
||||
return ERR_PTR(-VM_FAULT_SIGBUS);
|
||||
return ERR_PTR(-ENOSPC);
|
||||
}
|
||||
spin_lock(&hugetlb_lock);
|
||||
hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h),
|
||||
h_cg, page);
|
||||
list_move(&page->lru, &h->hugepage_activelist);
|
||||
spin_unlock(&hugetlb_lock);
|
||||
}
|
||||
|
||||
set_page_private(page, (unsigned long)spool);
|
||||
|
||||
vma_commit_reservation(h, vma, addr);
|
||||
|
||||
return page;
|
||||
}
|
||||
|
||||
@ -1646,7 +1671,7 @@ static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent,
|
||||
struct attribute_group *hstate_attr_group)
|
||||
{
|
||||
int retval;
|
||||
int hi = h - hstates;
|
||||
int hi = hstate_index(h);
|
||||
|
||||
hstate_kobjs[hi] = kobject_create_and_add(h->name, parent);
|
||||
if (!hstate_kobjs[hi])
|
||||
@ -1741,11 +1766,13 @@ void hugetlb_unregister_node(struct node *node)
|
||||
if (!nhs->hugepages_kobj)
|
||||
return; /* no hstate attributes */
|
||||
|
||||
for_each_hstate(h)
|
||||
if (nhs->hstate_kobjs[h - hstates]) {
|
||||
kobject_put(nhs->hstate_kobjs[h - hstates]);
|
||||
nhs->hstate_kobjs[h - hstates] = NULL;
|
||||
for_each_hstate(h) {
|
||||
int idx = hstate_index(h);
|
||||
if (nhs->hstate_kobjs[idx]) {
|
||||
kobject_put(nhs->hstate_kobjs[idx]);
|
||||
nhs->hstate_kobjs[idx] = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
kobject_put(nhs->hugepages_kobj);
|
||||
nhs->hugepages_kobj = NULL;
|
||||
@ -1848,7 +1875,7 @@ static void __exit hugetlb_exit(void)
|
||||
hugetlb_unregister_all_nodes();
|
||||
|
||||
for_each_hstate(h) {
|
||||
kobject_put(hstate_kobjs[h - hstates]);
|
||||
kobject_put(hstate_kobjs[hstate_index(h)]);
|
||||
}
|
||||
|
||||
kobject_put(hugepages_kobj);
|
||||
@ -1869,7 +1896,7 @@ static int __init hugetlb_init(void)
|
||||
if (!size_to_hstate(default_hstate_size))
|
||||
hugetlb_add_hstate(HUGETLB_PAGE_ORDER);
|
||||
}
|
||||
default_hstate_idx = size_to_hstate(default_hstate_size) - hstates;
|
||||
default_hstate_idx = hstate_index(size_to_hstate(default_hstate_size));
|
||||
if (default_hstate_max_huge_pages)
|
||||
default_hstate.max_huge_pages = default_hstate_max_huge_pages;
|
||||
|
||||
@ -1897,19 +1924,27 @@ void __init hugetlb_add_hstate(unsigned order)
|
||||
printk(KERN_WARNING "hugepagesz= specified twice, ignoring\n");
|
||||
return;
|
||||
}
|
||||
BUG_ON(max_hstate >= HUGE_MAX_HSTATE);
|
||||
BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE);
|
||||
BUG_ON(order == 0);
|
||||
h = &hstates[max_hstate++];
|
||||
h = &hstates[hugetlb_max_hstate++];
|
||||
h->order = order;
|
||||
h->mask = ~((1ULL << (order + PAGE_SHIFT)) - 1);
|
||||
h->nr_huge_pages = 0;
|
||||
h->free_huge_pages = 0;
|
||||
for (i = 0; i < MAX_NUMNODES; ++i)
|
||||
INIT_LIST_HEAD(&h->hugepage_freelists[i]);
|
||||
INIT_LIST_HEAD(&h->hugepage_activelist);
|
||||
h->next_nid_to_alloc = first_node(node_states[N_HIGH_MEMORY]);
|
||||
h->next_nid_to_free = first_node(node_states[N_HIGH_MEMORY]);
|
||||
snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
|
||||
huge_page_size(h)/1024);
|
||||
/*
|
||||
* Add cgroup control files only if the huge page consists
|
||||
* of more than two normal pages. This is because we use
|
||||
* page[2].lru.next for storing cgoup details.
|
||||
*/
|
||||
if (order >= HUGETLB_CGROUP_MIN_ORDER)
|
||||
hugetlb_cgroup_file_init(hugetlb_max_hstate - 1);
|
||||
|
||||
parsed_hstate = h;
|
||||
}
|
||||
@ -1920,10 +1955,10 @@ static int __init hugetlb_nrpages_setup(char *s)
|
||||
static unsigned long *last_mhp;
|
||||
|
||||
/*
|
||||
* !max_hstate means we haven't parsed a hugepagesz= parameter yet,
|
||||
* !hugetlb_max_hstate means we haven't parsed a hugepagesz= parameter yet,
|
||||
* so this hugepages= parameter goes to the "default hstate".
|
||||
*/
|
||||
if (!max_hstate)
|
||||
if (!hugetlb_max_hstate)
|
||||
mhp = &default_hstate_max_huge_pages;
|
||||
else
|
||||
mhp = &parsed_hstate->max_huge_pages;
|
||||
@ -1942,7 +1977,7 @@ static int __init hugetlb_nrpages_setup(char *s)
|
||||
* But we need to allocate >= MAX_ORDER hstates here early to still
|
||||
* use the bootmem allocator.
|
||||
*/
|
||||
if (max_hstate && parsed_hstate->order >= MAX_ORDER)
|
||||
if (hugetlb_max_hstate && parsed_hstate->order >= MAX_ORDER)
|
||||
hugetlb_hstate_alloc_pages(parsed_hstate);
|
||||
|
||||
last_mhp = mhp;
|
||||
@ -2308,30 +2343,26 @@ static int is_hugetlb_entry_hwpoisoned(pte_t pte)
|
||||
return 0;
|
||||
}
|
||||
|
||||
void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
|
||||
unsigned long end, struct page *ref_page)
|
||||
void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
|
||||
unsigned long start, unsigned long end,
|
||||
struct page *ref_page)
|
||||
{
|
||||
int force_flush = 0;
|
||||
struct mm_struct *mm = vma->vm_mm;
|
||||
unsigned long address;
|
||||
pte_t *ptep;
|
||||
pte_t pte;
|
||||
struct page *page;
|
||||
struct page *tmp;
|
||||
struct hstate *h = hstate_vma(vma);
|
||||
unsigned long sz = huge_page_size(h);
|
||||
|
||||
/*
|
||||
* A page gathering list, protected by per file i_mmap_mutex. The
|
||||
* lock is used to avoid list corruption from multiple unmapping
|
||||
* of the same page since we are using page->lru.
|
||||
*/
|
||||
LIST_HEAD(page_list);
|
||||
|
||||
WARN_ON(!is_vm_hugetlb_page(vma));
|
||||
BUG_ON(start & ~huge_page_mask(h));
|
||||
BUG_ON(end & ~huge_page_mask(h));
|
||||
|
||||
tlb_start_vma(tlb, vma);
|
||||
mmu_notifier_invalidate_range_start(mm, start, end);
|
||||
again:
|
||||
spin_lock(&mm->page_table_lock);
|
||||
for (address = start; address < end; address += sz) {
|
||||
ptep = huge_pte_offset(mm, address);
|
||||
@ -2370,30 +2401,64 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
|
||||
}
|
||||
|
||||
pte = huge_ptep_get_and_clear(mm, address, ptep);
|
||||
tlb_remove_tlb_entry(tlb, ptep, address);
|
||||
if (pte_dirty(pte))
|
||||
set_page_dirty(page);
|
||||
list_add(&page->lru, &page_list);
|
||||
|
||||
page_remove_rmap(page);
|
||||
force_flush = !__tlb_remove_page(tlb, page);
|
||||
if (force_flush)
|
||||
break;
|
||||
/* Bail out after unmapping reference page if supplied */
|
||||
if (ref_page)
|
||||
break;
|
||||
}
|
||||
flush_tlb_range(vma, start, end);
|
||||
spin_unlock(&mm->page_table_lock);
|
||||
mmu_notifier_invalidate_range_end(mm, start, end);
|
||||
list_for_each_entry_safe(page, tmp, &page_list, lru) {
|
||||
page_remove_rmap(page);
|
||||
list_del(&page->lru);
|
||||
put_page(page);
|
||||
/*
|
||||
* mmu_gather ran out of room to batch pages, we break out of
|
||||
* the PTE lock to avoid doing the potential expensive TLB invalidate
|
||||
* and page-free while holding it.
|
||||
*/
|
||||
if (force_flush) {
|
||||
force_flush = 0;
|
||||
tlb_flush_mmu(tlb);
|
||||
if (address < end && !ref_page)
|
||||
goto again;
|
||||
}
|
||||
mmu_notifier_invalidate_range_end(mm, start, end);
|
||||
tlb_end_vma(tlb, vma);
|
||||
}
|
||||
|
||||
void __unmap_hugepage_range_final(struct mmu_gather *tlb,
|
||||
struct vm_area_struct *vma, unsigned long start,
|
||||
unsigned long end, struct page *ref_page)
|
||||
{
|
||||
__unmap_hugepage_range(tlb, vma, start, end, ref_page);
|
||||
|
||||
/*
|
||||
* Clear this flag so that x86's huge_pmd_share page_table_shareable
|
||||
* test will fail on a vma being torn down, and not grab a page table
|
||||
* on its way out. We're lucky that the flag has such an appropriate
|
||||
* name, and can in fact be safely cleared here. We could clear it
|
||||
* before the __unmap_hugepage_range above, but all that's necessary
|
||||
* is to clear it before releasing the i_mmap_mutex. This works
|
||||
* because in the context this is called, the VMA is about to be
|
||||
* destroyed and the i_mmap_mutex is held.
|
||||
*/
|
||||
vma->vm_flags &= ~VM_MAYSHARE;
|
||||
}
|
||||
|
||||
void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
|
||||
unsigned long end, struct page *ref_page)
|
||||
{
|
||||
mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex);
|
||||
__unmap_hugepage_range(vma, start, end, ref_page);
|
||||
mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
|
||||
struct mm_struct *mm;
|
||||
struct mmu_gather tlb;
|
||||
|
||||
mm = vma->vm_mm;
|
||||
|
||||
tlb_gather_mmu(&tlb, mm, 0);
|
||||
__unmap_hugepage_range(&tlb, vma, start, end, ref_page);
|
||||
tlb_finish_mmu(&tlb, start, end);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -2438,9 +2503,8 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||
* from the time of fork. This would look like data corruption
|
||||
*/
|
||||
if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER))
|
||||
__unmap_hugepage_range(iter_vma,
|
||||
address, address + huge_page_size(h),
|
||||
page);
|
||||
unmap_hugepage_range(iter_vma, address,
|
||||
address + huge_page_size(h), page);
|
||||
}
|
||||
mutex_unlock(&mapping->i_mmap_mutex);
|
||||
|
||||
@ -2496,6 +2560,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||
new_page = alloc_huge_page(vma, address, outside_reserve);
|
||||
|
||||
if (IS_ERR(new_page)) {
|
||||
long err = PTR_ERR(new_page);
|
||||
page_cache_release(old_page);
|
||||
|
||||
/*
|
||||
@ -2524,7 +2589,10 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||
|
||||
/* Caller expects lock to be held */
|
||||
spin_lock(&mm->page_table_lock);
|
||||
return -PTR_ERR(new_page);
|
||||
if (err == -ENOMEM)
|
||||
return VM_FAULT_OOM;
|
||||
else
|
||||
return VM_FAULT_SIGBUS;
|
||||
}
|
||||
|
||||
/*
|
||||
@ -2642,7 +2710,11 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||
goto out;
|
||||
page = alloc_huge_page(vma, address, 0);
|
||||
if (IS_ERR(page)) {
|
||||
ret = -PTR_ERR(page);
|
||||
ret = PTR_ERR(page);
|
||||
if (ret == -ENOMEM)
|
||||
ret = VM_FAULT_OOM;
|
||||
else
|
||||
ret = VM_FAULT_SIGBUS;
|
||||
goto out;
|
||||
}
|
||||
clear_huge_page(page, address, pages_per_huge_page(h));
|
||||
@ -2679,7 +2751,7 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||
*/
|
||||
if (unlikely(PageHWPoison(page))) {
|
||||
ret = VM_FAULT_HWPOISON |
|
||||
VM_FAULT_SET_HINDEX(h - hstates);
|
||||
VM_FAULT_SET_HINDEX(hstate_index(h));
|
||||
goto backout_unlocked;
|
||||
}
|
||||
}
|
||||
@ -2752,7 +2824,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||
return 0;
|
||||
} else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
|
||||
return VM_FAULT_HWPOISON_LARGE |
|
||||
VM_FAULT_SET_HINDEX(h - hstates);
|
||||
VM_FAULT_SET_HINDEX(hstate_index(h));
|
||||
}
|
||||
|
||||
ptep = huge_pte_alloc(mm, address, huge_page_size(h));
|
||||
@ -2959,9 +3031,14 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
|
||||
}
|
||||
}
|
||||
spin_unlock(&mm->page_table_lock);
|
||||
mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
|
||||
|
||||
/*
|
||||
* Must flush TLB before releasing i_mmap_mutex: x86's huge_pmd_unshare
|
||||
* may have cleared our pud entry and done put_page on the page table:
|
||||
* once we release i_mmap_mutex, another task can do the final put_page
|
||||
* and that page table be reused and filled with junk.
|
||||
*/
|
||||
flush_tlb_range(vma, start, end);
|
||||
mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
|
||||
}
|
||||
|
||||
int hugetlb_reserve_pages(struct inode *inode,
|
||||
|
418
mm/hugetlb_cgroup.c
Normal file
418
mm/hugetlb_cgroup.c
Normal file
@ -0,0 +1,418 @@
|
||||
/*
|
||||
*
|
||||
* Copyright IBM Corporation, 2012
|
||||
* Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify it
|
||||
* under the terms of version 2.1 of the GNU Lesser General Public License
|
||||
* as published by the Free Software Foundation.
|
||||
*
|
||||
* This program is distributed in the hope that it would be useful, but
|
||||
* WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
||||
*
|
||||
*/
|
||||
|
||||
#include <linux/cgroup.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/hugetlb.h>
|
||||
#include <linux/hugetlb_cgroup.h>
|
||||
|
||||
struct hugetlb_cgroup {
|
||||
struct cgroup_subsys_state css;
|
||||
/*
|
||||
* the counter to account for hugepages from hugetlb.
|
||||
*/
|
||||
struct res_counter hugepage[HUGE_MAX_HSTATE];
|
||||
};
|
||||
|
||||
#define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val))
|
||||
#define MEMFILE_IDX(val) (((val) >> 16) & 0xffff)
|
||||
#define MEMFILE_ATTR(val) ((val) & 0xffff)
|
||||
|
||||
struct cgroup_subsys hugetlb_subsys __read_mostly;
|
||||
static struct hugetlb_cgroup *root_h_cgroup __read_mostly;
|
||||
|
||||
static inline
|
||||
struct hugetlb_cgroup *hugetlb_cgroup_from_css(struct cgroup_subsys_state *s)
|
||||
{
|
||||
return container_of(s, struct hugetlb_cgroup, css);
|
||||
}
|
||||
|
||||
static inline
|
||||
struct hugetlb_cgroup *hugetlb_cgroup_from_cgroup(struct cgroup *cgroup)
|
||||
{
|
||||
return hugetlb_cgroup_from_css(cgroup_subsys_state(cgroup,
|
||||
hugetlb_subsys_id));
|
||||
}
|
||||
|
||||
static inline
|
||||
struct hugetlb_cgroup *hugetlb_cgroup_from_task(struct task_struct *task)
|
||||
{
|
||||
return hugetlb_cgroup_from_css(task_subsys_state(task,
|
||||
hugetlb_subsys_id));
|
||||
}
|
||||
|
||||
static inline bool hugetlb_cgroup_is_root(struct hugetlb_cgroup *h_cg)
|
||||
{
|
||||
return (h_cg == root_h_cgroup);
|
||||
}
|
||||
|
||||
static inline struct hugetlb_cgroup *parent_hugetlb_cgroup(struct cgroup *cg)
|
||||
{
|
||||
if (!cg->parent)
|
||||
return NULL;
|
||||
return hugetlb_cgroup_from_cgroup(cg->parent);
|
||||
}
|
||||
|
||||
static inline bool hugetlb_cgroup_have_usage(struct cgroup *cg)
|
||||
{
|
||||
int idx;
|
||||
struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_cgroup(cg);
|
||||
|
||||
for (idx = 0; idx < hugetlb_max_hstate; idx++) {
|
||||
if ((res_counter_read_u64(&h_cg->hugepage[idx], RES_USAGE)) > 0)
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
static struct cgroup_subsys_state *hugetlb_cgroup_create(struct cgroup *cgroup)
|
||||
{
|
||||
int idx;
|
||||
struct cgroup *parent_cgroup;
|
||||
struct hugetlb_cgroup *h_cgroup, *parent_h_cgroup;
|
||||
|
||||
h_cgroup = kzalloc(sizeof(*h_cgroup), GFP_KERNEL);
|
||||
if (!h_cgroup)
|
||||
return ERR_PTR(-ENOMEM);
|
||||
|
||||
parent_cgroup = cgroup->parent;
|
||||
if (parent_cgroup) {
|
||||
parent_h_cgroup = hugetlb_cgroup_from_cgroup(parent_cgroup);
|
||||
for (idx = 0; idx < HUGE_MAX_HSTATE; idx++)
|
||||
res_counter_init(&h_cgroup->hugepage[idx],
|
||||
&parent_h_cgroup->hugepage[idx]);
|
||||
} else {
|
||||
root_h_cgroup = h_cgroup;
|
||||
for (idx = 0; idx < HUGE_MAX_HSTATE; idx++)
|
||||
res_counter_init(&h_cgroup->hugepage[idx], NULL);
|
||||
}
|
||||
return &h_cgroup->css;
|
||||
}
|
||||
|
||||
static void hugetlb_cgroup_destroy(struct cgroup *cgroup)
|
||||
{
|
||||
struct hugetlb_cgroup *h_cgroup;
|
||||
|
||||
h_cgroup = hugetlb_cgroup_from_cgroup(cgroup);
|
||||
kfree(h_cgroup);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Should be called with hugetlb_lock held.
|
||||
* Since we are holding hugetlb_lock, pages cannot get moved from
|
||||
* active list or uncharged from the cgroup, So no need to get
|
||||
* page reference and test for page active here. This function
|
||||
* cannot fail.
|
||||
*/
|
||||
static void hugetlb_cgroup_move_parent(int idx, struct cgroup *cgroup,
|
||||
struct page *page)
|
||||
{
|
||||
int csize;
|
||||
struct res_counter *counter;
|
||||
struct res_counter *fail_res;
|
||||
struct hugetlb_cgroup *page_hcg;
|
||||
struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_cgroup(cgroup);
|
||||
struct hugetlb_cgroup *parent = parent_hugetlb_cgroup(cgroup);
|
||||
|
||||
page_hcg = hugetlb_cgroup_from_page(page);
|
||||
/*
|
||||
* We can have pages in active list without any cgroup
|
||||
* ie, hugepage with less than 3 pages. We can safely
|
||||
* ignore those pages.
|
||||
*/
|
||||
if (!page_hcg || page_hcg != h_cg)
|
||||
goto out;
|
||||
|
||||
csize = PAGE_SIZE << compound_order(page);
|
||||
if (!parent) {
|
||||
parent = root_h_cgroup;
|
||||
/* root has no limit */
|
||||
res_counter_charge_nofail(&parent->hugepage[idx],
|
||||
csize, &fail_res);
|
||||
}
|
||||
counter = &h_cg->hugepage[idx];
|
||||
res_counter_uncharge_until(counter, counter->parent, csize);
|
||||
|
||||
set_hugetlb_cgroup(page, parent);
|
||||
out:
|
||||
return;
|
||||
}
|
||||
|
||||
/*
|
||||
* Force the hugetlb cgroup to empty the hugetlb resources by moving them to
|
||||
* the parent cgroup.
|
||||
*/
|
||||
static int hugetlb_cgroup_pre_destroy(struct cgroup *cgroup)
|
||||
{
|
||||
struct hstate *h;
|
||||
struct page *page;
|
||||
int ret = 0, idx = 0;
|
||||
|
||||
do {
|
||||
if (cgroup_task_count(cgroup) ||
|
||||
!list_empty(&cgroup->children)) {
|
||||
ret = -EBUSY;
|
||||
goto out;
|
||||
}
|
||||
for_each_hstate(h) {
|
||||
spin_lock(&hugetlb_lock);
|
||||
list_for_each_entry(page, &h->hugepage_activelist, lru)
|
||||
hugetlb_cgroup_move_parent(idx, cgroup, page);
|
||||
|
||||
spin_unlock(&hugetlb_lock);
|
||||
idx++;
|
||||
}
|
||||
cond_resched();
|
||||
} while (hugetlb_cgroup_have_usage(cgroup));
|
||||
out:
|
||||
return ret;
|
||||
}
|
||||
|
||||
int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages,
|
||||
struct hugetlb_cgroup **ptr)
|
||||
{
|
||||
int ret = 0;
|
||||
struct res_counter *fail_res;
|
||||
struct hugetlb_cgroup *h_cg = NULL;
|
||||
unsigned long csize = nr_pages * PAGE_SIZE;
|
||||
|
||||
if (hugetlb_cgroup_disabled())
|
||||
goto done;
|
||||
/*
|
||||
* We don't charge any cgroup if the compound page have less
|
||||
* than 3 pages.
|
||||
*/
|
||||
if (huge_page_order(&hstates[idx]) < HUGETLB_CGROUP_MIN_ORDER)
|
||||
goto done;
|
||||
again:
|
||||
rcu_read_lock();
|
||||
h_cg = hugetlb_cgroup_from_task(current);
|
||||
if (!css_tryget(&h_cg->css)) {
|
||||
rcu_read_unlock();
|
||||
goto again;
|
||||
}
|
||||
rcu_read_unlock();
|
||||
|
||||
ret = res_counter_charge(&h_cg->hugepage[idx], csize, &fail_res);
|
||||
css_put(&h_cg->css);
|
||||
done:
|
||||
*ptr = h_cg;
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* Should be called with hugetlb_lock held */
|
||||
void hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages,
|
||||
struct hugetlb_cgroup *h_cg,
|
||||
struct page *page)
|
||||
{
|
||||
if (hugetlb_cgroup_disabled() || !h_cg)
|
||||
return;
|
||||
|
||||
set_hugetlb_cgroup(page, h_cg);
|
||||
return;
|
||||
}
|
||||
|
||||
/*
|
||||
* Should be called with hugetlb_lock held
|
||||
*/
|
||||
void hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages,
|
||||
struct page *page)
|
||||
{
|
||||
struct hugetlb_cgroup *h_cg;
|
||||
unsigned long csize = nr_pages * PAGE_SIZE;
|
||||
|
||||
if (hugetlb_cgroup_disabled())
|
||||
return;
|
||||
VM_BUG_ON(!spin_is_locked(&hugetlb_lock));
|
||||
h_cg = hugetlb_cgroup_from_page(page);
|
||||
if (unlikely(!h_cg))
|
||||
return;
|
||||
set_hugetlb_cgroup(page, NULL);
|
||||
res_counter_uncharge(&h_cg->hugepage[idx], csize);
|
||||
return;
|
||||
}
|
||||
|
||||
void hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages,
|
||||
struct hugetlb_cgroup *h_cg)
|
||||
{
|
||||
unsigned long csize = nr_pages * PAGE_SIZE;
|
||||
|
||||
if (hugetlb_cgroup_disabled() || !h_cg)
|
||||
return;
|
||||
|
||||
if (huge_page_order(&hstates[idx]) < HUGETLB_CGROUP_MIN_ORDER)
|
||||
return;
|
||||
|
||||
res_counter_uncharge(&h_cg->hugepage[idx], csize);
|
||||
return;
|
||||
}
|
||||
|
||||
static ssize_t hugetlb_cgroup_read(struct cgroup *cgroup, struct cftype *cft,
|
||||
struct file *file, char __user *buf,
|
||||
size_t nbytes, loff_t *ppos)
|
||||
{
|
||||
u64 val;
|
||||
char str[64];
|
||||
int idx, name, len;
|
||||
struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_cgroup(cgroup);
|
||||
|
||||
idx = MEMFILE_IDX(cft->private);
|
||||
name = MEMFILE_ATTR(cft->private);
|
||||
|
||||
val = res_counter_read_u64(&h_cg->hugepage[idx], name);
|
||||
len = scnprintf(str, sizeof(str), "%llu\n", (unsigned long long)val);
|
||||
return simple_read_from_buffer(buf, nbytes, ppos, str, len);
|
||||
}
|
||||
|
||||
static int hugetlb_cgroup_write(struct cgroup *cgroup, struct cftype *cft,
|
||||
const char *buffer)
|
||||
{
|
||||
int idx, name, ret;
|
||||
unsigned long long val;
|
||||
struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_cgroup(cgroup);
|
||||
|
||||
idx = MEMFILE_IDX(cft->private);
|
||||
name = MEMFILE_ATTR(cft->private);
|
||||
|
||||
switch (name) {
|
||||
case RES_LIMIT:
|
||||
if (hugetlb_cgroup_is_root(h_cg)) {
|
||||
/* Can't set limit on root */
|
||||
ret = -EINVAL;
|
||||
break;
|
||||
}
|
||||
/* This function does all necessary parse...reuse it */
|
||||
ret = res_counter_memparse_write_strategy(buffer, &val);
|
||||
if (ret)
|
||||
break;
|
||||
ret = res_counter_set_limit(&h_cg->hugepage[idx], val);
|
||||
break;
|
||||
default:
|
||||
ret = -EINVAL;
|
||||
break;
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int hugetlb_cgroup_reset(struct cgroup *cgroup, unsigned int event)
|
||||
{
|
||||
int idx, name, ret = 0;
|
||||
struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_cgroup(cgroup);
|
||||
|
||||
idx = MEMFILE_IDX(event);
|
||||
name = MEMFILE_ATTR(event);
|
||||
|
||||
switch (name) {
|
||||
case RES_MAX_USAGE:
|
||||
res_counter_reset_max(&h_cg->hugepage[idx]);
|
||||
break;
|
||||
case RES_FAILCNT:
|
||||
res_counter_reset_failcnt(&h_cg->hugepage[idx]);
|
||||
break;
|
||||
default:
|
||||
ret = -EINVAL;
|
||||
break;
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
static char *mem_fmt(char *buf, int size, unsigned long hsize)
|
||||
{
|
||||
if (hsize >= (1UL << 30))
|
||||
snprintf(buf, size, "%luGB", hsize >> 30);
|
||||
else if (hsize >= (1UL << 20))
|
||||
snprintf(buf, size, "%luMB", hsize >> 20);
|
||||
else
|
||||
snprintf(buf, size, "%luKB", hsize >> 10);
|
||||
return buf;
|
||||
}
|
||||
|
||||
int __init hugetlb_cgroup_file_init(int idx)
|
||||
{
|
||||
char buf[32];
|
||||
struct cftype *cft;
|
||||
struct hstate *h = &hstates[idx];
|
||||
|
||||
/* format the size */
|
||||
mem_fmt(buf, 32, huge_page_size(h));
|
||||
|
||||
/* Add the limit file */
|
||||
cft = &h->cgroup_files[0];
|
||||
snprintf(cft->name, MAX_CFTYPE_NAME, "%s.limit_in_bytes", buf);
|
||||
cft->private = MEMFILE_PRIVATE(idx, RES_LIMIT);
|
||||
cft->read = hugetlb_cgroup_read;
|
||||
cft->write_string = hugetlb_cgroup_write;
|
||||
|
||||
/* Add the usage file */
|
||||
cft = &h->cgroup_files[1];
|
||||
snprintf(cft->name, MAX_CFTYPE_NAME, "%s.usage_in_bytes", buf);
|
||||
cft->private = MEMFILE_PRIVATE(idx, RES_USAGE);
|
||||
cft->read = hugetlb_cgroup_read;
|
||||
|
||||
/* Add the MAX usage file */
|
||||
cft = &h->cgroup_files[2];
|
||||
snprintf(cft->name, MAX_CFTYPE_NAME, "%s.max_usage_in_bytes", buf);
|
||||
cft->private = MEMFILE_PRIVATE(idx, RES_MAX_USAGE);
|
||||
cft->trigger = hugetlb_cgroup_reset;
|
||||
cft->read = hugetlb_cgroup_read;
|
||||
|
||||
/* Add the failcntfile */
|
||||
cft = &h->cgroup_files[3];
|
||||
snprintf(cft->name, MAX_CFTYPE_NAME, "%s.failcnt", buf);
|
||||
cft->private = MEMFILE_PRIVATE(idx, RES_FAILCNT);
|
||||
cft->trigger = hugetlb_cgroup_reset;
|
||||
cft->read = hugetlb_cgroup_read;
|
||||
|
||||
/* NULL terminate the last cft */
|
||||
cft = &h->cgroup_files[4];
|
||||
memset(cft, 0, sizeof(*cft));
|
||||
|
||||
WARN_ON(cgroup_add_cftypes(&hugetlb_subsys, h->cgroup_files));
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* hugetlb_lock will make sure a parallel cgroup rmdir won't happen
|
||||
* when we migrate hugepages
|
||||
*/
|
||||
void hugetlb_cgroup_migrate(struct page *oldhpage, struct page *newhpage)
|
||||
{
|
||||
struct hugetlb_cgroup *h_cg;
|
||||
struct hstate *h = page_hstate(oldhpage);
|
||||
|
||||
if (hugetlb_cgroup_disabled())
|
||||
return;
|
||||
|
||||
VM_BUG_ON(!PageHuge(oldhpage));
|
||||
spin_lock(&hugetlb_lock);
|
||||
h_cg = hugetlb_cgroup_from_page(oldhpage);
|
||||
set_hugetlb_cgroup(oldhpage, NULL);
|
||||
|
||||
/* move the h_cg details to new cgroup */
|
||||
set_hugetlb_cgroup(newhpage, h_cg);
|
||||
list_move(&newhpage->lru, &h->hugepage_activelist);
|
||||
spin_unlock(&hugetlb_lock);
|
||||
return;
|
||||
}
|
||||
|
||||
struct cgroup_subsys hugetlb_subsys = {
|
||||
.name = "hugetlb",
|
||||
.create = hugetlb_cgroup_create,
|
||||
.pre_destroy = hugetlb_cgroup_pre_destroy,
|
||||
.destroy = hugetlb_cgroup_destroy,
|
||||
.subsys_id = hugetlb_subsys_id,
|
||||
};
|
@ -123,7 +123,7 @@ static int pfn_inject_init(void)
|
||||
if (!dentry)
|
||||
goto fail;
|
||||
|
||||
#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
|
||||
#ifdef CONFIG_MEMCG_SWAP
|
||||
dentry = debugfs_create_u64("corrupt-filter-memcg", 0600,
|
||||
hwpoison_dir, &hwpoison_filter_memcg);
|
||||
if (!dentry)
|
||||
|
@ -118,8 +118,14 @@ struct compact_control {
|
||||
unsigned long nr_freepages; /* Number of isolated free pages */
|
||||
unsigned long nr_migratepages; /* Number of pages to migrate */
|
||||
unsigned long free_pfn; /* isolate_freepages search base */
|
||||
unsigned long start_free_pfn; /* where we started the search */
|
||||
unsigned long migrate_pfn; /* isolate_migratepages search base */
|
||||
bool sync; /* Synchronous migration */
|
||||
bool wrapped; /* Order > 0 compactions are
|
||||
incremental, once free_pfn
|
||||
and migrate_pfn meet, we restart
|
||||
from the top of the zone;
|
||||
remember we wrapped around. */
|
||||
|
||||
int order; /* order a direct compactor needs */
|
||||
int migratetype; /* MOVABLE, RECLAIMABLE etc */
|
||||
@ -347,3 +353,5 @@ extern u32 hwpoison_filter_enable;
|
||||
extern unsigned long vm_mmap_pgoff(struct file *, unsigned long,
|
||||
unsigned long, unsigned long,
|
||||
unsigned long, unsigned long);
|
||||
|
||||
extern void set_pageblock_order(void);
|
||||
|
@ -222,13 +222,13 @@ static int __init_memblock memblock_double_array(struct memblock_type *type,
|
||||
/* Try to find some space for it.
|
||||
*
|
||||
* WARNING: We assume that either slab_is_available() and we use it or
|
||||
* we use MEMBLOCK for allocations. That means that this is unsafe to use
|
||||
* when bootmem is currently active (unless bootmem itself is implemented
|
||||
* on top of MEMBLOCK which isn't the case yet)
|
||||
* we use MEMBLOCK for allocations. That means that this is unsafe to
|
||||
* use when bootmem is currently active (unless bootmem itself is
|
||||
* implemented on top of MEMBLOCK which isn't the case yet)
|
||||
*
|
||||
* This should however not be an issue for now, as we currently only
|
||||
* call into MEMBLOCK while it's still active, or much later when slab is
|
||||
* active for memory hotplug operations
|
||||
* call into MEMBLOCK while it's still active, or much later when slab
|
||||
* is active for memory hotplug operations
|
||||
*/
|
||||
if (use_slab) {
|
||||
new_array = kmalloc(new_size, GFP_KERNEL);
|
||||
@ -243,8 +243,8 @@ static int __init_memblock memblock_double_array(struct memblock_type *type,
|
||||
new_alloc_size, PAGE_SIZE);
|
||||
if (!addr && new_area_size)
|
||||
addr = memblock_find_in_range(0,
|
||||
min(new_area_start, memblock.current_limit),
|
||||
new_alloc_size, PAGE_SIZE);
|
||||
min(new_area_start, memblock.current_limit),
|
||||
new_alloc_size, PAGE_SIZE);
|
||||
|
||||
new_array = addr ? __va(addr) : 0;
|
||||
}
|
||||
@ -254,12 +254,14 @@ static int __init_memblock memblock_double_array(struct memblock_type *type,
|
||||
return -1;
|
||||
}
|
||||
|
||||
memblock_dbg("memblock: %s array is doubled to %ld at [%#010llx-%#010llx]",
|
||||
memblock_type_name(type), type->max * 2, (u64)addr, (u64)addr + new_size - 1);
|
||||
memblock_dbg("memblock: %s is doubled to %ld at [%#010llx-%#010llx]",
|
||||
memblock_type_name(type), type->max * 2, (u64)addr,
|
||||
(u64)addr + new_size - 1);
|
||||
|
||||
/* Found space, we now need to move the array over before
|
||||
* we add the reserved region since it may be our reserved
|
||||
* array itself that is full.
|
||||
/*
|
||||
* Found space, we now need to move the array over before we add the
|
||||
* reserved region since it may be our reserved array itself that is
|
||||
* full.
|
||||
*/
|
||||
memcpy(new_array, type->regions, old_size);
|
||||
memset(new_array + type->max, 0, old_size);
|
||||
@ -267,17 +269,16 @@ static int __init_memblock memblock_double_array(struct memblock_type *type,
|
||||
type->regions = new_array;
|
||||
type->max <<= 1;
|
||||
|
||||
/* Free old array. We needn't free it if the array is the
|
||||
* static one
|
||||
*/
|
||||
/* Free old array. We needn't free it if the array is the static one */
|
||||
if (*in_slab)
|
||||
kfree(old_array);
|
||||
else if (old_array != memblock_memory_init_regions &&
|
||||
old_array != memblock_reserved_init_regions)
|
||||
memblock_free(__pa(old_array), old_alloc_size);
|
||||
|
||||
/* Reserve the new array if that comes from the memblock.
|
||||
* Otherwise, we needn't do it
|
||||
/*
|
||||
* Reserve the new array if that comes from the memblock. Otherwise, we
|
||||
* needn't do it
|
||||
*/
|
||||
if (!use_slab)
|
||||
BUG_ON(memblock_reserve(addr, new_alloc_size));
|
||||
|
390
mm/memcontrol.c
390
mm/memcontrol.c
@ -61,12 +61,12 @@ struct cgroup_subsys mem_cgroup_subsys __read_mostly;
|
||||
#define MEM_CGROUP_RECLAIM_RETRIES 5
|
||||
static struct mem_cgroup *root_mem_cgroup __read_mostly;
|
||||
|
||||
#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
|
||||
#ifdef CONFIG_MEMCG_SWAP
|
||||
/* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */
|
||||
int do_swap_account __read_mostly;
|
||||
|
||||
/* for remember boot option*/
|
||||
#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP_ENABLED
|
||||
#ifdef CONFIG_MEMCG_SWAP_ENABLED
|
||||
static int really_do_swap_account __initdata = 1;
|
||||
#else
|
||||
static int really_do_swap_account __initdata = 0;
|
||||
@ -87,7 +87,7 @@ enum mem_cgroup_stat_index {
|
||||
MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */
|
||||
MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */
|
||||
MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */
|
||||
MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */
|
||||
MEM_CGROUP_STAT_SWAP, /* # of pages, swapped out */
|
||||
MEM_CGROUP_STAT_NSTATS,
|
||||
};
|
||||
|
||||
@ -378,9 +378,7 @@ static bool move_file(void)
|
||||
|
||||
enum charge_type {
|
||||
MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
|
||||
MEM_CGROUP_CHARGE_TYPE_MAPPED,
|
||||
MEM_CGROUP_CHARGE_TYPE_SHMEM, /* used by page migration of shmem */
|
||||
MEM_CGROUP_CHARGE_TYPE_FORCE, /* used by force_empty */
|
||||
MEM_CGROUP_CHARGE_TYPE_ANON,
|
||||
MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */
|
||||
MEM_CGROUP_CHARGE_TYPE_DROP, /* a page was unused swap cache */
|
||||
NR_CHARGE_TYPE,
|
||||
@ -407,8 +405,14 @@ enum charge_type {
|
||||
static void mem_cgroup_get(struct mem_cgroup *memcg);
|
||||
static void mem_cgroup_put(struct mem_cgroup *memcg);
|
||||
|
||||
static inline
|
||||
struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s)
|
||||
{
|
||||
return container_of(s, struct mem_cgroup, css);
|
||||
}
|
||||
|
||||
/* Writing them here to avoid exposing memcg's inner layout */
|
||||
#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM
|
||||
#ifdef CONFIG_MEMCG_KMEM
|
||||
#include <net/sock.h>
|
||||
#include <net/ip.h>
|
||||
|
||||
@ -467,9 +471,9 @@ struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg)
|
||||
}
|
||||
EXPORT_SYMBOL(tcp_proto_cgroup);
|
||||
#endif /* CONFIG_INET */
|
||||
#endif /* CONFIG_CGROUP_MEM_RES_CTLR_KMEM */
|
||||
#endif /* CONFIG_MEMCG_KMEM */
|
||||
|
||||
#if defined(CONFIG_INET) && defined(CONFIG_CGROUP_MEM_RES_CTLR_KMEM)
|
||||
#if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM)
|
||||
static void disarm_sock_keys(struct mem_cgroup *memcg)
|
||||
{
|
||||
if (!memcg_proto_activated(&memcg->tcp_mem.cg_proto))
|
||||
@ -703,7 +707,7 @@ static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg,
|
||||
bool charge)
|
||||
{
|
||||
int val = (charge) ? 1 : -1;
|
||||
this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAPOUT], val);
|
||||
this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAP], val);
|
||||
}
|
||||
|
||||
static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
|
||||
@ -864,9 +868,8 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
|
||||
|
||||
struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)
|
||||
{
|
||||
return container_of(cgroup_subsys_state(cont,
|
||||
mem_cgroup_subsys_id), struct mem_cgroup,
|
||||
css);
|
||||
return mem_cgroup_from_css(
|
||||
cgroup_subsys_state(cont, mem_cgroup_subsys_id));
|
||||
}
|
||||
|
||||
struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
|
||||
@ -879,8 +882,7 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
|
||||
if (unlikely(!p))
|
||||
return NULL;
|
||||
|
||||
return container_of(task_subsys_state(p, mem_cgroup_subsys_id),
|
||||
struct mem_cgroup, css);
|
||||
return mem_cgroup_from_css(task_subsys_state(p, mem_cgroup_subsys_id));
|
||||
}
|
||||
|
||||
struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
|
||||
@ -966,8 +968,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
|
||||
css = css_get_next(&mem_cgroup_subsys, id + 1, &root->css, &id);
|
||||
if (css) {
|
||||
if (css == &root->css || css_tryget(css))
|
||||
memcg = container_of(css,
|
||||
struct mem_cgroup, css);
|
||||
memcg = mem_cgroup_from_css(css);
|
||||
} else
|
||||
id = 0;
|
||||
rcu_read_unlock();
|
||||
@ -1454,7 +1455,7 @@ static int mem_cgroup_count_children(struct mem_cgroup *memcg)
|
||||
/*
|
||||
* Return the memory (and swap, if configured) limit for a memcg.
|
||||
*/
|
||||
u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
|
||||
static u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
|
||||
{
|
||||
u64 limit;
|
||||
u64 memsw;
|
||||
@ -1470,6 +1471,73 @@ u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
|
||||
return min(limit, memsw);
|
||||
}
|
||||
|
||||
void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
|
||||
int order)
|
||||
{
|
||||
struct mem_cgroup *iter;
|
||||
unsigned long chosen_points = 0;
|
||||
unsigned long totalpages;
|
||||
unsigned int points = 0;
|
||||
struct task_struct *chosen = NULL;
|
||||
|
||||
/*
|
||||
* If current has a pending SIGKILL, then automatically select it. The
|
||||
* goal is to allow it to allocate so that it may quickly exit and free
|
||||
* its memory.
|
||||
*/
|
||||
if (fatal_signal_pending(current)) {
|
||||
set_thread_flag(TIF_MEMDIE);
|
||||
return;
|
||||
}
|
||||
|
||||
check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL);
|
||||
totalpages = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT ? : 1;
|
||||
for_each_mem_cgroup_tree(iter, memcg) {
|
||||
struct cgroup *cgroup = iter->css.cgroup;
|
||||
struct cgroup_iter it;
|
||||
struct task_struct *task;
|
||||
|
||||
cgroup_iter_start(cgroup, &it);
|
||||
while ((task = cgroup_iter_next(cgroup, &it))) {
|
||||
switch (oom_scan_process_thread(task, totalpages, NULL,
|
||||
false)) {
|
||||
case OOM_SCAN_SELECT:
|
||||
if (chosen)
|
||||
put_task_struct(chosen);
|
||||
chosen = task;
|
||||
chosen_points = ULONG_MAX;
|
||||
get_task_struct(chosen);
|
||||
/* fall through */
|
||||
case OOM_SCAN_CONTINUE:
|
||||
continue;
|
||||
case OOM_SCAN_ABORT:
|
||||
cgroup_iter_end(cgroup, &it);
|
||||
mem_cgroup_iter_break(memcg, iter);
|
||||
if (chosen)
|
||||
put_task_struct(chosen);
|
||||
return;
|
||||
case OOM_SCAN_OK:
|
||||
break;
|
||||
};
|
||||
points = oom_badness(task, memcg, NULL, totalpages);
|
||||
if (points > chosen_points) {
|
||||
if (chosen)
|
||||
put_task_struct(chosen);
|
||||
chosen = task;
|
||||
chosen_points = points;
|
||||
get_task_struct(chosen);
|
||||
}
|
||||
}
|
||||
cgroup_iter_end(cgroup, &it);
|
||||
}
|
||||
|
||||
if (!chosen)
|
||||
return;
|
||||
points = chosen_points * 1000 / totalpages;
|
||||
oom_kill_process(chosen, gfp_mask, order, points, totalpages, memcg,
|
||||
NULL, "Memory cgroup out of memory");
|
||||
}
|
||||
|
||||
static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg,
|
||||
gfp_t gfp_mask,
|
||||
unsigned long flags)
|
||||
@ -1899,7 +1967,7 @@ void __mem_cgroup_begin_update_page_stat(struct page *page,
|
||||
return;
|
||||
/*
|
||||
* If this memory cgroup is not under account moving, we don't
|
||||
* need to take move_lock_page_cgroup(). Because we already hold
|
||||
* need to take move_lock_mem_cgroup(). Because we already hold
|
||||
* rcu_read_lock(), any calls to move_account will be delayed until
|
||||
* rcu_read_unlock() if mem_cgroup_stolen() == true.
|
||||
*/
|
||||
@ -1921,7 +1989,7 @@ void __mem_cgroup_end_update_page_stat(struct page *page, unsigned long *flags)
|
||||
/*
|
||||
* It's guaranteed that pc->mem_cgroup never changes while
|
||||
* lock is held because a routine modifies pc->mem_cgroup
|
||||
* should take move_lock_page_cgroup().
|
||||
* should take move_lock_mem_cgroup().
|
||||
*/
|
||||
move_unlock_mem_cgroup(pc->mem_cgroup, flags);
|
||||
}
|
||||
@ -2268,7 +2336,7 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
|
||||
* We always charge the cgroup the mm_struct belongs to.
|
||||
* The mm_struct's mem_cgroup changes on task migration if the
|
||||
* thread group leader migrates. It's possible that mm is not
|
||||
* set, if so charge the init_mm (happens for pagecache usage).
|
||||
* set, if so charge the root memcg (happens for pagecache usage).
|
||||
*/
|
||||
if (!*ptr && !mm)
|
||||
*ptr = root_mem_cgroup;
|
||||
@ -2429,7 +2497,7 @@ static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)
|
||||
css = css_lookup(&mem_cgroup_subsys, id);
|
||||
if (!css)
|
||||
return NULL;
|
||||
return container_of(css, struct mem_cgroup, css);
|
||||
return mem_cgroup_from_css(css);
|
||||
}
|
||||
|
||||
struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
|
||||
@ -2473,11 +2541,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
|
||||
bool anon;
|
||||
|
||||
lock_page_cgroup(pc);
|
||||
if (unlikely(PageCgroupUsed(pc))) {
|
||||
unlock_page_cgroup(pc);
|
||||
__mem_cgroup_cancel_charge(memcg, nr_pages);
|
||||
return;
|
||||
}
|
||||
VM_BUG_ON(PageCgroupUsed(pc));
|
||||
/*
|
||||
* we don't need page_cgroup_lock about tail pages, becase they are not
|
||||
* accessed by any other context at this point.
|
||||
@ -2519,7 +2583,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
|
||||
spin_unlock_irq(&zone->lru_lock);
|
||||
}
|
||||
|
||||
if (ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED)
|
||||
if (ctype == MEM_CGROUP_CHARGE_TYPE_ANON)
|
||||
anon = true;
|
||||
else
|
||||
anon = false;
|
||||
@ -2644,8 +2708,7 @@ static int mem_cgroup_move_account(struct page *page,
|
||||
|
||||
static int mem_cgroup_move_parent(struct page *page,
|
||||
struct page_cgroup *pc,
|
||||
struct mem_cgroup *child,
|
||||
gfp_t gfp_mask)
|
||||
struct mem_cgroup *child)
|
||||
{
|
||||
struct mem_cgroup *parent;
|
||||
unsigned int nr_pages;
|
||||
@ -2728,38 +2791,7 @@ int mem_cgroup_newpage_charge(struct page *page,
|
||||
VM_BUG_ON(page->mapping && !PageAnon(page));
|
||||
VM_BUG_ON(!mm);
|
||||
return mem_cgroup_charge_common(page, mm, gfp_mask,
|
||||
MEM_CGROUP_CHARGE_TYPE_MAPPED);
|
||||
}
|
||||
|
||||
static void
|
||||
__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
|
||||
enum charge_type ctype);
|
||||
|
||||
int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
|
||||
gfp_t gfp_mask)
|
||||
{
|
||||
struct mem_cgroup *memcg = NULL;
|
||||
enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE;
|
||||
int ret;
|
||||
|
||||
if (mem_cgroup_disabled())
|
||||
return 0;
|
||||
if (PageCompound(page))
|
||||
return 0;
|
||||
|
||||
if (unlikely(!mm))
|
||||
mm = &init_mm;
|
||||
if (!page_is_file_cache(page))
|
||||
type = MEM_CGROUP_CHARGE_TYPE_SHMEM;
|
||||
|
||||
if (!PageSwapCache(page))
|
||||
ret = mem_cgroup_charge_common(page, mm, gfp_mask, type);
|
||||
else { /* page is swapcache/shmem */
|
||||
ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &memcg);
|
||||
if (!ret)
|
||||
__mem_cgroup_commit_charge_swapin(page, memcg, type);
|
||||
}
|
||||
return ret;
|
||||
MEM_CGROUP_CHARGE_TYPE_ANON);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -2768,27 +2800,26 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
|
||||
* struct page_cgroup is acquired. This refcnt will be consumed by
|
||||
* "commit()" or removed by "cancel()"
|
||||
*/
|
||||
int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
|
||||
struct page *page,
|
||||
gfp_t mask, struct mem_cgroup **memcgp)
|
||||
static int __mem_cgroup_try_charge_swapin(struct mm_struct *mm,
|
||||
struct page *page,
|
||||
gfp_t mask,
|
||||
struct mem_cgroup **memcgp)
|
||||
{
|
||||
struct mem_cgroup *memcg;
|
||||
struct page_cgroup *pc;
|
||||
int ret;
|
||||
|
||||
*memcgp = NULL;
|
||||
|
||||
if (mem_cgroup_disabled())
|
||||
return 0;
|
||||
|
||||
if (!do_swap_account)
|
||||
goto charge_cur_mm;
|
||||
pc = lookup_page_cgroup(page);
|
||||
/*
|
||||
* A racing thread's fault, or swapoff, may have already updated
|
||||
* the pte, and even removed page from swap cache: in those cases
|
||||
* do_swap_page()'s pte_same() test will fail; but there's also a
|
||||
* KSM case which does need to charge the page.
|
||||
* Every swap fault against a single page tries to charge the
|
||||
* page, bail as early as possible. shmem_unuse() encounters
|
||||
* already charged pages, too. The USED bit is protected by
|
||||
* the page lock, which serializes swap cache removal, which
|
||||
* in turn serializes uncharging.
|
||||
*/
|
||||
if (!PageSwapCache(page))
|
||||
if (PageCgroupUsed(pc))
|
||||
return 0;
|
||||
if (!do_swap_account)
|
||||
goto charge_cur_mm;
|
||||
memcg = try_get_mem_cgroup_from_page(page);
|
||||
if (!memcg)
|
||||
@ -2800,14 +2831,44 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
|
||||
ret = 0;
|
||||
return ret;
|
||||
charge_cur_mm:
|
||||
if (unlikely(!mm))
|
||||
mm = &init_mm;
|
||||
ret = __mem_cgroup_try_charge(mm, mask, 1, memcgp, true);
|
||||
if (ret == -EINTR)
|
||||
ret = 0;
|
||||
return ret;
|
||||
}
|
||||
|
||||
int mem_cgroup_try_charge_swapin(struct mm_struct *mm, struct page *page,
|
||||
gfp_t gfp_mask, struct mem_cgroup **memcgp)
|
||||
{
|
||||
*memcgp = NULL;
|
||||
if (mem_cgroup_disabled())
|
||||
return 0;
|
||||
/*
|
||||
* A racing thread's fault, or swapoff, may have already
|
||||
* updated the pte, and even removed page from swap cache: in
|
||||
* those cases unuse_pte()'s pte_same() test will fail; but
|
||||
* there's also a KSM case which does need to charge the page.
|
||||
*/
|
||||
if (!PageSwapCache(page)) {
|
||||
int ret;
|
||||
|
||||
ret = __mem_cgroup_try_charge(mm, gfp_mask, 1, memcgp, true);
|
||||
if (ret == -EINTR)
|
||||
ret = 0;
|
||||
return ret;
|
||||
}
|
||||
return __mem_cgroup_try_charge_swapin(mm, page, gfp_mask, memcgp);
|
||||
}
|
||||
|
||||
void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg)
|
||||
{
|
||||
if (mem_cgroup_disabled())
|
||||
return;
|
||||
if (!memcg)
|
||||
return;
|
||||
__mem_cgroup_cancel_charge(memcg, 1);
|
||||
}
|
||||
|
||||
static void
|
||||
__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg,
|
||||
enum charge_type ctype)
|
||||
@ -2842,16 +2903,30 @@ void mem_cgroup_commit_charge_swapin(struct page *page,
|
||||
struct mem_cgroup *memcg)
|
||||
{
|
||||
__mem_cgroup_commit_charge_swapin(page, memcg,
|
||||
MEM_CGROUP_CHARGE_TYPE_MAPPED);
|
||||
MEM_CGROUP_CHARGE_TYPE_ANON);
|
||||
}
|
||||
|
||||
void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg)
|
||||
int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
|
||||
gfp_t gfp_mask)
|
||||
{
|
||||
struct mem_cgroup *memcg = NULL;
|
||||
enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE;
|
||||
int ret;
|
||||
|
||||
if (mem_cgroup_disabled())
|
||||
return;
|
||||
if (!memcg)
|
||||
return;
|
||||
__mem_cgroup_cancel_charge(memcg, 1);
|
||||
return 0;
|
||||
if (PageCompound(page))
|
||||
return 0;
|
||||
|
||||
if (!PageSwapCache(page))
|
||||
ret = mem_cgroup_charge_common(page, mm, gfp_mask, type);
|
||||
else { /* page is swapcache/shmem */
|
||||
ret = __mem_cgroup_try_charge_swapin(mm, page,
|
||||
gfp_mask, &memcg);
|
||||
if (!ret)
|
||||
__mem_cgroup_commit_charge_swapin(page, memcg, type);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void mem_cgroup_do_uncharge(struct mem_cgroup *memcg,
|
||||
@ -2911,7 +2986,8 @@ static void mem_cgroup_do_uncharge(struct mem_cgroup *memcg,
|
||||
* uncharge if !page_mapped(page)
|
||||
*/
|
||||
static struct mem_cgroup *
|
||||
__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
|
||||
__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype,
|
||||
bool end_migration)
|
||||
{
|
||||
struct mem_cgroup *memcg = NULL;
|
||||
unsigned int nr_pages = 1;
|
||||
@ -2921,8 +2997,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
|
||||
if (mem_cgroup_disabled())
|
||||
return NULL;
|
||||
|
||||
if (PageSwapCache(page))
|
||||
return NULL;
|
||||
VM_BUG_ON(PageSwapCache(page));
|
||||
|
||||
if (PageTransHuge(page)) {
|
||||
nr_pages <<= compound_order(page);
|
||||
@ -2945,7 +3020,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
|
||||
anon = PageAnon(page);
|
||||
|
||||
switch (ctype) {
|
||||
case MEM_CGROUP_CHARGE_TYPE_MAPPED:
|
||||
case MEM_CGROUP_CHARGE_TYPE_ANON:
|
||||
/*
|
||||
* Generally PageAnon tells if it's the anon statistics to be
|
||||
* updated; but sometimes e.g. mem_cgroup_uncharge_page() is
|
||||
@ -2955,7 +3030,16 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
|
||||
/* fallthrough */
|
||||
case MEM_CGROUP_CHARGE_TYPE_DROP:
|
||||
/* See mem_cgroup_prepare_migration() */
|
||||
if (page_mapped(page) || PageCgroupMigration(pc))
|
||||
if (page_mapped(page))
|
||||
goto unlock_out;
|
||||
/*
|
||||
* Pages under migration may not be uncharged. But
|
||||
* end_migration() /must/ be the one uncharging the
|
||||
* unused post-migration page and so it has to call
|
||||
* here with the migration bit still set. See the
|
||||
* res_counter handling below.
|
||||
*/
|
||||
if (!end_migration && PageCgroupMigration(pc))
|
||||
goto unlock_out;
|
||||
break;
|
||||
case MEM_CGROUP_CHARGE_TYPE_SWAPOUT:
|
||||
@ -2989,7 +3073,12 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
|
||||
mem_cgroup_swap_statistics(memcg, true);
|
||||
mem_cgroup_get(memcg);
|
||||
}
|
||||
if (!mem_cgroup_is_root(memcg))
|
||||
/*
|
||||
* Migration does not charge the res_counter for the
|
||||
* replacement page, so leave it alone when phasing out the
|
||||
* page that is unused after the migration.
|
||||
*/
|
||||
if (!end_migration && !mem_cgroup_is_root(memcg))
|
||||
mem_cgroup_do_uncharge(memcg, nr_pages, ctype);
|
||||
|
||||
return memcg;
|
||||
@ -3005,14 +3094,16 @@ void mem_cgroup_uncharge_page(struct page *page)
|
||||
if (page_mapped(page))
|
||||
return;
|
||||
VM_BUG_ON(page->mapping && !PageAnon(page));
|
||||
__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED);
|
||||
if (PageSwapCache(page))
|
||||
return;
|
||||
__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_ANON, false);
|
||||
}
|
||||
|
||||
void mem_cgroup_uncharge_cache_page(struct page *page)
|
||||
{
|
||||
VM_BUG_ON(page_mapped(page));
|
||||
VM_BUG_ON(page->mapping);
|
||||
__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE);
|
||||
__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE, false);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -3076,7 +3167,7 @@ mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
|
||||
if (!swapout) /* this was a swap cache but the swap is unused ! */
|
||||
ctype = MEM_CGROUP_CHARGE_TYPE_DROP;
|
||||
|
||||
memcg = __mem_cgroup_uncharge_common(page, ctype);
|
||||
memcg = __mem_cgroup_uncharge_common(page, ctype, false);
|
||||
|
||||
/*
|
||||
* record memcg information, if swapout && memcg != NULL,
|
||||
@ -3087,7 +3178,7 @@ mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
|
||||
#ifdef CONFIG_MEMCG_SWAP
|
||||
/*
|
||||
* called from swap_entry_free(). remove record in swap_cgroup and
|
||||
* uncharge "memsw" account.
|
||||
@ -3166,19 +3257,18 @@ static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
|
||||
* Before starting migration, account PAGE_SIZE to mem_cgroup that the old
|
||||
* page belongs to.
|
||||
*/
|
||||
int mem_cgroup_prepare_migration(struct page *page,
|
||||
struct page *newpage, struct mem_cgroup **memcgp, gfp_t gfp_mask)
|
||||
void mem_cgroup_prepare_migration(struct page *page, struct page *newpage,
|
||||
struct mem_cgroup **memcgp)
|
||||
{
|
||||
struct mem_cgroup *memcg = NULL;
|
||||
struct page_cgroup *pc;
|
||||
enum charge_type ctype;
|
||||
int ret = 0;
|
||||
|
||||
*memcgp = NULL;
|
||||
|
||||
VM_BUG_ON(PageTransHuge(page));
|
||||
if (mem_cgroup_disabled())
|
||||
return 0;
|
||||
return;
|
||||
|
||||
pc = lookup_page_cgroup(page);
|
||||
lock_page_cgroup(pc);
|
||||
@ -3223,24 +3313,9 @@ int mem_cgroup_prepare_migration(struct page *page,
|
||||
* we return here.
|
||||
*/
|
||||
if (!memcg)
|
||||
return 0;
|
||||
return;
|
||||
|
||||
*memcgp = memcg;
|
||||
ret = __mem_cgroup_try_charge(NULL, gfp_mask, 1, memcgp, false);
|
||||
css_put(&memcg->css);/* drop extra refcnt */
|
||||
if (ret) {
|
||||
if (PageAnon(page)) {
|
||||
lock_page_cgroup(pc);
|
||||
ClearPageCgroupMigration(pc);
|
||||
unlock_page_cgroup(pc);
|
||||
/*
|
||||
* The old page may be fully unmapped while we kept it.
|
||||
*/
|
||||
mem_cgroup_uncharge_page(page);
|
||||
}
|
||||
/* we'll need to revisit this error code (we have -EINTR) */
|
||||
return -ENOMEM;
|
||||
}
|
||||
/*
|
||||
* We charge new page before it's used/mapped. So, even if unlock_page()
|
||||
* is called before end_migration, we can catch all events on this new
|
||||
@ -3248,13 +3323,15 @@ int mem_cgroup_prepare_migration(struct page *page,
|
||||
* mapcount will be finally 0 and we call uncharge in end_migration().
|
||||
*/
|
||||
if (PageAnon(page))
|
||||
ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED;
|
||||
else if (page_is_file_cache(page))
|
||||
ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
|
||||
ctype = MEM_CGROUP_CHARGE_TYPE_ANON;
|
||||
else
|
||||
ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM;
|
||||
ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
|
||||
/*
|
||||
* The page is committed to the memcg, but it's not actually
|
||||
* charged to the res_counter since we plan on replacing the
|
||||
* old one and only one page is going to be left afterwards.
|
||||
*/
|
||||
__mem_cgroup_commit_charge(memcg, newpage, 1, ctype, false);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* remove redundant charge if migration failed*/
|
||||
@ -3276,6 +3353,12 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg,
|
||||
used = newpage;
|
||||
unused = oldpage;
|
||||
}
|
||||
anon = PageAnon(used);
|
||||
__mem_cgroup_uncharge_common(unused,
|
||||
anon ? MEM_CGROUP_CHARGE_TYPE_ANON
|
||||
: MEM_CGROUP_CHARGE_TYPE_CACHE,
|
||||
true);
|
||||
css_put(&memcg->css);
|
||||
/*
|
||||
* We disallowed uncharge of pages under migration because mapcount
|
||||
* of the page goes down to zero, temporarly.
|
||||
@ -3285,10 +3368,6 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg,
|
||||
lock_page_cgroup(pc);
|
||||
ClearPageCgroupMigration(pc);
|
||||
unlock_page_cgroup(pc);
|
||||
anon = PageAnon(used);
|
||||
__mem_cgroup_uncharge_common(unused,
|
||||
anon ? MEM_CGROUP_CHARGE_TYPE_MAPPED
|
||||
: MEM_CGROUP_CHARGE_TYPE_CACHE);
|
||||
|
||||
/*
|
||||
* If a page is a file cache, radix-tree replacement is very atomic
|
||||
@ -3340,10 +3419,6 @@ void mem_cgroup_replace_page_cache(struct page *oldpage,
|
||||
*/
|
||||
if (!memcg)
|
||||
return;
|
||||
|
||||
if (PageSwapBacked(oldpage))
|
||||
type = MEM_CGROUP_CHARGE_TYPE_SHMEM;
|
||||
|
||||
/*
|
||||
* Even if newpage->mapping was NULL before starting replacement,
|
||||
* the newpage may be on LRU(or pagevec for LRU) already. We lock
|
||||
@ -3418,7 +3493,7 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
|
||||
/*
|
||||
* Rather than hide all in some function, I do this in
|
||||
* open coded manner. You see what this really does.
|
||||
* We have to guarantee memcg->res.limit < memcg->memsw.limit.
|
||||
* We have to guarantee memcg->res.limit <= memcg->memsw.limit.
|
||||
*/
|
||||
mutex_lock(&set_limit_mutex);
|
||||
memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
|
||||
@ -3479,7 +3554,7 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
|
||||
/*
|
||||
* Rather than hide all in some function, I do this in
|
||||
* open coded manner. You see what this really does.
|
||||
* We have to guarantee memcg->res.limit < memcg->memsw.limit.
|
||||
* We have to guarantee memcg->res.limit <= memcg->memsw.limit.
|
||||
*/
|
||||
mutex_lock(&set_limit_mutex);
|
||||
memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);
|
||||
@ -3611,10 +3686,12 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
|
||||
}
|
||||
|
||||
/*
|
||||
* This routine traverse page_cgroup in given list and drop them all.
|
||||
* *And* this routine doesn't reclaim page itself, just removes page_cgroup.
|
||||
* Traverse a specified page_cgroup list and try to drop them all. This doesn't
|
||||
* reclaim the pages page themselves - it just removes the page_cgroups.
|
||||
* Returns true if some page_cgroups were not freed, indicating that the caller
|
||||
* must retry this operation.
|
||||
*/
|
||||
static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
|
||||
static bool mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
|
||||
int node, int zid, enum lru_list lru)
|
||||
{
|
||||
struct mem_cgroup_per_zone *mz;
|
||||
@ -3622,7 +3699,6 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
|
||||
struct list_head *list;
|
||||
struct page *busy;
|
||||
struct zone *zone;
|
||||
int ret = 0;
|
||||
|
||||
zone = &NODE_DATA(node)->node_zones[zid];
|
||||
mz = mem_cgroup_zoneinfo(memcg, node, zid);
|
||||
@ -3636,7 +3712,6 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
|
||||
struct page_cgroup *pc;
|
||||
struct page *page;
|
||||
|
||||
ret = 0;
|
||||
spin_lock_irqsave(&zone->lru_lock, flags);
|
||||
if (list_empty(list)) {
|
||||
spin_unlock_irqrestore(&zone->lru_lock, flags);
|
||||
@ -3653,21 +3728,14 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
|
||||
|
||||
pc = lookup_page_cgroup(page);
|
||||
|
||||
ret = mem_cgroup_move_parent(page, pc, memcg, GFP_KERNEL);
|
||||
if (ret == -ENOMEM || ret == -EINTR)
|
||||
break;
|
||||
|
||||
if (ret == -EBUSY || ret == -EINVAL) {
|
||||
if (mem_cgroup_move_parent(page, pc, memcg)) {
|
||||
/* found lock contention or "pc" is obsolete. */
|
||||
busy = page;
|
||||
cond_resched();
|
||||
} else
|
||||
busy = NULL;
|
||||
}
|
||||
|
||||
if (!ret && !list_empty(list))
|
||||
return -EBUSY;
|
||||
return ret;
|
||||
return !list_empty(list);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -3692,9 +3760,6 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg, bool free_all)
|
||||
ret = -EBUSY;
|
||||
if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children))
|
||||
goto out;
|
||||
ret = -EINTR;
|
||||
if (signal_pending(current))
|
||||
goto out;
|
||||
/* This is for making all *used* pages to be on LRU. */
|
||||
lru_add_drain_all();
|
||||
drain_all_stock_sync(memcg);
|
||||
@ -3715,9 +3780,6 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg, bool free_all)
|
||||
}
|
||||
mem_cgroup_end_move(memcg);
|
||||
memcg_oom_recover(memcg);
|
||||
/* it seems parent cgroup doesn't have enough mem */
|
||||
if (ret == -ENOMEM)
|
||||
goto try_to_free;
|
||||
cond_resched();
|
||||
/* "ret" should also be checked to ensure all lists are empty. */
|
||||
} while (res_counter_read_u64(&memcg->res, RES_USAGE) > 0 || ret);
|
||||
@ -3779,6 +3841,10 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
|
||||
parent_memcg = mem_cgroup_from_cont(parent);
|
||||
|
||||
cgroup_lock();
|
||||
|
||||
if (memcg->use_hierarchy == val)
|
||||
goto out;
|
||||
|
||||
/*
|
||||
* If parent's use_hierarchy is set, we can't make any modifications
|
||||
* in the child subtrees. If it is unset, then the change can
|
||||
@ -3795,6 +3861,8 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
|
||||
retval = -EBUSY;
|
||||
} else
|
||||
retval = -EINVAL;
|
||||
|
||||
out:
|
||||
cgroup_unlock();
|
||||
|
||||
return retval;
|
||||
@ -3831,7 +3899,7 @@ static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
|
||||
val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_RSS);
|
||||
|
||||
if (swap)
|
||||
val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SWAPOUT);
|
||||
val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SWAP);
|
||||
|
||||
return val << PAGE_SHIFT;
|
||||
}
|
||||
@ -4015,7 +4083,7 @@ static int mem_cgroup_move_charge_write(struct cgroup *cgrp,
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_NUMA
|
||||
static int mem_control_numa_stat_show(struct cgroup *cont, struct cftype *cft,
|
||||
static int memcg_numa_stat_show(struct cgroup *cont, struct cftype *cft,
|
||||
struct seq_file *m)
|
||||
{
|
||||
int nid;
|
||||
@ -4074,7 +4142,7 @@ static inline void mem_cgroup_lru_names_not_uptodate(void)
|
||||
BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);
|
||||
}
|
||||
|
||||
static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
|
||||
static int memcg_stat_show(struct cgroup *cont, struct cftype *cft,
|
||||
struct seq_file *m)
|
||||
{
|
||||
struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
|
||||
@ -4082,7 +4150,7 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
|
||||
unsigned int i;
|
||||
|
||||
for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
|
||||
if (i == MEM_CGROUP_STAT_SWAPOUT && !do_swap_account)
|
||||
if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
|
||||
continue;
|
||||
seq_printf(m, "%s %ld\n", mem_cgroup_stat_names[i],
|
||||
mem_cgroup_read_stat(memcg, i) * PAGE_SIZE);
|
||||
@ -4109,7 +4177,7 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
|
||||
for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
|
||||
long long val = 0;
|
||||
|
||||
if (i == MEM_CGROUP_STAT_SWAPOUT && !do_swap_account)
|
||||
if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
|
||||
continue;
|
||||
for_each_mem_cgroup_tree(mi, memcg)
|
||||
val += mem_cgroup_read_stat(mi, i) * PAGE_SIZE;
|
||||
@ -4533,7 +4601,7 @@ static int mem_cgroup_oom_control_write(struct cgroup *cgrp,
|
||||
return 0;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM
|
||||
#ifdef CONFIG_MEMCG_KMEM
|
||||
static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
|
||||
{
|
||||
return mem_cgroup_sockets_init(memcg, ss);
|
||||
@ -4588,7 +4656,7 @@ static struct cftype mem_cgroup_files[] = {
|
||||
},
|
||||
{
|
||||
.name = "stat",
|
||||
.read_seq_string = mem_control_stat_show,
|
||||
.read_seq_string = memcg_stat_show,
|
||||
},
|
||||
{
|
||||
.name = "force_empty",
|
||||
@ -4620,10 +4688,10 @@ static struct cftype mem_cgroup_files[] = {
|
||||
#ifdef CONFIG_NUMA
|
||||
{
|
||||
.name = "numa_stat",
|
||||
.read_seq_string = mem_control_numa_stat_show,
|
||||
.read_seq_string = memcg_numa_stat_show,
|
||||
},
|
||||
#endif
|
||||
#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
|
||||
#ifdef CONFIG_MEMCG_SWAP
|
||||
{
|
||||
.name = "memsw.usage_in_bytes",
|
||||
.private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
|
||||
@ -4810,7 +4878,7 @@ struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
|
||||
}
|
||||
EXPORT_SYMBOL(parent_mem_cgroup);
|
||||
|
||||
#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
|
||||
#ifdef CONFIG_MEMCG_SWAP
|
||||
static void __init enable_swap_cgroup(void)
|
||||
{
|
||||
if (!mem_cgroup_disabled() && really_do_swap_account)
|
||||
@ -5541,7 +5609,7 @@ struct cgroup_subsys mem_cgroup_subsys = {
|
||||
.__DEPRECATED_clear_css_refs = true,
|
||||
};
|
||||
|
||||
#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
|
||||
#ifdef CONFIG_MEMCG_SWAP
|
||||
static int __init enable_swap_account(char *s)
|
||||
{
|
||||
/* consider enabled if no parameter or 1 is given */
|
||||
|
@ -128,7 +128,7 @@ static int hwpoison_filter_flags(struct page *p)
|
||||
* can only guarantee that the page either belongs to the memcg tasks, or is
|
||||
* a freed page.
|
||||
*/
|
||||
#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
|
||||
#ifdef CONFIG_MEMCG_SWAP
|
||||
u64 hwpoison_filter_memcg;
|
||||
EXPORT_SYMBOL_GPL(hwpoison_filter_memcg);
|
||||
static int hwpoison_filter_task(struct page *p)
|
||||
@ -1416,7 +1416,6 @@ static int soft_offline_huge_page(struct page *page, int flags)
|
||||
int ret;
|
||||
unsigned long pfn = page_to_pfn(page);
|
||||
struct page *hpage = compound_head(page);
|
||||
LIST_HEAD(pagelist);
|
||||
|
||||
ret = get_any_page(page, pfn, flags);
|
||||
if (ret < 0)
|
||||
@ -1431,24 +1430,18 @@ static int soft_offline_huge_page(struct page *page, int flags)
|
||||
}
|
||||
|
||||
/* Keep page count to indicate a given hugepage is isolated. */
|
||||
|
||||
list_add(&hpage->lru, &pagelist);
|
||||
ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, false,
|
||||
ret = migrate_huge_page(hpage, new_page, MPOL_MF_MOVE_ALL, false,
|
||||
MIGRATE_SYNC);
|
||||
put_page(hpage);
|
||||
if (ret) {
|
||||
struct page *page1, *page2;
|
||||
list_for_each_entry_safe(page1, page2, &pagelist, lru)
|
||||
put_page(page1);
|
||||
|
||||
pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
|
||||
pfn, ret, page->flags);
|
||||
if (ret > 0)
|
||||
ret = -EIO;
|
||||
return ret;
|
||||
}
|
||||
done:
|
||||
if (!PageHWPoison(hpage))
|
||||
atomic_long_add(1 << compound_trans_order(hpage), &mce_bad_pages);
|
||||
atomic_long_add(1 << compound_trans_order(hpage),
|
||||
&mce_bad_pages);
|
||||
set_page_hwpoison_huge_page(hpage);
|
||||
dequeue_hwpoisoned_huge_page(hpage);
|
||||
/* keep elevated page count for bad page */
|
||||
|
@ -1343,8 +1343,11 @@ static void unmap_single_vma(struct mmu_gather *tlb,
|
||||
* Since no pte has actually been setup, it is
|
||||
* safe to do nothing in this case.
|
||||
*/
|
||||
if (vma->vm_file)
|
||||
unmap_hugepage_range(vma, start, end, NULL);
|
||||
if (vma->vm_file) {
|
||||
mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex);
|
||||
__unmap_hugepage_range_final(tlb, vma, start, end, NULL);
|
||||
mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
|
||||
}
|
||||
} else
|
||||
unmap_page_range(tlb, vma, start, end, details);
|
||||
}
|
||||
@ -3938,7 +3941,7 @@ void print_vma_addr(char *prefix, unsigned long ip)
|
||||
free_page((unsigned long)buf);
|
||||
}
|
||||
}
|
||||
up_read(¤t->mm->mmap_sem);
|
||||
up_read(&mm->mmap_sem);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_PROVE_LOCKING
|
||||
|
@ -512,19 +512,20 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages)
|
||||
|
||||
zone->present_pages += onlined_pages;
|
||||
zone->zone_pgdat->node_present_pages += onlined_pages;
|
||||
if (need_zonelists_rebuild)
|
||||
build_all_zonelists(zone);
|
||||
else
|
||||
zone_pcp_update(zone);
|
||||
if (onlined_pages) {
|
||||
node_set_state(zone_to_nid(zone), N_HIGH_MEMORY);
|
||||
if (need_zonelists_rebuild)
|
||||
build_all_zonelists(NULL, zone);
|
||||
else
|
||||
zone_pcp_update(zone);
|
||||
}
|
||||
|
||||
mutex_unlock(&zonelists_mutex);
|
||||
|
||||
init_per_zone_wmark_min();
|
||||
|
||||
if (onlined_pages) {
|
||||
if (onlined_pages)
|
||||
kswapd_run(zone_to_nid(zone));
|
||||
node_set_state(zone_to_nid(zone), N_HIGH_MEMORY);
|
||||
}
|
||||
|
||||
vm_total_pages = nr_free_pagecache_pages();
|
||||
|
||||
@ -562,7 +563,7 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
|
||||
* to access not-initialized zonelist, build here.
|
||||
*/
|
||||
mutex_lock(&zonelists_mutex);
|
||||
build_all_zonelists(NULL);
|
||||
build_all_zonelists(pgdat, NULL);
|
||||
mutex_unlock(&zonelists_mutex);
|
||||
|
||||
return pgdat;
|
||||
@ -965,6 +966,9 @@ static int __ref offline_pages(unsigned long start_pfn,
|
||||
|
||||
init_per_zone_wmark_min();
|
||||
|
||||
if (!populated_zone(zone))
|
||||
zone_pcp_reset(zone);
|
||||
|
||||
if (!node_present_pages(node)) {
|
||||
node_clear_state(node, N_HIGH_MEMORY);
|
||||
kswapd_stop(node);
|
||||
|
79
mm/migrate.c
79
mm/migrate.c
@ -33,6 +33,7 @@
|
||||
#include <linux/memcontrol.h>
|
||||
#include <linux/syscalls.h>
|
||||
#include <linux/hugetlb.h>
|
||||
#include <linux/hugetlb_cgroup.h>
|
||||
#include <linux/gfp.h>
|
||||
|
||||
#include <asm/tlbflush.h>
|
||||
@ -682,7 +683,6 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
|
||||
{
|
||||
int rc = -EAGAIN;
|
||||
int remap_swapcache = 1;
|
||||
int charge = 0;
|
||||
struct mem_cgroup *mem;
|
||||
struct anon_vma *anon_vma = NULL;
|
||||
|
||||
@ -724,12 +724,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
|
||||
}
|
||||
|
||||
/* charge against new page */
|
||||
charge = mem_cgroup_prepare_migration(page, newpage, &mem, GFP_KERNEL);
|
||||
if (charge == -ENOMEM) {
|
||||
rc = -ENOMEM;
|
||||
goto unlock;
|
||||
}
|
||||
BUG_ON(charge);
|
||||
mem_cgroup_prepare_migration(page, newpage, &mem);
|
||||
|
||||
if (PageWriteback(page)) {
|
||||
/*
|
||||
@ -819,8 +814,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
|
||||
put_anon_vma(anon_vma);
|
||||
|
||||
uncharge:
|
||||
if (!charge)
|
||||
mem_cgroup_end_migration(mem, page, newpage, rc == 0);
|
||||
mem_cgroup_end_migration(mem, page, newpage, rc == 0);
|
||||
unlock:
|
||||
unlock_page(page);
|
||||
out:
|
||||
@ -931,16 +925,13 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
|
||||
|
||||
if (anon_vma)
|
||||
put_anon_vma(anon_vma);
|
||||
|
||||
if (!rc)
|
||||
hugetlb_cgroup_migrate(hpage, new_hpage);
|
||||
|
||||
unlock_page(hpage);
|
||||
|
||||
out:
|
||||
if (rc != -EAGAIN) {
|
||||
list_del(&hpage->lru);
|
||||
put_page(hpage);
|
||||
}
|
||||
|
||||
put_page(new_hpage);
|
||||
|
||||
if (result) {
|
||||
if (rc)
|
||||
*result = rc;
|
||||
@ -1016,48 +1007,32 @@ int migrate_pages(struct list_head *from,
|
||||
return nr_failed + retry;
|
||||
}
|
||||
|
||||
int migrate_huge_pages(struct list_head *from,
|
||||
new_page_t get_new_page, unsigned long private, bool offlining,
|
||||
enum migrate_mode mode)
|
||||
int migrate_huge_page(struct page *hpage, new_page_t get_new_page,
|
||||
unsigned long private, bool offlining,
|
||||
enum migrate_mode mode)
|
||||
{
|
||||
int retry = 1;
|
||||
int nr_failed = 0;
|
||||
int pass = 0;
|
||||
struct page *page;
|
||||
struct page *page2;
|
||||
int rc;
|
||||
int pass, rc;
|
||||
|
||||
for (pass = 0; pass < 10 && retry; pass++) {
|
||||
retry = 0;
|
||||
|
||||
list_for_each_entry_safe(page, page2, from, lru) {
|
||||
for (pass = 0; pass < 10; pass++) {
|
||||
rc = unmap_and_move_huge_page(get_new_page,
|
||||
private, hpage, pass > 2, offlining,
|
||||
mode);
|
||||
switch (rc) {
|
||||
case -ENOMEM:
|
||||
goto out;
|
||||
case -EAGAIN:
|
||||
/* try again */
|
||||
cond_resched();
|
||||
|
||||
rc = unmap_and_move_huge_page(get_new_page,
|
||||
private, page, pass > 2, offlining,
|
||||
mode);
|
||||
|
||||
switch(rc) {
|
||||
case -ENOMEM:
|
||||
goto out;
|
||||
case -EAGAIN:
|
||||
retry++;
|
||||
break;
|
||||
case 0:
|
||||
break;
|
||||
default:
|
||||
/* Permanent failure */
|
||||
nr_failed++;
|
||||
break;
|
||||
}
|
||||
break;
|
||||
case 0:
|
||||
goto out;
|
||||
default:
|
||||
rc = -EIO;
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
rc = 0;
|
||||
out:
|
||||
if (rc)
|
||||
return rc;
|
||||
|
||||
return nr_failed + retry;
|
||||
return rc;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_NUMA
|
||||
|
@ -943,6 +943,8 @@ void vm_stat_account(struct mm_struct *mm, unsigned long flags,
|
||||
const unsigned long stack_flags
|
||||
= VM_STACK_FLAGS & (VM_GROWSUP|VM_GROWSDOWN);
|
||||
|
||||
mm->total_vm += pages;
|
||||
|
||||
if (file) {
|
||||
mm->shared_vm += pages;
|
||||
if ((flags & (VM_EXEC|VM_WRITE)) == VM_EXEC)
|
||||
@ -1347,7 +1349,6 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
|
||||
out:
|
||||
perf_event_mmap(vma);
|
||||
|
||||
mm->total_vm += len >> PAGE_SHIFT;
|
||||
vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);
|
||||
if (vm_flags & VM_LOCKED) {
|
||||
if (!mlock_vma_pages_range(vma, addr, addr + len))
|
||||
@ -1707,7 +1708,6 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns
|
||||
return -ENOMEM;
|
||||
|
||||
/* Ok, everything looks good - let it rip */
|
||||
mm->total_vm += grow;
|
||||
if (vma->vm_flags & VM_LOCKED)
|
||||
mm->locked_vm += grow;
|
||||
vm_stat_account(mm, vma->vm_flags, vma->vm_file, grow);
|
||||
@ -1889,7 +1889,6 @@ static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma)
|
||||
|
||||
if (vma->vm_flags & VM_ACCOUNT)
|
||||
nr_accounted += nrpages;
|
||||
mm->total_vm -= nrpages;
|
||||
vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages);
|
||||
vma = remove_vma(vma);
|
||||
} while (vma);
|
||||
|
@ -33,6 +33,24 @@
|
||||
void __mmu_notifier_release(struct mm_struct *mm)
|
||||
{
|
||||
struct mmu_notifier *mn;
|
||||
struct hlist_node *n;
|
||||
|
||||
/*
|
||||
* RCU here will block mmu_notifier_unregister until
|
||||
* ->release returns.
|
||||
*/
|
||||
rcu_read_lock();
|
||||
hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist)
|
||||
/*
|
||||
* if ->release runs before mmu_notifier_unregister it
|
||||
* must be handled as it's the only way for the driver
|
||||
* to flush all existing sptes and stop the driver
|
||||
* from establishing any more sptes before all the
|
||||
* pages in the mm are freed.
|
||||
*/
|
||||
if (mn->ops->release)
|
||||
mn->ops->release(mn, mm);
|
||||
rcu_read_unlock();
|
||||
|
||||
spin_lock(&mm->mmu_notifier_mm->lock);
|
||||
while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) {
|
||||
@ -46,23 +64,6 @@ void __mmu_notifier_release(struct mm_struct *mm)
|
||||
* mmu_notifier_unregister to return.
|
||||
*/
|
||||
hlist_del_init_rcu(&mn->hlist);
|
||||
/*
|
||||
* RCU here will block mmu_notifier_unregister until
|
||||
* ->release returns.
|
||||
*/
|
||||
rcu_read_lock();
|
||||
spin_unlock(&mm->mmu_notifier_mm->lock);
|
||||
/*
|
||||
* if ->release runs before mmu_notifier_unregister it
|
||||
* must be handled as it's the only way for the driver
|
||||
* to flush all existing sptes and stop the driver
|
||||
* from establishing any more sptes before all the
|
||||
* pages in the mm are freed.
|
||||
*/
|
||||
if (mn->ops->release)
|
||||
mn->ops->release(mn, mm);
|
||||
rcu_read_unlock();
|
||||
spin_lock(&mm->mmu_notifier_mm->lock);
|
||||
}
|
||||
spin_unlock(&mm->mmu_notifier_mm->lock);
|
||||
|
||||
@ -284,16 +285,13 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm)
|
||||
{
|
||||
BUG_ON(atomic_read(&mm->mm_count) <= 0);
|
||||
|
||||
spin_lock(&mm->mmu_notifier_mm->lock);
|
||||
if (!hlist_unhashed(&mn->hlist)) {
|
||||
hlist_del_rcu(&mn->hlist);
|
||||
|
||||
/*
|
||||
* RCU here will force exit_mmap to wait ->release to finish
|
||||
* before freeing the pages.
|
||||
*/
|
||||
rcu_read_lock();
|
||||
spin_unlock(&mm->mmu_notifier_mm->lock);
|
||||
|
||||
/*
|
||||
* exit_mmap will block in mmu_notifier_release to
|
||||
* guarantee ->release is called before freeing the
|
||||
@ -302,8 +300,11 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm)
|
||||
if (mn->ops->release)
|
||||
mn->ops->release(mn, mm);
|
||||
rcu_read_unlock();
|
||||
} else
|
||||
|
||||
spin_lock(&mm->mmu_notifier_mm->lock);
|
||||
hlist_del_rcu(&mn->hlist);
|
||||
spin_unlock(&mm->mmu_notifier_mm->lock);
|
||||
}
|
||||
|
||||
/*
|
||||
* Wait any running method to finish, of course including
|
||||
|
@ -96,7 +96,7 @@ void lruvec_init(struct lruvec *lruvec, struct zone *zone)
|
||||
for_each_lru(lru)
|
||||
INIT_LIST_HEAD(&lruvec->lists[lru]);
|
||||
|
||||
#ifdef CONFIG_CGROUP_MEM_RES_CTLR
|
||||
#ifdef CONFIG_MEMCG
|
||||
lruvec->zone = zone;
|
||||
#endif
|
||||
}
|
||||
|
@ -260,7 +260,6 @@ static unsigned long move_vma(struct vm_area_struct *vma,
|
||||
* If this were a serious issue, we'd add a flag to do_munmap().
|
||||
*/
|
||||
hiwater_vm = mm->hiwater_vm;
|
||||
mm->total_vm += new_len >> PAGE_SHIFT;
|
||||
vm_stat_account(mm, vma->vm_flags, vma->vm_file, new_len>>PAGE_SHIFT);
|
||||
|
||||
if (do_munmap(mm, old_addr, old_len) < 0) {
|
||||
@ -497,7 +496,6 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
|
||||
goto out;
|
||||
}
|
||||
|
||||
mm->total_vm += pages;
|
||||
vm_stat_account(mm, vma->vm_flags, vma->vm_file, pages);
|
||||
if (vma->vm_flags & VM_LOCKED) {
|
||||
mm->locked_vm += pages;
|
||||
|
223
mm/oom_kill.c
223
mm/oom_kill.c
@ -288,76 +288,93 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
|
||||
}
|
||||
#endif
|
||||
|
||||
enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
|
||||
unsigned long totalpages, const nodemask_t *nodemask,
|
||||
bool force_kill)
|
||||
{
|
||||
if (task->exit_state)
|
||||
return OOM_SCAN_CONTINUE;
|
||||
if (oom_unkillable_task(task, NULL, nodemask))
|
||||
return OOM_SCAN_CONTINUE;
|
||||
|
||||
/*
|
||||
* This task already has access to memory reserves and is being killed.
|
||||
* Don't allow any other task to have access to the reserves.
|
||||
*/
|
||||
if (test_tsk_thread_flag(task, TIF_MEMDIE)) {
|
||||
if (unlikely(frozen(task)))
|
||||
__thaw_task(task);
|
||||
if (!force_kill)
|
||||
return OOM_SCAN_ABORT;
|
||||
}
|
||||
if (!task->mm)
|
||||
return OOM_SCAN_CONTINUE;
|
||||
|
||||
if (task->flags & PF_EXITING) {
|
||||
/*
|
||||
* If task is current and is in the process of releasing memory,
|
||||
* allow the "kill" to set TIF_MEMDIE, which will allow it to
|
||||
* access memory reserves. Otherwise, it may stall forever.
|
||||
*
|
||||
* The iteration isn't broken here, however, in case other
|
||||
* threads are found to have already been oom killed.
|
||||
*/
|
||||
if (task == current)
|
||||
return OOM_SCAN_SELECT;
|
||||
else if (!force_kill) {
|
||||
/*
|
||||
* If this task is not being ptraced on exit, then wait
|
||||
* for it to finish before killing some other task
|
||||
* unnecessarily.
|
||||
*/
|
||||
if (!(task->group_leader->ptrace & PT_TRACE_EXIT))
|
||||
return OOM_SCAN_ABORT;
|
||||
}
|
||||
}
|
||||
return OOM_SCAN_OK;
|
||||
}
|
||||
|
||||
/*
|
||||
* Simple selection loop. We chose the process with the highest
|
||||
* number of 'points'. We expect the caller will lock the tasklist.
|
||||
* number of 'points'.
|
||||
*
|
||||
* (not docbooked, we don't want this one cluttering up the manual)
|
||||
*/
|
||||
static struct task_struct *select_bad_process(unsigned int *ppoints,
|
||||
unsigned long totalpages, struct mem_cgroup *memcg,
|
||||
const nodemask_t *nodemask, bool force_kill)
|
||||
unsigned long totalpages, const nodemask_t *nodemask,
|
||||
bool force_kill)
|
||||
{
|
||||
struct task_struct *g, *p;
|
||||
struct task_struct *chosen = NULL;
|
||||
unsigned long chosen_points = 0;
|
||||
|
||||
rcu_read_lock();
|
||||
do_each_thread(g, p) {
|
||||
unsigned int points;
|
||||
|
||||
if (p->exit_state)
|
||||
switch (oom_scan_process_thread(p, totalpages, nodemask,
|
||||
force_kill)) {
|
||||
case OOM_SCAN_SELECT:
|
||||
chosen = p;
|
||||
chosen_points = ULONG_MAX;
|
||||
/* fall through */
|
||||
case OOM_SCAN_CONTINUE:
|
||||
continue;
|
||||
if (oom_unkillable_task(p, memcg, nodemask))
|
||||
continue;
|
||||
|
||||
/*
|
||||
* This task already has access to memory reserves and is
|
||||
* being killed. Don't allow any other task access to the
|
||||
* memory reserve.
|
||||
*
|
||||
* Note: this may have a chance of deadlock if it gets
|
||||
* blocked waiting for another task which itself is waiting
|
||||
* for memory. Is there a better alternative?
|
||||
*/
|
||||
if (test_tsk_thread_flag(p, TIF_MEMDIE)) {
|
||||
if (unlikely(frozen(p)))
|
||||
__thaw_task(p);
|
||||
if (!force_kill)
|
||||
return ERR_PTR(-1UL);
|
||||
}
|
||||
if (!p->mm)
|
||||
continue;
|
||||
|
||||
if (p->flags & PF_EXITING) {
|
||||
/*
|
||||
* If p is the current task and is in the process of
|
||||
* releasing memory, we allow the "kill" to set
|
||||
* TIF_MEMDIE, which will allow it to gain access to
|
||||
* memory reserves. Otherwise, it may stall forever.
|
||||
*
|
||||
* The loop isn't broken here, however, in case other
|
||||
* threads are found to have already been oom killed.
|
||||
*/
|
||||
if (p == current) {
|
||||
chosen = p;
|
||||
chosen_points = ULONG_MAX;
|
||||
} else if (!force_kill) {
|
||||
/*
|
||||
* If this task is not being ptraced on exit,
|
||||
* then wait for it to finish before killing
|
||||
* some other task unnecessarily.
|
||||
*/
|
||||
if (!(p->group_leader->ptrace & PT_TRACE_EXIT))
|
||||
return ERR_PTR(-1UL);
|
||||
}
|
||||
}
|
||||
|
||||
points = oom_badness(p, memcg, nodemask, totalpages);
|
||||
case OOM_SCAN_ABORT:
|
||||
rcu_read_unlock();
|
||||
return ERR_PTR(-1UL);
|
||||
case OOM_SCAN_OK:
|
||||
break;
|
||||
};
|
||||
points = oom_badness(p, NULL, nodemask, totalpages);
|
||||
if (points > chosen_points) {
|
||||
chosen = p;
|
||||
chosen_points = points;
|
||||
}
|
||||
} while_each_thread(g, p);
|
||||
if (chosen)
|
||||
get_task_struct(chosen);
|
||||
rcu_read_unlock();
|
||||
|
||||
*ppoints = chosen_points * 1000 / totalpages;
|
||||
return chosen;
|
||||
@ -371,17 +388,16 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
|
||||
* Dumps the current memory state of all eligible tasks. Tasks not in the same
|
||||
* memcg, not in the same cpuset, or bound to a disjoint set of mempolicy nodes
|
||||
* are not shown.
|
||||
* State information includes task's pid, uid, tgid, vm size, rss, cpu, oom_adj
|
||||
* value, oom_score_adj value, and name.
|
||||
*
|
||||
* Call with tasklist_lock read-locked.
|
||||
* State information includes task's pid, uid, tgid, vm size, rss, nr_ptes,
|
||||
* swapents, oom_score_adj value, and name.
|
||||
*/
|
||||
static void dump_tasks(const struct mem_cgroup *memcg, const nodemask_t *nodemask)
|
||||
{
|
||||
struct task_struct *p;
|
||||
struct task_struct *task;
|
||||
|
||||
pr_info("[ pid ] uid tgid total_vm rss cpu oom_adj oom_score_adj name\n");
|
||||
pr_info("[ pid ] uid tgid total_vm rss nr_ptes swapents oom_score_adj name\n");
|
||||
rcu_read_lock();
|
||||
for_each_process(p) {
|
||||
if (oom_unkillable_task(p, memcg, nodemask))
|
||||
continue;
|
||||
@ -396,13 +412,15 @@ static void dump_tasks(const struct mem_cgroup *memcg, const nodemask_t *nodemas
|
||||
continue;
|
||||
}
|
||||
|
||||
pr_info("[%5d] %5d %5d %8lu %8lu %3u %3d %5d %s\n",
|
||||
pr_info("[%5d] %5d %5d %8lu %8lu %7lu %8lu %5d %s\n",
|
||||
task->pid, from_kuid(&init_user_ns, task_uid(task)),
|
||||
task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
|
||||
task_cpu(task), task->signal->oom_adj,
|
||||
task->mm->nr_ptes,
|
||||
get_mm_counter(task->mm, MM_SWAPENTS),
|
||||
task->signal->oom_score_adj, task->comm);
|
||||
task_unlock(task);
|
||||
}
|
||||
rcu_read_unlock();
|
||||
}
|
||||
|
||||
static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
|
||||
@ -423,10 +441,14 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
|
||||
}
|
||||
|
||||
#define K(x) ((x) << (PAGE_SHIFT-10))
|
||||
static void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
|
||||
unsigned int points, unsigned long totalpages,
|
||||
struct mem_cgroup *memcg, nodemask_t *nodemask,
|
||||
const char *message)
|
||||
/*
|
||||
* Must be called while holding a reference to p, which will be released upon
|
||||
* returning.
|
||||
*/
|
||||
void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
|
||||
unsigned int points, unsigned long totalpages,
|
||||
struct mem_cgroup *memcg, nodemask_t *nodemask,
|
||||
const char *message)
|
||||
{
|
||||
struct task_struct *victim = p;
|
||||
struct task_struct *child;
|
||||
@ -442,6 +464,7 @@ static void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
|
||||
*/
|
||||
if (p->flags & PF_EXITING) {
|
||||
set_tsk_thread_flag(p, TIF_MEMDIE);
|
||||
put_task_struct(p);
|
||||
return;
|
||||
}
|
||||
|
||||
@ -459,6 +482,7 @@ static void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
|
||||
* parent. This attempts to lose the minimal amount of work done while
|
||||
* still freeing memory.
|
||||
*/
|
||||
read_lock(&tasklist_lock);
|
||||
do {
|
||||
list_for_each_entry(child, &t->children, sibling) {
|
||||
unsigned int child_points;
|
||||
@ -471,15 +495,26 @@ static void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
|
||||
child_points = oom_badness(child, memcg, nodemask,
|
||||
totalpages);
|
||||
if (child_points > victim_points) {
|
||||
put_task_struct(victim);
|
||||
victim = child;
|
||||
victim_points = child_points;
|
||||
get_task_struct(victim);
|
||||
}
|
||||
}
|
||||
} while_each_thread(p, t);
|
||||
read_unlock(&tasklist_lock);
|
||||
|
||||
victim = find_lock_task_mm(victim);
|
||||
if (!victim)
|
||||
rcu_read_lock();
|
||||
p = find_lock_task_mm(victim);
|
||||
if (!p) {
|
||||
rcu_read_unlock();
|
||||
put_task_struct(victim);
|
||||
return;
|
||||
} else if (victim != p) {
|
||||
get_task_struct(p);
|
||||
put_task_struct(victim);
|
||||
victim = p;
|
||||
}
|
||||
|
||||
/* mm cannot safely be dereferenced after task_unlock(victim) */
|
||||
mm = victim->mm;
|
||||
@ -510,17 +545,19 @@ static void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
|
||||
task_unlock(p);
|
||||
do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true);
|
||||
}
|
||||
rcu_read_unlock();
|
||||
|
||||
set_tsk_thread_flag(victim, TIF_MEMDIE);
|
||||
do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, true);
|
||||
put_task_struct(victim);
|
||||
}
|
||||
#undef K
|
||||
|
||||
/*
|
||||
* Determines whether the kernel must panic because of the panic_on_oom sysctl.
|
||||
*/
|
||||
static void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
|
||||
int order, const nodemask_t *nodemask)
|
||||
void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
|
||||
int order, const nodemask_t *nodemask)
|
||||
{
|
||||
if (likely(!sysctl_panic_on_oom))
|
||||
return;
|
||||
@ -533,42 +570,11 @@ static void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
|
||||
if (constraint != CONSTRAINT_NONE)
|
||||
return;
|
||||
}
|
||||
read_lock(&tasklist_lock);
|
||||
dump_header(NULL, gfp_mask, order, NULL, nodemask);
|
||||
read_unlock(&tasklist_lock);
|
||||
panic("Out of memory: %s panic_on_oom is enabled\n",
|
||||
sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide");
|
||||
}
|
||||
|
||||
#ifdef CONFIG_CGROUP_MEM_RES_CTLR
|
||||
void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
|
||||
int order)
|
||||
{
|
||||
unsigned long limit;
|
||||
unsigned int points = 0;
|
||||
struct task_struct *p;
|
||||
|
||||
/*
|
||||
* If current has a pending SIGKILL, then automatically select it. The
|
||||
* goal is to allow it to allocate so that it may quickly exit and free
|
||||
* its memory.
|
||||
*/
|
||||
if (fatal_signal_pending(current)) {
|
||||
set_thread_flag(TIF_MEMDIE);
|
||||
return;
|
||||
}
|
||||
|
||||
check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL);
|
||||
limit = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT ? : 1;
|
||||
read_lock(&tasklist_lock);
|
||||
p = select_bad_process(&points, limit, memcg, NULL, false);
|
||||
if (p && PTR_ERR(p) != -1UL)
|
||||
oom_kill_process(p, gfp_mask, order, points, limit, memcg, NULL,
|
||||
"Memory cgroup out of memory");
|
||||
read_unlock(&tasklist_lock);
|
||||
}
|
||||
#endif
|
||||
|
||||
static BLOCKING_NOTIFIER_HEAD(oom_notify_list);
|
||||
|
||||
int register_oom_notifier(struct notifier_block *nb)
|
||||
@ -690,7 +696,7 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
|
||||
struct task_struct *p;
|
||||
unsigned long totalpages;
|
||||
unsigned long freed = 0;
|
||||
unsigned int points;
|
||||
unsigned int uninitialized_var(points);
|
||||
enum oom_constraint constraint = CONSTRAINT_NONE;
|
||||
int killed = 0;
|
||||
|
||||
@ -718,22 +724,20 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
|
||||
mpol_mask = (constraint == CONSTRAINT_MEMORY_POLICY) ? nodemask : NULL;
|
||||
check_panic_on_oom(constraint, gfp_mask, order, mpol_mask);
|
||||
|
||||
read_lock(&tasklist_lock);
|
||||
if (sysctl_oom_kill_allocating_task &&
|
||||
if (sysctl_oom_kill_allocating_task && current->mm &&
|
||||
!oom_unkillable_task(current, NULL, nodemask) &&
|
||||
current->mm) {
|
||||
current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {
|
||||
get_task_struct(current);
|
||||
oom_kill_process(current, gfp_mask, order, 0, totalpages, NULL,
|
||||
nodemask,
|
||||
"Out of memory (oom_kill_allocating_task)");
|
||||
goto out;
|
||||
}
|
||||
|
||||
p = select_bad_process(&points, totalpages, NULL, mpol_mask,
|
||||
force_kill);
|
||||
p = select_bad_process(&points, totalpages, mpol_mask, force_kill);
|
||||
/* Found nothing?!?! Either we hang forever, or we panic. */
|
||||
if (!p) {
|
||||
dump_header(NULL, gfp_mask, order, NULL, mpol_mask);
|
||||
read_unlock(&tasklist_lock);
|
||||
panic("Out of memory and no killable processes...\n");
|
||||
}
|
||||
if (PTR_ERR(p) != -1UL) {
|
||||
@ -742,14 +746,12 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
|
||||
killed = 1;
|
||||
}
|
||||
out:
|
||||
read_unlock(&tasklist_lock);
|
||||
|
||||
/*
|
||||
* Give "p" a good chance of killing itself before we
|
||||
* retry to allocate memory unless "p" is current
|
||||
* Give the killed threads a good chance of exiting before trying to
|
||||
* allocate memory again.
|
||||
*/
|
||||
if (killed && !test_thread_flag(TIF_MEMDIE))
|
||||
schedule_timeout_uninterruptible(1);
|
||||
if (killed)
|
||||
schedule_timeout_killable(1);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -764,6 +766,5 @@ void pagefault_out_of_memory(void)
|
||||
out_of_memory(NULL, 0, 0, NULL, false);
|
||||
clear_system_oom();
|
||||
}
|
||||
if (!test_thread_flag(TIF_MEMDIE))
|
||||
schedule_timeout_uninterruptible(1);
|
||||
schedule_timeout_killable(1);
|
||||
}
|
||||
|
318
mm/page_alloc.c
318
mm/page_alloc.c
@ -51,7 +51,6 @@
|
||||
#include <linux/page_cgroup.h>
|
||||
#include <linux/debugobjects.h>
|
||||
#include <linux/kmemleak.h>
|
||||
#include <linux/memory.h>
|
||||
#include <linux/compaction.h>
|
||||
#include <trace/events/kmem.h>
|
||||
#include <linux/ftrace_event.h>
|
||||
@ -219,7 +218,12 @@ EXPORT_SYMBOL(nr_online_nodes);
|
||||
|
||||
int page_group_by_mobility_disabled __read_mostly;
|
||||
|
||||
static void set_pageblock_migratetype(struct page *page, int migratetype)
|
||||
/*
|
||||
* NOTE:
|
||||
* Don't use set_pageblock_migratetype(page, MIGRATE_ISOLATE) directly.
|
||||
* Instead, use {un}set_pageblock_isolate.
|
||||
*/
|
||||
void set_pageblock_migratetype(struct page *page, int migratetype)
|
||||
{
|
||||
|
||||
if (unlikely(page_group_by_mobility_disabled))
|
||||
@ -954,7 +958,7 @@ static int move_freepages(struct zone *zone,
|
||||
return pages_moved;
|
||||
}
|
||||
|
||||
static int move_freepages_block(struct zone *zone, struct page *page,
|
||||
int move_freepages_block(struct zone *zone, struct page *page,
|
||||
int migratetype)
|
||||
{
|
||||
unsigned long start_pfn, end_pfn;
|
||||
@ -1158,8 +1162,10 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
|
||||
to_drain = pcp->batch;
|
||||
else
|
||||
to_drain = pcp->count;
|
||||
free_pcppages_bulk(zone, to_drain, pcp);
|
||||
pcp->count -= to_drain;
|
||||
if (to_drain > 0) {
|
||||
free_pcppages_bulk(zone, to_drain, pcp);
|
||||
pcp->count -= to_drain;
|
||||
}
|
||||
local_irq_restore(flags);
|
||||
}
|
||||
#endif
|
||||
@ -1529,16 +1535,16 @@ static int __init setup_fail_page_alloc(char *str)
|
||||
}
|
||||
__setup("fail_page_alloc=", setup_fail_page_alloc);
|
||||
|
||||
static int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
|
||||
static bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
|
||||
{
|
||||
if (order < fail_page_alloc.min_order)
|
||||
return 0;
|
||||
return false;
|
||||
if (gfp_mask & __GFP_NOFAIL)
|
||||
return 0;
|
||||
return false;
|
||||
if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM))
|
||||
return 0;
|
||||
return false;
|
||||
if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT))
|
||||
return 0;
|
||||
return false;
|
||||
|
||||
return should_fail(&fail_page_alloc.attr, 1 << order);
|
||||
}
|
||||
@ -1578,9 +1584,9 @@ late_initcall(fail_page_alloc_debugfs);
|
||||
|
||||
#else /* CONFIG_FAIL_PAGE_ALLOC */
|
||||
|
||||
static inline int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
|
||||
static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
|
||||
{
|
||||
return 0;
|
||||
return false;
|
||||
}
|
||||
|
||||
#endif /* CONFIG_FAIL_PAGE_ALLOC */
|
||||
@ -1594,6 +1600,7 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
|
||||
{
|
||||
/* free_pages my go negative - that's OK */
|
||||
long min = mark;
|
||||
long lowmem_reserve = z->lowmem_reserve[classzone_idx];
|
||||
int o;
|
||||
|
||||
free_pages -= (1 << order) - 1;
|
||||
@ -1602,7 +1609,7 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
|
||||
if (alloc_flags & ALLOC_HARDER)
|
||||
min -= min / 4;
|
||||
|
||||
if (free_pages <= min + z->lowmem_reserve[classzone_idx])
|
||||
if (free_pages <= min + lowmem_reserve)
|
||||
return false;
|
||||
for (o = 0; o < order; o++) {
|
||||
/* At the next order, this order's pages become unavailable */
|
||||
@ -1617,6 +1624,20 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
|
||||
return true;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_MEMORY_ISOLATION
|
||||
static inline unsigned long nr_zone_isolate_freepages(struct zone *zone)
|
||||
{
|
||||
if (unlikely(zone->nr_pageblock_isolate))
|
||||
return zone->nr_pageblock_isolate * pageblock_nr_pages;
|
||||
return 0;
|
||||
}
|
||||
#else
|
||||
static inline unsigned long nr_zone_isolate_freepages(struct zone *zone)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,
|
||||
int classzone_idx, int alloc_flags)
|
||||
{
|
||||
@ -1632,6 +1653,14 @@ bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,
|
||||
if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
|
||||
free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
|
||||
|
||||
/*
|
||||
* If the zone has MIGRATE_ISOLATE type free pages, we should consider
|
||||
* it. nr_zone_isolate_freepages is never accurate so kswapd might not
|
||||
* sleep although it could do so. But this is more desirable for memory
|
||||
* hotplug than sleeping which can cause a livelock in the direct
|
||||
* reclaim path.
|
||||
*/
|
||||
free_pages -= nr_zone_isolate_freepages(z);
|
||||
return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
|
||||
free_pages);
|
||||
}
|
||||
@ -2087,8 +2116,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
|
||||
|
||||
page = get_page_from_freelist(gfp_mask, nodemask,
|
||||
order, zonelist, high_zoneidx,
|
||||
alloc_flags, preferred_zone,
|
||||
migratetype);
|
||||
alloc_flags & ~ALLOC_NO_WATERMARKS,
|
||||
preferred_zone, migratetype);
|
||||
if (page) {
|
||||
preferred_zone->compact_considered = 0;
|
||||
preferred_zone->compact_defer_shift = 0;
|
||||
@ -2180,8 +2209,8 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
|
||||
retry:
|
||||
page = get_page_from_freelist(gfp_mask, nodemask, order,
|
||||
zonelist, high_zoneidx,
|
||||
alloc_flags, preferred_zone,
|
||||
migratetype);
|
||||
alloc_flags & ~ALLOC_NO_WATERMARKS,
|
||||
preferred_zone, migratetype);
|
||||
|
||||
/*
|
||||
* If an allocation failed after direct reclaim, it could be because
|
||||
@ -2265,15 +2294,24 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
|
||||
alloc_flags |= ALLOC_HARDER;
|
||||
|
||||
if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) {
|
||||
if (!in_interrupt() &&
|
||||
((current->flags & PF_MEMALLOC) ||
|
||||
unlikely(test_thread_flag(TIF_MEMDIE))))
|
||||
if (gfp_mask & __GFP_MEMALLOC)
|
||||
alloc_flags |= ALLOC_NO_WATERMARKS;
|
||||
else if (in_serving_softirq() && (current->flags & PF_MEMALLOC))
|
||||
alloc_flags |= ALLOC_NO_WATERMARKS;
|
||||
else if (!in_interrupt() &&
|
||||
((current->flags & PF_MEMALLOC) ||
|
||||
unlikely(test_thread_flag(TIF_MEMDIE))))
|
||||
alloc_flags |= ALLOC_NO_WATERMARKS;
|
||||
}
|
||||
|
||||
return alloc_flags;
|
||||
}
|
||||
|
||||
bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
|
||||
{
|
||||
return !!(gfp_to_alloc_flags(gfp_mask) & ALLOC_NO_WATERMARKS);
|
||||
}
|
||||
|
||||
static inline struct page *
|
||||
__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
|
||||
struct zonelist *zonelist, enum zone_type high_zoneidx,
|
||||
@ -2340,11 +2378,27 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
|
||||
|
||||
/* Allocate without watermarks if the context allows */
|
||||
if (alloc_flags & ALLOC_NO_WATERMARKS) {
|
||||
/*
|
||||
* Ignore mempolicies if ALLOC_NO_WATERMARKS on the grounds
|
||||
* the allocation is high priority and these type of
|
||||
* allocations are system rather than user orientated
|
||||
*/
|
||||
zonelist = node_zonelist(numa_node_id(), gfp_mask);
|
||||
|
||||
page = __alloc_pages_high_priority(gfp_mask, order,
|
||||
zonelist, high_zoneidx, nodemask,
|
||||
preferred_zone, migratetype);
|
||||
if (page)
|
||||
if (page) {
|
||||
/*
|
||||
* page->pfmemalloc is set when ALLOC_NO_WATERMARKS was
|
||||
* necessary to allocate the page. The expectation is
|
||||
* that the caller is taking steps that will free more
|
||||
* memory. The caller should avoid the page being used
|
||||
* for !PFMEMALLOC purposes.
|
||||
*/
|
||||
page->pfmemalloc = true;
|
||||
goto got_pg;
|
||||
}
|
||||
}
|
||||
|
||||
/* Atomic allocations - we can't balance anything */
|
||||
@ -2463,8 +2517,8 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
|
||||
got_pg:
|
||||
if (kmemcheck_enabled)
|
||||
kmemcheck_pagealloc_alloc(page, order, gfp_mask);
|
||||
return page;
|
||||
|
||||
return page;
|
||||
}
|
||||
|
||||
/*
|
||||
@ -2515,6 +2569,8 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
|
||||
page = __alloc_pages_slowpath(gfp_mask, order,
|
||||
zonelist, high_zoneidx, nodemask,
|
||||
preferred_zone, migratetype);
|
||||
else
|
||||
page->pfmemalloc = false;
|
||||
|
||||
trace_mm_page_alloc(page, order, gfp_mask, migratetype);
|
||||
|
||||
@ -3030,7 +3086,7 @@ int numa_zonelist_order_handler(ctl_table *table, int write,
|
||||
user_zonelist_order = oldval;
|
||||
} else if (oldval != user_zonelist_order) {
|
||||
mutex_lock(&zonelists_mutex);
|
||||
build_all_zonelists(NULL);
|
||||
build_all_zonelists(NULL, NULL);
|
||||
mutex_unlock(&zonelists_mutex);
|
||||
}
|
||||
}
|
||||
@ -3409,14 +3465,21 @@ static void setup_zone_pageset(struct zone *zone);
|
||||
DEFINE_MUTEX(zonelists_mutex);
|
||||
|
||||
/* return values int ....just for stop_machine() */
|
||||
static __init_refok int __build_all_zonelists(void *data)
|
||||
static int __build_all_zonelists(void *data)
|
||||
{
|
||||
int nid;
|
||||
int cpu;
|
||||
pg_data_t *self = data;
|
||||
|
||||
#ifdef CONFIG_NUMA
|
||||
memset(node_load, 0, sizeof(node_load));
|
||||
#endif
|
||||
|
||||
if (self && !node_online(self->node_id)) {
|
||||
build_zonelists(self);
|
||||
build_zonelist_cache(self);
|
||||
}
|
||||
|
||||
for_each_online_node(nid) {
|
||||
pg_data_t *pgdat = NODE_DATA(nid);
|
||||
|
||||
@ -3461,7 +3524,7 @@ static __init_refok int __build_all_zonelists(void *data)
|
||||
* Called with zonelists_mutex held always
|
||||
* unless system_state == SYSTEM_BOOTING.
|
||||
*/
|
||||
void __ref build_all_zonelists(void *data)
|
||||
void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone)
|
||||
{
|
||||
set_zonelist_order();
|
||||
|
||||
@ -3473,10 +3536,10 @@ void __ref build_all_zonelists(void *data)
|
||||
/* we have to stop all cpus to guarantee there is no user
|
||||
of zonelist */
|
||||
#ifdef CONFIG_MEMORY_HOTPLUG
|
||||
if (data)
|
||||
setup_zone_pageset((struct zone *)data);
|
||||
if (zone)
|
||||
setup_zone_pageset(zone);
|
||||
#endif
|
||||
stop_machine(__build_all_zonelists, NULL, NULL);
|
||||
stop_machine(__build_all_zonelists, pgdat, NULL);
|
||||
/* cpuset refresh routine should be here */
|
||||
}
|
||||
vm_total_pages = nr_free_pagecache_pages();
|
||||
@ -3746,7 +3809,7 @@ static void __meminit zone_init_free_lists(struct zone *zone)
|
||||
memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY)
|
||||
#endif
|
||||
|
||||
static int zone_batchsize(struct zone *zone)
|
||||
static int __meminit zone_batchsize(struct zone *zone)
|
||||
{
|
||||
#ifdef CONFIG_MMU
|
||||
int batch;
|
||||
@ -3828,7 +3891,7 @@ static void setup_pagelist_highmark(struct per_cpu_pageset *p,
|
||||
pcp->batch = PAGE_SHIFT * 8;
|
||||
}
|
||||
|
||||
static void setup_zone_pageset(struct zone *zone)
|
||||
static void __meminit setup_zone_pageset(struct zone *zone)
|
||||
{
|
||||
int cpu;
|
||||
|
||||
@ -3901,32 +3964,6 @@ int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int __zone_pcp_update(void *data)
|
||||
{
|
||||
struct zone *zone = data;
|
||||
int cpu;
|
||||
unsigned long batch = zone_batchsize(zone), flags;
|
||||
|
||||
for_each_possible_cpu(cpu) {
|
||||
struct per_cpu_pageset *pset;
|
||||
struct per_cpu_pages *pcp;
|
||||
|
||||
pset = per_cpu_ptr(zone->pageset, cpu);
|
||||
pcp = &pset->pcp;
|
||||
|
||||
local_irq_save(flags);
|
||||
free_pcppages_bulk(zone, pcp->count, pcp);
|
||||
setup_pageset(pset, batch);
|
||||
local_irq_restore(flags);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
void zone_pcp_update(struct zone *zone)
|
||||
{
|
||||
stop_machine(__zone_pcp_update, zone, NULL);
|
||||
}
|
||||
|
||||
static __meminit void zone_pcp_init(struct zone *zone)
|
||||
{
|
||||
/*
|
||||
@ -3942,7 +3979,7 @@ static __meminit void zone_pcp_init(struct zone *zone)
|
||||
zone_batchsize(zone));
|
||||
}
|
||||
|
||||
__meminit int init_currently_empty_zone(struct zone *zone,
|
||||
int __meminit init_currently_empty_zone(struct zone *zone,
|
||||
unsigned long zone_start_pfn,
|
||||
unsigned long size,
|
||||
enum memmap_context context)
|
||||
@ -4301,7 +4338,7 @@ static inline void setup_usemap(struct pglist_data *pgdat,
|
||||
#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
|
||||
|
||||
/* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */
|
||||
static inline void __init set_pageblock_order(void)
|
||||
void __init set_pageblock_order(void)
|
||||
{
|
||||
unsigned int order;
|
||||
|
||||
@ -4329,7 +4366,7 @@ static inline void __init set_pageblock_order(void)
|
||||
* include/linux/pageblock-flags.h for the values of pageblock_order based on
|
||||
* the kernel config
|
||||
*/
|
||||
static inline void set_pageblock_order(void)
|
||||
void __init set_pageblock_order(void)
|
||||
{
|
||||
}
|
||||
|
||||
@ -4340,6 +4377,8 @@ static inline void set_pageblock_order(void)
|
||||
* - mark all pages reserved
|
||||
* - mark all memory queues empty
|
||||
* - clear the memory bitmaps
|
||||
*
|
||||
* NOTE: pgdat should get zeroed by caller.
|
||||
*/
|
||||
static void __paginginit free_area_init_core(struct pglist_data *pgdat,
|
||||
unsigned long *zones_size, unsigned long *zholes_size)
|
||||
@ -4350,9 +4389,8 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
|
||||
int ret;
|
||||
|
||||
pgdat_resize_init(pgdat);
|
||||
pgdat->nr_zones = 0;
|
||||
init_waitqueue_head(&pgdat->kswapd_wait);
|
||||
pgdat->kswapd_max_order = 0;
|
||||
init_waitqueue_head(&pgdat->pfmemalloc_wait);
|
||||
pgdat_page_cgroup_init(pgdat);
|
||||
|
||||
for (j = 0; j < MAX_NR_ZONES; j++) {
|
||||
@ -4394,6 +4432,11 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
|
||||
|
||||
zone->spanned_pages = size;
|
||||
zone->present_pages = realsize;
|
||||
#if defined CONFIG_COMPACTION || defined CONFIG_CMA
|
||||
zone->compact_cached_free_pfn = zone->zone_start_pfn +
|
||||
zone->spanned_pages;
|
||||
zone->compact_cached_free_pfn &= ~(pageblock_nr_pages-1);
|
||||
#endif
|
||||
#ifdef CONFIG_NUMA
|
||||
zone->node = nid;
|
||||
zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio)
|
||||
@ -4408,8 +4451,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
|
||||
|
||||
zone_pcp_init(zone);
|
||||
lruvec_init(&zone->lruvec, zone);
|
||||
zap_zone_vm_stats(zone);
|
||||
zone->flags = 0;
|
||||
if (!size)
|
||||
continue;
|
||||
|
||||
@ -4469,6 +4510,9 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
|
||||
{
|
||||
pg_data_t *pgdat = NODE_DATA(nid);
|
||||
|
||||
/* pg_data_t should be reset to zero when it's allocated */
|
||||
WARN_ON(pgdat->nr_zones || pgdat->node_start_pfn || pgdat->classzone_idx);
|
||||
|
||||
pgdat->node_id = nid;
|
||||
pgdat->node_start_pfn = node_start_pfn;
|
||||
calculate_node_totalpages(pgdat, zones_size, zholes_size);
|
||||
@ -4750,7 +4794,7 @@ static void __init find_zone_movable_pfns_for_nodes(void)
|
||||
}
|
||||
|
||||
/* Any regular memory on that node ? */
|
||||
static void check_for_regular_memory(pg_data_t *pgdat)
|
||||
static void __init check_for_regular_memory(pg_data_t *pgdat)
|
||||
{
|
||||
#ifdef CONFIG_HIGHMEM
|
||||
enum zone_type zone_type;
|
||||
@ -5468,26 +5512,27 @@ void set_pageblock_flags_group(struct page *page, unsigned long flags,
|
||||
}
|
||||
|
||||
/*
|
||||
* This is designed as sub function...plz see page_isolation.c also.
|
||||
* set/clear page block's type to be ISOLATE.
|
||||
* page allocater never alloc memory from ISOLATE block.
|
||||
* This function checks whether pageblock includes unmovable pages or not.
|
||||
* If @count is not zero, it is okay to include less @count unmovable pages
|
||||
*
|
||||
* PageLRU check wihtout isolation or lru_lock could race so that
|
||||
* MIGRATE_MOVABLE block might include unmovable pages. It means you can't
|
||||
* expect this function should be exact.
|
||||
*/
|
||||
|
||||
static int
|
||||
__count_immobile_pages(struct zone *zone, struct page *page, int count)
|
||||
bool has_unmovable_pages(struct zone *zone, struct page *page, int count)
|
||||
{
|
||||
unsigned long pfn, iter, found;
|
||||
int mt;
|
||||
|
||||
/*
|
||||
* For avoiding noise data, lru_add_drain_all() should be called
|
||||
* If ZONE_MOVABLE, the zone never contains immobile pages
|
||||
* If ZONE_MOVABLE, the zone never contains unmovable pages
|
||||
*/
|
||||
if (zone_idx(zone) == ZONE_MOVABLE)
|
||||
return true;
|
||||
return false;
|
||||
mt = get_pageblock_migratetype(page);
|
||||
if (mt == MIGRATE_MOVABLE || is_migrate_cma(mt))
|
||||
return true;
|
||||
return false;
|
||||
|
||||
pfn = page_to_pfn(page);
|
||||
for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) {
|
||||
@ -5497,11 +5542,18 @@ __count_immobile_pages(struct zone *zone, struct page *page, int count)
|
||||
continue;
|
||||
|
||||
page = pfn_to_page(check);
|
||||
if (!page_count(page)) {
|
||||
/*
|
||||
* We can't use page_count without pin a page
|
||||
* because another CPU can free compound page.
|
||||
* This check already skips compound tails of THP
|
||||
* because their page->_count is zero at all time.
|
||||
*/
|
||||
if (!atomic_read(&page->_count)) {
|
||||
if (PageBuddy(page))
|
||||
iter += (1 << page_order(page)) - 1;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!PageLRU(page))
|
||||
found++;
|
||||
/*
|
||||
@ -5518,9 +5570,9 @@ __count_immobile_pages(struct zone *zone, struct page *page, int count)
|
||||
* page at boot.
|
||||
*/
|
||||
if (found > count)
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
bool is_pageblock_removable_nolock(struct page *page)
|
||||
@ -5544,77 +5596,7 @@ bool is_pageblock_removable_nolock(struct page *page)
|
||||
zone->zone_start_pfn + zone->spanned_pages <= pfn)
|
||||
return false;
|
||||
|
||||
return __count_immobile_pages(zone, page, 0);
|
||||
}
|
||||
|
||||
int set_migratetype_isolate(struct page *page)
|
||||
{
|
||||
struct zone *zone;
|
||||
unsigned long flags, pfn;
|
||||
struct memory_isolate_notify arg;
|
||||
int notifier_ret;
|
||||
int ret = -EBUSY;
|
||||
|
||||
zone = page_zone(page);
|
||||
|
||||
spin_lock_irqsave(&zone->lock, flags);
|
||||
|
||||
pfn = page_to_pfn(page);
|
||||
arg.start_pfn = pfn;
|
||||
arg.nr_pages = pageblock_nr_pages;
|
||||
arg.pages_found = 0;
|
||||
|
||||
/*
|
||||
* It may be possible to isolate a pageblock even if the
|
||||
* migratetype is not MIGRATE_MOVABLE. The memory isolation
|
||||
* notifier chain is used by balloon drivers to return the
|
||||
* number of pages in a range that are held by the balloon
|
||||
* driver to shrink memory. If all the pages are accounted for
|
||||
* by balloons, are free, or on the LRU, isolation can continue.
|
||||
* Later, for example, when memory hotplug notifier runs, these
|
||||
* pages reported as "can be isolated" should be isolated(freed)
|
||||
* by the balloon driver through the memory notifier chain.
|
||||
*/
|
||||
notifier_ret = memory_isolate_notify(MEM_ISOLATE_COUNT, &arg);
|
||||
notifier_ret = notifier_to_errno(notifier_ret);
|
||||
if (notifier_ret)
|
||||
goto out;
|
||||
/*
|
||||
* FIXME: Now, memory hotplug doesn't call shrink_slab() by itself.
|
||||
* We just check MOVABLE pages.
|
||||
*/
|
||||
if (__count_immobile_pages(zone, page, arg.pages_found))
|
||||
ret = 0;
|
||||
|
||||
/*
|
||||
* immobile means "not-on-lru" paes. If immobile is larger than
|
||||
* removable-by-driver pages reported by notifier, we'll fail.
|
||||
*/
|
||||
|
||||
out:
|
||||
if (!ret) {
|
||||
set_pageblock_migratetype(page, MIGRATE_ISOLATE);
|
||||
move_freepages_block(zone, page, MIGRATE_ISOLATE);
|
||||
}
|
||||
|
||||
spin_unlock_irqrestore(&zone->lock, flags);
|
||||
if (!ret)
|
||||
drain_all_pages();
|
||||
return ret;
|
||||
}
|
||||
|
||||
void unset_migratetype_isolate(struct page *page, unsigned migratetype)
|
||||
{
|
||||
struct zone *zone;
|
||||
unsigned long flags;
|
||||
zone = page_zone(page);
|
||||
spin_lock_irqsave(&zone->lock, flags);
|
||||
if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE)
|
||||
goto out;
|
||||
set_pageblock_migratetype(page, migratetype);
|
||||
move_freepages_block(zone, page, migratetype);
|
||||
out:
|
||||
spin_unlock_irqrestore(&zone->lock, flags);
|
||||
return !has_unmovable_pages(zone, page, 0);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_CMA
|
||||
@ -5869,7 +5851,49 @@ void free_contig_range(unsigned long pfn, unsigned nr_pages)
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_MEMORY_HOTPLUG
|
||||
static int __meminit __zone_pcp_update(void *data)
|
||||
{
|
||||
struct zone *zone = data;
|
||||
int cpu;
|
||||
unsigned long batch = zone_batchsize(zone), flags;
|
||||
|
||||
for_each_possible_cpu(cpu) {
|
||||
struct per_cpu_pageset *pset;
|
||||
struct per_cpu_pages *pcp;
|
||||
|
||||
pset = per_cpu_ptr(zone->pageset, cpu);
|
||||
pcp = &pset->pcp;
|
||||
|
||||
local_irq_save(flags);
|
||||
if (pcp->count > 0)
|
||||
free_pcppages_bulk(zone, pcp->count, pcp);
|
||||
setup_pageset(pset, batch);
|
||||
local_irq_restore(flags);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
void __meminit zone_pcp_update(struct zone *zone)
|
||||
{
|
||||
stop_machine(__zone_pcp_update, zone, NULL);
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_MEMORY_HOTREMOVE
|
||||
void zone_pcp_reset(struct zone *zone)
|
||||
{
|
||||
unsigned long flags;
|
||||
|
||||
/* avoid races with drain_pages() */
|
||||
local_irq_save(flags);
|
||||
if (zone->pageset != &boot_pageset) {
|
||||
free_percpu(zone->pageset);
|
||||
zone->pageset = &boot_pageset;
|
||||
}
|
||||
local_irq_restore(flags);
|
||||
}
|
||||
|
||||
/*
|
||||
* All pages in the range must be isolated before calling this.
|
||||
*/
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user