linux_dsm_epyc7002/arch/sparc/mm/init_32.c
Shaohua Li ec8acf20af swap: add per-partition lock for swapfile
swap_lock is heavily contended when I test swap to 3 fast SSD (even
slightly slower than swap to 2 such SSD).  The main contention comes
from swap_info_get().  This patch tries to fix the gap with adding a new
per-partition lock.

Global data like nr_swapfiles, total_swap_pages, least_priority and
swap_list are still protected by swap_lock.

nr_swap_pages is an atomic now, it can be changed without swap_lock.  In
theory, it's possible get_swap_page() finds no swap pages but actually
there are free swap pages.  But sounds not a big problem.

Accessing partition specific data (like scan_swap_map and so on) is only
protected by swap_info_struct.lock.

Changing swap_info_struct.flags need hold swap_lock and
swap_info_struct.lock, because scan_scan_map() will check it.  read the
flags is ok with either the locks hold.

If both swap_lock and swap_info_struct.lock must be hold, we always hold
the former first to avoid deadlock.

swap_entry_free() can change swap_list.  To delete that code, we add a
new highest_priority_index.  Whenever get_swap_page() is called, we
check it.  If it's valid, we use it.

It's a pity get_swap_page() still holds swap_lock().  But in practice,
swap_lock() isn't heavily contended in my test with this patch (or I can
say there are other much more heavier bottlenecks like TLB flush).  And
BTW, looks get_swap_page() doesn't really need the lock.  We never free
swap_info[] and we check SWAP_WRITEOK flag.  The only risk without the
lock is we could swapout to some low priority swap, but we can quickly
recover after several rounds of swap, so sounds not a big deal to me.
But I'd prefer to fix this if it's a real problem.

"swap: make each swap partition have one address_space" improved the
swapout speed from 1.7G/s to 2G/s.  This patch further improves the
speed to 2.3G/s, so around 15% improvement.  It's a multi-process test,
so TLB flush isn't the biggest bottleneck before the patches.

[arnd@arndb.de: fix it for nommu]
[hughd@google.com: add missing unlock]
[minchan@kernel.org: get rid of lockdep whinge on sys_swapon]
Signed-off-by: Shaohua Li <shli@fusionio.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Minchan Kim <minchan.kim@gmail.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Seth Jennings <sjenning@linux.vnet.ibm.com>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Cc: Dan Magenheimer <dan.magenheimer@oracle.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Hugh Dickins <hughd@google.com>
Signed-off-by: Minchan Kim <minchan@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2013-02-23 17:50:17 -08:00

427 lines
11 KiB
C

/*
* linux/arch/sparc/mm/init.c
*
* Copyright (C) 1995 David S. Miller (davem@caip.rutgers.edu)
* Copyright (C) 1995 Eddie C. Dost (ecd@skynet.be)
* Copyright (C) 1998 Jakub Jelinek (jj@sunsite.mff.cuni.cz)
* Copyright (C) 2000 Anton Blanchard (anton@samba.org)
*/
#include <linux/module.h>
#include <linux/signal.h>
#include <linux/sched.h>
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/string.h>
#include <linux/types.h>
#include <linux/ptrace.h>
#include <linux/mman.h>
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/initrd.h>
#include <linux/init.h>
#include <linux/highmem.h>
#include <linux/bootmem.h>
#include <linux/pagemap.h>
#include <linux/poison.h>
#include <linux/gfp.h>
#include <asm/sections.h>
#include <asm/page.h>
#include <asm/pgtable.h>
#include <asm/vaddrs.h>
#include <asm/pgalloc.h> /* bug in asm-generic/tlb.h: check_pgt_cache */
#include <asm/tlb.h>
#include <asm/prom.h>
#include <asm/leon.h>
unsigned long *sparc_valid_addr_bitmap;
EXPORT_SYMBOL(sparc_valid_addr_bitmap);
unsigned long phys_base;
EXPORT_SYMBOL(phys_base);
unsigned long pfn_base;
EXPORT_SYMBOL(pfn_base);
struct sparc_phys_banks sp_banks[SPARC_PHYS_BANKS+1];
/* Initial ramdisk setup */
extern unsigned int sparc_ramdisk_image;
extern unsigned int sparc_ramdisk_size;
unsigned long highstart_pfn, highend_pfn;
void show_mem(unsigned int filter)
{
printk("Mem-info:\n");
show_free_areas(filter);
printk("Free swap: %6ldkB\n",
get_nr_swap_pages() << (PAGE_SHIFT-10));
printk("%ld pages of RAM\n", totalram_pages);
printk("%ld free pages\n", nr_free_pages());
}
extern unsigned long cmdline_memory_size;
unsigned long last_valid_pfn;
unsigned long calc_highpages(void)
{
int i;
int nr = 0;
for (i = 0; sp_banks[i].num_bytes != 0; i++) {
unsigned long start_pfn = sp_banks[i].base_addr >> PAGE_SHIFT;
unsigned long end_pfn = (sp_banks[i].base_addr + sp_banks[i].num_bytes) >> PAGE_SHIFT;
if (end_pfn <= max_low_pfn)
continue;
if (start_pfn < max_low_pfn)
start_pfn = max_low_pfn;
nr += end_pfn - start_pfn;
}
return nr;
}
static unsigned long calc_max_low_pfn(void)
{
int i;
unsigned long tmp = pfn_base + (SRMMU_MAXMEM >> PAGE_SHIFT);
unsigned long curr_pfn, last_pfn;
last_pfn = (sp_banks[0].base_addr + sp_banks[0].num_bytes) >> PAGE_SHIFT;
for (i = 1; sp_banks[i].num_bytes != 0; i++) {
curr_pfn = sp_banks[i].base_addr >> PAGE_SHIFT;
if (curr_pfn >= tmp) {
if (last_pfn < tmp)
tmp = last_pfn;
break;
}
last_pfn = (sp_banks[i].base_addr + sp_banks[i].num_bytes) >> PAGE_SHIFT;
}
return tmp;
}
unsigned long __init bootmem_init(unsigned long *pages_avail)
{
unsigned long bootmap_size, start_pfn;
unsigned long end_of_phys_memory = 0UL;
unsigned long bootmap_pfn, bytes_avail, size;
int i;
bytes_avail = 0UL;
for (i = 0; sp_banks[i].num_bytes != 0; i++) {
end_of_phys_memory = sp_banks[i].base_addr +
sp_banks[i].num_bytes;
bytes_avail += sp_banks[i].num_bytes;
if (cmdline_memory_size) {
if (bytes_avail > cmdline_memory_size) {
unsigned long slack = bytes_avail - cmdline_memory_size;
bytes_avail -= slack;
end_of_phys_memory -= slack;
sp_banks[i].num_bytes -= slack;
if (sp_banks[i].num_bytes == 0) {
sp_banks[i].base_addr = 0xdeadbeef;
} else {
sp_banks[i+1].num_bytes = 0;
sp_banks[i+1].base_addr = 0xdeadbeef;
}
break;
}
}
}
/* Start with page aligned address of last symbol in kernel
* image.
*/
start_pfn = (unsigned long)__pa(PAGE_ALIGN((unsigned long) &_end));
/* Now shift down to get the real physical page frame number. */
start_pfn >>= PAGE_SHIFT;
bootmap_pfn = start_pfn;
max_pfn = end_of_phys_memory >> PAGE_SHIFT;
max_low_pfn = max_pfn;
highstart_pfn = highend_pfn = max_pfn;
if (max_low_pfn > pfn_base + (SRMMU_MAXMEM >> PAGE_SHIFT)) {
highstart_pfn = pfn_base + (SRMMU_MAXMEM >> PAGE_SHIFT);
max_low_pfn = calc_max_low_pfn();
printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
calc_highpages() >> (20 - PAGE_SHIFT));
}
#ifdef CONFIG_BLK_DEV_INITRD
/* Now have to check initial ramdisk, so that bootmap does not overwrite it */
if (sparc_ramdisk_image) {
if (sparc_ramdisk_image >= (unsigned long)&_end - 2 * PAGE_SIZE)
sparc_ramdisk_image -= KERNBASE;
initrd_start = sparc_ramdisk_image + phys_base;
initrd_end = initrd_start + sparc_ramdisk_size;
if (initrd_end > end_of_phys_memory) {
printk(KERN_CRIT "initrd extends beyond end of memory "
"(0x%016lx > 0x%016lx)\ndisabling initrd\n",
initrd_end, end_of_phys_memory);
initrd_start = 0;
}
if (initrd_start) {
if (initrd_start >= (start_pfn << PAGE_SHIFT) &&
initrd_start < (start_pfn << PAGE_SHIFT) + 2 * PAGE_SIZE)
bootmap_pfn = PAGE_ALIGN (initrd_end) >> PAGE_SHIFT;
}
}
#endif
/* Initialize the boot-time allocator. */
bootmap_size = init_bootmem_node(NODE_DATA(0), bootmap_pfn, pfn_base,
max_low_pfn);
/* Now register the available physical memory with the
* allocator.
*/
*pages_avail = 0;
for (i = 0; sp_banks[i].num_bytes != 0; i++) {
unsigned long curr_pfn, last_pfn;
curr_pfn = sp_banks[i].base_addr >> PAGE_SHIFT;
if (curr_pfn >= max_low_pfn)
break;
last_pfn = (sp_banks[i].base_addr + sp_banks[i].num_bytes) >> PAGE_SHIFT;
if (last_pfn > max_low_pfn)
last_pfn = max_low_pfn;
/*
* .. finally, did all the rounding and playing
* around just make the area go away?
*/
if (last_pfn <= curr_pfn)
continue;
size = (last_pfn - curr_pfn) << PAGE_SHIFT;
*pages_avail += last_pfn - curr_pfn;
free_bootmem(sp_banks[i].base_addr, size);
}
#ifdef CONFIG_BLK_DEV_INITRD
if (initrd_start) {
/* Reserve the initrd image area. */
size = initrd_end - initrd_start;
reserve_bootmem(initrd_start, size, BOOTMEM_DEFAULT);
*pages_avail -= PAGE_ALIGN(size) >> PAGE_SHIFT;
initrd_start = (initrd_start - phys_base) + PAGE_OFFSET;
initrd_end = (initrd_end - phys_base) + PAGE_OFFSET;
}
#endif
/* Reserve the kernel text/data/bss. */
size = (start_pfn << PAGE_SHIFT) - phys_base;
reserve_bootmem(phys_base, size, BOOTMEM_DEFAULT);
*pages_avail -= PAGE_ALIGN(size) >> PAGE_SHIFT;
/* Reserve the bootmem map. We do not account for it
* in pages_avail because we will release that memory
* in free_all_bootmem.
*/
size = bootmap_size;
reserve_bootmem((bootmap_pfn << PAGE_SHIFT), size, BOOTMEM_DEFAULT);
*pages_avail -= PAGE_ALIGN(size) >> PAGE_SHIFT;
return max_pfn;
}
/*
* paging_init() sets up the page tables: We call the MMU specific
* init routine based upon the Sun model type on the Sparc.
*
*/
extern void srmmu_paging_init(void);
extern void device_scan(void);
void __init paging_init(void)
{
srmmu_paging_init();
prom_build_devicetree();
of_fill_in_cpu_data();
device_scan();
}
static void __init taint_real_pages(void)
{
int i;
for (i = 0; sp_banks[i].num_bytes; i++) {
unsigned long start, end;
start = sp_banks[i].base_addr;
end = start + sp_banks[i].num_bytes;
while (start < end) {
set_bit(start >> 20, sparc_valid_addr_bitmap);
start += PAGE_SIZE;
}
}
}
static void map_high_region(unsigned long start_pfn, unsigned long end_pfn)
{
unsigned long tmp;
#ifdef CONFIG_DEBUG_HIGHMEM
printk("mapping high region %08lx - %08lx\n", start_pfn, end_pfn);
#endif
for (tmp = start_pfn; tmp < end_pfn; tmp++) {
struct page *page = pfn_to_page(tmp);
ClearPageReserved(page);
init_page_count(page);
__free_page(page);
totalhigh_pages++;
}
}
void __init mem_init(void)
{
int codepages = 0;
int datapages = 0;
int initpages = 0;
int reservedpages = 0;
int i;
if (PKMAP_BASE+LAST_PKMAP*PAGE_SIZE >= FIXADDR_START) {
prom_printf("BUG: fixmap and pkmap areas overlap\n");
prom_printf("pkbase: 0x%lx pkend: 0x%lx fixstart 0x%lx\n",
PKMAP_BASE,
(unsigned long)PKMAP_BASE+LAST_PKMAP*PAGE_SIZE,
FIXADDR_START);
prom_printf("Please mail sparclinux@vger.kernel.org.\n");
prom_halt();
}
/* Saves us work later. */
memset((void *)&empty_zero_page, 0, PAGE_SIZE);
i = last_valid_pfn >> ((20 - PAGE_SHIFT) + 5);
i += 1;
sparc_valid_addr_bitmap = (unsigned long *)
__alloc_bootmem(i << 2, SMP_CACHE_BYTES, 0UL);
if (sparc_valid_addr_bitmap == NULL) {
prom_printf("mem_init: Cannot alloc valid_addr_bitmap.\n");
prom_halt();
}
memset(sparc_valid_addr_bitmap, 0, i << 2);
taint_real_pages();
max_mapnr = last_valid_pfn - pfn_base;
high_memory = __va(max_low_pfn << PAGE_SHIFT);
totalram_pages = free_all_bootmem();
for (i = 0; sp_banks[i].num_bytes != 0; i++) {
unsigned long start_pfn = sp_banks[i].base_addr >> PAGE_SHIFT;
unsigned long end_pfn = (sp_banks[i].base_addr + sp_banks[i].num_bytes) >> PAGE_SHIFT;
num_physpages += sp_banks[i].num_bytes >> PAGE_SHIFT;
if (end_pfn <= highstart_pfn)
continue;
if (start_pfn < highstart_pfn)
start_pfn = highstart_pfn;
map_high_region(start_pfn, end_pfn);
}
totalram_pages += totalhigh_pages;
codepages = (((unsigned long) &_etext) - ((unsigned long)&_start));
codepages = PAGE_ALIGN(codepages) >> PAGE_SHIFT;
datapages = (((unsigned long) &_edata) - ((unsigned long)&_etext));
datapages = PAGE_ALIGN(datapages) >> PAGE_SHIFT;
initpages = (((unsigned long) &__init_end) - ((unsigned long) &__init_begin));
initpages = PAGE_ALIGN(initpages) >> PAGE_SHIFT;
/* Ignore memory holes for the purpose of counting reserved pages */
for (i=0; i < max_low_pfn; i++)
if (test_bit(i >> (20 - PAGE_SHIFT), sparc_valid_addr_bitmap)
&& PageReserved(pfn_to_page(i)))
reservedpages++;
printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, %dk reserved, %dk data, %dk init, %ldk highmem)\n",
nr_free_pages() << (PAGE_SHIFT-10),
num_physpages << (PAGE_SHIFT - 10),
codepages << (PAGE_SHIFT-10),
reservedpages << (PAGE_SHIFT - 10),
datapages << (PAGE_SHIFT-10),
initpages << (PAGE_SHIFT-10),
totalhigh_pages << (PAGE_SHIFT-10));
}
void free_initmem (void)
{
unsigned long addr;
unsigned long freed;
addr = (unsigned long)(&__init_begin);
freed = (unsigned long)(&__init_end) - addr;
for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) {
struct page *p;
memset((void *)addr, POISON_FREE_INITMEM, PAGE_SIZE);
p = virt_to_page(addr);
ClearPageReserved(p);
init_page_count(p);
__free_page(p);
totalram_pages++;
num_physpages++;
}
printk(KERN_INFO "Freeing unused kernel memory: %ldk freed\n",
freed >> 10);
}
#ifdef CONFIG_BLK_DEV_INITRD
void free_initrd_mem(unsigned long start, unsigned long end)
{
if (start < end)
printk(KERN_INFO "Freeing initrd memory: %ldk freed\n",
(end - start) >> 10);
for (; start < end; start += PAGE_SIZE) {
struct page *p;
memset((void *)start, POISON_FREE_INITMEM, PAGE_SIZE);
p = virt_to_page(start);
ClearPageReserved(p);
init_page_count(p);
__free_page(p);
totalram_pages++;
num_physpages++;
}
}
#endif
void sparc_flush_page_to_ram(struct page *page)
{
unsigned long vaddr = (unsigned long)page_address(page);
if (vaddr)
__flush_page_to_ram(vaddr);
}
EXPORT_SYMBOL(sparc_flush_page_to_ram);