linux_dsm_epyc7002/arch/powerpc/mm/pgtable.c
Linus Torvalds 192f0f8e9d powerpc updates for 5.3
Notable changes:
 
  - Removal of the NPU DMA code, used by the out-of-tree Nvidia driver, as well
    as some other functions only used by drivers that haven't (yet?) made it
    upstream.
 
  - A fix for a bug in our handling of hardware watchpoints (eg. perf record -e
    mem: ...) which could lead to register corruption and kernel crashes.
 
  - Enable HAVE_ARCH_HUGE_VMAP, which allows us to use large pages for vmalloc
    when using the Radix MMU.
 
  - A large but incremental rewrite of our exception handling code to use gas
    macros rather than multiple levels of nested CPP macros.
 
 And the usual small fixes, cleanups and improvements.
 
 Thanks to:
   Alastair D'Silva, Alexey Kardashevskiy, Andreas Schwab, Aneesh Kumar K.V, Anju
   T Sudhakar, Anton Blanchard, Arnd Bergmann, Athira Rajeev, Cédric Le Goater,
   Christian Lamparter, Christophe Leroy, Christophe Lombard, Christoph Hellwig,
   Daniel Axtens, Denis Efremov, Enrico Weigelt, Frederic Barrat, Gautham R.
   Shenoy, Geert Uytterhoeven, Geliang Tang, Gen Zhang, Greg Kroah-Hartman, Greg
   Kurz, Gustavo Romero, Krzysztof Kozlowski, Madhavan Srinivasan, Masahiro
   Yamada, Mathieu Malaterre, Michael Neuling, Nathan Lynch, Naveen N. Rao,
   Nicholas Piggin, Nishad Kamdar, Oliver O'Halloran, Qian Cai, Ravi Bangoria,
   Sachin Sant, Sam Bobroff, Satheesh Rajendran, Segher Boessenkool, Shaokun
   Zhang, Shawn Anastasio, Stewart Smith, Suraj Jitindar Singh, Thiago Jung
   Bauermann, YueHaibing.
 -----BEGIN PGP SIGNATURE-----
 
 iQIcBAABAgAGBQJdKVoLAAoJEFHr6jzI4aWA0kIP/A6shIbbE7H5W2hFrqt/PPPK
 3+VrvPKbOFF+W6hcE/RgSZmEnUo0svdNjHUd/eMfFS1vb/uRt2QDdrsHUNNwURQL
 M2mcLXFwYpnjSjb/XMgDbHpAQxjeGfTdYLonUIejN7Rk8KQUeLyKQ3SBn6kfMc46
 DnUUcPcjuRGaETUmVuZZ4e40ZWbJp8PKDrSJOuUrTPXMaK5ciNbZk5mCWXGbYl6G
 BMQAyv4ld/417rNTjBEP/T2foMJtioAt4W6mtlgdkOTdIEZnFU67nNxDBthNSu2c
 95+I+/sML4KOp1R4yhqLSLIDDbc3bg3c99hLGij0d948z3bkSZ8bwnPaUuy70C4v
 U8rvl/+N6C6H3DgSsPE/Gnkd8DnudqWY8nULc+8p3fXljGwww6/Qgt+6yCUn8BdW
 WgixkSjKgjDmzTw8trIUNEqORrTVle7cM2hIyIK2Q5T4kWzNQxrLZ/x/3wgoYjUa
 1KwIzaRo5JKZ9D3pJnJ5U+knE2/90rJIyfcp0W6ygyJsWKi2GNmq1eN3sKOw0IxH
 Tg86RENIA/rEMErNOfP45sLteMuTR7of7peCG3yumIOZqsDVYAzerpvtSgip2cvK
 aG+9HcYlBFOOOF9Dabi8GXsTBLXLfwiyjjLSpA9eXPwW8KObgiNfTZa7ujjTPvis
 4mk9oukFTFUpfhsMmI3T
 =3dBZ
 -----END PGP SIGNATURE-----

Merge tag 'powerpc-5.3-1' of git://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux

Pull powerpc updates from Michael Ellerman:
 "Notable changes:

   - Removal of the NPU DMA code, used by the out-of-tree Nvidia driver,
     as well as some other functions only used by drivers that haven't
     (yet?) made it upstream.

   - A fix for a bug in our handling of hardware watchpoints (eg. perf
     record -e mem: ...) which could lead to register corruption and
     kernel crashes.

   - Enable HAVE_ARCH_HUGE_VMAP, which allows us to use large pages for
     vmalloc when using the Radix MMU.

   - A large but incremental rewrite of our exception handling code to
     use gas macros rather than multiple levels of nested CPP macros.

  And the usual small fixes, cleanups and improvements.

  Thanks to: Alastair D'Silva, Alexey Kardashevskiy, Andreas Schwab,
  Aneesh Kumar K.V, Anju T Sudhakar, Anton Blanchard, Arnd Bergmann,
  Athira Rajeev, Cédric Le Goater, Christian Lamparter, Christophe
  Leroy, Christophe Lombard, Christoph Hellwig, Daniel Axtens, Denis
  Efremov, Enrico Weigelt, Frederic Barrat, Gautham R. Shenoy, Geert
  Uytterhoeven, Geliang Tang, Gen Zhang, Greg Kroah-Hartman, Greg Kurz,
  Gustavo Romero, Krzysztof Kozlowski, Madhavan Srinivasan, Masahiro
  Yamada, Mathieu Malaterre, Michael Neuling, Nathan Lynch, Naveen N.
  Rao, Nicholas Piggin, Nishad Kamdar, Oliver O'Halloran, Qian Cai, Ravi
  Bangoria, Sachin Sant, Sam Bobroff, Satheesh Rajendran, Segher
  Boessenkool, Shaokun Zhang, Shawn Anastasio, Stewart Smith, Suraj
  Jitindar Singh, Thiago Jung Bauermann, YueHaibing"

* tag 'powerpc-5.3-1' of git://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux: (163 commits)
  powerpc/powernv/idle: Fix restore of SPRN_LDBAR for POWER9 stop state.
  powerpc/eeh: Handle hugepages in ioremap space
  ocxl: Update for AFU descriptor template version 1.1
  powerpc/boot: pass CONFIG options in a simpler and more robust way
  powerpc/boot: add {get, put}_unaligned_be32 to xz_config.h
  powerpc/irq: Don't WARN continuously in arch_local_irq_restore()
  powerpc/module64: Use symbolic instructions names.
  powerpc/module32: Use symbolic instructions names.
  powerpc: Move PPC_HA() PPC_HI() and PPC_LO() to ppc-opcode.h
  powerpc/module64: Fix comment in R_PPC64_ENTRY handling
  powerpc/boot: Add lzo support for uImage
  powerpc/boot: Add lzma support for uImage
  powerpc/boot: don't force gzipped uImage
  powerpc/8xx: Add microcode patch to move SMC parameter RAM.
  powerpc/8xx: Use IO accessors in microcode programming.
  powerpc/8xx: replace #ifdefs by IS_ENABLED() in microcode.c
  powerpc/8xx: refactor programming of microcode CPM params.
  powerpc/8xx: refactor printing of microcode patch name.
  powerpc/8xx: Refactor microcode write
  powerpc/8xx: refactor writing of CPM microcode arrays
  ...
2019-07-13 16:08:36 -07:00

424 lines
10 KiB
C

// SPDX-License-Identifier: GPL-2.0-or-later
/*
* This file contains common routines for dealing with free of page tables
* Along with common page table handling code
*
* Derived from arch/powerpc/mm/tlb_64.c:
* Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
*
* Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au)
* and Cort Dougan (PReP) (cort@cs.nmt.edu)
* Copyright (C) 1996 Paul Mackerras
*
* Derived from "arch/i386/mm/init.c"
* Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
*
* Dave Engebretsen <engebret@us.ibm.com>
* Rework for PPC64 port.
*/
#include <linux/kernel.h>
#include <linux/gfp.h>
#include <linux/mm.h>
#include <linux/percpu.h>
#include <linux/hardirq.h>
#include <linux/hugetlb.h>
#include <asm/pgalloc.h>
#include <asm/tlbflush.h>
#include <asm/tlb.h>
#include <asm/hugetlb.h>
static inline int is_exec_fault(void)
{
return current->thread.regs && TRAP(current->thread.regs) == 0x400;
}
/* We only try to do i/d cache coherency on stuff that looks like
* reasonably "normal" PTEs. We currently require a PTE to be present
* and we avoid _PAGE_SPECIAL and cache inhibited pte. We also only do that
* on userspace PTEs
*/
static inline int pte_looks_normal(pte_t pte)
{
if (pte_present(pte) && !pte_special(pte)) {
if (pte_ci(pte))
return 0;
if (pte_user(pte))
return 1;
}
return 0;
}
static struct page *maybe_pte_to_page(pte_t pte)
{
unsigned long pfn = pte_pfn(pte);
struct page *page;
if (unlikely(!pfn_valid(pfn)))
return NULL;
page = pfn_to_page(pfn);
if (PageReserved(page))
return NULL;
return page;
}
#ifdef CONFIG_PPC_BOOK3S
/* Server-style MMU handles coherency when hashing if HW exec permission
* is supposed per page (currently 64-bit only). If not, then, we always
* flush the cache for valid PTEs in set_pte. Embedded CPU without HW exec
* support falls into the same category.
*/
static pte_t set_pte_filter_hash(pte_t pte)
{
if (radix_enabled())
return pte;
pte = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS);
if (pte_looks_normal(pte) && !(cpu_has_feature(CPU_FTR_COHERENT_ICACHE) ||
cpu_has_feature(CPU_FTR_NOEXECUTE))) {
struct page *pg = maybe_pte_to_page(pte);
if (!pg)
return pte;
if (!test_bit(PG_arch_1, &pg->flags)) {
flush_dcache_icache_page(pg);
set_bit(PG_arch_1, &pg->flags);
}
}
return pte;
}
#else /* CONFIG_PPC_BOOK3S */
static pte_t set_pte_filter_hash(pte_t pte) { return pte; }
#endif /* CONFIG_PPC_BOOK3S */
/* Embedded type MMU with HW exec support. This is a bit more complicated
* as we don't have two bits to spare for _PAGE_EXEC and _PAGE_HWEXEC so
* instead we "filter out" the exec permission for non clean pages.
*/
static pte_t set_pte_filter(pte_t pte)
{
struct page *pg;
if (mmu_has_feature(MMU_FTR_HPTE_TABLE))
return set_pte_filter_hash(pte);
/* No exec permission in the first place, move on */
if (!pte_exec(pte) || !pte_looks_normal(pte))
return pte;
/* If you set _PAGE_EXEC on weird pages you're on your own */
pg = maybe_pte_to_page(pte);
if (unlikely(!pg))
return pte;
/* If the page clean, we move on */
if (test_bit(PG_arch_1, &pg->flags))
return pte;
/* If it's an exec fault, we flush the cache and make it clean */
if (is_exec_fault()) {
flush_dcache_icache_page(pg);
set_bit(PG_arch_1, &pg->flags);
return pte;
}
/* Else, we filter out _PAGE_EXEC */
return pte_exprotect(pte);
}
static pte_t set_access_flags_filter(pte_t pte, struct vm_area_struct *vma,
int dirty)
{
struct page *pg;
if (mmu_has_feature(MMU_FTR_HPTE_TABLE))
return pte;
/* So here, we only care about exec faults, as we use them
* to recover lost _PAGE_EXEC and perform I$/D$ coherency
* if necessary. Also if _PAGE_EXEC is already set, same deal,
* we just bail out
*/
if (dirty || pte_exec(pte) || !is_exec_fault())
return pte;
#ifdef CONFIG_DEBUG_VM
/* So this is an exec fault, _PAGE_EXEC is not set. If it was
* an error we would have bailed out earlier in do_page_fault()
* but let's make sure of it
*/
if (WARN_ON(!(vma->vm_flags & VM_EXEC)))
return pte;
#endif /* CONFIG_DEBUG_VM */
/* If you set _PAGE_EXEC on weird pages you're on your own */
pg = maybe_pte_to_page(pte);
if (unlikely(!pg))
goto bail;
/* If the page is already clean, we move on */
if (test_bit(PG_arch_1, &pg->flags))
goto bail;
/* Clean the page and set PG_arch_1 */
flush_dcache_icache_page(pg);
set_bit(PG_arch_1, &pg->flags);
bail:
return pte_mkexec(pte);
}
/*
* set_pte stores a linux PTE into the linux page table.
*/
void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep,
pte_t pte)
{
/*
* Make sure hardware valid bit is not set. We don't do
* tlb flush for this update.
*/
VM_WARN_ON(pte_hw_valid(*ptep) && !pte_protnone(*ptep));
/* Add the pte bit when trying to set a pte */
pte = pte_mkpte(pte);
/* Note: mm->context.id might not yet have been assigned as
* this context might not have been activated yet when this
* is called.
*/
pte = set_pte_filter(pte);
/* Perform the setting of the PTE */
__set_pte_at(mm, addr, ptep, pte, 0);
}
/*
* This is called when relaxing access to a PTE. It's also called in the page
* fault path when we don't hit any of the major fault cases, ie, a minor
* update of _PAGE_ACCESSED, _PAGE_DIRTY, etc... The generic code will have
* handled those two for us, we additionally deal with missing execute
* permission here on some processors
*/
int ptep_set_access_flags(struct vm_area_struct *vma, unsigned long address,
pte_t *ptep, pte_t entry, int dirty)
{
int changed;
entry = set_access_flags_filter(entry, vma, dirty);
changed = !pte_same(*(ptep), entry);
if (changed) {
assert_pte_locked(vma->vm_mm, address);
__ptep_set_access_flags(vma, ptep, entry,
address, mmu_virtual_psize);
}
return changed;
}
#ifdef CONFIG_HUGETLB_PAGE
int huge_ptep_set_access_flags(struct vm_area_struct *vma,
unsigned long addr, pte_t *ptep,
pte_t pte, int dirty)
{
#ifdef HUGETLB_NEED_PRELOAD
/*
* The "return 1" forces a call of update_mmu_cache, which will write a
* TLB entry. Without this, platforms that don't do a write of the TLB
* entry in the TLB miss handler asm will fault ad infinitum.
*/
ptep_set_access_flags(vma, addr, ptep, pte, dirty);
return 1;
#else
int changed, psize;
pte = set_access_flags_filter(pte, vma, dirty);
changed = !pte_same(*(ptep), pte);
if (changed) {
#ifdef CONFIG_PPC_BOOK3S_64
struct hstate *h = hstate_vma(vma);
psize = hstate_get_psize(h);
#ifdef CONFIG_DEBUG_VM
assert_spin_locked(huge_pte_lockptr(h, vma->vm_mm, ptep));
#endif
#else
/*
* Not used on non book3s64 platforms. But 8xx
* can possibly use tsize derived from hstate.
*/
psize = 0;
#endif
__ptep_set_access_flags(vma, ptep, pte, addr, psize);
}
return changed;
#endif
}
#endif /* CONFIG_HUGETLB_PAGE */
#ifdef CONFIG_DEBUG_VM
void assert_pte_locked(struct mm_struct *mm, unsigned long addr)
{
pgd_t *pgd;
pud_t *pud;
pmd_t *pmd;
if (mm == &init_mm)
return;
pgd = mm->pgd + pgd_index(addr);
BUG_ON(pgd_none(*pgd));
pud = pud_offset(pgd, addr);
BUG_ON(pud_none(*pud));
pmd = pmd_offset(pud, addr);
/*
* khugepaged to collapse normal pages to hugepage, first set
* pmd to none to force page fault/gup to take mmap_sem. After
* pmd is set to none, we do a pte_clear which does this assertion
* so if we find pmd none, return.
*/
if (pmd_none(*pmd))
return;
BUG_ON(!pmd_present(*pmd));
assert_spin_locked(pte_lockptr(mm, pmd));
}
#endif /* CONFIG_DEBUG_VM */
unsigned long vmalloc_to_phys(void *va)
{
unsigned long pfn = vmalloc_to_pfn(va);
BUG_ON(!pfn);
return __pa(pfn_to_kaddr(pfn)) + offset_in_page(va);
}
EXPORT_SYMBOL_GPL(vmalloc_to_phys);
/*
* We have 4 cases for pgds and pmds:
* (1) invalid (all zeroes)
* (2) pointer to next table, as normal; bottom 6 bits == 0
* (3) leaf pte for huge page _PAGE_PTE set
* (4) hugepd pointer, _PAGE_PTE = 0 and bits [2..6] indicate size of table
*
* So long as we atomically load page table pointers we are safe against teardown,
* we can follow the address down to the the page and take a ref on it.
* This function need to be called with interrupts disabled. We use this variant
* when we have MSR[EE] = 0 but the paca->irq_soft_mask = IRQS_ENABLED
*/
pte_t *__find_linux_pte(pgd_t *pgdir, unsigned long ea,
bool *is_thp, unsigned *hpage_shift)
{
pgd_t pgd, *pgdp;
pud_t pud, *pudp;
pmd_t pmd, *pmdp;
pte_t *ret_pte;
hugepd_t *hpdp = NULL;
unsigned pdshift = PGDIR_SHIFT;
if (hpage_shift)
*hpage_shift = 0;
if (is_thp)
*is_thp = false;
pgdp = pgdir + pgd_index(ea);
pgd = READ_ONCE(*pgdp);
/*
* Always operate on the local stack value. This make sure the
* value don't get updated by a parallel THP split/collapse,
* page fault or a page unmap. The return pte_t * is still not
* stable. So should be checked there for above conditions.
*/
if (pgd_none(pgd))
return NULL;
if (pgd_is_leaf(pgd)) {
ret_pte = (pte_t *)pgdp;
goto out;
}
if (is_hugepd(__hugepd(pgd_val(pgd)))) {
hpdp = (hugepd_t *)&pgd;
goto out_huge;
}
/*
* Even if we end up with an unmap, the pgtable will not
* be freed, because we do an rcu free and here we are
* irq disabled
*/
pdshift = PUD_SHIFT;
pudp = pud_offset(&pgd, ea);
pud = READ_ONCE(*pudp);
if (pud_none(pud))
return NULL;
if (pud_is_leaf(pud)) {
ret_pte = (pte_t *)pudp;
goto out;
}
if (is_hugepd(__hugepd(pud_val(pud)))) {
hpdp = (hugepd_t *)&pud;
goto out_huge;
}
pdshift = PMD_SHIFT;
pmdp = pmd_offset(&pud, ea);
pmd = READ_ONCE(*pmdp);
/*
* A hugepage collapse is captured by this condition, see
* pmdp_collapse_flush.
*/
if (pmd_none(pmd))
return NULL;
#ifdef CONFIG_PPC_BOOK3S_64
/*
* A hugepage split is captured by this condition, see
* pmdp_invalidate.
*
* Huge page modification can be caught here too.
*/
if (pmd_is_serializing(pmd))
return NULL;
#endif
if (pmd_trans_huge(pmd) || pmd_devmap(pmd)) {
if (is_thp)
*is_thp = true;
ret_pte = (pte_t *)pmdp;
goto out;
}
if (pmd_is_leaf(pmd)) {
ret_pte = (pte_t *)pmdp;
goto out;
}
if (is_hugepd(__hugepd(pmd_val(pmd)))) {
hpdp = (hugepd_t *)&pmd;
goto out_huge;
}
return pte_offset_kernel(&pmd, ea);
out_huge:
if (!hpdp)
return NULL;
ret_pte = hugepte_offset(*hpdp, ea, pdshift);
pdshift = hugepd_shift(*hpdp);
out:
if (hpage_shift)
*hpage_shift = pdshift;
return ret_pte;
}
EXPORT_SYMBOL_GPL(__find_linux_pte);