2008-10-23 12:26:29 +07:00
|
|
|
#ifndef _ASM_X86_PGTABLE_3LEVEL_H
|
|
|
|
#define _ASM_X86_PGTABLE_3LEVEL_H
|
2005-04-17 05:20:36 +07:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Intel Physical Address Extension (PAE) Mode - three-level page
|
|
|
|
* tables on PPro+ CPUs.
|
|
|
|
*
|
|
|
|
* Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
|
|
|
|
*/
|
|
|
|
|
2008-03-23 15:03:10 +07:00
|
|
|
#define pte_ERROR(e) \
|
|
|
|
printk("%s:%d: bad pte %p(%08lx%08lx).\n", \
|
|
|
|
__FILE__, __LINE__, &(e), (e).pte_high, (e).pte_low)
|
|
|
|
#define pmd_ERROR(e) \
|
|
|
|
printk("%s:%d: bad pmd %p(%016Lx).\n", \
|
|
|
|
__FILE__, __LINE__, &(e), pmd_val(e))
|
|
|
|
#define pgd_ERROR(e) \
|
|
|
|
printk("%s:%d: bad pgd %p(%016Lx).\n", \
|
|
|
|
__FILE__, __LINE__, &(e), pgd_val(e))
|
2008-01-30 19:34:11 +07:00
|
|
|
|
2005-04-17 05:20:36 +07:00
|
|
|
/* Rules for using set_pte: the pte being assigned *must* be
|
|
|
|
* either not present or in a state where the hardware will
|
|
|
|
* not attempt to update the pte. In places where this is
|
|
|
|
* not possible, use pte_get_and_clear to obtain the old pte
|
|
|
|
* value and then use set_pte to update it. -ben
|
|
|
|
*/
|
2007-05-03 00:27:13 +07:00
|
|
|
static inline void native_set_pte(pte_t *ptep, pte_t pte)
|
2005-04-17 05:20:36 +07:00
|
|
|
{
|
|
|
|
ptep->pte_high = pte.pte_high;
|
|
|
|
smp_wmb();
|
|
|
|
ptep->pte_low = pte.pte_low;
|
|
|
|
}
|
|
|
|
|
mm: pmd_read_atomic: fix 32bit PAE pmd walk vs pmd_populate SMP race condition
When holding the mmap_sem for reading, pmd_offset_map_lock should only
run on a pmd_t that has been read atomically from the pmdp pointer,
otherwise we may read only half of it leading to this crash.
PID: 11679 TASK: f06e8000 CPU: 3 COMMAND: "do_race_2_panic"
#0 [f06a9dd8] crash_kexec at c049b5ec
#1 [f06a9e2c] oops_end at c083d1c2
#2 [f06a9e40] no_context at c0433ded
#3 [f06a9e64] bad_area_nosemaphore at c043401a
#4 [f06a9e6c] __do_page_fault at c0434493
#5 [f06a9eec] do_page_fault at c083eb45
#6 [f06a9f04] error_code (via page_fault) at c083c5d5
EAX: 01fb470c EBX: fff35000 ECX: 00000003 EDX: 00000100 EBP:
00000000
DS: 007b ESI: 9e201000 ES: 007b EDI: 01fb4700 GS: 00e0
CS: 0060 EIP: c083bc14 ERR: ffffffff EFLAGS: 00010246
#7 [f06a9f38] _spin_lock at c083bc14
#8 [f06a9f44] sys_mincore at c0507b7d
#9 [f06a9fb0] system_call at c083becd
start len
EAX: ffffffda EBX: 9e200000 ECX: 00001000 EDX: 6228537f
DS: 007b ESI: 00000000 ES: 007b EDI: 003d0f00
SS: 007b ESP: 62285354 EBP: 62285388 GS: 0033
CS: 0073 EIP: 00291416 ERR: 000000da EFLAGS: 00000286
This should be a longstanding bug affecting x86 32bit PAE without THP.
Only archs with 64bit large pmd_t and 32bit unsigned long should be
affected.
With THP enabled the barrier() in pmd_none_or_trans_huge_or_clear_bad()
would partly hide the bug when the pmd transition from none to stable,
by forcing a re-read of the *pmd in pmd_offset_map_lock, but when THP is
enabled a new set of problem arises by the fact could then transition
freely in any of the none, pmd_trans_huge or pmd_trans_stable states.
So making the barrier in pmd_none_or_trans_huge_or_clear_bad()
unconditional isn't good idea and it would be a flakey solution.
This should be fully fixed by introducing a pmd_read_atomic that reads
the pmd in order with THP disabled, or by reading the pmd atomically
with cmpxchg8b with THP enabled.
Luckily this new race condition only triggers in the places that must
already be covered by pmd_none_or_trans_huge_or_clear_bad() so the fix
is localized there but this bug is not related to THP.
NOTE: this can trigger on x86 32bit systems with PAE enabled with more
than 4G of ram, otherwise the high part of the pmd will never risk to be
truncated because it would be zero at all times, in turn so hiding the
SMP race.
This bug was discovered and fully debugged by Ulrich, quote:
----
[..]
pmd_none_or_trans_huge_or_clear_bad() loads the content of edx and
eax.
496 static inline int pmd_none_or_trans_huge_or_clear_bad(pmd_t
*pmd)
497 {
498 /* depend on compiler for an atomic pmd read */
499 pmd_t pmdval = *pmd;
// edi = pmd pointer
0xc0507a74 <sys_mincore+548>: mov 0x8(%esp),%edi
...
// edx = PTE page table high address
0xc0507a84 <sys_mincore+564>: mov 0x4(%edi),%edx
...
// eax = PTE page table low address
0xc0507a8e <sys_mincore+574>: mov (%edi),%eax
[..]
Please note that the PMD is not read atomically. These are two "mov"
instructions where the high order bits of the PMD entry are fetched
first. Hence, the above machine code is prone to the following race.
- The PMD entry {high|low} is 0x0000000000000000.
The "mov" at 0xc0507a84 loads 0x00000000 into edx.
- A page fault (on another CPU) sneaks in between the two "mov"
instructions and instantiates the PMD.
- The PMD entry {high|low} is now 0x00000003fda38067.
The "mov" at 0xc0507a8e loads 0xfda38067 into eax.
----
Reported-by: Ulrich Obergfell <uobergfe@redhat.com>
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Hugh Dickins <hughd@google.com>
Cc: Larry Woodman <lwoodman@redhat.com>
Cc: Petr Matousek <pmatouse@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-05-30 05:06:49 +07:00
|
|
|
#define pmd_read_atomic pmd_read_atomic
|
|
|
|
/*
|
|
|
|
* pte_offset_map_lock on 32bit PAE kernels was reading the pmd_t with
|
|
|
|
* a "*pmdp" dereference done by gcc. Problem is, in certain places
|
|
|
|
* where pte_offset_map_lock is called, concurrent page faults are
|
|
|
|
* allowed, if the mmap_sem is hold for reading. An example is mincore
|
|
|
|
* vs page faults vs MADV_DONTNEED. On the page fault side
|
|
|
|
* pmd_populate rightfully does a set_64bit, but if we're reading the
|
|
|
|
* pmd_t with a "*pmdp" on the mincore side, a SMP race can happen
|
|
|
|
* because gcc will not read the 64bit of the pmd atomically. To fix
|
|
|
|
* this all places running pmd_offset_map_lock() while holding the
|
|
|
|
* mmap_sem in read mode, shall read the pmdp pointer using this
|
|
|
|
* function to know if the pmd is null nor not, and in turn to know if
|
|
|
|
* they can run pmd_offset_map_lock or pmd_trans_huge or other pmd
|
|
|
|
* operations.
|
|
|
|
*
|
|
|
|
* Without THP if the mmap_sem is hold for reading, the
|
|
|
|
* pmd can only transition from null to not null while pmd_read_atomic runs.
|
|
|
|
* So there's no need of literally reading it atomically.
|
|
|
|
*
|
|
|
|
* With THP if the mmap_sem is hold for reading, the pmd can become
|
|
|
|
* THP or null or point to a pte (and in turn become "stable") at any
|
|
|
|
* time under pmd_read_atomic, so it's mandatory to read it atomically
|
|
|
|
* with cmpxchg8b.
|
|
|
|
*/
|
|
|
|
#ifndef CONFIG_TRANSPARENT_HUGEPAGE
|
|
|
|
static inline pmd_t pmd_read_atomic(pmd_t *pmdp)
|
|
|
|
{
|
|
|
|
pmdval_t ret;
|
|
|
|
u32 *tmp = (u32 *)pmdp;
|
|
|
|
|
|
|
|
ret = (pmdval_t) (*tmp);
|
|
|
|
if (ret) {
|
|
|
|
/*
|
|
|
|
* If the low part is null, we must not read the high part
|
|
|
|
* or we can end up with a partial pmd.
|
|
|
|
*/
|
|
|
|
smp_rmb();
|
|
|
|
ret |= ((pmdval_t)*(tmp + 1)) << 32;
|
|
|
|
}
|
|
|
|
|
|
|
|
return (pmd_t) { ret };
|
|
|
|
}
|
|
|
|
#else /* CONFIG_TRANSPARENT_HUGEPAGE */
|
|
|
|
static inline pmd_t pmd_read_atomic(pmd_t *pmdp)
|
|
|
|
{
|
|
|
|
return (pmd_t) { atomic64_read((atomic64_t *)pmdp) };
|
|
|
|
}
|
|
|
|
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
|
|
|
|
|
2007-05-03 00:27:13 +07:00
|
|
|
static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte)
|
|
|
|
{
|
2008-03-23 15:03:10 +07:00
|
|
|
set_64bit((unsigned long long *)(ptep), native_pte_val(pte));
|
2007-05-03 00:27:13 +07:00
|
|
|
}
|
2008-03-23 15:03:10 +07:00
|
|
|
|
2007-05-03 00:27:13 +07:00
|
|
|
static inline void native_set_pmd(pmd_t *pmdp, pmd_t pmd)
|
|
|
|
{
|
2008-03-23 15:03:10 +07:00
|
|
|
set_64bit((unsigned long long *)(pmdp), native_pmd_val(pmd));
|
2007-05-03 00:27:13 +07:00
|
|
|
}
|
2008-03-23 15:03:10 +07:00
|
|
|
|
2007-05-03 00:27:13 +07:00
|
|
|
static inline void native_set_pud(pud_t *pudp, pud_t pud)
|
|
|
|
{
|
2008-03-23 15:03:10 +07:00
|
|
|
set_64bit((unsigned long long *)(pudp), native_pud_val(pud));
|
2007-05-03 00:27:13 +07:00
|
|
|
}
|
2005-04-17 05:20:36 +07:00
|
|
|
|
[PATCH] x86/PAE: Fix pte_clear for the >4GB RAM case
Proposed fix for ptep_get_and_clear_full PAE bug. Pte_clear had the same bug,
so use the same fix for both. Turns out pmd_clear had it as well, but pgds
are not affected.
The problem is rather intricate. Page table entries in PAE mode are 64-bits
wide, but the only atomic 8-byte write operation available in 32-bit mode is
cmpxchg8b, which is expensive (at least on P4), and thus avoided. But it can
happen that the processor may prefetch entries into the TLB in the middle of an
operation which clears a page table entry. So one must always clear the P-bit
in the low word of the page table entry first when clearing it.
Since the sequence *ptep = __pte(0) leaves the order of the write dependent on
the compiler, it must be coded explicitly as a clear of the low word followed
by a clear of the high word. Further, there must be a write memory barrier
here to enforce proper ordering by the compiler (and, in the future, by the
processor as well).
On > 4GB memory machines, the implementation of pte_clear for PAE was clearly
deficient, as it could leave virtual mappings of physical memory above 4GB
aliased to memory below 4GB in the TLB. The implementation of
ptep_get_and_clear_full has a similar bug, although not nearly as likely to
occur, since the mappings being cleared are in the process of being destroyed,
and should never be dereferenced again.
But, as luck would have it, it is possible to trigger bugs even without ever
dereferencing these bogus TLB mappings, even if the clear is followed fairly
soon after with a TLB flush or invalidation. The problem is that memory above
4GB may now be aliased into the first 4GB of memory, and in fact, may hit a
region of memory with non-memory semantics. These regions include AGP and PCI
space. As such, these memory regions are not cached by the processor. This
introduces the bug.
The processor can speculate memory operations, including memory writes, as long
as they are committed with the proper ordering. Speculating a memory write to
a linear address that has a bogus TLB mapping is possible. Normally, the
speculation is harmless. But for cached memory, it does leave the falsely
speculated cacheline unmodified, but in a dirty state. This cache line will be
eventually written back. If this cacheline happens to intersect a region of
memory that is not protected by the cache coherency protocol, it can corrupt
data in I/O memory, which is generally a very bad thing to do, and can cause
total system failure or just plain undefined behavior.
These bugs are extremely unlikely, but the severity is of such magnitude, and
the fix so simple that I think fixing them immediately is justified. Also,
they are nearly impossible to debug.
Signed-off-by: Zachary Amsden <zach@vmware.com>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-04-28 01:32:29 +07:00
|
|
|
/*
|
|
|
|
* For PTEs and PDEs, we must clear the P-bit first when clearing a page table
|
|
|
|
* entry, so clear the bottom half first and enforce ordering with a compiler
|
|
|
|
* barrier.
|
|
|
|
*/
|
2008-03-23 15:03:10 +07:00
|
|
|
static inline void native_pte_clear(struct mm_struct *mm, unsigned long addr,
|
|
|
|
pte_t *ptep)
|
[PATCH] x86/PAE: Fix pte_clear for the >4GB RAM case
Proposed fix for ptep_get_and_clear_full PAE bug. Pte_clear had the same bug,
so use the same fix for both. Turns out pmd_clear had it as well, but pgds
are not affected.
The problem is rather intricate. Page table entries in PAE mode are 64-bits
wide, but the only atomic 8-byte write operation available in 32-bit mode is
cmpxchg8b, which is expensive (at least on P4), and thus avoided. But it can
happen that the processor may prefetch entries into the TLB in the middle of an
operation which clears a page table entry. So one must always clear the P-bit
in the low word of the page table entry first when clearing it.
Since the sequence *ptep = __pte(0) leaves the order of the write dependent on
the compiler, it must be coded explicitly as a clear of the low word followed
by a clear of the high word. Further, there must be a write memory barrier
here to enforce proper ordering by the compiler (and, in the future, by the
processor as well).
On > 4GB memory machines, the implementation of pte_clear for PAE was clearly
deficient, as it could leave virtual mappings of physical memory above 4GB
aliased to memory below 4GB in the TLB. The implementation of
ptep_get_and_clear_full has a similar bug, although not nearly as likely to
occur, since the mappings being cleared are in the process of being destroyed,
and should never be dereferenced again.
But, as luck would have it, it is possible to trigger bugs even without ever
dereferencing these bogus TLB mappings, even if the clear is followed fairly
soon after with a TLB flush or invalidation. The problem is that memory above
4GB may now be aliased into the first 4GB of memory, and in fact, may hit a
region of memory with non-memory semantics. These regions include AGP and PCI
space. As such, these memory regions are not cached by the processor. This
introduces the bug.
The processor can speculate memory operations, including memory writes, as long
as they are committed with the proper ordering. Speculating a memory write to
a linear address that has a bogus TLB mapping is possible. Normally, the
speculation is harmless. But for cached memory, it does leave the falsely
speculated cacheline unmodified, but in a dirty state. This cache line will be
eventually written back. If this cacheline happens to intersect a region of
memory that is not protected by the cache coherency protocol, it can corrupt
data in I/O memory, which is generally a very bad thing to do, and can cause
total system failure or just plain undefined behavior.
These bugs are extremely unlikely, but the severity is of such magnitude, and
the fix so simple that I think fixing them immediately is justified. Also,
they are nearly impossible to debug.
Signed-off-by: Zachary Amsden <zach@vmware.com>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-04-28 01:32:29 +07:00
|
|
|
{
|
|
|
|
ptep->pte_low = 0;
|
|
|
|
smp_wmb();
|
|
|
|
ptep->pte_high = 0;
|
|
|
|
}
|
|
|
|
|
2007-05-03 00:27:13 +07:00
|
|
|
static inline void native_pmd_clear(pmd_t *pmd)
|
[PATCH] x86/PAE: Fix pte_clear for the >4GB RAM case
Proposed fix for ptep_get_and_clear_full PAE bug. Pte_clear had the same bug,
so use the same fix for both. Turns out pmd_clear had it as well, but pgds
are not affected.
The problem is rather intricate. Page table entries in PAE mode are 64-bits
wide, but the only atomic 8-byte write operation available in 32-bit mode is
cmpxchg8b, which is expensive (at least on P4), and thus avoided. But it can
happen that the processor may prefetch entries into the TLB in the middle of an
operation which clears a page table entry. So one must always clear the P-bit
in the low word of the page table entry first when clearing it.
Since the sequence *ptep = __pte(0) leaves the order of the write dependent on
the compiler, it must be coded explicitly as a clear of the low word followed
by a clear of the high word. Further, there must be a write memory barrier
here to enforce proper ordering by the compiler (and, in the future, by the
processor as well).
On > 4GB memory machines, the implementation of pte_clear for PAE was clearly
deficient, as it could leave virtual mappings of physical memory above 4GB
aliased to memory below 4GB in the TLB. The implementation of
ptep_get_and_clear_full has a similar bug, although not nearly as likely to
occur, since the mappings being cleared are in the process of being destroyed,
and should never be dereferenced again.
But, as luck would have it, it is possible to trigger bugs even without ever
dereferencing these bogus TLB mappings, even if the clear is followed fairly
soon after with a TLB flush or invalidation. The problem is that memory above
4GB may now be aliased into the first 4GB of memory, and in fact, may hit a
region of memory with non-memory semantics. These regions include AGP and PCI
space. As such, these memory regions are not cached by the processor. This
introduces the bug.
The processor can speculate memory operations, including memory writes, as long
as they are committed with the proper ordering. Speculating a memory write to
a linear address that has a bogus TLB mapping is possible. Normally, the
speculation is harmless. But for cached memory, it does leave the falsely
speculated cacheline unmodified, but in a dirty state. This cache line will be
eventually written back. If this cacheline happens to intersect a region of
memory that is not protected by the cache coherency protocol, it can corrupt
data in I/O memory, which is generally a very bad thing to do, and can cause
total system failure or just plain undefined behavior.
These bugs are extremely unlikely, but the severity is of such magnitude, and
the fix so simple that I think fixing them immediately is justified. Also,
they are nearly impossible to debug.
Signed-off-by: Zachary Amsden <zach@vmware.com>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-04-28 01:32:29 +07:00
|
|
|
{
|
|
|
|
u32 *tmp = (u32 *)pmd;
|
|
|
|
*tmp = 0;
|
|
|
|
smp_wmb();
|
|
|
|
*(tmp + 1) = 0;
|
|
|
|
}
|
2007-05-03 00:27:13 +07:00
|
|
|
|
2008-01-30 19:34:11 +07:00
|
|
|
static inline void pud_clear(pud_t *pudp)
|
|
|
|
{
|
|
|
|
set_pud(pudp, __pud(0));
|
|
|
|
|
|
|
|
/*
|
2008-02-04 22:48:02 +07:00
|
|
|
* According to Intel App note "TLBs, Paging-Structure Caches,
|
|
|
|
* and Their Invalidation", April 2007, document 317080-001,
|
|
|
|
* section 8.1: in PAE mode we explicitly have to flush the
|
|
|
|
* TLB via cr3 if the top-level pgd is changed...
|
2008-01-30 19:34:11 +07:00
|
|
|
*
|
2011-03-16 10:37:29 +07:00
|
|
|
* Currently all places where pud_clear() is called either have
|
|
|
|
* flush_tlb_mm() followed or don't need TLB flush (x86_64 code or
|
|
|
|
* pud_clear_bad()), so we don't need TLB flush here.
|
2008-01-30 19:34:11 +07:00
|
|
|
*/
|
|
|
|
}
|
2006-12-07 08:14:08 +07:00
|
|
|
|
2007-05-03 00:27:19 +07:00
|
|
|
#ifdef CONFIG_SMP
|
2007-05-03 00:27:13 +07:00
|
|
|
static inline pte_t native_ptep_get_and_clear(pte_t *ptep)
|
2005-04-17 05:20:36 +07:00
|
|
|
{
|
|
|
|
pte_t res;
|
|
|
|
|
|
|
|
/* xchg acts as a barrier before the setting of the high bits */
|
|
|
|
res.pte_low = xchg(&ptep->pte_low, 0);
|
|
|
|
res.pte_high = ptep->pte_high;
|
|
|
|
ptep->pte_high = 0;
|
|
|
|
|
|
|
|
return res;
|
|
|
|
}
|
2007-05-03 00:27:19 +07:00
|
|
|
#else
|
|
|
|
#define native_ptep_get_and_clear(xp) native_local_ptep_get_and_clear(xp)
|
|
|
|
#endif
|
2005-04-17 05:20:36 +07:00
|
|
|
|
2011-01-14 06:47:01 +07:00
|
|
|
#ifdef CONFIG_SMP
|
|
|
|
union split_pmd {
|
|
|
|
struct {
|
|
|
|
u32 pmd_low;
|
|
|
|
u32 pmd_high;
|
|
|
|
};
|
|
|
|
pmd_t pmd;
|
|
|
|
};
|
|
|
|
static inline pmd_t native_pmdp_get_and_clear(pmd_t *pmdp)
|
|
|
|
{
|
|
|
|
union split_pmd res, *orig = (union split_pmd *)pmdp;
|
|
|
|
|
|
|
|
/* xchg acts as a barrier before setting of the high bits */
|
|
|
|
res.pmd_low = xchg(&orig->pmd_low, 0);
|
|
|
|
res.pmd_high = orig->pmd_high;
|
|
|
|
orig->pmd_high = 0;
|
|
|
|
|
|
|
|
return res.pmd;
|
|
|
|
}
|
|
|
|
#else
|
|
|
|
#define native_pmdp_get_and_clear(xp) native_local_pmdp_get_and_clear(xp)
|
|
|
|
#endif
|
|
|
|
|
2005-04-17 05:20:36 +07:00
|
|
|
/*
|
|
|
|
* Bits 0, 6 and 7 are taken in the low part of the pte,
|
|
|
|
* put the 32 bits of offset into the high part.
|
|
|
|
*/
|
|
|
|
#define pte_to_pgoff(pte) ((pte).pte_high)
|
2008-03-23 15:03:10 +07:00
|
|
|
#define pgoff_to_pte(off) \
|
|
|
|
((pte_t) { { .pte_low = _PAGE_FILE, .pte_high = (off) } })
|
2005-04-17 05:20:36 +07:00
|
|
|
#define PTE_FILE_MAX_BITS 32
|
|
|
|
|
|
|
|
/* Encode and de-code a swap entry */
|
2008-12-16 18:35:24 +07:00
|
|
|
#define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > 5)
|
2005-04-17 05:20:36 +07:00
|
|
|
#define __swp_type(x) (((x).val) & 0x1f)
|
|
|
|
#define __swp_offset(x) ((x).val >> 5)
|
|
|
|
#define __swp_entry(type, offset) ((swp_entry_t){(type) | (offset) << 5})
|
|
|
|
#define __pte_to_swp_entry(pte) ((swp_entry_t){ (pte).pte_high })
|
2008-01-30 19:32:57 +07:00
|
|
|
#define __swp_entry_to_pte(x) ((pte_t){ { .pte_high = (x).val } })
|
2005-04-17 05:20:36 +07:00
|
|
|
|
2008-10-23 12:26:29 +07:00
|
|
|
#endif /* _ASM_X86_PGTABLE_3LEVEL_H */
|