A set of x86 and membarrier fixes:

- Correct a few problems in the x86 and the generic membarrier
     implementation. Small corrections for assumptions about visibility
     which have turned out not to be true.
 
   - Make the PAT bits for memory encryption correct vs. 4K and 2M/1G page
     table entries as they are at a different location.
 
   - Fix a concurrency issue in the the local bandwidth readout of resource
     control leading to incorrect values
 
   - Fix the ordering of allocating a vector for an interrupt. The order
     missed to respect the provided cpumask when the first attempt of
     allocating node local in the mask fails. It then tries the node instead
     of trying the full provided mask first. This leads to erroneous error
     messages and breaking the (user) supplied affinity request. Reorder it.
 
   - Make the INT3 padding detection in optprobe work correctly.
 -----BEGIN PGP SIGNATURE-----
 
 iQJHBAABCgAxFiEEQp8+kY+LLUocC4bMphj1TA10mKEFAl/WOw0THHRnbHhAbGlu
 dXRyb25peC5kZQAKCRCmGPVMDXSYodUhEACW02Al+mlpZjOhzIGwNUALGX7YGEOi
 wJoiIDTV2vOtKhcJ/AOVn05CznoWXa6PN1+7pm0nR2M1XaiXZ3B6HMjy1+0rpSqK
 5dCCEqr+QowwMRsKd4hnuJowR/7Og66FiYIcknDJg/1egg+RXy7yKds577W6/KW0
 2VV/x35xDywERiQ28qIftQ4L7NREyiioomN/GFpeoQf8tQF06Rb4t12T5pdA6D8S
 /C1wj0IKVRMJo/7HbB/X6skxPOK0PWAEBpaT0+Q66VKSNnFVN5ap+rGTKdUoTmuM
 NdxHOukpHyFCQtbRPOjRUeeSZJLKSX5oXsBO1GUvyyxcT2XNlTOdNamYLWyO+RfV
 uP97qhzYDKL+cgDkwypJ0WOzOb9EIXOh4P9BTnJFBGhc4EQwen3cpb3CyWWftjnv
 /obXiRDnAOE6P6H2AGiwrK3Pny+SvgrFYKMe+iy+ntToz1yrDh7ZrZ0DQKVoFWEr
 n3qUnlPZmVvRzHIRVYoK69nS/UgmNN0LssavzRBzab3BcK93f23QkW86P42kNCLa
 9kL+ZgGwlpqUZZR/p3pHq9Mv2ZXGEdUYY99h2vy1fgMwOw/RQZnPDAj/131UXlsU
 4DL0mAlTK1/0/81H6/V1GDBkMkO+hN4x6Y3asi7bPHEKEzlLe+P1tUt3YELd3CEC
 dc8ebICG1InXsw==
 =t31y
 -----END PGP SIGNATURE-----

Merge tag 'x86-urgent-2020-12-13' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 fixes from Thomas Gleixner:
 "A set of x86 and membarrier fixes:

   - Correct a few problems in the x86 and the generic membarrier
     implementation. Small corrections for assumptions about visibility
     which have turned out not to be true.

   - Make the PAT bits for memory encryption correct vs 4K and 2M/1G
     page table entries as they are at a different location.

   - Fix a concurrency issue in the the local bandwidth readout of
     resource control leading to incorrect values

   - Fix the ordering of allocating a vector for an interrupt. The order
     missed to respect the provided cpumask when the first attempt of
     allocating node local in the mask fails. It then tries the node
     instead of trying the full provided mask first. This leads to
     erroneous error messages and breaking the (user) supplied affinity
     request. Reorder it.

   - Make the INT3 padding detection in optprobe work correctly"

* tag 'x86-urgent-2020-12-13' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  x86/kprobes: Fix optprobe to detect INT3 padding correctly
  x86/apic/vector: Fix ordering in vector assignment
  x86/resctrl: Fix incorrect local bandwidth when mba_sc is enabled
  x86/mm/mem_encrypt: Fix definition of PMD_FLAGS_DEC_WP
  membarrier: Execute SYNC_CORE on the calling thread
  membarrier: Explicitly sync remote cores when SYNC_CORE is requested
  membarrier: Add an actual barrier before rseq_preempt()
  x86/membarrier: Get rid of a dubious optimization
This commit is contained in:
Linus Torvalds 2020-12-13 11:31:19 -08:00
commit ec6f5e0e5c
8 changed files with 111 additions and 42 deletions

View File

@ -155,6 +155,7 @@ enum page_cache_mode {
#define _PAGE_ENC (_AT(pteval_t, sme_me_mask))
#define _PAGE_CACHE_MASK (_PAGE_PWT | _PAGE_PCD | _PAGE_PAT)
#define _PAGE_LARGE_CACHE_MASK (_PAGE_PWT | _PAGE_PCD | _PAGE_PAT_LARGE)
#define _PAGE_NOCACHE (cachemode2protval(_PAGE_CACHE_MODE_UC))
#define _PAGE_CACHE_WP (cachemode2protval(_PAGE_CACHE_MODE_WP))

View File

@ -98,12 +98,13 @@ static inline void sync_core_before_usermode(void)
/* With PTI, we unconditionally serialize before running user code. */
if (static_cpu_has(X86_FEATURE_PTI))
return;
/*
* Return from interrupt and NMI is done through iret, which is core
* serializing.
* Even if we're in an interrupt, we might reschedule before returning,
* in which case we could switch to a different thread in the same mm
* and return using SYSRET or SYSEXIT. Instead of trying to keep
* track of our need to sync the core, just sync right away.
*/
if (in_irq() || in_nmi())
return;
sync_core();
}

View File

@ -273,20 +273,24 @@ static int assign_irq_vector_any_locked(struct irq_data *irqd)
const struct cpumask *affmsk = irq_data_get_affinity_mask(irqd);
int node = irq_data_get_node(irqd);
if (node == NUMA_NO_NODE)
goto all;
/* Try the intersection of @affmsk and node mask */
cpumask_and(vector_searchmask, cpumask_of_node(node), affmsk);
if (!assign_vector_locked(irqd, vector_searchmask))
return 0;
/* Try the node mask */
if (!assign_vector_locked(irqd, cpumask_of_node(node)))
return 0;
all:
if (node != NUMA_NO_NODE) {
/* Try the intersection of @affmsk and node mask */
cpumask_and(vector_searchmask, cpumask_of_node(node), affmsk);
if (!assign_vector_locked(irqd, vector_searchmask))
return 0;
}
/* Try the full affinity mask */
cpumask_and(vector_searchmask, affmsk, cpu_online_mask);
if (!assign_vector_locked(irqd, vector_searchmask))
return 0;
if (node != NUMA_NO_NODE) {
/* Try the node mask */
if (!assign_vector_locked(irqd, cpumask_of_node(node)))
return 0;
}
/* Try the full online mask */
return assign_vector_locked(irqd, cpu_online_mask);
}

View File

@ -279,7 +279,6 @@ static void mbm_bw_count(u32 rmid, struct rmid_read *rr)
return;
chunks = mbm_overflow_count(m->prev_bw_msr, tval, rr->r->mbm_width);
m->chunks += chunks;
cur_bw = (chunks * r->mon_scale) >> 20;
if (m->delta_comp)
@ -450,15 +449,14 @@ static void mbm_update(struct rdt_resource *r, struct rdt_domain *d, int rmid)
}
if (is_mbm_local_enabled()) {
rr.evtid = QOS_L3_MBM_LOCAL_EVENT_ID;
__mon_event_count(rmid, &rr);
/*
* Call the MBA software controller only for the
* control groups and when user has enabled
* the software controller explicitly.
*/
if (!is_mba_sc(NULL))
__mon_event_count(rmid, &rr);
else
if (is_mba_sc(NULL))
mbm_bw_count(rmid, &rr);
}
}

View File

@ -272,6 +272,19 @@ static int insn_is_indirect_jump(struct insn *insn)
return ret;
}
static bool is_padding_int3(unsigned long addr, unsigned long eaddr)
{
unsigned char ops;
for (; addr < eaddr; addr++) {
if (get_kernel_nofault(ops, (void *)addr) < 0 ||
ops != INT3_INSN_OPCODE)
return false;
}
return true;
}
/* Decode whole function to ensure any instructions don't jump into target */
static int can_optimize(unsigned long paddr)
{
@ -310,9 +323,14 @@ static int can_optimize(unsigned long paddr)
return 0;
kernel_insn_init(&insn, (void *)recovered_insn, MAX_INSN_SIZE);
insn_get_length(&insn);
/* Another subsystem puts a breakpoint */
/*
* In the case of detecting unknown breakpoint, this could be
* a padding INT3 between functions. Let's check that all the
* rest of the bytes are also INT3.
*/
if (insn.opcode.bytes[0] == INT3_INSN_OPCODE)
return 0;
return is_padding_int3(addr, paddr - offset + size) ? 1 : 0;
/* Recover address */
insn.kaddr = (void *)addr;
insn.next_byte = (void *)(addr + insn.length);

View File

@ -45,8 +45,8 @@
#define PMD_FLAGS_LARGE (__PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL)
#define PMD_FLAGS_DEC PMD_FLAGS_LARGE
#define PMD_FLAGS_DEC_WP ((PMD_FLAGS_DEC & ~_PAGE_CACHE_MASK) | \
(_PAGE_PAT | _PAGE_PWT))
#define PMD_FLAGS_DEC_WP ((PMD_FLAGS_DEC & ~_PAGE_LARGE_CACHE_MASK) | \
(_PAGE_PAT_LARGE | _PAGE_PWT))
#define PMD_FLAGS_ENC (PMD_FLAGS_LARGE | _PAGE_ENC)

View File

@ -474,8 +474,14 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
/*
* The membarrier system call requires a full memory barrier and
* core serialization before returning to user-space, after
* storing to rq->curr. Writing to CR3 provides that full
* memory barrier and core serializing instruction.
* storing to rq->curr, when changing mm. This is because
* membarrier() sends IPIs to all CPUs that are in the target mm
* to make them issue memory barriers. However, if another CPU
* switches to/from the target mm concurrently with
* membarrier(), it can cause that CPU not to receive an IPI
* when it really should issue a memory barrier. Writing to CR3
* provides that full memory barrier and core serializing
* instruction.
*/
if (real_prev == next) {
VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) !=

View File

@ -38,8 +38,33 @@ static void ipi_mb(void *info)
smp_mb(); /* IPIs should be serializing but paranoid. */
}
static void ipi_sync_core(void *info)
{
/*
* The smp_mb() in membarrier after all the IPIs is supposed to
* ensure that memory on remote CPUs that occur before the IPI
* become visible to membarrier()'s caller -- see scenario B in
* the big comment at the top of this file.
*
* A sync_core() would provide this guarantee, but
* sync_core_before_usermode() might end up being deferred until
* after membarrier()'s smp_mb().
*/
smp_mb(); /* IPIs should be serializing but paranoid. */
sync_core_before_usermode();
}
static void ipi_rseq(void *info)
{
/*
* Ensure that all stores done by the calling thread are visible
* to the current task before the current task resumes. We could
* probably optimize this away on most architectures, but by the
* time we've already sent an IPI, the cost of the extra smp_mb()
* is negligible.
*/
smp_mb();
rseq_preempt(current);
}
@ -154,6 +179,7 @@ static int membarrier_private_expedited(int flags, int cpu_id)
if (!(atomic_read(&mm->membarrier_state) &
MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY))
return -EPERM;
ipi_func = ipi_sync_core;
} else if (flags == MEMBARRIER_FLAG_RSEQ) {
if (!IS_ENABLED(CONFIG_RSEQ))
return -EINVAL;
@ -168,7 +194,8 @@ static int membarrier_private_expedited(int flags, int cpu_id)
return -EPERM;
}
if (atomic_read(&mm->mm_users) == 1 || num_online_cpus() == 1)
if (flags != MEMBARRIER_FLAG_SYNC_CORE &&
(atomic_read(&mm->mm_users) == 1 || num_online_cpus() == 1))
return 0;
/*
@ -187,8 +214,6 @@ static int membarrier_private_expedited(int flags, int cpu_id)
if (cpu_id >= nr_cpu_ids || !cpu_online(cpu_id))
goto out;
if (cpu_id == raw_smp_processor_id())
goto out;
rcu_read_lock();
p = rcu_dereference(cpu_rq(cpu_id)->curr);
if (!p || p->mm != mm) {
@ -203,16 +228,6 @@ static int membarrier_private_expedited(int flags, int cpu_id)
for_each_online_cpu(cpu) {
struct task_struct *p;
/*
* Skipping the current CPU is OK even through we can be
* migrated at any point. The current CPU, at the point
* where we read raw_smp_processor_id(), is ensured to
* be in program order with respect to the caller
* thread. Therefore, we can skip this CPU from the
* iteration.
*/
if (cpu == raw_smp_processor_id())
continue;
p = rcu_dereference(cpu_rq(cpu)->curr);
if (p && p->mm == mm)
__cpumask_set_cpu(cpu, tmpmask);
@ -220,12 +235,38 @@ static int membarrier_private_expedited(int flags, int cpu_id)
rcu_read_unlock();
}
preempt_disable();
if (cpu_id >= 0)
if (cpu_id >= 0) {
/*
* smp_call_function_single() will call ipi_func() if cpu_id
* is the calling CPU.
*/
smp_call_function_single(cpu_id, ipi_func, NULL, 1);
else
smp_call_function_many(tmpmask, ipi_func, NULL, 1);
preempt_enable();
} else {
/*
* For regular membarrier, we can save a few cycles by
* skipping the current cpu -- we're about to do smp_mb()
* below, and if we migrate to a different cpu, this cpu
* and the new cpu will execute a full barrier in the
* scheduler.
*
* For SYNC_CORE, we do need a barrier on the current cpu --
* otherwise, if we are migrated and replaced by a different
* task in the same mm just before, during, or after
* membarrier, we will end up with some thread in the mm
* running without a core sync.
*
* For RSEQ, don't rseq_preempt() the caller. User code
* is not supposed to issue syscalls at all from inside an
* rseq critical section.
*/
if (flags != MEMBARRIER_FLAG_SYNC_CORE) {
preempt_disable();
smp_call_function_many(tmpmask, ipi_func, NULL, true);
preempt_enable();
} else {
on_each_cpu_mask(tmpmask, ipi_func, NULL, true);
}
}
out:
if (cpu_id < 0)