Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net

Lots of overlapping changes.  Also on the net-next side
the XDP state management is handled more in the generic
layers so undo the 'net' nfp fix which isn't applicable
in net-next.

Include a necessary change by Jakub Kicinski, with log message:

====================
cls_bpf no longer takes care of offload tracking.  Make sure
netdevsim performs necessary checks.  This fixes a warning
caused by TC trying to remove a filter it has not added.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
====================

Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
David S. Miller 2017-12-22 11:16:31 -05:00
commit fba961ab29
202 changed files with 2851 additions and 1137 deletions

View File

@ -13,7 +13,6 @@ Required properties:
at25df321a
at25df641
at26df081a
en25s64
mr25h128
mr25h256
mr25h10
@ -33,7 +32,6 @@ Required properties:
s25fl008k
s25fl064k
sst25vf040b
sst25wf040b
m25p40
m25p80
m25p16

View File

@ -12,24 +12,30 @@ Required properties:
- "fsl,imx53-ecspi" for SPI compatible with the one integrated on i.MX53 and later Soc
- reg : Offset and length of the register set for the device
- interrupts : Should contain CSPI/eCSPI interrupt
- cs-gpios : Specifies the gpio pins to be used for chipselects.
- clocks : Clock specifiers for both ipg and per clocks.
- clock-names : Clock names should include both "ipg" and "per"
See the clock consumer binding,
Documentation/devicetree/bindings/clock/clock-bindings.txt
- dmas: DMA specifiers for tx and rx dma. See the DMA client binding,
Documentation/devicetree/bindings/dma/dma.txt
- dma-names: DMA request names should include "tx" and "rx" if present.
Obsolete properties:
- fsl,spi-num-chipselects : Contains the number of the chipselect
Recommended properties:
- cs-gpios : GPIOs to use as chip selects, see spi-bus.txt. While the native chip
select lines can be used, they appear to always generate a pulse between each
word of a transfer. Most use cases will require GPIO based chip selects to
generate a valid transaction.
Optional properties:
- num-cs : Number of total chip selects, see spi-bus.txt.
- dmas: DMA specifiers for tx and rx dma. See the DMA client binding,
Documentation/devicetree/bindings/dma/dma.txt.
- dma-names: DMA request names, if present, should include "tx" and "rx".
- fsl,spi-rdy-drctl: Integer, representing the value of DRCTL, the register
controlling the SPI_READY handling. Note that to enable the DRCTL consideration,
the SPI_READY mode-flag needs to be set too.
Valid values are: 0 (disabled), 1 (edge-triggered burst) and 2 (level-triggered burst).
Obsolete properties:
- fsl,spi-num-chipselects : Contains the number of the chipselect
Example:
ecspi@70010000 {

View File

@ -2,7 +2,7 @@
VERSION = 4
PATCHLEVEL = 15
SUBLEVEL = 0
EXTRAVERSION = -rc3
EXTRAVERSION = -rc4
NAME = Fearless Coyote
# *DOCUMENTATION*

View File

@ -85,7 +85,11 @@
.pushsection .text.fixup,"ax"
.align 4
9001: mov r4, #-EFAULT
#ifdef CONFIG_CPU_SW_DOMAIN_PAN
ldr r5, [sp, #9*4] @ *err_ptr
#else
ldr r5, [sp, #8*4] @ *err_ptr
#endif
str r4, [r5]
ldmia sp, {r1, r2} @ retrieve dst, len
add r2, r2, r1

View File

@ -74,6 +74,9 @@ static void __hyp_text __debug_save_spe_nvhe(u64 *pmscr_el1)
{
u64 reg;
/* Clear pmscr in case of early return */
*pmscr_el1 = 0;
/* SPE present on this CPU? */
if (!cpuid_feature_extract_unsigned_field(read_sysreg(id_aa64dfr0_el1),
ID_AA64DFR0_PMSVER_SHIFT))

View File

@ -35,7 +35,12 @@ struct thread_info {
/* thread information allocation */
#ifdef CONFIG_IRQSTACKS
#define THREAD_SIZE_ORDER 2 /* PA-RISC requires at least 16k stack */
#else
#define THREAD_SIZE_ORDER 3 /* PA-RISC requires at least 32k stack */
#endif
/* Be sure to hunt all references to this down when you change the size of
* the kernel stack */
#define THREAD_SIZE (PAGE_SIZE << THREAD_SIZE_ORDER)

View File

@ -878,9 +878,6 @@ ENTRY_CFI(syscall_exit_rfi)
STREG %r19,PT_SR7(%r16)
intr_return:
/* NOTE: Need to enable interrupts incase we schedule. */
ssm PSW_SM_I, %r0
/* check for reschedule */
mfctl %cr30,%r1
LDREG TI_FLAGS(%r1),%r19 /* sched.h: TIF_NEED_RESCHED */
@ -907,6 +904,11 @@ intr_check_sig:
LDREG PT_IASQ1(%r16), %r20
cmpib,COND(=),n 0,%r20,intr_restore /* backward */
/* NOTE: We need to enable interrupts if we have to deliver
* signals. We used to do this earlier but it caused kernel
* stack overflows. */
ssm PSW_SM_I, %r0
copy %r0, %r25 /* long in_syscall = 0 */
#ifdef CONFIG_64BIT
ldo -16(%r30),%r29 /* Reference param save area */
@ -958,6 +960,10 @@ intr_do_resched:
cmpib,COND(=) 0, %r20, intr_do_preempt
nop
/* NOTE: We need to enable interrupts if we schedule. We used
* to do this earlier but it caused kernel stack overflows. */
ssm PSW_SM_I, %r0
#ifdef CONFIG_64BIT
ldo -16(%r30),%r29 /* Reference param save area */
#endif

View File

@ -305,6 +305,7 @@ ENDPROC_CFI(os_hpmc)
__INITRODATA
.align 4
.export os_hpmc_size
os_hpmc_size:
.word .os_hpmc_end-.os_hpmc

View File

@ -15,7 +15,6 @@
#include <linux/slab.h>
#include <linux/kallsyms.h>
#include <linux/sort.h>
#include <linux/sched.h>
#include <linux/uaccess.h>
#include <asm/assembly.h>

View File

@ -16,9 +16,7 @@
#include <linux/preempt.h>
#include <linux/init.h>
#include <asm/processor.h>
#include <asm/delay.h>
#include <asm/special_insns.h> /* for mfctl() */
#include <asm/processor.h> /* for boot_cpu_data */

View File

@ -763,7 +763,8 @@ static int bpf_jit_build_body(struct bpf_prog *fp, u32 *image,
func = (u8 *) __bpf_call_base + imm;
/* Save skb pointer if we need to re-cache skb data */
if (bpf_helper_changes_pkt_data(func))
if ((ctx->seen & SEEN_SKB) &&
bpf_helper_changes_pkt_data(func))
PPC_BPF_STL(3, 1, bpf_jit_stack_local(ctx));
bpf_jit_emit_func_call(image, ctx, (u64)func);
@ -772,7 +773,8 @@ static int bpf_jit_build_body(struct bpf_prog *fp, u32 *image,
PPC_MR(b2p[BPF_REG_0], 3);
/* refresh skb cache */
if (bpf_helper_changes_pkt_data(func)) {
if ((ctx->seen & SEEN_SKB) &&
bpf_helper_changes_pkt_data(func)) {
/* reload skb pointer to r3 */
PPC_BPF_LL(3, 1, bpf_jit_stack_local(ctx));
bpf_jit_emit_skb_loads(image, ctx);

View File

@ -55,8 +55,7 @@ struct bpf_jit {
#define SEEN_LITERAL 8 /* code uses literals */
#define SEEN_FUNC 16 /* calls C functions */
#define SEEN_TAIL_CALL 32 /* code uses tail calls */
#define SEEN_SKB_CHANGE 64 /* code changes skb data */
#define SEEN_REG_AX 128 /* code uses constant blinding */
#define SEEN_REG_AX 64 /* code uses constant blinding */
#define SEEN_STACK (SEEN_FUNC | SEEN_MEM | SEEN_SKB)
/*
@ -448,12 +447,12 @@ static void bpf_jit_prologue(struct bpf_jit *jit, u32 stack_depth)
EMIT6_DISP_LH(0xe3000000, 0x0024, REG_W1, REG_0,
REG_15, 152);
}
if (jit->seen & SEEN_SKB)
if (jit->seen & SEEN_SKB) {
emit_load_skb_data_hlen(jit);
if (jit->seen & SEEN_SKB_CHANGE)
/* stg %b1,ST_OFF_SKBP(%r0,%r15) */
EMIT6_DISP_LH(0xe3000000, 0x0024, BPF_REG_1, REG_0, REG_15,
STK_OFF_SKBP);
}
}
/*
@ -983,8 +982,8 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, int i
EMIT2(0x0d00, REG_14, REG_W1);
/* lgr %b0,%r2: load return value into %b0 */
EMIT4(0xb9040000, BPF_REG_0, REG_2);
if (bpf_helper_changes_pkt_data((void *)func)) {
jit->seen |= SEEN_SKB_CHANGE;
if ((jit->seen & SEEN_SKB) &&
bpf_helper_changes_pkt_data((void *)func)) {
/* lg %b1,ST_OFF_SKBP(%r15) */
EMIT6_DISP_LH(0xe3000000, 0x0004, BPF_REG_1, REG_0,
REG_15, STK_OFF_SKBP);

View File

@ -113,7 +113,7 @@ show_signal_msg(struct pt_regs *regs, int sig, int code,
if (!printk_ratelimit())
return;
printk("%s%s[%d]: segfault at %lx ip %p (rpc %p) sp %p error %x",
printk("%s%s[%d]: segfault at %lx ip %px (rpc %px) sp %px error %x",
task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
tsk->comm, task_pid_nr(tsk), address,
(void *)regs->pc, (void *)regs->u_regs[UREG_I7],

View File

@ -154,7 +154,7 @@ show_signal_msg(struct pt_regs *regs, int sig, int code,
if (!printk_ratelimit())
return;
printk("%s%s[%d]: segfault at %lx ip %p (rpc %p) sp %p error %x",
printk("%s%s[%d]: segfault at %lx ip %px (rpc %px) sp %px error %x",
task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
tsk->comm, task_pid_nr(tsk), address,
(void *)regs->tpc, (void *)regs->u_regs[UREG_I7],

View File

@ -1245,14 +1245,16 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx)
u8 *func = ((u8 *)__bpf_call_base) + imm;
ctx->saw_call = true;
if (ctx->saw_ld_abs_ind && bpf_helper_changes_pkt_data(func))
emit_reg_move(bpf2sparc[BPF_REG_1], L7, ctx);
emit_call((u32 *)func, ctx);
emit_nop(ctx);
emit_reg_move(O0, bpf2sparc[BPF_REG_0], ctx);
if (bpf_helper_changes_pkt_data(func) && ctx->saw_ld_abs_ind)
load_skb_regs(ctx, bpf2sparc[BPF_REG_6]);
if (ctx->saw_ld_abs_ind && bpf_helper_changes_pkt_data(func))
load_skb_regs(ctx, L7);
break;
}

View File

@ -150,7 +150,7 @@ static void show_segv_info(struct uml_pt_regs *regs)
if (!printk_ratelimit())
return;
printk("%s%s[%d]: segfault at %lx ip %p sp %p error %x",
printk("%s%s[%d]: segfault at %lx ip %px sp %px error %x",
task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
tsk->comm, task_pid_nr(tsk), FAULT_ADDRESS(*fi),
(void *)UPT_IP(regs), (void *)UPT_SP(regs),

View File

@ -941,7 +941,8 @@ ENTRY(debug)
movl %esp, %eax # pt_regs pointer
/* Are we currently on the SYSENTER stack? */
PER_CPU(cpu_tss + CPU_TSS_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx)
movl PER_CPU_VAR(cpu_entry_area), %ecx
addl $CPU_ENTRY_AREA_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx
subl %eax, %ecx /* ecx = (end of SYSENTER_stack) - esp */
cmpl $SIZEOF_SYSENTER_stack, %ecx
jb .Ldebug_from_sysenter_stack
@ -984,7 +985,8 @@ ENTRY(nmi)
movl %esp, %eax # pt_regs pointer
/* Are we currently on the SYSENTER stack? */
PER_CPU(cpu_tss + CPU_TSS_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx)
movl PER_CPU_VAR(cpu_entry_area), %ecx
addl $CPU_ENTRY_AREA_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx
subl %eax, %ecx /* ecx = (end of SYSENTER_stack) - esp */
cmpl $SIZEOF_SYSENTER_stack, %ecx
jb .Lnmi_from_sysenter_stack

View File

@ -140,6 +140,64 @@ END(native_usergs_sysret64)
* with them due to bugs in both AMD and Intel CPUs.
*/
.pushsection .entry_trampoline, "ax"
/*
* The code in here gets remapped into cpu_entry_area's trampoline. This means
* that the assembler and linker have the wrong idea as to where this code
* lives (and, in fact, it's mapped more than once, so it's not even at a
* fixed address). So we can't reference any symbols outside the entry
* trampoline and expect it to work.
*
* Instead, we carefully abuse %rip-relative addressing.
* _entry_trampoline(%rip) refers to the start of the remapped) entry
* trampoline. We can thus find cpu_entry_area with this macro:
*/
#define CPU_ENTRY_AREA \
_entry_trampoline - CPU_ENTRY_AREA_entry_trampoline(%rip)
/* The top word of the SYSENTER stack is hot and is usable as scratch space. */
#define RSP_SCRATCH CPU_ENTRY_AREA_SYSENTER_stack + \
SIZEOF_SYSENTER_stack - 8 + CPU_ENTRY_AREA
ENTRY(entry_SYSCALL_64_trampoline)
UNWIND_HINT_EMPTY
swapgs
/* Stash the user RSP. */
movq %rsp, RSP_SCRATCH
/* Load the top of the task stack into RSP */
movq CPU_ENTRY_AREA_tss + TSS_sp1 + CPU_ENTRY_AREA, %rsp
/* Start building the simulated IRET frame. */
pushq $__USER_DS /* pt_regs->ss */
pushq RSP_SCRATCH /* pt_regs->sp */
pushq %r11 /* pt_regs->flags */
pushq $__USER_CS /* pt_regs->cs */
pushq %rcx /* pt_regs->ip */
/*
* x86 lacks a near absolute jump, and we can't jump to the real
* entry text with a relative jump. We could push the target
* address and then use retq, but this destroys the pipeline on
* many CPUs (wasting over 20 cycles on Sandy Bridge). Instead,
* spill RDI and restore it in a second-stage trampoline.
*/
pushq %rdi
movq $entry_SYSCALL_64_stage2, %rdi
jmp *%rdi
END(entry_SYSCALL_64_trampoline)
.popsection
ENTRY(entry_SYSCALL_64_stage2)
UNWIND_HINT_EMPTY
popq %rdi
jmp entry_SYSCALL_64_after_hwframe
END(entry_SYSCALL_64_stage2)
ENTRY(entry_SYSCALL_64)
UNWIND_HINT_EMPTY
/*
@ -330,8 +388,24 @@ syscall_return_via_sysret:
popq %rsi /* skip rcx */
popq %rdx
popq %rsi
/*
* Now all regs are restored except RSP and RDI.
* Save old stack pointer and switch to trampoline stack.
*/
movq %rsp, %rdi
movq PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rsp
pushq RSP-RDI(%rdi) /* RSP */
pushq (%rdi) /* RDI */
/*
* We are on the trampoline stack. All regs except RDI are live.
* We can do future final exit work right here.
*/
popq %rdi
movq RSP-ORIG_RAX(%rsp), %rsp
popq %rsp
USERGS_SYSRET64
END(entry_SYSCALL_64)
@ -466,12 +540,13 @@ END(irq_entries_start)
.macro DEBUG_ENTRY_ASSERT_IRQS_OFF
#ifdef CONFIG_DEBUG_ENTRY
pushfq
testl $X86_EFLAGS_IF, (%rsp)
pushq %rax
SAVE_FLAGS(CLBR_RAX)
testl $X86_EFLAGS_IF, %eax
jz .Lokay_\@
ud2
.Lokay_\@:
addq $8, %rsp
popq %rax
#endif
.endm
@ -563,6 +638,13 @@ END(irq_entries_start)
/* 0(%rsp): ~(interrupt number) */
.macro interrupt func
cld
testb $3, CS-ORIG_RAX(%rsp)
jz 1f
SWAPGS
call switch_to_thread_stack
1:
ALLOC_PT_GPREGS_ON_STACK
SAVE_C_REGS
SAVE_EXTRA_REGS
@ -572,12 +654,8 @@ END(irq_entries_start)
jz 1f
/*
* IRQ from user mode. Switch to kernel gsbase and inform context
* tracking that we're in kernel mode.
*/
SWAPGS
/*
* IRQ from user mode.
*
* We need to tell lockdep that IRQs are off. We can't do this until
* we fix gsbase, and we should do it before enter_from_user_mode
* (which can take locks). Since TRACE_IRQS_OFF idempotent,
@ -630,10 +708,41 @@ GLOBAL(swapgs_restore_regs_and_return_to_usermode)
ud2
1:
#endif
SWAPGS
POP_EXTRA_REGS
POP_C_REGS
addq $8, %rsp /* skip regs->orig_ax */
popq %r11
popq %r10
popq %r9
popq %r8
popq %rax
popq %rcx
popq %rdx
popq %rsi
/*
* The stack is now user RDI, orig_ax, RIP, CS, EFLAGS, RSP, SS.
* Save old stack pointer and switch to trampoline stack.
*/
movq %rsp, %rdi
movq PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rsp
/* Copy the IRET frame to the trampoline stack. */
pushq 6*8(%rdi) /* SS */
pushq 5*8(%rdi) /* RSP */
pushq 4*8(%rdi) /* EFLAGS */
pushq 3*8(%rdi) /* CS */
pushq 2*8(%rdi) /* RIP */
/* Push user RDI on the trampoline stack. */
pushq (%rdi)
/*
* We are on the trampoline stack. All regs except RDI are live.
* We can do future final exit work right here.
*/
/* Restore RDI. */
popq %rdi
SWAPGS
INTERRUPT_RETURN
@ -829,7 +938,33 @@ apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt
/*
* Exception entry points.
*/
#define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss) + (TSS_ist + ((x) - 1) * 8)
#define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss_rw) + (TSS_ist + ((x) - 1) * 8)
/*
* Switch to the thread stack. This is called with the IRET frame and
* orig_ax on the stack. (That is, RDI..R12 are not on the stack and
* space has not been allocated for them.)
*/
ENTRY(switch_to_thread_stack)
UNWIND_HINT_FUNC
pushq %rdi
movq %rsp, %rdi
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
UNWIND_HINT sp_offset=16 sp_reg=ORC_REG_DI
pushq 7*8(%rdi) /* regs->ss */
pushq 6*8(%rdi) /* regs->rsp */
pushq 5*8(%rdi) /* regs->eflags */
pushq 4*8(%rdi) /* regs->cs */
pushq 3*8(%rdi) /* regs->ip */
pushq 2*8(%rdi) /* regs->orig_ax */
pushq 8(%rdi) /* return address */
UNWIND_HINT_FUNC
movq (%rdi), %rdi
ret
END(switch_to_thread_stack)
.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1
ENTRY(\sym)
@ -848,11 +983,12 @@ ENTRY(\sym)
ALLOC_PT_GPREGS_ON_STACK
.if \paranoid
.if \paranoid == 1
.if \paranoid < 2
testb $3, CS(%rsp) /* If coming from userspace, switch stacks */
jnz 1f
jnz .Lfrom_usermode_switch_stack_\@
.endif
.if \paranoid
call paranoid_entry
.else
call error_entry
@ -894,20 +1030,15 @@ ENTRY(\sym)
jmp error_exit
.endif
.if \paranoid == 1
.if \paranoid < 2
/*
* Paranoid entry from userspace. Switch stacks and treat it
* Entry from userspace. Switch stacks and treat it
* as a normal entry. This means that paranoid handlers
* run in real process context if user_mode(regs).
*/
1:
.Lfrom_usermode_switch_stack_\@:
call error_entry
movq %rsp, %rdi /* pt_regs pointer */
call sync_regs
movq %rax, %rsp /* switch stack */
movq %rsp, %rdi /* pt_regs pointer */
.if \has_error_code
@ -1170,6 +1301,14 @@ ENTRY(error_entry)
SWAPGS
.Lerror_entry_from_usermode_after_swapgs:
/* Put us onto the real thread stack. */
popq %r12 /* save return addr in %12 */
movq %rsp, %rdi /* arg0 = pt_regs pointer */
call sync_regs
movq %rax, %rsp /* switch stack */
ENCODE_FRAME_POINTER
pushq %r12
/*
* We need to tell lockdep that IRQs are off. We can't do this until
* we fix gsbase, and we should do it before enter_from_user_mode

View File

@ -48,7 +48,7 @@
*/
ENTRY(entry_SYSENTER_compat)
/* Interrupts are off on entry. */
SWAPGS_UNSAFE_STACK
SWAPGS
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
/*
@ -306,8 +306,11 @@ ENTRY(entry_INT80_compat)
*/
movl %eax, %eax
/* Construct struct pt_regs on stack (iret frame is already on stack) */
pushq %rax /* pt_regs->orig_ax */
/* switch to thread stack expects orig_ax to be pushed */
call switch_to_thread_stack
pushq %rdi /* pt_regs->di */
pushq %rsi /* pt_regs->si */
pushq %rdx /* pt_regs->dx */

View File

@ -135,6 +135,8 @@ extern void clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int bit);
set_bit(bit, (unsigned long *)cpu_caps_set); \
} while (0)
#define setup_force_cpu_bug(bit) setup_force_cpu_cap(bit)
#if defined(CC_HAVE_ASM_GOTO) && defined(CONFIG_X86_FAST_FEATURE_TESTS)
/*
* Static testing of CPU features. Used the same as boot_cpu_has().

View File

@ -60,17 +60,10 @@ static inline struct desc_struct *get_current_gdt_rw(void)
return this_cpu_ptr(&gdt_page)->gdt;
}
/* Get the fixmap index for a specific processor */
static inline unsigned int get_cpu_gdt_ro_index(int cpu)
{
return FIX_GDT_REMAP_BEGIN + cpu;
}
/* Provide the fixmap address of the remapped GDT */
static inline struct desc_struct *get_cpu_gdt_ro(int cpu)
{
unsigned int idx = get_cpu_gdt_ro_index(cpu);
return (struct desc_struct *)__fix_to_virt(idx);
return (struct desc_struct *)&get_cpu_entry_area(cpu)->gdt;
}
/* Provide the current read-only GDT */
@ -185,7 +178,7 @@ static inline void set_tssldt_descriptor(void *d, unsigned long addr,
#endif
}
static inline void __set_tss_desc(unsigned cpu, unsigned int entry, void *addr)
static inline void __set_tss_desc(unsigned cpu, unsigned int entry, struct x86_hw_tss *addr)
{
struct desc_struct *d = get_cpu_gdt_rw(cpu);
tss_desc tss;

View File

@ -44,6 +44,45 @@ extern unsigned long __FIXADDR_TOP;
PAGE_SIZE)
#endif
/*
* cpu_entry_area is a percpu region in the fixmap that contains things
* needed by the CPU and early entry/exit code. Real types aren't used
* for all fields here to avoid circular header dependencies.
*
* Every field is a virtual alias of some other allocated backing store.
* There is no direct allocation of a struct cpu_entry_area.
*/
struct cpu_entry_area {
char gdt[PAGE_SIZE];
/*
* The GDT is just below SYSENTER_stack and thus serves (on x86_64) as
* a a read-only guard page.
*/
struct SYSENTER_stack_page SYSENTER_stack_page;
/*
* On x86_64, the TSS is mapped RO. On x86_32, it's mapped RW because
* we need task switches to work, and task switches write to the TSS.
*/
struct tss_struct tss;
char entry_trampoline[PAGE_SIZE];
#ifdef CONFIG_X86_64
/*
* Exception stacks used for IST entries.
*
* In the future, this should have a separate slot for each stack
* with guard pages between them.
*/
char exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ];
#endif
};
#define CPU_ENTRY_AREA_PAGES (sizeof(struct cpu_entry_area) / PAGE_SIZE)
extern void setup_cpu_entry_areas(void);
/*
* Here we define all the compile-time 'special' virtual
@ -101,8 +140,8 @@ enum fixed_addresses {
FIX_LNW_VRTC,
#endif
/* Fixmap entries to remap the GDTs, one per processor. */
FIX_GDT_REMAP_BEGIN,
FIX_GDT_REMAP_END = FIX_GDT_REMAP_BEGIN + NR_CPUS - 1,
FIX_CPU_ENTRY_AREA_TOP,
FIX_CPU_ENTRY_AREA_BOTTOM = FIX_CPU_ENTRY_AREA_TOP + (CPU_ENTRY_AREA_PAGES * NR_CPUS) - 1,
#ifdef CONFIG_ACPI_APEI_GHES
/* Used for GHES mapping from assorted contexts */
@ -191,5 +230,30 @@ void __init *early_memremap_decrypted_wp(resource_size_t phys_addr,
void __early_set_fixmap(enum fixed_addresses idx,
phys_addr_t phys, pgprot_t flags);
static inline unsigned int __get_cpu_entry_area_page_index(int cpu, int page)
{
BUILD_BUG_ON(sizeof(struct cpu_entry_area) % PAGE_SIZE != 0);
return FIX_CPU_ENTRY_AREA_BOTTOM - cpu*CPU_ENTRY_AREA_PAGES - page;
}
#define __get_cpu_entry_area_offset_index(cpu, offset) ({ \
BUILD_BUG_ON(offset % PAGE_SIZE != 0); \
__get_cpu_entry_area_page_index(cpu, offset / PAGE_SIZE); \
})
#define get_cpu_entry_area_index(cpu, field) \
__get_cpu_entry_area_offset_index((cpu), offsetof(struct cpu_entry_area, field))
static inline struct cpu_entry_area *get_cpu_entry_area(int cpu)
{
return (struct cpu_entry_area *)__fix_to_virt(__get_cpu_entry_area_page_index(cpu, 0));
}
static inline struct SYSENTER_stack *cpu_SYSENTER_stack(int cpu)
{
return &get_cpu_entry_area(cpu)->SYSENTER_stack_page.stack;
}
#endif /* !__ASSEMBLY__ */
#endif /* _ASM_X86_FIXMAP_H */

View File

@ -20,16 +20,7 @@
#ifndef _ASM_X86_HYPERVISOR_H
#define _ASM_X86_HYPERVISOR_H
#ifdef CONFIG_HYPERVISOR_GUEST
#include <asm/kvm_para.h>
#include <asm/x86_init.h>
#include <asm/xen/hypervisor.h>
/*
* x86 hypervisor information
*/
/* x86 hypervisor types */
enum x86_hypervisor_type {
X86_HYPER_NATIVE = 0,
X86_HYPER_VMWARE,
@ -39,6 +30,12 @@ enum x86_hypervisor_type {
X86_HYPER_KVM,
};
#ifdef CONFIG_HYPERVISOR_GUEST
#include <asm/kvm_para.h>
#include <asm/x86_init.h>
#include <asm/xen/hypervisor.h>
struct hypervisor_x86 {
/* Hypervisor name */
const char *name;
@ -58,7 +55,15 @@ struct hypervisor_x86 {
extern enum x86_hypervisor_type x86_hyper_type;
extern void init_hypervisor_platform(void);
static inline bool hypervisor_is_type(enum x86_hypervisor_type type)
{
return x86_hyper_type == type;
}
#else
static inline void init_hypervisor_platform(void) { }
static inline bool hypervisor_is_type(enum x86_hypervisor_type type)
{
return type == X86_HYPER_NATIVE;
}
#endif /* CONFIG_HYPERVISOR_GUEST */
#endif /* _ASM_X86_HYPERVISOR_H */

View File

@ -142,6 +142,9 @@ static inline notrace unsigned long arch_local_irq_save(void)
swapgs; \
sysretl
#ifdef CONFIG_DEBUG_ENTRY
#define SAVE_FLAGS(x) pushfq; popq %rax
#endif
#else
#define INTERRUPT_RETURN iret
#define ENABLE_INTERRUPTS_SYSEXIT sti; sysexit

View File

@ -26,6 +26,7 @@ extern void die(const char *, struct pt_regs *,long);
extern int __must_check __die(const char *, struct pt_regs *, long);
extern void show_stack_regs(struct pt_regs *regs);
extern void __show_regs(struct pt_regs *regs, int all);
extern void show_iret_regs(struct pt_regs *regs);
extern unsigned long oops_begin(void);
extern void oops_end(unsigned long, struct pt_regs *, int signr);

View File

@ -927,6 +927,15 @@ extern void default_banner(void);
PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_usergs_sysret64), \
CLBR_NONE, \
jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_usergs_sysret64))
#ifdef CONFIG_DEBUG_ENTRY
#define SAVE_FLAGS(clobbers) \
PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_save_fl), clobbers, \
PV_SAVE_REGS(clobbers | CLBR_CALLEE_SAVE); \
call PARA_INDIRECT(pv_irq_ops+PV_IRQ_save_fl); \
PV_RESTORE_REGS(clobbers | CLBR_CALLEE_SAVE);)
#endif
#endif /* CONFIG_X86_32 */
#endif /* __ASSEMBLY__ */

View File

@ -163,9 +163,9 @@ enum cpuid_regs_idx {
extern struct cpuinfo_x86 boot_cpu_data;
extern struct cpuinfo_x86 new_cpu_data;
extern struct tss_struct doublefault_tss;
extern __u32 cpu_caps_cleared[NCAPINTS];
extern __u32 cpu_caps_set[NCAPINTS];
extern struct x86_hw_tss doublefault_tss;
extern __u32 cpu_caps_cleared[NCAPINTS + NBUGINTS];
extern __u32 cpu_caps_set[NCAPINTS + NBUGINTS];
#ifdef CONFIG_SMP
DECLARE_PER_CPU_READ_MOSTLY(struct cpuinfo_x86, cpu_info);
@ -253,6 +253,11 @@ static inline void load_cr3(pgd_t *pgdir)
write_cr3(__sme_pa(pgdir));
}
/*
* Note that while the legacy 'TSS' name comes from 'Task State Segment',
* on modern x86 CPUs the TSS also holds information important to 64-bit mode,
* unrelated to the task-switch mechanism:
*/
#ifdef CONFIG_X86_32
/* This is the TSS defined by the hardware. */
struct x86_hw_tss {
@ -305,7 +310,13 @@ struct x86_hw_tss {
struct x86_hw_tss {
u32 reserved1;
u64 sp0;
/*
* We store cpu_current_top_of_stack in sp1 so it's always accessible.
* Linux does not use ring 1, so sp1 is not otherwise needed.
*/
u64 sp1;
u64 sp2;
u64 reserved2;
u64 ist[7];
@ -323,12 +334,22 @@ struct x86_hw_tss {
#define IO_BITMAP_BITS 65536
#define IO_BITMAP_BYTES (IO_BITMAP_BITS/8)
#define IO_BITMAP_LONGS (IO_BITMAP_BYTES/sizeof(long))
#define IO_BITMAP_OFFSET offsetof(struct tss_struct, io_bitmap)
#define IO_BITMAP_OFFSET (offsetof(struct tss_struct, io_bitmap) - offsetof(struct tss_struct, x86_tss))
#define INVALID_IO_BITMAP_OFFSET 0x8000
struct SYSENTER_stack {
unsigned long words[64];
};
struct SYSENTER_stack_page {
struct SYSENTER_stack stack;
} __aligned(PAGE_SIZE);
struct tss_struct {
/*
* The hardware state:
* The fixed hardware portion. This must not cross a page boundary
* at risk of violating the SDM's advice and potentially triggering
* errata.
*/
struct x86_hw_tss x86_tss;
@ -339,18 +360,9 @@ struct tss_struct {
* be within the limit.
*/
unsigned long io_bitmap[IO_BITMAP_LONGS + 1];
} __aligned(PAGE_SIZE);
#ifdef CONFIG_X86_32
/*
* Space for the temporary SYSENTER stack.
*/
unsigned long SYSENTER_stack_canary;
unsigned long SYSENTER_stack[64];
#endif
} ____cacheline_aligned;
DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss);
DECLARE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss_rw);
/*
* sizeof(unsigned long) coming from an extra "long" at the end
@ -364,6 +376,9 @@ DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss);
#ifdef CONFIG_X86_32
DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack);
#else
/* The RO copy can't be accessed with this_cpu_xyz(), so use the RW copy. */
#define cpu_current_top_of_stack cpu_tss_rw.x86_tss.sp1
#endif
/*
@ -523,7 +538,7 @@ static inline void native_set_iopl_mask(unsigned mask)
static inline void
native_load_sp0(unsigned long sp0)
{
this_cpu_write(cpu_tss.x86_tss.sp0, sp0);
this_cpu_write(cpu_tss_rw.x86_tss.sp0, sp0);
}
static inline void native_swapgs(void)
@ -535,12 +550,12 @@ static inline void native_swapgs(void)
static inline unsigned long current_top_of_stack(void)
{
#ifdef CONFIG_X86_64
return this_cpu_read_stable(cpu_tss.x86_tss.sp0);
#else
/* sp0 on x86_32 is special in and around vm86 mode. */
/*
* We can't read directly from tss.sp0: sp0 on x86_32 is special in
* and around vm86 mode and sp0 on x86_64 is special because of the
* entry trampoline.
*/
return this_cpu_read_stable(cpu_current_top_of_stack);
#endif
}
static inline bool on_thread_stack(void)

View File

@ -16,6 +16,7 @@ enum stack_type {
STACK_TYPE_TASK,
STACK_TYPE_IRQ,
STACK_TYPE_SOFTIRQ,
STACK_TYPE_SYSENTER,
STACK_TYPE_EXCEPTION,
STACK_TYPE_EXCEPTION_LAST = STACK_TYPE_EXCEPTION + N_EXCEPTION_STACKS-1,
};
@ -28,6 +29,8 @@ struct stack_info {
bool in_task_stack(unsigned long *stack, struct task_struct *task,
struct stack_info *info);
bool in_sysenter_stack(unsigned long *stack, struct stack_info *info);
int get_stack_info(unsigned long *stack, struct task_struct *task,
struct stack_info *info, unsigned long *visit_mask);

View File

@ -79,10 +79,10 @@ do { \
static inline void refresh_sysenter_cs(struct thread_struct *thread)
{
/* Only happens when SEP is enabled, no need to test "SEP"arately: */
if (unlikely(this_cpu_read(cpu_tss.x86_tss.ss1) == thread->sysenter_cs))
if (unlikely(this_cpu_read(cpu_tss_rw.x86_tss.ss1) == thread->sysenter_cs))
return;
this_cpu_write(cpu_tss.x86_tss.ss1, thread->sysenter_cs);
this_cpu_write(cpu_tss_rw.x86_tss.ss1, thread->sysenter_cs);
wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
}
#endif
@ -90,9 +90,11 @@ static inline void refresh_sysenter_cs(struct thread_struct *thread)
/* This is used when switching tasks or entering/exiting vm86 mode. */
static inline void update_sp0(struct task_struct *task)
{
/* On x86_64, sp0 always points to the entry trampoline stack, which is constant: */
#ifdef CONFIG_X86_32
load_sp0(task->thread.sp0);
#else
if (static_cpu_has(X86_FEATURE_XENPV))
load_sp0(task_top_of_stack(task));
#endif
}

View File

@ -207,7 +207,7 @@ static inline int arch_within_stack_frames(const void * const stack,
#else /* !__ASSEMBLY__ */
#ifdef CONFIG_X86_64
# define cpu_current_top_of_stack (cpu_tss + TSS_sp0)
# define cpu_current_top_of_stack (cpu_tss_rw + TSS_sp1)
#endif
#endif

View File

@ -75,7 +75,6 @@ dotraplinkage void do_segment_not_present(struct pt_regs *, long);
dotraplinkage void do_stack_segment(struct pt_regs *, long);
#ifdef CONFIG_X86_64
dotraplinkage void do_double_fault(struct pt_regs *, long);
asmlinkage struct pt_regs *sync_regs(struct pt_regs *);
#endif
dotraplinkage void do_general_protection(struct pt_regs *, long);
dotraplinkage void do_page_fault(struct pt_regs *, unsigned long);

View File

@ -7,6 +7,9 @@
#include <asm/ptrace.h>
#include <asm/stacktrace.h>
#define IRET_FRAME_OFFSET (offsetof(struct pt_regs, ip))
#define IRET_FRAME_SIZE (sizeof(struct pt_regs) - IRET_FRAME_OFFSET)
struct unwind_state {
struct stack_info stack_info;
unsigned long stack_mask;
@ -52,6 +55,10 @@ void unwind_start(struct unwind_state *state, struct task_struct *task,
}
#if defined(CONFIG_UNWINDER_ORC) || defined(CONFIG_UNWINDER_FRAME_POINTER)
/*
* WARNING: The entire pt_regs may not be safe to dereference. In some cases,
* only the iret frame registers are accessible. Use with caution!
*/
static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state)
{
if (unwind_done(state))

View File

@ -93,4 +93,10 @@ void common(void) {
BLANK();
DEFINE(PTREGS_SIZE, sizeof(struct pt_regs));
/* Layout info for cpu_entry_area */
OFFSET(CPU_ENTRY_AREA_tss, cpu_entry_area, tss);
OFFSET(CPU_ENTRY_AREA_entry_trampoline, cpu_entry_area, entry_trampoline);
OFFSET(CPU_ENTRY_AREA_SYSENTER_stack, cpu_entry_area, SYSENTER_stack_page);
DEFINE(SIZEOF_SYSENTER_stack, sizeof(struct SYSENTER_stack));
}

View File

@ -47,13 +47,8 @@ void foo(void)
BLANK();
/* Offset from the sysenter stack to tss.sp0 */
DEFINE(TSS_sysenter_sp0, offsetof(struct tss_struct, x86_tss.sp0) -
offsetofend(struct tss_struct, SYSENTER_stack));
/* Offset from cpu_tss to SYSENTER_stack */
OFFSET(CPU_TSS_SYSENTER_stack, tss_struct, SYSENTER_stack);
/* Size of SYSENTER_stack */
DEFINE(SIZEOF_SYSENTER_stack, sizeof(((struct tss_struct *)0)->SYSENTER_stack));
DEFINE(TSS_sysenter_sp0, offsetof(struct cpu_entry_area, tss.x86_tss.sp0) -
offsetofend(struct cpu_entry_area, SYSENTER_stack_page.stack));
#ifdef CONFIG_CC_STACKPROTECTOR
BLANK();

View File

@ -23,6 +23,9 @@ int main(void)
#ifdef CONFIG_PARAVIRT
OFFSET(PV_CPU_usergs_sysret64, pv_cpu_ops, usergs_sysret64);
OFFSET(PV_CPU_swapgs, pv_cpu_ops, swapgs);
#ifdef CONFIG_DEBUG_ENTRY
OFFSET(PV_IRQ_save_fl, pv_irq_ops, save_fl);
#endif
BLANK();
#endif
@ -63,6 +66,7 @@ int main(void)
OFFSET(TSS_ist, tss_struct, x86_tss.ist);
OFFSET(TSS_sp0, tss_struct, x86_tss.sp0);
OFFSET(TSS_sp1, tss_struct, x86_tss.sp1);
BLANK();
#ifdef CONFIG_CC_STACKPROTECTOR

View File

@ -476,8 +476,8 @@ static const char *table_lookup_model(struct cpuinfo_x86 *c)
return NULL; /* Not found */
}
__u32 cpu_caps_cleared[NCAPINTS];
__u32 cpu_caps_set[NCAPINTS];
__u32 cpu_caps_cleared[NCAPINTS + NBUGINTS];
__u32 cpu_caps_set[NCAPINTS + NBUGINTS];
void load_percpu_segment(int cpu)
{
@ -490,27 +490,116 @@ void load_percpu_segment(int cpu)
load_stack_canary_segment();
}
/* Setup the fixmap mapping only once per-processor */
static inline void setup_fixmap_gdt(int cpu)
#ifdef CONFIG_X86_32
/* The 32-bit entry code needs to find cpu_entry_area. */
DEFINE_PER_CPU(struct cpu_entry_area *, cpu_entry_area);
#endif
#ifdef CONFIG_X86_64
/*
* Special IST stacks which the CPU switches to when it calls
* an IST-marked descriptor entry. Up to 7 stacks (hardware
* limit), all of them are 4K, except the debug stack which
* is 8K.
*/
static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = {
[0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STKSZ,
[DEBUG_STACK - 1] = DEBUG_STKSZ
};
static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks
[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]);
#endif
static DEFINE_PER_CPU_PAGE_ALIGNED(struct SYSENTER_stack_page,
SYSENTER_stack_storage);
static void __init
set_percpu_fixmap_pages(int idx, void *ptr, int pages, pgprot_t prot)
{
for ( ; pages; pages--, idx--, ptr += PAGE_SIZE)
__set_fixmap(idx, per_cpu_ptr_to_phys(ptr), prot);
}
/* Setup the fixmap mappings only once per-processor */
static void __init setup_cpu_entry_area(int cpu)
{
#ifdef CONFIG_X86_64
/* On 64-bit systems, we use a read-only fixmap GDT. */
pgprot_t prot = PAGE_KERNEL_RO;
extern char _entry_trampoline[];
/* On 64-bit systems, we use a read-only fixmap GDT and TSS. */
pgprot_t gdt_prot = PAGE_KERNEL_RO;
pgprot_t tss_prot = PAGE_KERNEL_RO;
#else
/*
* On native 32-bit systems, the GDT cannot be read-only because
* our double fault handler uses a task gate, and entering through
* a task gate needs to change an available TSS to busy. If the GDT
* is read-only, that will triple fault.
* a task gate needs to change an available TSS to busy. If the
* GDT is read-only, that will triple fault. The TSS cannot be
* read-only because the CPU writes to it on task switches.
*
* On Xen PV, the GDT must be read-only because the hypervisor requires
* it.
* On Xen PV, the GDT must be read-only because the hypervisor
* requires it.
*/
pgprot_t prot = boot_cpu_has(X86_FEATURE_XENPV) ?
pgprot_t gdt_prot = boot_cpu_has(X86_FEATURE_XENPV) ?
PAGE_KERNEL_RO : PAGE_KERNEL;
pgprot_t tss_prot = PAGE_KERNEL;
#endif
__set_fixmap(get_cpu_gdt_ro_index(cpu), get_cpu_gdt_paddr(cpu), prot);
__set_fixmap(get_cpu_entry_area_index(cpu, gdt), get_cpu_gdt_paddr(cpu), gdt_prot);
set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, SYSENTER_stack_page),
per_cpu_ptr(&SYSENTER_stack_storage, cpu), 1,
PAGE_KERNEL);
/*
* The Intel SDM says (Volume 3, 7.2.1):
*
* Avoid placing a page boundary in the part of the TSS that the
* processor reads during a task switch (the first 104 bytes). The
* processor may not correctly perform address translations if a
* boundary occurs in this area. During a task switch, the processor
* reads and writes into the first 104 bytes of each TSS (using
* contiguous physical addresses beginning with the physical address
* of the first byte of the TSS). So, after TSS access begins, if
* part of the 104 bytes is not physically contiguous, the processor
* will access incorrect information without generating a page-fault
* exception.
*
* There are also a lot of errata involving the TSS spanning a page
* boundary. Assert that we're not doing that.
*/
BUILD_BUG_ON((offsetof(struct tss_struct, x86_tss) ^
offsetofend(struct tss_struct, x86_tss)) & PAGE_MASK);
BUILD_BUG_ON(sizeof(struct tss_struct) % PAGE_SIZE != 0);
set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, tss),
&per_cpu(cpu_tss_rw, cpu),
sizeof(struct tss_struct) / PAGE_SIZE,
tss_prot);
#ifdef CONFIG_X86_32
per_cpu(cpu_entry_area, cpu) = get_cpu_entry_area(cpu);
#endif
#ifdef CONFIG_X86_64
BUILD_BUG_ON(sizeof(exception_stacks) % PAGE_SIZE != 0);
BUILD_BUG_ON(sizeof(exception_stacks) !=
sizeof(((struct cpu_entry_area *)0)->exception_stacks));
set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, exception_stacks),
&per_cpu(exception_stacks, cpu),
sizeof(exception_stacks) / PAGE_SIZE,
PAGE_KERNEL);
__set_fixmap(get_cpu_entry_area_index(cpu, entry_trampoline),
__pa_symbol(_entry_trampoline), PAGE_KERNEL_RX);
#endif
}
void __init setup_cpu_entry_areas(void)
{
unsigned int cpu;
for_each_possible_cpu(cpu)
setup_cpu_entry_area(cpu);
}
/* Load the original GDT from the per-cpu structure */
@ -747,7 +836,7 @@ static void apply_forced_caps(struct cpuinfo_x86 *c)
{
int i;
for (i = 0; i < NCAPINTS; i++) {
for (i = 0; i < NCAPINTS + NBUGINTS; i++) {
c->x86_capability[i] &= ~cpu_caps_cleared[i];
c->x86_capability[i] |= cpu_caps_set[i];
}
@ -1250,7 +1339,7 @@ void enable_sep_cpu(void)
return;
cpu = get_cpu();
tss = &per_cpu(cpu_tss, cpu);
tss = &per_cpu(cpu_tss_rw, cpu);
/*
* We cache MSR_IA32_SYSENTER_CS's value in the TSS's ss1 field --
@ -1259,11 +1348,7 @@ void enable_sep_cpu(void)
tss->x86_tss.ss1 = __KERNEL_CS;
wrmsr(MSR_IA32_SYSENTER_CS, tss->x86_tss.ss1, 0);
wrmsr(MSR_IA32_SYSENTER_ESP,
(unsigned long)tss + offsetofend(struct tss_struct, SYSENTER_stack),
0);
wrmsr(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_SYSENTER_stack(cpu) + 1), 0);
wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long)entry_SYSENTER_32, 0);
put_cpu();
@ -1357,25 +1442,19 @@ DEFINE_PER_CPU(unsigned int, irq_count) __visible = -1;
DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT;
EXPORT_PER_CPU_SYMBOL(__preempt_count);
/*
* Special IST stacks which the CPU switches to when it calls
* an IST-marked descriptor entry. Up to 7 stacks (hardware
* limit), all of them are 4K, except the debug stack which
* is 8K.
*/
static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = {
[0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STKSZ,
[DEBUG_STACK - 1] = DEBUG_STKSZ
};
static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks
[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]);
/* May not be marked __init: used by software suspend */
void syscall_init(void)
{
extern char _entry_trampoline[];
extern char entry_SYSCALL_64_trampoline[];
int cpu = smp_processor_id();
unsigned long SYSCALL64_entry_trampoline =
(unsigned long)get_cpu_entry_area(cpu)->entry_trampoline +
(entry_SYSCALL_64_trampoline - _entry_trampoline);
wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS);
wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64);
wrmsrl(MSR_LSTAR, SYSCALL64_entry_trampoline);
#ifdef CONFIG_IA32_EMULATION
wrmsrl(MSR_CSTAR, (unsigned long)entry_SYSCALL_compat);
@ -1386,7 +1465,7 @@ void syscall_init(void)
* AMD doesn't allow SYSENTER in long mode (either 32- or 64-bit).
*/
wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS);
wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL);
wrmsrl_safe(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_SYSENTER_stack(cpu) + 1));
wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat);
#else
wrmsrl(MSR_CSTAR, (unsigned long)ignore_sysret);
@ -1530,7 +1609,7 @@ void cpu_init(void)
if (cpu)
load_ucode_ap();
t = &per_cpu(cpu_tss, cpu);
t = &per_cpu(cpu_tss_rw, cpu);
oist = &per_cpu(orig_ist, cpu);
#ifdef CONFIG_NUMA
@ -1569,7 +1648,7 @@ void cpu_init(void)
* set up and load the per-CPU TSS
*/
if (!oist->ist[0]) {
char *estacks = per_cpu(exception_stacks, cpu);
char *estacks = get_cpu_entry_area(cpu)->exception_stacks;
for (v = 0; v < N_EXCEPTION_STACKS; v++) {
estacks += exception_stack_sizes[v];
@ -1580,7 +1659,7 @@ void cpu_init(void)
}
}
t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
t->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET;
/*
* <= is required because the CPU will access up to
@ -1596,11 +1675,12 @@ void cpu_init(void)
enter_lazy_tlb(&init_mm, me);
/*
* Initialize the TSS. Don't bother initializing sp0, as the initial
* task never enters user mode.
* Initialize the TSS. sp0 points to the entry trampoline stack
* regardless of what task is running.
*/
set_tss_desc(cpu, t);
set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss);
load_TR_desc();
load_sp0((unsigned long)(cpu_SYSENTER_stack(cpu) + 1));
load_mm_ldt(&init_mm);
@ -1612,7 +1692,6 @@ void cpu_init(void)
if (is_uv_system())
uv_cpu_init();
setup_fixmap_gdt(cpu);
load_fixmap_gdt(cpu);
}
@ -1622,7 +1701,7 @@ void cpu_init(void)
{
int cpu = smp_processor_id();
struct task_struct *curr = current;
struct tss_struct *t = &per_cpu(cpu_tss, cpu);
struct tss_struct *t = &per_cpu(cpu_tss_rw, cpu);
wait_for_master_cpu(cpu);
@ -1657,12 +1736,12 @@ void cpu_init(void)
* Initialize the TSS. Don't bother initializing sp0, as the initial
* task never enters user mode.
*/
set_tss_desc(cpu, t);
set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss);
load_TR_desc();
load_mm_ldt(&init_mm);
t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
t->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET;
#ifdef CONFIG_DOUBLEFAULT
/* Set up doublefault TSS pointer in the GDT */
@ -1674,7 +1753,6 @@ void cpu_init(void)
fpu__init_cpu();
setup_fixmap_gdt(cpu);
load_fixmap_gdt(cpu);
}
#endif

View File

@ -50,8 +50,7 @@ static void doublefault_fn(void)
cpu_relax();
}
struct tss_struct doublefault_tss __cacheline_aligned = {
.x86_tss = {
struct x86_hw_tss doublefault_tss __cacheline_aligned = {
.sp0 = STACK_START,
.ss0 = __KERNEL_DS,
.ldt = 0,
@ -68,7 +67,6 @@ struct tss_struct doublefault_tss __cacheline_aligned = {
.fs = __KERNEL_PERCPU,
.__cr3 = __pa_nodebug(swapper_pg_dir),
}
};
/* dummy for do_double_fault() call */

View File

@ -43,6 +43,24 @@ bool in_task_stack(unsigned long *stack, struct task_struct *task,
return true;
}
bool in_sysenter_stack(unsigned long *stack, struct stack_info *info)
{
struct SYSENTER_stack *ss = cpu_SYSENTER_stack(smp_processor_id());
void *begin = ss;
void *end = ss + 1;
if ((void *)stack < begin || (void *)stack >= end)
return false;
info->type = STACK_TYPE_SYSENTER;
info->begin = begin;
info->end = end;
info->next_sp = NULL;
return true;
}
static void printk_stack_address(unsigned long address, int reliable,
char *log_lvl)
{
@ -50,6 +68,28 @@ static void printk_stack_address(unsigned long address, int reliable,
printk("%s %s%pB\n", log_lvl, reliable ? "" : "? ", (void *)address);
}
void show_iret_regs(struct pt_regs *regs)
{
printk(KERN_DEFAULT "RIP: %04x:%pS\n", (int)regs->cs, (void *)regs->ip);
printk(KERN_DEFAULT "RSP: %04x:%016lx EFLAGS: %08lx", (int)regs->ss,
regs->sp, regs->flags);
}
static void show_regs_safe(struct stack_info *info, struct pt_regs *regs)
{
if (on_stack(info, regs, sizeof(*regs)))
__show_regs(regs, 0);
else if (on_stack(info, (void *)regs + IRET_FRAME_OFFSET,
IRET_FRAME_SIZE)) {
/*
* When an interrupt or exception occurs in entry code, the
* full pt_regs might not have been saved yet. In that case
* just print the iret frame.
*/
show_iret_regs(regs);
}
}
void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
unsigned long *stack, char *log_lvl)
{
@ -71,31 +111,35 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
* - task stack
* - interrupt stack
* - HW exception stacks (double fault, nmi, debug, mce)
* - SYSENTER stack
*
* x86-32 can have up to three stacks:
* x86-32 can have up to four stacks:
* - task stack
* - softirq stack
* - hardirq stack
* - SYSENTER stack
*/
for (regs = NULL; stack; stack = PTR_ALIGN(stack_info.next_sp, sizeof(long))) {
const char *stack_name;
if (get_stack_info(stack, task, &stack_info, &visit_mask)) {
/*
* If we overflowed the task stack into a guard page, jump back
* to the bottom of the usable stack.
* We weren't on a valid stack. It's possible that
* we overflowed a valid stack into a guard page.
* See if the next page up is valid so that we can
* generate some kind of backtrace if this happens.
*/
if (task_stack_page(task) - (void *)stack < PAGE_SIZE)
stack = task_stack_page(task);
stack = (unsigned long *)PAGE_ALIGN((unsigned long)stack);
if (get_stack_info(stack, task, &stack_info, &visit_mask))
break;
}
stack_name = stack_type_name(stack_info.type);
if (stack_name)
printk("%s <%s>\n", log_lvl, stack_name);
if (regs && on_stack(&stack_info, regs, sizeof(*regs)))
__show_regs(regs, 0);
if (regs)
show_regs_safe(&stack_info, regs);
/*
* Scan the stack, printing any text addresses we find. At the
@ -119,7 +163,7 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
/*
* Don't print regs->ip again if it was already printed
* by __show_regs() below.
* by show_regs_safe() below.
*/
if (regs && stack == &regs->ip)
goto next;
@ -155,8 +199,8 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
/* if the frame has entry regs, print them */
regs = unwind_get_entry_regs(&state);
if (regs && on_stack(&stack_info, regs, sizeof(*regs)))
__show_regs(regs, 0);
if (regs)
show_regs_safe(&stack_info, regs);
}
if (stack_name)

View File

@ -26,6 +26,9 @@ const char *stack_type_name(enum stack_type type)
if (type == STACK_TYPE_SOFTIRQ)
return "SOFTIRQ";
if (type == STACK_TYPE_SYSENTER)
return "SYSENTER";
return NULL;
}
@ -93,6 +96,9 @@ int get_stack_info(unsigned long *stack, struct task_struct *task,
if (task != current)
goto unknown;
if (in_sysenter_stack(stack, info))
goto recursion_check;
if (in_hardirq_stack(stack, info))
goto recursion_check;

View File

@ -37,6 +37,9 @@ const char *stack_type_name(enum stack_type type)
if (type == STACK_TYPE_IRQ)
return "IRQ";
if (type == STACK_TYPE_SYSENTER)
return "SYSENTER";
if (type >= STACK_TYPE_EXCEPTION && type <= STACK_TYPE_EXCEPTION_LAST)
return exception_stack_names[type - STACK_TYPE_EXCEPTION];
@ -115,6 +118,9 @@ int get_stack_info(unsigned long *stack, struct task_struct *task,
if (in_irq_stack(stack, info))
goto recursion_check;
if (in_sysenter_stack(stack, info))
goto recursion_check;
goto unknown;
recursion_check:

View File

@ -67,7 +67,7 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
* because the ->io_bitmap_max value must match the bitmap
* contents:
*/
tss = &per_cpu(cpu_tss, get_cpu());
tss = &per_cpu(cpu_tss_rw, get_cpu());
if (turn_on)
bitmap_clear(t->io_bitmap_ptr, from, num);

View File

@ -219,18 +219,6 @@ __visible unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
/* high bit used in ret_from_ code */
unsigned vector = ~regs->orig_ax;
/*
* NB: Unlike exception entries, IRQ entries do not reliably
* handle context tracking in the low-level entry code. This is
* because syscall entries execute briefly with IRQs on before
* updating context tracking state, so we can take an IRQ from
* kernel mode with CONTEXT_USER. The low-level entry code only
* updates the context if we came from user mode, so we won't
* switch to CONTEXT_KERNEL. We'll fix that once the syscall
* code is cleaned up enough that we can cleanly defer enabling
* IRQs.
*/
entering_irq();
/* entering_irq() tells RCU that we're not quiescent. Check it. */

View File

@ -57,10 +57,10 @@ static inline void stack_overflow_check(struct pt_regs *regs)
if (regs->sp >= estack_top && regs->sp <= estack_bottom)
return;
WARN_ONCE(1, "do_IRQ(): %s has overflown the kernel stack (cur:%Lx,sp:%lx,irq stk top-bottom:%Lx-%Lx,exception stk top-bottom:%Lx-%Lx)\n",
WARN_ONCE(1, "do_IRQ(): %s has overflown the kernel stack (cur:%Lx,sp:%lx,irq stk top-bottom:%Lx-%Lx,exception stk top-bottom:%Lx-%Lx,ip:%pF)\n",
current->comm, curbase, regs->sp,
irq_stack_top, irq_stack_bottom,
estack_top, estack_bottom);
estack_top, estack_bottom, (void *)regs->ip);
if (sysctl_panic_on_stackoverflow)
panic("low stack detected by irq handler - check messages\n");

View File

@ -10,7 +10,6 @@ DEF_NATIVE(pv_irq_ops, save_fl, "pushfq; popq %rax");
DEF_NATIVE(pv_mmu_ops, read_cr2, "movq %cr2, %rax");
DEF_NATIVE(pv_mmu_ops, read_cr3, "movq %cr3, %rax");
DEF_NATIVE(pv_mmu_ops, write_cr3, "movq %rdi, %cr3");
DEF_NATIVE(pv_mmu_ops, flush_tlb_single, "invlpg (%rdi)");
DEF_NATIVE(pv_cpu_ops, wbinvd, "wbinvd");
DEF_NATIVE(pv_cpu_ops, usergs_sysret64, "swapgs; sysretq");
@ -60,7 +59,6 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
PATCH_SITE(pv_mmu_ops, read_cr2);
PATCH_SITE(pv_mmu_ops, read_cr3);
PATCH_SITE(pv_mmu_ops, write_cr3);
PATCH_SITE(pv_mmu_ops, flush_tlb_single);
PATCH_SITE(pv_cpu_ops, wbinvd);
#if defined(CONFIG_PARAVIRT_SPINLOCKS)
case PARAVIRT_PATCH(pv_lock_ops.queued_spin_unlock):

View File

@ -47,7 +47,7 @@
* section. Since TSS's are completely CPU-local, we want them
* on exact cacheline boundaries, to eliminate cacheline ping-pong.
*/
__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = {
__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss_rw) = {
.x86_tss = {
/*
* .sp0 is only used when entering ring 0 from a lower
@ -56,6 +56,16 @@ __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = {
* Poison it.
*/
.sp0 = (1UL << (BITS_PER_LONG-1)) + 1,
#ifdef CONFIG_X86_64
/*
* .sp1 is cpu_current_top_of_stack. The init task never
* runs user code, but cpu_current_top_of_stack should still
* be well defined before the first context switch.
*/
.sp1 = TOP_OF_INIT_STACK,
#endif
#ifdef CONFIG_X86_32
.ss0 = __KERNEL_DS,
.ss1 = __KERNEL_CS,
@ -71,11 +81,8 @@ __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = {
*/
.io_bitmap = { [0 ... IO_BITMAP_LONGS] = ~0 },
#endif
#ifdef CONFIG_X86_32
.SYSENTER_stack_canary = STACK_END_MAGIC,
#endif
};
EXPORT_PER_CPU_SYMBOL(cpu_tss);
EXPORT_PER_CPU_SYMBOL(cpu_tss_rw);
DEFINE_PER_CPU(bool, __tss_limit_invalid);
EXPORT_PER_CPU_SYMBOL_GPL(__tss_limit_invalid);
@ -104,7 +111,7 @@ void exit_thread(struct task_struct *tsk)
struct fpu *fpu = &t->fpu;
if (bp) {
struct tss_struct *tss = &per_cpu(cpu_tss, get_cpu());
struct tss_struct *tss = &per_cpu(cpu_tss_rw, get_cpu());
t->io_bitmap_ptr = NULL;
clear_thread_flag(TIF_IO_BITMAP);

View File

@ -234,7 +234,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
struct fpu *prev_fpu = &prev->fpu;
struct fpu *next_fpu = &next->fpu;
int cpu = smp_processor_id();
struct tss_struct *tss = &per_cpu(cpu_tss, cpu);
struct tss_struct *tss = &per_cpu(cpu_tss_rw, cpu);
/* never put a printk in __switch_to... printk() calls wake_up*() indirectly */

View File

@ -69,9 +69,8 @@ void __show_regs(struct pt_regs *regs, int all)
unsigned int fsindex, gsindex;
unsigned int ds, cs, es;
printk(KERN_DEFAULT "RIP: %04lx:%pS\n", regs->cs, (void *)regs->ip);
printk(KERN_DEFAULT "RSP: %04lx:%016lx EFLAGS: %08lx", regs->ss,
regs->sp, regs->flags);
show_iret_regs(regs);
if (regs->orig_ax != -1)
pr_cont(" ORIG_RAX: %016lx\n", regs->orig_ax);
else
@ -88,6 +87,9 @@ void __show_regs(struct pt_regs *regs, int all)
printk(KERN_DEFAULT "R13: %016lx R14: %016lx R15: %016lx\n",
regs->r13, regs->r14, regs->r15);
if (!all)
return;
asm("movl %%ds,%0" : "=r" (ds));
asm("movl %%cs,%0" : "=r" (cs));
asm("movl %%es,%0" : "=r" (es));
@ -98,9 +100,6 @@ void __show_regs(struct pt_regs *regs, int all)
rdmsrl(MSR_GS_BASE, gs);
rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
if (!all)
return;
cr0 = read_cr0();
cr2 = read_cr2();
cr3 = __read_cr3();
@ -400,7 +399,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
struct fpu *prev_fpu = &prev->fpu;
struct fpu *next_fpu = &next->fpu;
int cpu = smp_processor_id();
struct tss_struct *tss = &per_cpu(cpu_tss, cpu);
struct tss_struct *tss = &per_cpu(cpu_tss_rw, cpu);
WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) &&
this_cpu_read(irq_count) != -1);
@ -462,6 +461,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
* Switch the PDA and FPU contexts.
*/
this_cpu_write(current_task, next_p);
this_cpu_write(cpu_current_top_of_stack, task_top_of_stack(next_p));
/* Reload sp0. */
update_sp0(next_p);

View File

@ -348,9 +348,15 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
/*
* If IRET takes a non-IST fault on the espfix64 stack, then we
* end up promoting it to a doublefault. In that case, modify
* the stack to make it look like we just entered the #GP
* handler from user space, similar to bad_iret.
* end up promoting it to a doublefault. In that case, take
* advantage of the fact that we're not using the normal (TSS.sp0)
* stack right now. We can write a fake #GP(0) frame at TSS.sp0
* and then modify our own IRET frame so that, when we return,
* we land directly at the #GP(0) vector with the stack already
* set up according to its expectations.
*
* The net result is that our #GP handler will think that we
* entered from usermode with the bad user context.
*
* No need for ist_enter here because we don't use RCU.
*/
@ -358,13 +364,26 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
regs->cs == __KERNEL_CS &&
regs->ip == (unsigned long)native_irq_return_iret)
{
struct pt_regs *normal_regs = task_pt_regs(current);
struct pt_regs *gpregs = (struct pt_regs *)this_cpu_read(cpu_tss_rw.x86_tss.sp0) - 1;
/* Fake a #GP(0) from userspace. */
memmove(&normal_regs->ip, (void *)regs->sp, 5*8);
normal_regs->orig_ax = 0; /* Missing (lost) #GP error code */
/*
* regs->sp points to the failing IRET frame on the
* ESPFIX64 stack. Copy it to the entry stack. This fills
* in gpregs->ss through gpregs->ip.
*
*/
memmove(&gpregs->ip, (void *)regs->sp, 5*8);
gpregs->orig_ax = 0; /* Missing (lost) #GP error code */
/*
* Adjust our frame so that we return straight to the #GP
* vector with the expected RSP value. This is safe because
* we won't enable interupts or schedule before we invoke
* general_protection, so nothing will clobber the stack
* frame we just set up.
*/
regs->ip = (unsigned long)general_protection;
regs->sp = (unsigned long)&normal_regs->orig_ax;
regs->sp = (unsigned long)&gpregs->orig_ax;
return;
}
@ -389,7 +408,7 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
*
* Processors update CR2 whenever a page fault is detected. If a
* second page fault occurs while an earlier page fault is being
* deliv- ered, the faulting linear address of the second fault will
* delivered, the faulting linear address of the second fault will
* overwrite the contents of CR2 (replacing the previous
* address). These updates to CR2 occur even if the page fault
* results in a double fault or occurs during the delivery of a
@ -605,13 +624,14 @@ NOKPROBE_SYMBOL(do_int3);
#ifdef CONFIG_X86_64
/*
* Help handler running on IST stack to switch off the IST stack if the
* interrupted code was in user mode. The actual stack switch is done in
* entry_64.S
* Help handler running on a per-cpu (IST or entry trampoline) stack
* to switch to the normal thread stack if the interrupted code was in
* user mode. The actual stack switch is done in entry_64.S
*/
asmlinkage __visible notrace struct pt_regs *sync_regs(struct pt_regs *eregs)
{
struct pt_regs *regs = task_pt_regs(current);
struct pt_regs *regs = (struct pt_regs *)this_cpu_read(cpu_current_top_of_stack) - 1;
if (regs != eregs)
*regs = *eregs;
return regs;
}
@ -628,13 +648,13 @@ struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s)
/*
* This is called from entry_64.S early in handling a fault
* caused by a bad iret to user mode. To handle the fault
* correctly, we want move our stack frame to task_pt_regs
* and we want to pretend that the exception came from the
* iret target.
* correctly, we want to move our stack frame to where it would
* be had we entered directly on the entry stack (rather than
* just below the IRET frame) and we want to pretend that the
* exception came from the IRET target.
*/
struct bad_iret_stack *new_stack =
container_of(task_pt_regs(current),
struct bad_iret_stack, regs);
(struct bad_iret_stack *)this_cpu_read(cpu_tss_rw.x86_tss.sp0) - 1;
/* Copy the IRET target to the new stack. */
memmove(&new_stack->regs.ip, (void *)s->regs.sp, 5*8);
@ -795,14 +815,6 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
debug_stack_usage_dec();
exit:
#if defined(CONFIG_X86_32)
/*
* This is the most likely code path that involves non-trivial use
* of the SYSENTER stack. Check that we haven't overrun it.
*/
WARN(this_cpu_read(cpu_tss.SYSENTER_stack_canary) != STACK_END_MAGIC,
"Overran or corrupted SYSENTER stack\n");
#endif
ist_exit(regs);
}
NOKPROBE_SYMBOL(do_debug);
@ -929,6 +941,9 @@ dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code)
void __init trap_init(void)
{
/* Init cpu_entry_area before IST entries are set up */
setup_cpu_entry_areas();
idt_setup_traps();
/*

View File

@ -253,21 +253,14 @@ unsigned long *unwind_get_return_address_ptr(struct unwind_state *state)
return NULL;
}
static bool stack_access_ok(struct unwind_state *state, unsigned long addr,
static bool stack_access_ok(struct unwind_state *state, unsigned long _addr,
size_t len)
{
struct stack_info *info = &state->stack_info;
void *addr = (void *)_addr;
/*
* If the address isn't on the current stack, switch to the next one.
*
* We may have to traverse multiple stacks to deal with the possibility
* that info->next_sp could point to an empty stack and the address
* could be on a subsequent stack.
*/
while (!on_stack(info, (void *)addr, len))
if (get_stack_info(info->next_sp, state->task, info,
&state->stack_mask))
if (!on_stack(info, addr, len) &&
(get_stack_info(addr, state->task, info, &state->stack_mask)))
return false;
return true;
@ -283,42 +276,32 @@ static bool deref_stack_reg(struct unwind_state *state, unsigned long addr,
return true;
}
#define REGS_SIZE (sizeof(struct pt_regs))
#define SP_OFFSET (offsetof(struct pt_regs, sp))
#define IRET_REGS_SIZE (REGS_SIZE - offsetof(struct pt_regs, ip))
#define IRET_SP_OFFSET (SP_OFFSET - offsetof(struct pt_regs, ip))
static bool deref_stack_regs(struct unwind_state *state, unsigned long addr,
unsigned long *ip, unsigned long *sp, bool full)
unsigned long *ip, unsigned long *sp)
{
size_t regs_size = full ? REGS_SIZE : IRET_REGS_SIZE;
size_t sp_offset = full ? SP_OFFSET : IRET_SP_OFFSET;
struct pt_regs *regs = (struct pt_regs *)(addr + regs_size - REGS_SIZE);
struct pt_regs *regs = (struct pt_regs *)addr;
if (IS_ENABLED(CONFIG_X86_64)) {
if (!stack_access_ok(state, addr, regs_size))
/* x86-32 support will be more complicated due to the &regs->sp hack */
BUILD_BUG_ON(IS_ENABLED(CONFIG_X86_32));
if (!stack_access_ok(state, addr, sizeof(struct pt_regs)))
return false;
*ip = regs->ip;
*sp = regs->sp;
return true;
}
}
if (!stack_access_ok(state, addr, sp_offset))
static bool deref_stack_iret_regs(struct unwind_state *state, unsigned long addr,
unsigned long *ip, unsigned long *sp)
{
struct pt_regs *regs = (void *)addr - IRET_FRAME_OFFSET;
if (!stack_access_ok(state, addr, IRET_FRAME_SIZE))
return false;
*ip = regs->ip;
if (user_mode(regs)) {
if (!stack_access_ok(state, addr + sp_offset,
REGS_SIZE - SP_OFFSET))
return false;
*sp = regs->sp;
} else
*sp = (unsigned long)&regs->sp;
return true;
}
@ -327,7 +310,6 @@ bool unwind_next_frame(struct unwind_state *state)
unsigned long ip_p, sp, orig_ip, prev_sp = state->sp;
enum stack_type prev_type = state->stack_info.type;
struct orc_entry *orc;
struct pt_regs *ptregs;
bool indirect = false;
if (unwind_done(state))
@ -435,7 +417,7 @@ bool unwind_next_frame(struct unwind_state *state)
break;
case ORC_TYPE_REGS:
if (!deref_stack_regs(state, sp, &state->ip, &state->sp, true)) {
if (!deref_stack_regs(state, sp, &state->ip, &state->sp)) {
orc_warn("can't dereference registers at %p for ip %pB\n",
(void *)sp, (void *)orig_ip);
goto done;
@ -447,20 +429,14 @@ bool unwind_next_frame(struct unwind_state *state)
break;
case ORC_TYPE_REGS_IRET:
if (!deref_stack_regs(state, sp, &state->ip, &state->sp, false)) {
if (!deref_stack_iret_regs(state, sp, &state->ip, &state->sp)) {
orc_warn("can't dereference iret registers at %p for ip %pB\n",
(void *)sp, (void *)orig_ip);
goto done;
}
ptregs = container_of((void *)sp, struct pt_regs, ip);
if ((unsigned long)ptregs >= prev_sp &&
on_stack(&state->stack_info, ptregs, REGS_SIZE)) {
state->regs = ptregs;
state->regs = (void *)sp - IRET_FRAME_OFFSET;
state->full_regs = false;
} else
state->regs = NULL;
state->signal = true;
break;
@ -553,8 +529,18 @@ void __unwind_start(struct unwind_state *state, struct task_struct *task,
}
if (get_stack_info((unsigned long *)state->sp, state->task,
&state->stack_info, &state->stack_mask))
&state->stack_info, &state->stack_mask)) {
/*
* We weren't on a valid stack. It's possible that
* we overflowed a valid stack into a guard page.
* See if the next page up is valid so that we can
* generate some kind of backtrace if this happens.
*/
void *next_page = (void *)PAGE_ALIGN((unsigned long)state->sp);
if (get_stack_info(next_page, state->task, &state->stack_info,
&state->stack_mask))
return;
}
/*
* The caller can provide the address of the first frame directly

View File

@ -107,6 +107,15 @@ SECTIONS
SOFTIRQENTRY_TEXT
*(.fixup)
*(.gnu.warning)
#ifdef CONFIG_X86_64
. = ALIGN(PAGE_SIZE);
_entry_trampoline = .;
*(.entry_trampoline)
. = ALIGN(PAGE_SIZE);
ASSERT(. - _entry_trampoline == PAGE_SIZE, "entry trampoline is too big");
#endif
/* End of text section */
_etext = .;
} :text = 0x9090

View File

@ -2390,9 +2390,21 @@ static int rsm_load_seg_64(struct x86_emulate_ctxt *ctxt, u64 smbase, int n)
}
static int rsm_enter_protected_mode(struct x86_emulate_ctxt *ctxt,
u64 cr0, u64 cr4)
u64 cr0, u64 cr3, u64 cr4)
{
int bad;
u64 pcid;
/* In order to later set CR4.PCIDE, CR3[11:0] must be zero. */
pcid = 0;
if (cr4 & X86_CR4_PCIDE) {
pcid = cr3 & 0xfff;
cr3 &= ~0xfff;
}
bad = ctxt->ops->set_cr(ctxt, 3, cr3);
if (bad)
return X86EMUL_UNHANDLEABLE;
/*
* First enable PAE, long mode needs it before CR0.PG = 1 is set.
@ -2411,6 +2423,12 @@ static int rsm_enter_protected_mode(struct x86_emulate_ctxt *ctxt,
bad = ctxt->ops->set_cr(ctxt, 4, cr4);
if (bad)
return X86EMUL_UNHANDLEABLE;
if (pcid) {
bad = ctxt->ops->set_cr(ctxt, 3, cr3 | pcid);
if (bad)
return X86EMUL_UNHANDLEABLE;
}
}
return X86EMUL_CONTINUE;
@ -2421,11 +2439,11 @@ static int rsm_load_state_32(struct x86_emulate_ctxt *ctxt, u64 smbase)
struct desc_struct desc;
struct desc_ptr dt;
u16 selector;
u32 val, cr0, cr4;
u32 val, cr0, cr3, cr4;
int i;
cr0 = GET_SMSTATE(u32, smbase, 0x7ffc);
ctxt->ops->set_cr(ctxt, 3, GET_SMSTATE(u32, smbase, 0x7ff8));
cr3 = GET_SMSTATE(u32, smbase, 0x7ff8);
ctxt->eflags = GET_SMSTATE(u32, smbase, 0x7ff4) | X86_EFLAGS_FIXED;
ctxt->_eip = GET_SMSTATE(u32, smbase, 0x7ff0);
@ -2467,14 +2485,14 @@ static int rsm_load_state_32(struct x86_emulate_ctxt *ctxt, u64 smbase)
ctxt->ops->set_smbase(ctxt, GET_SMSTATE(u32, smbase, 0x7ef8));
return rsm_enter_protected_mode(ctxt, cr0, cr4);
return rsm_enter_protected_mode(ctxt, cr0, cr3, cr4);
}
static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt, u64 smbase)
{
struct desc_struct desc;
struct desc_ptr dt;
u64 val, cr0, cr4;
u64 val, cr0, cr3, cr4;
u32 base3;
u16 selector;
int i, r;
@ -2491,7 +2509,7 @@ static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt, u64 smbase)
ctxt->ops->set_dr(ctxt, 7, (val & DR7_VOLATILE) | DR7_FIXED_1);
cr0 = GET_SMSTATE(u64, smbase, 0x7f58);
ctxt->ops->set_cr(ctxt, 3, GET_SMSTATE(u64, smbase, 0x7f50));
cr3 = GET_SMSTATE(u64, smbase, 0x7f50);
cr4 = GET_SMSTATE(u64, smbase, 0x7f48);
ctxt->ops->set_smbase(ctxt, GET_SMSTATE(u32, smbase, 0x7f00));
val = GET_SMSTATE(u64, smbase, 0x7ed0);
@ -2519,7 +2537,7 @@ static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt, u64 smbase)
dt.address = GET_SMSTATE(u64, smbase, 0x7e68);
ctxt->ops->set_gdt(ctxt, &dt);
r = rsm_enter_protected_mode(ctxt, cr0, cr4);
r = rsm_enter_protected_mode(ctxt, cr0, cr3, cr4);
if (r != X86EMUL_CONTINUE)
return r;

View File

@ -3395,7 +3395,7 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
spin_lock(&vcpu->kvm->mmu_lock);
if(make_mmu_pages_available(vcpu) < 0) {
spin_unlock(&vcpu->kvm->mmu_lock);
return 1;
return -ENOSPC;
}
sp = kvm_mmu_get_page(vcpu, 0, 0,
vcpu->arch.mmu.shadow_root_level, 1, ACC_ALL);
@ -3410,7 +3410,7 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
spin_lock(&vcpu->kvm->mmu_lock);
if (make_mmu_pages_available(vcpu) < 0) {
spin_unlock(&vcpu->kvm->mmu_lock);
return 1;
return -ENOSPC;
}
sp = kvm_mmu_get_page(vcpu, i << (30 - PAGE_SHIFT),
i << 30, PT32_ROOT_LEVEL, 1, ACC_ALL);
@ -3450,7 +3450,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
spin_lock(&vcpu->kvm->mmu_lock);
if (make_mmu_pages_available(vcpu) < 0) {
spin_unlock(&vcpu->kvm->mmu_lock);
return 1;
return -ENOSPC;
}
sp = kvm_mmu_get_page(vcpu, root_gfn, 0,
vcpu->arch.mmu.shadow_root_level, 0, ACC_ALL);
@ -3487,7 +3487,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
spin_lock(&vcpu->kvm->mmu_lock);
if (make_mmu_pages_available(vcpu) < 0) {
spin_unlock(&vcpu->kvm->mmu_lock);
return 1;
return -ENOSPC;
}
sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30, PT32_ROOT_LEVEL,
0, ACC_ALL);

View File

@ -2302,7 +2302,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
* processors. See 22.2.4.
*/
vmcs_writel(HOST_TR_BASE,
(unsigned long)this_cpu_ptr(&cpu_tss));
(unsigned long)&get_cpu_entry_area(cpu)->tss.x86_tss);
vmcs_writel(HOST_GDTR_BASE, (unsigned long)gdt); /* 22.2.4 */
/*

View File

@ -4384,7 +4384,7 @@ static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v)
addr, n, v))
&& kvm_io_bus_read(vcpu, KVM_MMIO_BUS, addr, n, v))
break;
trace_kvm_mmio(KVM_TRACE_MMIO_READ, n, addr, *(u64 *)v);
trace_kvm_mmio(KVM_TRACE_MMIO_READ, n, addr, v);
handled += n;
addr += n;
len -= n;
@ -4643,7 +4643,7 @@ static int read_prepare(struct kvm_vcpu *vcpu, void *val, int bytes)
{
if (vcpu->mmio_read_completed) {
trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes,
vcpu->mmio_fragments[0].gpa, *(u64 *)val);
vcpu->mmio_fragments[0].gpa, val);
vcpu->mmio_read_completed = 0;
return 1;
}
@ -4665,14 +4665,14 @@ static int write_emulate(struct kvm_vcpu *vcpu, gpa_t gpa,
static int write_mmio(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes, void *val)
{
trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val);
trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, val);
return vcpu_mmio_write(vcpu, gpa, bytes, val);
}
static int read_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa,
void *val, int bytes)
{
trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, 0);
trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, NULL);
return X86EMUL_IO_NEEDED;
}
@ -7264,13 +7264,12 @@ static int complete_emulated_mmio(struct kvm_vcpu *vcpu)
int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
{
struct fpu *fpu = &current->thread.fpu;
int r;
fpu__initialize(fpu);
kvm_sigset_activate(vcpu);
kvm_load_guest_fpu(vcpu);
if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) {
if (kvm_run->immediate_exit) {
r = -EINTR;
@ -7296,14 +7295,12 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
}
}
kvm_load_guest_fpu(vcpu);
if (unlikely(vcpu->arch.complete_userspace_io)) {
int (*cui)(struct kvm_vcpu *) = vcpu->arch.complete_userspace_io;
vcpu->arch.complete_userspace_io = NULL;
r = cui(vcpu);
if (r <= 0)
goto out_fpu;
goto out;
} else
WARN_ON(vcpu->arch.pio.count || vcpu->mmio_needed);
@ -7312,9 +7309,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
else
r = vcpu_run(vcpu);
out_fpu:
kvm_put_guest_fpu(vcpu);
out:
kvm_put_guest_fpu(vcpu);
post_kvm_run_save(vcpu);
kvm_sigset_deactivate(vcpu);
@ -7384,7 +7380,7 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
#endif
kvm_rip_write(vcpu, regs->rip);
kvm_set_rflags(vcpu, regs->rflags);
kvm_set_rflags(vcpu, regs->rflags | X86_EFLAGS_FIXED);
vcpu->arch.exception.pending = false;
@ -7498,6 +7494,29 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index,
}
EXPORT_SYMBOL_GPL(kvm_task_switch);
int kvm_valid_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
{
if ((sregs->efer & EFER_LME) && (sregs->cr0 & X86_CR0_PG_BIT)) {
/*
* When EFER.LME and CR0.PG are set, the processor is in
* 64-bit mode (though maybe in a 32-bit code segment).
* CR4.PAE and EFER.LMA must be set.
*/
if (!(sregs->cr4 & X86_CR4_PAE_BIT)
|| !(sregs->efer & EFER_LMA))
return -EINVAL;
} else {
/*
* Not in 64-bit mode: EFER.LMA is clear and the code
* segment cannot be 64-bit.
*/
if (sregs->efer & EFER_LMA || sregs->cs.l)
return -EINVAL;
}
return 0;
}
int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
struct kvm_sregs *sregs)
{
@ -7510,6 +7529,9 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
(sregs->cr4 & X86_CR4_OSXSAVE))
return -EINVAL;
if (kvm_valid_sregs(vcpu, sregs))
return -EINVAL;
apic_base_msr.data = sregs->apic_base;
apic_base_msr.host_initiated = true;
if (kvm_set_apic_base(vcpu, &apic_base_msr))

View File

@ -107,10 +107,10 @@ static void delay_mwaitx(unsigned long __loops)
delay = min_t(u64, MWAITX_MAX_LOOPS, loops);
/*
* Use cpu_tss as a cacheline-aligned, seldomly
* Use cpu_tss_rw as a cacheline-aligned, seldomly
* accessed per-cpu variable as the monitor target.
*/
__monitorx(raw_cpu_ptr(&cpu_tss), 0, 0);
__monitorx(raw_cpu_ptr(&cpu_tss_rw), 0, 0);
/*
* AMD, like Intel, supports the EAX hint and EAX=0xf

View File

@ -860,7 +860,7 @@ show_signal_msg(struct pt_regs *regs, unsigned long error_code,
if (!printk_ratelimit())
return;
printk("%s%s[%d]: segfault at %lx ip %p sp %p error %lx",
printk("%s%s[%d]: segfault at %lx ip %px sp %px error %lx",
task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
tsk->comm, task_pid_nr(tsk), address,
(void *)regs->ip, (void *)regs->sp, error_code);

View File

@ -277,6 +277,7 @@ void __init kasan_early_init(void)
void __init kasan_init(void)
{
int i;
void *shadow_cpu_entry_begin, *shadow_cpu_entry_end;
#ifdef CONFIG_KASAN_INLINE
register_die_notifier(&kasan_die_notifier);
@ -329,8 +330,23 @@ void __init kasan_init(void)
(unsigned long)kasan_mem_to_shadow(_end),
early_pfn_to_nid(__pa(_stext)));
shadow_cpu_entry_begin = (void *)__fix_to_virt(FIX_CPU_ENTRY_AREA_BOTTOM);
shadow_cpu_entry_begin = kasan_mem_to_shadow(shadow_cpu_entry_begin);
shadow_cpu_entry_begin = (void *)round_down((unsigned long)shadow_cpu_entry_begin,
PAGE_SIZE);
shadow_cpu_entry_end = (void *)(__fix_to_virt(FIX_CPU_ENTRY_AREA_TOP) + PAGE_SIZE);
shadow_cpu_entry_end = kasan_mem_to_shadow(shadow_cpu_entry_end);
shadow_cpu_entry_end = (void *)round_up((unsigned long)shadow_cpu_entry_end,
PAGE_SIZE);
kasan_populate_zero_shadow(kasan_mem_to_shadow((void *)MODULES_END),
(void *)KASAN_SHADOW_END);
shadow_cpu_entry_begin);
kasan_populate_shadow((unsigned long)shadow_cpu_entry_begin,
(unsigned long)shadow_cpu_entry_end, 0);
kasan_populate_zero_shadow(shadow_cpu_entry_end, (void *)KASAN_SHADOW_END);
load_cr3(init_top_pgt);
__flush_tlb_all();

View File

@ -152,17 +152,19 @@ static void do_fpu_end(void)
static void fix_processor_context(void)
{
int cpu = smp_processor_id();
struct tss_struct *t = &per_cpu(cpu_tss, cpu);
#ifdef CONFIG_X86_64
struct desc_struct *desc = get_cpu_gdt_rw(cpu);
tss_desc tss;
#endif
set_tss_desc(cpu, t); /*
* This just modifies memory; should not be
* necessary. But... This is necessary, because
* 386 hardware has concept of busy TSS or some
* similar stupidity.
/*
* We need to reload TR, which requires that we change the
* GDT entry to indicate "available" first.
*
* XXX: This could probably all be replaced by a call to
* force_reload_TR().
*/
set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss);
#ifdef CONFIG_X86_64
memcpy(&tss, &desc[GDT_ENTRY_TSS], sizeof(tss_desc));

View File

@ -826,7 +826,7 @@ static void xen_load_sp0(unsigned long sp0)
mcs = xen_mc_entry(0);
MULTI_stack_switch(mcs.mc, __KERNEL_DS, sp0);
xen_mc_issue(PARAVIRT_LAZY_CPU);
this_cpu_write(cpu_tss.x86_tss.sp0, sp0);
this_cpu_write(cpu_tss_rw.x86_tss.sp0, sp0);
}
void xen_set_iopl_mask(unsigned mask)

View File

@ -2272,7 +2272,7 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
#endif
case FIX_TEXT_POKE0:
case FIX_TEXT_POKE1:
case FIX_GDT_REMAP_BEGIN ... FIX_GDT_REMAP_END:
case FIX_CPU_ENTRY_AREA_TOP ... FIX_CPU_ENTRY_AREA_BOTTOM:
/* All local page mappings */
pte = pfn_pte(phys, prot);
break;

View File

@ -599,6 +599,8 @@ void __bio_clone_fast(struct bio *bio, struct bio *bio_src)
bio->bi_disk = bio_src->bi_disk;
bio->bi_partno = bio_src->bi_partno;
bio_set_flag(bio, BIO_CLONED);
if (bio_flagged(bio_src, BIO_THROTTLED))
bio_set_flag(bio, BIO_THROTTLED);
bio->bi_opf = bio_src->bi_opf;
bio->bi_write_hint = bio_src->bi_write_hint;
bio->bi_iter = bio_src->bi_iter;

View File

@ -12,22 +12,29 @@
#include "blk.h"
/*
* Append a bio to a passthrough request. Only works can be merged into
* the request based on the driver constraints.
* Append a bio to a passthrough request. Only works if the bio can be merged
* into the request based on the driver constraints.
*/
int blk_rq_append_bio(struct request *rq, struct bio *bio)
int blk_rq_append_bio(struct request *rq, struct bio **bio)
{
blk_queue_bounce(rq->q, &bio);
struct bio *orig_bio = *bio;
blk_queue_bounce(rq->q, bio);
if (!rq->bio) {
blk_rq_bio_prep(rq->q, rq, bio);
blk_rq_bio_prep(rq->q, rq, *bio);
} else {
if (!ll_back_merge_fn(rq->q, rq, bio))
if (!ll_back_merge_fn(rq->q, rq, *bio)) {
if (orig_bio != *bio) {
bio_put(*bio);
*bio = orig_bio;
}
return -EINVAL;
}
rq->biotail->bi_next = bio;
rq->biotail = bio;
rq->__data_len += bio->bi_iter.bi_size;
rq->biotail->bi_next = *bio;
rq->biotail = *bio;
rq->__data_len += (*bio)->bi_iter.bi_size;
}
return 0;
@ -73,14 +80,12 @@ static int __blk_rq_map_user_iov(struct request *rq,
* We link the bounce buffer in and could have to traverse it
* later so we have to get a ref to prevent it from being freed
*/
ret = blk_rq_append_bio(rq, bio);
bio_get(bio);
ret = blk_rq_append_bio(rq, &bio);
if (ret) {
bio_endio(bio);
__blk_rq_unmap_user(orig_bio);
bio_put(bio);
return ret;
}
bio_get(bio);
return 0;
}
@ -213,7 +218,7 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf,
int reading = rq_data_dir(rq) == READ;
unsigned long addr = (unsigned long) kbuf;
int do_copy = 0;
struct bio *bio;
struct bio *bio, *orig_bio;
int ret;
if (len > (queue_max_hw_sectors(q) << 9))
@ -236,10 +241,11 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf,
if (do_copy)
rq->rq_flags |= RQF_COPY_USER;
ret = blk_rq_append_bio(rq, bio);
orig_bio = bio;
ret = blk_rq_append_bio(rq, &bio);
if (unlikely(ret)) {
/* request is too big */
bio_put(bio);
bio_put(orig_bio);
return ret;
}

View File

@ -2226,13 +2226,7 @@ bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
out_unlock:
spin_unlock_irq(q->queue_lock);
out:
/*
* As multiple blk-throtls may stack in the same issue path, we
* don't want bios to leave with the flag set. Clear the flag if
* being issued.
*/
if (!throttled)
bio_clear_flag(bio, BIO_THROTTLED);
bio_set_flag(bio, BIO_THROTTLED);
#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
if (throttled || !td->track_bio_latency)

View File

@ -200,6 +200,7 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig,
unsigned i = 0;
bool bounce = false;
int sectors = 0;
bool passthrough = bio_is_passthrough(*bio_orig);
bio_for_each_segment(from, *bio_orig, iter) {
if (i++ < BIO_MAX_PAGES)
@ -210,13 +211,14 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig,
if (!bounce)
return;
if (sectors < bio_sectors(*bio_orig)) {
if (!passthrough && sectors < bio_sectors(*bio_orig)) {
bio = bio_split(*bio_orig, sectors, GFP_NOIO, bounce_bio_split);
bio_chain(bio, *bio_orig);
generic_make_request(*bio_orig);
*bio_orig = bio;
}
bio = bio_clone_bioset(*bio_orig, GFP_NOIO, bounce_bio_set);
bio = bio_clone_bioset(*bio_orig, GFP_NOIO, passthrough ? NULL :
bounce_bio_set);
bio_for_each_segment_all(to, bio, i) {
struct page *page = to->bv_page;

View File

@ -100,9 +100,13 @@ struct kyber_hctx_data {
unsigned int cur_domain;
unsigned int batching;
wait_queue_entry_t domain_wait[KYBER_NUM_DOMAINS];
struct sbq_wait_state *domain_ws[KYBER_NUM_DOMAINS];
atomic_t wait_index[KYBER_NUM_DOMAINS];
};
static int kyber_domain_wake(wait_queue_entry_t *wait, unsigned mode, int flags,
void *key);
static int rq_sched_domain(const struct request *rq)
{
unsigned int op = rq->cmd_flags;
@ -385,6 +389,9 @@ static int kyber_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
INIT_LIST_HEAD(&khd->rqs[i]);
init_waitqueue_func_entry(&khd->domain_wait[i],
kyber_domain_wake);
khd->domain_wait[i].private = hctx;
INIT_LIST_HEAD(&khd->domain_wait[i].entry);
atomic_set(&khd->wait_index[i], 0);
}
@ -524,35 +531,39 @@ static int kyber_get_domain_token(struct kyber_queue_data *kqd,
int nr;
nr = __sbitmap_queue_get(domain_tokens);
if (nr >= 0)
return nr;
/*
* If we failed to get a domain token, make sure the hardware queue is
* run when one becomes available. Note that this is serialized on
* khd->lock, but we still need to be careful about the waker.
*/
if (list_empty_careful(&wait->entry)) {
init_waitqueue_func_entry(wait, kyber_domain_wake);
wait->private = hctx;
if (nr < 0 && list_empty_careful(&wait->entry)) {
ws = sbq_wait_ptr(domain_tokens,
&khd->wait_index[sched_domain]);
khd->domain_ws[sched_domain] = ws;
add_wait_queue(&ws->wait, wait);
/*
* Try again in case a token was freed before we got on the wait
* queue. The waker may have already removed the entry from the
* wait queue, but list_del_init() is okay with that.
* queue.
*/
nr = __sbitmap_queue_get(domain_tokens);
if (nr >= 0) {
unsigned long flags;
}
spin_lock_irqsave(&ws->wait.lock, flags);
/*
* If we got a token while we were on the wait queue, remove ourselves
* from the wait queue to ensure that all wake ups make forward
* progress. It's possible that the waker already deleted the entry
* between the !list_empty_careful() check and us grabbing the lock, but
* list_del_init() is okay with that.
*/
if (nr >= 0 && !list_empty_careful(&wait->entry)) {
ws = khd->domain_ws[sched_domain];
spin_lock_irq(&ws->wait.lock);
list_del_init(&wait->entry);
spin_unlock_irqrestore(&ws->wait.lock, flags);
}
spin_unlock_irq(&ws->wait.lock);
}
return nr;
}

View File

@ -1007,7 +1007,7 @@ static ssize_t erst_reader(struct pstore_record *record)
/* The record may be cleared by others, try read next record */
if (len == -ENOENT)
goto skip;
else if (len < sizeof(*rcd)) {
else if (len < 0 || len < sizeof(*rcd)) {
rc = -EIO;
goto out;
}

View File

@ -1171,7 +1171,7 @@ int cppc_set_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls)
struct cpc_desc *cpc_desc = per_cpu(cpc_desc_ptr, cpu);
struct cpc_register_resource *desired_reg;
int pcc_ss_id = per_cpu(cpu_pcc_subspace_idx, cpu);
struct cppc_pcc_data *pcc_ss_data = pcc_data[pcc_ss_id];
struct cppc_pcc_data *pcc_ss_data;
int ret = 0;
if (!cpc_desc || pcc_ss_id < 0) {

View File

@ -35,13 +35,13 @@ static inline u64 mb_per_tick(int mbps)
struct nullb_cmd {
struct list_head list;
struct llist_node ll_list;
call_single_data_t csd;
struct __call_single_data csd;
struct request *rq;
struct bio *bio;
unsigned int tag;
blk_status_t error;
struct nullb_queue *nq;
struct hrtimer timer;
blk_status_t error;
};
struct nullb_queue {

View File

@ -22,6 +22,8 @@
#include "cpufreq_governor.h"
#define CPUFREQ_DBS_MIN_SAMPLING_INTERVAL (2 * TICK_NSEC / NSEC_PER_USEC)
static DEFINE_PER_CPU(struct cpu_dbs_info, cpu_dbs);
static DEFINE_MUTEX(gov_dbs_data_mutex);
@ -47,11 +49,15 @@ ssize_t store_sampling_rate(struct gov_attr_set *attr_set, const char *buf,
{
struct dbs_data *dbs_data = to_dbs_data(attr_set);
struct policy_dbs_info *policy_dbs;
unsigned int sampling_interval;
int ret;
ret = sscanf(buf, "%u", &dbs_data->sampling_rate);
if (ret != 1)
ret = sscanf(buf, "%u", &sampling_interval);
if (ret != 1 || sampling_interval < CPUFREQ_DBS_MIN_SAMPLING_INTERVAL)
return -EINVAL;
dbs_data->sampling_rate = sampling_interval;
/*
* We are operating under dbs_data->mutex and so the list and its
* entries can't be freed concurrently.
@ -430,7 +436,14 @@ int cpufreq_dbs_governor_init(struct cpufreq_policy *policy)
if (ret)
goto free_policy_dbs_info;
dbs_data->sampling_rate = cpufreq_policy_transition_delay_us(policy);
/*
* The sampling interval should not be less than the transition latency
* of the CPU and it also cannot be too small for dbs_update() to work
* correctly.
*/
dbs_data->sampling_rate = max_t(unsigned int,
CPUFREQ_DBS_MIN_SAMPLING_INTERVAL,
cpufreq_policy_transition_delay_us(policy));
if (!have_governor_per_policy())
gov->gdbs_data = dbs_data;

View File

@ -226,17 +226,18 @@ static void imx6q_opp_check_speed_grading(struct device *dev)
val >>= OCOTP_CFG3_SPEED_SHIFT;
val &= 0x3;
if ((val != OCOTP_CFG3_SPEED_1P2GHZ) &&
of_machine_is_compatible("fsl,imx6q"))
if (dev_pm_opp_disable(dev, 1200000000))
dev_warn(dev, "failed to disable 1.2GHz OPP\n");
if (val < OCOTP_CFG3_SPEED_996MHZ)
if (dev_pm_opp_disable(dev, 996000000))
dev_warn(dev, "failed to disable 996MHz OPP\n");
if (of_machine_is_compatible("fsl,imx6q")) {
if (of_machine_is_compatible("fsl,imx6q") ||
of_machine_is_compatible("fsl,imx6qp")) {
if (val != OCOTP_CFG3_SPEED_852MHZ)
if (dev_pm_opp_disable(dev, 852000000))
dev_warn(dev, "failed to disable 852MHz OPP\n");
if (val != OCOTP_CFG3_SPEED_1P2GHZ)
if (dev_pm_opp_disable(dev, 1200000000))
dev_warn(dev, "failed to disable 1.2GHz OPP\n");
}
iounmap(base);
put_node:

View File

@ -708,7 +708,7 @@ atc_prep_dma_interleaved(struct dma_chan *chan,
unsigned long flags)
{
struct at_dma_chan *atchan = to_at_dma_chan(chan);
struct data_chunk *first = xt->sgl;
struct data_chunk *first;
struct at_desc *desc = NULL;
size_t xfer_count;
unsigned int dwidth;
@ -720,6 +720,8 @@ atc_prep_dma_interleaved(struct dma_chan *chan,
if (unlikely(!xt || xt->numf != 1 || !xt->frame_size))
return NULL;
first = xt->sgl;
dev_info(chan2dev(chan),
"%s: src=%pad, dest=%pad, numf=%d, frame_size=%d, flags=0x%lx\n",
__func__, &xt->src_start, &xt->dst_start, xt->numf,

View File

@ -555,7 +555,7 @@ static int jz4740_dma_probe(struct platform_device *pdev)
ret = dma_async_device_register(dd);
if (ret)
return ret;
goto err_clk;
irq = platform_get_irq(pdev, 0);
ret = request_irq(irq, jz4740_dma_irq, 0, dev_name(&pdev->dev), dmadev);
@ -568,6 +568,8 @@ static int jz4740_dma_probe(struct platform_device *pdev)
err_unregister:
dma_async_device_unregister(dd);
err_clk:
clk_disable_unprepare(dmadev->clk);
return ret;
}

View File

@ -155,6 +155,12 @@ MODULE_PARM_DESC(run, "Run the test (default: false)");
#define PATTERN_COUNT_MASK 0x1f
#define PATTERN_MEMSET_IDX 0x01
/* poor man's completion - we want to use wait_event_freezable() on it */
struct dmatest_done {
bool done;
wait_queue_head_t *wait;
};
struct dmatest_thread {
struct list_head node;
struct dmatest_info *info;
@ -165,6 +171,8 @@ struct dmatest_thread {
u8 **dsts;
u8 **udsts;
enum dma_transaction_type type;
wait_queue_head_t done_wait;
struct dmatest_done test_done;
bool done;
};
@ -342,18 +350,25 @@ static unsigned int dmatest_verify(u8 **bufs, unsigned int start,
return error_count;
}
/* poor man's completion - we want to use wait_event_freezable() on it */
struct dmatest_done {
bool done;
wait_queue_head_t *wait;
};
static void dmatest_callback(void *arg)
{
struct dmatest_done *done = arg;
struct dmatest_thread *thread =
container_of(arg, struct dmatest_thread, done_wait);
if (!thread->done) {
done->done = true;
wake_up_all(done->wait);
} else {
/*
* If thread->done, it means that this callback occurred
* after the parent thread has cleaned up. This can
* happen in the case that driver doesn't implement
* the terminate_all() functionality and a dma operation
* did not occur within the timeout period
*/
WARN(1, "dmatest: Kernel memory may be corrupted!!\n");
}
}
static unsigned int min_odd(unsigned int x, unsigned int y)
@ -424,9 +439,8 @@ static unsigned long long dmatest_KBs(s64 runtime, unsigned long long len)
*/
static int dmatest_func(void *data)
{
DECLARE_WAIT_QUEUE_HEAD_ONSTACK(done_wait);
struct dmatest_thread *thread = data;
struct dmatest_done done = { .wait = &done_wait };
struct dmatest_done *done = &thread->test_done;
struct dmatest_info *info;
struct dmatest_params *params;
struct dma_chan *chan;
@ -673,9 +687,9 @@ static int dmatest_func(void *data)
continue;
}
done.done = false;
done->done = false;
tx->callback = dmatest_callback;
tx->callback_param = &done;
tx->callback_param = done;
cookie = tx->tx_submit(tx);
if (dma_submit_error(cookie)) {
@ -688,21 +702,12 @@ static int dmatest_func(void *data)
}
dma_async_issue_pending(chan);
wait_event_freezable_timeout(done_wait, done.done,
wait_event_freezable_timeout(thread->done_wait, done->done,
msecs_to_jiffies(params->timeout));
status = dma_async_is_tx_complete(chan, cookie, NULL, NULL);
if (!done.done) {
/*
* We're leaving the timed out dma operation with
* dangling pointer to done_wait. To make this
* correct, we'll need to allocate wait_done for
* each test iteration and perform "who's gonna
* free it this time?" dancing. For now, just
* leave it dangling.
*/
WARN(1, "dmatest: Kernel stack may be corrupted!!\n");
if (!done->done) {
dmaengine_unmap_put(um);
result("test timed out", total_tests, src_off, dst_off,
len, 0);
@ -789,7 +794,7 @@ static int dmatest_func(void *data)
dmatest_KBs(runtime, total_len), ret);
/* terminate all transfers on specified channels */
if (ret)
if (ret || failed_tests)
dmaengine_terminate_all(chan);
thread->done = true;
@ -849,6 +854,8 @@ static int dmatest_add_threads(struct dmatest_info *info,
thread->info = info;
thread->chan = dtc->chan;
thread->type = type;
thread->test_done.wait = &thread->done_wait;
init_waitqueue_head(&thread->done_wait);
smp_wmb();
thread->task = kthread_create(dmatest_func, thread, "%s-%s%u",
dma_chan_name(chan), op, i);

View File

@ -863,11 +863,11 @@ static void fsl_edma_irq_exit(
}
}
static void fsl_disable_clocks(struct fsl_edma_engine *fsl_edma)
static void fsl_disable_clocks(struct fsl_edma_engine *fsl_edma, int nr_clocks)
{
int i;
for (i = 0; i < DMAMUX_NR; i++)
for (i = 0; i < nr_clocks; i++)
clk_disable_unprepare(fsl_edma->muxclk[i]);
}
@ -904,25 +904,25 @@ static int fsl_edma_probe(struct platform_device *pdev)
res = platform_get_resource(pdev, IORESOURCE_MEM, 1 + i);
fsl_edma->muxbase[i] = devm_ioremap_resource(&pdev->dev, res);
if (IS_ERR(fsl_edma->muxbase[i]))
if (IS_ERR(fsl_edma->muxbase[i])) {
/* on error: disable all previously enabled clks */
fsl_disable_clocks(fsl_edma, i);
return PTR_ERR(fsl_edma->muxbase[i]);
}
sprintf(clkname, "dmamux%d", i);
fsl_edma->muxclk[i] = devm_clk_get(&pdev->dev, clkname);
if (IS_ERR(fsl_edma->muxclk[i])) {
dev_err(&pdev->dev, "Missing DMAMUX block clock.\n");
/* on error: disable all previously enabled clks */
fsl_disable_clocks(fsl_edma, i);
return PTR_ERR(fsl_edma->muxclk[i]);
}
ret = clk_prepare_enable(fsl_edma->muxclk[i]);
if (ret) {
/* disable only clks which were enabled on error */
for (; i >= 0; i--)
clk_disable_unprepare(fsl_edma->muxclk[i]);
dev_err(&pdev->dev, "DMAMUX clk block failed.\n");
return ret;
}
if (ret)
/* on error: disable all previously enabled clks */
fsl_disable_clocks(fsl_edma, i);
}
@ -976,7 +976,7 @@ static int fsl_edma_probe(struct platform_device *pdev)
if (ret) {
dev_err(&pdev->dev,
"Can't register Freescale eDMA engine. (%d)\n", ret);
fsl_disable_clocks(fsl_edma);
fsl_disable_clocks(fsl_edma, DMAMUX_NR);
return ret;
}
@ -985,7 +985,7 @@ static int fsl_edma_probe(struct platform_device *pdev)
dev_err(&pdev->dev,
"Can't register Freescale eDMA of_dma. (%d)\n", ret);
dma_async_device_unregister(&fsl_edma->dma_dev);
fsl_disable_clocks(fsl_edma);
fsl_disable_clocks(fsl_edma, DMAMUX_NR);
return ret;
}
@ -1015,7 +1015,7 @@ static int fsl_edma_remove(struct platform_device *pdev)
fsl_edma_cleanup_vchan(&fsl_edma->dma_dev);
of_dma_controller_free(np);
dma_async_device_unregister(&fsl_edma->dma_dev);
fsl_disable_clocks(fsl_edma);
fsl_disable_clocks(fsl_edma, DMAMUX_NR);
return 0;
}

View File

@ -390,7 +390,7 @@ static int ioat_dma_self_test(struct ioatdma_device *ioat_dma)
if (memcmp(src, dest, IOAT_TEST_SIZE)) {
dev_err(dev, "Self-test copy failed compare, disabling\n");
err = -ENODEV;
goto free_resources;
goto unmap_dma;
}
unmap_dma:

View File

@ -377,6 +377,7 @@ static int cros_ec_pkt_xfer_spi(struct cros_ec_device *ec_dev,
u8 *ptr;
u8 *rx_buf;
u8 sum;
u8 rx_byte;
int ret = 0, final_ret;
len = cros_ec_prepare_tx(ec_dev, ec_msg);
@ -421,24 +422,21 @@ static int cros_ec_pkt_xfer_spi(struct cros_ec_device *ec_dev,
if (!ret) {
/* Verify that EC can process command */
for (i = 0; i < len; i++) {
switch (rx_buf[i]) {
case EC_SPI_PAST_END:
case EC_SPI_RX_BAD_DATA:
case EC_SPI_NOT_READY:
ret = -EAGAIN;
ec_msg->result = EC_RES_IN_PROGRESS;
default:
rx_byte = rx_buf[i];
if (rx_byte == EC_SPI_PAST_END ||
rx_byte == EC_SPI_RX_BAD_DATA ||
rx_byte == EC_SPI_NOT_READY) {
ret = -EREMOTEIO;
break;
}
if (ret)
break;
}
}
if (!ret)
ret = cros_ec_spi_receive_packet(ec_dev,
ec_msg->insize + sizeof(*response));
} else {
else
dev_err(ec_dev->dev, "spi transfer failed: %d\n", ret);
}
final_ret = terminate_request(ec_dev);
@ -508,6 +506,7 @@ static int cros_ec_cmd_xfer_spi(struct cros_ec_device *ec_dev,
int i, len;
u8 *ptr;
u8 *rx_buf;
u8 rx_byte;
int sum;
int ret = 0, final_ret;
@ -544,24 +543,21 @@ static int cros_ec_cmd_xfer_spi(struct cros_ec_device *ec_dev,
if (!ret) {
/* Verify that EC can process command */
for (i = 0; i < len; i++) {
switch (rx_buf[i]) {
case EC_SPI_PAST_END:
case EC_SPI_RX_BAD_DATA:
case EC_SPI_NOT_READY:
ret = -EAGAIN;
ec_msg->result = EC_RES_IN_PROGRESS;
default:
rx_byte = rx_buf[i];
if (rx_byte == EC_SPI_PAST_END ||
rx_byte == EC_SPI_RX_BAD_DATA ||
rx_byte == EC_SPI_NOT_READY) {
ret = -EREMOTEIO;
break;
}
if (ret)
break;
}
}
if (!ret)
ret = cros_ec_spi_receive_response(ec_dev,
ec_msg->insize + EC_MSG_TX_PROTO_BYTES);
} else {
else
dev_err(ec_dev->dev, "spi transfer failed: %d\n", ret);
}
final_ret = terminate_request(ec_dev);
@ -667,6 +663,7 @@ static int cros_ec_spi_probe(struct spi_device *spi)
sizeof(struct ec_response_get_protocol_info);
ec_dev->dout_size = sizeof(struct ec_host_request);
ec_spi->last_transfer_ns = ktime_get_ns();
err = cros_ec_register(ec_dev);
if (err) {

View File

@ -159,13 +159,18 @@ unsigned int twl4030_audio_get_mclk(void)
EXPORT_SYMBOL_GPL(twl4030_audio_get_mclk);
static bool twl4030_audio_has_codec(struct twl4030_audio_data *pdata,
struct device_node *node)
struct device_node *parent)
{
struct device_node *node;
if (pdata && pdata->codec)
return true;
if (of_find_node_by_name(node, "codec"))
node = of_get_child_by_name(parent, "codec");
if (node) {
of_node_put(node);
return true;
}
return false;
}

View File

@ -97,12 +97,16 @@ static struct reg_sequence twl6040_patch[] = {
};
static bool twl6040_has_vibra(struct device_node *node)
static bool twl6040_has_vibra(struct device_node *parent)
{
#ifdef CONFIG_OF
if (of_find_node_by_name(node, "vibra"))
struct device_node *node;
node = of_get_child_by_name(parent, "vibra");
if (node) {
of_node_put(node);
return true;
#endif
}
return false;
}

View File

@ -32,7 +32,7 @@
#include <linux/pci.h>
#include <linux/mutex.h>
#include <linux/miscdevice.h>
#include <linux/pti.h>
#include <linux/intel-pti.h>
#include <linux/slab.h>
#include <linux/uaccess.h>

View File

@ -1114,7 +1114,7 @@ static int mtd_check_oob_ops(struct mtd_info *mtd, loff_t offs,
if (!ops->oobbuf)
ops->ooblen = 0;
if (offs < 0 || offs + ops->len >= mtd->size)
if (offs < 0 || offs + ops->len > mtd->size)
return -EINVAL;
if (ops->ooblen) {

View File

@ -1763,7 +1763,7 @@ static int brcmnand_read(struct mtd_info *mtd, struct nand_chip *chip,
err = brcmstb_nand_verify_erased_page(mtd, chip, buf,
addr);
/* erased page bitflips corrected */
if (err > 0)
if (err >= 0)
return err;
}

View File

@ -253,9 +253,9 @@ static int gpio_nand_probe(struct platform_device *pdev)
goto out_ce;
}
gpiomtd->nwp = devm_gpiod_get(dev, "ale", GPIOD_OUT_LOW);
if (IS_ERR(gpiomtd->nwp)) {
ret = PTR_ERR(gpiomtd->nwp);
gpiomtd->ale = devm_gpiod_get(dev, "ale", GPIOD_OUT_LOW);
if (IS_ERR(gpiomtd->ale)) {
ret = PTR_ERR(gpiomtd->ale);
goto out_ce;
}

View File

@ -1067,9 +1067,6 @@ static int gpmi_ecc_read_page(struct mtd_info *mtd, struct nand_chip *chip,
return ret;
}
/* handle the block mark swapping */
block_mark_swapping(this, payload_virt, auxiliary_virt);
/* Loop over status bytes, accumulating ECC status. */
status = auxiliary_virt + nfc_geo->auxiliary_status_offset;
@ -1158,6 +1155,9 @@ static int gpmi_ecc_read_page(struct mtd_info *mtd, struct nand_chip *chip,
max_bitflips = max_t(unsigned int, max_bitflips, *status);
}
/* handle the block mark swapping */
block_mark_swapping(this, buf, auxiliary_virt);
if (oob_required) {
/*
* It's time to deliver the OOB bytes. See gpmi_ecc_read_oob()

View File

@ -159,6 +159,8 @@ struct arc_emac_priv {
unsigned int link;
unsigned int duplex;
unsigned int speed;
unsigned int rx_missed_errors;
};
/**

View File

@ -26,6 +26,8 @@
#include "emac.h"
static void arc_emac_restart(struct net_device *ndev);
/**
* arc_emac_tx_avail - Return the number of available slots in the tx ring.
* @priv: Pointer to ARC EMAC private data structure.
@ -210,39 +212,48 @@ static int arc_emac_rx(struct net_device *ndev, int budget)
continue;
}
pktlen = info & LEN_MASK;
stats->rx_packets++;
stats->rx_bytes += pktlen;
skb = rx_buff->skb;
skb_put(skb, pktlen);
skb->dev = ndev;
skb->protocol = eth_type_trans(skb, ndev);
dma_unmap_single(&ndev->dev, dma_unmap_addr(rx_buff, addr),
dma_unmap_len(rx_buff, len), DMA_FROM_DEVICE);
/* Prepare the BD for next cycle */
rx_buff->skb = netdev_alloc_skb_ip_align(ndev,
EMAC_BUFFER_SIZE);
if (unlikely(!rx_buff->skb)) {
/* Prepare the BD for next cycle. netif_receive_skb()
* only if new skb was allocated and mapped to avoid holes
* in the RX fifo.
*/
skb = netdev_alloc_skb_ip_align(ndev, EMAC_BUFFER_SIZE);
if (unlikely(!skb)) {
if (net_ratelimit())
netdev_err(ndev, "cannot allocate skb\n");
/* Return ownership to EMAC */
rxbd->info = cpu_to_le32(FOR_EMAC | EMAC_BUFFER_SIZE);
stats->rx_errors++;
/* Because receive_skb is below, increment rx_dropped */
stats->rx_dropped++;
continue;
}
/* receive_skb only if new skb was allocated to avoid holes */
netif_receive_skb(skb);
addr = dma_map_single(&ndev->dev, (void *)rx_buff->skb->data,
addr = dma_map_single(&ndev->dev, (void *)skb->data,
EMAC_BUFFER_SIZE, DMA_FROM_DEVICE);
if (dma_mapping_error(&ndev->dev, addr)) {
if (net_ratelimit())
netdev_err(ndev, "cannot dma map\n");
dev_kfree_skb(rx_buff->skb);
netdev_err(ndev, "cannot map dma buffer\n");
dev_kfree_skb(skb);
/* Return ownership to EMAC */
rxbd->info = cpu_to_le32(FOR_EMAC | EMAC_BUFFER_SIZE);
stats->rx_errors++;
stats->rx_dropped++;
continue;
}
/* unmap previosly mapped skb */
dma_unmap_single(&ndev->dev, dma_unmap_addr(rx_buff, addr),
dma_unmap_len(rx_buff, len), DMA_FROM_DEVICE);
pktlen = info & LEN_MASK;
stats->rx_packets++;
stats->rx_bytes += pktlen;
skb_put(rx_buff->skb, pktlen);
rx_buff->skb->dev = ndev;
rx_buff->skb->protocol = eth_type_trans(rx_buff->skb, ndev);
netif_receive_skb(rx_buff->skb);
rx_buff->skb = skb;
dma_unmap_addr_set(rx_buff, addr, addr);
dma_unmap_len_set(rx_buff, len, EMAC_BUFFER_SIZE);
@ -258,6 +269,53 @@ static int arc_emac_rx(struct net_device *ndev, int budget)
return work_done;
}
/**
* arc_emac_rx_miss_handle - handle R_MISS register
* @ndev: Pointer to the net_device structure.
*/
static void arc_emac_rx_miss_handle(struct net_device *ndev)
{
struct arc_emac_priv *priv = netdev_priv(ndev);
struct net_device_stats *stats = &ndev->stats;
unsigned int miss;
miss = arc_reg_get(priv, R_MISS);
if (miss) {
stats->rx_errors += miss;
stats->rx_missed_errors += miss;
priv->rx_missed_errors += miss;
}
}
/**
* arc_emac_rx_stall_check - check RX stall
* @ndev: Pointer to the net_device structure.
* @budget: How many BDs requested to process on 1 call.
* @work_done: How many BDs processed
*
* Under certain conditions EMAC stop reception of incoming packets and
* continuously increment R_MISS register instead of saving data into
* provided buffer. This function detect that condition and restart
* EMAC.
*/
static void arc_emac_rx_stall_check(struct net_device *ndev,
int budget, unsigned int work_done)
{
struct arc_emac_priv *priv = netdev_priv(ndev);
struct arc_emac_bd *rxbd;
if (work_done)
priv->rx_missed_errors = 0;
if (priv->rx_missed_errors && budget) {
rxbd = &priv->rxbd[priv->last_rx_bd];
if (le32_to_cpu(rxbd->info) & FOR_EMAC) {
arc_emac_restart(ndev);
priv->rx_missed_errors = 0;
}
}
}
/**
* arc_emac_poll - NAPI poll handler.
* @napi: Pointer to napi_struct structure.
@ -272,6 +330,7 @@ static int arc_emac_poll(struct napi_struct *napi, int budget)
unsigned int work_done;
arc_emac_tx_clean(ndev);
arc_emac_rx_miss_handle(ndev);
work_done = arc_emac_rx(ndev, budget);
if (work_done < budget) {
@ -279,6 +338,8 @@ static int arc_emac_poll(struct napi_struct *napi, int budget)
arc_reg_or(priv, R_ENABLE, RXINT_MASK | TXINT_MASK);
}
arc_emac_rx_stall_check(ndev, budget, work_done);
return work_done;
}
@ -320,6 +381,8 @@ static irqreturn_t arc_emac_intr(int irq, void *dev_instance)
if (status & MSER_MASK) {
stats->rx_missed_errors += 0x100;
stats->rx_errors += 0x100;
priv->rx_missed_errors += 0x100;
napi_schedule(&priv->napi);
}
if (status & RXCR_MASK) {
@ -732,6 +795,63 @@ static int arc_emac_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
}
/**
* arc_emac_restart - Restart EMAC
* @ndev: Pointer to net_device structure.
*
* This function do hardware reset of EMAC in order to restore
* network packets reception.
*/
static void arc_emac_restart(struct net_device *ndev)
{
struct arc_emac_priv *priv = netdev_priv(ndev);
struct net_device_stats *stats = &ndev->stats;
int i;
if (net_ratelimit())
netdev_warn(ndev, "restarting stalled EMAC\n");
netif_stop_queue(ndev);
/* Disable interrupts */
arc_reg_clr(priv, R_ENABLE, RXINT_MASK | TXINT_MASK | ERR_MASK);
/* Disable EMAC */
arc_reg_clr(priv, R_CTRL, EN_MASK);
/* Return the sk_buff to system */
arc_free_tx_queue(ndev);
/* Clean Tx BD's */
priv->txbd_curr = 0;
priv->txbd_dirty = 0;
memset(priv->txbd, 0, TX_RING_SZ);
for (i = 0; i < RX_BD_NUM; i++) {
struct arc_emac_bd *rxbd = &priv->rxbd[i];
unsigned int info = le32_to_cpu(rxbd->info);
if (!(info & FOR_EMAC)) {
stats->rx_errors++;
stats->rx_dropped++;
}
/* Return ownership to EMAC */
rxbd->info = cpu_to_le32(FOR_EMAC | EMAC_BUFFER_SIZE);
}
priv->last_rx_bd = 0;
/* Make sure info is visible to EMAC before enable */
wmb();
/* Enable interrupts */
arc_reg_set(priv, R_ENABLE, RXINT_MASK | TXINT_MASK | ERR_MASK);
/* Enable EMAC */
arc_reg_or(priv, R_CTRL, EN_MASK);
netif_start_queue(ndev);
}
static const struct net_device_ops arc_emac_netdev_ops = {
.ndo_open = arc_emac_open,
.ndo_stop = arc_emac_stop,

View File

@ -14225,7 +14225,9 @@ static int tg3_change_mtu(struct net_device *dev, int new_mtu)
/* Reset PHY, otherwise the read DMA engine will be in a mode that
* breaks all requests to 256 bytes.
*/
if (tg3_asic_rev(tp) == ASIC_REV_57766)
if (tg3_asic_rev(tp) == ASIC_REV_57766 ||
tg3_asic_rev(tp) == ASIC_REV_5717 ||
tg3_asic_rev(tp) == ASIC_REV_5719)
reset_phy = true;
err = tg3_restart_hw(tp, reset_phy);

View File

@ -1214,6 +1214,10 @@ static void mvneta_port_disable(struct mvneta_port *pp)
val &= ~MVNETA_GMAC0_PORT_ENABLE;
mvreg_write(pp, MVNETA_GMAC_CTRL_0, val);
pp->link = 0;
pp->duplex = -1;
pp->speed = 0;
udelay(200);
}
@ -1958,9 +1962,9 @@ static int mvneta_rx_swbm(struct mvneta_port *pp, int rx_todo,
if (!mvneta_rxq_desc_is_first_last(rx_status) ||
(rx_status & MVNETA_RXD_ERR_SUMMARY)) {
mvneta_rx_error(pp, rx_desc);
err_drop_frame:
dev->stats.rx_errors++;
mvneta_rx_error(pp, rx_desc);
/* leave the descriptor untouched */
continue;
}
@ -3011,7 +3015,7 @@ static void mvneta_cleanup_rxqs(struct mvneta_port *pp)
{
int queue;
for (queue = 0; queue < txq_number; queue++)
for (queue = 0; queue < rxq_number; queue++)
mvneta_rxq_deinit(pp, &pp->rxqs[queue]);
}

View File

@ -1961,11 +1961,12 @@ static int mtk_hw_init(struct mtk_eth *eth)
/* set GE2 TUNE */
regmap_write(eth->pctl, GPIO_BIAS_CTRL, 0x0);
/* GE1, Force 1000M/FD, FC ON */
mtk_w32(eth, MAC_MCR_FIXED_LINK, MTK_MAC_MCR(0));
/* GE2, Force 1000M/FD, FC ON */
mtk_w32(eth, MAC_MCR_FIXED_LINK, MTK_MAC_MCR(1));
/* Set linkdown as the default for each GMAC. Its own MCR would be set
* up with the more appropriate value when mtk_phy_link_adjust call is
* being invoked.
*/
for (i = 0; i < MTK_MAC_COUNT; i++)
mtk_w32(eth, 0, MTK_MAC_MCR(i));
/* Indicates CDM to parse the MTK special tag from CPU
* which also is working out for untag packets.

View File

@ -362,7 +362,7 @@ static int mlx5_internal_err_ret_value(struct mlx5_core_dev *dev, u16 op,
case MLX5_CMD_OP_QUERY_VPORT_COUNTER:
case MLX5_CMD_OP_ALLOC_Q_COUNTER:
case MLX5_CMD_OP_QUERY_Q_COUNTER:
case MLX5_CMD_OP_SET_RATE_LIMIT:
case MLX5_CMD_OP_SET_PP_RATE_LIMIT:
case MLX5_CMD_OP_QUERY_RATE_LIMIT:
case MLX5_CMD_OP_CREATE_SCHEDULING_ELEMENT:
case MLX5_CMD_OP_QUERY_SCHEDULING_ELEMENT:
@ -505,7 +505,7 @@ const char *mlx5_command_str(int command)
MLX5_COMMAND_STR_CASE(ALLOC_Q_COUNTER);
MLX5_COMMAND_STR_CASE(DEALLOC_Q_COUNTER);
MLX5_COMMAND_STR_CASE(QUERY_Q_COUNTER);
MLX5_COMMAND_STR_CASE(SET_RATE_LIMIT);
MLX5_COMMAND_STR_CASE(SET_PP_RATE_LIMIT);
MLX5_COMMAND_STR_CASE(QUERY_RATE_LIMIT);
MLX5_COMMAND_STR_CASE(CREATE_SCHEDULING_ELEMENT);
MLX5_COMMAND_STR_CASE(DESTROY_SCHEDULING_ELEMENT);

View File

@ -82,6 +82,9 @@
max_t(u32, MLX5_MPWRQ_MIN_LOG_STRIDE_SZ(mdev), req)
#define MLX5_MPWRQ_DEF_LOG_STRIDE_SZ(mdev) MLX5_MPWRQ_LOG_STRIDE_SZ(mdev, 6)
#define MLX5_MPWRQ_CQE_CMPRS_LOG_STRIDE_SZ(mdev) MLX5_MPWRQ_LOG_STRIDE_SZ(mdev, 8)
#define MLX5E_MPWQE_STRIDE_SZ(mdev, cqe_cmprs) \
(cqe_cmprs ? MLX5_MPWRQ_CQE_CMPRS_LOG_STRIDE_SZ(mdev) : \
MLX5_MPWRQ_DEF_LOG_STRIDE_SZ(mdev))
#define MLX5_MPWRQ_LOG_WQE_SZ 18
#define MLX5_MPWRQ_WQE_PAGE_ORDER (MLX5_MPWRQ_LOG_WQE_SZ - PAGE_SHIFT > 0 ? \
@ -590,6 +593,7 @@ struct mlx5e_channel {
struct mlx5_core_dev *mdev;
struct hwtstamp_config *tstamp;
int ix;
int cpu;
};
struct mlx5e_channels {
@ -935,8 +939,9 @@ void mlx5e_set_tx_cq_mode_params(struct mlx5e_params *params,
u8 cq_period_mode);
void mlx5e_set_rx_cq_mode_params(struct mlx5e_params *params,
u8 cq_period_mode);
void mlx5e_set_rq_type_params(struct mlx5_core_dev *mdev,
struct mlx5e_params *params, u8 rq_type);
void mlx5e_init_rq_type_params(struct mlx5_core_dev *mdev,
struct mlx5e_params *params,
u8 rq_type);
static inline bool mlx5e_tunnel_inner_ft_supported(struct mlx5_core_dev *mdev)
{

View File

@ -274,6 +274,7 @@ int mlx5e_dcbnl_ieee_setets_core(struct mlx5e_priv *priv, struct ieee_ets *ets)
static int mlx5e_dbcnl_validate_ets(struct net_device *netdev,
struct ieee_ets *ets)
{
bool have_ets_tc = false;
int bw_sum = 0;
int i;
@ -288,11 +289,14 @@ static int mlx5e_dbcnl_validate_ets(struct net_device *netdev,
}
/* Validate Bandwidth Sum */
for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++)
if (ets->tc_tsa[i] == IEEE_8021QAZ_TSA_ETS)
for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++) {
if (ets->tc_tsa[i] == IEEE_8021QAZ_TSA_ETS) {
have_ets_tc = true;
bw_sum += ets->tc_tx_bw[i];
}
}
if (bw_sum != 0 && bw_sum != 100) {
if (have_ets_tc && bw_sum != 100) {
netdev_err(netdev,
"Failed to validate ETS: BW sum is illegal\n");
return -EINVAL;

View File

@ -1523,8 +1523,10 @@ int mlx5e_modify_rx_cqe_compression_locked(struct mlx5e_priv *priv, bool new_val
new_channels.params = priv->channels.params;
MLX5E_SET_PFLAG(&new_channels.params, MLX5E_PFLAG_RX_CQE_COMPRESS, new_val);
mlx5e_set_rq_type_params(priv->mdev, &new_channels.params,
new_channels.params.rq_wq_type);
new_channels.params.mpwqe_log_stride_sz =
MLX5E_MPWQE_STRIDE_SZ(priv->mdev, new_val);
new_channels.params.mpwqe_log_num_strides =
MLX5_MPWRQ_LOG_WQE_SZ - new_channels.params.mpwqe_log_stride_sz;
if (!test_bit(MLX5E_STATE_OPENED, &priv->state)) {
priv->channels.params = new_channels.params;
@ -1536,6 +1538,10 @@ int mlx5e_modify_rx_cqe_compression_locked(struct mlx5e_priv *priv, bool new_val
return err;
mlx5e_switch_priv_channels(priv, &new_channels, NULL);
mlx5e_dbg(DRV, priv, "MLX5E: RxCqeCmprss was turned %s\n",
MLX5E_GET_PFLAG(&priv->channels.params,
MLX5E_PFLAG_RX_CQE_COMPRESS) ? "ON" : "OFF");
return 0;
}

View File

@ -71,11 +71,6 @@ struct mlx5e_channel_param {
struct mlx5e_cq_param icosq_cq;
};
static int mlx5e_get_node(struct mlx5e_priv *priv, int ix)
{
return pci_irq_get_node(priv->mdev->pdev, MLX5_EQ_VEC_COMP_BASE + ix);
}
static bool mlx5e_check_fragmented_striding_rq_cap(struct mlx5_core_dev *mdev)
{
return MLX5_CAP_GEN(mdev, striding_rq) &&
@ -83,7 +78,7 @@ static bool mlx5e_check_fragmented_striding_rq_cap(struct mlx5_core_dev *mdev)
MLX5_CAP_ETH(mdev, reg_umr_sq);
}
void mlx5e_set_rq_type_params(struct mlx5_core_dev *mdev,
void mlx5e_init_rq_type_params(struct mlx5_core_dev *mdev,
struct mlx5e_params *params, u8 rq_type)
{
params->rq_wq_type = rq_type;
@ -93,10 +88,8 @@ void mlx5e_set_rq_type_params(struct mlx5_core_dev *mdev,
params->log_rq_size = is_kdump_kernel() ?
MLX5E_PARAMS_MINIMUM_LOG_RQ_SIZE_MPW :
MLX5E_PARAMS_DEFAULT_LOG_RQ_SIZE_MPW;
params->mpwqe_log_stride_sz =
MLX5E_GET_PFLAG(params, MLX5E_PFLAG_RX_CQE_COMPRESS) ?
MLX5_MPWRQ_CQE_CMPRS_LOG_STRIDE_SZ(mdev) :
MLX5_MPWRQ_DEF_LOG_STRIDE_SZ(mdev);
params->mpwqe_log_stride_sz = MLX5E_MPWQE_STRIDE_SZ(mdev,
MLX5E_GET_PFLAG(params, MLX5E_PFLAG_RX_CQE_COMPRESS));
params->mpwqe_log_num_strides = MLX5_MPWRQ_LOG_WQE_SZ -
params->mpwqe_log_stride_sz;
break;
@ -120,13 +113,14 @@ void mlx5e_set_rq_type_params(struct mlx5_core_dev *mdev,
MLX5E_GET_PFLAG(params, MLX5E_PFLAG_RX_CQE_COMPRESS));
}
static void mlx5e_set_rq_params(struct mlx5_core_dev *mdev, struct mlx5e_params *params)
static void mlx5e_set_rq_params(struct mlx5_core_dev *mdev,
struct mlx5e_params *params)
{
u8 rq_type = mlx5e_check_fragmented_striding_rq_cap(mdev) &&
!params->xdp_prog && !MLX5_IPSEC_DEV(mdev) ?
MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ :
MLX5_WQ_TYPE_LINKED_LIST;
mlx5e_set_rq_type_params(mdev, params, rq_type);
mlx5e_init_rq_type_params(mdev, params, rq_type);
}
static void mlx5e_update_carrier(struct mlx5e_priv *priv)
@ -444,17 +438,16 @@ static int mlx5e_rq_alloc_mpwqe_info(struct mlx5e_rq *rq,
int wq_sz = mlx5_wq_ll_get_size(&rq->wq);
int mtt_sz = mlx5e_get_wqe_mtt_sz();
int mtt_alloc = mtt_sz + MLX5_UMR_ALIGN - 1;
int node = mlx5e_get_node(c->priv, c->ix);
int i;
rq->mpwqe.info = kzalloc_node(wq_sz * sizeof(*rq->mpwqe.info),
GFP_KERNEL, node);
GFP_KERNEL, cpu_to_node(c->cpu));
if (!rq->mpwqe.info)
goto err_out;
/* We allocate more than mtt_sz as we will align the pointer */
rq->mpwqe.mtt_no_align = kzalloc_node(mtt_alloc * wq_sz,
GFP_KERNEL, node);
rq->mpwqe.mtt_no_align = kzalloc_node(mtt_alloc * wq_sz, GFP_KERNEL,
cpu_to_node(c->cpu));
if (unlikely(!rq->mpwqe.mtt_no_align))
goto err_free_wqe_info;
@ -562,7 +555,7 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
int err;
int i;
rqp->wq.db_numa_node = mlx5e_get_node(c->priv, c->ix);
rqp->wq.db_numa_node = cpu_to_node(c->cpu);
err = mlx5_wq_ll_create(mdev, &rqp->wq, rqc_wq, &rq->wq,
&rq->wq_ctrl);
@ -629,8 +622,7 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
default: /* MLX5_WQ_TYPE_LINKED_LIST */
rq->wqe.frag_info =
kzalloc_node(wq_sz * sizeof(*rq->wqe.frag_info),
GFP_KERNEL,
mlx5e_get_node(c->priv, c->ix));
GFP_KERNEL, cpu_to_node(c->cpu));
if (!rq->wqe.frag_info) {
err = -ENOMEM;
goto err_rq_wq_destroy;
@ -1000,13 +992,13 @@ static int mlx5e_alloc_xdpsq(struct mlx5e_channel *c,
sq->uar_map = mdev->mlx5e_res.bfreg.map;
sq->min_inline_mode = params->tx_min_inline_mode;
param->wq.db_numa_node = mlx5e_get_node(c->priv, c->ix);
param->wq.db_numa_node = cpu_to_node(c->cpu);
err = mlx5_wq_cyc_create(mdev, &param->wq, sqc_wq, &sq->wq, &sq->wq_ctrl);
if (err)
return err;
sq->wq.db = &sq->wq.db[MLX5_SND_DBR];
err = mlx5e_alloc_xdpsq_db(sq, mlx5e_get_node(c->priv, c->ix));
err = mlx5e_alloc_xdpsq_db(sq, cpu_to_node(c->cpu));
if (err)
goto err_sq_wq_destroy;
@ -1053,13 +1045,13 @@ static int mlx5e_alloc_icosq(struct mlx5e_channel *c,
sq->channel = c;
sq->uar_map = mdev->mlx5e_res.bfreg.map;
param->wq.db_numa_node = mlx5e_get_node(c->priv, c->ix);
param->wq.db_numa_node = cpu_to_node(c->cpu);
err = mlx5_wq_cyc_create(mdev, &param->wq, sqc_wq, &sq->wq, &sq->wq_ctrl);
if (err)
return err;
sq->wq.db = &sq->wq.db[MLX5_SND_DBR];
err = mlx5e_alloc_icosq_db(sq, mlx5e_get_node(c->priv, c->ix));
err = mlx5e_alloc_icosq_db(sq, cpu_to_node(c->cpu));
if (err)
goto err_sq_wq_destroy;
@ -1126,13 +1118,13 @@ static int mlx5e_alloc_txqsq(struct mlx5e_channel *c,
if (MLX5_IPSEC_DEV(c->priv->mdev))
set_bit(MLX5E_SQ_STATE_IPSEC, &sq->state);
param->wq.db_numa_node = mlx5e_get_node(c->priv, c->ix);
param->wq.db_numa_node = cpu_to_node(c->cpu);
err = mlx5_wq_cyc_create(mdev, &param->wq, sqc_wq, &sq->wq, &sq->wq_ctrl);
if (err)
return err;
sq->wq.db = &sq->wq.db[MLX5_SND_DBR];
err = mlx5e_alloc_txqsq_db(sq, mlx5e_get_node(c->priv, c->ix));
err = mlx5e_alloc_txqsq_db(sq, cpu_to_node(c->cpu));
if (err)
goto err_sq_wq_destroy;
@ -1504,8 +1496,8 @@ static int mlx5e_alloc_cq(struct mlx5e_channel *c,
struct mlx5_core_dev *mdev = c->priv->mdev;
int err;
param->wq.buf_numa_node = mlx5e_get_node(c->priv, c->ix);
param->wq.db_numa_node = mlx5e_get_node(c->priv, c->ix);
param->wq.buf_numa_node = cpu_to_node(c->cpu);
param->wq.db_numa_node = cpu_to_node(c->cpu);
param->eq_ix = c->ix;
err = mlx5e_alloc_cq_common(mdev, param, cq);
@ -1604,6 +1596,11 @@ static void mlx5e_close_cq(struct mlx5e_cq *cq)
mlx5e_free_cq(cq);
}
static int mlx5e_get_cpu(struct mlx5e_priv *priv, int ix)
{
return cpumask_first(priv->mdev->priv.irq_info[ix].mask);
}
static int mlx5e_open_tx_cqs(struct mlx5e_channel *c,
struct mlx5e_params *params,
struct mlx5e_channel_param *cparam)
@ -1752,12 +1749,13 @@ static int mlx5e_open_channel(struct mlx5e_priv *priv, int ix,
{
struct mlx5e_cq_moder icocq_moder = {0, 0};
struct net_device *netdev = priv->netdev;
int cpu = mlx5e_get_cpu(priv, ix);
struct mlx5e_channel *c;
unsigned int irq;
int err;
int eqn;
c = kzalloc_node(sizeof(*c), GFP_KERNEL, mlx5e_get_node(priv, ix));
c = kzalloc_node(sizeof(*c), GFP_KERNEL, cpu_to_node(cpu));
if (!c)
return -ENOMEM;
@ -1765,6 +1763,7 @@ static int mlx5e_open_channel(struct mlx5e_priv *priv, int ix,
c->mdev = priv->mdev;
c->tstamp = &priv->tstamp;
c->ix = ix;
c->cpu = cpu;
c->pdev = &priv->mdev->pdev->dev;
c->netdev = priv->netdev;
c->mkey_be = cpu_to_be32(priv->mdev->mlx5e_res.mkey.key);
@ -1853,8 +1852,7 @@ static void mlx5e_activate_channel(struct mlx5e_channel *c)
for (tc = 0; tc < c->num_tc; tc++)
mlx5e_activate_txqsq(&c->sq[tc]);
mlx5e_activate_rq(&c->rq);
netif_set_xps_queue(c->netdev,
mlx5_get_vector_affinity(c->priv->mdev, c->ix), c->ix);
netif_set_xps_queue(c->netdev, get_cpu_mask(c->cpu), c->ix);
}
static void mlx5e_deactivate_channel(struct mlx5e_channel *c)
@ -3679,6 +3677,7 @@ static netdev_features_t mlx5e_tunnel_features_check(struct mlx5e_priv *priv,
struct sk_buff *skb,
netdev_features_t features)
{
unsigned int offset = 0;
struct udphdr *udph;
u8 proto;
u16 port;
@ -3688,7 +3687,7 @@ static netdev_features_t mlx5e_tunnel_features_check(struct mlx5e_priv *priv,
proto = ip_hdr(skb)->protocol;
break;
case htons(ETH_P_IPV6):
proto = ipv6_hdr(skb)->nexthdr;
proto = ipv6_find_hdr(skb, &offset, -1, NULL, NULL);
break;
default:
goto out;

View File

@ -466,7 +466,7 @@ static irqreturn_t mlx5_eq_int(int irq, void *eq_ptr)
break;
case MLX5_EVENT_TYPE_CQ_ERROR:
cqn = be32_to_cpu(eqe->data.cq_err.cqn) & 0xffffff;
mlx5_core_warn(dev, "CQ error on CQN 0x%x, syndrom 0x%x\n",
mlx5_core_warn(dev, "CQ error on CQN 0x%x, syndrome 0x%x\n",
cqn, eqe->data.cq_err.syndrome);
mlx5_cq_event(dev, cqn, eqe->type);
break;
@ -775,7 +775,7 @@ int mlx5_start_eqs(struct mlx5_core_dev *dev)
return err;
}
int mlx5_stop_eqs(struct mlx5_core_dev *dev)
void mlx5_stop_eqs(struct mlx5_core_dev *dev)
{
struct mlx5_eq_table *table = &dev->priv.eq_table;
int err;
@ -784,22 +784,26 @@ int mlx5_stop_eqs(struct mlx5_core_dev *dev)
if (MLX5_CAP_GEN(dev, pg)) {
err = mlx5_destroy_unmap_eq(dev, &table->pfault_eq);
if (err)
return err;
mlx5_core_err(dev, "failed to destroy page fault eq, err(%d)\n",
err);
}
#endif
err = mlx5_destroy_unmap_eq(dev, &table->pages_eq);
if (err)
return err;
mlx5_core_err(dev, "failed to destroy pages eq, err(%d)\n",
err);
mlx5_destroy_unmap_eq(dev, &table->async_eq);
err = mlx5_destroy_unmap_eq(dev, &table->async_eq);
if (err)
mlx5_core_err(dev, "failed to destroy async eq, err(%d)\n",
err);
mlx5_cmd_use_polling(dev);
err = mlx5_destroy_unmap_eq(dev, &table->cmd_eq);
if (err)
mlx5_cmd_use_events(dev);
return err;
mlx5_core_err(dev, "failed to destroy command eq, err(%d)\n",
err);
}
int mlx5_core_eq_query(struct mlx5_core_dev *dev, struct mlx5_eq *eq,

View File

@ -66,6 +66,9 @@ static int mlx5_fpga_mem_read_i2c(struct mlx5_fpga_device *fdev, size_t size,
u8 actual_size;
int err;
if (!size)
return -EINVAL;
if (!fdev->mdev)
return -ENOTCONN;
@ -95,6 +98,9 @@ static int mlx5_fpga_mem_write_i2c(struct mlx5_fpga_device *fdev, size_t size,
u8 actual_size;
int err;
if (!size)
return -EINVAL;
if (!fdev->mdev)
return -ENOTCONN;

View File

@ -174,6 +174,8 @@ static void del_hw_fte(struct fs_node *node);
static void del_sw_flow_table(struct fs_node *node);
static void del_sw_flow_group(struct fs_node *node);
static void del_sw_fte(struct fs_node *node);
static void del_sw_prio(struct fs_node *node);
static void del_sw_ns(struct fs_node *node);
/* Delete rule (destination) is special case that
* requires to lock the FTE for all the deletion process.
*/
@ -408,6 +410,16 @@ static inline struct mlx5_core_dev *get_dev(struct fs_node *node)
return NULL;
}
static void del_sw_ns(struct fs_node *node)
{
kfree(node);
}
static void del_sw_prio(struct fs_node *node)
{
kfree(node);
}
static void del_hw_flow_table(struct fs_node *node)
{
struct mlx5_flow_table *ft;
@ -2064,7 +2076,7 @@ static struct fs_prio *fs_create_prio(struct mlx5_flow_namespace *ns,
return ERR_PTR(-ENOMEM);
fs_prio->node.type = FS_TYPE_PRIO;
tree_init_node(&fs_prio->node, NULL, NULL);
tree_init_node(&fs_prio->node, NULL, del_sw_prio);
tree_add_node(&fs_prio->node, &ns->node);
fs_prio->num_levels = num_levels;
fs_prio->prio = prio;
@ -2090,7 +2102,7 @@ static struct mlx5_flow_namespace *fs_create_namespace(struct fs_prio *prio)
return ERR_PTR(-ENOMEM);
fs_init_namespace(ns);
tree_init_node(&ns->node, NULL, NULL);
tree_init_node(&ns->node, NULL, del_sw_ns);
tree_add_node(&ns->node, &prio->node);
list_add_tail(&ns->node.list, &prio->node.children);

View File

@ -241,7 +241,7 @@ static void print_health_info(struct mlx5_core_dev *dev)
u32 fw;
int i;
/* If the syndrom is 0, the device is OK and no need to print buffer */
/* If the syndrome is 0, the device is OK and no need to print buffer */
if (!ioread8(&h->synd))
return;

View File

@ -57,7 +57,7 @@ static void mlx5i_build_nic_params(struct mlx5_core_dev *mdev,
struct mlx5e_params *params)
{
/* Override RQ params as IPoIB supports only LINKED LIST RQ for now */
mlx5e_set_rq_type_params(mdev, params, MLX5_WQ_TYPE_LINKED_LIST);
mlx5e_init_rq_type_params(mdev, params, MLX5_WQ_TYPE_LINKED_LIST);
/* RQ size in ipoib by default is 512 */
params->log_rq_size = is_kdump_kernel() ?

View File

@ -317,9 +317,6 @@ static int mlx5_alloc_irq_vectors(struct mlx5_core_dev *dev)
{
struct mlx5_priv *priv = &dev->priv;
struct mlx5_eq_table *table = &priv->eq_table;
struct irq_affinity irqdesc = {
.pre_vectors = MLX5_EQ_VEC_COMP_BASE,
};
int num_eqs = 1 << MLX5_CAP_GEN(dev, log_max_eq);
int nvec;
@ -333,10 +330,9 @@ static int mlx5_alloc_irq_vectors(struct mlx5_core_dev *dev)
if (!priv->irq_info)
goto err_free_msix;
nvec = pci_alloc_irq_vectors_affinity(dev->pdev,
nvec = pci_alloc_irq_vectors(dev->pdev,
MLX5_EQ_VEC_COMP_BASE + 1, nvec,
PCI_IRQ_MSIX | PCI_IRQ_AFFINITY,
&irqdesc);
PCI_IRQ_MSIX);
if (nvec < 0)
return nvec;
@ -622,6 +618,63 @@ u64 mlx5_read_internal_timer(struct mlx5_core_dev *dev)
return (u64)timer_l | (u64)timer_h1 << 32;
}
static int mlx5_irq_set_affinity_hint(struct mlx5_core_dev *mdev, int i)
{
struct mlx5_priv *priv = &mdev->priv;
int irq = pci_irq_vector(mdev->pdev, MLX5_EQ_VEC_COMP_BASE + i);
if (!zalloc_cpumask_var(&priv->irq_info[i].mask, GFP_KERNEL)) {
mlx5_core_warn(mdev, "zalloc_cpumask_var failed");
return -ENOMEM;
}
cpumask_set_cpu(cpumask_local_spread(i, priv->numa_node),
priv->irq_info[i].mask);
if (IS_ENABLED(CONFIG_SMP) &&
irq_set_affinity_hint(irq, priv->irq_info[i].mask))
mlx5_core_warn(mdev, "irq_set_affinity_hint failed, irq 0x%.4x", irq);
return 0;
}
static void mlx5_irq_clear_affinity_hint(struct mlx5_core_dev *mdev, int i)
{
struct mlx5_priv *priv = &mdev->priv;
int irq = pci_irq_vector(mdev->pdev, MLX5_EQ_VEC_COMP_BASE + i);
irq_set_affinity_hint(irq, NULL);
free_cpumask_var(priv->irq_info[i].mask);
}
static int mlx5_irq_set_affinity_hints(struct mlx5_core_dev *mdev)
{
int err;
int i;
for (i = 0; i < mdev->priv.eq_table.num_comp_vectors; i++) {
err = mlx5_irq_set_affinity_hint(mdev, i);
if (err)
goto err_out;
}
return 0;
err_out:
for (i--; i >= 0; i--)
mlx5_irq_clear_affinity_hint(mdev, i);
return err;
}
static void mlx5_irq_clear_affinity_hints(struct mlx5_core_dev *mdev)
{
int i;
for (i = 0; i < mdev->priv.eq_table.num_comp_vectors; i++)
mlx5_irq_clear_affinity_hint(mdev, i);
}
int mlx5_vector2eqn(struct mlx5_core_dev *dev, int vector, int *eqn,
unsigned int *irqn)
{
@ -1097,6 +1150,12 @@ static int mlx5_load_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv,
goto err_stop_eqs;
}
err = mlx5_irq_set_affinity_hints(dev);
if (err) {
dev_err(&pdev->dev, "Failed to alloc affinity hint cpumask\n");
goto err_affinity_hints;
}
err = mlx5_init_fs(dev);
if (err) {
dev_err(&pdev->dev, "Failed to init flow steering\n");
@ -1154,6 +1213,9 @@ static int mlx5_load_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv,
mlx5_cleanup_fs(dev);
err_fs:
mlx5_irq_clear_affinity_hints(dev);
err_affinity_hints:
free_comp_eqs(dev);
err_stop_eqs:
@ -1222,6 +1284,7 @@ static int mlx5_unload_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv,
mlx5_sriov_detach(dev);
mlx5_cleanup_fs(dev);
mlx5_irq_clear_affinity_hints(dev);
free_comp_eqs(dev);
mlx5_stop_eqs(dev);
mlx5_put_uars_page(dev, priv->uar);

View File

@ -213,8 +213,8 @@ int mlx5_core_create_qp(struct mlx5_core_dev *dev,
err_cmd:
memset(din, 0, sizeof(din));
memset(dout, 0, sizeof(dout));
MLX5_SET(destroy_qp_in, in, opcode, MLX5_CMD_OP_DESTROY_QP);
MLX5_SET(destroy_qp_in, in, qpn, qp->qpn);
MLX5_SET(destroy_qp_in, din, opcode, MLX5_CMD_OP_DESTROY_QP);
MLX5_SET(destroy_qp_in, din, qpn, qp->qpn);
mlx5_cmd_exec(dev, din, sizeof(din), dout, sizeof(dout));
return err;
}

Some files were not shown because too many files have changed in this diff Show More