xhci: Rework how we handle unresponsive or hoptlug removed hosts

Introduce a new xhci_hc_died() function that takes care of handling
pending commands and URBs if a host controller becomes unresponsive.

This addresses issues on hotpluggable xhci controllers that disappear
from the bus suddenly, often while the bus (PCI) remove function is
still being processed.

xhci_hc_died() sets a XHCI_STATUS_DYING flag to prevent new URBs and
commands or to be queued. The flag also ensures xhci_hc_died() will
give back pending commands and URBs once.

Host is considered dead if register read returns 0xffffffff, or host
fails to abort the command ring, or fails stopping an endpoint after
trying for 5 seconds.

Signed-off-by: Mathias Nyman <mathias.nyman@linux.intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
This commit is contained in:
Mathias Nyman 2017-04-07 17:57:01 +03:00 committed by Greg Kroah-Hartman
parent fe190ed0d6
commit d9f11ba9f1
4 changed files with 80 additions and 67 deletions

View File

@ -1050,7 +1050,8 @@ int xhci_hub_control(struct usb_hcd *hcd, u16 typeReq, u16 wValue,
goto error; goto error;
wIndex--; wIndex--;
temp = readl(port_array[wIndex]); temp = readl(port_array[wIndex]);
if (temp == 0xffffffff) { if (temp == ~(u32)0) {
xhci_hc_died(xhci);
retval = -ENODEV; retval = -ENODEV;
break; break;
} }
@ -1092,7 +1093,8 @@ int xhci_hub_control(struct usb_hcd *hcd, u16 typeReq, u16 wValue,
goto error; goto error;
wIndex--; wIndex--;
temp = readl(port_array[wIndex]); temp = readl(port_array[wIndex]);
if (temp == 0xffffffff) { if (temp == ~(u32)0) {
xhci_hc_died(xhci);
retval = -ENODEV; retval = -ENODEV;
break; break;
} }
@ -1267,7 +1269,8 @@ int xhci_hub_control(struct usb_hcd *hcd, u16 typeReq, u16 wValue,
goto error; goto error;
wIndex--; wIndex--;
temp = readl(port_array[wIndex]); temp = readl(port_array[wIndex]);
if (temp == 0xffffffff) { if (temp == ~(u32)0) {
xhci_hc_died(xhci);
retval = -ENODEV; retval = -ENODEV;
break; break;
} }
@ -1378,7 +1381,8 @@ int xhci_hub_status_data(struct usb_hcd *hcd, char *buf)
/* For each port, did anything change? If so, set that bit in buf. */ /* For each port, did anything change? If so, set that bit in buf. */
for (i = 0; i < max_ports; i++) { for (i = 0; i < max_ports; i++) {
temp = readl(port_array[i]); temp = readl(port_array[i]);
if (temp == 0xffffffff) { if (temp == ~(u32)0) {
xhci_hc_died(xhci);
retval = -ENODEV; retval = -ENODEV;
break; break;
} }

View File

@ -359,21 +359,19 @@ static int xhci_abort_cmd_ring(struct xhci_hcd *xhci, unsigned long flags)
xhci_write_64(xhci, temp_64 | CMD_RING_ABORT, xhci_write_64(xhci, temp_64 | CMD_RING_ABORT,
&xhci->op_regs->cmd_ring); &xhci->op_regs->cmd_ring);
/* Section 4.6.1.2 of xHCI 1.0 spec says software should /* Section 4.6.1.2 of xHCI 1.0 spec says software should also time the
* time the completion od all xHCI commands, including * completion of the Command Abort operation. If CRR is not negated in 5
* the Command Abort operation. If software doesn't see * seconds then driver handles it as if host died (-ENODEV).
* CRR negated in a timely manner (e.g. longer than 5 * In the future we should distinguish between -ENODEV and -ETIMEDOUT
* seconds), then it should assume that the there are * and try to recover a -ETIMEDOUT with a host controller reset.
* larger problems with the xHC and assert HCRST.
*/ */
ret = xhci_handshake(&xhci->op_regs->cmd_ring, ret = xhci_handshake(&xhci->op_regs->cmd_ring,
CMD_RING_RUNNING, 0, 5 * 1000 * 1000); CMD_RING_RUNNING, 0, 5 * 1000 * 1000);
if (ret < 0) { if (ret < 0) {
xhci_err(xhci, xhci_err(xhci, "Abort failed to stop command ring: %d\n", ret);
"Stop command ring failed, maybe the host is dead\n");
xhci->xhc_state |= XHCI_STATE_DYING;
xhci_halt(xhci); xhci_halt(xhci);
return -ESHUTDOWN; xhci_hc_died(xhci);
return ret;
} }
/* /*
* Writing the CMD_RING_ABORT bit should cause a cmd completion event, * Writing the CMD_RING_ABORT bit should cause a cmd completion event,
@ -873,6 +871,40 @@ static void xhci_kill_endpoint_urbs(struct xhci_hcd *xhci,
} }
} }
/*
* host controller died, register read returns 0xffffffff
* Complete pending commands, mark them ABORTED.
* URBs need to be given back as usb core might be waiting with device locks
* held for the URBs to finish during device disconnect, blocking host remove.
*
* Call with xhci->lock held.
* lock is relased and re-acquired while giving back urb.
*/
void xhci_hc_died(struct xhci_hcd *xhci)
{
int i, j;
if (xhci->xhc_state & XHCI_STATE_DYING)
return;
xhci_err(xhci, "xHCI host controller not responding, assume dead\n");
xhci->xhc_state |= XHCI_STATE_DYING;
xhci_cleanup_command_queue(xhci);
/* return any pending urbs, remove may be waiting for them */
for (i = 0; i <= HCS_MAX_SLOTS(xhci->hcs_params1); i++) {
if (!xhci->devs[i])
continue;
for (j = 0; j < 31; j++)
xhci_kill_endpoint_urbs(xhci, i, j);
}
/* inform usb core hc died if PCI remove isn't already handling it */
if (!(xhci->xhc_state & XHCI_STATE_REMOVING))
usb_hc_died(xhci_to_hcd(xhci));
}
/* Watchdog timer function for when a stop endpoint command fails to complete. /* Watchdog timer function for when a stop endpoint command fails to complete.
* In this case, we assume the host controller is broken or dying or dead. The * In this case, we assume the host controller is broken or dying or dead. The
* host may still be completing some other events, so we have to be careful to * host may still be completing some other events, so we have to be careful to
@ -894,7 +926,6 @@ void xhci_stop_endpoint_command_watchdog(unsigned long arg)
{ {
struct xhci_hcd *xhci; struct xhci_hcd *xhci;
struct xhci_virt_ep *ep; struct xhci_virt_ep *ep;
int ret, i, j;
unsigned long flags; unsigned long flags;
ep = (struct xhci_virt_ep *) arg; ep = (struct xhci_virt_ep *) arg;
@ -911,52 +942,22 @@ void xhci_stop_endpoint_command_watchdog(unsigned long arg)
} }
xhci_warn(xhci, "xHCI host not responding to stop endpoint command.\n"); xhci_warn(xhci, "xHCI host not responding to stop endpoint command.\n");
xhci_warn(xhci, "Assuming host is dying, halting host.\n");
/* Oops, HC is dead or dying or at least not responding to the stop
* endpoint command.
*/
xhci->xhc_state |= XHCI_STATE_DYING;
ep->ep_state &= ~EP_STOP_CMD_PENDING; ep->ep_state &= ~EP_STOP_CMD_PENDING;
/* Disable interrupts from the host controller and start halting it */ xhci_halt(xhci);
xhci_quiesce(xhci);
spin_unlock_irqrestore(&xhci->lock, flags);
ret = xhci_halt(xhci); /*
* handle a stop endpoint cmd timeout as if host died (-ENODEV).
* In the future we could distinguish between -ENODEV and -ETIMEDOUT
* and try to recover a -ETIMEDOUT with a host controller reset
*/
xhci_hc_died(xhci);
spin_lock_irqsave(&xhci->lock, flags);
if (ret < 0) {
/* This is bad; the host is not responding to commands and it's
* not allowing itself to be halted. At least interrupts are
* disabled. If we call usb_hc_died(), it will attempt to
* disconnect all device drivers under this host. Those
* disconnect() methods will wait for all URBs to be unlinked,
* so we must complete them.
*/
xhci_warn(xhci, "Non-responsive xHCI host is not halting.\n");
xhci_warn(xhci, "Completing active URBs anyway.\n");
/* We could turn all TDs on the rings to no-ops. This won't
* help if the host has cached part of the ring, and is slow if
* we want to preserve the cycle bit. Skip it and hope the host
* doesn't touch the memory.
*/
}
for (i = 0; i < MAX_HC_SLOTS; i++) {
if (!xhci->devs[i])
continue;
for (j = 0; j < 31; j++)
xhci_kill_endpoint_urbs(xhci, i, j);
}
spin_unlock_irqrestore(&xhci->lock, flags); spin_unlock_irqrestore(&xhci->lock, flags);
xhci_dbg_trace(xhci, trace_xhci_dbg_cancel_urb,
"Calling usb_hc_died()");
usb_hc_died(xhci_to_hcd(xhci));
xhci_dbg_trace(xhci, trace_xhci_dbg_cancel_urb, xhci_dbg_trace(xhci, trace_xhci_dbg_cancel_urb,
"xHCI host controller is dead."); "xHCI host controller is dead.");
} }
static void update_ring_for_set_deq_completion(struct xhci_hcd *xhci, static void update_ring_for_set_deq_completion(struct xhci_hcd *xhci,
struct xhci_virt_device *dev, struct xhci_virt_device *dev,
struct xhci_ring *ep_ring, struct xhci_ring *ep_ring,
@ -1291,7 +1292,6 @@ void xhci_cleanup_command_queue(struct xhci_hcd *xhci)
void xhci_handle_command_timeout(struct work_struct *work) void xhci_handle_command_timeout(struct work_struct *work)
{ {
struct xhci_hcd *xhci; struct xhci_hcd *xhci;
int ret;
unsigned long flags; unsigned long flags;
u64 hw_ring_state; u64 hw_ring_state;
@ -1312,22 +1312,17 @@ void xhci_handle_command_timeout(struct work_struct *work)
/* Make sure command ring is running before aborting it */ /* Make sure command ring is running before aborting it */
hw_ring_state = xhci_read_64(xhci, &xhci->op_regs->cmd_ring); hw_ring_state = xhci_read_64(xhci, &xhci->op_regs->cmd_ring);
if (hw_ring_state == ~(u64)0) {
xhci_hc_died(xhci);
goto time_out_completed;
}
if ((xhci->cmd_ring_state & CMD_RING_STATE_RUNNING) && if ((xhci->cmd_ring_state & CMD_RING_STATE_RUNNING) &&
(hw_ring_state & CMD_RING_RUNNING)) { (hw_ring_state & CMD_RING_RUNNING)) {
/* Prevent new doorbell, and start command abort */ /* Prevent new doorbell, and start command abort */
xhci->cmd_ring_state = CMD_RING_STATE_ABORTED; xhci->cmd_ring_state = CMD_RING_STATE_ABORTED;
xhci_dbg(xhci, "Command timeout\n"); xhci_dbg(xhci, "Command timeout\n");
ret = xhci_abort_cmd_ring(xhci, flags); xhci_abort_cmd_ring(xhci, flags);
if (unlikely(ret == -ESHUTDOWN)) {
xhci_err(xhci, "Abort command ring failed\n");
xhci_cleanup_command_queue(xhci);
spin_unlock_irqrestore(&xhci->lock, flags);
usb_hc_died(xhci_to_hcd(xhci)->primary_hcd);
xhci_dbg(xhci, "xHCI host controller is dead.\n");
return;
}
goto time_out_completed; goto time_out_completed;
} }
@ -2695,7 +2690,8 @@ irqreturn_t xhci_irq(struct usb_hcd *hcd)
spin_lock(&xhci->lock); spin_lock(&xhci->lock);
/* Check if the xHC generated the interrupt, or the irq is shared */ /* Check if the xHC generated the interrupt, or the irq is shared */
status = readl(&xhci->op_regs->status); status = readl(&xhci->op_regs->status);
if (status == 0xffffffff) { if (status == ~(u32)0) {
xhci_hc_died(xhci);
ret = IRQ_HANDLED; ret = IRQ_HANDLED;
goto out; goto out;
} }

View File

@ -1504,10 +1504,16 @@ int xhci_urb_dequeue(struct usb_hcd *hcd, struct urb *urb, int status)
if (!ep || !ep_ring) if (!ep || !ep_ring)
goto err_giveback; goto err_giveback;
/* If xHC is dead take it down and return ALL URBs in xhci_hc_died() */
temp = readl(&xhci->op_regs->status); temp = readl(&xhci->op_regs->status);
if (temp == 0xffffffff || (xhci->xhc_state & XHCI_STATE_HALTED)) { if (temp == ~(u32)0 || xhci->xhc_state & XHCI_STATE_DYING) {
xhci_hc_died(xhci);
goto done;
}
if (xhci->xhc_state & XHCI_STATE_HALTED) {
xhci_dbg_trace(xhci, trace_xhci_dbg_cancel_urb, xhci_dbg_trace(xhci, trace_xhci_dbg_cancel_urb,
"HW died, freeing TD."); "HC halted, freeing TD manually.");
for (i = urb_priv->num_tds_done; for (i = urb_priv->num_tds_done;
i < urb_priv->num_tds; i < urb_priv->num_tds;
i++) { i++) {
@ -2598,6 +2604,12 @@ static int xhci_configure_endpoint(struct xhci_hcd *xhci,
return -EINVAL; return -EINVAL;
spin_lock_irqsave(&xhci->lock, flags); spin_lock_irqsave(&xhci->lock, flags);
if (xhci->xhc_state & XHCI_STATE_DYING) {
spin_unlock_irqrestore(&xhci->lock, flags);
return -ESHUTDOWN;
}
virt_dev = xhci->devs[udev->slot_id]; virt_dev = xhci->devs[udev->slot_id];
ctrl_ctx = xhci_get_input_control_ctx(command->in_ctx); ctrl_ctx = xhci_get_input_control_ctx(command->in_ctx);

View File

@ -2131,6 +2131,7 @@ int xhci_hub_control(struct usb_hcd *hcd, u16 typeReq, u16 wValue, u16 wIndex,
char *buf, u16 wLength); char *buf, u16 wLength);
int xhci_hub_status_data(struct usb_hcd *hcd, char *buf); int xhci_hub_status_data(struct usb_hcd *hcd, char *buf);
int xhci_find_raw_port_number(struct usb_hcd *hcd, int port1); int xhci_find_raw_port_number(struct usb_hcd *hcd, int port1);
void xhci_hc_died(struct xhci_hcd *xhci);
#ifdef CONFIG_PM #ifdef CONFIG_PM
int xhci_bus_suspend(struct usb_hcd *hcd); int xhci_bus_suspend(struct usb_hcd *hcd);