mirror of
https://github.com/AuxXxilium/linux_dsm_epyc7002.git
synced 2024-12-16 00:56:49 +07:00
net/mlx4_core: Enhance the catas flow to support device reset
This includes: - resetting the chip when a fatal error is detected (the current code does not do this). - exposing the ability to enter error state from outside the catas code by calling its functionality. (E.g. FW Command timeout, AER error). - managing a persistent device state. This is needed to sync between reset flow cases. Signed-off-by: Yishai Hadas <yishaih@mellanox.com> Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
parent
ad9a0bf08f
commit
f6bc11e426
@ -48,6 +48,83 @@ MODULE_PARM_DESC(internal_err_reset,
|
|||||||
"Reset device on internal errors if non-zero"
|
"Reset device on internal errors if non-zero"
|
||||||
" (default 1, in SRIOV mode default is 0)");
|
" (default 1, in SRIOV mode default is 0)");
|
||||||
|
|
||||||
|
static int read_vendor_id(struct mlx4_dev *dev)
|
||||||
|
{
|
||||||
|
u16 vendor_id = 0;
|
||||||
|
int ret;
|
||||||
|
|
||||||
|
ret = pci_read_config_word(dev->persist->pdev, 0, &vendor_id);
|
||||||
|
if (ret) {
|
||||||
|
mlx4_err(dev, "Failed to read vendor ID, ret=%d\n", ret);
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (vendor_id == 0xffff) {
|
||||||
|
mlx4_err(dev, "PCI can't be accessed to read vendor id\n");
|
||||||
|
return -EINVAL;
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int mlx4_reset_master(struct mlx4_dev *dev)
|
||||||
|
{
|
||||||
|
int err = 0;
|
||||||
|
|
||||||
|
if (!pci_channel_offline(dev->persist->pdev)) {
|
||||||
|
err = read_vendor_id(dev);
|
||||||
|
/* If PCI can't be accessed to read vendor ID we assume that its
|
||||||
|
* link was disabled and chip was already reset.
|
||||||
|
*/
|
||||||
|
if (err)
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
err = mlx4_reset(dev);
|
||||||
|
if (err)
|
||||||
|
mlx4_err(dev, "Fail to reset HCA\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
return err;
|
||||||
|
}
|
||||||
|
|
||||||
|
void mlx4_enter_error_state(struct mlx4_dev_persistent *persist)
|
||||||
|
{
|
||||||
|
int err;
|
||||||
|
struct mlx4_dev *dev;
|
||||||
|
|
||||||
|
if (!internal_err_reset)
|
||||||
|
return;
|
||||||
|
|
||||||
|
mutex_lock(&persist->device_state_mutex);
|
||||||
|
if (persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR)
|
||||||
|
goto out;
|
||||||
|
|
||||||
|
dev = persist->dev;
|
||||||
|
mlx4_err(dev, "device is going to be reset\n");
|
||||||
|
err = mlx4_reset_master(dev);
|
||||||
|
BUG_ON(err != 0);
|
||||||
|
|
||||||
|
dev->persist->state |= MLX4_DEVICE_STATE_INTERNAL_ERROR;
|
||||||
|
mlx4_err(dev, "device was reset successfully\n");
|
||||||
|
mutex_unlock(&persist->device_state_mutex);
|
||||||
|
|
||||||
|
/* At that step HW was already reset, now notify clients */
|
||||||
|
mlx4_dispatch_event(dev, MLX4_DEV_EVENT_CATASTROPHIC_ERROR, 0);
|
||||||
|
return;
|
||||||
|
|
||||||
|
out:
|
||||||
|
mutex_unlock(&persist->device_state_mutex);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void mlx4_handle_error_state(struct mlx4_dev_persistent *persist)
|
||||||
|
{
|
||||||
|
int err = 0;
|
||||||
|
|
||||||
|
mlx4_enter_error_state(persist);
|
||||||
|
err = mlx4_restart_one(persist->pdev);
|
||||||
|
mlx4_info(persist->dev, "mlx4_restart_one was ended, ret=%d\n", err);
|
||||||
|
}
|
||||||
|
|
||||||
static void dump_err_buf(struct mlx4_dev *dev)
|
static void dump_err_buf(struct mlx4_dev *dev)
|
||||||
{
|
{
|
||||||
struct mlx4_priv *priv = mlx4_priv(dev);
|
struct mlx4_priv *priv = mlx4_priv(dev);
|
||||||
@ -66,21 +143,22 @@ static void poll_catas(unsigned long dev_ptr)
|
|||||||
struct mlx4_priv *priv = mlx4_priv(dev);
|
struct mlx4_priv *priv = mlx4_priv(dev);
|
||||||
|
|
||||||
if (readl(priv->catas_err.map)) {
|
if (readl(priv->catas_err.map)) {
|
||||||
/* If the device is off-line, we cannot try to recover it */
|
dump_err_buf(dev);
|
||||||
if (pci_channel_offline(dev->persist->pdev))
|
goto internal_err;
|
||||||
mod_timer(&priv->catas_err.timer,
|
}
|
||||||
round_jiffies(jiffies + MLX4_CATAS_POLL_INTERVAL));
|
|
||||||
else {
|
|
||||||
dump_err_buf(dev);
|
|
||||||
mlx4_dispatch_event(dev, MLX4_DEV_EVENT_CATASTROPHIC_ERROR, 0);
|
|
||||||
|
|
||||||
if (internal_err_reset)
|
if (dev->persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR) {
|
||||||
queue_work(dev->persist->catas_wq,
|
mlx4_warn(dev, "Internal error mark was detected on device\n");
|
||||||
&dev->persist->catas_work);
|
goto internal_err;
|
||||||
}
|
}
|
||||||
} else
|
|
||||||
mod_timer(&priv->catas_err.timer,
|
mod_timer(&priv->catas_err.timer,
|
||||||
round_jiffies(jiffies + MLX4_CATAS_POLL_INTERVAL));
|
round_jiffies(jiffies + MLX4_CATAS_POLL_INTERVAL));
|
||||||
|
return;
|
||||||
|
|
||||||
|
internal_err:
|
||||||
|
if (internal_err_reset)
|
||||||
|
queue_work(dev->persist->catas_wq, &dev->persist->catas_work);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void catas_reset(struct work_struct *work)
|
static void catas_reset(struct work_struct *work)
|
||||||
@ -88,20 +166,8 @@ static void catas_reset(struct work_struct *work)
|
|||||||
struct mlx4_dev_persistent *persist =
|
struct mlx4_dev_persistent *persist =
|
||||||
container_of(work, struct mlx4_dev_persistent,
|
container_of(work, struct mlx4_dev_persistent,
|
||||||
catas_work);
|
catas_work);
|
||||||
struct pci_dev *pdev = persist->pdev;
|
|
||||||
int ret;
|
|
||||||
|
|
||||||
/* If the device is off-line, we cannot reset it */
|
mlx4_handle_error_state(persist);
|
||||||
if (pci_channel_offline(pdev))
|
|
||||||
return;
|
|
||||||
|
|
||||||
ret = mlx4_restart_one(pdev);
|
|
||||||
/* 'priv' now is not valid */
|
|
||||||
if (ret)
|
|
||||||
pr_err("mlx4 %s: Reset failed (%d)\n",
|
|
||||||
pci_name(pdev), ret);
|
|
||||||
else
|
|
||||||
mlx4_dbg(persist->dev, "Reset succeeded\n");
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void mlx4_start_catas_poll(struct mlx4_dev *dev)
|
void mlx4_start_catas_poll(struct mlx4_dev *dev)
|
||||||
|
@ -2624,6 +2624,11 @@ static int mlx4_load_one(struct pci_dev *pdev, int pci_dev_data,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* on load remove any previous indication of internal error,
|
||||||
|
* device is up.
|
||||||
|
*/
|
||||||
|
dev->persist->state = MLX4_DEVICE_STATE_UP;
|
||||||
|
|
||||||
slave_start:
|
slave_start:
|
||||||
err = mlx4_cmd_init(dev);
|
err = mlx4_cmd_init(dev);
|
||||||
if (err) {
|
if (err) {
|
||||||
@ -3108,6 +3113,7 @@ static int mlx4_init_one(struct pci_dev *pdev, const struct pci_device_id *id)
|
|||||||
dev->persist->dev = dev;
|
dev->persist->dev = dev;
|
||||||
pci_set_drvdata(pdev, dev->persist);
|
pci_set_drvdata(pdev, dev->persist);
|
||||||
priv->pci_dev_data = id->driver_data;
|
priv->pci_dev_data = id->driver_data;
|
||||||
|
mutex_init(&dev->persist->device_state_mutex);
|
||||||
|
|
||||||
ret = __mlx4_init_one(pdev, id->driver_data, priv);
|
ret = __mlx4_init_one(pdev, id->driver_data, priv);
|
||||||
if (ret) {
|
if (ret) {
|
||||||
|
@ -1178,7 +1178,7 @@ void mlx4_qp_event(struct mlx4_dev *dev, u32 qpn, int event_type);
|
|||||||
|
|
||||||
void mlx4_srq_event(struct mlx4_dev *dev, u32 srqn, int event_type);
|
void mlx4_srq_event(struct mlx4_dev *dev, u32 srqn, int event_type);
|
||||||
|
|
||||||
void mlx4_handle_catas_err(struct mlx4_dev *dev);
|
void mlx4_enter_error_state(struct mlx4_dev_persistent *persist);
|
||||||
|
|
||||||
int mlx4_SENSE_PORT(struct mlx4_dev *dev, int port,
|
int mlx4_SENSE_PORT(struct mlx4_dev *dev, int port,
|
||||||
enum mlx4_port_type *type);
|
enum mlx4_port_type *type);
|
||||||
|
@ -411,6 +411,11 @@ enum {
|
|||||||
MLX4_EQ_PORT_INFO_MSTR_SM_SL_CHANGE_MASK = 1 << 4,
|
MLX4_EQ_PORT_INFO_MSTR_SM_SL_CHANGE_MASK = 1 << 4,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
enum {
|
||||||
|
MLX4_DEVICE_STATE_UP = 1 << 0,
|
||||||
|
MLX4_DEVICE_STATE_INTERNAL_ERROR = 1 << 1,
|
||||||
|
};
|
||||||
|
|
||||||
#define MSTR_SM_CHANGE_MASK (MLX4_EQ_PORT_INFO_MSTR_SM_SL_CHANGE_MASK | \
|
#define MSTR_SM_CHANGE_MASK (MLX4_EQ_PORT_INFO_MSTR_SM_SL_CHANGE_MASK | \
|
||||||
MLX4_EQ_PORT_INFO_MSTR_SM_LID_CHANGE_MASK)
|
MLX4_EQ_PORT_INFO_MSTR_SM_LID_CHANGE_MASK)
|
||||||
|
|
||||||
@ -753,6 +758,8 @@ struct mlx4_dev_persistent {
|
|||||||
enum mlx4_port_type curr_port_poss_type[MLX4_MAX_PORTS + 1];
|
enum mlx4_port_type curr_port_poss_type[MLX4_MAX_PORTS + 1];
|
||||||
struct work_struct catas_work;
|
struct work_struct catas_work;
|
||||||
struct workqueue_struct *catas_wq;
|
struct workqueue_struct *catas_wq;
|
||||||
|
struct mutex device_state_mutex; /* protect HW state */
|
||||||
|
u8 state;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct mlx4_dev {
|
struct mlx4_dev {
|
||||||
|
Loading…
Reference in New Issue
Block a user