net/mlx5: Add Crdump support

Crdump allows the driver to retrieve a dump of the FW PCI crspace.
This is useful in case of catastrophic issues which may require FW
reset. The crspace dump can be used for later debug.

Signed-off-by: Alex Vesker <valex@mellanox.com>
Signed-off-by: Moshe Shemesh <moshe@mellanox.com>
Reviewed-by: Feras Daoud <ferasda@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
This commit is contained in:
Alex Vesker 2018-07-17 11:18:26 +03:00 committed by Saeed Mahameed
parent b25bbc2f24
commit 8b9d8baae1
5 changed files with 116 additions and 1 deletions

View File

@ -16,7 +16,7 @@ mlx5_core-y := main.o cmd.o debugfs.o fw.o eq.o uar.o pagealloc.o \
transobj.o vport.o sriov.o fs_cmd.o fs_core.o \
fs_counters.o rl.o lag.o dev.o events.o wq.o lib/gid.o \
lib/devcom.o lib/pci_vsc.o diag/fs_tracepoint.o \
diag/fw_tracer.o devlink.o
diag/fw_tracer.o diag/crdump.o devlink.o
#
# Netdev basic

View File

@ -0,0 +1,106 @@
// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
/* Copyright (c) 2019 Mellanox Technologies */
#include <linux/mlx5/driver.h>
#include "mlx5_core.h"
#include "lib/pci_vsc.h"
#include "lib/mlx5.h"
#define BAD_ACCESS 0xBADACCE5
#define MLX5_PROTECTED_CR_SCAN_CRSPACE 0x7
static bool mlx5_crdump_enabled(struct mlx5_core_dev *dev)
{
return !!dev->priv.health.crdump_size;
}
static int mlx5_crdump_fill(struct mlx5_core_dev *dev, u32 *cr_data)
{
u32 crdump_size = dev->priv.health.crdump_size;
int i, ret;
for (i = 0; i < (crdump_size / 4); i++)
cr_data[i] = BAD_ACCESS;
ret = mlx5_vsc_gw_read_block_fast(dev, cr_data, crdump_size);
if (ret <= 0) {
if (ret == 0)
return -EIO;
return ret;
}
if (crdump_size != ret) {
mlx5_core_warn(dev, "failed to read full dump, read %d out of %u\n",
ret, crdump_size);
return -EINVAL;
}
return 0;
}
int mlx5_crdump_collect(struct mlx5_core_dev *dev, u32 *cr_data)
{
int ret;
if (!mlx5_crdump_enabled(dev))
return -ENODEV;
ret = mlx5_vsc_gw_lock(dev);
if (ret) {
mlx5_core_warn(dev, "crdump: failed to lock vsc gw err %d\n",
ret);
return ret;
}
ret = mlx5_vsc_gw_set_space(dev, MLX5_VSC_SPACE_SCAN_CRSPACE, NULL);
if (ret)
goto unlock;
ret = mlx5_crdump_fill(dev, cr_data);
unlock:
mlx5_vsc_gw_unlock(dev);
return ret;
}
int mlx5_crdump_enable(struct mlx5_core_dev *dev)
{
struct mlx5_priv *priv = &dev->priv;
u32 space_size;
int ret;
if (!mlx5_core_is_pf(dev) || !mlx5_vsc_accessible(dev) ||
mlx5_crdump_enabled(dev))
return 0;
ret = mlx5_vsc_gw_lock(dev);
if (ret)
return ret;
/* Check if space is supported and get space size */
ret = mlx5_vsc_gw_set_space(dev, MLX5_VSC_SPACE_SCAN_CRSPACE,
&space_size);
if (ret) {
/* Unlock and mask error since space is not supported */
mlx5_vsc_gw_unlock(dev);
return 0;
}
if (!space_size) {
mlx5_core_warn(dev, "Invalid Crspace size, zero\n");
mlx5_vsc_gw_unlock(dev);
return -EINVAL;
}
ret = mlx5_vsc_gw_unlock(dev);
if (ret)
return ret;
priv->health.crdump_size = space_size;
return 0;
}
void mlx5_crdump_disable(struct mlx5_core_dev *dev)
{
dev->priv.health.crdump_size = 0;
}

View File

@ -41,6 +41,9 @@ int mlx5_core_reserve_gids(struct mlx5_core_dev *dev, unsigned int count);
void mlx5_core_unreserve_gids(struct mlx5_core_dev *dev, unsigned int count);
int mlx5_core_reserved_gid_alloc(struct mlx5_core_dev *dev, int *gid_index);
void mlx5_core_reserved_gid_free(struct mlx5_core_dev *dev, int gid_index);
int mlx5_crdump_enable(struct mlx5_core_dev *dev);
void mlx5_crdump_disable(struct mlx5_core_dev *dev);
int mlx5_crdump_collect(struct mlx5_core_dev *dev, u32 *cr_data);
/* TODO move to lib/events.h */

View File

@ -1313,6 +1313,10 @@ static int init_one(struct pci_dev *pdev, const struct pci_device_id *id)
if (err)
goto clean_load;
err = mlx5_crdump_enable(dev);
if (err)
dev_err(&pdev->dev, "mlx5_crdump_enable failed with error code %d\n", err);
pci_save_state(pdev);
return 0;
@ -1334,6 +1338,7 @@ static void remove_one(struct pci_dev *pdev)
struct mlx5_core_dev *dev = pci_get_drvdata(pdev);
struct devlink *devlink = priv_to_devlink(dev);
mlx5_crdump_disable(dev);
mlx5_devlink_unregister(devlink);
mlx5_unregister_device(dev);

View File

@ -435,6 +435,7 @@ struct mlx5_core_health {
u32 prev;
int miss_counter;
bool sick;
u32 crdump_size;
/* wq spinlock to synchronize draining */
spinlock_t wq_lock;
struct workqueue_struct *wq;