mlx5-updates-2019-06-13
Mlx5 devlink health fw reporters and sw reset support This series provides mlx5 firmware reset support and firmware devlink health reporters. 1) Add initial mlx5 kernel documentation and include devlink health reporters 2) Add CR-Space access and FW Crdump snapshot support via devlink region_snapshot 3) Issue software reset upon FW asserts 4) Add fw and fw_fatal devlink heath reporters to follow fw errors indication by dump and recover procedures and enable trigger these functionality by user. 4.1) fw reporter: The fw reporter implements diagnose and dump callbacks. It follows symptoms of fw error such as fw syndrome by triggering fw core dump and storing it and any other fw trace into the dump buffer. The fw reporter diagnose command can be triggered any time by the user to check current fw status. 4.2) fw_fatal repoter: The fw_fatal reporter implements dump and recover callbacks. It follows fatal errors indications by CR-space dump and recover flow. The CR-space dump uses vsc interface which is valid even if the FW command interface is not functional, which is the case in most FW fatal errors. The CR-space dump is stored as a memory region snapshot to ease read by address. The recover function runs recover flow which reloads the driver and triggers fw reset if needed. -----BEGIN PGP SIGNATURE----- iQEzBAABCAAdFiEEGhZs6bAKwk/OTgTpSD+KveBX+j4FAl0CsLgACgkQSD+KveBX +j7mFwf+MYvIbUO4mXyoZIezci1UCzt1vNAkUYPceE94O9fK68ItrwtwrstgIqqS 58Tgx//MXxPpe9k9NIWjeS3i8sjcb8fDoqkjOCj7KAchv0IhSUvYFRpBrUK+yTOW NIIXZzuCgIoR9a/hVlT/lhG+dm4MX2L5dWFtORLxMoO+ff3yiy4nNf9+Zdt0H7LT YCELWnKeIQCvdzJAxX7OyTh3eOfc/h7o1nOsU4VugBHxKxx4T+9A26d+cZeZH5Ox 3ikTCc01ivVHqcLydAy96HQu0MENSNYNpmyDxWum3oJGFFu6hBQTM2ueRmVWZfwH DRu+hhxONZROxxtpmP/ULmwYcLnBHg== =VhXt -----END PGP SIGNATURE----- Merge tag 'mlx5-updates-2019-06-13' of git://git.kernel.org/pub/scm/linux/kernel/git/saeed/linux Saeed Mahameed says: ==================== mlx5-updates-2019-06-13 Mlx5 devlink health fw reporters and sw reset support This series provides mlx5 firmware reset support and firmware devlink health reporters. 1) Add initial mlx5 kernel documentation and include devlink health reporters 2) Add CR-Space access and FW Crdump snapshot support via devlink region_snapshot 3) Issue software reset upon FW asserts 4) Add fw and fw_fatal devlink heath reporters to follow fw errors indication by dump and recover procedures and enable trigger these functionality by user. 4.1) fw reporter: The fw reporter implements diagnose and dump callbacks. It follows symptoms of fw error such as fw syndrome by triggering fw core dump and storing it and any other fw trace into the dump buffer. The fw reporter diagnose command can be triggered any time by the user to check current fw status. 4.2) fw_fatal repoter: The fw_fatal reporter implements dump and recover callbacks. It follows fatal errors indications by CR-space dump and recover flow. The CR-space dump uses vsc interface which is valid even if the FW command interface is not functional, which is the case in most FW fatal errors. The CR-space dump is stored as a memory region snapshot to ease read by address. The recover function runs recover flow which reloads the driver and triggers fw reset if needed. ==================== Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
commit
b4a6d9abeb
19 changed files with 1517 additions and 145 deletions
|
|
@ -510,6 +510,10 @@ struct mlx5_cmd_layout {
|
|||
u8 status_own;
|
||||
};
|
||||
|
||||
enum mlx5_fatal_assert_bit_offsets {
|
||||
MLX5_RFR_OFFSET = 31,
|
||||
};
|
||||
|
||||
struct health_buffer {
|
||||
__be32 assert_var[5];
|
||||
__be32 rsvd0[3];
|
||||
|
|
@ -518,12 +522,16 @@ struct health_buffer {
|
|||
__be32 rsvd1[2];
|
||||
__be32 fw_ver;
|
||||
__be32 hw_id;
|
||||
__be32 rsvd2;
|
||||
__be32 rfr;
|
||||
u8 irisc_index;
|
||||
u8 synd;
|
||||
__be16 ext_synd;
|
||||
};
|
||||
|
||||
enum mlx5_initializing_bit_offsets {
|
||||
MLX5_FW_RESET_SUPPORTED_OFFSET = 30,
|
||||
};
|
||||
|
||||
enum mlx5_cmd_addr_l_sz_offset {
|
||||
MLX5_NIC_IFC_OFFSET = 8,
|
||||
};
|
||||
|
|
|
|||
|
|
@ -53,6 +53,7 @@
|
|||
#include <linux/mlx5/eq.h>
|
||||
#include <linux/timecounter.h>
|
||||
#include <linux/ptp_clock_kernel.h>
|
||||
#include <net/devlink.h>
|
||||
|
||||
enum {
|
||||
MLX5_BOARD_ID_LEN = 64,
|
||||
|
|
@ -434,13 +435,18 @@ struct mlx5_core_health {
|
|||
struct timer_list timer;
|
||||
u32 prev;
|
||||
int miss_counter;
|
||||
bool sick;
|
||||
u8 synd;
|
||||
u32 fatal_error;
|
||||
u32 crdump_size;
|
||||
/* wq spinlock to synchronize draining */
|
||||
spinlock_t wq_lock;
|
||||
struct workqueue_struct *wq;
|
||||
unsigned long flags;
|
||||
struct work_struct work;
|
||||
struct work_struct fatal_report_work;
|
||||
struct work_struct report_work;
|
||||
struct delayed_work recover_work;
|
||||
struct devlink_health_reporter *fw_reporter;
|
||||
struct devlink_health_reporter *fw_fatal_reporter;
|
||||
};
|
||||
|
||||
struct mlx5_qp_table {
|
||||
|
|
@ -581,6 +587,7 @@ struct mlx5_priv {
|
|||
};
|
||||
|
||||
enum mlx5_device_state {
|
||||
MLX5_DEVICE_STATE_UNINITIALIZED,
|
||||
MLX5_DEVICE_STATE_UP,
|
||||
MLX5_DEVICE_STATE_INTERNAL_ERROR,
|
||||
};
|
||||
|
|
@ -693,6 +700,7 @@ struct mlx5_core_dev {
|
|||
struct mlx5_clock clock;
|
||||
struct mlx5_ib_clock_info *clock_info;
|
||||
struct mlx5_fw_tracer *tracer;
|
||||
u32 vsc_addr;
|
||||
};
|
||||
|
||||
struct mlx5_db {
|
||||
|
|
@ -904,7 +912,6 @@ void mlx5_start_health_poll(struct mlx5_core_dev *dev);
|
|||
void mlx5_stop_health_poll(struct mlx5_core_dev *dev, bool disable_health);
|
||||
void mlx5_drain_health_wq(struct mlx5_core_dev *dev);
|
||||
void mlx5_trigger_health_work(struct mlx5_core_dev *dev);
|
||||
void mlx5_drain_health_recovery(struct mlx5_core_dev *dev);
|
||||
int mlx5_buf_alloc_node(struct mlx5_core_dev *dev, int size,
|
||||
struct mlx5_frag_buf *buf, int node);
|
||||
int mlx5_buf_alloc(struct mlx5_core_dev *dev,
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue