diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h index 842adf3719ff1aa7fbbaf04ba33cf8b23843f5ec..7316571a4df5fac04c858330f9ac3e4cf8cb87fe 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h @@ -300,6 +300,7 @@ struct mlx5e_dcbx_dp { enum { MLX5E_RQ_STATE_ENABLED, + MLX5E_RQ_STATE_RECOVERING, MLX5E_RQ_STATE_AM, MLX5E_RQ_STATE_NO_CSUM_COMPLETE, MLX5E_RQ_STATE_CSUM_FULL, /* cqe_csum_full hw bit is set */ @@ -672,6 +673,8 @@ struct mlx5e_rq { struct zero_copy_allocator zca; struct xdp_umem *umem; + struct work_struct recover_work; + /* control */ struct mlx5_wq_ctrl wq_ctrl; __be32 mkey_be; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/health.h b/drivers/net/ethernet/mellanox/mlx5/core/en/health.h index 52e9ca37cf46137418d4970012c5013ca7b74662..d3693fa547ac0cbd814efe753c6d734fe6ccd713 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/health.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/health.h @@ -8,6 +8,14 @@ #define MLX5E_RX_ERR_CQE(cqe) (get_cqe_opcode(cqe) != MLX5_CQE_RESP_SEND) +static inline bool cqe_syndrome_needs_recover(u8 syndrome) +{ + return syndrome == MLX5_CQE_SYNDROME_LOCAL_LENGTH_ERR || + syndrome == MLX5_CQE_SYNDROME_LOCAL_QP_OP_ERR || + syndrome == MLX5_CQE_SYNDROME_LOCAL_PROT_ERR || + syndrome == MLX5_CQE_SYNDROME_WR_FLUSH_ERR; +} + int mlx5e_reporter_tx_create(struct mlx5e_priv *priv); void mlx5e_reporter_tx_destroy(struct mlx5e_priv *priv); void mlx5e_reporter_tx_err_cqe(struct mlx5e_txqsq *sq); @@ -21,6 +29,7 @@ int mlx5e_reporter_named_obj_nest_end(struct devlink_fmsg *fmsg); int mlx5e_reporter_rx_create(struct mlx5e_priv *priv); void mlx5e_reporter_rx_destroy(struct mlx5e_priv *priv); void mlx5e_reporter_icosq_cqe_err(struct mlx5e_icosq *icosq); +void mlx5e_reporter_rq_cqe_err(struct mlx5e_rq *rq); void mlx5e_reporter_rx_timeout(struct mlx5e_rq *rq); #define MLX5E_REPORTER_PER_Q_MAX_LEN 256 diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c index 05450df875543caf3105d451b86b2f7a885460bd..b860569d4247284d77983aae3c87f6fbcfb339bd 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c @@ -115,6 +115,75 @@ void mlx5e_reporter_icosq_cqe_err(struct mlx5e_icosq *icosq) mlx5e_health_report(priv, priv->rx_reporter, err_str, &err_ctx); } +static int mlx5e_rq_to_ready(struct mlx5e_rq *rq, int curr_state) +{ + struct net_device *dev = rq->netdev; + int err; + + err = mlx5e_modify_rq_state(rq, curr_state, MLX5_RQC_STATE_RST); + if (err) { + netdev_err(dev, "Failed to move rq 0x%x to reset\n", rq->rqn); + return err; + } + err = mlx5e_modify_rq_state(rq, MLX5_RQC_STATE_RST, MLX5_RQC_STATE_RDY); + if (err) { + netdev_err(dev, "Failed to move rq 0x%x to ready\n", rq->rqn); + return err; + } + + return 0; +} + +static int mlx5e_rx_reporter_err_rq_cqe_recover(void *ctx) +{ + struct mlx5_core_dev *mdev; + struct net_device *dev; + struct mlx5e_rq *rq; + u8 state; + int err; + + rq = ctx; + mdev = rq->mdev; + dev = rq->netdev; + err = mlx5e_query_rq_state(mdev, rq->rqn, &state); + if (err) { + netdev_err(dev, "Failed to query RQ 0x%x state. err = %d\n", + rq->rqn, err); + goto out; + } + + if (state != MLX5_RQC_STATE_ERR) + goto out; + + mlx5e_deactivate_rq(rq); + mlx5e_free_rx_descs(rq); + + err = mlx5e_rq_to_ready(rq, MLX5_RQC_STATE_ERR); + if (err) + goto out; + + clear_bit(MLX5E_RQ_STATE_RECOVERING, &rq->state); + mlx5e_activate_rq(rq); + rq->stats->recover++; + return 0; +out: + clear_bit(MLX5E_RQ_STATE_RECOVERING, &rq->state); + return err; +} + +void mlx5e_reporter_rq_cqe_err(struct mlx5e_rq *rq) +{ + struct mlx5e_priv *priv = rq->channel->priv; + char err_str[MLX5E_REPORTER_PER_Q_MAX_LEN]; + struct mlx5e_err_ctx err_ctx = {}; + + err_ctx.ctx = rq; + err_ctx.recover = mlx5e_rx_reporter_err_rq_cqe_recover; + sprintf(err_str, "ERR CQE on RQ: 0x%x", rq->rqn); + + mlx5e_health_report(priv, priv->rx_reporter, err_str, &err_ctx); +} + static int mlx5e_rx_reporter_timeout_recover(void *ctx) { struct mlx5e_icosq *icosq; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 54f320391f635091f865df0a92027890af4a9f48..7fdea6479ff6876d7179d49d4c719c6ad81cffca 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -362,6 +362,13 @@ static void mlx5e_free_di_list(struct mlx5e_rq *rq) kvfree(rq->wqe.di); } +static void mlx5e_rq_err_cqe_work(struct work_struct *recover_work) +{ + struct mlx5e_rq *rq = container_of(recover_work, struct mlx5e_rq, recover_work); + + mlx5e_reporter_rq_cqe_err(rq); +} + static int mlx5e_alloc_rq(struct mlx5e_channel *c, struct mlx5e_params *params, struct mlx5e_xsk_param *xsk, @@ -398,6 +405,7 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c, rq->stats = &c->priv->channel_stats[c->ix].xskrq; else rq->stats = &c->priv->channel_stats[c->ix].rq; + INIT_WORK(&rq->recover_work, mlx5e_rq_err_cqe_work); rq->xdp_prog = params->xdp_prog ? bpf_prog_inc(params->xdp_prog) : NULL; if (IS_ERR(rq->xdp_prog)) { @@ -907,6 +915,7 @@ void mlx5e_close_rq(struct mlx5e_rq *rq) { cancel_work_sync(&rq->dim.work); cancel_work_sync(&rq->channel->icosq.recover_work); + cancel_work_sync(&rq->recover_work); mlx5e_destroy_rq(rq); mlx5e_free_rx_descs(rq); mlx5e_free_rq(rq); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c index 43d790b7d4ec15e038197ccab8678c6fa8eaee35..2fd2760d0bb7c0b1000b8687f4b0af1c0c056ee7 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c @@ -1130,6 +1130,15 @@ mlx5e_skb_from_cqe_nonlinear(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe, return skb; } +static void trigger_report(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe) +{ + struct mlx5_err_cqe *err_cqe = (struct mlx5_err_cqe *)cqe; + + if (cqe_syndrome_needs_recover(err_cqe->syndrome) && + !test_and_set_bit(MLX5E_RQ_STATE_RECOVERING, &rq->state)) + queue_work(rq->channel->priv->wq, &rq->recover_work); +} + void mlx5e_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe) { struct mlx5_wq_cyc *wq = &rq->wqe.wq; @@ -1143,6 +1152,7 @@ void mlx5e_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe) cqe_bcnt = be32_to_cpu(cqe->byte_cnt); if (unlikely(MLX5E_RX_ERR_CQE(cqe))) { + trigger_report(rq, cqe); rq->stats->wqe_err++; goto free_wqe; } @@ -1328,6 +1338,7 @@ void mlx5e_handle_rx_cqe_mpwrq(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe) wi->consumed_strides += cstrides; if (unlikely(MLX5E_RX_ERR_CQE(cqe))) { + trigger_report(rq, cqe); rq->stats->wqe_err++; goto mpwrq_cqe_out; }