]> asedeno.scripts.mit.edu Git - linux.git/commitdiff
net/mlx5e: Report and recover from CQE with error on RQ
authorAya Levin <ayal@mellanox.com>
Wed, 26 Jun 2019 20:21:40 +0000 (23:21 +0300)
committerSaeed Mahameed <saeedm@mellanox.com>
Tue, 20 Aug 2019 20:08:18 +0000 (13:08 -0700)
Add support for report and recovery from error on completion on RQ by
setting the queue back to ready state. Handle only errors with a
syndrome indicating the RQ might enter error state and could be
recovered.

Signed-off-by: Aya Levin <ayal@mellanox.com>
Reviewed-by: Tariq Toukan <tariqt@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
drivers/net/ethernet/mellanox/mlx5/core/en.h
drivers/net/ethernet/mellanox/mlx5/core/en/health.h
drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c
drivers/net/ethernet/mellanox/mlx5/core/en_main.c
drivers/net/ethernet/mellanox/mlx5/core/en_rx.c

index 842adf3719ff1aa7fbbaf04ba33cf8b23843f5ec..7316571a4df5fac04c858330f9ac3e4cf8cb87fe 100644 (file)
@@ -300,6 +300,7 @@ struct mlx5e_dcbx_dp {
 
 enum {
        MLX5E_RQ_STATE_ENABLED,
+       MLX5E_RQ_STATE_RECOVERING,
        MLX5E_RQ_STATE_AM,
        MLX5E_RQ_STATE_NO_CSUM_COMPLETE,
        MLX5E_RQ_STATE_CSUM_FULL, /* cqe_csum_full hw bit is set */
@@ -672,6 +673,8 @@ struct mlx5e_rq {
        struct zero_copy_allocator zca;
        struct xdp_umem       *umem;
 
+       struct work_struct     recover_work;
+
        /* control */
        struct mlx5_wq_ctrl    wq_ctrl;
        __be32                 mkey_be;
index 52e9ca37cf46137418d4970012c5013ca7b74662..d3693fa547ac0cbd814efe753c6d734fe6ccd713 100644 (file)
@@ -8,6 +8,14 @@
 
 #define MLX5E_RX_ERR_CQE(cqe) (get_cqe_opcode(cqe) != MLX5_CQE_RESP_SEND)
 
+static inline bool cqe_syndrome_needs_recover(u8 syndrome)
+{
+       return syndrome == MLX5_CQE_SYNDROME_LOCAL_LENGTH_ERR ||
+              syndrome == MLX5_CQE_SYNDROME_LOCAL_QP_OP_ERR ||
+              syndrome == MLX5_CQE_SYNDROME_LOCAL_PROT_ERR ||
+              syndrome == MLX5_CQE_SYNDROME_WR_FLUSH_ERR;
+}
+
 int mlx5e_reporter_tx_create(struct mlx5e_priv *priv);
 void mlx5e_reporter_tx_destroy(struct mlx5e_priv *priv);
 void mlx5e_reporter_tx_err_cqe(struct mlx5e_txqsq *sq);
@@ -21,6 +29,7 @@ int mlx5e_reporter_named_obj_nest_end(struct devlink_fmsg *fmsg);
 int mlx5e_reporter_rx_create(struct mlx5e_priv *priv);
 void mlx5e_reporter_rx_destroy(struct mlx5e_priv *priv);
 void mlx5e_reporter_icosq_cqe_err(struct mlx5e_icosq *icosq);
+void mlx5e_reporter_rq_cqe_err(struct mlx5e_rq *rq);
 void mlx5e_reporter_rx_timeout(struct mlx5e_rq *rq);
 
 #define MLX5E_REPORTER_PER_Q_MAX_LEN 256
index 05450df875543caf3105d451b86b2f7a885460bd..b860569d4247284d77983aae3c87f6fbcfb339bd 100644 (file)
@@ -115,6 +115,75 @@ void mlx5e_reporter_icosq_cqe_err(struct mlx5e_icosq *icosq)
        mlx5e_health_report(priv, priv->rx_reporter, err_str, &err_ctx);
 }
 
+static int mlx5e_rq_to_ready(struct mlx5e_rq *rq, int curr_state)
+{
+       struct net_device *dev = rq->netdev;
+       int err;
+
+       err = mlx5e_modify_rq_state(rq, curr_state, MLX5_RQC_STATE_RST);
+       if (err) {
+               netdev_err(dev, "Failed to move rq 0x%x to reset\n", rq->rqn);
+               return err;
+       }
+       err = mlx5e_modify_rq_state(rq, MLX5_RQC_STATE_RST, MLX5_RQC_STATE_RDY);
+       if (err) {
+               netdev_err(dev, "Failed to move rq 0x%x to ready\n", rq->rqn);
+               return err;
+       }
+
+       return 0;
+}
+
+static int mlx5e_rx_reporter_err_rq_cqe_recover(void *ctx)
+{
+       struct mlx5_core_dev *mdev;
+       struct net_device *dev;
+       struct mlx5e_rq *rq;
+       u8 state;
+       int err;
+
+       rq = ctx;
+       mdev = rq->mdev;
+       dev = rq->netdev;
+       err = mlx5e_query_rq_state(mdev, rq->rqn, &state);
+       if (err) {
+               netdev_err(dev, "Failed to query RQ 0x%x state. err = %d\n",
+                          rq->rqn, err);
+               goto out;
+       }
+
+       if (state != MLX5_RQC_STATE_ERR)
+               goto out;
+
+       mlx5e_deactivate_rq(rq);
+       mlx5e_free_rx_descs(rq);
+
+       err = mlx5e_rq_to_ready(rq, MLX5_RQC_STATE_ERR);
+       if (err)
+               goto out;
+
+       clear_bit(MLX5E_RQ_STATE_RECOVERING, &rq->state);
+       mlx5e_activate_rq(rq);
+       rq->stats->recover++;
+       return 0;
+out:
+       clear_bit(MLX5E_RQ_STATE_RECOVERING, &rq->state);
+       return err;
+}
+
+void mlx5e_reporter_rq_cqe_err(struct mlx5e_rq *rq)
+{
+       struct mlx5e_priv *priv = rq->channel->priv;
+       char err_str[MLX5E_REPORTER_PER_Q_MAX_LEN];
+       struct mlx5e_err_ctx err_ctx = {};
+
+       err_ctx.ctx = rq;
+       err_ctx.recover = mlx5e_rx_reporter_err_rq_cqe_recover;
+       sprintf(err_str, "ERR CQE on RQ: 0x%x", rq->rqn);
+
+       mlx5e_health_report(priv, priv->rx_reporter, err_str, &err_ctx);
+}
+
 static int mlx5e_rx_reporter_timeout_recover(void *ctx)
 {
        struct mlx5e_icosq *icosq;
index 54f320391f635091f865df0a92027890af4a9f48..7fdea6479ff6876d7179d49d4c719c6ad81cffca 100644 (file)
@@ -362,6 +362,13 @@ static void mlx5e_free_di_list(struct mlx5e_rq *rq)
        kvfree(rq->wqe.di);
 }
 
+static void mlx5e_rq_err_cqe_work(struct work_struct *recover_work)
+{
+       struct mlx5e_rq *rq = container_of(recover_work, struct mlx5e_rq, recover_work);
+
+       mlx5e_reporter_rq_cqe_err(rq);
+}
+
 static int mlx5e_alloc_rq(struct mlx5e_channel *c,
                          struct mlx5e_params *params,
                          struct mlx5e_xsk_param *xsk,
@@ -398,6 +405,7 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
                rq->stats = &c->priv->channel_stats[c->ix].xskrq;
        else
                rq->stats = &c->priv->channel_stats[c->ix].rq;
+       INIT_WORK(&rq->recover_work, mlx5e_rq_err_cqe_work);
 
        rq->xdp_prog = params->xdp_prog ? bpf_prog_inc(params->xdp_prog) : NULL;
        if (IS_ERR(rq->xdp_prog)) {
@@ -907,6 +915,7 @@ void mlx5e_close_rq(struct mlx5e_rq *rq)
 {
        cancel_work_sync(&rq->dim.work);
        cancel_work_sync(&rq->channel->icosq.recover_work);
+       cancel_work_sync(&rq->recover_work);
        mlx5e_destroy_rq(rq);
        mlx5e_free_rx_descs(rq);
        mlx5e_free_rq(rq);
index 43d790b7d4ec15e038197ccab8678c6fa8eaee35..2fd2760d0bb7c0b1000b8687f4b0af1c0c056ee7 100644 (file)
@@ -1130,6 +1130,15 @@ mlx5e_skb_from_cqe_nonlinear(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe,
        return skb;
 }
 
+static void trigger_report(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
+{
+       struct mlx5_err_cqe *err_cqe = (struct mlx5_err_cqe *)cqe;
+
+       if (cqe_syndrome_needs_recover(err_cqe->syndrome) &&
+           !test_and_set_bit(MLX5E_RQ_STATE_RECOVERING, &rq->state))
+               queue_work(rq->channel->priv->wq, &rq->recover_work);
+}
+
 void mlx5e_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
 {
        struct mlx5_wq_cyc *wq = &rq->wqe.wq;
@@ -1143,6 +1152,7 @@ void mlx5e_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
        cqe_bcnt = be32_to_cpu(cqe->byte_cnt);
 
        if (unlikely(MLX5E_RX_ERR_CQE(cqe))) {
+               trigger_report(rq, cqe);
                rq->stats->wqe_err++;
                goto free_wqe;
        }
@@ -1328,6 +1338,7 @@ void mlx5e_handle_rx_cqe_mpwrq(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
        wi->consumed_strides += cstrides;
 
        if (unlikely(MLX5E_RX_ERR_CQE(cqe))) {
+               trigger_report(rq, cqe);
                rq->stats->wqe_err++;
                goto mpwrq_cqe_out;
        }