]> asedeno.scripts.mit.edu Git - linux.git/commitdiff
net/mlx5: Report devlink health on FW issues
authorMoshe Shemesh <moshe@mellanox.com>
Tue, 11 Dec 2018 14:09:56 +0000 (16:09 +0200)
committerSaeed Mahameed <saeedm@mellanox.com>
Thu, 13 Jun 2019 20:23:19 +0000 (13:23 -0700)
Use devlink_health_report() to report any symptom of FW issue as FW
counter miss or new health syndrome.
The FW issues detected in mlx5 during poll_health which is called in
timer atomic context and so health work queue is used to schedule the
reports.

Signed-off-by: Moshe Shemesh <moshe@mellanox.com>
Signed-off-by: Eran Ben Elisha <eranbe@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
drivers/net/ethernet/mellanox/mlx5/core/health.c
include/linux/mlx5/driver.h

index 1c20d3f1d238fcbec4bbf4304dbeb40fdc073270..5e876f1de114cb2fcbc1f8dc081b63c9fc70ee4b 100644 (file)
@@ -515,6 +515,29 @@ mlx5_fw_reporter_dump(struct devlink_health_reporter *reporter,
        return mlx5_fw_tracer_get_saved_traces_objects(dev->tracer, fmsg);
 }
 
+static void mlx5_fw_reporter_err_work(struct work_struct *work)
+{
+       struct mlx5_fw_reporter_ctx fw_reporter_ctx;
+       struct mlx5_core_health *health;
+
+       health = container_of(work, struct mlx5_core_health, report_work);
+
+       if (IS_ERR_OR_NULL(health->fw_reporter))
+               return;
+
+       fw_reporter_ctx.err_synd = health->synd;
+       fw_reporter_ctx.miss_counter = health->miss_counter;
+       if (fw_reporter_ctx.err_synd) {
+               devlink_health_report(health->fw_reporter,
+                                     "FW syndrom reported", &fw_reporter_ctx);
+               return;
+       }
+       if (fw_reporter_ctx.miss_counter)
+               devlink_health_report(health->fw_reporter,
+                                     "FW miss counter reported",
+                                     &fw_reporter_ctx);
+}
+
 static const struct devlink_health_reporter_ops mlx5_fw_reporter_ops = {
                .name = "fw",
                .diagnose = mlx5_fw_reporter_diagnose,
@@ -572,7 +595,9 @@ static void poll_health(struct timer_list *t)
 {
        struct mlx5_core_dev *dev = from_timer(dev, t, priv.health.timer);
        struct mlx5_core_health *health = &dev->priv.health;
+       struct health_buffer __iomem *h = health->health;
        u32 fatal_error;
+       u8 prev_synd;
        u32 count;
 
        if (dev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR)
@@ -588,8 +613,14 @@ static void poll_health(struct timer_list *t)
        if (health->miss_counter == MAX_MISSES) {
                mlx5_core_err(dev, "device's health compromised - reached miss count\n");
                print_health_info(dev);
+               queue_work(health->wq, &health->report_work);
        }
 
+       prev_synd = health->synd;
+       health->synd = ioread8(&h->synd);
+       if (health->synd && health->synd != prev_synd)
+               queue_work(health->wq, &health->report_work);
+
        fatal_error = check_fatal_sensors(dev);
 
        if (fatal_error && !health->fatal_error) {
@@ -639,6 +670,7 @@ void mlx5_drain_health_wq(struct mlx5_core_dev *dev)
        spin_lock_irqsave(&health->wq_lock, flags);
        set_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags);
        spin_unlock_irqrestore(&health->wq_lock, flags);
+       cancel_work_sync(&health->report_work);
        cancel_work_sync(&health->work);
 }
 
@@ -675,6 +707,7 @@ int mlx5_health_init(struct mlx5_core_dev *dev)
                return -ENOMEM;
        spin_lock_init(&health->wq_lock);
        INIT_WORK(&health->work, health_care);
+       INIT_WORK(&health->report_work, mlx5_fw_reporter_err_work);
 
        mlx5_fw_reporter_create(dev);
 
index 8d5d065d1aa6aeb30a1796375182a84f05abed60..1931a4080d78a3194c01e4a84ffe913b9599dc51 100644 (file)
@@ -435,7 +435,7 @@ struct mlx5_core_health {
        struct timer_list               timer;
        u32                             prev;
        int                             miss_counter;
-       bool                            sick;
+       u8                              synd;
        u32                             fatal_error;
        u32                             crdump_size;
        /* wq spinlock to synchronize draining */
@@ -443,6 +443,7 @@ struct mlx5_core_health {
        struct workqueue_struct        *wq;
        unsigned long                   flags;
        struct work_struct              work;
+       struct work_struct              report_work;
        struct delayed_work             recover_work;
        struct devlink_health_reporter *fw_reporter;
 };