]> asedeno.scripts.mit.edu Git - linux.git/commitdiff
drm/amdgpu: add concurrent baco reset support for XGMI
authorLe Ma <le.ma@amd.com>
Tue, 26 Nov 2019 14:12:31 +0000 (22:12 +0800)
committerAlex Deucher <alexander.deucher@amd.com>
Thu, 5 Dec 2019 21:25:53 +0000 (16:25 -0500)
Currently each XGMI node reset wq does not run in parrallel if bound to same
cpu. Make change to bound the xgmi_reset_work item to different cpus.

XGMI requires all nodes enter into baco within very close proximity before
any node exit baco. So schedule the xgmi_reset_work wq twice for enter/exit
baco respectively.

To use baco for XGMI, PMFW supported for baco on XGMI needs to be involved.

The case that PSP reset and baco reset coexist within an XGMI hive never exist
and is not in the consideration.

v2: define use_baco flag to simplify the code for xgmi baco sequence

Signed-off-by: Le Ma <le.ma@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/amdgpu.h
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

index 2e5b2f46791172e9250948463efd042159e998e3..6003f94543547e67204314e7e9b7fe85364b2113 100644 (file)
@@ -992,6 +992,8 @@ struct amdgpu_device {
 
        bool                            pm_sysfs_en;
        bool                            ucode_sysfs_en;
+
+       bool                            in_baco;
 };
 
 static inline struct amdgpu_device *amdgpu_ttm_adev(struct ttm_bo_device *bdev)
index fc53faac41479a62fec2a153970bfdbfa0ee8ec4..114f5bca581ab958e2e0e8bad6366e924e69a5ba 100644 (file)
@@ -2661,7 +2661,13 @@ static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
        struct amdgpu_device *adev =
                container_of(__work, struct amdgpu_device, xgmi_reset_work);
 
-       adev->asic_reset_res =  amdgpu_asic_reset(adev);
+       if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO)
+               adev->asic_reset_res = (adev->in_baco == false) ?
+                               amdgpu_device_baco_enter(adev->ddev) :
+                               amdgpu_device_baco_exit(adev->ddev);
+       else
+               adev->asic_reset_res = amdgpu_asic_reset(adev);
+
        if (adev->asic_reset_res)
                DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",
                         adev->asic_reset_res, adev->ddev->unique);
@@ -3787,13 +3793,18 @@ static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
        return r;
 }
 
-static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
+static int amdgpu_do_asic_reset(struct amdgpu_device *adev,
+                              struct amdgpu_hive_info *hive,
                               struct list_head *device_list_handle,
                               bool *need_full_reset_arg)
 {
        struct amdgpu_device *tmp_adev = NULL;
        bool need_full_reset = *need_full_reset_arg, vram_lost = false;
        int r = 0;
+       int cpu = smp_processor_id();
+       bool use_baco =
+               (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) ?
+               true : false;
 
        /*
         * ASIC reset has to be done on all HGMI hive nodes ASAP
@@ -3801,21 +3812,24 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
         */
        if (need_full_reset) {
                list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
-                       /* For XGMI run all resets in parallel to speed up the process */
+                       /*
+                        * For XGMI run all resets in parallel to speed up the
+                        * process by scheduling the highpri wq on different
+                        * cpus. For XGMI with baco reset, all nodes must enter
+                        * baco within close proximity before anyone exit.
+                        */
                        if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
-                               if (!queue_work(system_highpri_wq, &tmp_adev->xgmi_reset_work))
+                               if (!queue_work_on(cpu, system_highpri_wq,
+                                                  &tmp_adev->xgmi_reset_work))
                                        r = -EALREADY;
+                               cpu = cpumask_next(cpu, cpu_online_mask);
                        } else
                                r = amdgpu_asic_reset(tmp_adev);
-
-                       if (r) {
-                               DRM_ERROR("ASIC reset failed with error, %d for drm dev, %s",
-                                        r, tmp_adev->ddev->unique);
+                       if (r)
                                break;
-                       }
                }
 
-               /* For XGMI wait for all PSP resets to complete before proceed */
+               /* For XGMI wait for all work to complete before proceed */
                if (!r) {
                        list_for_each_entry(tmp_adev, device_list_handle,
                                            gmc.xgmi.head) {
@@ -3824,11 +3838,54 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
                                        r = tmp_adev->asic_reset_res;
                                        if (r)
                                                break;
+                                       if (use_baco)
+                                               tmp_adev->in_baco = true;
                                }
                        }
                }
-       }
 
+               /*
+                * For XGMI with baco reset, need exit baco phase by scheduling
+                * xgmi_reset_work one more time. PSP reset and sGPU skips this
+                * phase. Not assume the situation that PSP reset and baco reset
+                * coexist within an XGMI hive.
+                */
+
+               if (!r && use_baco) {
+                       cpu = smp_processor_id();
+                       list_for_each_entry(tmp_adev, device_list_handle,
+                                           gmc.xgmi.head) {
+                               if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
+                                       if (!queue_work_on(cpu,
+                                               system_highpri_wq,
+                                               &tmp_adev->xgmi_reset_work))
+                                               r = -EALREADY;
+                                       if (r)
+                                               break;
+                                       cpu = cpumask_next(cpu, cpu_online_mask);
+                               }
+                       }
+               }
+
+               if (!r && use_baco) {
+                       list_for_each_entry(tmp_adev, device_list_handle,
+                                           gmc.xgmi.head) {
+                               if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
+                                       flush_work(&tmp_adev->xgmi_reset_work);
+                                       r = tmp_adev->asic_reset_res;
+                                       if (r)
+                                               break;
+                                       tmp_adev->in_baco = false;
+                               }
+                       }
+               }
+
+               if (r) {
+                       DRM_ERROR("ASIC reset failed with error, %d for drm dev, %s",
+                                r, tmp_adev->ddev->unique);
+                       goto end;
+               }
+       }
 
        list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
                if (need_full_reset) {
@@ -4113,7 +4170,8 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
                if (r)
                        adev->asic_reset_res = r;
        } else {
-               r  = amdgpu_do_asic_reset(hive, device_list_handle, &need_full_reset);
+               r  = amdgpu_do_asic_reset(adev, hive, device_list_handle,
+                                         &need_full_reset);
                if (r && r == -EAGAIN)
                        goto retry;
        }