drm/amdgpu: add concurrent baco reset support for XGMI

author Le Ma <le.ma@amd.com>

Tue, 26 Nov 2019 14:12:31 +0000 (22:12 +0800)

committer Alex Deucher <alexander.deucher@amd.com>

Thu, 5 Dec 2019 21:25:53 +0000 (16:25 -0500)
author Le Ma <le.ma@amd.com>
Tue, 26 Nov 2019 14:12:31 +0000 (22:12 +0800)
committer Alex Deucher <alexander.deucher@amd.com>
Thu, 5 Dec 2019 21:25:53 +0000 (16:25 -0500)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h

index 2e5b2f46791172e9250948463efd042159e998e3..6003f94543547e67204314e7e9b7fe85364b2113 100644 (file)
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -992,6 +992,8 @@ struct amdgpu_device {
  
         bool                            pm_sysfs_en;
         bool                            ucode_sysfs_en;
+
+       bool                            in_baco;
  };
  
  static inline struct amdgpu_device *amdgpu_ttm_adev(struct ttm_bo_device *bdev)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

index fc53faac41479a62fec2a153970bfdbfa0ee8ec4..114f5bca581ab958e2e0e8bad6366e924e69a5ba 100644 (file)
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2661,7 +2661,13 @@ static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
         struct amdgpu_device *adev =
                 container_of(__work, struct amdgpu_device, xgmi_reset_work);
  
-       adev->asic_reset_res =  amdgpu_asic_reset(adev);
+       if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO)
+               adev->asic_reset_res = (adev->in_baco == false) ?
+                               amdgpu_device_baco_enter(adev->ddev) :
+                               amdgpu_device_baco_exit(adev->ddev);
+       else
+               adev->asic_reset_res = amdgpu_asic_reset(adev);
+
         if (adev->asic_reset_res)
                 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",
                          adev->asic_reset_res, adev->ddev->unique);
@@ -3787,13 +3793,18 @@ static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
         return r;
  }
  
-static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
+static int amdgpu_do_asic_reset(struct amdgpu_device *adev,
+                              struct amdgpu_hive_info *hive,
                                struct list_head *device_list_handle,
                                bool *need_full_reset_arg)
  {
         struct amdgpu_device *tmp_adev = NULL;
         bool need_full_reset = *need_full_reset_arg, vram_lost = false;
         int r = 0;
+       int cpu = smp_processor_id();
+       bool use_baco =
+               (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) ?
+               true : false;
  
         /*
          * ASIC reset has to be done on all HGMI hive nodes ASAP
@@ -3801,21 +3812,24 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
          */
         if (need_full_reset) {
                 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
-                       /* For XGMI run all resets in parallel to speed up the process */
+                       /*
+                        * For XGMI run all resets in parallel to speed up the
+                        * process by scheduling the highpri wq on different
+                        * cpus. For XGMI with baco reset, all nodes must enter
+                        * baco within close proximity before anyone exit.
+                        */
                         if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
-                               if (!queue_work(system_highpri_wq, &tmp_adev->xgmi_reset_work))
+                               if (!queue_work_on(cpu, system_highpri_wq,
+                                                  &tmp_adev->xgmi_reset_work))
                                         r = -EALREADY;
+                               cpu = cpumask_next(cpu, cpu_online_mask);
                         } else
                                 r = amdgpu_asic_reset(tmp_adev);
-
-                       if (r) {
-                               DRM_ERROR("ASIC reset failed with error, %d for drm dev, %s",
-                                        r, tmp_adev->ddev->unique);
+                       if (r)
                                 break;
-                       }
                 }
  
-               /* For XGMI wait for all PSP resets to complete before proceed */
+               /* For XGMI wait for all work to complete before proceed */
                 if (!r) {
                         list_for_each_entry(tmp_adev, device_list_handle,
                                             gmc.xgmi.head) {
@@ -3824,11 +3838,54 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
                                         r = tmp_adev->asic_reset_res;
                                         if (r)
                                                 break;
+                                       if (use_baco)
+                                               tmp_adev->in_baco = true;
                                 }
                         }
                 }
-       }
  
+               /*
+                * For XGMI with baco reset, need exit baco phase by scheduling
+                * xgmi_reset_work one more time. PSP reset and sGPU skips this
+                * phase. Not assume the situation that PSP reset and baco reset
+                * coexist within an XGMI hive.
+                */
+
+               if (!r && use_baco) {
+                       cpu = smp_processor_id();
+                       list_for_each_entry(tmp_adev, device_list_handle,
+                                           gmc.xgmi.head) {
+                               if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
+                                       if (!queue_work_on(cpu,
+                                               system_highpri_wq,
+                                               &tmp_adev->xgmi_reset_work))
+                                               r = -EALREADY;
+                                       if (r)
+                                               break;
+                                       cpu = cpumask_next(cpu, cpu_online_mask);
+                               }
+                       }
+               }
+
+               if (!r && use_baco) {
+                       list_for_each_entry(tmp_adev, device_list_handle,
+                                           gmc.xgmi.head) {
+                               if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
+                                       flush_work(&tmp_adev->xgmi_reset_work);
+                                       r = tmp_adev->asic_reset_res;
+                                       if (r)
+                                               break;
+                                       tmp_adev->in_baco = false;
+                               }
+                       }
+               }
+
+               if (r) {
+                       DRM_ERROR("ASIC reset failed with error, %d for drm dev, %s",
+                                r, tmp_adev->ddev->unique);
+                       goto end;
+               }
+       }
  
         list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
                 if (need_full_reset) {
@@ -4113,7 +4170,8 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
                 if (r)
                         adev->asic_reset_res = r;
         } else {
-               r  = amdgpu_do_asic_reset(hive, device_list_handle, &need_full_reset);
+               r  = amdgpu_do_asic_reset(adev, hive, device_list_handle,
+                                         &need_full_reset);
                 if (r && r == -EAGAIN)
                         goto retry;
         }
author	Le Ma <le.ma@amd.com>
	Tue, 26 Nov 2019 14:12:31 +0000 (22:12 +0800)
committer	Alex Deucher <alexander.deucher@amd.com>
	Thu, 5 Dec 2019 21:25:53 +0000 (16:25 -0500)
drivers/gpu/drm/amd/amdgpu/amdgpu.h		patch \| blob \| history
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c		patch \| blob \| history