From: Andrey Grodzovsky Date: Wed, 11 Dec 2019 19:18:31 +0000 (-0500) Subject: drm/amdgpu: Redo XGMI reset synchronization. X-Git-Tag: v5.6-rc1~114^2~12^2~116 X-Git-Url: https://asedeno.scripts.mit.edu/gitweb/?a=commitdiff_plain;h=c6a6e2db994528a3eaf1ed938a0b7a35b87b7fa4;p=linux.git drm/amdgpu: Redo XGMI reset synchronization. Use task barrier in XGMI hive to synchronize ASIC resets across devices in XGMI hive. v2: Return right away with a warning if no xgmi hive, update doc. Signed-off-by: Andrey Grodzovsky Reviewed-by: Le Ma Signed-off-by: Alex Deucher --- diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 6d52168454b4..277caaf1ea26 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -66,6 +66,7 @@ #include "amdgpu_pmu.h" #include +#include MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); @@ -2664,14 +2665,38 @@ static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) { struct amdgpu_device *adev = container_of(__work, struct amdgpu_device, xgmi_reset_work); + struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev, 0); - if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) - adev->asic_reset_res = (adev->in_baco == false) ? - amdgpu_device_baco_enter(adev->ddev) : - amdgpu_device_baco_exit(adev->ddev); - else - adev->asic_reset_res = amdgpu_asic_reset(adev); + /* It's a bug to not have a hive within this function */ + if (WARN_ON(!hive)) + return; + + /* + * Use task barrier to synchronize all xgmi reset works across the + * hive. task_barrier_enter and task_barrier_exit will block + * until all the threads running the xgmi reset works reach + * those points. task_barrier_full will do both blocks. + */ + if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { + + task_barrier_enter(&hive->tb); + adev->asic_reset_res = amdgpu_device_baco_enter(adev->ddev); + + if (adev->asic_reset_res) + goto fail; + + task_barrier_exit(&hive->tb); + adev->asic_reset_res = amdgpu_device_baco_exit(adev->ddev); + + if (adev->asic_reset_res) + goto fail; + } else { + + task_barrier_full(&hive->tb); + adev->asic_reset_res = amdgpu_asic_reset(adev); + } +fail: if (adev->asic_reset_res) DRM_WARN("ASIC reset failed with error, %d for drm dev, %s", adev->asic_reset_res, adev->ddev->unique);