From 0e9a860c72ec387140a0feb4b8d9a6d0004e9316 Mon Sep 17 00:00:00 2001 From: Yong Zhao Date: Wed, 11 Jul 2018 22:33:05 -0400 Subject: [PATCH] drm/amdkfd: Introduce KFD module parameter halt_if_hws_hang MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit This avoids triggering a GPU reset or otherwise changing the HW state. Instead KFD will hang, which allows HW debugging tools to analyze the problem. Signed-off-by: Yong Zhao Reviewed-by: Felix Kuehling Signed-off-by: Felix Kuehling Acked-by: Christian König Signed-off-by: Oded Gabbay --- drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 7 +++++++ drivers/gpu/drm/amd/amdkfd/kfd_module.c | 4 ++++ drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 5 +++++ 3 files changed, 16 insertions(+) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c index 32e93b53e7e8..5d05d125c3c1 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c @@ -1217,6 +1217,13 @@ int amdkfd_fence_wait_timeout(unsigned int *fence_addr, while (*fence_addr != fence_value) { if (time_after(jiffies, end_jiffies)) { pr_err("qcm fence wait loop timeout expired\n"); + /* In HWS case, this is used to halt the driver thread + * in order not to mess up CP states before doing + * scandumps for FW debugging. + */ + while (halt_if_hws_hang) + schedule(); + return -ETIME; } schedule(); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_module.c b/drivers/gpu/drm/amd/amdkfd/kfd_module.c index ee7bf07db472..3a8c15ad0c64 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_module.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_module.c @@ -92,6 +92,10 @@ MODULE_PARM_DESC(noretry, static int amdkfd_init_completed; +int halt_if_hws_hang; +module_param(halt_if_hws_hang, int, 0644); +MODULE_PARM_DESC(halt_if_hws_hang, "Halt if HWS hang is detected (0 = off (default), 1 = on)"); + int kgd2kfd_init(unsigned int interface_version, const struct kgd2kfd_calls **g2f) { diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h index d9bf70b52857..8473e7b3dcc2 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h @@ -144,6 +144,11 @@ extern int ignore_crat; */ extern int vega10_noretry; +/* + * Halt if HWS hang is detected + */ +extern int halt_if_hws_hang; + /** * enum kfd_sched_policy * -- 2.45.2