From c688a06bc661749a944a2980f0ff0cee8659ac81 Mon Sep 17 00:00:00 2001 From: Guchun Chen Date: Mon, 21 Oct 2019 16:56:00 +0800 Subject: [PATCH] drm/amdgpu: refine reboot debugfs operation in ras case (v3) MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Ras reboot debugfs node allows user one easy control to avoid gpu recovery hang problem and directly reboot system per card basis, after ras uncorrectable error happens. However, it is one common entry, which should get rid of ras_ctrl node and remove ip dependence when inputting by user. So add one new auto_reboot node in ras debugfs dir to achieve this. v2: in commit mssage, add justification why ras reboot debugfs node is needed. v3: use debugfs_create_bool to create debugfs file for boolean value Signed-off-by: Guchun Chen Reviewed-by: Alex Deucher Reviewed-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 6220394521e4..2d9e13d2a71a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -153,8 +153,6 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f, op = 1; else if (sscanf(str, "inject %32s %8s", block_name, err) == 2) op = 2; - else if (sscanf(str, "reboot %32s", block_name) == 1) - op = 3; else if (str[0] && str[1] && str[2] && str[3]) /* ascii string, but commands are not matched. */ return -EINVAL; @@ -218,12 +216,11 @@ static struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev, * value to the address. * * Second member: struct ras_debug_if::op. - * It has four kinds of operations. + * It has three kinds of operations. * * - 0: disable RAS on the block. Take ::head as its data. * - 1: enable RAS on the block. Take ::head as its data. * - 2: inject errors on the block. Take ::inject as its data. - * - 3: reboot on unrecoverable error * * How to use the interface? * programs: @@ -305,9 +302,6 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, const char __user * /* data.inject.address is offset instead of absolute gpu address */ ret = amdgpu_ras_error_inject(adev, &data.inject); break; - case 3: - amdgpu_ras_get_context(adev)->reboot = true; - break; default: ret = -EINVAL; break; @@ -1037,6 +1031,17 @@ static void amdgpu_ras_debugfs_create_ctrl_node(struct amdgpu_device *adev) adev, &amdgpu_ras_debugfs_ctrl_ops); debugfs_create_file("ras_eeprom_reset", S_IWUGO | S_IRUGO, con->dir, adev, &amdgpu_ras_debugfs_eeprom_ops); + + /* + * After one uncorrectable error happens, usually GPU recovery will + * be scheduled. But due to the known problem in GPU recovery failing + * to bring GPU back, below interface provides one direct way to + * user to reboot system automatically in such case within + * ERREVENT_ATHUB_INTERRUPT generated. Normal GPU recovery routine + * will never be called. + */ + debugfs_create_bool("auto_reboot", S_IWUGO | S_IRUGO, con->dir, + &con->reboot); } void amdgpu_ras_debugfs_create(struct amdgpu_device *adev, -- 2.45.2