]> asedeno.scripts.mit.edu Git - linux.git/blobdiff - drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
drm/amdgpu: fix build error without CONFIG_HSA_AMD
[linux.git] / drivers / gpu / drm / amd / amdgpu / amdgpu_device.c
index 682833f90fddfc1fad1d9658ce4e61dad99ae123..b893ec935b841c290ba73a6a82ed5b4123cb92bc 100644 (file)
 #include "amdgpu_ras.h"
 #include "amdgpu_pmu.h"
 
+#include <linux/suspend.h>
+
 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin");
 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin");
 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin");
 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin");
+MODULE_FIRMWARE("amdgpu/renoir_gpu_info.bin");
 MODULE_FIRMWARE("amdgpu/navi10_gpu_info.bin");
 MODULE_FIRMWARE("amdgpu/navi14_gpu_info.bin");
 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin");
 
 #define AMDGPU_RESUME_MS               2000
 
-static const char *amdgpu_asic_name[] = {
+const char *amdgpu_asic_name[] = {
        "TAHITI",
        "PITCAIRN",
        "VERDE",
@@ -102,6 +105,7 @@ static const char *amdgpu_asic_name[] = {
        "VEGA20",
        "RAVEN",
        "ARCTURUS",
+       "RENOIR",
        "NAVI10",
        "NAVI14",
        "NAVI12",
@@ -1427,6 +1431,9 @@ static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev)
        case CHIP_ARCTURUS:
                chip_name = "arcturus";
                break;
+       case CHIP_RENOIR:
+               chip_name = "renoir";
+               break;
        case CHIP_NAVI10:
                chip_name = "navi10";
                break;
@@ -1579,7 +1586,9 @@ static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)
        case CHIP_VEGA20:
        case CHIP_RAVEN:
        case CHIP_ARCTURUS:
-               if (adev->asic_type == CHIP_RAVEN)
+       case CHIP_RENOIR:
+               if (adev->asic_type == CHIP_RAVEN ||
+                   adev->asic_type == CHIP_RENOIR)
                        adev->family = AMDGPU_FAMILY_RV;
                else
                        adev->family = AMDGPU_FAMILY_AI;
@@ -1615,7 +1624,11 @@ static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)
        }
 
        adev->pm.pp_feature = amdgpu_pp_feature_mask;
-       if (amdgpu_sriov_vf(adev))
+       if (amdgpu_sriov_vf(adev)
+           #ifdef CONFIG_HSA_AMD
+           || sched_policy == KFD_SCHED_POLICY_NO_HWS
+           #endif
+           )
                adev->pm.pp_feature &= ~PP_GFXOFF_MASK;
 
        for (i = 0; i < adev->num_ip_blocks; i++) {
@@ -2511,6 +2524,9 @@ bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)
        case CHIP_NAVI10:
        case CHIP_NAVI14:
        case CHIP_NAVI12:
+#endif
+#if defined(CONFIG_DRM_AMD_DC_DCN2_1)
+       case CHIP_RENOIR:
 #endif
                return amdgpu_dc != 0;
 #endif
@@ -2573,7 +2589,12 @@ int amdgpu_device_init(struct amdgpu_device *adev,
        adev->ddev = ddev;
        adev->pdev = pdev;
        adev->flags = flags;
-       adev->asic_type = flags & AMD_ASIC_MASK;
+
+       if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST)
+               adev->asic_type = amdgpu_force_asic_type;
+       else
+               adev->asic_type = flags & AMD_ASIC_MASK;
+
        adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT;
        if (amdgpu_emu_mode == 1)
                adev->usec_timeout *= 2;
@@ -3476,7 +3497,7 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
        amdgpu_virt_init_data_exchange(adev);
        amdgpu_virt_release_full_gpu(adev, true);
        if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) {
-               atomic_inc(&adev->vram_lost_counter);
+               amdgpu_inc_vram_lost(adev);
                r = amdgpu_device_recover_vram(adev);
        }
 
@@ -3518,6 +3539,7 @@ bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev)
                case CHIP_VEGA20:
                case CHIP_VEGA10:
                case CHIP_VEGA12:
+               case CHIP_RAVEN:
                        break;
                default:
                        goto disabled;
@@ -3617,11 +3639,6 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
                                                break;
                                }
                        }
-
-                       list_for_each_entry(tmp_adev, device_list_handle,
-                                       gmc.xgmi.head) {
-                               amdgpu_ras_reserve_bad_pages(tmp_adev);
-                       }
                }
        }
 
@@ -3641,7 +3658,7 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
                                vram_lost = amdgpu_device_check_vram_lost(tmp_adev);
                                if (vram_lost) {
                                        DRM_INFO("VRAM is lost due to GPU reset!\n");
-                                       atomic_inc(&tmp_adev->vram_lost_counter);
+                                       amdgpu_inc_vram_lost(tmp_adev);
                                }
 
                                r = amdgpu_gtt_mgr_recover(
@@ -3725,25 +3742,18 @@ static bool amdgpu_device_lock_adev(struct amdgpu_device *adev, bool trylock)
                adev->mp1_state = PP_MP1_STATE_NONE;
                break;
        }
-       /* Block kfd: SRIOV would do it separately */
-       if (!amdgpu_sriov_vf(adev))
-                amdgpu_amdkfd_pre_reset(adev);
 
        return true;
 }
 
 static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
 {
-       /*unlock kfd: SRIOV would do it separately */
-       if (!amdgpu_sriov_vf(adev))
-                amdgpu_amdkfd_post_reset(adev);
        amdgpu_vf_error_trans_all(adev);
        adev->mp1_state = PP_MP1_STATE_NONE;
        adev->in_gpu_reset = 0;
        mutex_unlock(&adev->lock_reset);
 }
 
-
 /**
  * amdgpu_device_gpu_recover - reset the asic and recover scheduler
  *
@@ -3763,11 +3773,24 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
        struct amdgpu_hive_info *hive = NULL;
        struct amdgpu_device *tmp_adev = NULL;
        int i, r = 0;
+       bool in_ras_intr = amdgpu_ras_intr_triggered();
+
+       /*
+        * Flush RAM to disk so that after reboot
+        * the user can read log and see why the system rebooted.
+        */
+       if (in_ras_intr && amdgpu_ras_get_context(adev)->reboot) {
+
+               DRM_WARN("Emergency reboot.");
+
+               ksys_sync_helper();
+               emergency_restart();
+       }
 
        need_full_reset = job_signaled = false;
        INIT_LIST_HEAD(&device_list);
 
-       dev_info(adev->dev, "GPU reset begin!\n");
+       dev_info(adev->dev, "GPU %s begin!\n", in_ras_intr ? "jobs stop":"reset");
 
        cancel_delayed_work_sync(&adev->delayed_init_work);
 
@@ -3783,20 +3806,27 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
 
        if (hive && !mutex_trylock(&hive->reset_lock)) {
                DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress",
-                        job->base.id, hive->hive_id);
+                         job ? job->base.id : -1, hive->hive_id);
                return 0;
        }
 
        /* Start with adev pre asic reset first for soft reset check.*/
        if (!amdgpu_device_lock_adev(adev, !hive)) {
                DRM_INFO("Bailing on TDR for s_job:%llx, as another already in progress",
-                                        job->base.id);
+                         job ? job->base.id : -1);
                return 0;
        }
 
+       /* Block kfd: SRIOV would do it separately */
+       if (!amdgpu_sriov_vf(adev))
+                amdgpu_amdkfd_pre_reset(adev);
+
        /* Build list of devices to reset */
        if  (adev->gmc.xgmi.num_physical_nodes > 1) {
                if (!hive) {
+                       /*unlock kfd: SRIOV would do it separately */
+                       if (!amdgpu_sriov_vf(adev))
+                               amdgpu_amdkfd_post_reset(adev);
                        amdgpu_device_unlock_adev(adev);
                        return -ENODEV;
                }
@@ -3812,17 +3842,22 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
                device_list_handle = &device_list;
        }
 
-       /*
-        * Mark these ASICs to be reseted as untracked first
-        * And add them back after reset completed
-        */
-       list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head)
-               amdgpu_unregister_gpu_instance(tmp_adev);
-
        /* block all schedulers and reset given job's ring */
        list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
+               if (tmp_adev != adev) {
+                       amdgpu_device_lock_adev(tmp_adev, false);
+                       if (!amdgpu_sriov_vf(tmp_adev))
+                                       amdgpu_amdkfd_pre_reset(tmp_adev);
+               }
+
+               /*
+                * Mark these ASICs to be reseted as untracked first
+                * And add them back after reset completed
+                */
+               amdgpu_unregister_gpu_instance(tmp_adev);
+
                /* disable ras on ALL IPs */
-               if (amdgpu_device_ip_need_full_reset(tmp_adev))
+               if (!in_ras_intr && amdgpu_device_ip_need_full_reset(tmp_adev))
                        amdgpu_ras_suspend(tmp_adev);
 
                for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
@@ -3831,11 +3866,17 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
                        if (!ring || !ring->sched.thread)
                                continue;
 
-                       drm_sched_stop(&ring->sched, &job->base);
+                       drm_sched_stop(&ring->sched, job ? &job->base : NULL);
+
+                       if (in_ras_intr)
+                               amdgpu_job_stop_all_jobs_on_sched(&ring->sched);
                }
        }
 
 
+       if (in_ras_intr)
+               goto skip_sched_resume;
+
        /*
         * Must check guilty signal here since after this point all old
         * HW fences are force signaled.
@@ -3846,9 +3887,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
            dma_fence_is_signaled(job->base.s_fence->parent))
                job_signaled = true;
 
-       if (!amdgpu_device_ip_need_full_reset(adev))
-               device_list_handle = &device_list;
-
        if (job_signaled) {
                dev_info(adev->dev, "Guilty job already signaled, skipping HW reset");
                goto skip_hw_reset;
@@ -3856,9 +3894,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
 
 
        /* Guilty job will be freed after this*/
-       r = amdgpu_device_pre_asic_reset(adev,
-                                        job,
-                                        &need_full_reset);
+       r = amdgpu_device_pre_asic_reset(adev, job, &need_full_reset);
        if (r) {
                /*TODO Should we stop ?*/
                DRM_ERROR("GPU pre asic reset failed with err, %d for drm dev, %s ",
@@ -3872,7 +3908,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
                if (tmp_adev == adev)
                        continue;
 
-               amdgpu_device_lock_adev(tmp_adev, false);
                r = amdgpu_device_pre_asic_reset(tmp_adev,
                                                 NULL,
                                                 &need_full_reset);
@@ -3900,6 +3935,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
 
        /* Post ASIC reset for all devs .*/
        list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
+
                for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
                        struct amdgpu_ring *ring = tmp_adev->rings[i];
 
@@ -3921,12 +3957,18 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
 
                if (r) {
                        /* bad news, how to tell it to userspace ? */
-                       dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&adev->gpu_reset_counter));
+                       dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter));
                        amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r);
                } else {
-                       dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&adev->gpu_reset_counter));
+                       dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter));
                }
+       }
 
+skip_sched_resume:
+       list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
+               /*unlock kfd: SRIOV would do it separately */
+               if (!in_ras_intr && !amdgpu_sriov_vf(tmp_adev))
+                       amdgpu_amdkfd_post_reset(tmp_adev);
                amdgpu_device_unlock_adev(tmp_adev);
        }