]> asedeno.scripts.mit.edu Git - linux.git/commitdiff
drm/amdgpu: add graceful VM fault handling v3
authorChristian König <christian.koenig@amd.com>
Fri, 7 Dec 2018 14:18:43 +0000 (15:18 +0100)
committerAlex Deucher <alexander.deucher@amd.com>
Mon, 16 Sep 2019 15:42:55 +0000 (10:42 -0500)
Next step towards HMM support. For now just silence the retry fault and
optionally redirect the request to the dummy page.

v2: make sure the VM is not destroyed while we handle the fault.
v3: fix VM destroy check, cleanup comments

Signed-off-by: Christian König <christian.koenig@amd.com>
Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c

index 696253ebdf981596b20970c3f42074229497a157..8327469f5722116dce821e463b5a08a9f4ee2ff4 100644 (file)
@@ -3126,3 +3126,76 @@ void amdgpu_vm_set_task_info(struct amdgpu_vm *vm)
                }
        }
 }
+
+/**
+ * amdgpu_vm_handle_fault - graceful handling of VM faults.
+ * @adev: amdgpu device pointer
+ * @pasid: PASID of the VM
+ * @addr: Address of the fault
+ *
+ * Try to gracefully handle a VM fault. Return true if the fault was handled and
+ * shouldn't be reported any more.
+ */
+bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, unsigned int pasid,
+                           uint64_t addr)
+{
+       struct amdgpu_bo *root;
+       uint64_t value, flags;
+       struct amdgpu_vm *vm;
+       long r;
+
+       spin_lock(&adev->vm_manager.pasid_lock);
+       vm = idr_find(&adev->vm_manager.pasid_idr, pasid);
+       if (vm)
+               root = amdgpu_bo_ref(vm->root.base.bo);
+       else
+               root = NULL;
+       spin_unlock(&adev->vm_manager.pasid_lock);
+
+       if (!root)
+               return false;
+
+       r = amdgpu_bo_reserve(root, true);
+       if (r)
+               goto error_unref;
+
+       /* Double check that the VM still exists */
+       spin_lock(&adev->vm_manager.pasid_lock);
+       vm = idr_find(&adev->vm_manager.pasid_idr, pasid);
+       if (vm && vm->root.base.bo != root)
+               vm = NULL;
+       spin_unlock(&adev->vm_manager.pasid_lock);
+       if (!vm)
+               goto error_unlock;
+
+       addr /= AMDGPU_GPU_PAGE_SIZE;
+       flags = AMDGPU_PTE_VALID | AMDGPU_PTE_SNOOPED |
+               AMDGPU_PTE_SYSTEM;
+
+       if (amdgpu_vm_fault_stop == AMDGPU_VM_FAULT_STOP_NEVER) {
+               /* Redirect the access to the dummy page */
+               value = adev->dummy_page_addr;
+               flags |= AMDGPU_PTE_EXECUTABLE | AMDGPU_PTE_READABLE |
+                       AMDGPU_PTE_WRITEABLE;
+       } else {
+               /* Let the hw retry silently on the PTE */
+               value = 0;
+       }
+
+       r = amdgpu_vm_bo_update_mapping(adev, vm, true, NULL, addr, addr + 1,
+                                       flags, value, NULL, NULL);
+       if (r)
+               goto error_unlock;
+
+       r = amdgpu_vm_update_pdes(adev, vm, true);
+
+error_unlock:
+       amdgpu_bo_unreserve(root);
+       if (r < 0)
+               DRM_ERROR("Can't handle page fault (%ld)\n", r);
+
+error_unref:
+       amdgpu_bo_unref(&root);
+
+       return false;
+}
index 0a97dc839f3b1ecea19103d0f51b4ebb740d47a2..4dbbe1b6b413604e33ae7be5abfffc36dcdd45f4 100644 (file)
@@ -413,6 +413,8 @@ void amdgpu_vm_check_compute_bug(struct amdgpu_device *adev);
 
 void amdgpu_vm_get_task_info(struct amdgpu_device *adev, unsigned int pasid,
                             struct amdgpu_task_info *task_info);
+bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, unsigned int pasid,
+                           uint64_t addr);
 
 void amdgpu_vm_set_task_info(struct amdgpu_vm *vm);
 
index a991a6c4f468d770a325fa3f04867d3eb09a5166..6102deaa03cafef346a5dada69ec5ae29e9885cb 100644 (file)
@@ -380,6 +380,10 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev,
        }
 
        /* If it's the first fault for this address, process it normally */
+       if (retry_fault && !in_interrupt() &&
+           amdgpu_vm_handle_fault(adev, entry->pasid, addr))
+               return 1; /* This also prevents sending it to KFD */
+
        if (!amdgpu_sriov_vf(adev)) {
                /*
                 * Issue a dummy read to wait for the status register to