From 58e698861255129a00765b69c0499bc0d044feb4 Mon Sep 17 00:00:00 2001 From: Lan Xiao Date: Wed, 11 Jul 2018 22:32:51 -0400 Subject: [PATCH] drm/amdkfd: fix zero reading of VMID and PASID for Hawaii MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Upon VM Fault, the VMID and PASID written by HW are zeros in Hawaii. Instead of reading from ih_ring_entry, read directly from the registers. This workaround fix the soft hang issues caused by mishandled VM Fault in Hawaii. Signed-off-by: Lan Xiao Signed-off-by: Felix Kuehling Acked-by: Christian König Signed-off-by: Oded Gabbay --- .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c | 20 ++++++++++++- .../gpu/drm/amd/amdkfd/cik_event_interrupt.c | 29 ++++++++++++++++++- drivers/gpu/drm/amd/amdkfd/kfd_device.c | 14 +++++++-- .../gpu/drm/amd/amdkfd/kfd_int_process_v9.c | 4 ++- drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c | 6 ++-- drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 9 ++++-- .../gpu/drm/amd/include/kgd_kfd_interface.h | 5 ++++ 7 files changed, 77 insertions(+), 10 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c index befc7c48b1cf..b4a05c510c75 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c @@ -145,6 +145,7 @@ static void set_vm_context_page_table_base(struct kgd_dev *kgd, uint32_t vmid, uint32_t page_table_base); static int invalidate_tlbs(struct kgd_dev *kgd, uint16_t pasid); static int invalidate_tlbs_vmid(struct kgd_dev *kgd, uint16_t vmid); +static uint32_t read_vmid_from_vmfault_reg(struct kgd_dev *kgd); /* Because of REG_GET_FIELD() being used, we put this function in the * asic specific file. @@ -216,7 +217,8 @@ static const struct kfd2kgd_calls kfd2kgd = { .invalidate_tlbs = invalidate_tlbs, .invalidate_tlbs_vmid = invalidate_tlbs_vmid, .submit_ib = amdgpu_amdkfd_submit_ib, - .get_vm_fault_info = amdgpu_amdkfd_gpuvm_get_vm_fault_info + .get_vm_fault_info = amdgpu_amdkfd_gpuvm_get_vm_fault_info, + .read_vmid_from_vmfault_reg = read_vmid_from_vmfault_reg }; struct kfd2kgd_calls *amdgpu_amdkfd_gfx_7_get_functions(void) @@ -912,3 +914,19 @@ static int invalidate_tlbs_vmid(struct kgd_dev *kgd, uint16_t vmid) RREG32(mmVM_INVALIDATE_RESPONSE); return 0; } + + /** + * read_vmid_from_vmfault_reg - read vmid from register + * + * adev: amdgpu_device pointer + * @vmid: vmid pointer + * read vmid from register (CIK). + */ +static uint32_t read_vmid_from_vmfault_reg(struct kgd_dev *kgd) +{ + struct amdgpu_device *adev = get_amdgpu_device(kgd); + + uint32_t status = RREG32(mmVM_CONTEXT1_PROTECTION_FAULT_STATUS); + + return REG_GET_FIELD(status, VM_CONTEXT1_PROTECTION_FAULT_STATUS, VMID); +} diff --git a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c index cc33870e7edb..5d2475d5392c 100644 --- a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c +++ b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c @@ -25,12 +25,39 @@ #include "cik_int.h" static bool cik_event_interrupt_isr(struct kfd_dev *dev, - const uint32_t *ih_ring_entry) + const uint32_t *ih_ring_entry, + uint32_t *patched_ihre, + bool *patched_flag) { const struct cik_ih_ring_entry *ihre = (const struct cik_ih_ring_entry *)ih_ring_entry; + const struct kfd2kgd_calls *f2g = dev->kfd2kgd; unsigned int vmid, pasid; + /* This workaround is due to HW/FW limitation on Hawaii that + * VMID and PASID are not written into ih_ring_entry + */ + if ((ihre->source_id == CIK_INTSRC_GFX_PAGE_INV_FAULT || + ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT) && + dev->device_info->asic_family == CHIP_HAWAII) { + struct cik_ih_ring_entry *tmp_ihre = + (struct cik_ih_ring_entry *)patched_ihre; + + *patched_flag = true; + *tmp_ihre = *ihre; + + vmid = f2g->read_vmid_from_vmfault_reg(dev->kgd); + pasid = f2g->get_atc_vmid_pasid_mapping_pasid(dev->kgd, vmid); + + tmp_ihre->ring_id &= 0x000000ff; + tmp_ihre->ring_id |= vmid << 8; + tmp_ihre->ring_id |= pasid << 16; + + return (pasid != 0) && + vmid >= dev->vm_info.first_vmid_kfd && + vmid <= dev->vm_info.last_vmid_kfd; + } + /* Only handle interrupts from KFD VMIDs */ vmid = (ihre->ring_id & 0x0000ff00) >> 8; if (vmid < dev->vm_info.first_vmid_kfd || diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c index 48c505e83217..600751175760 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c @@ -577,14 +577,24 @@ static int kfd_resume(struct kfd_dev *kfd) /* This is called directly from KGD at ISR. */ void kgd2kfd_interrupt(struct kfd_dev *kfd, const void *ih_ring_entry) { + uint32_t patched_ihre[KFD_MAX_RING_ENTRY_SIZE]; + bool is_patched = false; + if (!kfd->init_complete) return; + if (kfd->device_info->ih_ring_entry_size > sizeof(patched_ihre)) { + dev_err_once(kfd_device, "Ring entry too small\n"); + return; + } + spin_lock(&kfd->interrupt_lock); if (kfd->interrupts_active - && interrupt_is_wanted(kfd, ih_ring_entry) - && enqueue_ih_ring_entry(kfd, ih_ring_entry)) + && interrupt_is_wanted(kfd, ih_ring_entry, + patched_ihre, &is_patched) + && enqueue_ih_ring_entry(kfd, + is_patched ? patched_ihre : ih_ring_entry)) queue_work(kfd->ih_wq, &kfd->interrupt_work); spin_unlock(&kfd->interrupt_lock); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c index d6b64e692760..f836897bbf58 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c @@ -26,7 +26,9 @@ static bool event_interrupt_isr_v9(struct kfd_dev *dev, - const uint32_t *ih_ring_entry) + const uint32_t *ih_ring_entry, + uint32_t *patched_ihre, + bool *patched_flag) { uint16_t source_id, client_id, pasid, vmid; const uint32_t *data = ih_ring_entry; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c b/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c index db6d9336b80d..c56ac47cd318 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c @@ -151,13 +151,15 @@ static void interrupt_wq(struct work_struct *work) ih_ring_entry); } -bool interrupt_is_wanted(struct kfd_dev *dev, const uint32_t *ih_ring_entry) +bool interrupt_is_wanted(struct kfd_dev *dev, + const uint32_t *ih_ring_entry, + uint32_t *patched_ihre, bool *flag) { /* integer and bitwise OR so there is no boolean short-circuiting */ unsigned int wanted = 0; wanted |= dev->device_info->event_interrupt_class->interrupt_isr(dev, - ih_ring_entry); + ih_ring_entry, patched_ihre, flag); return wanted != 0; } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h index 91a3368421b1..cd5121d925e0 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h @@ -180,9 +180,10 @@ enum cache_policy { struct kfd_event_interrupt_class { bool (*interrupt_isr)(struct kfd_dev *dev, - const uint32_t *ih_ring_entry); + const uint32_t *ih_ring_entry, uint32_t *patched_ihre, + bool *patched_flag); void (*interrupt_wq)(struct kfd_dev *dev, - const uint32_t *ih_ring_entry); + const uint32_t *ih_ring_entry); }; struct kfd_device_info { @@ -806,7 +807,9 @@ int kfd_interrupt_init(struct kfd_dev *dev); void kfd_interrupt_exit(struct kfd_dev *dev); void kgd2kfd_interrupt(struct kfd_dev *kfd, const void *ih_ring_entry); bool enqueue_ih_ring_entry(struct kfd_dev *kfd, const void *ih_ring_entry); -bool interrupt_is_wanted(struct kfd_dev *dev, const uint32_t *ih_ring_entry); +bool interrupt_is_wanted(struct kfd_dev *dev, + const uint32_t *ih_ring_entry, + uint32_t *patched_ihre, bool *flag); /* Power Management */ void kgd2kfd_suspend(struct kfd_dev *kfd); diff --git a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h index 28b11d105288..76a30cbeee19 100644 --- a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h +++ b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h @@ -276,6 +276,10 @@ struct tile_config { * faults. On GFXv9 VM fault information is fully contained in the IH * packet and this function is not needed. * + * @read_vmid_from_vmfault_reg: On Hawaii the VMID is not set in the + * IH ring entry. This function allows the KFD ISR to get the VMID + * from the fault status register as early as possible. + * * This structure contains function pointers to services that the kgd driver * provides to amdkfd driver. * @@ -394,6 +398,7 @@ struct kfd2kgd_calls { int (*get_vm_fault_info)(struct kgd_dev *kgd, struct kfd_vm_fault_info *info); + uint32_t (*read_vmid_from_vmfault_reg)(struct kgd_dev *kgd); }; /** -- 2.45.2