]> asedeno.scripts.mit.edu Git - linux.git/blobdiff - arch/powerpc/kernel/eeh_driver.c
Merge tag 'powerpc-4.17-1' of git://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux
[linux.git] / arch / powerpc / kernel / eeh_driver.c
index 0c0b66fc5bfb32f6c61e8fa454fbfe1a467db89b..b8a329f048141c28940c49018ba67332c01096b6 100644 (file)
@@ -207,18 +207,18 @@ static void *eeh_report_error(void *data, void *userdata)
 
        if (!dev || eeh_dev_removed(edev) || eeh_pe_passed(edev->pe))
                return NULL;
+
+       device_lock(&dev->dev);
        dev->error_state = pci_channel_io_frozen;
 
        driver = eeh_pcid_get(dev);
-       if (!driver) return NULL;
+       if (!driver) goto out_no_dev;
 
        eeh_disable_irq(dev);
 
        if (!driver->err_handler ||
-           !driver->err_handler->error_detected) {
-               eeh_pcid_put(dev);
-               return NULL;
-       }
+           !driver->err_handler->error_detected)
+               goto out;
 
        rc = driver->err_handler->error_detected(dev, pci_channel_io_frozen);
 
@@ -227,8 +227,12 @@ static void *eeh_report_error(void *data, void *userdata)
        if (*res == PCI_ERS_RESULT_NONE) *res = rc;
 
        edev->in_error = true;
-       eeh_pcid_put(dev);
        pci_uevent_ers(dev, PCI_ERS_RESULT_NONE);
+
+out:
+       eeh_pcid_put(dev);
+out_no_dev:
+       device_unlock(&dev->dev);
        return NULL;
 }
 
@@ -251,15 +255,14 @@ static void *eeh_report_mmio_enabled(void *data, void *userdata)
        if (!dev || eeh_dev_removed(edev) || eeh_pe_passed(edev->pe))
                return NULL;
 
+       device_lock(&dev->dev);
        driver = eeh_pcid_get(dev);
-       if (!driver) return NULL;
+       if (!driver) goto out_no_dev;
 
        if (!driver->err_handler ||
            !driver->err_handler->mmio_enabled ||
-           (edev->mode & EEH_DEV_NO_HANDLER)) {
-               eeh_pcid_put(dev);
-               return NULL;
-       }
+           (edev->mode & EEH_DEV_NO_HANDLER))
+               goto out;
 
        rc = driver->err_handler->mmio_enabled(dev);
 
@@ -267,7 +270,10 @@ static void *eeh_report_mmio_enabled(void *data, void *userdata)
        if (rc == PCI_ERS_RESULT_NEED_RESET) *res = rc;
        if (*res == PCI_ERS_RESULT_NONE) *res = rc;
 
+out:
        eeh_pcid_put(dev);
+out_no_dev:
+       device_unlock(&dev->dev);
        return NULL;
 }
 
@@ -290,20 +296,20 @@ static void *eeh_report_reset(void *data, void *userdata)
 
        if (!dev || eeh_dev_removed(edev) || eeh_pe_passed(edev->pe))
                return NULL;
+
+       device_lock(&dev->dev);
        dev->error_state = pci_channel_io_normal;
 
        driver = eeh_pcid_get(dev);
-       if (!driver) return NULL;
+       if (!driver) goto out_no_dev;
 
        eeh_enable_irq(dev);
 
        if (!driver->err_handler ||
            !driver->err_handler->slot_reset ||
            (edev->mode & EEH_DEV_NO_HANDLER) ||
-           (!edev->in_error)) {
-               eeh_pcid_put(dev);
-               return NULL;
-       }
+           (!edev->in_error))
+               goto out;
 
        rc = driver->err_handler->slot_reset(dev);
        if ((*res == PCI_ERS_RESULT_NONE) ||
@@ -311,7 +317,10 @@ static void *eeh_report_reset(void *data, void *userdata)
        if (*res == PCI_ERS_RESULT_DISCONNECT &&
             rc == PCI_ERS_RESULT_NEED_RESET) *res = rc;
 
+out:
        eeh_pcid_put(dev);
+out_no_dev:
+       device_unlock(&dev->dev);
        return NULL;
 }
 
@@ -362,10 +371,12 @@ static void *eeh_report_resume(void *data, void *userdata)
 
        if (!dev || eeh_dev_removed(edev) || eeh_pe_passed(edev->pe))
                return NULL;
+
+       device_lock(&dev->dev);
        dev->error_state = pci_channel_io_normal;
 
        driver = eeh_pcid_get(dev);
-       if (!driver) return NULL;
+       if (!driver) goto out_no_dev;
 
        was_in_error = edev->in_error;
        edev->in_error = false;
@@ -375,18 +386,20 @@ static void *eeh_report_resume(void *data, void *userdata)
            !driver->err_handler->resume ||
            (edev->mode & EEH_DEV_NO_HANDLER) || !was_in_error) {
                edev->mode &= ~EEH_DEV_NO_HANDLER;
-               eeh_pcid_put(dev);
-               return NULL;
+               goto out;
        }
 
        driver->err_handler->resume(dev);
 
-       eeh_pcid_put(dev);
        pci_uevent_ers(dev, PCI_ERS_RESULT_RECOVERED);
+out:
+       eeh_pcid_put(dev);
 #ifdef CONFIG_PCI_IOV
        if (eeh_ops->notify_resume && eeh_dev_to_pdn(edev))
                eeh_ops->notify_resume(eeh_dev_to_pdn(edev));
 #endif
+out_no_dev:
+       device_unlock(&dev->dev);
        return NULL;
 }
 
@@ -406,23 +419,26 @@ static void *eeh_report_failure(void *data, void *userdata)
 
        if (!dev || eeh_dev_removed(edev) || eeh_pe_passed(edev->pe))
                return NULL;
+
+       device_lock(&dev->dev);
        dev->error_state = pci_channel_io_perm_failure;
 
        driver = eeh_pcid_get(dev);
-       if (!driver) return NULL;
+       if (!driver) goto out_no_dev;
 
        eeh_disable_irq(dev);
 
        if (!driver->err_handler ||
-           !driver->err_handler->error_detected) {
-               eeh_pcid_put(dev);
-               return NULL;
-       }
+           !driver->err_handler->error_detected)
+               goto out;
 
        driver->err_handler->error_detected(dev, pci_channel_io_perm_failure);
 
-       eeh_pcid_put(dev);
        pci_uevent_ers(dev, PCI_ERS_RESULT_DISCONNECT);
+out:
+       eeh_pcid_put(dev);
+out_no_dev:
+       device_unlock(&dev->dev);
        return NULL;
 }
 
@@ -619,17 +635,19 @@ int eeh_pe_reset_and_recover(struct eeh_pe *pe)
 
 /**
  * eeh_reset_device - Perform actual reset of a pci slot
+ * @driver_eeh_aware: Does the device's driver provide EEH support?
  * @pe: EEH PE
  * @bus: PCI bus corresponding to the isolcated slot
+ * @rmv_data: Optional, list to record removed devices
  *
  * This routine must be called to do reset on the indicated PE.
  * During the reset, udev might be invoked because those affected
  * PCI devices will be removed and then added.
  */
 static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus,
-                               struct eeh_rmv_data *rmv_data)
+                           struct eeh_rmv_data *rmv_data,
+                           bool driver_eeh_aware)
 {
-       struct pci_bus *frozen_bus = eeh_pe_bus_get(pe);
        time64_t tstamp;
        int cnt, rc;
        struct eeh_dev *edev;
@@ -645,16 +663,12 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus,
         * into pci_hp_add_devices().
         */
        eeh_pe_state_mark(pe, EEH_PE_KEEP);
-       if (bus) {
-               if (pe->type & EEH_PE_VF) {
-                       eeh_pe_dev_traverse(pe, eeh_rmv_device, NULL);
-               } else {
-                       pci_lock_rescan_remove();
-                       pci_hp_remove_devices(bus);
-                       pci_unlock_rescan_remove();
-               }
-       } else if (frozen_bus) {
+       if (driver_eeh_aware || (pe->type & EEH_PE_VF)) {
                eeh_pe_dev_traverse(pe, eeh_rmv_device, rmv_data);
+       } else {
+               pci_lock_rescan_remove();
+               pci_hp_remove_devices(bus);
+               pci_unlock_rescan_remove();
        }
 
        /*
@@ -689,8 +703,9 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus,
         * the device up before the scripts have taken it down,
         * potentially weird things happen.
         */
-       if (bus) {
-               pr_info("EEH: Sleep 5s ahead of complete hotplug\n");
+       if (!driver_eeh_aware || rmv_data->removed) {
+               pr_info("EEH: Sleep 5s ahead of %s hotplug\n",
+                       (driver_eeh_aware ? "partial" : "complete"));
                ssleep(5);
 
                /*
@@ -703,19 +718,10 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus,
                if (pe->type & EEH_PE_VF) {
                        eeh_add_virt_device(edev, NULL);
                } else {
-                       eeh_pe_state_clear(pe, EEH_PE_PRI_BUS);
+                       if (!driver_eeh_aware)
+                               eeh_pe_state_clear(pe, EEH_PE_PRI_BUS);
                        pci_hp_add_devices(bus);
                }
-       } else if (frozen_bus && rmv_data->removed) {
-               pr_info("EEH: Sleep 5s ahead of partial hotplug\n");
-               ssleep(5);
-
-               edev = list_first_entry(&pe->edevs, struct eeh_dev, list);
-               eeh_pe_traverse(pe, eeh_pe_detach_dev, NULL);
-               if (pe->type & EEH_PE_VF)
-                       eeh_add_virt_device(edev, NULL);
-               else
-                       pci_hp_add_devices(frozen_bus);
        }
        eeh_pe_state_clear(pe, EEH_PE_KEEP);
 
@@ -733,28 +739,42 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus,
 
 /**
  * eeh_handle_normal_event - Handle EEH events on a specific PE
- * @pe: EEH PE
+ * @pe: EEH PE - which should not be used after we return, as it may
+ * have been invalidated.
  *
  * Attempts to recover the given PE.  If recovery fails or the PE has failed
  * too many times, remove the PE.
  *
- * Returns true if @pe should no longer be used, else false.
+ * While PHB detects address or data parity errors on particular PCI
+ * slot, the associated PE will be frozen. Besides, DMA's occurring
+ * to wild addresses (which usually happen due to bugs in device
+ * drivers or in PCI adapter firmware) can cause EEH error. #SERR,
+ * #PERR or other misc PCI-related errors also can trigger EEH errors.
+ *
+ * Recovery process consists of unplugging the device driver (which
+ * generated hotplug events to userspace), then issuing a PCI #RST to
+ * the device, then reconfiguring the PCI config space for all bridges
+ * & devices under this slot, and then finally restarting the device
+ * drivers (which cause a second set of hotplug events to go out to
+ * userspace).
  */
-static bool eeh_handle_normal_event(struct eeh_pe *pe)
+void eeh_handle_normal_event(struct eeh_pe *pe)
 {
-       struct pci_bus *frozen_bus;
+       struct pci_bus *bus;
        struct eeh_dev *edev, *tmp;
        int rc = 0;
        enum pci_ers_result result = PCI_ERS_RESULT_NONE;
        struct eeh_rmv_data rmv_data = {LIST_HEAD_INIT(rmv_data.edev_list), 0};
 
-       frozen_bus = eeh_pe_bus_get(pe);
-       if (!frozen_bus) {
+       bus = eeh_pe_bus_get(pe);
+       if (!bus) {
                pr_err("%s: Cannot find PCI bus for PHB#%x-PE#%x\n",
                        __func__, pe->phb->global_number, pe->addr);
-               return false;
+               return;
        }
 
+       eeh_pe_state_mark(pe, EEH_PE_RECOVERING);
+
        eeh_pe_update_time_stamp(pe);
        pe->freeze_count++;
        if (pe->freeze_count > eeh_max_freezes) {
@@ -806,7 +826,7 @@ static bool eeh_handle_normal_event(struct eeh_pe *pe)
         */
        if (result == PCI_ERS_RESULT_NONE) {
                pr_info("EEH: Reset with hotplug activity\n");
-               rc = eeh_reset_device(pe, frozen_bus, NULL);
+               rc = eeh_reset_device(pe, bus, NULL, false);
                if (rc) {
                        pr_warn("%s: Unable to reset, err=%d\n",
                                __func__, rc);
@@ -858,7 +878,7 @@ static bool eeh_handle_normal_event(struct eeh_pe *pe)
        /* If any device called out for a reset, then reset the slot */
        if (result == PCI_ERS_RESULT_NEED_RESET) {
                pr_info("EEH: Reset without hotplug activity\n");
-               rc = eeh_reset_device(pe, NULL, &rmv_data);
+               rc = eeh_reset_device(pe, bus, &rmv_data, true);
                if (rc) {
                        pr_warn("%s: Cannot reset, err=%d\n",
                                __func__, rc);
@@ -891,7 +911,7 @@ static bool eeh_handle_normal_event(struct eeh_pe *pe)
        pr_info("EEH: Notify device driver to resume\n");
        eeh_pe_dev_traverse(pe, eeh_report_resume, NULL);
 
-       return false;
+       goto final;
 
 hard_fail:
        /*
@@ -916,23 +936,21 @@ static bool eeh_handle_normal_event(struct eeh_pe *pe)
         * all removed devices correctly to avoid access
         * the their PCI config any more.
         */
-       if (frozen_bus) {
-               if (pe->type & EEH_PE_VF) {
-                       eeh_pe_dev_traverse(pe, eeh_rmv_device, NULL);
-                       eeh_pe_dev_mode_mark(pe, EEH_DEV_REMOVED);
-               } else {
-                       eeh_pe_state_clear(pe, EEH_PE_PRI_BUS);
-                       eeh_pe_dev_mode_mark(pe, EEH_DEV_REMOVED);
-
-                       pci_lock_rescan_remove();
-                       pci_hp_remove_devices(frozen_bus);
-                       pci_unlock_rescan_remove();
+       if (pe->type & EEH_PE_VF) {
+               eeh_pe_dev_traverse(pe, eeh_rmv_device, NULL);
+               eeh_pe_dev_mode_mark(pe, EEH_DEV_REMOVED);
+       } else {
+               eeh_pe_state_clear(pe, EEH_PE_PRI_BUS);
+               eeh_pe_dev_mode_mark(pe, EEH_DEV_REMOVED);
 
-                       /* The passed PE should no longer be used */
-                       return true;
-               }
+               pci_lock_rescan_remove();
+               pci_hp_remove_devices(bus);
+               pci_unlock_rescan_remove();
+               /* The passed PE should no longer be used */
+               return;
        }
-       return false;
+final:
+       eeh_pe_state_clear(pe, EEH_PE_RECOVERING);
 }
 
 /**
@@ -942,7 +960,7 @@ static bool eeh_handle_normal_event(struct eeh_pe *pe)
  * specific PE.  Iterates through possible failures and handles them as
  * necessary.
  */
-static void eeh_handle_special_event(void)
+void eeh_handle_special_event(void)
 {
        struct eeh_pe *pe, *phb_pe;
        struct pci_bus *bus;
@@ -1005,15 +1023,7 @@ static void eeh_handle_special_event(void)
                 */
                if (rc == EEH_NEXT_ERR_FROZEN_PE ||
                    rc == EEH_NEXT_ERR_FENCED_PHB) {
-                       /*
-                        * eeh_handle_normal_event() can make the PE stale if it
-                        * determines that the PE cannot possibly be recovered.
-                        * Don't modify the PE state if that's the case.
-                        */
-                       if (eeh_handle_normal_event(pe))
-                               continue;
-
-                       eeh_pe_state_clear(pe, EEH_PE_RECOVERING);
+                       eeh_handle_normal_event(pe);
                } else {
                        pci_lock_rescan_remove();
                        list_for_each_entry(hose, &hose_list, list_node) {
@@ -1049,28 +1059,3 @@ static void eeh_handle_special_event(void)
                        break;
        } while (rc != EEH_NEXT_ERR_NONE);
 }
-
-/**
- * eeh_handle_event - Reset a PCI device after hard lockup.
- * @pe: EEH PE
- *
- * While PHB detects address or data parity errors on particular PCI
- * slot, the associated PE will be frozen. Besides, DMA's occurring
- * to wild addresses (which usually happen due to bugs in device
- * drivers or in PCI adapter firmware) can cause EEH error. #SERR,
- * #PERR or other misc PCI-related errors also can trigger EEH errors.
- *
- * Recovery process consists of unplugging the device driver (which
- * generated hotplug events to userspace), then issuing a PCI #RST to
- * the device, then reconfiguring the PCI config space for all bridges
- * & devices under this slot, and then finally restarting the device
- * drivers (which cause a second set of hotplug events to go out to
- * userspace).
- */
-void eeh_handle_event(struct eeh_pe *pe)
-{
-       if (pe)
-               eeh_handle_normal_event(pe);
-       else
-               eeh_handle_special_event();
-}