x86/mce: Handle broadcasted MCE gracefully with kexec

author Xunlei Pang <xlpang@redhat.com>

Mon, 13 Mar 2017 09:50:19 +0000 (10:50 +0100)

committer Thomas Gleixner <tglx@linutronix.de>

Mon, 13 Mar 2017 19:18:07 +0000 (20:18 +0100)
author Xunlei Pang <xlpang@redhat.com>
Mon, 13 Mar 2017 09:50:19 +0000 (10:50 +0100)
committer Thomas Gleixner <tglx@linutronix.de>
Mon, 13 Mar 2017 19:18:07 +0000 (20:18 +0100)
diff --git a/arch/x86/include/asm/reboot.h b/arch/x86/include/asm/reboot.h

index 2cb1cc253d51ebb5828e89f63b63462396050970..fc62ba8dce933d3bb1ec0262b7d4851bcc5fee26 100644 (file)
--- a/arch/x86/include/asm/reboot.h
+++ b/arch/x86/include/asm/reboot.h
@@ -15,6 +15,7 @@ struct machine_ops {
  };
  
  extern struct machine_ops machine_ops;
+extern int crashing_cpu;
  
  void native_machine_crash_shutdown(struct pt_regs *regs);
  void native_machine_shutdown(void);
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c

index 8e9725c607ea6acb7a91deed9b72b2c9a873803e..177472ace83854c314810e4e32d9944cf5ebd227 100644 (file)
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -49,6 +49,7 @@
  #include <asm/tlbflush.h>
  #include <asm/mce.h>
  #include <asm/msr.h>
+#include <asm/reboot.h>
  
  #include "mce-internal.h"
  
@@ -1127,9 +1128,22 @@ void do_machine_check(struct pt_regs *regs, long error_code)
          * on Intel.
          */
         int lmce = 1;
+       int cpu = smp_processor_id();
  
-       /* If this CPU is offline, just bail out. */
-       if (cpu_is_offline(smp_processor_id())) {
+       /*
+        * Cases where we avoid rendezvous handler timeout:
+        * 1) If this CPU is offline.
+        *
+        * 2) If crashing_cpu was set, e.g. we're entering kdump and we need to
+        *  skip those CPUs which remain looping in the 1st kernel - see
+        *  crash_nmi_callback().
+        *
+        * Note: there still is a small window between kexec-ing and the new,
+        * kdump kernel establishing a new #MC handler where a broadcasted MCE
+        * might not get handled properly.
+        */
+       if (cpu_is_offline(cpu) ||
+           (crashing_cpu != -1 && crashing_cpu != cpu)) {
                 u64 mcgstatus;
  
                 mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c

index 067f9813fd2cf7c15d5a1d297b537eedf6ca7959..2544700a2a87566437e3aea0d8b3696e7e0f439e 100644 (file)
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -765,10 +765,11 @@ void machine_crash_shutdown(struct pt_regs *regs)
  #endif
  
  
+/* This is the CPU performing the emergency shutdown work. */
+int crashing_cpu = -1;
+
  #if defined(CONFIG_SMP)
  
-/* This keeps a track of which one is crashing cpu. */
-static int crashing_cpu;
  static nmi_shootdown_cb shootdown_callback;
  
  static atomic_t waiting_for_crash_ipi;
author	Xunlei Pang <xlpang@redhat.com>
	Mon, 13 Mar 2017 09:50:19 +0000 (10:50 +0100)
committer	Thomas Gleixner <tglx@linutronix.de>
	Mon, 13 Mar 2017 19:18:07 +0000 (20:18 +0100)
arch/x86/include/asm/reboot.h		patch \| blob \| history
arch/x86/kernel/cpu/mcheck/mce.c		patch \| blob \| history
arch/x86/kernel/reboot.c		patch \| blob \| history