@@ -319,3 +319,44 @@ messages. To help with this, a tool has been created at
`amd-debug-tools <https://git.kernel.org/pub/scm/linux/kernel/git/superm1/amd-debug-tools.git/about/>`_
to help parse the messages.
+Random reboot issues
+====================
+When a random reboot occurs, the high-level reason for the reboot is stored
+in a register that will persist onto the next boot.
+
+There are 6 classes of reasons for the reboot:
+ * Software induced
+ * Power state transition
+ * Pin induced
+ * Hardware induced
+ * Remote reset
+ * Internal CPU event
+
+.. csv-table::
+ :header: "Bit", "Type", "Reason"
+ :align: left
+
+ "0", "Pin", "thermal pin BP_THERMTRIP_L was tripped"
+ "1", "Pin", "power button was pressed for 4 seconds"
+ "2", "Pin", "shutdown pin was shorted"
+ "4", "Remote", "remote ASF power off command was received"
+ "9", "Internal", "internal CPU thermal limit was tripped"
+ "16", "Pin", "system reset pin BP_SYS_RST_L was tripped"
+ "17", "Software", "software issued PCI reset"
+ "18", "Software", "software wrote 0x4 to reset control register 0xCF9"
+ "19", "Software", "software wrote 0x6 to reset control register 0xCF9"
+ "20", "Software", "software wrote 0xE to reset control register 0xCF9"
+ "21", "Sleep", "ACPI power state transition occurred"
+ "22", "Pin", "keyboard reset pin KB_RST_L was asserted"
+ "23", "Internal", "internal CPU shutdown event occurred"
+ "24", "Hardware", "system failed to boot before failed boot timer expired"
+ "25", "Hardware", "hardware watchdog timer expired"
+ "26", "Remote", "remote ASF reset command was received"
+ "27", "Internal", "an uncorrected error caused a data fabric sync flood event"
+ "29", "Internal", "FCH and MP1 failed warm reset handshake"
+ "30", "Internal", "a parity error occurred"
+ "31", "Internal", "a software sync flood event occurred"
+
+This information is read by the kernel at bootup and is saved into the
+kernel ring buffer. When a random reboot occurs this message can be helpful
+to determine the next component to debug such an issue.
@@ -9,5 +9,6 @@
#define FCH_PM_DECODEEN 0x00
#define FCH_PM_DECODEEN_SMBUS0SEL GENMASK(20, 19)
#define FCH_PM_SCRATCH 0x80
+#define FCH_PM_S5_RESET_STATUS 0xC0
#endif
@@ -9,6 +9,7 @@
#include <linux/sched/clock.h>
#include <linux/random.h>
#include <linux/topology.h>
+#include <asm/amd/fch.h>
#include <asm/processor.h>
#include <asm/apic.h>
#include <asm/cacheinfo.h>
@@ -1237,3 +1238,66 @@ void amd_check_microcode(void)
if (cpu_feature_enabled(X86_FEATURE_ZEN2))
on_each_cpu(zenbleed_check_cpu, NULL, 1);
}
+
+static const char * const s5_reset_reason_txt[] = {
+ [0] = "thermal pin BP_THERMTRIP_L was tripped",
+ [1] = "power button was pressed for 4 seconds",
+ [2] = "shutdown pin was shorted",
+ [4] = "remote ASF power off command was received",
+ [9] = "internal CPU thermal limit was tripped",
+ [16] = "system reset pin BP_SYS_RST_L was tripped",
+ [17] = "software issued PCI reset",
+ [18] = "software wrote 0x4 to reset control register 0xCF9",
+ [19] = "software wrote 0x6 to reset control register 0xCF9",
+ [20] = "software wrote 0xE to reset control register 0xCF9",
+ [21] = "ACPI power state transition occurred",
+ [22] = "keyboard reset pin KB_RST_L was asserted",
+ [23] = "internal CPU shutdown event occurred",
+ [24] = "system failed to boot before failed boot timer expired",
+ [25] = "hardware watchdog timer expired",
+ [26] = "remote ASF reset command was received",
+ [27] = "an uncorrected error caused a data fabric sync flood event",
+ [29] = "FCH and MP1 failed warm reset handshake",
+ [30] = "a parity error occurred",
+ [31] = "a software sync flood event occurred",
+};
+
+static __init int print_s5_reset_status_mmio(void)
+{
+ void __iomem *addr;
+ unsigned long value;
+ int nr_reasons = 0;
+ int bit = -1;
+
+ if (!cpu_feature_enabled(X86_FEATURE_ZEN))
+ return 0;
+
+ addr = ioremap(FCH_PM_BASE + FCH_PM_S5_RESET_STATUS, sizeof(value));
+ if (!addr)
+ return 0;
+
+ value = ioread32(addr);
+ iounmap(addr);
+
+ /* Iterate on each bit in the 'value' mask: */
+ while (true) {
+ bit = find_next_bit(&value, BITS_PER_LONG, bit + 1);
+
+ /* Reached the end of the word, no more bits: */
+ if (bit >= BITS_PER_LONG) {
+ if (!nr_reasons)
+ pr_info("x86/amd: Previous system reset reason [0x%08lx]: Unknown\n", value);
+ break;
+ }
+
+ if (!s5_reset_reason_txt[bit])
+ continue;
+
+ nr_reasons++;
+ pr_info("x86/amd: Previous system reset reason [0x%08lx]: %s\n",
+ value, s5_reset_reason_txt[bit]);
+ }
+
+ return 0;
+}
+late_initcall(print_s5_reset_status_mmio);