diff mbox series

[V2] ACPI: APEI: Use ERST timeout for slow devices

Message ID 20230712223448.145079-1-jeshuas@nvidia.com
State Accepted
Commit fac475aab70b2b6d44f54edea85e8bd276a1fe60
Headers show
Series [V2] ACPI: APEI: Use ERST timeout for slow devices | expand

Commit Message

Jeshua Smith July 12, 2023, 10:34 p.m. UTC
Slow devices such as flash may not meet the default 1ms timeout value,
so use the ERST max execution time value that they provide as the
timeout if it is larger.

Signed-off-by: Jeshua Smith <jeshuas@nvidia.com>
---
v2:
* no longer add copyright.
* no longer add unused ERST_EXEC_TIMING_TYPICAL defines.
* set timings to 0 if the ACPI_ERST_EXECUTE_TIMINGS operation isn't supported,
  which will result in the default timeout being used.

 drivers/acpi/apei/erst.c | 41 ++++++++++++++++++++++++++++++++++++----
 1 file changed, 37 insertions(+), 4 deletions(-)

Comments

Jeshua Smith Aug. 4, 2023, 3:55 p.m. UTC | #1
Can the maintainers please respond to my patch?

-----Original Message-----
From: Jeshua Smith <jeshuas@nvidia.com> 
Sent: Wednesday, July 12, 2023 4:35 PM
To: keescook@chromium.org; tony.luck@intel.com; gpiccoli@igalia.com; rafael@kernel.org; lenb@kernel.org; james.morse@arm.com; bp@alien8.de
Cc: linux-acpi@vger.kernel.org; linux-kernel@vger.kernel.org; linux-hardening@vger.kernel.org; linux-tegra@vger.kernel.org; Thierry Reding <treding@nvidia.com>; Jonathan Hunter <jonathanh@nvidia.com>; Jeshua Smith <jeshuas@nvidia.com>
Subject: [PATCH V2] ACPI: APEI: Use ERST timeout for slow devices

Slow devices such as flash may not meet the default 1ms timeout value, so use the ERST max execution time value that they provide as the timeout if it is larger.

Signed-off-by: Jeshua Smith <jeshuas@nvidia.com>
---
v2:
* no longer add copyright.
* no longer add unused ERST_EXEC_TIMING_TYPICAL defines.
* set timings to 0 if the ACPI_ERST_EXECUTE_TIMINGS operation isn't supported,
  which will result in the default timeout being used.

 drivers/acpi/apei/erst.c | 41 ++++++++++++++++++++++++++++++++++++----
 1 file changed, 37 insertions(+), 4 deletions(-)

diff --git a/drivers/acpi/apei/erst.c b/drivers/acpi/apei/erst.c index 247989060e29..bf65e3461531 100644
--- a/drivers/acpi/apei/erst.c
+++ b/drivers/acpi/apei/erst.c
@@ -59,6 +59,10 @@ static struct acpi_table_erst *erst_tab;
 #define ERST_RANGE_NVRAM	0x0002
 #define ERST_RANGE_SLOW		0x0004
 
+/* ERST Exec max timings */
+#define ERST_EXEC_TIMING_MAX_MASK      0xFFFFFFFF00000000
+#define ERST_EXEC_TIMING_MAX_SHIFT     32
+
 /*
  * ERST Error Log Address Range, used as buffer for reading/writing
  * error records.
@@ -68,6 +72,7 @@ static struct erst_erange {
 	u64 size;
 	void __iomem *vaddr;
 	u32 attr;
+	u64 timings;
 } erst_erange;
 
 /*
@@ -97,6 +102,19 @@ static inline int erst_errno(int command_status)
 	}
 }
 
+static inline u64 erst_get_timeout(void) {
+	u64 timeout = FIRMWARE_TIMEOUT;
+
+	if (erst_erange.attr & ERST_RANGE_SLOW) {
+		timeout = ((erst_erange.timings & ERST_EXEC_TIMING_MAX_MASK) >>
+			ERST_EXEC_TIMING_MAX_SHIFT) * NSEC_PER_MSEC;
+		if (timeout < FIRMWARE_TIMEOUT)
+			timeout = FIRMWARE_TIMEOUT;
+	}
+	return timeout;
+}
+
 static int erst_timedout(u64 *t, u64 spin_unit)  {
 	if ((s64)*t < spin_unit) {
@@ -191,9 +209,11 @@ static int erst_exec_stall_while_true(struct apei_exec_context *ctx,  {
 	int rc;
 	u64 val;
-	u64 timeout = FIRMWARE_TIMEOUT;
+	u64 timeout;
 	u64 stall_time;
 
+	timeout = erst_get_timeout();
+
 	if (ctx->var1 > FIRMWARE_MAX_STALL) {
 		if (!in_nmi())
 			pr_warn(FW_WARN
@@ -389,6 +409,13 @@ static int erst_get_erange(struct erst_erange *range)
 	if (rc)
 		return rc;
 	range->attr = apei_exec_ctx_get_output(&ctx);
+	rc = apei_exec_run(&ctx, ACPI_ERST_EXECUTE_TIMINGS);
+	if (rc == 0)
+		range->timings = apei_exec_ctx_get_output(&ctx);
+	else if (rc == -ENOENT)
+		range->timings = 0;
+	else
+		return rc;
 
 	return 0;
 }
@@ -621,10 +648,12 @@ EXPORT_SYMBOL_GPL(erst_get_record_id_end);
 static int __erst_write_to_storage(u64 offset)  {
 	struct apei_exec_context ctx;
-	u64 timeout = FIRMWARE_TIMEOUT;
+	u64 timeout;
 	u64 val;
 	int rc;
 
+	timeout = erst_get_timeout();
+
 	erst_exec_ctx_init(&ctx);
 	rc = apei_exec_run_optional(&ctx, ACPI_ERST_BEGIN_WRITE);
 	if (rc)
@@ -660,10 +689,12 @@ static int __erst_write_to_storage(u64 offset)  static int __erst_read_from_storage(u64 record_id, u64 offset)  {
 	struct apei_exec_context ctx;
-	u64 timeout = FIRMWARE_TIMEOUT;
+	u64 timeout;
 	u64 val;
 	int rc;
 
+	timeout = erst_get_timeout();
+
 	erst_exec_ctx_init(&ctx);
 	rc = apei_exec_run_optional(&ctx, ACPI_ERST_BEGIN_READ);
 	if (rc)
@@ -703,10 +734,12 @@ static int __erst_read_from_storage(u64 record_id, u64 offset)  static int __erst_clear_from_storage(u64 record_id)  {
 	struct apei_exec_context ctx;
-	u64 timeout = FIRMWARE_TIMEOUT;
+	u64 timeout;
 	u64 val;
 	int rc;
 
+	timeout = erst_get_timeout();
+
 	erst_exec_ctx_init(&ctx);
 	rc = apei_exec_run_optional(&ctx, ACPI_ERST_BEGIN_CLEAR);
 	if (rc)
--
2.25.1
Luck, Tony Aug. 4, 2023, 4:31 p.m. UTC | #2
> Can the maintainers please respond to my patch?

Can you give a reference to the ACPI spec where this timing information is documented? I'm looking at ACPI 6.5
and don't see anything about this.

https://uefi.org/specs/ACPI/6.5/18_Platform_Error_Interfaces.html#error-serialization

-Tony
Jeshua Smith Aug. 5, 2023, 1:04 a.m. UTC | #3
Thanks for the reply.

It's not very easy to see. It's just a bit down from the link you sent. It's the last possible action in the Serialization Actions table:
https://uefi.org/specs/ACPI/6.5/18_Platform_Error_Interfaces.html#serialization-actions

18.5.1.1. Serialization Actions

GET_EXECUTE-_OPERATION_TIMINGS

Returns an encoded QWORD:
[63:32] value in microseconds that the platform expects would be the maximum amount of time it will take to process and complete an EXECUTE_OPERATION.
[31:0] value in microseconds that the platform expects would be the nominal amount of time it will take to process and complete an EXECUTE_OPERATION.

-----Original Message-----
From: Luck, Tony <tony.luck@intel.com> 
Sent: Friday, August 4, 2023 10:31 AM
To: Jeshua Smith <jeshuas@nvidia.com>; keescook@chromium.org; gpiccoli@igalia.com; rafael@kernel.org; lenb@kernel.org; james.morse@arm.com; bp@alien8.de
Cc: linux-acpi@vger.kernel.org; linux-kernel@vger.kernel.org; linux-hardening@vger.kernel.org; linux-tegra@vger.kernel.org; Thierry Reding <treding@nvidia.com>; Jonathan Hunter <jonathanh@nvidia.com>
Subject: RE: [PATCH V2] ACPI: APEI: Use ERST timeout for slow devices

External email: Use caution opening links or attachments


> Can the maintainers please respond to my patch?

Can you give a reference to the ACPI spec where this timing information is documented? I'm looking at ACPI 6.5 and don't see anything about this.

https://uefi.org/specs/ACPI/6.5/18_Platform_Error_Interfaces.html#error-serialization

-Tony
Jeshua Smith Sept. 11, 2023, 4:15 p.m. UTC | #4
Any further questions? Anything else holding up this patch?

-----Original Message-----
From: Jeshua Smith <jeshuas@nvidia.com> 
Sent: Friday, August 4, 2023 7:05 PM
To: Luck, Tony <tony.luck@intel.com>; keescook@chromium.org; gpiccoli@igalia.com; rafael@kernel.org; lenb@kernel.org; james.morse@arm.com; bp@alien8.de
Cc: linux-acpi@vger.kernel.org; linux-kernel@vger.kernel.org; linux-hardening@vger.kernel.org; linux-tegra@vger.kernel.org; Thierry Reding <treding@nvidia.com>; Jonathan Hunter <jonathanh@nvidia.com>
Subject: RE: [PATCH V2] ACPI: APEI: Use ERST timeout for slow devices

Thanks for the reply.

It's not very easy to see. It's just a bit down from the link you sent. It's the last possible action in the Serialization Actions table:
https://uefi.org/specs/ACPI/6.5/18_Platform_Error_Interfaces.html#serialization-actions

18.5.1.1. Serialization Actions

GET_EXECUTE-_OPERATION_TIMINGS

Returns an encoded QWORD:
[63:32] value in microseconds that the platform expects would be the maximum amount of time it will take to process and complete an EXECUTE_OPERATION.
[31:0] value in microseconds that the platform expects would be the nominal amount of time it will take to process and complete an EXECUTE_OPERATION.

-----Original Message-----
From: Luck, Tony <tony.luck@intel.com> 
Sent: Friday, August 4, 2023 10:31 AM
To: Jeshua Smith <jeshuas@nvidia.com>; keescook@chromium.org; gpiccoli@igalia.com; rafael@kernel.org; lenb@kernel.org; james.morse@arm.com; bp@alien8.de
Cc: linux-acpi@vger.kernel.org; linux-kernel@vger.kernel.org; linux-hardening@vger.kernel.org; linux-tegra@vger.kernel.org; Thierry Reding <treding@nvidia.com>; Jonathan Hunter <jonathanh@nvidia.com>
Subject: RE: [PATCH V2] ACPI: APEI: Use ERST timeout for slow devices

External email: Use caution opening links or attachments


> Can the maintainers please respond to my patch?

Can you give a reference to the ACPI spec where this timing information is documented? I'm looking at ACPI 6.5 and don't see anything about this.

https://uefi.org/specs/ACPI/6.5/18_Platform_Error_Interfaces.html#error-serialization

-Tony
Jeshua Smith Oct. 2, 2023, 4:10 p.m. UTC | #5
Resending due to lack of responses.

-----Original Message-----
From: Jeshua Smith 
Sent: Monday, September 11, 2023 10:16 AM
To: Luck, Tony <tony.luck@intel.com>; keescook@chromium.org; gpiccoli@igalia.com; rafael@kernel.org; lenb@kernel.org; james.morse@arm.com; bp@alien8.de
Cc: linux-acpi@vger.kernel.org; linux-kernel@vger.kernel.org; linux-hardening@vger.kernel.org; linux-tegra@vger.kernel.org; Thierry Reding <treding@nvidia.com>; Jonathan Hunter <jonathanh@nvidia.com>
Subject: RE: [PATCH V2] ACPI: APEI: Use ERST timeout for slow devices

Any further questions? Anything else holding up this patch?

-----Original Message-----
From: Jeshua Smith <jeshuas@nvidia.com> 
Sent: Friday, August 4, 2023 7:05 PM
To: Luck, Tony <tony.luck@intel.com>; keescook@chromium.org; gpiccoli@igalia.com; rafael@kernel.org; lenb@kernel.org; james.morse@arm.com; bp@alien8.de
Cc: linux-acpi@vger.kernel.org; linux-kernel@vger.kernel.org; linux-hardening@vger.kernel.org; linux-tegra@vger.kernel.org; Thierry Reding <treding@nvidia.com>; Jonathan Hunter <jonathanh@nvidia.com>
Subject: RE: [PATCH V2] ACPI: APEI: Use ERST timeout for slow devices

Thanks for the reply.

It's not very easy to see. It's just a bit down from the link you sent. It's the last possible action in the Serialization Actions table:
https://uefi.org/specs/ACPI/6.5/18_Platform_Error_Interfaces.html#serialization-actions

18.5.1.1. Serialization Actions

GET_EXECUTE-_OPERATION_TIMINGS

Returns an encoded QWORD:
[63:32] value in microseconds that the platform expects would be the maximum amount of time it will take to process and complete an EXECUTE_OPERATION.
[31:0] value in microseconds that the platform expects would be the nominal amount of time it will take to process and complete an EXECUTE_OPERATION.

-----Original Message-----
From: Luck, Tony <tony.luck@intel.com> 
Sent: Friday, August 4, 2023 10:31 AM
To: Jeshua Smith <jeshuas@nvidia.com>; keescook@chromium.org; gpiccoli@igalia.com; rafael@kernel.org; lenb@kernel.org; james.morse@arm.com; bp@alien8.de
Cc: linux-acpi@vger.kernel.org; linux-kernel@vger.kernel.org; linux-hardening@vger.kernel.org; linux-tegra@vger.kernel.org; Thierry Reding <treding@nvidia.com>; Jonathan Hunter <jonathanh@nvidia.com>
Subject: RE: [PATCH V2] ACPI: APEI: Use ERST timeout for slow devices

External email: Use caution opening links or attachments


> Can the maintainers please respond to my patch?

Can you give a reference to the ACPI spec where this timing information is documented? I'm looking at ACPI 6.5 and don't see anything about this.

https://uefi.org/specs/ACPI/6.5/18_Platform_Error_Interfaces.html#error-serialization

-Tony
Jeshua Smith Oct. 23, 2023, 3:45 p.m. UTC | #6
Can we get this merged please, or at least instructions for what needs to happen to get it merged?

Thanks.

-----Original Message-----
From: Jeshua Smith 
Sent: Monday, October 2, 2023 10:10 AM
To: Luck, Tony <tony.luck@intel.com>; keescook@chromium.org; gpiccoli@igalia.com; rafael@kernel.org; lenb@kernel.org; james.morse@arm.com; bp@alien8.de
Cc: linux-acpi@vger.kernel.org; linux-kernel@vger.kernel.org; linux-hardening@vger.kernel.org; linux-tegra@vger.kernel.org; Thierry Reding <treding@nvidia.com>; Jonathan Hunter <jonathanh@nvidia.com>
Subject: RE: [PATCH V2] ACPI: APEI: Use ERST timeout for slow devices

Resending due to lack of responses.

-----Original Message-----
From: Jeshua Smith 
Sent: Monday, September 11, 2023 10:16 AM
To: Luck, Tony <tony.luck@intel.com>; keescook@chromium.org; gpiccoli@igalia.com; rafael@kernel.org; lenb@kernel.org; james.morse@arm.com; bp@alien8.de
Cc: linux-acpi@vger.kernel.org; linux-kernel@vger.kernel.org; linux-hardening@vger.kernel.org; linux-tegra@vger.kernel.org; Thierry Reding <treding@nvidia.com>; Jonathan Hunter <jonathanh@nvidia.com>
Subject: RE: [PATCH V2] ACPI: APEI: Use ERST timeout for slow devices

Any further questions? Anything else holding up this patch?

-----Original Message-----
From: Jeshua Smith <jeshuas@nvidia.com> 
Sent: Friday, August 4, 2023 7:05 PM
To: Luck, Tony <tony.luck@intel.com>; keescook@chromium.org; gpiccoli@igalia.com; rafael@kernel.org; lenb@kernel.org; james.morse@arm.com; bp@alien8.de
Cc: linux-acpi@vger.kernel.org; linux-kernel@vger.kernel.org; linux-hardening@vger.kernel.org; linux-tegra@vger.kernel.org; Thierry Reding <treding@nvidia.com>; Jonathan Hunter <jonathanh@nvidia.com>
Subject: RE: [PATCH V2] ACPI: APEI: Use ERST timeout for slow devices

Thanks for the reply.

It's not very easy to see. It's just a bit down from the link you sent. It's the last possible action in the Serialization Actions table:
https://uefi.org/specs/ACPI/6.5/18_Platform_Error_Interfaces.html#serialization-actions

18.5.1.1. Serialization Actions

GET_EXECUTE-_OPERATION_TIMINGS

Returns an encoded QWORD:
[63:32] value in microseconds that the platform expects would be the maximum amount of time it will take to process and complete an EXECUTE_OPERATION.
[31:0] value in microseconds that the platform expects would be the nominal amount of time it will take to process and complete an EXECUTE_OPERATION.

-----Original Message-----
From: Luck, Tony <tony.luck@intel.com> 
Sent: Friday, August 4, 2023 10:31 AM
To: Jeshua Smith <jeshuas@nvidia.com>; keescook@chromium.org; gpiccoli@igalia.com; rafael@kernel.org; lenb@kernel.org; james.morse@arm.com; bp@alien8.de
Cc: linux-acpi@vger.kernel.org; linux-kernel@vger.kernel.org; linux-hardening@vger.kernel.org; linux-tegra@vger.kernel.org; Thierry Reding <treding@nvidia.com>; Jonathan Hunter <jonathanh@nvidia.com>
Subject: RE: [PATCH V2] ACPI: APEI: Use ERST timeout for slow devices

External email: Use caution opening links or attachments


> Can the maintainers please respond to my patch?

Can you give a reference to the ACPI spec where this timing information is documented? I'm looking at ACPI 6.5 and don't see anything about this.

https://uefi.org/specs/ACPI/6.5/18_Platform_Error_Interfaces.html#error-serialization

-Tony
Rafael J. Wysocki Oct. 24, 2023, 2:32 p.m. UTC | #7
On Mon, Oct 23, 2023 at 5:45 PM Jeshua Smith <jeshuas@nvidia.com> wrote:
>
> Can we get this merged please, or at least instructions for what needs to happen to get it merged?

So there are 3 designated reviewers for APEI: Tony Luck, Borislav
Petkov and James Morse.  I need an ACK or Reviewed-by from one of
them, so I can proceed with an APEI patch.

Thanks!

> -----Original Message-----
> From: Jeshua Smith
> Sent: Monday, October 2, 2023 10:10 AM
> To: Luck, Tony <tony.luck@intel.com>; keescook@chromium.org; gpiccoli@igalia.com; rafael@kernel.org; lenb@kernel.org; james.morse@arm.com; bp@alien8.de
> Cc: linux-acpi@vger.kernel.org; linux-kernel@vger.kernel.org; linux-hardening@vger.kernel.org; linux-tegra@vger.kernel.org; Thierry Reding <treding@nvidia.com>; Jonathan Hunter <jonathanh@nvidia.com>
> Subject: RE: [PATCH V2] ACPI: APEI: Use ERST timeout for slow devices
>
> Resending due to lack of responses.
>
> -----Original Message-----
> From: Jeshua Smith
> Sent: Monday, September 11, 2023 10:16 AM
> To: Luck, Tony <tony.luck@intel.com>; keescook@chromium.org; gpiccoli@igalia.com; rafael@kernel.org; lenb@kernel.org; james.morse@arm.com; bp@alien8.de
> Cc: linux-acpi@vger.kernel.org; linux-kernel@vger.kernel.org; linux-hardening@vger.kernel.org; linux-tegra@vger.kernel.org; Thierry Reding <treding@nvidia.com>; Jonathan Hunter <jonathanh@nvidia.com>
> Subject: RE: [PATCH V2] ACPI: APEI: Use ERST timeout for slow devices
>
> Any further questions? Anything else holding up this patch?
>
> -----Original Message-----
> From: Jeshua Smith <jeshuas@nvidia.com>
> Sent: Friday, August 4, 2023 7:05 PM
> To: Luck, Tony <tony.luck@intel.com>; keescook@chromium.org; gpiccoli@igalia.com; rafael@kernel.org; lenb@kernel.org; james.morse@arm.com; bp@alien8.de
> Cc: linux-acpi@vger.kernel.org; linux-kernel@vger.kernel.org; linux-hardening@vger.kernel.org; linux-tegra@vger.kernel.org; Thierry Reding <treding@nvidia.com>; Jonathan Hunter <jonathanh@nvidia.com>
> Subject: RE: [PATCH V2] ACPI: APEI: Use ERST timeout for slow devices
>
> Thanks for the reply.
>
> It's not very easy to see. It's just a bit down from the link you sent. It's the last possible action in the Serialization Actions table:
> https://uefi.org/specs/ACPI/6.5/18_Platform_Error_Interfaces.html#serialization-actions
>
> 18.5.1.1. Serialization Actions
>
> GET_EXECUTE-_OPERATION_TIMINGS
>
> Returns an encoded QWORD:
> [63:32] value in microseconds that the platform expects would be the maximum amount of time it will take to process and complete an EXECUTE_OPERATION.
> [31:0] value in microseconds that the platform expects would be the nominal amount of time it will take to process and complete an EXECUTE_OPERATION.
>
> -----Original Message-----
> From: Luck, Tony <tony.luck@intel.com>
> Sent: Friday, August 4, 2023 10:31 AM
> To: Jeshua Smith <jeshuas@nvidia.com>; keescook@chromium.org; gpiccoli@igalia.com; rafael@kernel.org; lenb@kernel.org; james.morse@arm.com; bp@alien8.de
> Cc: linux-acpi@vger.kernel.org; linux-kernel@vger.kernel.org; linux-hardening@vger.kernel.org; linux-tegra@vger.kernel.org; Thierry Reding <treding@nvidia.com>; Jonathan Hunter <jonathanh@nvidia.com>
> Subject: RE: [PATCH V2] ACPI: APEI: Use ERST timeout for slow devices
>
> External email: Use caution opening links or attachments
>
>
> > Can the maintainers please respond to my patch?
>
> Can you give a reference to the ACPI spec where this timing information is documented? I'm looking at ACPI 6.5 and don't see anything about this.
>
> https://uefi.org/specs/ACPI/6.5/18_Platform_Error_Interfaces.html#error-serialization
>
> -Tony
Luck, Tony Oct. 24, 2023, 3:24 p.m. UTC | #8
On Wed, Jul 12, 2023 at 10:34:48PM +0000, Jeshua Smith wrote:
> Slow devices such as flash may not meet the default 1ms timeout value,
> so use the ERST max execution time value that they provide as the
> timeout if it is larger.
> 
> Signed-off-by: Jeshua Smith <jeshuas@nvidia.com>

> +/* ERST Exec max timings */
> +#define ERST_EXEC_TIMING_MAX_MASK      0xFFFFFFFF00000000
> +#define ERST_EXEC_TIMING_MAX_SHIFT     32

I've recently become a fan of <linux/bitfield.h> I think this would
be easier on the eyes as:

#define ERST_EXEC_TIMING_MAX	GENMASK_ULL(63, 32)

> +static inline u64 erst_get_timeout(void)
> +{
> +	u64 timeout = FIRMWARE_TIMEOUT;
> +
> +	if (erst_erange.attr & ERST_RANGE_SLOW) {
> +		timeout = ((erst_erange.timings & ERST_EXEC_TIMING_MAX_MASK) >>
> +			ERST_EXEC_TIMING_MAX_SHIFT) * NSEC_PER_MSEC;

then this becomes:

		timeout = FIELD_GET(ERST_EXEC_TIMING_MAX, erst_erange.timings) *
			  NSEC_PER_MSEC;

> +		if (timeout < FIRMWARE_TIMEOUT)
> +			timeout = FIRMWARE_TIMEOUT;

But that's just a matter of style.  Otherwise the patch looks fine.

Reviewed-by: Tony Luck <tony.luck@intel.com>

-Tony
Borislav Petkov Oct. 24, 2023, 3:27 p.m. UTC | #9
On Tue, Oct 24, 2023 at 04:32:48PM +0200, Rafael J. Wysocki wrote:
> So there are 3 designated reviewers for APEI: Tony Luck, Borislav
> Petkov and James Morse.  I need an ACK or Reviewed-by from one of
> them, so I can proceed with an APEI patch.

Here's what I see:

cat /tmp/patch | ./scripts/get_maintainer.pl
Kees Cook <keescook@chromium.org> (supporter:PSTORE FILESYSTEM)
Tony Luck <tony.luck@intel.com> (reviewer:PSTORE FILESYSTEM)
"Guilherme G. Piccoli" <gpiccoli@igalia.com> (reviewer:PSTORE FILESYSTEM)
"Rafael J. Wysocki" <rafael@kernel.org> (unknown:ACPI APEI)
Len Brown <lenb@kernel.org> (reviewer:ACPI APEI)
James Morse <james.morse@arm.com> (reviewer:ACPI APEI)
Borislav Petkov <bp@alien8.de> (reviewer:ACPI APEI)

so I'm guessing Kees, Tony, Guilherme ...
Rafael J. Wysocki Oct. 24, 2023, 6:51 p.m. UTC | #10
On Tue, Oct 24, 2023 at 5:27 PM Tony Luck <tony.luck@intel.com> wrote:
>
> On Wed, Jul 12, 2023 at 10:34:48PM +0000, Jeshua Smith wrote:
> > Slow devices such as flash may not meet the default 1ms timeout value,
> > so use the ERST max execution time value that they provide as the
> > timeout if it is larger.
> >
> > Signed-off-by: Jeshua Smith <jeshuas@nvidia.com>
>
> > +/* ERST Exec max timings */
> > +#define ERST_EXEC_TIMING_MAX_MASK      0xFFFFFFFF00000000
> > +#define ERST_EXEC_TIMING_MAX_SHIFT     32
>
> I've recently become a fan of <linux/bitfield.h> I think this would
> be easier on the eyes as:
>
> #define ERST_EXEC_TIMING_MAX    GENMASK_ULL(63, 32)
>
> > +static inline u64 erst_get_timeout(void)
> > +{
> > +     u64 timeout = FIRMWARE_TIMEOUT;
> > +
> > +     if (erst_erange.attr & ERST_RANGE_SLOW) {
> > +             timeout = ((erst_erange.timings & ERST_EXEC_TIMING_MAX_MASK) >>
> > +                     ERST_EXEC_TIMING_MAX_SHIFT) * NSEC_PER_MSEC;
>
> then this becomes:
>
>                 timeout = FIELD_GET(ERST_EXEC_TIMING_MAX, erst_erange.timings) *
>                           NSEC_PER_MSEC;
>
> > +             if (timeout < FIRMWARE_TIMEOUT)
> > +                     timeout = FIRMWARE_TIMEOUT;
>
> But that's just a matter of style.  Otherwise the patch looks fine.
>
> Reviewed-by: Tony Luck <tony.luck@intel.com>

Applied as 6.7 material, thanks!
Jeshua Smith Oct. 25, 2023, 2:09 p.m. UTC | #11
Hi Boris,

You asked several questions, and it's not clear to me if you are suggesting the answers be sent as email reply, or if you're asking for the answers to be added to the commit message. Below are my email replies to those questions.

Borislav Petkov wrote:
> When I see "may" in commit messages "Slow devices such as flash may not meet the default 1ms timeout value" then I wanna know what devices are those?

The ERST table specifies an interface for how the OS can serialize error records to a "persistent store". The details of the persistent storage device are implementation defined. The ERST table provides microsecond values for the "nominal" and "maximum" amount of time it takes for the implemented device to process and complete an EXECUTE_OPERATION (a record write, read, or clear request). The current APEI ERST code hardcodes the timeout to 1ms, and ignores the actual timing information that the platform has provided in the ERST table for the platform's implementation. This is a problem for any device that can or will take more than 1ms worst case to process and complete requested operations. On a platform that uses NOR flash as the "persistent store", for example, it can easily take longer than 1ms.

Detailed NOR flash example:
A Micron nor-flash spec sheet: https://media-www.micron.com/-/media/client/global/documents/products/data-sheet/nor-flash/serial-nor/mt25q/die-rev-b/mt25q_qlkt_u_02g_cbb_0.pdf?rev=9b167fbf2b3645efba6385949a72e453

Page 82 lists "Page program time (256 bytes)" as 120us typical, 1800us max.

A 32KB error log would be (32K/256) = 128 nor-flash pages.

Writing 128 nor-flash pages would then take 120us * 128 = 15ms typical, or 1800us * 128 = 230.4ms max.

> What is the actual use case here?

Actual use case:
Kernel panic -> Pstore calls APEI's ERST code to write the ~32KB error log to persistent store -> ERST code writes the error log to nor-flash, which takes more than 1ms to complete. This is expected, as communicated by the platform to the OS via the maximum time field in the ERST table.

Currently APIE's ERST code will flag a timeout of the write operation after 1ms and return an error to Pstore. My patch will let the write (or read or clear) operation take as long as the maximum time ERST says it could take before flagging a timeout and returning an error. The ERST table has a "attributes" field that includes a "slow" bit to allow the platform to indicate that the address range for the error log "has slow access times", but the spec doesn't define what is considered a slow access time. My patch assumes that since the current timeout is 1ms, any access times greater than 1ms would reasonably be considered "slow", and therefore the extended (ERST-defined) timeout is only applied for implementations that indicate that they are "slow". I assume that platforms which bother to set the "slow" bit will also specify actual timings, and platforms which don't are OK with the current 1ms timeout.

> Upthread there's a question about the ACPI spec. That should be explained too. Because I have no clue what "the ERST max execution time value" is.

As I replied to Tony upthread, the Serialization Actions table entry for OPERATION_TIMINGS (https://uefi.org/specs/ACPI/6.5/18_Platform_Error_Interfaces.html#serialization-actions) says that it is the "value in microseconds that the platform expects would be the maximum amount of time it will take to process and complete an EXECUTE_OPERATION." Based on the spec, I take that to be the worst case time between when the OS does "EXECUTE_OPERATION" and when the OS sees "CHECK_BUSY_STATUS" return FALSE rather than TRUE, for the worst case time of a read/write/clear operation for the maximum size supported by the platform's ERST implementation.

Does that answer your questions?
Borislav Petkov Oct. 25, 2023, 2:22 p.m. UTC | #12
Hi,

On Wed, Oct 25, 2023 at 02:09:37PM +0000, Jeshua Smith wrote:
<... snip a very detailed and good explanation... >

> Writing 128 nor-flash pages would then take 120us * 128 = 15ms
> typical, or 1800us * 128 = 230.4ms max.

This is perfectly suitable to be in the commit message - it explains in
exact detail why the change is needed.

> Actual use case:
>
> Kernel panic -> Pstore calls APEI's ERST code to write the ~32KB error
> log to persistent store -> ERST code writes the error log to
> nor-flash, which takes more than 1ms to complete. This is expected, as
> communicated by the platform to the OS via the maximum time field in
> the ERST table.

This is actually very important and it justifies the need for that
change even more - you want to flush out the complete panic message to
pstore and not only the first couple of lines.

> ... and therefore the extended (ERST-defined) timeout is only applied
> for implementations that indicate that they are "slow". I assume that
> platforms which bother to set the "slow" bit will also specify actual
> timings, and platforms which don't are OK with the current 1ms
> timeout.

Yap, makes perfect sense to me.

> Does that answer your questions?

Yes, thanks for taking the time to explain this in such a detail and
precisely. I think you should use the main bits of what you wrote here
and add them to the commit message - after this there are no more
questions why this patch is needed, IMO.

Thx.
diff mbox series

Patch

diff --git a/drivers/acpi/apei/erst.c b/drivers/acpi/apei/erst.c
index 247989060e29..bf65e3461531 100644
--- a/drivers/acpi/apei/erst.c
+++ b/drivers/acpi/apei/erst.c
@@ -59,6 +59,10 @@  static struct acpi_table_erst *erst_tab;
 #define ERST_RANGE_NVRAM	0x0002
 #define ERST_RANGE_SLOW		0x0004
 
+/* ERST Exec max timings */
+#define ERST_EXEC_TIMING_MAX_MASK      0xFFFFFFFF00000000
+#define ERST_EXEC_TIMING_MAX_SHIFT     32
+
 /*
  * ERST Error Log Address Range, used as buffer for reading/writing
  * error records.
@@ -68,6 +72,7 @@  static struct erst_erange {
 	u64 size;
 	void __iomem *vaddr;
 	u32 attr;
+	u64 timings;
 } erst_erange;
 
 /*
@@ -97,6 +102,19 @@  static inline int erst_errno(int command_status)
 	}
 }
 
+static inline u64 erst_get_timeout(void)
+{
+	u64 timeout = FIRMWARE_TIMEOUT;
+
+	if (erst_erange.attr & ERST_RANGE_SLOW) {
+		timeout = ((erst_erange.timings & ERST_EXEC_TIMING_MAX_MASK) >>
+			ERST_EXEC_TIMING_MAX_SHIFT) * NSEC_PER_MSEC;
+		if (timeout < FIRMWARE_TIMEOUT)
+			timeout = FIRMWARE_TIMEOUT;
+	}
+	return timeout;
+}
+
 static int erst_timedout(u64 *t, u64 spin_unit)
 {
 	if ((s64)*t < spin_unit) {
@@ -191,9 +209,11 @@  static int erst_exec_stall_while_true(struct apei_exec_context *ctx,
 {
 	int rc;
 	u64 val;
-	u64 timeout = FIRMWARE_TIMEOUT;
+	u64 timeout;
 	u64 stall_time;
 
+	timeout = erst_get_timeout();
+
 	if (ctx->var1 > FIRMWARE_MAX_STALL) {
 		if (!in_nmi())
 			pr_warn(FW_WARN
@@ -389,6 +409,13 @@  static int erst_get_erange(struct erst_erange *range)
 	if (rc)
 		return rc;
 	range->attr = apei_exec_ctx_get_output(&ctx);
+	rc = apei_exec_run(&ctx, ACPI_ERST_EXECUTE_TIMINGS);
+	if (rc == 0)
+		range->timings = apei_exec_ctx_get_output(&ctx);
+	else if (rc == -ENOENT)
+		range->timings = 0;
+	else
+		return rc;
 
 	return 0;
 }
@@ -621,10 +648,12 @@  EXPORT_SYMBOL_GPL(erst_get_record_id_end);
 static int __erst_write_to_storage(u64 offset)
 {
 	struct apei_exec_context ctx;
-	u64 timeout = FIRMWARE_TIMEOUT;
+	u64 timeout;
 	u64 val;
 	int rc;
 
+	timeout = erst_get_timeout();
+
 	erst_exec_ctx_init(&ctx);
 	rc = apei_exec_run_optional(&ctx, ACPI_ERST_BEGIN_WRITE);
 	if (rc)
@@ -660,10 +689,12 @@  static int __erst_write_to_storage(u64 offset)
 static int __erst_read_from_storage(u64 record_id, u64 offset)
 {
 	struct apei_exec_context ctx;
-	u64 timeout = FIRMWARE_TIMEOUT;
+	u64 timeout;
 	u64 val;
 	int rc;
 
+	timeout = erst_get_timeout();
+
 	erst_exec_ctx_init(&ctx);
 	rc = apei_exec_run_optional(&ctx, ACPI_ERST_BEGIN_READ);
 	if (rc)
@@ -703,10 +734,12 @@  static int __erst_read_from_storage(u64 record_id, u64 offset)
 static int __erst_clear_from_storage(u64 record_id)
 {
 	struct apei_exec_context ctx;
-	u64 timeout = FIRMWARE_TIMEOUT;
+	u64 timeout;
 	u64 val;
 	int rc;
 
+	timeout = erst_get_timeout();
+
 	erst_exec_ctx_init(&ctx);
 	rc = apei_exec_run_optional(&ctx, ACPI_ERST_BEGIN_CLEAR);
 	if (rc)