diff mbox series

[RFC,1/2] suspend: extend S2Idle ops by new notify handler

Message ID 20220707125329.378277-2-jaz@semihalf.com
State New
Headers show
Series x86: allow to notify host about guest entering s2idle | expand

Commit Message

Grzegorz Jaszczyk July 7, 2022, 12:53 p.m. UTC
Currently the LPS0 prepare_late callback is aimed to run as the very
last thing before entering the S2Idle state from LPS0 perspective,
nevertheless between this call and the system actually entering the
S2Idle state there are several places where the suspension process could
be canceled.

In order to notify VMM about guest entering suspend, extend the S2Idle
ops by new notify callback, which will be really invoked as a very last
thing before guest actually enters S2Idle state.

Additionally extend the acpi_s2idle_dev_ops by notify() callback so
any driver can hook into it and allow to implement its own notification.

Taking advantage of e.g. existing acpi_s2idle_dev_ops's prepare/restore
hooks is not an option since it will not allow to prevent race
conditions:
- VM0 enters s2idle
- host notes about VM0 is in s2idle
- host continues with system suspension but in the meantime VM0 exits
s2idle and sends notification but it is already too late (VM could not
even send notification on time).

Introducing notify() as a very last step before the system enters S2Idle
together with an assumption that the VMM has control over guest
resumption allows preventing mentioned races.

Signed-off-by: Grzegorz Jaszczyk <jaz@semihalf.com>
---
 drivers/acpi/x86/s2idle.c | 11 +++++++++++
 include/linux/acpi.h      |  1 +
 include/linux/suspend.h   |  1 +
 kernel/power/suspend.c    |  4 ++++
 4 files changed, 17 insertions(+)

Comments

Rafael J. Wysocki July 19, 2022, 6:08 p.m. UTC | #1
On Thu, Jul 7, 2022 at 2:56 PM Grzegorz Jaszczyk <jaz@semihalf.com> wrote:
>
> Currently the LPS0 prepare_late callback is aimed to run as the very
> last thing before entering the S2Idle state from LPS0 perspective,
> nevertheless between this call and the system actually entering the
> S2Idle state there are several places where the suspension process could
> be canceled.

And why is this a problem?

The cancellation will occur only if there is a wakeup signal that
would otherwise cause one of the CPUs to exit the idle state.  Such a
wakeup signal can appear after calling the new notifier as well, so
why does it make a difference?

> In order to notify VMM about guest entering suspend, extend the S2Idle
> ops by new notify callback, which will be really invoked as a very last
> thing before guest actually enters S2Idle state.

It is not guaranteed that "suspend" (defined as all CPUs entering idle
states) will be actually entered even after this "last step".

> Additionally extend the acpi_s2idle_dev_ops by notify() callback so
> any driver can hook into it and allow to implement its own notification.
>
> Taking advantage of e.g. existing acpi_s2idle_dev_ops's prepare/restore
> hooks is not an option since it will not allow to prevent race
> conditions:
> - VM0 enters s2idle
> - host notes about VM0 is in s2idle
> - host continues with system suspension but in the meantime VM0 exits
> s2idle and sends notification but it is already too late (VM could not
> even send notification on time).

Too late for what?

> Introducing notify() as a very last step before the system enters S2Idle
> together with an assumption that the VMM has control over guest
> resumption allows preventing mentioned races.

How does it do that?

It looks like you want suspend-to-idle to behave like S3 and it won't.

> Signed-off-by: Grzegorz Jaszczyk <jaz@semihalf.com>
> ---
>  drivers/acpi/x86/s2idle.c | 11 +++++++++++
>  include/linux/acpi.h      |  1 +
>  include/linux/suspend.h   |  1 +
>  kernel/power/suspend.c    |  4 ++++
>  4 files changed, 17 insertions(+)
>
> diff --git a/drivers/acpi/x86/s2idle.c b/drivers/acpi/x86/s2idle.c
> index 2963229062f8..d5aff194c736 100644
> --- a/drivers/acpi/x86/s2idle.c
> +++ b/drivers/acpi/x86/s2idle.c
> @@ -520,10 +520,21 @@ void acpi_s2idle_restore_early(void)
>                                         lps0_dsm_func_mask, lps0_dsm_guid);
>  }
>
> +static void acpi_s2idle_notify(void)
> +{
> +       struct acpi_s2idle_dev_ops *handler;
> +
> +       list_for_each_entry(handler, &lps0_s2idle_devops_head, list_node) {
> +               if (handler->notify)
> +                       handler->notify();
> +       }
> +}
> +
>  static const struct platform_s2idle_ops acpi_s2idle_ops_lps0 = {
>         .begin = acpi_s2idle_begin,
>         .prepare = acpi_s2idle_prepare,
>         .prepare_late = acpi_s2idle_prepare_late,
> +       .notify = acpi_s2idle_notify,
>         .wake = acpi_s2idle_wake,
>         .restore_early = acpi_s2idle_restore_early,
>         .restore = acpi_s2idle_restore,
> diff --git a/include/linux/acpi.h b/include/linux/acpi.h
> index 4f82a5bc6d98..b32c4baed99b 100644
> --- a/include/linux/acpi.h
> +++ b/include/linux/acpi.h
> @@ -1068,6 +1068,7 @@ struct acpi_s2idle_dev_ops {
>         struct list_head list_node;
>         void (*prepare)(void);
>         void (*restore)(void);
> +       void (*notify)(void);
>  };
>  int acpi_register_lps0_dev(struct acpi_s2idle_dev_ops *arg);
>  void acpi_unregister_lps0_dev(struct acpi_s2idle_dev_ops *arg);
> diff --git a/include/linux/suspend.h b/include/linux/suspend.h
> index 70f2921e2e70..16ef7f9d9a03 100644
> --- a/include/linux/suspend.h
> +++ b/include/linux/suspend.h
> @@ -191,6 +191,7 @@ struct platform_s2idle_ops {
>         int (*begin)(void);
>         int (*prepare)(void);
>         int (*prepare_late)(void);
> +       void (*notify)(void);
>         bool (*wake)(void);
>         void (*restore_early)(void);
>         void (*restore)(void);
> diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
> index 827075944d28..6ba211b94ed1 100644
> --- a/kernel/power/suspend.c
> +++ b/kernel/power/suspend.c
> @@ -100,6 +100,10 @@ static void s2idle_enter(void)
>
>         /* Push all the CPUs into the idle loop. */
>         wake_up_all_idle_cpus();
> +
> +       if (s2idle_ops && s2idle_ops->notify)
> +               s2idle_ops->notify();
> +
>         /* Make the current CPU wait so it can enter the idle loop too. */
>         swait_event_exclusive(s2idle_wait_head,
>                     s2idle_state == S2IDLE_STATE_WAKE);
> --
> 2.37.0.rc0.161.g10f37bed90-goog
>
Grzegorz Jaszczyk July 20, 2022, 1:15 p.m. UTC | #2
wt., 19 lip 2022 o 20:09 Rafael J. Wysocki <rafael@kernel.org> napisał(a):
>
> On Thu, Jul 7, 2022 at 2:56 PM Grzegorz Jaszczyk <jaz@semihalf.com> wrote:
> >
> > Currently the LPS0 prepare_late callback is aimed to run as the very
> > last thing before entering the S2Idle state from LPS0 perspective,
> > nevertheless between this call and the system actually entering the
> > S2Idle state there are several places where the suspension process could
> > be canceled.
>
> And why is this a problem?
>
> The cancellation will occur only if there is a wakeup signal that
> would otherwise cause one of the CPUs to exit the idle state.  Such a
> wakeup signal can appear after calling the new notifier as well, so
> why does it make a difference?

It could also occur due to suspend_test. Additionally with new
notifier we could get notification when the system wakes up from
s2idle_loop and immediately goes to sleep again (due to e.g.
acpi_s2idle_wake condition not being met) - in this case relying on
prepare_late callback is not possible since it is not called in this
path.

>
> > In order to notify VMM about guest entering suspend, extend the S2Idle
> > ops by new notify callback, which will be really invoked as a very last
> > thing before guest actually enters S2Idle state.
>
> It is not guaranteed that "suspend" (defined as all CPUs entering idle
> states) will be actually entered even after this "last step".

Since this whole patchset is aimed at notifying the host about a guest
entering s2idle state, reaching this step can be considered as a
suspend "entry point" for VM IMO. It is because we are talking about
the vCPU not the real CPU. Therefore it seems to me, that even if some
other vCPUs could still get some wakeup signal they will not be able
to kick (through s2idle_wake->swake_up_one(&s2idle_wait_head);) the
original vCPU which entered s2idle_loop, triggered the new notifier
and is halted due to handling vCPU exit (and was about to trigger
swait_event_exclusive). So it will prevent the VM's resume process
from being started.

>
> > Additionally extend the acpi_s2idle_dev_ops by notify() callback so
> > any driver can hook into it and allow to implement its own notification.
> >
> > Taking advantage of e.g. existing acpi_s2idle_dev_ops's prepare/restore
> > hooks is not an option since it will not allow to prevent race
> > conditions:
> > - VM0 enters s2idle
> > - host notes about VM0 is in s2idle
> > - host continues with system suspension but in the meantime VM0 exits
> > s2idle and sends notification but it is already too late (VM could not
> > even send notification on time).
>
> Too late for what?

Too late to cancel the host suspend process, which thinks that the VM
is in s2idle state while it isn't.

>
> > Introducing notify() as a very last step before the system enters S2Idle
> > together with an assumption that the VMM has control over guest
> > resumption allows preventing mentioned races.
>
> How does it do that?

At the moment when VM triggers this new notifier we trap on MMIO
access and the VMM handles vCPU exit (so the vCPU is "halted").
Therefore the VMM could control when it finishes such handling and
releases the vCPU again.

Maybe adding some more context will be helpful. This patchset was
aimed for two different scenarios actually:
1) Host is about to enter the suspend state and needs first to suspend
VM with all pass-through devices. In this case the host waits for
s2idle notification from the guest and when it receives it, it
continues with its own suspend process.
2) Guest could be a "privileged" one (in terms of VMM) and when the
guest enters s2idle state it notifies the host, which in turn triggers
the suspend process of the host.

>
> It looks like you want suspend-to-idle to behave like S3 and it won't.

In a way, yes, we compensate for the lack of something like PM1_CNT to
trap on for detecting that the guest is suspending.
We could instead force the guest to use S3 but IMO it is undesirable,
since it generally does make a difference which suspend mode is used
in the guest, s2idle or S3, e.g some drivers check which suspend type
is used and based on that behaves differently during suspend. One of
the example is:
https://elixir.bootlin.com/linux/v5.18.12/source/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c#L2323
https://elixir.bootlin.com/linux/v5.18.12/source/drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c#L1069
https://elixir.bootlin.com/linux/v5.18.12/source/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c#L583

Thank you,
Grzegorz








>
> > Signed-off-by: Grzegorz Jaszczyk <jaz@semihalf.com>
> > ---
> >  drivers/acpi/x86/s2idle.c | 11 +++++++++++
> >  include/linux/acpi.h      |  1 +
> >  include/linux/suspend.h   |  1 +
> >  kernel/power/suspend.c    |  4 ++++
> >  4 files changed, 17 insertions(+)
> >
> > diff --git a/drivers/acpi/x86/s2idle.c b/drivers/acpi/x86/s2idle.c
> > index 2963229062f8..d5aff194c736 100644
> > --- a/drivers/acpi/x86/s2idle.c
> > +++ b/drivers/acpi/x86/s2idle.c
> > @@ -520,10 +520,21 @@ void acpi_s2idle_restore_early(void)
> >                                         lps0_dsm_func_mask, lps0_dsm_guid);
> >  }
> >
> > +static void acpi_s2idle_notify(void)
> > +{
> > +       struct acpi_s2idle_dev_ops *handler;
> > +
> > +       list_for_each_entry(handler, &lps0_s2idle_devops_head, list_node) {
> > +               if (handler->notify)
> > +                       handler->notify();
> > +       }
> > +}
> > +
> >  static const struct platform_s2idle_ops acpi_s2idle_ops_lps0 = {
> >         .begin = acpi_s2idle_begin,
> >         .prepare = acpi_s2idle_prepare,
> >         .prepare_late = acpi_s2idle_prepare_late,
> > +       .notify = acpi_s2idle_notify,
> >         .wake = acpi_s2idle_wake,
> >         .restore_early = acpi_s2idle_restore_early,
> >         .restore = acpi_s2idle_restore,
> > diff --git a/include/linux/acpi.h b/include/linux/acpi.h
> > index 4f82a5bc6d98..b32c4baed99b 100644
> > --- a/include/linux/acpi.h
> > +++ b/include/linux/acpi.h
> > @@ -1068,6 +1068,7 @@ struct acpi_s2idle_dev_ops {
> >         struct list_head list_node;
> >         void (*prepare)(void);
> >         void (*restore)(void);
> > +       void (*notify)(void);
> >  };
> >  int acpi_register_lps0_dev(struct acpi_s2idle_dev_ops *arg);
> >  void acpi_unregister_lps0_dev(struct acpi_s2idle_dev_ops *arg);
> > diff --git a/include/linux/suspend.h b/include/linux/suspend.h
> > index 70f2921e2e70..16ef7f9d9a03 100644
> > --- a/include/linux/suspend.h
> > +++ b/include/linux/suspend.h
> > @@ -191,6 +191,7 @@ struct platform_s2idle_ops {
> >         int (*begin)(void);
> >         int (*prepare)(void);
> >         int (*prepare_late)(void);
> > +       void (*notify)(void);
> >         bool (*wake)(void);
> >         void (*restore_early)(void);
> >         void (*restore)(void);
> > diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
> > index 827075944d28..6ba211b94ed1 100644
> > --- a/kernel/power/suspend.c
> > +++ b/kernel/power/suspend.c
> > @@ -100,6 +100,10 @@ static void s2idle_enter(void)
> >
> >         /* Push all the CPUs into the idle loop. */
> >         wake_up_all_idle_cpus();
> > +
> > +       if (s2idle_ops && s2idle_ops->notify)
> > +               s2idle_ops->notify();
> > +
> >         /* Make the current CPU wait so it can enter the idle loop too. */
> >         swait_event_exclusive(s2idle_wait_head,
> >                     s2idle_state == S2IDLE_STATE_WAKE);
> > --
> > 2.37.0.rc0.161.g10f37bed90-goog
> >
Mario Limonciello July 20, 2022, 3:22 p.m. UTC | #3
>> It looks like you want suspend-to-idle to behave like S3 and it won't.
> 
> In a way, yes, we compensate for the lack of something like PM1_CNT to
> trap on for detecting that the guest is suspending.
> We could instead force the guest to use S3 but IMO it is undesirable,
> since it generally does make a difference which suspend mode is used
> in the guest, s2idle or S3, e.g some drivers check which suspend type
> is used and based on that behaves differently during suspend. One of
> the example is:
> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Felixir.bootlin.com%2Flinux%2Fv5.18.12%2Fsource%2Fdrivers%2Fgpu%2Fdrm%2Famd%2Famdgpu%2Famdgpu_drv.c%23L2323&amp;data=05%7C01%7Cmario.limonciello%40amd.com%7C7bdd972291324d03847e08da6a51ff4f%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637939197694682503%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000%7C%7C%7C&amp;sdata=5M1sn3iRybQzSFi3ojQ4YTJuW41DlgJNl5sxbWEvLBQ%3D&amp;reserved=0
> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Felixir.bootlin.com%2Flinux%2Fv5.18.12%2Fsource%2Fdrivers%2Fgpu%2Fdrm%2Famd%2Famdgpu%2Famdgpu_acpi.c%23L1069&amp;data=05%7C01%7Cmario.limonciello%40amd.com%7C7bdd972291324d03847e08da6a51ff4f%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637939197694682503%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000%7C%7C%7C&amp;sdata=fIrLmZAgpIRPYO4to4uYUoBSEWXmz1lr%2BTnR14kAfvM%3D&amp;reserved=0
> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Felixir.bootlin.com%2Flinux%2Fv5.18.12%2Fsource%2Fdrivers%2Fgpu%2Fdrm%2Famd%2Famdgpu%2Famdgpu_gfx.c%23L583&amp;data=05%7C01%7Cmario.limonciello%40amd.com%7C7bdd972291324d03847e08da6a51ff4f%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637939197694682503%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000%7C%7C%7C&amp;sdata=SNsbmpV4HrgA%2Bkff4JzRodNDzKvwM5tnkGDvrKO44dc%3D&amp;reserved=0
> 

Actually I recently was suggesting a change to add this detection to 
another driver to set a policy and Rafael pushed back.  He's actively 
removing it from other places in the kernel.

For amdgpu stuff you pointed above, are you wanting to pass through the 
PCIe GPU device to a guest and then suspend that guest? Or is this just 
illustrative?

For a dGPU I would expect it works, but I don't think passing an APU's 
GPU PCIe endpoint would functionally work (there were bugs reported on 
this I recall).

That code path you point out only has special handling for APU when 
headed to S0ix and that's because the GPU driver happens to be where the 
control point is for some common silicon functions.  If the bug I 
mentioned about PCIe passthrough of the APU GPU endpoint to the guest is 
fixed and the guest needs to do s0ix when the host doesn't we're going 
to have other breakage to worry about because of that common silicon 
functionality I mentioned.
Grzegorz Jaszczyk July 20, 2022, 3:54 p.m. UTC | #4
śr., 20 lip 2022 o 17:22 Limonciello, Mario
<mario.limonciello@amd.com> napisał(a):
>
> >> It looks like you want suspend-to-idle to behave like S3 and it won't.
> >
> > In a way, yes, we compensate for the lack of something like PM1_CNT to
> > trap on for detecting that the guest is suspending.
> > We could instead force the guest to use S3 but IMO it is undesirable,
> > since it generally does make a difference which suspend mode is used
> > in the guest, s2idle or S3, e.g some drivers check which suspend type
> > is used and based on that behaves differently during suspend. One of
> > the example is:
> > https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Felixir.bootlin.com%2Flinux%2Fv5.18.12%2Fsource%2Fdrivers%2Fgpu%2Fdrm%2Famd%2Famdgpu%2Famdgpu_drv.c%23L2323&amp;data=05%7C01%7Cmario.limonciello%40amd.com%7C7bdd972291324d03847e08da6a51ff4f%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637939197694682503%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000%7C%7C%7C&amp;sdata=5M1sn3iRybQzSFi3ojQ4YTJuW41DlgJNl5sxbWEvLBQ%3D&amp;reserved=0
> > https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Felixir.bootlin.com%2Flinux%2Fv5.18.12%2Fsource%2Fdrivers%2Fgpu%2Fdrm%2Famd%2Famdgpu%2Famdgpu_acpi.c%23L1069&amp;data=05%7C01%7Cmario.limonciello%40amd.com%7C7bdd972291324d03847e08da6a51ff4f%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637939197694682503%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000%7C%7C%7C&amp;sdata=fIrLmZAgpIRPYO4to4uYUoBSEWXmz1lr%2BTnR14kAfvM%3D&amp;reserved=0
> > https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Felixir.bootlin.com%2Flinux%2Fv5.18.12%2Fsource%2Fdrivers%2Fgpu%2Fdrm%2Famd%2Famdgpu%2Famdgpu_gfx.c%23L583&amp;data=05%7C01%7Cmario.limonciello%40amd.com%7C7bdd972291324d03847e08da6a51ff4f%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637939197694682503%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000%7C%7C%7C&amp;sdata=SNsbmpV4HrgA%2Bkff4JzRodNDzKvwM5tnkGDvrKO44dc%3D&amp;reserved=0
> >
>
> Actually I recently was suggesting a change to add this detection to
> another driver to set a policy and Rafael pushed back.  He's actively
> removing it from other places in the kernel.
>
> For amdgpu stuff you pointed above, are you wanting to pass through the
> PCIe GPU device to a guest and then suspend that guest? Or is this just
> illustrative?

Just illustrative. I am not focused on amdgpu stuff right now.

Thank you,
Grzegorz

>
> For a dGPU I would expect it works, but I don't think passing an APU's
> GPU PCIe endpoint would functionally work (there were bugs reported on
> this I recall).
>
> That code path you point out only has special handling for APU when
> headed to S0ix and that's because the GPU driver happens to be where the
> control point is for some common silicon functions.  If the bug I
> mentioned about PCIe passthrough of the APU GPU endpoint to the guest is
> fixed and the guest needs to do s0ix when the host doesn't we're going
> to have other breakage to worry about because of that common silicon
> functionality I mentioned.
Grzegorz Jaszczyk Aug. 22, 2022, 9:26 a.m. UTC | #5
Hi Rafael,

Could you please kindly comment on the above?

Thank you in advance,
Grzegorz

śr., 20 lip 2022 o 15:15 Grzegorz Jaszczyk <jaz@semihalf.com> napisał(a):
>
> wt., 19 lip 2022 o 20:09 Rafael J. Wysocki <rafael@kernel.org> napisał(a):
> >
> > On Thu, Jul 7, 2022 at 2:56 PM Grzegorz Jaszczyk <jaz@semihalf.com> wrote:
> > >
> > > Currently the LPS0 prepare_late callback is aimed to run as the very
> > > last thing before entering the S2Idle state from LPS0 perspective,
> > > nevertheless between this call and the system actually entering the
> > > S2Idle state there are several places where the suspension process could
> > > be canceled.
> >
> > And why is this a problem?
> >
> > The cancellation will occur only if there is a wakeup signal that
> > would otherwise cause one of the CPUs to exit the idle state.  Such a
> > wakeup signal can appear after calling the new notifier as well, so
> > why does it make a difference?
>
> It could also occur due to suspend_test. Additionally with new
> notifier we could get notification when the system wakes up from
> s2idle_loop and immediately goes to sleep again (due to e.g.
> acpi_s2idle_wake condition not being met) - in this case relying on
> prepare_late callback is not possible since it is not called in this
> path.
>
> >
> > > In order to notify VMM about guest entering suspend, extend the S2Idle
> > > ops by new notify callback, which will be really invoked as a very last
> > > thing before guest actually enters S2Idle state.
> >
> > It is not guaranteed that "suspend" (defined as all CPUs entering idle
> > states) will be actually entered even after this "last step".
>
> Since this whole patchset is aimed at notifying the host about a guest
> entering s2idle state, reaching this step can be considered as a
> suspend "entry point" for VM IMO. It is because we are talking about
> the vCPU not the real CPU. Therefore it seems to me, that even if some
> other vCPUs could still get some wakeup signal they will not be able
> to kick (through s2idle_wake->swake_up_one(&s2idle_wait_head);) the
> original vCPU which entered s2idle_loop, triggered the new notifier
> and is halted due to handling vCPU exit (and was about to trigger
> swait_event_exclusive). So it will prevent the VM's resume process
> from being started.
>
> >
> > > Additionally extend the acpi_s2idle_dev_ops by notify() callback so
> > > any driver can hook into it and allow to implement its own notification.
> > >
> > > Taking advantage of e.g. existing acpi_s2idle_dev_ops's prepare/restore
> > > hooks is not an option since it will not allow to prevent race
> > > conditions:
> > > - VM0 enters s2idle
> > > - host notes about VM0 is in s2idle
> > > - host continues with system suspension but in the meantime VM0 exits
> > > s2idle and sends notification but it is already too late (VM could not
> > > even send notification on time).
> >
> > Too late for what?
>
> Too late to cancel the host suspend process, which thinks that the VM
> is in s2idle state while it isn't.
>
> >
> > > Introducing notify() as a very last step before the system enters S2Idle
> > > together with an assumption that the VMM has control over guest
> > > resumption allows preventing mentioned races.
> >
> > How does it do that?
>
> At the moment when VM triggers this new notifier we trap on MMIO
> access and the VMM handles vCPU exit (so the vCPU is "halted").
> Therefore the VMM could control when it finishes such handling and
> releases the vCPU again.
>
> Maybe adding some more context will be helpful. This patchset was
> aimed for two different scenarios actually:
> 1) Host is about to enter the suspend state and needs first to suspend
> VM with all pass-through devices. In this case the host waits for
> s2idle notification from the guest and when it receives it, it
> continues with its own suspend process.
> 2) Guest could be a "privileged" one (in terms of VMM) and when the
> guest enters s2idle state it notifies the host, which in turn triggers
> the suspend process of the host.
>
> >
> > It looks like you want suspend-to-idle to behave like S3 and it won't.
>
> In a way, yes, we compensate for the lack of something like PM1_CNT to
> trap on for detecting that the guest is suspending.
> We could instead force the guest to use S3 but IMO it is undesirable,
> since it generally does make a difference which suspend mode is used
> in the guest, s2idle or S3, e.g some drivers check which suspend type
> is used and based on that behaves differently during suspend. One of
> the example is:
> https://elixir.bootlin.com/linux/v5.18.12/source/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c#L2323
> https://elixir.bootlin.com/linux/v5.18.12/source/drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c#L1069
> https://elixir.bootlin.com/linux/v5.18.12/source/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c#L583
>
> Thank you,
> Grzegorz
Grzegorz Jaszczyk Sept. 12, 2022, 2:44 p.m. UTC | #6
Hi Rafael,

Gentle ping

Best regards,
Grzegorz

pon., 22 sie 2022 o 11:26 Grzegorz Jaszczyk <jaz@semihalf.com> napisał(a):
>
> Hi Rafael,
>
> Could you please kindly comment on the above?
>
> Thank you in advance,
> Grzegorz
>
> śr., 20 lip 2022 o 15:15 Grzegorz Jaszczyk <jaz@semihalf.com> napisał(a):
> >
> > wt., 19 lip 2022 o 20:09 Rafael J. Wysocki <rafael@kernel.org> napisał(a):
> > >
> > > On Thu, Jul 7, 2022 at 2:56 PM Grzegorz Jaszczyk <jaz@semihalf.com> wrote:
> > > >
> > > > Currently the LPS0 prepare_late callback is aimed to run as the very
> > > > last thing before entering the S2Idle state from LPS0 perspective,
> > > > nevertheless between this call and the system actually entering the
> > > > S2Idle state there are several places where the suspension process could
> > > > be canceled.
> > >
> > > And why is this a problem?
> > >
> > > The cancellation will occur only if there is a wakeup signal that
> > > would otherwise cause one of the CPUs to exit the idle state.  Such a
> > > wakeup signal can appear after calling the new notifier as well, so
> > > why does it make a difference?
> >
> > It could also occur due to suspend_test. Additionally with new
> > notifier we could get notification when the system wakes up from
> > s2idle_loop and immediately goes to sleep again (due to e.g.
> > acpi_s2idle_wake condition not being met) - in this case relying on
> > prepare_late callback is not possible since it is not called in this
> > path.
> >
> > >
> > > > In order to notify VMM about guest entering suspend, extend the S2Idle
> > > > ops by new notify callback, which will be really invoked as a very last
> > > > thing before guest actually enters S2Idle state.
> > >
> > > It is not guaranteed that "suspend" (defined as all CPUs entering idle
> > > states) will be actually entered even after this "last step".
> >
> > Since this whole patchset is aimed at notifying the host about a guest
> > entering s2idle state, reaching this step can be considered as a
> > suspend "entry point" for VM IMO. It is because we are talking about
> > the vCPU not the real CPU. Therefore it seems to me, that even if some
> > other vCPUs could still get some wakeup signal they will not be able
> > to kick (through s2idle_wake->swake_up_one(&s2idle_wait_head);) the
> > original vCPU which entered s2idle_loop, triggered the new notifier
> > and is halted due to handling vCPU exit (and was about to trigger
> > swait_event_exclusive). So it will prevent the VM's resume process
> > from being started.
> >
> > >
> > > > Additionally extend the acpi_s2idle_dev_ops by notify() callback so
> > > > any driver can hook into it and allow to implement its own notification.
> > > >
> > > > Taking advantage of e.g. existing acpi_s2idle_dev_ops's prepare/restore
> > > > hooks is not an option since it will not allow to prevent race
> > > > conditions:
> > > > - VM0 enters s2idle
> > > > - host notes about VM0 is in s2idle
> > > > - host continues with system suspension but in the meantime VM0 exits
> > > > s2idle and sends notification but it is already too late (VM could not
> > > > even send notification on time).
> > >
> > > Too late for what?
> >
> > Too late to cancel the host suspend process, which thinks that the VM
> > is in s2idle state while it isn't.
> >
> > >
> > > > Introducing notify() as a very last step before the system enters S2Idle
> > > > together with an assumption that the VMM has control over guest
> > > > resumption allows preventing mentioned races.
> > >
> > > How does it do that?
> >
> > At the moment when VM triggers this new notifier we trap on MMIO
> > access and the VMM handles vCPU exit (so the vCPU is "halted").
> > Therefore the VMM could control when it finishes such handling and
> > releases the vCPU again.
> >
> > Maybe adding some more context will be helpful. This patchset was
> > aimed for two different scenarios actually:
> > 1) Host is about to enter the suspend state and needs first to suspend
> > VM with all pass-through devices. In this case the host waits for
> > s2idle notification from the guest and when it receives it, it
> > continues with its own suspend process.
> > 2) Guest could be a "privileged" one (in terms of VMM) and when the
> > guest enters s2idle state it notifies the host, which in turn triggers
> > the suspend process of the host.
> >
> > >
> > > It looks like you want suspend-to-idle to behave like S3 and it won't.
> >
> > In a way, yes, we compensate for the lack of something like PM1_CNT to
> > trap on for detecting that the guest is suspending.
> > We could instead force the guest to use S3 but IMO it is undesirable,
> > since it generally does make a difference which suspend mode is used
> > in the guest, s2idle or S3, e.g some drivers check which suspend type
> > is used and based on that behaves differently during suspend. One of
> > the example is:
> > https://elixir.bootlin.com/linux/v5.18.12/source/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c#L2323
> > https://elixir.bootlin.com/linux/v5.18.12/source/drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c#L1069
> > https://elixir.bootlin.com/linux/v5.18.12/source/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c#L583
> >
> > Thank you,
> > Grzegorz
diff mbox series

Patch

diff --git a/drivers/acpi/x86/s2idle.c b/drivers/acpi/x86/s2idle.c
index 2963229062f8..d5aff194c736 100644
--- a/drivers/acpi/x86/s2idle.c
+++ b/drivers/acpi/x86/s2idle.c
@@ -520,10 +520,21 @@  void acpi_s2idle_restore_early(void)
 					lps0_dsm_func_mask, lps0_dsm_guid);
 }
 
+static void acpi_s2idle_notify(void)
+{
+	struct acpi_s2idle_dev_ops *handler;
+
+	list_for_each_entry(handler, &lps0_s2idle_devops_head, list_node) {
+		if (handler->notify)
+			handler->notify();
+	}
+}
+
 static const struct platform_s2idle_ops acpi_s2idle_ops_lps0 = {
 	.begin = acpi_s2idle_begin,
 	.prepare = acpi_s2idle_prepare,
 	.prepare_late = acpi_s2idle_prepare_late,
+	.notify = acpi_s2idle_notify,
 	.wake = acpi_s2idle_wake,
 	.restore_early = acpi_s2idle_restore_early,
 	.restore = acpi_s2idle_restore,
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 4f82a5bc6d98..b32c4baed99b 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -1068,6 +1068,7 @@  struct acpi_s2idle_dev_ops {
 	struct list_head list_node;
 	void (*prepare)(void);
 	void (*restore)(void);
+	void (*notify)(void);
 };
 int acpi_register_lps0_dev(struct acpi_s2idle_dev_ops *arg);
 void acpi_unregister_lps0_dev(struct acpi_s2idle_dev_ops *arg);
diff --git a/include/linux/suspend.h b/include/linux/suspend.h
index 70f2921e2e70..16ef7f9d9a03 100644
--- a/include/linux/suspend.h
+++ b/include/linux/suspend.h
@@ -191,6 +191,7 @@  struct platform_s2idle_ops {
 	int (*begin)(void);
 	int (*prepare)(void);
 	int (*prepare_late)(void);
+	void (*notify)(void);
 	bool (*wake)(void);
 	void (*restore_early)(void);
 	void (*restore)(void);
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 827075944d28..6ba211b94ed1 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -100,6 +100,10 @@  static void s2idle_enter(void)
 
 	/* Push all the CPUs into the idle loop. */
 	wake_up_all_idle_cpus();
+
+	if (s2idle_ops && s2idle_ops->notify)
+		s2idle_ops->notify();
+
 	/* Make the current CPU wait so it can enter the idle loop too. */
 	swait_event_exclusive(s2idle_wait_head,
 		    s2idle_state == S2IDLE_STATE_WAKE);