diff mbox series

[net-next,v6,3/3] net: add sysfs attribute to control napi threaded mode

Message ID 20210115003123.1254314-4-weiwan@google.com
State New
Headers show
Series None | expand

Commit Message

Wei Wang Jan. 15, 2021, 12:31 a.m. UTC
This patch adds a new sysfs attribute to the network device class.
Said attribute provides a per-device control to enable/disable the
threaded mode for all the napi instances of the given network device.

Co-developed-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Co-developed-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Co-developed-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Wei Wang <weiwan@google.com>
---
 include/linux/netdevice.h |  2 ++
 net/core/dev.c            | 28 +++++++++++++++++
 net/core/net-sysfs.c      | 63 +++++++++++++++++++++++++++++++++++++++
 3 files changed, 93 insertions(+)

Comments

Alexander Duyck Jan. 15, 2021, 3:34 a.m. UTC | #1
On Thu, Jan 14, 2021 at 4:34 PM Wei Wang <weiwan@google.com> wrote:
>

> This patch adds a new sysfs attribute to the network device class.

> Said attribute provides a per-device control to enable/disable the

> threaded mode for all the napi instances of the given network device.

>

> Co-developed-by: Paolo Abeni <pabeni@redhat.com>

> Signed-off-by: Paolo Abeni <pabeni@redhat.com>

> Co-developed-by: Hannes Frederic Sowa <hannes@stressinduktion.org>

> Signed-off-by: Hannes Frederic Sowa <hannes@stressinduktion.org>

> Co-developed-by: Felix Fietkau <nbd@nbd.name>

> Signed-off-by: Felix Fietkau <nbd@nbd.name>

> Signed-off-by: Wei Wang <weiwan@google.com>

> ---

>  include/linux/netdevice.h |  2 ++

>  net/core/dev.c            | 28 +++++++++++++++++

>  net/core/net-sysfs.c      | 63 +++++++++++++++++++++++++++++++++++++++

>  3 files changed, 93 insertions(+)

>

> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h

> index c24ed232c746..11ae0c9b9350 100644

> --- a/include/linux/netdevice.h

> +++ b/include/linux/netdevice.h

> @@ -497,6 +497,8 @@ static inline bool napi_complete(struct napi_struct *n)

>         return napi_complete_done(n, 0);

>  }

>

> +int dev_set_threaded(struct net_device *dev, bool threaded);

> +

>  /**

>   *     napi_disable - prevent NAPI from scheduling

>   *     @n: NAPI context

> diff --git a/net/core/dev.c b/net/core/dev.c

> index edcfec1361e9..d5fb95316ea8 100644

> --- a/net/core/dev.c

> +++ b/net/core/dev.c

> @@ -6754,6 +6754,34 @@ static int napi_set_threaded(struct napi_struct *n, bool threaded)

>         return err;

>  }

>

> +static void dev_disable_threaded_all(struct net_device *dev)

> +{

> +       struct napi_struct *napi;

> +

> +       list_for_each_entry(napi, &dev->napi_list, dev_list)

> +               napi_set_threaded(napi, false);

> +}

> +

> +int dev_set_threaded(struct net_device *dev, bool threaded)

> +{

> +       struct napi_struct *napi;

> +       int ret;

> +

> +       dev->threaded = threaded;

> +       list_for_each_entry(napi, &dev->napi_list, dev_list) {

> +               ret = napi_set_threaded(napi, threaded);

> +               if (ret) {

> +                       /* Error occurred on one of the napi,

> +                        * reset threaded mode on all napi.

> +                        */

> +                       dev_disable_threaded_all(dev);

> +                       break;

> +               }

> +       }

> +

> +       return ret;


This doesn't seem right. The NAPI instances can be active while this
is occuring can they not? I would think at a minimum you need to go
through a napi_disable/napi_enable in order to toggle this value for
each NAPI instance. Otherwise aren't you at risk for racing and having
a napi_schedule attempt to wake up the thread before it has been
allocated?

> +}

> +

>  void netif_napi_add(struct net_device *dev, struct napi_struct *napi,

>                     int (*poll)(struct napi_struct *, int), int weight)

>  {

> diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c

> index daf502c13d6d..2017f8f07b8d 100644

> --- a/net/core/net-sysfs.c

> +++ b/net/core/net-sysfs.c

> @@ -538,6 +538,68 @@ static ssize_t phys_switch_id_show(struct device *dev,

>  }

>  static DEVICE_ATTR_RO(phys_switch_id);

>

> +static ssize_t threaded_show(struct device *dev,

> +                            struct device_attribute *attr, char *buf)

> +{

> +       struct net_device *netdev = to_net_dev(dev);

> +       struct napi_struct *n;

> +       bool enabled;

> +       int ret;

> +

> +       if (!rtnl_trylock())

> +               return restart_syscall();

> +

> +       if (!dev_isalive(netdev)) {

> +               ret = -EINVAL;

> +               goto unlock;

> +       }

> +

> +       if (list_empty(&netdev->napi_list)) {

> +               ret = -EOPNOTSUPP;

> +               goto unlock;

> +       }

> +

> +       /* Only return true if all napi have threaded mode.

> +        * The inconsistency could happen when the device driver calls

> +        * napi_disable()/napi_enable() with dev->threaded set to true,

> +        * but napi_kthread_create() fails.

> +        * We return false in this case to remind the user that one or

> +        * more napi did not have threaded mode enabled properly.

> +        */

> +       list_for_each_entry(n, &netdev->napi_list, dev_list) {

> +               enabled = !!test_bit(NAPI_STATE_THREADED, &n->state);

> +               if (!enabled)

> +                       break;

> +       }

> +


This logic seems backwards to me. If we have it enabled for any of
them it seems like we should report it was enabled. Otherwise we are
going to be leaking out instances of threaded napi and not be able to
easily find where they are coming from. If nothing else it might make
sense to have this as a ternary value where it is either enabled,
disabled, or partial/broken.

Also why bother testing each queue when you already have dev->threaded?

> +       ret = sprintf(buf, fmt_dec, enabled);

> +

> +unlock:

> +       rtnl_unlock();

> +       return ret;

> +}

> +

> +static int modify_napi_threaded(struct net_device *dev, unsigned long val)

> +{

> +       struct napi_struct *napi;

> +       int ret;

> +

> +       if (list_empty(&dev->napi_list))

> +               return -EOPNOTSUPP;

> +

> +       ret = dev_set_threaded(dev, !!val);

> +

> +       return ret;

> +}

> +

> +static ssize_t threaded_store(struct device *dev,

> +                             struct device_attribute *attr,

> +                             const char *buf, size_t len)

> +{

> +       return netdev_store(dev, attr, buf, len, modify_napi_threaded);

> +}

> +static DEVICE_ATTR_RW(threaded);

> +

>  static struct attribute *net_class_attrs[] __ro_after_init = {

>         &dev_attr_netdev_group.attr,

>         &dev_attr_type.attr,

> @@ -570,6 +632,7 @@ static struct attribute *net_class_attrs[] __ro_after_init = {

>         &dev_attr_proto_down.attr,

>         &dev_attr_carrier_up_count.attr,

>         &dev_attr_carrier_down_count.attr,

> +       &dev_attr_threaded.attr,

>         NULL,

>  };

>  ATTRIBUTE_GROUPS(net_class);

> --

> 2.30.0.284.gd98b1dd5eaa7-goog

>
Wei Wang Jan. 15, 2021, 9:54 p.m. UTC | #2
On Thu, Jan 14, 2021 at 7:34 PM Alexander Duyck
<alexander.duyck@gmail.com> wrote:
>

> On Thu, Jan 14, 2021 at 4:34 PM Wei Wang <weiwan@google.com> wrote:

> >

> > This patch adds a new sysfs attribute to the network device class.

> > Said attribute provides a per-device control to enable/disable the

> > threaded mode for all the napi instances of the given network device.

> >

> > Co-developed-by: Paolo Abeni <pabeni@redhat.com>

> > Signed-off-by: Paolo Abeni <pabeni@redhat.com>

> > Co-developed-by: Hannes Frederic Sowa <hannes@stressinduktion.org>

> > Signed-off-by: Hannes Frederic Sowa <hannes@stressinduktion.org>

> > Co-developed-by: Felix Fietkau <nbd@nbd.name>

> > Signed-off-by: Felix Fietkau <nbd@nbd.name>

> > Signed-off-by: Wei Wang <weiwan@google.com>

> > ---

> >  include/linux/netdevice.h |  2 ++

> >  net/core/dev.c            | 28 +++++++++++++++++

> >  net/core/net-sysfs.c      | 63 +++++++++++++++++++++++++++++++++++++++

> >  3 files changed, 93 insertions(+)

> >

> > diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h

> > index c24ed232c746..11ae0c9b9350 100644

> > --- a/include/linux/netdevice.h

> > +++ b/include/linux/netdevice.h

> > @@ -497,6 +497,8 @@ static inline bool napi_complete(struct napi_struct *n)

> >         return napi_complete_done(n, 0);

> >  }

> >

> > +int dev_set_threaded(struct net_device *dev, bool threaded);

> > +

> >  /**

> >   *     napi_disable - prevent NAPI from scheduling

> >   *     @n: NAPI context

> > diff --git a/net/core/dev.c b/net/core/dev.c

> > index edcfec1361e9..d5fb95316ea8 100644

> > --- a/net/core/dev.c

> > +++ b/net/core/dev.c

> > @@ -6754,6 +6754,34 @@ static int napi_set_threaded(struct napi_struct *n, bool threaded)

> >         return err;

> >  }

> >

> > +static void dev_disable_threaded_all(struct net_device *dev)

> > +{

> > +       struct napi_struct *napi;

> > +

> > +       list_for_each_entry(napi, &dev->napi_list, dev_list)

> > +               napi_set_threaded(napi, false);

> > +}

> > +

> > +int dev_set_threaded(struct net_device *dev, bool threaded)

> > +{

> > +       struct napi_struct *napi;

> > +       int ret;

> > +

> > +       dev->threaded = threaded;

> > +       list_for_each_entry(napi, &dev->napi_list, dev_list) {

> > +               ret = napi_set_threaded(napi, threaded);

> > +               if (ret) {

> > +                       /* Error occurred on one of the napi,

> > +                        * reset threaded mode on all napi.

> > +                        */

> > +                       dev_disable_threaded_all(dev);

> > +                       break;

> > +               }

> > +       }

> > +

> > +       return ret;

>

> This doesn't seem right. The NAPI instances can be active while this

> is occuring can they not? I would think at a minimum you need to go

> through a napi_disable/napi_enable in order to toggle this value for

> each NAPI instance. Otherwise aren't you at risk for racing and having

> a napi_schedule attempt to wake up the thread before it has been

> allocated?



Yes. The napi instance could be active when this occurs. And I think
it is OK. It is cause napi_set_threaded() only sets
NAPI_STATE_THREADED bit after successfully created the kthread. And
___napi_schedule() only tries to wake up the kthread after testing the
THREADED bit.

>

>

> > +}

> > +

> >  void netif_napi_add(struct net_device *dev, struct napi_struct *napi,

> >                     int (*poll)(struct napi_struct *, int), int weight)

> >  {

> > diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c

> > index daf502c13d6d..2017f8f07b8d 100644

> > --- a/net/core/net-sysfs.c

> > +++ b/net/core/net-sysfs.c

> > @@ -538,6 +538,68 @@ static ssize_t phys_switch_id_show(struct device *dev,

> >  }

> >  static DEVICE_ATTR_RO(phys_switch_id);

> >

> > +static ssize_t threaded_show(struct device *dev,

> > +                            struct device_attribute *attr, char *buf)

> > +{

> > +       struct net_device *netdev = to_net_dev(dev);

> > +       struct napi_struct *n;

> > +       bool enabled;

> > +       int ret;

> > +

> > +       if (!rtnl_trylock())

> > +               return restart_syscall();

> > +

> > +       if (!dev_isalive(netdev)) {

> > +               ret = -EINVAL;

> > +               goto unlock;

> > +       }

> > +

> > +       if (list_empty(&netdev->napi_list)) {

> > +               ret = -EOPNOTSUPP;

> > +               goto unlock;

> > +       }

> > +

> > +       /* Only return true if all napi have threaded mode.

> > +        * The inconsistency could happen when the device driver calls

> > +        * napi_disable()/napi_enable() with dev->threaded set to true,

> > +        * but napi_kthread_create() fails.

> > +        * We return false in this case to remind the user that one or

> > +        * more napi did not have threaded mode enabled properly.

> > +        */

> > +       list_for_each_entry(n, &netdev->napi_list, dev_list) {

> > +               enabled = !!test_bit(NAPI_STATE_THREADED, &n->state);

> > +               if (!enabled)

> > +                       break;

> > +       }

> > +

>

> This logic seems backwards to me. If we have it enabled for any of

> them it seems like we should report it was enabled. Otherwise we are

> going to be leaking out instances of threaded napi and not be able to

> easily find where they are coming from. If nothing else it might make

> sense to have this as a ternary value where it is either enabled,

> disabled, or partial/broken.



Good point. The reason to return true only if all napi have threaded
enabled, is I would like this return value to serve as a signal to the
user to indicate that the threaded mode is not enabled successfully
for all napi instances, when the user tries to enable it, but then got
"disabled".
But maybe using a ternary value is a better idea. I will see how to change that.


>

> Also why bother testing each queue when you already have dev->threaded?



It is cause I use dev-> threaded to store what user wants to set the
threaded mode to. But if it is set partially or it is broken, I'd like
to return "disabled".
Again, I will see how to implement a ternary value.

>

>

> > +       ret = sprintf(buf, fmt_dec, enabled);

> > +

> > +unlock:

> > +       rtnl_unlock();

> > +       return ret;

> > +}

> > +

> > +static int modify_napi_threaded(struct net_device *dev, unsigned long val)

> > +{

> > +       struct napi_struct *napi;

> > +       int ret;

> > +

> > +       if (list_empty(&dev->napi_list))

> > +               return -EOPNOTSUPP;

> > +

> > +       ret = dev_set_threaded(dev, !!val);

> > +

> > +       return ret;

> > +}

> > +

> > +static ssize_t threaded_store(struct device *dev,

> > +                             struct device_attribute *attr,

> > +                             const char *buf, size_t len)

> > +{

> > +       return netdev_store(dev, attr, buf, len, modify_napi_threaded);

> > +}

> > +static DEVICE_ATTR_RW(threaded);

> > +

> >  static struct attribute *net_class_attrs[] __ro_after_init = {

> >         &dev_attr_netdev_group.attr,

> >         &dev_attr_type.attr,

> > @@ -570,6 +632,7 @@ static struct attribute *net_class_attrs[] __ro_after_init = {

> >         &dev_attr_proto_down.attr,

> >         &dev_attr_carrier_up_count.attr,

> >         &dev_attr_carrier_down_count.attr,

> > +       &dev_attr_threaded.attr,

> >         NULL,

> >  };

> >  ATTRIBUTE_GROUPS(net_class);

> > --

> > 2.30.0.284.gd98b1dd5eaa7-goog

> >
Alexander Duyck Jan. 15, 2021, 11:08 p.m. UTC | #3
On Fri, Jan 15, 2021 at 1:54 PM Wei Wang <weiwan@google.com> wrote:
>

> On Thu, Jan 14, 2021 at 7:34 PM Alexander Duyck

> <alexander.duyck@gmail.com> wrote:

> >

> > On Thu, Jan 14, 2021 at 4:34 PM Wei Wang <weiwan@google.com> wrote:

> > >

> > > This patch adds a new sysfs attribute to the network device class.

> > > Said attribute provides a per-device control to enable/disable the

> > > threaded mode for all the napi instances of the given network device.

> > >

> > > Co-developed-by: Paolo Abeni <pabeni@redhat.com>

> > > Signed-off-by: Paolo Abeni <pabeni@redhat.com>

> > > Co-developed-by: Hannes Frederic Sowa <hannes@stressinduktion.org>

> > > Signed-off-by: Hannes Frederic Sowa <hannes@stressinduktion.org>

> > > Co-developed-by: Felix Fietkau <nbd@nbd.name>

> > > Signed-off-by: Felix Fietkau <nbd@nbd.name>

> > > Signed-off-by: Wei Wang <weiwan@google.com>

> > > ---

> > >  include/linux/netdevice.h |  2 ++

> > >  net/core/dev.c            | 28 +++++++++++++++++

> > >  net/core/net-sysfs.c      | 63 +++++++++++++++++++++++++++++++++++++++

> > >  3 files changed, 93 insertions(+)

> > >

> > > diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h

> > > index c24ed232c746..11ae0c9b9350 100644

> > > --- a/include/linux/netdevice.h

> > > +++ b/include/linux/netdevice.h

> > > @@ -497,6 +497,8 @@ static inline bool napi_complete(struct napi_struct *n)

> > >         return napi_complete_done(n, 0);

> > >  }

> > >

> > > +int dev_set_threaded(struct net_device *dev, bool threaded);

> > > +

> > >  /**

> > >   *     napi_disable - prevent NAPI from scheduling

> > >   *     @n: NAPI context

> > > diff --git a/net/core/dev.c b/net/core/dev.c

> > > index edcfec1361e9..d5fb95316ea8 100644

> > > --- a/net/core/dev.c

> > > +++ b/net/core/dev.c

> > > @@ -6754,6 +6754,34 @@ static int napi_set_threaded(struct napi_struct *n, bool threaded)

> > >         return err;

> > >  }

> > >

> > > +static void dev_disable_threaded_all(struct net_device *dev)

> > > +{

> > > +       struct napi_struct *napi;

> > > +

> > > +       list_for_each_entry(napi, &dev->napi_list, dev_list)

> > > +               napi_set_threaded(napi, false);

> > > +}

> > > +

> > > +int dev_set_threaded(struct net_device *dev, bool threaded)

> > > +{

> > > +       struct napi_struct *napi;

> > > +       int ret;

> > > +

> > > +       dev->threaded = threaded;

> > > +       list_for_each_entry(napi, &dev->napi_list, dev_list) {

> > > +               ret = napi_set_threaded(napi, threaded);

> > > +               if (ret) {

> > > +                       /* Error occurred on one of the napi,

> > > +                        * reset threaded mode on all napi.

> > > +                        */

> > > +                       dev_disable_threaded_all(dev);

> > > +                       break;

> > > +               }

> > > +       }

> > > +

> > > +       return ret;

> >

> > This doesn't seem right. The NAPI instances can be active while this

> > is occuring can they not? I would think at a minimum you need to go

> > through a napi_disable/napi_enable in order to toggle this value for

> > each NAPI instance. Otherwise aren't you at risk for racing and having

> > a napi_schedule attempt to wake up the thread before it has been

> > allocated?

>

>

> Yes. The napi instance could be active when this occurs. And I think

> it is OK. It is cause napi_set_threaded() only sets

> NAPI_STATE_THREADED bit after successfully created the kthread. And

> ___napi_schedule() only tries to wake up the kthread after testing the

> THREADED bit.


But what do you have guaranteeing that the kthread has been written to
memory? That is what I was getting at. Just because you have written
the value doesn't mean it is in memory yet so you would probably need
an smb_mb__before_atomic() barrier call before you set the bit.

Also I am not sure it is entirely safe to have the instance polling
while you are doing this. That is why I am thinking if the instance is
enabled then a napi_disable/napi_enable would be preferable.
Wei Wang Jan. 16, 2021, 12:44 a.m. UTC | #4
On Fri, Jan 15, 2021 at 3:08 PM Alexander Duyck
<alexander.duyck@gmail.com> wrote:
>

> On Fri, Jan 15, 2021 at 1:54 PM Wei Wang <weiwan@google.com> wrote:

> >

> > On Thu, Jan 14, 2021 at 7:34 PM Alexander Duyck

> > <alexander.duyck@gmail.com> wrote:

> > >

> > > On Thu, Jan 14, 2021 at 4:34 PM Wei Wang <weiwan@google.com> wrote:

> > > >

> > > > This patch adds a new sysfs attribute to the network device class.

> > > > Said attribute provides a per-device control to enable/disable the

> > > > threaded mode for all the napi instances of the given network device.

> > > >

> > > > Co-developed-by: Paolo Abeni <pabeni@redhat.com>

> > > > Signed-off-by: Paolo Abeni <pabeni@redhat.com>

> > > > Co-developed-by: Hannes Frederic Sowa <hannes@stressinduktion.org>

> > > > Signed-off-by: Hannes Frederic Sowa <hannes@stressinduktion.org>

> > > > Co-developed-by: Felix Fietkau <nbd@nbd.name>

> > > > Signed-off-by: Felix Fietkau <nbd@nbd.name>

> > > > Signed-off-by: Wei Wang <weiwan@google.com>

> > > > ---

> > > >  include/linux/netdevice.h |  2 ++

> > > >  net/core/dev.c            | 28 +++++++++++++++++

> > > >  net/core/net-sysfs.c      | 63 +++++++++++++++++++++++++++++++++++++++

> > > >  3 files changed, 93 insertions(+)

> > > >

> > > > diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h

> > > > index c24ed232c746..11ae0c9b9350 100644

> > > > --- a/include/linux/netdevice.h

> > > > +++ b/include/linux/netdevice.h

> > > > @@ -497,6 +497,8 @@ static inline bool napi_complete(struct napi_struct *n)

> > > >         return napi_complete_done(n, 0);

> > > >  }

> > > >

> > > > +int dev_set_threaded(struct net_device *dev, bool threaded);

> > > > +

> > > >  /**

> > > >   *     napi_disable - prevent NAPI from scheduling

> > > >   *     @n: NAPI context

> > > > diff --git a/net/core/dev.c b/net/core/dev.c

> > > > index edcfec1361e9..d5fb95316ea8 100644

> > > > --- a/net/core/dev.c

> > > > +++ b/net/core/dev.c

> > > > @@ -6754,6 +6754,34 @@ static int napi_set_threaded(struct napi_struct *n, bool threaded)

> > > >         return err;

> > > >  }

> > > >

> > > > +static void dev_disable_threaded_all(struct net_device *dev)

> > > > +{

> > > > +       struct napi_struct *napi;

> > > > +

> > > > +       list_for_each_entry(napi, &dev->napi_list, dev_list)

> > > > +               napi_set_threaded(napi, false);

> > > > +}

> > > > +

> > > > +int dev_set_threaded(struct net_device *dev, bool threaded)

> > > > +{

> > > > +       struct napi_struct *napi;

> > > > +       int ret;

> > > > +

> > > > +       dev->threaded = threaded;

> > > > +       list_for_each_entry(napi, &dev->napi_list, dev_list) {

> > > > +               ret = napi_set_threaded(napi, threaded);

> > > > +               if (ret) {

> > > > +                       /* Error occurred on one of the napi,

> > > > +                        * reset threaded mode on all napi.

> > > > +                        */

> > > > +                       dev_disable_threaded_all(dev);

> > > > +                       break;

> > > > +               }

> > > > +       }

> > > > +

> > > > +       return ret;

> > >

> > > This doesn't seem right. The NAPI instances can be active while this

> > > is occuring can they not? I would think at a minimum you need to go

> > > through a napi_disable/napi_enable in order to toggle this value for

> > > each NAPI instance. Otherwise aren't you at risk for racing and having

> > > a napi_schedule attempt to wake up the thread before it has been

> > > allocated?

> >

> >

> > Yes. The napi instance could be active when this occurs. And I think

> > it is OK. It is cause napi_set_threaded() only sets

> > NAPI_STATE_THREADED bit after successfully created the kthread. And

> > ___napi_schedule() only tries to wake up the kthread after testing the

> > THREADED bit.

>

> But what do you have guaranteeing that the kthread has been written to

> memory? That is what I was getting at. Just because you have written

> the value doesn't mean it is in memory yet so you would probably need

> an smb_mb__before_atomic() barrier call before you set the bit.

>

Noted. Will look into this.

> Also I am not sure it is entirely safe to have the instance polling

> while you are doing this. That is why I am thinking if the instance is

> enabled then a napi_disable/napi_enable would be preferable.

When the napi is actively being polled in threaded mode, we will keep
rescheduling the kthread and calling __napi_poll() until
NAPI_SCHED_STATE is cleared by napi_complete_done(). And during the
next time napi_schedule() is called, we re-evaluate
NAPI_STATE_THREADED bit to see if we should wake up kthread, or
generate softirq.
And for the other way around, if napi is being handled during
net_rx_action(), toggling the bit won't cause immediate wake-up of the
kthread, but will wait for NAPI_SCHED_STATE to be cleared, and the
next time napi_schedule() is called.
I think it is OK. WDYT?
Alexander Duyck Jan. 16, 2021, 2:18 a.m. UTC | #5
On Fri, Jan 15, 2021 at 4:44 PM Wei Wang <weiwan@google.com> wrote:
>

> On Fri, Jan 15, 2021 at 3:08 PM Alexander Duyck

> <alexander.duyck@gmail.com> wrote:

> >

> > On Fri, Jan 15, 2021 at 1:54 PM Wei Wang <weiwan@google.com> wrote:

> > >

> > > On Thu, Jan 14, 2021 at 7:34 PM Alexander Duyck

> > > <alexander.duyck@gmail.com> wrote:

> > > >

> > > > On Thu, Jan 14, 2021 at 4:34 PM Wei Wang <weiwan@google.com> wrote:

> > > > >

> > > > > This patch adds a new sysfs attribute to the network device class.

> > > > > Said attribute provides a per-device control to enable/disable the

> > > > > threaded mode for all the napi instances of the given network device.

> > > > >

> > > > > Co-developed-by: Paolo Abeni <pabeni@redhat.com>

> > > > > Signed-off-by: Paolo Abeni <pabeni@redhat.com>

> > > > > Co-developed-by: Hannes Frederic Sowa <hannes@stressinduktion.org>

> > > > > Signed-off-by: Hannes Frederic Sowa <hannes@stressinduktion.org>

> > > > > Co-developed-by: Felix Fietkau <nbd@nbd.name>

> > > > > Signed-off-by: Felix Fietkau <nbd@nbd.name>

> > > > > Signed-off-by: Wei Wang <weiwan@google.com>

> > > > > ---

> > > > >  include/linux/netdevice.h |  2 ++

> > > > >  net/core/dev.c            | 28 +++++++++++++++++

> > > > >  net/core/net-sysfs.c      | 63 +++++++++++++++++++++++++++++++++++++++

> > > > >  3 files changed, 93 insertions(+)

> > > > >

> > > > > diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h

> > > > > index c24ed232c746..11ae0c9b9350 100644

> > > > > --- a/include/linux/netdevice.h

> > > > > +++ b/include/linux/netdevice.h

> > > > > @@ -497,6 +497,8 @@ static inline bool napi_complete(struct napi_struct *n)

> > > > >         return napi_complete_done(n, 0);

> > > > >  }

> > > > >

> > > > > +int dev_set_threaded(struct net_device *dev, bool threaded);

> > > > > +

> > > > >  /**

> > > > >   *     napi_disable - prevent NAPI from scheduling

> > > > >   *     @n: NAPI context

> > > > > diff --git a/net/core/dev.c b/net/core/dev.c

> > > > > index edcfec1361e9..d5fb95316ea8 100644

> > > > > --- a/net/core/dev.c

> > > > > +++ b/net/core/dev.c

> > > > > @@ -6754,6 +6754,34 @@ static int napi_set_threaded(struct napi_struct *n, bool threaded)

> > > > >         return err;

> > > > >  }

> > > > >

> > > > > +static void dev_disable_threaded_all(struct net_device *dev)

> > > > > +{

> > > > > +       struct napi_struct *napi;

> > > > > +

> > > > > +       list_for_each_entry(napi, &dev->napi_list, dev_list)

> > > > > +               napi_set_threaded(napi, false);

> > > > > +}

> > > > > +

> > > > > +int dev_set_threaded(struct net_device *dev, bool threaded)

> > > > > +{

> > > > > +       struct napi_struct *napi;

> > > > > +       int ret;

> > > > > +

> > > > > +       dev->threaded = threaded;

> > > > > +       list_for_each_entry(napi, &dev->napi_list, dev_list) {

> > > > > +               ret = napi_set_threaded(napi, threaded);

> > > > > +               if (ret) {

> > > > > +                       /* Error occurred on one of the napi,

> > > > > +                        * reset threaded mode on all napi.

> > > > > +                        */

> > > > > +                       dev_disable_threaded_all(dev);

> > > > > +                       break;

> > > > > +               }

> > > > > +       }

> > > > > +

> > > > > +       return ret;

> > > >

> > > > This doesn't seem right. The NAPI instances can be active while this

> > > > is occuring can they not? I would think at a minimum you need to go

> > > > through a napi_disable/napi_enable in order to toggle this value for

> > > > each NAPI instance. Otherwise aren't you at risk for racing and having

> > > > a napi_schedule attempt to wake up the thread before it has been

> > > > allocated?

> > >

> > >

> > > Yes. The napi instance could be active when this occurs. And I think

> > > it is OK. It is cause napi_set_threaded() only sets

> > > NAPI_STATE_THREADED bit after successfully created the kthread. And

> > > ___napi_schedule() only tries to wake up the kthread after testing the

> > > THREADED bit.

> >

> > But what do you have guaranteeing that the kthread has been written to

> > memory? That is what I was getting at. Just because you have written

> > the value doesn't mean it is in memory yet so you would probably need

> > an smb_mb__before_atomic() barrier call before you set the bit.

> >

> Noted. Will look into this.

>

> > Also I am not sure it is entirely safe to have the instance polling

> > while you are doing this. That is why I am thinking if the instance is

> > enabled then a napi_disable/napi_enable would be preferable.

> When the napi is actively being polled in threaded mode, we will keep

> rescheduling the kthread and calling __napi_poll() until

> NAPI_SCHED_STATE is cleared by napi_complete_done(). And during the

> next time napi_schedule() is called, we re-evaluate

> NAPI_STATE_THREADED bit to see if we should wake up kthread, or

> generate softirq.

> And for the other way around, if napi is being handled during

> net_rx_action(), toggling the bit won't cause immediate wake-up of the

> kthread, but will wait for NAPI_SCHED_STATE to be cleared, and the

> next time napi_schedule() is called.

> I think it is OK. WDYT?


It is hard to say. The one spot that gives me a bit of concern is the
NAPIF_STATE_MISSED case in napi_complete_done. It is essentially would
become a switchover point between the two while we are actively
polling inside the driver. You end up with NAPI_SCHED_STATE not being
toggled but jumping from one to the other.
Wei Wang Jan. 18, 2021, 7:58 p.m. UTC | #6
On Fri, Jan 15, 2021 at 6:18 PM Alexander Duyck
<alexander.duyck@gmail.com> wrote:
>

> On Fri, Jan 15, 2021 at 4:44 PM Wei Wang <weiwan@google.com> wrote:

> >

> > On Fri, Jan 15, 2021 at 3:08 PM Alexander Duyck

> > <alexander.duyck@gmail.com> wrote:

> > >

> > > On Fri, Jan 15, 2021 at 1:54 PM Wei Wang <weiwan@google.com> wrote:

> > > >

> > > > On Thu, Jan 14, 2021 at 7:34 PM Alexander Duyck

> > > > <alexander.duyck@gmail.com> wrote:

> > > > >

> > > > > On Thu, Jan 14, 2021 at 4:34 PM Wei Wang <weiwan@google.com> wrote:

> > > > > >

> > > > > > This patch adds a new sysfs attribute to the network device class.

> > > > > > Said attribute provides a per-device control to enable/disable the

> > > > > > threaded mode for all the napi instances of the given network device.

> > > > > >

> > > > > > Co-developed-by: Paolo Abeni <pabeni@redhat.com>

> > > > > > Signed-off-by: Paolo Abeni <pabeni@redhat.com>

> > > > > > Co-developed-by: Hannes Frederic Sowa <hannes@stressinduktion.org>

> > > > > > Signed-off-by: Hannes Frederic Sowa <hannes@stressinduktion.org>

> > > > > > Co-developed-by: Felix Fietkau <nbd@nbd.name>

> > > > > > Signed-off-by: Felix Fietkau <nbd@nbd.name>

> > > > > > Signed-off-by: Wei Wang <weiwan@google.com>

> > > > > > ---

> > > > > >  include/linux/netdevice.h |  2 ++

> > > > > >  net/core/dev.c            | 28 +++++++++++++++++

> > > > > >  net/core/net-sysfs.c      | 63 +++++++++++++++++++++++++++++++++++++++

> > > > > >  3 files changed, 93 insertions(+)

> > > > > >

> > > > > > diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h

> > > > > > index c24ed232c746..11ae0c9b9350 100644

> > > > > > --- a/include/linux/netdevice.h

> > > > > > +++ b/include/linux/netdevice.h

> > > > > > @@ -497,6 +497,8 @@ static inline bool napi_complete(struct napi_struct *n)

> > > > > >         return napi_complete_done(n, 0);

> > > > > >  }

> > > > > >

> > > > > > +int dev_set_threaded(struct net_device *dev, bool threaded);

> > > > > > +

> > > > > >  /**

> > > > > >   *     napi_disable - prevent NAPI from scheduling

> > > > > >   *     @n: NAPI context

> > > > > > diff --git a/net/core/dev.c b/net/core/dev.c

> > > > > > index edcfec1361e9..d5fb95316ea8 100644

> > > > > > --- a/net/core/dev.c

> > > > > > +++ b/net/core/dev.c

> > > > > > @@ -6754,6 +6754,34 @@ static int napi_set_threaded(struct napi_struct *n, bool threaded)

> > > > > >         return err;

> > > > > >  }

> > > > > >

> > > > > > +static void dev_disable_threaded_all(struct net_device *dev)

> > > > > > +{

> > > > > > +       struct napi_struct *napi;

> > > > > > +

> > > > > > +       list_for_each_entry(napi, &dev->napi_list, dev_list)

> > > > > > +               napi_set_threaded(napi, false);

> > > > > > +}

> > > > > > +

> > > > > > +int dev_set_threaded(struct net_device *dev, bool threaded)

> > > > > > +{

> > > > > > +       struct napi_struct *napi;

> > > > > > +       int ret;

> > > > > > +

> > > > > > +       dev->threaded = threaded;

> > > > > > +       list_for_each_entry(napi, &dev->napi_list, dev_list) {

> > > > > > +               ret = napi_set_threaded(napi, threaded);

> > > > > > +               if (ret) {

> > > > > > +                       /* Error occurred on one of the napi,

> > > > > > +                        * reset threaded mode on all napi.

> > > > > > +                        */

> > > > > > +                       dev_disable_threaded_all(dev);

> > > > > > +                       break;

> > > > > > +               }

> > > > > > +       }

> > > > > > +

> > > > > > +       return ret;

> > > > >

> > > > > This doesn't seem right. The NAPI instances can be active while this

> > > > > is occuring can they not? I would think at a minimum you need to go

> > > > > through a napi_disable/napi_enable in order to toggle this value for

> > > > > each NAPI instance. Otherwise aren't you at risk for racing and having

> > > > > a napi_schedule attempt to wake up the thread before it has been

> > > > > allocated?

> > > >

> > > >

> > > > Yes. The napi instance could be active when this occurs. And I think

> > > > it is OK. It is cause napi_set_threaded() only sets

> > > > NAPI_STATE_THREADED bit after successfully created the kthread. And

> > > > ___napi_schedule() only tries to wake up the kthread after testing the

> > > > THREADED bit.

> > >

> > > But what do you have guaranteeing that the kthread has been written to

> > > memory? That is what I was getting at. Just because you have written

> > > the value doesn't mean it is in memory yet so you would probably need

> > > an smb_mb__before_atomic() barrier call before you set the bit.

> > >

> > Noted. Will look into this.

> >

> > > Also I am not sure it is entirely safe to have the instance polling

> > > while you are doing this. That is why I am thinking if the instance is

> > > enabled then a napi_disable/napi_enable would be preferable.

> > When the napi is actively being polled in threaded mode, we will keep

> > rescheduling the kthread and calling __napi_poll() until

> > NAPI_SCHED_STATE is cleared by napi_complete_done(). And during the

> > next time napi_schedule() is called, we re-evaluate

> > NAPI_STATE_THREADED bit to see if we should wake up kthread, or

> > generate softirq.

> > And for the other way around, if napi is being handled during

> > net_rx_action(), toggling the bit won't cause immediate wake-up of the

> > kthread, but will wait for NAPI_SCHED_STATE to be cleared, and the

> > next time napi_schedule() is called.

> > I think it is OK. WDYT?

>

> It is hard to say. The one spot that gives me a bit of concern is the

> NAPIF_STATE_MISSED case in napi_complete_done. It is essentially would

> become a switchover point between the two while we are actively

> polling inside the driver. You end up with NAPI_SCHED_STATE not being

> toggled but jumping from one to the other.


Hmm.. Right. That is the one case where NAPI_SCHED_STATE will not be
toggled, but could potentially change the processing mode.
But still, I don't see any race in this case. The napi instance will
still either be processed in softirq mode by net_rx_action(), or in
the kthread mode, after napi_complete_done() calls __napi_schedule().
diff mbox series

Patch

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index c24ed232c746..11ae0c9b9350 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -497,6 +497,8 @@  static inline bool napi_complete(struct napi_struct *n)
 	return napi_complete_done(n, 0);
 }
 
+int dev_set_threaded(struct net_device *dev, bool threaded);
+
 /**
  *	napi_disable - prevent NAPI from scheduling
  *	@n: NAPI context
diff --git a/net/core/dev.c b/net/core/dev.c
index edcfec1361e9..d5fb95316ea8 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -6754,6 +6754,34 @@  static int napi_set_threaded(struct napi_struct *n, bool threaded)
 	return err;
 }
 
+static void dev_disable_threaded_all(struct net_device *dev)
+{
+	struct napi_struct *napi;
+
+	list_for_each_entry(napi, &dev->napi_list, dev_list)
+		napi_set_threaded(napi, false);
+}
+
+int dev_set_threaded(struct net_device *dev, bool threaded)
+{
+	struct napi_struct *napi;
+	int ret;
+
+	dev->threaded = threaded;
+	list_for_each_entry(napi, &dev->napi_list, dev_list) {
+		ret = napi_set_threaded(napi, threaded);
+		if (ret) {
+			/* Error occurred on one of the napi,
+			 * reset threaded mode on all napi.
+			 */
+			dev_disable_threaded_all(dev);
+			break;
+		}
+	}
+
+	return ret;
+}
+
 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
 		    int (*poll)(struct napi_struct *, int), int weight)
 {
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index daf502c13d6d..2017f8f07b8d 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -538,6 +538,68 @@  static ssize_t phys_switch_id_show(struct device *dev,
 }
 static DEVICE_ATTR_RO(phys_switch_id);
 
+static ssize_t threaded_show(struct device *dev,
+			     struct device_attribute *attr, char *buf)
+{
+	struct net_device *netdev = to_net_dev(dev);
+	struct napi_struct *n;
+	bool enabled;
+	int ret;
+
+	if (!rtnl_trylock())
+		return restart_syscall();
+
+	if (!dev_isalive(netdev)) {
+		ret = -EINVAL;
+		goto unlock;
+	}
+
+	if (list_empty(&netdev->napi_list)) {
+		ret = -EOPNOTSUPP;
+		goto unlock;
+	}
+
+	/* Only return true if all napi have threaded mode.
+	 * The inconsistency could happen when the device driver calls
+	 * napi_disable()/napi_enable() with dev->threaded set to true,
+	 * but napi_kthread_create() fails.
+	 * We return false in this case to remind the user that one or
+	 * more napi did not have threaded mode enabled properly.
+	 */
+	list_for_each_entry(n, &netdev->napi_list, dev_list) {
+		enabled = !!test_bit(NAPI_STATE_THREADED, &n->state);
+		if (!enabled)
+			break;
+	}
+
+	ret = sprintf(buf, fmt_dec, enabled);
+
+unlock:
+	rtnl_unlock();
+	return ret;
+}
+
+static int modify_napi_threaded(struct net_device *dev, unsigned long val)
+{
+	struct napi_struct *napi;
+	int ret;
+
+	if (list_empty(&dev->napi_list))
+		return -EOPNOTSUPP;
+
+	ret = dev_set_threaded(dev, !!val);
+
+	return ret;
+}
+
+static ssize_t threaded_store(struct device *dev,
+			      struct device_attribute *attr,
+			      const char *buf, size_t len)
+{
+	return netdev_store(dev, attr, buf, len, modify_napi_threaded);
+}
+static DEVICE_ATTR_RW(threaded);
+
 static struct attribute *net_class_attrs[] __ro_after_init = {
 	&dev_attr_netdev_group.attr,
 	&dev_attr_type.attr,
@@ -570,6 +632,7 @@  static struct attribute *net_class_attrs[] __ro_after_init = {
 	&dev_attr_proto_down.attr,
 	&dev_attr_carrier_up_count.attr,
 	&dev_attr_carrier_down_count.attr,
+	&dev_attr_threaded.attr,
 	NULL,
 };
 ATTRIBUTE_GROUPS(net_class);