diff mbox series

[2/6] block/nvme: Map doorbells pages write-only

Message ID 20200921162949.553863-3-philmd@redhat.com
State New
Headers show
Series block/nvme: Map doorbells pages write-only, remove magic from nvme_init | expand

Commit Message

Philippe Mathieu-Daudé Sept. 21, 2020, 4:29 p.m. UTC
Per the datasheet sections 3.1.13/3.1.14:
  "The host should not read the doorbell registers."

As we don't need read access, map the doorbells with write-only
permission. We keep a reference to this mapped address in the
BDRVNVMeState structure.

Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
---
 block/nvme.c | 29 +++++++++++++++++++----------
 1 file changed, 19 insertions(+), 10 deletions(-)

Comments

Fam Zheng Sept. 22, 2020, 8:18 a.m. UTC | #1
On Mon, 2020-09-21 at 18:29 +0200, Philippe Mathieu-Daudé wrote:
> Per the datasheet sections 3.1.13/3.1.14:

>   "The host should not read the doorbell registers."

> 

> As we don't need read access, map the doorbells with write-only

> permission. We keep a reference to this mapped address in the

> BDRVNVMeState structure.


Besides looking more correct in access mode, is there any side effect
of WO mapping?

Fam

> 

> Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>

> ---

>  block/nvme.c | 29 +++++++++++++++++++----------

>  1 file changed, 19 insertions(+), 10 deletions(-)

> 

> diff --git a/block/nvme.c b/block/nvme.c

> index 5a4dc6a722a..3c834da8fec 100644

> --- a/block/nvme.c

> +++ b/block/nvme.c

> @@ -31,7 +31,7 @@

>  #define NVME_SQ_ENTRY_BYTES 64

>  #define NVME_CQ_ENTRY_BYTES 16

>  #define NVME_QUEUE_SIZE 128

> -#define NVME_BAR_SIZE 8192

> +#define NVME_DOORBELL_SIZE 4096

>  

>  /*

>   * We have to leave one slot empty as that is the full queue case

> where

> @@ -84,10 +84,6 @@ typedef struct {

>  /* Memory mapped registers */

>  typedef volatile struct {

>      NvmeBar ctrl;

> -    struct {

> -        uint32_t sq_tail;

> -        uint32_t cq_head;

> -    } doorbells[];

>  } NVMeRegs;

>  

>  #define INDEX_ADMIN     0

> @@ -103,6 +99,11 @@ struct BDRVNVMeState {

>      AioContext *aio_context;

>      QEMUVFIOState *vfio;

>      NVMeRegs *regs;

> +    /* Memory mapped registers */

> +    volatile struct {

> +        uint32_t sq_tail;

> +        uint32_t cq_head;

> +    } *doorbells;

>      /* The submission/completion queue pairs.

>       * [0]: admin queue.

>       * [1..]: io queues.

> @@ -247,14 +248,14 @@ static NVMeQueuePair

> *nvme_create_queue_pair(BDRVNVMeState *s,

>          error_propagate(errp, local_err);

>          goto fail;

>      }

> -    q->sq.doorbell = &s->regs->doorbells[idx * s-

> >doorbell_scale].sq_tail;

> +    q->sq.doorbell = &s->doorbells[idx * s->doorbell_scale].sq_tail;

>  

>      nvme_init_queue(s, &q->cq, size, NVME_CQ_ENTRY_BYTES,

> &local_err);

>      if (local_err) {

>          error_propagate(errp, local_err);

>          goto fail;

>      }

> -    q->cq.doorbell = &s->regs->doorbells[idx * s-

> >doorbell_scale].cq_head;

> +    q->cq.doorbell = &s->doorbells[idx * s->doorbell_scale].cq_head;

>  

>      return q;

>  fail:

> @@ -712,13 +713,12 @@ static int nvme_init(BlockDriverState *bs,

> const char *device, int namespace,

>          goto out;

>      }

>  

> -    s->regs = qemu_vfio_pci_map_bar(s->vfio, 0, 0, NVME_BAR_SIZE,

> +    s->regs = qemu_vfio_pci_map_bar(s->vfio, 0, 0, sizeof(NvmeBar),

>                                      PROT_READ | PROT_WRITE, errp);

>      if (!s->regs) {

>          ret = -EINVAL;

>          goto out;

>      }

> -

>      /* Perform initialize sequence as described in NVMe spec "7.6.1

>       * Initialization". */

>  

> @@ -748,6 +748,13 @@ static int nvme_init(BlockDriverState *bs, const

> char *device, int namespace,

>          }

>      }

>  

> +    s->doorbells = qemu_vfio_pci_map_bar(s->vfio, 0,

> sizeof(NvmeBar),

> +                                         NVME_DOORBELL_SIZE,

> PROT_WRITE, errp);

> +    if (!s->doorbells) {

> +        ret = -EINVAL;

> +        goto out;

> +    }

> +

>      /* Set up admin queue. */

>      s->queues = g_new(NVMeQueuePair *, 1);

>      s->queues[INDEX_ADMIN] = nvme_create_queue_pair(s, aio_context,

> 0,

> @@ -873,7 +880,9 @@ static void nvme_close(BlockDriverState *bs)

>                             &s->irq_notifier[MSIX_SHARED_IRQ_IDX],

>                             false, NULL, NULL);

>      event_notifier_cleanup(&s->irq_notifier[MSIX_SHARED_IRQ_IDX]);

> -    qemu_vfio_pci_unmap_bar(s->vfio, 0, (void *)s->regs, 0,

> NVME_BAR_SIZE);

> +    qemu_vfio_pci_unmap_bar(s->vfio, 0, (void *)s->doorbells,

> +                            sizeof(NvmeBar), NVME_DOORBELL_SIZE);

> +    qemu_vfio_pci_unmap_bar(s->vfio, 0, (void *)s->regs, 0,

> sizeof(NvmeBar));

>      qemu_vfio_close(s->vfio);

>  

>      g_free(s->device);
Philippe Mathieu-Daudé Sept. 22, 2020, 8:41 a.m. UTC | #2
Hi Fam,

+Paolo?

On 9/22/20 10:18 AM, Fam Zheng wrote:
> On Mon, 2020-09-21 at 18:29 +0200, Philippe Mathieu-Daudé wrote:

>> Per the datasheet sections 3.1.13/3.1.14:

>>   "The host should not read the doorbell registers."

>>

>> As we don't need read access, map the doorbells with write-only

>> permission. We keep a reference to this mapped address in the

>> BDRVNVMeState structure.

> 

> Besides looking more correct in access mode, is there any side effect

> of WO mapping?


TBH I don't have enough knowledge to answer this question.
I tested successfully on X86. I'm writing more tests.

> 

> Fam

> 

>>

>> Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>

>> ---

>>  block/nvme.c | 29 +++++++++++++++++++----------

>>  1 file changed, 19 insertions(+), 10 deletions(-)

>>

>> diff --git a/block/nvme.c b/block/nvme.c

>> index 5a4dc6a722a..3c834da8fec 100644

>> --- a/block/nvme.c

>> +++ b/block/nvme.c

>> @@ -31,7 +31,7 @@

>>  #define NVME_SQ_ENTRY_BYTES 64

>>  #define NVME_CQ_ENTRY_BYTES 16

>>  #define NVME_QUEUE_SIZE 128

>> -#define NVME_BAR_SIZE 8192

>> +#define NVME_DOORBELL_SIZE 4096

>>  

>>  /*

>>   * We have to leave one slot empty as that is the full queue case

>> where

>> @@ -84,10 +84,6 @@ typedef struct {

>>  /* Memory mapped registers */

>>  typedef volatile struct {

>>      NvmeBar ctrl;

>> -    struct {

>> -        uint32_t sq_tail;

>> -        uint32_t cq_head;

>> -    } doorbells[];

>>  } NVMeRegs;

>>  

>>  #define INDEX_ADMIN     0

>> @@ -103,6 +99,11 @@ struct BDRVNVMeState {

>>      AioContext *aio_context;

>>      QEMUVFIOState *vfio;

>>      NVMeRegs *regs;

>> +    /* Memory mapped registers */

>> +    volatile struct {

>> +        uint32_t sq_tail;

>> +        uint32_t cq_head;

>> +    } *doorbells;

>>      /* The submission/completion queue pairs.

>>       * [0]: admin queue.

>>       * [1..]: io queues.

>> @@ -247,14 +248,14 @@ static NVMeQueuePair

>> *nvme_create_queue_pair(BDRVNVMeState *s,

>>          error_propagate(errp, local_err);

>>          goto fail;

>>      }

>> -    q->sq.doorbell = &s->regs->doorbells[idx * s-

>>> doorbell_scale].sq_tail;

>> +    q->sq.doorbell = &s->doorbells[idx * s->doorbell_scale].sq_tail;

>>  

>>      nvme_init_queue(s, &q->cq, size, NVME_CQ_ENTRY_BYTES,

>> &local_err);

>>      if (local_err) {

>>          error_propagate(errp, local_err);

>>          goto fail;

>>      }

>> -    q->cq.doorbell = &s->regs->doorbells[idx * s-

>>> doorbell_scale].cq_head;

>> +    q->cq.doorbell = &s->doorbells[idx * s->doorbell_scale].cq_head;

>>  

>>      return q;

>>  fail:

>> @@ -712,13 +713,12 @@ static int nvme_init(BlockDriverState *bs,

>> const char *device, int namespace,

>>          goto out;

>>      }

>>  

>> -    s->regs = qemu_vfio_pci_map_bar(s->vfio, 0, 0, NVME_BAR_SIZE,

>> +    s->regs = qemu_vfio_pci_map_bar(s->vfio, 0, 0, sizeof(NvmeBar),

>>                                      PROT_READ | PROT_WRITE, errp);

>>      if (!s->regs) {

>>          ret = -EINVAL;

>>          goto out;

>>      }

>> -

>>      /* Perform initialize sequence as described in NVMe spec "7.6.1

>>       * Initialization". */

>>  

>> @@ -748,6 +748,13 @@ static int nvme_init(BlockDriverState *bs, const

>> char *device, int namespace,

>>          }

>>      }

>>  

>> +    s->doorbells = qemu_vfio_pci_map_bar(s->vfio, 0,

>> sizeof(NvmeBar),

>> +                                         NVME_DOORBELL_SIZE,

>> PROT_WRITE, errp);

>> +    if (!s->doorbells) {

>> +        ret = -EINVAL;

>> +        goto out;

>> +    }

>> +

>>      /* Set up admin queue. */

>>      s->queues = g_new(NVMeQueuePair *, 1);

>>      s->queues[INDEX_ADMIN] = nvme_create_queue_pair(s, aio_context,

>> 0,

>> @@ -873,7 +880,9 @@ static void nvme_close(BlockDriverState *bs)

>>                             &s->irq_notifier[MSIX_SHARED_IRQ_IDX],

>>                             false, NULL, NULL);

>>      event_notifier_cleanup(&s->irq_notifier[MSIX_SHARED_IRQ_IDX]);

>> -    qemu_vfio_pci_unmap_bar(s->vfio, 0, (void *)s->regs, 0,

>> NVME_BAR_SIZE);

>> +    qemu_vfio_pci_unmap_bar(s->vfio, 0, (void *)s->doorbells,

>> +                            sizeof(NvmeBar), NVME_DOORBELL_SIZE);

>> +    qemu_vfio_pci_unmap_bar(s->vfio, 0, (void *)s->regs, 0,

>> sizeof(NvmeBar));

>>      qemu_vfio_close(s->vfio);

>>  

>>      g_free(s->device);

>
Paolo Bonzini Sept. 22, 2020, 9:04 a.m. UTC | #3
On 22/09/20 10:41, Philippe Mathieu-Daudé wrote:
>> Besides looking more correct in access mode, is there any side effect

>> of WO mapping?

> TBH I don't have enough knowledge to answer this question.

> I tested successfully on X86. I'm writing more tests.


No problem with doing this, but PROT_WRITE does not work at all on x86.
:)  PROT_EXEC works if you have a machine with PKRU, but PROT_WRITE
silently becomes PROT_READ|PROT_WRITE because the processor does not
support it.

Paolo
Fam Zheng Sept. 22, 2020, 9:33 a.m. UTC | #4
On Tue, 2020-09-22 at 10:41 +0200, Philippe Mathieu-Daudé wrote:
> Hi Fam,

> 

> +Paolo?

> 

> On 9/22/20 10:18 AM, Fam Zheng wrote:

> > On Mon, 2020-09-21 at 18:29 +0200, Philippe Mathieu-Daudé wrote:

> > > Per the datasheet sections 3.1.13/3.1.14:

> > >   "The host should not read the doorbell registers."

> > > 

> > > As we don't need read access, map the doorbells with write-only

> > > permission. We keep a reference to this mapped address in the

> > > BDRVNVMeState structure.

> > 

> > Besides looking more correct in access mode, is there any side

> > effect

> > of WO mapping?

> 

> TBH I don't have enough knowledge to answer this question.

> I tested successfully on X86. I'm writing more tests.


The reason I'm asking is more because x86 pages are either RO or RW. So
I'd like to see if there's a practical reason behind this patch (I have
no idea about the effects on MTRR and/or IOMMU).

Fam

> 

> > 

> > Fam

> > 

> > > 

> > > Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>

> > > ---

> > >  block/nvme.c | 29 +++++++++++++++++++----------

> > >  1 file changed, 19 insertions(+), 10 deletions(-)

> > > 

> > > diff --git a/block/nvme.c b/block/nvme.c

> > > index 5a4dc6a722a..3c834da8fec 100644

> > > --- a/block/nvme.c

> > > +++ b/block/nvme.c

> > > @@ -31,7 +31,7 @@

> > >  #define NVME_SQ_ENTRY_BYTES 64

> > >  #define NVME_CQ_ENTRY_BYTES 16

> > >  #define NVME_QUEUE_SIZE 128

> > > -#define NVME_BAR_SIZE 8192

> > > +#define NVME_DOORBELL_SIZE 4096

> > >  

> > >  /*

> > >   * We have to leave one slot empty as that is the full queue

> > > case

> > > where

> > > @@ -84,10 +84,6 @@ typedef struct {

> > >  /* Memory mapped registers */

> > >  typedef volatile struct {

> > >      NvmeBar ctrl;

> > > -    struct {

> > > -        uint32_t sq_tail;

> > > -        uint32_t cq_head;

> > > -    } doorbells[];

> > >  } NVMeRegs;

> > >  

> > >  #define INDEX_ADMIN     0

> > > @@ -103,6 +99,11 @@ struct BDRVNVMeState {

> > >      AioContext *aio_context;

> > >      QEMUVFIOState *vfio;

> > >      NVMeRegs *regs;

> > > +    /* Memory mapped registers */

> > > +    volatile struct {

> > > +        uint32_t sq_tail;

> > > +        uint32_t cq_head;

> > > +    } *doorbells;

> > >      /* The submission/completion queue pairs.

> > >       * [0]: admin queue.

> > >       * [1..]: io queues.

> > > @@ -247,14 +248,14 @@ static NVMeQueuePair

> > > *nvme_create_queue_pair(BDRVNVMeState *s,

> > >          error_propagate(errp, local_err);

> > >          goto fail;

> > >      }

> > > -    q->sq.doorbell = &s->regs->doorbells[idx * s-

> > > > doorbell_scale].sq_tail;

> > > 

> > > +    q->sq.doorbell = &s->doorbells[idx * s-

> > > >doorbell_scale].sq_tail;

> > >  

> > >      nvme_init_queue(s, &q->cq, size, NVME_CQ_ENTRY_BYTES,

> > > &local_err);

> > >      if (local_err) {

> > >          error_propagate(errp, local_err);

> > >          goto fail;

> > >      }

> > > -    q->cq.doorbell = &s->regs->doorbells[idx * s-

> > > > doorbell_scale].cq_head;

> > > 

> > > +    q->cq.doorbell = &s->doorbells[idx * s-

> > > >doorbell_scale].cq_head;

> > >  

> > >      return q;

> > >  fail:

> > > @@ -712,13 +713,12 @@ static int nvme_init(BlockDriverState *bs,

> > > const char *device, int namespace,

> > >          goto out;

> > >      }

> > >  

> > > -    s->regs = qemu_vfio_pci_map_bar(s->vfio, 0, 0,

> > > NVME_BAR_SIZE,

> > > +    s->regs = qemu_vfio_pci_map_bar(s->vfio, 0, 0,

> > > sizeof(NvmeBar),

> > >                                      PROT_READ | PROT_WRITE,

> > > errp);

> > >      if (!s->regs) {

> > >          ret = -EINVAL;

> > >          goto out;

> > >      }

> > > -

> > >      /* Perform initialize sequence as described in NVMe spec

> > > "7.6.1

> > >       * Initialization". */

> > >  

> > > @@ -748,6 +748,13 @@ static int nvme_init(BlockDriverState *bs,

> > > const

> > > char *device, int namespace,

> > >          }

> > >      }

> > >  

> > > +    s->doorbells = qemu_vfio_pci_map_bar(s->vfio, 0,

> > > sizeof(NvmeBar),

> > > +                                         NVME_DOORBELL_SIZE,

> > > PROT_WRITE, errp);

> > > +    if (!s->doorbells) {

> > > +        ret = -EINVAL;

> > > +        goto out;

> > > +    }

> > > +

> > >      /* Set up admin queue. */

> > >      s->queues = g_new(NVMeQueuePair *, 1);

> > >      s->queues[INDEX_ADMIN] = nvme_create_queue_pair(s,

> > > aio_context,

> > > 0,

> > > @@ -873,7 +880,9 @@ static void nvme_close(BlockDriverState *bs)

> > >                             &s-

> > > >irq_notifier[MSIX_SHARED_IRQ_IDX],

> > >                             false, NULL, NULL);

> > >      event_notifier_cleanup(&s-

> > > >irq_notifier[MSIX_SHARED_IRQ_IDX]);

> > > -    qemu_vfio_pci_unmap_bar(s->vfio, 0, (void *)s->regs, 0,

> > > NVME_BAR_SIZE);

> > > +    qemu_vfio_pci_unmap_bar(s->vfio, 0, (void *)s->doorbells,

> > > +                            sizeof(NvmeBar),

> > > NVME_DOORBELL_SIZE);

> > > +    qemu_vfio_pci_unmap_bar(s->vfio, 0, (void *)s->regs, 0,

> > > sizeof(NvmeBar));

> > >      qemu_vfio_close(s->vfio);

> > >  

> > >      g_free(s->device);

> 

>
Philippe Mathieu-Daudé Sept. 22, 2020, 9:36 a.m. UTC | #5
On 9/22/20 11:04 AM, Paolo Bonzini wrote:
> On 22/09/20 10:41, Philippe Mathieu-Daudé wrote:

>>> Besides looking more correct in access mode, is there any side effect

>>> of WO mapping?

>> TBH I don't have enough knowledge to answer this question.

>> I tested successfully on X86. I'm writing more tests.

> 

> No problem with doing this, but PROT_WRITE does not work at all on x86.

> :)  PROT_EXEC works if you have a machine with PKRU, but PROT_WRITE

> silently becomes PROT_READ|PROT_WRITE because the processor does not

> support it.


Ah this is why it works the same way in my testing.

I'll run tests on ARM.

Thanks,

Phil.

> 

> Paolo

>
diff mbox series

Patch

diff --git a/block/nvme.c b/block/nvme.c
index 5a4dc6a722a..3c834da8fec 100644
--- a/block/nvme.c
+++ b/block/nvme.c
@@ -31,7 +31,7 @@ 
 #define NVME_SQ_ENTRY_BYTES 64
 #define NVME_CQ_ENTRY_BYTES 16
 #define NVME_QUEUE_SIZE 128
-#define NVME_BAR_SIZE 8192
+#define NVME_DOORBELL_SIZE 4096
 
 /*
  * We have to leave one slot empty as that is the full queue case where
@@ -84,10 +84,6 @@  typedef struct {
 /* Memory mapped registers */
 typedef volatile struct {
     NvmeBar ctrl;
-    struct {
-        uint32_t sq_tail;
-        uint32_t cq_head;
-    } doorbells[];
 } NVMeRegs;
 
 #define INDEX_ADMIN     0
@@ -103,6 +99,11 @@  struct BDRVNVMeState {
     AioContext *aio_context;
     QEMUVFIOState *vfio;
     NVMeRegs *regs;
+    /* Memory mapped registers */
+    volatile struct {
+        uint32_t sq_tail;
+        uint32_t cq_head;
+    } *doorbells;
     /* The submission/completion queue pairs.
      * [0]: admin queue.
      * [1..]: io queues.
@@ -247,14 +248,14 @@  static NVMeQueuePair *nvme_create_queue_pair(BDRVNVMeState *s,
         error_propagate(errp, local_err);
         goto fail;
     }
-    q->sq.doorbell = &s->regs->doorbells[idx * s->doorbell_scale].sq_tail;
+    q->sq.doorbell = &s->doorbells[idx * s->doorbell_scale].sq_tail;
 
     nvme_init_queue(s, &q->cq, size, NVME_CQ_ENTRY_BYTES, &local_err);
     if (local_err) {
         error_propagate(errp, local_err);
         goto fail;
     }
-    q->cq.doorbell = &s->regs->doorbells[idx * s->doorbell_scale].cq_head;
+    q->cq.doorbell = &s->doorbells[idx * s->doorbell_scale].cq_head;
 
     return q;
 fail:
@@ -712,13 +713,12 @@  static int nvme_init(BlockDriverState *bs, const char *device, int namespace,
         goto out;
     }
 
-    s->regs = qemu_vfio_pci_map_bar(s->vfio, 0, 0, NVME_BAR_SIZE,
+    s->regs = qemu_vfio_pci_map_bar(s->vfio, 0, 0, sizeof(NvmeBar),
                                     PROT_READ | PROT_WRITE, errp);
     if (!s->regs) {
         ret = -EINVAL;
         goto out;
     }
-
     /* Perform initialize sequence as described in NVMe spec "7.6.1
      * Initialization". */
 
@@ -748,6 +748,13 @@  static int nvme_init(BlockDriverState *bs, const char *device, int namespace,
         }
     }
 
+    s->doorbells = qemu_vfio_pci_map_bar(s->vfio, 0, sizeof(NvmeBar),
+                                         NVME_DOORBELL_SIZE, PROT_WRITE, errp);
+    if (!s->doorbells) {
+        ret = -EINVAL;
+        goto out;
+    }
+
     /* Set up admin queue. */
     s->queues = g_new(NVMeQueuePair *, 1);
     s->queues[INDEX_ADMIN] = nvme_create_queue_pair(s, aio_context, 0,
@@ -873,7 +880,9 @@  static void nvme_close(BlockDriverState *bs)
                            &s->irq_notifier[MSIX_SHARED_IRQ_IDX],
                            false, NULL, NULL);
     event_notifier_cleanup(&s->irq_notifier[MSIX_SHARED_IRQ_IDX]);
-    qemu_vfio_pci_unmap_bar(s->vfio, 0, (void *)s->regs, 0, NVME_BAR_SIZE);
+    qemu_vfio_pci_unmap_bar(s->vfio, 0, (void *)s->doorbells,
+                            sizeof(NvmeBar), NVME_DOORBELL_SIZE);
+    qemu_vfio_pci_unmap_bar(s->vfio, 0, (void *)s->regs, 0, sizeof(NvmeBar));
     qemu_vfio_close(s->vfio);
 
     g_free(s->device);