Message ID | 20200921162949.553863-3-philmd@redhat.com |
---|---|
State | New |
Headers | show |
Series | block/nvme: Map doorbells pages write-only, remove magic from nvme_init | expand |
On Mon, 2020-09-21 at 18:29 +0200, Philippe Mathieu-Daudé wrote: > Per the datasheet sections 3.1.13/3.1.14: > "The host should not read the doorbell registers." > > As we don't need read access, map the doorbells with write-only > permission. We keep a reference to this mapped address in the > BDRVNVMeState structure. Besides looking more correct in access mode, is there any side effect of WO mapping? Fam > > Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com> > --- > block/nvme.c | 29 +++++++++++++++++++---------- > 1 file changed, 19 insertions(+), 10 deletions(-) > > diff --git a/block/nvme.c b/block/nvme.c > index 5a4dc6a722a..3c834da8fec 100644 > --- a/block/nvme.c > +++ b/block/nvme.c > @@ -31,7 +31,7 @@ > #define NVME_SQ_ENTRY_BYTES 64 > #define NVME_CQ_ENTRY_BYTES 16 > #define NVME_QUEUE_SIZE 128 > -#define NVME_BAR_SIZE 8192 > +#define NVME_DOORBELL_SIZE 4096 > > /* > * We have to leave one slot empty as that is the full queue case > where > @@ -84,10 +84,6 @@ typedef struct { > /* Memory mapped registers */ > typedef volatile struct { > NvmeBar ctrl; > - struct { > - uint32_t sq_tail; > - uint32_t cq_head; > - } doorbells[]; > } NVMeRegs; > > #define INDEX_ADMIN 0 > @@ -103,6 +99,11 @@ struct BDRVNVMeState { > AioContext *aio_context; > QEMUVFIOState *vfio; > NVMeRegs *regs; > + /* Memory mapped registers */ > + volatile struct { > + uint32_t sq_tail; > + uint32_t cq_head; > + } *doorbells; > /* The submission/completion queue pairs. > * [0]: admin queue. > * [1..]: io queues. > @@ -247,14 +248,14 @@ static NVMeQueuePair > *nvme_create_queue_pair(BDRVNVMeState *s, > error_propagate(errp, local_err); > goto fail; > } > - q->sq.doorbell = &s->regs->doorbells[idx * s- > >doorbell_scale].sq_tail; > + q->sq.doorbell = &s->doorbells[idx * s->doorbell_scale].sq_tail; > > nvme_init_queue(s, &q->cq, size, NVME_CQ_ENTRY_BYTES, > &local_err); > if (local_err) { > error_propagate(errp, local_err); > goto fail; > } > - q->cq.doorbell = &s->regs->doorbells[idx * s- > >doorbell_scale].cq_head; > + q->cq.doorbell = &s->doorbells[idx * s->doorbell_scale].cq_head; > > return q; > fail: > @@ -712,13 +713,12 @@ static int nvme_init(BlockDriverState *bs, > const char *device, int namespace, > goto out; > } > > - s->regs = qemu_vfio_pci_map_bar(s->vfio, 0, 0, NVME_BAR_SIZE, > + s->regs = qemu_vfio_pci_map_bar(s->vfio, 0, 0, sizeof(NvmeBar), > PROT_READ | PROT_WRITE, errp); > if (!s->regs) { > ret = -EINVAL; > goto out; > } > - > /* Perform initialize sequence as described in NVMe spec "7.6.1 > * Initialization". */ > > @@ -748,6 +748,13 @@ static int nvme_init(BlockDriverState *bs, const > char *device, int namespace, > } > } > > + s->doorbells = qemu_vfio_pci_map_bar(s->vfio, 0, > sizeof(NvmeBar), > + NVME_DOORBELL_SIZE, > PROT_WRITE, errp); > + if (!s->doorbells) { > + ret = -EINVAL; > + goto out; > + } > + > /* Set up admin queue. */ > s->queues = g_new(NVMeQueuePair *, 1); > s->queues[INDEX_ADMIN] = nvme_create_queue_pair(s, aio_context, > 0, > @@ -873,7 +880,9 @@ static void nvme_close(BlockDriverState *bs) > &s->irq_notifier[MSIX_SHARED_IRQ_IDX], > false, NULL, NULL); > event_notifier_cleanup(&s->irq_notifier[MSIX_SHARED_IRQ_IDX]); > - qemu_vfio_pci_unmap_bar(s->vfio, 0, (void *)s->regs, 0, > NVME_BAR_SIZE); > + qemu_vfio_pci_unmap_bar(s->vfio, 0, (void *)s->doorbells, > + sizeof(NvmeBar), NVME_DOORBELL_SIZE); > + qemu_vfio_pci_unmap_bar(s->vfio, 0, (void *)s->regs, 0, > sizeof(NvmeBar)); > qemu_vfio_close(s->vfio); > > g_free(s->device);
Hi Fam, +Paolo? On 9/22/20 10:18 AM, Fam Zheng wrote: > On Mon, 2020-09-21 at 18:29 +0200, Philippe Mathieu-Daudé wrote: >> Per the datasheet sections 3.1.13/3.1.14: >> "The host should not read the doorbell registers." >> >> As we don't need read access, map the doorbells with write-only >> permission. We keep a reference to this mapped address in the >> BDRVNVMeState structure. > > Besides looking more correct in access mode, is there any side effect > of WO mapping? TBH I don't have enough knowledge to answer this question. I tested successfully on X86. I'm writing more tests. > > Fam > >> >> Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com> >> --- >> block/nvme.c | 29 +++++++++++++++++++---------- >> 1 file changed, 19 insertions(+), 10 deletions(-) >> >> diff --git a/block/nvme.c b/block/nvme.c >> index 5a4dc6a722a..3c834da8fec 100644 >> --- a/block/nvme.c >> +++ b/block/nvme.c >> @@ -31,7 +31,7 @@ >> #define NVME_SQ_ENTRY_BYTES 64 >> #define NVME_CQ_ENTRY_BYTES 16 >> #define NVME_QUEUE_SIZE 128 >> -#define NVME_BAR_SIZE 8192 >> +#define NVME_DOORBELL_SIZE 4096 >> >> /* >> * We have to leave one slot empty as that is the full queue case >> where >> @@ -84,10 +84,6 @@ typedef struct { >> /* Memory mapped registers */ >> typedef volatile struct { >> NvmeBar ctrl; >> - struct { >> - uint32_t sq_tail; >> - uint32_t cq_head; >> - } doorbells[]; >> } NVMeRegs; >> >> #define INDEX_ADMIN 0 >> @@ -103,6 +99,11 @@ struct BDRVNVMeState { >> AioContext *aio_context; >> QEMUVFIOState *vfio; >> NVMeRegs *regs; >> + /* Memory mapped registers */ >> + volatile struct { >> + uint32_t sq_tail; >> + uint32_t cq_head; >> + } *doorbells; >> /* The submission/completion queue pairs. >> * [0]: admin queue. >> * [1..]: io queues. >> @@ -247,14 +248,14 @@ static NVMeQueuePair >> *nvme_create_queue_pair(BDRVNVMeState *s, >> error_propagate(errp, local_err); >> goto fail; >> } >> - q->sq.doorbell = &s->regs->doorbells[idx * s- >>> doorbell_scale].sq_tail; >> + q->sq.doorbell = &s->doorbells[idx * s->doorbell_scale].sq_tail; >> >> nvme_init_queue(s, &q->cq, size, NVME_CQ_ENTRY_BYTES, >> &local_err); >> if (local_err) { >> error_propagate(errp, local_err); >> goto fail; >> } >> - q->cq.doorbell = &s->regs->doorbells[idx * s- >>> doorbell_scale].cq_head; >> + q->cq.doorbell = &s->doorbells[idx * s->doorbell_scale].cq_head; >> >> return q; >> fail: >> @@ -712,13 +713,12 @@ static int nvme_init(BlockDriverState *bs, >> const char *device, int namespace, >> goto out; >> } >> >> - s->regs = qemu_vfio_pci_map_bar(s->vfio, 0, 0, NVME_BAR_SIZE, >> + s->regs = qemu_vfio_pci_map_bar(s->vfio, 0, 0, sizeof(NvmeBar), >> PROT_READ | PROT_WRITE, errp); >> if (!s->regs) { >> ret = -EINVAL; >> goto out; >> } >> - >> /* Perform initialize sequence as described in NVMe spec "7.6.1 >> * Initialization". */ >> >> @@ -748,6 +748,13 @@ static int nvme_init(BlockDriverState *bs, const >> char *device, int namespace, >> } >> } >> >> + s->doorbells = qemu_vfio_pci_map_bar(s->vfio, 0, >> sizeof(NvmeBar), >> + NVME_DOORBELL_SIZE, >> PROT_WRITE, errp); >> + if (!s->doorbells) { >> + ret = -EINVAL; >> + goto out; >> + } >> + >> /* Set up admin queue. */ >> s->queues = g_new(NVMeQueuePair *, 1); >> s->queues[INDEX_ADMIN] = nvme_create_queue_pair(s, aio_context, >> 0, >> @@ -873,7 +880,9 @@ static void nvme_close(BlockDriverState *bs) >> &s->irq_notifier[MSIX_SHARED_IRQ_IDX], >> false, NULL, NULL); >> event_notifier_cleanup(&s->irq_notifier[MSIX_SHARED_IRQ_IDX]); >> - qemu_vfio_pci_unmap_bar(s->vfio, 0, (void *)s->regs, 0, >> NVME_BAR_SIZE); >> + qemu_vfio_pci_unmap_bar(s->vfio, 0, (void *)s->doorbells, >> + sizeof(NvmeBar), NVME_DOORBELL_SIZE); >> + qemu_vfio_pci_unmap_bar(s->vfio, 0, (void *)s->regs, 0, >> sizeof(NvmeBar)); >> qemu_vfio_close(s->vfio); >> >> g_free(s->device); >
On 22/09/20 10:41, Philippe Mathieu-Daudé wrote: >> Besides looking more correct in access mode, is there any side effect >> of WO mapping? > TBH I don't have enough knowledge to answer this question. > I tested successfully on X86. I'm writing more tests. No problem with doing this, but PROT_WRITE does not work at all on x86. :) PROT_EXEC works if you have a machine with PKRU, but PROT_WRITE silently becomes PROT_READ|PROT_WRITE because the processor does not support it. Paolo
On Tue, 2020-09-22 at 10:41 +0200, Philippe Mathieu-Daudé wrote: > Hi Fam, > > +Paolo? > > On 9/22/20 10:18 AM, Fam Zheng wrote: > > On Mon, 2020-09-21 at 18:29 +0200, Philippe Mathieu-Daudé wrote: > > > Per the datasheet sections 3.1.13/3.1.14: > > > "The host should not read the doorbell registers." > > > > > > As we don't need read access, map the doorbells with write-only > > > permission. We keep a reference to this mapped address in the > > > BDRVNVMeState structure. > > > > Besides looking more correct in access mode, is there any side > > effect > > of WO mapping? > > TBH I don't have enough knowledge to answer this question. > I tested successfully on X86. I'm writing more tests. The reason I'm asking is more because x86 pages are either RO or RW. So I'd like to see if there's a practical reason behind this patch (I have no idea about the effects on MTRR and/or IOMMU). Fam > > > > > Fam > > > > > > > > Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com> > > > --- > > > block/nvme.c | 29 +++++++++++++++++++---------- > > > 1 file changed, 19 insertions(+), 10 deletions(-) > > > > > > diff --git a/block/nvme.c b/block/nvme.c > > > index 5a4dc6a722a..3c834da8fec 100644 > > > --- a/block/nvme.c > > > +++ b/block/nvme.c > > > @@ -31,7 +31,7 @@ > > > #define NVME_SQ_ENTRY_BYTES 64 > > > #define NVME_CQ_ENTRY_BYTES 16 > > > #define NVME_QUEUE_SIZE 128 > > > -#define NVME_BAR_SIZE 8192 > > > +#define NVME_DOORBELL_SIZE 4096 > > > > > > /* > > > * We have to leave one slot empty as that is the full queue > > > case > > > where > > > @@ -84,10 +84,6 @@ typedef struct { > > > /* Memory mapped registers */ > > > typedef volatile struct { > > > NvmeBar ctrl; > > > - struct { > > > - uint32_t sq_tail; > > > - uint32_t cq_head; > > > - } doorbells[]; > > > } NVMeRegs; > > > > > > #define INDEX_ADMIN 0 > > > @@ -103,6 +99,11 @@ struct BDRVNVMeState { > > > AioContext *aio_context; > > > QEMUVFIOState *vfio; > > > NVMeRegs *regs; > > > + /* Memory mapped registers */ > > > + volatile struct { > > > + uint32_t sq_tail; > > > + uint32_t cq_head; > > > + } *doorbells; > > > /* The submission/completion queue pairs. > > > * [0]: admin queue. > > > * [1..]: io queues. > > > @@ -247,14 +248,14 @@ static NVMeQueuePair > > > *nvme_create_queue_pair(BDRVNVMeState *s, > > > error_propagate(errp, local_err); > > > goto fail; > > > } > > > - q->sq.doorbell = &s->regs->doorbells[idx * s- > > > > doorbell_scale].sq_tail; > > > > > > + q->sq.doorbell = &s->doorbells[idx * s- > > > >doorbell_scale].sq_tail; > > > > > > nvme_init_queue(s, &q->cq, size, NVME_CQ_ENTRY_BYTES, > > > &local_err); > > > if (local_err) { > > > error_propagate(errp, local_err); > > > goto fail; > > > } > > > - q->cq.doorbell = &s->regs->doorbells[idx * s- > > > > doorbell_scale].cq_head; > > > > > > + q->cq.doorbell = &s->doorbells[idx * s- > > > >doorbell_scale].cq_head; > > > > > > return q; > > > fail: > > > @@ -712,13 +713,12 @@ static int nvme_init(BlockDriverState *bs, > > > const char *device, int namespace, > > > goto out; > > > } > > > > > > - s->regs = qemu_vfio_pci_map_bar(s->vfio, 0, 0, > > > NVME_BAR_SIZE, > > > + s->regs = qemu_vfio_pci_map_bar(s->vfio, 0, 0, > > > sizeof(NvmeBar), > > > PROT_READ | PROT_WRITE, > > > errp); > > > if (!s->regs) { > > > ret = -EINVAL; > > > goto out; > > > } > > > - > > > /* Perform initialize sequence as described in NVMe spec > > > "7.6.1 > > > * Initialization". */ > > > > > > @@ -748,6 +748,13 @@ static int nvme_init(BlockDriverState *bs, > > > const > > > char *device, int namespace, > > > } > > > } > > > > > > + s->doorbells = qemu_vfio_pci_map_bar(s->vfio, 0, > > > sizeof(NvmeBar), > > > + NVME_DOORBELL_SIZE, > > > PROT_WRITE, errp); > > > + if (!s->doorbells) { > > > + ret = -EINVAL; > > > + goto out; > > > + } > > > + > > > /* Set up admin queue. */ > > > s->queues = g_new(NVMeQueuePair *, 1); > > > s->queues[INDEX_ADMIN] = nvme_create_queue_pair(s, > > > aio_context, > > > 0, > > > @@ -873,7 +880,9 @@ static void nvme_close(BlockDriverState *bs) > > > &s- > > > >irq_notifier[MSIX_SHARED_IRQ_IDX], > > > false, NULL, NULL); > > > event_notifier_cleanup(&s- > > > >irq_notifier[MSIX_SHARED_IRQ_IDX]); > > > - qemu_vfio_pci_unmap_bar(s->vfio, 0, (void *)s->regs, 0, > > > NVME_BAR_SIZE); > > > + qemu_vfio_pci_unmap_bar(s->vfio, 0, (void *)s->doorbells, > > > + sizeof(NvmeBar), > > > NVME_DOORBELL_SIZE); > > > + qemu_vfio_pci_unmap_bar(s->vfio, 0, (void *)s->regs, 0, > > > sizeof(NvmeBar)); > > > qemu_vfio_close(s->vfio); > > > > > > g_free(s->device); > >
On 9/22/20 11:04 AM, Paolo Bonzini wrote: > On 22/09/20 10:41, Philippe Mathieu-Daudé wrote: >>> Besides looking more correct in access mode, is there any side effect >>> of WO mapping? >> TBH I don't have enough knowledge to answer this question. >> I tested successfully on X86. I'm writing more tests. > > No problem with doing this, but PROT_WRITE does not work at all on x86. > :) PROT_EXEC works if you have a machine with PKRU, but PROT_WRITE > silently becomes PROT_READ|PROT_WRITE because the processor does not > support it. Ah this is why it works the same way in my testing. I'll run tests on ARM. Thanks, Phil. > > Paolo >
diff --git a/block/nvme.c b/block/nvme.c index 5a4dc6a722a..3c834da8fec 100644 --- a/block/nvme.c +++ b/block/nvme.c @@ -31,7 +31,7 @@ #define NVME_SQ_ENTRY_BYTES 64 #define NVME_CQ_ENTRY_BYTES 16 #define NVME_QUEUE_SIZE 128 -#define NVME_BAR_SIZE 8192 +#define NVME_DOORBELL_SIZE 4096 /* * We have to leave one slot empty as that is the full queue case where @@ -84,10 +84,6 @@ typedef struct { /* Memory mapped registers */ typedef volatile struct { NvmeBar ctrl; - struct { - uint32_t sq_tail; - uint32_t cq_head; - } doorbells[]; } NVMeRegs; #define INDEX_ADMIN 0 @@ -103,6 +99,11 @@ struct BDRVNVMeState { AioContext *aio_context; QEMUVFIOState *vfio; NVMeRegs *regs; + /* Memory mapped registers */ + volatile struct { + uint32_t sq_tail; + uint32_t cq_head; + } *doorbells; /* The submission/completion queue pairs. * [0]: admin queue. * [1..]: io queues. @@ -247,14 +248,14 @@ static NVMeQueuePair *nvme_create_queue_pair(BDRVNVMeState *s, error_propagate(errp, local_err); goto fail; } - q->sq.doorbell = &s->regs->doorbells[idx * s->doorbell_scale].sq_tail; + q->sq.doorbell = &s->doorbells[idx * s->doorbell_scale].sq_tail; nvme_init_queue(s, &q->cq, size, NVME_CQ_ENTRY_BYTES, &local_err); if (local_err) { error_propagate(errp, local_err); goto fail; } - q->cq.doorbell = &s->regs->doorbells[idx * s->doorbell_scale].cq_head; + q->cq.doorbell = &s->doorbells[idx * s->doorbell_scale].cq_head; return q; fail: @@ -712,13 +713,12 @@ static int nvme_init(BlockDriverState *bs, const char *device, int namespace, goto out; } - s->regs = qemu_vfio_pci_map_bar(s->vfio, 0, 0, NVME_BAR_SIZE, + s->regs = qemu_vfio_pci_map_bar(s->vfio, 0, 0, sizeof(NvmeBar), PROT_READ | PROT_WRITE, errp); if (!s->regs) { ret = -EINVAL; goto out; } - /* Perform initialize sequence as described in NVMe spec "7.6.1 * Initialization". */ @@ -748,6 +748,13 @@ static int nvme_init(BlockDriverState *bs, const char *device, int namespace, } } + s->doorbells = qemu_vfio_pci_map_bar(s->vfio, 0, sizeof(NvmeBar), + NVME_DOORBELL_SIZE, PROT_WRITE, errp); + if (!s->doorbells) { + ret = -EINVAL; + goto out; + } + /* Set up admin queue. */ s->queues = g_new(NVMeQueuePair *, 1); s->queues[INDEX_ADMIN] = nvme_create_queue_pair(s, aio_context, 0, @@ -873,7 +880,9 @@ static void nvme_close(BlockDriverState *bs) &s->irq_notifier[MSIX_SHARED_IRQ_IDX], false, NULL, NULL); event_notifier_cleanup(&s->irq_notifier[MSIX_SHARED_IRQ_IDX]); - qemu_vfio_pci_unmap_bar(s->vfio, 0, (void *)s->regs, 0, NVME_BAR_SIZE); + qemu_vfio_pci_unmap_bar(s->vfio, 0, (void *)s->doorbells, + sizeof(NvmeBar), NVME_DOORBELL_SIZE); + qemu_vfio_pci_unmap_bar(s->vfio, 0, (void *)s->regs, 0, sizeof(NvmeBar)); qemu_vfio_close(s->vfio); g_free(s->device);
Per the datasheet sections 3.1.13/3.1.14: "The host should not read the doorbell registers." As we don't need read access, map the doorbells with write-only permission. We keep a reference to this mapped address in the BDRVNVMeState structure. Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com> --- block/nvme.c | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-)