diff mbox

[0/3] nvme-core: restructure nvme_init_ctrl()

Message ID 20230802032629.24309-1-kch@nvidia.com
State New
Headers show

Commit Message

Chaitanya Kulkarni Aug. 2, 2023, 3:26 a.m. UTC
Hi,

Restructure nvme_init_ctrl() for better initialization flow.

Currenlty nvme_init_ctrl() initialized nvme authentication, fault
injection, and device PM QoS after adding the controller device with
a call to cdev_device_add(). This has led to additional code complexity,
as it required handling the unwinding of these initializations if any
of them failed.

This series addresses the issue by restructuring nvme_init_ctrl() to
perform the required initializations before adding the controller
device. With this potential failures are caught earlier in the
initialization process, preventing the controller from becoming
visible in userspace until it is fully initialized.

Also it avoids exposing a half-initialized device to userspace, which 
could lead to various problems, including userspace inconsistencies 
where applications may assume the device is fully initialized while
still being prone to failure.

I've done basic testing with blktests and module load/unload they seem
to pass.

Instead of sending one patch I've purposely made 3 separate patches
since they all are not related, however I've merged fault injection
and mpath.

-ck

Chaitanya Kulkarni (3):
  nvme-core: init auth ctrl early
  nvme-core: init fault injection & mpath ctrl early
  nvme-core: init power qoe ops early

 drivers/nvme/host/core.c | 23 ++++++++++-------------
 1 file changed, 10 insertions(+), 13 deletions(-)

nvme (nvme-6.6) # 
nvme (nvme-6.6) # gitlog -3
218a1f775461 (HEAD -> nvme-6.6) nvme-core: init power qoe ops early
d7240009e752 nvme-core: init fault injection & mpath ctrl early
063f3e74a3a2 nvme-core: init auth ctrl early
nvme (nvme-6.6) # ./compile_nvme.sh 
+ umount /mnt/nvme0n1
+ clear_dmesg
./compile_nvme.sh: line 3: clear_dmesg: command not found
umount: /mnt/nvme0n1: no mount point specified.
+ ./delete.sh
+ NQN=testnqn
+ nvme disconnect -n testnqn
NQN:testnqn disconnected 0 controller(s)

real	0m0.012s
user	0m0.000s
sys	0m0.005s
+ rm -fr '/sys/kernel/config/nvmet/ports/1/subsystems/*'
+ rmdir /sys/kernel/config/nvmet/ports/1
rmdir: failed to remove '/sys/kernel/config/nvmet/ports/1': No such file or directory
+ for subsys in /sys/kernel/config/nvmet/subsystems/*
+ for ns in ${subsys}/namespaces/*
+ echo 0
./delete.sh: line 14: /sys/kernel/config/nvmet/subsystems/*/namespaces/*/enable: No such file or directory
+ rmdir '/sys/kernel/config/nvmet/subsystems/*/namespaces/*'
rmdir: failed to remove '/sys/kernel/config/nvmet/subsystems/*/namespaces/*': No such file or directory
+ rmdir '/sys/kernel/config/nvmet/subsystems/*'
rmdir: failed to remove '/sys/kernel/config/nvmet/subsystems/*': No such file or directory
+ rmdir 'config/nullb/nullb*'
rmdir: failed to remove 'config/nullb/nullb*': No such file or directory
+ umount /mnt/nvme0n1
umount: /mnt/nvme0n1: no mount point specified.
+ umount /mnt/backend
umount: /mnt/backend: not mounted.
+ modprobe -r nvme_loop
+ modprobe -r nvme_fabrics
+ modprobe -r nvmet
+ modprobe -r nvme
+ modprobe -r null_blk
+ tree /sys/kernel/config
/sys/kernel/config
└── pci_ep
    ├── controllers
    └── functions

3 directories, 0 files
+ sleep 1
+ modprobe -r nvme-core
+ lsmod
+ grep nvme
+ sleep 1
+ git diff
++ nproc
+ make -j 48 M=drivers/nvme/ modules
  CC [M]  drivers/nvme/host/core.o
  LD [M]  drivers/nvme/host/nvme-core.o
  MODPOST drivers/nvme/Module.symvers
  CC [M]  drivers/nvme/host/nvme-core.mod.o
  CC [M]  drivers/nvme/host/nvme.mod.o
  CC [M]  drivers/nvme/host/nvme-fabrics.mod.o
  CC [M]  drivers/nvme/host/nvme-rdma.mod.o
  CC [M]  drivers/nvme/host/nvme-fc.mod.o
  CC [M]  drivers/nvme/host/nvme-tcp.mod.o
  CC [M]  drivers/nvme/target/nvmet.mod.o
  CC [M]  drivers/nvme/target/nvme-loop.mod.o
  CC [M]  drivers/nvme/target/nvmet-fc.mod.o
  CC [M]  drivers/nvme/target/nvmet-rdma.mod.o
  CC [M]  drivers/nvme/target/nvme-fcloop.mod.o
  CC [M]  drivers/nvme/common/nvme-common.mod.o
  CC [M]  drivers/nvme/target/nvmet-tcp.mod.o
  LD [M]  drivers/nvme/host/nvme-tcp.ko
  LD [M]  drivers/nvme/target/nvmet.ko
  LD [M]  drivers/nvme/host/nvme-fabrics.ko
  LD [M]  drivers/nvme/target/nvmet-fc.ko
  LD [M]  drivers/nvme/host/nvme.ko
  LD [M]  drivers/nvme/host/nvme-rdma.ko
  LD [M]  drivers/nvme/host/nvme-fc.ko
  LD [M]  drivers/nvme/target/nvme-fcloop.ko
  LD [M]  drivers/nvme/target/nvmet-rdma.ko
  LD [M]  drivers/nvme/target/nvme-loop.ko
  LD [M]  drivers/nvme/host/nvme-core.ko
  LD [M]  drivers/nvme/target/nvmet-tcp.ko
  LD [M]  drivers/nvme/common/nvme-common.ko
+ HOST=drivers/nvme/host
+ TARGET=drivers/nvme/target
++ uname -r
+ HOST_DEST=/lib/modules/6.5.0-rc2nvme+/kernel/drivers/nvme/host/
++ uname -r
+ TARGET_DEST=/lib/modules/6.5.0-rc2nvme+/kernel/drivers/nvme/target/
+ cp drivers/nvme/host/nvme-core.ko drivers/nvme/host/nvme-fabrics.ko drivers/nvme/host/nvme-fc.ko drivers/nvme/host/nvme.ko drivers/nvme/host/nvme-rdma.ko drivers/nvme/host/nvme-tcp.ko /lib/modules/6.5.0-rc2nvme+/kernel/drivers/nvme/host//
+ cp drivers/nvme/target/nvme-fcloop.ko drivers/nvme/target/nvme-loop.ko drivers/nvme/target/nvmet-fc.ko drivers/nvme/target/nvmet.ko drivers/nvme/target/nvmet-rdma.ko drivers/nvme/target/nvmet-tcp.ko /lib/modules/6.5.0-rc2nvme+/kernel/drivers/nvme/target//
+ ls -lrth /lib/modules/6.5.0-rc2nvme+/kernel/drivers/nvme/host/ /lib/modules/6.5.0-rc2nvme+/kernel/drivers/nvme/target//
/lib/modules/6.5.0-rc2nvme+/kernel/drivers/nvme/host/:
total 8.1M
-rw-r--r--. 1 root root  4.1M Aug  1 18:34 nvme-core.ko
-rw-r--r--. 1 root root  495K Aug  1 18:34 nvme-fabrics.ko
-rw-r--r--. 1 root root 1005K Aug  1 18:34 nvme-fc.ko
-rw-r--r--. 1 root root  788K Aug  1 18:34 nvme.ko
-rw-r--r--. 1 root root  931K Aug  1 18:34 nvme-rdma.ko
-rw-r--r--. 1 root root  914K Aug  1 18:34 nvme-tcp.ko

/lib/modules/6.5.0-rc2nvme+/kernel/drivers/nvme/target//:
total 7.5M
-rw-r--r--. 1 root root 541K Aug  1 18:34 nvme-fcloop.ko
-rw-r--r--. 1 root root 476K Aug  1 18:34 nvme-loop.ko
-rw-r--r--. 1 root root 827K Aug  1 18:34 nvmet-fc.ko
-rw-r--r--. 1 root root 4.0M Aug  1 18:34 nvmet.ko
-rw-r--r--. 1 root root 903K Aug  1 18:34 nvmet-rdma.ko
-rw-r--r--. 1 root root 769K Aug  1 18:34 nvmet-tcp.ko
+ sync
+ sync
+ sync
+ modprobe nvme
+ echo 'Press enter to continue ...'
Press enter to continue ...
nvme (nvme-6.6) # modprobe nvme 
nvme (nvme-6.6) # dmesg  -c 
[  147.346136] nvme 0000:00:04.0: vgaarb: pci_notify
[  147.403997] power/qos: dev_pm_qos_update_user_latency_tolerance 912
[  147.404001] power/qos: dev_pm_qos_update_user_latency_tolerance 940
[  147.404002] power/qos: dev_pm_qos_update_user_latency_tolerance 942
[  147.404211] pci 0000:00:04.0: vgaarb: pci_notify
[  154.598489] nvme_core: loading out-of-tree module taints kernel.
[  154.607823] nvme 0000:00:04.0: vgaarb: pci_notify
[  154.607857] nvme 0000:00:04.0: runtime IRQ mapping not provided by arch
[  154.607938] power/qos: dev_pm_qos_update_user_latency_tolerance 912
[  154.607974] power/qos: dev_pm_qos_update_user_latency_tolerance 917
[  154.607976] power/qos: dev_pm_qos_update_user_latency_tolerance 925
[  154.607978] power/qos: dev_pm_qos_update_user_latency_tolerance 931
[  154.607981] power/qos: dev_pm_qos_update_user_latency_tolerance 938
[  154.608624] nvme nvme0: pci function 0000:00:04.0
[  154.624312] nvme 0000:00:04.0: enabling bus mastering
[  154.624900] nvme 0000:00:04.0: saving config space at offset 0x0 (reading 0x101b36)
[  154.624907] nvme 0000:00:04.0: saving config space at offset 0x4 (reading 0x100507)
[  154.624912] nvme 0000:00:04.0: saving config space at offset 0x8 (reading 0x1080202)
[  154.624916] nvme 0000:00:04.0: saving config space at offset 0xc (reading 0x0)
[  154.624921] nvme 0000:00:04.0: saving config space at offset 0x10 (reading 0xfebd0004)
[  154.624926] nvme 0000:00:04.0: saving config space at offset 0x14 (reading 0x0)
[  154.624930] nvme 0000:00:04.0: saving config space at offset 0x18 (reading 0x0)
[  154.624935] nvme 0000:00:04.0: saving config space at offset 0x1c (reading 0x0)
[  154.624939] nvme 0000:00:04.0: saving config space at offset 0x20 (reading 0x0)
[  154.624944] nvme 0000:00:04.0: saving config space at offset 0x24 (reading 0x0)
[  154.624948] nvme 0000:00:04.0: saving config space at offset 0x28 (reading 0x0)
[  154.624952] nvme 0000:00:04.0: saving config space at offset 0x2c (reading 0x11001af4)
[  154.624957] nvme 0000:00:04.0: saving config space at offset 0x30 (reading 0x0)
[  154.624961] nvme 0000:00:04.0: saving config space at offset 0x34 (reading 0x40)
[  154.624966] nvme 0000:00:04.0: saving config space at offset 0x38 (reading 0x0)
[  154.624970] nvme 0000:00:04.0: saving config space at offset 0x3c (reading 0x10b)
[  154.636004] nvme nvme0: 48/0/0 default/read/poll queues
[  154.639365] nvme nvme0: Ignoring bogus Namespace Identifiers
[  154.642471] nvme 0000:00:04.0: vgaarb: pci_notify
nvme (nvme-6.6) # modprobe  -r nvme 
nvme (nvme-6.6) # cdblktests 
blktests (master) # ./check nvme
nvme/002 (create many subsystems and test discovery)         [passed]
    runtime  18.857s  ...  18.964s
nvme/003 (test if we're sending keep-alives to a discovery controller) [passed]
    runtime  10.082s  ...  10.075s
nvme/004 (test nvme and nvmet UUID NS descriptors)           [passed]
    runtime  1.434s  ...  1.437s
nvme/005 (reset local loopback target)                       [passed]
    runtime  1.800s  ...  1.793s
nvme/006 (create an NVMeOF target with a block device-backed ns) [passed]
    runtime  0.066s  ...  0.074s
nvme/007 (create an NVMeOF target with a file-backed ns)     [passed]
    runtime  0.033s  ...  0.037s
nvme/008 (create an NVMeOF host with a block device-backed ns) [passed]
    runtime  1.448s  ...  1.452s
nvme/009 (create an NVMeOF host with a file-backed ns)       [passed]
    runtime  1.420s  ...  1.434s
nvme/010 (run data verification fio job on NVMeOF block device-backed ns) [passed]
    runtime  58.052s  ...  59.627s
nvme/011 (run data verification fio job on NVMeOF file-backed ns) [passed]
    runtime  89.351s  ...  90.811s
nvme/012 (run mkfs and data verification fio job on NVMeOF block device-backed ns) [passed]
    runtime  52.261s  ...  54.646s
nvme/013 (run mkfs and data verification fio job on NVMeOF file-backed ns) [passed]
    runtime  82.256s  ...  82.792s
nvme/014 (flush a NVMeOF block device-backed ns)             [passed]
    runtime  6.966s  ...  7.131s
nvme/015 (unit test for NVMe flush for file backed ns)       [passed]
    runtime  6.135s  ...  6.227s
nvme/016 (create/delete many NVMeOF block device-backed ns and test discovery) [passed]
    runtime  12.594s  ...  12.457s
nvme/017 (create/delete many file-ns and test discovery)     [passed]
    runtime  12.636s  ...  12.562s
nvme/018 (unit test NVMe-oF out of range access on a file backend) [passed]
    runtime  1.429s  ...  1.416s
nvme/019 (test NVMe DSM Discard command on NVMeOF block-device ns) [passed]
    runtime  1.440s  ...  1.445s
nvme/020 (test NVMe DSM Discard command on NVMeOF file-backed ns) [passed]
    runtime  1.431s  ...  1.425s
nvme/021 (test NVMe list command on NVMeOF file-backed ns)   [passed]
    runtime  1.430s  ...  1.408s
nvme/022 (test NVMe reset command on NVMeOF file-backed ns)  [passed]
    runtime  1.761s  ...  1.779s
nvme/023 (test NVMe smart-log command on NVMeOF block-device ns) [passed]
    runtime  1.448s  ...  1.427s
nvme/024 (test NVMe smart-log command on NVMeOF file-backed ns) [passed]
    runtime  1.431s  ...  1.410s
nvme/025 (test NVMe effects-log command on NVMeOF file-backed ns) [passed]
    runtime  1.435s  ...  1.419s
nvme/026 (test NVMe ns-descs command on NVMeOF file-backed ns) [passed]
    runtime  1.415s  ...  1.429s
nvme/027 (test NVMe ns-rescan command on NVMeOF file-backed ns) [passed]
    runtime  1.436s  ...  1.431s
nvme/028 (test NVMe list-subsys command on NVMeOF file-backed ns) [passed]
    runtime  1.430s  ...  1.418s
nvme/029 (test userspace IO via nvme-cli read/write interface) [passed]
    runtime  1.563s  ...  1.554s
nvme/030 (ensure the discovery generation counter is updated appropriately) [passed]
    runtime  0.205s  ...  0.189s
nvme/031 (test deletion of NVMeOF controllers immediately after setup) [passed]
    runtime  3.899s  ...  3.959s
nvme/038 (test deletion of NVMeOF subsystem without enabling) [passed]
    runtime  0.013s  ...  0.013s
nvme/040 (test nvme fabrics controller reset/disconnect operation during I/O) [passed]
    runtime  7.953s  ...  7.992s
nvme/041 (Create authenticated connections)                  [passed]
    runtime  0.733s  ...  0.731s
nvme/042 (Test dhchap key types for authenticated connections) [passed]
    runtime  4.755s  ...  4.768s
nvme/043 (Test hash and DH group variations for authenticated connections) [passed]
    runtime  23.182s  ...  23.268s
nvme/044 (Test bi-directional authentication)                [passed]
    runtime  1.814s  ...  1.813s
nvme/045 (Test re-authentication)                            [passed]
    runtime  4.272s  ...  3.809s
nvme/047 (test different queue types for fabric transports)  [not run]
    nvme_trtype=loop is not supported in this test
nvme/048 (Test queue count changes on reconnect)             [not run]
    nvme_trtype=loop is not supported in this test
blktests (master) #

Comments

Christoph Hellwig Aug. 2, 2023, 12:27 p.m. UTC | #1
On Tue, Aug 01, 2023 at 08:26:26PM -0700, Chaitanya Kulkarni wrote:
> Hi,
> 
> Restructure nvme_init_ctrl() for better initialization flow.
> 
> Currenlty nvme_init_ctrl() initialized nvme authentication, fault
> injection, and device PM QoS after adding the controller device with
> a call to cdev_device_add(). This has led to additional code complexity,
> as it required handling the unwinding of these initializations if any
> of them failed.

The current code is in fact also broken, as after device_add
(which cdev_device_add does underneath) fails we can't just cleaup, but
most call put_device.  I think this single patch is what we should be
doing, but I don't fell fully confindent in it without some extra
error injection:

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 47d7ba2827ff29..5da55a271a5ab0 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -4405,14 +4405,12 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
 	BUILD_BUG_ON(NVME_DSM_MAX_RANGES * sizeof(struct nvme_dsm_range) >
 			PAGE_SIZE);
 	ctrl->discard_page = alloc_page(GFP_KERNEL);
-	if (!ctrl->discard_page) {
-		ret = -ENOMEM;
-		goto out;
-	}
+	if (!ctrl->discard_page)
+		return -ENOMEM;
 
 	ret = ida_alloc(&nvme_instance_ida, GFP_KERNEL);
 	if (ret < 0)
-		goto out;
+		goto out_free_discard_page;
 	ctrl->instance = ret;
 
 	device_initialize(&ctrl->ctrl_device);
@@ -4431,13 +4429,6 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
 	if (ret)
 		goto out_release_instance;
 
-	nvme_get_ctrl(ctrl);
-	cdev_init(&ctrl->cdev, &nvme_dev_fops);
-	ctrl->cdev.owner = ops->module;
-	ret = cdev_device_add(&ctrl->cdev, ctrl->device);
-	if (ret)
-		goto out_free_name;
-
 	/*
 	 * Initialize latency tolerance controls.  The sysfs files won't
 	 * be visible to userspace unless the device actually supports APST.
@@ -4448,23 +4439,27 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
 
 	nvme_fault_inject_init(&ctrl->fault_inject, dev_name(ctrl->device));
 	nvme_mpath_init_ctrl(ctrl);
+
 	ret = nvme_auth_init_ctrl(ctrl);
 	if (ret)
-		goto out_free_cdev;
+		goto out_fault_inject_fini;
 
-	return 0;
-out_free_cdev:
+	nvme_get_ctrl(ctrl);
+	cdev_init(&ctrl->cdev, &nvme_dev_fops);
+	ctrl->cdev.owner = ops->module;
+	ret = cdev_device_add(&ctrl->cdev, ctrl->device);
+	if (ret)
+		put_device(ctrl->device);
+	return ret;
+
+out_fault_inject_fini:
 	nvme_fault_inject_fini(&ctrl->fault_inject);
 	dev_pm_qos_hide_latency_tolerance(ctrl->device);
-	cdev_device_del(&ctrl->cdev, ctrl->device);
-out_free_name:
-	nvme_put_ctrl(ctrl);
 	kfree_const(ctrl->device->kobj.name);
 out_release_instance:
 	ida_free(&nvme_instance_ida, ctrl->instance);
-out:
-	if (ctrl->discard_page)
-		__free_page(ctrl->discard_page);
+out_free_discard_page:
+	__free_page(ctrl->discard_page);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(nvme_init_ctrl);
diff mbox

Patch

diff --git a/drivers/base/power/qos.c b/drivers/base/power/qos.c
index 8e93167f1783..169ec6d02ffb 100644
--- a/drivers/base/power/qos.c
+++ b/drivers/base/power/qos.c
@@ -39,6 +39,9 @@ 
 
 #include "power.h"
 
+#undef pr_fmt
+#define pr_fmt(fmt)    "power/qos: " fmt
+
 static DEFINE_MUTEX(dev_pm_qos_mtx);
 static DEFINE_MUTEX(dev_pm_qos_sysfs_mtx);
 
@@ -906,11 +909,12 @@  int dev_pm_qos_update_user_latency_tolerance(struct device *dev, s32 val)
        int ret;
 
        mutex_lock(&dev_pm_qos_mtx);
-
+       pr_info("%s %d\n", __func__, __LINE__);
        if (IS_ERR_OR_NULL(dev->power.qos)
            || !dev->power.qos->latency_tolerance_req) {
                struct dev_pm_qos_request *req;
 
+               pr_info("%s %d\n", __func__, __LINE__);
                if (val < 0) {
                        if (val == PM_QOS_LATENCY_TOLERANCE_NO_CONSTRAINT)
                                ret = 0;
@@ -918,19 +922,24 @@  int dev_pm_qos_update_user_latency_tolerance(struct device *dev, s32 val)
                                ret = -EINVAL;
                        goto out;
                }
+               pr_info("%s %d\n", __func__, __LINE__);
                req = kzalloc(sizeof(*req), GFP_KERNEL);
                if (!req) {
                        ret = -ENOMEM;
                        goto out;
                }
+               pr_info("%s %d\n", __func__, __LINE__);
                ret = __dev_pm_qos_add_request(dev, req, DEV_PM_QOS_LATENCY_TOLERANCE, val);
                if (ret < 0) {
                        kfree(req);
                        goto out;
                }
                dev->power.qos->latency_tolerance_req = req;
+               pr_info("%s %d\n", __func__, __LINE__);
        } else {
+               pr_info("%s %d\n", __func__, __LINE__);
                if (val < 0) {
+                       pr_info("%s %d\n", __func__, __LINE__);
                        __dev_pm_qos_drop_user_request(dev, DEV_PM_QOS_LATENCY_TOLERANCE);
                        ret = 0;
                } else {