diff mbox series

[RFC,v5,04/27] nvme-tcp-offload: Add controller level implementation

Message ID 20210519111340.20613-5-smalin@marvell.com
State New
Headers show
Series NVMeTCP Offload ULP and QEDN Device Driver | expand

Commit Message

Shai Malin May 19, 2021, 11:13 a.m. UTC
From: Arie Gershberg <agershberg@marvell.com>

In this patch we implement controller level functionality including:
- create_ctrl.
- delete_ctrl.
- free_ctrl.

The implementation is similar to other nvme fabrics modules, the main
difference being that the nvme-tcp-offload ULP calls the vendor specific
claim_dev() op with the given TCP/IP parameters to determine which device
will be used for this controller.
Once found, the vendor specific device and controller will be paired and
kept in a controller list managed by the ULP.

Acked-by: Igor Russkikh <irusskikh@marvell.com>
Signed-off-by: Arie Gershberg <agershberg@marvell.com>
Signed-off-by: Prabhakar Kushwaha <pkushwaha@marvell.com>
Signed-off-by: Omkar Kulkarni <okulkarni@marvell.com>
Signed-off-by: Michal Kalderon <mkalderon@marvell.com>
Signed-off-by: Ariel Elior <aelior@marvell.com>
Signed-off-by: Shai Malin <smalin@marvell.com>
---
 drivers/nvme/host/tcp-offload.c | 475 +++++++++++++++++++++++++++++++-
 1 file changed, 467 insertions(+), 8 deletions(-)

Comments

Himanshu Madhani May 21, 2021, 5:19 p.m. UTC | #1
On 5/19/21 6:13 AM, Shai Malin wrote:
> From: Arie Gershberg <agershberg@marvell.com>

> 

> In this patch we implement controller level functionality including:

> - create_ctrl.

> - delete_ctrl.

> - free_ctrl.

> 

> The implementation is similar to other nvme fabrics modules, the main

> difference being that the nvme-tcp-offload ULP calls the vendor specific

> claim_dev() op with the given TCP/IP parameters to determine which device

> will be used for this controller.

> Once found, the vendor specific device and controller will be paired and

> kept in a controller list managed by the ULP.

> 

> Acked-by: Igor Russkikh <irusskikh@marvell.com>

> Signed-off-by: Arie Gershberg <agershberg@marvell.com>

> Signed-off-by: Prabhakar Kushwaha <pkushwaha@marvell.com>

> Signed-off-by: Omkar Kulkarni <okulkarni@marvell.com>

> Signed-off-by: Michal Kalderon <mkalderon@marvell.com>

> Signed-off-by: Ariel Elior <aelior@marvell.com>

> Signed-off-by: Shai Malin <smalin@marvell.com>

> ---

>   drivers/nvme/host/tcp-offload.c | 475 +++++++++++++++++++++++++++++++-

>   1 file changed, 467 insertions(+), 8 deletions(-)

> 

> diff --git a/drivers/nvme/host/tcp-offload.c b/drivers/nvme/host/tcp-offload.c

> index aa7cc239abf2..f7e0dc79bedd 100644

> --- a/drivers/nvme/host/tcp-offload.c

> +++ b/drivers/nvme/host/tcp-offload.c

> @@ -12,6 +12,10 @@

>   

>   static LIST_HEAD(nvme_tcp_ofld_devices);

>   static DECLARE_RWSEM(nvme_tcp_ofld_devices_rwsem);

> +static LIST_HEAD(nvme_tcp_ofld_ctrl_list);

> +static DECLARE_RWSEM(nvme_tcp_ofld_ctrl_rwsem);

> +static struct blk_mq_ops nvme_tcp_ofld_admin_mq_ops;

> +static struct blk_mq_ops nvme_tcp_ofld_mq_ops;

>   

>   static inline struct nvme_tcp_ofld_ctrl *to_tcp_ofld_ctrl(struct nvme_ctrl *nctrl)

>   {

> @@ -128,28 +132,435 @@ nvme_tcp_ofld_lookup_dev(struct nvme_tcp_ofld_ctrl *ctrl)

>   	return dev;

>   }

>   

> +static struct blk_mq_tag_set *

> +nvme_tcp_ofld_alloc_tagset(struct nvme_ctrl *nctrl, bool admin)

> +{

> +	struct nvme_tcp_ofld_ctrl *ctrl = to_tcp_ofld_ctrl(nctrl);

> +	struct blk_mq_tag_set *set;

> +	int rc;

> +

> +	if (admin) {

> +		set = &ctrl->admin_tag_set;

> +		memset(set, 0, sizeof(*set));

> +		set->ops = &nvme_tcp_ofld_admin_mq_ops;

> +		set->queue_depth = NVME_AQ_MQ_TAG_DEPTH;

> +		set->reserved_tags = NVMF_RESERVED_TAGS;

> +		set->numa_node = nctrl->numa_node;

> +		set->flags = BLK_MQ_F_BLOCKING;

> +		set->cmd_size = sizeof(struct nvme_tcp_ofld_req);

> +		set->driver_data = ctrl;

> +		set->nr_hw_queues = 1;

> +		set->timeout = NVME_ADMIN_TIMEOUT;

> +	} else {

> +		set = &ctrl->tag_set;

> +		memset(set, 0, sizeof(*set));

> +		set->ops = &nvme_tcp_ofld_mq_ops;

> +		set->queue_depth = nctrl->sqsize + 1;

> +		set->reserved_tags = NVMF_RESERVED_TAGS;

> +		set->numa_node = nctrl->numa_node;

> +		set->flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_BLOCKING;

> +		set->cmd_size = sizeof(struct nvme_tcp_ofld_req);

> +		set->driver_data = ctrl;

> +		set->nr_hw_queues = nctrl->queue_count - 1;

> +		set->timeout = NVME_IO_TIMEOUT;

> +		set->nr_maps = nctrl->opts->nr_poll_queues ? HCTX_MAX_TYPES : 2;

> +	}

> +

> +	rc = blk_mq_alloc_tag_set(set);

> +	if (rc)

> +		return ERR_PTR(rc);

> +

> +	return set;

> +}

> +

> +static int nvme_tcp_ofld_configure_admin_queue(struct nvme_ctrl *nctrl,

> +					       bool new)

> +{

> +	int rc;

> +

> +	/* Placeholder - alloc_admin_queue */

> +	if (new) {

> +		nctrl->admin_tagset =

> +				nvme_tcp_ofld_alloc_tagset(nctrl, true);

> +		if (IS_ERR(nctrl->admin_tagset)) {

> +			rc = PTR_ERR(nctrl->admin_tagset);

> +			nctrl->admin_tagset = NULL;

> +			goto out_free_queue;

> +		}

> +

> +		nctrl->fabrics_q = blk_mq_init_queue(nctrl->admin_tagset);

> +		if (IS_ERR(nctrl->fabrics_q)) {

> +			rc = PTR_ERR(nctrl->fabrics_q);

> +			nctrl->fabrics_q = NULL;

> +			goto out_free_tagset;

> +		}

> +

> +		nctrl->admin_q = blk_mq_init_queue(nctrl->admin_tagset);

> +		if (IS_ERR(nctrl->admin_q)) {

> +			rc = PTR_ERR(nctrl->admin_q);

> +			nctrl->admin_q = NULL;

> +			goto out_cleanup_fabrics_q;

> +		}

> +	}

> +

> +	/* Placeholder - nvme_tcp_ofld_start_queue */

> +

> +	rc = nvme_enable_ctrl(nctrl);

> +	if (rc)

> +		goto out_stop_queue;

> +

> +	blk_mq_unquiesce_queue(nctrl->admin_q);

> +

> +	rc = nvme_init_ctrl_finish(nctrl);

> +	if (rc)

> +		goto out_quiesce_queue;

> +

> +	return 0;

> +

> +out_quiesce_queue:

> +	blk_mq_quiesce_queue(nctrl->admin_q);

> +	blk_sync_queue(nctrl->admin_q);

> +

> +out_stop_queue:

> +	/* Placeholder - stop offload queue */

> +	nvme_cancel_admin_tagset(nctrl);

> +

> +out_cleanup_fabrics_q:

> +	if (new)

> +		blk_cleanup_queue(nctrl->fabrics_q);

> +out_free_tagset:

> +	if (new)

> +		blk_mq_free_tag_set(nctrl->admin_tagset);

> +out_free_queue:

> +	/* Placeholder - free admin queue */

> +

> +	return rc;

> +}

> +

> +static int

> +nvme_tcp_ofld_configure_io_queues(struct nvme_ctrl *nctrl, bool new)

> +{

> +	int rc;

> +

> +	/* Placeholder - alloc_io_queues */

> +

> +	if (new) {

> +		nctrl->tagset = nvme_tcp_ofld_alloc_tagset(nctrl, false);

> +		if (IS_ERR(nctrl->tagset)) {

> +			rc = PTR_ERR(nctrl->tagset);

> +			nctrl->tagset = NULL;

> +			goto out_free_io_queues;

> +		}

> +

> +		nctrl->connect_q = blk_mq_init_queue(nctrl->tagset);

> +		if (IS_ERR(nctrl->connect_q)) {

> +			rc = PTR_ERR(nctrl->connect_q);

> +			nctrl->connect_q = NULL;

> +			goto out_free_tag_set;

> +		}

> +	}

> +

> +	/* Placeholder - start_io_queues */

> +

> +	if (!new) {

> +		nvme_start_queues(nctrl);

> +		if (!nvme_wait_freeze_timeout(nctrl, NVME_IO_TIMEOUT)) {

> +			/*

> +			 * If we timed out waiting for freeze we are likely to

> +			 * be stuck.  Fail the controller initialization just

> +			 * to be safe.

> +			 */

> +			rc = -ENODEV;

> +			goto out_wait_freeze_timed_out;

> +		}

> +		blk_mq_update_nr_hw_queues(nctrl->tagset, nctrl->queue_count - 1);

> +		nvme_unfreeze(nctrl);

> +	}

> +

> +	return 0;

> +

> +out_wait_freeze_timed_out:

> +	nvme_stop_queues(nctrl);

> +	nvme_sync_io_queues(nctrl);

> +

> +	/* Placeholder - Stop IO queues */

> +

> +	if (new)

> +		blk_cleanup_queue(nctrl->connect_q);

> +out_free_tag_set:

> +	if (new)

> +		blk_mq_free_tag_set(nctrl->tagset);

> +out_free_io_queues:

> +	/* Placeholder - free_io_queues */

> +

> +	return rc;

> +}

> +

>   static int nvme_tcp_ofld_setup_ctrl(struct nvme_ctrl *nctrl, bool new)

>   {

> -	/* Placeholder - validates inputs and creates admin and IO queues */

> +	struct nvmf_ctrl_options *opts = nctrl->opts;

> +	int rc;

> +

> +	rc = nvme_tcp_ofld_configure_admin_queue(nctrl, new);

> +	if (rc)

> +		return rc;

> +

> +	if (nctrl->icdoff) {

> +		dev_err(nctrl->device, "icdoff is not supported!\n");

> +		rc = -EINVAL;

> +		goto destroy_admin;

> +	}

> +

> +	if (!(nctrl->sgls & ((1 << 0) | (1 << 1)))) {

> +		dev_err(nctrl->device, "Mandatory sgls are not supported!\n");

> +		goto destroy_admin;

> +	}

> +

> +	if (opts->queue_size > nctrl->sqsize + 1)

> +		dev_warn(nctrl->device,

> +			 "queue_size %zu > ctrl sqsize %u, clamping down\n",

> +			 opts->queue_size, nctrl->sqsize + 1);

> +

> +	if (nctrl->sqsize + 1 > nctrl->maxcmd) {

> +		dev_warn(nctrl->device,

> +			 "sqsize %u > ctrl maxcmd %u, clamping down\n",

> +			 nctrl->sqsize + 1, nctrl->maxcmd);

> +		nctrl->sqsize = nctrl->maxcmd - 1;

> +	}

> +

> +	if (nctrl->queue_count > 1) {

> +		rc = nvme_tcp_ofld_configure_io_queues(nctrl, new);

> +		if (rc)

> +			goto destroy_admin;

> +	}

> +

> +	if (!nvme_change_ctrl_state(nctrl, NVME_CTRL_LIVE)) {

> +		/*

> +		 * state change failure is ok if we started ctrl delete,

> +		 * unless we're during creation of a new controller to

> +		 * avoid races with teardown flow.

> +		 */

> +		WARN_ON_ONCE(nctrl->state != NVME_CTRL_DELETING &&

> +			     nctrl->state != NVME_CTRL_DELETING_NOIO);

> +		WARN_ON_ONCE(new);

> +		rc = -EINVAL;

> +		goto destroy_io;

> +	}

> +

> +	nvme_start_ctrl(nctrl);

> +

> +	return 0;

> +

> +destroy_io:

> +	/* Placeholder - stop and destroy io queues*/

> +destroy_admin:

> +	/* Placeholder - stop and destroy admin queue*/

> +

> +	return rc;

> +}

> +

> +static int

> +nvme_tcp_ofld_check_dev_opts(struct nvmf_ctrl_options *opts,

> +			     struct nvme_tcp_ofld_ops *ofld_ops)

> +{

> +	unsigned int nvme_tcp_ofld_opt_mask = NVMF_ALLOWED_OPTS |

> +			ofld_ops->allowed_opts | ofld_ops->required_opts;

> +	if (opts->mask & ~nvme_tcp_ofld_opt_mask) {

> +		pr_warn("One or more of the nvmf options isn't supported by %s.\n",

> +			ofld_ops->name);

> +

> +		return -EINVAL;

> +	}

> +

> +	return 0;

> +}

> +

> +static void nvme_tcp_ofld_free_ctrl(struct nvme_ctrl *nctrl)

> +{

> +	struct nvme_tcp_ofld_ctrl *ctrl = to_tcp_ofld_ctrl(nctrl);

> +	struct nvme_tcp_ofld_dev *dev = ctrl->dev;

> +

> +	if (list_empty(&ctrl->list))

> +		goto free_ctrl;

> +

> +	down_write(&nvme_tcp_ofld_ctrl_rwsem);

> +	ctrl->dev->ops->release_ctrl(ctrl);

> +	list_del(&ctrl->list);

> +	up_write(&nvme_tcp_ofld_ctrl_rwsem);

> +

> +	nvmf_free_options(nctrl->opts);

> +free_ctrl:

> +	module_put(dev->ops->module);

> +	kfree(ctrl->queues);

> +	kfree(ctrl);

> +}

> +

> +static void nvme_tcp_ofld_submit_async_event(struct nvme_ctrl *arg)

> +{

> +	/* Placeholder - submit_async_event */

> +}

> +

> +static void

> +nvme_tcp_ofld_teardown_admin_queue(struct nvme_ctrl *ctrl, bool remove)

> +{

> +	/* Placeholder - teardown_admin_queue */

> +}

> +

> +static void

> +nvme_tcp_ofld_teardown_io_queues(struct nvme_ctrl *nctrl, bool remove)

> +{

> +	/* Placeholder - teardown_io_queues */

> +}

> +

> +static void

> +nvme_tcp_ofld_teardown_ctrl(struct nvme_ctrl *nctrl, bool shutdown)

> +{

> +	/* Placeholder - err_work and connect_work */

> +	nvme_tcp_ofld_teardown_io_queues(nctrl, shutdown);

> +	blk_mq_quiesce_queue(nctrl->admin_q);

> +	if (shutdown)

> +		nvme_shutdown_ctrl(nctrl);

> +	else

> +		nvme_disable_ctrl(nctrl);

> +	nvme_tcp_ofld_teardown_admin_queue(nctrl, shutdown);

> +}

> +

> +static void nvme_tcp_ofld_delete_ctrl(struct nvme_ctrl *nctrl)

> +{

> +	nvme_tcp_ofld_teardown_ctrl(nctrl, true);

> +}

> +

> +static int

> +nvme_tcp_ofld_init_request(struct blk_mq_tag_set *set,

> +			   struct request *rq,

> +			   unsigned int hctx_idx,

> +			   unsigned int numa_node)

> +{

> +	struct nvme_tcp_ofld_req *req = blk_mq_rq_to_pdu(rq);

> +	struct nvme_tcp_ofld_ctrl *ctrl = set->driver_data;

> +

> +	/* Placeholder - init request */

> +

> +	req->done = nvme_tcp_ofld_req_done;

> +	ctrl->dev->ops->init_req(req);

>   

>   	return 0;

>   }

>   

> +static blk_status_t

> +nvme_tcp_ofld_queue_rq(struct blk_mq_hw_ctx *hctx,

> +		       const struct blk_mq_queue_data *bd)

> +{

> +	/* Call nvme_setup_cmd(...) */

> +

> +	/* Call ops->send_req(...) */

> +

> +	return BLK_STS_OK;

> +}

> +

> +static struct blk_mq_ops nvme_tcp_ofld_mq_ops = {

> +	.queue_rq	= nvme_tcp_ofld_queue_rq,

> +	.init_request	= nvme_tcp_ofld_init_request,

> +	/*

> +	 * All additional ops will be also implemented and registered similar to

> +	 * tcp.c

> +	 */

> +};

> +

> +static struct blk_mq_ops nvme_tcp_ofld_admin_mq_ops = {

> +	.queue_rq	= nvme_tcp_ofld_queue_rq,

> +	.init_request	= nvme_tcp_ofld_init_request,

> +	/*

> +	 * All additional ops will be also implemented and registered similar to

> +	 * tcp.c

> +	 */

> +};

> +

> +static const struct nvme_ctrl_ops nvme_tcp_ofld_ctrl_ops = {

> +	.name			= "tcp_offload",

> +	.module			= THIS_MODULE,

> +	.flags			= NVME_F_FABRICS,

> +	.reg_read32		= nvmf_reg_read32,

> +	.reg_read64		= nvmf_reg_read64,

> +	.reg_write32		= nvmf_reg_write32,

> +	.free_ctrl		= nvme_tcp_ofld_free_ctrl,

> +	.submit_async_event	= nvme_tcp_ofld_submit_async_event,

> +	.delete_ctrl		= nvme_tcp_ofld_delete_ctrl,

> +	.get_address		= nvmf_get_address,

> +};

> +

> +static bool

> +nvme_tcp_ofld_existing_controller(struct nvmf_ctrl_options *opts)

> +{

> +	struct nvme_tcp_ofld_ctrl *ctrl;

> +	bool found = false;

> +

> +	down_read(&nvme_tcp_ofld_ctrl_rwsem);

> +	list_for_each_entry(ctrl, &nvme_tcp_ofld_ctrl_list, list) {

> +		found = nvmf_ip_options_match(&ctrl->nctrl, opts);

> +		if (found)

> +			break;

> +	}

> +	up_read(&nvme_tcp_ofld_ctrl_rwsem);

> +

> +	return found;

> +}

> +

>   static struct nvme_ctrl *

>   nvme_tcp_ofld_create_ctrl(struct device *ndev, struct nvmf_ctrl_options *opts)

>   {

> +	struct nvme_tcp_ofld_queue *queue;

>   	struct nvme_tcp_ofld_ctrl *ctrl;

>   	struct nvme_tcp_ofld_dev *dev;

>   	struct nvme_ctrl *nctrl;

> -	int rc = 0;

> +	int i, rc = 0;

>   

>   	ctrl = kzalloc(sizeof(*ctrl), GFP_KERNEL);

>   	if (!ctrl)

>   		return ERR_PTR(-ENOMEM);

>   

> +	INIT_LIST_HEAD(&ctrl->list);

>   	nctrl = &ctrl->nctrl;

> +	nctrl->opts = opts;

> +	nctrl->queue_count = opts->nr_io_queues + opts->nr_write_queues +

> +			     opts->nr_poll_queues + 1;

> +	nctrl->sqsize = opts->queue_size - 1;

> +	nctrl->kato = opts->kato;

> +	if (!(opts->mask & NVMF_OPT_TRSVCID)) {

> +		opts->trsvcid =

> +			kstrdup(__stringify(NVME_TCP_DISC_PORT), GFP_KERNEL);

> +		if (!opts->trsvcid) {

> +			rc = -ENOMEM;

> +			goto out_free_ctrl;

> +		}

> +		opts->mask |= NVMF_OPT_TRSVCID;

> +	}

>   

> -	/* Init nvme_tcp_ofld_ctrl and nvme_ctrl params based on received opts */

> +	rc = inet_pton_with_scope(&init_net, AF_UNSPEC, opts->traddr,

> +				  opts->trsvcid,

> +				  &ctrl->conn_params.remote_ip_addr);

> +	if (rc) {

> +		pr_err("malformed address passed: %s:%s\n",

> +		       opts->traddr, opts->trsvcid);

> +		goto out_free_ctrl;

> +	}

> +

> +	if (opts->mask & NVMF_OPT_HOST_TRADDR) {

> +		rc = inet_pton_with_scope(&init_net, AF_UNSPEC,

> +					  opts->host_traddr, NULL,

> +					  &ctrl->conn_params.local_ip_addr);

> +		if (rc) {

> +			pr_err("malformed src address passed: %s\n",

> +			       opts->host_traddr);

> +			goto out_free_ctrl;

> +		}

> +	}

> +

> +	if (!opts->duplicate_connect &&

> +	    nvme_tcp_ofld_existing_controller(opts)) {

> +		rc = -EALREADY;

> +		goto out_free_ctrl;

> +	}

>   

>   	/* Find device that can reach the dest addr */

>   	dev = nvme_tcp_ofld_lookup_dev(ctrl);

> @@ -160,6 +571,10 @@ nvme_tcp_ofld_create_ctrl(struct device *ndev, struct nvmf_ctrl_options *opts)

>   		goto out_free_ctrl;

>   	}

>   

> +	rc = nvme_tcp_ofld_check_dev_opts(opts, dev->ops);

> +	if (rc)

> +		goto out_module_put;

> +

>   	ctrl->dev = dev;

>   

>   	if (ctrl->dev->ops->max_hw_sectors)

> @@ -167,22 +582,58 @@ nvme_tcp_ofld_create_ctrl(struct device *ndev, struct nvmf_ctrl_options *opts)

>   	if (ctrl->dev->ops->max_segments)

>   		nctrl->max_segments = ctrl->dev->ops->max_segments;

>   

> -	/* Init queues */

> +	ctrl->queues = kcalloc(nctrl->queue_count,

> +			       sizeof(struct nvme_tcp_ofld_queue),

> +			       GFP_KERNEL);

> +	if (!ctrl->queues) {

> +		rc = -ENOMEM;

> +		goto out_module_put;

> +	}

> +

> +	for (i = 0; i < nctrl->queue_count; ++i) {

> +		queue = &ctrl->queues[i];

> +		queue->ctrl = ctrl;

> +		queue->dev = dev;

> +		queue->report_err = nvme_tcp_ofld_report_queue_err;

> +		queue->hdr_digest = nctrl->opts->hdr_digest;

> +		queue->data_digest = nctrl->opts->data_digest;

> +		queue->tos = nctrl->opts->tos;

> +	}

> +

> +	rc = nvme_init_ctrl(nctrl, ndev, &nvme_tcp_ofld_ctrl_ops, 0);

> +	if (rc)

> +		goto out_free_queues;

>   

> -	/* Call nvme_init_ctrl */

> +	if (!nvme_change_ctrl_state(nctrl, NVME_CTRL_CONNECTING)) {

> +		WARN_ON_ONCE(1);

> +		rc = -EINTR;

> +		goto out_uninit_ctrl;

> +	}

>   

>   	rc = ctrl->dev->ops->setup_ctrl(ctrl, true);

>   	if (rc)

> -		goto out_module_put;

> +		goto out_uninit_ctrl;

>   

>   	rc = nvme_tcp_ofld_setup_ctrl(nctrl, true);

>   	if (rc)

> -		goto out_uninit_ctrl;

> +		goto out_release_ctrl;

> +

> +	dev_info(nctrl->device, "new ctrl: NQN \"%s\", addr %pISp\n",

> +		 opts->subsysnqn, &ctrl->conn_params.remote_ip_addr);

> +

> +	down_write(&nvme_tcp_ofld_ctrl_rwsem);

> +	list_add_tail(&ctrl->list, &nvme_tcp_ofld_ctrl_list);

> +	up_write(&nvme_tcp_ofld_ctrl_rwsem);

>   

>   	return nctrl;

>   

> -out_uninit_ctrl:

> +out_release_ctrl:

>   	ctrl->dev->ops->release_ctrl(ctrl);

> +out_uninit_ctrl:

> +	nvme_uninit_ctrl(nctrl);

> +	nvme_put_ctrl(nctrl);

> +out_free_queues:

> +	kfree(ctrl->queues);

>   out_module_put:

>   	module_put(dev->ops->module);

>   out_free_ctrl:

> @@ -212,7 +663,15 @@ static int __init nvme_tcp_ofld_init_module(void)

>   

>   static void __exit nvme_tcp_ofld_cleanup_module(void)

>   {

> +	struct nvme_tcp_ofld_ctrl *ctrl;

> +

>   	nvmf_unregister_transport(&nvme_tcp_ofld_transport);

> +

> +	down_write(&nvme_tcp_ofld_ctrl_rwsem);

> +	list_for_each_entry(ctrl, &nvme_tcp_ofld_ctrl_list, list)

> +		nvme_delete_ctrl(&ctrl->nctrl);

> +	up_write(&nvme_tcp_ofld_ctrl_rwsem);

> +	flush_workqueue(nvme_delete_wq);

>   }

>   

>   module_init(nvme_tcp_ofld_init_module);

> 


Reviewed-by: Himanshu Madhani <himanshu.madhani@oracle.com>


-- 
Himanshu Madhani                                Oracle Linux Engineering
Sagi Grimberg May 21, 2021, 10:31 p.m. UTC | #2
On 5/19/21 4:13 AM, Shai Malin wrote:
> From: Arie Gershberg <agershberg@marvell.com>

> 

> In this patch we implement controller level functionality including:

> - create_ctrl.

> - delete_ctrl.

> - free_ctrl.

> 

> The implementation is similar to other nvme fabrics modules, the main

> difference being that the nvme-tcp-offload ULP calls the vendor specific

> claim_dev() op with the given TCP/IP parameters to determine which device

> will be used for this controller.

> Once found, the vendor specific device and controller will be paired and

> kept in a controller list managed by the ULP.

> 

> Acked-by: Igor Russkikh <irusskikh@marvell.com>

> Signed-off-by: Arie Gershberg <agershberg@marvell.com>

> Signed-off-by: Prabhakar Kushwaha <pkushwaha@marvell.com>

> Signed-off-by: Omkar Kulkarni <okulkarni@marvell.com>

> Signed-off-by: Michal Kalderon <mkalderon@marvell.com>

> Signed-off-by: Ariel Elior <aelior@marvell.com>

> Signed-off-by: Shai Malin <smalin@marvell.com>

> ---

>   drivers/nvme/host/tcp-offload.c | 475 +++++++++++++++++++++++++++++++-

>   1 file changed, 467 insertions(+), 8 deletions(-)

> 

> diff --git a/drivers/nvme/host/tcp-offload.c b/drivers/nvme/host/tcp-offload.c

> index aa7cc239abf2..f7e0dc79bedd 100644

> --- a/drivers/nvme/host/tcp-offload.c

> +++ b/drivers/nvme/host/tcp-offload.c

> @@ -12,6 +12,10 @@

>   

>   static LIST_HEAD(nvme_tcp_ofld_devices);

>   static DECLARE_RWSEM(nvme_tcp_ofld_devices_rwsem);

> +static LIST_HEAD(nvme_tcp_ofld_ctrl_list);

> +static DECLARE_RWSEM(nvme_tcp_ofld_ctrl_rwsem);

> +static struct blk_mq_ops nvme_tcp_ofld_admin_mq_ops;

> +static struct blk_mq_ops nvme_tcp_ofld_mq_ops;

>   

>   static inline struct nvme_tcp_ofld_ctrl *to_tcp_ofld_ctrl(struct nvme_ctrl *nctrl)

>   {

> @@ -128,28 +132,435 @@ nvme_tcp_ofld_lookup_dev(struct nvme_tcp_ofld_ctrl *ctrl)

>   	return dev;

>   }

>   

> +static struct blk_mq_tag_set *

> +nvme_tcp_ofld_alloc_tagset(struct nvme_ctrl *nctrl, bool admin)

> +{

> +	struct nvme_tcp_ofld_ctrl *ctrl = to_tcp_ofld_ctrl(nctrl);

> +	struct blk_mq_tag_set *set;

> +	int rc;

> +

> +	if (admin) {

> +		set = &ctrl->admin_tag_set;

> +		memset(set, 0, sizeof(*set));

> +		set->ops = &nvme_tcp_ofld_admin_mq_ops;

> +		set->queue_depth = NVME_AQ_MQ_TAG_DEPTH;

> +		set->reserved_tags = NVMF_RESERVED_TAGS;

> +		set->numa_node = nctrl->numa_node;

> +		set->flags = BLK_MQ_F_BLOCKING;

> +		set->cmd_size = sizeof(struct nvme_tcp_ofld_req);

> +		set->driver_data = ctrl;

> +		set->nr_hw_queues = 1;

> +		set->timeout = NVME_ADMIN_TIMEOUT;

> +	} else {

> +		set = &ctrl->tag_set;

> +		memset(set, 0, sizeof(*set));

> +		set->ops = &nvme_tcp_ofld_mq_ops;

> +		set->queue_depth = nctrl->sqsize + 1;

> +		set->reserved_tags = NVMF_RESERVED_TAGS;

> +		set->numa_node = nctrl->numa_node;

> +		set->flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_BLOCKING;


Why do you set BLK_MQ_F_BLOCKING? do you schedule in the submission path?

> +		set->cmd_size = sizeof(struct nvme_tcp_ofld_req);

> +		set->driver_data = ctrl;

> +		set->nr_hw_queues = nctrl->queue_count - 1;

> +		set->timeout = NVME_IO_TIMEOUT;

> +		set->nr_maps = nctrl->opts->nr_poll_queues ? HCTX_MAX_TYPES : 2;

> +	}

> +

> +	rc = blk_mq_alloc_tag_set(set);

> +	if (rc)

> +		return ERR_PTR(rc);

> +

> +	return set;

> +}

> +

> +static int nvme_tcp_ofld_configure_admin_queue(struct nvme_ctrl *nctrl,

> +					       bool new)

> +{

> +	int rc;

> +

> +	/* Placeholder - alloc_admin_queue */

> +	if (new) {

> +		nctrl->admin_tagset =

> +				nvme_tcp_ofld_alloc_tagset(nctrl, true);

> +		if (IS_ERR(nctrl->admin_tagset)) {

> +			rc = PTR_ERR(nctrl->admin_tagset);

> +			nctrl->admin_tagset = NULL;

> +			goto out_free_queue;

> +		}

> +

> +		nctrl->fabrics_q = blk_mq_init_queue(nctrl->admin_tagset);

> +		if (IS_ERR(nctrl->fabrics_q)) {

> +			rc = PTR_ERR(nctrl->fabrics_q);

> +			nctrl->fabrics_q = NULL;

> +			goto out_free_tagset;

> +		}

> +

> +		nctrl->admin_q = blk_mq_init_queue(nctrl->admin_tagset);

> +		if (IS_ERR(nctrl->admin_q)) {

> +			rc = PTR_ERR(nctrl->admin_q);

> +			nctrl->admin_q = NULL;

> +			goto out_cleanup_fabrics_q;

> +		}

> +	}

> +

> +	/* Placeholder - nvme_tcp_ofld_start_queue */

> +

> +	rc = nvme_enable_ctrl(nctrl);

> +	if (rc)

> +		goto out_stop_queue;

> +

> +	blk_mq_unquiesce_queue(nctrl->admin_q);

> +

> +	rc = nvme_init_ctrl_finish(nctrl);

> +	if (rc)

> +		goto out_quiesce_queue;

> +

> +	return 0;

> +

> +out_quiesce_queue:

> +	blk_mq_quiesce_queue(nctrl->admin_q);

> +	blk_sync_queue(nctrl->admin_q);

> +

> +out_stop_queue:

> +	/* Placeholder - stop offload queue */

> +	nvme_cancel_admin_tagset(nctrl);

> +

> +out_cleanup_fabrics_q:

> +	if (new)

> +		blk_cleanup_queue(nctrl->fabrics_q);

> +out_free_tagset:

> +	if (new)

> +		blk_mq_free_tag_set(nctrl->admin_tagset);

> +out_free_queue:

> +	/* Placeholder - free admin queue */

> +

> +	return rc;

> +}

> +

> +static int

> +nvme_tcp_ofld_configure_io_queues(struct nvme_ctrl *nctrl, bool new)

> +{

> +	int rc;

> +

> +	/* Placeholder - alloc_io_queues */

> +

> +	if (new) {

> +		nctrl->tagset = nvme_tcp_ofld_alloc_tagset(nctrl, false);

> +		if (IS_ERR(nctrl->tagset)) {

> +			rc = PTR_ERR(nctrl->tagset);

> +			nctrl->tagset = NULL;

> +			goto out_free_io_queues;

> +		}

> +

> +		nctrl->connect_q = blk_mq_init_queue(nctrl->tagset);

> +		if (IS_ERR(nctrl->connect_q)) {

> +			rc = PTR_ERR(nctrl->connect_q);

> +			nctrl->connect_q = NULL;

> +			goto out_free_tag_set;

> +		}

> +	}

> +

> +	/* Placeholder - start_io_queues */

> +

> +	if (!new) {

> +		nvme_start_queues(nctrl);

> +		if (!nvme_wait_freeze_timeout(nctrl, NVME_IO_TIMEOUT)) {

> +			/*

> +			 * If we timed out waiting for freeze we are likely to

> +			 * be stuck.  Fail the controller initialization just

> +			 * to be safe.

> +			 */

> +			rc = -ENODEV;

> +			goto out_wait_freeze_timed_out;

> +		}

> +		blk_mq_update_nr_hw_queues(nctrl->tagset, nctrl->queue_count - 1);

> +		nvme_unfreeze(nctrl);

> +	}

> +

> +	return 0;

> +

> +out_wait_freeze_timed_out:

> +	nvme_stop_queues(nctrl);

> +	nvme_sync_io_queues(nctrl);

> +

> +	/* Placeholder - Stop IO queues */

> +

> +	if (new)

> +		blk_cleanup_queue(nctrl->connect_q);

> +out_free_tag_set:

> +	if (new)

> +		blk_mq_free_tag_set(nctrl->tagset);

> +out_free_io_queues:

> +	/* Placeholder - free_io_queues */

> +

> +	return rc;

> +}

> +

>   static int nvme_tcp_ofld_setup_ctrl(struct nvme_ctrl *nctrl, bool new)

>   {

> -	/* Placeholder - validates inputs and creates admin and IO queues */

> +	struct nvmf_ctrl_options *opts = nctrl->opts;

> +	int rc;

> +

> +	rc = nvme_tcp_ofld_configure_admin_queue(nctrl, new);

> +	if (rc)

> +		return rc;

> +

> +	if (nctrl->icdoff) {

> +		dev_err(nctrl->device, "icdoff is not supported!\n");

> +		rc = -EINVAL;

> +		goto destroy_admin;

> +	}

> +

> +	if (!(nctrl->sgls & ((1 << 0) | (1 << 1)))) {

> +		dev_err(nctrl->device, "Mandatory sgls are not supported!\n");

> +		goto destroy_admin;

> +	}

> +

> +	if (opts->queue_size > nctrl->sqsize + 1)

> +		dev_warn(nctrl->device,

> +			 "queue_size %zu > ctrl sqsize %u, clamping down\n",

> +			 opts->queue_size, nctrl->sqsize + 1);

> +

> +	if (nctrl->sqsize + 1 > nctrl->maxcmd) {

> +		dev_warn(nctrl->device,

> +			 "sqsize %u > ctrl maxcmd %u, clamping down\n",

> +			 nctrl->sqsize + 1, nctrl->maxcmd);

> +		nctrl->sqsize = nctrl->maxcmd - 1;

> +	}

> +

> +	if (nctrl->queue_count > 1) {

> +		rc = nvme_tcp_ofld_configure_io_queues(nctrl, new);

> +		if (rc)

> +			goto destroy_admin;

> +	}

> +

> +	if (!nvme_change_ctrl_state(nctrl, NVME_CTRL_LIVE)) {

> +		/*

> +		 * state change failure is ok if we started ctrl delete,

> +		 * unless we're during creation of a new controller to

> +		 * avoid races with teardown flow.

> +		 */

> +		WARN_ON_ONCE(nctrl->state != NVME_CTRL_DELETING &&

> +			     nctrl->state != NVME_CTRL_DELETING_NOIO);

> +		WARN_ON_ONCE(new);

> +		rc = -EINVAL;

> +		goto destroy_io;

> +	}

> +

> +	nvme_start_ctrl(nctrl);

> +

> +	return 0;

> +

> +destroy_io:

> +	/* Placeholder - stop and destroy io queues*/

> +destroy_admin:

> +	/* Placeholder - stop and destroy admin queue*/

> +

> +	return rc;

> +}

> +

> +static int

> +nvme_tcp_ofld_check_dev_opts(struct nvmf_ctrl_options *opts,

> +			     struct nvme_tcp_ofld_ops *ofld_ops)

> +{

> +	unsigned int nvme_tcp_ofld_opt_mask = NVMF_ALLOWED_OPTS |

> +			ofld_ops->allowed_opts | ofld_ops->required_opts;

> +	if (opts->mask & ~nvme_tcp_ofld_opt_mask) {

> +		pr_warn("One or more of the nvmf options isn't supported by %s.\n",

> +			ofld_ops->name);


Which one(s)?

> +

> +		return -EINVAL;

> +	}

> +

> +	return 0;

> +}

> +

> +static void nvme_tcp_ofld_free_ctrl(struct nvme_ctrl *nctrl)

> +{

> +	struct nvme_tcp_ofld_ctrl *ctrl = to_tcp_ofld_ctrl(nctrl);

> +	struct nvme_tcp_ofld_dev *dev = ctrl->dev;

> +

> +	if (list_empty(&ctrl->list))

> +		goto free_ctrl;

> +

> +	down_write(&nvme_tcp_ofld_ctrl_rwsem);

> +	ctrl->dev->ops->release_ctrl(ctrl);


Why is release called under the lock? specific reason?

> +	list_del(&ctrl->list);

> +	up_write(&nvme_tcp_ofld_ctrl_rwsem);

> +

> +	nvmf_free_options(nctrl->opts);

> +free_ctrl:

> +	module_put(dev->ops->module);

> +	kfree(ctrl->queues);

> +	kfree(ctrl);

> +}

> +

> +static void nvme_tcp_ofld_submit_async_event(struct nvme_ctrl *arg)

> +{

> +	/* Placeholder - submit_async_event */

> +}

> +

> +static void

> +nvme_tcp_ofld_teardown_admin_queue(struct nvme_ctrl *ctrl, bool remove)

> +{

> +	/* Placeholder - teardown_admin_queue */

> +}

> +

> +static void

> +nvme_tcp_ofld_teardown_io_queues(struct nvme_ctrl *nctrl, bool remove)

> +{

> +	/* Placeholder - teardown_io_queues */

> +}

> +

> +static void

> +nvme_tcp_ofld_teardown_ctrl(struct nvme_ctrl *nctrl, bool shutdown)

> +{

> +	/* Placeholder - err_work and connect_work */

> +	nvme_tcp_ofld_teardown_io_queues(nctrl, shutdown);

> +	blk_mq_quiesce_queue(nctrl->admin_q);

> +	if (shutdown)

> +		nvme_shutdown_ctrl(nctrl);

> +	else

> +		nvme_disable_ctrl(nctrl);

> +	nvme_tcp_ofld_teardown_admin_queue(nctrl, shutdown);

> +}

> +

> +static void nvme_tcp_ofld_delete_ctrl(struct nvme_ctrl *nctrl)

> +{

> +	nvme_tcp_ofld_teardown_ctrl(nctrl, true);

> +}

> +

> +static int

> +nvme_tcp_ofld_init_request(struct blk_mq_tag_set *set,

> +			   struct request *rq,

> +			   unsigned int hctx_idx,

> +			   unsigned int numa_node)

> +{

> +	struct nvme_tcp_ofld_req *req = blk_mq_rq_to_pdu(rq);

> +	struct nvme_tcp_ofld_ctrl *ctrl = set->driver_data;

> +

> +	/* Placeholder - init request */

> +

> +	req->done = nvme_tcp_ofld_req_done;

> +	ctrl->dev->ops->init_req(req);

>   

>   	return 0;

>   }

>   

> +static blk_status_t

> +nvme_tcp_ofld_queue_rq(struct blk_mq_hw_ctx *hctx,

> +		       const struct blk_mq_queue_data *bd)

> +{

> +	/* Call nvme_setup_cmd(...) */

> +

> +	/* Call ops->send_req(...) */

> +

> +	return BLK_STS_OK;

> +}

> +

> +static struct blk_mq_ops nvme_tcp_ofld_mq_ops = {

> +	.queue_rq	= nvme_tcp_ofld_queue_rq,

> +	.init_request	= nvme_tcp_ofld_init_request,

> +	/*

> +	 * All additional ops will be also implemented and registered similar to

> +	 * tcp.c

> +	 */

> +};

> +

> +static struct blk_mq_ops nvme_tcp_ofld_admin_mq_ops = {

> +	.queue_rq	= nvme_tcp_ofld_queue_rq,

> +	.init_request	= nvme_tcp_ofld_init_request,

> +	/*

> +	 * All additional ops will be also implemented and registered similar to

> +	 * tcp.c

> +	 */

> +};

> +

> +static const struct nvme_ctrl_ops nvme_tcp_ofld_ctrl_ops = {

> +	.name			= "tcp_offload",

> +	.module			= THIS_MODULE,

> +	.flags			= NVME_F_FABRICS,

> +	.reg_read32		= nvmf_reg_read32,

> +	.reg_read64		= nvmf_reg_read64,

> +	.reg_write32		= nvmf_reg_write32,

> +	.free_ctrl		= nvme_tcp_ofld_free_ctrl,

> +	.submit_async_event	= nvme_tcp_ofld_submit_async_event,


Its pretty difficult to review when you have declarations of functions
that don't exist yet. It also hurts bisectability.

> +	.delete_ctrl		= nvme_tcp_ofld_delete_ctrl,

> +	.get_address		= nvmf_get_address,

> +};

> +

> +static bool

> +nvme_tcp_ofld_existing_controller(struct nvmf_ctrl_options *opts)

> +{

> +	struct nvme_tcp_ofld_ctrl *ctrl;

> +	bool found = false;

> +

> +	down_read(&nvme_tcp_ofld_ctrl_rwsem);

> +	list_for_each_entry(ctrl, &nvme_tcp_ofld_ctrl_list, list) {

> +		found = nvmf_ip_options_match(&ctrl->nctrl, opts);

> +		if (found)

> +			break;

> +	}

> +	up_read(&nvme_tcp_ofld_ctrl_rwsem);

> +

> +	return found;

> +}

> +

>   static struct nvme_ctrl *

>   nvme_tcp_ofld_create_ctrl(struct device *ndev, struct nvmf_ctrl_options *opts)

>   {

> +	struct nvme_tcp_ofld_queue *queue;

>   	struct nvme_tcp_ofld_ctrl *ctrl;

>   	struct nvme_tcp_ofld_dev *dev;

>   	struct nvme_ctrl *nctrl;

> -	int rc = 0;

> +	int i, rc = 0;

>   

>   	ctrl = kzalloc(sizeof(*ctrl), GFP_KERNEL);

>   	if (!ctrl)

>   		return ERR_PTR(-ENOMEM);

>   

> +	INIT_LIST_HEAD(&ctrl->list);

>   	nctrl = &ctrl->nctrl;

> +	nctrl->opts = opts;

> +	nctrl->queue_count = opts->nr_io_queues + opts->nr_write_queues +

> +			     opts->nr_poll_queues + 1;

> +	nctrl->sqsize = opts->queue_size - 1;

> +	nctrl->kato = opts->kato;

> +	if (!(opts->mask & NVMF_OPT_TRSVCID)) {

> +		opts->trsvcid =

> +			kstrdup(__stringify(NVME_TCP_DISC_PORT), GFP_KERNEL);

> +		if (!opts->trsvcid) {

> +			rc = -ENOMEM;

> +			goto out_free_ctrl;

> +		}

> +		opts->mask |= NVMF_OPT_TRSVCID;

> +	}

>   

> -	/* Init nvme_tcp_ofld_ctrl and nvme_ctrl params based on received opts */

> +	rc = inet_pton_with_scope(&init_net, AF_UNSPEC, opts->traddr,

> +				  opts->trsvcid,

> +				  &ctrl->conn_params.remote_ip_addr);

> +	if (rc) {

> +		pr_err("malformed address passed: %s:%s\n",

> +		       opts->traddr, opts->trsvcid);

> +		goto out_free_ctrl;

> +	}

> +

> +	if (opts->mask & NVMF_OPT_HOST_TRADDR) {

> +		rc = inet_pton_with_scope(&init_net, AF_UNSPEC,

> +					  opts->host_traddr, NULL,

> +					  &ctrl->conn_params.local_ip_addr);

> +		if (rc) {

> +			pr_err("malformed src address passed: %s\n",

> +			       opts->host_traddr);

> +			goto out_free_ctrl;

> +		}

> +	}

> +

> +	if (!opts->duplicate_connect &&

> +	    nvme_tcp_ofld_existing_controller(opts)) {

> +		rc = -EALREADY;

> +		goto out_free_ctrl;

> +	}

>   

>   	/* Find device that can reach the dest addr */

>   	dev = nvme_tcp_ofld_lookup_dev(ctrl);

> @@ -160,6 +571,10 @@ nvme_tcp_ofld_create_ctrl(struct device *ndev, struct nvmf_ctrl_options *opts)

>   		goto out_free_ctrl;

>   	}

>   

> +	rc = nvme_tcp_ofld_check_dev_opts(opts, dev->ops);

> +	if (rc)

> +		goto out_module_put;

> +

>   	ctrl->dev = dev;

>   

>   	if (ctrl->dev->ops->max_hw_sectors)

> @@ -167,22 +582,58 @@ nvme_tcp_ofld_create_ctrl(struct device *ndev, struct nvmf_ctrl_options *opts)

>   	if (ctrl->dev->ops->max_segments)

>   		nctrl->max_segments = ctrl->dev->ops->max_segments;

>   

> -	/* Init queues */

> +	ctrl->queues = kcalloc(nctrl->queue_count,

> +			       sizeof(struct nvme_tcp_ofld_queue),

> +			       GFP_KERNEL);

> +	if (!ctrl->queues) {

> +		rc = -ENOMEM;

> +		goto out_module_put;

> +	}

> +

> +	for (i = 0; i < nctrl->queue_count; ++i) {

> +		queue = &ctrl->queues[i];

> +		queue->ctrl = ctrl;

> +		queue->dev = dev;

> +		queue->report_err = nvme_tcp_ofld_report_queue_err;

> +		queue->hdr_digest = nctrl->opts->hdr_digest;

> +		queue->data_digest = nctrl->opts->data_digest;

> +		queue->tos = nctrl->opts->tos;


Why is this here and not in the queue initialization?

> +	}

> +

> +	rc = nvme_init_ctrl(nctrl, ndev, &nvme_tcp_ofld_ctrl_ops, 0);

> +	if (rc)

> +		goto out_free_queues;

>   

> -	/* Call nvme_init_ctrl */

> +	if (!nvme_change_ctrl_state(nctrl, NVME_CTRL_CONNECTING)) {

> +		WARN_ON_ONCE(1);

> +		rc = -EINTR;

> +		goto out_uninit_ctrl;

> +	}

>   

>   	rc = ctrl->dev->ops->setup_ctrl(ctrl, true);

>   	if (rc)

> -		goto out_module_put;

> +		goto out_uninit_ctrl;

>   

>   	rc = nvme_tcp_ofld_setup_ctrl(nctrl, true);

>   	if (rc)

> -		goto out_uninit_ctrl;

> +		goto out_release_ctrl;


Its kinda confusing that a patch series adds code that is
replaced later in the series...

> +

> +	dev_info(nctrl->device, "new ctrl: NQN \"%s\", addr %pISp\n",

> +		 opts->subsysnqn, &ctrl->conn_params.remote_ip_addr);

> +

> +	down_write(&nvme_tcp_ofld_ctrl_rwsem);

> +	list_add_tail(&ctrl->list, &nvme_tcp_ofld_ctrl_list);

> +	up_write(&nvme_tcp_ofld_ctrl_rwsem);

>   

>   	return nctrl;

>   

> -out_uninit_ctrl:

> +out_release_ctrl:

>   	ctrl->dev->ops->release_ctrl(ctrl);

> +out_uninit_ctrl:

> +	nvme_uninit_ctrl(nctrl);

> +	nvme_put_ctrl(nctrl);

> +out_free_queues:

> +	kfree(ctrl->queues);

>   out_module_put:

>   	module_put(dev->ops->module);

>   out_free_ctrl:

> @@ -212,7 +663,15 @@ static int __init nvme_tcp_ofld_init_module(void)

>   

>   static void __exit nvme_tcp_ofld_cleanup_module(void)

>   {

> +	struct nvme_tcp_ofld_ctrl *ctrl;

> +

>   	nvmf_unregister_transport(&nvme_tcp_ofld_transport);

> +

> +	down_write(&nvme_tcp_ofld_ctrl_rwsem);

> +	list_for_each_entry(ctrl, &nvme_tcp_ofld_ctrl_list, list)

> +		nvme_delete_ctrl(&ctrl->nctrl);

> +	up_write(&nvme_tcp_ofld_ctrl_rwsem);

> +	flush_workqueue(nvme_delete_wq);

>   }

>   

>   module_init(nvme_tcp_ofld_init_module);

>
Shai Malin May 27, 2021, 8:03 p.m. UTC | #3
On 5/22/21 1:31 AM, Sagi Grimberg wrote:
> On 5/19/21 4:13 AM, Shai Malin wrote:

> > From: Arie Gershberg <agershberg@marvell.com>

> >

> > In this patch we implement controller level functionality including:

> > - create_ctrl.

> > - delete_ctrl.

> > - free_ctrl.

> >

> > The implementation is similar to other nvme fabrics modules, the main

> > difference being that the nvme-tcp-offload ULP calls the vendor specific

> > claim_dev() op with the given TCP/IP parameters to determine which device

> > will be used for this controller.

> > Once found, the vendor specific device and controller will be paired and

> > kept in a controller list managed by the ULP.

> >

> > Acked-by: Igor Russkikh <irusskikh@marvell.com>

> > Signed-off-by: Arie Gershberg <agershberg@marvell.com>

> > Signed-off-by: Prabhakar Kushwaha <pkushwaha@marvell.com>

> > Signed-off-by: Omkar Kulkarni <okulkarni@marvell.com>

> > Signed-off-by: Michal Kalderon <mkalderon@marvell.com>

> > Signed-off-by: Ariel Elior <aelior@marvell.com>

> > Signed-off-by: Shai Malin <smalin@marvell.com>

> > ---

> >   drivers/nvme/host/tcp-offload.c | 475 +++++++++++++++++++++++++++++++-

> >   1 file changed, 467 insertions(+), 8 deletions(-)

> >

> > diff --git a/drivers/nvme/host/tcp-offload.c b/drivers/nvme/host/tcp-offload.c

> > index aa7cc239abf2..f7e0dc79bedd 100644

> > --- a/drivers/nvme/host/tcp-offload.c

> > +++ b/drivers/nvme/host/tcp-offload.c

> > @@ -12,6 +12,10 @@

> >

> >   static LIST_HEAD(nvme_tcp_ofld_devices);

> >   static DECLARE_RWSEM(nvme_tcp_ofld_devices_rwsem);

> > +static LIST_HEAD(nvme_tcp_ofld_ctrl_list);

> > +static DECLARE_RWSEM(nvme_tcp_ofld_ctrl_rwsem);

> > +static struct blk_mq_ops nvme_tcp_ofld_admin_mq_ops;

> > +static struct blk_mq_ops nvme_tcp_ofld_mq_ops;

> >

> >   static inline struct nvme_tcp_ofld_ctrl *to_tcp_ofld_ctrl(struct nvme_ctrl *nctrl)

> >   {

> > @@ -128,28 +132,435 @@ nvme_tcp_ofld_lookup_dev(struct nvme_tcp_ofld_ctrl *ctrl)

> >       return dev;

> >   }

> >

> > +static struct blk_mq_tag_set *

> > +nvme_tcp_ofld_alloc_tagset(struct nvme_ctrl *nctrl, bool admin)

> > +{

> > +     struct nvme_tcp_ofld_ctrl *ctrl = to_tcp_ofld_ctrl(nctrl);

> > +     struct blk_mq_tag_set *set;

> > +     int rc;

> > +

> > +     if (admin) {

> > +             set = &ctrl->admin_tag_set;

> > +             memset(set, 0, sizeof(*set));

> > +             set->ops = &nvme_tcp_ofld_admin_mq_ops;

> > +             set->queue_depth = NVME_AQ_MQ_TAG_DEPTH;

> > +             set->reserved_tags = NVMF_RESERVED_TAGS;

> > +             set->numa_node = nctrl->numa_node;

> > +             set->flags = BLK_MQ_F_BLOCKING;

> > +             set->cmd_size = sizeof(struct nvme_tcp_ofld_req);

> > +             set->driver_data = ctrl;

> > +             set->nr_hw_queues = 1;

> > +             set->timeout = NVME_ADMIN_TIMEOUT;

> > +     } else {

> > +             set = &ctrl->tag_set;

> > +             memset(set, 0, sizeof(*set));

> > +             set->ops = &nvme_tcp_ofld_mq_ops;

> > +             set->queue_depth = nctrl->sqsize + 1;

> > +             set->reserved_tags = NVMF_RESERVED_TAGS;

> > +             set->numa_node = nctrl->numa_node;

> > +             set->flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_BLOCKING;

>

> Why do you set BLK_MQ_F_BLOCKING? do you schedule in the submission path?


No. Will be fixed.

>

> > +             set->cmd_size = sizeof(struct nvme_tcp_ofld_req);

> > +             set->driver_data = ctrl;

> > +             set->nr_hw_queues = nctrl->queue_count - 1;

> > +             set->timeout = NVME_IO_TIMEOUT;

> > +             set->nr_maps = nctrl->opts->nr_poll_queues ? HCTX_MAX_TYPES : 2;

> > +     }

> > +

> > +     rc = blk_mq_alloc_tag_set(set);

> > +     if (rc)

> > +             return ERR_PTR(rc);

> > +

> > +     return set;

> > +}

> > +

> > +static int nvme_tcp_ofld_configure_admin_queue(struct nvme_ctrl *nctrl,

> > +                                            bool new)

> > +{

> > +     int rc;

> > +

> > +     /* Placeholder - alloc_admin_queue */

> > +     if (new) {

> > +             nctrl->admin_tagset =

> > +                             nvme_tcp_ofld_alloc_tagset(nctrl, true);

> > +             if (IS_ERR(nctrl->admin_tagset)) {

> > +                     rc = PTR_ERR(nctrl->admin_tagset);

> > +                     nctrl->admin_tagset = NULL;

> > +                     goto out_free_queue;

> > +             }

> > +

> > +             nctrl->fabrics_q = blk_mq_init_queue(nctrl->admin_tagset);

> > +             if (IS_ERR(nctrl->fabrics_q)) {

> > +                     rc = PTR_ERR(nctrl->fabrics_q);

> > +                     nctrl->fabrics_q = NULL;

> > +                     goto out_free_tagset;

> > +             }

> > +

> > +             nctrl->admin_q = blk_mq_init_queue(nctrl->admin_tagset);

> > +             if (IS_ERR(nctrl->admin_q)) {

> > +                     rc = PTR_ERR(nctrl->admin_q);

> > +                     nctrl->admin_q = NULL;

> > +                     goto out_cleanup_fabrics_q;

> > +             }

> > +     }

> > +

> > +     /* Placeholder - nvme_tcp_ofld_start_queue */

> > +

> > +     rc = nvme_enable_ctrl(nctrl);

> > +     if (rc)

> > +             goto out_stop_queue;

> > +

> > +     blk_mq_unquiesce_queue(nctrl->admin_q);

> > +

> > +     rc = nvme_init_ctrl_finish(nctrl);

> > +     if (rc)

> > +             goto out_quiesce_queue;

> > +

> > +     return 0;

> > +

> > +out_quiesce_queue:

> > +     blk_mq_quiesce_queue(nctrl->admin_q);

> > +     blk_sync_queue(nctrl->admin_q);

> > +

> > +out_stop_queue:

> > +     /* Placeholder - stop offload queue */

> > +     nvme_cancel_admin_tagset(nctrl);

> > +

> > +out_cleanup_fabrics_q:

> > +     if (new)

> > +             blk_cleanup_queue(nctrl->fabrics_q);

> > +out_free_tagset:

> > +     if (new)

> > +             blk_mq_free_tag_set(nctrl->admin_tagset);

> > +out_free_queue:

> > +     /* Placeholder - free admin queue */

> > +

> > +     return rc;

> > +}

> > +

> > +static int

> > +nvme_tcp_ofld_configure_io_queues(struct nvme_ctrl *nctrl, bool new)

> > +{

> > +     int rc;

> > +

> > +     /* Placeholder - alloc_io_queues */

> > +

> > +     if (new) {

> > +             nctrl->tagset = nvme_tcp_ofld_alloc_tagset(nctrl, false);

> > +             if (IS_ERR(nctrl->tagset)) {

> > +                     rc = PTR_ERR(nctrl->tagset);

> > +                     nctrl->tagset = NULL;

> > +                     goto out_free_io_queues;

> > +             }

> > +

> > +             nctrl->connect_q = blk_mq_init_queue(nctrl->tagset);

> > +             if (IS_ERR(nctrl->connect_q)) {

> > +                     rc = PTR_ERR(nctrl->connect_q);

> > +                     nctrl->connect_q = NULL;

> > +                     goto out_free_tag_set;

> > +             }

> > +     }

> > +

> > +     /* Placeholder - start_io_queues */

> > +

> > +     if (!new) {

> > +             nvme_start_queues(nctrl);

> > +             if (!nvme_wait_freeze_timeout(nctrl, NVME_IO_TIMEOUT)) {

> > +                     /*

> > +                      * If we timed out waiting for freeze we are likely to

> > +                      * be stuck.  Fail the controller initialization just

> > +                      * to be safe.

> > +                      */

> > +                     rc = -ENODEV;

> > +                     goto out_wait_freeze_timed_out;

> > +             }

> > +             blk_mq_update_nr_hw_queues(nctrl->tagset, nctrl->queue_count - 1);

> > +             nvme_unfreeze(nctrl);

> > +     }

> > +

> > +     return 0;

> > +

> > +out_wait_freeze_timed_out:

> > +     nvme_stop_queues(nctrl);

> > +     nvme_sync_io_queues(nctrl);

> > +

> > +     /* Placeholder - Stop IO queues */

> > +

> > +     if (new)

> > +             blk_cleanup_queue(nctrl->connect_q);

> > +out_free_tag_set:

> > +     if (new)

> > +             blk_mq_free_tag_set(nctrl->tagset);

> > +out_free_io_queues:

> > +     /* Placeholder - free_io_queues */

> > +

> > +     return rc;

> > +}

> > +

> >   static int nvme_tcp_ofld_setup_ctrl(struct nvme_ctrl *nctrl, bool new)

> >   {

> > -     /* Placeholder - validates inputs and creates admin and IO queues */

> > +     struct nvmf_ctrl_options *opts = nctrl->opts;

> > +     int rc;

> > +

> > +     rc = nvme_tcp_ofld_configure_admin_queue(nctrl, new);

> > +     if (rc)

> > +             return rc;

> > +

> > +     if (nctrl->icdoff) {

> > +             dev_err(nctrl->device, "icdoff is not supported!\n");

> > +             rc = -EINVAL;

> > +             goto destroy_admin;

> > +     }

> > +

> > +     if (!(nctrl->sgls & ((1 << 0) | (1 << 1)))) {

> > +             dev_err(nctrl->device, "Mandatory sgls are not supported!\n");

> > +             goto destroy_admin;

> > +     }

> > +

> > +     if (opts->queue_size > nctrl->sqsize + 1)

> > +             dev_warn(nctrl->device,

> > +                      "queue_size %zu > ctrl sqsize %u, clamping down\n",

> > +                      opts->queue_size, nctrl->sqsize + 1);

> > +

> > +     if (nctrl->sqsize + 1 > nctrl->maxcmd) {

> > +             dev_warn(nctrl->device,

> > +                      "sqsize %u > ctrl maxcmd %u, clamping down\n",

> > +                      nctrl->sqsize + 1, nctrl->maxcmd);

> > +             nctrl->sqsize = nctrl->maxcmd - 1;

> > +     }

> > +

> > +     if (nctrl->queue_count > 1) {

> > +             rc = nvme_tcp_ofld_configure_io_queues(nctrl, new);

> > +             if (rc)

> > +                     goto destroy_admin;

> > +     }

> > +

> > +     if (!nvme_change_ctrl_state(nctrl, NVME_CTRL_LIVE)) {

> > +             /*

> > +              * state change failure is ok if we started ctrl delete,

> > +              * unless we're during creation of a new controller to

> > +              * avoid races with teardown flow.

> > +              */

> > +             WARN_ON_ONCE(nctrl->state != NVME_CTRL_DELETING &&

> > +                          nctrl->state != NVME_CTRL_DELETING_NOIO);

> > +             WARN_ON_ONCE(new);

> > +             rc = -EINVAL;

> > +             goto destroy_io;

> > +     }

> > +

> > +     nvme_start_ctrl(nctrl);

> > +

> > +     return 0;

> > +

> > +destroy_io:

> > +     /* Placeholder - stop and destroy io queues*/

> > +destroy_admin:

> > +     /* Placeholder - stop and destroy admin queue*/

> > +

> > +     return rc;

> > +}

> > +

> > +static int

> > +nvme_tcp_ofld_check_dev_opts(struct nvmf_ctrl_options *opts,

> > +                          struct nvme_tcp_ofld_ops *ofld_ops)

> > +{

> > +     unsigned int nvme_tcp_ofld_opt_mask = NVMF_ALLOWED_OPTS |

> > +                     ofld_ops->allowed_opts | ofld_ops->required_opts;

> > +     if (opts->mask & ~nvme_tcp_ofld_opt_mask) {

> > +             pr_warn("One or more of the nvmf options isn't supported by %s.\n",

> > +                     ofld_ops->name);

>

> Which one(s)?


Will be fixed.

>

> > +

> > +             return -EINVAL;

> > +     }

> > +

> > +     return 0;

> > +}

> > +

> > +static void nvme_tcp_ofld_free_ctrl(struct nvme_ctrl *nctrl)

> > +{

> > +     struct nvme_tcp_ofld_ctrl *ctrl = to_tcp_ofld_ctrl(nctrl);

> > +     struct nvme_tcp_ofld_dev *dev = ctrl->dev;

> > +

> > +     if (list_empty(&ctrl->list))

> > +             goto free_ctrl;

> > +

> > +     down_write(&nvme_tcp_ofld_ctrl_rwsem);

> > +     ctrl->dev->ops->release_ctrl(ctrl);

>

> Why is release called under the lock? specific reason?


It's not needed. Will be fixed.

>

> > +     list_del(&ctrl->list);

> > +     up_write(&nvme_tcp_ofld_ctrl_rwsem);

> > +

> > +     nvmf_free_options(nctrl->opts);

> > +free_ctrl:

> > +     module_put(dev->ops->module);

> > +     kfree(ctrl->queues);

> > +     kfree(ctrl);

> > +}

> > +

> > +static void nvme_tcp_ofld_submit_async_event(struct nvme_ctrl *arg)

> > +{

> > +     /* Placeholder - submit_async_event */

> > +}

> > +

> > +static void

> > +nvme_tcp_ofld_teardown_admin_queue(struct nvme_ctrl *ctrl, bool remove)

> > +{

> > +     /* Placeholder - teardown_admin_queue */

> > +}

> > +

> > +static void

> > +nvme_tcp_ofld_teardown_io_queues(struct nvme_ctrl *nctrl, bool remove)

> > +{

> > +     /* Placeholder - teardown_io_queues */

> > +}

> > +

> > +static void

> > +nvme_tcp_ofld_teardown_ctrl(struct nvme_ctrl *nctrl, bool shutdown)

> > +{

> > +     /* Placeholder - err_work and connect_work */

> > +     nvme_tcp_ofld_teardown_io_queues(nctrl, shutdown);

> > +     blk_mq_quiesce_queue(nctrl->admin_q);

> > +     if (shutdown)

> > +             nvme_shutdown_ctrl(nctrl);

> > +     else

> > +             nvme_disable_ctrl(nctrl);

> > +     nvme_tcp_ofld_teardown_admin_queue(nctrl, shutdown);

> > +}

> > +

> > +static void nvme_tcp_ofld_delete_ctrl(struct nvme_ctrl *nctrl)

> > +{

> > +     nvme_tcp_ofld_teardown_ctrl(nctrl, true);

> > +}

> > +

> > +static int

> > +nvme_tcp_ofld_init_request(struct blk_mq_tag_set *set,

> > +                        struct request *rq,

> > +                        unsigned int hctx_idx,

> > +                        unsigned int numa_node)

> > +{

> > +     struct nvme_tcp_ofld_req *req = blk_mq_rq_to_pdu(rq);

> > +     struct nvme_tcp_ofld_ctrl *ctrl = set->driver_data;

> > +

> > +     /* Placeholder - init request */

> > +

> > +     req->done = nvme_tcp_ofld_req_done;

> > +     ctrl->dev->ops->init_req(req);

> >

> >       return 0;

> >   }

> >

> > +static blk_status_t

> > +nvme_tcp_ofld_queue_rq(struct blk_mq_hw_ctx *hctx,

> > +                    const struct blk_mq_queue_data *bd)

> > +{

> > +     /* Call nvme_setup_cmd(...) */

> > +

> > +     /* Call ops->send_req(...) */

> > +

> > +     return BLK_STS_OK;

> > +}

> > +

> > +static struct blk_mq_ops nvme_tcp_ofld_mq_ops = {

> > +     .queue_rq       = nvme_tcp_ofld_queue_rq,

> > +     .init_request   = nvme_tcp_ofld_init_request,

> > +     /*

> > +      * All additional ops will be also implemented and registered similar to

> > +      * tcp.c

> > +      */

> > +};

> > +

> > +static struct blk_mq_ops nvme_tcp_ofld_admin_mq_ops = {

> > +     .queue_rq       = nvme_tcp_ofld_queue_rq,

> > +     .init_request   = nvme_tcp_ofld_init_request,

> > +     /*

> > +      * All additional ops will be also implemented and registered similar to

> > +      * tcp.c

> > +      */

> > +};

> > +

> > +static const struct nvme_ctrl_ops nvme_tcp_ofld_ctrl_ops = {

> > +     .name                   = "tcp_offload",

> > +     .module                 = THIS_MODULE,

> > +     .flags                  = NVME_F_FABRICS,

> > +     .reg_read32             = nvmf_reg_read32,

> > +     .reg_read64             = nvmf_reg_read64,

> > +     .reg_write32            = nvmf_reg_write32,

> > +     .free_ctrl              = nvme_tcp_ofld_free_ctrl,

> > +     .submit_async_event     = nvme_tcp_ofld_submit_async_event,

>

> Its pretty difficult to review when you have declarations of functions

> that don't exist yet. It also hurts bisectability.


Will be fixed.

>

> > +     .delete_ctrl            = nvme_tcp_ofld_delete_ctrl,

> > +     .get_address            = nvmf_get_address,

> > +};

> > +

> > +static bool

> > +nvme_tcp_ofld_existing_controller(struct nvmf_ctrl_options *opts)

> > +{

> > +     struct nvme_tcp_ofld_ctrl *ctrl;

> > +     bool found = false;

> > +

> > +     down_read(&nvme_tcp_ofld_ctrl_rwsem);

> > +     list_for_each_entry(ctrl, &nvme_tcp_ofld_ctrl_list, list) {

> > +             found = nvmf_ip_options_match(&ctrl->nctrl, opts);

> > +             if (found)

> > +                     break;

> > +     }

> > +     up_read(&nvme_tcp_ofld_ctrl_rwsem);

> > +

> > +     return found;

> > +}

> > +

> >   static struct nvme_ctrl *

> >   nvme_tcp_ofld_create_ctrl(struct device *ndev, struct nvmf_ctrl_options *opts)

> >   {

> > +     struct nvme_tcp_ofld_queue *queue;

> >       struct nvme_tcp_ofld_ctrl *ctrl;

> >       struct nvme_tcp_ofld_dev *dev;

> >       struct nvme_ctrl *nctrl;

> > -     int rc = 0;

> > +     int i, rc = 0;

> >

> >       ctrl = kzalloc(sizeof(*ctrl), GFP_KERNEL);

> >       if (!ctrl)

> >               return ERR_PTR(-ENOMEM);

> >

> > +     INIT_LIST_HEAD(&ctrl->list);

> >       nctrl = &ctrl->nctrl;

> > +     nctrl->opts = opts;

> > +     nctrl->queue_count = opts->nr_io_queues + opts->nr_write_queues +

> > +                          opts->nr_poll_queues + 1;

> > +     nctrl->sqsize = opts->queue_size - 1;

> > +     nctrl->kato = opts->kato;

> > +     if (!(opts->mask & NVMF_OPT_TRSVCID)) {

> > +             opts->trsvcid =

> > +                     kstrdup(__stringify(NVME_TCP_DISC_PORT), GFP_KERNEL);

> > +             if (!opts->trsvcid) {

> > +                     rc = -ENOMEM;

> > +                     goto out_free_ctrl;

> > +             }

> > +             opts->mask |= NVMF_OPT_TRSVCID;

> > +     }

> >

> > -     /* Init nvme_tcp_ofld_ctrl and nvme_ctrl params based on received opts */

> > +     rc = inet_pton_with_scope(&init_net, AF_UNSPEC, opts->traddr,

> > +                               opts->trsvcid,

> > +                               &ctrl->conn_params.remote_ip_addr);

> > +     if (rc) {

> > +             pr_err("malformed address passed: %s:%s\n",

> > +                    opts->traddr, opts->trsvcid);

> > +             goto out_free_ctrl;

> > +     }

> > +

> > +     if (opts->mask & NVMF_OPT_HOST_TRADDR) {

> > +             rc = inet_pton_with_scope(&init_net, AF_UNSPEC,

> > +                                       opts->host_traddr, NULL,

> > +                                       &ctrl->conn_params.local_ip_addr);

> > +             if (rc) {

> > +                     pr_err("malformed src address passed: %s\n",

> > +                            opts->host_traddr);

> > +                     goto out_free_ctrl;

> > +             }

> > +     }

> > +

> > +     if (!opts->duplicate_connect &&

> > +         nvme_tcp_ofld_existing_controller(opts)) {

> > +             rc = -EALREADY;

> > +             goto out_free_ctrl;

> > +     }

> >

> >       /* Find device that can reach the dest addr */

> >       dev = nvme_tcp_ofld_lookup_dev(ctrl);

> > @@ -160,6 +571,10 @@ nvme_tcp_ofld_create_ctrl(struct device *ndev, struct nvmf_ctrl_options *opts)

> >               goto out_free_ctrl;

> >       }

> >

> > +     rc = nvme_tcp_ofld_check_dev_opts(opts, dev->ops);

> > +     if (rc)

> > +             goto out_module_put;

> > +

> >       ctrl->dev = dev;

> >

> >       if (ctrl->dev->ops->max_hw_sectors)

> > @@ -167,22 +582,58 @@ nvme_tcp_ofld_create_ctrl(struct device *ndev, struct nvmf_ctrl_options *opts)

> >       if (ctrl->dev->ops->max_segments)

> >               nctrl->max_segments = ctrl->dev->ops->max_segments;

> >

> > -     /* Init queues */

> > +     ctrl->queues = kcalloc(nctrl->queue_count,

> > +                            sizeof(struct nvme_tcp_ofld_queue),

> > +                            GFP_KERNEL);

> > +     if (!ctrl->queues) {

> > +             rc = -ENOMEM;

> > +             goto out_module_put;

> > +     }

> > +

> > +     for (i = 0; i < nctrl->queue_count; ++i) {

> > +             queue = &ctrl->queues[i];

> > +             queue->ctrl = ctrl;

> > +             queue->dev = dev;

> > +             queue->report_err = nvme_tcp_ofld_report_queue_err;

> > +             queue->hdr_digest = nctrl->opts->hdr_digest;

> > +             queue->data_digest = nctrl->opts->data_digest;

> > +             queue->tos = nctrl->opts->tos;

>

> Why is this here and not in the queue initialization?


It's in order to not duplicate the initializations between the admin queue
and IO queue. We will move the digest and tos initialization into the vendor
driver create_queue().

>

> > +     }

> > +

> > +     rc = nvme_init_ctrl(nctrl, ndev, &nvme_tcp_ofld_ctrl_ops, 0);

> > +     if (rc)

> > +             goto out_free_queues;

> >

> > -     /* Call nvme_init_ctrl */

> > +     if (!nvme_change_ctrl_state(nctrl, NVME_CTRL_CONNECTING)) {

> > +             WARN_ON_ONCE(1);

> > +             rc = -EINTR;

> > +             goto out_uninit_ctrl;

> > +     }

> >

> >       rc = ctrl->dev->ops->setup_ctrl(ctrl, true);

> >       if (rc)

> > -             goto out_module_put;

> > +             goto out_uninit_ctrl;

> >

> >       rc = nvme_tcp_ofld_setup_ctrl(nctrl, true);

> >       if (rc)

> > -             goto out_uninit_ctrl;

> > +             goto out_release_ctrl;

>

> Its kinda confusing that a patch series adds code that is

> replaced later in the series...


Will be fixed.

>

> > +

> > +     dev_info(nctrl->device, "new ctrl: NQN \"%s\", addr %pISp\n",

> > +              opts->subsysnqn, &ctrl->conn_params.remote_ip_addr);

> > +

> > +     down_write(&nvme_tcp_ofld_ctrl_rwsem);

> > +     list_add_tail(&ctrl->list, &nvme_tcp_ofld_ctrl_list);

> > +     up_write(&nvme_tcp_ofld_ctrl_rwsem);

> >

> >       return nctrl;

> >

> > -out_uninit_ctrl:

> > +out_release_ctrl:

> >       ctrl->dev->ops->release_ctrl(ctrl);

> > +out_uninit_ctrl:

> > +     nvme_uninit_ctrl(nctrl);

> > +     nvme_put_ctrl(nctrl);

> > +out_free_queues:

> > +     kfree(ctrl->queues);

> >   out_module_put:

> >       module_put(dev->ops->module);

> >   out_free_ctrl:

> > @@ -212,7 +663,15 @@ static int __init nvme_tcp_ofld_init_module(void)

> >

> >   static void __exit nvme_tcp_ofld_cleanup_module(void)

> >   {

> > +     struct nvme_tcp_ofld_ctrl *ctrl;

> > +

> >       nvmf_unregister_transport(&nvme_tcp_ofld_transport);

> > +

> > +     down_write(&nvme_tcp_ofld_ctrl_rwsem);

> > +     list_for_each_entry(ctrl, &nvme_tcp_ofld_ctrl_list, list)

> > +             nvme_delete_ctrl(&ctrl->nctrl);

> > +     up_write(&nvme_tcp_ofld_ctrl_rwsem);

> > +     flush_workqueue(nvme_delete_wq);

> >   }

> >

> >   module_init(nvme_tcp_ofld_init_module);

> >
diff mbox series

Patch

diff --git a/drivers/nvme/host/tcp-offload.c b/drivers/nvme/host/tcp-offload.c
index aa7cc239abf2..f7e0dc79bedd 100644
--- a/drivers/nvme/host/tcp-offload.c
+++ b/drivers/nvme/host/tcp-offload.c
@@ -12,6 +12,10 @@ 
 
 static LIST_HEAD(nvme_tcp_ofld_devices);
 static DECLARE_RWSEM(nvme_tcp_ofld_devices_rwsem);
+static LIST_HEAD(nvme_tcp_ofld_ctrl_list);
+static DECLARE_RWSEM(nvme_tcp_ofld_ctrl_rwsem);
+static struct blk_mq_ops nvme_tcp_ofld_admin_mq_ops;
+static struct blk_mq_ops nvme_tcp_ofld_mq_ops;
 
 static inline struct nvme_tcp_ofld_ctrl *to_tcp_ofld_ctrl(struct nvme_ctrl *nctrl)
 {
@@ -128,28 +132,435 @@  nvme_tcp_ofld_lookup_dev(struct nvme_tcp_ofld_ctrl *ctrl)
 	return dev;
 }
 
+static struct blk_mq_tag_set *
+nvme_tcp_ofld_alloc_tagset(struct nvme_ctrl *nctrl, bool admin)
+{
+	struct nvme_tcp_ofld_ctrl *ctrl = to_tcp_ofld_ctrl(nctrl);
+	struct blk_mq_tag_set *set;
+	int rc;
+
+	if (admin) {
+		set = &ctrl->admin_tag_set;
+		memset(set, 0, sizeof(*set));
+		set->ops = &nvme_tcp_ofld_admin_mq_ops;
+		set->queue_depth = NVME_AQ_MQ_TAG_DEPTH;
+		set->reserved_tags = NVMF_RESERVED_TAGS;
+		set->numa_node = nctrl->numa_node;
+		set->flags = BLK_MQ_F_BLOCKING;
+		set->cmd_size = sizeof(struct nvme_tcp_ofld_req);
+		set->driver_data = ctrl;
+		set->nr_hw_queues = 1;
+		set->timeout = NVME_ADMIN_TIMEOUT;
+	} else {
+		set = &ctrl->tag_set;
+		memset(set, 0, sizeof(*set));
+		set->ops = &nvme_tcp_ofld_mq_ops;
+		set->queue_depth = nctrl->sqsize + 1;
+		set->reserved_tags = NVMF_RESERVED_TAGS;
+		set->numa_node = nctrl->numa_node;
+		set->flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_BLOCKING;
+		set->cmd_size = sizeof(struct nvme_tcp_ofld_req);
+		set->driver_data = ctrl;
+		set->nr_hw_queues = nctrl->queue_count - 1;
+		set->timeout = NVME_IO_TIMEOUT;
+		set->nr_maps = nctrl->opts->nr_poll_queues ? HCTX_MAX_TYPES : 2;
+	}
+
+	rc = blk_mq_alloc_tag_set(set);
+	if (rc)
+		return ERR_PTR(rc);
+
+	return set;
+}
+
+static int nvme_tcp_ofld_configure_admin_queue(struct nvme_ctrl *nctrl,
+					       bool new)
+{
+	int rc;
+
+	/* Placeholder - alloc_admin_queue */
+	if (new) {
+		nctrl->admin_tagset =
+				nvme_tcp_ofld_alloc_tagset(nctrl, true);
+		if (IS_ERR(nctrl->admin_tagset)) {
+			rc = PTR_ERR(nctrl->admin_tagset);
+			nctrl->admin_tagset = NULL;
+			goto out_free_queue;
+		}
+
+		nctrl->fabrics_q = blk_mq_init_queue(nctrl->admin_tagset);
+		if (IS_ERR(nctrl->fabrics_q)) {
+			rc = PTR_ERR(nctrl->fabrics_q);
+			nctrl->fabrics_q = NULL;
+			goto out_free_tagset;
+		}
+
+		nctrl->admin_q = blk_mq_init_queue(nctrl->admin_tagset);
+		if (IS_ERR(nctrl->admin_q)) {
+			rc = PTR_ERR(nctrl->admin_q);
+			nctrl->admin_q = NULL;
+			goto out_cleanup_fabrics_q;
+		}
+	}
+
+	/* Placeholder - nvme_tcp_ofld_start_queue */
+
+	rc = nvme_enable_ctrl(nctrl);
+	if (rc)
+		goto out_stop_queue;
+
+	blk_mq_unquiesce_queue(nctrl->admin_q);
+
+	rc = nvme_init_ctrl_finish(nctrl);
+	if (rc)
+		goto out_quiesce_queue;
+
+	return 0;
+
+out_quiesce_queue:
+	blk_mq_quiesce_queue(nctrl->admin_q);
+	blk_sync_queue(nctrl->admin_q);
+
+out_stop_queue:
+	/* Placeholder - stop offload queue */
+	nvme_cancel_admin_tagset(nctrl);
+
+out_cleanup_fabrics_q:
+	if (new)
+		blk_cleanup_queue(nctrl->fabrics_q);
+out_free_tagset:
+	if (new)
+		blk_mq_free_tag_set(nctrl->admin_tagset);
+out_free_queue:
+	/* Placeholder - free admin queue */
+
+	return rc;
+}
+
+static int
+nvme_tcp_ofld_configure_io_queues(struct nvme_ctrl *nctrl, bool new)
+{
+	int rc;
+
+	/* Placeholder - alloc_io_queues */
+
+	if (new) {
+		nctrl->tagset = nvme_tcp_ofld_alloc_tagset(nctrl, false);
+		if (IS_ERR(nctrl->tagset)) {
+			rc = PTR_ERR(nctrl->tagset);
+			nctrl->tagset = NULL;
+			goto out_free_io_queues;
+		}
+
+		nctrl->connect_q = blk_mq_init_queue(nctrl->tagset);
+		if (IS_ERR(nctrl->connect_q)) {
+			rc = PTR_ERR(nctrl->connect_q);
+			nctrl->connect_q = NULL;
+			goto out_free_tag_set;
+		}
+	}
+
+	/* Placeholder - start_io_queues */
+
+	if (!new) {
+		nvme_start_queues(nctrl);
+		if (!nvme_wait_freeze_timeout(nctrl, NVME_IO_TIMEOUT)) {
+			/*
+			 * If we timed out waiting for freeze we are likely to
+			 * be stuck.  Fail the controller initialization just
+			 * to be safe.
+			 */
+			rc = -ENODEV;
+			goto out_wait_freeze_timed_out;
+		}
+		blk_mq_update_nr_hw_queues(nctrl->tagset, nctrl->queue_count - 1);
+		nvme_unfreeze(nctrl);
+	}
+
+	return 0;
+
+out_wait_freeze_timed_out:
+	nvme_stop_queues(nctrl);
+	nvme_sync_io_queues(nctrl);
+
+	/* Placeholder - Stop IO queues */
+
+	if (new)
+		blk_cleanup_queue(nctrl->connect_q);
+out_free_tag_set:
+	if (new)
+		blk_mq_free_tag_set(nctrl->tagset);
+out_free_io_queues:
+	/* Placeholder - free_io_queues */
+
+	return rc;
+}
+
 static int nvme_tcp_ofld_setup_ctrl(struct nvme_ctrl *nctrl, bool new)
 {
-	/* Placeholder - validates inputs and creates admin and IO queues */
+	struct nvmf_ctrl_options *opts = nctrl->opts;
+	int rc;
+
+	rc = nvme_tcp_ofld_configure_admin_queue(nctrl, new);
+	if (rc)
+		return rc;
+
+	if (nctrl->icdoff) {
+		dev_err(nctrl->device, "icdoff is not supported!\n");
+		rc = -EINVAL;
+		goto destroy_admin;
+	}
+
+	if (!(nctrl->sgls & ((1 << 0) | (1 << 1)))) {
+		dev_err(nctrl->device, "Mandatory sgls are not supported!\n");
+		goto destroy_admin;
+	}
+
+	if (opts->queue_size > nctrl->sqsize + 1)
+		dev_warn(nctrl->device,
+			 "queue_size %zu > ctrl sqsize %u, clamping down\n",
+			 opts->queue_size, nctrl->sqsize + 1);
+
+	if (nctrl->sqsize + 1 > nctrl->maxcmd) {
+		dev_warn(nctrl->device,
+			 "sqsize %u > ctrl maxcmd %u, clamping down\n",
+			 nctrl->sqsize + 1, nctrl->maxcmd);
+		nctrl->sqsize = nctrl->maxcmd - 1;
+	}
+
+	if (nctrl->queue_count > 1) {
+		rc = nvme_tcp_ofld_configure_io_queues(nctrl, new);
+		if (rc)
+			goto destroy_admin;
+	}
+
+	if (!nvme_change_ctrl_state(nctrl, NVME_CTRL_LIVE)) {
+		/*
+		 * state change failure is ok if we started ctrl delete,
+		 * unless we're during creation of a new controller to
+		 * avoid races with teardown flow.
+		 */
+		WARN_ON_ONCE(nctrl->state != NVME_CTRL_DELETING &&
+			     nctrl->state != NVME_CTRL_DELETING_NOIO);
+		WARN_ON_ONCE(new);
+		rc = -EINVAL;
+		goto destroy_io;
+	}
+
+	nvme_start_ctrl(nctrl);
+
+	return 0;
+
+destroy_io:
+	/* Placeholder - stop and destroy io queues*/
+destroy_admin:
+	/* Placeholder - stop and destroy admin queue*/
+
+	return rc;
+}
+
+static int
+nvme_tcp_ofld_check_dev_opts(struct nvmf_ctrl_options *opts,
+			     struct nvme_tcp_ofld_ops *ofld_ops)
+{
+	unsigned int nvme_tcp_ofld_opt_mask = NVMF_ALLOWED_OPTS |
+			ofld_ops->allowed_opts | ofld_ops->required_opts;
+	if (opts->mask & ~nvme_tcp_ofld_opt_mask) {
+		pr_warn("One or more of the nvmf options isn't supported by %s.\n",
+			ofld_ops->name);
+
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static void nvme_tcp_ofld_free_ctrl(struct nvme_ctrl *nctrl)
+{
+	struct nvme_tcp_ofld_ctrl *ctrl = to_tcp_ofld_ctrl(nctrl);
+	struct nvme_tcp_ofld_dev *dev = ctrl->dev;
+
+	if (list_empty(&ctrl->list))
+		goto free_ctrl;
+
+	down_write(&nvme_tcp_ofld_ctrl_rwsem);
+	ctrl->dev->ops->release_ctrl(ctrl);
+	list_del(&ctrl->list);
+	up_write(&nvme_tcp_ofld_ctrl_rwsem);
+
+	nvmf_free_options(nctrl->opts);
+free_ctrl:
+	module_put(dev->ops->module);
+	kfree(ctrl->queues);
+	kfree(ctrl);
+}
+
+static void nvme_tcp_ofld_submit_async_event(struct nvme_ctrl *arg)
+{
+	/* Placeholder - submit_async_event */
+}
+
+static void
+nvme_tcp_ofld_teardown_admin_queue(struct nvme_ctrl *ctrl, bool remove)
+{
+	/* Placeholder - teardown_admin_queue */
+}
+
+static void
+nvme_tcp_ofld_teardown_io_queues(struct nvme_ctrl *nctrl, bool remove)
+{
+	/* Placeholder - teardown_io_queues */
+}
+
+static void
+nvme_tcp_ofld_teardown_ctrl(struct nvme_ctrl *nctrl, bool shutdown)
+{
+	/* Placeholder - err_work and connect_work */
+	nvme_tcp_ofld_teardown_io_queues(nctrl, shutdown);
+	blk_mq_quiesce_queue(nctrl->admin_q);
+	if (shutdown)
+		nvme_shutdown_ctrl(nctrl);
+	else
+		nvme_disable_ctrl(nctrl);
+	nvme_tcp_ofld_teardown_admin_queue(nctrl, shutdown);
+}
+
+static void nvme_tcp_ofld_delete_ctrl(struct nvme_ctrl *nctrl)
+{
+	nvme_tcp_ofld_teardown_ctrl(nctrl, true);
+}
+
+static int
+nvme_tcp_ofld_init_request(struct blk_mq_tag_set *set,
+			   struct request *rq,
+			   unsigned int hctx_idx,
+			   unsigned int numa_node)
+{
+	struct nvme_tcp_ofld_req *req = blk_mq_rq_to_pdu(rq);
+	struct nvme_tcp_ofld_ctrl *ctrl = set->driver_data;
+
+	/* Placeholder - init request */
+
+	req->done = nvme_tcp_ofld_req_done;
+	ctrl->dev->ops->init_req(req);
 
 	return 0;
 }
 
+static blk_status_t
+nvme_tcp_ofld_queue_rq(struct blk_mq_hw_ctx *hctx,
+		       const struct blk_mq_queue_data *bd)
+{
+	/* Call nvme_setup_cmd(...) */
+
+	/* Call ops->send_req(...) */
+
+	return BLK_STS_OK;
+}
+
+static struct blk_mq_ops nvme_tcp_ofld_mq_ops = {
+	.queue_rq	= nvme_tcp_ofld_queue_rq,
+	.init_request	= nvme_tcp_ofld_init_request,
+	/*
+	 * All additional ops will be also implemented and registered similar to
+	 * tcp.c
+	 */
+};
+
+static struct blk_mq_ops nvme_tcp_ofld_admin_mq_ops = {
+	.queue_rq	= nvme_tcp_ofld_queue_rq,
+	.init_request	= nvme_tcp_ofld_init_request,
+	/*
+	 * All additional ops will be also implemented and registered similar to
+	 * tcp.c
+	 */
+};
+
+static const struct nvme_ctrl_ops nvme_tcp_ofld_ctrl_ops = {
+	.name			= "tcp_offload",
+	.module			= THIS_MODULE,
+	.flags			= NVME_F_FABRICS,
+	.reg_read32		= nvmf_reg_read32,
+	.reg_read64		= nvmf_reg_read64,
+	.reg_write32		= nvmf_reg_write32,
+	.free_ctrl		= nvme_tcp_ofld_free_ctrl,
+	.submit_async_event	= nvme_tcp_ofld_submit_async_event,
+	.delete_ctrl		= nvme_tcp_ofld_delete_ctrl,
+	.get_address		= nvmf_get_address,
+};
+
+static bool
+nvme_tcp_ofld_existing_controller(struct nvmf_ctrl_options *opts)
+{
+	struct nvme_tcp_ofld_ctrl *ctrl;
+	bool found = false;
+
+	down_read(&nvme_tcp_ofld_ctrl_rwsem);
+	list_for_each_entry(ctrl, &nvme_tcp_ofld_ctrl_list, list) {
+		found = nvmf_ip_options_match(&ctrl->nctrl, opts);
+		if (found)
+			break;
+	}
+	up_read(&nvme_tcp_ofld_ctrl_rwsem);
+
+	return found;
+}
+
 static struct nvme_ctrl *
 nvme_tcp_ofld_create_ctrl(struct device *ndev, struct nvmf_ctrl_options *opts)
 {
+	struct nvme_tcp_ofld_queue *queue;
 	struct nvme_tcp_ofld_ctrl *ctrl;
 	struct nvme_tcp_ofld_dev *dev;
 	struct nvme_ctrl *nctrl;
-	int rc = 0;
+	int i, rc = 0;
 
 	ctrl = kzalloc(sizeof(*ctrl), GFP_KERNEL);
 	if (!ctrl)
 		return ERR_PTR(-ENOMEM);
 
+	INIT_LIST_HEAD(&ctrl->list);
 	nctrl = &ctrl->nctrl;
+	nctrl->opts = opts;
+	nctrl->queue_count = opts->nr_io_queues + opts->nr_write_queues +
+			     opts->nr_poll_queues + 1;
+	nctrl->sqsize = opts->queue_size - 1;
+	nctrl->kato = opts->kato;
+	if (!(opts->mask & NVMF_OPT_TRSVCID)) {
+		opts->trsvcid =
+			kstrdup(__stringify(NVME_TCP_DISC_PORT), GFP_KERNEL);
+		if (!opts->trsvcid) {
+			rc = -ENOMEM;
+			goto out_free_ctrl;
+		}
+		opts->mask |= NVMF_OPT_TRSVCID;
+	}
 
-	/* Init nvme_tcp_ofld_ctrl and nvme_ctrl params based on received opts */
+	rc = inet_pton_with_scope(&init_net, AF_UNSPEC, opts->traddr,
+				  opts->trsvcid,
+				  &ctrl->conn_params.remote_ip_addr);
+	if (rc) {
+		pr_err("malformed address passed: %s:%s\n",
+		       opts->traddr, opts->trsvcid);
+		goto out_free_ctrl;
+	}
+
+	if (opts->mask & NVMF_OPT_HOST_TRADDR) {
+		rc = inet_pton_with_scope(&init_net, AF_UNSPEC,
+					  opts->host_traddr, NULL,
+					  &ctrl->conn_params.local_ip_addr);
+		if (rc) {
+			pr_err("malformed src address passed: %s\n",
+			       opts->host_traddr);
+			goto out_free_ctrl;
+		}
+	}
+
+	if (!opts->duplicate_connect &&
+	    nvme_tcp_ofld_existing_controller(opts)) {
+		rc = -EALREADY;
+		goto out_free_ctrl;
+	}
 
 	/* Find device that can reach the dest addr */
 	dev = nvme_tcp_ofld_lookup_dev(ctrl);
@@ -160,6 +571,10 @@  nvme_tcp_ofld_create_ctrl(struct device *ndev, struct nvmf_ctrl_options *opts)
 		goto out_free_ctrl;
 	}
 
+	rc = nvme_tcp_ofld_check_dev_opts(opts, dev->ops);
+	if (rc)
+		goto out_module_put;
+
 	ctrl->dev = dev;
 
 	if (ctrl->dev->ops->max_hw_sectors)
@@ -167,22 +582,58 @@  nvme_tcp_ofld_create_ctrl(struct device *ndev, struct nvmf_ctrl_options *opts)
 	if (ctrl->dev->ops->max_segments)
 		nctrl->max_segments = ctrl->dev->ops->max_segments;
 
-	/* Init queues */
+	ctrl->queues = kcalloc(nctrl->queue_count,
+			       sizeof(struct nvme_tcp_ofld_queue),
+			       GFP_KERNEL);
+	if (!ctrl->queues) {
+		rc = -ENOMEM;
+		goto out_module_put;
+	}
+
+	for (i = 0; i < nctrl->queue_count; ++i) {
+		queue = &ctrl->queues[i];
+		queue->ctrl = ctrl;
+		queue->dev = dev;
+		queue->report_err = nvme_tcp_ofld_report_queue_err;
+		queue->hdr_digest = nctrl->opts->hdr_digest;
+		queue->data_digest = nctrl->opts->data_digest;
+		queue->tos = nctrl->opts->tos;
+	}
+
+	rc = nvme_init_ctrl(nctrl, ndev, &nvme_tcp_ofld_ctrl_ops, 0);
+	if (rc)
+		goto out_free_queues;
 
-	/* Call nvme_init_ctrl */
+	if (!nvme_change_ctrl_state(nctrl, NVME_CTRL_CONNECTING)) {
+		WARN_ON_ONCE(1);
+		rc = -EINTR;
+		goto out_uninit_ctrl;
+	}
 
 	rc = ctrl->dev->ops->setup_ctrl(ctrl, true);
 	if (rc)
-		goto out_module_put;
+		goto out_uninit_ctrl;
 
 	rc = nvme_tcp_ofld_setup_ctrl(nctrl, true);
 	if (rc)
-		goto out_uninit_ctrl;
+		goto out_release_ctrl;
+
+	dev_info(nctrl->device, "new ctrl: NQN \"%s\", addr %pISp\n",
+		 opts->subsysnqn, &ctrl->conn_params.remote_ip_addr);
+
+	down_write(&nvme_tcp_ofld_ctrl_rwsem);
+	list_add_tail(&ctrl->list, &nvme_tcp_ofld_ctrl_list);
+	up_write(&nvme_tcp_ofld_ctrl_rwsem);
 
 	return nctrl;
 
-out_uninit_ctrl:
+out_release_ctrl:
 	ctrl->dev->ops->release_ctrl(ctrl);
+out_uninit_ctrl:
+	nvme_uninit_ctrl(nctrl);
+	nvme_put_ctrl(nctrl);
+out_free_queues:
+	kfree(ctrl->queues);
 out_module_put:
 	module_put(dev->ops->module);
 out_free_ctrl:
@@ -212,7 +663,15 @@  static int __init nvme_tcp_ofld_init_module(void)
 
 static void __exit nvme_tcp_ofld_cleanup_module(void)
 {
+	struct nvme_tcp_ofld_ctrl *ctrl;
+
 	nvmf_unregister_transport(&nvme_tcp_ofld_transport);
+
+	down_write(&nvme_tcp_ofld_ctrl_rwsem);
+	list_for_each_entry(ctrl, &nvme_tcp_ofld_ctrl_list, list)
+		nvme_delete_ctrl(&ctrl->nctrl);
+	up_write(&nvme_tcp_ofld_ctrl_rwsem);
+	flush_workqueue(nvme_delete_wq);
 }
 
 module_init(nvme_tcp_ofld_init_module);