diff mbox series

[v3,5/5] ceph: convert to sparse reads

Message ID 20220318135013.43934-6-jlayton@kernel.org
State New
Headers show
Series ceph/libceph: add support for sparse reads to msgr2 crc codepath | expand

Commit Message

Jeff Layton March 18, 2022, 1:50 p.m. UTC
Have ceph issue sparse reads instead of normal ones. The callers now
preallocate an sparse extent buffer that the libceph receive code can
populate and hand back after the operation completes.

After a successful read, we can't use the req->r_result value to
determine the amount of data "read", so instead we set the received
length to be from the end of the last extent in the buffer. Any
interstitial holes will have been filled by the receive code.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
---
 fs/ceph/addr.c  | 13 +++++++++++--
 fs/ceph/file.c  | 41 ++++++++++++++++++++++++++++++++++-------
 fs/ceph/super.h |  7 +++++++
 3 files changed, 52 insertions(+), 9 deletions(-)

Comments

Jeff Layton March 21, 2022, 12:30 p.m. UTC | #1
On Fri, 2022-03-18 at 09:50 -0400, Jeff Layton wrote:
> Have ceph issue sparse reads instead of normal ones. The callers now
> preallocate an sparse extent buffer that the libceph receive code can
> populate and hand back after the operation completes.
> 
> After a successful read, we can't use the req->r_result value to
> determine the amount of data "read", so instead we set the received
> length to be from the end of the last extent in the buffer. Any
> interstitial holes will have been filled by the receive code.
> 
> Signed-off-by: Jeff Layton <jlayton@kernel.org>
> ---
>  fs/ceph/addr.c  | 13 +++++++++++--
>  fs/ceph/file.c  | 41 ++++++++++++++++++++++++++++++++++-------
>  fs/ceph/super.h |  7 +++++++
>  3 files changed, 52 insertions(+), 9 deletions(-)
> 
> diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
> index 752c421c9922..6d4f9fbf22ce 100644
> --- a/fs/ceph/addr.c
> +++ b/fs/ceph/addr.c
> @@ -220,6 +220,7 @@ static void finish_netfs_read(struct ceph_osd_request *req)
>  	struct ceph_fs_client *fsc = ceph_inode_to_client(req->r_inode);
>  	struct ceph_osd_data *osd_data = osd_req_op_extent_osd_data(req, 0);
>  	struct netfs_read_subrequest *subreq = req->r_priv;
> +	struct ceph_osd_req_op *op = &req->r_ops[0];
>  	int num_pages;
>  	int err = req->r_result;
>  
> @@ -230,7 +231,9 @@ static void finish_netfs_read(struct ceph_osd_request *req)
>  	     subreq->len, i_size_read(req->r_inode));
>  
>  	/* no object means success but no data */
> -	if (err == -ENOENT)
> +	if (err >= 0)
> +		err = ceph_sparse_ext_map_end(op);
> +	else if (err == -ENOENT)
>  		err = 0;
>  	else if (err == -EBLOCKLISTED)
>  		fsc->blocklisted = true;
> @@ -317,7 +320,7 @@ static void ceph_netfs_issue_op(struct netfs_read_subrequest *subreq)
>  		return;
>  
>  	req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, vino, subreq->start, &len,
> -			0, 1, CEPH_OSD_OP_READ,
> +			0, 1, CEPH_OSD_OP_SPARSE_READ,
>  			CEPH_OSD_FLAG_READ | fsc->client->osdc.client->options->read_from_replica,
>  			NULL, ci->i_truncate_seq, ci->i_truncate_size, false);
>  	if (IS_ERR(req)) {
> @@ -326,6 +329,12 @@ static void ceph_netfs_issue_op(struct netfs_read_subrequest *subreq)
>  		goto out;
>  	}
>  
> +	err = ceph_alloc_sparse_ext_map(&req->r_ops[0], CEPH_SPARSE_EXT_ARRAY_INITIAL);
> +	if (err) {
> +		ceph_osdc_put_request(req);
> +		goto out;
> +	}
> +
>  	dout("%s: pos=%llu orig_len=%zu len=%llu\n", __func__, subreq->start, subreq->len, len);
>  	iov_iter_xarray(&iter, READ, &rreq->mapping->i_pages, subreq->start, len);
>  	err = iov_iter_get_pages_alloc(&iter, &pages, len, &page_off);
> diff --git a/fs/ceph/file.c b/fs/ceph/file.c
> index feb75eb1cd82..deba39989a07 100644
> --- a/fs/ceph/file.c
> +++ b/fs/ceph/file.c
> @@ -931,10 +931,11 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to,
>  		bool more;
>  		int idx;
>  		size_t left;
> +		struct ceph_osd_req_op *op;
>  
>  		req = ceph_osdc_new_request(osdc, &ci->i_layout,
>  					ci->i_vino, off, &len, 0, 1,
> -					CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
> +					CEPH_OSD_OP_SPARSE_READ, CEPH_OSD_FLAG_READ,
>  					NULL, ci->i_truncate_seq,
>  					ci->i_truncate_size, false);
>  		if (IS_ERR(req)) {
> @@ -955,6 +956,14 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to,
>  
>  		osd_req_op_extent_osd_data_pages(req, 0, pages, len, page_off,
>  						 false, false);
> +
> +		op = &req->r_ops[0];
> +		ret = ceph_alloc_sparse_ext_map(op, CEPH_SPARSE_EXT_ARRAY_INITIAL);
> +		if (ret) {
> +			ceph_osdc_put_request(req);
> +			break;
> +		}
> +
>  		ret = ceph_osdc_start_request(osdc, req, false);
>  		if (!ret)
>  			ret = ceph_osdc_wait_request(osdc, req);
> @@ -964,23 +973,28 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to,
>  					 req->r_end_latency,
>  					 len, ret);
>  
> -		ceph_osdc_put_request(req);
> -
>  		i_size = i_size_read(inode);
>  		dout("sync_read %llu~%llu got %zd i_size %llu%s\n",
>  		     off, len, ret, i_size, (more ? " MORE" : ""));
>  
> -		if (ret == -ENOENT)
> +		/* Fix it to go to end of extent map */
> +		if (ret >= 0)
> +			ret = ceph_sparse_ext_map_end(op);
> +		else if (ret == -ENOENT)
>  			ret = 0;
> +
>  		if (ret >= 0 && ret < len && (off + ret < i_size)) {
>  			int zlen = min(len - ret, i_size - off - ret);
>  			int zoff = page_off + ret;
> +
>  			dout("sync_read zero gap %llu~%llu\n",
> -                             off + ret, off + ret + zlen);
> +				off + ret, off + ret + zlen);
>  			ceph_zero_page_vector_range(zoff, zlen, pages);
>  			ret += zlen;
>  		}
>  
> +		ceph_osdc_put_request(req);
> +
>  		idx = 0;
>  		left = ret > 0 ? ret : 0;
>  		while (left > 0) {
> @@ -1095,6 +1109,7 @@ static void ceph_aio_complete_req(struct ceph_osd_request *req)
>  	struct inode *inode = req->r_inode;
>  	struct ceph_aio_request *aio_req = req->r_priv;
>  	struct ceph_osd_data *osd_data = osd_req_op_extent_osd_data(req, 0);
> +	struct ceph_osd_req_op *op = &req->r_ops[0];
>  	struct ceph_client_metric *metric = &ceph_sb_to_mdsc(inode->i_sb)->metric;
>  	unsigned int len = osd_data->bvec_pos.iter.bi_size;
>  
> @@ -1117,6 +1132,8 @@ static void ceph_aio_complete_req(struct ceph_osd_request *req)
>  		}
>  		rc = -ENOMEM;
>  	} else if (!aio_req->write) {
> +		if (rc >= 0)
> +			rc = ceph_sparse_ext_map_end(op);
>  		if (rc == -ENOENT)
>  			rc = 0;
>  		if (rc >= 0 && len > rc) {
> @@ -1280,6 +1297,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
>  	while (iov_iter_count(iter) > 0) {
>  		u64 size = iov_iter_count(iter);
>  		ssize_t len;
> +		struct ceph_osd_req_op *op;
>  
>  		if (write)
>  			size = min_t(u64, size, fsc->mount_options->wsize);
> @@ -1291,7 +1309,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
>  					    vino, pos, &size, 0,
>  					    1,
>  					    write ? CEPH_OSD_OP_WRITE :
> -						    CEPH_OSD_OP_READ,
> +						    CEPH_OSD_OP_SPARSE_READ,
>  					    flags, snapc,
>  					    ci->i_truncate_seq,
>  					    ci->i_truncate_size,
> @@ -1342,6 +1360,12 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
>  		}
>  
>  		osd_req_op_extent_osd_data_bvecs(req, 0, bvecs, num_pages, len);
> +		op = &req->r_ops[0];
> +		ret = ceph_alloc_sparse_ext_map(op, CEPH_SPARSE_EXT_ARRAY_INITIAL);
> +		if (ret) {
> +			ceph_osdc_put_request(req);
> +			break;
> +		}
>  
>  		if (aio_req) {
>  			aio_req->total_len += len;
> @@ -1370,8 +1394,11 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
>  
>  		size = i_size_read(inode);
>  		if (!write) {
> -			if (ret == -ENOENT)
> +			if (ret >= 0)
> +				ret = ceph_sparse_ext_map_end(op);
> +			else if (ret == -ENOENT)
>  				ret = 0;
> +
>  			if (ret >= 0 && ret < len && pos + ret < size) {
>  				struct iov_iter i;
>  				int zlen = min_t(size_t, len - ret,
> diff --git a/fs/ceph/super.h b/fs/ceph/super.h
> index 250aefecd628..ad09c26afac6 100644
> --- a/fs/ceph/super.h
> +++ b/fs/ceph/super.h
> @@ -75,6 +75,13 @@
>  #define CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT      5  /* cap release delay */
>  #define CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT     60  /* cap release delay */
>  
> +/*
> + * How big an extent array should we preallocate for a sparse read? This is
> + * just a starting value.  If we get more than this back from the OSD, the
> + * receiver will reallocate.
> + */
> +#define CEPH_SPARSE_EXT_ARRAY_INITIAL	16
> +
>  struct ceph_mount_options {
>  	unsigned int flags;
>  

For the record, I don't see us merging this patch as-is. This is just
what I was using for testing, but in practice, we may want to just use
sparse reads when necessary (i.e. only with fscrypt enabled).
diff mbox series

Patch

diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 752c421c9922..6d4f9fbf22ce 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -220,6 +220,7 @@  static void finish_netfs_read(struct ceph_osd_request *req)
 	struct ceph_fs_client *fsc = ceph_inode_to_client(req->r_inode);
 	struct ceph_osd_data *osd_data = osd_req_op_extent_osd_data(req, 0);
 	struct netfs_read_subrequest *subreq = req->r_priv;
+	struct ceph_osd_req_op *op = &req->r_ops[0];
 	int num_pages;
 	int err = req->r_result;
 
@@ -230,7 +231,9 @@  static void finish_netfs_read(struct ceph_osd_request *req)
 	     subreq->len, i_size_read(req->r_inode));
 
 	/* no object means success but no data */
-	if (err == -ENOENT)
+	if (err >= 0)
+		err = ceph_sparse_ext_map_end(op);
+	else if (err == -ENOENT)
 		err = 0;
 	else if (err == -EBLOCKLISTED)
 		fsc->blocklisted = true;
@@ -317,7 +320,7 @@  static void ceph_netfs_issue_op(struct netfs_read_subrequest *subreq)
 		return;
 
 	req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, vino, subreq->start, &len,
-			0, 1, CEPH_OSD_OP_READ,
+			0, 1, CEPH_OSD_OP_SPARSE_READ,
 			CEPH_OSD_FLAG_READ | fsc->client->osdc.client->options->read_from_replica,
 			NULL, ci->i_truncate_seq, ci->i_truncate_size, false);
 	if (IS_ERR(req)) {
@@ -326,6 +329,12 @@  static void ceph_netfs_issue_op(struct netfs_read_subrequest *subreq)
 		goto out;
 	}
 
+	err = ceph_alloc_sparse_ext_map(&req->r_ops[0], CEPH_SPARSE_EXT_ARRAY_INITIAL);
+	if (err) {
+		ceph_osdc_put_request(req);
+		goto out;
+	}
+
 	dout("%s: pos=%llu orig_len=%zu len=%llu\n", __func__, subreq->start, subreq->len, len);
 	iov_iter_xarray(&iter, READ, &rreq->mapping->i_pages, subreq->start, len);
 	err = iov_iter_get_pages_alloc(&iter, &pages, len, &page_off);
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index feb75eb1cd82..deba39989a07 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -931,10 +931,11 @@  static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to,
 		bool more;
 		int idx;
 		size_t left;
+		struct ceph_osd_req_op *op;
 
 		req = ceph_osdc_new_request(osdc, &ci->i_layout,
 					ci->i_vino, off, &len, 0, 1,
-					CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
+					CEPH_OSD_OP_SPARSE_READ, CEPH_OSD_FLAG_READ,
 					NULL, ci->i_truncate_seq,
 					ci->i_truncate_size, false);
 		if (IS_ERR(req)) {
@@ -955,6 +956,14 @@  static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to,
 
 		osd_req_op_extent_osd_data_pages(req, 0, pages, len, page_off,
 						 false, false);
+
+		op = &req->r_ops[0];
+		ret = ceph_alloc_sparse_ext_map(op, CEPH_SPARSE_EXT_ARRAY_INITIAL);
+		if (ret) {
+			ceph_osdc_put_request(req);
+			break;
+		}
+
 		ret = ceph_osdc_start_request(osdc, req, false);
 		if (!ret)
 			ret = ceph_osdc_wait_request(osdc, req);
@@ -964,23 +973,28 @@  static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to,
 					 req->r_end_latency,
 					 len, ret);
 
-		ceph_osdc_put_request(req);
-
 		i_size = i_size_read(inode);
 		dout("sync_read %llu~%llu got %zd i_size %llu%s\n",
 		     off, len, ret, i_size, (more ? " MORE" : ""));
 
-		if (ret == -ENOENT)
+		/* Fix it to go to end of extent map */
+		if (ret >= 0)
+			ret = ceph_sparse_ext_map_end(op);
+		else if (ret == -ENOENT)
 			ret = 0;
+
 		if (ret >= 0 && ret < len && (off + ret < i_size)) {
 			int zlen = min(len - ret, i_size - off - ret);
 			int zoff = page_off + ret;
+
 			dout("sync_read zero gap %llu~%llu\n",
-                             off + ret, off + ret + zlen);
+				off + ret, off + ret + zlen);
 			ceph_zero_page_vector_range(zoff, zlen, pages);
 			ret += zlen;
 		}
 
+		ceph_osdc_put_request(req);
+
 		idx = 0;
 		left = ret > 0 ? ret : 0;
 		while (left > 0) {
@@ -1095,6 +1109,7 @@  static void ceph_aio_complete_req(struct ceph_osd_request *req)
 	struct inode *inode = req->r_inode;
 	struct ceph_aio_request *aio_req = req->r_priv;
 	struct ceph_osd_data *osd_data = osd_req_op_extent_osd_data(req, 0);
+	struct ceph_osd_req_op *op = &req->r_ops[0];
 	struct ceph_client_metric *metric = &ceph_sb_to_mdsc(inode->i_sb)->metric;
 	unsigned int len = osd_data->bvec_pos.iter.bi_size;
 
@@ -1117,6 +1132,8 @@  static void ceph_aio_complete_req(struct ceph_osd_request *req)
 		}
 		rc = -ENOMEM;
 	} else if (!aio_req->write) {
+		if (rc >= 0)
+			rc = ceph_sparse_ext_map_end(op);
 		if (rc == -ENOENT)
 			rc = 0;
 		if (rc >= 0 && len > rc) {
@@ -1280,6 +1297,7 @@  ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
 	while (iov_iter_count(iter) > 0) {
 		u64 size = iov_iter_count(iter);
 		ssize_t len;
+		struct ceph_osd_req_op *op;
 
 		if (write)
 			size = min_t(u64, size, fsc->mount_options->wsize);
@@ -1291,7 +1309,7 @@  ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
 					    vino, pos, &size, 0,
 					    1,
 					    write ? CEPH_OSD_OP_WRITE :
-						    CEPH_OSD_OP_READ,
+						    CEPH_OSD_OP_SPARSE_READ,
 					    flags, snapc,
 					    ci->i_truncate_seq,
 					    ci->i_truncate_size,
@@ -1342,6 +1360,12 @@  ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
 		}
 
 		osd_req_op_extent_osd_data_bvecs(req, 0, bvecs, num_pages, len);
+		op = &req->r_ops[0];
+		ret = ceph_alloc_sparse_ext_map(op, CEPH_SPARSE_EXT_ARRAY_INITIAL);
+		if (ret) {
+			ceph_osdc_put_request(req);
+			break;
+		}
 
 		if (aio_req) {
 			aio_req->total_len += len;
@@ -1370,8 +1394,11 @@  ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
 
 		size = i_size_read(inode);
 		if (!write) {
-			if (ret == -ENOENT)
+			if (ret >= 0)
+				ret = ceph_sparse_ext_map_end(op);
+			else if (ret == -ENOENT)
 				ret = 0;
+
 			if (ret >= 0 && ret < len && pos + ret < size) {
 				struct iov_iter i;
 				int zlen = min_t(size_t, len - ret,
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 250aefecd628..ad09c26afac6 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -75,6 +75,13 @@ 
 #define CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT      5  /* cap release delay */
 #define CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT     60  /* cap release delay */
 
+/*
+ * How big an extent array should we preallocate for a sparse read? This is
+ * just a starting value.  If we get more than this back from the OSD, the
+ * receiver will reallocate.
+ */
+#define CEPH_SPARSE_EXT_ARRAY_INITIAL	16
+
 struct ceph_mount_options {
 	unsigned int flags;