diff mbox series

[v2,36/51] netfs: Support encryption on Unbuffered/DIO write

Message ID 20231117211544.1740466-37-dhowells@redhat.com
State Superseded
Headers show
Series netfs, afs, cifs: Delegate high-level I/O to netfslib | expand

Commit Message

David Howells Nov. 17, 2023, 9:15 p.m. UTC
Support unbuffered and direct I/O writes to an encrypted file.  This may
require making an RMW cycle if the write is not appropriately aligned with
respect to the crypto blocks.

Signed-off-by: David Howells <dhowells@redhat.com>
cc: Jeff Layton <jlayton@kernel.org>
cc: linux-cachefs@redhat.com
cc: linux-fsdevel@vger.kernel.org
cc: linux-mm@kvack.org
---
 fs/netfs/direct_read.c       |   2 +-
 fs/netfs/direct_write.c      | 210 ++++++++++++++++++++++++++++++++++-
 fs/netfs/internal.h          |   8 ++
 fs/netfs/io.c                | 117 +++++++++++++++++++
 fs/netfs/main.c              |   1 +
 include/linux/netfs.h        |   4 +
 include/trace/events/netfs.h |   1 +
 7 files changed, 337 insertions(+), 6 deletions(-)
diff mbox series

Patch

diff --git a/fs/netfs/direct_read.c b/fs/netfs/direct_read.c
index 158719b56900..c01cbe42db8a 100644
--- a/fs/netfs/direct_read.c
+++ b/fs/netfs/direct_read.c
@@ -88,7 +88,7 @@  static int netfs_copy_xarray_to_iter(struct netfs_io_request *rreq,
  * If we did a direct read to a bounce buffer (say we needed to decrypt it),
  * copy the data obtained to the destination iterator.
  */
-static int netfs_dio_copy_bounce_to_dest(struct netfs_io_request *rreq)
+int netfs_dio_copy_bounce_to_dest(struct netfs_io_request *rreq)
 {
 	struct iov_iter *dest_iter = &rreq->iter;
 	struct kiocb *iocb = rreq->iocb;
diff --git a/fs/netfs/direct_write.c b/fs/netfs/direct_write.c
index b1a4921ac4a2..f9dea801d6dd 100644
--- a/fs/netfs/direct_write.c
+++ b/fs/netfs/direct_write.c
@@ -23,6 +23,100 @@  static void netfs_cleanup_dio_write(struct netfs_io_request *wreq)
 	}
 }
 
+/*
+ * Allocate a bunch of pages and add them into the xarray buffer starting at
+ * the given index.
+ */
+static int netfs_alloc_buffer(struct xarray *xa, pgoff_t index, unsigned int nr_pages)
+{
+	struct page *page;
+	unsigned int n;
+	int ret = 0;
+	LIST_HEAD(list);
+
+	n = alloc_pages_bulk_list(GFP_NOIO, nr_pages, &list);
+	if (n < nr_pages) {
+		ret = -ENOMEM;
+	}
+
+	while ((page = list_first_entry_or_null(&list, struct page, lru))) {
+		list_del(&page->lru);
+		page->index = index;
+		ret = xa_insert(xa, index++, page, GFP_NOIO);
+		if (ret < 0)
+			break;
+	}
+
+	while ((page = list_first_entry_or_null(&list, struct page, lru))) {
+		list_del(&page->lru);
+		__free_page(page);
+	}
+	return ret;
+}
+
+/*
+ * Copy all of the data from the source iterator into folios in the destination
+ * xarray.  We cannot step through and kmap the source iterator if it's an
+ * iovec, so we have to step through the xarray and drop the RCU lock each
+ * time.
+ */
+static int netfs_copy_iter_to_xarray(struct iov_iter *src, struct xarray *xa,
+				     unsigned long long start)
+{
+	struct folio *folio;
+	void *base;
+	pgoff_t index = start / PAGE_SIZE;
+	size_t len, copied, count = iov_iter_count(src);
+
+	XA_STATE(xas, xa, index);
+
+	_enter("%zx", count);
+
+	if (!count)
+		return -EIO;
+
+	len = PAGE_SIZE - offset_in_page(start);
+	rcu_read_lock();
+	xas_for_each(&xas, folio, ULONG_MAX) {
+		size_t offset;
+
+		if (xas_retry(&xas, folio))
+			continue;
+
+		/* There shouldn't be a need to call xas_pause() as no one else
+		 * can see the xarray we're iterating over.
+		 */
+		rcu_read_unlock();
+
+		offset = offset_in_folio(folio, start);
+		_debug("folio %lx +%zx [%llx]", folio->index, offset, start);
+
+		while (offset < folio_size(folio)) {
+			len = min(count, len);
+
+			base = kmap_local_folio(folio, offset);
+			copied = copy_from_iter(base, len, src);
+			kunmap_local(base);
+			if (copied != len)
+				goto out;
+			count -= len;
+			if (count == 0)
+				goto out;
+
+			start += len;
+			offset += len;
+			len = PAGE_SIZE;
+		}
+
+		rcu_read_lock();
+	}
+
+	rcu_read_unlock();
+out:
+	_leave(" = %zx", count);
+	return count ? -EIO : 0;
+}
+
 /*
  * Perform an unbuffered write where we may have to do an RMW operation on an
  * encrypted file.  This can also be used for direct I/O writes.
@@ -31,20 +125,47 @@  ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov_iter *
 					   struct netfs_group *netfs_group)
 {
 	struct netfs_io_request *wreq;
+	struct netfs_inode *ctx = netfs_inode(file_inode(iocb->ki_filp));
+	unsigned long long real_size = ctx->remote_i_size;
 	unsigned long long start = iocb->ki_pos;
 	unsigned long long end = start + iov_iter_count(iter);
 	ssize_t ret, n;
-	bool async = !is_sync_kiocb(iocb);
+	size_t min_bsize = 1UL << ctx->min_bshift;
+	size_t bmask = min_bsize - 1;
+	size_t gap_before = start & bmask;
+	size_t gap_after = (min_bsize - end) & bmask;
+	bool use_bounce, async = !is_sync_kiocb(iocb);
+	enum {
+		DIRECT_IO, COPY_TO_BOUNCE, ENC_TO_BOUNCE, COPY_THEN_ENC,
+	} buffering;
 
 	_enter("");
 
+	/* The real size must be rounded out to the crypto block size plus
+	 * any trailer we might want to attach.
+	 */
+	if (real_size && ctx->crypto_bshift) {
+		size_t cmask = 1UL << ctx->crypto_bshift;
+
+		if (real_size < ctx->crypto_trailer)
+			return -EIO;
+		if ((real_size - ctx->crypto_trailer) & cmask)
+			return -EIO;
+		real_size -= ctx->crypto_trailer;
+	}
+
 	/* We're going to need a bounce buffer if what we transmit is going to
 	 * be different in some way to the source buffer, e.g. because it gets
 	 * encrypted/compressed or because it needs expanding to a block size.
 	 */
-	// TODO
+	use_bounce = test_bit(NETFS_ICTX_ENCRYPTED, &ctx->flags);
+	if (gap_before || gap_after) {
+		if (iocb->ki_flags & IOCB_DIRECT)
+			return -EINVAL;
+		use_bounce = true;
+	}
 
-	_debug("uw %llx-%llx", start, end);
+	_debug("uw %llx-%llx +%zx,%zx", start, end, gap_before, gap_after);
 
 	wreq = netfs_alloc_request(iocb->ki_filp->f_mapping, iocb->ki_filp,
 				   start, end - start,
@@ -53,7 +174,57 @@  ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov_iter *
 	if (IS_ERR(wreq))
 		return PTR_ERR(wreq);
 
-	{
+	if (use_bounce) {
+		unsigned long long bstart = start - gap_before;
+		unsigned long long bend = end + gap_after;
+		pgoff_t first = bstart / PAGE_SIZE;
+		pgoff_t last  = (bend - 1) / PAGE_SIZE;
+
+		_debug("bounce %llx-%llx %lx-%lx", bstart, bend, first, last);
+
+		ret = netfs_alloc_buffer(&wreq->bounce, first, last - first + 1);
+		if (ret < 0)
+			goto out;
+
+		iov_iter_xarray(&wreq->io_iter, READ, &wreq->bounce,
+				bstart, bend - bstart);
+
+		if (gap_before || gap_after)
+			async = false; /* We may have to repeat the RMW cycle */
+	}
+
+repeat_rmw_cycle:
+	if (use_bounce) {
+		/* If we're going to need to do an RMW cycle, fill in the gaps
+		 * at the ends of the buffer.
+		 */
+		if (gap_before || gap_after) {
+			struct iov_iter buffer = wreq->io_iter;
+
+			if ((gap_before && start - gap_before < real_size) ||
+			    (gap_after && end < real_size)) {
+				ret = netfs_rmw_read(wreq, iocb->ki_filp,
+						     start - gap_before, gap_before,
+						     end, end < real_size ? gap_after : 0);
+				if (ret < 0)
+					goto out;
+			}
+
+			if (gap_before && start - gap_before >= real_size)
+				iov_iter_zero(gap_before, &buffer);
+			if (gap_after && end >= real_size) {
+				iov_iter_advance(&buffer, end - start);
+				iov_iter_zero(gap_after, &buffer);
+			}
+		}
+
+		if (!test_bit(NETFS_RREQ_CONTENT_ENCRYPTION, &wreq->flags))
+			buffering = COPY_TO_BOUNCE;
+		else if (!gap_before && !gap_after && netfs_is_crypto_aligned(wreq, iter))
+			buffering = ENC_TO_BOUNCE;
+		else
+			buffering = COPY_THEN_ENC;
+	} else {
 		/* If this is an async op and we're not using a bounce buffer,
 		 * we have to save the source buffer as the iterator is only
 		 * good until we return.  In such a case, extract an iterator
@@ -77,10 +248,25 @@  ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov_iter *
 		}
 
 		wreq->io_iter = wreq->iter;
+		buffering = DIRECT_IO;
 	}
 
 	/* Copy the data into the bounce buffer and encrypt it. */
-	// TODO
+	if (buffering == COPY_TO_BOUNCE ||
+	    buffering == COPY_THEN_ENC) {
+		ret = netfs_copy_iter_to_xarray(iter, &wreq->bounce, wreq->start);
+		if (ret < 0)
+			goto out;
+		wreq->iter = wreq->io_iter;
+		wreq->start -= gap_before;
+		wreq->len += gap_before + gap_after;
+	}
+
+	if (buffering == COPY_THEN_ENC ||
+	    buffering == ENC_TO_BOUNCE) {
+		if (!netfs_encrypt(wreq))
+			goto out;
+	}
 
 	/* Dispatch the write. */
 	__set_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags);
@@ -101,6 +287,20 @@  ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov_iter *
 		wait_on_bit(&wreq->flags, NETFS_RREQ_IN_PROGRESS,
 			    TASK_UNINTERRUPTIBLE);
 
+		/* See if the write failed due to a 3rd party race when doing
+		 * an RMW on a partially modified block in an encrypted file.
+		 */
+		if (test_and_clear_bit(NETFS_RREQ_REPEAT_RMW, &wreq->flags)) {
+			netfs_clear_subrequests(wreq, false);
+			iov_iter_revert(iter, end - start);
+			wreq->error = 0;
+			wreq->start = start;
+			wreq->len = end - start;
+			wreq->transferred = 0;
+			wreq->submitted = 0;
+			goto repeat_rmw_cycle;
+		}
+
 		ret = wreq->error;
 		_debug("waited = %zd", ret);
 		if (ret == 0) {
diff --git a/fs/netfs/internal.h b/fs/netfs/internal.h
index 447a67301329..782b73b1f5a7 100644
--- a/fs/netfs/internal.h
+++ b/fs/netfs/internal.h
@@ -28,6 +28,11 @@  int netfs_prefetch_for_write(struct file *file, struct folio *folio,
 bool netfs_encrypt(struct netfs_io_request *wreq);
 void netfs_decrypt(struct netfs_io_request *rreq);
 
+/*
+ * direct_read.c
+ */
+int netfs_dio_copy_bounce_to_dest(struct netfs_io_request *rreq);
+
 /*
  * direct_write.c
  */
@@ -38,6 +43,9 @@  ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov_iter *
  * io.c
  */
 int netfs_begin_read(struct netfs_io_request *rreq, bool sync);
+ssize_t netfs_rmw_read(struct netfs_io_request *wreq, struct file *file,
+		       unsigned long long start1, size_t len1,
+		       unsigned long long start2, size_t len2);
 
 /*
  * main.c
diff --git a/fs/netfs/io.c b/fs/netfs/io.c
index 9887b22e4cb3..14a9f3312d3b 100644
--- a/fs/netfs/io.c
+++ b/fs/netfs/io.c
@@ -775,3 +775,120 @@  int netfs_begin_read(struct netfs_io_request *rreq, bool sync)
 out:
 	return ret;
 }
+
+static bool netfs_rmw_read_one(struct netfs_io_request *rreq,
+			       unsigned long long start, size_t len)
+{
+	struct netfs_inode *ctx = netfs_inode(rreq->inode);
+	struct iov_iter io_iter;
+	unsigned long long pstart, end = start + len;
+	pgoff_t first, last;
+	ssize_t ret;
+	size_t min_bsize = 1UL << ctx->min_bshift;
+
+	/* Determine the block we need to load. */
+	end = round_up(end, min_bsize);
+	start = round_down(start, min_bsize);
+
+	/* Determine the folios we need to insert. */
+	pstart = round_down(start, PAGE_SIZE);
+	first = pstart / PAGE_SIZE;
+	last = DIV_ROUND_UP(end, PAGE_SIZE);
+
+	ret = netfs_add_folios_to_buffer(&rreq->bounce, rreq->mapping,
+					 first, last, GFP_NOFS);
+	if (ret < 0) {
+		rreq->error = ret;
+		return false;
+	}
+
+	rreq->start = start;
+	rreq->len = len;
+	rreq->submitted = 0;
+	iov_iter_xarray(&rreq->io_iter, ITER_DEST, &rreq->bounce, start, len);
+
+	io_iter = rreq->io_iter;
+	do {
+		_debug("submit %llx + %zx >= %llx",
+		       rreq->start, rreq->submitted, rreq->i_size);
+		if (rreq->start + rreq->submitted >= rreq->i_size)
+			break;
+		if (!netfs_rreq_submit_slice(rreq, &io_iter, &rreq->subreq_counter))
+			break;
+	} while (rreq->submitted < rreq->len);
+
+	if (rreq->submitted < rreq->len) {
+		netfs_put_request(rreq, false, netfs_rreq_trace_put_no_submit);
+		return false;
+	}
+
+	return true;
+}
+
+/*
+ * Begin the process of reading in one or two chunks of data for use by
+ * unbuffered write to perform an RMW cycle.  We don't read directly into the
+ * write buffer as this may get called to redo the read in the case that a
+ * conditional write fails due to conflicting 3rd-party modifications.
+ */
+ssize_t netfs_rmw_read(struct netfs_io_request *wreq, struct file *file,
+		       unsigned long long start1, size_t len1,
+		       unsigned long long start2, size_t len2)
+{
+	struct netfs_io_request *rreq;
+	ssize_t ret;
+
+	_enter("RMW:R=%x %llx-%llx %llx-%llx",
+	       rreq->debug_id, start1, start1 + len1 - 1, start2, start2 + len2 - 1);
+
+	rreq = netfs_alloc_request(wreq->mapping, file,
+				   start1, start2 - start1 + len2, NETFS_RMW_READ);
+	if (IS_ERR(rreq))
+		return PTR_ERR(rreq);
+
+	INIT_WORK(&rreq->work, netfs_rreq_work);
+
+	rreq->iter = wreq->io_iter;
+	__set_bit(NETFS_RREQ_CRYPT_IN_PLACE, &rreq->flags);
+	__set_bit(NETFS_RREQ_USE_BOUNCE_BUFFER, &rreq->flags);
+
+	/* Chop the reads into slices according to what the netfs wants and
+	 * submit each one.
+	 */
+	netfs_get_request(rreq, netfs_rreq_trace_get_for_outstanding);
+	atomic_set(&rreq->nr_outstanding, 1);
+	if (len1 && !netfs_rmw_read_one(rreq, start1, len1))
+		goto wait;
+	if (len2)
+		netfs_rmw_read_one(rreq, start2, len2);
+
+wait:
+	/* Keep nr_outstanding incremented so that the ref always belongs to us
+	 * and the service code isn't punted off to a random thread pool to
+	 * process.
+	 */
+	for (;;) {
+		wait_var_event(&rreq->nr_outstanding,
+			       atomic_read(&rreq->nr_outstanding) == 1);
+		netfs_rreq_assess(rreq, false);
+		if (atomic_read(&rreq->nr_outstanding) == 1)
+			break;
+		cond_resched();
+	}
+
+	trace_netfs_rreq(wreq, netfs_rreq_trace_wait_ip);
+	wait_on_bit(&wreq->flags, NETFS_RREQ_IN_PROGRESS,
+		    TASK_UNINTERRUPTIBLE);
+
+	ret = rreq->error;
+	if (ret == 0 && rreq->submitted < rreq->len) {
+		trace_netfs_failure(rreq, NULL, ret, netfs_fail_short_read);
+		ret = -EIO;
+	}
+
+	if (ret == 0)
+		ret = netfs_dio_copy_bounce_to_dest(rreq);
+
+	netfs_put_request(rreq, false, netfs_rreq_trace_put_return);
+	return ret;
+}
diff --git a/fs/netfs/main.c b/fs/netfs/main.c
index 1cf10f9c4c1f..b335e6a50f9c 100644
--- a/fs/netfs/main.c
+++ b/fs/netfs/main.c
@@ -33,6 +33,7 @@  static const char *netfs_origins[nr__netfs_io_origin] = {
 	[NETFS_READPAGE]		= "RP",
 	[NETFS_READ_FOR_WRITE]		= "RW",
 	[NETFS_WRITEBACK]		= "WB",
+	[NETFS_RMW_READ]		= "RM",
 	[NETFS_UNBUFFERED_WRITE]	= "UW",
 	[NETFS_DIO_READ]		= "DR",
 	[NETFS_DIO_WRITE]		= "DW",
diff --git a/include/linux/netfs.h b/include/linux/netfs.h
index 364361cc93be..c3d1eac1ce51 100644
--- a/include/linux/netfs.h
+++ b/include/linux/netfs.h
@@ -145,6 +145,7 @@  struct netfs_inode {
 #define NETFS_ICTX_ENCRYPTED	2		/* The file contents are encrypted */
 	unsigned char		min_bshift;	/* log2 min block size for bounding box or 0 */
 	unsigned char		crypto_bshift;	/* log2 of crypto block size */
+	unsigned char		crypto_trailer;	/* Size of crypto trailer */
 };
 
 /*
@@ -233,6 +234,7 @@  enum netfs_io_origin {
 	NETFS_READPAGE,			/* This read is a synchronous read */
 	NETFS_READ_FOR_WRITE,		/* This read is to prepare a write */
 	NETFS_WRITEBACK,		/* This write was triggered by writepages */
+	NETFS_RMW_READ,			/* This is an unbuffered read for RMW */
 	NETFS_UNBUFFERED_WRITE,		/* This is an unbuffered write */
 	NETFS_DIO_READ,			/* This is a direct I/O read */
 	NETFS_DIO_WRITE,		/* This is a direct I/O write */
@@ -291,6 +293,7 @@  struct netfs_io_request {
 #define NETFS_RREQ_BLOCKED		10	/* We blocked */
 #define NETFS_RREQ_CONTENT_ENCRYPTION	11	/* Content encryption is in use */
 #define NETFS_RREQ_CRYPT_IN_PLACE	12	/* Enc/dec in place in ->io_iter */
+#define NETFS_RREQ_REPEAT_RMW		13	/* Need to repeat RMW cycle */
 	const struct netfs_request_ops *netfs_ops;
 	void (*cleanup)(struct netfs_io_request *req);
 };
@@ -479,6 +482,7 @@  static inline void netfs_inode_init(struct netfs_inode *ctx,
 	ctx->flags = 0;
 	ctx->min_bshift = 0;
 	ctx->crypto_bshift = 0;
+	ctx->crypto_trailer = 0;
 #if IS_ENABLED(CONFIG_FSCACHE)
 	ctx->cache = NULL;
 #endif
diff --git a/include/trace/events/netfs.h b/include/trace/events/netfs.h
index 2f35057602fa..825946f510ee 100644
--- a/include/trace/events/netfs.h
+++ b/include/trace/events/netfs.h
@@ -33,6 +33,7 @@ 
 	EM(NETFS_READPAGE,			"RP")		\
 	EM(NETFS_READ_FOR_WRITE,		"RW")		\
 	EM(NETFS_WRITEBACK,			"WB")		\
+	EM(NETFS_RMW_READ,			"RM")		\
 	EM(NETFS_UNBUFFERED_WRITE,		"UW")		\
 	EM(NETFS_DIO_READ,			"DR")		\
 	E_(NETFS_DIO_WRITE,			"DW")