@@ -135,6 +135,233 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
}
EXPORT_SYMBOL(blkdev_issue_discard);
+/*
+ * Wait on and process all in-flight BIOs. This must only be called once
+ * all bios have been issued so that the refcount can only decrease.
+ * This just waits for all bios to make it through bio_copy_end_io. IO
+ * errors are propagated through cio->io_error.
+ */
+static int cio_await_completion(struct cio *cio)
+{
+ int ret = 0;
+ unsigned long flags;
+
+ spin_lock_irqsave(&cio->lock, flags);
+ if (cio->refcount) {
+ cio->waiter = current;
+ __set_current_state(TASK_UNINTERRUPTIBLE);
+ spin_unlock_irqrestore(&cio->lock, flags);
+ blk_io_schedule();
+ /* wake up sets us TASK_RUNNING */
+ spin_lock_irqsave(&cio->lock, flags);
+ cio->waiter = NULL;
+ ret = cio->io_err;
+ }
+ spin_unlock_irqrestore(&cio->lock, flags);
+ kvfree(cio);
+
+ return ret;
+}
+
+static void bio_copy_end_io(struct bio *bio)
+{
+ struct copy_ctx *ctx = bio->bi_private;
+ struct cio *cio = ctx->cio;
+ sector_t clen;
+ int ri = ctx->range_idx;
+ unsigned long flags;
+
+ if (bio->bi_status) {
+ cio->io_err = bio->bi_status;
+ clen = (bio->bi_iter.bi_sector - ctx->start_sec) << SECTOR_SHIFT;
+ cio->rlist[ri].comp_len = min_t(sector_t, clen, cio->rlist[ri].comp_len);
+ }
+ __free_page(bio->bi_io_vec[0].bv_page);
+ kfree(ctx);
+ bio_put(bio);
+
+ spin_lock_irqsave(&cio->lock, flags);
+ if (!--cio->refcount && cio->waiter)
+ wake_up_process(cio->waiter);
+ spin_unlock_irqrestore(&cio->lock, flags);
+}
+
+/*
+ * blk_copy_offload - Use device's native copy offload feature
+ * Go through user provide payload, prepare new payload based on device's copy offload limits.
+ */
+int blk_copy_offload(struct block_device *src_bdev, int nr_srcs,
+ struct range_entry *rlist, struct block_device *dst_bdev, gfp_t gfp_mask)
+{
+ struct request_queue *sq = bdev_get_queue(src_bdev);
+ struct request_queue *dq = bdev_get_queue(dst_bdev);
+ struct bio *read_bio, *write_bio;
+ struct copy_ctx *ctx;
+ struct cio *cio;
+ struct page *token;
+ sector_t src_blk, copy_len, dst_blk;
+ sector_t remaining, max_copy_len = LONG_MAX;
+ unsigned long flags;
+ int ri = 0, ret = 0;
+
+ cio = kzalloc(sizeof(struct cio), GFP_KERNEL);
+ if (!cio)
+ return -ENOMEM;
+ cio->rlist = rlist;
+ spin_lock_init(&cio->lock);
+
+ max_copy_len = min_t(sector_t, sq->limits.max_copy_sectors, dq->limits.max_copy_sectors);
+ max_copy_len = min3(max_copy_len, (sector_t)sq->limits.max_copy_range_sectors,
+ (sector_t)dq->limits.max_copy_range_sectors) << SECTOR_SHIFT;
+
+ for (ri = 0; ri < nr_srcs; ri++) {
+ cio->rlist[ri].comp_len = rlist[ri].len;
+ src_blk = rlist[ri].src;
+ dst_blk = rlist[ri].dst;
+ for (remaining = rlist[ri].len; remaining > 0; remaining -= copy_len) {
+ copy_len = min(remaining, max_copy_len);
+
+ token = alloc_page(gfp_mask);
+ if (unlikely(!token)) {
+ ret = -ENOMEM;
+ goto err_token;
+ }
+
+ ctx = kzalloc(sizeof(struct copy_ctx), gfp_mask);
+ if (!ctx) {
+ ret = -ENOMEM;
+ goto err_ctx;
+ }
+ ctx->cio = cio;
+ ctx->range_idx = ri;
+ ctx->start_sec = rlist[ri].src;
+
+ read_bio = bio_alloc(src_bdev, 1, REQ_OP_READ | REQ_COPY | REQ_NOMERGE,
+ gfp_mask);
+ if (!read_bio) {
+ ret = -ENOMEM;
+ goto err_read_bio;
+ }
+ read_bio->bi_iter.bi_sector = src_blk >> SECTOR_SHIFT;
+ read_bio->bi_iter.bi_size = copy_len;
+ __bio_add_page(read_bio, token, PAGE_SIZE, 0);
+ ret = submit_bio_wait(read_bio);
+ bio_put(read_bio);
+ if (ret)
+ goto err_read_bio;
+
+ write_bio = bio_alloc(dst_bdev, 1, REQ_OP_WRITE | REQ_COPY | REQ_NOMERGE,
+ gfp_mask);
+ if (!write_bio) {
+ ret = -ENOMEM;
+ goto err_read_bio;
+ }
+ write_bio->bi_iter.bi_sector = dst_blk >> SECTOR_SHIFT;
+ write_bio->bi_iter.bi_size = copy_len;
+ __bio_add_page(write_bio, token, PAGE_SIZE, 0);
+ write_bio->bi_end_io = bio_copy_end_io;
+ write_bio->bi_private = ctx;
+
+ spin_lock_irqsave(&cio->lock, flags);
+ ++cio->refcount;
+ spin_unlock_irqrestore(&cio->lock, flags);
+
+ submit_bio(write_bio);
+ src_blk += copy_len;
+ dst_blk += copy_len;
+ }
+ }
+
+ /* Wait for completion of all IO's*/
+ return cio_await_completion(cio);
+
+err_read_bio:
+ kfree(ctx);
+err_ctx:
+ __free_page(token);
+err_token:
+ rlist[ri].comp_len = min_t(sector_t, rlist[ri].comp_len, (rlist[ri].len - remaining));
+
+ cio->io_err = ret;
+ return cio_await_completion(cio);
+}
+
+static inline int blk_copy_sanity_check(struct block_device *src_bdev,
+ struct block_device *dst_bdev, struct range_entry *rlist, int nr)
+{
+ unsigned int align_mask = max(
+ bdev_logical_block_size(dst_bdev), bdev_logical_block_size(src_bdev)) - 1;
+ sector_t len = 0;
+ int i;
+
+ for (i = 0; i < nr; i++) {
+ if (rlist[i].len)
+ len += rlist[i].len;
+ else
+ return -EINVAL;
+ if ((rlist[i].dst & align_mask) || (rlist[i].src & align_mask) ||
+ (rlist[i].len & align_mask))
+ return -EINVAL;
+ rlist[i].comp_len = 0;
+ }
+
+ if (len && len >= MAX_COPY_TOTAL_LENGTH)
+ return -EINVAL;
+
+ return 0;
+}
+
+static inline bool blk_check_copy_offload(struct request_queue *src_q,
+ struct request_queue *dest_q)
+{
+ if (blk_queue_copy(dest_q) && blk_queue_copy(src_q))
+ return true;
+
+ return false;
+}
+
+/*
+ * blkdev_issue_copy - queue a copy
+ * @src_bdev: source block device
+ * @nr_srcs: number of source ranges to copy
+ * @rlist: array of source/dest/len
+ * @dest_bdev: destination block device
+ * @gfp_mask: memory allocation flags (for bio_alloc)
+ *
+ * Description:
+ * Copy source ranges from source block device to destination block device.
+ * length of a source range cannot be zero.
+ */
+int blkdev_issue_copy(struct block_device *src_bdev, int nr,
+ struct range_entry *rlist, struct block_device *dest_bdev, gfp_t gfp_mask)
+{
+ struct request_queue *src_q = bdev_get_queue(src_bdev);
+ struct request_queue *dest_q = bdev_get_queue(dest_bdev);
+ int ret = -EINVAL;
+
+ if (!src_q || !dest_q)
+ return -ENXIO;
+
+ if (!nr)
+ return -EINVAL;
+
+ if (nr >= MAX_COPY_NR_RANGE)
+ return -EINVAL;
+
+ if (bdev_read_only(dest_bdev))
+ return -EPERM;
+
+ ret = blk_copy_sanity_check(src_bdev, dest_bdev, rlist, nr);
+ if (ret)
+ return ret;
+
+ if (blk_check_copy_offload(src_q, dest_q))
+ ret = blk_copy_offload(src_bdev, nr, rlist, dest_bdev, gfp_mask);
+
+ return ret;
+}
+EXPORT_SYMBOL(blkdev_issue_copy);
+
/**
* __blkdev_issue_write_same - generate number of bios with same page
* @bdev: target blockdev
@@ -292,6 +292,8 @@ static inline bool blk_may_split(struct request_queue *q, struct bio *bio)
break;
}
+ if (unlikely(op_is_copy(bio->bi_opf)))
+ return false;
/*
* All drivers must accept single-segments bios that are <= PAGE_SIZE.
* This is a quick and dirty check that relies on the fact that
@@ -418,6 +418,7 @@ enum req_flag_bits {
/* for driver use */
__REQ_DRV,
__REQ_SWAP, /* swapping request. */
+ __REQ_COPY, /* copy request*/
__REQ_NR_BITS, /* stops here */
};
@@ -442,6 +443,7 @@ enum req_flag_bits {
#define REQ_DRV (1ULL << __REQ_DRV)
#define REQ_SWAP (1ULL << __REQ_SWAP)
+#define REQ_COPY (1ULL << __REQ_COPY)
#define REQ_FAILFAST_MASK \
(REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | REQ_FAILFAST_DRIVER)
@@ -498,6 +500,11 @@ static inline bool op_is_discard(unsigned int op)
return (op & REQ_OP_MASK) == REQ_OP_DISCARD;
}
+static inline bool op_is_copy(unsigned int op)
+{
+ return (op & REQ_COPY);
+}
+
/*
* Check if a bio or request operation is a zone management operation, with
* the exception of REQ_OP_ZONE_RESET_ALL which is treated as a special case
@@ -532,4 +539,18 @@ struct blk_rq_stat {
u64 batch;
};
+struct cio {
+ struct range_entry *rlist;
+ struct task_struct *waiter; /* waiting task (NULL if none) */
+ spinlock_t lock; /* protects refcount and waiter */
+ int refcount;
+ blk_status_t io_err;
+};
+
+struct copy_ctx {
+ int range_idx;
+ sector_t start_sec;
+ struct cio *cio;
+};
+
#endif /* __LINUX_BLK_TYPES_H */
@@ -1121,6 +1121,8 @@ extern int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
struct bio **biop);
struct bio *bio_map_kern(struct request_queue *q, void *data, unsigned int len,
gfp_t gfp_mask);
+int blkdev_issue_copy(struct block_device *src_bdev, int nr_srcs,
+ struct range_entry *src_rlist, struct block_device *dest_bdev, gfp_t gfp_mask);
#define BLKDEV_ZERO_NOUNMAP (1 << 0) /* do not free blocks */
#define BLKDEV_ZERO_NOFALLBACK (1 << 1) /* don't write explicit zeroes */
@@ -64,6 +64,20 @@ struct fstrim_range {
__u64 minlen;
};
+/* Maximum no of entries supported */
+#define MAX_COPY_NR_RANGE (1 << 12)
+
+/* maximum total copy length */
+#define MAX_COPY_TOTAL_LENGTH (1 << 21)
+
+/* Source range entry for copy */
+struct range_entry {
+ __u64 src;
+ __u64 dst;
+ __u64 len;
+ __u64 comp_len;
+};
+
/* extent-same (dedupe) ioctls; these MUST match the btrfs ioctl definitions */
#define FILE_DEDUPE_RANGE_SAME 0
#define FILE_DEDUPE_RANGE_DIFFERS 1