mbox series

[v5,0/3] libceph: fix sparse-read failure bug

Message ID 20240123131204.1166101-1-xiubli@redhat.com
Headers show
Series libceph: fix sparse-read failure bug | expand

Message

Xiubo Li Jan. 23, 2024, 1:12 p.m. UTC
From: Xiubo Li <xiubli@redhat.com>

V5:
- remove sr_total_resid and reuse total_resid instead.

V4:
- remove the sr_resid_elen field
- improved the 'read_partial_sparse_msg_data()'

V3:
- rename read_sparse_msg_XX to read_partial_sparse_msg_XX
- fix the sparse-read bug in the messager v1 code.

V2:
- fix the sparse-read bug in the sparse-read code instead


Xiubo Li (3):
  libceph: fail the sparse-read if there still has data in socket
  libceph: rename read_sparse_msg_XX to read_partial_sparse_msg_XX
  libceph: just wait for more data to be available on the socket

 include/linux/ceph/messenger.h |  2 +-
 net/ceph/messenger_v1.c        | 33 +++++++++++++++++----------------
 net/ceph/messenger_v2.c        |  4 ++--
 net/ceph/osd_client.c          | 22 ++++++++++++++--------
 4 files changed, 34 insertions(+), 27 deletions(-)

Comments

Jeff Layton Jan. 23, 2024, 2:50 p.m. UTC | #1
On Tue, 2024-01-23 at 21:12 +0800, xiubli@redhat.com wrote:
> From: Xiubo Li <xiubli@redhat.com>
> 
> Once this happens that means there have bugs.
> 
> URL: https://tracker.ceph.com/issues/63586
> Signed-off-by: Xiubo Li <xiubli@redhat.com>
> ---
>  net/ceph/osd_client.c | 13 +++++++++++--
>  1 file changed, 11 insertions(+), 2 deletions(-)
> 
> diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
> index 9be80d01c1dc..6beab9be51e2 100644
> --- a/net/ceph/osd_client.c
> +++ b/net/ceph/osd_client.c
> @@ -5857,8 +5857,8 @@ static int osd_sparse_read(struct ceph_connection *con,
>  	struct ceph_osd *o = con->private;
>  	struct ceph_sparse_read *sr = &o->o_sparse_read;
>  	u32 count = sr->sr_count;
> -	u64 eoff, elen;
> -	int ret;
> +	u64 eoff, elen, len = 0;
> +	int i, ret;
>  
>  	switch (sr->sr_state) {
>  	case CEPH_SPARSE_READ_HDR:
> @@ -5909,6 +5909,13 @@ static int osd_sparse_read(struct ceph_connection *con,
>  		/* Convert sr_datalen to host-endian */
>  		sr->sr_datalen = le32_to_cpu((__force __le32)sr->sr_datalen);
>  		sr->sr_state = CEPH_SPARSE_READ_DATA;
> +		for (i = 0; i < count; i++)
> +			len += sr->sr_extent[i].len;
> +		if (sr->sr_datalen != len) {
> +			pr_warn_ratelimited("data len %u != extent len %llu\n",
> +					    sr->sr_datalen, len);
> +			return -EREMOTEIO;
> +		}
>  		fallthrough;
>  	case CEPH_SPARSE_READ_DATA:
>  		if (sr->sr_index >= count) {
> @@ -5919,6 +5926,8 @@ static int osd_sparse_read(struct ceph_connection *con,
>  		eoff = sr->sr_extent[sr->sr_index].off;
>  		elen = sr->sr_extent[sr->sr_index].len;
>  
> +		sr->sr_datalen -= elen;
> +
>  		dout("[%d] ext %d off 0x%llx len 0x%llx\n",
>  		     o->o_osd, sr->sr_index, eoff, elen);
>  

Reviewed-by: Jeff Layton <jlayton@kernel.org>
Xiubo Li Jan. 24, 2024, 1:20 a.m. UTC | #2
On 1/23/24 22:47, Jeff Layton wrote:
> On Tue, 2024-01-23 at 21:12 +0800, xiubli@redhat.com wrote:
>> From: Xiubo Li <xiubli@redhat.com>
>>
>> A short read may occur while reading the message footer from the
>> socket.  Later, when the socket is ready for another read, the
>> messenger shoudl invoke all read_partial* handlers, except the
>> read_partial_sparse_msg_data().  The contract between the messenger
>> and these handlers is that the handlers should bail if the area
>> of the message is responsible for is already processed.  So,
>> in this case, it's expected that read_sparse_msg_data() would bail,
>> allowing the messenger to invoke read_partial() for the footer and
>> pick up where it left off.
>>
>> However read_partial_sparse_msg_data() violates that contract and
>> ends up calling into the state machine in the OSD client. The
>> sparse-read state machine just assumes that it's a new op and
>> interprets some piece of the footer as the sparse-read extents/data
>> and then returns bogus extents/data length, etc.
>>
>> This will just reuse the 'total_resid' to determine whether should
>> the read_partial_sparse_msg_data() bail out or not. Because once
>> it reaches to zero that means all the extents and data have been
>> successfully received in last read, else it could break out when
>> partially reading any of the extents and data. And then the
>> osd_sparse_read() could continue where it left off.
>>
> Thanks for the detailed description. That really helps!
>
I just copied from Ilya's comments and with some changes.

>> URL: https://tracker.ceph.com/issues/63586
>> Signed-off-by: Xiubo Li <xiubli@redhat.com>
>> ---
>>   include/linux/ceph/messenger.h |  2 +-
>>   net/ceph/messenger_v1.c        | 25 +++++++++++++------------
>>   net/ceph/messenger_v2.c        |  4 ++--
>>   net/ceph/osd_client.c          |  9 +++------
>>   4 files changed, 19 insertions(+), 21 deletions(-)
>>
>> diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
>> index 2eaaabbe98cb..1717cc57cdac 100644
>> --- a/include/linux/ceph/messenger.h
>> +++ b/include/linux/ceph/messenger.h
>> @@ -283,7 +283,7 @@ struct ceph_msg {
>>   	struct kref kref;
>>   	bool more_to_follow;
>>   	bool needs_out_seq;
>> -	bool sparse_read;
>> +	u64 sparse_read_total;
>>   	int front_alloc_len;
>>   
>>   	struct ceph_msgpool *pool;
>> diff --git a/net/ceph/messenger_v1.c b/net/ceph/messenger_v1.c
>> index 4cb60bacf5f5..4c76c8390de1 100644
>> --- a/net/ceph/messenger_v1.c
>> +++ b/net/ceph/messenger_v1.c
>> @@ -160,8 +160,9 @@ static size_t sizeof_footer(struct ceph_connection *con)
>>   static void prepare_message_data(struct ceph_msg *msg, u32 data_len)
>>   {
>>   	/* Initialize data cursor if it's not a sparse read */
>> -	if (!msg->sparse_read)
>> -		ceph_msg_data_cursor_init(&msg->cursor, msg, data_len);
>> +	u64 len = msg->sparse_read_total ? : data_len;
>> +
>> +	ceph_msg_data_cursor_init(&msg->cursor, msg, len);
>>   }
>>   
>>   /*
>> @@ -1036,7 +1037,7 @@ static int read_partial_sparse_msg_data(struct ceph_connection *con)
>>   	if (do_datacrc)
>>   		crc = con->in_data_crc;
>>   
>> -	do {
>> +	while (cursor->total_resid) {
>>   		if (con->v1.in_sr_kvec.iov_base)
>>   			ret = read_partial_message_chunk(con,
>>   							 &con->v1.in_sr_kvec,
>> @@ -1044,23 +1045,23 @@ static int read_partial_sparse_msg_data(struct ceph_connection *con)
>>   							 &crc);
>>   		else if (cursor->sr_resid > 0)
>>   			ret = read_partial_sparse_msg_extent(con, &crc);
>> -
>> -		if (ret <= 0) {
>> -			if (do_datacrc)
>> -				con->in_data_crc = crc;
>> -			return ret;
>> -		}
>> +		if (ret <= 0)
>> +			break;
>>   
>>   		memset(&con->v1.in_sr_kvec, 0, sizeof(con->v1.in_sr_kvec));
>>   		ret = con->ops->sparse_read(con, cursor,
>>   				(char **)&con->v1.in_sr_kvec.iov_base);
>> +		if (ret <= 0) {
>> +			ret = ret ? : 1; /* must return > 0 to indicate success */
> nit: this syntax is a gcc-ism (AIUI) and is not preferred. It'd be
> better spell it out in this case (particularly since it's only 4 extra
> chars:
>
> 			ret = ret ? ret : 1;

Sure.

Thanks Jeff.

- Xiubo


>> +			break;
>> +		}
>>   		con->v1.in_sr_len = ret;
>> -	} while (ret > 0);
>> +	}
>>   
>>   	if (do_datacrc)
>>   		con->in_data_crc = crc;
>>   
>> -	return ret < 0 ? ret : 1;  /* must return > 0 to indicate success */
>> +	return ret;
>>   }
>>   
>>   static int read_partial_msg_data(struct ceph_connection *con)
>> @@ -1253,7 +1254,7 @@ static int read_partial_message(struct ceph_connection *con)
>>   		if (!m->num_data_items)
>>   			return -EIO;
>>   
>> -		if (m->sparse_read)
>> +		if (m->sparse_read_total)
>>   			ret = read_partial_sparse_msg_data(con);
>>   		else if (ceph_test_opt(from_msgr(con->msgr), RXBOUNCE))
>>   			ret = read_partial_msg_data_bounce(con);
>> diff --git a/net/ceph/messenger_v2.c b/net/ceph/messenger_v2.c
>> index f8ec60e1aba3..a0ca5414b333 100644
>> --- a/net/ceph/messenger_v2.c
>> +++ b/net/ceph/messenger_v2.c
>> @@ -1128,7 +1128,7 @@ static int decrypt_tail(struct ceph_connection *con)
>>   	struct sg_table enc_sgt = {};
>>   	struct sg_table sgt = {};
>>   	struct page **pages = NULL;
>> -	bool sparse = con->in_msg->sparse_read;
>> +	bool sparse = !!con->in_msg->sparse_read_total;
>>   	int dpos = 0;
>>   	int tail_len;
>>   	int ret;
>> @@ -2060,7 +2060,7 @@ static int prepare_read_tail_plain(struct ceph_connection *con)
>>   	}
>>   
>>   	if (data_len(msg)) {
>> -		if (msg->sparse_read)
>> +		if (msg->sparse_read_total)
>>   			con->v2.in_state = IN_S_PREPARE_SPARSE_DATA;
>>   		else
>>   			con->v2.in_state = IN_S_PREPARE_READ_DATA;
>> diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
>> index 6beab9be51e2..1a5b1e1e24ca 100644
>> --- a/net/ceph/osd_client.c
>> +++ b/net/ceph/osd_client.c
>> @@ -5510,7 +5510,7 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
>>   	}
>>   
>>   	m = ceph_msg_get(req->r_reply);
>> -	m->sparse_read = (bool)srlen;
>> +	m->sparse_read_total = srlen;
>>   
>>   	dout("get_reply tid %lld %p\n", tid, m);
>>   
>> @@ -5777,11 +5777,8 @@ static int prep_next_sparse_read(struct ceph_connection *con,
>>   	}
>>   
>>   	if (o->o_sparse_op_idx < 0) {
>> -		u64 srlen = sparse_data_requested(req);
>> -
>> -		dout("%s: [%d] starting new sparse read req. srlen=0x%llx\n",
>> -		     __func__, o->o_osd, srlen);
>> -		ceph_msg_data_cursor_init(cursor, con->in_msg, srlen);
>> +		dout("%s: [%d] starting new sparse read req\n",
>> +		     __func__, o->o_osd);
>>   	} else {
>>   		u64 end;
>>   
> The patch itself looks fine though.
>
> Reviewed-by: Jeff Layton <jlayton@kernel.org>
>