Message ID | 20230725104438.386461-1-xiubli@redhat.com |
---|---|
State | Superseded |
Headers | show |
Series | ceph: make num_fwd and num_retry to __u32 | expand |
Hi Xiubo! On Tue, Jul 25, 2023 at 12:46 PM <xiubli@redhat.com> wrote: > > From: Xiubo Li <xiubli@redhat.com> > > The num_fwd in MClientRequestForward is int32_t, while the num_fwd > in ceph_mds_request_head is __u8. This is buggy when the num_fwd > is larger than 256 it will always be truncate to 0 again. But the > client couldn't recoginize this. > > This will make them to __u32 instead. Because the old cephs will > directly copy the raw memories when decoding the reqeust's head, > so we need to make sure this kclient will be compatible with old > cephs. For newer cephs they will decode the requests depending > the version, which will be much simpler and easier to extend new > members. > > URL: https://tracker.ceph.com/issues/62145 > Signed-off-by: Xiubo Li <xiubli@redhat.com> > --- > fs/ceph/mds_client.c | 191 ++++++++++++++++++----------------- > fs/ceph/mds_client.h | 4 +- > include/linux/ceph/ceph_fs.h | 23 ++++- > 3 files changed, 124 insertions(+), 94 deletions(-) > > diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c > index 70987b3c198a..191bae3a4ee6 100644 > --- a/fs/ceph/mds_client.c > +++ b/fs/ceph/mds_client.c > @@ -2897,6 +2897,18 @@ static void encode_mclientrequest_tail(void **p, const struct ceph_mds_request * > } > } > > +static struct ceph_mds_request_head_legacy * > +find_legacy_request_head(void *p, u64 features) > +{ > + bool legacy = !(features & CEPH_FEATURE_FS_BTIME); > + struct ceph_mds_request_head_old *ohead; > + > + if (legacy) > + return (struct ceph_mds_request_head_legacy *)p; > + ohead = (struct ceph_mds_request_head_old *)p; > + return (struct ceph_mds_request_head_legacy *)&ohead->oldest_client_tid; > +} > + > /* > * called under mdsc->mutex > */ > @@ -2907,7 +2919,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session, > int mds = session->s_mds; > struct ceph_mds_client *mdsc = session->s_mdsc; > struct ceph_msg *msg; > - struct ceph_mds_request_head_old *head; > + struct ceph_mds_request_head_legacy *lhead; > const char *path1 = NULL; > const char *path2 = NULL; > u64 ino1 = 0, ino2 = 0; > @@ -2919,6 +2931,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session, > void *p, *end; > int ret; > bool legacy = !(session->s_con.peer_features & CEPH_FEATURE_FS_BTIME); > + bool old_version = !test_bit(CEPHFS_FEATURE_32BITS_RETRY_FWD, &session->s_features); > > ret = set_request_path_attr(mdsc, req->r_inode, req->r_dentry, > req->r_parent, req->r_path1, req->r_ino1.ino, > @@ -2950,7 +2963,19 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session, > goto out_free2; > } > > - len = legacy ? sizeof(*head) : sizeof(struct ceph_mds_request_head); > + /* > + * For old cephs without supporting the 32bit retry/fwd feature > + * it will copy the raw memories directly when decoding the > + * requests. While new cephs will decode the head depending the > + * version member, so we need to make sure it will be compatible > + * with them both. > + */ > + if (legacy) > + len = sizeof(struct ceph_mds_request_head_legacy); > + else if (old_version) > + len = sizeof(struct ceph_mds_request_head_old); > + else > + len = sizeof(struct ceph_mds_request_head); > > /* filepaths */ > len += 2 * (1 + sizeof(u32) + sizeof(u64)); > @@ -2995,33 +3020,40 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session, > > msg->hdr.tid = cpu_to_le64(req->r_tid); > > + lhead = find_legacy_request_head(msg->front.iov_base, > + session->s_con.peer_features); > + > /* > - * The old ceph_mds_request_head didn't contain a version field, and > + * The ceph_mds_request_head_legacy didn't contain a version field, and > * one was added when we moved the message version from 3->4. > */ > if (legacy) { > msg->hdr.version = cpu_to_le16(3); > - head = msg->front.iov_base; > - p = msg->front.iov_base + sizeof(*head); > + p = msg->front.iov_base + sizeof(*lhead); > + } else if (old_version) { > + struct ceph_mds_request_head_old *ohead = msg->front.iov_base; > + > + msg->hdr.version = cpu_to_le16(4); > + ohead->version = cpu_to_le16(CEPH_MDS_REQUEST_HEAD_VERSION); Don't we want to use the old mds request head version here: ohead->version = cpu_to_le16(1); ? As far as I understand the idea here is to skip new fields ext_num_retry/ext_num_fwd in case when the old_version is true. Would it be incorrect to set version to the latest one (CEPH_MDS_REQUEST_HEAD_VERSION) and at the same time skip the setting of new fields? Kind regards, Alex > + p = msg->front.iov_base + sizeof(*ohead); > } else { > - struct ceph_mds_request_head *new_head = msg->front.iov_base; > + struct ceph_mds_request_head *nhead = msg->front.iov_base; > > msg->hdr.version = cpu_to_le16(6); > - new_head->version = cpu_to_le16(CEPH_MDS_REQUEST_HEAD_VERSION); > - head = (struct ceph_mds_request_head_old *)&new_head->oldest_client_tid; > - p = msg->front.iov_base + sizeof(*new_head); > + nhead->version = cpu_to_le16(CEPH_MDS_REQUEST_HEAD_VERSION); > + p = msg->front.iov_base + sizeof(*nhead); > } > > end = msg->front.iov_base + msg->front.iov_len; > > - head->mdsmap_epoch = cpu_to_le32(mdsc->mdsmap->m_epoch); > - head->op = cpu_to_le32(req->r_op); > - head->caller_uid = cpu_to_le32(from_kuid(&init_user_ns, > - req->r_cred->fsuid)); > - head->caller_gid = cpu_to_le32(from_kgid(&init_user_ns, > - req->r_cred->fsgid)); > - head->ino = cpu_to_le64(req->r_deleg_ino); > - head->args = req->r_args; > + lhead->mdsmap_epoch = cpu_to_le32(mdsc->mdsmap->m_epoch); > + lhead->op = cpu_to_le32(req->r_op); > + lhead->caller_uid = cpu_to_le32(from_kuid(&init_user_ns, > + req->r_cred->fsuid)); > + lhead->caller_gid = cpu_to_le32(from_kgid(&init_user_ns, > + req->r_cred->fsgid)); > + lhead->ino = cpu_to_le64(req->r_deleg_ino); > + lhead->args = req->r_args; > > ceph_encode_filepath(&p, end, ino1, path1); > ceph_encode_filepath(&p, end, ino2, path2); > @@ -3063,7 +3095,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session, > p = msg->front.iov_base + req->r_request_release_offset; > } > > - head->num_releases = cpu_to_le16(releases); > + lhead->num_releases = cpu_to_le16(releases); > > encode_mclientrequest_tail(&p, req); > > @@ -3114,18 +3146,6 @@ static void complete_request(struct ceph_mds_client *mdsc, > complete_all(&req->r_completion); > } > > -static struct ceph_mds_request_head_old * > -find_old_request_head(void *p, u64 features) > -{ > - bool legacy = !(features & CEPH_FEATURE_FS_BTIME); > - struct ceph_mds_request_head *new_head; > - > - if (legacy) > - return (struct ceph_mds_request_head_old *)p; > - new_head = (struct ceph_mds_request_head *)p; > - return (struct ceph_mds_request_head_old *)&new_head->oldest_client_tid; > -} > - > /* > * called under mdsc->mutex > */ > @@ -3136,30 +3156,26 @@ static int __prepare_send_request(struct ceph_mds_session *session, > int mds = session->s_mds; > struct ceph_mds_client *mdsc = session->s_mdsc; > struct ceph_client *cl = mdsc->fsc->client; > - struct ceph_mds_request_head_old *rhead; > + struct ceph_mds_request_head_legacy *lhead; > + struct ceph_mds_request_head *nhead; > struct ceph_msg *msg; > - int flags = 0, max_retry; > + int flags = 0, old_max_retry; > + bool old_version = !test_bit(CEPHFS_FEATURE_32BITS_RETRY_FWD, &session->s_features); > > - /* > - * The type of 'r_attempts' in kernel 'ceph_mds_request' > - * is 'int', while in 'ceph_mds_request_head' the type of > - * 'num_retry' is '__u8'. So in case the request retries > - * exceeding 256 times, the MDS will receive a incorrect > - * retry seq. > - * > - * In this case it's ususally a bug in MDS and continue > - * retrying the request makes no sense. > - * > - * In future this could be fixed in ceph code, so avoid > - * using the hardcode here. > + /* Avoid inifinite retrying after overflow. The client will > + * increase the retry count and if the MDS is old version, > + * so we limit to retry at most 256 times. > */ > - max_retry = sizeof_field(struct ceph_mds_request_head, num_retry); > - max_retry = 1 << (max_retry * BITS_PER_BYTE); > - if (req->r_attempts >= max_retry) { > - pr_warn_ratelimited_client(cl, "request tid %llu seq overflow\n", > - req->r_tid); > - return -EMULTIHOP; > - } > + if (req->r_attempts) { > + old_max_retry = sizeof_field(struct ceph_mds_request_head_old, num_retry); > + old_max_retry = 1 << (old_max_retry * BITS_PER_BYTE); > + if ((old_version && req->r_attempts >= old_max_retry) || > + ((uint32_t)req->r_attempts >= U32_MAX)) { > + pr_warn_ratelimited_client(cl, "%s request tid %llu seq overflow\n", > + __func__, req->r_tid); > + return -EMULTIHOP; > + } > + } > > req->r_attempts++; > if (req->r_inode) { > @@ -3184,20 +3200,24 @@ static int __prepare_send_request(struct ceph_mds_session *session, > * d_move mangles the src name. > */ > msg = req->r_request; > - rhead = find_old_request_head(msg->front.iov_base, > - session->s_con.peer_features); > + lhead = find_legacy_request_head(msg->front.iov_base, > + session->s_con.peer_features); > > - flags = le32_to_cpu(rhead->flags); > + flags = le32_to_cpu(lhead->flags); > flags |= CEPH_MDS_FLAG_REPLAY; > - rhead->flags = cpu_to_le32(flags); > + lhead->flags = cpu_to_le32(flags); > > if (req->r_target_inode) > - rhead->ino = cpu_to_le64(ceph_ino(req->r_target_inode)); > + lhead->ino = cpu_to_le64(ceph_ino(req->r_target_inode)); > > - rhead->num_retry = req->r_attempts - 1; > + lhead->num_retry = req->r_attempts - 1; > + if (!old_version) { > + nhead = (struct ceph_mds_request_head*)msg->front.iov_base; > + nhead->ext_num_retry = cpu_to_le32(req->r_attempts - 1); > + } > > /* remove cap/dentry releases from message */ > - rhead->num_releases = 0; > + lhead->num_releases = 0; > > p = msg->front.iov_base + req->r_request_release_offset; > encode_mclientrequest_tail(&p, req); > @@ -3218,18 +3238,23 @@ static int __prepare_send_request(struct ceph_mds_session *session, > } > req->r_request = msg; > > - rhead = find_old_request_head(msg->front.iov_base, > - session->s_con.peer_features); > - rhead->oldest_client_tid = cpu_to_le64(__get_oldest_tid(mdsc)); > + lhead = find_legacy_request_head(msg->front.iov_base, > + session->s_con.peer_features); > + lhead->oldest_client_tid = cpu_to_le64(__get_oldest_tid(mdsc)); > if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) > flags |= CEPH_MDS_FLAG_REPLAY; > if (test_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags)) > flags |= CEPH_MDS_FLAG_ASYNC; > if (req->r_parent) > flags |= CEPH_MDS_FLAG_WANT_DENTRY; > - rhead->flags = cpu_to_le32(flags); > - rhead->num_fwd = req->r_num_fwd; > - rhead->num_retry = req->r_attempts - 1; > + lhead->flags = cpu_to_le32(flags); > + lhead->num_fwd = req->r_num_fwd; > + lhead->num_retry = req->r_attempts - 1; > + if (!old_version) { > + nhead = (struct ceph_mds_request_head*)msg->front.iov_base; > + nhead->ext_num_fwd = cpu_to_le32(req->r_num_fwd); > + nhead->ext_num_retry = cpu_to_le32(req->r_attempts - 1); > + } > > doutc(cl, " r_parent = %p\n", req->r_parent); > return 0; > @@ -3893,34 +3918,20 @@ static void handle_forward(struct ceph_mds_client *mdsc, > if (test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) { > doutc(cl, "forward tid %llu aborted, unregistering\n", tid); > __unregister_request(mdsc, req); > - } else if (fwd_seq <= req->r_num_fwd) { > - /* > - * The type of 'num_fwd' in ceph 'MClientRequestForward' > - * is 'int32_t', while in 'ceph_mds_request_head' the > - * type is '__u8'. So in case the request bounces between > - * MDSes exceeding 256 times, the client will get stuck. > - * > - * In this case it's ususally a bug in MDS and continue > - * bouncing the request makes no sense. > + } else if (fwd_seq <= req->r_num_fwd || (uint32_t)fwd_seq >= U32_MAX) { > + /* Avoid inifinite retrying after overflow. > * > - * In future this could be fixed in ceph code, so avoid > - * using the hardcode here. > + * The MDS will increase the fwd count and in client side > + * if the num_fwd is less than the one saved in request > + * that means the MDS is an old version and overflowed of > + * 8 bits. > */ > - int max = sizeof_field(struct ceph_mds_request_head, num_fwd); > - max = 1 << (max * BITS_PER_BYTE); > - if (req->r_num_fwd >= max) { > - mutex_lock(&req->r_fill_mutex); > - req->r_err = -EMULTIHOP; > - set_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags); > - mutex_unlock(&req->r_fill_mutex); > - aborted = true; > - pr_warn_ratelimited_client(cl, > - "forward tid %llu seq overflow\n", > - tid); > - } else { > - doutc(cl, "forward tid %llu to mds%d - old seq %d <= %d\n", > - tid, next_mds, req->r_num_fwd, fwd_seq); > - } > + mutex_lock(&req->r_fill_mutex); > + req->r_err = -EMULTIHOP; > + set_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags); > + mutex_unlock(&req->r_fill_mutex); > + aborted = true; > + pr_warn_ratelimited_client(cl, "forward tid %llu seq overflow\n", tid); > } else { > /* resend. forward race not possible; mds would drop */ > doutc(cl, "forward tid %llu to mds%d (we resend)\n", tid, next_mds); > diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h > index befbd384428e..717a7399bacb 100644 > --- a/fs/ceph/mds_client.h > +++ b/fs/ceph/mds_client.h > @@ -32,8 +32,9 @@ enum ceph_feature_type { > CEPHFS_FEATURE_ALTERNATE_NAME, > CEPHFS_FEATURE_NOTIFY_SESSION_STATE, > CEPHFS_FEATURE_OP_GETVXATTR, > + CEPHFS_FEATURE_32BITS_RETRY_FWD, > > - CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_OP_GETVXATTR, > + CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_32BITS_RETRY_FWD, > }; > > #define CEPHFS_FEATURES_CLIENT_SUPPORTED { \ > @@ -47,6 +48,7 @@ enum ceph_feature_type { > CEPHFS_FEATURE_ALTERNATE_NAME, \ > CEPHFS_FEATURE_NOTIFY_SESSION_STATE, \ > CEPHFS_FEATURE_OP_GETVXATTR, \ > + CEPHFS_FEATURE_32BITS_RETRY_FWD, \ > } > > /* > diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h > index 45f8ce61e103..f3b3593254b9 100644 > --- a/include/linux/ceph/ceph_fs.h > +++ b/include/linux/ceph/ceph_fs.h > @@ -484,7 +484,7 @@ union ceph_mds_request_args_ext { > #define CEPH_MDS_FLAG_WANT_DENTRY 2 /* want dentry in reply */ > #define CEPH_MDS_FLAG_ASYNC 4 /* request is asynchronous */ > > -struct ceph_mds_request_head_old { > +struct ceph_mds_request_head_legacy { > __le64 oldest_client_tid; > __le32 mdsmap_epoch; /* on client */ > __le32 flags; /* CEPH_MDS_FLAG_* */ > @@ -497,9 +497,9 @@ struct ceph_mds_request_head_old { > union ceph_mds_request_args args; > } __attribute__ ((packed)); > > -#define CEPH_MDS_REQUEST_HEAD_VERSION 1 > +#define CEPH_MDS_REQUEST_HEAD_VERSION 2 > > -struct ceph_mds_request_head { > +struct ceph_mds_request_head_old { > __le16 version; /* struct version */ > __le64 oldest_client_tid; > __le32 mdsmap_epoch; /* on client */ > @@ -513,6 +513,23 @@ struct ceph_mds_request_head { > union ceph_mds_request_args_ext args; > } __attribute__ ((packed)); > > +struct ceph_mds_request_head { > + __le16 version; /* struct version */ > + __le64 oldest_client_tid; > + __le32 mdsmap_epoch; /* on client */ > + __le32 flags; /* CEPH_MDS_FLAG_* */ > + __u8 num_retry, num_fwd; /* legacy count retry and fwd attempts */ > + __le16 num_releases; /* # include cap/lease release records */ > + __le32 op; /* mds op code */ > + __le32 caller_uid, caller_gid; > + __le64 ino; /* use this ino for openc, mkdir, mknod, > + etc. (if replaying) */ > + union ceph_mds_request_args_ext args; > + > + __le32 ext_num_retry; /* new count retry attempts */ > + __le32 ext_num_fwd; /* new count fwd attempts */ > +} __attribute__ ((packed)); > + > /* cap/lease release record */ > struct ceph_mds_request_release { > __le64 ino, cap_id; /* ino and unique cap id */ > -- > 2.40.1 >
On 7/26/23 18:06, Aleksandr Mikhalitsyn wrote: > Hi Xiubo! > > On Tue, Jul 25, 2023 at 12:46 PM <xiubli@redhat.com> wrote: >> From: Xiubo Li <xiubli@redhat.com> >> >> The num_fwd in MClientRequestForward is int32_t, while the num_fwd >> in ceph_mds_request_head is __u8. This is buggy when the num_fwd >> is larger than 256 it will always be truncate to 0 again. But the >> client couldn't recoginize this. >> >> This will make them to __u32 instead. Because the old cephs will >> directly copy the raw memories when decoding the reqeust's head, >> so we need to make sure this kclient will be compatible with old >> cephs. For newer cephs they will decode the requests depending >> the version, which will be much simpler and easier to extend new >> members. >> >> URL: https://tracker.ceph.com/issues/62145 >> Signed-off-by: Xiubo Li <xiubli@redhat.com> >> --- >> fs/ceph/mds_client.c | 191 ++++++++++++++++++----------------- >> fs/ceph/mds_client.h | 4 +- >> include/linux/ceph/ceph_fs.h | 23 ++++- >> 3 files changed, 124 insertions(+), 94 deletions(-) >> >> diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c >> index 70987b3c198a..191bae3a4ee6 100644 >> --- a/fs/ceph/mds_client.c >> +++ b/fs/ceph/mds_client.c >> @@ -2897,6 +2897,18 @@ static void encode_mclientrequest_tail(void **p, const struct ceph_mds_request * >> } >> } >> >> +static struct ceph_mds_request_head_legacy * >> +find_legacy_request_head(void *p, u64 features) >> +{ >> + bool legacy = !(features & CEPH_FEATURE_FS_BTIME); >> + struct ceph_mds_request_head_old *ohead; >> + >> + if (legacy) >> + return (struct ceph_mds_request_head_legacy *)p; >> + ohead = (struct ceph_mds_request_head_old *)p; >> + return (struct ceph_mds_request_head_legacy *)&ohead->oldest_client_tid; >> +} >> + >> /* >> * called under mdsc->mutex >> */ >> @@ -2907,7 +2919,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session, >> int mds = session->s_mds; >> struct ceph_mds_client *mdsc = session->s_mdsc; >> struct ceph_msg *msg; >> - struct ceph_mds_request_head_old *head; >> + struct ceph_mds_request_head_legacy *lhead; >> const char *path1 = NULL; >> const char *path2 = NULL; >> u64 ino1 = 0, ino2 = 0; >> @@ -2919,6 +2931,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session, >> void *p, *end; >> int ret; >> bool legacy = !(session->s_con.peer_features & CEPH_FEATURE_FS_BTIME); >> + bool old_version = !test_bit(CEPHFS_FEATURE_32BITS_RETRY_FWD, &session->s_features); >> >> ret = set_request_path_attr(mdsc, req->r_inode, req->r_dentry, >> req->r_parent, req->r_path1, req->r_ino1.ino, >> @@ -2950,7 +2963,19 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session, >> goto out_free2; >> } >> >> - len = legacy ? sizeof(*head) : sizeof(struct ceph_mds_request_head); >> + /* >> + * For old cephs without supporting the 32bit retry/fwd feature >> + * it will copy the raw memories directly when decoding the >> + * requests. While new cephs will decode the head depending the >> + * version member, so we need to make sure it will be compatible >> + * with them both. >> + */ >> + if (legacy) >> + len = sizeof(struct ceph_mds_request_head_legacy); >> + else if (old_version) >> + len = sizeof(struct ceph_mds_request_head_old); >> + else >> + len = sizeof(struct ceph_mds_request_head); >> >> /* filepaths */ >> len += 2 * (1 + sizeof(u32) + sizeof(u64)); >> @@ -2995,33 +3020,40 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session, >> >> msg->hdr.tid = cpu_to_le64(req->r_tid); >> >> + lhead = find_legacy_request_head(msg->front.iov_base, >> + session->s_con.peer_features); >> + >> /* >> - * The old ceph_mds_request_head didn't contain a version field, and >> + * The ceph_mds_request_head_legacy didn't contain a version field, and >> * one was added when we moved the message version from 3->4. >> */ >> if (legacy) { >> msg->hdr.version = cpu_to_le16(3); >> - head = msg->front.iov_base; >> - p = msg->front.iov_base + sizeof(*head); >> + p = msg->front.iov_base + sizeof(*lhead); >> + } else if (old_version) { >> + struct ceph_mds_request_head_old *ohead = msg->front.iov_base; >> + >> + msg->hdr.version = cpu_to_le16(4); >> + ohead->version = cpu_to_le16(CEPH_MDS_REQUEST_HEAD_VERSION); > Don't we want to use the old mds request head version here: > ohead->version = cpu_to_le16(1); > ? > > As far as I understand the idea here is to skip new fields > ext_num_retry/ext_num_fwd in case > when the old_version is true. Would it be incorrect to set version to > the latest one (CEPH_MDS_REQUEST_HEAD_VERSION) > and at the same time skip the setting of new fields? Hi Alex, I noticed that, but this doesn't matter, because for the old version cephs they will do nothing with the version field. I am just following the user space code. But to make the code to be more readable and easier to understand I will revise this. And also will raise one PR to fix the user space code. Thanks - Xiubo > Kind regards, > Alex > >> + p = msg->front.iov_base + sizeof(*ohead); >> } else { >> - struct ceph_mds_request_head *new_head = msg->front.iov_base; >> + struct ceph_mds_request_head *nhead = msg->front.iov_base; >> >> msg->hdr.version = cpu_to_le16(6); >> - new_head->version = cpu_to_le16(CEPH_MDS_REQUEST_HEAD_VERSION); >> - head = (struct ceph_mds_request_head_old *)&new_head->oldest_client_tid; >> - p = msg->front.iov_base + sizeof(*new_head); >> + nhead->version = cpu_to_le16(CEPH_MDS_REQUEST_HEAD_VERSION); >> + p = msg->front.iov_base + sizeof(*nhead); >> } >> >> end = msg->front.iov_base + msg->front.iov_len; >> >> - head->mdsmap_epoch = cpu_to_le32(mdsc->mdsmap->m_epoch); >> - head->op = cpu_to_le32(req->r_op); >> - head->caller_uid = cpu_to_le32(from_kuid(&init_user_ns, >> - req->r_cred->fsuid)); >> - head->caller_gid = cpu_to_le32(from_kgid(&init_user_ns, >> - req->r_cred->fsgid)); >> - head->ino = cpu_to_le64(req->r_deleg_ino); >> - head->args = req->r_args; >> + lhead->mdsmap_epoch = cpu_to_le32(mdsc->mdsmap->m_epoch); >> + lhead->op = cpu_to_le32(req->r_op); >> + lhead->caller_uid = cpu_to_le32(from_kuid(&init_user_ns, >> + req->r_cred->fsuid)); >> + lhead->caller_gid = cpu_to_le32(from_kgid(&init_user_ns, >> + req->r_cred->fsgid)); >> + lhead->ino = cpu_to_le64(req->r_deleg_ino); >> + lhead->args = req->r_args; >> >> ceph_encode_filepath(&p, end, ino1, path1); >> ceph_encode_filepath(&p, end, ino2, path2); >> @@ -3063,7 +3095,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session, >> p = msg->front.iov_base + req->r_request_release_offset; >> } >> >> - head->num_releases = cpu_to_le16(releases); >> + lhead->num_releases = cpu_to_le16(releases); >> >> encode_mclientrequest_tail(&p, req); >> >> @@ -3114,18 +3146,6 @@ static void complete_request(struct ceph_mds_client *mdsc, >> complete_all(&req->r_completion); >> } >> >> -static struct ceph_mds_request_head_old * >> -find_old_request_head(void *p, u64 features) >> -{ >> - bool legacy = !(features & CEPH_FEATURE_FS_BTIME); >> - struct ceph_mds_request_head *new_head; >> - >> - if (legacy) >> - return (struct ceph_mds_request_head_old *)p; >> - new_head = (struct ceph_mds_request_head *)p; >> - return (struct ceph_mds_request_head_old *)&new_head->oldest_client_tid; >> -} >> - >> /* >> * called under mdsc->mutex >> */ >> @@ -3136,30 +3156,26 @@ static int __prepare_send_request(struct ceph_mds_session *session, >> int mds = session->s_mds; >> struct ceph_mds_client *mdsc = session->s_mdsc; >> struct ceph_client *cl = mdsc->fsc->client; >> - struct ceph_mds_request_head_old *rhead; >> + struct ceph_mds_request_head_legacy *lhead; >> + struct ceph_mds_request_head *nhead; >> struct ceph_msg *msg; >> - int flags = 0, max_retry; >> + int flags = 0, old_max_retry; >> + bool old_version = !test_bit(CEPHFS_FEATURE_32BITS_RETRY_FWD, &session->s_features); >> >> - /* >> - * The type of 'r_attempts' in kernel 'ceph_mds_request' >> - * is 'int', while in 'ceph_mds_request_head' the type of >> - * 'num_retry' is '__u8'. So in case the request retries >> - * exceeding 256 times, the MDS will receive a incorrect >> - * retry seq. >> - * >> - * In this case it's ususally a bug in MDS and continue >> - * retrying the request makes no sense. >> - * >> - * In future this could be fixed in ceph code, so avoid >> - * using the hardcode here. >> + /* Avoid inifinite retrying after overflow. The client will >> + * increase the retry count and if the MDS is old version, >> + * so we limit to retry at most 256 times. >> */ >> - max_retry = sizeof_field(struct ceph_mds_request_head, num_retry); >> - max_retry = 1 << (max_retry * BITS_PER_BYTE); >> - if (req->r_attempts >= max_retry) { >> - pr_warn_ratelimited_client(cl, "request tid %llu seq overflow\n", >> - req->r_tid); >> - return -EMULTIHOP; >> - } >> + if (req->r_attempts) { >> + old_max_retry = sizeof_field(struct ceph_mds_request_head_old, num_retry); >> + old_max_retry = 1 << (old_max_retry * BITS_PER_BYTE); >> + if ((old_version && req->r_attempts >= old_max_retry) || >> + ((uint32_t)req->r_attempts >= U32_MAX)) { >> + pr_warn_ratelimited_client(cl, "%s request tid %llu seq overflow\n", >> + __func__, req->r_tid); >> + return -EMULTIHOP; >> + } >> + } >> >> req->r_attempts++; >> if (req->r_inode) { >> @@ -3184,20 +3200,24 @@ static int __prepare_send_request(struct ceph_mds_session *session, >> * d_move mangles the src name. >> */ >> msg = req->r_request; >> - rhead = find_old_request_head(msg->front.iov_base, >> - session->s_con.peer_features); >> + lhead = find_legacy_request_head(msg->front.iov_base, >> + session->s_con.peer_features); >> >> - flags = le32_to_cpu(rhead->flags); >> + flags = le32_to_cpu(lhead->flags); >> flags |= CEPH_MDS_FLAG_REPLAY; >> - rhead->flags = cpu_to_le32(flags); >> + lhead->flags = cpu_to_le32(flags); >> >> if (req->r_target_inode) >> - rhead->ino = cpu_to_le64(ceph_ino(req->r_target_inode)); >> + lhead->ino = cpu_to_le64(ceph_ino(req->r_target_inode)); >> >> - rhead->num_retry = req->r_attempts - 1; >> + lhead->num_retry = req->r_attempts - 1; >> + if (!old_version) { >> + nhead = (struct ceph_mds_request_head*)msg->front.iov_base; >> + nhead->ext_num_retry = cpu_to_le32(req->r_attempts - 1); >> + } >> >> /* remove cap/dentry releases from message */ >> - rhead->num_releases = 0; >> + lhead->num_releases = 0; >> >> p = msg->front.iov_base + req->r_request_release_offset; >> encode_mclientrequest_tail(&p, req); >> @@ -3218,18 +3238,23 @@ static int __prepare_send_request(struct ceph_mds_session *session, >> } >> req->r_request = msg; >> >> - rhead = find_old_request_head(msg->front.iov_base, >> - session->s_con.peer_features); >> - rhead->oldest_client_tid = cpu_to_le64(__get_oldest_tid(mdsc)); >> + lhead = find_legacy_request_head(msg->front.iov_base, >> + session->s_con.peer_features); >> + lhead->oldest_client_tid = cpu_to_le64(__get_oldest_tid(mdsc)); >> if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) >> flags |= CEPH_MDS_FLAG_REPLAY; >> if (test_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags)) >> flags |= CEPH_MDS_FLAG_ASYNC; >> if (req->r_parent) >> flags |= CEPH_MDS_FLAG_WANT_DENTRY; >> - rhead->flags = cpu_to_le32(flags); >> - rhead->num_fwd = req->r_num_fwd; >> - rhead->num_retry = req->r_attempts - 1; >> + lhead->flags = cpu_to_le32(flags); >> + lhead->num_fwd = req->r_num_fwd; >> + lhead->num_retry = req->r_attempts - 1; >> + if (!old_version) { >> + nhead = (struct ceph_mds_request_head*)msg->front.iov_base; >> + nhead->ext_num_fwd = cpu_to_le32(req->r_num_fwd); >> + nhead->ext_num_retry = cpu_to_le32(req->r_attempts - 1); >> + } >> >> doutc(cl, " r_parent = %p\n", req->r_parent); >> return 0; >> @@ -3893,34 +3918,20 @@ static void handle_forward(struct ceph_mds_client *mdsc, >> if (test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) { >> doutc(cl, "forward tid %llu aborted, unregistering\n", tid); >> __unregister_request(mdsc, req); >> - } else if (fwd_seq <= req->r_num_fwd) { >> - /* >> - * The type of 'num_fwd' in ceph 'MClientRequestForward' >> - * is 'int32_t', while in 'ceph_mds_request_head' the >> - * type is '__u8'. So in case the request bounces between >> - * MDSes exceeding 256 times, the client will get stuck. >> - * >> - * In this case it's ususally a bug in MDS and continue >> - * bouncing the request makes no sense. >> + } else if (fwd_seq <= req->r_num_fwd || (uint32_t)fwd_seq >= U32_MAX) { >> + /* Avoid inifinite retrying after overflow. >> * >> - * In future this could be fixed in ceph code, so avoid >> - * using the hardcode here. >> + * The MDS will increase the fwd count and in client side >> + * if the num_fwd is less than the one saved in request >> + * that means the MDS is an old version and overflowed of >> + * 8 bits. >> */ >> - int max = sizeof_field(struct ceph_mds_request_head, num_fwd); >> - max = 1 << (max * BITS_PER_BYTE); >> - if (req->r_num_fwd >= max) { >> - mutex_lock(&req->r_fill_mutex); >> - req->r_err = -EMULTIHOP; >> - set_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags); >> - mutex_unlock(&req->r_fill_mutex); >> - aborted = true; >> - pr_warn_ratelimited_client(cl, >> - "forward tid %llu seq overflow\n", >> - tid); >> - } else { >> - doutc(cl, "forward tid %llu to mds%d - old seq %d <= %d\n", >> - tid, next_mds, req->r_num_fwd, fwd_seq); >> - } >> + mutex_lock(&req->r_fill_mutex); >> + req->r_err = -EMULTIHOP; >> + set_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags); >> + mutex_unlock(&req->r_fill_mutex); >> + aborted = true; >> + pr_warn_ratelimited_client(cl, "forward tid %llu seq overflow\n", tid); >> } else { >> /* resend. forward race not possible; mds would drop */ >> doutc(cl, "forward tid %llu to mds%d (we resend)\n", tid, next_mds); >> diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h >> index befbd384428e..717a7399bacb 100644 >> --- a/fs/ceph/mds_client.h >> +++ b/fs/ceph/mds_client.h >> @@ -32,8 +32,9 @@ enum ceph_feature_type { >> CEPHFS_FEATURE_ALTERNATE_NAME, >> CEPHFS_FEATURE_NOTIFY_SESSION_STATE, >> CEPHFS_FEATURE_OP_GETVXATTR, >> + CEPHFS_FEATURE_32BITS_RETRY_FWD, >> >> - CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_OP_GETVXATTR, >> + CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_32BITS_RETRY_FWD, >> }; >> >> #define CEPHFS_FEATURES_CLIENT_SUPPORTED { \ >> @@ -47,6 +48,7 @@ enum ceph_feature_type { >> CEPHFS_FEATURE_ALTERNATE_NAME, \ >> CEPHFS_FEATURE_NOTIFY_SESSION_STATE, \ >> CEPHFS_FEATURE_OP_GETVXATTR, \ >> + CEPHFS_FEATURE_32BITS_RETRY_FWD, \ >> } >> >> /* >> diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h >> index 45f8ce61e103..f3b3593254b9 100644 >> --- a/include/linux/ceph/ceph_fs.h >> +++ b/include/linux/ceph/ceph_fs.h >> @@ -484,7 +484,7 @@ union ceph_mds_request_args_ext { >> #define CEPH_MDS_FLAG_WANT_DENTRY 2 /* want dentry in reply */ >> #define CEPH_MDS_FLAG_ASYNC 4 /* request is asynchronous */ >> >> -struct ceph_mds_request_head_old { >> +struct ceph_mds_request_head_legacy { >> __le64 oldest_client_tid; >> __le32 mdsmap_epoch; /* on client */ >> __le32 flags; /* CEPH_MDS_FLAG_* */ >> @@ -497,9 +497,9 @@ struct ceph_mds_request_head_old { >> union ceph_mds_request_args args; >> } __attribute__ ((packed)); >> >> -#define CEPH_MDS_REQUEST_HEAD_VERSION 1 >> +#define CEPH_MDS_REQUEST_HEAD_VERSION 2 >> >> -struct ceph_mds_request_head { >> +struct ceph_mds_request_head_old { >> __le16 version; /* struct version */ >> __le64 oldest_client_tid; >> __le32 mdsmap_epoch; /* on client */ >> @@ -513,6 +513,23 @@ struct ceph_mds_request_head { >> union ceph_mds_request_args_ext args; >> } __attribute__ ((packed)); >> >> +struct ceph_mds_request_head { >> + __le16 version; /* struct version */ >> + __le64 oldest_client_tid; >> + __le32 mdsmap_epoch; /* on client */ >> + __le32 flags; /* CEPH_MDS_FLAG_* */ >> + __u8 num_retry, num_fwd; /* legacy count retry and fwd attempts */ >> + __le16 num_releases; /* # include cap/lease release records */ >> + __le32 op; /* mds op code */ >> + __le32 caller_uid, caller_gid; >> + __le64 ino; /* use this ino for openc, mkdir, mknod, >> + etc. (if replaying) */ >> + union ceph_mds_request_args_ext args; >> + >> + __le32 ext_num_retry; /* new count retry attempts */ >> + __le32 ext_num_fwd; /* new count fwd attempts */ >> +} __attribute__ ((packed)); >> + >> /* cap/lease release record */ >> struct ceph_mds_request_release { >> __le64 ino, cap_id; /* ino and unique cap id */ >> -- >> 2.40.1 >>
On Wed, Jul 26, 2023 at 3:16 PM Xiubo Li <xiubli@redhat.com> wrote: > > > On 7/26/23 18:06, Aleksandr Mikhalitsyn wrote: > > Hi Xiubo! > > > > On Tue, Jul 25, 2023 at 12:46 PM <xiubli@redhat.com> wrote: > >> From: Xiubo Li <xiubli@redhat.com> > >> > >> The num_fwd in MClientRequestForward is int32_t, while the num_fwd > >> in ceph_mds_request_head is __u8. This is buggy when the num_fwd > >> is larger than 256 it will always be truncate to 0 again. But the > >> client couldn't recoginize this. > >> > >> This will make them to __u32 instead. Because the old cephs will > >> directly copy the raw memories when decoding the reqeust's head, > >> so we need to make sure this kclient will be compatible with old > >> cephs. For newer cephs they will decode the requests depending > >> the version, which will be much simpler and easier to extend new > >> members. > >> > >> URL: https://tracker.ceph.com/issues/62145 > >> Signed-off-by: Xiubo Li <xiubli@redhat.com> > >> --- > >> fs/ceph/mds_client.c | 191 ++++++++++++++++++----------------- > >> fs/ceph/mds_client.h | 4 +- > >> include/linux/ceph/ceph_fs.h | 23 ++++- > >> 3 files changed, 124 insertions(+), 94 deletions(-) > >> > >> diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c > >> index 70987b3c198a..191bae3a4ee6 100644 > >> --- a/fs/ceph/mds_client.c > >> +++ b/fs/ceph/mds_client.c > >> @@ -2897,6 +2897,18 @@ static void encode_mclientrequest_tail(void **p, const struct ceph_mds_request * > >> } > >> } > >> > >> +static struct ceph_mds_request_head_legacy * > >> +find_legacy_request_head(void *p, u64 features) > >> +{ > >> + bool legacy = !(features & CEPH_FEATURE_FS_BTIME); > >> + struct ceph_mds_request_head_old *ohead; > >> + > >> + if (legacy) > >> + return (struct ceph_mds_request_head_legacy *)p; > >> + ohead = (struct ceph_mds_request_head_old *)p; > >> + return (struct ceph_mds_request_head_legacy *)&ohead->oldest_client_tid; > >> +} > >> + > >> /* > >> * called under mdsc->mutex > >> */ > >> @@ -2907,7 +2919,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session, > >> int mds = session->s_mds; > >> struct ceph_mds_client *mdsc = session->s_mdsc; > >> struct ceph_msg *msg; > >> - struct ceph_mds_request_head_old *head; > >> + struct ceph_mds_request_head_legacy *lhead; > >> const char *path1 = NULL; > >> const char *path2 = NULL; > >> u64 ino1 = 0, ino2 = 0; > >> @@ -2919,6 +2931,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session, > >> void *p, *end; > >> int ret; > >> bool legacy = !(session->s_con.peer_features & CEPH_FEATURE_FS_BTIME); > >> + bool old_version = !test_bit(CEPHFS_FEATURE_32BITS_RETRY_FWD, &session->s_features); > >> > >> ret = set_request_path_attr(mdsc, req->r_inode, req->r_dentry, > >> req->r_parent, req->r_path1, req->r_ino1.ino, > >> @@ -2950,7 +2963,19 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session, > >> goto out_free2; > >> } > >> > >> - len = legacy ? sizeof(*head) : sizeof(struct ceph_mds_request_head); > >> + /* > >> + * For old cephs without supporting the 32bit retry/fwd feature > >> + * it will copy the raw memories directly when decoding the > >> + * requests. While new cephs will decode the head depending the > >> + * version member, so we need to make sure it will be compatible > >> + * with them both. > >> + */ > >> + if (legacy) > >> + len = sizeof(struct ceph_mds_request_head_legacy); > >> + else if (old_version) > >> + len = sizeof(struct ceph_mds_request_head_old); > >> + else > >> + len = sizeof(struct ceph_mds_request_head); > >> > >> /* filepaths */ > >> len += 2 * (1 + sizeof(u32) + sizeof(u64)); > >> @@ -2995,33 +3020,40 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session, > >> > >> msg->hdr.tid = cpu_to_le64(req->r_tid); > >> > >> + lhead = find_legacy_request_head(msg->front.iov_base, > >> + session->s_con.peer_features); > >> + > >> /* > >> - * The old ceph_mds_request_head didn't contain a version field, and > >> + * The ceph_mds_request_head_legacy didn't contain a version field, and > >> * one was added when we moved the message version from 3->4. > >> */ > >> if (legacy) { > >> msg->hdr.version = cpu_to_le16(3); > >> - head = msg->front.iov_base; > >> - p = msg->front.iov_base + sizeof(*head); > >> + p = msg->front.iov_base + sizeof(*lhead); > >> + } else if (old_version) { > >> + struct ceph_mds_request_head_old *ohead = msg->front.iov_base; > >> + > >> + msg->hdr.version = cpu_to_le16(4); > >> + ohead->version = cpu_to_le16(CEPH_MDS_REQUEST_HEAD_VERSION); > > Don't we want to use the old mds request head version here: > > ohead->version = cpu_to_le16(1); > > ? > > > > As far as I understand the idea here is to skip new fields > > ext_num_retry/ext_num_fwd in case > > when the old_version is true. Would it be incorrect to set version to > > the latest one (CEPH_MDS_REQUEST_HEAD_VERSION) > > and at the same time skip the setting of new fields? > > Hi Alex, > > I noticed that, but this doesn't matter, because for the old version > cephs they will do nothing with the version field. I am just following > the user space code. Got it. Thanks! Reviewed-by: Alexander Mikhalitsyn <aleksandr.mikhalitsyn@canonical.com> > > But to make the code to be more readable and easier to understand I will > revise this. And also will raise one PR to fix the user space code. > > Thanks > > - Xiubo > > > > Kind regards, > > Alex > > > >> + p = msg->front.iov_base + sizeof(*ohead); > >> } else { > >> - struct ceph_mds_request_head *new_head = msg->front.iov_base; > >> + struct ceph_mds_request_head *nhead = msg->front.iov_base; > >> > >> msg->hdr.version = cpu_to_le16(6); > >> - new_head->version = cpu_to_le16(CEPH_MDS_REQUEST_HEAD_VERSION); > >> - head = (struct ceph_mds_request_head_old *)&new_head->oldest_client_tid; > >> - p = msg->front.iov_base + sizeof(*new_head); > >> + nhead->version = cpu_to_le16(CEPH_MDS_REQUEST_HEAD_VERSION); > >> + p = msg->front.iov_base + sizeof(*nhead); > >> } > >> > >> end = msg->front.iov_base + msg->front.iov_len; > >> > >> - head->mdsmap_epoch = cpu_to_le32(mdsc->mdsmap->m_epoch); > >> - head->op = cpu_to_le32(req->r_op); > >> - head->caller_uid = cpu_to_le32(from_kuid(&init_user_ns, > >> - req->r_cred->fsuid)); > >> - head->caller_gid = cpu_to_le32(from_kgid(&init_user_ns, > >> - req->r_cred->fsgid)); > >> - head->ino = cpu_to_le64(req->r_deleg_ino); > >> - head->args = req->r_args; > >> + lhead->mdsmap_epoch = cpu_to_le32(mdsc->mdsmap->m_epoch); > >> + lhead->op = cpu_to_le32(req->r_op); > >> + lhead->caller_uid = cpu_to_le32(from_kuid(&init_user_ns, > >> + req->r_cred->fsuid)); > >> + lhead->caller_gid = cpu_to_le32(from_kgid(&init_user_ns, > >> + req->r_cred->fsgid)); > >> + lhead->ino = cpu_to_le64(req->r_deleg_ino); > >> + lhead->args = req->r_args; > >> > >> ceph_encode_filepath(&p, end, ino1, path1); > >> ceph_encode_filepath(&p, end, ino2, path2); > >> @@ -3063,7 +3095,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session, > >> p = msg->front.iov_base + req->r_request_release_offset; > >> } > >> > >> - head->num_releases = cpu_to_le16(releases); > >> + lhead->num_releases = cpu_to_le16(releases); > >> > >> encode_mclientrequest_tail(&p, req); > >> > >> @@ -3114,18 +3146,6 @@ static void complete_request(struct ceph_mds_client *mdsc, > >> complete_all(&req->r_completion); > >> } > >> > >> -static struct ceph_mds_request_head_old * > >> -find_old_request_head(void *p, u64 features) > >> -{ > >> - bool legacy = !(features & CEPH_FEATURE_FS_BTIME); > >> - struct ceph_mds_request_head *new_head; > >> - > >> - if (legacy) > >> - return (struct ceph_mds_request_head_old *)p; > >> - new_head = (struct ceph_mds_request_head *)p; > >> - return (struct ceph_mds_request_head_old *)&new_head->oldest_client_tid; > >> -} > >> - > >> /* > >> * called under mdsc->mutex > >> */ > >> @@ -3136,30 +3156,26 @@ static int __prepare_send_request(struct ceph_mds_session *session, > >> int mds = session->s_mds; > >> struct ceph_mds_client *mdsc = session->s_mdsc; > >> struct ceph_client *cl = mdsc->fsc->client; > >> - struct ceph_mds_request_head_old *rhead; > >> + struct ceph_mds_request_head_legacy *lhead; > >> + struct ceph_mds_request_head *nhead; > >> struct ceph_msg *msg; > >> - int flags = 0, max_retry; > >> + int flags = 0, old_max_retry; > >> + bool old_version = !test_bit(CEPHFS_FEATURE_32BITS_RETRY_FWD, &session->s_features); > >> > >> - /* > >> - * The type of 'r_attempts' in kernel 'ceph_mds_request' > >> - * is 'int', while in 'ceph_mds_request_head' the type of > >> - * 'num_retry' is '__u8'. So in case the request retries > >> - * exceeding 256 times, the MDS will receive a incorrect > >> - * retry seq. > >> - * > >> - * In this case it's ususally a bug in MDS and continue > >> - * retrying the request makes no sense. > >> - * > >> - * In future this could be fixed in ceph code, so avoid > >> - * using the hardcode here. > >> + /* Avoid inifinite retrying after overflow. The client will > >> + * increase the retry count and if the MDS is old version, > >> + * so we limit to retry at most 256 times. > >> */ > >> - max_retry = sizeof_field(struct ceph_mds_request_head, num_retry); > >> - max_retry = 1 << (max_retry * BITS_PER_BYTE); > >> - if (req->r_attempts >= max_retry) { > >> - pr_warn_ratelimited_client(cl, "request tid %llu seq overflow\n", > >> - req->r_tid); > >> - return -EMULTIHOP; > >> - } > >> + if (req->r_attempts) { > >> + old_max_retry = sizeof_field(struct ceph_mds_request_head_old, num_retry); > >> + old_max_retry = 1 << (old_max_retry * BITS_PER_BYTE); > >> + if ((old_version && req->r_attempts >= old_max_retry) || > >> + ((uint32_t)req->r_attempts >= U32_MAX)) { > >> + pr_warn_ratelimited_client(cl, "%s request tid %llu seq overflow\n", > >> + __func__, req->r_tid); > >> + return -EMULTIHOP; > >> + } > >> + } > >> > >> req->r_attempts++; > >> if (req->r_inode) { > >> @@ -3184,20 +3200,24 @@ static int __prepare_send_request(struct ceph_mds_session *session, > >> * d_move mangles the src name. > >> */ > >> msg = req->r_request; > >> - rhead = find_old_request_head(msg->front.iov_base, > >> - session->s_con.peer_features); > >> + lhead = find_legacy_request_head(msg->front.iov_base, > >> + session->s_con.peer_features); > >> > >> - flags = le32_to_cpu(rhead->flags); > >> + flags = le32_to_cpu(lhead->flags); > >> flags |= CEPH_MDS_FLAG_REPLAY; > >> - rhead->flags = cpu_to_le32(flags); > >> + lhead->flags = cpu_to_le32(flags); > >> > >> if (req->r_target_inode) > >> - rhead->ino = cpu_to_le64(ceph_ino(req->r_target_inode)); > >> + lhead->ino = cpu_to_le64(ceph_ino(req->r_target_inode)); > >> > >> - rhead->num_retry = req->r_attempts - 1; > >> + lhead->num_retry = req->r_attempts - 1; > >> + if (!old_version) { > >> + nhead = (struct ceph_mds_request_head*)msg->front.iov_base; > >> + nhead->ext_num_retry = cpu_to_le32(req->r_attempts - 1); > >> + } > >> > >> /* remove cap/dentry releases from message */ > >> - rhead->num_releases = 0; > >> + lhead->num_releases = 0; > >> > >> p = msg->front.iov_base + req->r_request_release_offset; > >> encode_mclientrequest_tail(&p, req); > >> @@ -3218,18 +3238,23 @@ static int __prepare_send_request(struct ceph_mds_session *session, > >> } > >> req->r_request = msg; > >> > >> - rhead = find_old_request_head(msg->front.iov_base, > >> - session->s_con.peer_features); > >> - rhead->oldest_client_tid = cpu_to_le64(__get_oldest_tid(mdsc)); > >> + lhead = find_legacy_request_head(msg->front.iov_base, > >> + session->s_con.peer_features); > >> + lhead->oldest_client_tid = cpu_to_le64(__get_oldest_tid(mdsc)); > >> if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) > >> flags |= CEPH_MDS_FLAG_REPLAY; > >> if (test_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags)) > >> flags |= CEPH_MDS_FLAG_ASYNC; > >> if (req->r_parent) > >> flags |= CEPH_MDS_FLAG_WANT_DENTRY; > >> - rhead->flags = cpu_to_le32(flags); > >> - rhead->num_fwd = req->r_num_fwd; > >> - rhead->num_retry = req->r_attempts - 1; > >> + lhead->flags = cpu_to_le32(flags); > >> + lhead->num_fwd = req->r_num_fwd; > >> + lhead->num_retry = req->r_attempts - 1; > >> + if (!old_version) { > >> + nhead = (struct ceph_mds_request_head*)msg->front.iov_base; > >> + nhead->ext_num_fwd = cpu_to_le32(req->r_num_fwd); > >> + nhead->ext_num_retry = cpu_to_le32(req->r_attempts - 1); > >> + } > >> > >> doutc(cl, " r_parent = %p\n", req->r_parent); > >> return 0; > >> @@ -3893,34 +3918,20 @@ static void handle_forward(struct ceph_mds_client *mdsc, > >> if (test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) { > >> doutc(cl, "forward tid %llu aborted, unregistering\n", tid); > >> __unregister_request(mdsc, req); > >> - } else if (fwd_seq <= req->r_num_fwd) { > >> - /* > >> - * The type of 'num_fwd' in ceph 'MClientRequestForward' > >> - * is 'int32_t', while in 'ceph_mds_request_head' the > >> - * type is '__u8'. So in case the request bounces between > >> - * MDSes exceeding 256 times, the client will get stuck. > >> - * > >> - * In this case it's ususally a bug in MDS and continue > >> - * bouncing the request makes no sense. > >> + } else if (fwd_seq <= req->r_num_fwd || (uint32_t)fwd_seq >= U32_MAX) { > >> + /* Avoid inifinite retrying after overflow. > >> * > >> - * In future this could be fixed in ceph code, so avoid > >> - * using the hardcode here. > >> + * The MDS will increase the fwd count and in client side > >> + * if the num_fwd is less than the one saved in request > >> + * that means the MDS is an old version and overflowed of > >> + * 8 bits. > >> */ > >> - int max = sizeof_field(struct ceph_mds_request_head, num_fwd); > >> - max = 1 << (max * BITS_PER_BYTE); > >> - if (req->r_num_fwd >= max) { > >> - mutex_lock(&req->r_fill_mutex); > >> - req->r_err = -EMULTIHOP; > >> - set_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags); > >> - mutex_unlock(&req->r_fill_mutex); > >> - aborted = true; > >> - pr_warn_ratelimited_client(cl, > >> - "forward tid %llu seq overflow\n", > >> - tid); > >> - } else { > >> - doutc(cl, "forward tid %llu to mds%d - old seq %d <= %d\n", > >> - tid, next_mds, req->r_num_fwd, fwd_seq); > >> - } > >> + mutex_lock(&req->r_fill_mutex); > >> + req->r_err = -EMULTIHOP; > >> + set_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags); > >> + mutex_unlock(&req->r_fill_mutex); > >> + aborted = true; > >> + pr_warn_ratelimited_client(cl, "forward tid %llu seq overflow\n", tid); > >> } else { > >> /* resend. forward race not possible; mds would drop */ > >> doutc(cl, "forward tid %llu to mds%d (we resend)\n", tid, next_mds); > >> diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h > >> index befbd384428e..717a7399bacb 100644 > >> --- a/fs/ceph/mds_client.h > >> +++ b/fs/ceph/mds_client.h > >> @@ -32,8 +32,9 @@ enum ceph_feature_type { > >> CEPHFS_FEATURE_ALTERNATE_NAME, > >> CEPHFS_FEATURE_NOTIFY_SESSION_STATE, > >> CEPHFS_FEATURE_OP_GETVXATTR, > >> + CEPHFS_FEATURE_32BITS_RETRY_FWD, > >> > >> - CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_OP_GETVXATTR, > >> + CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_32BITS_RETRY_FWD, > >> }; > >> > >> #define CEPHFS_FEATURES_CLIENT_SUPPORTED { \ > >> @@ -47,6 +48,7 @@ enum ceph_feature_type { > >> CEPHFS_FEATURE_ALTERNATE_NAME, \ > >> CEPHFS_FEATURE_NOTIFY_SESSION_STATE, \ > >> CEPHFS_FEATURE_OP_GETVXATTR, \ > >> + CEPHFS_FEATURE_32BITS_RETRY_FWD, \ > >> } > >> > >> /* > >> diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h > >> index 45f8ce61e103..f3b3593254b9 100644 > >> --- a/include/linux/ceph/ceph_fs.h > >> +++ b/include/linux/ceph/ceph_fs.h > >> @@ -484,7 +484,7 @@ union ceph_mds_request_args_ext { > >> #define CEPH_MDS_FLAG_WANT_DENTRY 2 /* want dentry in reply */ > >> #define CEPH_MDS_FLAG_ASYNC 4 /* request is asynchronous */ > >> > >> -struct ceph_mds_request_head_old { > >> +struct ceph_mds_request_head_legacy { > >> __le64 oldest_client_tid; > >> __le32 mdsmap_epoch; /* on client */ > >> __le32 flags; /* CEPH_MDS_FLAG_* */ > >> @@ -497,9 +497,9 @@ struct ceph_mds_request_head_old { > >> union ceph_mds_request_args args; > >> } __attribute__ ((packed)); > >> > >> -#define CEPH_MDS_REQUEST_HEAD_VERSION 1 > >> +#define CEPH_MDS_REQUEST_HEAD_VERSION 2 > >> > >> -struct ceph_mds_request_head { > >> +struct ceph_mds_request_head_old { > >> __le16 version; /* struct version */ > >> __le64 oldest_client_tid; > >> __le32 mdsmap_epoch; /* on client */ > >> @@ -513,6 +513,23 @@ struct ceph_mds_request_head { > >> union ceph_mds_request_args_ext args; > >> } __attribute__ ((packed)); > >> > >> +struct ceph_mds_request_head { > >> + __le16 version; /* struct version */ > >> + __le64 oldest_client_tid; > >> + __le32 mdsmap_epoch; /* on client */ > >> + __le32 flags; /* CEPH_MDS_FLAG_* */ > >> + __u8 num_retry, num_fwd; /* legacy count retry and fwd attempts */ > >> + __le16 num_releases; /* # include cap/lease release records */ > >> + __le32 op; /* mds op code */ > >> + __le32 caller_uid, caller_gid; > >> + __le64 ino; /* use this ino for openc, mkdir, mknod, > >> + etc. (if replaying) */ > >> + union ceph_mds_request_args_ext args; > >> + > >> + __le32 ext_num_retry; /* new count retry attempts */ > >> + __le32 ext_num_fwd; /* new count fwd attempts */ > >> +} __attribute__ ((packed)); > >> + > >> /* cap/lease release record */ > >> struct ceph_mds_request_release { > >> __le64 ino, cap_id; /* ino and unique cap id */ > >> -- > >> 2.40.1 > >> >
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 70987b3c198a..191bae3a4ee6 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -2897,6 +2897,18 @@ static void encode_mclientrequest_tail(void **p, const struct ceph_mds_request * } } +static struct ceph_mds_request_head_legacy * +find_legacy_request_head(void *p, u64 features) +{ + bool legacy = !(features & CEPH_FEATURE_FS_BTIME); + struct ceph_mds_request_head_old *ohead; + + if (legacy) + return (struct ceph_mds_request_head_legacy *)p; + ohead = (struct ceph_mds_request_head_old *)p; + return (struct ceph_mds_request_head_legacy *)&ohead->oldest_client_tid; +} + /* * called under mdsc->mutex */ @@ -2907,7 +2919,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session, int mds = session->s_mds; struct ceph_mds_client *mdsc = session->s_mdsc; struct ceph_msg *msg; - struct ceph_mds_request_head_old *head; + struct ceph_mds_request_head_legacy *lhead; const char *path1 = NULL; const char *path2 = NULL; u64 ino1 = 0, ino2 = 0; @@ -2919,6 +2931,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session, void *p, *end; int ret; bool legacy = !(session->s_con.peer_features & CEPH_FEATURE_FS_BTIME); + bool old_version = !test_bit(CEPHFS_FEATURE_32BITS_RETRY_FWD, &session->s_features); ret = set_request_path_attr(mdsc, req->r_inode, req->r_dentry, req->r_parent, req->r_path1, req->r_ino1.ino, @@ -2950,7 +2963,19 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session, goto out_free2; } - len = legacy ? sizeof(*head) : sizeof(struct ceph_mds_request_head); + /* + * For old cephs without supporting the 32bit retry/fwd feature + * it will copy the raw memories directly when decoding the + * requests. While new cephs will decode the head depending the + * version member, so we need to make sure it will be compatible + * with them both. + */ + if (legacy) + len = sizeof(struct ceph_mds_request_head_legacy); + else if (old_version) + len = sizeof(struct ceph_mds_request_head_old); + else + len = sizeof(struct ceph_mds_request_head); /* filepaths */ len += 2 * (1 + sizeof(u32) + sizeof(u64)); @@ -2995,33 +3020,40 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session, msg->hdr.tid = cpu_to_le64(req->r_tid); + lhead = find_legacy_request_head(msg->front.iov_base, + session->s_con.peer_features); + /* - * The old ceph_mds_request_head didn't contain a version field, and + * The ceph_mds_request_head_legacy didn't contain a version field, and * one was added when we moved the message version from 3->4. */ if (legacy) { msg->hdr.version = cpu_to_le16(3); - head = msg->front.iov_base; - p = msg->front.iov_base + sizeof(*head); + p = msg->front.iov_base + sizeof(*lhead); + } else if (old_version) { + struct ceph_mds_request_head_old *ohead = msg->front.iov_base; + + msg->hdr.version = cpu_to_le16(4); + ohead->version = cpu_to_le16(CEPH_MDS_REQUEST_HEAD_VERSION); + p = msg->front.iov_base + sizeof(*ohead); } else { - struct ceph_mds_request_head *new_head = msg->front.iov_base; + struct ceph_mds_request_head *nhead = msg->front.iov_base; msg->hdr.version = cpu_to_le16(6); - new_head->version = cpu_to_le16(CEPH_MDS_REQUEST_HEAD_VERSION); - head = (struct ceph_mds_request_head_old *)&new_head->oldest_client_tid; - p = msg->front.iov_base + sizeof(*new_head); + nhead->version = cpu_to_le16(CEPH_MDS_REQUEST_HEAD_VERSION); + p = msg->front.iov_base + sizeof(*nhead); } end = msg->front.iov_base + msg->front.iov_len; - head->mdsmap_epoch = cpu_to_le32(mdsc->mdsmap->m_epoch); - head->op = cpu_to_le32(req->r_op); - head->caller_uid = cpu_to_le32(from_kuid(&init_user_ns, - req->r_cred->fsuid)); - head->caller_gid = cpu_to_le32(from_kgid(&init_user_ns, - req->r_cred->fsgid)); - head->ino = cpu_to_le64(req->r_deleg_ino); - head->args = req->r_args; + lhead->mdsmap_epoch = cpu_to_le32(mdsc->mdsmap->m_epoch); + lhead->op = cpu_to_le32(req->r_op); + lhead->caller_uid = cpu_to_le32(from_kuid(&init_user_ns, + req->r_cred->fsuid)); + lhead->caller_gid = cpu_to_le32(from_kgid(&init_user_ns, + req->r_cred->fsgid)); + lhead->ino = cpu_to_le64(req->r_deleg_ino); + lhead->args = req->r_args; ceph_encode_filepath(&p, end, ino1, path1); ceph_encode_filepath(&p, end, ino2, path2); @@ -3063,7 +3095,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session, p = msg->front.iov_base + req->r_request_release_offset; } - head->num_releases = cpu_to_le16(releases); + lhead->num_releases = cpu_to_le16(releases); encode_mclientrequest_tail(&p, req); @@ -3114,18 +3146,6 @@ static void complete_request(struct ceph_mds_client *mdsc, complete_all(&req->r_completion); } -static struct ceph_mds_request_head_old * -find_old_request_head(void *p, u64 features) -{ - bool legacy = !(features & CEPH_FEATURE_FS_BTIME); - struct ceph_mds_request_head *new_head; - - if (legacy) - return (struct ceph_mds_request_head_old *)p; - new_head = (struct ceph_mds_request_head *)p; - return (struct ceph_mds_request_head_old *)&new_head->oldest_client_tid; -} - /* * called under mdsc->mutex */ @@ -3136,30 +3156,26 @@ static int __prepare_send_request(struct ceph_mds_session *session, int mds = session->s_mds; struct ceph_mds_client *mdsc = session->s_mdsc; struct ceph_client *cl = mdsc->fsc->client; - struct ceph_mds_request_head_old *rhead; + struct ceph_mds_request_head_legacy *lhead; + struct ceph_mds_request_head *nhead; struct ceph_msg *msg; - int flags = 0, max_retry; + int flags = 0, old_max_retry; + bool old_version = !test_bit(CEPHFS_FEATURE_32BITS_RETRY_FWD, &session->s_features); - /* - * The type of 'r_attempts' in kernel 'ceph_mds_request' - * is 'int', while in 'ceph_mds_request_head' the type of - * 'num_retry' is '__u8'. So in case the request retries - * exceeding 256 times, the MDS will receive a incorrect - * retry seq. - * - * In this case it's ususally a bug in MDS and continue - * retrying the request makes no sense. - * - * In future this could be fixed in ceph code, so avoid - * using the hardcode here. + /* Avoid inifinite retrying after overflow. The client will + * increase the retry count and if the MDS is old version, + * so we limit to retry at most 256 times. */ - max_retry = sizeof_field(struct ceph_mds_request_head, num_retry); - max_retry = 1 << (max_retry * BITS_PER_BYTE); - if (req->r_attempts >= max_retry) { - pr_warn_ratelimited_client(cl, "request tid %llu seq overflow\n", - req->r_tid); - return -EMULTIHOP; - } + if (req->r_attempts) { + old_max_retry = sizeof_field(struct ceph_mds_request_head_old, num_retry); + old_max_retry = 1 << (old_max_retry * BITS_PER_BYTE); + if ((old_version && req->r_attempts >= old_max_retry) || + ((uint32_t)req->r_attempts >= U32_MAX)) { + pr_warn_ratelimited_client(cl, "%s request tid %llu seq overflow\n", + __func__, req->r_tid); + return -EMULTIHOP; + } + } req->r_attempts++; if (req->r_inode) { @@ -3184,20 +3200,24 @@ static int __prepare_send_request(struct ceph_mds_session *session, * d_move mangles the src name. */ msg = req->r_request; - rhead = find_old_request_head(msg->front.iov_base, - session->s_con.peer_features); + lhead = find_legacy_request_head(msg->front.iov_base, + session->s_con.peer_features); - flags = le32_to_cpu(rhead->flags); + flags = le32_to_cpu(lhead->flags); flags |= CEPH_MDS_FLAG_REPLAY; - rhead->flags = cpu_to_le32(flags); + lhead->flags = cpu_to_le32(flags); if (req->r_target_inode) - rhead->ino = cpu_to_le64(ceph_ino(req->r_target_inode)); + lhead->ino = cpu_to_le64(ceph_ino(req->r_target_inode)); - rhead->num_retry = req->r_attempts - 1; + lhead->num_retry = req->r_attempts - 1; + if (!old_version) { + nhead = (struct ceph_mds_request_head*)msg->front.iov_base; + nhead->ext_num_retry = cpu_to_le32(req->r_attempts - 1); + } /* remove cap/dentry releases from message */ - rhead->num_releases = 0; + lhead->num_releases = 0; p = msg->front.iov_base + req->r_request_release_offset; encode_mclientrequest_tail(&p, req); @@ -3218,18 +3238,23 @@ static int __prepare_send_request(struct ceph_mds_session *session, } req->r_request = msg; - rhead = find_old_request_head(msg->front.iov_base, - session->s_con.peer_features); - rhead->oldest_client_tid = cpu_to_le64(__get_oldest_tid(mdsc)); + lhead = find_legacy_request_head(msg->front.iov_base, + session->s_con.peer_features); + lhead->oldest_client_tid = cpu_to_le64(__get_oldest_tid(mdsc)); if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) flags |= CEPH_MDS_FLAG_REPLAY; if (test_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags)) flags |= CEPH_MDS_FLAG_ASYNC; if (req->r_parent) flags |= CEPH_MDS_FLAG_WANT_DENTRY; - rhead->flags = cpu_to_le32(flags); - rhead->num_fwd = req->r_num_fwd; - rhead->num_retry = req->r_attempts - 1; + lhead->flags = cpu_to_le32(flags); + lhead->num_fwd = req->r_num_fwd; + lhead->num_retry = req->r_attempts - 1; + if (!old_version) { + nhead = (struct ceph_mds_request_head*)msg->front.iov_base; + nhead->ext_num_fwd = cpu_to_le32(req->r_num_fwd); + nhead->ext_num_retry = cpu_to_le32(req->r_attempts - 1); + } doutc(cl, " r_parent = %p\n", req->r_parent); return 0; @@ -3893,34 +3918,20 @@ static void handle_forward(struct ceph_mds_client *mdsc, if (test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) { doutc(cl, "forward tid %llu aborted, unregistering\n", tid); __unregister_request(mdsc, req); - } else if (fwd_seq <= req->r_num_fwd) { - /* - * The type of 'num_fwd' in ceph 'MClientRequestForward' - * is 'int32_t', while in 'ceph_mds_request_head' the - * type is '__u8'. So in case the request bounces between - * MDSes exceeding 256 times, the client will get stuck. - * - * In this case it's ususally a bug in MDS and continue - * bouncing the request makes no sense. + } else if (fwd_seq <= req->r_num_fwd || (uint32_t)fwd_seq >= U32_MAX) { + /* Avoid inifinite retrying after overflow. * - * In future this could be fixed in ceph code, so avoid - * using the hardcode here. + * The MDS will increase the fwd count and in client side + * if the num_fwd is less than the one saved in request + * that means the MDS is an old version and overflowed of + * 8 bits. */ - int max = sizeof_field(struct ceph_mds_request_head, num_fwd); - max = 1 << (max * BITS_PER_BYTE); - if (req->r_num_fwd >= max) { - mutex_lock(&req->r_fill_mutex); - req->r_err = -EMULTIHOP; - set_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags); - mutex_unlock(&req->r_fill_mutex); - aborted = true; - pr_warn_ratelimited_client(cl, - "forward tid %llu seq overflow\n", - tid); - } else { - doutc(cl, "forward tid %llu to mds%d - old seq %d <= %d\n", - tid, next_mds, req->r_num_fwd, fwd_seq); - } + mutex_lock(&req->r_fill_mutex); + req->r_err = -EMULTIHOP; + set_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags); + mutex_unlock(&req->r_fill_mutex); + aborted = true; + pr_warn_ratelimited_client(cl, "forward tid %llu seq overflow\n", tid); } else { /* resend. forward race not possible; mds would drop */ doutc(cl, "forward tid %llu to mds%d (we resend)\n", tid, next_mds); diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index befbd384428e..717a7399bacb 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -32,8 +32,9 @@ enum ceph_feature_type { CEPHFS_FEATURE_ALTERNATE_NAME, CEPHFS_FEATURE_NOTIFY_SESSION_STATE, CEPHFS_FEATURE_OP_GETVXATTR, + CEPHFS_FEATURE_32BITS_RETRY_FWD, - CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_OP_GETVXATTR, + CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_32BITS_RETRY_FWD, }; #define CEPHFS_FEATURES_CLIENT_SUPPORTED { \ @@ -47,6 +48,7 @@ enum ceph_feature_type { CEPHFS_FEATURE_ALTERNATE_NAME, \ CEPHFS_FEATURE_NOTIFY_SESSION_STATE, \ CEPHFS_FEATURE_OP_GETVXATTR, \ + CEPHFS_FEATURE_32BITS_RETRY_FWD, \ } /* diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h index 45f8ce61e103..f3b3593254b9 100644 --- a/include/linux/ceph/ceph_fs.h +++ b/include/linux/ceph/ceph_fs.h @@ -484,7 +484,7 @@ union ceph_mds_request_args_ext { #define CEPH_MDS_FLAG_WANT_DENTRY 2 /* want dentry in reply */ #define CEPH_MDS_FLAG_ASYNC 4 /* request is asynchronous */ -struct ceph_mds_request_head_old { +struct ceph_mds_request_head_legacy { __le64 oldest_client_tid; __le32 mdsmap_epoch; /* on client */ __le32 flags; /* CEPH_MDS_FLAG_* */ @@ -497,9 +497,9 @@ struct ceph_mds_request_head_old { union ceph_mds_request_args args; } __attribute__ ((packed)); -#define CEPH_MDS_REQUEST_HEAD_VERSION 1 +#define CEPH_MDS_REQUEST_HEAD_VERSION 2 -struct ceph_mds_request_head { +struct ceph_mds_request_head_old { __le16 version; /* struct version */ __le64 oldest_client_tid; __le32 mdsmap_epoch; /* on client */ @@ -513,6 +513,23 @@ struct ceph_mds_request_head { union ceph_mds_request_args_ext args; } __attribute__ ((packed)); +struct ceph_mds_request_head { + __le16 version; /* struct version */ + __le64 oldest_client_tid; + __le32 mdsmap_epoch; /* on client */ + __le32 flags; /* CEPH_MDS_FLAG_* */ + __u8 num_retry, num_fwd; /* legacy count retry and fwd attempts */ + __le16 num_releases; /* # include cap/lease release records */ + __le32 op; /* mds op code */ + __le32 caller_uid, caller_gid; + __le64 ino; /* use this ino for openc, mkdir, mknod, + etc. (if replaying) */ + union ceph_mds_request_args_ext args; + + __le32 ext_num_retry; /* new count retry attempts */ + __le32 ext_num_fwd; /* new count fwd attempts */ +} __attribute__ ((packed)); + /* cap/lease release record */ struct ceph_mds_request_release { __le64 ino, cap_id; /* ino and unique cap id */