mbox series

[v3,0/2] ceph: fix fscrypt_destroy_keyring use-after-free bug

Message ID 20230608022959.45134-1-xiubli@redhat.com
Headers show
Series ceph: fix fscrypt_destroy_keyring use-after-free bug | expand

Message

Xiubo Li June 8, 2023, 2:29 a.m. UTC
From: Xiubo Li <xiubli@redhat.com>

V3:
- reuse the mount_timeout when umounting.

Xiubo Li (2):
  ceph: drop the messages from MDS when unmounting
  ceph: just wait the osd requests' callbacks to finish when unmounting

 fs/ceph/addr.c       | 10 +++++
 fs/ceph/caps.c       |  6 ++-
 fs/ceph/mds_client.c | 14 +++++--
 fs/ceph/mds_client.h | 11 +++++-
 fs/ceph/quota.c      | 14 +++----
 fs/ceph/snap.c       | 10 +++--
 fs/ceph/super.c      | 87 +++++++++++++++++++++++++++++++++++++++++++-
 fs/ceph/super.h      |  5 +++
 8 files changed, 140 insertions(+), 17 deletions(-)

Comments

Xiubo Li June 12, 2023, 12:43 a.m. UTC | #1
On 6/9/23 22:49, Luís Henriques wrote:
> On Thu, Jun 08, 2023 at 10:29:58AM +0800, xiubli@redhat.com wrote:
>> From: Xiubo Li <xiubli@redhat.com>
>>
>> When unmounting all the dirty buffers will be flushed and after
>> the last osd request is finished the last reference of the i_count
>> will be released. Then it will flush the dirty cap/snap to MDSs,
>> and the unmounting won't wait the possible acks, which will ihold
>> the inodes when updating the metadata locally but makes no sense
>> any more, of this. This will make the evict_inodes() to skip these
>> inodes.
>>
>> If encrypt is enabled the kernel generate a warning when removing
>> the encrypt keys when the skipped inodes still hold the keyring:
>>
>> WARNING: CPU: 4 PID: 168846 at fs/crypto/keyring.c:242 fscrypt_destroy_keyring+0x7e/0xd0
>> CPU: 4 PID: 168846 Comm: umount Tainted: G S  6.1.0-rc5-ceph-g72ead199864c #1
>> Hardware name: Supermicro SYS-5018R-WR/X10SRW-F, BIOS 2.0 12/17/2015
>> RIP: 0010:fscrypt_destroy_keyring+0x7e/0xd0
>> RSP: 0018:ffffc9000b277e28 EFLAGS: 00010202
>> RAX: 0000000000000002 RBX: ffff88810d52ac00 RCX: ffff88810b56aa00
>> RDX: 0000000080000000 RSI: ffffffff822f3a09 RDI: ffff888108f59000
>> RBP: ffff8881d394fb88 R08: 0000000000000028 R09: 0000000000000000
>> R10: 0000000000000001 R11: 11ff4fe6834fcd91 R12: ffff8881d394fc40
>> R13: ffff888108f59000 R14: ffff8881d394f800 R15: 0000000000000000
>> FS:  00007fd83f6f1080(0000) GS:ffff88885fd00000(0000) knlGS:0000000000000000
>> CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
>> CR2: 00007f918d417000 CR3: 000000017f89a005 CR4: 00000000003706e0
>> DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
>> DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
>> Call Trace:
>> <TASK>
>> generic_shutdown_super+0x47/0x120
>> kill_anon_super+0x14/0x30
>> ceph_kill_sb+0x36/0x90 [ceph]
>> deactivate_locked_super+0x29/0x60
>> cleanup_mnt+0xb8/0x140
>> task_work_run+0x67/0xb0
>> exit_to_user_mode_prepare+0x23d/0x240
>> syscall_exit_to_user_mode+0x25/0x60
>> do_syscall_64+0x40/0x80
>> entry_SYSCALL_64_after_hwframe+0x63/0xcd
>> RIP: 0033:0x7fd83dc39e9b
>>
>> Later the kernel will crash when iput() the inodes and dereferencing
>> the "sb->s_master_keys", which has been released by the
>> generic_shutdown_super().
>>
>> URL: https://tracker.ceph.com/issues/58126
>> URL: https://tracker.ceph.com/issues/59162
>> Reviewed-by: Milind Changire <mchangir@redhat.com>
>> Signed-off-by: Xiubo Li <xiubli@redhat.com>
>> ---
>>   fs/ceph/caps.c       |  6 +++-
>>   fs/ceph/mds_client.c | 14 ++++++--
>>   fs/ceph/mds_client.h | 11 ++++++-
>>   fs/ceph/quota.c      | 14 ++++----
>>   fs/ceph/snap.c       | 10 +++---
>>   fs/ceph/super.c      | 76 +++++++++++++++++++++++++++++++++++++++++++-
>>   fs/ceph/super.h      |  3 ++
>>   7 files changed, 117 insertions(+), 17 deletions(-)
>>
>> diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
>> index 5324727a26d8..246999780800 100644
>> --- a/fs/ceph/caps.c
>> +++ b/fs/ceph/caps.c
>> @@ -4222,6 +4222,9 @@ void ceph_handle_caps(struct ceph_mds_session *session,
>>   
>>   	dout("handle_caps from mds%d\n", session->s_mds);
>>   
>> +	if (!ceph_inc_mds_stopping_blocker(mdsc, session))
>> +		return;
>> +
>>   	/* decode */
>>   	end = msg->front.iov_base + msg->front.iov_len;
>>   	if (msg->front.iov_len < sizeof(*h))
>> @@ -4323,7 +4326,6 @@ void ceph_handle_caps(struct ceph_mds_session *session,
>>   	     vino.snap, inode);
>>   
>>   	mutex_lock(&session->s_mutex);
>> -	inc_session_sequence(session);
>>   	dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq,
>>   	     (unsigned)seq);
>>   
>> @@ -4435,6 +4437,8 @@ void ceph_handle_caps(struct ceph_mds_session *session,
>>   done_unlocked:
>>   	iput(inode);
>>   out:
>> +	ceph_dec_mds_stopping_blocker(mdsc);
>> +
>>   	ceph_put_string(extra_info.pool_ns);
>>   
>>   	/* Defer closing the sessions after s_mutex lock being released */
>> diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
>> index cd7b25f6b908..679f433a60a8 100644
>> --- a/fs/ceph/mds_client.c
>> +++ b/fs/ceph/mds_client.c
>> @@ -4902,6 +4902,9 @@ static void handle_lease(struct ceph_mds_client *mdsc,
>>   
>>   	dout("handle_lease from mds%d\n", mds);
>>   
>> +	if (!ceph_inc_mds_stopping_blocker(mdsc, session))
>> +		return;
>> +
>>   	/* decode */
>>   	if (msg->front.iov_len < sizeof(*h) + sizeof(u32))
>>   		goto bad;
>> @@ -4920,8 +4923,6 @@ static void handle_lease(struct ceph_mds_client *mdsc,
>>   	     dname.len, dname.name);
>>   
>>   	mutex_lock(&session->s_mutex);
>> -	inc_session_sequence(session);
>> -
>>   	if (!inode) {
>>   		dout("handle_lease no inode %llx\n", vino.ino);
>>   		goto release;
>> @@ -4983,9 +4984,13 @@ static void handle_lease(struct ceph_mds_client *mdsc,
>>   out:
>>   	mutex_unlock(&session->s_mutex);
>>   	iput(inode);
>> +
>> +	ceph_dec_mds_stopping_blocker(mdsc);
>>   	return;
>>   
>>   bad:
>> +	ceph_dec_mds_stopping_blocker(mdsc);
>> +
>>   	pr_err("corrupt lease message\n");
>>   	ceph_msg_dump(msg);
>>   }
>> @@ -5181,6 +5186,9 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc)
>>   	}
>>   
>>   	init_completion(&mdsc->safe_umount_waiters);
>> +	spin_lock_init(&mdsc->stopping_lock);
>> +	atomic_set(&mdsc->stopping_blockers, 0);
>> +	init_completion(&mdsc->stopping_waiter);
>>   	init_waitqueue_head(&mdsc->session_close_wq);
>>   	INIT_LIST_HEAD(&mdsc->waiting_for_map);
>>   	mdsc->quotarealms_inodes = RB_ROOT;
>> @@ -5295,7 +5303,7 @@ void send_flush_mdlog(struct ceph_mds_session *s)
>>   void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc)
>>   {
>>   	dout("pre_umount\n");
>> -	mdsc->stopping = 1;
>> +	mdsc->stopping = CEPH_MDSC_STOPPING_BEGIN;
>>   
>>   	ceph_mdsc_iterate_sessions(mdsc, send_flush_mdlog, true);
>>   	ceph_mdsc_iterate_sessions(mdsc, lock_unlock_session, false);
>> diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
>> index 82165f09a516..351d92f7fc4f 100644
>> --- a/fs/ceph/mds_client.h
>> +++ b/fs/ceph/mds_client.h
>> @@ -398,6 +398,11 @@ struct cap_wait {
>>   	int			want;
>>   };
>>   
>> +enum {
>> +	CEPH_MDSC_STOPPING_BEGIN = 1,
>> +	CEPH_MDSC_STOPPING_FLUSHED = 2,
>> +};
>> +
>>   /*
>>    * mds client state
>>    */
>> @@ -414,7 +419,11 @@ struct ceph_mds_client {
>>   	struct ceph_mds_session **sessions;    /* NULL for mds if no session */
>>   	atomic_t		num_sessions;
>>   	int                     max_sessions;  /* len of sessions array */
>> -	int                     stopping;      /* true if shutting down */
>> +
>> +	spinlock_t              stopping_lock;  /* protect snap_empty */
>> +	int                     stopping;      /* the stage of shutting down */
>> +	atomic_t                stopping_blockers;
>> +	struct completion	stopping_waiter;
>>   
>>   	atomic64_t		quotarealms_count; /* # realms with quota */
>>   	/*
>> diff --git a/fs/ceph/quota.c b/fs/ceph/quota.c
>> index 64592adfe48f..f7fcf7f08ec6 100644
>> --- a/fs/ceph/quota.c
>> +++ b/fs/ceph/quota.c
>> @@ -47,25 +47,23 @@ void ceph_handle_quota(struct ceph_mds_client *mdsc,
>>   	struct inode *inode;
>>   	struct ceph_inode_info *ci;
>>   
>> +	if (!ceph_inc_mds_stopping_blocker(mdsc, session))
>> +		return;
>> +
>>   	if (msg->front.iov_len < sizeof(*h)) {
>>   		pr_err("%s corrupt message mds%d len %d\n", __func__,
>>   		       session->s_mds, (int)msg->front.iov_len);
>>   		ceph_msg_dump(msg);
>> -		return;
>> +		goto out;
>>   	}
>>   
>> -	/* increment msg sequence number */
>> -	mutex_lock(&session->s_mutex);
>> -	inc_session_sequence(session);
>> -	mutex_unlock(&session->s_mutex);
>> -
>>   	/* lookup inode */
>>   	vino.ino = le64_to_cpu(h->ino);
>>   	vino.snap = CEPH_NOSNAP;
>>   	inode = ceph_find_inode(sb, vino);
>>   	if (!inode) {
>>   		pr_warn("Failed to find inode %llu\n", vino.ino);
>> -		return;
>> +		goto out;
>>   	}
>>   	ci = ceph_inode(inode);
>>   
>> @@ -78,6 +76,8 @@ void ceph_handle_quota(struct ceph_mds_client *mdsc,
>>   	spin_unlock(&ci->i_ceph_lock);
>>   
>>   	iput(inode);
>> +out:
>> +	ceph_dec_mds_stopping_blocker(mdsc);
>>   }
>>   
>>   static struct ceph_quotarealm_inode *
>> diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
>> index 8c5a949d8ca8..f94b0745957c 100644
>> --- a/fs/ceph/snap.c
>> +++ b/fs/ceph/snap.c
>> @@ -1011,6 +1011,9 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc,
>>   	int locked_rwsem = 0;
>>   	bool close_sessions = false;
>>   
>> +	if (!ceph_inc_mds_stopping_blocker(mdsc, session))
>> +		return;
>> +
>>   	/* decode */
>>   	if (msg->front.iov_len < sizeof(*h))
>>   		goto bad;
>> @@ -1026,10 +1029,6 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc,
>>   	dout("%s from mds%d op %s split %llx tracelen %d\n", __func__,
>>   	     mds, ceph_snap_op_name(op), split, trace_len);
>>   
>> -	mutex_lock(&session->s_mutex);
>> -	inc_session_sequence(session);
>> -	mutex_unlock(&session->s_mutex);
>> -
>>   	down_write(&mdsc->snap_rwsem);
>>   	locked_rwsem = 1;
>>   
>> @@ -1147,12 +1146,15 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc,
>>   	up_write(&mdsc->snap_rwsem);
>>   
>>   	flush_snaps(mdsc);
>> +	ceph_dec_mds_stopping_blocker(mdsc);
>>   	return;
>>   
>>   bad:
>>   	pr_err("%s corrupt snap message from mds%d\n", __func__, mds);
>>   	ceph_msg_dump(msg);
>>   out:
>> +	ceph_dec_mds_stopping_blocker(mdsc);
>> +
>>   	if (locked_rwsem)
>>   		up_write(&mdsc->snap_rwsem);
>>
> I'm not really sure if this may cause any troubles, but I *think* that the
> call to ceph_dec_mds_stopping_blocker() should be done after doing the
> up_write(&mdsc->snap_rwsem).  Since the ceph_inc_mds_stopping_blocker() is
> done before the down_write(), at least it makes it more logical.

Thanks Luis. This is okay. But I will adjust the order for this to make 
the logic the same.

-Xiubo

> Other than this, feel free to add my:
>
> Tested-by: Luís Henriques <lhenriques@suse.de>
> Reviewed-by: Luís Henriques <lhenriques@suse.de>
>
> Cheers,
> --
> Luís
>
>
>> diff --git a/fs/ceph/super.c b/fs/ceph/super.c
>> index b9dd2fa36d8b..48da49e21466 100644
>> --- a/fs/ceph/super.c
>> +++ b/fs/ceph/super.c
>> @@ -1461,15 +1461,89 @@ static int ceph_init_fs_context(struct fs_context *fc)
>>   	return -ENOMEM;
>>   }
>>   
>> +/*
>> + * Return true if it successfully increases the blocker counter,
>> + * or false if the mdsc is in stopping and flushed state.
>> + */
>> +static bool __inc_stopping_blocker(struct ceph_mds_client *mdsc)
>> +{
>> +	spin_lock(&mdsc->stopping_lock);
>> +	if (mdsc->stopping >= CEPH_MDSC_STOPPING_FLUSHED) {
>> +		spin_unlock(&mdsc->stopping_lock);
>> +		return false;
>> +	}
>> +	atomic_inc(&mdsc->stopping_blockers);
>> +	spin_unlock(&mdsc->stopping_lock);
>> +	return true;
>> +}
>> +
>> +static void __dec_stopping_blocker(struct ceph_mds_client *mdsc)
>> +{
>> +	spin_lock(&mdsc->stopping_lock);
>> +	if (!atomic_dec_return(&mdsc->stopping_blockers) &&
>> +	    mdsc->stopping >= CEPH_MDSC_STOPPING_FLUSHED)
>> +		complete_all(&mdsc->stopping_waiter);
>> +	spin_unlock(&mdsc->stopping_lock);
>> +}
>> +
>> +/* For metadata IO requests */
>> +bool ceph_inc_mds_stopping_blocker(struct ceph_mds_client *mdsc,
>> +				   struct ceph_mds_session *session)
>> +{
>> +	mutex_lock(&session->s_mutex);
>> +	inc_session_sequence(session);
>> +	mutex_unlock(&session->s_mutex);
>> +
>> +	return __inc_stopping_blocker(mdsc);
>> +}
>> +
>> +void ceph_dec_mds_stopping_blocker(struct ceph_mds_client *mdsc)
>> +{
>> +	__dec_stopping_blocker(mdsc);
>> +}
>> +
>>   static void ceph_kill_sb(struct super_block *s)
>>   {
>>   	struct ceph_fs_client *fsc = ceph_sb_to_client(s);
>> +	struct ceph_mds_client *mdsc = fsc->mdsc;
>> +	bool wait;
>>   
>>   	dout("kill_sb %p\n", s);
>>   
>> -	ceph_mdsc_pre_umount(fsc->mdsc);
>> +	ceph_mdsc_pre_umount(mdsc);
>>   	flush_fs_workqueues(fsc);
>>   
>> +	/*
>> +	 * Though the kill_anon_super() will finally trigger the
>> +	 * sync_filesystem() anyway, we still need to do it here and
>> +	 * then bump the stage of shutdown. This will allow us to
>> +	 * drop any further message, which will increase the inodes'
>> +	 * i_count reference counters but makes no sense any more,
>> +	 * from MDSs.
>> +	 *
>> +	 * Without this when evicting the inodes it may fail in the
>> +	 * kill_anon_super(), which will trigger a warning when
>> +	 * destroying the fscrypt keyring and then possibly trigger
>> +	 * a further crash in ceph module when the iput() tries to
>> +	 * evict the inodes later.
>> +	 */
>> +	sync_filesystem(s);
>> +
>> +	spin_lock(&mdsc->stopping_lock);
>> +	mdsc->stopping = CEPH_MDSC_STOPPING_FLUSHED;
>> +	wait = !!atomic_read(&mdsc->stopping_blockers);
>> +	spin_unlock(&mdsc->stopping_lock);
>> +
>> +	if (wait && atomic_read(&mdsc->stopping_blockers)) {
>> +		long timeleft = wait_for_completion_killable_timeout(
>> +				        &mdsc->stopping_waiter,
>> +					fsc->client->options->mount_timeout);
>> +		if (!timeleft) /* timed out */
>> +			pr_warn("umount timed out, %ld\n", timeleft);
>> +		else if (timeleft < 0) /* killed */
>> +			pr_warn("umount was killed, %ld\n", timeleft);
>> +	}
>> +
>>   	kill_anon_super(s);
>>   
>>   	fsc->client->extra_mon_dispatch = NULL;
>> diff --git a/fs/ceph/super.h b/fs/ceph/super.h
>> index 47d86068b92a..789b47c4e2b3 100644
>> --- a/fs/ceph/super.h
>> +++ b/fs/ceph/super.h
>> @@ -1399,4 +1399,7 @@ extern bool ceph_quota_update_statfs(struct ceph_fs_client *fsc,
>>   				     struct kstatfs *buf);
>>   extern void ceph_cleanup_quotarealms_inodes(struct ceph_mds_client *mdsc);
>>   
>> +bool ceph_inc_mds_stopping_blocker(struct ceph_mds_client *mdsc,
>> +			       struct ceph_mds_session *session);
>> +void ceph_dec_mds_stopping_blocker(struct ceph_mds_client *mdsc);
>>   #endif /* _FS_CEPH_SUPER_H */
>> -- 
>> 2.40.1
>>