diff mbox series

scsi: target: tcmu: Fix wrong uio handling causing big memory leak

Message ID 20201218141534.9918-1-bostroesser@gmail.com
State New
Headers show
Series scsi: target: tcmu: Fix wrong uio handling causing big memory leak | expand

Commit Message

Bodo Stroesser Dec. 18, 2020, 2:15 p.m. UTC
tcmu calls uio_unregister_device from tcmu_destroy_device.
After that uio will never call tcmu_release for this device.
If userspace still had the uio device open and / or mmap'ed
during uio_unregister_device, tcmu_release will not be called and
udev->kref will never go down to 0.

So tcmu in this situation will not free:
 - cmds or tmrs still in the queue or the ring
 - all pages allocated for mailbox and cmd_ring (vmalloc)
 - all pages allocated for data space
 - the udev itself

The vmalloc'ed area is 8 MB, amount of pages allocated for data
space depends on previous usage of the tcmu device. Theoretically
that can be up to 1GB.

This patch moves the call of uio_unregister_device to the
beginning of tcmu_dev_kref_release, which is called when
udev->kref drops down to zero. So we know, that uio device is
closed and unmap'ed.

In case tcmu_realease drops the last kref, we would end up doing
the uio_unregister_device from a function called by uio_release,
which causes the process to block forever.
So we now do the kref_put from new worker function
tcmu_release_work_fn which is scheduled by tcmu_release.

To make userspace still aware of the device being deleted,
tcmu_destroy_device instead of uio_unregister_device now does:
 - sets a bit in udev, so that tcmu_open and tcmu_mmap can check
   and fail with -EIO
 - resets udev->uio_info->irq to 0, so uio will fail read() and
   write() with -EIO
 - wakes up userspace possibly waiting in read(), so the read
   fails with -EIO

Avoid possible races in tcmu_open by replacing kref_get with
kref_get_unless_zero.

Signed-off-by: Bodo Stroesser <bostroesser@gmail.com>
---
 drivers/target/target_core_user.c | 54 ++++++++++++++++++++++++++++++++++++---
 1 file changed, 50 insertions(+), 4 deletions(-)

Comments

Bodo Stroesser Jan. 11, 2021, 6:22 p.m. UTC | #1
I'm sending a gentle ping for the below patch.

Especially I'm wondering whether the way how tcmu_destroy_device()
now 'stops' the uio device is ok. Should we better have something
like uio_stop_device() instead?

Thank you,
Bodo


On 18.12.20 15:15, Bodo Stroesser wrote:
> tcmu calls uio_unregister_device from tcmu_destroy_device.

> After that uio will never call tcmu_release for this device.

> If userspace still had the uio device open and / or mmap'ed

> during uio_unregister_device, tcmu_release will not be called and

> udev->kref will never go down to 0.

> 

> So tcmu in this situation will not free:

>   - cmds or tmrs still in the queue or the ring

>   - all pages allocated for mailbox and cmd_ring (vmalloc)

>   - all pages allocated for data space

>   - the udev itself

> 

> The vmalloc'ed area is 8 MB, amount of pages allocated for data

> space depends on previous usage of the tcmu device. Theoretically

> that can be up to 1GB.

> 

> This patch moves the call of uio_unregister_device to the

> beginning of tcmu_dev_kref_release, which is called when

> udev->kref drops down to zero. So we know, that uio device is

> closed and unmap'ed.

> 

> In case tcmu_realease drops the last kref, we would end up doing

> the uio_unregister_device from a function called by uio_release,

> which causes the process to block forever.

> So we now do the kref_put from new worker function

> tcmu_release_work_fn which is scheduled by tcmu_release.

> 

> To make userspace still aware of the device being deleted,

> tcmu_destroy_device instead of uio_unregister_device now does:

>   - sets a bit in udev, so that tcmu_open and tcmu_mmap can check

>     and fail with -EIO

>   - resets udev->uio_info->irq to 0, so uio will fail read() and

>     write() with -EIO

>   - wakes up userspace possibly waiting in read(), so the read

>     fails with -EIO

> 

> Avoid possible races in tcmu_open by replacing kref_get with

> kref_get_unless_zero.

> 

> Signed-off-by: Bodo Stroesser <bostroesser@gmail.com>

> ---

>   drivers/target/target_core_user.c | 54 ++++++++++++++++++++++++++++++++++++---

>   1 file changed, 50 insertions(+), 4 deletions(-)

> 

> diff --git a/drivers/target/target_core_user.c b/drivers/target/target_core_user.c

> index 0458bfb143f8..080760985ebf 100644

> --- a/drivers/target/target_core_user.c

> +++ b/drivers/target/target_core_user.c

> @@ -21,6 +21,7 @@

>   #include <linux/configfs.h>

>   #include <linux/mutex.h>

>   #include <linux/workqueue.h>

> +#include <linux/delay.h>

>   #include <net/genetlink.h>

>   #include <scsi/scsi_common.h>

>   #include <scsi/scsi_proto.h>

> @@ -109,6 +110,7 @@ struct tcmu_nl_cmd {

>   struct tcmu_dev {

>   	struct list_head node;

>   	struct kref kref;

> +	struct work_struct release_work;

>   

>   	struct se_device se_dev;

>   

> @@ -119,6 +121,7 @@ struct tcmu_dev {

>   #define TCMU_DEV_BIT_BROKEN 1

>   #define TCMU_DEV_BIT_BLOCKED 2

>   #define TCMU_DEV_BIT_TMR_NOTIFY 3

> +#define TCMU_DEV_BIT_GOING_DOWN 4

>   	unsigned long flags;

>   

>   	struct uio_info uio_info;

> @@ -1527,6 +1530,8 @@ static void tcmu_detach_hba(struct se_hba *hba)

>   	hba->hba_ptr = NULL;

>   }

>   

> +static void tcmu_release_work_fn(struct work_struct *work);

> +

>   static struct se_device *tcmu_alloc_device(struct se_hba *hba, const char *name)

>   {

>   	struct tcmu_dev *udev;

> @@ -1542,6 +1547,8 @@ static struct se_device *tcmu_alloc_device(struct se_hba *hba, const char *name)

>   		return NULL;

>   	}

>   

> +	INIT_WORK(&udev->release_work, tcmu_release_work_fn);

> +

>   	udev->hba = hba;

>   	udev->cmd_time_out = TCMU_TIME_OUT;

>   	udev->qfull_time_out = -1;

> @@ -1719,6 +1726,9 @@ static int tcmu_mmap(struct uio_info *info, struct vm_area_struct *vma)

>   {

>   	struct tcmu_dev *udev = container_of(info, struct tcmu_dev, uio_info);

>   

> +	if (test_bit(TCMU_DEV_BIT_GOING_DOWN, &udev->flags))

> +		return -EIO;

> +

>   	vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;

>   	vma->vm_ops = &tcmu_vm_ops;

>   

> @@ -1735,12 +1745,17 @@ static int tcmu_open(struct uio_info *info, struct inode *inode)

>   {

>   	struct tcmu_dev *udev = container_of(info, struct tcmu_dev, uio_info);

>   

> +	if (test_bit(TCMU_DEV_BIT_GOING_DOWN, &udev->flags))

> +		return -EIO;

> +

>   	/* O_EXCL not supported for char devs, so fake it? */

>   	if (test_and_set_bit(TCMU_DEV_BIT_OPEN, &udev->flags))

>   		return -EBUSY;

>   

>   	udev->inode = inode;

> -	kref_get(&udev->kref);

> +	if (!kref_get_unless_zero(&udev->kref))

> +		/* Race between open and device going down */

> +		return -EIO;

>   

>   	pr_debug("open\n");

>   

> @@ -1799,6 +1814,8 @@ static void tcmu_dev_kref_release(struct kref *kref)

>   	bool all_expired = true;

>   	int i;

>   

> +	uio_unregister_device(&udev->uio_info);

> +

>   	vfree(udev->mb_addr);

>   	udev->mb_addr = NULL;

>   

> @@ -1827,6 +1844,15 @@ static void tcmu_dev_kref_release(struct kref *kref)

>   	call_rcu(&dev->rcu_head, tcmu_dev_call_rcu);

>   }

>   

> +static void tcmu_release_work_fn(struct work_struct *work)

> +{

> +	struct tcmu_dev *udev = container_of(work, struct tcmu_dev,

> +					     release_work);

> +

> +	/* release ref from open */

> +	kref_put(&udev->kref, tcmu_dev_kref_release);

> +}

> +

>   static int tcmu_release(struct uio_info *info, struct inode *inode)

>   {

>   	struct tcmu_dev *udev = container_of(info, struct tcmu_dev, uio_info);

> @@ -1834,8 +1860,17 @@ static int tcmu_release(struct uio_info *info, struct inode *inode)

>   	clear_bit(TCMU_DEV_BIT_OPEN, &udev->flags);

>   

>   	pr_debug("close\n");

> -	/* release ref from open */

> -	kref_put(&udev->kref, tcmu_dev_kref_release);

> +

> +	/*

> +	 * We must not put kref directly from here, since dropping down kref to

> +	 * zero would implicitly call tcmu_dev_kref_release, which calls

> +	 * uio_unregister_device --> process hangs forever, since tcmu_release

> +	 * is called from uio.

> +	 * So we leave it to tcmu_release_work_fn to put the kref.

> +	 */

> +	while (!schedule_work(&udev->release_work))

> +		usleep_range(1000, 5000);

> +

>   	return 0;

>   }

>   

> @@ -2166,7 +2201,18 @@ static void tcmu_destroy_device(struct se_device *dev)

>   

>   	tcmu_send_dev_remove_event(udev);

>   

> -	uio_unregister_device(&udev->uio_info);

> +	/*

> +	 * We must not call uio_unregister_device here. If there is a userspace

> +	 * process with open or mmap'ed uio device, uio would not call

> +	 * tcmu_release on later unmap or close.

> +	 */

> +

> +	/* reset uio_info->irq, so uio will reject read() and write() */

> +	udev->uio_info.irq = 0;

> +	/* Set bit, so we can reject later calls to tcmu_open and tcmu_mmap */

> +	set_bit(TCMU_DEV_BIT_GOING_DOWN, &udev->flags);

> +	/* wake up possible sleeper in uio_read(), it will return -EIO */

> +	uio_event_notify(&udev->uio_info);

>   

>   	/* release ref from configure */

>   	kref_put(&udev->kref, tcmu_dev_kref_release);

>
Mike Christie Jan. 12, 2021, 6:36 p.m. UTC | #2
On 12/18/20 8:15 AM, Bodo Stroesser wrote:
> tcmu calls uio_unregister_device from tcmu_destroy_device.

> After that uio will never call tcmu_release for this device.

> If userspace still had the uio device open and / or mmap'ed

> during uio_unregister_device, tcmu_release will not be called and

> udev->kref will never go down to 0.

> 


I didn't get why the release function is not called if you call
uio_unregister_device while a device is open. Does the device_destroy call in
uio_unregister_device completely free the device or does it set some bits so
uio_release is not called later?

Do other drivers hit this? Should uio have refcounting so uio_release is called
when the last ref (from userspace open/close/mmap calls and from the kernel by
drivers like target_core_user) is done?
Bodo Stroesser Jan. 13, 2021, 5:59 p.m. UTC | #3
On 12.01.21 19:36, Mike Christie wrote:
> On 12/18/20 8:15 AM, Bodo Stroesser wrote:

>> tcmu calls uio_unregister_device from tcmu_destroy_device.

>> After that uio will never call tcmu_release for this device.

>> If userspace still had the uio device open and / or mmap'ed

>> during uio_unregister_device, tcmu_release will not be called and

>> udev->kref will never go down to 0.

>>

> 

> I didn't get why the release function is not called if you call

> uio_unregister_device while a device is open. Does the device_destroy call in

> uio_unregister_device completely free the device or does it set some bits so

> uio_release is not called later?


uio_unregister_device() resets the pointer (idev->info) to the struct 
uio_info which tcmu provided in uio_register_device().
The uio device itself AFAICS is kept while it is open / mmap'ed.
But no matter what userspace does, uio will not call tcmu's callbacks
since info pointer now is NULL.

When userspace finally closes the uio device, uio_release is called, but
tcmu_release can not be called.

> 

> Do other drivers hit this? Should uio have refcounting so uio_release is called

> when the last ref (from userspace open/close/mmap calls and from the kernel by

> drivers like target_core_user) is done?

> 


To be honest I don't know exactly.
tcmu seems to be a special case in that is has it's own mmap callback.
That allows us to map pages allocated by tcmu.
As long as userspace still holds the mapping, we should not unmap those
pages, because userspace then could get killed by SIGSEGV.
So we have to wait for userspace closing uio before we may unmap and
free the pages.
Mike Christie Jan. 13, 2021, 9:04 p.m. UTC | #4
On 1/13/21 11:59 AM, Bodo Stroesser wrote:
> On 12.01.21 19:36, Mike Christie wrote:

>> On 12/18/20 8:15 AM, Bodo Stroesser wrote:

>>> tcmu calls uio_unregister_device from tcmu_destroy_device.

>>> After that uio will never call tcmu_release for this device.

>>> If userspace still had the uio device open and / or mmap'ed

>>> during uio_unregister_device, tcmu_release will not be called and

>>> udev->kref will never go down to 0.

>>>

>>

>> I didn't get why the release function is not called if you call

>> uio_unregister_device while a device is open. Does the device_destroy call in

>> uio_unregister_device completely free the device or does it set some bits so

>> uio_release is not called later?

> 

> uio_unregister_device() resets the pointer (idev->info) to the struct uio_info which tcmu provided in uio_register_device().

> The uio device itself AFAICS is kept while it is open / mmap'ed.

> But no matter what userspace does, uio will not call tcmu's callbacks

> since info pointer now is NULL.

> 

> When userspace finally closes the uio device, uio_release is called, but

> tcmu_release can not be called.

> 

>>

>> Do other drivers hit this? Should uio have refcounting so uio_release is called

>> when the last ref (from userspace open/close/mmap calls and from the kernel by

>> drivers like target_core_user) is done?

>>

> 

> To be honest I don't know exactly.

> tcmu seems to be a special case in that is has it's own mmap callback.

> That allows us to map pages allocated by tcmu.

> As long as userspace still holds the mapping, we should not unmap those

> pages, because userspace then could get killed by SIGSEGV.

> So we have to wait for userspace closing uio before we may unmap and

> free the pages.



If we removed the clearing of idev->info in uio_unregister_device, and
then moved the idev->info->release call from uio_release to
uio_device_release it would work like you need right? The release callback
would always be called and called when userspace has dropped it's refs.
You need to also fix up the module refcount and some other bits because
it looks like uio uses the uio->info check to determine if the device is
being removed.

I don't know if that is the correct approach. It looks like non
target_core_user drivers could hit a similar issue. It seems like drivers
like qedi/bnx2i could hit the issue if their port is removed from the
kernel before their uio daemon closes the device. However, they also
could do a driver specific fix and handle the issue by adding some extra
kernel/userspace bits to sync the port removal.
Bodo Stroesser Jan. 14, 2021, 4:50 p.m. UTC | #5
On 13.01.21 22:04, Mike Christie wrote:
> On 1/13/21 11:59 AM, Bodo Stroesser wrote:

>> On 12.01.21 19:36, Mike Christie wrote:

>>> On 12/18/20 8:15 AM, Bodo Stroesser wrote:

>>>> tcmu calls uio_unregister_device from tcmu_destroy_device.

>>>> After that uio will never call tcmu_release for this device.

>>>> If userspace still had the uio device open and / or mmap'ed

>>>> during uio_unregister_device, tcmu_release will not be called and

>>>> udev->kref will never go down to 0.

>>>>

>>>

>>> I didn't get why the release function is not called if you call

>>> uio_unregister_device while a device is open. Does the device_destroy call in

>>> uio_unregister_device completely free the device or does it set some bits so

>>> uio_release is not called later?

>>

>> uio_unregister_device() resets the pointer (idev->info) to the struct uio_info which tcmu provided in uio_register_device().

>> The uio device itself AFAICS is kept while it is open / mmap'ed.

>> But no matter what userspace does, uio will not call tcmu's callbacks

>> since info pointer now is NULL.

>>

>> When userspace finally closes the uio device, uio_release is called, but

>> tcmu_release can not be called.

>>

>>>

>>> Do other drivers hit this? Should uio have refcounting so uio_release is called

>>> when the last ref (from userspace open/close/mmap calls and from the kernel by

>>> drivers like target_core_user) is done?

>>>

>>

>> To be honest I don't know exactly.

>> tcmu seems to be a special case in that is has it's own mmap callback.

>> That allows us to map pages allocated by tcmu.

>> As long as userspace still holds the mapping, we should not unmap those

>> pages, because userspace then could get killed by SIGSEGV.

>> So we have to wait for userspace closing uio before we may unmap and

>> free the pages.

> 

> 

> If we removed the clearing of idev->info in uio_unregister_device, and

> then moved the idev->info->release call from uio_release to

> uio_device_release it would work like you need right? The release callback

> would always be called and called when userspace has dropped it's refs.

> You need to also fix up the module refcount and some other bits because

> it looks like uio uses the uio->info check to determine if the device is

> being removed.


I fear that would not work, because uio_release must be called always,
no matter whether userspace closes the device before or after
uio_unregister_device.

But we could add a new callback pointer 'late_release' to struct
uio_info and struct uio_device. During uio_register_device we would
copy the pointer from info to idev.

If info == NULL, uio_release calls idev->late_release if != NULL.

tcmu would of course set info->release and ->late_release both to
tcmu_release.

> 

> I don't know if that is the correct approach. It looks like non

> target_core_user drivers could hit a similar issue. It seems like drivers

> like qedi/bnx2i could hit the issue if their port is removed from the

> kernel before their uio daemon closes the device. However, they also

> could do a driver specific fix and handle the issue by adding some extra

> kernel/userspace bits to sync the port removal.

> 


I had a closer look into qedi. I assume there might be a leak also,
because qedi_uio_close calls "qedi_ll2_free_skbs(qedi)".

Unfortunately my above proposal would not work here without adding a
new refcount to qedi_uio_dev, because currently in __qedi_free_uio
the udev is freed shortly after uio_unregister_device. So later calls
of qedi_uio_close(udev) would be harmful.

But I guess the leak can be fixed by adding two lines after the
uio_unregister_device() in __qedi_free_uio:

	if (test_bit(UIO_DEV_OPENED, &udev->qedi->flags)
		qedi_ll2_free_skbs(qedi);
diff mbox series

Patch

diff --git a/drivers/target/target_core_user.c b/drivers/target/target_core_user.c
index 0458bfb143f8..080760985ebf 100644
--- a/drivers/target/target_core_user.c
+++ b/drivers/target/target_core_user.c
@@ -21,6 +21,7 @@ 
 #include <linux/configfs.h>
 #include <linux/mutex.h>
 #include <linux/workqueue.h>
+#include <linux/delay.h>
 #include <net/genetlink.h>
 #include <scsi/scsi_common.h>
 #include <scsi/scsi_proto.h>
@@ -109,6 +110,7 @@  struct tcmu_nl_cmd {
 struct tcmu_dev {
 	struct list_head node;
 	struct kref kref;
+	struct work_struct release_work;
 
 	struct se_device se_dev;
 
@@ -119,6 +121,7 @@  struct tcmu_dev {
 #define TCMU_DEV_BIT_BROKEN 1
 #define TCMU_DEV_BIT_BLOCKED 2
 #define TCMU_DEV_BIT_TMR_NOTIFY 3
+#define TCMU_DEV_BIT_GOING_DOWN 4
 	unsigned long flags;
 
 	struct uio_info uio_info;
@@ -1527,6 +1530,8 @@  static void tcmu_detach_hba(struct se_hba *hba)
 	hba->hba_ptr = NULL;
 }
 
+static void tcmu_release_work_fn(struct work_struct *work);
+
 static struct se_device *tcmu_alloc_device(struct se_hba *hba, const char *name)
 {
 	struct tcmu_dev *udev;
@@ -1542,6 +1547,8 @@  static struct se_device *tcmu_alloc_device(struct se_hba *hba, const char *name)
 		return NULL;
 	}
 
+	INIT_WORK(&udev->release_work, tcmu_release_work_fn);
+
 	udev->hba = hba;
 	udev->cmd_time_out = TCMU_TIME_OUT;
 	udev->qfull_time_out = -1;
@@ -1719,6 +1726,9 @@  static int tcmu_mmap(struct uio_info *info, struct vm_area_struct *vma)
 {
 	struct tcmu_dev *udev = container_of(info, struct tcmu_dev, uio_info);
 
+	if (test_bit(TCMU_DEV_BIT_GOING_DOWN, &udev->flags))
+		return -EIO;
+
 	vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
 	vma->vm_ops = &tcmu_vm_ops;
 
@@ -1735,12 +1745,17 @@  static int tcmu_open(struct uio_info *info, struct inode *inode)
 {
 	struct tcmu_dev *udev = container_of(info, struct tcmu_dev, uio_info);
 
+	if (test_bit(TCMU_DEV_BIT_GOING_DOWN, &udev->flags))
+		return -EIO;
+
 	/* O_EXCL not supported for char devs, so fake it? */
 	if (test_and_set_bit(TCMU_DEV_BIT_OPEN, &udev->flags))
 		return -EBUSY;
 
 	udev->inode = inode;
-	kref_get(&udev->kref);
+	if (!kref_get_unless_zero(&udev->kref))
+		/* Race between open and device going down */
+		return -EIO;
 
 	pr_debug("open\n");
 
@@ -1799,6 +1814,8 @@  static void tcmu_dev_kref_release(struct kref *kref)
 	bool all_expired = true;
 	int i;
 
+	uio_unregister_device(&udev->uio_info);
+
 	vfree(udev->mb_addr);
 	udev->mb_addr = NULL;
 
@@ -1827,6 +1844,15 @@  static void tcmu_dev_kref_release(struct kref *kref)
 	call_rcu(&dev->rcu_head, tcmu_dev_call_rcu);
 }
 
+static void tcmu_release_work_fn(struct work_struct *work)
+{
+	struct tcmu_dev *udev = container_of(work, struct tcmu_dev,
+					     release_work);
+
+	/* release ref from open */
+	kref_put(&udev->kref, tcmu_dev_kref_release);
+}
+
 static int tcmu_release(struct uio_info *info, struct inode *inode)
 {
 	struct tcmu_dev *udev = container_of(info, struct tcmu_dev, uio_info);
@@ -1834,8 +1860,17 @@  static int tcmu_release(struct uio_info *info, struct inode *inode)
 	clear_bit(TCMU_DEV_BIT_OPEN, &udev->flags);
 
 	pr_debug("close\n");
-	/* release ref from open */
-	kref_put(&udev->kref, tcmu_dev_kref_release);
+
+	/*
+	 * We must not put kref directly from here, since dropping down kref to
+	 * zero would implicitly call tcmu_dev_kref_release, which calls
+	 * uio_unregister_device --> process hangs forever, since tcmu_release
+	 * is called from uio.
+	 * So we leave it to tcmu_release_work_fn to put the kref.
+	 */
+	while (!schedule_work(&udev->release_work))
+		usleep_range(1000, 5000);
+
 	return 0;
 }
 
@@ -2166,7 +2201,18 @@  static void tcmu_destroy_device(struct se_device *dev)
 
 	tcmu_send_dev_remove_event(udev);
 
-	uio_unregister_device(&udev->uio_info);
+	/*
+	 * We must not call uio_unregister_device here. If there is a userspace
+	 * process with open or mmap'ed uio device, uio would not call
+	 * tcmu_release on later unmap or close.
+	 */
+
+	/* reset uio_info->irq, so uio will reject read() and write() */
+	udev->uio_info.irq = 0;
+	/* Set bit, so we can reject later calls to tcmu_open and tcmu_mmap */
+	set_bit(TCMU_DEV_BIT_GOING_DOWN, &udev->flags);
+	/* wake up possible sleeper in uio_read(), it will return -EIO */
+	uio_event_notify(&udev->uio_info);
 
 	/* release ref from configure */
 	kref_put(&udev->kref, tcmu_dev_kref_release);