diff mbox series

[6/6] efivarfs: fix error on write to new variable leaving remnants

Message ID 20241210170224.19159-7-James.Bottomley@HansenPartnership.com
State New
Headers show
Series convert efivarfs to manage object data correctly | expand

Commit Message

James Bottomley Dec. 10, 2024, 5:02 p.m. UTC
Make variable cleanup go through the fops release mechanism and use
zero inode size as the indicator to delete the file.  Since all EFI
variables must have an initial u32 attribute, zero size occurs either
because the update deleted the variable or because an unsuccessful
write after create caused the size never to be set in the first place.

Even though this fixes the bug that a create either not followed by a
write or followed by a write that errored would leave a remnant file
for the variable, the file will appear momentarily globally visible
until the close of the fd deletes it.  This is safe because the normal
filesystem operations will mediate any races; however, it is still
possible for a directory listing at that instant between create and
close contain a variable that doesn't exist in the EFI table.

Signed-off-by: James Bottomley <James.Bottomley@HansenPartnership.com>
---
 fs/efivarfs/file.c | 31 ++++++++++++++++++++++---------
 1 file changed, 22 insertions(+), 9 deletions(-)

Comments

Christian Brauner Dec. 11, 2024, 11:16 a.m. UTC | #1
On Tue, Dec 10, 2024 at 12:02:24PM -0500, James Bottomley wrote:
> Make variable cleanup go through the fops release mechanism and use
> zero inode size as the indicator to delete the file.  Since all EFI
> variables must have an initial u32 attribute, zero size occurs either
> because the update deleted the variable or because an unsuccessful
> write after create caused the size never to be set in the first place.
> 
> Even though this fixes the bug that a create either not followed by a
> write or followed by a write that errored would leave a remnant file
> for the variable, the file will appear momentarily globally visible
> until the close of the fd deletes it.  This is safe because the normal
> filesystem operations will mediate any races; however, it is still
> possible for a directory listing at that instant between create and
> close contain a variable that doesn't exist in the EFI table.
> 
> Signed-off-by: James Bottomley <James.Bottomley@HansenPartnership.com>
> ---
>  fs/efivarfs/file.c | 31 ++++++++++++++++++++++---------
>  1 file changed, 22 insertions(+), 9 deletions(-)
> 
> diff --git a/fs/efivarfs/file.c b/fs/efivarfs/file.c
> index 23c51d62f902..edf363f395f5 100644
> --- a/fs/efivarfs/file.c
> +++ b/fs/efivarfs/file.c
> @@ -38,22 +38,24 @@ static ssize_t efivarfs_file_write(struct file *file,
>  
>  	bytes = efivar_entry_set_get_size(var, attributes, &datasize,
>  					  data, &set);
> -	if (!set && bytes) {
> +	if (!set) {
>  		if (bytes == -ENOENT)
>  			bytes = -EIO;
>  		goto out;
>  	}
>  
> +	inode_lock(inode);
>  	if (bytes == -ENOENT) {
> -		drop_nlink(inode);
> -		d_delete(file->f_path.dentry);
> -		dput(file->f_path.dentry);
> +		/*
> +		 * zero size signals to release that the write deleted
> +		 * the variable
> +		 */
> +		i_size_write(inode, 0);
>  	} else {
> -		inode_lock(inode);
>  		i_size_write(inode, datasize + sizeof(attributes));
>  		inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
> -		inode_unlock(inode);
>  	}
> +	inode_unlock(inode);
>  
>  	bytes = count;
>  
> @@ -106,8 +108,19 @@ static ssize_t efivarfs_file_read(struct file *file, char __user *userbuf,
>  	return size;
>  }
>  
> +static int efivarfs_file_release(struct inode *inode, struct file *file)
> +{
> +	if (i_size_read(inode) == 0) {
> +		drop_nlink(inode);
> +		d_delete(file->f_path.dentry);
> +		dput(file->f_path.dentry);
> +	}

Without wider context the dput() looks UAF-y as __fput() will do:

struct dentry *dentry = file->f_path.dentry;
if (file->f_op->release)
        file->f_op->release(inode, file);
dput(dentry);

Is there an extra reference on file->f_path.dentry taken somewhere?
James Bottomley Dec. 11, 2024, 12:39 p.m. UTC | #2
On Wed, 2024-12-11 at 12:16 +0100, Christian Brauner wrote:
> On Tue, Dec 10, 2024 at 12:02:24PM -0500, James Bottomley wrote:
[...]
> > +static int efivarfs_file_release(struct inode *inode, struct file
> > *file)
> > +{
> > +       if (i_size_read(inode) == 0) {
> > +               drop_nlink(inode);
> > +               d_delete(file->f_path.dentry);
> > +               dput(file->f_path.dentry);
> > +       }
> 
> Without wider context the dput() looks UAF-y as __fput() will do:
> 
> struct dentry *dentry = file->f_path.dentry;
> if (file->f_op->release)
>         file->f_op->release(inode, file);
> dput(dentry);
> 
> Is there an extra reference on file->f_path.dentry taken somewhere?

Heh, well, this is why I cc'd fsdevel to make sure I got all the fs
bits I used to be familiar with, but knowledge of which has atrophied,
correct.

I think it's paired with the extra dget() just after d_instantiate() in
fs/efivarfs/inode.c:efivarfs_create().  The reason being this is a
pseudo-filesystem so all the dentries representing objects have to be
born with a positive reference count to prevent them being reclaimed
under memory pressure.

Regards,

James
James Bottomley Dec. 19, 2024, 5:14 p.m. UTC | #3
On Tue, 2024-12-10 at 12:02 -0500, James Bottomley wrote:
> Even though this fixes the bug that a create either not followed by a
> write or followed by a write that errored would leave a remnant file
> for the variable, the file will appear momentarily globally visible
> until the close of the fd deletes it.  This is safe because the
> normal filesystem operations will mediate any races; however, it is
> still possible for a directory listing at that instant between create
> and close contain a variable that doesn't exist in the EFI table.

Systemd doesn't like 0 length files appearing in efivarfs, even if only
momentarily, so I think this needs updating to prevent even momentary
instances of zero length files:

https://github.com/systemd/systemd/issues/34304

These occur for two reasons

   1. The system has hibernated and resumed and the dcache entries are
      now out of sync with the original variables
   2. between the create and a successful write of a variable being
      created in efivarfs

1. can only really be fixed by adding a hibernation hook to the
filesystem code, which would be a separate patch set (which I'll work
on after we get this upstream); but 2. can be fixed by ensuring that
all variables returned from .create aren't visible in the directory
listing until a successful write.

Since we need the file to be visible to lookups but not the directory,
the only two ways of doing this are either to mark the directory in a
way that libfs.c:dcache_readdir() won't see it ... I think this would
have to be marking it as a cursor (we'd remove the cursor mark on
successful write); or to implement our own .iterate_shared function and
hijack the actor to skip newly created files (this is similar to what
overlayfs does to merge directories) which would be identified as
having zero size.

Do the fs people have a preference? The cursor mark is simpler to
implement but depends on internal libfs.c magic. The actor hijack is at
least something that already exists, so would be less prone to breaking
due to internal changes.

Regards,

James
Christian Brauner Dec. 22, 2024, 10:12 a.m. UTC | #4
> Do the fs people have a preference? The cursor mark is simpler to
> implement but depends on internal libfs.c magic. The actor hijack is at
> least something that already exists, so would be less prone to breaking
> due to internal changes.

I think making this filesystem specific is better than plumbing this
into dcache_readdir().
James Bottomley Dec. 23, 2024, 7:44 p.m. UTC | #5
On Sun, 2024-12-22 at 11:12 +0100, Christian Brauner wrote:
> > Do the fs people have a preference? The cursor mark is simpler to
> > implement but depends on internal libfs.c magic. The actor hijack
> > is at least something that already exists, so would be less prone
> > to breaking due to internal changes.
> 
> I think making this filesystem specific is better than plumbing this
> into dcache_readdir().

Neither would require changes to libfs.c: the dcache_readdir already
does the ignore cursor behaviour; the code in efivarfs would just set
the cursor flag to take advantage of it.

However, on further consideration I think making the zero size entries
not show up in the directory listing doesn't really help anything: they
still have to be found on lookup (otherwise nothing mediates a same
variable create race) which means an open by name from userspace will
still get hold of one.  The good news is that after this change they
should only show up fleetingly instead of hanging around until the next
reboot, so the chances of anyone hitting a problem is much smaller.

Regards,

James
James Bottomley Dec. 23, 2024, 7:52 p.m. UTC | #6
On Wed, 2024-12-11 at 07:39 -0500, James Bottomley wrote:
> On Wed, 2024-12-11 at 12:16 +0100, Christian Brauner wrote:
> > On Tue, Dec 10, 2024 at 12:02:24PM -0500, James Bottomley wrote:
> [...]
> > > +static int efivarfs_file_release(struct inode *inode, struct
> > > file
> > > *file)
> > > +{
> > > +       if (i_size_read(inode) == 0) {
> > > +               drop_nlink(inode);
> > > +               d_delete(file->f_path.dentry);
> > > +               dput(file->f_path.dentry);
> > > +       }
> > 
> > Without wider context the dput() looks UAF-y as __fput() will do:
> > 
> > struct dentry *dentry = file->f_path.dentry;
> > if (file->f_op->release)
> >         file->f_op->release(inode, file);
> > dput(dentry);
> > 
> > Is there an extra reference on file->f_path.dentry taken somewhere?
> 
> Heh, well, this is why I cc'd fsdevel to make sure I got all the fs
> bits I used to be familiar with, but knowledge of which has
> atrophied, correct.
> 
> I think it's paired with the extra dget() just after d_instantiate()
> in fs/efivarfs/inode.c:efivarfs_create().  The reason being this is a
> pseudo-filesystem so all the dentries representing objects have to be
> born with a positive reference count to prevent them being reclaimed
> under memory pressure.

Actually on further testing, this did turn out to be a use after free.
Not because of the dput, but because file->release is called for every
closed filehandle, so if you open the file for creation more than once,
both closes will try to delete it and ... oops.

The way I thought of mediating this is to check d_hashed in the file-
>release to see if the file has already been deleted.  That also means
we need a d_hashed() check in write because we can't resurrect the now
deleted file.  And finally something needs to mediate the check and
remove or check and add, so I used the inode semaphore for that.  The
updated patch is below and now passes the concurrency tests.

Regards,

James

------8>8>8><8<8<8-------------
From: James Bottomley <James.Bottomley@HansenPartnership.com>
Subject: [PATCH 6/6] efivarfs: fix error on write to new variable leaving remnants

Make variable cleanup go through the fops release mechanism and use
zero inode size as the indicator to delete the file.  Since all EFI
variables must have an initial u32 attribute, zero size occurs either
because the update deleted the variable or because an unsuccessful
write after create caused the size never to be set in the first place.

Even though this fixes the bug that a create either not followed by a
write or followed by a write that errored would leave a remnant file
for the variable, the file will appear momentarily globally visible
until the close of the fd deletes it.  This is safe because the normal
filesystem operations will mediate any races; however, it is still
possible for a directory listing at that instant between create and
close contain a variable that doesn't exist in the EFI table.

Signed-off-by: James Bottomley <James.Bottomley@HansenPartnership.com>
---
 fs/efivarfs/file.c | 44 +++++++++++++++++++++++++++++++++++---------
 1 file changed, 35 insertions(+), 9 deletions(-)

diff --git a/fs/efivarfs/file.c b/fs/efivarfs/file.c
index 23c51d62f902..70a673e7fda3 100644
--- a/fs/efivarfs/file.c
+++ b/fs/efivarfs/file.c
@@ -36,28 +36,41 @@ static ssize_t efivarfs_file_write(struct file *file,
 	if (IS_ERR(data))
 		return PTR_ERR(data);
 
+	inode_lock(inode);
+	if (d_unhashed(file->f_path.dentry)) {
+		/*
+		 * file got removed; don't allow a set.  Caused by an
+		 * unsuccessful create or successful delete write
+		 * racing with us.
+		 */
+		bytes = -EIO;
+		goto out;
+	}
+
 	bytes = efivar_entry_set_get_size(var, attributes, &datasize,
 					  data, &set);
-	if (!set && bytes) {
+	if (!set) {
 		if (bytes == -ENOENT)
 			bytes = -EIO;
 		goto out;
 	}
 
 	if (bytes == -ENOENT) {
-		drop_nlink(inode);
-		d_delete(file->f_path.dentry);
-		dput(file->f_path.dentry);
+		/*
+		 * zero size signals to release that the write deleted
+		 * the variable
+		 */
+		i_size_write(inode, 0);
 	} else {
-		inode_lock(inode);
 		i_size_write(inode, datasize + sizeof(attributes));
 		inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
-		inode_unlock(inode);
 	}
 
 	bytes = count;
 
 out:
+	inode_unlock(inode);
+
 	kfree(data);
 
 	return bytes;
@@ -106,8 +119,21 @@ static ssize_t efivarfs_file_read(struct file *file, char __user *userbuf,
 	return size;
 }
 
+static int efivarfs_file_release(struct inode *inode, struct file *file)
+{
+	inode_lock(inode);
+	if (i_size_read(inode) == 0 && !d_unhashed(file->f_path.dentry)) {
+		drop_nlink(inode);
+		d_delete(file->f_path.dentry);
+		dput(file->f_path.dentry);
+	}
+	inode_unlock(inode);
+	return 0;
+}
+
 const struct file_operations efivarfs_file_operations = {
-	.open	= simple_open,
-	.read	= efivarfs_file_read,
-	.write	= efivarfs_file_write,
+	.open		= simple_open,
+	.read		= efivarfs_file_read,
+	.write		= efivarfs_file_write,
+	.release	= efivarfs_file_release,
 };
Al Viro Dec. 23, 2024, 8:05 p.m. UTC | #7
On Mon, Dec 23, 2024 at 02:52:12PM -0500, James Bottomley wrote:
>  
> +static int efivarfs_file_release(struct inode *inode, struct file *file)
> +{
> +	inode_lock(inode);
> +	if (i_size_read(inode) == 0 && !d_unhashed(file->f_path.dentry)) {
> +		drop_nlink(inode);
> +		d_delete(file->f_path.dentry);
> +		dput(file->f_path.dentry);
> +	}
> +	inode_unlock(inode);
> +	return 0;
> +}

This is wrong; so's existing logics for removal from write().  Think
what happens if you open the sucker, have something bound on top of
it and do that deleting write().

Let me look into that area...
James Bottomley Dec. 23, 2024, 9:39 p.m. UTC | #8
On Mon, 2024-12-23 at 20:05 +0000, Al Viro wrote:
> On Mon, Dec 23, 2024 at 02:52:12PM -0500, James Bottomley wrote:
> >  
> > +static int efivarfs_file_release(struct inode *inode, struct file
> > *file)
> > +{
> > +       inode_lock(inode);
> > +       if (i_size_read(inode) == 0 && !d_unhashed(file-
> > >f_path.dentry)) {
> > +               drop_nlink(inode);
> > +               d_delete(file->f_path.dentry);
> > +               dput(file->f_path.dentry);
> > +       }
> > +       inode_unlock(inode);
> > +       return 0;
> > +}
> 
> This is wrong; so's existing logics for removal from write().  Think
> what happens if you open the sucker, have something bound on top of
> it and do that deleting write().

Shouldn't the bind have taken a dentry reference? in which case we'll
just drop the dentry but it won't be the final put, so it will still
hang around.

> Let me look into that area...

Thanks; as you say, delete from write has been around for over a decade
in this filesystem.  We can defer the delete, but it has to happen
somewhere if a write causes an EFI variable to be removed.

Regards,

James
James Bottomley Dec. 23, 2024, 10:56 p.m. UTC | #9
On Mon, 2024-12-23 at 20:05 +0000, Al Viro wrote:
> On Mon, Dec 23, 2024 at 02:52:12PM -0500, James Bottomley wrote:
> >  
> > +static int efivarfs_file_release(struct inode *inode, struct file
> > *file)
> > +{
> > +       inode_lock(inode);
> > +       if (i_size_read(inode) == 0 && !d_unhashed(file-
> > >f_path.dentry)) {
> > +               drop_nlink(inode);
> > +               d_delete(file->f_path.dentry);
> > +               dput(file->f_path.dentry);
> > +       }
> > +       inode_unlock(inode);
> > +       return 0;
> > +}
> 
> This is wrong; so's existing logics for removal from write().  Think
> what happens if you open the sucker, have something bound on top of
> it and do that deleting write().
> 
> Let me look into that area...

I thought about this some more.  I could see a twisted container use
case where something like this might happen (expose some but not all
efi variables to the container).

So, help me understand the subtleties here.  If it's the target of a
bind mount, that's all OK, because you are allowed to delete the
target.  If something is bind mounted on to an efivarfs object, the
is_local_mountpoint() check in vfs_unlink would usually trip and
prevent deletion (so the subtree doesn't become unreachable).  If I
were to duplicate that, I think the best way would be simply to do a
d_put() in the file->release function and implement drop_nlink() in
d_prune (since last put will always call __dentry_kill)?

Regards,

James
Al Viro Dec. 23, 2024, 11:12 p.m. UTC | #10
On Mon, Dec 23, 2024 at 05:56:04PM -0500, James Bottomley wrote:
> > Let me look into that area...
> 
> I thought about this some more.  I could see a twisted container use
> case where something like this might happen (expose some but not all
> efi variables to the container).
> 
> So, help me understand the subtleties here.  If it's the target of a
> bind mount, that's all OK, because you are allowed to delete the
> target.  If something is bind mounted on to an efivarfs object, the
> is_local_mountpoint() check in vfs_unlink would usually trip and
> prevent deletion (so the subtree doesn't become unreachable).  If I
> were to duplicate that, I think the best way would be simply to do a
> d_put() in the file->release function and implement drop_nlink() in
> d_prune (since last put will always call __dentry_kill)?

Refcounting is not an issue.  At all.

Inability to find and evict the mount, OTOH, very much is.  And after your
blind d_delete() that's precisely what will happen.

You are steadily moving towards more and more convoluted crap, in places
where it really does not belong.

If anything, simple_recursive_removal() should be used for that, instead
of trying to open-code bizarre subsets of its functionality...
James Bottomley Dec. 24, 2024, 4:04 a.m. UTC | #11
On Mon, 2024-12-23 at 23:12 +0000, Al Viro wrote:
> On Mon, Dec 23, 2024 at 05:56:04PM -0500, James Bottomley wrote:
> > > Let me look into that area...
> > 
> > I thought about this some more.  I could see a twisted container
> > use case where something like this might happen (expose some but
> > not all efi variables to the container).
> > 
> > So, help me understand the subtleties here.  If it's the target of
> > a bind mount, that's all OK, because you are allowed to delete the
> > target.  If something is bind mounted on to an efivarfs object, the
> > is_local_mountpoint() check in vfs_unlink would usually trip and
> > prevent deletion (so the subtree doesn't become unreachable).  If I
> > were to duplicate that, I think the best way would be simply to do
> > a d_put() in the file->release function and implement drop_nlink()
> > in d_prune (since last put will always call __dentry_kill)?
> 
> Refcounting is not an issue.  At all.
> 
> Inability to find and evict the mount, OTOH, very much is.  And after
> your blind d_delete() that's precisely what will happen.
> 
> You are steadily moving towards more and more convoluted crap, in
> places where it really does not belong.
> 
> If anything, simple_recursive_removal() should be used for that,
> instead of trying to open-code bizarre subsets of its
> functionality...

OK, so like the below?

In my defence, simple_recursive_removal() isn't mentioned in
Documentation/filesystems and the function itself also has no
documentation, so even if I had stumbled across it in libfs.c the
recursive in the name would have lead me to believe it wasn't for
single dentry removal.

Regards,

James

---8>8>8><8<8<8---

From: James Bottomley <James.Bottomley@HansenPartnership.com>
Subject: [PATCH] efivarfs: fix error on write to new variable leaving remnants

Make variable cleanup go through the fops release mechanism and use
zero inode size as the indicator to delete the file.  Since all EFI
variables must have an initial u32 attribute, zero size occurs either
because the update deleted the variable or because an unsuccessful
write after create caused the size never to be set in the first place.

Even though this fixes the bug that a create either not followed by a
write or followed by a write that errored would leave a remnant file
for the variable, the file will appear momentarily globally visible
until the close of the fd deletes it.  This is safe because the normal
filesystem operations will mediate any races; however, it is still
possible for a directory listing at that instant between create and
close contain a variable that doesn't exist in the EFI table.

Signed-off-by: James Bottomley <James.Bottomley@HansenPartnership.com>
---
 fs/efivarfs/file.c | 40 +++++++++++++++++++++++++++++++---------
 1 file changed, 31 insertions(+), 9 deletions(-)

diff --git a/fs/efivarfs/file.c b/fs/efivarfs/file.c
index 23c51d62f902..0e545c8be173 100644
--- a/fs/efivarfs/file.c
+++ b/fs/efivarfs/file.c
@@ -36,28 +36,41 @@ static ssize_t efivarfs_file_write(struct file *file,
 	if (IS_ERR(data))
 		return PTR_ERR(data);
 
+	inode_lock(inode);
+	if (d_unhashed(file->f_path.dentry)) {
+		/*
+		 * file got removed; don't allow a set.  Caused by an
+		 * unsuccessful create or successful delete write
+		 * racing with us.
+		 */
+		bytes = -EIO;
+		goto out;
+	}
+
 	bytes = efivar_entry_set_get_size(var, attributes, &datasize,
 					  data, &set);
-	if (!set && bytes) {
+	if (!set) {
 		if (bytes == -ENOENT)
 			bytes = -EIO;
 		goto out;
 	}
 
 	if (bytes == -ENOENT) {
-		drop_nlink(inode);
-		d_delete(file->f_path.dentry);
-		dput(file->f_path.dentry);
+		/*
+		 * zero size signals to release that the write deleted
+		 * the variable
+		 */
+		i_size_write(inode, 0);
 	} else {
-		inode_lock(inode);
 		i_size_write(inode, datasize + sizeof(attributes));
 		inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
-		inode_unlock(inode);
 	}
 
 	bytes = count;
 
 out:
+	inode_unlock(inode);
+
 	kfree(data);
 
 	return bytes;
@@ -106,8 +119,17 @@ static ssize_t efivarfs_file_read(struct file *file, char __user *userbuf,
 	return size;
 }
 
+static int efivarfs_file_release(struct inode *inode, struct file *file)
+{
+	if (i_size_read(inode) == 0)
+		simple_recursive_removal(file->f_path.dentry, NULL);
+
+	return 0;
+}
+
 const struct file_operations efivarfs_file_operations = {
-	.open	= simple_open,
-	.read	= efivarfs_file_read,
-	.write	= efivarfs_file_write,
+	.open		= simple_open,
+	.read		= efivarfs_file_read,
+	.write		= efivarfs_file_write,
+	.release	= efivarfs_file_release,
 };
Al Viro Dec. 24, 2024, 4:44 a.m. UTC | #12
On Mon, Dec 23, 2024 at 11:04:58PM -0500, James Bottomley wrote:

> +static int efivarfs_file_release(struct inode *inode, struct file *file)
> +{
> +	if (i_size_read(inode) == 0)
> +		simple_recursive_removal(file->f_path.dentry, NULL);
> +
> +	return 0;
> +}

What happens if you have

	fd = creat(name, 0700);
	fd2 = open(name, O_RDONLY);
	close(fd2);
	write(fd, "barf", 4);

or, better yet, if open()/close() pair happens in an unrelated thread
poking around?

I mean, having that logics in ->release() feels very awkward...

For that matter, what about
	fd = creat(name, 0700);
	fd2 = open(name, O_RDWR);
	close(fd);
	write(fd2, "barf", 4);

I'm not asking about the implementation; what behaviour do you want
to see in userland?
James Bottomley Dec. 24, 2024, 1:07 p.m. UTC | #13
On Tue, 2024-12-24 at 04:44 +0000, Al Viro wrote:
> On Mon, Dec 23, 2024 at 11:04:58PM -0500, James Bottomley wrote:
> 
> > +static int efivarfs_file_release(struct inode *inode, struct file
> > *file)
> > +{
> > +       if (i_size_read(inode) == 0)
> > +               simple_recursive_removal(file->f_path.dentry,
> > NULL);
> > +
> > +       return 0;
> > +}
> 
> What happens if you have
> 
>         fd = creat(name, 0700);
>         fd2 = open(name, O_RDONLY);
>         close(fd2);
>         write(fd, "barf", 4);
> 
> or, better yet, if open()/close() pair happens in an unrelated thread
> poking around?

According to my tests you get -EIO to the last write.  I could be glib
and point out that a write of "barf" would return EINVAL anyway, but
assuming it's correctly formatted, you'll get -EIO from the d_unhashed
check before the variable gets created.  If you want to check this
yourself, useful writes that will create a variable are:

echo 0700000054|xxd -r -p > name

And to delete it:

echo 07000000|xxd -r -p > name

You can check your above scenario from a shell with

{ sleep 10; echo 0700000054|xxd -r -p; } > name &
cat name  

> I mean, having that logics in ->release() feels very awkward...
> 
> For that matter, what about
>         fd = creat(name, 0700);
>         fd2 = open(name, O_RDWR);
>         close(fd);
>         write(fd2, "barf", 4);

Same thing: -EIO to last write.

> I'm not asking about the implementation; what behaviour do you want
> to see in userland?

Given the fact that very few things actually manipulate efi variables
and when they do they perform open/write/close in quick succession to
set or remove variables, I think the above behaviour is consistent with
the use and gets rid of the ghost files problem and won't cause any
additional issues.  On the other hand the most intuitive thing would be
to remove zero length files on last close, not first, so if you have a
thought on how to do that easily, I'm all ears.

Regards,

James
James Bottomley Dec. 24, 2024, 3:09 p.m. UTC | #14
On Tue, 2024-12-24 at 08:07 -0500, James Bottomley wrote:
[...]

> On the other hand the most intuitive thing would be to remove zero
> length files on last close, not first, so if you have a thought on
> how to do that easily, I'm all ears.

I could do this by adding an open_count to the i_private data struct
efivar_entry and reimplementing simple_open as an efivarfs specific
open that increments this count and decrementing it in ->release(). 
That's still somewhat adding "more convoluted crap", though ...

Regards,

James
James Bottomley Dec. 27, 2024, 2:52 p.m. UTC | #15
On Tue, 2024-12-24 at 10:09 -0500, James Bottomley wrote:
> On Tue, 2024-12-24 at 08:07 -0500, James Bottomley wrote:
> [...]
> 
> > On the other hand the most intuitive thing would be to remove zero
> > length files on last close, not first, so if you have a thought on
> > how to do that easily, I'm all ears.
> 
> I could do this by adding an open_count to the i_private data struct
> efivar_entry and reimplementing simple_open as an efivarfs specific
> open that increments this count and decrementing it in ->release(). 
> That's still somewhat adding "more convoluted crap", though ...

There being no other suggestions as to how the vfs might do this; this
is a sketch of the additional code needed to do it within efivarfs.  As
you can see, it's not actually that much.  If this is OK with everyone
I'll fold it in and post a v2.  Since all simple_open really does is
copy i_private to file->private_data, there's really not a lot of
duplication in the attached.

Regards,

James

---

diff --git a/fs/efivarfs/file.c b/fs/efivarfs/file.c
index 0e545c8be173..cf0179d84bc5 100644
--- a/fs/efivarfs/file.c
+++ b/fs/efivarfs/file.c
@@ -121,14 +121,34 @@ static ssize_t efivarfs_file_read(struct file
*file, char __user *userbuf,
 
 static int efivarfs_file_release(struct inode *inode, struct file
*file)
 {
-	if (i_size_read(inode) == 0)
+	bool release;
+	struct efivar_entry *var = inode->i_private;
+
+	inode_lock(inode);
+	release = (--var->open_count == 0 && i_size_read(inode) == 0);
+	inode_unlock(inode);
+
+	if (release)
 		simple_recursive_removal(file->f_path.dentry, NULL);
 
 	return 0;
 }
 
+static int efivarfs_file_open(struct inode *inode, struct file *file)
+{
+	struct efivar_entry *entry = inode->i_private;
+
+	file->private_data = entry;
+
+	inode_lock(inode);
+	entry->open_count++;
+	inode_unlock(inode);
+
+	return 0;
+}
+
 const struct file_operations efivarfs_file_operations = {
-	.open		= simple_open,
+	.open		= efivarfs_file_open,
 	.read		= efivarfs_file_read,
 	.write		= efivarfs_file_write,
 	.release	= efivarfs_file_release,
diff --git a/fs/efivarfs/internal.h b/fs/efivarfs/internal.h
index 18a600f80992..32b83f644798 100644
--- a/fs/efivarfs/internal.h
+++ b/fs/efivarfs/internal.h
@@ -26,6 +26,7 @@ struct efi_variable {
 
 struct efivar_entry {
 	struct efi_variable var;
+	unsigned long open_count;
 };
 
 int efivar_init(int (*func)(efi_char16_t *, efi_guid_t, unsigned long,
void *),
diff mbox series

Patch

diff --git a/fs/efivarfs/file.c b/fs/efivarfs/file.c
index 23c51d62f902..edf363f395f5 100644
--- a/fs/efivarfs/file.c
+++ b/fs/efivarfs/file.c
@@ -38,22 +38,24 @@  static ssize_t efivarfs_file_write(struct file *file,
 
 	bytes = efivar_entry_set_get_size(var, attributes, &datasize,
 					  data, &set);
-	if (!set && bytes) {
+	if (!set) {
 		if (bytes == -ENOENT)
 			bytes = -EIO;
 		goto out;
 	}
 
+	inode_lock(inode);
 	if (bytes == -ENOENT) {
-		drop_nlink(inode);
-		d_delete(file->f_path.dentry);
-		dput(file->f_path.dentry);
+		/*
+		 * zero size signals to release that the write deleted
+		 * the variable
+		 */
+		i_size_write(inode, 0);
 	} else {
-		inode_lock(inode);
 		i_size_write(inode, datasize + sizeof(attributes));
 		inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
-		inode_unlock(inode);
 	}
+	inode_unlock(inode);
 
 	bytes = count;
 
@@ -106,8 +108,19 @@  static ssize_t efivarfs_file_read(struct file *file, char __user *userbuf,
 	return size;
 }
 
+static int efivarfs_file_release(struct inode *inode, struct file *file)
+{
+	if (i_size_read(inode) == 0) {
+		drop_nlink(inode);
+		d_delete(file->f_path.dentry);
+		dput(file->f_path.dentry);
+	}
+	return 0;
+}
+
 const struct file_operations efivarfs_file_operations = {
-	.open	= simple_open,
-	.read	= efivarfs_file_read,
-	.write	= efivarfs_file_write,
+	.open		= simple_open,
+	.read		= efivarfs_file_read,
+	.write		= efivarfs_file_write,
+	.release	= efivarfs_file_release,
 };