diff mbox series

[4/5] lib/ioremap: Ensure phys_addr actually corresponds to a physical address

Message ID 1536747974-25875-5-git-send-email-will.deacon@arm.com
State Superseded
Headers show
Series Clean up huge vmap and ioremap code | expand

Commit Message

Will Deacon Sept. 12, 2018, 10:26 a.m. UTC
The current ioremap() code uses a phys_addr variable at each level of
page table, which is confusingly offset by subtracting the base virtual
address being mapped so that adding the current virtual address back on
when iterating through the page table entries gives back the corresponding
physical address.

This is fairly confusing and results in all users of phys_addr having to
add the current virtual address back on. Instead, this patch just updates
phys_addr when iterating over the page table entries, ensuring that it's
always up-to-date and doesn't require explicit offsetting.

Cc: Chintan Pandya <cpandya@codeaurora.org>
Cc: Toshi Kani <toshi.kani@hpe.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>

---
 lib/ioremap.c | 28 ++++++++++++----------------
 1 file changed, 12 insertions(+), 16 deletions(-)

-- 
2.1.4

Comments

Sean Christopherson Sept. 12, 2018, 3:09 p.m. UTC | #1
On Wed, Sep 12, 2018 at 11:26:13AM +0100, Will Deacon wrote:
> The current ioremap() code uses a phys_addr variable at each level of

> page table, which is confusingly offset by subtracting the base virtual

> address being mapped so that adding the current virtual address back on

> when iterating through the page table entries gives back the corresponding

> physical address.

> 

> This is fairly confusing and results in all users of phys_addr having to

> add the current virtual address back on. Instead, this patch just updates

> phys_addr when iterating over the page table entries, ensuring that it's

> always up-to-date and doesn't require explicit offsetting.

> 

> Cc: Chintan Pandya <cpandya@codeaurora.org>

> Cc: Toshi Kani <toshi.kani@hpe.com>

> Cc: Thomas Gleixner <tglx@linutronix.de>

> Cc: Michal Hocko <mhocko@suse.com>

> Cc: Andrew Morton <akpm@linux-foundation.org>

> Signed-off-by: Will Deacon <will.deacon@arm.com>

> ---

>  lib/ioremap.c | 28 ++++++++++++----------------

>  1 file changed, 12 insertions(+), 16 deletions(-)

> 

> diff --git a/lib/ioremap.c b/lib/ioremap.c

> index 6c72764af19c..fc834a59c90c 100644

> --- a/lib/ioremap.c

> +++ b/lib/ioremap.c

> @@ -101,19 +101,18 @@ static inline int ioremap_pmd_range(pud_t *pud, unsigned long addr,

>  	pmd_t *pmd;

>  	unsigned long next;

>  

> -	phys_addr -= addr;

>  	pmd = pmd_alloc(&init_mm, pud, addr);

>  	if (!pmd)

>  		return -ENOMEM;

>  	do {

>  		next = pmd_addr_end(addr, end);

>  

> -		if (ioremap_try_huge_pmd(pmd, addr, next, phys_addr + addr, prot))

> +		if (ioremap_try_huge_pmd(pmd, addr, next, phys_addr, prot))

>  			continue;

>  

> -		if (ioremap_pte_range(pmd, addr, next, phys_addr + addr, prot))

> +		if (ioremap_pte_range(pmd, addr, next, phys_addr, prot))

>  			return -ENOMEM;

> -	} while (pmd++, addr = next, addr != end);

> +	} while (pmd++, addr = next, phys_addr += PMD_SIZE, addr != end);


I think bumping phys_addr by PXX_SIZE is wrong if phys_addr and addr
start unaligned with respect to PXX_SIZE.  The addresses must be
PAGE_ALIGNED, which lets ioremap_pte_range() do a simple calculation,
but that doesn't hold true for the upper levels, i.e. phys_addr needs
to be adjusted using an algorithm similar to pxx_addr_end().

Using a 2mb page as an example (lower 32 bits only): 

pxx_size  = 0x00020000
pxx_mask  = 0xfffe0000
addr      = 0x1000
end       = 0x00040000
phys_addr = 0x1000

Loop 1:
   addr = 0x1000
   phys = 0x1000

Loop 2:
   addr = 0x20000
   phys = 0x21000


>  	return 0;

>  }

>  

> @@ -142,19 +141,18 @@ static inline int ioremap_pud_range(p4d_t *p4d, unsigned long addr,

>  	pud_t *pud;

>  	unsigned long next;

>  

> -	phys_addr -= addr;

>  	pud = pud_alloc(&init_mm, p4d, addr);

>  	if (!pud)

>  		return -ENOMEM;

>  	do {

>  		next = pud_addr_end(addr, end);

>  

> -		if (ioremap_try_huge_pud(pud, addr, next, phys_addr + addr, prot))

> +		if (ioremap_try_huge_pud(pud, addr, next, phys_addr, prot))

>  			continue;

>  

> -		if (ioremap_pmd_range(pud, addr, next, phys_addr + addr, prot))

> +		if (ioremap_pmd_range(pud, addr, next, phys_addr, prot))

>  			return -ENOMEM;

> -	} while (pud++, addr = next, addr != end);

> +	} while (pud++, addr = next, phys_addr += PUD_SIZE, addr != end);

>  	return 0;

>  }

>  

> @@ -164,7 +162,6 @@ static inline int ioremap_p4d_range(pgd_t *pgd, unsigned long addr,

>  	p4d_t *p4d;

>  	unsigned long next;

>  

> -	phys_addr -= addr;

>  	p4d = p4d_alloc(&init_mm, pgd, addr);

>  	if (!p4d)

>  		return -ENOMEM;

> @@ -173,14 +170,14 @@ static inline int ioremap_p4d_range(pgd_t *pgd, unsigned long addr,

>  

>  		if (ioremap_p4d_enabled() &&

>  		    ((next - addr) == P4D_SIZE) &&

> -		    IS_ALIGNED(phys_addr + addr, P4D_SIZE)) {

> -			if (p4d_set_huge(p4d, phys_addr + addr, prot))

> +		    IS_ALIGNED(phys_addr, P4D_SIZE)) {

> +			if (p4d_set_huge(p4d, phys_addr, prot))

>  				continue;

>  		}

>  

> -		if (ioremap_pud_range(p4d, addr, next, phys_addr + addr, prot))

> +		if (ioremap_pud_range(p4d, addr, next, phys_addr, prot))

>  			return -ENOMEM;

> -	} while (p4d++, addr = next, addr != end);

> +	} while (p4d++, addr = next, phys_addr += P4D_SIZE, addr != end);

>  	return 0;

>  }

>  

> @@ -196,14 +193,13 @@ int ioremap_page_range(unsigned long addr,

>  	BUG_ON(addr >= end);

>  

>  	start = addr;

> -	phys_addr -= addr;

>  	pgd = pgd_offset_k(addr);

>  	do {

>  		next = pgd_addr_end(addr, end);

> -		err = ioremap_p4d_range(pgd, addr, next, phys_addr+addr, prot);

> +		err = ioremap_p4d_range(pgd, addr, next, phys_addr, prot);

>  		if (err)

>  			break;

> -	} while (pgd++, addr = next, addr != end);

> +	} while (pgd++, addr = next, phys_addr += PGDIR_SIZE, addr != end);

>  

>  	flush_cache_vmap(start, end);

>  

> -- 

> 2.1.4

>
Will Deacon Sept. 12, 2018, 4:39 p.m. UTC | #2
Hi Sean,

Thanks for looking at the patch.

On Wed, Sep 12, 2018 at 08:09:39AM -0700, Sean Christopherson wrote:
> On Wed, Sep 12, 2018 at 11:26:13AM +0100, Will Deacon wrote:

> > The current ioremap() code uses a phys_addr variable at each level of

> > page table, which is confusingly offset by subtracting the base virtual

> > address being mapped so that adding the current virtual address back on

> > when iterating through the page table entries gives back the corresponding

> > physical address.

> > 

> > This is fairly confusing and results in all users of phys_addr having to

> > add the current virtual address back on. Instead, this patch just updates

> > phys_addr when iterating over the page table entries, ensuring that it's

> > always up-to-date and doesn't require explicit offsetting.

> > 

> > Cc: Chintan Pandya <cpandya@codeaurora.org>

> > Cc: Toshi Kani <toshi.kani@hpe.com>

> > Cc: Thomas Gleixner <tglx@linutronix.de>

> > Cc: Michal Hocko <mhocko@suse.com>

> > Cc: Andrew Morton <akpm@linux-foundation.org>

> > Signed-off-by: Will Deacon <will.deacon@arm.com>

> > ---

> >  lib/ioremap.c | 28 ++++++++++++----------------

> >  1 file changed, 12 insertions(+), 16 deletions(-)

> > 

> > diff --git a/lib/ioremap.c b/lib/ioremap.c

> > index 6c72764af19c..fc834a59c90c 100644

> > --- a/lib/ioremap.c

> > +++ b/lib/ioremap.c

> > @@ -101,19 +101,18 @@ static inline int ioremap_pmd_range(pud_t *pud, unsigned long addr,

> >  	pmd_t *pmd;

> >  	unsigned long next;

> >  

> > -	phys_addr -= addr;

> >  	pmd = pmd_alloc(&init_mm, pud, addr);

> >  	if (!pmd)

> >  		return -ENOMEM;

> >  	do {

> >  		next = pmd_addr_end(addr, end);

> >  

> > -		if (ioremap_try_huge_pmd(pmd, addr, next, phys_addr + addr, prot))

> > +		if (ioremap_try_huge_pmd(pmd, addr, next, phys_addr, prot))

> >  			continue;

> >  

> > -		if (ioremap_pte_range(pmd, addr, next, phys_addr + addr, prot))

> > +		if (ioremap_pte_range(pmd, addr, next, phys_addr, prot))

> >  			return -ENOMEM;

> > -	} while (pmd++, addr = next, addr != end);

> > +	} while (pmd++, addr = next, phys_addr += PMD_SIZE, addr != end);

> 

> I think bumping phys_addr by PXX_SIZE is wrong if phys_addr and addr

> start unaligned with respect to PXX_SIZE.  The addresses must be

> PAGE_ALIGNED, which lets ioremap_pte_range() do a simple calculation,

> but that doesn't hold true for the upper levels, i.e. phys_addr needs

> to be adjusted using an algorithm similar to pxx_addr_end().

> 

> Using a 2mb page as an example (lower 32 bits only): 

> 

> pxx_size  = 0x00020000

> pxx_mask  = 0xfffe0000

> addr      = 0x1000

> end       = 0x00040000

> phys_addr = 0x1000

> 

> Loop 1:

>    addr = 0x1000

>    phys = 0x1000

> 

> Loop 2:

>    addr = 0x20000

>    phys = 0x21000


Yes, I think you're completely right, however I also don't think this
can happen with the current code (and I've failed to trigger it in my
testing). The virtual addresses allocated for VM_IOREMAP allocations
are aligned to the order of the allocation, which means that the virtual
address at the start of the mapping is aligned such that when we hit the
end of a pXd, we know we've mapped the previous PXD_SIZE bytes.

Having said that, this is clearly a change from the current code and I
haven't audited architectures other than arm64 (where IOREMAP_MAX_ORDER
corresponds to the maximum size of our huge mappings), so it would be
much better not to introduce this funny behaviour in a patch that aims
to reduce confusion in the first place!

Fixing this using the pxx_addr_end() macros is a bit strange, since we
don't have a physical end variable (nor do we need one), so perhaps
something like changing the while condition to be:

	do {
		...
	} while (pmd++, phys_addr += (next - addr), addr = next, addr != end);

would do the trick. What do you reckon?

Will
Sean Christopherson Sept. 12, 2018, 5:14 p.m. UTC | #3
On Wed, Sep 12, 2018 at 05:39:14PM +0100, Will Deacon wrote:
> Hi Sean,

> 

> Thanks for looking at the patch.

> 

> On Wed, Sep 12, 2018 at 08:09:39AM -0700, Sean Christopherson wrote:

> > On Wed, Sep 12, 2018 at 11:26:13AM +0100, Will Deacon wrote:

> > > The current ioremap() code uses a phys_addr variable at each level of

> > > page table, which is confusingly offset by subtracting the base virtual

> > > address being mapped so that adding the current virtual address back on

> > > when iterating through the page table entries gives back the corresponding

> > > physical address.

> > > 

> > > This is fairly confusing and results in all users of phys_addr having to

> > > add the current virtual address back on. Instead, this patch just updates

> > > phys_addr when iterating over the page table entries, ensuring that it's

> > > always up-to-date and doesn't require explicit offsetting.

> > > 

> > > Cc: Chintan Pandya <cpandya@codeaurora.org>

> > > Cc: Toshi Kani <toshi.kani@hpe.com>

> > > Cc: Thomas Gleixner <tglx@linutronix.de>

> > > Cc: Michal Hocko <mhocko@suse.com>

> > > Cc: Andrew Morton <akpm@linux-foundation.org>

> > > Signed-off-by: Will Deacon <will.deacon@arm.com>

> > > ---

> > >  lib/ioremap.c | 28 ++++++++++++----------------

> > >  1 file changed, 12 insertions(+), 16 deletions(-)

> > > 

> > > diff --git a/lib/ioremap.c b/lib/ioremap.c

> > > index 6c72764af19c..fc834a59c90c 100644

> > > --- a/lib/ioremap.c

> > > +++ b/lib/ioremap.c

> > > @@ -101,19 +101,18 @@ static inline int ioremap_pmd_range(pud_t *pud, unsigned long addr,

> > >  	pmd_t *pmd;

> > >  	unsigned long next;

> > >  

> > > -	phys_addr -= addr;

> > >  	pmd = pmd_alloc(&init_mm, pud, addr);

> > >  	if (!pmd)

> > >  		return -ENOMEM;

> > >  	do {

> > >  		next = pmd_addr_end(addr, end);

> > >  

> > > -		if (ioremap_try_huge_pmd(pmd, addr, next, phys_addr + addr, prot))

> > > +		if (ioremap_try_huge_pmd(pmd, addr, next, phys_addr, prot))

> > >  			continue;

> > >  

> > > -		if (ioremap_pte_range(pmd, addr, next, phys_addr + addr, prot))

> > > +		if (ioremap_pte_range(pmd, addr, next, phys_addr, prot))

> > >  			return -ENOMEM;

> > > -	} while (pmd++, addr = next, addr != end);

> > > +	} while (pmd++, addr = next, phys_addr += PMD_SIZE, addr != end);

> > 

> > I think bumping phys_addr by PXX_SIZE is wrong if phys_addr and addr

> > start unaligned with respect to PXX_SIZE.  The addresses must be

> > PAGE_ALIGNED, which lets ioremap_pte_range() do a simple calculation,

> > but that doesn't hold true for the upper levels, i.e. phys_addr needs

> > to be adjusted using an algorithm similar to pxx_addr_end().

> > 

> > Using a 2mb page as an example (lower 32 bits only): 

> > 

> > pxx_size  = 0x00020000

> > pxx_mask  = 0xfffe0000

> > addr      = 0x1000

> > end       = 0x00040000

> > phys_addr = 0x1000

> > 

> > Loop 1:

> >    addr = 0x1000

> >    phys = 0x1000

> > 

> > Loop 2:

> >    addr = 0x20000

> >    phys = 0x21000

> 

> Yes, I think you're completely right, however I also don't think this

> can happen with the current code (and I've failed to trigger it in my

> testing). The virtual addresses allocated for VM_IOREMAP allocations

> are aligned to the order of the allocation, which means that the virtual

> address at the start of the mapping is aligned such that when we hit the

> end of a pXd, we know we've mapped the previous PXD_SIZE bytes.

> 

> Having said that, this is clearly a change from the current code and I

> haven't audited architectures other than arm64 (where IOREMAP_MAX_ORDER

> corresponds to the maximum size of our huge mappings), so it would be

> much better not to introduce this funny behaviour in a patch that aims

> to reduce confusion in the first place!

> 

> Fixing this using the pxx_addr_end() macros is a bit strange, since we

> don't have a physical end variable (nor do we need one), so perhaps

> something like changing the while condition to be:

> 

> 	do {

> 		...

> 	} while (pmd++, phys_addr += (next - addr), addr = next, addr != end);

> 

> would do the trick. What do you reckon?


LGTM.  I like that there isn't a separate calculation for phys_addr's offset.
diff mbox series

Patch

diff --git a/lib/ioremap.c b/lib/ioremap.c
index 6c72764af19c..fc834a59c90c 100644
--- a/lib/ioremap.c
+++ b/lib/ioremap.c
@@ -101,19 +101,18 @@  static inline int ioremap_pmd_range(pud_t *pud, unsigned long addr,
 	pmd_t *pmd;
 	unsigned long next;
 
-	phys_addr -= addr;
 	pmd = pmd_alloc(&init_mm, pud, addr);
 	if (!pmd)
 		return -ENOMEM;
 	do {
 		next = pmd_addr_end(addr, end);
 
-		if (ioremap_try_huge_pmd(pmd, addr, next, phys_addr + addr, prot))
+		if (ioremap_try_huge_pmd(pmd, addr, next, phys_addr, prot))
 			continue;
 
-		if (ioremap_pte_range(pmd, addr, next, phys_addr + addr, prot))
+		if (ioremap_pte_range(pmd, addr, next, phys_addr, prot))
 			return -ENOMEM;
-	} while (pmd++, addr = next, addr != end);
+	} while (pmd++, addr = next, phys_addr += PMD_SIZE, addr != end);
 	return 0;
 }
 
@@ -142,19 +141,18 @@  static inline int ioremap_pud_range(p4d_t *p4d, unsigned long addr,
 	pud_t *pud;
 	unsigned long next;
 
-	phys_addr -= addr;
 	pud = pud_alloc(&init_mm, p4d, addr);
 	if (!pud)
 		return -ENOMEM;
 	do {
 		next = pud_addr_end(addr, end);
 
-		if (ioremap_try_huge_pud(pud, addr, next, phys_addr + addr, prot))
+		if (ioremap_try_huge_pud(pud, addr, next, phys_addr, prot))
 			continue;
 
-		if (ioremap_pmd_range(pud, addr, next, phys_addr + addr, prot))
+		if (ioremap_pmd_range(pud, addr, next, phys_addr, prot))
 			return -ENOMEM;
-	} while (pud++, addr = next, addr != end);
+	} while (pud++, addr = next, phys_addr += PUD_SIZE, addr != end);
 	return 0;
 }
 
@@ -164,7 +162,6 @@  static inline int ioremap_p4d_range(pgd_t *pgd, unsigned long addr,
 	p4d_t *p4d;
 	unsigned long next;
 
-	phys_addr -= addr;
 	p4d = p4d_alloc(&init_mm, pgd, addr);
 	if (!p4d)
 		return -ENOMEM;
@@ -173,14 +170,14 @@  static inline int ioremap_p4d_range(pgd_t *pgd, unsigned long addr,
 
 		if (ioremap_p4d_enabled() &&
 		    ((next - addr) == P4D_SIZE) &&
-		    IS_ALIGNED(phys_addr + addr, P4D_SIZE)) {
-			if (p4d_set_huge(p4d, phys_addr + addr, prot))
+		    IS_ALIGNED(phys_addr, P4D_SIZE)) {
+			if (p4d_set_huge(p4d, phys_addr, prot))
 				continue;
 		}
 
-		if (ioremap_pud_range(p4d, addr, next, phys_addr + addr, prot))
+		if (ioremap_pud_range(p4d, addr, next, phys_addr, prot))
 			return -ENOMEM;
-	} while (p4d++, addr = next, addr != end);
+	} while (p4d++, addr = next, phys_addr += P4D_SIZE, addr != end);
 	return 0;
 }
 
@@ -196,14 +193,13 @@  int ioremap_page_range(unsigned long addr,
 	BUG_ON(addr >= end);
 
 	start = addr;
-	phys_addr -= addr;
 	pgd = pgd_offset_k(addr);
 	do {
 		next = pgd_addr_end(addr, end);
-		err = ioremap_p4d_range(pgd, addr, next, phys_addr+addr, prot);
+		err = ioremap_p4d_range(pgd, addr, next, phys_addr, prot);
 		if (err)
 			break;
-	} while (pgd++, addr = next, addr != end);
+	} while (pgd++, addr = next, phys_addr += PGDIR_SIZE, addr != end);
 
 	flush_cache_vmap(start, end);