diff mbox series

[v1,2/5] mips: octeon: use mips_mach_early_init() to copy to L2 cache

Message ID 20200623095141.3310591-3-sr@denx.de
State New
Headers show
Series mips: Improve initial Octeon MIPS64 support | expand

Commit Message

Stefan Roese June 23, 2020, 9:51 a.m. UTC
This patch adds the code to copy itself from bootrom location to a
different location (TEXT_BASE) to the Octeon platform. Its used in
this case to copy the complete U-Boot image into L2 cache, which
greatly improves the bootup time - especially in regard to the
very long and complex DDR4 init code.

The Kconfig symbol CONFIG_MIPS_MACH_EARLY_INIT is enabled with this
patch for Octeon.

Signed-off-by: Stefan Roese <sr at denx.de>
---

 arch/mips/Kconfig                     |  1 +
 arch/mips/mach-octeon/lowlevel_init.S | 56 +++++++++++++++++++++++++++
 2 files changed, 57 insertions(+)

Comments

Daniel Schwierzeck June 29, 2020, 10:08 p.m. UTC | #1
> This patch adds the code to copy itself from bootrom location to a
> different location (TEXT_BASE) to the Octeon platform. Its used in
> this case to copy the complete U-Boot image into L2 cache, which
> greatly improves the bootup time - especially in regard to the
> very long and complex DDR4 init code.
> 
> The Kconfig symbol CONFIG_MIPS_MACH_EARLY_INIT is enabled with this
> patch for Octeon.
> 
> Signed-off-by: Stefan Roese <sr at denx.de>
> ---
> 
>  arch/mips/Kconfig                     |  1 +
>  arch/mips/mach-octeon/lowlevel_init.S | 56 +++++++++++++++++++++++++++
>  2 files changed, 57 insertions(+)
> 
> diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
> index 327fd4848a..bcf6f26457 100644
> --- a/arch/mips/Kconfig
> +++ b/arch/mips/Kconfig
> @@ -114,6 +114,7 @@ config ARCH_OCTEON
>  	select DM
>  	select DM_SERIAL
>  	select MIPS_L2_CACHE
> +	select MIPS_MACH_EARLY_INIT
>  	select MIPS_TUNE_OCTEON3
>  	select ROM_EXCEPTION_VECTORS
>  	select SUPPORTS_BIG_ENDIAN
> diff --git a/arch/mips/mach-octeon/lowlevel_init.S b/arch/mips/mach-octeon/lowlevel_init.S
> index d9aab38cde..2d9257b1ed 100644
> --- a/arch/mips/mach-octeon/lowlevel_init.S
> +++ b/arch/mips/mach-octeon/lowlevel_init.S
> @@ -17,3 +17,59 @@ LEAF(lowlevel_init)
>  	jr	ra
>  	 nop
>  	END(lowlevel_init)
> +
> +LEAF(mips_mach_early_init)
> +
> +        move    s0, ra

indentation

> +
> +	bal	__dummy
> +	 nop
> +
> +__dummy:
> +	/* Get the actual address that we are running at */
> +	PTR_LA	a6, _start		/* Linked address of _start */
> +	PTR_LA	a7, __dummy
> +	dsubu	t1, a7, a6		/* offset of __dummy label from _start*/
> +	dsubu	t0, ra, t1		/* t0 now has actual address of _start*/

including _start in the calculation makes it a little bit hard to
understand. Wouldn't it be enough to just calculate the difference
between a7 and ra to get the relocation offset? You can use that offset
later to calculate the destination address based on _start.

> +
> +	PTR_LI	t1, CONFIG_SYS_TEXT_BASE

isn't that the same address as _start?

> +
> +	/* Calculate end address of copy loop */
> +	PTR_LI	s5, CONFIG_BOARD_SIZE_LIMIT

couldn't you use __image_copy_end to get the real binary size without
BSS? Or _end if you need to copy the relocation table as well.

> +	daddu	t2, s5, t0	/* t2 = end address */
> +	daddiu	t2, t2, 127
> +	ins	t2, zero, 0, 7	/* Round up to cache line for memcpy */
> +
> +	/* Copy ourself to the L2 cache from flash, 32 bytes at a time */
> +1:
> +	ld	a0, 0(t0)
> +	ld	a1, 8(t0)
> +	ld	a2, 16(t0)
> +	ld	a3, 24(t0)
> +	sd	a0, 0(t1)
> +	sd	a1, 8(t1)
> +	sd	a2, 16(t1)
> +	sd	a3, 24(t1)
> +	addiu	t0, 32
> +	bne	t0, t2, 1b
> +	addiu	t1, 32

the instruction in the delay slot should be indented by an extra space
character

> +
> +	sync
> +	synci	0(zero)
> +
> +	PTR_LA	t9, uboot_in_cache
> +	j	t9
> +	 nop

Why the extra jump? If you have the relocation offset as suggested
above, you could simply add that to s0 and do one jr instruction.

Also instead of synci you could use jr.hb to automatically add a
instruction hazard barrier during the jump (if that's implemented on
Octeon).


> +
> +uboot_in_cache:
> +
> +	/*
> +	 * Return to start.S now running from TEXT_BASE, which points
> +	 * to DRAM address space, which effectively is L2 cache now.
> +	 * This speeds up the init process extremely, especially the
> +	 * DDR init code.
> +	 */
> +	jr	s0
> +	 nop
> +
> +	END(mips_mach_early_init)
Stefan Roese June 30, 2020, 9:12 a.m. UTC | #2
On 30.06.20 00:08, Daniel Schwierzeck wrote:
> 
>> This patch adds the code to copy itself from bootrom location to a
>> different location (TEXT_BASE) to the Octeon platform. Its used in
>> this case to copy the complete U-Boot image into L2 cache, which
>> greatly improves the bootup time - especially in regard to the
>> very long and complex DDR4 init code.
>>
>> The Kconfig symbol CONFIG_MIPS_MACH_EARLY_INIT is enabled with this
>> patch for Octeon.
>>
>> Signed-off-by: Stefan Roese <sr at denx.de>
>> ---
>>
>>   arch/mips/Kconfig                     |  1 +
>>   arch/mips/mach-octeon/lowlevel_init.S | 56 +++++++++++++++++++++++++++
>>   2 files changed, 57 insertions(+)
>>
>> diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
>> index 327fd4848a..bcf6f26457 100644
>> --- a/arch/mips/Kconfig
>> +++ b/arch/mips/Kconfig
>> @@ -114,6 +114,7 @@ config ARCH_OCTEON
>>   	select DM
>>   	select DM_SERIAL
>>   	select MIPS_L2_CACHE
>> +	select MIPS_MACH_EARLY_INIT
>>   	select MIPS_TUNE_OCTEON3
>>   	select ROM_EXCEPTION_VECTORS
>>   	select SUPPORTS_BIG_ENDIAN
>> diff --git a/arch/mips/mach-octeon/lowlevel_init.S b/arch/mips/mach-octeon/lowlevel_init.S
>> index d9aab38cde..2d9257b1ed 100644
>> --- a/arch/mips/mach-octeon/lowlevel_init.S
>> +++ b/arch/mips/mach-octeon/lowlevel_init.S
>> @@ -17,3 +17,59 @@ LEAF(lowlevel_init)
>>   	jr	ra
>>   	 nop
>>   	END(lowlevel_init)
>> +
>> +LEAF(mips_mach_early_init)
>> +
>> +        move    s0, ra
> 
> indentation

Thanks for spotting.

>> +
>> +	bal	__dummy
>> +	 nop
>> +
>> +__dummy:
>> +	/* Get the actual address that we are running at */
>> +	PTR_LA	a6, _start		/* Linked address of _start */
>> +	PTR_LA	a7, __dummy
>> +	dsubu	t1, a7, a6		/* offset of __dummy label from _start*/
>> +	dsubu	t0, ra, t1		/* t0 now has actual address of _start*/
> 
> including _start in the calculation makes it a little bit hard to
> understand. Wouldn't it be enough to just calculate the difference
> between a7 and ra to get the relocation offset?

Yes. Thanks for the suggestion. I'll update this part accordingly.

> You can use that offset
> later to calculate the destination address based on _start.

Yep.

>> +
>> +	PTR_LI	t1, CONFIG_SYS_TEXT_BASE
> 
> isn't that the same address as _start?

Yes.

>> +
>> +	/* Calculate end address of copy loop */
>> +	PTR_LI	s5, CONFIG_BOARD_SIZE_LIMIT
> 
> couldn't you use __image_copy_end to get the real binary size without
> BSS? Or _end if you need to copy the relocation table as well.

I tried this before. And even _end is not enough, as the appended DTB
also needs to be copied. I will use _end plus some additional size for
the DTB in the next version and will drop CONFIG_BOARD_SIZE_LIMIT.

>> +	daddu	t2, s5, t0	/* t2 = end address */
>> +	daddiu	t2, t2, 127
>> +	ins	t2, zero, 0, 7	/* Round up to cache line for memcpy */
>> +
>> +	/* Copy ourself to the L2 cache from flash, 32 bytes at a time */
>> +1:
>> +	ld	a0, 0(t0)
>> +	ld	a1, 8(t0)
>> +	ld	a2, 16(t0)
>> +	ld	a3, 24(t0)
>> +	sd	a0, 0(t1)
>> +	sd	a1, 8(t1)
>> +	sd	a2, 16(t1)
>> +	sd	a3, 24(t1)
>> +	addiu	t0, 32
>> +	bne	t0, t2, 1b
>> +	addiu	t1, 32
> 
> the instruction in the delay slot should be indented by an extra space
> character

Right, will change in v2.

>> +
>> +	sync
>> +	synci	0(zero)
>> +
>> +	PTR_LA	t9, uboot_in_cache
>> +	j	t9
>> +	 nop
> 
> Why the extra jump? If you have the relocation offset as suggested
> above, you could simply add that to s0 and do one jr instruction.

Good idea, thanks.

> Also instead of synci you could use jr.hb to automatically add a
> instruction hazard barrier during the jump (if that's implemented on
> Octeon).

I tried it and it works as you suggested. Will update on v2.

> 
>> +
>> +uboot_in_cache:
>> +
>> +	/*
>> +	 * Return to start.S now running from TEXT_BASE, which points
>> +	 * to DRAM address space, which effectively is L2 cache now.
>> +	 * This speeds up the init process extremely, especially the
>> +	 * DDR init code.
>> +	 */
>> +	jr	s0
>> +	 nop
>> +
>> +	END(mips_mach_early_init)

Thanks for all your suggestions. The resulting code now looks better
(easier to understand and smaller).

Thanks,
Stefan
diff mbox series

Patch

diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index 327fd4848a..bcf6f26457 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -114,6 +114,7 @@  config ARCH_OCTEON
 	select DM
 	select DM_SERIAL
 	select MIPS_L2_CACHE
+	select MIPS_MACH_EARLY_INIT
 	select MIPS_TUNE_OCTEON3
 	select ROM_EXCEPTION_VECTORS
 	select SUPPORTS_BIG_ENDIAN
diff --git a/arch/mips/mach-octeon/lowlevel_init.S b/arch/mips/mach-octeon/lowlevel_init.S
index d9aab38cde..2d9257b1ed 100644
--- a/arch/mips/mach-octeon/lowlevel_init.S
+++ b/arch/mips/mach-octeon/lowlevel_init.S
@@ -17,3 +17,59 @@  LEAF(lowlevel_init)
 	jr	ra
 	 nop
 	END(lowlevel_init)
+
+LEAF(mips_mach_early_init)
+
+        move    s0, ra
+
+	bal	__dummy
+	 nop
+
+__dummy:
+	/* Get the actual address that we are running at */
+	PTR_LA	a6, _start		/* Linked address of _start */
+	PTR_LA	a7, __dummy
+	dsubu	t1, a7, a6		/* offset of __dummy label from _start*/
+	dsubu	t0, ra, t1		/* t0 now has actual address of _start*/
+
+	PTR_LI	t1, CONFIG_SYS_TEXT_BASE
+
+	/* Calculate end address of copy loop */
+	PTR_LI	s5, CONFIG_BOARD_SIZE_LIMIT
+	daddu	t2, s5, t0	/* t2 = end address */
+	daddiu	t2, t2, 127
+	ins	t2, zero, 0, 7	/* Round up to cache line for memcpy */
+
+	/* Copy ourself to the L2 cache from flash, 32 bytes at a time */
+1:
+	ld	a0, 0(t0)
+	ld	a1, 8(t0)
+	ld	a2, 16(t0)
+	ld	a3, 24(t0)
+	sd	a0, 0(t1)
+	sd	a1, 8(t1)
+	sd	a2, 16(t1)
+	sd	a3, 24(t1)
+	addiu	t0, 32
+	bne	t0, t2, 1b
+	addiu	t1, 32
+
+	sync
+	synci	0(zero)
+
+	PTR_LA	t9, uboot_in_cache
+	j	t9
+	 nop
+
+uboot_in_cache:
+
+	/*
+	 * Return to start.S now running from TEXT_BASE, which points
+	 * to DRAM address space, which effectively is L2 cache now.
+	 * This speeds up the init process extremely, especially the
+	 * DDR init code.
+	 */
+	jr	s0
+	 nop
+
+	END(mips_mach_early_init)