diff mbox series

[v1,01/10] mips: octeon: Initial minimal support for the Marvell Octeon SoC

Message ID 20200502085944.13444-2-sr@denx.de
State New
Headers show
Series mips: Add initial Octeon MIPS64 base support | expand

Commit Message

Stefan Roese May 2, 2020, 8:59 a.m. UTC
From: Aaron Williams <awilliams at marvell.com>

This patch adds very basic support for the Octeon III SoCs. Only
CFI parallel NOR flash and UART is supported for now.

Please note that the basic Octeon port does not include the DDR3/4
initialization yet. This will be added in some follow-up patches
later. To still use U-Boot on with this port, the L2 cache (4MiB on
Octeon III CN73xx) is used as RAM. This way, U-Boot can boot to the
prompt on such boards.

Signed-off-by: Aaron Williams <awilliams at marvell.com>
Signed-off-by: Stefan Roese <sr at denx.de>
---

 MAINTAINERS                                  |    6 +
 arch/Kconfig                                 |    1 +
 arch/mips/Kconfig                            |   49 +-
 arch/mips/Makefile                           |    7 +
 arch/mips/cpu/Makefile                       |    4 +-
 arch/mips/include/asm/arch-octeon/cavm-reg.h |   42 +
 arch/mips/include/asm/arch-octeon/clock.h    |   24 +
 arch/mips/mach-octeon/Kconfig                |   92 ++
 arch/mips/mach-octeon/Makefile               |   10 +
 arch/mips/mach-octeon/clock.c                |   22 +
 arch/mips/mach-octeon/cpu.c                  |   55 +
 arch/mips/mach-octeon/dram.c                 |   27 +
 arch/mips/mach-octeon/include/ioremap.h      |   30 +
 arch/mips/mach-octeon/start.S                | 1241 ++++++++++++++++++
 14 files changed, 1608 insertions(+), 2 deletions(-)
 create mode 100644 arch/mips/include/asm/arch-octeon/cavm-reg.h
 create mode 100644 arch/mips/include/asm/arch-octeon/clock.h
 create mode 100644 arch/mips/mach-octeon/Kconfig
 create mode 100644 arch/mips/mach-octeon/Makefile
 create mode 100644 arch/mips/mach-octeon/clock.c
 create mode 100644 arch/mips/mach-octeon/cpu.c
 create mode 100644 arch/mips/mach-octeon/dram.c
 create mode 100644 arch/mips/mach-octeon/include/ioremap.h
 create mode 100644 arch/mips/mach-octeon/start.S

Comments

Daniel Schwierzeck May 13, 2020, 12:49 p.m. UTC | #1
sorry for the delay ;)

Am 02.05.20 um 10:59 schrieb Stefan Roese:
> From: Aaron Williams <awilliams at marvell.com>
> 
> This patch adds very basic support for the Octeon III SoCs. Only
> CFI parallel NOR flash and UART is supported for now.
> 
> Please note that the basic Octeon port does not include the DDR3/4
> initialization yet. This will be added in some follow-up patches
> later. To still use U-Boot on with this port, the L2 cache (4MiB on
> Octeon III CN73xx) is used as RAM. This way, U-Boot can boot to the
> prompt on such boards.

this patch should come after the common MIPS patches

> 
> Signed-off-by: Aaron Williams <awilliams at marvell.com>
> Signed-off-by: Stefan Roese <sr at denx.de>
> ---
> 
>  MAINTAINERS                                  |    6 +
>  arch/Kconfig                                 |    1 +
>  arch/mips/Kconfig                            |   49 +-
>  arch/mips/Makefile                           |    7 +
>  arch/mips/cpu/Makefile                       |    4 +-
>  arch/mips/include/asm/arch-octeon/cavm-reg.h |   42 +
>  arch/mips/include/asm/arch-octeon/clock.h    |   24 +
>  arch/mips/mach-octeon/Kconfig                |   92 ++
>  arch/mips/mach-octeon/Makefile               |   10 +
>  arch/mips/mach-octeon/clock.c                |   22 +
>  arch/mips/mach-octeon/cpu.c                  |   55 +
>  arch/mips/mach-octeon/dram.c                 |   27 +
>  arch/mips/mach-octeon/include/ioremap.h      |   30 +
>  arch/mips/mach-octeon/start.S                | 1241 ++++++++++++++++++
>  14 files changed, 1608 insertions(+), 2 deletions(-)
>  create mode 100644 arch/mips/include/asm/arch-octeon/cavm-reg.h
>  create mode 100644 arch/mips/include/asm/arch-octeon/clock.h
>  create mode 100644 arch/mips/mach-octeon/Kconfig
>  create mode 100644 arch/mips/mach-octeon/Makefile
>  create mode 100644 arch/mips/mach-octeon/clock.c
>  create mode 100644 arch/mips/mach-octeon/cpu.c
>  create mode 100644 arch/mips/mach-octeon/dram.c
>  create mode 100644 arch/mips/mach-octeon/include/ioremap.h
>  create mode 100644 arch/mips/mach-octeon/start.S
> 
> diff --git a/MAINTAINERS b/MAINTAINERS
> index 66f0b07263..29f2d7328c 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -749,6 +749,12 @@ M:	Ezequiel Garcia <ezequiel at collabora.com>
>  S:	Maintained
>  F:	arch/mips/mach-jz47xx/
>  
> +MIPS Octeon
> +M:	Aaron Williams <awilliams at marvell.com>
> +S:	Maintained
> +F:	arch/mips/mach-octeon/
> +F:	arch/mips/include/asm/arch-octeon/
> +
>  MMC
>  M:	Peng Fan <peng.fan at nxp.com>
>  S:	Maintained
> diff --git a/arch/Kconfig b/arch/Kconfig
> index 91e049b322..1cd3e1dc0b 100644
> --- a/arch/Kconfig
> +++ b/arch/Kconfig
> @@ -37,6 +37,7 @@ config MICROBLAZE
>  
>  config MIPS
>  	bool "MIPS architecture"
> +	select CREATE_ARCH_SYMLINK

you should not need that. The path arch/mips/mach-octeon/include/ will
be automatically added to the include search paths. Thus move all files
in arch/mips/include/asm/arch-octeon/ to arch/mips/mach-octeon/include/

>  	select HAVE_ARCH_IOREMAP
>  	select HAVE_PRIVATE_LIBGCC
>  	select SUPPORT_OF_CONTROL
> diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
> index 48e754cc46..3c7f3eb94f 100644
> --- a/arch/mips/Kconfig
> +++ b/arch/mips/Kconfig
> @@ -106,6 +106,24 @@ config ARCH_JZ47XX
>  	select OF_CONTROL
>  	select DM
>  
> +config ARCH_OCTEON
> +	bool "Support Marvell Octeon CN7xxx platforms"
> +	select DISPLAY_CPUINFO
> +	select DMA_ADDR_T_64BIT
> +	select DM
> +	select DM_SERIAL
> +	select MIPS_CACHE_COHERENT
> +	select MIPS_INIT_STACK_IN_SRAM
> +	select MIPS_L2_CACHE
> +	select MIPS_TUNE_OCTEON3
> +	select ROM_EXCEPTION_VECTORS
> +	select SUPPORTS_BIG_ENDIAN
> +	select SUPPORTS_CPU_MIPS64_OCTEON
> +	select PHYS_64BIT
> +	select OF_CONTROL
> +	select OF_LIVE
> +	imply CMD_DM
> +
>  config MACH_PIC32
>  	bool "Support Microchip PIC32"
>  	select DM
> @@ -160,6 +178,7 @@ source "arch/mips/mach-bmips/Kconfig"
>  source "arch/mips/mach-jz47xx/Kconfig"
>  source "arch/mips/mach-pic32/Kconfig"
>  source "arch/mips/mach-mtmips/Kconfig"
> +source "arch/mips/mach-octeon/Kconfig"
>  
>  if MIPS
>  
> @@ -233,6 +252,14 @@ config CPU_MIPS64_R6
>  	  Choose this option to build a kernel for release 6 or later of the
>  	  MIPS64 architecture.
>  
> +config CPU_MIPS64_OCTEON
> +	bool "Marvell Octeon series of CPUs"
> +	depends on SUPPORTS_CPU_MIPS64_OCTEON
> +	select 64BIT
> +	help
> +	 Choose this option for Marvell Octeon CPUs.  These CPUs are between
> +	 MIPS64 R5 and R6 with other extensions.
> +
>  endchoice
>  
>  menu "General setup"
> @@ -261,7 +288,7 @@ config MIPS_CM_BASE
>  config MIPS_CACHE_INDEX_BASE
>  	hex "Index base address for cache initialisation"
>  	default 0x80000000 if CPU_MIPS32
> -	default 0xffffffff80000000 if CPU_MIPS64
> +	default 0xFFFFFFFFC0000000 if ARCH_OCTEON
>  	help
>  	  This is the base address for a memory block, which is used for
>  	  initialising the cache lines. This is also the base address of a memory
> @@ -342,6 +369,14 @@ config SPL_LOADER_SUPPORT
>  	help
>  	  Enable this option if you want to use SPL loaders without DM enabled.
>  
> +config MIPS_CACHE_COHERENT
> +	bool "Set if MIPS processor is cache coherent"
> +	help
> +	 Enable this if the MIPS architecture is cache coherent like the
> +	 Marvell Octeon series of SoCs.  When this is set, cache flushes
> +	 and invalidates only flush the write buffer since the hardware
> +	 maintains cache coherency.
> +
>  endmenu
>  
>  menu "OS boot interface"
> @@ -398,6 +433,9 @@ config SUPPORTS_CPU_MIPS64_R2
>  config SUPPORTS_CPU_MIPS64_R6
>  	bool
>  
> +config SUPPORTS_CPU_MIPS64_OCTEON
> +	bool
> +
>  config CPU_MIPS32
>  	bool
>  	default y if CPU_MIPS32_R1 || CPU_MIPS32_R2 || CPU_MIPS32_R6
> @@ -405,6 +443,7 @@ config CPU_MIPS32
>  config CPU_MIPS64
>  	bool
>  	default y if CPU_MIPS64_R1 || CPU_MIPS64_R2 || CPU_MIPS64_R6
> +	default y if CPU_MIPS64_OCTEON
>  
>  config MIPS_TUNE_4KC
>  	bool
> @@ -421,6 +460,9 @@ config MIPS_TUNE_34KC
>  config MIPS_TUNE_74KC
>  	bool
>  
> +config MIPS_TUNE_OCTEON3
> +	bool
> +
>  config 32BIT
>  	bool
>  
> @@ -453,6 +495,11 @@ config MIPS_SRAM_INIT
>  	  before it can be used. If enabled, a function mips_sram_init() will
>  	  be called just before setup_stack_gd.
>  
> +config DMA_ADDR_T_64BIT
> +	bool
> +	help
> +	 Select this to enable 64-bit DMA addressing
> +
>  config SYS_DCACHE_SIZE
>  	int
>  	default 0
> diff --git a/arch/mips/Makefile b/arch/mips/Makefile
> index af3f227436..fa1ba7855a 100644
> --- a/arch/mips/Makefile
> +++ b/arch/mips/Makefile
> @@ -1,6 +1,10 @@
>  # SPDX-License-Identifier: GPL-2.0+
>  
> +ifneq ($(CONFIG_ARCH_OCTEON),y)
>  head-y := arch/mips/cpu/start.o
> +else
> +head-y := arch/mips/mach-octeon/start.o
> +endif
>  
>  ifeq ($(CONFIG_SPL_BUILD),y)
>  ifneq ($(CONFIG_SPL_START_S_PATH),)
> @@ -17,6 +21,7 @@ machine-$(CONFIG_ARCH_JZ47XX) += jz47xx
>  machine-$(CONFIG_MACH_PIC32) += pic32
>  machine-$(CONFIG_ARCH_MTMIPS) += mtmips
>  machine-$(CONFIG_ARCH_MSCC) += mscc
> +machine-${CONFIG_ARCH_OCTEON} += octeon
>  
>  machdirs := $(patsubst %,arch/mips/mach-%/,$(machine-y))
>  libs-y += $(machdirs)
> @@ -30,6 +35,7 @@ arch-$(CONFIG_CPU_MIPS32_R6) += -march=mips32r6 -Wa,-mips32r6
>  arch-$(CONFIG_CPU_MIPS64_R1) += -march=mips64 -Wa,-mips64
>  arch-$(CONFIG_CPU_MIPS64_R2) += -march=mips64r2 -Wa,-mips64r2
>  arch-$(CONFIG_CPU_MIPS64_R6) += -march=mips64r6 -Wa,-mips64r6
> +arch-${CONFIG_CPU_MIPS64_OCTEON} += -march=octeon3
>  
>  # Allow extra optimization for specific CPUs/SoCs
>  tune-$(CONFIG_MIPS_TUNE_4KC) += -mtune=4kc
> @@ -37,6 +43,7 @@ tune-$(CONFIG_MIPS_TUNE_14KC) += -mtune=14kc
>  tune-$(CONFIG_MIPS_TUNE_24KC) += -mtune=24kc
>  tune-$(CONFIG_MIPS_TUNE_34KC) += -mtune=34kc
>  tune-$(CONFIG_MIPS_TUNE_74KC) += -mtune=74kc
> +tune-${CONFIG_MIPS_TUNE_OCTEON3} += -mtune=octeon3
>  
>  # Include default header files
>  cflags-y += -I$(srctree)/arch/mips/include/asm/mach-generic
> diff --git a/arch/mips/cpu/Makefile b/arch/mips/cpu/Makefile
> index 6df7bb4e48..732015d6f3 100644
> --- a/arch/mips/cpu/Makefile
> +++ b/arch/mips/cpu/Makefile
> @@ -1,6 +1,8 @@
>  # SPDX-License-Identifier: GPL-2.0+
>  
> -extra-y	= start.o
> +ifneq ($(CONFIG_ARCH_OCTEON),y)
> +extra-y = start.o
> +endif
>  
>  obj-y += time.o
>  obj-y += interrupts.o
> diff --git a/arch/mips/include/asm/arch-octeon/cavm-reg.h b/arch/mips/include/asm/arch-octeon/cavm-reg.h
> new file mode 100644
> index 0000000000..b961e54956
> --- /dev/null
> +++ b/arch/mips/include/asm/arch-octeon/cavm-reg.h
> @@ -0,0 +1,42 @@
> +/* SPDX-License-Identifier:    GPL-2.0 */
> +/*
> + * Copyright (C) 2020 Marvell International Ltd.
> + */
> +
> +#ifndef __CAVM_REG_H__
> +
> +/* Register offsets */
> +#define CAVM_CIU_FUSE			((u64 *)0x80010100000001a0)
> +#define CAVM_MIO_BOOT_REG_CFG0		((u64 *)0x8001180000000000)
> +#define CAVM_RST_BOOT			((u64 *)0x8001180006001600)
> +
> +/* Register structs */
> +
> +/**
> + * Register (RSL) rst_boot
> + *
> + * RST Boot Register
> + */
> +union cavm_rst_boot {
> +	u64 u;
> +	struct cavm_rst_boot_s {
> +		u64 chipkill                         : 1;
> +		u64 jtcsrdis                         : 1;
> +		u64 ejtagdis                         : 1;
> +		u64 romen                            : 1;
> +		u64 ckill_ppdis                      : 1;
> +		u64 jt_tstmode                       : 1;
> +		u64 vrm_err                          : 1;
> +		u64 reserved_37_56                   : 20;
> +		u64 c_mul                            : 7;
> +		u64 pnr_mul                          : 6;
> +		u64 reserved_21_23                   : 3;
> +		u64 lboot_oci                        : 3;
> +		u64 lboot_ext                        : 6;
> +		u64 lboot                            : 10;
> +		u64 rboot                            : 1;
> +		u64 rboot_pin                        : 1;
> +	} s;
> +};
> +
> +#endif /* __CAVM_REG_H__ */
> diff --git a/arch/mips/include/asm/arch-octeon/clock.h b/arch/mips/include/asm/arch-octeon/clock.h
> new file mode 100644
> index 0000000000..a844a222c9
> --- /dev/null
> +++ b/arch/mips/include/asm/arch-octeon/clock.h
> @@ -0,0 +1,24 @@
> +/* SPDX-License-Identifier:    GPL-2.0 */
> +/*
> + * Copyright (C) 2018, 2019 Marvell International Ltd.
> + *
> + * https://spdx.org/licenses
> + */
> +
> +#ifndef __CLOCK_H__
> +
> +/** System PLL reference clock */
> +#define PLL_REF_CLK                     50000000        /* 50 MHz */
> +#define NS_PER_REF_CLK_TICK             (1000000000 / PLL_REF_CLK)
> +
> +/**
> + * Returns the I/O clock speed in Hz
> + */
> +u64 octeon_get_io_clock(void);
> +
> +/**
> + * Returns the core clock speed in Hz
> + */
> +u64 octeon_get_core_clock(void);
> +
> +#endif /* __CLOCK_H__ */
> diff --git a/arch/mips/mach-octeon/Kconfig b/arch/mips/mach-octeon/Kconfig
> new file mode 100644
> index 0000000000..67fcb6058c
> --- /dev/null
> +++ b/arch/mips/mach-octeon/Kconfig
> @@ -0,0 +1,92 @@
> +menu "Octeon platforms"
> +	depends on ARCH_OCTEON
> +
> +config SYS_SOC
> +	string
> +	default "octeon"
> +
> +config OCTEON_CN7XXX
> +	bool "Octeon CN7XXX SoC"
> +
> +config OCTEON_CN70XX
> +	bool "Octeon CN70XX SoC"
> +	select OCTEON_CN7XXX
> +
> +config OCTEON_CN73XX
> +	bool "Octeon CN73XX SoC"
> +	select OCTEON_CN7XXX
> +
> +config OCTEON_CN78XX
> +	bool "Octeon CN78XX SoC"
> +	select OCTEON_CN7XXX
> +
> +choice
> +	prompt "Octeon MIPS family select"
> +
> +config SOC_OCTEON2
> +	bool "Octeon II family"
> +	help
> +	 This selects the Octeon II SoC family

this should be added later when needed

> +
> +config SOC_OCTEON3
> +	bool "Octeon III family"
> +	help
> +	 This selects the Octeon III SoC family CN70xx, CN73XX, CN78xx
> +	 and CNF75XX.
> +
> +endchoice
> +
> +config SYS_DCACHE_SIZE
> +	default 32768
> +
> +config SYS_DCACHE_LINE_SIZE
> +	default 128
> +
> +config SYS_ICACHE_SIZE
> +	default	79872
> +
> +config SYS_ICACHE_LINE_SIZE
> +	default 128
> +
> +config OCTEON_BIG_STACK_SIZE
> +	hex
> +	default 0x4000
> +	help
> +	 This enables a larger stack needed for Octeon 3 DRAM initialization.
> +	 If this is disabled then a part of the L1 cache will be reserved for
> +	 the stack, resulting in a smaller image.  If this  is true then
> +	 a portion of the TEXT address space will be reserved for the stack.
> +	 Note that this requires that U-Boot MUST be able to fit entirely
> +	 within the L2 cache and cannot be executed from a parallel NOR flash.
> +	 The default size is 16KiB.
> +
> +config OCTEON_COPY_FROM_FLASH_TO_L2
> +	bool
> +	default y
> +	help
> +	 Set this for U-Boot to attempt to copy itself from flash memory into
> +	 the L2 cache.  This significantly improvess the boot performance.
> +
> +config OCTEON_L2_MEMCPY_IN_CACHE
> +	bool
> +	default y
> +	help
> +	 If this is set then the memcpy code that is used to copy U-Boot from
> +	 the flash to the L2 cache is written to the L2 cache.  This
> +	 significantly speeds up the memcpy operation.
> +
> +config OCTEON_L2_UBOOT_ADDR
> +	hex
> +	default 0xffffffff81000000
> +	help
> +	 This specifies the address where U-Boot will be copied into the L2
> +	 cache.
> +
> +config OCTEON_L2_MEMCPY_ADDR
> +	hex
> +	default 0xffffffff81400000
> +	help
> +	 This specifies where U-Boot will place the memcpy routine used for
> +	 copying U-Boot from flash to L2 cache.
> +
> +endmenu
> diff --git a/arch/mips/mach-octeon/Makefile b/arch/mips/mach-octeon/Makefile
> new file mode 100644
> index 0000000000..a5fda682a7
> --- /dev/null
> +++ b/arch/mips/mach-octeon/Makefile
> @@ -0,0 +1,10 @@
> +# (C) Copyright 2019 Marvell, Inc.
> +#
> +# SPDX-License-Identifier:	GPL-2.0+
> +#
> +
> +extra-y = start.o
> +
> +obj-y += clock.o
> +obj-y += cpu.o
> +obj-y += dram.o
> diff --git a/arch/mips/mach-octeon/clock.c b/arch/mips/mach-octeon/clock.c
> new file mode 100644
> index 0000000000..6e32008641
> --- /dev/null
> +++ b/arch/mips/mach-octeon/clock.c
> @@ -0,0 +1,22 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Copyright (C) 2018, 2019 Marvell International Ltd.
> + */
> +
> +#include <common.h>
> +#include <asm/arch/clock.h>
> +
> +DECLARE_GLOBAL_DATA_PTR;
> +
> +int octeon_get_timer_freq(void)
> +{
> +	return gd->cpu_clk;
> +}
> +
> +/**
> + * Returns the I/O clock speed in Hz
> + */
> +u64 octeon_get_io_clock(void)
> +{
> +	return gd->bus_clk;
> +}
> diff --git a/arch/mips/mach-octeon/cpu.c b/arch/mips/mach-octeon/cpu.c
> new file mode 100644
> index 0000000000..a1373c6d56
> --- /dev/null
> +++ b/arch/mips/mach-octeon/cpu.c
> @@ -0,0 +1,55 @@
> +// SPDX-License-Identifier: GPL-2.0+
> +/*
> + * Copyright (C) 2020 Marvell International Ltd.
> + */
> +
> +#include <common.h>
> +#include <linux/io.h>
> +#include <asm/arch/clock.h>
> +#include <asm/arch-octeon/cavm-reg.h>
> +
> +DECLARE_GLOBAL_DATA_PTR;
> +
> +static int get_clocks(void)
> +{
> +	const u64 ref_clock = PLL_REF_CLK;
> +	union cavm_rst_boot rst_boot;
> +
> +	rst_boot.u = ioread64(CAVM_RST_BOOT);
> +	gd->cpu_clk = ref_clock * rst_boot.s.c_mul;
> +	gd->bus_clk = ref_clock * rst_boot.s.pnr_mul;
> +
> +	debug("%s: cpu: %lu, bus: %lu\n", __func__, gd->cpu_clk, gd->bus_clk);
> +
> +	return 0;
> +}
> +
> +/* Early mach init code run from flash */
> +int mach_cpu_init(void)
> +{
> +	/* Remap boot-bus 0x1fc0.0000 -> 0x1f40.0000 */
> +	/* ToDo: Move this to an early running bus (bootbus) DM driver */
> +	clrsetbits_be64(CAVM_MIO_BOOT_REG_CFG0, 0xffff, 0x1f40);
> +
> +	/* Get clocks and store them in GD */
> +	get_clocks();
> +
> +	return 0;
> +}
> +
> +/**
> + * Returns number of cores
> + *
> + * @return	number of CPU cores for the specified node
> + */
> +static int cavm_octeon_num_cores(void)
> +{
> +	return fls64(ioread64(CAVM_CIU_FUSE) & 0xffffffffffff);
> +}
> +
> +int print_cpuinfo(void)
> +{
> +	printf("SoC:   Octeon CN73xx (%d cores)\n", cavm_octeon_num_cores());
> +
> +	return 0;
> +}
> diff --git a/arch/mips/mach-octeon/dram.c b/arch/mips/mach-octeon/dram.c
> new file mode 100644
> index 0000000000..c16a73e8e6
> --- /dev/null
> +++ b/arch/mips/mach-octeon/dram.c
> @@ -0,0 +1,27 @@
> +// SPDX-License-Identifier: GPL-2.0+
> +/*
> + * Copyright (C) 2020 Marvell International Ltd.
> + */
> +
> +#include <common.h>
> +#include <dm.h>
> +#include <ram.h>
> +
> +DECLARE_GLOBAL_DATA_PTR;
> +
> +int dram_init(void)
> +{
> +	/*
> +	 * No DDR init yet -> run in L2 cache
> +	 */
> +	gd->ram_size = (2 << 20);
> +	gd->bd->bi_dram[0].size = gd->ram_size;
> +	gd->bd->bi_dram[1].size = 0;
> +
> +	return 0;
> +}
> +
> +ulong board_get_usable_ram_top(ulong total_size)
> +{
> +	return gd->ram_top;
> +}
> diff --git a/arch/mips/mach-octeon/include/ioremap.h b/arch/mips/mach-octeon/include/ioremap.h
> new file mode 100644
> index 0000000000..59b75008a2
> --- /dev/null
> +++ b/arch/mips/mach-octeon/include/ioremap.h
> @@ -0,0 +1,30 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef __ASM_MACH_OCTEON_IOREMAP_H
> +#define __ASM_MACH_OCTEON_IOREMAP_H
> +
> +#include <linux/types.h>
> +
> +/*
> + * Allow physical addresses to be fixed up to help peripherals located
> + * outside the low 32-bit range -- generic pass-through version.
> + */
> +static inline phys_addr_t fixup_bigphys_addr(phys_addr_t phys_addr,
> +					     phys_addr_t size)
> +{
> +	return phys_addr;
> +}
> +
> +static inline void __iomem *plat_ioremap(phys_addr_t offset, unsigned long size,
> +					 unsigned long flags)
> +{
> +	return (void __iomem *)(XKPHYS | offset);
> +}
> +
> +static inline int plat_iounmap(const volatile void __iomem *addr)
> +{
> +	return 0;
> +}
> +
> +#define _page_cachable_default	_CACHE_CACHABLE_NONCOHERENT
> +
> +#endif /* __ASM_MACH_OCTEON_IOREMAP_H */
> diff --git a/arch/mips/mach-octeon/start.S b/arch/mips/mach-octeon/start.S
> new file mode 100644
> index 0000000000..acb967201a
> --- /dev/null
> +++ b/arch/mips/mach-octeon/start.S
> @@ -0,0 +1,1241 @@
> +/* SPDX-License-Identifier: GPL-2.0+ */
> +/*
> + *  Startup Code for OCTEON 64-bit CPU-core
> + *
> + *  Copyright (c) 2003	Wolfgang Denk <wd at denx.de>
> + *  Copyright 2004, 2005, 2010 - 2015 Cavium Inc..
> + */
> +
> +#include <asm-offsets.h>
> +#include <config.h>
> +#include <asm/regdef.h>
> +#include <asm/mipsregs.h>
> +#include <asm/asm.h>
> +
> +#define BOOT_VECTOR_NUM_WORDS		8
> +
> +#define OCTEON_BOOT_MOVEABLE_MAGIC_OFFSET	0x70
> +#define OCTEON_BOOT_VECTOR_MOVEABLE_OFFSET	0x78
> +
> +#define OCTEON_BOOT_MOVEABLE_MAGIC1_RAW	0xdb00110ad358eacd
> +#define OCTEON_BOOT_MOVEABLE_MAGIC1	OCTEON_BOOT_MOVEABLE_MAGIC1_RAW
> +
> +#define OCTEON_CIU_SOFT_RST		0x8001070000000740
> +
> +#define	OCTEON_L2C_WPAR_PP0		0x8001180080840000
> +#define OCTEON_MIO_BOOT_BASE		0x8001180000000000
> +#define OCTEON_MIO_BOOT_REG_CFG0_OFF	0x0000
> +#define OCTEON_MIO_BOOT_LOC_CFG0_OFF	0x0080
> +#define OCTEON_MIO_BOOT_LOC_ADR_OFF	0x0090
> +#define OCTEON_MIO_BOOT_LOC_DAT_OFF	0x0098
> +#define	OCTEON_MIO_RST_BOOT		0x8001180000001600
> +#define OCTEON_MIO_BOOT_REG_CFG0	0x8001180000000000
> +#define	OCTEON_MIO_BOOT_REG_TIM0	0x8001180000000040
> +#define OCTEON_MIO_BOOT_LOC_CFG0	0x8001180000000080
> +#define OCTEON_MIO_BOOT_LOC_ADR		0x8001180000000090
> +#define OCTEON_MIO_BOOT_LOC_DAT		0x8001180000000098
> +#define	OCTEON_MIO_FUSE_DAT3		0x8001180000001418
> +#define OCTEON_L2D_FUS3			0x80011800800007B8
> +#define	OCTEON_LMC0_DDR_PLL_CTL		0x8001180088000258
> +
> +#define OCTEON_RST			0x8001180006000000
> +#define OCTEON_RST_BOOT_OFFSET		0x1600
> +#define OCTEON_RST_SOFT_RST_OFFSET	0x1680
> +#define OCTEON_RST_COLD_DATAX_OFFSET(X)	(0x17C0 + (X) * 8)
> +#define OCTEON_RST_BOOT			0x8001180006001600
> +#define OCTEON_RST_SOFT_RST		0x8001180006001680
> +#define OCTEON_RST_COLD_DATAX(X)	(0x80011800060017C0 + (X) * 8)
> +
> +#define OCTEON_OCX_COM_NODE		0x8001180011000000
> +#define OCTEON_L2C_OCI_CTL		0x8001180080800020
> +#define OCTEON_L2C_TAD_CTL		0x8001180080800018
> +#define OCTEON_L2C_CTL			0x8001180080800000
> +
> +#define OCTEON_DBG_DATA			0x80011F00000001E8
> +#define OCTEON_PCI_READ_CMD_E		0x80011F0000001188
> +#define OCTEON_NPEI_DBG_DATA		0x80011F0000008510
> +#define OCTEON_CIU_WDOG(X)		(0x8001070000000500 + (X) * 8)
> +#define OCTEON_CIU_PP_POKE(X)		(0x8001070000000580 + (X) * 8)
> +#define OCTEON_CIU3_WDOG(X)		(0x8001010000020000 + (X) * 8)
> +#define OCTEON_CIU3_PP_POKE(X)		(0x8001010000030000 + (X) * 8)
> +#define OCTEON_OCX_COM_LINKX_CTL(X)	(0x8001180011000020 + (X) * 8)
> +#define OCTEON_SLI_CTL_STATUS		0x80011F0000028570
> +#define OCTEON_GSERX_SCRATCH(X)		(0x8001180090000020 + (X) * 0x1000000)
> +
> +/** PRID for CN56XX */
> +#define OCTEON_PRID_CN56XX		0x04
> +/** PRID for CN52XX */
> +#define OCTEON_PRID_CN52XX		0x07
> +/** PRID for CN63XX */
> +#define OCTEON_PRID_CN63XX		0x90
> +/** PRID for CN68XX */
> +#define OCTEON_PRID_CN68XX		0x91
> +/** PRID for CN66XX */
> +#define OCTEON_PRID_CN66XX		0x92
> +/** PRID for CN61XX */
> +#define OCTEON_PRID_CN61XX		0x93
> +/** PRID for CNF71XX */
> +#define OCTEON_PRID_CNF71XX		0x94
> +/** PRID for CN78XX */
> +#define OCTEON_PRID_CN78XX		0x95
> +/** PRID for CN70XX */
> +#define OCTEON_PRID_CN70XX		0x96
> +/** PRID for CN73XX */
> +#define OCTEON_PRID_CN73XX		0x97
> +/** PRID for CNF75XX */
> +#define OCTEON_PRID_CNF75XX		0x98
> +
> +/* func argument is used to create a  mark, must be unique */
> +#define GETOFFSET(reg, func)	\
> +	.balign	8;		\
> +	bal	func ##_mark;	\
> +	nop;			\
> +	.dword	.;		\
> +func ##_mark:			\
> +	ld	reg, 0(ra);	\
> +	dsubu	reg, ra, reg;
> +
> +#define JAL(func)		\
> +	.balign	8;		\
> +	bal	func ##_mark;	\
> +	 nop;			\
> +	.dword .;		\
> +func ##_mark:			\
> +	ld	t8, 0(ra);	\
> +	dsubu	t8, ra, t8;	\
> +	dla	t9, func;	\
> +	daddu	t9, t9, t8;	\
> +	jalr	t9;		\
> +	 nop;
> +
> +	.set	arch=octeon3
> +	.set	noreorder
> +
> +	.macro uhi_mips_exception
> +	move	k0, t9		# preserve t9 in k0
> +	move	k1, a0		# preserve a0 in k1
> +	li	t9, 15		# UHI exception operation
> +	li	a0, 0		# Use hard register context
> +	sdbbp	1		# Invoke UHI operation
> +	.endm
> +
> +	.macro setup_stack_gd
> +	li	t0, -16
> +	PTR_LI	t1, big_stack_start
> +	and	sp, t1, t0		# force 16 byte alignment
> +	PTR_SUBU \
> +		sp, sp, GD_SIZE		# reserve space for gd
> +	and	sp, sp, t0		# force 16 byte alignment
> +	move	k0, sp			# save gd pointer
> +#if CONFIG_VAL(SYS_MALLOC_F_LEN) && \
> +    !CONFIG_IS_ENABLED(INIT_STACK_WITHOUT_MALLOC_F)
> +	li	t2, CONFIG_VAL(SYS_MALLOC_F_LEN)
> +	PTR_SUBU \
> +		sp, sp, t2		# reserve space for early malloc
> +	and	sp, sp, t0		# force 16 byte alignment
> +#endif
> +	move	fp, sp
> +
> +	/* Clear gd */
> +	move	t0, k0
> +1:
> +	PTR_S	zero, 0(t0)
> +	PTR_ADDIU t0, PTRSIZE
> +	blt	t0, t1, 1b
> +	 nop
> +
> +#if CONFIG_VAL(SYS_MALLOC_F_LEN) && \
> +    !CONFIG_IS_ENABLED(INIT_STACK_WITHOUT_MALLOC_F)
> +	PTR_S	sp, GD_MALLOC_BASE(k0)	# gd->malloc_base offset
> +#endif
> +	.endm
> +
> +/* Saved register usage:
> + * s0:	not used
> + * s1:	not used
> + * s2:	Address U-Boot loaded into in L2 cache
> + * s3:	Start address
> + * s4:	flags
> + *		1:	booting from RAM
> + *		2:	executing out of cache
> + *		4:	booting from flash
> + * s5:	u-boot size (data end - _start)
> + * s6:	offset in flash.
> + * s7:	_start physical address
> + * s8:
> + */
> +
> +ENTRY(_start)
> +	/* U-Boot entry point */
> +	b	reset
> +
> +	/* The above jump instruction/nop are considered part of the
> +	 * bootloader_header_t structure but are not changed when the header is
> +	 * updated.
> +	 */
> +
> +	/* Leave room for bootloader_header_t header at start of binary.  This
> +	 * header is used to identify the board the bootloader is for, what
> +	 * address it is linked at, failsafe/normal, etc.  It also contains a
> +	 * CRC of the entire image.
> +	 */
> +
> +#if defined(CONFIG_ROM_EXCEPTION_VECTORS)
> +	/*
> +	 * Exception vector entry points. When running from ROM, an exception
> +	 * cannot be handled. Halt execution and transfer control to debugger,
> +	 * if one is attached.
> +	 */
> +	.org 0x200
> +	/* TLB refill, 32 bit task */
> +	uhi_mips_exception
> +
> +	.org 0x280
> +	/* XTLB refill, 64 bit task */
> +	uhi_mips_exception
> +
> +	.org 0x300
> +	/* Cache error exception */
> +	uhi_mips_exception
> +
> +	.org 0x380
> +	/* General exception */
> +	uhi_mips_exception
> +
> +	.org 0x400
> +	/* Catch interrupt exceptions */
> +	uhi_mips_exception
> +
> +	.org 0x480
> +	/* EJTAG debug exception */
> +1:	b	1b
> +	 nop
> +
> +	.org 0x500
> +#endif
> +
> +/* Reserve extra space so that when we use the boot bus local memory
> + * segment to remap the debug exception vector we don't overwrite
> + * anything useful
> + */
> +
> +/* Basic exception handler (dump registers) in all ASM.	 When using the TLB for
> + * mapping u-boot C code, we can't branch to that C code for exception handling
> + * (TLB is disabled for some exceptions.
> + */
> +
> +/* RESET/start here */
> +	.balign	8
> +reset:
> +	nop
> +	synci	0(zero)
> +	mfc0	k0, CP0_STATUS
> +	ori	k0, 0x00E0		/* enable 64 bit mode for CSR access */
> +	mtc0	k0, CP0_STATUS
> +
> +	/* Save the address we're booting from, strip off low bits */
> +	bal	1f
> +	 nop
> +1:
> +	move	s3, ra
> +	dins	s3, zero, 0, 12
> +
> +	/* Disable boot bus moveable regions */
> +	PTR_LI	k0, OCTEON_MIO_BOOT_LOC_CFG0
> +	sd	zero, 0(k0)
> +	sd	zero, 8(k0)
> +
> +	/* Disable the watchdog timer
> +	 * First we check if we're running on CN78XX, CN73XX or CNF75XX to see
> +	 * if we use CIU3 or CIU.
> +	 */
> +	mfc0	t0, CP0_PRID
> +	ext	t0, t0, 8, 8
> +	/* Assume CIU */
> +	PTR_LI	t1, OCTEON_CIU_WDOG(0)
> +	PTR_LI	t2, OCTEON_CIU_PP_POKE(0)
> +	blt	t0, OCTEON_PRID_CN78XX, wd_use_ciu
> +	 nop
> +	beq	t0, OCTEON_PRID_CN70XX, wd_use_ciu
> +	 nop
> +	/* Use CIU3 */
> +	PTR_LI	t1, OCTEON_CIU3_WDOG(0)
> +	PTR_LI	t2, OCTEON_CIU3_PP_POKE(0)
> +wd_use_ciu:
> +	sd	zero, 0(t2)		/* Pet the dog */
> +	sd	zero, 0(t1)		/* Disable watchdog timer */
> +
> +	/* Errata: CN76XX has a node ID of 3. change it to zero here.
> +	 * This needs to be done before we relocate to L2 as addresses change
> +	 * For 76XX pass 1.X we need to zero out the OCX_COM_NODE[ID],
> +	 * L2C_OCI_CTL[GKSEGNODE] and CP0 of Root.CvmMemCtl2[KSEGNODE].
> +	 */
> +	mfc0	a4, CP0_PRID
> +	/* Check for 78xx pass 1.x processor ID */
> +	andi	a4, 0xffff
> +	blt	a4, (OCTEON_PRID_CN78XX << 8), 1f
> +	 nop
> +
> +	/* Zero out alternate package for now */
> +	dins	a4, zero, 6, 1
> +	bge	a4, ((OCTEON_PRID_CN78XX << 8) | 0x08), 1f
> +	 nop
> +
> +	/* 78xx or 76xx here, first check for bug #27141 */
> +	PTR_LI	a5, OCTEON_SLI_CTL_STATUS
> +	ld	a6, 0(a5)
> +	andi	a7, a4, 0xff
> +	andi	a6, a6, 0xff
> +
> +	beq	a6, a7, not_bug27141
> +	 nop
> +
> +	/* core 0 proc_id rev_id field does not match SLI_CTL_STATUS rev_id */
> +	/* We just hit bug #27141.  Need to reset the chip and try again */
> +
> +	PTR_LI	a4, OCTEON_RST_SOFT_RST
> +	ori	a5, zero, 0x1	/* set the reset bit */
> +
> +reset_78xx_27141:
> +	sync
> +	synci	0(zero)
> +	cache	9, 0(zero)
> +	sd	a5, 0(a4)
> +	wait
> +	b	reset_78xx_27141
> +	 nop
> +
> +not_bug27141:
> +	/* 76XX pass 1.x has the node number set to 3 */
> +	mfc0	a4, CP0_EBASE
> +	ext	a4, a4, 0, 10
> +	bne	a4, 0x180, 1f	/* Branch if not node 3 core 0 */
> +	 nop
> +
> +	/* Clear OCX_COM_NODE[ID] */
> +	PTR_LI	a5, OCTEON_OCX_COM_NODE
> +	ld	a4, 0(a5)
> +	dins	a4, zero, 0, 2
> +	sd	a4, 0(a5)
> +	ld	zero, 0(a5)
> +
> +	/* Clear L2C_OCI_CTL[GKSEGNODE] */
> +	PTR_LI	a5, OCTEON_L2C_OCI_CTL
> +	ld	a4, 0(a5)
> +	dins	a4, zero, 4, 2
> +	sd	a4, 0(a5)
> +	ld	zero, 0(a5)
> +
> +	/* Clear CP0 Root.CvmMemCtl2[KSEGNODE] */
> +	dmfc0	a4, CP0_CVMMEMCTL2
> +	dins	a4, zero, 12, 2
> +	dmtc0	a4, CP0_CVMMEMCTL2
> +
> +	/* Put the flash address in the start of the EBASE register to
> +	 * enable our exception handler but only for core 0.
> +	 */
> +	mfc0	a4, CP0_EBASE
> +	dext	a4, a4, 0, 10
> +	bnez	a4, no_flash
> +	/* OK in delay slot */
> +	dext	a6, a6, 0, 16		/* Get the base address in flash */
> +	sll	a6, a6, 16
> +	mtc0	a6, CP0_EBASE	/* Enable exceptions */
> +
> +no_flash:
> +	/* Zero out various registers */
> +	mtc0	zero, CP0_DEPC
> +	mtc0	zero, CP0_EPC
> +	mtc0	zero, CP0_CAUSE
> +	mfc0	a4, CP0_PRID
> +	ext	a4, a4, 8, 8
> +	mtc0	zero, CP0_DESAVE
> +
> +	/* The following are only available on Octeon 2 or later */
> +	mtc0	zero, CP0_KSCRATCH1
> +	mtc0	zero, CP0_KSCRATCH2
> +	mtc0	zero, CP0_KSCRATCH3
> +	mtc0	zero, CP0_USERLOCAL
> +
> +	/* Turn off ROMEN bit to disable ROM */
> +	PTR_LI	a1, OCTEON_MIO_RST_BOOT
> +	/* For OCTEON 3 we use RST_BOOT instead of MIO_RST_BOOT.
> +	 * The difference is bits 24-26 are 6 instead of 0 for the address.
> +	 */
> +	/* For Octeon 2 and CN70XX we can ignore the watchdog */
> +	blt	a4, OCTEON_PRID_CN78XX, watchdog_ok
> +	 nop
> +
> +	PTR_LI	a1, OCTEON_RST_BOOT
> +
> +	beq	a4, OCTEON_PRID_CN70XX, watchdog_ok
> +	 nop
> +
> +	ld	a2, 0(a1)
> +	/* There is a bug where some registers don't get properly reset when
> +	 * the watchdog timer causes a reset.  In this case we need to force
> +	 * a reset.
> +	 */
> +	bbit0	a2, 11, watchdog_ok	/* Skip if watchdog not hit */
> +	 dins	a2, zero, 2, 18	/* Don't clear LBOOT, LBOOT_EXT or LBOOT_OCI */
> +	/* Clear bit indicating reset due to watchdog */
> +	ori	a2, 1 << 11
> +	sd	a2, 0(a1)
> +
> +	/* Disable watchdog */
> +	PTR_LI	a1, OCTEON_CIU3_PP_POKE(0)
> +	sd	zero, 0(a1)
> +	PTR_LI	a1, OCTEON_CIU3_WDOG(0)
> +	sd	zero, 0(a1)
> +
> +	/* Record this in the GSER0_SCRATCH register in bit 11 */
> +	PTR_LI	a1, OCTEON_GSERX_SCRATCH(0)
> +	ld	a2, 0(a1)
> +	ori	a2, 1 << 11
> +	sd	a2, 0(a1)
> +
> +	PTR_LI	a1, OCTEON_RST_SOFT_RST
> +	li	a2, 1
> +	sd	a2, 0(a1)
> +	wait
> +
> +	/* We should never get here */
> +
> +watchdog_ok:
> +	ld	a2, 0(a1)
> +	/* Don't clear LBOOT/LBOOT_EXT or LBOOT_OCI */
> +	dins	a2, zero, 2, 18
> +	dins	a2, zero, 60, 1	/* Clear ROMEN bit */
> +	sd	a2, 0(a1)
> +
> +	/* Start of Octeon setup */
> +
> +	/* Check what core we are - if core 0, branch to init tlb
> +	 * loop in flash.  Otherwise, look up address of init tlb
> +	 * loop that was saved in the boot vector block.
> +	 */
> +	mfc0	a0, CP0_EBASE
> +	andi	a0, EBASE_CPUNUM		/* get core */
> +	beqz	a0, InitTLBStart_local
> +	 nop
> +
> +	break
> +	/* We should never get here - non-zero cores now go directly to
> +	 * tlb init from the boot stub in movable region.
> +	 */
> +
> +	.globl InitTLBStart
> +InitTLBStart:
> +InitTLBStart_local:
> +	/* If we don't have working memory yet configure a bunch of
> +	 * scratch memory, and set the stack pointer to the top
> +	 * of it.  This allows us to go to C code without having
> +	 * memory set up
> +	 *
> +	 * Warning: do not change SCRATCH_STACK_LINES as this can impact the
> +	 * transition from start.S to crti.asm. crti requires 590 bytes of
> +	 * stack space.
> +	 */
> +	cache	1,0(zero)	/* Clear Dcache so cvmseg works right */
> +#if CONFIG_OCTEON_BIG_STACK_SIZE
> +	rdhwr	v0, $0
> +	bnez	v0, 1f
> +	 nop
> +	PTR_LA	sp, big_stack_start - 16
> +	b	stack_clear_done
> +	 nop
> +1:
> +#endif
> +#define SCRATCH_STACK_LINES 0x36   /* MAX is 0x36 */
> +	dmfc0	v0, CP0_CVMMEMCTL
> +	dins	v0, zero, 0, 9
> +	/* setup SCRATCH_STACK_LINES scratch lines of scratch */
> +	ori	v0, 0x100 | SCRATCH_STACK_LINES
> +	dmtc0	v0, CP0_CVMMEMCTL
> +	/* set stack to top of scratch memory */
> +	li	sp, 0xffffffffffff8000 + (SCRATCH_STACK_LINES * 128)
> +	/* Clear scratch for CN63XX pass 2.0 errata Core-15169*/
> +	li	t0, 0xffffffffffff8000
> +clear_scratch:
> +	sd	zero, 0(t0)
> +	addiu	t0, 8
> +	bne	t0, sp, clear_scratch
> +	 nop
> +
> +	/* This code run on all cores - core 0 from flash,
> +	 * the rest from DRAM.	When booting from PCI, non-zero cores
> +	 * come directly here from the boot vector - no earlier code in this
> +	 * file is executed.
> +	 */
> +
> +	/* Some generic initialization is done here as well, as we need this
> +	 * done on all cores even when booting from PCI
> +	 */
> +stack_clear_done:
> +	/* Clear watch registers. */
> +	mtc0	zero, CP0_WATCHLO
> +	mtc0	zero, CP0_WATCHHI
> +
> +	/* STATUS register */
> +	mfc0	k0, CP0_STATUS
> +	li	k1, ~ST0_IE
> +	and	k0, k1
> +	mtc0	k0, CP0_STATUS
> +
> +	/* CAUSE register */
> +	mtc0	zero, CP0_CAUSE
> +
> +	/* Init Timer */
> +	dmtc0	zero, CP0_COUNT
> +	dmtc0	zero, CP0_COMPARE
> +
> +
> +	mfc0	a5, CP0_STATUS
> +	li	v0, 0xE0		/* enable 64 bit mode for CSR access */
> +	or	v0, v0, a5
> +	mtc0	v0, CP0_STATUS
> +
> +
> +	dli	v0, 1 << 29  /* Enable large physical address support in TLB */
> +	mtc0	v0, CP0_PAGEGRAIN
> +
> +InitTLB:
> +	dmtc0	zero, CP0_ENTRYLO0
> +	dmtc0	zero, CP0_ENTRYLO1
> +	mtc0	zero, CP0_PAGEMASK
> +	dmtc0	zero, CP0_CONTEXT
> +	/* Use an offset into kseg0 so we won't conflict with Mips1 legacy
> +	 * TLB clearing
> +	 */
> +	PTR_LI	v0, 0xFFFFFFFF90000000
> +	mfc0	a0, CP0_CONFIG1
> +	srl	a0, a0, 25
> +	/* Check if config4 reg present */
> +	mfc0	a1, CP0_CONFIG3
> +	bbit0	a1, 31, 2f
> +	 and	a0, a0, 0x3F		/* a0 now has the max mmu entry index */
> +	mfc0	a1, CP0_CONFIG4
> +	bbit0	a1, 14, 2f		/* check config4[MMUExtDef] */
> +	 nop
> +	/* append config4[MMUSizeExt] to most significant bit of
> +	 * config1[MMUSize-1]
> +	 */
> +	ins	a0, a1, 6, 8
> +	and	a0, a0, 0x3fff	/* a0 now includes max entries for cn6xxx */
> +2:
> +	dmtc0	zero, CP0_XCONTEXT
> +	mtc0	zero, CP0_WIRED
> +
> +InitTLBloop:
> +	dmtc0	v0, CP0_ENTRYHI
> +	tlbp
> +	mfc0	v1, CP0_INDEX
> +	daddiu	v0, v0, 1<<13
> +	bgez	v1, InitTLBloop
> +
> +	mtc0	a0, CP0_INDEX
> +	tlbwi
> +	bnez	a0, InitTLBloop
> +	 daddiu	a0, -1
> +
> +	mthi	zero
> +	mtlo	zero
> +
> +	/* Set up status register */
> +	mfc0	v0, CP0_STATUS
> +	/* Enable COP0 and COP2 access */
> +	li	a4, (1 << 28) | (1 << 30)
> +	or	v0, a4
> +
> +	/* Must leave BEV set here, as DRAM is not configured for core 0.
> +	 * Also, BEV must be 1 later on when the exception base address is set.
> +	 */
> +
> +	/* Mask all interrupts */
> +	ins	v0, zero, 0, 16
> +	/* Clear NMI (used to start cores other than core 0) */
> +	ori	v0, 0xE4		/* enable 64 bit, disable interrupts */
> +	mtc0	v0, CP0_STATUS
> +
> +	dli	v0,0xE000000F		/* enable all readhw locations */
> +	mtc0	v0, CP0_HWRENA
> +
> +	dmfc0	v0, CP0_CVMCTL
> +	ori	v0, 1<<14	/* enable fixup of unaligned mem access */
> +	dmtc0	v0, CP0_CVMCTL
> +
> +	/* Setup scratch memory.  This is also done in
> +	 * cvmx_user_app_init, and this code will be removed
> +	 * from the bootloader in the near future.
> +	 */
> +
> +	/* Set L2C_LAD_CTL[MAXLFB] = 0 on CN73XX */
> +	mfc0	a4, CP0_PRID
> +	ext	a4, a4, 8, 8
> +	blt	a4, OCTEON_PRID_CN73XX, 72f
> +	nop
> +	PTR_LI	v0, OCTEON_L2C_TAD_CTL
> +	ld	t1, 0(v0)
> +	dins	t1, zero, 0, 4
> +	sd	t1, 0(v0)
> +	ld	zero, 0(v0)
> +
> +72:
> +
> +	/* clear these to avoid immediate interrupt in noperf mode */
> +	dmtc0	zero, CP0_COMPARE	/* clear timer interrupt */
> +	dmtc0	zero, CP0_COUNT		/* clear timer interrupt */
> +	dmtc0	zero, CP0_PERF_CNT0	/* clear perfCnt0 */
> +	dmtc0	zero, CP0_PERF_CNT1	/* clear perfCnt1 */
> +	dmtc0	zero, CP0_PERF_CNT2
> +	dmtc0	zero, CP0_PERF_CNT3
> +
> +	/* If we're running on a node other than 0 then we need to set KSEGNODE
> +	 * to 0.  The nice thing with this code is that it also autodetects if
> +	 * we're running on a processor that supports CVMMEMCTL2 or not since
> +	 * only processors that have this will have a non-zero node ID.  Because
> +	 * of this there's no need to check if we're running on a 78XX.
> +	 */
> +	mfc0    t1, CP0_EBASE
> +	dext    t1, t1, 7, 3            /* Extract node number */
> +	beqz    t1, is_node0            /* If non-zero then we're not node 0 */
> +	 nop
> +	dmfc0   t1, CP0_CVMMEMCTL2
> +	dins    t1, zero, 12, 4
> +	dmtc0   t1, CP0_CVMMEMCTL2
> +is_node0:
> +
> +	/* Set up TLB mappings for u-boot code in flash. */
> +
> +	/* Use a bal to get the current PC into ra.  Since this bal is to
> +	 * the address immediately following the delay slot, the ra is
> +	 * the address of the label.  We then use this to get the actual
> +	 * address that we are executing from.
> +	 */
> +	bal	__dummy
> +	 nop
> +
> +__dummy:
> +	/* Get the actual address that we are running at */
> +	PTR_LA	a6, _start		/* Linked address of _start */
> +	PTR_LA	a7, __dummy
> +	dsubu	t0, a7, a6		/* offset of __dummy label from _start*/
> +	dsubu	a7, ra, t0		/* a7 now has actual address of _start*/
> +
> +	/* Save actual _start address in s7.  This is where we
> +	 * are executing from, as opposed to where the code is
> +	 * linked.
> +	 */
> +	move	s7, a7
> +	move	s4, zero
> +
> +	/* s7 has actual address of _start.  If this is
> +	 * on the boot bus, it will be between 0xBFC000000 and 0xBFFFFFFF.
> +	 * If it is on the boot bus, use 0xBFC00000 as the physical address
> +	 * for the TLB mapping, as we will be adjusting the boot bus
> +	 * to make this adjustment.
> +	 * If we are running from DRAM (remote-boot), then we want to use the
> +	 * real address in DRAM.
> +	 */
> +
> +	/* Check to see if we are running from flash - we expect that to
> +	 * be 0xffffffffb0000000-0xffffffffbfffffff
> +	 * (0x10000000-0x1fffffff, unmapped/uncached)
> +	 */
> +	dli	t2, 0xffffffffb0000000
> +	dsubu	t2, s7
> +	slt	s4, s7, t2
> +	bltz	t2, uboot_in_flash
> +	 nop
> +
> +	/* If we're not core 0 then we don't care about cache */
> +	mfc0	t2, CP0_EBASE
> +	andi	t2, EBASE_CPUNUM
> +	bnez	t2, uboot_in_ram
> +	 nop
> +
> +	/* Find out if we're OCTEON I or OCTEON + which don't support running
> +	 * out of cache.
> +	 */
> +	mfc0	t2, CP0_PRID
> +	ext	t2, t2, 8, 8
> +	li	s4, 1
> +	blt	t2, 0x90, uboot_in_ram
> +	 nop
> +
> +	/* U-Boot can be executing either in RAM or L2 cache.  Now we need to
> +	 * check if DRAM is initialized.  The way we do that is to look at
> +	 * the reset bit of the LMC0_DDR_PLL_CTL register (bit 7)
> +	 */
> +	PTR_LI	t2, OCTEON_LMC0_DDR_PLL_CTL
> +	ld	t2, 0(t2)
> +	bbit1	t2, 7, uboot_in_ram
> +	 nop
> +
> +	/* We must be executing out of cache */
> +	b	uboot_in_ram
> +	 li	s4, 2
> +
> +uboot_in_flash:
> +	/* Set s4 to 4 to indicate we're running in FLASH */
> +	li	s4, 4
> +
> +#if defined(CONFIG_OCTEON_DISABLE_L2_CACHE_INDEX_ALIASING)
> +	/* By default, L2C index aliasing is enabled.  In some cases it may
> +	 * need to be disabled.  The L2C index aliasing can only be disabled
> +	 * if U-Boot is running out of L2 cache and the L2 cache has not been
> +	 * used to store anything.
> +	 */
> +	PTR_LI	t1, OCTEON_L2C_CTL
> +	ld	t2, 0(t1)
> +	ori	t2, 1
> +	sd	t2, 0(t1)
> +#endif
> +
> +	/* Use BFC00000 as physical address for TLB mappings when booting
> +	 * from flash, as we will adjust the boot bus mappings to make this
> +	 * mapping correct.
> +	 */
> +	dli	a7, 0xFFFFFFFFBFC00000
> +	dsubu	s6, s7, a7  /* Save flash offset in s6 */
> +
> +#if defined(CONFIG_OCTEON_COPY_FROM_FLASH_TO_L2)
> +	/* For OCTEON II we check to see if the L2 cache is big enough to hold
> +	 * U-Boot.  If it is big enough then we copy ourself from flash to the
> +	 * L2 cache in order to speed up execution.
> +	 */
> +
> +	/* Check for OCTEON 2 */
> +	mfc0	t1, CP0_PRID
> +	ext	t1, t1, 8, 8
> +	/* Get number of L2 cache sets */
> +	beq	t1, OCTEON_PRID_CNF71XX, got_l2_sets	/* CNF71XX */
> +	 li	t2, 1 << 9
> +	beq	t1, OCTEON_PRID_CN78XX, got_l2_sets	/* CN78XX */
> +	 li	t2, 1 << 13
> +	beq	t1, OCTEON_PRID_CN70XX, got_l2_sets	/* CN70XX */
> +	 li	t2, 1 << 10
> +	beq	t1, OCTEON_PRID_CN73XX, got_l2_sets	/* CN73XX */
> +	 li	t2, 1 << 11
> +	beq	t1, OCTEON_PRID_CNF75XX, got_l2_sets	/* CNF75XX */
> +	 li	t2, 1 << 11
> +	b	l2_cache_too_small	/* Unknown OCTEON model */
> +	 nop
> +
> +got_l2_sets:
> +	/* Get number of associations */
> +	PTR_LI	t0, OCTEON_MIO_FUSE_DAT3
> +	ld	t0, 0(t0)
> +	dext	t0, t0, 32, 3
> +
> +	beq	t1, OCTEON_PRID_CN70XX, process_70xx_l2sets
> +	 nop
> +	/* 0 = 16-way, 1 = 12-way, 2 = 8-way, 3 = 4-way, 4-7 reserved */
> +	beqz	t0, got_l2_ways
> +	 li	t3, 16
> +	beq	t0, 1, got_l2_ways
> +	 li	t3, 12
> +	beq	t0, 2, got_l2_ways
> +	 li	t3, 8
> +	beq	t0, 3, got_l2_ways
> +	 li	t3, 4
> +	b	l2_cache_too_small
> +	 nop
> +
> +process_70xx_l2sets:
> +	/* For 70XX, the number of ways is defined as:
> +	 * 0 - full cache (4-way) 512K
> +	 * 1 - 3/4 ways (3-way) 384K
> +	 * 2 - 1/2 ways (2-way) 256K
> +	 * 3 - 1/4 ways (1-way) 128K
> +	 * 4-7 illegal (aliased to 0-3)
> +	 */
> +	andi	t0, 3
> +	beqz	t0, got_l2_ways
> +	 li	t3, 4
> +	beq	t0, 1, got_l2_ways
> +	 li	t3, 3
> +	beq	t0, 2, got_l2_ways
> +	 li	t3, 2
> +	li	t3, 1
> +
> +got_l2_ways:
> +	dmul	a1, t2, t3		/* Calculate cache size */
> +	dsll	a1, 7			/* Ways * Sets * cache line sz (128) */
> +	daddiu	a1, a1, -128		/* Adjust cache size for copy code */
> +
> +	/* Calculate size of U-Boot image */
> +	/*
> +	 * "uboot_end - _start" is not correct, as the image also
> +	 * includes the DTB appended to the end (OF_EMBED is deprecated).
> +	 * Lets use a defined max for now here.
> +	 */
> +	PTR_LI	s5, CONFIG_BOARD_SIZE_LIMIT
> +
> +	daddu	t2, s5, s7	/* t2 = end address */
> +	daddiu	t2, t2, 127
> +	ins	t2, zero, 0, 7	/* Round up to cache line for memcpy */
> +
> +	slt	t1, a1, s5	/* See if we're bigger than the L2 cache */
> +	bnez	t1, l2_cache_too_small
> +	 nop
> +	/* Address we plan to load at in the L2 cache */
> +	PTR_LI	t9, CONFIG_OCTEON_L2_UBOOT_ADDR
> +# ifdef CONFIG_OCTEON_L2_MEMCPY_IN_CACHE
> +	/* Enable all ways for PP0.  Authentik ROM may have disabled these */
> +	PTR_LI	a1, OCTEON_L2C_WPAR_PP0
> +	sd	zero, 0(a1)
> +
> +	/* Address to place our memcpy code */
> +	PTR_LI	a0, CONFIG_OCTEON_L2_MEMCPY_ADDR
> +	/* The following code writes a simple memcpy routine into the cache
> +	 * to copy ourself from flash into the L2 cache.  This makes the
> +	 * memcpy routine a lot faster since each instruction can potentially
> +	 * require four read cycles to flash over the boot bus.
> +	 */
> +	/* Zero cache line in the L2 cache */
> +	zcb	(a0)
> +	synci	0(zero)
> +	dli	a1, 0xdd840000dd850008	/* ld a0, 0(t0);  ld a1, 8(t0) */
> +	sd	a1, 0(a0)
> +	dli	a1, 0xdd860010dd870018	/* ld a2, 16(t0); ld a3, 24(t0) */
> +	sd	a1, 8(a0)
> +	dli	a1, 0xfda40000fda50008	/* sd a0, 0(t1);  sd a1, 8(t1) */
> +	sd	a1, 16(a0)
> +	dli	a1, 0xfda60010fda70018	/* sd a2, 16(t1); sd a3, 24(t1) */
> +	sd	a1, 24(a0)
> +	dli	a1, 0x258c0020158efff6	/* addiu t0, 32; bne t0, t2, -40 */
> +	sd	a1, 32(a0)
> +	dli	a1, 0x25ad002003e00008	/* addiu t1, 32; jr ra */
> +	sd	a1, 40(a0)
> +	sd	zero, 48(a0)		/* nop; nop */
> +
> +	/* Synchronize the caches */
> +	sync
> +	synci	0(zero)
> +
> +	move	t0, s7
> +	move	t1, t9
> +
> +	/* Do the memcpy operation in L2 cache to copy ourself from flash
> +	 * to the L2 cache.
> +	 */
> +	jalr	a0
> +	 nop
> +
> +# else
> +	/* Copy ourself to the L2 cache from flash, 32 bytes at a time */
> +	/* This code is now written to the L2 cache using the code above */
> +1:
> +	ld	a0, 0(t0)
> +	ld	a1, 8(t0)
> +	ld	a2, 16(t0)
> +	ld	a3, 24(t0)
> +	sd	a0, 0(t1)
> +	sd	a1, 8(t1)
> +	sd	a2, 16(t1)
> +	sd	a3, 24(t1)
> +	addiu	t0, 32
> +	bne	t0, t2, 1b
> +	addiu	t1, 32
> +# endif	/* CONFIG_OCTEON_L2_MEMCPY_IN_CACHE */
> +
> +	/* Adjust the start address of U-Boot and the global pointer */
> +	subu	t0, s7, t9	/* t0 = address difference */
> +	move	s7, t9		/* Update physical address */
> +	move	s2, t9
> +	sync
> +	synci	0(zero)
> +
> +	/* Now we branch to the L2 cache.  We first get our PC then adjust it
> +	 */
> +	bal	3f
> +	 nop
> +3:
> +	/* Don't add any instructions here! */
> +	subu	t9, ra, t0
> +	/* Give ourself 16 bytes */
> +	addiu	t9, 0x10
> +
> +	jal	t9		/* Branch to address in L2 cache */
> +
> +	 nop
> +	nop
> +	/* Add instructions after here */
> +
> +	move	a7, s7
> +
> +	b	uboot_in_ram
> +	 ori	s4, 2		/* Running out of L2 cache */
> +
> +l2_cache_too_small:	/* We go here if we can't copy ourself to L2 */
> +#endif /* CONFIG_OCTEON_COPY_FROM_FLASH_TO_L2 */
> +
> +	/* This code is only executed if booting from flash. */
> +	/*  For flash boot (_not_ RAM boot), we do a workaround for
> +	 * an LLM errata on CN38XX and CN58XX parts.
> +	 */
> +
> +uboot_in_ram:
> +	/* U-boot address is now in reg a7, and is 4 MByte aligned.
> +	 * (boot bus addressing has been adjusted to make this happen for flash,
> +	 * and for DRAM this alignment must be provided by the remote boot
> +	 * utility.
> +	 */
> +	/* See if we're in KSEG0 range, if so set EBASE register to handle
> +	 * exceptions.
> +	 */
> +	dli	a1, 0x20000000
> +	bge	a7, a1, 1f
> +	 nop
> +	/* Convert our physical address to KSEG0 */
> +	PTR_LI	a1, 0xffffffff80000000
> +	or	a1, a1, a7
> +	mtc0	a1, CP0_EBASE
> +1:
> +	/* U-boot now starts at 0xBFC00000.  Use a single 4 MByte TLB mapping
> +	 * to map u-boot.
> +	 */
> +	move	a0, a6		/* Virtual addr in a0 */
> +	dins	a0, zero, 0, 16	/* Zero out offset bits */
> +	move	a1, a7		/* Physical addr in a1 */
> +
> +	/* Now we need to remove the MIPS address space bits.  For this we
> +	 * need to determine if it is a 32 bit compatibility address or not.
> +	 */
> +
> +	/* 'lowest' address in compatibility space */
> +	PTR_LI	t0, 0xffffffff80000000
> +	dsubu	t0, t0, a1
> +	bltz	t0, compat_space
> +	 nop
> +
> +	/* We have a xkphys address, so strip off top bit */
> +	b	addr_fixup_done
> +	 dins	a1, zero, 63, 1
> +
> +compat_space:
> +	PTR_LI	a2, 0x1fffffff
> +	and	a1, a1, a2  /* Mask phy addr to remove address space bits */
> +
> +addr_fixup_done:
> +	/* Currenty the u-boot image size is limited to 4 MBytes.  In order to
> +	 * support larger images the flash mapping will need to be changed to
> +	 * be able to access more than that before C code is run.  Until that
> +	 * is done, we just use a 4 MByte mapping for the secondary cores as
> +	 * well.
> +	 */
> +	/* page size (only support 4 Meg binary size for now for core 0)
> +	 * This limitation is due to the fact that the boot vector is
> +	 * 0xBFC00000 which only makes 4MB available.  Later more flash
> +	 * address space will be available after U-Boot has been copied to
> +	 * RAM.	 For now assume that it is in flash.
> +	 */
> +	li	a2, 2*1024*1024
> +
> +	mfc0	a4, CP0_EBASE
> +	andi	a4, EBASE_CPUNUM		/* get core */
> +	beqz	a4, core_0_tlb
> +	 nop
> +
> +	/* Now determine how big a mapping to use for secondary cores,
> +	 * which need to map all of u-boot + heap in DRAM
> +	 */
> +	/* Here we look at the alignment of the the physical address,
> +	 * and use the largest page size possible.  In some cases
> +	 * this can result in an oversize mapping, but for secondary cores
> +	 * this mapping is very short lived.
> +	 */
> +
> +	/* Physical address in a1 */
> +	li	a2, 1
> +1:
> +	sll	a2, 1
> +	and	a5, a1, a2
> +	beqz	a5, 1b
> +	 nop
> +
> +	/* a2 now contains largest page size we can use */
> +core_0_tlb:
> +	JAL(single_tlb_setup)
> +
> +	/* Check if we're running from cache */
> +	bbit1	s4, 1, uboot_in_cache
> +	 nop
> +
> +	/* If we are already running from ram, we don't need to muck
> +	 * with boot bus mappings.
> +	 */
> +	PTR_LI	t2, 0xffffffffb0000000
> +	dsubu	t2, s7
> +	/* See if our starting address is lower than the boot bus */
> +	bgez	t2, uboot_in_ram2	/* If yes, booting from RAM */
> +	 nop
> +
> +uboot_in_cache:
> +#if CONFIG_OCTEON_BIG_STACK_SIZE
> +	/* The large stack is only for core 0.  For all other cores we need to
> +	 * use the L1 cache otherwise the other cores will stomp on top of each
> +	 * other unless even more space is reserved for the stack space for
> +	 * each core.  With potentially 96 cores this gets excessive.
> +	 */
> +	mfc0	v0, CP0_EBASE
> +	andi	a0, EBASE_CPUNUM
> +	bnez	a0, no_big_stack
> +	 nop
> +	PTR_LA	sp, big_stack_start
> +	daddiu	sp, -16
> +
> +no_big_stack:
> +#endif
> +	/* We now have the TLB set up, so we need to remap the boot bus.
> +	 * This is tricky, as we are running from flash, and will be changing
> +	 * the addressing of the flash.
> +	 */
> +	/* Enable movable boot bus region 0, at address 0x10000000 */
> +	PTR_LI	a4, OCTEON_MIO_BOOT_BASE
> +	dli	a5, 0x81000000	/* EN + base address 0x11000000 */
> +	sd	a5, OCTEON_MIO_BOOT_LOC_CFG0_OFF(a4)
> +
> +	/* Copy code to that remaps the boot bus to movable region */
> +	sd	zero, OCTEON_MIO_BOOT_LOC_DAT_OFF(a4)
> +
> +	PTR_LA	a6, change_boot_mappings
> +	GETOFFSET(a5, change_boot_mappings);
> +	daddu	a5, a5, a6
> +
> +	/* The code is 16 bytes (2 DWORDS) */
> +	ld	a7, 0(a5)
> +	sd	a7, OCTEON_MIO_BOOT_LOC_DAT_OFF(a4)
> +	ld	a7, 8(a5)
> +	sd	a7, OCTEON_MIO_BOOT_LOC_DAT_OFF(a4)
> +
> +	/* Read from an RML register to ensure that the previous writes have
> +	 * completed before we branch to the movable region.
> +	 */
> +	ld	zero, OCTEON_MIO_BOOT_LOC_CFG0_OFF(a4)
> +
> +	/* Compute value for boot bus configuration register */
> +	/* Read region 0 config so we can _modify_ the base address field */
> +	PTR_LI	a4, OCTEON_MIO_BOOT_REG_CFG0	/* region 0 config */
> +	ld	a0, 0(a4)
> +	dli	a4, 0xf0000000		/* Mask off bits we want to save */
> +	and	a4, a4, a0
> +	dli	a0, 0x0fff0000		/* Force size to max */
> +	or	a4, a4, a0
> +
> +	move	a5, s6
> +	/* Convert to 64k blocks, as used by boot bus config */
> +	srl	a5, 16
> +	li	a6, 0x1fc0	/* 'normal' boot bus base config value */
> +	subu	a6, a6, a5	/* Subtract offset */
> +	/* combine into register value to pass to boot bus routine */
> +	or	a0, a4, a6
> +
> +	/* Branch there */
> +	PTR_LA	a1, __mapped_continue_label
> +	PTR_LI	a2, OCTEON_MIO_BOOT_REG_CFG0
> +	/* If region 0 is not enabled we can skip it */
> +	ld	a4, 0(a2)
> +	bbit0	a4, 31, __mapped_continue_label
> +	 nop
> +	li	a4, 0x10000000
> +	j	a4
> +	 synci	0(zero)
> +
> +	/* We never get here, as we go directly to __mapped_continue_label */
> +	break
> +
> +
> +uboot_in_ram2:
> +
> +	/* Now jump to address in TLB mapped memory to continue execution */
> +	PTR_LA	a4, __mapped_continue_label
> +	synci	0(a4)
> +	j	a4
> +	 nop
> +
> +__mapped_continue_label:
> +	/* Check if we are core 0, if we are not then we need
> +	 * to vector to code in DRAM to do application setup, and
> +	 * skip the rest of the bootloader.  Only core 0 runs the bootloader
> +	 * and sets up the tables that the other cores will use for
> +	 * configuration.
> +	 */
> +	mfc0	a0, CP0_EBASE
> +	andi	a0, EBASE_CPUNUM   /* get core */
> +	/* if (__all_cores_are_equal==0 && core==0),
> +	 * then jump to execute BL on core 0; else 'go to next line'
> +	 * (core_0_cont1 is executed ONLY when k0=a0=0(core0_ID))
> +	 */
> +	lw	t0, __all_cores_are_equal
> +	beq	a0, t0, core_0_cont1
> +	 nop
> +
> +	/* other cores look up addr from dram */
> +        /* DRAM controller already set up by first core */
> +        li      a1, (BOOT_VECTOR_NUM_WORDS * 4)
> +        mul     a0, a0, a1
> +
> +        /* Now find out the boot vector base address from the moveable boot
> +         * bus region.
> +         */
> +
> +        /* Get the address of the boot bus moveable region */
> +        PTR_LI     t8, OCTEON_MIO_BOOT_BASE
> +        ld      t9, OCTEON_MIO_BOOT_LOC_CFG0_OFF(t8)
> +        /* Make sure it's enabled */
> +        bbit0   t9, 31, invalid_boot_vector
> +         dext   t9, t9, 3, 24
> +        dsll    t9, t9, 7
> +        /* Make address XKPHYS */
> +	li	t0, 1
> +	dins	t9, t0, 63, 1
> +
> +        ld      t0, OCTEON_BOOT_MOVEABLE_MAGIC_OFFSET(t9)
> +        dli     t1, OCTEON_BOOT_MOVEABLE_MAGIC1
> +        bne     t0, t1, invalid_boot_vector
> +         nop
> +
> +        /* Load base address of boot vector table */
> +        ld      t0, OCTEON_BOOT_VECTOR_MOVEABLE_OFFSET(t9)
> +        /* Add offset for core */
> +        daddu   a1, t0, a0
> +
> +	mfc0	v0, CP0_STATUS
> +	move	v1, v0
> +	ins	v1, zero, 19, 1		/* Clear NMI bit */
> +	mtc0	v1, CP0_STATUS
> +
> +        /* Get app start function address */
> +        lw      t9, 8(a1)
> +        beqz    t9, invalid_boot_vector
> +         nop
> +
> +        j       t9
> +         lw      k0, 12(a1)      /* Load global data (deprecated) */
> +
> +invalid_boot_vector:
> +        wait
> +        b       invalid_boot_vector
> +         nop
> +
> +__all_cores_are_equal:
> +	/* The following .word tell if 'all_cores_are_equal' or core0 is special
> +	 * By default (for the first execution) the core0 should be special,
> +	 * in order to behave like the old(existing not-modified) bootloader
> +	 * and run the bootloader on core 0 to follow the existing design.
> +	 * However after that we make 'all_cores_equal' which allows to run SE
> +	 * applications on core0 like on any other core. NOTE that value written
> +	 * to '__all_cores_are_equal' should not match any core ID.
> +	 */
> +	.word 	0
> +
> +core_0_cont1:
> +	li	t0, 0xffffffff
> +	sw	t0, __all_cores_are_equal
> +	/* From here on, only core 0 runs, other cores have branched
> +	 * away.
> +	 */
> +#ifdef CONFIG_MIPS_INIT_STACK_IN_SRAM
> +	/* Set up initial stack and global data */
> +	setup_stack_gd
> +# ifdef CONFIG_DEBUG_UART
> +	PTR_LA	t9, debug_uart_init
> +	jalr	t9
> +	 nop
> +# endif
> +#endif
> +	move	a0, zero		# a0 <-- boot_flags = 0
> +	PTR_LA	t9, board_init_f
> +
> +	jr	t9
> +	 move	ra, zero
> +	END(_start)
> +
> +	.balign	8
> +	.globl	single_tlb_setup
> +	.ent	single_tlb_setup
> +	/* Sets up a single TLB entry.	Virtual/physical addresses
> +	 * must be properly aligned.
> +	 * a0  Virtual address
> +	 * a1  Physical address
> +	 * a2  page (_not_ mapping) size
> +	 */
> +single_tlb_setup:
> +	/* Determine the number of TLB entries available, and
> +	 * use the top one.
> +	 */
> +	mfc0	a3, CP0_CONFIG1
> +	dext	a3, a3, 25, 6		/* a3 now has the max mmu entry index */
> +	mfc0	a5, CP0_CONFIG3		/* Check if config4 reg present */
> +	bbit0	a5, 31, single_tlb_setup_cont
> +	 nop
> +	mfc0	a5, CP0_CONFIG4
> +	bbit0	a5, 14, single_tlb_setup_cont	/* check config4[MMUExtDef] */
> +	 nop
> +	/* append config4[MMUSizeExt] to most significant bit of
> +	 * config1[MMUSize-1]
> +	 */
> +	dins	a3, a5, 6, 8
> +	and	a3, a3, 0x3fff	/* a3 now includes max entries for cn6xxx */
> +
> +single_tlb_setup_cont:
> +
> +	/* Format physical address for entry low */
> +	nop
> +	dsrl	a1, a1, 12
> +	dsll	a1, a1, 6
> +	ori	a1, a1, 0x7	/* set DVG bits */
> +
> +	move	a4, a2
> +	daddu	a5, a4, a4	/* mapping size */
> +	dsll	a6, a4, 1
> +	daddiu	a6, a6, -1	/* pagemask */
> +	dsrl	a4, a4, 6	/* adjust for adding with entrylo */
> +
> +	/* Now set up mapping */
> +	mtc0	a6, CP0_PAGEMASK
> +	mtc0	a3, CP0_INDEX
> +
> +	dmtc0	a1, CP0_ENTRYLO0
> +	daddu	a1, a1, a4
> +
> +	dmtc0	a1, CP0_ENTRYLO1
> +	daddu	a1, a1, a4
> +
> +	dmtc0	a0, CP0_ENTRYHI
> +	daddu	a0, a0, a5
> +
> +	ehb
> +	tlbwi
> +	jr  ra
> +	 nop
> +	.end   single_tlb_setup
> +
> +
> +/**
> + * This code is moved to a movable boot bus region,
> + * and it is responsible for changing the flash mappings and
> + * jumping to run from the TLB mapped address.
> + *
> + * @param a0	New address for boot bus region 0
> + * @param a1	Address to branch to afterwards
> + * @param a2	Address of MIO_BOOT_REG_CFG0
> + */
> +	.balign	8
> +change_boot_mappings:
> +	sd	a0, 0(a2)
> +	sync
> +	j a1	    /* Jump to new TLB mapped location */
> +	 synci	0(zero)
> +
> +/* If we need a large stack, allocate it here. */
> +#if CONFIG_OCTEON_BIG_STACK_SIZE
> +	/* Allocate the stack here so it's in L2 cache or DRAM */
> +	.balign	16
> +big_stack_end:
> +	.skip	CONFIG_OCTEON_BIG_STACK_SIZE, 0
> +big_stack_start:
> +	.dword	0
> +#endif
>
Daniel Schwierzeck May 13, 2020, 11:43 p.m. UTC | #2
Am 02.05.20 um 10:59 schrieb Stefan Roese:
> From: Aaron Williams <awilliams at marvell.com>
> 
> This patch adds very basic support for the Octeon III SoCs. Only
> CFI parallel NOR flash and UART is supported for now.
> 
> Please note that the basic Octeon port does not include the DDR3/4
> initialization yet. This will be added in some follow-up patches
> later. To still use U-Boot on with this port, the L2 cache (4MiB on
> Octeon III CN73xx) is used as RAM. This way, U-Boot can boot to the
> prompt on such boards.
> 
> Signed-off-by: Aaron Williams <awilliams at marvell.com>
> Signed-off-by: Stefan Roese <sr at denx.de>
> ---
> 
>  MAINTAINERS                                  |    6 +
>  arch/Kconfig                                 |    1 +
>  arch/mips/Kconfig                            |   49 +-
>  arch/mips/Makefile                           |    7 +
>  arch/mips/cpu/Makefile                       |    4 +-
>  arch/mips/include/asm/arch-octeon/cavm-reg.h |   42 +
>  arch/mips/include/asm/arch-octeon/clock.h    |   24 +
>  arch/mips/mach-octeon/Kconfig                |   92 ++
>  arch/mips/mach-octeon/Makefile               |   10 +
>  arch/mips/mach-octeon/clock.c                |   22 +
>  arch/mips/mach-octeon/cpu.c                  |   55 +
>  arch/mips/mach-octeon/dram.c                 |   27 +
>  arch/mips/mach-octeon/include/ioremap.h      |   30 +
>  arch/mips/mach-octeon/start.S                | 1241 ++++++++++++++++++
>  14 files changed, 1608 insertions(+), 2 deletions(-)
>  create mode 100644 arch/mips/include/asm/arch-octeon/cavm-reg.h
>  create mode 100644 arch/mips/include/asm/arch-octeon/clock.h
>  create mode 100644 arch/mips/mach-octeon/Kconfig
>  create mode 100644 arch/mips/mach-octeon/Makefile
>  create mode 100644 arch/mips/mach-octeon/clock.c
>  create mode 100644 arch/mips/mach-octeon/cpu.c
>  create mode 100644 arch/mips/mach-octeon/dram.c
>  create mode 100644 arch/mips/mach-octeon/include/ioremap.h
>  create mode 100644 arch/mips/mach-octeon/start.S
> 

I couldn't completely understand the start.S. There is too much stuff in
it for an initial merge. But I don't see a hard reason against using the
generic start.S. So the first patch series should only implement the
bare minimum needed to boot from flash, init the boot CPU core, maybe
suspend all other cores and relocate to L2 cache.

I know the current start.S is not really suited yet but I'm working on a
refactoring to add some more hooks which a SoC/CPU can implement. Once
we have your initial patch series and the refactoring in mainline, it
should be possible to gradually add more Octeon stuff like memory init.

Basic idea for refactoring is something like this:

reset:
    - mips_cpu_early_init()       # custom early init, fix errata
    - init CP0 registers, Watch registers
    - mips_cache_disable()        # set K0 CCA to uncached
    - mips_cpu_core_init()        # per CPU core init
                                  # -> generic code issues wait instr.
                                  # -> custom code can do custom init
                                  #    or custom boot protocols
    - mips_cm_map()               # init CM if available
    - mips_cache_init()           # init caches, set K0 CCA to non-coh.
    - mips_sram_init()            # init SRAM, Scratch RAM if avail
    - setup initial stack and global_data
    - debug_uart_init()
    - mips_mem_init()             # init external memory, C env avail.
    - init malloc_f
    - board_init_f()

> +
> +#endif /* __ASM_MACH_OCTEON_IOREMAP_H */
> diff --git a/arch/mips/mach-octeon/start.S b/arch/mips/mach-octeon/start.S
> new file mode 100644
> index 0000000000..acb967201a
> --- /dev/null
> +++ b/arch/mips/mach-octeon/start.S
> @@ -0,0 +1,1241 @@
> +/* SPDX-License-Identifier: GPL-2.0+ */
> +/*
> + *  Startup Code for OCTEON 64-bit CPU-core
> + *
> + *  Copyright (c) 2003	Wolfgang Denk <wd at denx.de>
> + *  Copyright 2004, 2005, 2010 - 2015 Cavium Inc..
> + */
> +
> +#include <asm-offsets.h>
> +#include <config.h>
> +#include <asm/regdef.h>
> +#include <asm/mipsregs.h>
> +#include <asm/asm.h>
> +
> +#define BOOT_VECTOR_NUM_WORDS		8
> +
> +#define OCTEON_BOOT_MOVEABLE_MAGIC_OFFSET	0x70
> +#define OCTEON_BOOT_VECTOR_MOVEABLE_OFFSET	0x78
> +
> +#define OCTEON_BOOT_MOVEABLE_MAGIC1_RAW	0xdb00110ad358eacd
> +#define OCTEON_BOOT_MOVEABLE_MAGIC1	OCTEON_BOOT_MOVEABLE_MAGIC1_RAW
> +
> +#define OCTEON_CIU_SOFT_RST		0x8001070000000740
> +
> +#define	OCTEON_L2C_WPAR_PP0		0x8001180080840000
> +#define OCTEON_MIO_BOOT_BASE		0x8001180000000000
> +#define OCTEON_MIO_BOOT_REG_CFG0_OFF	0x0000
> +#define OCTEON_MIO_BOOT_LOC_CFG0_OFF	0x0080
> +#define OCTEON_MIO_BOOT_LOC_ADR_OFF	0x0090
> +#define OCTEON_MIO_BOOT_LOC_DAT_OFF	0x0098
> +#define	OCTEON_MIO_RST_BOOT		0x8001180000001600
> +#define OCTEON_MIO_BOOT_REG_CFG0	0x8001180000000000
> +#define	OCTEON_MIO_BOOT_REG_TIM0	0x8001180000000040
> +#define OCTEON_MIO_BOOT_LOC_CFG0	0x8001180000000080
> +#define OCTEON_MIO_BOOT_LOC_ADR		0x8001180000000090
> +#define OCTEON_MIO_BOOT_LOC_DAT		0x8001180000000098
> +#define	OCTEON_MIO_FUSE_DAT3		0x8001180000001418
> +#define OCTEON_L2D_FUS3			0x80011800800007B8
> +#define	OCTEON_LMC0_DDR_PLL_CTL		0x8001180088000258
> +
> +#define OCTEON_RST			0x8001180006000000
> +#define OCTEON_RST_BOOT_OFFSET		0x1600
> +#define OCTEON_RST_SOFT_RST_OFFSET	0x1680
> +#define OCTEON_RST_COLD_DATAX_OFFSET(X)	(0x17C0 + (X) * 8)
> +#define OCTEON_RST_BOOT			0x8001180006001600
> +#define OCTEON_RST_SOFT_RST		0x8001180006001680
> +#define OCTEON_RST_COLD_DATAX(X)	(0x80011800060017C0 + (X) * 8)
> +
> +#define OCTEON_OCX_COM_NODE		0x8001180011000000
> +#define OCTEON_L2C_OCI_CTL		0x8001180080800020
> +#define OCTEON_L2C_TAD_CTL		0x8001180080800018
> +#define OCTEON_L2C_CTL			0x8001180080800000
> +
> +#define OCTEON_DBG_DATA			0x80011F00000001E8
> +#define OCTEON_PCI_READ_CMD_E		0x80011F0000001188
> +#define OCTEON_NPEI_DBG_DATA		0x80011F0000008510
> +#define OCTEON_CIU_WDOG(X)		(0x8001070000000500 + (X) * 8)
> +#define OCTEON_CIU_PP_POKE(X)		(0x8001070000000580 + (X) * 8)
> +#define OCTEON_CIU3_WDOG(X)		(0x8001010000020000 + (X) * 8)
> +#define OCTEON_CIU3_PP_POKE(X)		(0x8001010000030000 + (X) * 8)
> +#define OCTEON_OCX_COM_LINKX_CTL(X)	(0x8001180011000020 + (X) * 8)
> +#define OCTEON_SLI_CTL_STATUS		0x80011F0000028570
> +#define OCTEON_GSERX_SCRATCH(X)		(0x8001180090000020 + (X) * 0x1000000)
> +
> +/** PRID for CN56XX */
> +#define OCTEON_PRID_CN56XX		0x04
> +/** PRID for CN52XX */
> +#define OCTEON_PRID_CN52XX		0x07
> +/** PRID for CN63XX */
> +#define OCTEON_PRID_CN63XX		0x90
> +/** PRID for CN68XX */
> +#define OCTEON_PRID_CN68XX		0x91
> +/** PRID for CN66XX */
> +#define OCTEON_PRID_CN66XX		0x92
> +/** PRID for CN61XX */
> +#define OCTEON_PRID_CN61XX		0x93
> +/** PRID for CNF71XX */
> +#define OCTEON_PRID_CNF71XX		0x94
> +/** PRID for CN78XX */
> +#define OCTEON_PRID_CN78XX		0x95
> +/** PRID for CN70XX */
> +#define OCTEON_PRID_CN70XX		0x96
> +/** PRID for CN73XX */
> +#define OCTEON_PRID_CN73XX		0x97
> +/** PRID for CNF75XX */
> +#define OCTEON_PRID_CNF75XX		0x98
> +
> +/* func argument is used to create a  mark, must be unique */
> +#define GETOFFSET(reg, func)	\
> +	.balign	8;		\
> +	bal	func ##_mark;	\
> +	nop;			\
> +	.dword	.;		\
> +func ##_mark:			\
> +	ld	reg, 0(ra);	\
> +	dsubu	reg, ra, reg;
> +
> +#define JAL(func)		\
> +	.balign	8;		\
> +	bal	func ##_mark;	\
> +	 nop;			\
> +	.dword .;		\
> +func ##_mark:			\
> +	ld	t8, 0(ra);	\
> +	dsubu	t8, ra, t8;	\
> +	dla	t9, func;	\
> +	daddu	t9, t9, t8;	\
> +	jalr	t9;		\
> +	 nop;
> +
> +	.set	arch=octeon3
> +	.set	noreorder
> +
> +	.macro uhi_mips_exception
> +	move	k0, t9		# preserve t9 in k0
> +	move	k1, a0		# preserve a0 in k1
> +	li	t9, 15		# UHI exception operation
> +	li	a0, 0		# Use hard register context
> +	sdbbp	1		# Invoke UHI operation
> +	.endm
> +
> +	.macro setup_stack_gd
> +	li	t0, -16
> +	PTR_LI	t1, big_stack_start
> +	and	sp, t1, t0		# force 16 byte alignment
> +	PTR_SUBU \
> +		sp, sp, GD_SIZE		# reserve space for gd
> +	and	sp, sp, t0		# force 16 byte alignment
> +	move	k0, sp			# save gd pointer
> +#if CONFIG_VAL(SYS_MALLOC_F_LEN) && \
> +    !CONFIG_IS_ENABLED(INIT_STACK_WITHOUT_MALLOC_F)
> +	li	t2, CONFIG_VAL(SYS_MALLOC_F_LEN)
> +	PTR_SUBU \
> +		sp, sp, t2		# reserve space for early malloc
> +	and	sp, sp, t0		# force 16 byte alignment
> +#endif
> +	move	fp, sp
> +
> +	/* Clear gd */
> +	move	t0, k0
> +1:
> +	PTR_S	zero, 0(t0)
> +	PTR_ADDIU t0, PTRSIZE
> +	blt	t0, t1, 1b
> +	 nop
> +
> +#if CONFIG_VAL(SYS_MALLOC_F_LEN) && \
> +    !CONFIG_IS_ENABLED(INIT_STACK_WITHOUT_MALLOC_F)
> +	PTR_S	sp, GD_MALLOC_BASE(k0)	# gd->malloc_base offset
> +#endif
> +	.endm
> +
> +/* Saved register usage:
> + * s0:	not used
> + * s1:	not used
> + * s2:	Address U-Boot loaded into in L2 cache
> + * s3:	Start address
> + * s4:	flags
> + *		1:	booting from RAM
> + *		2:	executing out of cache
> + *		4:	booting from flash
> + * s5:	u-boot size (data end - _start)
> + * s6:	offset in flash.
> + * s7:	_start physical address
> + * s8:
> + */
> +
> +ENTRY(_start)
> +	/* U-Boot entry point */
> +	b	reset
> +
> +	/* The above jump instruction/nop are considered part of the
> +	 * bootloader_header_t structure but are not changed when the header is
> +	 * updated.
> +	 */
> +
> +	/* Leave room for bootloader_header_t header at start of binary.  This
> +	 * header is used to identify the board the bootloader is for, what
> +	 * address it is linked at, failsafe/normal, etc.  It also contains a
> +	 * CRC of the entire image.
> +	 */
> +
> +#if defined(CONFIG_ROM_EXCEPTION_VECTORS)
> +	/*
> +	 * Exception vector entry points. When running from ROM, an exception
> +	 * cannot be handled. Halt execution and transfer control to debugger,
> +	 * if one is attached.
> +	 */
> +	.org 0x200
> +	/* TLB refill, 32 bit task */
> +	uhi_mips_exception
> +
> +	.org 0x280
> +	/* XTLB refill, 64 bit task */
> +	uhi_mips_exception
> +
> +	.org 0x300
> +	/* Cache error exception */
> +	uhi_mips_exception
> +
> +	.org 0x380
> +	/* General exception */
> +	uhi_mips_exception
> +
> +	.org 0x400
> +	/* Catch interrupt exceptions */
> +	uhi_mips_exception
> +
> +	.org 0x480
> +	/* EJTAG debug exception */
> +1:	b	1b
> +	 nop
> +
> +	.org 0x500
> +#endif
> +
> +/* Reserve extra space so that when we use the boot bus local memory
> + * segment to remap the debug exception vector we don't overwrite
> + * anything useful
> + */
> +
> +/* Basic exception handler (dump registers) in all ASM.	 When using the TLB for
> + * mapping u-boot C code, we can't branch to that C code for exception handling
> + * (TLB is disabled for some exceptions.
> + */
> +
> +/* RESET/start here */
> +	.balign	8
> +reset:
> +	nop
> +	synci	0(zero)
> +	mfc0	k0, CP0_STATUS
> +	ori	k0, 0x00E0		/* enable 64 bit mode for CSR access */
> +	mtc0	k0, CP0_STATUS
> +
> +	/* Save the address we're booting from, strip off low bits */
> +	bal	1f
> +	 nop
> +1:
> +	move	s3, ra
> +	dins	s3, zero, 0, 12
> +
> +	/* Disable boot bus moveable regions */
> +	PTR_LI	k0, OCTEON_MIO_BOOT_LOC_CFG0
> +	sd	zero, 0(k0)
> +	sd	zero, 8(k0)
> +
> +	/* Disable the watchdog timer
> +	 * First we check if we're running on CN78XX, CN73XX or CNF75XX to see
> +	 * if we use CIU3 or CIU.
> +	 */
> +	mfc0	t0, CP0_PRID
> +	ext	t0, t0, 8, 8
> +	/* Assume CIU */
> +	PTR_LI	t1, OCTEON_CIU_WDOG(0)
> +	PTR_LI	t2, OCTEON_CIU_PP_POKE(0)
> +	blt	t0, OCTEON_PRID_CN78XX, wd_use_ciu
> +	 nop
> +	beq	t0, OCTEON_PRID_CN70XX, wd_use_ciu
> +	 nop
> +	/* Use CIU3 */
> +	PTR_LI	t1, OCTEON_CIU3_WDOG(0)
> +	PTR_LI	t2, OCTEON_CIU3_PP_POKE(0)
> +wd_use_ciu:
> +	sd	zero, 0(t2)		/* Pet the dog */
> +	sd	zero, 0(t1)		/* Disable watchdog timer */
> +
> +	/* Errata: CN76XX has a node ID of 3. change it to zero here.
> +	 * This needs to be done before we relocate to L2 as addresses change
> +	 * For 76XX pass 1.X we need to zero out the OCX_COM_NODE[ID],
> +	 * L2C_OCI_CTL[GKSEGNODE] and CP0 of Root.CvmMemCtl2[KSEGNODE].
> +	 */
> +	mfc0	a4, CP0_PRID
> +	/* Check for 78xx pass 1.x processor ID */
> +	andi	a4, 0xffff
> +	blt	a4, (OCTEON_PRID_CN78XX << 8), 1f
> +	 nop
> +
> +	/* Zero out alternate package for now */
> +	dins	a4, zero, 6, 1
> +	bge	a4, ((OCTEON_PRID_CN78XX << 8) | 0x08), 1f
> +	 nop
> +
> +	/* 78xx or 76xx here, first check for bug #27141 */
> +	PTR_LI	a5, OCTEON_SLI_CTL_STATUS
> +	ld	a6, 0(a5)
> +	andi	a7, a4, 0xff
> +	andi	a6, a6, 0xff
> +
> +	beq	a6, a7, not_bug27141
> +	 nop
> +
> +	/* core 0 proc_id rev_id field does not match SLI_CTL_STATUS rev_id */
> +	/* We just hit bug #27141.  Need to reset the chip and try again */
> +
> +	PTR_LI	a4, OCTEON_RST_SOFT_RST
> +	ori	a5, zero, 0x1	/* set the reset bit */
> +
> +reset_78xx_27141:
> +	sync
> +	synci	0(zero)
> +	cache	9, 0(zero)
> +	sd	a5, 0(a4)
> +	wait
> +	b	reset_78xx_27141
> +	 nop
> +
> +not_bug27141:
> +	/* 76XX pass 1.x has the node number set to 3 */
> +	mfc0	a4, CP0_EBASE
> +	ext	a4, a4, 0, 10
> +	bne	a4, 0x180, 1f	/* Branch if not node 3 core 0 */
> +	 nop
> +
> +	/* Clear OCX_COM_NODE[ID] */
> +	PTR_LI	a5, OCTEON_OCX_COM_NODE
> +	ld	a4, 0(a5)
> +	dins	a4, zero, 0, 2
> +	sd	a4, 0(a5)
> +	ld	zero, 0(a5)
> +
> +	/* Clear L2C_OCI_CTL[GKSEGNODE] */
> +	PTR_LI	a5, OCTEON_L2C_OCI_CTL
> +	ld	a4, 0(a5)
> +	dins	a4, zero, 4, 2
> +	sd	a4, 0(a5)
> +	ld	zero, 0(a5)
> +
> +	/* Clear CP0 Root.CvmMemCtl2[KSEGNODE] */
> +	dmfc0	a4, CP0_CVMMEMCTL2
> +	dins	a4, zero, 12, 2
> +	dmtc0	a4, CP0_CVMMEMCTL2
> +
> +	/* Put the flash address in the start of the EBASE register to
> +	 * enable our exception handler but only for core 0.
> +	 */
> +	mfc0	a4, CP0_EBASE
> +	dext	a4, a4, 0, 10
> +	bnez	a4, no_flash
> +	/* OK in delay slot */
> +	dext	a6, a6, 0, 16		/* Get the base address in flash */
> +	sll	a6, a6, 16
> +	mtc0	a6, CP0_EBASE	/* Enable exceptions */
> +
> +no_flash:
> +	/* Zero out various registers */
> +	mtc0	zero, CP0_DEPC
> +	mtc0	zero, CP0_EPC
> +	mtc0	zero, CP0_CAUSE
> +	mfc0	a4, CP0_PRID
> +	ext	a4, a4, 8, 8
> +	mtc0	zero, CP0_DESAVE
> +
> +	/* The following are only available on Octeon 2 or later */
> +	mtc0	zero, CP0_KSCRATCH1
> +	mtc0	zero, CP0_KSCRATCH2
> +	mtc0	zero, CP0_KSCRATCH3
> +	mtc0	zero, CP0_USERLOCAL
> +
> +	/* Turn off ROMEN bit to disable ROM */
> +	PTR_LI	a1, OCTEON_MIO_RST_BOOT
> +	/* For OCTEON 3 we use RST_BOOT instead of MIO_RST_BOOT.
> +	 * The difference is bits 24-26 are 6 instead of 0 for the address.
> +	 */
> +	/* For Octeon 2 and CN70XX we can ignore the watchdog */
> +	blt	a4, OCTEON_PRID_CN78XX, watchdog_ok
> +	 nop
> +
> +	PTR_LI	a1, OCTEON_RST_BOOT
> +
> +	beq	a4, OCTEON_PRID_CN70XX, watchdog_ok
> +	 nop
> +
> +	ld	a2, 0(a1)
> +	/* There is a bug where some registers don't get properly reset when
> +	 * the watchdog timer causes a reset.  In this case we need to force
> +	 * a reset.
> +	 */
> +	bbit0	a2, 11, watchdog_ok	/* Skip if watchdog not hit */
> +	 dins	a2, zero, 2, 18	/* Don't clear LBOOT, LBOOT_EXT or LBOOT_OCI */
> +	/* Clear bit indicating reset due to watchdog */
> +	ori	a2, 1 << 11
> +	sd	a2, 0(a1)
> +
> +	/* Disable watchdog */
> +	PTR_LI	a1, OCTEON_CIU3_PP_POKE(0)
> +	sd	zero, 0(a1)
> +	PTR_LI	a1, OCTEON_CIU3_WDOG(0)
> +	sd	zero, 0(a1)
> +
> +	/* Record this in the GSER0_SCRATCH register in bit 11 */
> +	PTR_LI	a1, OCTEON_GSERX_SCRATCH(0)
> +	ld	a2, 0(a1)
> +	ori	a2, 1 << 11
> +	sd	a2, 0(a1)
> +
> +	PTR_LI	a1, OCTEON_RST_SOFT_RST
> +	li	a2, 1
> +	sd	a2, 0(a1)
> +	wait
> +
> +	/* We should never get here */
> +
> +watchdog_ok:
> +	ld	a2, 0(a1)
> +	/* Don't clear LBOOT/LBOOT_EXT or LBOOT_OCI */
> +	dins	a2, zero, 2, 18
> +	dins	a2, zero, 60, 1	/* Clear ROMEN bit */
> +	sd	a2, 0(a1)
> +
> +	/* Start of Octeon setup */
> +
> +	/* Check what core we are - if core 0, branch to init tlb
> +	 * loop in flash.  Otherwise, look up address of init tlb
> +	 * loop that was saved in the boot vector block.
> +	 */
> +	mfc0	a0, CP0_EBASE
> +	andi	a0, EBASE_CPUNUM		/* get core */
> +	beqz	a0, InitTLBStart_local
> +	 nop
> +
> +	break
> +	/* We should never get here - non-zero cores now go directly to
> +	 * tlb init from the boot stub in movable region.
> +	 */
> +
> +	.globl InitTLBStart
> +InitTLBStart:
> +InitTLBStart_local:
> +	/* If we don't have working memory yet configure a bunch of
> +	 * scratch memory, and set the stack pointer to the top
> +	 * of it.  This allows us to go to C code without having
> +	 * memory set up
> +	 *
> +	 * Warning: do not change SCRATCH_STACK_LINES as this can impact the
> +	 * transition from start.S to crti.asm. crti requires 590 bytes of
> +	 * stack space.
> +	 */
> +	cache	1,0(zero)	/* Clear Dcache so cvmseg works right */
> +#if CONFIG_OCTEON_BIG_STACK_SIZE
> +	rdhwr	v0, $0
> +	bnez	v0, 1f
> +	 nop
> +	PTR_LA	sp, big_stack_start - 16
> +	b	stack_clear_done
> +	 nop
> +1:
> +#endif
> +#define SCRATCH_STACK_LINES 0x36   /* MAX is 0x36 */
> +	dmfc0	v0, CP0_CVMMEMCTL
> +	dins	v0, zero, 0, 9
> +	/* setup SCRATCH_STACK_LINES scratch lines of scratch */
> +	ori	v0, 0x100 | SCRATCH_STACK_LINES
> +	dmtc0	v0, CP0_CVMMEMCTL
> +	/* set stack to top of scratch memory */
> +	li	sp, 0xffffffffffff8000 + (SCRATCH_STACK_LINES * 128)
> +	/* Clear scratch for CN63XX pass 2.0 errata Core-15169*/
> +	li	t0, 0xffffffffffff8000
> +clear_scratch:
> +	sd	zero, 0(t0)
> +	addiu	t0, 8
> +	bne	t0, sp, clear_scratch
> +	 nop
> +
> +	/* This code run on all cores - core 0 from flash,
> +	 * the rest from DRAM.	When booting from PCI, non-zero cores
> +	 * come directly here from the boot vector - no earlier code in this
> +	 * file is executed.
> +	 */
> +
> +	/* Some generic initialization is done here as well, as we need this
> +	 * done on all cores even when booting from PCI
> +	 */
> +stack_clear_done:
> +	/* Clear watch registers. */
> +	mtc0	zero, CP0_WATCHLO
> +	mtc0	zero, CP0_WATCHHI
> +
> +	/* STATUS register */
> +	mfc0	k0, CP0_STATUS
> +	li	k1, ~ST0_IE
> +	and	k0, k1
> +	mtc0	k0, CP0_STATUS
> +
> +	/* CAUSE register */
> +	mtc0	zero, CP0_CAUSE
> +
> +	/* Init Timer */
> +	dmtc0	zero, CP0_COUNT
> +	dmtc0	zero, CP0_COMPARE
> +
> +
> +	mfc0	a5, CP0_STATUS
> +	li	v0, 0xE0		/* enable 64 bit mode for CSR access */
> +	or	v0, v0, a5
> +	mtc0	v0, CP0_STATUS
> +
> +
> +	dli	v0, 1 << 29  /* Enable large physical address support in TLB */
> +	mtc0	v0, CP0_PAGEGRAIN
> +
> +InitTLB:
> +	dmtc0	zero, CP0_ENTRYLO0
> +	dmtc0	zero, CP0_ENTRYLO1
> +	mtc0	zero, CP0_PAGEMASK
> +	dmtc0	zero, CP0_CONTEXT
> +	/* Use an offset into kseg0 so we won't conflict with Mips1 legacy
> +	 * TLB clearing
> +	 */
> +	PTR_LI	v0, 0xFFFFFFFF90000000
> +	mfc0	a0, CP0_CONFIG1
> +	srl	a0, a0, 25
> +	/* Check if config4 reg present */
> +	mfc0	a1, CP0_CONFIG3
> +	bbit0	a1, 31, 2f
> +	 and	a0, a0, 0x3F		/* a0 now has the max mmu entry index */
> +	mfc0	a1, CP0_CONFIG4
> +	bbit0	a1, 14, 2f		/* check config4[MMUExtDef] */
> +	 nop
> +	/* append config4[MMUSizeExt] to most significant bit of
> +	 * config1[MMUSize-1]
> +	 */
> +	ins	a0, a1, 6, 8
> +	and	a0, a0, 0x3fff	/* a0 now includes max entries for cn6xxx */
> +2:
> +	dmtc0	zero, CP0_XCONTEXT
> +	mtc0	zero, CP0_WIRED
> +
> +InitTLBloop:
> +	dmtc0	v0, CP0_ENTRYHI
> +	tlbp
> +	mfc0	v1, CP0_INDEX
> +	daddiu	v0, v0, 1<<13
> +	bgez	v1, InitTLBloop
> +
> +	mtc0	a0, CP0_INDEX
> +	tlbwi
> +	bnez	a0, InitTLBloop
> +	 daddiu	a0, -1
> +
> +	mthi	zero
> +	mtlo	zero
> +
> +	/* Set up status register */
> +	mfc0	v0, CP0_STATUS
> +	/* Enable COP0 and COP2 access */
> +	li	a4, (1 << 28) | (1 << 30)
> +	or	v0, a4
> +
> +	/* Must leave BEV set here, as DRAM is not configured for core 0.
> +	 * Also, BEV must be 1 later on when the exception base address is set.
> +	 */
> +
> +	/* Mask all interrupts */
> +	ins	v0, zero, 0, 16
> +	/* Clear NMI (used to start cores other than core 0) */
> +	ori	v0, 0xE4		/* enable 64 bit, disable interrupts */
> +	mtc0	v0, CP0_STATUS
> +
> +	dli	v0,0xE000000F		/* enable all readhw locations */
> +	mtc0	v0, CP0_HWRENA
> +
> +	dmfc0	v0, CP0_CVMCTL
> +	ori	v0, 1<<14	/* enable fixup of unaligned mem access */
> +	dmtc0	v0, CP0_CVMCTL
> +
> +	/* Setup scratch memory.  This is also done in
> +	 * cvmx_user_app_init, and this code will be removed
> +	 * from the bootloader in the near future.
> +	 */
> +
> +	/* Set L2C_LAD_CTL[MAXLFB] = 0 on CN73XX */
> +	mfc0	a4, CP0_PRID
> +	ext	a4, a4, 8, 8
> +	blt	a4, OCTEON_PRID_CN73XX, 72f
> +	nop
> +	PTR_LI	v0, OCTEON_L2C_TAD_CTL
> +	ld	t1, 0(v0)
> +	dins	t1, zero, 0, 4
> +	sd	t1, 0(v0)
> +	ld	zero, 0(v0)
> +
> +72:
> +
> +	/* clear these to avoid immediate interrupt in noperf mode */
> +	dmtc0	zero, CP0_COMPARE	/* clear timer interrupt */
> +	dmtc0	zero, CP0_COUNT		/* clear timer interrupt */
> +	dmtc0	zero, CP0_PERF_CNT0	/* clear perfCnt0 */
> +	dmtc0	zero, CP0_PERF_CNT1	/* clear perfCnt1 */
> +	dmtc0	zero, CP0_PERF_CNT2
> +	dmtc0	zero, CP0_PERF_CNT3
> +
> +	/* If we're running on a node other than 0 then we need to set KSEGNODE
> +	 * to 0.  The nice thing with this code is that it also autodetects if
> +	 * we're running on a processor that supports CVMMEMCTL2 or not since
> +	 * only processors that have this will have a non-zero node ID.  Because
> +	 * of this there's no need to check if we're running on a 78XX.
> +	 */
> +	mfc0    t1, CP0_EBASE
> +	dext    t1, t1, 7, 3            /* Extract node number */
> +	beqz    t1, is_node0            /* If non-zero then we're not node 0 */
> +	 nop
> +	dmfc0   t1, CP0_CVMMEMCTL2
> +	dins    t1, zero, 12, 4
> +	dmtc0   t1, CP0_CVMMEMCTL2
> +is_node0:
> +
> +	/* Set up TLB mappings for u-boot code in flash. */
> +
> +	/* Use a bal to get the current PC into ra.  Since this bal is to
> +	 * the address immediately following the delay slot, the ra is
> +	 * the address of the label.  We then use this to get the actual
> +	 * address that we are executing from.
> +	 */
> +	bal	__dummy
> +	 nop
> +
> +__dummy:
> +	/* Get the actual address that we are running at */
> +	PTR_LA	a6, _start		/* Linked address of _start */
> +	PTR_LA	a7, __dummy
> +	dsubu	t0, a7, a6		/* offset of __dummy label from _start*/
> +	dsubu	a7, ra, t0		/* a7 now has actual address of _start*/
> +
> +	/* Save actual _start address in s7.  This is where we
> +	 * are executing from, as opposed to where the code is
> +	 * linked.
> +	 */
> +	move	s7, a7
> +	move	s4, zero
> +
> +	/* s7 has actual address of _start.  If this is
> +	 * on the boot bus, it will be between 0xBFC000000 and 0xBFFFFFFF.
> +	 * If it is on the boot bus, use 0xBFC00000 as the physical address
> +	 * for the TLB mapping, as we will be adjusting the boot bus
> +	 * to make this adjustment.
> +	 * If we are running from DRAM (remote-boot), then we want to use the
> +	 * real address in DRAM.
> +	 */
> +
> +	/* Check to see if we are running from flash - we expect that to
> +	 * be 0xffffffffb0000000-0xffffffffbfffffff
> +	 * (0x10000000-0x1fffffff, unmapped/uncached)
> +	 */
> +	dli	t2, 0xffffffffb0000000
> +	dsubu	t2, s7
> +	slt	s4, s7, t2
> +	bltz	t2, uboot_in_flash
> +	 nop
> +
> +	/* If we're not core 0 then we don't care about cache */
> +	mfc0	t2, CP0_EBASE
> +	andi	t2, EBASE_CPUNUM
> +	bnez	t2, uboot_in_ram
> +	 nop
> +
> +	/* Find out if we're OCTEON I or OCTEON + which don't support running
> +	 * out of cache.
> +	 */
> +	mfc0	t2, CP0_PRID
> +	ext	t2, t2, 8, 8
> +	li	s4, 1
> +	blt	t2, 0x90, uboot_in_ram
> +	 nop
> +
> +	/* U-Boot can be executing either in RAM or L2 cache.  Now we need to
> +	 * check if DRAM is initialized.  The way we do that is to look at
> +	 * the reset bit of the LMC0_DDR_PLL_CTL register (bit 7)
> +	 */
> +	PTR_LI	t2, OCTEON_LMC0_DDR_PLL_CTL
> +	ld	t2, 0(t2)
> +	bbit1	t2, 7, uboot_in_ram
> +	 nop
> +
> +	/* We must be executing out of cache */
> +	b	uboot_in_ram
> +	 li	s4, 2
> +
> +uboot_in_flash:
> +	/* Set s4 to 4 to indicate we're running in FLASH */
> +	li	s4, 4
> +
> +#if defined(CONFIG_OCTEON_DISABLE_L2_CACHE_INDEX_ALIASING)
> +	/* By default, L2C index aliasing is enabled.  In some cases it may
> +	 * need to be disabled.  The L2C index aliasing can only be disabled
> +	 * if U-Boot is running out of L2 cache and the L2 cache has not been
> +	 * used to store anything.
> +	 */
> +	PTR_LI	t1, OCTEON_L2C_CTL
> +	ld	t2, 0(t1)
> +	ori	t2, 1
> +	sd	t2, 0(t1)
> +#endif
> +
> +	/* Use BFC00000 as physical address for TLB mappings when booting
> +	 * from flash, as we will adjust the boot bus mappings to make this
> +	 * mapping correct.
> +	 */
> +	dli	a7, 0xFFFFFFFFBFC00000
> +	dsubu	s6, s7, a7  /* Save flash offset in s6 */
> +
> +#if defined(CONFIG_OCTEON_COPY_FROM_FLASH_TO_L2)
> +	/* For OCTEON II we check to see if the L2 cache is big enough to hold
> +	 * U-Boot.  If it is big enough then we copy ourself from flash to the
> +	 * L2 cache in order to speed up execution.
> +	 */
> +
> +	/* Check for OCTEON 2 */
> +	mfc0	t1, CP0_PRID
> +	ext	t1, t1, 8, 8
> +	/* Get number of L2 cache sets */
> +	beq	t1, OCTEON_PRID_CNF71XX, got_l2_sets	/* CNF71XX */
> +	 li	t2, 1 << 9
> +	beq	t1, OCTEON_PRID_CN78XX, got_l2_sets	/* CN78XX */
> +	 li	t2, 1 << 13
> +	beq	t1, OCTEON_PRID_CN70XX, got_l2_sets	/* CN70XX */
> +	 li	t2, 1 << 10
> +	beq	t1, OCTEON_PRID_CN73XX, got_l2_sets	/* CN73XX */
> +	 li	t2, 1 << 11
> +	beq	t1, OCTEON_PRID_CNF75XX, got_l2_sets	/* CNF75XX */
> +	 li	t2, 1 << 11
> +	b	l2_cache_too_small	/* Unknown OCTEON model */
> +	 nop
> +
> +got_l2_sets:
> +	/* Get number of associations */
> +	PTR_LI	t0, OCTEON_MIO_FUSE_DAT3
> +	ld	t0, 0(t0)
> +	dext	t0, t0, 32, 3
> +
> +	beq	t1, OCTEON_PRID_CN70XX, process_70xx_l2sets
> +	 nop
> +	/* 0 = 16-way, 1 = 12-way, 2 = 8-way, 3 = 4-way, 4-7 reserved */
> +	beqz	t0, got_l2_ways
> +	 li	t3, 16
> +	beq	t0, 1, got_l2_ways
> +	 li	t3, 12
> +	beq	t0, 2, got_l2_ways
> +	 li	t3, 8
> +	beq	t0, 3, got_l2_ways
> +	 li	t3, 4
> +	b	l2_cache_too_small
> +	 nop
> +
> +process_70xx_l2sets:
> +	/* For 70XX, the number of ways is defined as:
> +	 * 0 - full cache (4-way) 512K
> +	 * 1 - 3/4 ways (3-way) 384K
> +	 * 2 - 1/2 ways (2-way) 256K
> +	 * 3 - 1/4 ways (1-way) 128K
> +	 * 4-7 illegal (aliased to 0-3)
> +	 */
> +	andi	t0, 3
> +	beqz	t0, got_l2_ways
> +	 li	t3, 4
> +	beq	t0, 1, got_l2_ways
> +	 li	t3, 3
> +	beq	t0, 2, got_l2_ways
> +	 li	t3, 2
> +	li	t3, 1
> +
> +got_l2_ways:
> +	dmul	a1, t2, t3		/* Calculate cache size */
> +	dsll	a1, 7			/* Ways * Sets * cache line sz (128) */
> +	daddiu	a1, a1, -128		/* Adjust cache size for copy code */
> +
> +	/* Calculate size of U-Boot image */
> +	/*
> +	 * "uboot_end - _start" is not correct, as the image also
> +	 * includes the DTB appended to the end (OF_EMBED is deprecated).
> +	 * Lets use a defined max for now here.
> +	 */
> +	PTR_LI	s5, CONFIG_BOARD_SIZE_LIMIT
> +
> +	daddu	t2, s5, s7	/* t2 = end address */
> +	daddiu	t2, t2, 127
> +	ins	t2, zero, 0, 7	/* Round up to cache line for memcpy */
> +
> +	slt	t1, a1, s5	/* See if we're bigger than the L2 cache */
> +	bnez	t1, l2_cache_too_small
> +	 nop
> +	/* Address we plan to load at in the L2 cache */
> +	PTR_LI	t9, CONFIG_OCTEON_L2_UBOOT_ADDR
> +# ifdef CONFIG_OCTEON_L2_MEMCPY_IN_CACHE
> +	/* Enable all ways for PP0.  Authentik ROM may have disabled these */
> +	PTR_LI	a1, OCTEON_L2C_WPAR_PP0
> +	sd	zero, 0(a1)
> +
> +	/* Address to place our memcpy code */
> +	PTR_LI	a0, CONFIG_OCTEON_L2_MEMCPY_ADDR
> +	/* The following code writes a simple memcpy routine into the cache
> +	 * to copy ourself from flash into the L2 cache.  This makes the
> +	 * memcpy routine a lot faster since each instruction can potentially
> +	 * require four read cycles to flash over the boot bus.
> +	 */
> +	/* Zero cache line in the L2 cache */
> +	zcb	(a0)
> +	synci	0(zero)
> +	dli	a1, 0xdd840000dd850008	/* ld a0, 0(t0);  ld a1, 8(t0) */
> +	sd	a1, 0(a0)
> +	dli	a1, 0xdd860010dd870018	/* ld a2, 16(t0); ld a3, 24(t0) */
> +	sd	a1, 8(a0)
> +	dli	a1, 0xfda40000fda50008	/* sd a0, 0(t1);  sd a1, 8(t1) */
> +	sd	a1, 16(a0)
> +	dli	a1, 0xfda60010fda70018	/* sd a2, 16(t1); sd a3, 24(t1) */
> +	sd	a1, 24(a0)
> +	dli	a1, 0x258c0020158efff6	/* addiu t0, 32; bne t0, t2, -40 */
> +	sd	a1, 32(a0)
> +	dli	a1, 0x25ad002003e00008	/* addiu t1, 32; jr ra */
> +	sd	a1, 40(a0)
> +	sd	zero, 48(a0)		/* nop; nop */
> +
> +	/* Synchronize the caches */
> +	sync
> +	synci	0(zero)
> +
> +	move	t0, s7
> +	move	t1, t9
> +
> +	/* Do the memcpy operation in L2 cache to copy ourself from flash
> +	 * to the L2 cache.
> +	 */
> +	jalr	a0
> +	 nop
> +
> +# else
> +	/* Copy ourself to the L2 cache from flash, 32 bytes at a time */
> +	/* This code is now written to the L2 cache using the code above */
> +1:
> +	ld	a0, 0(t0)
> +	ld	a1, 8(t0)
> +	ld	a2, 16(t0)
> +	ld	a3, 24(t0)
> +	sd	a0, 0(t1)
> +	sd	a1, 8(t1)
> +	sd	a2, 16(t1)
> +	sd	a3, 24(t1)
> +	addiu	t0, 32
> +	bne	t0, t2, 1b
> +	addiu	t1, 32
> +# endif	/* CONFIG_OCTEON_L2_MEMCPY_IN_CACHE */
> +
> +	/* Adjust the start address of U-Boot and the global pointer */
> +	subu	t0, s7, t9	/* t0 = address difference */
> +	move	s7, t9		/* Update physical address */
> +	move	s2, t9
> +	sync
> +	synci	0(zero)
> +
> +	/* Now we branch to the L2 cache.  We first get our PC then adjust it
> +	 */
> +	bal	3f
> +	 nop
> +3:
> +	/* Don't add any instructions here! */
> +	subu	t9, ra, t0
> +	/* Give ourself 16 bytes */
> +	addiu	t9, 0x10
> +
> +	jal	t9		/* Branch to address in L2 cache */
> +
> +	 nop
> +	nop
> +	/* Add instructions after here */
> +
> +	move	a7, s7
> +
> +	b	uboot_in_ram
> +	 ori	s4, 2		/* Running out of L2 cache */
> +
> +l2_cache_too_small:	/* We go here if we can't copy ourself to L2 */
> +#endif /* CONFIG_OCTEON_COPY_FROM_FLASH_TO_L2 */
> +
> +	/* This code is only executed if booting from flash. */
> +	/*  For flash boot (_not_ RAM boot), we do a workaround for
> +	 * an LLM errata on CN38XX and CN58XX parts.
> +	 */
> +
> +uboot_in_ram:
> +	/* U-boot address is now in reg a7, and is 4 MByte aligned.
> +	 * (boot bus addressing has been adjusted to make this happen for flash,
> +	 * and for DRAM this alignment must be provided by the remote boot
> +	 * utility.
> +	 */
> +	/* See if we're in KSEG0 range, if so set EBASE register to handle
> +	 * exceptions.
> +	 */
> +	dli	a1, 0x20000000
> +	bge	a7, a1, 1f
> +	 nop
> +	/* Convert our physical address to KSEG0 */
> +	PTR_LI	a1, 0xffffffff80000000
> +	or	a1, a1, a7
> +	mtc0	a1, CP0_EBASE
> +1:
> +	/* U-boot now starts at 0xBFC00000.  Use a single 4 MByte TLB mapping
> +	 * to map u-boot.
> +	 */
> +	move	a0, a6		/* Virtual addr in a0 */
> +	dins	a0, zero, 0, 16	/* Zero out offset bits */
> +	move	a1, a7		/* Physical addr in a1 */
> +
> +	/* Now we need to remove the MIPS address space bits.  For this we
> +	 * need to determine if it is a 32 bit compatibility address or not.
> +	 */
> +
> +	/* 'lowest' address in compatibility space */
> +	PTR_LI	t0, 0xffffffff80000000
> +	dsubu	t0, t0, a1
> +	bltz	t0, compat_space
> +	 nop
> +
> +	/* We have a xkphys address, so strip off top bit */
> +	b	addr_fixup_done
> +	 dins	a1, zero, 63, 1
> +
> +compat_space:
> +	PTR_LI	a2, 0x1fffffff
> +	and	a1, a1, a2  /* Mask phy addr to remove address space bits */
> +
> +addr_fixup_done:
> +	/* Currenty the u-boot image size is limited to 4 MBytes.  In order to
> +	 * support larger images the flash mapping will need to be changed to
> +	 * be able to access more than that before C code is run.  Until that
> +	 * is done, we just use a 4 MByte mapping for the secondary cores as
> +	 * well.
> +	 */
> +	/* page size (only support 4 Meg binary size for now for core 0)
> +	 * This limitation is due to the fact that the boot vector is
> +	 * 0xBFC00000 which only makes 4MB available.  Later more flash
> +	 * address space will be available after U-Boot has been copied to
> +	 * RAM.	 For now assume that it is in flash.
> +	 */
> +	li	a2, 2*1024*1024
> +
> +	mfc0	a4, CP0_EBASE
> +	andi	a4, EBASE_CPUNUM		/* get core */
> +	beqz	a4, core_0_tlb
> +	 nop
> +
> +	/* Now determine how big a mapping to use for secondary cores,
> +	 * which need to map all of u-boot + heap in DRAM
> +	 */
> +	/* Here we look at the alignment of the the physical address,
> +	 * and use the largest page size possible.  In some cases
> +	 * this can result in an oversize mapping, but for secondary cores
> +	 * this mapping is very short lived.
> +	 */
> +
> +	/* Physical address in a1 */
> +	li	a2, 1
> +1:
> +	sll	a2, 1
> +	and	a5, a1, a2
> +	beqz	a5, 1b
> +	 nop
> +
> +	/* a2 now contains largest page size we can use */
> +core_0_tlb:
> +	JAL(single_tlb_setup)
> +
> +	/* Check if we're running from cache */
> +	bbit1	s4, 1, uboot_in_cache
> +	 nop
> +
> +	/* If we are already running from ram, we don't need to muck
> +	 * with boot bus mappings.
> +	 */
> +	PTR_LI	t2, 0xffffffffb0000000
> +	dsubu	t2, s7
> +	/* See if our starting address is lower than the boot bus */
> +	bgez	t2, uboot_in_ram2	/* If yes, booting from RAM */
> +	 nop
> +
> +uboot_in_cache:
> +#if CONFIG_OCTEON_BIG_STACK_SIZE
> +	/* The large stack is only for core 0.  For all other cores we need to
> +	 * use the L1 cache otherwise the other cores will stomp on top of each
> +	 * other unless even more space is reserved for the stack space for
> +	 * each core.  With potentially 96 cores this gets excessive.
> +	 */
> +	mfc0	v0, CP0_EBASE
> +	andi	a0, EBASE_CPUNUM
> +	bnez	a0, no_big_stack
> +	 nop
> +	PTR_LA	sp, big_stack_start
> +	daddiu	sp, -16
> +
> +no_big_stack:
> +#endif
> +	/* We now have the TLB set up, so we need to remap the boot bus.
> +	 * This is tricky, as we are running from flash, and will be changing
> +	 * the addressing of the flash.
> +	 */
> +	/* Enable movable boot bus region 0, at address 0x10000000 */
> +	PTR_LI	a4, OCTEON_MIO_BOOT_BASE
> +	dli	a5, 0x81000000	/* EN + base address 0x11000000 */
> +	sd	a5, OCTEON_MIO_BOOT_LOC_CFG0_OFF(a4)
> +
> +	/* Copy code to that remaps the boot bus to movable region */
> +	sd	zero, OCTEON_MIO_BOOT_LOC_DAT_OFF(a4)
> +
> +	PTR_LA	a6, change_boot_mappings
> +	GETOFFSET(a5, change_boot_mappings);
> +	daddu	a5, a5, a6
> +
> +	/* The code is 16 bytes (2 DWORDS) */
> +	ld	a7, 0(a5)
> +	sd	a7, OCTEON_MIO_BOOT_LOC_DAT_OFF(a4)
> +	ld	a7, 8(a5)
> +	sd	a7, OCTEON_MIO_BOOT_LOC_DAT_OFF(a4)
> +
> +	/* Read from an RML register to ensure that the previous writes have
> +	 * completed before we branch to the movable region.
> +	 */
> +	ld	zero, OCTEON_MIO_BOOT_LOC_CFG0_OFF(a4)
> +
> +	/* Compute value for boot bus configuration register */
> +	/* Read region 0 config so we can _modify_ the base address field */
> +	PTR_LI	a4, OCTEON_MIO_BOOT_REG_CFG0	/* region 0 config */
> +	ld	a0, 0(a4)
> +	dli	a4, 0xf0000000		/* Mask off bits we want to save */
> +	and	a4, a4, a0
> +	dli	a0, 0x0fff0000		/* Force size to max */
> +	or	a4, a4, a0
> +
> +	move	a5, s6
> +	/* Convert to 64k blocks, as used by boot bus config */
> +	srl	a5, 16
> +	li	a6, 0x1fc0	/* 'normal' boot bus base config value */
> +	subu	a6, a6, a5	/* Subtract offset */
> +	/* combine into register value to pass to boot bus routine */
> +	or	a0, a4, a6
> +
> +	/* Branch there */
> +	PTR_LA	a1, __mapped_continue_label
> +	PTR_LI	a2, OCTEON_MIO_BOOT_REG_CFG0
> +	/* If region 0 is not enabled we can skip it */
> +	ld	a4, 0(a2)
> +	bbit0	a4, 31, __mapped_continue_label
> +	 nop
> +	li	a4, 0x10000000
> +	j	a4
> +	 synci	0(zero)
> +
> +	/* We never get here, as we go directly to __mapped_continue_label */
> +	break
> +
> +
> +uboot_in_ram2:
> +
> +	/* Now jump to address in TLB mapped memory to continue execution */
> +	PTR_LA	a4, __mapped_continue_label
> +	synci	0(a4)
> +	j	a4
> +	 nop
> +
> +__mapped_continue_label:
> +	/* Check if we are core 0, if we are not then we need
> +	 * to vector to code in DRAM to do application setup, and
> +	 * skip the rest of the bootloader.  Only core 0 runs the bootloader
> +	 * and sets up the tables that the other cores will use for
> +	 * configuration.
> +	 */
> +	mfc0	a0, CP0_EBASE
> +	andi	a0, EBASE_CPUNUM   /* get core */
> +	/* if (__all_cores_are_equal==0 && core==0),
> +	 * then jump to execute BL on core 0; else 'go to next line'
> +	 * (core_0_cont1 is executed ONLY when k0=a0=0(core0_ID))
> +	 */
> +	lw	t0, __all_cores_are_equal
> +	beq	a0, t0, core_0_cont1
> +	 nop
> +
> +	/* other cores look up addr from dram */
> +        /* DRAM controller already set up by first core */
> +        li      a1, (BOOT_VECTOR_NUM_WORDS * 4)
> +        mul     a0, a0, a1
> +
> +        /* Now find out the boot vector base address from the moveable boot
> +         * bus region.
> +         */
> +
> +        /* Get the address of the boot bus moveable region */
> +        PTR_LI     t8, OCTEON_MIO_BOOT_BASE
> +        ld      t9, OCTEON_MIO_BOOT_LOC_CFG0_OFF(t8)
> +        /* Make sure it's enabled */
> +        bbit0   t9, 31, invalid_boot_vector
> +         dext   t9, t9, 3, 24
> +        dsll    t9, t9, 7
> +        /* Make address XKPHYS */
> +	li	t0, 1
> +	dins	t9, t0, 63, 1
> +
> +        ld      t0, OCTEON_BOOT_MOVEABLE_MAGIC_OFFSET(t9)
> +        dli     t1, OCTEON_BOOT_MOVEABLE_MAGIC1
> +        bne     t0, t1, invalid_boot_vector
> +         nop
> +
> +        /* Load base address of boot vector table */
> +        ld      t0, OCTEON_BOOT_VECTOR_MOVEABLE_OFFSET(t9)
> +        /* Add offset for core */
> +        daddu   a1, t0, a0
> +
> +	mfc0	v0, CP0_STATUS
> +	move	v1, v0
> +	ins	v1, zero, 19, 1		/* Clear NMI bit */
> +	mtc0	v1, CP0_STATUS
> +
> +        /* Get app start function address */
> +        lw      t9, 8(a1)
> +        beqz    t9, invalid_boot_vector
> +         nop
> +
> +        j       t9
> +         lw      k0, 12(a1)      /* Load global data (deprecated) */
> +
> +invalid_boot_vector:
> +        wait
> +        b       invalid_boot_vector
> +         nop
> +
> +__all_cores_are_equal:
> +	/* The following .word tell if 'all_cores_are_equal' or core0 is special
> +	 * By default (for the first execution) the core0 should be special,
> +	 * in order to behave like the old(existing not-modified) bootloader
> +	 * and run the bootloader on core 0 to follow the existing design.
> +	 * However after that we make 'all_cores_equal' which allows to run SE
> +	 * applications on core0 like on any other core. NOTE that value written
> +	 * to '__all_cores_are_equal' should not match any core ID.
> +	 */
> +	.word 	0
> +
> +core_0_cont1:
> +	li	t0, 0xffffffff
> +	sw	t0, __all_cores_are_equal
> +	/* From here on, only core 0 runs, other cores have branched
> +	 * away.
> +	 */
> +#ifdef CONFIG_MIPS_INIT_STACK_IN_SRAM
> +	/* Set up initial stack and global data */
> +	setup_stack_gd
> +# ifdef CONFIG_DEBUG_UART
> +	PTR_LA	t9, debug_uart_init
> +	jalr	t9
> +	 nop
> +# endif
> +#endif
> +	move	a0, zero		# a0 <-- boot_flags = 0
> +	PTR_LA	t9, board_init_f
> +
> +	jr	t9
> +	 move	ra, zero
> +	END(_start)
> +
> +	.balign	8
> +	.globl	single_tlb_setup
> +	.ent	single_tlb_setup
> +	/* Sets up a single TLB entry.	Virtual/physical addresses
> +	 * must be properly aligned.
> +	 * a0  Virtual address
> +	 * a1  Physical address
> +	 * a2  page (_not_ mapping) size
> +	 */
> +single_tlb_setup:
> +	/* Determine the number of TLB entries available, and
> +	 * use the top one.
> +	 */
> +	mfc0	a3, CP0_CONFIG1
> +	dext	a3, a3, 25, 6		/* a3 now has the max mmu entry index */
> +	mfc0	a5, CP0_CONFIG3		/* Check if config4 reg present */
> +	bbit0	a5, 31, single_tlb_setup_cont
> +	 nop
> +	mfc0	a5, CP0_CONFIG4
> +	bbit0	a5, 14, single_tlb_setup_cont	/* check config4[MMUExtDef] */
> +	 nop
> +	/* append config4[MMUSizeExt] to most significant bit of
> +	 * config1[MMUSize-1]
> +	 */
> +	dins	a3, a5, 6, 8
> +	and	a3, a3, 0x3fff	/* a3 now includes max entries for cn6xxx */
> +
> +single_tlb_setup_cont:
> +
> +	/* Format physical address for entry low */
> +	nop
> +	dsrl	a1, a1, 12
> +	dsll	a1, a1, 6
> +	ori	a1, a1, 0x7	/* set DVG bits */
> +
> +	move	a4, a2
> +	daddu	a5, a4, a4	/* mapping size */
> +	dsll	a6, a4, 1
> +	daddiu	a6, a6, -1	/* pagemask */
> +	dsrl	a4, a4, 6	/* adjust for adding with entrylo */
> +
> +	/* Now set up mapping */
> +	mtc0	a6, CP0_PAGEMASK
> +	mtc0	a3, CP0_INDEX
> +
> +	dmtc0	a1, CP0_ENTRYLO0
> +	daddu	a1, a1, a4
> +
> +	dmtc0	a1, CP0_ENTRYLO1
> +	daddu	a1, a1, a4
> +
> +	dmtc0	a0, CP0_ENTRYHI
> +	daddu	a0, a0, a5
> +
> +	ehb
> +	tlbwi
> +	jr  ra
> +	 nop
> +	.end   single_tlb_setup
> +
> +
> +/**
> + * This code is moved to a movable boot bus region,
> + * and it is responsible for changing the flash mappings and
> + * jumping to run from the TLB mapped address.
> + *
> + * @param a0	New address for boot bus region 0
> + * @param a1	Address to branch to afterwards
> + * @param a2	Address of MIO_BOOT_REG_CFG0
> + */
> +	.balign	8
> +change_boot_mappings:
> +	sd	a0, 0(a2)
> +	sync
> +	j a1	    /* Jump to new TLB mapped location */
> +	 synci	0(zero)
> +
> +/* If we need a large stack, allocate it here. */
> +#if CONFIG_OCTEON_BIG_STACK_SIZE
> +	/* Allocate the stack here so it's in L2 cache or DRAM */
> +	.balign	16
> +big_stack_end:
> +	.skip	CONFIG_OCTEON_BIG_STACK_SIZE, 0
> +big_stack_start:
> +	.dword	0
> +#endif
>
Stefan Roese May 14, 2020, 7:50 a.m. UTC | #3
Hi Daniel,

On 13.05.20 14:49, Daniel Schwierzeck wrote:
> sorry for the delay ;)

NP. I know that its sometimes not easy to find the time for this
maintainer / review job. ;)

> Am 02.05.20 um 10:59 schrieb Stefan Roese:
>> From: Aaron Williams <awilliams at marvell.com>
>>
>> This patch adds very basic support for the Octeon III SoCs. Only
>> CFI parallel NOR flash and UART is supported for now.
>>
>> Please note that the basic Octeon port does not include the DDR3/4
>> initialization yet. This will be added in some follow-up patches
>> later. To still use U-Boot on with this port, the L2 cache (4MiB on
>> Octeon III CN73xx) is used as RAM. This way, U-Boot can boot to the
>> prompt on such boards.
> 
> this patch should come after the common MIPS patches

Okay, I'll re-arrange the sequence of patches in v2.

>>
>> Signed-off-by: Aaron Williams <awilliams at marvell.com>
>> Signed-off-by: Stefan Roese <sr at denx.de>
>> ---
>>
>>   MAINTAINERS                                  |    6 +
>>   arch/Kconfig                                 |    1 +
>>   arch/mips/Kconfig                            |   49 +-
>>   arch/mips/Makefile                           |    7 +
>>   arch/mips/cpu/Makefile                       |    4 +-
>>   arch/mips/include/asm/arch-octeon/cavm-reg.h |   42 +
>>   arch/mips/include/asm/arch-octeon/clock.h    |   24 +
>>   arch/mips/mach-octeon/Kconfig                |   92 ++
>>   arch/mips/mach-octeon/Makefile               |   10 +
>>   arch/mips/mach-octeon/clock.c                |   22 +
>>   arch/mips/mach-octeon/cpu.c                  |   55 +
>>   arch/mips/mach-octeon/dram.c                 |   27 +
>>   arch/mips/mach-octeon/include/ioremap.h      |   30 +
>>   arch/mips/mach-octeon/start.S                | 1241 ++++++++++++++++++
>>   14 files changed, 1608 insertions(+), 2 deletions(-)
>>   create mode 100644 arch/mips/include/asm/arch-octeon/cavm-reg.h
>>   create mode 100644 arch/mips/include/asm/arch-octeon/clock.h
>>   create mode 100644 arch/mips/mach-octeon/Kconfig
>>   create mode 100644 arch/mips/mach-octeon/Makefile
>>   create mode 100644 arch/mips/mach-octeon/clock.c
>>   create mode 100644 arch/mips/mach-octeon/cpu.c
>>   create mode 100644 arch/mips/mach-octeon/dram.c
>>   create mode 100644 arch/mips/mach-octeon/include/ioremap.h
>>   create mode 100644 arch/mips/mach-octeon/start.S
>>
>> diff --git a/MAINTAINERS b/MAINTAINERS
>> index 66f0b07263..29f2d7328c 100644
>> --- a/MAINTAINERS
>> +++ b/MAINTAINERS
>> @@ -749,6 +749,12 @@ M:	Ezequiel Garcia <ezequiel at collabora.com>
>>   S:	Maintained
>>   F:	arch/mips/mach-jz47xx/
>>   
>> +MIPS Octeon
>> +M:	Aaron Williams <awilliams at marvell.com>
>> +S:	Maintained
>> +F:	arch/mips/mach-octeon/
>> +F:	arch/mips/include/asm/arch-octeon/
>> +
>>   MMC
>>   M:	Peng Fan <peng.fan at nxp.com>
>>   S:	Maintained
>> diff --git a/arch/Kconfig b/arch/Kconfig
>> index 91e049b322..1cd3e1dc0b 100644
>> --- a/arch/Kconfig
>> +++ b/arch/Kconfig
>> @@ -37,6 +37,7 @@ config MICROBLAZE
>>   
>>   config MIPS
>>   	bool "MIPS architecture"
>> +	select CREATE_ARCH_SYMLINK
> 
> you should not need that. The path arch/mips/mach-octeon/include/ will
> be automatically added to the include search paths. Thus move all files
> in arch/mips/include/asm/arch-octeon/ to arch/mips/mach-octeon/include/

Good idea.

>>   	select HAVE_ARCH_IOREMAP
>>   	select HAVE_PRIVATE_LIBGCC
>>   	select SUPPORT_OF_CONTROL
>> diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
>> index 48e754cc46..3c7f3eb94f 100644
>> --- a/arch/mips/Kconfig
>> +++ b/arch/mips/Kconfig
>> @@ -106,6 +106,24 @@ config ARCH_JZ47XX
>>   	select OF_CONTROL
>>   	select DM
>>   
>> +config ARCH_OCTEON
>> +	bool "Support Marvell Octeon CN7xxx platforms"
>> +	select DISPLAY_CPUINFO
>> +	select DMA_ADDR_T_64BIT
>> +	select DM
>> +	select DM_SERIAL
>> +	select MIPS_CACHE_COHERENT
>> +	select MIPS_INIT_STACK_IN_SRAM
>> +	select MIPS_L2_CACHE
>> +	select MIPS_TUNE_OCTEON3
>> +	select ROM_EXCEPTION_VECTORS
>> +	select SUPPORTS_BIG_ENDIAN
>> +	select SUPPORTS_CPU_MIPS64_OCTEON
>> +	select PHYS_64BIT
>> +	select OF_CONTROL
>> +	select OF_LIVE
>> +	imply CMD_DM
>> +
>>   config MACH_PIC32
>>   	bool "Support Microchip PIC32"
>>   	select DM
>> @@ -160,6 +178,7 @@ source "arch/mips/mach-bmips/Kconfig"
>>   source "arch/mips/mach-jz47xx/Kconfig"
>>   source "arch/mips/mach-pic32/Kconfig"
>>   source "arch/mips/mach-mtmips/Kconfig"
>> +source "arch/mips/mach-octeon/Kconfig"
>>   
>>   if MIPS
>>   
>> @@ -233,6 +252,14 @@ config CPU_MIPS64_R6
>>   	  Choose this option to build a kernel for release 6 or later of the
>>   	  MIPS64 architecture.
>>   
>> +config CPU_MIPS64_OCTEON
>> +	bool "Marvell Octeon series of CPUs"
>> +	depends on SUPPORTS_CPU_MIPS64_OCTEON
>> +	select 64BIT
>> +	help
>> +	 Choose this option for Marvell Octeon CPUs.  These CPUs are between
>> +	 MIPS64 R5 and R6 with other extensions.
>> +
>>   endchoice
>>   
>>   menu "General setup"
>> @@ -261,7 +288,7 @@ config MIPS_CM_BASE
>>   config MIPS_CACHE_INDEX_BASE
>>   	hex "Index base address for cache initialisation"
>>   	default 0x80000000 if CPU_MIPS32
>> -	default 0xffffffff80000000 if CPU_MIPS64
>> +	default 0xFFFFFFFFC0000000 if ARCH_OCTEON
>>   	help
>>   	  This is the base address for a memory block, which is used for
>>   	  initialising the cache lines. This is also the base address of a memory
>> @@ -342,6 +369,14 @@ config SPL_LOADER_SUPPORT
>>   	help
>>   	  Enable this option if you want to use SPL loaders without DM enabled.
>>   
>> +config MIPS_CACHE_COHERENT
>> +	bool "Set if MIPS processor is cache coherent"
>> +	help
>> +	 Enable this if the MIPS architecture is cache coherent like the
>> +	 Marvell Octeon series of SoCs.  When this is set, cache flushes
>> +	 and invalidates only flush the write buffer since the hardware
>> +	 maintains cache coherency.
>> +
>>   endmenu
>>   
>>   menu "OS boot interface"
>> @@ -398,6 +433,9 @@ config SUPPORTS_CPU_MIPS64_R2
>>   config SUPPORTS_CPU_MIPS64_R6
>>   	bool
>>   
>> +config SUPPORTS_CPU_MIPS64_OCTEON
>> +	bool
>> +
>>   config CPU_MIPS32
>>   	bool
>>   	default y if CPU_MIPS32_R1 || CPU_MIPS32_R2 || CPU_MIPS32_R6
>> @@ -405,6 +443,7 @@ config CPU_MIPS32
>>   config CPU_MIPS64
>>   	bool
>>   	default y if CPU_MIPS64_R1 || CPU_MIPS64_R2 || CPU_MIPS64_R6
>> +	default y if CPU_MIPS64_OCTEON
>>   
>>   config MIPS_TUNE_4KC
>>   	bool
>> @@ -421,6 +460,9 @@ config MIPS_TUNE_34KC
>>   config MIPS_TUNE_74KC
>>   	bool
>>   
>> +config MIPS_TUNE_OCTEON3
>> +	bool
>> +
>>   config 32BIT
>>   	bool
>>   
>> @@ -453,6 +495,11 @@ config MIPS_SRAM_INIT
>>   	  before it can be used. If enabled, a function mips_sram_init() will
>>   	  be called just before setup_stack_gd.
>>   
>> +config DMA_ADDR_T_64BIT
>> +	bool
>> +	help
>> +	 Select this to enable 64-bit DMA addressing
>> +
>>   config SYS_DCACHE_SIZE
>>   	int
>>   	default 0
>> diff --git a/arch/mips/Makefile b/arch/mips/Makefile
>> index af3f227436..fa1ba7855a 100644
>> --- a/arch/mips/Makefile
>> +++ b/arch/mips/Makefile
>> @@ -1,6 +1,10 @@
>>   # SPDX-License-Identifier: GPL-2.0+
>>   
>> +ifneq ($(CONFIG_ARCH_OCTEON),y)
>>   head-y := arch/mips/cpu/start.o
>> +else
>> +head-y := arch/mips/mach-octeon/start.o
>> +endif
>>   
>>   ifeq ($(CONFIG_SPL_BUILD),y)
>>   ifneq ($(CONFIG_SPL_START_S_PATH),)
>> @@ -17,6 +21,7 @@ machine-$(CONFIG_ARCH_JZ47XX) += jz47xx
>>   machine-$(CONFIG_MACH_PIC32) += pic32
>>   machine-$(CONFIG_ARCH_MTMIPS) += mtmips
>>   machine-$(CONFIG_ARCH_MSCC) += mscc
>> +machine-${CONFIG_ARCH_OCTEON} += octeon
>>   
>>   machdirs := $(patsubst %,arch/mips/mach-%/,$(machine-y))
>>   libs-y += $(machdirs)
>> @@ -30,6 +35,7 @@ arch-$(CONFIG_CPU_MIPS32_R6) += -march=mips32r6 -Wa,-mips32r6
>>   arch-$(CONFIG_CPU_MIPS64_R1) += -march=mips64 -Wa,-mips64
>>   arch-$(CONFIG_CPU_MIPS64_R2) += -march=mips64r2 -Wa,-mips64r2
>>   arch-$(CONFIG_CPU_MIPS64_R6) += -march=mips64r6 -Wa,-mips64r6
>> +arch-${CONFIG_CPU_MIPS64_OCTEON} += -march=octeon3
>>   
>>   # Allow extra optimization for specific CPUs/SoCs
>>   tune-$(CONFIG_MIPS_TUNE_4KC) += -mtune=4kc
>> @@ -37,6 +43,7 @@ tune-$(CONFIG_MIPS_TUNE_14KC) += -mtune=14kc
>>   tune-$(CONFIG_MIPS_TUNE_24KC) += -mtune=24kc
>>   tune-$(CONFIG_MIPS_TUNE_34KC) += -mtune=34kc
>>   tune-$(CONFIG_MIPS_TUNE_74KC) += -mtune=74kc
>> +tune-${CONFIG_MIPS_TUNE_OCTEON3} += -mtune=octeon3
>>   
>>   # Include default header files
>>   cflags-y += -I$(srctree)/arch/mips/include/asm/mach-generic
>> diff --git a/arch/mips/cpu/Makefile b/arch/mips/cpu/Makefile
>> index 6df7bb4e48..732015d6f3 100644
>> --- a/arch/mips/cpu/Makefile
>> +++ b/arch/mips/cpu/Makefile
>> @@ -1,6 +1,8 @@
>>   # SPDX-License-Identifier: GPL-2.0+
>>   
>> -extra-y	= start.o
>> +ifneq ($(CONFIG_ARCH_OCTEON),y)
>> +extra-y = start.o
>> +endif
>>   
>>   obj-y += time.o
>>   obj-y += interrupts.o
>> diff --git a/arch/mips/include/asm/arch-octeon/cavm-reg.h b/arch/mips/include/asm/arch-octeon/cavm-reg.h
>> new file mode 100644
>> index 0000000000..b961e54956
>> --- /dev/null
>> +++ b/arch/mips/include/asm/arch-octeon/cavm-reg.h
>> @@ -0,0 +1,42 @@
>> +/* SPDX-License-Identifier:    GPL-2.0 */
>> +/*
>> + * Copyright (C) 2020 Marvell International Ltd.
>> + */
>> +
>> +#ifndef __CAVM_REG_H__
>> +
>> +/* Register offsets */
>> +#define CAVM_CIU_FUSE			((u64 *)0x80010100000001a0)
>> +#define CAVM_MIO_BOOT_REG_CFG0		((u64 *)0x8001180000000000)
>> +#define CAVM_RST_BOOT			((u64 *)0x8001180006001600)
>> +
>> +/* Register structs */
>> +
>> +/**
>> + * Register (RSL) rst_boot
>> + *
>> + * RST Boot Register
>> + */
>> +union cavm_rst_boot {
>> +	u64 u;
>> +	struct cavm_rst_boot_s {
>> +		u64 chipkill                         : 1;
>> +		u64 jtcsrdis                         : 1;
>> +		u64 ejtagdis                         : 1;
>> +		u64 romen                            : 1;
>> +		u64 ckill_ppdis                      : 1;
>> +		u64 jt_tstmode                       : 1;
>> +		u64 vrm_err                          : 1;
>> +		u64 reserved_37_56                   : 20;
>> +		u64 c_mul                            : 7;
>> +		u64 pnr_mul                          : 6;
>> +		u64 reserved_21_23                   : 3;
>> +		u64 lboot_oci                        : 3;
>> +		u64 lboot_ext                        : 6;
>> +		u64 lboot                            : 10;
>> +		u64 rboot                            : 1;
>> +		u64 rboot_pin                        : 1;
>> +	} s;
>> +};
>> +
>> +#endif /* __CAVM_REG_H__ */
>> diff --git a/arch/mips/include/asm/arch-octeon/clock.h b/arch/mips/include/asm/arch-octeon/clock.h
>> new file mode 100644
>> index 0000000000..a844a222c9
>> --- /dev/null
>> +++ b/arch/mips/include/asm/arch-octeon/clock.h
>> @@ -0,0 +1,24 @@
>> +/* SPDX-License-Identifier:    GPL-2.0 */
>> +/*
>> + * Copyright (C) 2018, 2019 Marvell International Ltd.
>> + *
>> + * https://spdx.org/licenses
>> + */
>> +
>> +#ifndef __CLOCK_H__
>> +
>> +/** System PLL reference clock */
>> +#define PLL_REF_CLK                     50000000        /* 50 MHz */
>> +#define NS_PER_REF_CLK_TICK             (1000000000 / PLL_REF_CLK)
>> +
>> +/**
>> + * Returns the I/O clock speed in Hz
>> + */
>> +u64 octeon_get_io_clock(void);
>> +
>> +/**
>> + * Returns the core clock speed in Hz
>> + */
>> +u64 octeon_get_core_clock(void);
>> +
>> +#endif /* __CLOCK_H__ */
>> diff --git a/arch/mips/mach-octeon/Kconfig b/arch/mips/mach-octeon/Kconfig
>> new file mode 100644
>> index 0000000000..67fcb6058c
>> --- /dev/null
>> +++ b/arch/mips/mach-octeon/Kconfig
>> @@ -0,0 +1,92 @@
>> +menu "Octeon platforms"
>> +	depends on ARCH_OCTEON
>> +
>> +config SYS_SOC
>> +	string
>> +	default "octeon"
>> +
>> +config OCTEON_CN7XXX
>> +	bool "Octeon CN7XXX SoC"
>> +
>> +config OCTEON_CN70XX
>> +	bool "Octeon CN70XX SoC"
>> +	select OCTEON_CN7XXX
>> +
>> +config OCTEON_CN73XX
>> +	bool "Octeon CN73XX SoC"
>> +	select OCTEON_CN7XXX
>> +
>> +config OCTEON_CN78XX
>> +	bool "Octeon CN78XX SoC"
>> +	select OCTEON_CN7XXX
>> +
>> +choice
>> +	prompt "Octeon MIPS family select"
>> +
>> +config SOC_OCTEON2
>> +	bool "Octeon II family"
>> +	help
>> +	 This selects the Octeon II SoC family
> 
> this should be added later when needed

Yes, makes sense.

>> +
>> +config SOC_OCTEON3
>> +	bool "Octeon III family"
>> +	help
>> +	 This selects the Octeon III SoC family CN70xx, CN73XX, CN78xx
>> +	 and CNF75XX.
>> +
>> +endchoice
>> +
>> +config SYS_DCACHE_SIZE
>> +	default 32768
>> +
>> +config SYS_DCACHE_LINE_SIZE
>> +	default 128
>> +
>> +config SYS_ICACHE_SIZE
>> +	default	79872
>> +
>> +config SYS_ICACHE_LINE_SIZE
>> +	default 128
>> +
>> +config OCTEON_BIG_STACK_SIZE
>> +	hex
>> +	default 0x4000
>> +	help
>> +	 This enables a larger stack needed for Octeon 3 DRAM initialization.
>> +	 If this is disabled then a part of the L1 cache will be reserved for
>> +	 the stack, resulting in a smaller image.  If this  is true then
>> +	 a portion of the TEXT address space will be reserved for the stack.
>> +	 Note that this requires that U-Boot MUST be able to fit entirely
>> +	 within the L2 cache and cannot be executed from a parallel NOR flash.
>> +	 The default size is 16KiB.
>> +
>> +config OCTEON_COPY_FROM_FLASH_TO_L2
>> +	bool
>> +	default y
>> +	help
>> +	 Set this for U-Boot to attempt to copy itself from flash memory into
>> +	 the L2 cache.  This significantly improvess the boot performance.
>> +
>> +config OCTEON_L2_MEMCPY_IN_CACHE
>> +	bool
>> +	default y
>> +	help
>> +	 If this is set then the memcpy code that is used to copy U-Boot from
>> +	 the flash to the L2 cache is written to the L2 cache.  This
>> +	 significantly speeds up the memcpy operation.
>> +
>> +config OCTEON_L2_UBOOT_ADDR
>> +	hex
>> +	default 0xffffffff81000000
>> +	help
>> +	 This specifies the address where U-Boot will be copied into the L2
>> +	 cache.
>> +
>> +config OCTEON_L2_MEMCPY_ADDR
>> +	hex
>> +	default 0xffffffff81400000
>> +	help
>> +	 This specifies where U-Boot will place the memcpy routine used for
>> +	 copying U-Boot from flash to L2 cache.
>> +
>> +endmenu
>> diff --git a/arch/mips/mach-octeon/Makefile b/arch/mips/mach-octeon/Makefile
>> new file mode 100644
>> index 0000000000..a5fda682a7
>> --- /dev/null
>> +++ b/arch/mips/mach-octeon/Makefile
>> @@ -0,0 +1,10 @@
>> +# (C) Copyright 2019 Marvell, Inc.
>> +#
>> +# SPDX-License-Identifier:	GPL-2.0+
>> +#
>> +
>> +extra-y = start.o
>> +
>> +obj-y += clock.o
>> +obj-y += cpu.o
>> +obj-y += dram.o
>> diff --git a/arch/mips/mach-octeon/clock.c b/arch/mips/mach-octeon/clock.c
>> new file mode 100644
>> index 0000000000..6e32008641
>> --- /dev/null
>> +++ b/arch/mips/mach-octeon/clock.c
>> @@ -0,0 +1,22 @@
>> +// SPDX-License-Identifier: GPL-2.0
>> +/*
>> + * Copyright (C) 2018, 2019 Marvell International Ltd.
>> + */
>> +
>> +#include <common.h>
>> +#include <asm/arch/clock.h>
>> +
>> +DECLARE_GLOBAL_DATA_PTR;
>> +
>> +int octeon_get_timer_freq(void)
>> +{
>> +	return gd->cpu_clk;
>> +}
>> +
>> +/**
>> + * Returns the I/O clock speed in Hz
>> + */
>> +u64 octeon_get_io_clock(void)
>> +{
>> +	return gd->bus_clk;
>> +}
>> diff --git a/arch/mips/mach-octeon/cpu.c b/arch/mips/mach-octeon/cpu.c
>> new file mode 100644
>> index 0000000000..a1373c6d56
>> --- /dev/null
>> +++ b/arch/mips/mach-octeon/cpu.c
>> @@ -0,0 +1,55 @@
>> +// SPDX-License-Identifier: GPL-2.0+
>> +/*
>> + * Copyright (C) 2020 Marvell International Ltd.
>> + */
>> +
>> +#include <common.h>
>> +#include <linux/io.h>
>> +#include <asm/arch/clock.h>
>> +#include <asm/arch-octeon/cavm-reg.h>
>> +
>> +DECLARE_GLOBAL_DATA_PTR;
>> +
>> +static int get_clocks(void)
>> +{
>> +	const u64 ref_clock = PLL_REF_CLK;
>> +	union cavm_rst_boot rst_boot;
>> +
>> +	rst_boot.u = ioread64(CAVM_RST_BOOT);
>> +	gd->cpu_clk = ref_clock * rst_boot.s.c_mul;
>> +	gd->bus_clk = ref_clock * rst_boot.s.pnr_mul;
>> +
>> +	debug("%s: cpu: %lu, bus: %lu\n", __func__, gd->cpu_clk, gd->bus_clk);
>> +
>> +	return 0;
>> +}
>> +
>> +/* Early mach init code run from flash */
>> +int mach_cpu_init(void)
>> +{
>> +	/* Remap boot-bus 0x1fc0.0000 -> 0x1f40.0000 */
>> +	/* ToDo: Move this to an early running bus (bootbus) DM driver */
>> +	clrsetbits_be64(CAVM_MIO_BOOT_REG_CFG0, 0xffff, 0x1f40);
>> +
>> +	/* Get clocks and store them in GD */
>> +	get_clocks();
>> +
>> +	return 0;
>> +}
>> +
>> +/**
>> + * Returns number of cores
>> + *
>> + * @return	number of CPU cores for the specified node
>> + */
>> +static int cavm_octeon_num_cores(void)
>> +{
>> +	return fls64(ioread64(CAVM_CIU_FUSE) & 0xffffffffffff);
>> +}
>> +
>> +int print_cpuinfo(void)
>> +{
>> +	printf("SoC:   Octeon CN73xx (%d cores)\n", cavm_octeon_num_cores());
>> +
>> +	return 0;
>> +}
>> diff --git a/arch/mips/mach-octeon/dram.c b/arch/mips/mach-octeon/dram.c
>> new file mode 100644
>> index 0000000000..c16a73e8e6
>> --- /dev/null
>> +++ b/arch/mips/mach-octeon/dram.c
>> @@ -0,0 +1,27 @@
>> +// SPDX-License-Identifier: GPL-2.0+
>> +/*
>> + * Copyright (C) 2020 Marvell International Ltd.
>> + */
>> +
>> +#include <common.h>
>> +#include <dm.h>
>> +#include <ram.h>
>> +
>> +DECLARE_GLOBAL_DATA_PTR;
>> +
>> +int dram_init(void)
>> +{
>> +	/*
>> +	 * No DDR init yet -> run in L2 cache
>> +	 */
>> +	gd->ram_size = (2 << 20);
>> +	gd->bd->bi_dram[0].size = gd->ram_size;
>> +	gd->bd->bi_dram[1].size = 0;
>> +
>> +	return 0;
>> +}
>> +
>> +ulong board_get_usable_ram_top(ulong total_size)
>> +{
>> +	return gd->ram_top;
>> +}
>> diff --git a/arch/mips/mach-octeon/include/ioremap.h b/arch/mips/mach-octeon/include/ioremap.h
>> new file mode 100644
>> index 0000000000..59b75008a2
>> --- /dev/null
>> +++ b/arch/mips/mach-octeon/include/ioremap.h
>> @@ -0,0 +1,30 @@
>> +/* SPDX-License-Identifier: GPL-2.0 */
>> +#ifndef __ASM_MACH_OCTEON_IOREMAP_H
>> +#define __ASM_MACH_OCTEON_IOREMAP_H
>> +
>> +#include <linux/types.h>
>> +
>> +/*
>> + * Allow physical addresses to be fixed up to help peripherals located
>> + * outside the low 32-bit range -- generic pass-through version.
>> + */
>> +static inline phys_addr_t fixup_bigphys_addr(phys_addr_t phys_addr,
>> +					     phys_addr_t size)
>> +{
>> +	return phys_addr;
>> +}
>> +
>> +static inline void __iomem *plat_ioremap(phys_addr_t offset, unsigned long size,
>> +					 unsigned long flags)
>> +{
>> +	return (void __iomem *)(XKPHYS | offset);
>> +}
>> +
>> +static inline int plat_iounmap(const volatile void __iomem *addr)
>> +{
>> +	return 0;
>> +}
>> +
>> +#define _page_cachable_default	_CACHE_CACHABLE_NONCOHERENT
>> +
>> +#endif /* __ASM_MACH_OCTEON_IOREMAP_H */
>> diff --git a/arch/mips/mach-octeon/start.S b/arch/mips/mach-octeon/start.S
>> new file mode 100644
>> index 0000000000..acb967201a
>> --- /dev/null
>> +++ b/arch/mips/mach-octeon/start.S
>> @@ -0,0 +1,1241 @@
>> +/* SPDX-License-Identifier: GPL-2.0+ */
>> +/*
>> + *  Startup Code for OCTEON 64-bit CPU-core
>> + *
>> + *  Copyright (c) 2003	Wolfgang Denk <wd at denx.de>
>> + *  Copyright 2004, 2005, 2010 - 2015 Cavium Inc..
>> + */
>> +
>> +#include <asm-offsets.h>
>> +#include <config.h>
>> +#include <asm/regdef.h>
>> +#include <asm/mipsregs.h>
>> +#include <asm/asm.h>
>> +
>> +#define BOOT_VECTOR_NUM_WORDS		8
>> +
>> +#define OCTEON_BOOT_MOVEABLE_MAGIC_OFFSET	0x70
>> +#define OCTEON_BOOT_VECTOR_MOVEABLE_OFFSET	0x78
>> +
>> +#define OCTEON_BOOT_MOVEABLE_MAGIC1_RAW	0xdb00110ad358eacd
>> +#define OCTEON_BOOT_MOVEABLE_MAGIC1	OCTEON_BOOT_MOVEABLE_MAGIC1_RAW
>> +
>> +#define OCTEON_CIU_SOFT_RST		0x8001070000000740
>> +
>> +#define	OCTEON_L2C_WPAR_PP0		0x8001180080840000
>> +#define OCTEON_MIO_BOOT_BASE		0x8001180000000000
>> +#define OCTEON_MIO_BOOT_REG_CFG0_OFF	0x0000
>> +#define OCTEON_MIO_BOOT_LOC_CFG0_OFF	0x0080
>> +#define OCTEON_MIO_BOOT_LOC_ADR_OFF	0x0090
>> +#define OCTEON_MIO_BOOT_LOC_DAT_OFF	0x0098
>> +#define	OCTEON_MIO_RST_BOOT		0x8001180000001600
>> +#define OCTEON_MIO_BOOT_REG_CFG0	0x8001180000000000
>> +#define	OCTEON_MIO_BOOT_REG_TIM0	0x8001180000000040
>> +#define OCTEON_MIO_BOOT_LOC_CFG0	0x8001180000000080
>> +#define OCTEON_MIO_BOOT_LOC_ADR		0x8001180000000090
>> +#define OCTEON_MIO_BOOT_LOC_DAT		0x8001180000000098
>> +#define	OCTEON_MIO_FUSE_DAT3		0x8001180000001418
>> +#define OCTEON_L2D_FUS3			0x80011800800007B8
>> +#define	OCTEON_LMC0_DDR_PLL_CTL		0x8001180088000258
>> +
>> +#define OCTEON_RST			0x8001180006000000
>> +#define OCTEON_RST_BOOT_OFFSET		0x1600
>> +#define OCTEON_RST_SOFT_RST_OFFSET	0x1680
>> +#define OCTEON_RST_COLD_DATAX_OFFSET(X)	(0x17C0 + (X) * 8)
>> +#define OCTEON_RST_BOOT			0x8001180006001600
>> +#define OCTEON_RST_SOFT_RST		0x8001180006001680
>> +#define OCTEON_RST_COLD_DATAX(X)	(0x80011800060017C0 + (X) * 8)
>> +
>> +#define OCTEON_OCX_COM_NODE		0x8001180011000000
>> +#define OCTEON_L2C_OCI_CTL		0x8001180080800020
>> +#define OCTEON_L2C_TAD_CTL		0x8001180080800018
>> +#define OCTEON_L2C_CTL			0x8001180080800000
>> +
>> +#define OCTEON_DBG_DATA			0x80011F00000001E8
>> +#define OCTEON_PCI_READ_CMD_E		0x80011F0000001188
>> +#define OCTEON_NPEI_DBG_DATA		0x80011F0000008510
>> +#define OCTEON_CIU_WDOG(X)		(0x8001070000000500 + (X) * 8)
>> +#define OCTEON_CIU_PP_POKE(X)		(0x8001070000000580 + (X) * 8)
>> +#define OCTEON_CIU3_WDOG(X)		(0x8001010000020000 + (X) * 8)
>> +#define OCTEON_CIU3_PP_POKE(X)		(0x8001010000030000 + (X) * 8)
>> +#define OCTEON_OCX_COM_LINKX_CTL(X)	(0x8001180011000020 + (X) * 8)
>> +#define OCTEON_SLI_CTL_STATUS		0x80011F0000028570
>> +#define OCTEON_GSERX_SCRATCH(X)		(0x8001180090000020 + (X) * 0x1000000)
>> +
>> +/** PRID for CN56XX */
>> +#define OCTEON_PRID_CN56XX		0x04
>> +/** PRID for CN52XX */
>> +#define OCTEON_PRID_CN52XX		0x07
>> +/** PRID for CN63XX */
>> +#define OCTEON_PRID_CN63XX		0x90
>> +/** PRID for CN68XX */
>> +#define OCTEON_PRID_CN68XX		0x91
>> +/** PRID for CN66XX */
>> +#define OCTEON_PRID_CN66XX		0x92
>> +/** PRID for CN61XX */
>> +#define OCTEON_PRID_CN61XX		0x93
>> +/** PRID for CNF71XX */
>> +#define OCTEON_PRID_CNF71XX		0x94
>> +/** PRID for CN78XX */
>> +#define OCTEON_PRID_CN78XX		0x95
>> +/** PRID for CN70XX */
>> +#define OCTEON_PRID_CN70XX		0x96
>> +/** PRID for CN73XX */
>> +#define OCTEON_PRID_CN73XX		0x97
>> +/** PRID for CNF75XX */
>> +#define OCTEON_PRID_CNF75XX		0x98
>> +
>> +/* func argument is used to create a  mark, must be unique */
>> +#define GETOFFSET(reg, func)	\
>> +	.balign	8;		\
>> +	bal	func ##_mark;	\
>> +	nop;			\
>> +	.dword	.;		\
>> +func ##_mark:			\
>> +	ld	reg, 0(ra);	\
>> +	dsubu	reg, ra, reg;
>> +
>> +#define JAL(func)		\
>> +	.balign	8;		\
>> +	bal	func ##_mark;	\
>> +	 nop;			\
>> +	.dword .;		\
>> +func ##_mark:			\
>> +	ld	t8, 0(ra);	\
>> +	dsubu	t8, ra, t8;	\
>> +	dla	t9, func;	\
>> +	daddu	t9, t9, t8;	\
>> +	jalr	t9;		\
>> +	 nop;
>> +
>> +	.set	arch=octeon3
>> +	.set	noreorder
>> +
>> +	.macro uhi_mips_exception
>> +	move	k0, t9		# preserve t9 in k0
>> +	move	k1, a0		# preserve a0 in k1
>> +	li	t9, 15		# UHI exception operation
>> +	li	a0, 0		# Use hard register context
>> +	sdbbp	1		# Invoke UHI operation
>> +	.endm
>> +
>> +	.macro setup_stack_gd
>> +	li	t0, -16
>> +	PTR_LI	t1, big_stack_start
>> +	and	sp, t1, t0		# force 16 byte alignment
>> +	PTR_SUBU \
>> +		sp, sp, GD_SIZE		# reserve space for gd
>> +	and	sp, sp, t0		# force 16 byte alignment
>> +	move	k0, sp			# save gd pointer
>> +#if CONFIG_VAL(SYS_MALLOC_F_LEN) && \
>> +    !CONFIG_IS_ENABLED(INIT_STACK_WITHOUT_MALLOC_F)
>> +	li	t2, CONFIG_VAL(SYS_MALLOC_F_LEN)
>> +	PTR_SUBU \
>> +		sp, sp, t2		# reserve space for early malloc
>> +	and	sp, sp, t0		# force 16 byte alignment
>> +#endif
>> +	move	fp, sp
>> +
>> +	/* Clear gd */
>> +	move	t0, k0
>> +1:
>> +	PTR_S	zero, 0(t0)
>> +	PTR_ADDIU t0, PTRSIZE
>> +	blt	t0, t1, 1b
>> +	 nop
>> +
>> +#if CONFIG_VAL(SYS_MALLOC_F_LEN) && \
>> +    !CONFIG_IS_ENABLED(INIT_STACK_WITHOUT_MALLOC_F)
>> +	PTR_S	sp, GD_MALLOC_BASE(k0)	# gd->malloc_base offset
>> +#endif
>> +	.endm
>> +
>> +/* Saved register usage:
>> + * s0:	not used
>> + * s1:	not used
>> + * s2:	Address U-Boot loaded into in L2 cache
>> + * s3:	Start address
>> + * s4:	flags
>> + *		1:	booting from RAM
>> + *		2:	executing out of cache
>> + *		4:	booting from flash
>> + * s5:	u-boot size (data end - _start)
>> + * s6:	offset in flash.
>> + * s7:	_start physical address
>> + * s8:
>> + */
>> +
>> +ENTRY(_start)
>> +	/* U-Boot entry point */
>> +	b	reset
>> +
>> +	/* The above jump instruction/nop are considered part of the
>> +	 * bootloader_header_t structure but are not changed when the header is
>> +	 * updated.
>> +	 */
>> +
>> +	/* Leave room for bootloader_header_t header at start of binary.  This
>> +	 * header is used to identify the board the bootloader is for, what
>> +	 * address it is linked at, failsafe/normal, etc.  It also contains a
>> +	 * CRC of the entire image.
>> +	 */
>> +
>> +#if defined(CONFIG_ROM_EXCEPTION_VECTORS)
>> +	/*
>> +	 * Exception vector entry points. When running from ROM, an exception
>> +	 * cannot be handled. Halt execution and transfer control to debugger,
>> +	 * if one is attached.
>> +	 */
>> +	.org 0x200
>> +	/* TLB refill, 32 bit task */
>> +	uhi_mips_exception
>> +
>> +	.org 0x280
>> +	/* XTLB refill, 64 bit task */
>> +	uhi_mips_exception
>> +
>> +	.org 0x300
>> +	/* Cache error exception */
>> +	uhi_mips_exception
>> +
>> +	.org 0x380
>> +	/* General exception */
>> +	uhi_mips_exception
>> +
>> +	.org 0x400
>> +	/* Catch interrupt exceptions */
>> +	uhi_mips_exception
>> +
>> +	.org 0x480
>> +	/* EJTAG debug exception */
>> +1:	b	1b
>> +	 nop
>> +
>> +	.org 0x500
>> +#endif
>> +
>> +/* Reserve extra space so that when we use the boot bus local memory
>> + * segment to remap the debug exception vector we don't overwrite
>> + * anything useful
>> + */
>> +
>> +/* Basic exception handler (dump registers) in all ASM.	 When using the TLB for
>> + * mapping u-boot C code, we can't branch to that C code for exception handling
>> + * (TLB is disabled for some exceptions.
>> + */
>> +
>> +/* RESET/start here */
>> +	.balign	8
>> +reset:
>> +	nop
>> +	synci	0(zero)
>> +	mfc0	k0, CP0_STATUS
>> +	ori	k0, 0x00E0		/* enable 64 bit mode for CSR access */
>> +	mtc0	k0, CP0_STATUS
>> +
>> +	/* Save the address we're booting from, strip off low bits */
>> +	bal	1f
>> +	 nop
>> +1:
>> +	move	s3, ra
>> +	dins	s3, zero, 0, 12
>> +
>> +	/* Disable boot bus moveable regions */
>> +	PTR_LI	k0, OCTEON_MIO_BOOT_LOC_CFG0
>> +	sd	zero, 0(k0)
>> +	sd	zero, 8(k0)
>> +
>> +	/* Disable the watchdog timer
>> +	 * First we check if we're running on CN78XX, CN73XX or CNF75XX to see
>> +	 * if we use CIU3 or CIU.
>> +	 */
>> +	mfc0	t0, CP0_PRID
>> +	ext	t0, t0, 8, 8
>> +	/* Assume CIU */
>> +	PTR_LI	t1, OCTEON_CIU_WDOG(0)
>> +	PTR_LI	t2, OCTEON_CIU_PP_POKE(0)
>> +	blt	t0, OCTEON_PRID_CN78XX, wd_use_ciu
>> +	 nop
>> +	beq	t0, OCTEON_PRID_CN70XX, wd_use_ciu
>> +	 nop
>> +	/* Use CIU3 */
>> +	PTR_LI	t1, OCTEON_CIU3_WDOG(0)
>> +	PTR_LI	t2, OCTEON_CIU3_PP_POKE(0)
>> +wd_use_ciu:
>> +	sd	zero, 0(t2)		/* Pet the dog */
>> +	sd	zero, 0(t1)		/* Disable watchdog timer */
>> +
>> +	/* Errata: CN76XX has a node ID of 3. change it to zero here.
>> +	 * This needs to be done before we relocate to L2 as addresses change
>> +	 * For 76XX pass 1.X we need to zero out the OCX_COM_NODE[ID],
>> +	 * L2C_OCI_CTL[GKSEGNODE] and CP0 of Root.CvmMemCtl2[KSEGNODE].
>> +	 */
>> +	mfc0	a4, CP0_PRID
>> +	/* Check for 78xx pass 1.x processor ID */
>> +	andi	a4, 0xffff
>> +	blt	a4, (OCTEON_PRID_CN78XX << 8), 1f
>> +	 nop
>> +
>> +	/* Zero out alternate package for now */
>> +	dins	a4, zero, 6, 1
>> +	bge	a4, ((OCTEON_PRID_CN78XX << 8) | 0x08), 1f
>> +	 nop
>> +
>> +	/* 78xx or 76xx here, first check for bug #27141 */
>> +	PTR_LI	a5, OCTEON_SLI_CTL_STATUS
>> +	ld	a6, 0(a5)
>> +	andi	a7, a4, 0xff
>> +	andi	a6, a6, 0xff
>> +
>> +	beq	a6, a7, not_bug27141
>> +	 nop
>> +
>> +	/* core 0 proc_id rev_id field does not match SLI_CTL_STATUS rev_id */
>> +	/* We just hit bug #27141.  Need to reset the chip and try again */
>> +
>> +	PTR_LI	a4, OCTEON_RST_SOFT_RST
>> +	ori	a5, zero, 0x1	/* set the reset bit */
>> +
>> +reset_78xx_27141:
>> +	sync
>> +	synci	0(zero)
>> +	cache	9, 0(zero)
>> +	sd	a5, 0(a4)
>> +	wait
>> +	b	reset_78xx_27141
>> +	 nop
>> +
>> +not_bug27141:
>> +	/* 76XX pass 1.x has the node number set to 3 */
>> +	mfc0	a4, CP0_EBASE
>> +	ext	a4, a4, 0, 10
>> +	bne	a4, 0x180, 1f	/* Branch if not node 3 core 0 */
>> +	 nop
>> +
>> +	/* Clear OCX_COM_NODE[ID] */
>> +	PTR_LI	a5, OCTEON_OCX_COM_NODE
>> +	ld	a4, 0(a5)
>> +	dins	a4, zero, 0, 2
>> +	sd	a4, 0(a5)
>> +	ld	zero, 0(a5)
>> +
>> +	/* Clear L2C_OCI_CTL[GKSEGNODE] */
>> +	PTR_LI	a5, OCTEON_L2C_OCI_CTL
>> +	ld	a4, 0(a5)
>> +	dins	a4, zero, 4, 2
>> +	sd	a4, 0(a5)
>> +	ld	zero, 0(a5)
>> +
>> +	/* Clear CP0 Root.CvmMemCtl2[KSEGNODE] */
>> +	dmfc0	a4, CP0_CVMMEMCTL2
>> +	dins	a4, zero, 12, 2
>> +	dmtc0	a4, CP0_CVMMEMCTL2
>> +
>> +	/* Put the flash address in the start of the EBASE register to
>> +	 * enable our exception handler but only for core 0.
>> +	 */
>> +	mfc0	a4, CP0_EBASE
>> +	dext	a4, a4, 0, 10
>> +	bnez	a4, no_flash
>> +	/* OK in delay slot */
>> +	dext	a6, a6, 0, 16		/* Get the base address in flash */
>> +	sll	a6, a6, 16
>> +	mtc0	a6, CP0_EBASE	/* Enable exceptions */
>> +
>> +no_flash:
>> +	/* Zero out various registers */
>> +	mtc0	zero, CP0_DEPC
>> +	mtc0	zero, CP0_EPC
>> +	mtc0	zero, CP0_CAUSE
>> +	mfc0	a4, CP0_PRID
>> +	ext	a4, a4, 8, 8
>> +	mtc0	zero, CP0_DESAVE
>> +
>> +	/* The following are only available on Octeon 2 or later */
>> +	mtc0	zero, CP0_KSCRATCH1
>> +	mtc0	zero, CP0_KSCRATCH2
>> +	mtc0	zero, CP0_KSCRATCH3
>> +	mtc0	zero, CP0_USERLOCAL
>> +
>> +	/* Turn off ROMEN bit to disable ROM */
>> +	PTR_LI	a1, OCTEON_MIO_RST_BOOT
>> +	/* For OCTEON 3 we use RST_BOOT instead of MIO_RST_BOOT.
>> +	 * The difference is bits 24-26 are 6 instead of 0 for the address.
>> +	 */
>> +	/* For Octeon 2 and CN70XX we can ignore the watchdog */
>> +	blt	a4, OCTEON_PRID_CN78XX, watchdog_ok
>> +	 nop
>> +
>> +	PTR_LI	a1, OCTEON_RST_BOOT
>> +
>> +	beq	a4, OCTEON_PRID_CN70XX, watchdog_ok
>> +	 nop
>> +
>> +	ld	a2, 0(a1)
>> +	/* There is a bug where some registers don't get properly reset when
>> +	 * the watchdog timer causes a reset.  In this case we need to force
>> +	 * a reset.
>> +	 */
>> +	bbit0	a2, 11, watchdog_ok	/* Skip if watchdog not hit */
>> +	 dins	a2, zero, 2, 18	/* Don't clear LBOOT, LBOOT_EXT or LBOOT_OCI */
>> +	/* Clear bit indicating reset due to watchdog */
>> +	ori	a2, 1 << 11
>> +	sd	a2, 0(a1)
>> +
>> +	/* Disable watchdog */
>> +	PTR_LI	a1, OCTEON_CIU3_PP_POKE(0)
>> +	sd	zero, 0(a1)
>> +	PTR_LI	a1, OCTEON_CIU3_WDOG(0)
>> +	sd	zero, 0(a1)
>> +
>> +	/* Record this in the GSER0_SCRATCH register in bit 11 */
>> +	PTR_LI	a1, OCTEON_GSERX_SCRATCH(0)
>> +	ld	a2, 0(a1)
>> +	ori	a2, 1 << 11
>> +	sd	a2, 0(a1)
>> +
>> +	PTR_LI	a1, OCTEON_RST_SOFT_RST
>> +	li	a2, 1
>> +	sd	a2, 0(a1)
>> +	wait
>> +
>> +	/* We should never get here */
>> +
>> +watchdog_ok:
>> +	ld	a2, 0(a1)
>> +	/* Don't clear LBOOT/LBOOT_EXT or LBOOT_OCI */
>> +	dins	a2, zero, 2, 18
>> +	dins	a2, zero, 60, 1	/* Clear ROMEN bit */
>> +	sd	a2, 0(a1)
>> +
>> +	/* Start of Octeon setup */
>> +
>> +	/* Check what core we are - if core 0, branch to init tlb
>> +	 * loop in flash.  Otherwise, look up address of init tlb
>> +	 * loop that was saved in the boot vector block.
>> +	 */
>> +	mfc0	a0, CP0_EBASE
>> +	andi	a0, EBASE_CPUNUM		/* get core */
>> +	beqz	a0, InitTLBStart_local
>> +	 nop
>> +
>> +	break
>> +	/* We should never get here - non-zero cores now go directly to
>> +	 * tlb init from the boot stub in movable region.
>> +	 */
>> +
>> +	.globl InitTLBStart
>> +InitTLBStart:
>> +InitTLBStart_local:
>> +	/* If we don't have working memory yet configure a bunch of
>> +	 * scratch memory, and set the stack pointer to the top
>> +	 * of it.  This allows us to go to C code without having
>> +	 * memory set up
>> +	 *
>> +	 * Warning: do not change SCRATCH_STACK_LINES as this can impact the
>> +	 * transition from start.S to crti.asm. crti requires 590 bytes of
>> +	 * stack space.
>> +	 */
>> +	cache	1,0(zero)	/* Clear Dcache so cvmseg works right */
>> +#if CONFIG_OCTEON_BIG_STACK_SIZE
>> +	rdhwr	v0, $0
>> +	bnez	v0, 1f
>> +	 nop
>> +	PTR_LA	sp, big_stack_start - 16
>> +	b	stack_clear_done
>> +	 nop
>> +1:
>> +#endif
>> +#define SCRATCH_STACK_LINES 0x36   /* MAX is 0x36 */
>> +	dmfc0	v0, CP0_CVMMEMCTL
>> +	dins	v0, zero, 0, 9
>> +	/* setup SCRATCH_STACK_LINES scratch lines of scratch */
>> +	ori	v0, 0x100 | SCRATCH_STACK_LINES
>> +	dmtc0	v0, CP0_CVMMEMCTL
>> +	/* set stack to top of scratch memory */
>> +	li	sp, 0xffffffffffff8000 + (SCRATCH_STACK_LINES * 128)
>> +	/* Clear scratch for CN63XX pass 2.0 errata Core-15169*/
>> +	li	t0, 0xffffffffffff8000
>> +clear_scratch:
>> +	sd	zero, 0(t0)
>> +	addiu	t0, 8
>> +	bne	t0, sp, clear_scratch
>> +	 nop
>> +
>> +	/* This code run on all cores - core 0 from flash,
>> +	 * the rest from DRAM.	When booting from PCI, non-zero cores
>> +	 * come directly here from the boot vector - no earlier code in this
>> +	 * file is executed.
>> +	 */
>> +
>> +	/* Some generic initialization is done here as well, as we need this
>> +	 * done on all cores even when booting from PCI
>> +	 */
>> +stack_clear_done:
>> +	/* Clear watch registers. */
>> +	mtc0	zero, CP0_WATCHLO
>> +	mtc0	zero, CP0_WATCHHI
>> +
>> +	/* STATUS register */
>> +	mfc0	k0, CP0_STATUS
>> +	li	k1, ~ST0_IE
>> +	and	k0, k1
>> +	mtc0	k0, CP0_STATUS
>> +
>> +	/* CAUSE register */
>> +	mtc0	zero, CP0_CAUSE
>> +
>> +	/* Init Timer */
>> +	dmtc0	zero, CP0_COUNT
>> +	dmtc0	zero, CP0_COMPARE
>> +
>> +
>> +	mfc0	a5, CP0_STATUS
>> +	li	v0, 0xE0		/* enable 64 bit mode for CSR access */
>> +	or	v0, v0, a5
>> +	mtc0	v0, CP0_STATUS
>> +
>> +
>> +	dli	v0, 1 << 29  /* Enable large physical address support in TLB */
>> +	mtc0	v0, CP0_PAGEGRAIN
>> +
>> +InitTLB:
>> +	dmtc0	zero, CP0_ENTRYLO0
>> +	dmtc0	zero, CP0_ENTRYLO1
>> +	mtc0	zero, CP0_PAGEMASK
>> +	dmtc0	zero, CP0_CONTEXT
>> +	/* Use an offset into kseg0 so we won't conflict with Mips1 legacy
>> +	 * TLB clearing
>> +	 */
>> +	PTR_LI	v0, 0xFFFFFFFF90000000
>> +	mfc0	a0, CP0_CONFIG1
>> +	srl	a0, a0, 25
>> +	/* Check if config4 reg present */
>> +	mfc0	a1, CP0_CONFIG3
>> +	bbit0	a1, 31, 2f
>> +	 and	a0, a0, 0x3F		/* a0 now has the max mmu entry index */
>> +	mfc0	a1, CP0_CONFIG4
>> +	bbit0	a1, 14, 2f		/* check config4[MMUExtDef] */
>> +	 nop
>> +	/* append config4[MMUSizeExt] to most significant bit of
>> +	 * config1[MMUSize-1]
>> +	 */
>> +	ins	a0, a1, 6, 8
>> +	and	a0, a0, 0x3fff	/* a0 now includes max entries for cn6xxx */
>> +2:
>> +	dmtc0	zero, CP0_XCONTEXT
>> +	mtc0	zero, CP0_WIRED
>> +
>> +InitTLBloop:
>> +	dmtc0	v0, CP0_ENTRYHI
>> +	tlbp
>> +	mfc0	v1, CP0_INDEX
>> +	daddiu	v0, v0, 1<<13
>> +	bgez	v1, InitTLBloop
>> +
>> +	mtc0	a0, CP0_INDEX
>> +	tlbwi
>> +	bnez	a0, InitTLBloop
>> +	 daddiu	a0, -1
>> +
>> +	mthi	zero
>> +	mtlo	zero
>> +
>> +	/* Set up status register */
>> +	mfc0	v0, CP0_STATUS
>> +	/* Enable COP0 and COP2 access */
>> +	li	a4, (1 << 28) | (1 << 30)
>> +	or	v0, a4
>> +
>> +	/* Must leave BEV set here, as DRAM is not configured for core 0.
>> +	 * Also, BEV must be 1 later on when the exception base address is set.
>> +	 */
>> +
>> +	/* Mask all interrupts */
>> +	ins	v0, zero, 0, 16
>> +	/* Clear NMI (used to start cores other than core 0) */
>> +	ori	v0, 0xE4		/* enable 64 bit, disable interrupts */
>> +	mtc0	v0, CP0_STATUS
>> +
>> +	dli	v0,0xE000000F		/* enable all readhw locations */
>> +	mtc0	v0, CP0_HWRENA
>> +
>> +	dmfc0	v0, CP0_CVMCTL
>> +	ori	v0, 1<<14	/* enable fixup of unaligned mem access */
>> +	dmtc0	v0, CP0_CVMCTL
>> +
>> +	/* Setup scratch memory.  This is also done in
>> +	 * cvmx_user_app_init, and this code will be removed
>> +	 * from the bootloader in the near future.
>> +	 */
>> +
>> +	/* Set L2C_LAD_CTL[MAXLFB] = 0 on CN73XX */
>> +	mfc0	a4, CP0_PRID
>> +	ext	a4, a4, 8, 8
>> +	blt	a4, OCTEON_PRID_CN73XX, 72f
>> +	nop
>> +	PTR_LI	v0, OCTEON_L2C_TAD_CTL
>> +	ld	t1, 0(v0)
>> +	dins	t1, zero, 0, 4
>> +	sd	t1, 0(v0)
>> +	ld	zero, 0(v0)
>> +
>> +72:
>> +
>> +	/* clear these to avoid immediate interrupt in noperf mode */
>> +	dmtc0	zero, CP0_COMPARE	/* clear timer interrupt */
>> +	dmtc0	zero, CP0_COUNT		/* clear timer interrupt */
>> +	dmtc0	zero, CP0_PERF_CNT0	/* clear perfCnt0 */
>> +	dmtc0	zero, CP0_PERF_CNT1	/* clear perfCnt1 */
>> +	dmtc0	zero, CP0_PERF_CNT2
>> +	dmtc0	zero, CP0_PERF_CNT3
>> +
>> +	/* If we're running on a node other than 0 then we need to set KSEGNODE
>> +	 * to 0.  The nice thing with this code is that it also autodetects if
>> +	 * we're running on a processor that supports CVMMEMCTL2 or not since
>> +	 * only processors that have this will have a non-zero node ID.  Because
>> +	 * of this there's no need to check if we're running on a 78XX.
>> +	 */
>> +	mfc0    t1, CP0_EBASE
>> +	dext    t1, t1, 7, 3            /* Extract node number */
>> +	beqz    t1, is_node0            /* If non-zero then we're not node 0 */
>> +	 nop
>> +	dmfc0   t1, CP0_CVMMEMCTL2
>> +	dins    t1, zero, 12, 4
>> +	dmtc0   t1, CP0_CVMMEMCTL2
>> +is_node0:
>> +
>> +	/* Set up TLB mappings for u-boot code in flash. */
>> +
>> +	/* Use a bal to get the current PC into ra.  Since this bal is to
>> +	 * the address immediately following the delay slot, the ra is
>> +	 * the address of the label.  We then use this to get the actual
>> +	 * address that we are executing from.
>> +	 */
>> +	bal	__dummy
>> +	 nop
>> +
>> +__dummy:
>> +	/* Get the actual address that we are running at */
>> +	PTR_LA	a6, _start		/* Linked address of _start */
>> +	PTR_LA	a7, __dummy
>> +	dsubu	t0, a7, a6		/* offset of __dummy label from _start*/
>> +	dsubu	a7, ra, t0		/* a7 now has actual address of _start*/
>> +
>> +	/* Save actual _start address in s7.  This is where we
>> +	 * are executing from, as opposed to where the code is
>> +	 * linked.
>> +	 */
>> +	move	s7, a7
>> +	move	s4, zero
>> +
>> +	/* s7 has actual address of _start.  If this is
>> +	 * on the boot bus, it will be between 0xBFC000000 and 0xBFFFFFFF.
>> +	 * If it is on the boot bus, use 0xBFC00000 as the physical address
>> +	 * for the TLB mapping, as we will be adjusting the boot bus
>> +	 * to make this adjustment.
>> +	 * If we are running from DRAM (remote-boot), then we want to use the
>> +	 * real address in DRAM.
>> +	 */
>> +
>> +	/* Check to see if we are running from flash - we expect that to
>> +	 * be 0xffffffffb0000000-0xffffffffbfffffff
>> +	 * (0x10000000-0x1fffffff, unmapped/uncached)
>> +	 */
>> +	dli	t2, 0xffffffffb0000000
>> +	dsubu	t2, s7
>> +	slt	s4, s7, t2
>> +	bltz	t2, uboot_in_flash
>> +	 nop
>> +
>> +	/* If we're not core 0 then we don't care about cache */
>> +	mfc0	t2, CP0_EBASE
>> +	andi	t2, EBASE_CPUNUM
>> +	bnez	t2, uboot_in_ram
>> +	 nop
>> +
>> +	/* Find out if we're OCTEON I or OCTEON + which don't support running
>> +	 * out of cache.
>> +	 */
>> +	mfc0	t2, CP0_PRID
>> +	ext	t2, t2, 8, 8
>> +	li	s4, 1
>> +	blt	t2, 0x90, uboot_in_ram
>> +	 nop
>> +
>> +	/* U-Boot can be executing either in RAM or L2 cache.  Now we need to
>> +	 * check if DRAM is initialized.  The way we do that is to look at
>> +	 * the reset bit of the LMC0_DDR_PLL_CTL register (bit 7)
>> +	 */
>> +	PTR_LI	t2, OCTEON_LMC0_DDR_PLL_CTL
>> +	ld	t2, 0(t2)
>> +	bbit1	t2, 7, uboot_in_ram
>> +	 nop
>> +
>> +	/* We must be executing out of cache */
>> +	b	uboot_in_ram
>> +	 li	s4, 2
>> +
>> +uboot_in_flash:
>> +	/* Set s4 to 4 to indicate we're running in FLASH */
>> +	li	s4, 4
>> +
>> +#if defined(CONFIG_OCTEON_DISABLE_L2_CACHE_INDEX_ALIASING)
>> +	/* By default, L2C index aliasing is enabled.  In some cases it may
>> +	 * need to be disabled.  The L2C index aliasing can only be disabled
>> +	 * if U-Boot is running out of L2 cache and the L2 cache has not been
>> +	 * used to store anything.
>> +	 */
>> +	PTR_LI	t1, OCTEON_L2C_CTL
>> +	ld	t2, 0(t1)
>> +	ori	t2, 1
>> +	sd	t2, 0(t1)
>> +#endif
>> +
>> +	/* Use BFC00000 as physical address for TLB mappings when booting
>> +	 * from flash, as we will adjust the boot bus mappings to make this
>> +	 * mapping correct.
>> +	 */
>> +	dli	a7, 0xFFFFFFFFBFC00000
>> +	dsubu	s6, s7, a7  /* Save flash offset in s6 */
>> +
>> +#if defined(CONFIG_OCTEON_COPY_FROM_FLASH_TO_L2)
>> +	/* For OCTEON II we check to see if the L2 cache is big enough to hold
>> +	 * U-Boot.  If it is big enough then we copy ourself from flash to the
>> +	 * L2 cache in order to speed up execution.
>> +	 */
>> +
>> +	/* Check for OCTEON 2 */
>> +	mfc0	t1, CP0_PRID
>> +	ext	t1, t1, 8, 8
>> +	/* Get number of L2 cache sets */
>> +	beq	t1, OCTEON_PRID_CNF71XX, got_l2_sets	/* CNF71XX */
>> +	 li	t2, 1 << 9
>> +	beq	t1, OCTEON_PRID_CN78XX, got_l2_sets	/* CN78XX */
>> +	 li	t2, 1 << 13
>> +	beq	t1, OCTEON_PRID_CN70XX, got_l2_sets	/* CN70XX */
>> +	 li	t2, 1 << 10
>> +	beq	t1, OCTEON_PRID_CN73XX, got_l2_sets	/* CN73XX */
>> +	 li	t2, 1 << 11
>> +	beq	t1, OCTEON_PRID_CNF75XX, got_l2_sets	/* CNF75XX */
>> +	 li	t2, 1 << 11
>> +	b	l2_cache_too_small	/* Unknown OCTEON model */
>> +	 nop
>> +
>> +got_l2_sets:
>> +	/* Get number of associations */
>> +	PTR_LI	t0, OCTEON_MIO_FUSE_DAT3
>> +	ld	t0, 0(t0)
>> +	dext	t0, t0, 32, 3
>> +
>> +	beq	t1, OCTEON_PRID_CN70XX, process_70xx_l2sets
>> +	 nop
>> +	/* 0 = 16-way, 1 = 12-way, 2 = 8-way, 3 = 4-way, 4-7 reserved */
>> +	beqz	t0, got_l2_ways
>> +	 li	t3, 16
>> +	beq	t0, 1, got_l2_ways
>> +	 li	t3, 12
>> +	beq	t0, 2, got_l2_ways
>> +	 li	t3, 8
>> +	beq	t0, 3, got_l2_ways
>> +	 li	t3, 4
>> +	b	l2_cache_too_small
>> +	 nop
>> +
>> +process_70xx_l2sets:
>> +	/* For 70XX, the number of ways is defined as:
>> +	 * 0 - full cache (4-way) 512K
>> +	 * 1 - 3/4 ways (3-way) 384K
>> +	 * 2 - 1/2 ways (2-way) 256K
>> +	 * 3 - 1/4 ways (1-way) 128K
>> +	 * 4-7 illegal (aliased to 0-3)
>> +	 */
>> +	andi	t0, 3
>> +	beqz	t0, got_l2_ways
>> +	 li	t3, 4
>> +	beq	t0, 1, got_l2_ways
>> +	 li	t3, 3
>> +	beq	t0, 2, got_l2_ways
>> +	 li	t3, 2
>> +	li	t3, 1
>> +
>> +got_l2_ways:
>> +	dmul	a1, t2, t3		/* Calculate cache size */
>> +	dsll	a1, 7			/* Ways * Sets * cache line sz (128) */
>> +	daddiu	a1, a1, -128		/* Adjust cache size for copy code */
>> +
>> +	/* Calculate size of U-Boot image */
>> +	/*
>> +	 * "uboot_end - _start" is not correct, as the image also
>> +	 * includes the DTB appended to the end (OF_EMBED is deprecated).
>> +	 * Lets use a defined max for now here.
>> +	 */
>> +	PTR_LI	s5, CONFIG_BOARD_SIZE_LIMIT
>> +
>> +	daddu	t2, s5, s7	/* t2 = end address */
>> +	daddiu	t2, t2, 127
>> +	ins	t2, zero, 0, 7	/* Round up to cache line for memcpy */
>> +
>> +	slt	t1, a1, s5	/* See if we're bigger than the L2 cache */
>> +	bnez	t1, l2_cache_too_small
>> +	 nop
>> +	/* Address we plan to load at in the L2 cache */
>> +	PTR_LI	t9, CONFIG_OCTEON_L2_UBOOT_ADDR
>> +# ifdef CONFIG_OCTEON_L2_MEMCPY_IN_CACHE
>> +	/* Enable all ways for PP0.  Authentik ROM may have disabled these */
>> +	PTR_LI	a1, OCTEON_L2C_WPAR_PP0
>> +	sd	zero, 0(a1)
>> +
>> +	/* Address to place our memcpy code */
>> +	PTR_LI	a0, CONFIG_OCTEON_L2_MEMCPY_ADDR
>> +	/* The following code writes a simple memcpy routine into the cache
>> +	 * to copy ourself from flash into the L2 cache.  This makes the
>> +	 * memcpy routine a lot faster since each instruction can potentially
>> +	 * require four read cycles to flash over the boot bus.
>> +	 */
>> +	/* Zero cache line in the L2 cache */
>> +	zcb	(a0)
>> +	synci	0(zero)
>> +	dli	a1, 0xdd840000dd850008	/* ld a0, 0(t0);  ld a1, 8(t0) */
>> +	sd	a1, 0(a0)
>> +	dli	a1, 0xdd860010dd870018	/* ld a2, 16(t0); ld a3, 24(t0) */
>> +	sd	a1, 8(a0)
>> +	dli	a1, 0xfda40000fda50008	/* sd a0, 0(t1);  sd a1, 8(t1) */
>> +	sd	a1, 16(a0)
>> +	dli	a1, 0xfda60010fda70018	/* sd a2, 16(t1); sd a3, 24(t1) */
>> +	sd	a1, 24(a0)
>> +	dli	a1, 0x258c0020158efff6	/* addiu t0, 32; bne t0, t2, -40 */
>> +	sd	a1, 32(a0)
>> +	dli	a1, 0x25ad002003e00008	/* addiu t1, 32; jr ra */
>> +	sd	a1, 40(a0)
>> +	sd	zero, 48(a0)		/* nop; nop */
>> +
>> +	/* Synchronize the caches */
>> +	sync
>> +	synci	0(zero)
>> +
>> +	move	t0, s7
>> +	move	t1, t9
>> +
>> +	/* Do the memcpy operation in L2 cache to copy ourself from flash
>> +	 * to the L2 cache.
>> +	 */
>> +	jalr	a0
>> +	 nop
>> +
>> +# else
>> +	/* Copy ourself to the L2 cache from flash, 32 bytes at a time */
>> +	/* This code is now written to the L2 cache using the code above */
>> +1:
>> +	ld	a0, 0(t0)
>> +	ld	a1, 8(t0)
>> +	ld	a2, 16(t0)
>> +	ld	a3, 24(t0)
>> +	sd	a0, 0(t1)
>> +	sd	a1, 8(t1)
>> +	sd	a2, 16(t1)
>> +	sd	a3, 24(t1)
>> +	addiu	t0, 32
>> +	bne	t0, t2, 1b
>> +	addiu	t1, 32
>> +# endif	/* CONFIG_OCTEON_L2_MEMCPY_IN_CACHE */
>> +
>> +	/* Adjust the start address of U-Boot and the global pointer */
>> +	subu	t0, s7, t9	/* t0 = address difference */
>> +	move	s7, t9		/* Update physical address */
>> +	move	s2, t9
>> +	sync
>> +	synci	0(zero)
>> +
>> +	/* Now we branch to the L2 cache.  We first get our PC then adjust it
>> +	 */
>> +	bal	3f
>> +	 nop
>> +3:
>> +	/* Don't add any instructions here! */
>> +	subu	t9, ra, t0
>> +	/* Give ourself 16 bytes */
>> +	addiu	t9, 0x10
>> +
>> +	jal	t9		/* Branch to address in L2 cache */
>> +
>> +	 nop
>> +	nop
>> +	/* Add instructions after here */
>> +
>> +	move	a7, s7
>> +
>> +	b	uboot_in_ram
>> +	 ori	s4, 2		/* Running out of L2 cache */
>> +
>> +l2_cache_too_small:	/* We go here if we can't copy ourself to L2 */
>> +#endif /* CONFIG_OCTEON_COPY_FROM_FLASH_TO_L2 */
>> +
>> +	/* This code is only executed if booting from flash. */
>> +	/*  For flash boot (_not_ RAM boot), we do a workaround for
>> +	 * an LLM errata on CN38XX and CN58XX parts.
>> +	 */
>> +
>> +uboot_in_ram:
>> +	/* U-boot address is now in reg a7, and is 4 MByte aligned.
>> +	 * (boot bus addressing has been adjusted to make this happen for flash,
>> +	 * and for DRAM this alignment must be provided by the remote boot
>> +	 * utility.
>> +	 */
>> +	/* See if we're in KSEG0 range, if so set EBASE register to handle
>> +	 * exceptions.
>> +	 */
>> +	dli	a1, 0x20000000
>> +	bge	a7, a1, 1f
>> +	 nop
>> +	/* Convert our physical address to KSEG0 */
>> +	PTR_LI	a1, 0xffffffff80000000
>> +	or	a1, a1, a7
>> +	mtc0	a1, CP0_EBASE
>> +1:
>> +	/* U-boot now starts at 0xBFC00000.  Use a single 4 MByte TLB mapping
>> +	 * to map u-boot.
>> +	 */
>> +	move	a0, a6		/* Virtual addr in a0 */
>> +	dins	a0, zero, 0, 16	/* Zero out offset bits */
>> +	move	a1, a7		/* Physical addr in a1 */
>> +
>> +	/* Now we need to remove the MIPS address space bits.  For this we
>> +	 * need to determine if it is a 32 bit compatibility address or not.
>> +	 */
>> +
>> +	/* 'lowest' address in compatibility space */
>> +	PTR_LI	t0, 0xffffffff80000000
>> +	dsubu	t0, t0, a1
>> +	bltz	t0, compat_space
>> +	 nop
>> +
>> +	/* We have a xkphys address, so strip off top bit */
>> +	b	addr_fixup_done
>> +	 dins	a1, zero, 63, 1
>> +
>> +compat_space:
>> +	PTR_LI	a2, 0x1fffffff
>> +	and	a1, a1, a2  /* Mask phy addr to remove address space bits */
>> +
>> +addr_fixup_done:
>> +	/* Currenty the u-boot image size is limited to 4 MBytes.  In order to
>> +	 * support larger images the flash mapping will need to be changed to
>> +	 * be able to access more than that before C code is run.  Until that
>> +	 * is done, we just use a 4 MByte mapping for the secondary cores as
>> +	 * well.
>> +	 */
>> +	/* page size (only support 4 Meg binary size for now for core 0)
>> +	 * This limitation is due to the fact that the boot vector is
>> +	 * 0xBFC00000 which only makes 4MB available.  Later more flash
>> +	 * address space will be available after U-Boot has been copied to
>> +	 * RAM.	 For now assume that it is in flash.
>> +	 */
>> +	li	a2, 2*1024*1024
>> +
>> +	mfc0	a4, CP0_EBASE
>> +	andi	a4, EBASE_CPUNUM		/* get core */
>> +	beqz	a4, core_0_tlb
>> +	 nop
>> +
>> +	/* Now determine how big a mapping to use for secondary cores,
>> +	 * which need to map all of u-boot + heap in DRAM
>> +	 */
>> +	/* Here we look at the alignment of the the physical address,
>> +	 * and use the largest page size possible.  In some cases
>> +	 * this can result in an oversize mapping, but for secondary cores
>> +	 * this mapping is very short lived.
>> +	 */
>> +
>> +	/* Physical address in a1 */
>> +	li	a2, 1
>> +1:
>> +	sll	a2, 1
>> +	and	a5, a1, a2
>> +	beqz	a5, 1b
>> +	 nop
>> +
>> +	/* a2 now contains largest page size we can use */
>> +core_0_tlb:
>> +	JAL(single_tlb_setup)
>> +
>> +	/* Check if we're running from cache */
>> +	bbit1	s4, 1, uboot_in_cache
>> +	 nop
>> +
>> +	/* If we are already running from ram, we don't need to muck
>> +	 * with boot bus mappings.
>> +	 */
>> +	PTR_LI	t2, 0xffffffffb0000000
>> +	dsubu	t2, s7
>> +	/* See if our starting address is lower than the boot bus */
>> +	bgez	t2, uboot_in_ram2	/* If yes, booting from RAM */
>> +	 nop
>> +
>> +uboot_in_cache:
>> +#if CONFIG_OCTEON_BIG_STACK_SIZE
>> +	/* The large stack is only for core 0.  For all other cores we need to
>> +	 * use the L1 cache otherwise the other cores will stomp on top of each
>> +	 * other unless even more space is reserved for the stack space for
>> +	 * each core.  With potentially 96 cores this gets excessive.
>> +	 */
>> +	mfc0	v0, CP0_EBASE
>> +	andi	a0, EBASE_CPUNUM
>> +	bnez	a0, no_big_stack
>> +	 nop
>> +	PTR_LA	sp, big_stack_start
>> +	daddiu	sp, -16
>> +
>> +no_big_stack:
>> +#endif
>> +	/* We now have the TLB set up, so we need to remap the boot bus.
>> +	 * This is tricky, as we are running from flash, and will be changing
>> +	 * the addressing of the flash.
>> +	 */
>> +	/* Enable movable boot bus region 0, at address 0x10000000 */
>> +	PTR_LI	a4, OCTEON_MIO_BOOT_BASE
>> +	dli	a5, 0x81000000	/* EN + base address 0x11000000 */
>> +	sd	a5, OCTEON_MIO_BOOT_LOC_CFG0_OFF(a4)
>> +
>> +	/* Copy code to that remaps the boot bus to movable region */
>> +	sd	zero, OCTEON_MIO_BOOT_LOC_DAT_OFF(a4)
>> +
>> +	PTR_LA	a6, change_boot_mappings
>> +	GETOFFSET(a5, change_boot_mappings);
>> +	daddu	a5, a5, a6
>> +
>> +	/* The code is 16 bytes (2 DWORDS) */
>> +	ld	a7, 0(a5)
>> +	sd	a7, OCTEON_MIO_BOOT_LOC_DAT_OFF(a4)
>> +	ld	a7, 8(a5)
>> +	sd	a7, OCTEON_MIO_BOOT_LOC_DAT_OFF(a4)
>> +
>> +	/* Read from an RML register to ensure that the previous writes have
>> +	 * completed before we branch to the movable region.
>> +	 */
>> +	ld	zero, OCTEON_MIO_BOOT_LOC_CFG0_OFF(a4)
>> +
>> +	/* Compute value for boot bus configuration register */
>> +	/* Read region 0 config so we can _modify_ the base address field */
>> +	PTR_LI	a4, OCTEON_MIO_BOOT_REG_CFG0	/* region 0 config */
>> +	ld	a0, 0(a4)
>> +	dli	a4, 0xf0000000		/* Mask off bits we want to save */
>> +	and	a4, a4, a0
>> +	dli	a0, 0x0fff0000		/* Force size to max */
>> +	or	a4, a4, a0
>> +
>> +	move	a5, s6
>> +	/* Convert to 64k blocks, as used by boot bus config */
>> +	srl	a5, 16
>> +	li	a6, 0x1fc0	/* 'normal' boot bus base config value */
>> +	subu	a6, a6, a5	/* Subtract offset */
>> +	/* combine into register value to pass to boot bus routine */
>> +	or	a0, a4, a6
>> +
>> +	/* Branch there */
>> +	PTR_LA	a1, __mapped_continue_label
>> +	PTR_LI	a2, OCTEON_MIO_BOOT_REG_CFG0
>> +	/* If region 0 is not enabled we can skip it */
>> +	ld	a4, 0(a2)
>> +	bbit0	a4, 31, __mapped_continue_label
>> +	 nop
>> +	li	a4, 0x10000000
>> +	j	a4
>> +	 synci	0(zero)
>> +
>> +	/* We never get here, as we go directly to __mapped_continue_label */
>> +	break
>> +
>> +
>> +uboot_in_ram2:
>> +
>> +	/* Now jump to address in TLB mapped memory to continue execution */
>> +	PTR_LA	a4, __mapped_continue_label
>> +	synci	0(a4)
>> +	j	a4
>> +	 nop
>> +
>> +__mapped_continue_label:
>> +	/* Check if we are core 0, if we are not then we need
>> +	 * to vector to code in DRAM to do application setup, and
>> +	 * skip the rest of the bootloader.  Only core 0 runs the bootloader
>> +	 * and sets up the tables that the other cores will use for
>> +	 * configuration.
>> +	 */
>> +	mfc0	a0, CP0_EBASE
>> +	andi	a0, EBASE_CPUNUM   /* get core */
>> +	/* if (__all_cores_are_equal==0 && core==0),
>> +	 * then jump to execute BL on core 0; else 'go to next line'
>> +	 * (core_0_cont1 is executed ONLY when k0=a0=0(core0_ID))
>> +	 */
>> +	lw	t0, __all_cores_are_equal
>> +	beq	a0, t0, core_0_cont1
>> +	 nop
>> +
>> +	/* other cores look up addr from dram */
>> +        /* DRAM controller already set up by first core */
>> +        li      a1, (BOOT_VECTOR_NUM_WORDS * 4)
>> +        mul     a0, a0, a1
>> +
>> +        /* Now find out the boot vector base address from the moveable boot
>> +         * bus region.
>> +         */
>> +
>> +        /* Get the address of the boot bus moveable region */
>> +        PTR_LI     t8, OCTEON_MIO_BOOT_BASE
>> +        ld      t9, OCTEON_MIO_BOOT_LOC_CFG0_OFF(t8)
>> +        /* Make sure it's enabled */
>> +        bbit0   t9, 31, invalid_boot_vector
>> +         dext   t9, t9, 3, 24
>> +        dsll    t9, t9, 7
>> +        /* Make address XKPHYS */
>> +	li	t0, 1
>> +	dins	t9, t0, 63, 1
>> +
>> +        ld      t0, OCTEON_BOOT_MOVEABLE_MAGIC_OFFSET(t9)
>> +        dli     t1, OCTEON_BOOT_MOVEABLE_MAGIC1
>> +        bne     t0, t1, invalid_boot_vector
>> +         nop
>> +
>> +        /* Load base address of boot vector table */
>> +        ld      t0, OCTEON_BOOT_VECTOR_MOVEABLE_OFFSET(t9)
>> +        /* Add offset for core */
>> +        daddu   a1, t0, a0
>> +
>> +	mfc0	v0, CP0_STATUS
>> +	move	v1, v0
>> +	ins	v1, zero, 19, 1		/* Clear NMI bit */
>> +	mtc0	v1, CP0_STATUS
>> +
>> +        /* Get app start function address */
>> +        lw      t9, 8(a1)
>> +        beqz    t9, invalid_boot_vector
>> +         nop
>> +
>> +        j       t9
>> +         lw      k0, 12(a1)      /* Load global data (deprecated) */
>> +
>> +invalid_boot_vector:
>> +        wait
>> +        b       invalid_boot_vector
>> +         nop
>> +
>> +__all_cores_are_equal:
>> +	/* The following .word tell if 'all_cores_are_equal' or core0 is special
>> +	 * By default (for the first execution) the core0 should be special,
>> +	 * in order to behave like the old(existing not-modified) bootloader
>> +	 * and run the bootloader on core 0 to follow the existing design.
>> +	 * However after that we make 'all_cores_equal' which allows to run SE
>> +	 * applications on core0 like on any other core. NOTE that value written
>> +	 * to '__all_cores_are_equal' should not match any core ID.
>> +	 */
>> +	.word 	0
>> +
>> +core_0_cont1:
>> +	li	t0, 0xffffffff
>> +	sw	t0, __all_cores_are_equal
>> +	/* From here on, only core 0 runs, other cores have branched
>> +	 * away.
>> +	 */
>> +#ifdef CONFIG_MIPS_INIT_STACK_IN_SRAM
>> +	/* Set up initial stack and global data */
>> +	setup_stack_gd
>> +# ifdef CONFIG_DEBUG_UART
>> +	PTR_LA	t9, debug_uart_init
>> +	jalr	t9
>> +	 nop
>> +# endif
>> +#endif
>> +	move	a0, zero		# a0 <-- boot_flags = 0
>> +	PTR_LA	t9, board_init_f
>> +
>> +	jr	t9
>> +	 move	ra, zero
>> +	END(_start)
>> +
>> +	.balign	8
>> +	.globl	single_tlb_setup
>> +	.ent	single_tlb_setup
>> +	/* Sets up a single TLB entry.	Virtual/physical addresses
>> +	 * must be properly aligned.
>> +	 * a0  Virtual address
>> +	 * a1  Physical address
>> +	 * a2  page (_not_ mapping) size
>> +	 */
>> +single_tlb_setup:
>> +	/* Determine the number of TLB entries available, and
>> +	 * use the top one.
>> +	 */
>> +	mfc0	a3, CP0_CONFIG1
>> +	dext	a3, a3, 25, 6		/* a3 now has the max mmu entry index */
>> +	mfc0	a5, CP0_CONFIG3		/* Check if config4 reg present */
>> +	bbit0	a5, 31, single_tlb_setup_cont
>> +	 nop
>> +	mfc0	a5, CP0_CONFIG4
>> +	bbit0	a5, 14, single_tlb_setup_cont	/* check config4[MMUExtDef] */
>> +	 nop
>> +	/* append config4[MMUSizeExt] to most significant bit of
>> +	 * config1[MMUSize-1]
>> +	 */
>> +	dins	a3, a5, 6, 8
>> +	and	a3, a3, 0x3fff	/* a3 now includes max entries for cn6xxx */
>> +
>> +single_tlb_setup_cont:
>> +
>> +	/* Format physical address for entry low */
>> +	nop
>> +	dsrl	a1, a1, 12
>> +	dsll	a1, a1, 6
>> +	ori	a1, a1, 0x7	/* set DVG bits */
>> +
>> +	move	a4, a2
>> +	daddu	a5, a4, a4	/* mapping size */
>> +	dsll	a6, a4, 1
>> +	daddiu	a6, a6, -1	/* pagemask */
>> +	dsrl	a4, a4, 6	/* adjust for adding with entrylo */
>> +
>> +	/* Now set up mapping */
>> +	mtc0	a6, CP0_PAGEMASK
>> +	mtc0	a3, CP0_INDEX
>> +
>> +	dmtc0	a1, CP0_ENTRYLO0
>> +	daddu	a1, a1, a4
>> +
>> +	dmtc0	a1, CP0_ENTRYLO1
>> +	daddu	a1, a1, a4
>> +
>> +	dmtc0	a0, CP0_ENTRYHI
>> +	daddu	a0, a0, a5
>> +
>> +	ehb
>> +	tlbwi
>> +	jr  ra
>> +	 nop
>> +	.end   single_tlb_setup
>> +
>> +
>> +/**
>> + * This code is moved to a movable boot bus region,
>> + * and it is responsible for changing the flash mappings and
>> + * jumping to run from the TLB mapped address.
>> + *
>> + * @param a0	New address for boot bus region 0
>> + * @param a1	Address to branch to afterwards
>> + * @param a2	Address of MIO_BOOT_REG_CFG0
>> + */
>> +	.balign	8
>> +change_boot_mappings:
>> +	sd	a0, 0(a2)
>> +	sync
>> +	j a1	    /* Jump to new TLB mapped location */
>> +	 synci	0(zero)
>> +
>> +/* If we need a large stack, allocate it here. */
>> +#if CONFIG_OCTEON_BIG_STACK_SIZE
>> +	/* Allocate the stack here so it's in L2 cache or DRAM */
>> +	.balign	16
>> +big_stack_end:
>> +	.skip	CONFIG_OCTEON_BIG_STACK_SIZE, 0
>> +big_stack_start:
>> +	.dword	0
>> +#endif
>>
> 


Viele Gr??e,
Stefan
Stefan Roese May 14, 2020, 9:19 a.m. UTC | #4
On 14.05.20 01:43, Daniel Schwierzeck wrote:
> 
> 
> Am 02.05.20 um 10:59 schrieb Stefan Roese:
>> From: Aaron Williams <awilliams at marvell.com>
>>
>> This patch adds very basic support for the Octeon III SoCs. Only
>> CFI parallel NOR flash and UART is supported for now.
>>
>> Please note that the basic Octeon port does not include the DDR3/4
>> initialization yet. This will be added in some follow-up patches
>> later. To still use U-Boot on with this port, the L2 cache (4MiB on
>> Octeon III CN73xx) is used as RAM. This way, U-Boot can boot to the
>> prompt on such boards.
>>
>> Signed-off-by: Aaron Williams <awilliams at marvell.com>
>> Signed-off-by: Stefan Roese <sr at denx.de>
>> ---
>>
>>   MAINTAINERS                                  |    6 +
>>   arch/Kconfig                                 |    1 +
>>   arch/mips/Kconfig                            |   49 +-
>>   arch/mips/Makefile                           |    7 +
>>   arch/mips/cpu/Makefile                       |    4 +-
>>   arch/mips/include/asm/arch-octeon/cavm-reg.h |   42 +
>>   arch/mips/include/asm/arch-octeon/clock.h    |   24 +
>>   arch/mips/mach-octeon/Kconfig                |   92 ++
>>   arch/mips/mach-octeon/Makefile               |   10 +
>>   arch/mips/mach-octeon/clock.c                |   22 +
>>   arch/mips/mach-octeon/cpu.c                  |   55 +
>>   arch/mips/mach-octeon/dram.c                 |   27 +
>>   arch/mips/mach-octeon/include/ioremap.h      |   30 +
>>   arch/mips/mach-octeon/start.S                | 1241 ++++++++++++++++++
>>   14 files changed, 1608 insertions(+), 2 deletions(-)
>>   create mode 100644 arch/mips/include/asm/arch-octeon/cavm-reg.h
>>   create mode 100644 arch/mips/include/asm/arch-octeon/clock.h
>>   create mode 100644 arch/mips/mach-octeon/Kconfig
>>   create mode 100644 arch/mips/mach-octeon/Makefile
>>   create mode 100644 arch/mips/mach-octeon/clock.c
>>   create mode 100644 arch/mips/mach-octeon/cpu.c
>>   create mode 100644 arch/mips/mach-octeon/dram.c
>>   create mode 100644 arch/mips/mach-octeon/include/ioremap.h
>>   create mode 100644 arch/mips/mach-octeon/start.S
>>
> 
> I couldn't completely understand the start.S. There is too much stuff in
> it for an initial merge. But I don't see a hard reason against using the
> generic start.S. So the first patch series should only implement the
> bare minimum needed to boot from flash, init the boot CPU core, maybe
> suspend all other cores and relocate to L2 cache.

I already worked on using the common start.S with minimal custom
additions for Octeon. This will be included in v2 of the base Octeon
patchset.

> I know the current start.S is not really suited yet but I'm working on a
> refactoring to add some more hooks which a SoC/CPU can implement. Once
> we have your initial patch series and the refactoring in mainline, it
> should be possible to gradually add more Octeon stuff like memory init.
> 
> Basic idea for refactoring is something like this:
> 
> reset:
>      - mips_cpu_early_init()       # custom early init, fix errata
>      - init CP0 registers, Watch registers
>      - mips_cache_disable()        # set K0 CCA to uncached
>      - mips_cpu_core_init()        # per CPU core init
>                                    # -> generic code issues wait instr.
>                                    # -> custom code can do custom init
>                                    #    or custom boot protocols
>      - mips_cm_map()               # init CM if available
>      - mips_cache_init()           # init caches, set K0 CCA to non-coh.
>      - mips_sram_init()            # init SRAM, Scratch RAM if avail
>      - setup initial stack and global_data
>      - debug_uart_init()
>      - mips_mem_init()             # init external memory, C env avail.
>      - init malloc_f
>      - board_init_f()

Thanks Daniel, this sounds like a very good approach. I'll send v2 later
today (as its already finished). We can then work on how to integrate
it, either by using the currently available functions like
mips_sram_init(), or by extending start.S (and the Octeon custom code)
with some other, newly introduced functions.

Thanks,
Stefan

>> +
>> +#endif /* __ASM_MACH_OCTEON_IOREMAP_H */
>> diff --git a/arch/mips/mach-octeon/start.S b/arch/mips/mach-octeon/start.S
>> new file mode 100644
>> index 0000000000..acb967201a
>> --- /dev/null
>> +++ b/arch/mips/mach-octeon/start.S
>> @@ -0,0 +1,1241 @@
>> +/* SPDX-License-Identifier: GPL-2.0+ */
>> +/*
>> + *  Startup Code for OCTEON 64-bit CPU-core
>> + *
>> + *  Copyright (c) 2003	Wolfgang Denk <wd at denx.de>
>> + *  Copyright 2004, 2005, 2010 - 2015 Cavium Inc..
>> + */
>> +
>> +#include <asm-offsets.h>
>> +#include <config.h>
>> +#include <asm/regdef.h>
>> +#include <asm/mipsregs.h>
>> +#include <asm/asm.h>
>> +
>> +#define BOOT_VECTOR_NUM_WORDS		8
>> +
>> +#define OCTEON_BOOT_MOVEABLE_MAGIC_OFFSET	0x70
>> +#define OCTEON_BOOT_VECTOR_MOVEABLE_OFFSET	0x78
>> +
>> +#define OCTEON_BOOT_MOVEABLE_MAGIC1_RAW	0xdb00110ad358eacd
>> +#define OCTEON_BOOT_MOVEABLE_MAGIC1	OCTEON_BOOT_MOVEABLE_MAGIC1_RAW
>> +
>> +#define OCTEON_CIU_SOFT_RST		0x8001070000000740
>> +
>> +#define	OCTEON_L2C_WPAR_PP0		0x8001180080840000
>> +#define OCTEON_MIO_BOOT_BASE		0x8001180000000000
>> +#define OCTEON_MIO_BOOT_REG_CFG0_OFF	0x0000
>> +#define OCTEON_MIO_BOOT_LOC_CFG0_OFF	0x0080
>> +#define OCTEON_MIO_BOOT_LOC_ADR_OFF	0x0090
>> +#define OCTEON_MIO_BOOT_LOC_DAT_OFF	0x0098
>> +#define	OCTEON_MIO_RST_BOOT		0x8001180000001600
>> +#define OCTEON_MIO_BOOT_REG_CFG0	0x8001180000000000
>> +#define	OCTEON_MIO_BOOT_REG_TIM0	0x8001180000000040
>> +#define OCTEON_MIO_BOOT_LOC_CFG0	0x8001180000000080
>> +#define OCTEON_MIO_BOOT_LOC_ADR		0x8001180000000090
>> +#define OCTEON_MIO_BOOT_LOC_DAT		0x8001180000000098
>> +#define	OCTEON_MIO_FUSE_DAT3		0x8001180000001418
>> +#define OCTEON_L2D_FUS3			0x80011800800007B8
>> +#define	OCTEON_LMC0_DDR_PLL_CTL		0x8001180088000258
>> +
>> +#define OCTEON_RST			0x8001180006000000
>> +#define OCTEON_RST_BOOT_OFFSET		0x1600
>> +#define OCTEON_RST_SOFT_RST_OFFSET	0x1680
>> +#define OCTEON_RST_COLD_DATAX_OFFSET(X)	(0x17C0 + (X) * 8)
>> +#define OCTEON_RST_BOOT			0x8001180006001600
>> +#define OCTEON_RST_SOFT_RST		0x8001180006001680
>> +#define OCTEON_RST_COLD_DATAX(X)	(0x80011800060017C0 + (X) * 8)
>> +
>> +#define OCTEON_OCX_COM_NODE		0x8001180011000000
>> +#define OCTEON_L2C_OCI_CTL		0x8001180080800020
>> +#define OCTEON_L2C_TAD_CTL		0x8001180080800018
>> +#define OCTEON_L2C_CTL			0x8001180080800000
>> +
>> +#define OCTEON_DBG_DATA			0x80011F00000001E8
>> +#define OCTEON_PCI_READ_CMD_E		0x80011F0000001188
>> +#define OCTEON_NPEI_DBG_DATA		0x80011F0000008510
>> +#define OCTEON_CIU_WDOG(X)		(0x8001070000000500 + (X) * 8)
>> +#define OCTEON_CIU_PP_POKE(X)		(0x8001070000000580 + (X) * 8)
>> +#define OCTEON_CIU3_WDOG(X)		(0x8001010000020000 + (X) * 8)
>> +#define OCTEON_CIU3_PP_POKE(X)		(0x8001010000030000 + (X) * 8)
>> +#define OCTEON_OCX_COM_LINKX_CTL(X)	(0x8001180011000020 + (X) * 8)
>> +#define OCTEON_SLI_CTL_STATUS		0x80011F0000028570
>> +#define OCTEON_GSERX_SCRATCH(X)		(0x8001180090000020 + (X) * 0x1000000)
>> +
>> +/** PRID for CN56XX */
>> +#define OCTEON_PRID_CN56XX		0x04
>> +/** PRID for CN52XX */
>> +#define OCTEON_PRID_CN52XX		0x07
>> +/** PRID for CN63XX */
>> +#define OCTEON_PRID_CN63XX		0x90
>> +/** PRID for CN68XX */
>> +#define OCTEON_PRID_CN68XX		0x91
>> +/** PRID for CN66XX */
>> +#define OCTEON_PRID_CN66XX		0x92
>> +/** PRID for CN61XX */
>> +#define OCTEON_PRID_CN61XX		0x93
>> +/** PRID for CNF71XX */
>> +#define OCTEON_PRID_CNF71XX		0x94
>> +/** PRID for CN78XX */
>> +#define OCTEON_PRID_CN78XX		0x95
>> +/** PRID for CN70XX */
>> +#define OCTEON_PRID_CN70XX		0x96
>> +/** PRID for CN73XX */
>> +#define OCTEON_PRID_CN73XX		0x97
>> +/** PRID for CNF75XX */
>> +#define OCTEON_PRID_CNF75XX		0x98
>> +
>> +/* func argument is used to create a  mark, must be unique */
>> +#define GETOFFSET(reg, func)	\
>> +	.balign	8;		\
>> +	bal	func ##_mark;	\
>> +	nop;			\
>> +	.dword	.;		\
>> +func ##_mark:			\
>> +	ld	reg, 0(ra);	\
>> +	dsubu	reg, ra, reg;
>> +
>> +#define JAL(func)		\
>> +	.balign	8;		\
>> +	bal	func ##_mark;	\
>> +	 nop;			\
>> +	.dword .;		\
>> +func ##_mark:			\
>> +	ld	t8, 0(ra);	\
>> +	dsubu	t8, ra, t8;	\
>> +	dla	t9, func;	\
>> +	daddu	t9, t9, t8;	\
>> +	jalr	t9;		\
>> +	 nop;
>> +
>> +	.set	arch=octeon3
>> +	.set	noreorder
>> +
>> +	.macro uhi_mips_exception
>> +	move	k0, t9		# preserve t9 in k0
>> +	move	k1, a0		# preserve a0 in k1
>> +	li	t9, 15		# UHI exception operation
>> +	li	a0, 0		# Use hard register context
>> +	sdbbp	1		# Invoke UHI operation
>> +	.endm
>> +
>> +	.macro setup_stack_gd
>> +	li	t0, -16
>> +	PTR_LI	t1, big_stack_start
>> +	and	sp, t1, t0		# force 16 byte alignment
>> +	PTR_SUBU \
>> +		sp, sp, GD_SIZE		# reserve space for gd
>> +	and	sp, sp, t0		# force 16 byte alignment
>> +	move	k0, sp			# save gd pointer
>> +#if CONFIG_VAL(SYS_MALLOC_F_LEN) && \
>> +    !CONFIG_IS_ENABLED(INIT_STACK_WITHOUT_MALLOC_F)
>> +	li	t2, CONFIG_VAL(SYS_MALLOC_F_LEN)
>> +	PTR_SUBU \
>> +		sp, sp, t2		# reserve space for early malloc
>> +	and	sp, sp, t0		# force 16 byte alignment
>> +#endif
>> +	move	fp, sp
>> +
>> +	/* Clear gd */
>> +	move	t0, k0
>> +1:
>> +	PTR_S	zero, 0(t0)
>> +	PTR_ADDIU t0, PTRSIZE
>> +	blt	t0, t1, 1b
>> +	 nop
>> +
>> +#if CONFIG_VAL(SYS_MALLOC_F_LEN) && \
>> +    !CONFIG_IS_ENABLED(INIT_STACK_WITHOUT_MALLOC_F)
>> +	PTR_S	sp, GD_MALLOC_BASE(k0)	# gd->malloc_base offset
>> +#endif
>> +	.endm
>> +
>> +/* Saved register usage:
>> + * s0:	not used
>> + * s1:	not used
>> + * s2:	Address U-Boot loaded into in L2 cache
>> + * s3:	Start address
>> + * s4:	flags
>> + *		1:	booting from RAM
>> + *		2:	executing out of cache
>> + *		4:	booting from flash
>> + * s5:	u-boot size (data end - _start)
>> + * s6:	offset in flash.
>> + * s7:	_start physical address
>> + * s8:
>> + */
>> +
>> +ENTRY(_start)
>> +	/* U-Boot entry point */
>> +	b	reset
>> +
>> +	/* The above jump instruction/nop are considered part of the
>> +	 * bootloader_header_t structure but are not changed when the header is
>> +	 * updated.
>> +	 */
>> +
>> +	/* Leave room for bootloader_header_t header at start of binary.  This
>> +	 * header is used to identify the board the bootloader is for, what
>> +	 * address it is linked at, failsafe/normal, etc.  It also contains a
>> +	 * CRC of the entire image.
>> +	 */
>> +
>> +#if defined(CONFIG_ROM_EXCEPTION_VECTORS)
>> +	/*
>> +	 * Exception vector entry points. When running from ROM, an exception
>> +	 * cannot be handled. Halt execution and transfer control to debugger,
>> +	 * if one is attached.
>> +	 */
>> +	.org 0x200
>> +	/* TLB refill, 32 bit task */
>> +	uhi_mips_exception
>> +
>> +	.org 0x280
>> +	/* XTLB refill, 64 bit task */
>> +	uhi_mips_exception
>> +
>> +	.org 0x300
>> +	/* Cache error exception */
>> +	uhi_mips_exception
>> +
>> +	.org 0x380
>> +	/* General exception */
>> +	uhi_mips_exception
>> +
>> +	.org 0x400
>> +	/* Catch interrupt exceptions */
>> +	uhi_mips_exception
>> +
>> +	.org 0x480
>> +	/* EJTAG debug exception */
>> +1:	b	1b
>> +	 nop
>> +
>> +	.org 0x500
>> +#endif
>> +
>> +/* Reserve extra space so that when we use the boot bus local memory
>> + * segment to remap the debug exception vector we don't overwrite
>> + * anything useful
>> + */
>> +
>> +/* Basic exception handler (dump registers) in all ASM.	 When using the TLB for
>> + * mapping u-boot C code, we can't branch to that C code for exception handling
>> + * (TLB is disabled for some exceptions.
>> + */
>> +
>> +/* RESET/start here */
>> +	.balign	8
>> +reset:
>> +	nop
>> +	synci	0(zero)
>> +	mfc0	k0, CP0_STATUS
>> +	ori	k0, 0x00E0		/* enable 64 bit mode for CSR access */
>> +	mtc0	k0, CP0_STATUS
>> +
>> +	/* Save the address we're booting from, strip off low bits */
>> +	bal	1f
>> +	 nop
>> +1:
>> +	move	s3, ra
>> +	dins	s3, zero, 0, 12
>> +
>> +	/* Disable boot bus moveable regions */
>> +	PTR_LI	k0, OCTEON_MIO_BOOT_LOC_CFG0
>> +	sd	zero, 0(k0)
>> +	sd	zero, 8(k0)
>> +
>> +	/* Disable the watchdog timer
>> +	 * First we check if we're running on CN78XX, CN73XX or CNF75XX to see
>> +	 * if we use CIU3 or CIU.
>> +	 */
>> +	mfc0	t0, CP0_PRID
>> +	ext	t0, t0, 8, 8
>> +	/* Assume CIU */
>> +	PTR_LI	t1, OCTEON_CIU_WDOG(0)
>> +	PTR_LI	t2, OCTEON_CIU_PP_POKE(0)
>> +	blt	t0, OCTEON_PRID_CN78XX, wd_use_ciu
>> +	 nop
>> +	beq	t0, OCTEON_PRID_CN70XX, wd_use_ciu
>> +	 nop
>> +	/* Use CIU3 */
>> +	PTR_LI	t1, OCTEON_CIU3_WDOG(0)
>> +	PTR_LI	t2, OCTEON_CIU3_PP_POKE(0)
>> +wd_use_ciu:
>> +	sd	zero, 0(t2)		/* Pet the dog */
>> +	sd	zero, 0(t1)		/* Disable watchdog timer */
>> +
>> +	/* Errata: CN76XX has a node ID of 3. change it to zero here.
>> +	 * This needs to be done before we relocate to L2 as addresses change
>> +	 * For 76XX pass 1.X we need to zero out the OCX_COM_NODE[ID],
>> +	 * L2C_OCI_CTL[GKSEGNODE] and CP0 of Root.CvmMemCtl2[KSEGNODE].
>> +	 */
>> +	mfc0	a4, CP0_PRID
>> +	/* Check for 78xx pass 1.x processor ID */
>> +	andi	a4, 0xffff
>> +	blt	a4, (OCTEON_PRID_CN78XX << 8), 1f
>> +	 nop
>> +
>> +	/* Zero out alternate package for now */
>> +	dins	a4, zero, 6, 1
>> +	bge	a4, ((OCTEON_PRID_CN78XX << 8) | 0x08), 1f
>> +	 nop
>> +
>> +	/* 78xx or 76xx here, first check for bug #27141 */
>> +	PTR_LI	a5, OCTEON_SLI_CTL_STATUS
>> +	ld	a6, 0(a5)
>> +	andi	a7, a4, 0xff
>> +	andi	a6, a6, 0xff
>> +
>> +	beq	a6, a7, not_bug27141
>> +	 nop
>> +
>> +	/* core 0 proc_id rev_id field does not match SLI_CTL_STATUS rev_id */
>> +	/* We just hit bug #27141.  Need to reset the chip and try again */
>> +
>> +	PTR_LI	a4, OCTEON_RST_SOFT_RST
>> +	ori	a5, zero, 0x1	/* set the reset bit */
>> +
>> +reset_78xx_27141:
>> +	sync
>> +	synci	0(zero)
>> +	cache	9, 0(zero)
>> +	sd	a5, 0(a4)
>> +	wait
>> +	b	reset_78xx_27141
>> +	 nop
>> +
>> +not_bug27141:
>> +	/* 76XX pass 1.x has the node number set to 3 */
>> +	mfc0	a4, CP0_EBASE
>> +	ext	a4, a4, 0, 10
>> +	bne	a4, 0x180, 1f	/* Branch if not node 3 core 0 */
>> +	 nop
>> +
>> +	/* Clear OCX_COM_NODE[ID] */
>> +	PTR_LI	a5, OCTEON_OCX_COM_NODE
>> +	ld	a4, 0(a5)
>> +	dins	a4, zero, 0, 2
>> +	sd	a4, 0(a5)
>> +	ld	zero, 0(a5)
>> +
>> +	/* Clear L2C_OCI_CTL[GKSEGNODE] */
>> +	PTR_LI	a5, OCTEON_L2C_OCI_CTL
>> +	ld	a4, 0(a5)
>> +	dins	a4, zero, 4, 2
>> +	sd	a4, 0(a5)
>> +	ld	zero, 0(a5)
>> +
>> +	/* Clear CP0 Root.CvmMemCtl2[KSEGNODE] */
>> +	dmfc0	a4, CP0_CVMMEMCTL2
>> +	dins	a4, zero, 12, 2
>> +	dmtc0	a4, CP0_CVMMEMCTL2
>> +
>> +	/* Put the flash address in the start of the EBASE register to
>> +	 * enable our exception handler but only for core 0.
>> +	 */
>> +	mfc0	a4, CP0_EBASE
>> +	dext	a4, a4, 0, 10
>> +	bnez	a4, no_flash
>> +	/* OK in delay slot */
>> +	dext	a6, a6, 0, 16		/* Get the base address in flash */
>> +	sll	a6, a6, 16
>> +	mtc0	a6, CP0_EBASE	/* Enable exceptions */
>> +
>> +no_flash:
>> +	/* Zero out various registers */
>> +	mtc0	zero, CP0_DEPC
>> +	mtc0	zero, CP0_EPC
>> +	mtc0	zero, CP0_CAUSE
>> +	mfc0	a4, CP0_PRID
>> +	ext	a4, a4, 8, 8
>> +	mtc0	zero, CP0_DESAVE
>> +
>> +	/* The following are only available on Octeon 2 or later */
>> +	mtc0	zero, CP0_KSCRATCH1
>> +	mtc0	zero, CP0_KSCRATCH2
>> +	mtc0	zero, CP0_KSCRATCH3
>> +	mtc0	zero, CP0_USERLOCAL
>> +
>> +	/* Turn off ROMEN bit to disable ROM */
>> +	PTR_LI	a1, OCTEON_MIO_RST_BOOT
>> +	/* For OCTEON 3 we use RST_BOOT instead of MIO_RST_BOOT.
>> +	 * The difference is bits 24-26 are 6 instead of 0 for the address.
>> +	 */
>> +	/* For Octeon 2 and CN70XX we can ignore the watchdog */
>> +	blt	a4, OCTEON_PRID_CN78XX, watchdog_ok
>> +	 nop
>> +
>> +	PTR_LI	a1, OCTEON_RST_BOOT
>> +
>> +	beq	a4, OCTEON_PRID_CN70XX, watchdog_ok
>> +	 nop
>> +
>> +	ld	a2, 0(a1)
>> +	/* There is a bug where some registers don't get properly reset when
>> +	 * the watchdog timer causes a reset.  In this case we need to force
>> +	 * a reset.
>> +	 */
>> +	bbit0	a2, 11, watchdog_ok	/* Skip if watchdog not hit */
>> +	 dins	a2, zero, 2, 18	/* Don't clear LBOOT, LBOOT_EXT or LBOOT_OCI */
>> +	/* Clear bit indicating reset due to watchdog */
>> +	ori	a2, 1 << 11
>> +	sd	a2, 0(a1)
>> +
>> +	/* Disable watchdog */
>> +	PTR_LI	a1, OCTEON_CIU3_PP_POKE(0)
>> +	sd	zero, 0(a1)
>> +	PTR_LI	a1, OCTEON_CIU3_WDOG(0)
>> +	sd	zero, 0(a1)
>> +
>> +	/* Record this in the GSER0_SCRATCH register in bit 11 */
>> +	PTR_LI	a1, OCTEON_GSERX_SCRATCH(0)
>> +	ld	a2, 0(a1)
>> +	ori	a2, 1 << 11
>> +	sd	a2, 0(a1)
>> +
>> +	PTR_LI	a1, OCTEON_RST_SOFT_RST
>> +	li	a2, 1
>> +	sd	a2, 0(a1)
>> +	wait
>> +
>> +	/* We should never get here */
>> +
>> +watchdog_ok:
>> +	ld	a2, 0(a1)
>> +	/* Don't clear LBOOT/LBOOT_EXT or LBOOT_OCI */
>> +	dins	a2, zero, 2, 18
>> +	dins	a2, zero, 60, 1	/* Clear ROMEN bit */
>> +	sd	a2, 0(a1)
>> +
>> +	/* Start of Octeon setup */
>> +
>> +	/* Check what core we are - if core 0, branch to init tlb
>> +	 * loop in flash.  Otherwise, look up address of init tlb
>> +	 * loop that was saved in the boot vector block.
>> +	 */
>> +	mfc0	a0, CP0_EBASE
>> +	andi	a0, EBASE_CPUNUM		/* get core */
>> +	beqz	a0, InitTLBStart_local
>> +	 nop
>> +
>> +	break
>> +	/* We should never get here - non-zero cores now go directly to
>> +	 * tlb init from the boot stub in movable region.
>> +	 */
>> +
>> +	.globl InitTLBStart
>> +InitTLBStart:
>> +InitTLBStart_local:
>> +	/* If we don't have working memory yet configure a bunch of
>> +	 * scratch memory, and set the stack pointer to the top
>> +	 * of it.  This allows us to go to C code without having
>> +	 * memory set up
>> +	 *
>> +	 * Warning: do not change SCRATCH_STACK_LINES as this can impact the
>> +	 * transition from start.S to crti.asm. crti requires 590 bytes of
>> +	 * stack space.
>> +	 */
>> +	cache	1,0(zero)	/* Clear Dcache so cvmseg works right */
>> +#if CONFIG_OCTEON_BIG_STACK_SIZE
>> +	rdhwr	v0, $0
>> +	bnez	v0, 1f
>> +	 nop
>> +	PTR_LA	sp, big_stack_start - 16
>> +	b	stack_clear_done
>> +	 nop
>> +1:
>> +#endif
>> +#define SCRATCH_STACK_LINES 0x36   /* MAX is 0x36 */
>> +	dmfc0	v0, CP0_CVMMEMCTL
>> +	dins	v0, zero, 0, 9
>> +	/* setup SCRATCH_STACK_LINES scratch lines of scratch */
>> +	ori	v0, 0x100 | SCRATCH_STACK_LINES
>> +	dmtc0	v0, CP0_CVMMEMCTL
>> +	/* set stack to top of scratch memory */
>> +	li	sp, 0xffffffffffff8000 + (SCRATCH_STACK_LINES * 128)
>> +	/* Clear scratch for CN63XX pass 2.0 errata Core-15169*/
>> +	li	t0, 0xffffffffffff8000
>> +clear_scratch:
>> +	sd	zero, 0(t0)
>> +	addiu	t0, 8
>> +	bne	t0, sp, clear_scratch
>> +	 nop
>> +
>> +	/* This code run on all cores - core 0 from flash,
>> +	 * the rest from DRAM.	When booting from PCI, non-zero cores
>> +	 * come directly here from the boot vector - no earlier code in this
>> +	 * file is executed.
>> +	 */
>> +
>> +	/* Some generic initialization is done here as well, as we need this
>> +	 * done on all cores even when booting from PCI
>> +	 */
>> +stack_clear_done:
>> +	/* Clear watch registers. */
>> +	mtc0	zero, CP0_WATCHLO
>> +	mtc0	zero, CP0_WATCHHI
>> +
>> +	/* STATUS register */
>> +	mfc0	k0, CP0_STATUS
>> +	li	k1, ~ST0_IE
>> +	and	k0, k1
>> +	mtc0	k0, CP0_STATUS
>> +
>> +	/* CAUSE register */
>> +	mtc0	zero, CP0_CAUSE
>> +
>> +	/* Init Timer */
>> +	dmtc0	zero, CP0_COUNT
>> +	dmtc0	zero, CP0_COMPARE
>> +
>> +
>> +	mfc0	a5, CP0_STATUS
>> +	li	v0, 0xE0		/* enable 64 bit mode for CSR access */
>> +	or	v0, v0, a5
>> +	mtc0	v0, CP0_STATUS
>> +
>> +
>> +	dli	v0, 1 << 29  /* Enable large physical address support in TLB */
>> +	mtc0	v0, CP0_PAGEGRAIN
>> +
>> +InitTLB:
>> +	dmtc0	zero, CP0_ENTRYLO0
>> +	dmtc0	zero, CP0_ENTRYLO1
>> +	mtc0	zero, CP0_PAGEMASK
>> +	dmtc0	zero, CP0_CONTEXT
>> +	/* Use an offset into kseg0 so we won't conflict with Mips1 legacy
>> +	 * TLB clearing
>> +	 */
>> +	PTR_LI	v0, 0xFFFFFFFF90000000
>> +	mfc0	a0, CP0_CONFIG1
>> +	srl	a0, a0, 25
>> +	/* Check if config4 reg present */
>> +	mfc0	a1, CP0_CONFIG3
>> +	bbit0	a1, 31, 2f
>> +	 and	a0, a0, 0x3F		/* a0 now has the max mmu entry index */
>> +	mfc0	a1, CP0_CONFIG4
>> +	bbit0	a1, 14, 2f		/* check config4[MMUExtDef] */
>> +	 nop
>> +	/* append config4[MMUSizeExt] to most significant bit of
>> +	 * config1[MMUSize-1]
>> +	 */
>> +	ins	a0, a1, 6, 8
>> +	and	a0, a0, 0x3fff	/* a0 now includes max entries for cn6xxx */
>> +2:
>> +	dmtc0	zero, CP0_XCONTEXT
>> +	mtc0	zero, CP0_WIRED
>> +
>> +InitTLBloop:
>> +	dmtc0	v0, CP0_ENTRYHI
>> +	tlbp
>> +	mfc0	v1, CP0_INDEX
>> +	daddiu	v0, v0, 1<<13
>> +	bgez	v1, InitTLBloop
>> +
>> +	mtc0	a0, CP0_INDEX
>> +	tlbwi
>> +	bnez	a0, InitTLBloop
>> +	 daddiu	a0, -1
>> +
>> +	mthi	zero
>> +	mtlo	zero
>> +
>> +	/* Set up status register */
>> +	mfc0	v0, CP0_STATUS
>> +	/* Enable COP0 and COP2 access */
>> +	li	a4, (1 << 28) | (1 << 30)
>> +	or	v0, a4
>> +
>> +	/* Must leave BEV set here, as DRAM is not configured for core 0.
>> +	 * Also, BEV must be 1 later on when the exception base address is set.
>> +	 */
>> +
>> +	/* Mask all interrupts */
>> +	ins	v0, zero, 0, 16
>> +	/* Clear NMI (used to start cores other than core 0) */
>> +	ori	v0, 0xE4		/* enable 64 bit, disable interrupts */
>> +	mtc0	v0, CP0_STATUS
>> +
>> +	dli	v0,0xE000000F		/* enable all readhw locations */
>> +	mtc0	v0, CP0_HWRENA
>> +
>> +	dmfc0	v0, CP0_CVMCTL
>> +	ori	v0, 1<<14	/* enable fixup of unaligned mem access */
>> +	dmtc0	v0, CP0_CVMCTL
>> +
>> +	/* Setup scratch memory.  This is also done in
>> +	 * cvmx_user_app_init, and this code will be removed
>> +	 * from the bootloader in the near future.
>> +	 */
>> +
>> +	/* Set L2C_LAD_CTL[MAXLFB] = 0 on CN73XX */
>> +	mfc0	a4, CP0_PRID
>> +	ext	a4, a4, 8, 8
>> +	blt	a4, OCTEON_PRID_CN73XX, 72f
>> +	nop
>> +	PTR_LI	v0, OCTEON_L2C_TAD_CTL
>> +	ld	t1, 0(v0)
>> +	dins	t1, zero, 0, 4
>> +	sd	t1, 0(v0)
>> +	ld	zero, 0(v0)
>> +
>> +72:
>> +
>> +	/* clear these to avoid immediate interrupt in noperf mode */
>> +	dmtc0	zero, CP0_COMPARE	/* clear timer interrupt */
>> +	dmtc0	zero, CP0_COUNT		/* clear timer interrupt */
>> +	dmtc0	zero, CP0_PERF_CNT0	/* clear perfCnt0 */
>> +	dmtc0	zero, CP0_PERF_CNT1	/* clear perfCnt1 */
>> +	dmtc0	zero, CP0_PERF_CNT2
>> +	dmtc0	zero, CP0_PERF_CNT3
>> +
>> +	/* If we're running on a node other than 0 then we need to set KSEGNODE
>> +	 * to 0.  The nice thing with this code is that it also autodetects if
>> +	 * we're running on a processor that supports CVMMEMCTL2 or not since
>> +	 * only processors that have this will have a non-zero node ID.  Because
>> +	 * of this there's no need to check if we're running on a 78XX.
>> +	 */
>> +	mfc0    t1, CP0_EBASE
>> +	dext    t1, t1, 7, 3            /* Extract node number */
>> +	beqz    t1, is_node0            /* If non-zero then we're not node 0 */
>> +	 nop
>> +	dmfc0   t1, CP0_CVMMEMCTL2
>> +	dins    t1, zero, 12, 4
>> +	dmtc0   t1, CP0_CVMMEMCTL2
>> +is_node0:
>> +
>> +	/* Set up TLB mappings for u-boot code in flash. */
>> +
>> +	/* Use a bal to get the current PC into ra.  Since this bal is to
>> +	 * the address immediately following the delay slot, the ra is
>> +	 * the address of the label.  We then use this to get the actual
>> +	 * address that we are executing from.
>> +	 */
>> +	bal	__dummy
>> +	 nop
>> +
>> +__dummy:
>> +	/* Get the actual address that we are running at */
>> +	PTR_LA	a6, _start		/* Linked address of _start */
>> +	PTR_LA	a7, __dummy
>> +	dsubu	t0, a7, a6		/* offset of __dummy label from _start*/
>> +	dsubu	a7, ra, t0		/* a7 now has actual address of _start*/
>> +
>> +	/* Save actual _start address in s7.  This is where we
>> +	 * are executing from, as opposed to where the code is
>> +	 * linked.
>> +	 */
>> +	move	s7, a7
>> +	move	s4, zero
>> +
>> +	/* s7 has actual address of _start.  If this is
>> +	 * on the boot bus, it will be between 0xBFC000000 and 0xBFFFFFFF.
>> +	 * If it is on the boot bus, use 0xBFC00000 as the physical address
>> +	 * for the TLB mapping, as we will be adjusting the boot bus
>> +	 * to make this adjustment.
>> +	 * If we are running from DRAM (remote-boot), then we want to use the
>> +	 * real address in DRAM.
>> +	 */
>> +
>> +	/* Check to see if we are running from flash - we expect that to
>> +	 * be 0xffffffffb0000000-0xffffffffbfffffff
>> +	 * (0x10000000-0x1fffffff, unmapped/uncached)
>> +	 */
>> +	dli	t2, 0xffffffffb0000000
>> +	dsubu	t2, s7
>> +	slt	s4, s7, t2
>> +	bltz	t2, uboot_in_flash
>> +	 nop
>> +
>> +	/* If we're not core 0 then we don't care about cache */
>> +	mfc0	t2, CP0_EBASE
>> +	andi	t2, EBASE_CPUNUM
>> +	bnez	t2, uboot_in_ram
>> +	 nop
>> +
>> +	/* Find out if we're OCTEON I or OCTEON + which don't support running
>> +	 * out of cache.
>> +	 */
>> +	mfc0	t2, CP0_PRID
>> +	ext	t2, t2, 8, 8
>> +	li	s4, 1
>> +	blt	t2, 0x90, uboot_in_ram
>> +	 nop
>> +
>> +	/* U-Boot can be executing either in RAM or L2 cache.  Now we need to
>> +	 * check if DRAM is initialized.  The way we do that is to look at
>> +	 * the reset bit of the LMC0_DDR_PLL_CTL register (bit 7)
>> +	 */
>> +	PTR_LI	t2, OCTEON_LMC0_DDR_PLL_CTL
>> +	ld	t2, 0(t2)
>> +	bbit1	t2, 7, uboot_in_ram
>> +	 nop
>> +
>> +	/* We must be executing out of cache */
>> +	b	uboot_in_ram
>> +	 li	s4, 2
>> +
>> +uboot_in_flash:
>> +	/* Set s4 to 4 to indicate we're running in FLASH */
>> +	li	s4, 4
>> +
>> +#if defined(CONFIG_OCTEON_DISABLE_L2_CACHE_INDEX_ALIASING)
>> +	/* By default, L2C index aliasing is enabled.  In some cases it may
>> +	 * need to be disabled.  The L2C index aliasing can only be disabled
>> +	 * if U-Boot is running out of L2 cache and the L2 cache has not been
>> +	 * used to store anything.
>> +	 */
>> +	PTR_LI	t1, OCTEON_L2C_CTL
>> +	ld	t2, 0(t1)
>> +	ori	t2, 1
>> +	sd	t2, 0(t1)
>> +#endif
>> +
>> +	/* Use BFC00000 as physical address for TLB mappings when booting
>> +	 * from flash, as we will adjust the boot bus mappings to make this
>> +	 * mapping correct.
>> +	 */
>> +	dli	a7, 0xFFFFFFFFBFC00000
>> +	dsubu	s6, s7, a7  /* Save flash offset in s6 */
>> +
>> +#if defined(CONFIG_OCTEON_COPY_FROM_FLASH_TO_L2)
>> +	/* For OCTEON II we check to see if the L2 cache is big enough to hold
>> +	 * U-Boot.  If it is big enough then we copy ourself from flash to the
>> +	 * L2 cache in order to speed up execution.
>> +	 */
>> +
>> +	/* Check for OCTEON 2 */
>> +	mfc0	t1, CP0_PRID
>> +	ext	t1, t1, 8, 8
>> +	/* Get number of L2 cache sets */
>> +	beq	t1, OCTEON_PRID_CNF71XX, got_l2_sets	/* CNF71XX */
>> +	 li	t2, 1 << 9
>> +	beq	t1, OCTEON_PRID_CN78XX, got_l2_sets	/* CN78XX */
>> +	 li	t2, 1 << 13
>> +	beq	t1, OCTEON_PRID_CN70XX, got_l2_sets	/* CN70XX */
>> +	 li	t2, 1 << 10
>> +	beq	t1, OCTEON_PRID_CN73XX, got_l2_sets	/* CN73XX */
>> +	 li	t2, 1 << 11
>> +	beq	t1, OCTEON_PRID_CNF75XX, got_l2_sets	/* CNF75XX */
>> +	 li	t2, 1 << 11
>> +	b	l2_cache_too_small	/* Unknown OCTEON model */
>> +	 nop
>> +
>> +got_l2_sets:
>> +	/* Get number of associations */
>> +	PTR_LI	t0, OCTEON_MIO_FUSE_DAT3
>> +	ld	t0, 0(t0)
>> +	dext	t0, t0, 32, 3
>> +
>> +	beq	t1, OCTEON_PRID_CN70XX, process_70xx_l2sets
>> +	 nop
>> +	/* 0 = 16-way, 1 = 12-way, 2 = 8-way, 3 = 4-way, 4-7 reserved */
>> +	beqz	t0, got_l2_ways
>> +	 li	t3, 16
>> +	beq	t0, 1, got_l2_ways
>> +	 li	t3, 12
>> +	beq	t0, 2, got_l2_ways
>> +	 li	t3, 8
>> +	beq	t0, 3, got_l2_ways
>> +	 li	t3, 4
>> +	b	l2_cache_too_small
>> +	 nop
>> +
>> +process_70xx_l2sets:
>> +	/* For 70XX, the number of ways is defined as:
>> +	 * 0 - full cache (4-way) 512K
>> +	 * 1 - 3/4 ways (3-way) 384K
>> +	 * 2 - 1/2 ways (2-way) 256K
>> +	 * 3 - 1/4 ways (1-way) 128K
>> +	 * 4-7 illegal (aliased to 0-3)
>> +	 */
>> +	andi	t0, 3
>> +	beqz	t0, got_l2_ways
>> +	 li	t3, 4
>> +	beq	t0, 1, got_l2_ways
>> +	 li	t3, 3
>> +	beq	t0, 2, got_l2_ways
>> +	 li	t3, 2
>> +	li	t3, 1
>> +
>> +got_l2_ways:
>> +	dmul	a1, t2, t3		/* Calculate cache size */
>> +	dsll	a1, 7			/* Ways * Sets * cache line sz (128) */
>> +	daddiu	a1, a1, -128		/* Adjust cache size for copy code */
>> +
>> +	/* Calculate size of U-Boot image */
>> +	/*
>> +	 * "uboot_end - _start" is not correct, as the image also
>> +	 * includes the DTB appended to the end (OF_EMBED is deprecated).
>> +	 * Lets use a defined max for now here.
>> +	 */
>> +	PTR_LI	s5, CONFIG_BOARD_SIZE_LIMIT
>> +
>> +	daddu	t2, s5, s7	/* t2 = end address */
>> +	daddiu	t2, t2, 127
>> +	ins	t2, zero, 0, 7	/* Round up to cache line for memcpy */
>> +
>> +	slt	t1, a1, s5	/* See if we're bigger than the L2 cache */
>> +	bnez	t1, l2_cache_too_small
>> +	 nop
>> +	/* Address we plan to load at in the L2 cache */
>> +	PTR_LI	t9, CONFIG_OCTEON_L2_UBOOT_ADDR
>> +# ifdef CONFIG_OCTEON_L2_MEMCPY_IN_CACHE
>> +	/* Enable all ways for PP0.  Authentik ROM may have disabled these */
>> +	PTR_LI	a1, OCTEON_L2C_WPAR_PP0
>> +	sd	zero, 0(a1)
>> +
>> +	/* Address to place our memcpy code */
>> +	PTR_LI	a0, CONFIG_OCTEON_L2_MEMCPY_ADDR
>> +	/* The following code writes a simple memcpy routine into the cache
>> +	 * to copy ourself from flash into the L2 cache.  This makes the
>> +	 * memcpy routine a lot faster since each instruction can potentially
>> +	 * require four read cycles to flash over the boot bus.
>> +	 */
>> +	/* Zero cache line in the L2 cache */
>> +	zcb	(a0)
>> +	synci	0(zero)
>> +	dli	a1, 0xdd840000dd850008	/* ld a0, 0(t0);  ld a1, 8(t0) */
>> +	sd	a1, 0(a0)
>> +	dli	a1, 0xdd860010dd870018	/* ld a2, 16(t0); ld a3, 24(t0) */
>> +	sd	a1, 8(a0)
>> +	dli	a1, 0xfda40000fda50008	/* sd a0, 0(t1);  sd a1, 8(t1) */
>> +	sd	a1, 16(a0)
>> +	dli	a1, 0xfda60010fda70018	/* sd a2, 16(t1); sd a3, 24(t1) */
>> +	sd	a1, 24(a0)
>> +	dli	a1, 0x258c0020158efff6	/* addiu t0, 32; bne t0, t2, -40 */
>> +	sd	a1, 32(a0)
>> +	dli	a1, 0x25ad002003e00008	/* addiu t1, 32; jr ra */
>> +	sd	a1, 40(a0)
>> +	sd	zero, 48(a0)		/* nop; nop */
>> +
>> +	/* Synchronize the caches */
>> +	sync
>> +	synci	0(zero)
>> +
>> +	move	t0, s7
>> +	move	t1, t9
>> +
>> +	/* Do the memcpy operation in L2 cache to copy ourself from flash
>> +	 * to the L2 cache.
>> +	 */
>> +	jalr	a0
>> +	 nop
>> +
>> +# else
>> +	/* Copy ourself to the L2 cache from flash, 32 bytes at a time */
>> +	/* This code is now written to the L2 cache using the code above */
>> +1:
>> +	ld	a0, 0(t0)
>> +	ld	a1, 8(t0)
>> +	ld	a2, 16(t0)
>> +	ld	a3, 24(t0)
>> +	sd	a0, 0(t1)
>> +	sd	a1, 8(t1)
>> +	sd	a2, 16(t1)
>> +	sd	a3, 24(t1)
>> +	addiu	t0, 32
>> +	bne	t0, t2, 1b
>> +	addiu	t1, 32
>> +# endif	/* CONFIG_OCTEON_L2_MEMCPY_IN_CACHE */
>> +
>> +	/* Adjust the start address of U-Boot and the global pointer */
>> +	subu	t0, s7, t9	/* t0 = address difference */
>> +	move	s7, t9		/* Update physical address */
>> +	move	s2, t9
>> +	sync
>> +	synci	0(zero)
>> +
>> +	/* Now we branch to the L2 cache.  We first get our PC then adjust it
>> +	 */
>> +	bal	3f
>> +	 nop
>> +3:
>> +	/* Don't add any instructions here! */
>> +	subu	t9, ra, t0
>> +	/* Give ourself 16 bytes */
>> +	addiu	t9, 0x10
>> +
>> +	jal	t9		/* Branch to address in L2 cache */
>> +
>> +	 nop
>> +	nop
>> +	/* Add instructions after here */
>> +
>> +	move	a7, s7
>> +
>> +	b	uboot_in_ram
>> +	 ori	s4, 2		/* Running out of L2 cache */
>> +
>> +l2_cache_too_small:	/* We go here if we can't copy ourself to L2 */
>> +#endif /* CONFIG_OCTEON_COPY_FROM_FLASH_TO_L2 */
>> +
>> +	/* This code is only executed if booting from flash. */
>> +	/*  For flash boot (_not_ RAM boot), we do a workaround for
>> +	 * an LLM errata on CN38XX and CN58XX parts.
>> +	 */
>> +
>> +uboot_in_ram:
>> +	/* U-boot address is now in reg a7, and is 4 MByte aligned.
>> +	 * (boot bus addressing has been adjusted to make this happen for flash,
>> +	 * and for DRAM this alignment must be provided by the remote boot
>> +	 * utility.
>> +	 */
>> +	/* See if we're in KSEG0 range, if so set EBASE register to handle
>> +	 * exceptions.
>> +	 */
>> +	dli	a1, 0x20000000
>> +	bge	a7, a1, 1f
>> +	 nop
>> +	/* Convert our physical address to KSEG0 */
>> +	PTR_LI	a1, 0xffffffff80000000
>> +	or	a1, a1, a7
>> +	mtc0	a1, CP0_EBASE
>> +1:
>> +	/* U-boot now starts at 0xBFC00000.  Use a single 4 MByte TLB mapping
>> +	 * to map u-boot.
>> +	 */
>> +	move	a0, a6		/* Virtual addr in a0 */
>> +	dins	a0, zero, 0, 16	/* Zero out offset bits */
>> +	move	a1, a7		/* Physical addr in a1 */
>> +
>> +	/* Now we need to remove the MIPS address space bits.  For this we
>> +	 * need to determine if it is a 32 bit compatibility address or not.
>> +	 */
>> +
>> +	/* 'lowest' address in compatibility space */
>> +	PTR_LI	t0, 0xffffffff80000000
>> +	dsubu	t0, t0, a1
>> +	bltz	t0, compat_space
>> +	 nop
>> +
>> +	/* We have a xkphys address, so strip off top bit */
>> +	b	addr_fixup_done
>> +	 dins	a1, zero, 63, 1
>> +
>> +compat_space:
>> +	PTR_LI	a2, 0x1fffffff
>> +	and	a1, a1, a2  /* Mask phy addr to remove address space bits */
>> +
>> +addr_fixup_done:
>> +	/* Currenty the u-boot image size is limited to 4 MBytes.  In order to
>> +	 * support larger images the flash mapping will need to be changed to
>> +	 * be able to access more than that before C code is run.  Until that
>> +	 * is done, we just use a 4 MByte mapping for the secondary cores as
>> +	 * well.
>> +	 */
>> +	/* page size (only support 4 Meg binary size for now for core 0)
>> +	 * This limitation is due to the fact that the boot vector is
>> +	 * 0xBFC00000 which only makes 4MB available.  Later more flash
>> +	 * address space will be available after U-Boot has been copied to
>> +	 * RAM.	 For now assume that it is in flash.
>> +	 */
>> +	li	a2, 2*1024*1024
>> +
>> +	mfc0	a4, CP0_EBASE
>> +	andi	a4, EBASE_CPUNUM		/* get core */
>> +	beqz	a4, core_0_tlb
>> +	 nop
>> +
>> +	/* Now determine how big a mapping to use for secondary cores,
>> +	 * which need to map all of u-boot + heap in DRAM
>> +	 */
>> +	/* Here we look at the alignment of the the physical address,
>> +	 * and use the largest page size possible.  In some cases
>> +	 * this can result in an oversize mapping, but for secondary cores
>> +	 * this mapping is very short lived.
>> +	 */
>> +
>> +	/* Physical address in a1 */
>> +	li	a2, 1
>> +1:
>> +	sll	a2, 1
>> +	and	a5, a1, a2
>> +	beqz	a5, 1b
>> +	 nop
>> +
>> +	/* a2 now contains largest page size we can use */
>> +core_0_tlb:
>> +	JAL(single_tlb_setup)
>> +
>> +	/* Check if we're running from cache */
>> +	bbit1	s4, 1, uboot_in_cache
>> +	 nop
>> +
>> +	/* If we are already running from ram, we don't need to muck
>> +	 * with boot bus mappings.
>> +	 */
>> +	PTR_LI	t2, 0xffffffffb0000000
>> +	dsubu	t2, s7
>> +	/* See if our starting address is lower than the boot bus */
>> +	bgez	t2, uboot_in_ram2	/* If yes, booting from RAM */
>> +	 nop
>> +
>> +uboot_in_cache:
>> +#if CONFIG_OCTEON_BIG_STACK_SIZE
>> +	/* The large stack is only for core 0.  For all other cores we need to
>> +	 * use the L1 cache otherwise the other cores will stomp on top of each
>> +	 * other unless even more space is reserved for the stack space for
>> +	 * each core.  With potentially 96 cores this gets excessive.
>> +	 */
>> +	mfc0	v0, CP0_EBASE
>> +	andi	a0, EBASE_CPUNUM
>> +	bnez	a0, no_big_stack
>> +	 nop
>> +	PTR_LA	sp, big_stack_start
>> +	daddiu	sp, -16
>> +
>> +no_big_stack:
>> +#endif
>> +	/* We now have the TLB set up, so we need to remap the boot bus.
>> +	 * This is tricky, as we are running from flash, and will be changing
>> +	 * the addressing of the flash.
>> +	 */
>> +	/* Enable movable boot bus region 0, at address 0x10000000 */
>> +	PTR_LI	a4, OCTEON_MIO_BOOT_BASE
>> +	dli	a5, 0x81000000	/* EN + base address 0x11000000 */
>> +	sd	a5, OCTEON_MIO_BOOT_LOC_CFG0_OFF(a4)
>> +
>> +	/* Copy code to that remaps the boot bus to movable region */
>> +	sd	zero, OCTEON_MIO_BOOT_LOC_DAT_OFF(a4)
>> +
>> +	PTR_LA	a6, change_boot_mappings
>> +	GETOFFSET(a5, change_boot_mappings);
>> +	daddu	a5, a5, a6
>> +
>> +	/* The code is 16 bytes (2 DWORDS) */
>> +	ld	a7, 0(a5)
>> +	sd	a7, OCTEON_MIO_BOOT_LOC_DAT_OFF(a4)
>> +	ld	a7, 8(a5)
>> +	sd	a7, OCTEON_MIO_BOOT_LOC_DAT_OFF(a4)
>> +
>> +	/* Read from an RML register to ensure that the previous writes have
>> +	 * completed before we branch to the movable region.
>> +	 */
>> +	ld	zero, OCTEON_MIO_BOOT_LOC_CFG0_OFF(a4)
>> +
>> +	/* Compute value for boot bus configuration register */
>> +	/* Read region 0 config so we can _modify_ the base address field */
>> +	PTR_LI	a4, OCTEON_MIO_BOOT_REG_CFG0	/* region 0 config */
>> +	ld	a0, 0(a4)
>> +	dli	a4, 0xf0000000		/* Mask off bits we want to save */
>> +	and	a4, a4, a0
>> +	dli	a0, 0x0fff0000		/* Force size to max */
>> +	or	a4, a4, a0
>> +
>> +	move	a5, s6
>> +	/* Convert to 64k blocks, as used by boot bus config */
>> +	srl	a5, 16
>> +	li	a6, 0x1fc0	/* 'normal' boot bus base config value */
>> +	subu	a6, a6, a5	/* Subtract offset */
>> +	/* combine into register value to pass to boot bus routine */
>> +	or	a0, a4, a6
>> +
>> +	/* Branch there */
>> +	PTR_LA	a1, __mapped_continue_label
>> +	PTR_LI	a2, OCTEON_MIO_BOOT_REG_CFG0
>> +	/* If region 0 is not enabled we can skip it */
>> +	ld	a4, 0(a2)
>> +	bbit0	a4, 31, __mapped_continue_label
>> +	 nop
>> +	li	a4, 0x10000000
>> +	j	a4
>> +	 synci	0(zero)
>> +
>> +	/* We never get here, as we go directly to __mapped_continue_label */
>> +	break
>> +
>> +
>> +uboot_in_ram2:
>> +
>> +	/* Now jump to address in TLB mapped memory to continue execution */
>> +	PTR_LA	a4, __mapped_continue_label
>> +	synci	0(a4)
>> +	j	a4
>> +	 nop
>> +
>> +__mapped_continue_label:
>> +	/* Check if we are core 0, if we are not then we need
>> +	 * to vector to code in DRAM to do application setup, and
>> +	 * skip the rest of the bootloader.  Only core 0 runs the bootloader
>> +	 * and sets up the tables that the other cores will use for
>> +	 * configuration.
>> +	 */
>> +	mfc0	a0, CP0_EBASE
>> +	andi	a0, EBASE_CPUNUM   /* get core */
>> +	/* if (__all_cores_are_equal==0 && core==0),
>> +	 * then jump to execute BL on core 0; else 'go to next line'
>> +	 * (core_0_cont1 is executed ONLY when k0=a0=0(core0_ID))
>> +	 */
>> +	lw	t0, __all_cores_are_equal
>> +	beq	a0, t0, core_0_cont1
>> +	 nop
>> +
>> +	/* other cores look up addr from dram */
>> +        /* DRAM controller already set up by first core */
>> +        li      a1, (BOOT_VECTOR_NUM_WORDS * 4)
>> +        mul     a0, a0, a1
>> +
>> +        /* Now find out the boot vector base address from the moveable boot
>> +         * bus region.
>> +         */
>> +
>> +        /* Get the address of the boot bus moveable region */
>> +        PTR_LI     t8, OCTEON_MIO_BOOT_BASE
>> +        ld      t9, OCTEON_MIO_BOOT_LOC_CFG0_OFF(t8)
>> +        /* Make sure it's enabled */
>> +        bbit0   t9, 31, invalid_boot_vector
>> +         dext   t9, t9, 3, 24
>> +        dsll    t9, t9, 7
>> +        /* Make address XKPHYS */
>> +	li	t0, 1
>> +	dins	t9, t0, 63, 1
>> +
>> +        ld      t0, OCTEON_BOOT_MOVEABLE_MAGIC_OFFSET(t9)
>> +        dli     t1, OCTEON_BOOT_MOVEABLE_MAGIC1
>> +        bne     t0, t1, invalid_boot_vector
>> +         nop
>> +
>> +        /* Load base address of boot vector table */
>> +        ld      t0, OCTEON_BOOT_VECTOR_MOVEABLE_OFFSET(t9)
>> +        /* Add offset for core */
>> +        daddu   a1, t0, a0
>> +
>> +	mfc0	v0, CP0_STATUS
>> +	move	v1, v0
>> +	ins	v1, zero, 19, 1		/* Clear NMI bit */
>> +	mtc0	v1, CP0_STATUS
>> +
>> +        /* Get app start function address */
>> +        lw      t9, 8(a1)
>> +        beqz    t9, invalid_boot_vector
>> +         nop
>> +
>> +        j       t9
>> +         lw      k0, 12(a1)      /* Load global data (deprecated) */
>> +
>> +invalid_boot_vector:
>> +        wait
>> +        b       invalid_boot_vector
>> +         nop
>> +
>> +__all_cores_are_equal:
>> +	/* The following .word tell if 'all_cores_are_equal' or core0 is special
>> +	 * By default (for the first execution) the core0 should be special,
>> +	 * in order to behave like the old(existing not-modified) bootloader
>> +	 * and run the bootloader on core 0 to follow the existing design.
>> +	 * However after that we make 'all_cores_equal' which allows to run SE
>> +	 * applications on core0 like on any other core. NOTE that value written
>> +	 * to '__all_cores_are_equal' should not match any core ID.
>> +	 */
>> +	.word 	0
>> +
>> +core_0_cont1:
>> +	li	t0, 0xffffffff
>> +	sw	t0, __all_cores_are_equal
>> +	/* From here on, only core 0 runs, other cores have branched
>> +	 * away.
>> +	 */
>> +#ifdef CONFIG_MIPS_INIT_STACK_IN_SRAM
>> +	/* Set up initial stack and global data */
>> +	setup_stack_gd
>> +# ifdef CONFIG_DEBUG_UART
>> +	PTR_LA	t9, debug_uart_init
>> +	jalr	t9
>> +	 nop
>> +# endif
>> +#endif
>> +	move	a0, zero		# a0 <-- boot_flags = 0
>> +	PTR_LA	t9, board_init_f
>> +
>> +	jr	t9
>> +	 move	ra, zero
>> +	END(_start)
>> +
>> +	.balign	8
>> +	.globl	single_tlb_setup
>> +	.ent	single_tlb_setup
>> +	/* Sets up a single TLB entry.	Virtual/physical addresses
>> +	 * must be properly aligned.
>> +	 * a0  Virtual address
>> +	 * a1  Physical address
>> +	 * a2  page (_not_ mapping) size
>> +	 */
>> +single_tlb_setup:
>> +	/* Determine the number of TLB entries available, and
>> +	 * use the top one.
>> +	 */
>> +	mfc0	a3, CP0_CONFIG1
>> +	dext	a3, a3, 25, 6		/* a3 now has the max mmu entry index */
>> +	mfc0	a5, CP0_CONFIG3		/* Check if config4 reg present */
>> +	bbit0	a5, 31, single_tlb_setup_cont
>> +	 nop
>> +	mfc0	a5, CP0_CONFIG4
>> +	bbit0	a5, 14, single_tlb_setup_cont	/* check config4[MMUExtDef] */
>> +	 nop
>> +	/* append config4[MMUSizeExt] to most significant bit of
>> +	 * config1[MMUSize-1]
>> +	 */
>> +	dins	a3, a5, 6, 8
>> +	and	a3, a3, 0x3fff	/* a3 now includes max entries for cn6xxx */
>> +
>> +single_tlb_setup_cont:
>> +
>> +	/* Format physical address for entry low */
>> +	nop
>> +	dsrl	a1, a1, 12
>> +	dsll	a1, a1, 6
>> +	ori	a1, a1, 0x7	/* set DVG bits */
>> +
>> +	move	a4, a2
>> +	daddu	a5, a4, a4	/* mapping size */
>> +	dsll	a6, a4, 1
>> +	daddiu	a6, a6, -1	/* pagemask */
>> +	dsrl	a4, a4, 6	/* adjust for adding with entrylo */
>> +
>> +	/* Now set up mapping */
>> +	mtc0	a6, CP0_PAGEMASK
>> +	mtc0	a3, CP0_INDEX
>> +
>> +	dmtc0	a1, CP0_ENTRYLO0
>> +	daddu	a1, a1, a4
>> +
>> +	dmtc0	a1, CP0_ENTRYLO1
>> +	daddu	a1, a1, a4
>> +
>> +	dmtc0	a0, CP0_ENTRYHI
>> +	daddu	a0, a0, a5
>> +
>> +	ehb
>> +	tlbwi
>> +	jr  ra
>> +	 nop
>> +	.end   single_tlb_setup
>> +
>> +
>> +/**
>> + * This code is moved to a movable boot bus region,
>> + * and it is responsible for changing the flash mappings and
>> + * jumping to run from the TLB mapped address.
>> + *
>> + * @param a0	New address for boot bus region 0
>> + * @param a1	Address to branch to afterwards
>> + * @param a2	Address of MIO_BOOT_REG_CFG0
>> + */
>> +	.balign	8
>> +change_boot_mappings:
>> +	sd	a0, 0(a2)
>> +	sync
>> +	j a1	    /* Jump to new TLB mapped location */
>> +	 synci	0(zero)
>> +
>> +/* If we need a large stack, allocate it here. */
>> +#if CONFIG_OCTEON_BIG_STACK_SIZE
>> +	/* Allocate the stack here so it's in L2 cache or DRAM */
>> +	.balign	16
>> +big_stack_end:
>> +	.skip	CONFIG_OCTEON_BIG_STACK_SIZE, 0
>> +big_stack_start:
>> +	.dword	0
>> +#endif
>>
> 


Viele Gr??e,
Stefan
diff mbox series

Patch

diff --git a/MAINTAINERS b/MAINTAINERS
index 66f0b07263..29f2d7328c 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -749,6 +749,12 @@  M:	Ezequiel Garcia <ezequiel at collabora.com>
 S:	Maintained
 F:	arch/mips/mach-jz47xx/
 
+MIPS Octeon
+M:	Aaron Williams <awilliams at marvell.com>
+S:	Maintained
+F:	arch/mips/mach-octeon/
+F:	arch/mips/include/asm/arch-octeon/
+
 MMC
 M:	Peng Fan <peng.fan at nxp.com>
 S:	Maintained
diff --git a/arch/Kconfig b/arch/Kconfig
index 91e049b322..1cd3e1dc0b 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -37,6 +37,7 @@  config MICROBLAZE
 
 config MIPS
 	bool "MIPS architecture"
+	select CREATE_ARCH_SYMLINK
 	select HAVE_ARCH_IOREMAP
 	select HAVE_PRIVATE_LIBGCC
 	select SUPPORT_OF_CONTROL
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index 48e754cc46..3c7f3eb94f 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -106,6 +106,24 @@  config ARCH_JZ47XX
 	select OF_CONTROL
 	select DM
 
+config ARCH_OCTEON
+	bool "Support Marvell Octeon CN7xxx platforms"
+	select DISPLAY_CPUINFO
+	select DMA_ADDR_T_64BIT
+	select DM
+	select DM_SERIAL
+	select MIPS_CACHE_COHERENT
+	select MIPS_INIT_STACK_IN_SRAM
+	select MIPS_L2_CACHE
+	select MIPS_TUNE_OCTEON3
+	select ROM_EXCEPTION_VECTORS
+	select SUPPORTS_BIG_ENDIAN
+	select SUPPORTS_CPU_MIPS64_OCTEON
+	select PHYS_64BIT
+	select OF_CONTROL
+	select OF_LIVE
+	imply CMD_DM
+
 config MACH_PIC32
 	bool "Support Microchip PIC32"
 	select DM
@@ -160,6 +178,7 @@  source "arch/mips/mach-bmips/Kconfig"
 source "arch/mips/mach-jz47xx/Kconfig"
 source "arch/mips/mach-pic32/Kconfig"
 source "arch/mips/mach-mtmips/Kconfig"
+source "arch/mips/mach-octeon/Kconfig"
 
 if MIPS
 
@@ -233,6 +252,14 @@  config CPU_MIPS64_R6
 	  Choose this option to build a kernel for release 6 or later of the
 	  MIPS64 architecture.
 
+config CPU_MIPS64_OCTEON
+	bool "Marvell Octeon series of CPUs"
+	depends on SUPPORTS_CPU_MIPS64_OCTEON
+	select 64BIT
+	help
+	 Choose this option for Marvell Octeon CPUs.  These CPUs are between
+	 MIPS64 R5 and R6 with other extensions.
+
 endchoice
 
 menu "General setup"
@@ -261,7 +288,7 @@  config MIPS_CM_BASE
 config MIPS_CACHE_INDEX_BASE
 	hex "Index base address for cache initialisation"
 	default 0x80000000 if CPU_MIPS32
-	default 0xffffffff80000000 if CPU_MIPS64
+	default 0xFFFFFFFFC0000000 if ARCH_OCTEON
 	help
 	  This is the base address for a memory block, which is used for
 	  initialising the cache lines. This is also the base address of a memory
@@ -342,6 +369,14 @@  config SPL_LOADER_SUPPORT
 	help
 	  Enable this option if you want to use SPL loaders without DM enabled.
 
+config MIPS_CACHE_COHERENT
+	bool "Set if MIPS processor is cache coherent"
+	help
+	 Enable this if the MIPS architecture is cache coherent like the
+	 Marvell Octeon series of SoCs.  When this is set, cache flushes
+	 and invalidates only flush the write buffer since the hardware
+	 maintains cache coherency.
+
 endmenu
 
 menu "OS boot interface"
@@ -398,6 +433,9 @@  config SUPPORTS_CPU_MIPS64_R2
 config SUPPORTS_CPU_MIPS64_R6
 	bool
 
+config SUPPORTS_CPU_MIPS64_OCTEON
+	bool
+
 config CPU_MIPS32
 	bool
 	default y if CPU_MIPS32_R1 || CPU_MIPS32_R2 || CPU_MIPS32_R6
@@ -405,6 +443,7 @@  config CPU_MIPS32
 config CPU_MIPS64
 	bool
 	default y if CPU_MIPS64_R1 || CPU_MIPS64_R2 || CPU_MIPS64_R6
+	default y if CPU_MIPS64_OCTEON
 
 config MIPS_TUNE_4KC
 	bool
@@ -421,6 +460,9 @@  config MIPS_TUNE_34KC
 config MIPS_TUNE_74KC
 	bool
 
+config MIPS_TUNE_OCTEON3
+	bool
+
 config 32BIT
 	bool
 
@@ -453,6 +495,11 @@  config MIPS_SRAM_INIT
 	  before it can be used. If enabled, a function mips_sram_init() will
 	  be called just before setup_stack_gd.
 
+config DMA_ADDR_T_64BIT
+	bool
+	help
+	 Select this to enable 64-bit DMA addressing
+
 config SYS_DCACHE_SIZE
 	int
 	default 0
diff --git a/arch/mips/Makefile b/arch/mips/Makefile
index af3f227436..fa1ba7855a 100644
--- a/arch/mips/Makefile
+++ b/arch/mips/Makefile
@@ -1,6 +1,10 @@ 
 # SPDX-License-Identifier: GPL-2.0+
 
+ifneq ($(CONFIG_ARCH_OCTEON),y)
 head-y := arch/mips/cpu/start.o
+else
+head-y := arch/mips/mach-octeon/start.o
+endif
 
 ifeq ($(CONFIG_SPL_BUILD),y)
 ifneq ($(CONFIG_SPL_START_S_PATH),)
@@ -17,6 +21,7 @@  machine-$(CONFIG_ARCH_JZ47XX) += jz47xx
 machine-$(CONFIG_MACH_PIC32) += pic32
 machine-$(CONFIG_ARCH_MTMIPS) += mtmips
 machine-$(CONFIG_ARCH_MSCC) += mscc
+machine-${CONFIG_ARCH_OCTEON} += octeon
 
 machdirs := $(patsubst %,arch/mips/mach-%/,$(machine-y))
 libs-y += $(machdirs)
@@ -30,6 +35,7 @@  arch-$(CONFIG_CPU_MIPS32_R6) += -march=mips32r6 -Wa,-mips32r6
 arch-$(CONFIG_CPU_MIPS64_R1) += -march=mips64 -Wa,-mips64
 arch-$(CONFIG_CPU_MIPS64_R2) += -march=mips64r2 -Wa,-mips64r2
 arch-$(CONFIG_CPU_MIPS64_R6) += -march=mips64r6 -Wa,-mips64r6
+arch-${CONFIG_CPU_MIPS64_OCTEON} += -march=octeon3
 
 # Allow extra optimization for specific CPUs/SoCs
 tune-$(CONFIG_MIPS_TUNE_4KC) += -mtune=4kc
@@ -37,6 +43,7 @@  tune-$(CONFIG_MIPS_TUNE_14KC) += -mtune=14kc
 tune-$(CONFIG_MIPS_TUNE_24KC) += -mtune=24kc
 tune-$(CONFIG_MIPS_TUNE_34KC) += -mtune=34kc
 tune-$(CONFIG_MIPS_TUNE_74KC) += -mtune=74kc
+tune-${CONFIG_MIPS_TUNE_OCTEON3} += -mtune=octeon3
 
 # Include default header files
 cflags-y += -I$(srctree)/arch/mips/include/asm/mach-generic
diff --git a/arch/mips/cpu/Makefile b/arch/mips/cpu/Makefile
index 6df7bb4e48..732015d6f3 100644
--- a/arch/mips/cpu/Makefile
+++ b/arch/mips/cpu/Makefile
@@ -1,6 +1,8 @@ 
 # SPDX-License-Identifier: GPL-2.0+
 
-extra-y	= start.o
+ifneq ($(CONFIG_ARCH_OCTEON),y)
+extra-y = start.o
+endif
 
 obj-y += time.o
 obj-y += interrupts.o
diff --git a/arch/mips/include/asm/arch-octeon/cavm-reg.h b/arch/mips/include/asm/arch-octeon/cavm-reg.h
new file mode 100644
index 0000000000..b961e54956
--- /dev/null
+++ b/arch/mips/include/asm/arch-octeon/cavm-reg.h
@@ -0,0 +1,42 @@ 
+/* SPDX-License-Identifier:    GPL-2.0 */
+/*
+ * Copyright (C) 2020 Marvell International Ltd.
+ */
+
+#ifndef __CAVM_REG_H__
+
+/* Register offsets */
+#define CAVM_CIU_FUSE			((u64 *)0x80010100000001a0)
+#define CAVM_MIO_BOOT_REG_CFG0		((u64 *)0x8001180000000000)
+#define CAVM_RST_BOOT			((u64 *)0x8001180006001600)
+
+/* Register structs */
+
+/**
+ * Register (RSL) rst_boot
+ *
+ * RST Boot Register
+ */
+union cavm_rst_boot {
+	u64 u;
+	struct cavm_rst_boot_s {
+		u64 chipkill                         : 1;
+		u64 jtcsrdis                         : 1;
+		u64 ejtagdis                         : 1;
+		u64 romen                            : 1;
+		u64 ckill_ppdis                      : 1;
+		u64 jt_tstmode                       : 1;
+		u64 vrm_err                          : 1;
+		u64 reserved_37_56                   : 20;
+		u64 c_mul                            : 7;
+		u64 pnr_mul                          : 6;
+		u64 reserved_21_23                   : 3;
+		u64 lboot_oci                        : 3;
+		u64 lboot_ext                        : 6;
+		u64 lboot                            : 10;
+		u64 rboot                            : 1;
+		u64 rboot_pin                        : 1;
+	} s;
+};
+
+#endif /* __CAVM_REG_H__ */
diff --git a/arch/mips/include/asm/arch-octeon/clock.h b/arch/mips/include/asm/arch-octeon/clock.h
new file mode 100644
index 0000000000..a844a222c9
--- /dev/null
+++ b/arch/mips/include/asm/arch-octeon/clock.h
@@ -0,0 +1,24 @@ 
+/* SPDX-License-Identifier:    GPL-2.0 */
+/*
+ * Copyright (C) 2018, 2019 Marvell International Ltd.
+ *
+ * https://spdx.org/licenses
+ */
+
+#ifndef __CLOCK_H__
+
+/** System PLL reference clock */
+#define PLL_REF_CLK                     50000000        /* 50 MHz */
+#define NS_PER_REF_CLK_TICK             (1000000000 / PLL_REF_CLK)
+
+/**
+ * Returns the I/O clock speed in Hz
+ */
+u64 octeon_get_io_clock(void);
+
+/**
+ * Returns the core clock speed in Hz
+ */
+u64 octeon_get_core_clock(void);
+
+#endif /* __CLOCK_H__ */
diff --git a/arch/mips/mach-octeon/Kconfig b/arch/mips/mach-octeon/Kconfig
new file mode 100644
index 0000000000..67fcb6058c
--- /dev/null
+++ b/arch/mips/mach-octeon/Kconfig
@@ -0,0 +1,92 @@ 
+menu "Octeon platforms"
+	depends on ARCH_OCTEON
+
+config SYS_SOC
+	string
+	default "octeon"
+
+config OCTEON_CN7XXX
+	bool "Octeon CN7XXX SoC"
+
+config OCTEON_CN70XX
+	bool "Octeon CN70XX SoC"
+	select OCTEON_CN7XXX
+
+config OCTEON_CN73XX
+	bool "Octeon CN73XX SoC"
+	select OCTEON_CN7XXX
+
+config OCTEON_CN78XX
+	bool "Octeon CN78XX SoC"
+	select OCTEON_CN7XXX
+
+choice
+	prompt "Octeon MIPS family select"
+
+config SOC_OCTEON2
+	bool "Octeon II family"
+	help
+	 This selects the Octeon II SoC family
+
+config SOC_OCTEON3
+	bool "Octeon III family"
+	help
+	 This selects the Octeon III SoC family CN70xx, CN73XX, CN78xx
+	 and CNF75XX.
+
+endchoice
+
+config SYS_DCACHE_SIZE
+	default 32768
+
+config SYS_DCACHE_LINE_SIZE
+	default 128
+
+config SYS_ICACHE_SIZE
+	default	79872
+
+config SYS_ICACHE_LINE_SIZE
+	default 128
+
+config OCTEON_BIG_STACK_SIZE
+	hex
+	default 0x4000
+	help
+	 This enables a larger stack needed for Octeon 3 DRAM initialization.
+	 If this is disabled then a part of the L1 cache will be reserved for
+	 the stack, resulting in a smaller image.  If this  is true then
+	 a portion of the TEXT address space will be reserved for the stack.
+	 Note that this requires that U-Boot MUST be able to fit entirely
+	 within the L2 cache and cannot be executed from a parallel NOR flash.
+	 The default size is 16KiB.
+
+config OCTEON_COPY_FROM_FLASH_TO_L2
+	bool
+	default y
+	help
+	 Set this for U-Boot to attempt to copy itself from flash memory into
+	 the L2 cache.  This significantly improvess the boot performance.
+
+config OCTEON_L2_MEMCPY_IN_CACHE
+	bool
+	default y
+	help
+	 If this is set then the memcpy code that is used to copy U-Boot from
+	 the flash to the L2 cache is written to the L2 cache.  This
+	 significantly speeds up the memcpy operation.
+
+config OCTEON_L2_UBOOT_ADDR
+	hex
+	default 0xffffffff81000000
+	help
+	 This specifies the address where U-Boot will be copied into the L2
+	 cache.
+
+config OCTEON_L2_MEMCPY_ADDR
+	hex
+	default 0xffffffff81400000
+	help
+	 This specifies where U-Boot will place the memcpy routine used for
+	 copying U-Boot from flash to L2 cache.
+
+endmenu
diff --git a/arch/mips/mach-octeon/Makefile b/arch/mips/mach-octeon/Makefile
new file mode 100644
index 0000000000..a5fda682a7
--- /dev/null
+++ b/arch/mips/mach-octeon/Makefile
@@ -0,0 +1,10 @@ 
+# (C) Copyright 2019 Marvell, Inc.
+#
+# SPDX-License-Identifier:	GPL-2.0+
+#
+
+extra-y = start.o
+
+obj-y += clock.o
+obj-y += cpu.o
+obj-y += dram.o
diff --git a/arch/mips/mach-octeon/clock.c b/arch/mips/mach-octeon/clock.c
new file mode 100644
index 0000000000..6e32008641
--- /dev/null
+++ b/arch/mips/mach-octeon/clock.c
@@ -0,0 +1,22 @@ 
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2018, 2019 Marvell International Ltd.
+ */
+
+#include <common.h>
+#include <asm/arch/clock.h>
+
+DECLARE_GLOBAL_DATA_PTR;
+
+int octeon_get_timer_freq(void)
+{
+	return gd->cpu_clk;
+}
+
+/**
+ * Returns the I/O clock speed in Hz
+ */
+u64 octeon_get_io_clock(void)
+{
+	return gd->bus_clk;
+}
diff --git a/arch/mips/mach-octeon/cpu.c b/arch/mips/mach-octeon/cpu.c
new file mode 100644
index 0000000000..a1373c6d56
--- /dev/null
+++ b/arch/mips/mach-octeon/cpu.c
@@ -0,0 +1,55 @@ 
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Copyright (C) 2020 Marvell International Ltd.
+ */
+
+#include <common.h>
+#include <linux/io.h>
+#include <asm/arch/clock.h>
+#include <asm/arch-octeon/cavm-reg.h>
+
+DECLARE_GLOBAL_DATA_PTR;
+
+static int get_clocks(void)
+{
+	const u64 ref_clock = PLL_REF_CLK;
+	union cavm_rst_boot rst_boot;
+
+	rst_boot.u = ioread64(CAVM_RST_BOOT);
+	gd->cpu_clk = ref_clock * rst_boot.s.c_mul;
+	gd->bus_clk = ref_clock * rst_boot.s.pnr_mul;
+
+	debug("%s: cpu: %lu, bus: %lu\n", __func__, gd->cpu_clk, gd->bus_clk);
+
+	return 0;
+}
+
+/* Early mach init code run from flash */
+int mach_cpu_init(void)
+{
+	/* Remap boot-bus 0x1fc0.0000 -> 0x1f40.0000 */
+	/* ToDo: Move this to an early running bus (bootbus) DM driver */
+	clrsetbits_be64(CAVM_MIO_BOOT_REG_CFG0, 0xffff, 0x1f40);
+
+	/* Get clocks and store them in GD */
+	get_clocks();
+
+	return 0;
+}
+
+/**
+ * Returns number of cores
+ *
+ * @return	number of CPU cores for the specified node
+ */
+static int cavm_octeon_num_cores(void)
+{
+	return fls64(ioread64(CAVM_CIU_FUSE) & 0xffffffffffff);
+}
+
+int print_cpuinfo(void)
+{
+	printf("SoC:   Octeon CN73xx (%d cores)\n", cavm_octeon_num_cores());
+
+	return 0;
+}
diff --git a/arch/mips/mach-octeon/dram.c b/arch/mips/mach-octeon/dram.c
new file mode 100644
index 0000000000..c16a73e8e6
--- /dev/null
+++ b/arch/mips/mach-octeon/dram.c
@@ -0,0 +1,27 @@ 
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Copyright (C) 2020 Marvell International Ltd.
+ */
+
+#include <common.h>
+#include <dm.h>
+#include <ram.h>
+
+DECLARE_GLOBAL_DATA_PTR;
+
+int dram_init(void)
+{
+	/*
+	 * No DDR init yet -> run in L2 cache
+	 */
+	gd->ram_size = (2 << 20);
+	gd->bd->bi_dram[0].size = gd->ram_size;
+	gd->bd->bi_dram[1].size = 0;
+
+	return 0;
+}
+
+ulong board_get_usable_ram_top(ulong total_size)
+{
+	return gd->ram_top;
+}
diff --git a/arch/mips/mach-octeon/include/ioremap.h b/arch/mips/mach-octeon/include/ioremap.h
new file mode 100644
index 0000000000..59b75008a2
--- /dev/null
+++ b/arch/mips/mach-octeon/include/ioremap.h
@@ -0,0 +1,30 @@ 
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __ASM_MACH_OCTEON_IOREMAP_H
+#define __ASM_MACH_OCTEON_IOREMAP_H
+
+#include <linux/types.h>
+
+/*
+ * Allow physical addresses to be fixed up to help peripherals located
+ * outside the low 32-bit range -- generic pass-through version.
+ */
+static inline phys_addr_t fixup_bigphys_addr(phys_addr_t phys_addr,
+					     phys_addr_t size)
+{
+	return phys_addr;
+}
+
+static inline void __iomem *plat_ioremap(phys_addr_t offset, unsigned long size,
+					 unsigned long flags)
+{
+	return (void __iomem *)(XKPHYS | offset);
+}
+
+static inline int plat_iounmap(const volatile void __iomem *addr)
+{
+	return 0;
+}
+
+#define _page_cachable_default	_CACHE_CACHABLE_NONCOHERENT
+
+#endif /* __ASM_MACH_OCTEON_IOREMAP_H */
diff --git a/arch/mips/mach-octeon/start.S b/arch/mips/mach-octeon/start.S
new file mode 100644
index 0000000000..acb967201a
--- /dev/null
+++ b/arch/mips/mach-octeon/start.S
@@ -0,0 +1,1241 @@ 
+/* SPDX-License-Identifier: GPL-2.0+ */
+/*
+ *  Startup Code for OCTEON 64-bit CPU-core
+ *
+ *  Copyright (c) 2003	Wolfgang Denk <wd at denx.de>
+ *  Copyright 2004, 2005, 2010 - 2015 Cavium Inc..
+ */
+
+#include <asm-offsets.h>
+#include <config.h>
+#include <asm/regdef.h>
+#include <asm/mipsregs.h>
+#include <asm/asm.h>
+
+#define BOOT_VECTOR_NUM_WORDS		8
+
+#define OCTEON_BOOT_MOVEABLE_MAGIC_OFFSET	0x70
+#define OCTEON_BOOT_VECTOR_MOVEABLE_OFFSET	0x78
+
+#define OCTEON_BOOT_MOVEABLE_MAGIC1_RAW	0xdb00110ad358eacd
+#define OCTEON_BOOT_MOVEABLE_MAGIC1	OCTEON_BOOT_MOVEABLE_MAGIC1_RAW
+
+#define OCTEON_CIU_SOFT_RST		0x8001070000000740
+
+#define	OCTEON_L2C_WPAR_PP0		0x8001180080840000
+#define OCTEON_MIO_BOOT_BASE		0x8001180000000000
+#define OCTEON_MIO_BOOT_REG_CFG0_OFF	0x0000
+#define OCTEON_MIO_BOOT_LOC_CFG0_OFF	0x0080
+#define OCTEON_MIO_BOOT_LOC_ADR_OFF	0x0090
+#define OCTEON_MIO_BOOT_LOC_DAT_OFF	0x0098
+#define	OCTEON_MIO_RST_BOOT		0x8001180000001600
+#define OCTEON_MIO_BOOT_REG_CFG0	0x8001180000000000
+#define	OCTEON_MIO_BOOT_REG_TIM0	0x8001180000000040
+#define OCTEON_MIO_BOOT_LOC_CFG0	0x8001180000000080
+#define OCTEON_MIO_BOOT_LOC_ADR		0x8001180000000090
+#define OCTEON_MIO_BOOT_LOC_DAT		0x8001180000000098
+#define	OCTEON_MIO_FUSE_DAT3		0x8001180000001418
+#define OCTEON_L2D_FUS3			0x80011800800007B8
+#define	OCTEON_LMC0_DDR_PLL_CTL		0x8001180088000258
+
+#define OCTEON_RST			0x8001180006000000
+#define OCTEON_RST_BOOT_OFFSET		0x1600
+#define OCTEON_RST_SOFT_RST_OFFSET	0x1680
+#define OCTEON_RST_COLD_DATAX_OFFSET(X)	(0x17C0 + (X) * 8)
+#define OCTEON_RST_BOOT			0x8001180006001600
+#define OCTEON_RST_SOFT_RST		0x8001180006001680
+#define OCTEON_RST_COLD_DATAX(X)	(0x80011800060017C0 + (X) * 8)
+
+#define OCTEON_OCX_COM_NODE		0x8001180011000000
+#define OCTEON_L2C_OCI_CTL		0x8001180080800020
+#define OCTEON_L2C_TAD_CTL		0x8001180080800018
+#define OCTEON_L2C_CTL			0x8001180080800000
+
+#define OCTEON_DBG_DATA			0x80011F00000001E8
+#define OCTEON_PCI_READ_CMD_E		0x80011F0000001188
+#define OCTEON_NPEI_DBG_DATA		0x80011F0000008510
+#define OCTEON_CIU_WDOG(X)		(0x8001070000000500 + (X) * 8)
+#define OCTEON_CIU_PP_POKE(X)		(0x8001070000000580 + (X) * 8)
+#define OCTEON_CIU3_WDOG(X)		(0x8001010000020000 + (X) * 8)
+#define OCTEON_CIU3_PP_POKE(X)		(0x8001010000030000 + (X) * 8)
+#define OCTEON_OCX_COM_LINKX_CTL(X)	(0x8001180011000020 + (X) * 8)
+#define OCTEON_SLI_CTL_STATUS		0x80011F0000028570
+#define OCTEON_GSERX_SCRATCH(X)		(0x8001180090000020 + (X) * 0x1000000)
+
+/** PRID for CN56XX */
+#define OCTEON_PRID_CN56XX		0x04
+/** PRID for CN52XX */
+#define OCTEON_PRID_CN52XX		0x07
+/** PRID for CN63XX */
+#define OCTEON_PRID_CN63XX		0x90
+/** PRID for CN68XX */
+#define OCTEON_PRID_CN68XX		0x91
+/** PRID for CN66XX */
+#define OCTEON_PRID_CN66XX		0x92
+/** PRID for CN61XX */
+#define OCTEON_PRID_CN61XX		0x93
+/** PRID for CNF71XX */
+#define OCTEON_PRID_CNF71XX		0x94
+/** PRID for CN78XX */
+#define OCTEON_PRID_CN78XX		0x95
+/** PRID for CN70XX */
+#define OCTEON_PRID_CN70XX		0x96
+/** PRID for CN73XX */
+#define OCTEON_PRID_CN73XX		0x97
+/** PRID for CNF75XX */
+#define OCTEON_PRID_CNF75XX		0x98
+
+/* func argument is used to create a  mark, must be unique */
+#define GETOFFSET(reg, func)	\
+	.balign	8;		\
+	bal	func ##_mark;	\
+	nop;			\
+	.dword	.;		\
+func ##_mark:			\
+	ld	reg, 0(ra);	\
+	dsubu	reg, ra, reg;
+
+#define JAL(func)		\
+	.balign	8;		\
+	bal	func ##_mark;	\
+	 nop;			\
+	.dword .;		\
+func ##_mark:			\
+	ld	t8, 0(ra);	\
+	dsubu	t8, ra, t8;	\
+	dla	t9, func;	\
+	daddu	t9, t9, t8;	\
+	jalr	t9;		\
+	 nop;
+
+	.set	arch=octeon3
+	.set	noreorder
+
+	.macro uhi_mips_exception
+	move	k0, t9		# preserve t9 in k0
+	move	k1, a0		# preserve a0 in k1
+	li	t9, 15		# UHI exception operation
+	li	a0, 0		# Use hard register context
+	sdbbp	1		# Invoke UHI operation
+	.endm
+
+	.macro setup_stack_gd
+	li	t0, -16
+	PTR_LI	t1, big_stack_start
+	and	sp, t1, t0		# force 16 byte alignment
+	PTR_SUBU \
+		sp, sp, GD_SIZE		# reserve space for gd
+	and	sp, sp, t0		# force 16 byte alignment
+	move	k0, sp			# save gd pointer
+#if CONFIG_VAL(SYS_MALLOC_F_LEN) && \
+    !CONFIG_IS_ENABLED(INIT_STACK_WITHOUT_MALLOC_F)
+	li	t2, CONFIG_VAL(SYS_MALLOC_F_LEN)
+	PTR_SUBU \
+		sp, sp, t2		# reserve space for early malloc
+	and	sp, sp, t0		# force 16 byte alignment
+#endif
+	move	fp, sp
+
+	/* Clear gd */
+	move	t0, k0
+1:
+	PTR_S	zero, 0(t0)
+	PTR_ADDIU t0, PTRSIZE
+	blt	t0, t1, 1b
+	 nop
+
+#if CONFIG_VAL(SYS_MALLOC_F_LEN) && \
+    !CONFIG_IS_ENABLED(INIT_STACK_WITHOUT_MALLOC_F)
+	PTR_S	sp, GD_MALLOC_BASE(k0)	# gd->malloc_base offset
+#endif
+	.endm
+
+/* Saved register usage:
+ * s0:	not used
+ * s1:	not used
+ * s2:	Address U-Boot loaded into in L2 cache
+ * s3:	Start address
+ * s4:	flags
+ *		1:	booting from RAM
+ *		2:	executing out of cache
+ *		4:	booting from flash
+ * s5:	u-boot size (data end - _start)
+ * s6:	offset in flash.
+ * s7:	_start physical address
+ * s8:
+ */
+
+ENTRY(_start)
+	/* U-Boot entry point */
+	b	reset
+
+	/* The above jump instruction/nop are considered part of the
+	 * bootloader_header_t structure but are not changed when the header is
+	 * updated.
+	 */
+
+	/* Leave room for bootloader_header_t header at start of binary.  This
+	 * header is used to identify the board the bootloader is for, what
+	 * address it is linked at, failsafe/normal, etc.  It also contains a
+	 * CRC of the entire image.
+	 */
+
+#if defined(CONFIG_ROM_EXCEPTION_VECTORS)
+	/*
+	 * Exception vector entry points. When running from ROM, an exception
+	 * cannot be handled. Halt execution and transfer control to debugger,
+	 * if one is attached.
+	 */
+	.org 0x200
+	/* TLB refill, 32 bit task */
+	uhi_mips_exception
+
+	.org 0x280
+	/* XTLB refill, 64 bit task */
+	uhi_mips_exception
+
+	.org 0x300
+	/* Cache error exception */
+	uhi_mips_exception
+
+	.org 0x380
+	/* General exception */
+	uhi_mips_exception
+
+	.org 0x400
+	/* Catch interrupt exceptions */
+	uhi_mips_exception
+
+	.org 0x480
+	/* EJTAG debug exception */
+1:	b	1b
+	 nop
+
+	.org 0x500
+#endif
+
+/* Reserve extra space so that when we use the boot bus local memory
+ * segment to remap the debug exception vector we don't overwrite
+ * anything useful
+ */
+
+/* Basic exception handler (dump registers) in all ASM.	 When using the TLB for
+ * mapping u-boot C code, we can't branch to that C code for exception handling
+ * (TLB is disabled for some exceptions.
+ */
+
+/* RESET/start here */
+	.balign	8
+reset:
+	nop
+	synci	0(zero)
+	mfc0	k0, CP0_STATUS
+	ori	k0, 0x00E0		/* enable 64 bit mode for CSR access */
+	mtc0	k0, CP0_STATUS
+
+	/* Save the address we're booting from, strip off low bits */
+	bal	1f
+	 nop
+1:
+	move	s3, ra
+	dins	s3, zero, 0, 12
+
+	/* Disable boot bus moveable regions */
+	PTR_LI	k0, OCTEON_MIO_BOOT_LOC_CFG0
+	sd	zero, 0(k0)
+	sd	zero, 8(k0)
+
+	/* Disable the watchdog timer
+	 * First we check if we're running on CN78XX, CN73XX or CNF75XX to see
+	 * if we use CIU3 or CIU.
+	 */
+	mfc0	t0, CP0_PRID
+	ext	t0, t0, 8, 8
+	/* Assume CIU */
+	PTR_LI	t1, OCTEON_CIU_WDOG(0)
+	PTR_LI	t2, OCTEON_CIU_PP_POKE(0)
+	blt	t0, OCTEON_PRID_CN78XX, wd_use_ciu
+	 nop
+	beq	t0, OCTEON_PRID_CN70XX, wd_use_ciu
+	 nop
+	/* Use CIU3 */
+	PTR_LI	t1, OCTEON_CIU3_WDOG(0)
+	PTR_LI	t2, OCTEON_CIU3_PP_POKE(0)
+wd_use_ciu:
+	sd	zero, 0(t2)		/* Pet the dog */
+	sd	zero, 0(t1)		/* Disable watchdog timer */
+
+	/* Errata: CN76XX has a node ID of 3. change it to zero here.
+	 * This needs to be done before we relocate to L2 as addresses change
+	 * For 76XX pass 1.X we need to zero out the OCX_COM_NODE[ID],
+	 * L2C_OCI_CTL[GKSEGNODE] and CP0 of Root.CvmMemCtl2[KSEGNODE].
+	 */
+	mfc0	a4, CP0_PRID
+	/* Check for 78xx pass 1.x processor ID */
+	andi	a4, 0xffff
+	blt	a4, (OCTEON_PRID_CN78XX << 8), 1f
+	 nop
+
+	/* Zero out alternate package for now */
+	dins	a4, zero, 6, 1
+	bge	a4, ((OCTEON_PRID_CN78XX << 8) | 0x08), 1f
+	 nop
+
+	/* 78xx or 76xx here, first check for bug #27141 */
+	PTR_LI	a5, OCTEON_SLI_CTL_STATUS
+	ld	a6, 0(a5)
+	andi	a7, a4, 0xff
+	andi	a6, a6, 0xff
+
+	beq	a6, a7, not_bug27141
+	 nop
+
+	/* core 0 proc_id rev_id field does not match SLI_CTL_STATUS rev_id */
+	/* We just hit bug #27141.  Need to reset the chip and try again */
+
+	PTR_LI	a4, OCTEON_RST_SOFT_RST
+	ori	a5, zero, 0x1	/* set the reset bit */
+
+reset_78xx_27141:
+	sync
+	synci	0(zero)
+	cache	9, 0(zero)
+	sd	a5, 0(a4)
+	wait
+	b	reset_78xx_27141
+	 nop
+
+not_bug27141:
+	/* 76XX pass 1.x has the node number set to 3 */
+	mfc0	a4, CP0_EBASE
+	ext	a4, a4, 0, 10
+	bne	a4, 0x180, 1f	/* Branch if not node 3 core 0 */
+	 nop
+
+	/* Clear OCX_COM_NODE[ID] */
+	PTR_LI	a5, OCTEON_OCX_COM_NODE
+	ld	a4, 0(a5)
+	dins	a4, zero, 0, 2
+	sd	a4, 0(a5)
+	ld	zero, 0(a5)
+
+	/* Clear L2C_OCI_CTL[GKSEGNODE] */
+	PTR_LI	a5, OCTEON_L2C_OCI_CTL
+	ld	a4, 0(a5)
+	dins	a4, zero, 4, 2
+	sd	a4, 0(a5)
+	ld	zero, 0(a5)
+
+	/* Clear CP0 Root.CvmMemCtl2[KSEGNODE] */
+	dmfc0	a4, CP0_CVMMEMCTL2
+	dins	a4, zero, 12, 2
+	dmtc0	a4, CP0_CVMMEMCTL2
+
+	/* Put the flash address in the start of the EBASE register to
+	 * enable our exception handler but only for core 0.
+	 */
+	mfc0	a4, CP0_EBASE
+	dext	a4, a4, 0, 10
+	bnez	a4, no_flash
+	/* OK in delay slot */
+	dext	a6, a6, 0, 16		/* Get the base address in flash */
+	sll	a6, a6, 16
+	mtc0	a6, CP0_EBASE	/* Enable exceptions */
+
+no_flash:
+	/* Zero out various registers */
+	mtc0	zero, CP0_DEPC
+	mtc0	zero, CP0_EPC
+	mtc0	zero, CP0_CAUSE
+	mfc0	a4, CP0_PRID
+	ext	a4, a4, 8, 8
+	mtc0	zero, CP0_DESAVE
+
+	/* The following are only available on Octeon 2 or later */
+	mtc0	zero, CP0_KSCRATCH1
+	mtc0	zero, CP0_KSCRATCH2
+	mtc0	zero, CP0_KSCRATCH3
+	mtc0	zero, CP0_USERLOCAL
+
+	/* Turn off ROMEN bit to disable ROM */
+	PTR_LI	a1, OCTEON_MIO_RST_BOOT
+	/* For OCTEON 3 we use RST_BOOT instead of MIO_RST_BOOT.
+	 * The difference is bits 24-26 are 6 instead of 0 for the address.
+	 */
+	/* For Octeon 2 and CN70XX we can ignore the watchdog */
+	blt	a4, OCTEON_PRID_CN78XX, watchdog_ok
+	 nop
+
+	PTR_LI	a1, OCTEON_RST_BOOT
+
+	beq	a4, OCTEON_PRID_CN70XX, watchdog_ok
+	 nop
+
+	ld	a2, 0(a1)
+	/* There is a bug where some registers don't get properly reset when
+	 * the watchdog timer causes a reset.  In this case we need to force
+	 * a reset.
+	 */
+	bbit0	a2, 11, watchdog_ok	/* Skip if watchdog not hit */
+	 dins	a2, zero, 2, 18	/* Don't clear LBOOT, LBOOT_EXT or LBOOT_OCI */
+	/* Clear bit indicating reset due to watchdog */
+	ori	a2, 1 << 11
+	sd	a2, 0(a1)
+
+	/* Disable watchdog */
+	PTR_LI	a1, OCTEON_CIU3_PP_POKE(0)
+	sd	zero, 0(a1)
+	PTR_LI	a1, OCTEON_CIU3_WDOG(0)
+	sd	zero, 0(a1)
+
+	/* Record this in the GSER0_SCRATCH register in bit 11 */
+	PTR_LI	a1, OCTEON_GSERX_SCRATCH(0)
+	ld	a2, 0(a1)
+	ori	a2, 1 << 11
+	sd	a2, 0(a1)
+
+	PTR_LI	a1, OCTEON_RST_SOFT_RST
+	li	a2, 1
+	sd	a2, 0(a1)
+	wait
+
+	/* We should never get here */
+
+watchdog_ok:
+	ld	a2, 0(a1)
+	/* Don't clear LBOOT/LBOOT_EXT or LBOOT_OCI */
+	dins	a2, zero, 2, 18
+	dins	a2, zero, 60, 1	/* Clear ROMEN bit */
+	sd	a2, 0(a1)
+
+	/* Start of Octeon setup */
+
+	/* Check what core we are - if core 0, branch to init tlb
+	 * loop in flash.  Otherwise, look up address of init tlb
+	 * loop that was saved in the boot vector block.
+	 */
+	mfc0	a0, CP0_EBASE
+	andi	a0, EBASE_CPUNUM		/* get core */
+	beqz	a0, InitTLBStart_local
+	 nop
+
+	break
+	/* We should never get here - non-zero cores now go directly to
+	 * tlb init from the boot stub in movable region.
+	 */
+
+	.globl InitTLBStart
+InitTLBStart:
+InitTLBStart_local:
+	/* If we don't have working memory yet configure a bunch of
+	 * scratch memory, and set the stack pointer to the top
+	 * of it.  This allows us to go to C code without having
+	 * memory set up
+	 *
+	 * Warning: do not change SCRATCH_STACK_LINES as this can impact the
+	 * transition from start.S to crti.asm. crti requires 590 bytes of
+	 * stack space.
+	 */
+	cache	1,0(zero)	/* Clear Dcache so cvmseg works right */
+#if CONFIG_OCTEON_BIG_STACK_SIZE
+	rdhwr	v0, $0
+	bnez	v0, 1f
+	 nop
+	PTR_LA	sp, big_stack_start - 16
+	b	stack_clear_done
+	 nop
+1:
+#endif
+#define SCRATCH_STACK_LINES 0x36   /* MAX is 0x36 */
+	dmfc0	v0, CP0_CVMMEMCTL
+	dins	v0, zero, 0, 9
+	/* setup SCRATCH_STACK_LINES scratch lines of scratch */
+	ori	v0, 0x100 | SCRATCH_STACK_LINES
+	dmtc0	v0, CP0_CVMMEMCTL
+	/* set stack to top of scratch memory */
+	li	sp, 0xffffffffffff8000 + (SCRATCH_STACK_LINES * 128)
+	/* Clear scratch for CN63XX pass 2.0 errata Core-15169*/
+	li	t0, 0xffffffffffff8000
+clear_scratch:
+	sd	zero, 0(t0)
+	addiu	t0, 8
+	bne	t0, sp, clear_scratch
+	 nop
+
+	/* This code run on all cores - core 0 from flash,
+	 * the rest from DRAM.	When booting from PCI, non-zero cores
+	 * come directly here from the boot vector - no earlier code in this
+	 * file is executed.
+	 */
+
+	/* Some generic initialization is done here as well, as we need this
+	 * done on all cores even when booting from PCI
+	 */
+stack_clear_done:
+	/* Clear watch registers. */
+	mtc0	zero, CP0_WATCHLO
+	mtc0	zero, CP0_WATCHHI
+
+	/* STATUS register */
+	mfc0	k0, CP0_STATUS
+	li	k1, ~ST0_IE
+	and	k0, k1
+	mtc0	k0, CP0_STATUS
+
+	/* CAUSE register */
+	mtc0	zero, CP0_CAUSE
+
+	/* Init Timer */
+	dmtc0	zero, CP0_COUNT
+	dmtc0	zero, CP0_COMPARE
+
+
+	mfc0	a5, CP0_STATUS
+	li	v0, 0xE0		/* enable 64 bit mode for CSR access */
+	or	v0, v0, a5
+	mtc0	v0, CP0_STATUS
+
+
+	dli	v0, 1 << 29  /* Enable large physical address support in TLB */
+	mtc0	v0, CP0_PAGEGRAIN
+
+InitTLB:
+	dmtc0	zero, CP0_ENTRYLO0
+	dmtc0	zero, CP0_ENTRYLO1
+	mtc0	zero, CP0_PAGEMASK
+	dmtc0	zero, CP0_CONTEXT
+	/* Use an offset into kseg0 so we won't conflict with Mips1 legacy
+	 * TLB clearing
+	 */
+	PTR_LI	v0, 0xFFFFFFFF90000000
+	mfc0	a0, CP0_CONFIG1
+	srl	a0, a0, 25
+	/* Check if config4 reg present */
+	mfc0	a1, CP0_CONFIG3
+	bbit0	a1, 31, 2f
+	 and	a0, a0, 0x3F		/* a0 now has the max mmu entry index */
+	mfc0	a1, CP0_CONFIG4
+	bbit0	a1, 14, 2f		/* check config4[MMUExtDef] */
+	 nop
+	/* append config4[MMUSizeExt] to most significant bit of
+	 * config1[MMUSize-1]
+	 */
+	ins	a0, a1, 6, 8
+	and	a0, a0, 0x3fff	/* a0 now includes max entries for cn6xxx */
+2:
+	dmtc0	zero, CP0_XCONTEXT
+	mtc0	zero, CP0_WIRED
+
+InitTLBloop:
+	dmtc0	v0, CP0_ENTRYHI
+	tlbp
+	mfc0	v1, CP0_INDEX
+	daddiu	v0, v0, 1<<13
+	bgez	v1, InitTLBloop
+
+	mtc0	a0, CP0_INDEX
+	tlbwi
+	bnez	a0, InitTLBloop
+	 daddiu	a0, -1
+
+	mthi	zero
+	mtlo	zero
+
+	/* Set up status register */
+	mfc0	v0, CP0_STATUS
+	/* Enable COP0 and COP2 access */
+	li	a4, (1 << 28) | (1 << 30)
+	or	v0, a4
+
+	/* Must leave BEV set here, as DRAM is not configured for core 0.
+	 * Also, BEV must be 1 later on when the exception base address is set.
+	 */
+
+	/* Mask all interrupts */
+	ins	v0, zero, 0, 16
+	/* Clear NMI (used to start cores other than core 0) */
+	ori	v0, 0xE4		/* enable 64 bit, disable interrupts */
+	mtc0	v0, CP0_STATUS
+
+	dli	v0,0xE000000F		/* enable all readhw locations */
+	mtc0	v0, CP0_HWRENA
+
+	dmfc0	v0, CP0_CVMCTL
+	ori	v0, 1<<14	/* enable fixup of unaligned mem access */
+	dmtc0	v0, CP0_CVMCTL
+
+	/* Setup scratch memory.  This is also done in
+	 * cvmx_user_app_init, and this code will be removed
+	 * from the bootloader in the near future.
+	 */
+
+	/* Set L2C_LAD_CTL[MAXLFB] = 0 on CN73XX */
+	mfc0	a4, CP0_PRID
+	ext	a4, a4, 8, 8
+	blt	a4, OCTEON_PRID_CN73XX, 72f
+	nop
+	PTR_LI	v0, OCTEON_L2C_TAD_CTL
+	ld	t1, 0(v0)
+	dins	t1, zero, 0, 4
+	sd	t1, 0(v0)
+	ld	zero, 0(v0)
+
+72:
+
+	/* clear these to avoid immediate interrupt in noperf mode */
+	dmtc0	zero, CP0_COMPARE	/* clear timer interrupt */
+	dmtc0	zero, CP0_COUNT		/* clear timer interrupt */
+	dmtc0	zero, CP0_PERF_CNT0	/* clear perfCnt0 */
+	dmtc0	zero, CP0_PERF_CNT1	/* clear perfCnt1 */
+	dmtc0	zero, CP0_PERF_CNT2
+	dmtc0	zero, CP0_PERF_CNT3
+
+	/* If we're running on a node other than 0 then we need to set KSEGNODE
+	 * to 0.  The nice thing with this code is that it also autodetects if
+	 * we're running on a processor that supports CVMMEMCTL2 or not since
+	 * only processors that have this will have a non-zero node ID.  Because
+	 * of this there's no need to check if we're running on a 78XX.
+	 */
+	mfc0    t1, CP0_EBASE
+	dext    t1, t1, 7, 3            /* Extract node number */
+	beqz    t1, is_node0            /* If non-zero then we're not node 0 */
+	 nop
+	dmfc0   t1, CP0_CVMMEMCTL2
+	dins    t1, zero, 12, 4
+	dmtc0   t1, CP0_CVMMEMCTL2
+is_node0:
+
+	/* Set up TLB mappings for u-boot code in flash. */
+
+	/* Use a bal to get the current PC into ra.  Since this bal is to
+	 * the address immediately following the delay slot, the ra is
+	 * the address of the label.  We then use this to get the actual
+	 * address that we are executing from.
+	 */
+	bal	__dummy
+	 nop
+
+__dummy:
+	/* Get the actual address that we are running at */
+	PTR_LA	a6, _start		/* Linked address of _start */
+	PTR_LA	a7, __dummy
+	dsubu	t0, a7, a6		/* offset of __dummy label from _start*/
+	dsubu	a7, ra, t0		/* a7 now has actual address of _start*/
+
+	/* Save actual _start address in s7.  This is where we
+	 * are executing from, as opposed to where the code is
+	 * linked.
+	 */
+	move	s7, a7
+	move	s4, zero
+
+	/* s7 has actual address of _start.  If this is
+	 * on the boot bus, it will be between 0xBFC000000 and 0xBFFFFFFF.
+	 * If it is on the boot bus, use 0xBFC00000 as the physical address
+	 * for the TLB mapping, as we will be adjusting the boot bus
+	 * to make this adjustment.
+	 * If we are running from DRAM (remote-boot), then we want to use the
+	 * real address in DRAM.
+	 */
+
+	/* Check to see if we are running from flash - we expect that to
+	 * be 0xffffffffb0000000-0xffffffffbfffffff
+	 * (0x10000000-0x1fffffff, unmapped/uncached)
+	 */
+	dli	t2, 0xffffffffb0000000
+	dsubu	t2, s7
+	slt	s4, s7, t2
+	bltz	t2, uboot_in_flash
+	 nop
+
+	/* If we're not core 0 then we don't care about cache */
+	mfc0	t2, CP0_EBASE
+	andi	t2, EBASE_CPUNUM
+	bnez	t2, uboot_in_ram
+	 nop
+
+	/* Find out if we're OCTEON I or OCTEON + which don't support running
+	 * out of cache.
+	 */
+	mfc0	t2, CP0_PRID
+	ext	t2, t2, 8, 8
+	li	s4, 1
+	blt	t2, 0x90, uboot_in_ram
+	 nop
+
+	/* U-Boot can be executing either in RAM or L2 cache.  Now we need to
+	 * check if DRAM is initialized.  The way we do that is to look at
+	 * the reset bit of the LMC0_DDR_PLL_CTL register (bit 7)
+	 */
+	PTR_LI	t2, OCTEON_LMC0_DDR_PLL_CTL
+	ld	t2, 0(t2)
+	bbit1	t2, 7, uboot_in_ram
+	 nop
+
+	/* We must be executing out of cache */
+	b	uboot_in_ram
+	 li	s4, 2
+
+uboot_in_flash:
+	/* Set s4 to 4 to indicate we're running in FLASH */
+	li	s4, 4
+
+#if defined(CONFIG_OCTEON_DISABLE_L2_CACHE_INDEX_ALIASING)
+	/* By default, L2C index aliasing is enabled.  In some cases it may
+	 * need to be disabled.  The L2C index aliasing can only be disabled
+	 * if U-Boot is running out of L2 cache and the L2 cache has not been
+	 * used to store anything.
+	 */
+	PTR_LI	t1, OCTEON_L2C_CTL
+	ld	t2, 0(t1)
+	ori	t2, 1
+	sd	t2, 0(t1)
+#endif
+
+	/* Use BFC00000 as physical address for TLB mappings when booting
+	 * from flash, as we will adjust the boot bus mappings to make this
+	 * mapping correct.
+	 */
+	dli	a7, 0xFFFFFFFFBFC00000
+	dsubu	s6, s7, a7  /* Save flash offset in s6 */
+
+#if defined(CONFIG_OCTEON_COPY_FROM_FLASH_TO_L2)
+	/* For OCTEON II we check to see if the L2 cache is big enough to hold
+	 * U-Boot.  If it is big enough then we copy ourself from flash to the
+	 * L2 cache in order to speed up execution.
+	 */
+
+	/* Check for OCTEON 2 */
+	mfc0	t1, CP0_PRID
+	ext	t1, t1, 8, 8
+	/* Get number of L2 cache sets */
+	beq	t1, OCTEON_PRID_CNF71XX, got_l2_sets	/* CNF71XX */
+	 li	t2, 1 << 9
+	beq	t1, OCTEON_PRID_CN78XX, got_l2_sets	/* CN78XX */
+	 li	t2, 1 << 13
+	beq	t1, OCTEON_PRID_CN70XX, got_l2_sets	/* CN70XX */
+	 li	t2, 1 << 10
+	beq	t1, OCTEON_PRID_CN73XX, got_l2_sets	/* CN73XX */
+	 li	t2, 1 << 11
+	beq	t1, OCTEON_PRID_CNF75XX, got_l2_sets	/* CNF75XX */
+	 li	t2, 1 << 11
+	b	l2_cache_too_small	/* Unknown OCTEON model */
+	 nop
+
+got_l2_sets:
+	/* Get number of associations */
+	PTR_LI	t0, OCTEON_MIO_FUSE_DAT3
+	ld	t0, 0(t0)
+	dext	t0, t0, 32, 3
+
+	beq	t1, OCTEON_PRID_CN70XX, process_70xx_l2sets
+	 nop
+	/* 0 = 16-way, 1 = 12-way, 2 = 8-way, 3 = 4-way, 4-7 reserved */
+	beqz	t0, got_l2_ways
+	 li	t3, 16
+	beq	t0, 1, got_l2_ways
+	 li	t3, 12
+	beq	t0, 2, got_l2_ways
+	 li	t3, 8
+	beq	t0, 3, got_l2_ways
+	 li	t3, 4
+	b	l2_cache_too_small
+	 nop
+
+process_70xx_l2sets:
+	/* For 70XX, the number of ways is defined as:
+	 * 0 - full cache (4-way) 512K
+	 * 1 - 3/4 ways (3-way) 384K
+	 * 2 - 1/2 ways (2-way) 256K
+	 * 3 - 1/4 ways (1-way) 128K
+	 * 4-7 illegal (aliased to 0-3)
+	 */
+	andi	t0, 3
+	beqz	t0, got_l2_ways
+	 li	t3, 4
+	beq	t0, 1, got_l2_ways
+	 li	t3, 3
+	beq	t0, 2, got_l2_ways
+	 li	t3, 2
+	li	t3, 1
+
+got_l2_ways:
+	dmul	a1, t2, t3		/* Calculate cache size */
+	dsll	a1, 7			/* Ways * Sets * cache line sz (128) */
+	daddiu	a1, a1, -128		/* Adjust cache size for copy code */
+
+	/* Calculate size of U-Boot image */
+	/*
+	 * "uboot_end - _start" is not correct, as the image also
+	 * includes the DTB appended to the end (OF_EMBED is deprecated).
+	 * Lets use a defined max for now here.
+	 */
+	PTR_LI	s5, CONFIG_BOARD_SIZE_LIMIT
+
+	daddu	t2, s5, s7	/* t2 = end address */
+	daddiu	t2, t2, 127
+	ins	t2, zero, 0, 7	/* Round up to cache line for memcpy */
+
+	slt	t1, a1, s5	/* See if we're bigger than the L2 cache */
+	bnez	t1, l2_cache_too_small
+	 nop
+	/* Address we plan to load at in the L2 cache */
+	PTR_LI	t9, CONFIG_OCTEON_L2_UBOOT_ADDR
+# ifdef CONFIG_OCTEON_L2_MEMCPY_IN_CACHE
+	/* Enable all ways for PP0.  Authentik ROM may have disabled these */
+	PTR_LI	a1, OCTEON_L2C_WPAR_PP0
+	sd	zero, 0(a1)
+
+	/* Address to place our memcpy code */
+	PTR_LI	a0, CONFIG_OCTEON_L2_MEMCPY_ADDR
+	/* The following code writes a simple memcpy routine into the cache
+	 * to copy ourself from flash into the L2 cache.  This makes the
+	 * memcpy routine a lot faster since each instruction can potentially
+	 * require four read cycles to flash over the boot bus.
+	 */
+	/* Zero cache line in the L2 cache */
+	zcb	(a0)
+	synci	0(zero)
+	dli	a1, 0xdd840000dd850008	/* ld a0, 0(t0);  ld a1, 8(t0) */
+	sd	a1, 0(a0)
+	dli	a1, 0xdd860010dd870018	/* ld a2, 16(t0); ld a3, 24(t0) */
+	sd	a1, 8(a0)
+	dli	a1, 0xfda40000fda50008	/* sd a0, 0(t1);  sd a1, 8(t1) */
+	sd	a1, 16(a0)
+	dli	a1, 0xfda60010fda70018	/* sd a2, 16(t1); sd a3, 24(t1) */
+	sd	a1, 24(a0)
+	dli	a1, 0x258c0020158efff6	/* addiu t0, 32; bne t0, t2, -40 */
+	sd	a1, 32(a0)
+	dli	a1, 0x25ad002003e00008	/* addiu t1, 32; jr ra */
+	sd	a1, 40(a0)
+	sd	zero, 48(a0)		/* nop; nop */
+
+	/* Synchronize the caches */
+	sync
+	synci	0(zero)
+
+	move	t0, s7
+	move	t1, t9
+
+	/* Do the memcpy operation in L2 cache to copy ourself from flash
+	 * to the L2 cache.
+	 */
+	jalr	a0
+	 nop
+
+# else
+	/* Copy ourself to the L2 cache from flash, 32 bytes at a time */
+	/* This code is now written to the L2 cache using the code above */
+1:
+	ld	a0, 0(t0)
+	ld	a1, 8(t0)
+	ld	a2, 16(t0)
+	ld	a3, 24(t0)
+	sd	a0, 0(t1)
+	sd	a1, 8(t1)
+	sd	a2, 16(t1)
+	sd	a3, 24(t1)
+	addiu	t0, 32
+	bne	t0, t2, 1b
+	addiu	t1, 32
+# endif	/* CONFIG_OCTEON_L2_MEMCPY_IN_CACHE */
+
+	/* Adjust the start address of U-Boot and the global pointer */
+	subu	t0, s7, t9	/* t0 = address difference */
+	move	s7, t9		/* Update physical address */
+	move	s2, t9
+	sync
+	synci	0(zero)
+
+	/* Now we branch to the L2 cache.  We first get our PC then adjust it
+	 */
+	bal	3f
+	 nop
+3:
+	/* Don't add any instructions here! */
+	subu	t9, ra, t0
+	/* Give ourself 16 bytes */
+	addiu	t9, 0x10
+
+	jal	t9		/* Branch to address in L2 cache */
+
+	 nop
+	nop
+	/* Add instructions after here */
+
+	move	a7, s7
+
+	b	uboot_in_ram
+	 ori	s4, 2		/* Running out of L2 cache */
+
+l2_cache_too_small:	/* We go here if we can't copy ourself to L2 */
+#endif /* CONFIG_OCTEON_COPY_FROM_FLASH_TO_L2 */
+
+	/* This code is only executed if booting from flash. */
+	/*  For flash boot (_not_ RAM boot), we do a workaround for
+	 * an LLM errata on CN38XX and CN58XX parts.
+	 */
+
+uboot_in_ram:
+	/* U-boot address is now in reg a7, and is 4 MByte aligned.
+	 * (boot bus addressing has been adjusted to make this happen for flash,
+	 * and for DRAM this alignment must be provided by the remote boot
+	 * utility.
+	 */
+	/* See if we're in KSEG0 range, if so set EBASE register to handle
+	 * exceptions.
+	 */
+	dli	a1, 0x20000000
+	bge	a7, a1, 1f
+	 nop
+	/* Convert our physical address to KSEG0 */
+	PTR_LI	a1, 0xffffffff80000000
+	or	a1, a1, a7
+	mtc0	a1, CP0_EBASE
+1:
+	/* U-boot now starts at 0xBFC00000.  Use a single 4 MByte TLB mapping
+	 * to map u-boot.
+	 */
+	move	a0, a6		/* Virtual addr in a0 */
+	dins	a0, zero, 0, 16	/* Zero out offset bits */
+	move	a1, a7		/* Physical addr in a1 */
+
+	/* Now we need to remove the MIPS address space bits.  For this we
+	 * need to determine if it is a 32 bit compatibility address or not.
+	 */
+
+	/* 'lowest' address in compatibility space */
+	PTR_LI	t0, 0xffffffff80000000
+	dsubu	t0, t0, a1
+	bltz	t0, compat_space
+	 nop
+
+	/* We have a xkphys address, so strip off top bit */
+	b	addr_fixup_done
+	 dins	a1, zero, 63, 1
+
+compat_space:
+	PTR_LI	a2, 0x1fffffff
+	and	a1, a1, a2  /* Mask phy addr to remove address space bits */
+
+addr_fixup_done:
+	/* Currenty the u-boot image size is limited to 4 MBytes.  In order to
+	 * support larger images the flash mapping will need to be changed to
+	 * be able to access more than that before C code is run.  Until that
+	 * is done, we just use a 4 MByte mapping for the secondary cores as
+	 * well.
+	 */
+	/* page size (only support 4 Meg binary size for now for core 0)
+	 * This limitation is due to the fact that the boot vector is
+	 * 0xBFC00000 which only makes 4MB available.  Later more flash
+	 * address space will be available after U-Boot has been copied to
+	 * RAM.	 For now assume that it is in flash.
+	 */
+	li	a2, 2*1024*1024
+
+	mfc0	a4, CP0_EBASE
+	andi	a4, EBASE_CPUNUM		/* get core */
+	beqz	a4, core_0_tlb
+	 nop
+
+	/* Now determine how big a mapping to use for secondary cores,
+	 * which need to map all of u-boot + heap in DRAM
+	 */
+	/* Here we look at the alignment of the the physical address,
+	 * and use the largest page size possible.  In some cases
+	 * this can result in an oversize mapping, but for secondary cores
+	 * this mapping is very short lived.
+	 */
+
+	/* Physical address in a1 */
+	li	a2, 1
+1:
+	sll	a2, 1
+	and	a5, a1, a2
+	beqz	a5, 1b
+	 nop
+
+	/* a2 now contains largest page size we can use */
+core_0_tlb:
+	JAL(single_tlb_setup)
+
+	/* Check if we're running from cache */
+	bbit1	s4, 1, uboot_in_cache
+	 nop
+
+	/* If we are already running from ram, we don't need to muck
+	 * with boot bus mappings.
+	 */
+	PTR_LI	t2, 0xffffffffb0000000
+	dsubu	t2, s7
+	/* See if our starting address is lower than the boot bus */
+	bgez	t2, uboot_in_ram2	/* If yes, booting from RAM */
+	 nop
+
+uboot_in_cache:
+#if CONFIG_OCTEON_BIG_STACK_SIZE
+	/* The large stack is only for core 0.  For all other cores we need to
+	 * use the L1 cache otherwise the other cores will stomp on top of each
+	 * other unless even more space is reserved for the stack space for
+	 * each core.  With potentially 96 cores this gets excessive.
+	 */
+	mfc0	v0, CP0_EBASE
+	andi	a0, EBASE_CPUNUM
+	bnez	a0, no_big_stack
+	 nop
+	PTR_LA	sp, big_stack_start
+	daddiu	sp, -16
+
+no_big_stack:
+#endif
+	/* We now have the TLB set up, so we need to remap the boot bus.
+	 * This is tricky, as we are running from flash, and will be changing
+	 * the addressing of the flash.
+	 */
+	/* Enable movable boot bus region 0, at address 0x10000000 */
+	PTR_LI	a4, OCTEON_MIO_BOOT_BASE
+	dli	a5, 0x81000000	/* EN + base address 0x11000000 */
+	sd	a5, OCTEON_MIO_BOOT_LOC_CFG0_OFF(a4)
+
+	/* Copy code to that remaps the boot bus to movable region */
+	sd	zero, OCTEON_MIO_BOOT_LOC_DAT_OFF(a4)
+
+	PTR_LA	a6, change_boot_mappings
+	GETOFFSET(a5, change_boot_mappings);
+	daddu	a5, a5, a6
+
+	/* The code is 16 bytes (2 DWORDS) */
+	ld	a7, 0(a5)
+	sd	a7, OCTEON_MIO_BOOT_LOC_DAT_OFF(a4)
+	ld	a7, 8(a5)
+	sd	a7, OCTEON_MIO_BOOT_LOC_DAT_OFF(a4)
+
+	/* Read from an RML register to ensure that the previous writes have
+	 * completed before we branch to the movable region.
+	 */
+	ld	zero, OCTEON_MIO_BOOT_LOC_CFG0_OFF(a4)
+
+	/* Compute value for boot bus configuration register */
+	/* Read region 0 config so we can _modify_ the base address field */
+	PTR_LI	a4, OCTEON_MIO_BOOT_REG_CFG0	/* region 0 config */
+	ld	a0, 0(a4)
+	dli	a4, 0xf0000000		/* Mask off bits we want to save */
+	and	a4, a4, a0
+	dli	a0, 0x0fff0000		/* Force size to max */
+	or	a4, a4, a0
+
+	move	a5, s6
+	/* Convert to 64k blocks, as used by boot bus config */
+	srl	a5, 16
+	li	a6, 0x1fc0	/* 'normal' boot bus base config value */
+	subu	a6, a6, a5	/* Subtract offset */
+	/* combine into register value to pass to boot bus routine */
+	or	a0, a4, a6
+
+	/* Branch there */
+	PTR_LA	a1, __mapped_continue_label
+	PTR_LI	a2, OCTEON_MIO_BOOT_REG_CFG0
+	/* If region 0 is not enabled we can skip it */
+	ld	a4, 0(a2)
+	bbit0	a4, 31, __mapped_continue_label
+	 nop
+	li	a4, 0x10000000
+	j	a4
+	 synci	0(zero)
+
+	/* We never get here, as we go directly to __mapped_continue_label */
+	break
+
+
+uboot_in_ram2:
+
+	/* Now jump to address in TLB mapped memory to continue execution */
+	PTR_LA	a4, __mapped_continue_label
+	synci	0(a4)
+	j	a4
+	 nop
+
+__mapped_continue_label:
+	/* Check if we are core 0, if we are not then we need
+	 * to vector to code in DRAM to do application setup, and
+	 * skip the rest of the bootloader.  Only core 0 runs the bootloader
+	 * and sets up the tables that the other cores will use for
+	 * configuration.
+	 */
+	mfc0	a0, CP0_EBASE
+	andi	a0, EBASE_CPUNUM   /* get core */
+	/* if (__all_cores_are_equal==0 && core==0),
+	 * then jump to execute BL on core 0; else 'go to next line'
+	 * (core_0_cont1 is executed ONLY when k0=a0=0(core0_ID))
+	 */
+	lw	t0, __all_cores_are_equal
+	beq	a0, t0, core_0_cont1
+	 nop
+
+	/* other cores look up addr from dram */
+        /* DRAM controller already set up by first core */
+        li      a1, (BOOT_VECTOR_NUM_WORDS * 4)
+        mul     a0, a0, a1
+
+        /* Now find out the boot vector base address from the moveable boot
+         * bus region.
+         */
+
+        /* Get the address of the boot bus moveable region */
+        PTR_LI     t8, OCTEON_MIO_BOOT_BASE
+        ld      t9, OCTEON_MIO_BOOT_LOC_CFG0_OFF(t8)
+        /* Make sure it's enabled */
+        bbit0   t9, 31, invalid_boot_vector
+         dext   t9, t9, 3, 24
+        dsll    t9, t9, 7
+        /* Make address XKPHYS */
+	li	t0, 1
+	dins	t9, t0, 63, 1
+
+        ld      t0, OCTEON_BOOT_MOVEABLE_MAGIC_OFFSET(t9)
+        dli     t1, OCTEON_BOOT_MOVEABLE_MAGIC1
+        bne     t0, t1, invalid_boot_vector
+         nop
+
+        /* Load base address of boot vector table */
+        ld      t0, OCTEON_BOOT_VECTOR_MOVEABLE_OFFSET(t9)
+        /* Add offset for core */
+        daddu   a1, t0, a0
+
+	mfc0	v0, CP0_STATUS
+	move	v1, v0
+	ins	v1, zero, 19, 1		/* Clear NMI bit */
+	mtc0	v1, CP0_STATUS
+
+        /* Get app start function address */
+        lw      t9, 8(a1)
+        beqz    t9, invalid_boot_vector
+         nop
+
+        j       t9
+         lw      k0, 12(a1)      /* Load global data (deprecated) */
+
+invalid_boot_vector:
+        wait
+        b       invalid_boot_vector
+         nop
+
+__all_cores_are_equal:
+	/* The following .word tell if 'all_cores_are_equal' or core0 is special
+	 * By default (for the first execution) the core0 should be special,
+	 * in order to behave like the old(existing not-modified) bootloader
+	 * and run the bootloader on core 0 to follow the existing design.
+	 * However after that we make 'all_cores_equal' which allows to run SE
+	 * applications on core0 like on any other core. NOTE that value written
+	 * to '__all_cores_are_equal' should not match any core ID.
+	 */
+	.word 	0
+
+core_0_cont1:
+	li	t0, 0xffffffff
+	sw	t0, __all_cores_are_equal
+	/* From here on, only core 0 runs, other cores have branched
+	 * away.
+	 */
+#ifdef CONFIG_MIPS_INIT_STACK_IN_SRAM
+	/* Set up initial stack and global data */
+	setup_stack_gd
+# ifdef CONFIG_DEBUG_UART
+	PTR_LA	t9, debug_uart_init
+	jalr	t9
+	 nop
+# endif
+#endif
+	move	a0, zero		# a0 <-- boot_flags = 0
+	PTR_LA	t9, board_init_f
+
+	jr	t9
+	 move	ra, zero
+	END(_start)
+
+	.balign	8
+	.globl	single_tlb_setup
+	.ent	single_tlb_setup
+	/* Sets up a single TLB entry.	Virtual/physical addresses
+	 * must be properly aligned.
+	 * a0  Virtual address
+	 * a1  Physical address
+	 * a2  page (_not_ mapping) size
+	 */
+single_tlb_setup:
+	/* Determine the number of TLB entries available, and
+	 * use the top one.
+	 */
+	mfc0	a3, CP0_CONFIG1
+	dext	a3, a3, 25, 6		/* a3 now has the max mmu entry index */
+	mfc0	a5, CP0_CONFIG3		/* Check if config4 reg present */
+	bbit0	a5, 31, single_tlb_setup_cont
+	 nop
+	mfc0	a5, CP0_CONFIG4
+	bbit0	a5, 14, single_tlb_setup_cont	/* check config4[MMUExtDef] */
+	 nop
+	/* append config4[MMUSizeExt] to most significant bit of
+	 * config1[MMUSize-1]
+	 */
+	dins	a3, a5, 6, 8
+	and	a3, a3, 0x3fff	/* a3 now includes max entries for cn6xxx */
+
+single_tlb_setup_cont:
+
+	/* Format physical address for entry low */
+	nop
+	dsrl	a1, a1, 12
+	dsll	a1, a1, 6
+	ori	a1, a1, 0x7	/* set DVG bits */
+
+	move	a4, a2
+	daddu	a5, a4, a4	/* mapping size */
+	dsll	a6, a4, 1
+	daddiu	a6, a6, -1	/* pagemask */
+	dsrl	a4, a4, 6	/* adjust for adding with entrylo */
+
+	/* Now set up mapping */
+	mtc0	a6, CP0_PAGEMASK
+	mtc0	a3, CP0_INDEX
+
+	dmtc0	a1, CP0_ENTRYLO0
+	daddu	a1, a1, a4
+
+	dmtc0	a1, CP0_ENTRYLO1
+	daddu	a1, a1, a4
+
+	dmtc0	a0, CP0_ENTRYHI
+	daddu	a0, a0, a5
+
+	ehb
+	tlbwi
+	jr  ra
+	 nop
+	.end   single_tlb_setup
+
+
+/**
+ * This code is moved to a movable boot bus region,
+ * and it is responsible for changing the flash mappings and
+ * jumping to run from the TLB mapped address.
+ *
+ * @param a0	New address for boot bus region 0
+ * @param a1	Address to branch to afterwards
+ * @param a2	Address of MIO_BOOT_REG_CFG0
+ */
+	.balign	8
+change_boot_mappings:
+	sd	a0, 0(a2)
+	sync
+	j a1	    /* Jump to new TLB mapped location */
+	 synci	0(zero)
+
+/* If we need a large stack, allocate it here. */
+#if CONFIG_OCTEON_BIG_STACK_SIZE
+	/* Allocate the stack here so it's in L2 cache or DRAM */
+	.balign	16
+big_stack_end:
+	.skip	CONFIG_OCTEON_BIG_STACK_SIZE, 0
+big_stack_start:
+	.dword	0
+#endif