@@ -44,9 +44,11 @@ config X86_RUN_64BIT_NO_SPL
bool "64-bit"
select X86_64
help
- Build U-Boot as a 64-bit binary without SPL. As U-Boot enters
- in 64-bit mode, the assumption is that the silicon is fully
- initialized (MP, page tables, etc.).
+ Build U-Boot as a single 64-bit binary without SPL. If the
+ board selects X86_RESET_VECTOR, the binary includes 16-bit
+ and 32-bit startup code that transitions to 64-bit mode
+ before running any C code. Otherwise U-Boot enters directly
+ in 64-bit mode (e.g. when launched from coreboot).
endchoice
@@ -2,7 +2,11 @@
ifeq ($(CONFIG_EFI_APP),)
ifdef CONFIG_$(PHASE_)X86_64
+ifeq ($(CONFIG_$(PHASE_)X86_16BIT_INIT),y)
+head-y := arch/x86/cpu/start_from_32.o
+else
head-y := arch/x86/cpu/start64.o
+endif
else
ifeq ($(CONFIG_$(PHASE_)X86_16BIT_INIT),y)
head-y := arch/x86/cpu/start.o
@@ -7,7 +7,11 @@
# Daniel Engström, Omicron Ceti AB, daniel@omicron.se.
ifeq ($(CONFIG_$(PHASE_)X86_64),y)
+ifeq ($(CONFIG_$(PHASE_)X86_16BIT_INIT),y)
+extra-y = start_from_32.o
+else
extra-y = start64.o
+endif
else
ifeq ($(CONFIG_$(PHASE_)X86_16BIT_INIT),y)
extra-y = start.o
new file mode 100644
@@ -0,0 +1,248 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+/*
+ * 64-bit x86 Startup Code with integrated 32-bit init
+ *
+ * Entry point _start is .code32, called from start16.S after the
+ * 16-to-32-bit transition. This sets up an identity-mapped page table
+ * and transitions to 64-bit mode before calling into the normal
+ * board_init_f() flow.
+ *
+ * The 32-bit section uses position-independent code (call/pop for the
+ * instruction pointer) because the 64-bit binary is linked as PIE.
+ *
+ * Copyright 2026 Canonical Ltd
+ * Written by Simon Glass <simon.glass@canonical.com>
+ */
+
+#include <config.h>
+#include <asm/msr-index.h>
+#include <asm/processor.h>
+#include <asm/processor-flags.h>
+
+/*
+ * Page-table base address - must be 4KB aligned and below 4GB.
+ * Uses 24KB total: PML4 (4KB) + PDPT (4KB) + 4 PD tables (4KB each)
+ */
+#define PT_BASE 0x80000
+
+/* ------------------------------------------------------------------ */
+
+.section .text.start
+.code32
+.globl _start
+.type _start, @function
+_start:
+ /* Load the segment registers to match the GDT loaded in start16.S */
+ movl $(X86_GDT_ENTRY_32BIT_DS * X86_GDT_ENTRY_SIZE), %eax
+ movl %eax, %ds
+ movl %eax, %es
+ movl %eax, %gs
+ movl %eax, %ss
+
+ /* Set up the stack in the CAR/SRAM region */
+ movl $(CONFIG_SYS_CAR_ADDR + CONFIG_SYS_CAR_SIZE - 4), %esp
+
+ /* Clear IDT */
+ subl $8, %esp
+ movl $0, 4(%esp) /* base = 0 */
+ movw $0, 2(%esp) /* padding */
+ movw $0, (%esp) /* limit = 0 */
+ lidt (%esp)
+ addl $8, %esp
+
+ /*
+ * Get our runtime address into %ebx so we can reference data
+ * position-independently (the 64-bit binary is linked as PIE)
+ */
+ call 2f
+2: popl %ebx
+
+ /*
+ * Copy the boot GDT from ROM to RAM and load it from there.
+ * KVM's EPT may not allow data reads from the ROM region, so
+ * the GDT must be in RAM for the far jump to read the 64-bit
+ * CS descriptor.
+ */
+#define GDT_RAM 0x2000
+ leal (boot_gdt - 2b)(%ebx), %esi
+ movl $GDT_RAM, %edi
+ movl $((boot_gdt_end - boot_gdt) / 4), %ecx
+ cld
+ rep movsl
+
+ subl $8, %esp
+ movl $GDT_RAM, 2(%esp) /* base in RAM */
+ movw $(boot_gdt_end - boot_gdt - 1), (%esp) /* limit */
+ lgdt (%esp)
+ addl $8, %esp
+
+ /*
+ * Build identity-mapped page tables at PT_BASE (maps 4GB with
+ * 2MB pages). This is similar to build_pagetable() in
+ * arch/x86/cpu/i386/cpu.c (which also sets the US/A/DT bits)
+ * but must be done in assembly because page tables are needed
+ * to enter 64-bit mode and all C code in this build is compiled
+ * for 64-bit.
+ *
+ * Layout (24KB total):
+ * PT_BASE + 0x0000 PML4 (512 entries, only [0] used)
+ * PT_BASE + 0x1000 PDPT (512 entries, [0]..[3] used)
+ * PT_BASE + 0x2000 PD for 0-1GB (512 * 2MB entries)
+ * PT_BASE + 0x3000 PD for 1-2GB
+ * PT_BASE + 0x4000 PD for 2-3GB
+ * PT_BASE + 0x5000 PD for 3-4GB
+ */
+
+ /* Zero 24KB */
+ movl $PT_BASE, %edi
+ xorl %eax, %eax
+ movl $(6 * 4096 / 4), %ecx
+ rep stosl
+
+ /* PML4[0] -> PDPT */
+ movl $(PT_BASE + 0x1000 + 0x03), %eax /* Present + RW */
+ movl %eax, PT_BASE
+
+ /* PDPT[0..3] -> four PD tables */
+ movl $(PT_BASE + 0x2000 + 0x03), %eax
+ movl %eax, (PT_BASE + 0x1000 + 0 * 8)
+ addl $0x1000, %eax
+ movl %eax, (PT_BASE + 0x1000 + 1 * 8)
+ addl $0x1000, %eax
+ movl %eax, (PT_BASE + 0x1000 + 2 * 8)
+ addl $0x1000, %eax
+ movl %eax, (PT_BASE + 0x1000 + 3 * 8)
+
+ /*
+ * Fill the four PD tables (2048 entries total).
+ * Each entry maps a 2MB page: address | PS(bit7) | RW | P
+ */
+ movl $(PT_BASE + 0x2000), %edi
+ movl $0x00000083, %eax /* 0MB, PS + RW + P */
+ movl $2048, %ecx
+1:
+ movl %eax, (%edi)
+ movl $0, 4(%edi) /* high 32 bits = 0 */
+ addl $0x200000, %eax /* next 2MB page */
+ addl $8, %edi
+ decl %ecx
+ jnz 1b
+
+ /*
+ * Transition to 64-bit long mode. This is similar to
+ * cpu_call64() in arch/x86/cpu/i386/call64.S but uses lret
+ * instead of ljmp (which would emit a PIE-incompatible
+ * relocation). It also enables SSE which call64.S does not
+ * need to do.
+ */
+
+ /* Disable paging (should already be off after reset) */
+ movl %cr0, %eax
+ andl $~X86_CR0_PG, %eax
+ movl %eax, %cr0
+
+ /* Enable PAE and SSE (x86_64 gcc assumes SSE2 is available) */
+ movl %cr4, %eax
+ orl $(X86_CR4_PAE | X86_CR4_OSFXSR), %eax
+ movl %eax, %cr4
+
+ /* Clear CR0.EM so SSE instructions do not fault */
+ movl %cr0, %eax
+ andl $~X86_CR0_EM, %eax
+ movl %eax, %cr0
+
+ /* Point CR3 at PML4 */
+ movl $PT_BASE, %eax
+ movl %eax, %cr3
+
+ /* Enable Long Mode in EFER */
+ movl $MSR_EFER, %ecx
+ rdmsr
+ btsl $_EFER_LME, %eax
+ wrmsr
+
+ /* Enable paging -> activates long mode */
+ movl %cr0, %eax
+ orl $X86_CR0_PG, %eax
+ movl %eax, %cr0
+
+ /*
+ * Jump to 64-bit code segment. Use lret to avoid the
+ * PIE-incompatible relocation that a direct ljmp would emit.
+ */
+ leal (start64 - 2b)(%ebx), %eax
+ pushl $(X86_GDT_ENTRY_64BIT_CS * X86_GDT_ENTRY_SIZE)
+ pushl %eax
+ lret
+
+/* ------------------------------------------------------------------ */
+.code64
+start64:
+ /* Set up memory using the existing stack */
+ mov %rsp, %rdi
+ call board_init_f_alloc_reserve
+ mov %rax, %rsp
+
+ call board_init_f_init_reserve
+
+ xor %rdi, %rdi
+ call board_init_f
+ call board_init_f_r
+
+ /* Should not return here */
+ jmp .
+
+.globl board_init_f_r_trampoline64
+.type board_init_f_r_trampoline64, @function
+board_init_f_r_trampoline64:
+ /*
+ * SDRAM has been initialised, U-Boot code has been copied into
+ * RAM, BSS has been cleared and relocation adjustments have been
+ * made. It is now time to jump into the in-RAM copy of U-Boot
+ *
+ * %rsi = Address of top of new stack
+ * %rdi = New gd
+ */
+
+ /* Stack grows down from top of SDRAM */
+ movq %rsi, %rsp
+
+ /* Re-enter U-Boot by calling board_init_f_r() */
+ call board_init_f_r
+
+/* ------------------------------------------------------------------ */
+/* Data */
+/* ------------------------------------------------------------------ */
+
+/*
+ * Boot GDT - includes valid 32-bit CS/DS entries (matching start16.S's
+ * selectors 0x10 and 0x18) plus the 64-bit CS at entry 9 (selector
+ * 0x48, matching U-Boot's standard GDT numbering).
+ *
+ * This is copied to RAM before use because KVM cannot perform the
+ * implicit GDT data read from the ROM region during the far jump
+ * to 64-bit mode.
+ *
+ * When arch_setup_gd() later loads the real GDT the CS selector (0x48)
+ * remains valid.
+ */
+.align 16
+boot_gdt:
+ /* Entry 0: NULL */
+ .quad 0
+ /* Entry 1: unused (matches start16.S layout) */
+ .quad 0
+ /* Entry 2: 32-bit code segment (selector 0x10) */
+ .quad 0x00cf9b000000ffff
+ /* Entry 3: 32-bit data segment (selector 0x18) */
+ .quad 0x00cf93000000ffff
+ /* Entries 4-8: unused */
+ .fill 5, 8, 0
+
+ /* Entry 9: 64-bit code segment (selector 0x48) */
+ .quad 0x00af9a000000ffff
+
+ /* Entry 10-11: unused (keep GDT same size as real one) */
+ .quad 0
+ .quad 0
+boot_gdt_end:
@@ -12,7 +12,7 @@
u-boot-spl-with-ucode-ptr {
optional-ucode;
};
-#else
+#elif !defined(CONFIG_X86_64)
u-boot-with-ucode-ptr {
optional-ucode;
};