[Concept,08/12] x86: Add 32-to-64-bit startup code

Message ID 20260211143309.1183113-9-sjg@u-boot.org
State New
Headers
Series x86: Add single 64-bit U-Boot without SPL for QEMU |

Commit Message

Simon Glass Feb. 11, 2026, 2:32 p.m. UTC
  From: Simon Glass <simon.glass@canonical.com>

Add start_from_32.S which provides a complete startup path from
the 16-bit reset vector through to 64-bit board_init_f(). The
entry point is .code32, called from start16.S after the
16-to-32-bit transition. It builds identity-mapped page tables
with 2MB pages, enables PAE and SSE, then transitions to long
mode.

The 32-bit section uses position-independent code (call/pop for
the instruction pointer) because the 64-bit binary is linked as
PIE. The far jump to 64-bit mode uses lret rather than ljmp to
avoid PIE-incompatible relocations.

The boot GDT is copied from ROM to RAM before use because KVM
cannot perform the implicit GDT data read from the ROM region
during the far jump that loads the 64-bit CS descriptor.

Update the Makefile and Kconfig to select start_from_32.o when
X86_64 and X86_16BIT_INIT are both enabled, and skip the
ucode-ptr binman entry for x86_64 non-SPL builds.

Co-developed-by: Claude Opus 4.6 <noreply@anthropic.com>
Signed-off-by: Simon Glass <simon.glass@canonical.com>
---

 arch/x86/Kconfig                   |   8 +-
 arch/x86/Makefile                  |   4 +
 arch/x86/cpu/Makefile              |   4 +
 arch/x86/cpu/start_from_32.S       | 248 +++++++++++++++++++++++++++++
 arch/x86/dts/emulation-u-boot.dtsi |   2 +-
 5 files changed, 262 insertions(+), 4 deletions(-)
 create mode 100644 arch/x86/cpu/start_from_32.S

-- 
2.43.0
  

Patch

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index db3f16a7222..4fdfe5b90fe 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -44,9 +44,11 @@  config X86_RUN_64BIT_NO_SPL
 	bool "64-bit"
 	select X86_64
 	help
-          Build U-Boot as a 64-bit binary without SPL. As U-Boot enters
-          in 64-bit mode, the assumption is that the silicon is fully
-          initialized (MP, page tables, etc.).
+	  Build U-Boot as a single 64-bit binary without SPL. If the
+	  board selects X86_RESET_VECTOR, the binary includes 16-bit
+	  and 32-bit startup code that transitions to 64-bit mode
+	  before running any C code. Otherwise U-Boot enters directly
+	  in 64-bit mode (e.g. when launched from coreboot).
 
 endchoice
 
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 955a728e361..e8813aa7e28 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -2,7 +2,11 @@ 
 
 ifeq ($(CONFIG_EFI_APP),)
 ifdef CONFIG_$(PHASE_)X86_64
+ifeq ($(CONFIG_$(PHASE_)X86_16BIT_INIT),y)
+head-y := arch/x86/cpu/start_from_32.o
+else
 head-y := arch/x86/cpu/start64.o
+endif
 else
 ifeq ($(CONFIG_$(PHASE_)X86_16BIT_INIT),y)
 head-y := arch/x86/cpu/start.o
diff --git a/arch/x86/cpu/Makefile b/arch/x86/cpu/Makefile
index 71feca3bf29..210290364fb 100644
--- a/arch/x86/cpu/Makefile
+++ b/arch/x86/cpu/Makefile
@@ -7,7 +7,11 @@ 
 # Daniel Engström, Omicron Ceti AB, daniel@omicron.se.
 
 ifeq ($(CONFIG_$(PHASE_)X86_64),y)
+ifeq ($(CONFIG_$(PHASE_)X86_16BIT_INIT),y)
+extra-y	= start_from_32.o
+else
 extra-y	= start64.o
+endif
 else
 ifeq ($(CONFIG_$(PHASE_)X86_16BIT_INIT),y)
 extra-y	= start.o
diff --git a/arch/x86/cpu/start_from_32.S b/arch/x86/cpu/start_from_32.S
new file mode 100644
index 00000000000..197e4398cdb
--- /dev/null
+++ b/arch/x86/cpu/start_from_32.S
@@ -0,0 +1,248 @@ 
+/* SPDX-License-Identifier: GPL-2.0+ */
+/*
+ * 64-bit x86 Startup Code with integrated 32-bit init
+ *
+ * Entry point _start is .code32, called from start16.S after the
+ * 16-to-32-bit transition.  This sets up an identity-mapped page table
+ * and transitions to 64-bit mode before calling into the normal
+ * board_init_f() flow.
+ *
+ * The 32-bit section uses position-independent code (call/pop for the
+ * instruction pointer) because the 64-bit binary is linked as PIE.
+ *
+ * Copyright 2026 Canonical Ltd
+ * Written by Simon Glass <simon.glass@canonical.com>
+ */
+
+#include <config.h>
+#include <asm/msr-index.h>
+#include <asm/processor.h>
+#include <asm/processor-flags.h>
+
+/*
+ * Page-table base address - must be 4KB aligned and below 4GB.
+ * Uses 24KB total: PML4 (4KB) + PDPT (4KB) + 4 PD tables (4KB each)
+ */
+#define PT_BASE		0x80000
+
+/* ------------------------------------------------------------------ */
+
+.section .text.start
+.code32
+.globl _start
+.type _start, @function
+_start:
+	/* Load the segment registers to match the GDT loaded in start16.S */
+	movl	$(X86_GDT_ENTRY_32BIT_DS * X86_GDT_ENTRY_SIZE), %eax
+	movl	%eax, %ds
+	movl	%eax, %es
+	movl	%eax, %gs
+	movl	%eax, %ss
+
+	/* Set up the stack in the CAR/SRAM region */
+	movl	$(CONFIG_SYS_CAR_ADDR + CONFIG_SYS_CAR_SIZE - 4), %esp
+
+	/* Clear IDT */
+	subl	$8, %esp
+	movl	$0, 4(%esp)		/* base = 0 */
+	movw	$0, 2(%esp)		/* padding */
+	movw	$0, (%esp)		/* limit = 0 */
+	lidt	(%esp)
+	addl	$8, %esp
+
+	/*
+	 * Get our runtime address into %ebx so we can reference data
+	 * position-independently (the 64-bit binary is linked as PIE)
+	 */
+	call	2f
+2:	popl	%ebx
+
+	/*
+	 * Copy the boot GDT from ROM to RAM and load it from there.
+	 * KVM's EPT may not allow data reads from the ROM region, so
+	 * the GDT must be in RAM for the far jump to read the 64-bit
+	 * CS descriptor.
+	 */
+#define GDT_RAM		0x2000
+	leal	(boot_gdt - 2b)(%ebx), %esi
+	movl	$GDT_RAM, %edi
+	movl	$((boot_gdt_end - boot_gdt) / 4), %ecx
+	cld
+	rep movsl
+
+	subl	$8, %esp
+	movl	$GDT_RAM, 2(%esp)	/* base in RAM */
+	movw	$(boot_gdt_end - boot_gdt - 1), (%esp)	/* limit */
+	lgdt	(%esp)
+	addl	$8, %esp
+
+	/*
+	 * Build identity-mapped page tables at PT_BASE (maps 4GB with
+	 * 2MB pages).  This is similar to build_pagetable() in
+	 * arch/x86/cpu/i386/cpu.c (which also sets the US/A/DT bits)
+	 * but must be done in assembly because page tables are needed
+	 * to enter 64-bit mode and all C code in this build is compiled
+	 * for 64-bit.
+	 *
+	 * Layout (24KB total):
+	 *   PT_BASE + 0x0000  PML4           (512 entries, only [0] used)
+	 *   PT_BASE + 0x1000  PDPT           (512 entries, [0]..[3] used)
+	 *   PT_BASE + 0x2000  PD for 0-1GB   (512 * 2MB entries)
+	 *   PT_BASE + 0x3000  PD for 1-2GB
+	 *   PT_BASE + 0x4000  PD for 2-3GB
+	 *   PT_BASE + 0x5000  PD for 3-4GB
+	 */
+
+	/* Zero 24KB */
+	movl	$PT_BASE, %edi
+	xorl	%eax, %eax
+	movl	$(6 * 4096 / 4), %ecx
+	rep stosl
+
+	/* PML4[0] -> PDPT */
+	movl	$(PT_BASE + 0x1000 + 0x03), %eax	/* Present + RW */
+	movl	%eax, PT_BASE
+
+	/* PDPT[0..3] -> four PD tables */
+	movl	$(PT_BASE + 0x2000 + 0x03), %eax
+	movl	%eax, (PT_BASE + 0x1000 + 0 * 8)
+	addl	$0x1000, %eax
+	movl	%eax, (PT_BASE + 0x1000 + 1 * 8)
+	addl	$0x1000, %eax
+	movl	%eax, (PT_BASE + 0x1000 + 2 * 8)
+	addl	$0x1000, %eax
+	movl	%eax, (PT_BASE + 0x1000 + 3 * 8)
+
+	/*
+	 * Fill the four PD tables (2048 entries total).
+	 * Each entry maps a 2MB page: address | PS(bit7) | RW | P
+	 */
+	movl	$(PT_BASE + 0x2000), %edi
+	movl	$0x00000083, %eax		/* 0MB, PS + RW + P */
+	movl	$2048, %ecx
+1:
+	movl	%eax, (%edi)
+	movl	$0, 4(%edi)			/* high 32 bits = 0 */
+	addl	$0x200000, %eax			/* next 2MB page */
+	addl	$8, %edi
+	decl	%ecx
+	jnz	1b
+
+	/*
+	 * Transition to 64-bit long mode.  This is similar to
+	 * cpu_call64() in arch/x86/cpu/i386/call64.S but uses lret
+	 * instead of ljmp (which would emit a PIE-incompatible
+	 * relocation).  It also enables SSE which call64.S does not
+	 * need to do.
+	 */
+
+	/* Disable paging (should already be off after reset) */
+	movl	%cr0, %eax
+	andl	$~X86_CR0_PG, %eax
+	movl	%eax, %cr0
+
+	/* Enable PAE and SSE (x86_64 gcc assumes SSE2 is available) */
+	movl	%cr4, %eax
+	orl	$(X86_CR4_PAE | X86_CR4_OSFXSR), %eax
+	movl	%eax, %cr4
+
+	/* Clear CR0.EM so SSE instructions do not fault */
+	movl	%cr0, %eax
+	andl	$~X86_CR0_EM, %eax
+	movl	%eax, %cr0
+
+	/* Point CR3 at PML4 */
+	movl	$PT_BASE, %eax
+	movl	%eax, %cr3
+
+	/* Enable Long Mode in EFER */
+	movl	$MSR_EFER, %ecx
+	rdmsr
+	btsl	$_EFER_LME, %eax
+	wrmsr
+
+	/* Enable paging -> activates long mode */
+	movl	%cr0, %eax
+	orl	$X86_CR0_PG, %eax
+	movl	%eax, %cr0
+
+	/*
+	 * Jump to 64-bit code segment.  Use lret to avoid the
+	 * PIE-incompatible relocation that a direct ljmp would emit.
+	 */
+	leal	(start64 - 2b)(%ebx), %eax
+	pushl	$(X86_GDT_ENTRY_64BIT_CS * X86_GDT_ENTRY_SIZE)
+	pushl	%eax
+	lret
+
+/* ------------------------------------------------------------------ */
+.code64
+start64:
+	/* Set up memory using the existing stack */
+	mov	%rsp, %rdi
+	call	board_init_f_alloc_reserve
+	mov	%rax, %rsp
+
+	call	board_init_f_init_reserve
+
+	xor	%rdi, %rdi
+	call	board_init_f
+	call	board_init_f_r
+
+	/* Should not return here */
+	jmp	.
+
+.globl board_init_f_r_trampoline64
+.type board_init_f_r_trampoline64, @function
+board_init_f_r_trampoline64:
+	/*
+	 * SDRAM has been initialised, U-Boot code has been copied into
+	 * RAM, BSS has been cleared and relocation adjustments have been
+	 * made. It is now time to jump into the in-RAM copy of U-Boot
+	 *
+	 * %rsi = Address of top of new stack
+	 * %rdi = New gd
+	 */
+
+	/* Stack grows down from top of SDRAM */
+	movq	%rsi, %rsp
+
+	/* Re-enter U-Boot by calling board_init_f_r() */
+	call	board_init_f_r
+
+/* ------------------------------------------------------------------ */
+/* Data */
+/* ------------------------------------------------------------------ */
+
+/*
+ * Boot GDT - includes valid 32-bit CS/DS entries (matching start16.S's
+ * selectors 0x10 and 0x18) plus the 64-bit CS at entry 9 (selector
+ * 0x48, matching U-Boot's standard GDT numbering).
+ *
+ * This is copied to RAM before use because KVM cannot perform the
+ * implicit GDT data read from the ROM region during the far jump
+ * to 64-bit mode.
+ *
+ * When arch_setup_gd() later loads the real GDT the CS selector (0x48)
+ * remains valid.
+ */
+.align 16
+boot_gdt:
+	/* Entry 0: NULL */
+	.quad	0
+	/* Entry 1: unused (matches start16.S layout) */
+	.quad	0
+	/* Entry 2: 32-bit code segment (selector 0x10) */
+	.quad	0x00cf9b000000ffff
+	/* Entry 3: 32-bit data segment (selector 0x18) */
+	.quad	0x00cf93000000ffff
+	/* Entries 4-8: unused */
+	.fill	5, 8, 0
+
+	/* Entry 9: 64-bit code segment (selector 0x48) */
+	.quad	0x00af9a000000ffff
+
+	/* Entry 10-11: unused (keep GDT same size as real one) */
+	.quad	0
+	.quad	0
+boot_gdt_end:
diff --git a/arch/x86/dts/emulation-u-boot.dtsi b/arch/x86/dts/emulation-u-boot.dtsi
index 7245fe51b3b..64d61b207da 100644
--- a/arch/x86/dts/emulation-u-boot.dtsi
+++ b/arch/x86/dts/emulation-u-boot.dtsi
@@ -12,7 +12,7 @@ 
 	u-boot-spl-with-ucode-ptr {
 		optional-ucode;
 	};
-#else
+#elif !defined(CONFIG_X86_64)
 	u-boot-with-ucode-ptr {
 		optional-ucode;
 	};