Page MenuHomeFreeBSD

D38259.diff
No OneTemporary

D38259.diff

diff --git a/stand/kboot/arch/amd64/amd64_tramp.S b/stand/kboot/arch/amd64/amd64_tramp.S
--- a/stand/kboot/arch/amd64/amd64_tramp.S
+++ b/stand/kboot/arch/amd64/amd64_tramp.S
@@ -1,9 +1,6 @@
/*-
- * Copyright (c) 2013 The FreeBSD Foundation
- * All rights reserved.
+ * Copyright (c) 2022 Netflix, Inc
*
- * This software was developed by Benno Rice under sponsorship from
- * the FreeBSD Foundation.
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
@@ -24,53 +21,87 @@
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
- *
- * $FreeBSD$
*/
-#include <machine/asmacros.h>
-
-#define ASM_FILE
-#include "multiboot2.h"
+/*
+ * This is the trampoline that starts the FreeBSD kernel. Since the Linux kernel
+ * calls this routine with no args, and has a different environment than the
+ * boot loader provides and that the kernel expects, this code is responsible
+ * for setting all that up and calling the normal kernel entry point. It's
+ * analogous to the "purgatory" code in the linux kernel. Details about these
+ * operations are contained in comments below. On amd64, the kernel starts all
+ * the APs so we don't have to worry about them here.
+ */
+/*
+ * Keep in sync with elf64_freebsd.c. Kexec starts tramp w/o any parameters, so
+ * store them here. This is constructed to be a useful stack:
+ *
+ * struct trampoline_data {
+ * uint64_t pt4; // Page table address to pop
+ * uint64_t entry; // return address to jump to kernel
+ * uint32_t fill1; // 0
+ * uint32_t modulep; // 4 module metadata
+ * uint32_t kernend; // 8 kernel end
+ * uint32_t fill2; // 12
+ * };
+ *
+ * loader.kboot will construct a stack that btext expects, which is arguments on
+ * the stack, not in registers, and these args are 32-bit not 64
+ *
+ * Processor is already in long mode when we're called, paging is enabled and
+ * boot loader loads things such that:
+ * - kernel mapped at KERNBASE, aligned to 2MB, below 4GB, contiguous memory
+ * - there is a 2M hole at KERNBASE (KERNSTART = KERNBASE + 2M)
+ * - kernel is mapped with 2M superpages
+ * - The kernel, modules and metadata is in first 4GB which is unity mapped
+ * - There's additional memory after loader provided data for early allocations
+ *
+ * Unlike EFI, we don't support copying the staging area. We tell Linux to land
+ * the kernel in its final location with the needed alignment, etc. We copy the
+ * trampoline code to 1MB offset above KERNBASE since that memory is otherwise
+ * free and safely above the lower 1MB swamp we inherited from IBM PC, though
+ * this code makes no assumptions about where that might.
+ *
+ * Thus, the trampoline just needs to set %rsp to that stack pop the %cr3 value,
+ * set it and then retq to jump to the kernel with its stack args filled in.
+ * Since the handoff to this code used to be from 32-bit code, it uses the i386
+ * calling conventions which put the arguments on the stack. The kernel's btext
+ * routine expects this setup.
+ */
.text
- .globl amd64_tramp
-
+ .globl tramp
+tramp:
+ cli /* Make sure we don't get interrupted. */
+ leaq tramp_pt4(%rip), %rsp /* Setup our pre-filled-in stack */
+ popq %rax /* Pop off the PT4 ptr for %cr3 */
+ movq %rax, %cr3 /* set the page table */
+ retq /* Return addr and args already on stack */
/*
- * void amd64_tramp(uint64_t stack, void *copy_finish, uint64_t kernend,
- * uint64_t modulep, uint64_t pagetable, uint64_t entry)
+ * The following is the stack for the above code. The stack will increase in
+ * address as things are popped off of it, so we start with the stack pointing
+ * to tramp_pt4.
*/
-amd64_tramp:
- cli /* Make sure we don't get interrupted. */
- movq %rdi,%rsp /* Switch to our temporary stack. */
-
- movq %rdx,%r12 /* Stash the kernel values for later. */
- movq %rcx,%r13
- movq %r8,%r14
- movq %r9,%r15
-
- callq *%rsi /* Call copy_finish so we're all ready to go. */
-
- pushq %r12 /* Push kernend. */
- salq $32,%r13 /* Shift modulep and push it. */
- pushq %r13
- pushq %r15 /* Push the entry address. */
- movq %r14,%cr3 /* Switch page tables. */
- ret /* "Return" to kernel entry. */
-
- ALIGN_TEXT
-amd64_tramp_end:
-
-/* void multiboot2_exec(uint64_t entry, uint64_t multiboot_info, uint64_t stack) */
- .globl multiboot2_exec
-multiboot2_exec:
- movq %rdx,%rsp
- pushq %rdi
- movq %rsi,%rbx
- movq $MULTIBOOT2_BOOTLOADER_MAGIC,%rax
- ret
+ .p2align 3 /* Stack has to be 8 byte aligned */
+trampoline_data:
+tramp_pt4: .quad 0 /* New %cr3 value */
+tramp_entry: .quad 0 /* Entry to kernel (btext) */
+ /* %rsp points here on entry to amd64 kernel's btext */
+ .long 0 /* 0 filler, ignored (current loaders set to 0) */
+tramp_modulep: .long 0 /* 4 moudlep */
+tramp_kernend: .long 0 /* 8 kernend */
+ .long 0 /* 12 alignment filler (also 0) */
+tramp_end:
.data
- .globl amd64_tramp_size
-amd64_tramp_size:
- .long amd64_tramp_end-amd64_tramp
+ .type tramp_size,@object
+ .globl tramp_size
+tramp_size:
+ .long tramp_end-tramp
+ .size tramp_size, 4
+
+ .type tramp_data_offset,@object
+ .globl tramp_data_offset
+tramp_data_offset:
+ .long trampoline_data-tramp
+ .size tramp_data_offset, 4
diff --git a/stand/kboot/arch/amd64/elf64_freebsd.c b/stand/kboot/arch/amd64/elf64_freebsd.c
--- a/stand/kboot/arch/amd64/elf64_freebsd.c
+++ b/stand/kboot/arch/amd64/elf64_freebsd.c
@@ -41,9 +41,12 @@
#ifdef EFI
#include <efi.h>
#include <efilib.h>
+#else
+#include "host_syscall.h"
#endif
#include "bootstrap.h"
+#include "kboot.h"
#include "platform/acfreebsd.h"
#include "acconfig.h"
@@ -53,9 +56,7 @@
#ifdef EFI
#include "loader_efi.h"
-#endif
-#ifdef EFI
static EFI_GUID acpi_guid = ACPI_TABLE_GUID;
static EFI_GUID acpi20_guid = ACPI_20_TABLE_GUID;
#endif
@@ -63,9 +64,11 @@
#ifdef EFI
#define LOADER_PAGE_SIZE EFI_PAGE_SIZE
#else
-#define LOADER_PAGE_SIZE 8192
+#define LOADER_PAGE_SIZE PAGE_SIZE
#endif
+extern vm_offset_t kboot_get_phys_load_segment(void);
+
extern int bi_load(char *args, vm_offset_t *modulep, vm_offset_t *kernendp,
bool exit_bs);
@@ -81,13 +84,13 @@
.l_exec = elf64_obj_exec,
};
-#if 0
+#ifdef EFI
extern struct file_format multiboot2;
extern struct file_format multiboot2_obj;
#endif
struct file_format *file_formats[] = {
-#if 0
+#ifdef EFI
&multiboot2,
&multiboot2_obj,
#endif
@@ -96,21 +99,44 @@
NULL
};
-#ifdef EFI
+#ifndef EFI
+/*
+ * We create the stack that we want. We have the address of the page tables
+ * we make on top (so we pop that off and set %cr3). We have the entry point
+ * to the kernel (which retq pops off) This leaves the stack that the btext
+ * wants: offset 4 is modulep and offset8 is kernend, with the filler bytes
+ * to keep this aligned. This makes the trampoline very simple.
+ */
+struct trampoline_data {
+ uint64_t pt4; // Page table address to pop
+ uint64_t entry; // return address to jump to kernel
+ uint32_t fill1; // 0
+ uint32_t modulep; // 4 module metadata
+ uint32_t kernend; // 8 kernel end
+ uint32_t fill2; // 12
+};
+_Static_assert(sizeof(struct trampoline_data) == 32, "Bad size for trampoline data");
+#endif
+
static pml4_entry_t *PT4;
-static pdp_entry_t *PT3;
static pdp_entry_t *PT3_l, *PT3_u;
-static pd_entry_t *PT2;
static pd_entry_t *PT2_l0, *PT2_l1, *PT2_l2, *PT2_l3, *PT2_u0, *PT2_u1;
+#ifdef EFI
+static pdp_entry_t *PT3;
+static pd_entry_t *PT2;
+
extern EFI_PHYSICAL_ADDRESS staging;
static void (*trampoline)(uint64_t stack, void *copy_finish, uint64_t kernend,
uint64_t modulep, pml4_entry_t *pagetable, uint64_t entry);
#endif
-extern uintptr_t amd64_tramp;
-extern uint32_t amd64_tramp_size;
+extern uintptr_t tramp;
+extern uint32_t tramp_size;
+#ifndef EFI
+extern uint32_t tramp_data_offset;
+#endif
/*
* There is an ELF kernel and one or more ELF modules loaded.
@@ -120,15 +146,27 @@
static int
elf64_exec(struct preloaded_file *fp)
{
-#ifdef EFI
struct file_metadata *md;
Elf_Ehdr *ehdr;
- vm_offset_t modulep, kernend, trampcode, trampstack;
+ vm_offset_t modulep, kernend;
int err, i;
- ACPI_TABLE_RSDP *rsdp;
char buf[24];
+#ifdef EFI
+ ACPI_TABLE_RSDP *rsdp = NULL;
int revision;
- bool copy_auto;
+ int copy_auto;
+ vm_offset_t trampstack, trampcode;
+#else
+ vm_offset_t rsdp = 0;
+ void *trampcode;
+ int nseg;
+ void *kseg;
+ vm_offset_t trampolinebase;
+ uint64_t *trampoline;
+ struct trampoline_data *trampoline_data;
+ vm_offset_t staging;
+ int error;
+#endif
#ifdef EFI
copy_auto = copy_staging == COPY_STAGING_AUTO;
@@ -136,66 +174,49 @@
copy_staging = fp->f_kernphys_relocatable ?
COPY_STAGING_DISABLE : COPY_STAGING_ENABLE;
#else
- copy_auto = COPY_STAGING_DISABLE; /* XXX */
+ /*
+ * Figure out where to put it.
+ *
+ * Linux does not allow to do kexec_load into any part of memory. Ask
+ * arch_loadaddr to resolve the first available chunk of physical memory
+ * where loading is possible (load_addr).
+ *
+ * The kernel is loaded at the 'base' address in continguous physical
+ * pages (using 2MB super pages). The first such page is unused by the
+ * kernel and serves as a good place to put not only the trampoline, but
+ * the page table pages that the trampoline needs to setup the proper
+ * kernel starting environment.
+ */
+ staging = trampolinebase = kboot_get_phys_load_segment();
+ trampolinebase += 1ULL << 20; /* Copy trampoline to base + 1MB, kernel will wind up at 2MB */
+ printf("Load address at %#jx\n", (uintmax_t)trampolinebase);
+ printf("Relocation offset is %#jx\n", (uintmax_t)elf64_relocation_offset);
#endif
/*
* Report the RSDP to the kernel. While this can be found with
* a BIOS boot, the RSDP may be elsewhere when booted from UEFI.
- * The old code used the 'hints' method to communite this to
- * the kernel. However, while convenient, the 'hints' method
- * is fragile and does not work when static hints are compiled
- * into the kernel. Instead, move to setting different tunables
- * that start with acpi. The old 'hints' can be removed before
- * we branch for FreeBSD 12.
*/
-
#ifdef EFI
rsdp = efi_get_table(&acpi20_guid);
if (rsdp == NULL) {
rsdp = efi_get_table(&acpi_guid);
}
#else
- rsdp = NULL;
-#warning "write me"
+ rsdp = acpi_rsdp();
#endif
- if (rsdp != NULL) {
+ if (rsdp != 0) {
sprintf(buf, "0x%016llx", (unsigned long long)rsdp);
- setenv("hint.acpi.0.rsdp", buf, 1);
setenv("acpi.rsdp", buf, 1);
- revision = rsdp->Revision;
- if (revision == 0)
- revision = 1;
- sprintf(buf, "%d", revision);
- setenv("hint.acpi.0.revision", buf, 1);
- setenv("acpi.revision", buf, 1);
- strncpy(buf, rsdp->OemId, sizeof(rsdp->OemId));
- buf[sizeof(rsdp->OemId)] = '\0';
- setenv("hint.acpi.0.oem", buf, 1);
- setenv("acpi.oem", buf, 1);
- sprintf(buf, "0x%016x", rsdp->RsdtPhysicalAddress);
- setenv("hint.acpi.0.rsdt", buf, 1);
- setenv("acpi.rsdt", buf, 1);
- if (revision >= 2) {
- /* XXX extended checksum? */
- sprintf(buf, "0x%016llx",
- (unsigned long long)rsdp->XsdtPhysicalAddress);
- setenv("hint.acpi.0.xsdt", buf, 1);
- setenv("acpi.xsdt", buf, 1);
- sprintf(buf, "%d", rsdp->Length);
- setenv("hint.acpi.0.xsdt_length", buf, 1);
- setenv("acpi.xsdt_length", buf, 1);
- }
}
-
if ((md = file_findmetadata(fp, MODINFOMD_ELFHDR)) == NULL)
return (EFTYPE);
ehdr = (Elf_Ehdr *)&(md->md_data);
+#ifdef EFI
trampcode = copy_staging == COPY_STAGING_ENABLE ?
(vm_offset_t)0x0000000040000000 /* 1G */ :
(vm_offset_t)0x0000000100000000; /* 4G */;
-#ifdef EFI
err = BS->AllocatePages(AllocateMaxAddress, EfiLoaderData, 1,
(EFI_PHYSICAL_ADDRESS *)&trampcode);
if (EFI_ERROR(err)) {
@@ -204,17 +225,22 @@
copy_staging = COPY_STAGING_AUTO;
return (ENOMEM);
}
+ trampstack = trampcode + LOADER_PAGE_SIZE - 8;
#else
-#warning "Write me"
+ // XXX Question: why not just use malloc?
+ trampcode = host_getmem(LOADER_PAGE_SIZE);
+ if (trampcode == NULL) {
+ printf("Unable to allocate trampoline\n");
+ return (ENOMEM);
+ }
#endif
bzero((void *)trampcode, LOADER_PAGE_SIZE);
- trampstack = trampcode + LOADER_PAGE_SIZE - 8;
- bcopy((void *)&amd64_tramp, (void *)trampcode, amd64_tramp_size);
+ bcopy((void *)&tramp, (void *)trampcode, tramp_size);
trampoline = (void *)trampcode;
+#ifdef EFI
if (copy_staging == COPY_STAGING_ENABLE) {
PT4 = (pml4_entry_t *)0x0000000040000000;
-#ifdef EFI
err = BS->AllocatePages(AllocateMaxAddress, EfiLoaderData, 3,
(EFI_PHYSICAL_ADDRESS *)&PT4);
if (EFI_ERROR(err)) {
@@ -224,9 +250,6 @@
copy_staging = COPY_STAGING_AUTO;
return (ENOMEM);
}
-#else
-#warning "Write me"
-#endif
bzero(PT4, 3 * LOADER_PAGE_SIZE);
PT3 = &PT4[512];
PT2 = &PT3[512];
@@ -259,7 +282,6 @@
}
} else {
PT4 = (pml4_entry_t *)0x0000000100000000; /* 4G */
-#ifdef EFI
err = BS->AllocatePages(AllocateMaxAddress, EfiLoaderData, 9,
(EFI_PHYSICAL_ADDRESS *)&PT4);
if (EFI_ERROR(err)) {
@@ -269,10 +291,6 @@
copy_staging = COPY_STAGING_AUTO;
return (ENOMEM);
}
-#else
-#warning "Write me"
-#endif
-
bzero(PT4, 9 * LOADER_PAGE_SIZE);
PT3_l = &PT4[NPML4EPG * 1];
@@ -308,10 +326,84 @@
PG_V | PG_RW | PG_PS;
}
}
+#else
+ {
+ vm_offset_t pabase, pa_pt3_l, pa_pt3_u, pa_pt2_l0, pa_pt2_l1, pa_pt2_l2, pa_pt2_l3, pa_pt2_u0, pa_pt2_u1;
+ /* We'll find a place for these later */
+ PT4 = (pml4_entry_t *)host_getmem(9 * LOADER_PAGE_SIZE);
+ bzero(PT4, 9 * LOADER_PAGE_SIZE);
+
+ PT3_l = &PT4[NPML4EPG * 1];
+ PT3_u = &PT4[NPML4EPG * 2];
+ PT2_l0 = &PT4[NPML4EPG * 3];
+ PT2_l1 = &PT4[NPML4EPG * 4];
+ PT2_l2 = &PT4[NPML4EPG * 5];
+ PT2_l3 = &PT4[NPML4EPG * 6];
+ PT2_u0 = &PT4[NPML4EPG * 7];
+ PT2_u1 = &PT4[NPML4EPG * 8];
+
+ pabase = trampolinebase + LOADER_PAGE_SIZE;
+ pa_pt3_l = pabase + LOADER_PAGE_SIZE * 1;
+ pa_pt3_u = pabase + LOADER_PAGE_SIZE * 2;
+ pa_pt2_l0 = pabase + LOADER_PAGE_SIZE * 3;
+ pa_pt2_l1 = pabase + LOADER_PAGE_SIZE * 4;
+ pa_pt2_l2 = pabase + LOADER_PAGE_SIZE * 5;
+ pa_pt2_l3 = pabase + LOADER_PAGE_SIZE * 6;
+ pa_pt2_u0 = pabase + LOADER_PAGE_SIZE * 7;
+ pa_pt2_u1 = pabase + LOADER_PAGE_SIZE * 8;
+
+ /* 1:1 mapping of lower 4G */
+ PT4[0] = (pml4_entry_t)pa_pt3_l | PG_V | PG_RW;
+ PT3_l[0] = (pdp_entry_t)pa_pt2_l0 | PG_V | PG_RW;
+ PT3_l[1] = (pdp_entry_t)pa_pt2_l1 | PG_V | PG_RW;
+ PT3_l[2] = (pdp_entry_t)pa_pt2_l2 | PG_V | PG_RW;
+ PT3_l[3] = (pdp_entry_t)pa_pt2_l3 | PG_V | PG_RW;
+ for (i = 0; i < 4 * NPDEPG; i++) { /* we overflow PT2_l0 into _l1, etc */
+ PT2_l0[i] = ((pd_entry_t)i << PDRSHIFT) | PG_V |
+ PG_RW | PG_PS;
+ }
+
+ /* mapping of kernel 2G below top */
+ PT4[NPML4EPG - 1] = (pml4_entry_t)pa_pt3_u | PG_V | PG_RW;
+ PT3_u[NPDPEPG - 2] = (pdp_entry_t)pa_pt2_u0 | PG_V | PG_RW;
+ PT3_u[NPDPEPG - 1] = (pdp_entry_t)pa_pt2_u1 | PG_V | PG_RW;
+ /* compat mapping of phys @0 */
+ PT2_u0[0] = PG_PS | PG_V | PG_RW;
+ /* this maps past staging area */
+ /*
+ * Kernel uses the KERNSTART (== KERNBASE + 2MB) entry to figure
+ * out where we loaded the kernel. This is PT2_u0[1] (since
+ * these map 2MB pages. So the PA that this maps has to be
+ * kboot's staging + 2MB. For UEFI we do 'i - 1' since we load
+ * the kernel right at staging (and assume the first address we
+ * load is 2MB in efi_copyin). However for kboot, staging + 1 *
+ * NBPDR == staging + 2MB which is where the kernel starts. Our
+ * trampoline need not be mapped into the kernel space since we
+ * execute PA==VA for that, and the trampoline can just go away
+ * once the kernel is called.
+ *
+ * Staging should likely be as low as possible, though, because
+ * all the 'early' allocations are at kernend (which the kernel
+ * calls physfree).
+ */
+ for (i = 1; i < 2 * NPDEPG; i++) { /* we overflow PT2_u0 into _u1 */
+ PT2_u0[i] = ((pd_entry_t)staging +
+ ((pd_entry_t)i) * NBPDR) |
+ PG_V | PG_RW | PG_PS;
+ if (i < 10) printf("Mapping %d to %#lx staging %#lx\n", i, PT2_u0[i], staging);
+ }
+ }
+#endif
+
+#ifdef EFI
printf("staging %#lx (%scopying) tramp %p PT4 %p\n",
staging, copy_staging == COPY_STAGING_ENABLE ? "" : "not ",
trampoline, PT4);
+#else
+ printf("staging %#lx tramp %p PT4 %p\n", staging, (void *)trampolinebase,
+ (void *)trampolinebase + LOADER_PAGE_SIZE);
+#endif
printf("Start @ 0x%lx ...\n", ehdr->e_entry);
#ifdef EFI
@@ -321,17 +413,46 @@
if (err != 0) {
#ifdef EFI
efi_time_init();
-#endif
if (copy_auto)
copy_staging = COPY_STAGING_AUTO;
+#endif
return (err);
}
dev_cleanup();
+#ifdef EFI
trampoline(trampstack, copy_staging == COPY_STAGING_ENABLE ?
efi_copy_finish : efi_copy_finish_nop, kernend, modulep,
PT4, ehdr->e_entry);
+#else
+ trampoline_data = (void *)trampoline + tramp_data_offset;
+ trampoline_data->entry = ehdr->e_entry;
+ trampoline_data->pt4 = trampolinebase + LOADER_PAGE_SIZE;
+ /*
+ * So we compute the VA of the module data by modulep + KERNBASE....
+ * need to make sure that that address is mapped right. We calculate
+ * the start of available memory to allocate via kernend (which is
+ * calculated with a phyaddr of "kernend + PA(PT_u0[1])"), so we better
+ * make sure we're not overwriting the last 2MB of the kernel :).
+ */
+ trampoline_data->modulep = modulep; /* Offset from KERNBASE */
+ trampoline_data->kernend = kernend; /* Offset from the load address */
+ trampoline_data->fill1 = trampoline_data->fill2 = 0;
+ printf("Modulep = %lx kernend %lx\n", modulep, kernend);
+ /* NOTE: when copyting in, it's relative to the start of our 'area' not an abs addr */
+ /* Copy the trampoline to the ksegs */
+ archsw.arch_copyin((void *)trampcode, trampolinebase - staging, tramp_size);
+ /* Copy the page table to the ksegs */
+ archsw.arch_copyin(PT4, trampoline_data->pt4 - staging, 9 * LOADER_PAGE_SIZE);
+
+ if (archsw.arch_kexec_kseg_get == NULL)
+ panic("architecture did not provide kexec segment mapping");
+ archsw.arch_kexec_kseg_get(&nseg, &kseg);
+ error = host_kexec_load(trampolinebase, nseg, kseg, HOST_KEXEC_ARCH_X86_64);
+ if (error != 0)
+ panic("kexec_load returned error: %d", error);
+ host_reboot(HOST_REBOOT_MAGIC1, HOST_REBOOT_MAGIC2, HOST_REBOOT_CMD_KEXEC, 0);
#endif
panic("exec returned");

File Metadata

Mime Type
text/plain
Expires
Sat, Jan 18, 1:44 PM (17 h, 51 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
15862853
Default Alt Text
D38259.diff (18 KB)

Event Timeline