Page Menu
Home
FreeBSD
Search
Configure Global Search
Log In
Files
F109477412
D24758.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Flag For Later
Award Token
Size
32 KB
Referenced Files
None
Subscribers
None
D24758.diff
View Options
Index: sys/amd64/amd64/machdep.c
===================================================================
--- sys/amd64/amd64/machdep.c
+++ sys/amd64/amd64/machdep.c
@@ -216,9 +216,6 @@
static struct trapframe proc0_tf;
struct region_descriptor r_idt;
-struct pcpu *__pcpu;
-struct pcpu temp_bsp_pcpu;
-
struct mtx icu_lock;
struct mem_range_softc mem_range_softc;
@@ -1670,13 +1667,20 @@
*/
pmap_thread_init_invl_gen(&thread0);
- pc = &temp_bsp_pcpu;
+ /*
+ * Initialize the static and dynamic per-CPU areas. The latter must
+ * immediately follow the former.
+ */
+ pc = (struct pcpu *)(physfree + KERNBASE);
+ physfree += sizeof(struct pcpu);
pcpu_init(pc, 0, sizeof(struct pcpu));
- gdt = &temp_bsp_pcpu.pc_gdt[0];
+ dpcpu_init((void *)(physfree + KERNBASE), 0);
+ physfree += DPCPU_SIZE;
/*
* make gdt memory segments
*/
+ gdt = &pc->pc_gdt[0];
for (x = 0; x < NGDT; x++) {
if (x != GPROC0_SEL && x != (GPROC0_SEL + 1) &&
x != GUSERLDT_SEL && x != (GUSERLDT_SEL) + 1)
@@ -1694,8 +1698,6 @@
wrmsr(MSR_GSBASE, (u_int64_t)pc);
wrmsr(MSR_KGSBASE, 0); /* User value while in the kernel */
- dpcpu_init((void *)(physfree + KERNBASE), 0);
- physfree += DPCPU_SIZE;
amd64_bsp_pcpu_init1(pc);
/* Non-late cninit() and printf() can be moved up to here. */
Index: sys/amd64/amd64/mp_machdep.c
===================================================================
--- sys/amd64/amd64/mp_machdep.c
+++ sys/amd64/amd64/mp_machdep.c
@@ -61,8 +61,13 @@
#include <vm/pmap.h>
#include <vm/vm_kern.h>
#include <vm/vm_extern.h>
+#include <vm/vm_map.h>
+#include <vm/vm_object.h>
#include <vm/vm_page.h>
#include <vm/vm_phys.h>
+#include <vm/vm_pagequeue.h>
+#include <vm/uma.h>
+#include <vm/uma_int.h>
#include <x86/apicreg.h>
#include <machine/clock.h>
@@ -124,6 +129,173 @@
return (start + AP_BOOTPT_SZ <= GiB(4) && atop(end) < Maxmem);
}
+/*
+ * Initialize the bootstrap allocator for dynamic per-CPU memory allocations.
+ * 2MB is reserved by pmap_bootstrap() for the BSP, from which its pcpu and
+ * dpcpu regions are allocated. The rest can be used by UMA to satisfy dynamic
+ * per-CPU allocations until SI_SUB_CPU, when the pcpu regions for the APs are
+ * laid out. At that point, unused portions of the initial 2MB allocation may
+ * be used for APs as well as the BSP.
+ */
+static void
+pcpu_bootstrap(void *arg __unused)
+{
+ uma_pcpu_init1(VM_PCPU_BASE_START + sizeof(struct pcpu) + DPCPU_SIZE,
+ VM_PCPU_BOOTSTRAP_SIZE - (sizeof(struct pcpu) + DPCPU_SIZE));
+}
+SYSINIT(pcpu_bootstrap, SI_SUB_VM, SI_ORDER_ANY, pcpu_bootstrap, NULL);
+
+static int
+pcpu_domidx(int domain)
+{
+ int bspdom;
+
+ bspdom = PCPU_GET(domain);
+ if (bspdom == 0)
+ return (domain);
+ if (domain == bspdom)
+ return (0);
+ return (domain > bspdom ? domain : domain + 1);
+}
+
+/*
+ * Place per-CPU structures. Each AP requires a pcpu and dpcpu region. The
+ * pcpu region of a CPU is its base pcpu address. A pointer to per-CPU data is
+ * an offset relative to the base pcpu address, and UMA's per-CPU allocator
+ * ensures that adding that offset to the base address always gives the address
+ * of memory allocated for the corresponding CPU.
+ *
+ * The layout attempts to maximize use of 2MB mappings while also providing
+ * domain-local memory on NUMA systems. It uses 2 parameters, N, the number of
+ * 4KB pages per CPU, and M, the number of 2MB pages per allocation quantum. M
+ * is a multiple of vm_ndomains and they are usually equal. N has a lower bound
+ * of L = sizeof(struct pcpu) + DPCPU_SIZE + uma_pcpu_bootstrap_used(), where
+ * the last term is the amount of memory used by the bootstrap per-CPU
+ * allocator. Each 2MB page hosts per-CPU data for CPUs belonging to the domain
+ * from which the page was allocated, so we first compute M by determining the
+ * maximum number of CPUs per domain and multiplying that by L. Then N is given
+ * by M*2MB divided by the number of CPUs per domain.
+ *
+ * __________ N 4KB pages __________
+ * / \
+ * VM_PCPU_BASE_START -------> +----------+-----------+--------------+
+ * | BSP pcpu | BSP dpcpu | UMA data ... |\
+ * +----------+-----------+--------------+ |
+ * | AP1 pcpu | AP1 dpcpu | UMA data ... | |
+ * +----------+-----------+--------------+ |
+ * | ... | | M 2MB
+ * +----------+--------------------------+ | pages
+ * | APi pcpu | APi dpcpu | UMA data ... | |
+ * +----------+-----------+--------------+ |
+ * | ... | |
+ * | ... |/
+ * +-------------------------------------+
+ *
+ * If the original region is exhausted, for example because a subsystem
+ * allocates many per-CPU counters, UMA allocaates another M*2MB region of KVA
+ * to mirror the base region.
+ */
+static void
+pcpu_layout(void)
+{
+ vm_offset_t addr;
+ vm_size_t size, used;
+ int count[MAXMEMDOM], domoff[MAXMEMDOM];
+ int domain, error, i, maxcpupdom, n2mpgpdom, n4kpgpcpu, nbpdom;
+
+ /*
+ * Compute the maximum count of CPUs in a single domain. Domains are
+ * typically symmetric but this is not required.
+ */
+ memset(count, 0, sizeof(count));
+ for (i = 0; i <= mp_maxid; i++) {
+ if (vm_ndomains > 1 && cpu_apic_ids[i] != -1)
+ domain = acpi_pxm_get_cpu_locality(cpu_apic_ids[i]);
+ else
+ domain = 0;
+ count[domain]++;
+ }
+ for (i = 0, maxcpupdom = -1; i < vm_ndomains; i++)
+ if (count[i] > maxcpupdom)
+ maxcpupdom = count[i];
+
+ /*
+ * Compute layout parameters: the number of 4KB pages per CPU, and the
+ * number of 2MB pages per domain. The amount of memory already
+ * allocated by the bootstrap allocator gives a lower bound for the
+ * former, and we use that bound to compute the number of 2MB pages
+ * per domain.
+ */
+ used = uma_pcpu_bootstrap_used();
+ n2mpgpdom = howmany(atop(used) * maxcpupdom, NPDEPG);
+ n4kpgpcpu = atop(NBPDR * n2mpgpdom) / maxcpupdom;
+
+ /*
+ * Assign a pcpu base address to each CPU. Handle the possibility that
+ * the BSP is not local to domain 0.
+ */
+ memset(domoff, 0, sizeof(domoff));
+ for (i = 0; i <= mp_maxid; i++) {
+ if (vm_ndomains > 1 && cpu_apic_ids[i] != -1)
+ domain = acpi_pxm_get_cpu_locality(cpu_apic_ids[i]);
+ else
+ domain = 0;
+
+ addr = VM_PCPU_BASE_START +
+ pcpu_domidx(domain) * n2mpgpdom * NBPDR +
+ domoff[domain] * n4kpgpcpu * PAGE_SIZE;
+ cpuid_to_pcpu[i] = (struct pcpu *)addr;
+ domoff[domain]++;
+ }
+
+ /*
+ * Ensure that the remaining bootstrap region is backed by physical
+ * pages.
+ */
+ nbpdom = n2mpgpdom * NBPDR;
+ for (domain = 0; domain < vm_ndomains; domain++) {
+ addr = VM_PCPU_BASE_START + nbpdom * pcpu_domidx(domain);
+ size = nbpdom;
+ if (domain == PCPU_GET(domain)) {
+ /* This 2MB page was allocated by pmap_bootstrap(). */
+ addr += NBPDR;
+ size -= NBPDR;
+ if (size == 0)
+ continue;
+ }
+ if (VM_DOMAIN_EMPTY(domain))
+ error = kmem_back(kernel_object, addr, size,
+ M_WAITOK | M_ZERO);
+ else
+ error = kmem_back_domain(domain, kernel_object, addr,
+ size, M_WAITOK | M_ZERO);
+ if (error != KERN_SUCCESS)
+ panic("%s: failed to allocate memory: %d",
+ __func__, error);
+ }
+
+ /*
+ * Release reserved, unused KVA back to the system.
+ */
+ vm_map_lock(kernel_map);
+ error = vm_map_delete(kernel_map,
+ VM_PCPU_BASE_START + vm_ndomains * nbpdom,
+ VM_PCPU_BASE_START + VM_PCPU_BASE_SIZE);
+ if (error != KERN_SUCCESS)
+ panic("%s: failed to release KVA: %d", __func__, error);
+ vm_map_unlock(kernel_map);
+
+ /*
+ * Finally, provide layout parameters to the allocator so that it can
+ * finish bootstrapping.
+ */
+ uma_pcpu_init2(n4kpgpcpu, n2mpgpdom);
+
+ if (bootverbose)
+ printf("%s: %d 2MB pages per domain, %d 4KB pages per CPU\n",
+ __func__, n2mpgpdom, n4kpgpcpu);
+}
+
/*
* Calculate usable address in base memory for AP trampoline code.
*/
@@ -263,6 +435,9 @@
assign_cpu_ids();
+ /* Place AP pcpu structures now that CPU IDs are defined. */
+ pcpu_layout();
+
/* Start each Application Processor */
init_ops.start_all_aps();
@@ -292,12 +467,9 @@
/* Update microcode before doing anything else. */
ucode_load_ap(cpu);
- /* Get per-cpu data and save */
- pc = &__pcpu[cpu];
-
- /* prime data page for it to use */
+ pc = cpuid_to_pcpu[cpu];
pcpu_init(pc, cpu, sizeof(struct pcpu));
- dpcpu_init(dpcpu, cpu);
+ dpcpu_init((void *)DPCPU_BASE(pc), cpu);
pc->pc_apic_id = cpu_apic_ids[cpu];
pc->pc_prvspace = pc;
pc->pc_curthread = 0;
@@ -315,7 +487,7 @@
pc->pc_pcid_gen = 1;
/* Init tss */
- pc->pc_common_tss = __pcpu[0].pc_common_tss;
+ pc->pc_common_tss = cpuid_to_pcpu[0]->pc_common_tss;
pc->pc_common_tss.tss_iobase = sizeof(struct amd64tss) +
IOPERM_BITMAP_SIZE;
pc->pc_common_tss.tss_rsp0 = 0;
@@ -388,27 +560,6 @@
* local functions and data
*/
-#ifdef NUMA
-static void
-mp_realloc_pcpu(int cpuid, int domain)
-{
- vm_page_t m;
- vm_offset_t oa, na;
-
- oa = (vm_offset_t)&__pcpu[cpuid];
- if (_vm_phys_domain(pmap_kextract(oa)) == domain)
- return;
- m = vm_page_alloc_domain(NULL, 0, domain,
- VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ);
- if (m == NULL)
- return;
- na = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
- pagecopy((void *)oa, (void *)na);
- pmap_qenter((vm_offset_t)&__pcpu[cpuid], &m, 1);
- /* XXX old pcpu page leaked. */
-}
-#endif
-
/*
* start each AP in our list
*/
@@ -456,16 +607,6 @@
outb(CMOS_REG, BIOS_RESET);
outb(CMOS_DATA, BIOS_WARM); /* 'warm-start' */
- /* Relocate pcpu areas to the correct domain. */
-#ifdef NUMA
- if (vm_ndomains > 1)
- for (cpu = 1; cpu < mp_ncpus; cpu++) {
- apic_id = cpu_apic_ids[cpu];
- domain = acpi_pxm_get_cpu_locality(apic_id);
- mp_realloc_pcpu(cpu, domain);
- }
-#endif
-
/* start each AP */
domain = 0;
for (cpu = 1; cpu < mp_ncpus; cpu++) {
@@ -484,8 +625,6 @@
DOMAINSET_PREF(domain), PAGE_SIZE, M_WAITOK | M_ZERO);
dbg_stack = (char *)kmem_malloc_domainset(
DOMAINSET_PREF(domain), PAGE_SIZE, M_WAITOK | M_ZERO);
- dpcpu = (void *)kmem_malloc_domainset(DOMAINSET_PREF(domain),
- DPCPU_SIZE, M_WAITOK | M_ZERO);
bootSTK = (char *)bootstacks[cpu] +
kstack_pages * PAGE_SIZE - 8;
Index: sys/amd64/amd64/pmap.c
===================================================================
--- sys/amd64/amd64/pmap.c
+++ sys/amd64/amd64/pmap.c
@@ -1415,6 +1415,17 @@
return (ret);
}
+static uint64_t
+alloc2mpage(vm_paddr_t *firstaddr)
+{
+ uint64_t ret;
+
+ ret = roundup2(*firstaddr, NBPDR);
+ bzero((void *)ret, NBPDR);
+ *firstaddr = ret + NBPDR;
+ return (ret);
+}
+
CTASSERT(powerof2(NDMPML4E));
/* number of kernel PDP slots */
@@ -1660,6 +1671,59 @@
}
}
+static void
+bootstrap_pcpu(vm_paddr_t pcpupg, vm_paddr_t pdppg)
+{
+ struct region_descriptor r_gdt;
+ struct pcpu *oldpc, *pc;
+ void *dpcpu;
+ vm_offset_t va;
+ pdp_entry_t *pdpe;
+ pd_entry_t *pde;
+
+ /*
+ * Map the bootstrap per-CPU region.
+ */
+ va = VM_PCPU_BASE_START;
+ pdpe = pmap_pdpe(kernel_pmap, va);
+ if ((*pdpe & X86_PG_V) != 0)
+ panic("pdpe for %#lx is already valid", va);
+ *pdpe = pdppg | X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M;
+ pde = pmap_pde(kernel_pmap, va);
+ pde_store(pde, pcpupg | X86_PG_V | X86_PG_PS | X86_PG_RW | X86_PG_A |
+ X86_PG_M | pg_nx | pg_g);
+
+ /*
+ * Re-initialize PCPU area for BSP after switching.
+ * Make hardware use gdt and common_tss from the new PCPU.
+ * Copy dynamic PCPU data following the PCPU structure.
+ */
+ STAILQ_INIT(&cpuhead);
+ pc = (struct pcpu *)va;
+ oldpc = get_pcpu();
+ wrmsr(MSR_GSBASE, (uintptr_t)pc);
+ pcpu_init(pc, 0, sizeof(struct pcpu));
+ amd64_bsp_pcpu_init1(pc);
+ amd64_bsp_ist_init(pc);
+ pc->pc_common_tss.tss_iobase = sizeof(struct amd64tss) +
+ IOPERM_BITMAP_SIZE;
+ memcpy(pc->pc_gdt, oldpc->pc_gdt, NGDT *
+ sizeof(struct user_segment_descriptor));
+ gdt_segs[GPROC0_SEL].ssd_base = (uintptr_t)&pc->pc_common_tss;
+ ssdtosyssd(&gdt_segs[GPROC0_SEL],
+ (struct system_segment_descriptor *)&pc->pc_gdt[GPROC0_SEL]);
+ r_gdt.rd_limit = NGDT * sizeof(struct user_segment_descriptor) - 1;
+ r_gdt.rd_base = (long)pc->pc_gdt;
+ lgdt(&r_gdt);
+ wrmsr(MSR_GSBASE, (uintptr_t)pc);
+ ltr(GSEL(GPROC0_SEL, SEL_KPL));
+ pc->pc_acpi_id = oldpc->pc_acpi_id;
+
+ dpcpu = (void *)DPCPU_BASE(pc);
+ dpcpu_init(dpcpu, 0);
+ memcpy(dpcpu, (void *)DPCPU_BASE(oldpc), DPCPU_BYTES);
+}
+
/*
* Bootstrap the system enough to run with virtual memory.
*
@@ -1674,10 +1738,9 @@
pmap_bootstrap(vm_paddr_t *firstaddr)
{
vm_offset_t va;
- pt_entry_t *pte, *pcpu_pte;
- struct region_descriptor r_gdt;
- uint64_t cr4, pcpu_phys;
- u_long res;
+ pt_entry_t *pte;
+ uint64_t cr4;
+ u_long res, pcpupg, pdppg;
int i;
KERNend = *firstaddr;
@@ -1691,8 +1754,6 @@
*/
create_pagetables(firstaddr);
- pcpu_phys = allocpages(firstaddr, MAXCPU);
-
/*
* Add a physical memory segment (vm_phys_seg) corresponding to the
* preallocated kernel page table pages so that vm_page structures
@@ -1708,6 +1769,20 @@
virtual_avail = (vm_offset_t)KERNBASE + round_2mpage(KERNend);
virtual_end = VM_MAX_KERNEL_ADDRESS;
+ /*
+ * Reserve physical memory to bootstrap the per-CPU allocator, as well
+ * as a PD page used to map it into the kernel map. Minimize the amount
+ * of memory wasted to maintain alignment.
+ */
+ if ((*firstaddr & PDRMASK) != 0) {
+ pdppg = allocpages(firstaddr, 1);
+ pcpupg = alloc2mpage(firstaddr);
+ } else {
+ pcpupg = alloc2mpage(firstaddr);
+ pdppg = allocpages(firstaddr, 1);
+ }
+ vm_phys_early_add_seg(pcpupg, pcpupg + NBPDR);
+
/*
* Enable PG_G global pages, then switch to the kernel page
* table from the bootstrap page table. After the switch, it
@@ -1759,38 +1834,12 @@
*/
SYSMAP(caddr_t, CMAP1, crashdumpmap, MAXDUMPPGS)
CADDR1 = crashdumpmap;
-
- SYSMAP(struct pcpu *, pcpu_pte, __pcpu, MAXCPU);
virtual_avail = va;
- for (i = 0; i < MAXCPU; i++) {
- pcpu_pte[i] = (pcpu_phys + ptoa(i)) | X86_PG_V | X86_PG_RW |
- pg_g | pg_nx | X86_PG_M | X86_PG_A;
- }
-
/*
- * Re-initialize PCPU area for BSP after switching.
- * Make hardware use gdt and common_tss from the new PCPU.
+ * Bootstrap the per-CPU allocator.
*/
- STAILQ_INIT(&cpuhead);
- wrmsr(MSR_GSBASE, (uint64_t)&__pcpu[0]);
- pcpu_init(&__pcpu[0], 0, sizeof(struct pcpu));
- amd64_bsp_pcpu_init1(&__pcpu[0]);
- amd64_bsp_ist_init(&__pcpu[0]);
- __pcpu[0].pc_common_tss.tss_iobase = sizeof(struct amd64tss) +
- IOPERM_BITMAP_SIZE;
- memcpy(__pcpu[0].pc_gdt, temp_bsp_pcpu.pc_gdt, NGDT *
- sizeof(struct user_segment_descriptor));
- gdt_segs[GPROC0_SEL].ssd_base = (uintptr_t)&__pcpu[0].pc_common_tss;
- ssdtosyssd(&gdt_segs[GPROC0_SEL],
- (struct system_segment_descriptor *)&__pcpu[0].pc_gdt[GPROC0_SEL]);
- r_gdt.rd_limit = NGDT * sizeof(struct user_segment_descriptor) - 1;
- r_gdt.rd_base = (long)__pcpu[0].pc_gdt;
- lgdt(&r_gdt);
- wrmsr(MSR_GSBASE, (uint64_t)&__pcpu[0]);
- ltr(GSEL(GPROC0_SEL, SEL_KPL));
- __pcpu[0].pc_dynamic = temp_bsp_pcpu.pc_dynamic;
- __pcpu[0].pc_acpi_id = temp_bsp_pcpu.pc_acpi_id;
+ bootstrap_pcpu(pcpupg, pdppg);
/*
* Initialize the PAT MSR.
@@ -4109,7 +4158,7 @@
vm_page_array_size = pages;
- start = VM_MIN_KERNEL_ADDRESS;
+ start = VM_PAGE_ARRAY_START;
end = start + pages * sizeof(struct vm_page);
for (va = start; va < end; va += NBPDR) {
pfn = first_page + (va - start) / sizeof(struct vm_page);
@@ -9818,6 +9867,7 @@
{
vm_page_t pml4_pg;
pdp_entry_t *pdpe;
+ struct pcpu *pc;
vm_offset_t va;
int i;
@@ -9832,23 +9882,24 @@
pdpe = pmap_pti_pdpe(va);
pmap_pti_wire_pte(pdpe);
}
- pmap_pti_add_kva_locked((vm_offset_t)&__pcpu[0],
- (vm_offset_t)&__pcpu[0] + sizeof(__pcpu[0]) * MAXCPU, false);
pmap_pti_add_kva_locked((vm_offset_t)idt, (vm_offset_t)idt +
sizeof(struct gate_descriptor) * NIDT, false);
CPU_FOREACH(i) {
+ pc = cpuid_to_pcpu[i];
+ pmap_pti_add_kva_locked((vm_offset_t)pc, (vm_offset_t)(pc + 1),
+ false);
/* Doublefault stack IST 1 */
- va = __pcpu[i].pc_common_tss.tss_ist1;
+ va = pc->pc_common_tss.tss_ist1;
pmap_pti_add_kva_locked(va - PAGE_SIZE, va, false);
/* NMI stack IST 2 */
- va = __pcpu[i].pc_common_tss.tss_ist2 + sizeof(struct nmi_pcpu);
+ va = pc->pc_common_tss.tss_ist2 + sizeof(struct nmi_pcpu);
pmap_pti_add_kva_locked(va - PAGE_SIZE, va, false);
/* MC# stack IST 3 */
- va = __pcpu[i].pc_common_tss.tss_ist3 +
+ va = pc->pc_common_tss.tss_ist3 +
sizeof(struct nmi_pcpu);
pmap_pti_add_kva_locked(va - PAGE_SIZE, va, false);
/* DB# stack IST 4 */
- va = __pcpu[i].pc_common_tss.tss_ist4 + sizeof(struct nmi_pcpu);
+ va = pc->pc_common_tss.tss_ist4 + sizeof(struct nmi_pcpu);
pmap_pti_add_kva_locked(va - PAGE_SIZE, va, false);
}
pmap_pti_add_kva_locked((vm_offset_t)kernphys + KERNBASE,
Index: sys/amd64/amd64/uma_machdep.c
===================================================================
--- sys/amd64/amd64/uma_machdep.c
+++ sys/amd64/amd64/uma_machdep.c
@@ -3,6 +3,10 @@
*
* Copyright (c) 2003 Alan L. Cox <alc@cs.rice.edu>
* All rights reserved.
+ * Copyright (c) 2020 The FreeBSD Foundation
+ *
+ * Portions of this software were developed by Mark Johnston under
+ * sponsorship from the FreeBSD Foundation.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -31,18 +35,36 @@
#include <sys/param.h>
#include <sys/lock.h>
+#include <sys/kernel.h>
#include <sys/malloc.h>
#include <sys/mutex.h>
+#include <sys/smp.h>
#include <sys/systm.h>
+#include <sys/vmem.h>
#include <sys/vmmeter.h>
+
#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_object.h>
#include <vm/vm_page.h>
#include <vm/vm_pageout.h>
+#include <vm/vm_phys.h>
+#include <vm/vm_pagequeue.h>
#include <vm/uma.h>
#include <vm/uma_int.h>
+
#include <machine/md_var.h>
#include <machine/vmparam.h>
+/* Bootstrap data. */
+static bool uma_pcpu_bootstrapped = false;
+static vm_offset_t uma_pcpu_bootstrap_addr;
+static vm_size_t uma_pcpu_bootstrap_size;
+
+static vmem_t *uma_pcpu_arena;
+
void *
uma_small_alloc(uma_zone_t zone, vm_size_t bytes, int domain, u_int8_t *flags,
int wait)
@@ -77,3 +99,150 @@
vm_page_unwire_noq(m);
vm_page_free(m);
}
+
+void *
+uma_pcpu_alloc(uma_zone_t zone, vm_size_t size, int domain, uint8_t *flags,
+ int wait)
+{
+ void *pcpu_offset;
+ vm_offset_t addr, pcpu_addr;
+ vm_size_t pcpu_size;
+ int error, i;
+
+ KASSERT(size == (mp_maxid + 1) * PAGE_SIZE,
+ ("%s: unexpected alloc size %#lx", __func__, size));
+
+ *flags = UMA_SLAB_PRIV;
+ pcpu_size = PAGE_SIZE;
+
+ if (!uma_pcpu_bootstrapped) {
+ if (uma_pcpu_bootstrap_size == 0)
+ panic("%s: ran out of per-CPU pages", __func__);
+ addr = uma_pcpu_bootstrap_addr;
+ uma_pcpu_bootstrap_addr += pcpu_size;
+ uma_pcpu_bootstrap_size -= pcpu_size;
+ return ((void *)addr);
+ }
+
+ error = vmem_alloc(uma_pcpu_arena, pcpu_size, M_BESTFIT | wait, &addr);
+ if (error != 0)
+ return (NULL);
+
+ /*
+ * If the address comes from the bootstrap region, it is already backed
+ * by physical memory. Otherwise we must allocate memory.
+ */
+ pcpu_offset = zpcpu_base_to_offset((void *)addr);
+ if ((vm_offset_t)pcpu_offset >= VM_PCPU_BOOTSTRAP_SIZE) {
+ for (i = 0; i <= mp_maxid; i++) {
+ domain = cpuid_to_pcpu[i]->pc_domain;
+ pcpu_addr = (vm_offset_t)zpcpu_get_cpu(pcpu_offset, i);
+ if (VM_DOMAIN_EMPTY(domain))
+ error = kmem_back(kernel_object, pcpu_addr,
+ pcpu_size, wait | M_ZERO);
+ else
+ error = kmem_back_domain(domain, kernel_object,
+ pcpu_addr, pcpu_size, wait | M_ZERO);
+ if (error != KERN_SUCCESS)
+ goto fail;
+ }
+ }
+ return ((void *)addr);
+
+fail:
+ for (; i > 0; i--) {
+ pcpu_addr = (vm_offset_t)zpcpu_get_cpu(pcpu_offset, i - 1);
+ kmem_unback(kernel_object, pcpu_addr, pcpu_size);
+ }
+ vmem_xfree(uma_pcpu_arena, addr, pcpu_size);
+ return (NULL);
+}
+
+void
+uma_pcpu_free(void *mem, vm_size_t size, uint8_t flags)
+{
+ void *pcpu_offset;
+ vm_offset_t pcpu_addr;
+ vm_size_t pcpu_size;
+ int i;
+
+ KASSERT(uma_pcpu_bootstrapped,
+ ("%s: not bootstrapped", __func__));
+ KASSERT(size == (mp_maxid + 1) * PAGE_SIZE,
+ ("%s: unexpected free size %#lx", __func__, size));
+
+ pcpu_offset = zpcpu_base_to_offset(mem);
+ pcpu_size = PAGE_SIZE;
+
+ /*
+ * Memory allocated from the bootstrap region remains permanently
+ * allocated.
+ */
+ if ((vm_offset_t)pcpu_offset >= VM_PCPU_BOOTSTRAP_SIZE)
+ for (i = 0; i <= mp_maxid; i++) {
+ pcpu_addr = (vm_offset_t)zpcpu_get_cpu(pcpu_offset, i);
+ kmem_unback(kernel_object, pcpu_addr, pcpu_size);
+ }
+
+ vmem_free(uma_pcpu_arena, (vm_offset_t)mem, pcpu_size);
+}
+
+static int
+pcpu_import(void *arg, vmem_size_t size, int flags, vmem_addr_t *addrp)
+{
+ vm_size_t kvasize, nbpdom;
+
+ nbpdom = (int)(uintptr_t)arg * NBPDR;
+ kvasize = nbpdom * vm_ndomains;
+ return (vmem_xalloc(kernel_arena, kvasize, VM_PCPU_ALIGN, 0, 0,
+ 0, ~(vmem_addr_t)0, M_BESTFIT | flags, addrp));
+}
+
+void
+uma_pcpu_init1(vm_offset_t addr, vm_size_t size)
+{
+ uma_pcpu_bootstrap_addr = addr;
+ uma_pcpu_bootstrap_size = size;
+}
+
+void
+uma_pcpu_init2(int n4kpgpcpu, int n2mpgpdom)
+{
+ vmem_addr_t addr, addr1;
+ vmem_size_t pcpu_size;
+ int error;
+
+ KASSERT(!smp_started, ("%s: called after SMP is started", __func__));
+
+ pcpu_size = PAGE_SIZE;
+
+ uma_pcpu_arena = vmem_create("UMA pcpu arena", 0, 0, pcpu_size, 0,
+ M_WAITOK);
+ vmem_set_import(uma_pcpu_arena, pcpu_import, NULL,
+ (void *)(uintptr_t)n2mpgpdom, ptoa(n4kpgpcpu));
+
+ /*
+ * Add the bootstrap region. Structures allocated during boot may be
+ * freed, for example if a preloaded module is unloaded, so they are
+ * marked here as allocated.
+ */
+ error = vmem_add(uma_pcpu_arena, VM_PCPU_BASE_START, ptoa(n4kpgpcpu),
+ M_WAITOK);
+ if (error != 0)
+ panic("%s: vmem_add() failed: %d", __func__, error);
+ for (addr = VM_PCPU_BASE_START; addr < uma_pcpu_bootstrap_addr;
+ addr += pcpu_size) {
+ error = vmem_xalloc(uma_pcpu_arena, pcpu_size, 0, 0, 0,
+ addr, addr + pcpu_size, M_BESTFIT | M_WAITOK, &addr1);
+ if (error != 0)
+ panic("%s: vmem_xalloc() failed: %d", __func__, error);
+ }
+
+ uma_pcpu_bootstrapped = true;
+}
+
+vm_size_t
+uma_pcpu_bootstrap_used(void)
+{
+ return (uma_pcpu_bootstrap_addr - VM_PCPU_BASE_START);
+}
Index: sys/amd64/include/pcpu.h
===================================================================
--- sys/amd64/include/pcpu.h
+++ sys/amd64/include/pcpu.h
@@ -37,6 +37,7 @@
#include <machine/segments.h>
#include <machine/tss.h>
+#include <machine/vmparam.h>
#define PC_PTI_STACK_SZ 16
@@ -238,11 +239,23 @@
#define PCPU_PTR(member) __PCPU_PTR(pc_ ## member)
#define PCPU_SET(member, val) __PCPU_SET(pc_ ## member, val)
+#define DPCPU_BASE(pc) ((uintptr_t)((struct pcpu *)(pc) + 1))
+
+/*
+ * Kernel modules use a dynamically allocated region in the DPCPU area,
+ * so they must fall back to the indirection through pc_dynamic.
+ */
+#ifndef KLD_MODULE
+#define DPCPU_BASE_OFFSET(pc) (DPCPU_BASE(pc) - DPCPU_START)
+#endif
+
#define IS_BSP() (PCPU_GET(cpuid) == 0)
-#define zpcpu_offset_cpu(cpu) ((uintptr_t)&__pcpu[0] + UMA_PCPU_ALLOC_SIZE * cpu)
-#define zpcpu_base_to_offset(base) (void *)((uintptr_t)(base) - (uintptr_t)&__pcpu[0])
-#define zpcpu_offset_to_base(base) (void *)((uintptr_t)(base) + (uintptr_t)&__pcpu[0])
+#define zpcpu_offset_cpu(cpu) ((uintptr_t)cpuid_to_pcpu[cpu])
+#define zpcpu_base_to_offset(base) ((void *)((uintptr_t)(base) - \
+ (uintptr_t)VM_PCPU_BASE_START))
+#define zpcpu_offset_to_base(base) ((void *)((uintptr_t)(base) + \
+ (uintptr_t)VM_PCPU_BASE_START))
#define zpcpu_sub_protected(base, n) do { \
ZPCPU_ASSERT_PROTECTED(); \
Index: sys/amd64/include/pcpu_aux.h
===================================================================
--- sys/amd64/include/pcpu_aux.h
+++ sys/amd64/include/pcpu_aux.h
@@ -42,10 +42,7 @@
#endif
/* Required for counters(9) to work on x86. */
-_Static_assert(sizeof(struct pcpu) == UMA_PCPU_ALLOC_SIZE, "fix pcpu size");
-
-extern struct pcpu *__pcpu;
-extern struct pcpu temp_bsp_pcpu;
+_Static_assert(sizeof(struct pcpu) % PAGE_SIZE == 0, "fix pcpu size");
static __inline __pure2 struct thread *
__curthread(void)
Index: sys/amd64/include/vmparam.h
===================================================================
--- sys/amd64/include/vmparam.h
+++ sys/amd64/include/vmparam.h
@@ -78,6 +78,12 @@
*/
#define UMA_MD_SMALL_ALLOC
+/*
+ * We provide a machine specific per-CPU allocator which returns 2MB mappings
+ * when possible.
+ */
+#define UMA_MD_PCPU_ALLOC
+
/*
* The physical address space is densely populated.
*/
@@ -165,7 +171,8 @@
*
* Within the kernel map:
*
- * 0xfffffe0000000000 vm_page_array
+ * 0xfffffe0000000000 bootstrap pcpu region
+ * 0xfffffe0020000000 vm_page_array
* 0xffffffff80000000 KERNBASE
*/
@@ -192,6 +199,13 @@
#define VM_MAX_ADDRESS UPT_MAX_ADDRESS
#define VM_MIN_ADDRESS (0)
+#define VM_PCPU_BASE_START VM_MIN_KERNEL_ADDRESS
+#define VM_PCPU_BASE_SIZE (MAXCPU * NBPDR)
+#define VM_PCPU_BOOTSTRAP_SIZE NBPDR
+#define VM_PCPU_ALIGN NBPDR
+
+#define VM_PAGE_ARRAY_START (VM_PCPU_BASE_START + VM_PCPU_BASE_SIZE)
+
/*
* XXX Allowing dmaplimit == 0 is a temporary workaround for vt(4) efifb's
* early use of PHYS_TO_DMAP before the mapping is actually setup. This works
Index: sys/i386/i386/mp_machdep.c
===================================================================
--- sys/i386/i386/mp_machdep.c
+++ sys/i386/i386/mp_machdep.c
@@ -146,6 +146,9 @@
static char *ap_copyout_buf;
static char *ap_tramp_stack_base;
+
+static void *dpcpu;
+
/*
* Initialize the IPI handlers and start up the AP's.
*/
Index: sys/sys/pcpu.h
===================================================================
--- sys/sys/pcpu.h
+++ sys/sys/pcpu.h
@@ -109,6 +109,10 @@
static t DPCPU_NAME(n) __section(DPCPU_SETNAME) __used
#endif
+#ifndef DPCPU_BASE_OFFSET
+#define DPCPU_BASE_OFFSET(pc) ((pc)->pc_dynamic)
+#endif
+
/*
* Accessors with a given base.
*/
@@ -120,7 +124,7 @@
/*
* Accessors for the current cpu.
*/
-#define DPCPU_PTR(n) _DPCPU_PTR(PCPU_GET(dynamic), n)
+#define DPCPU_PTR(n) _DPCPU_PTR(DPCPU_BASE_OFFSET(get_pcpu()), n)
#define DPCPU_GET(n) (*DPCPU_PTR(n))
#define DPCPU_SET(n, v) (*DPCPU_PTR(n) = v)
Index: sys/vm/uma_core.c
===================================================================
--- sys/vm/uma_core.c
+++ sys/vm/uma_core.c
@@ -280,11 +280,13 @@
static void *noobj_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int);
static void *page_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int);
+#ifndef UMA_MD_SMALL_ALLOC
static void *pcpu_page_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int);
+static void pcpu_page_free(void *, vm_size_t, uint8_t);
+#endif
static void *startup_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int);
static void *contig_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int);
static void page_free(void *, vm_size_t, uint8_t);
-static void pcpu_page_free(void *, vm_size_t, uint8_t);
static uma_slab_t keg_alloc_slab(uma_keg_t, uma_zone_t, int, int, int);
static void cache_drain(uma_zone_t);
static void bucket_drain(uma_zone_t, uma_bucket_t);
@@ -1514,6 +1516,7 @@
uma_alloc allocf;
uma_slab_t slab;
unsigned long size;
+ int pperslab;
uint8_t *mem;
uint8_t sflags;
int i;
@@ -1569,10 +1572,18 @@
else
slab_tohashslab(slab)->uhs_data = mem;
- if (keg->uk_flags & UMA_ZFLAG_VTOSLAB)
- for (i = 0; i < keg->uk_ppera; i++)
- vsetzoneslab((vm_offset_t)mem + (i * PAGE_SIZE),
+ if ((keg->uk_flags & UMA_ZFLAG_VTOSLAB) != 0) {
+ /*
+ * Per-CPU slabs have a special layout. Only pages belonging to
+ * the base of the allocation need to be marked, and the slab
+ * may not be contiguous.
+ */
+ pperslab = (keg->uk_flags & UMA_ZONE_PCPU) != 0 ?
+ atop(UMA_PCPU_ALLOC_SIZE) : keg->uk_ppera;
+ for (i = 0; i < pperslab; i++)
+ vsetzoneslab((vm_offset_t)mem + i * PAGE_SIZE,
zone, slab);
+ }
slab->us_freecount = keg->uk_ipers;
slab->us_flags = sflags;
@@ -1701,6 +1712,7 @@
return (p);
}
+#ifndef UMA_MD_PCPU_ALLOC
static void *
pcpu_page_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *pflag,
int wait)
@@ -1755,6 +1767,7 @@
}
return (NULL);
}
+#endif
/*
* Allocates a number of pages from within an object
@@ -1856,6 +1869,7 @@
kmem_free((vm_offset_t)mem, size);
}
+#ifndef UMA_MD_PCPU_ALLOC
/*
* Frees pcpu zone allocations
*
@@ -1891,7 +1905,7 @@
pmap_qremove(sva, size >> PAGE_SHIFT);
kva_free(sva, size);
}
-
+#endif
/*
* Zero fill initializer
@@ -2243,7 +2257,11 @@
if (booted < BOOT_KVA)
keg->uk_allocf = startup_alloc;
else if (keg->uk_flags & UMA_ZONE_PCPU)
+#ifdef UMA_MD_PCPU_ALLOC
+ keg->uk_allocf = uma_pcpu_alloc;
+#else
keg->uk_allocf = pcpu_page_alloc;
+#endif
else if ((keg->uk_flags & UMA_ZONE_CONTIG) != 0 && keg->uk_ppera > 1)
keg->uk_allocf = contig_alloc;
else
@@ -2254,7 +2272,11 @@
else
#endif
if (keg->uk_flags & UMA_ZONE_PCPU)
+#ifdef UMA_MD_PCPU_ALLOC
+ keg->uk_freef = uma_pcpu_free;
+#else
keg->uk_freef = pcpu_page_free;
+#endif
else
keg->uk_freef = page_free;
@@ -3114,10 +3136,21 @@
if (item == NULL)
return (NULL);
pcpu_item = zpcpu_base_to_offset(item);
- if (flags & M_ZERO) {
+ if ((flags & M_ZERO) != 0) {
#ifdef SMP
- for (i = 0; i <= mp_maxid; i++)
+ for (i = 0; i <= mp_maxid; i++) {
bzero(zpcpu_get_cpu(pcpu_item, i), zone->uz_size);
+#ifdef UMA_MD_PCPU_ALLOC
+ if (__predict_false(booted < BOOT_RUNNING))
+ /*
+ * Only CPU's 0 memory is accessible if the
+ * per-CPU allocator is still being
+ * bootstrapped. The allocator guarantees that
+ * early allocations will be zero-filled.
+ */
+ break;
+#endif
+ }
#else
bzero(item, zone->uz_size);
#endif
Index: sys/vm/uma_int.h
===================================================================
--- sys/vm/uma_int.h
+++ sys/vm/uma_int.h
@@ -664,6 +664,7 @@
uma_reclaim_wakeup();
}
+#ifdef UMA_MD_SMALL_ALLOC
/*
* The following two functions may be defined by architecture specific code
* if they can provide more efficient allocation functions. This is useful
@@ -672,6 +673,19 @@
void *uma_small_alloc(uma_zone_t zone, vm_size_t bytes, int domain,
uint8_t *pflag, int wait);
void uma_small_free(void *mem, vm_size_t size, uint8_t flags);
+#endif
+
+#ifdef UMA_MD_PCPU_ALLOC
+void *uma_pcpu_alloc(uma_zone_t zone, vm_size_t bytes, int domain,
+ uint8_t *pflag, int wait);
+void uma_pcpu_free(void *mem, vm_size_t size, uint8_t flags);
+
+#ifdef __amd64__
+void uma_pcpu_init1(vm_offset_t addr, vm_size_t size);
+void uma_pcpu_init2(int ptpstride, int npdepdom);
+vm_size_t uma_pcpu_bootstrap_used(void);
+#endif
+#endif
/* Set a global soft limit on UMA managed memory. */
void uma_set_limit(unsigned long limit);
Index: sys/vm/vm_kern.c
===================================================================
--- sys/vm/vm_kern.c
+++ sys/vm/vm_kern.c
@@ -767,6 +767,14 @@
/* ... and ending with the completion of the above `insert' */
#ifdef __amd64__
+ /*
+ * Mark the PCPU bootstrap region as allocated. In practice most of
+ * this region will be released back to the VM during boot.
+ */
+ (void)vm_map_insert(m, NULL, 0, VM_PCPU_BASE_START,
+ VM_PCPU_BASE_START + VM_PCPU_BASE_SIZE,
+ VM_PROT_RW, VM_PROT_RW, MAP_NOFAULT);
+
/*
* Mark KVA used for the page array as allocated. Other platforms
* that handle vm_page_array allocation can simply adjust virtual_avail
Index: sys/x86/include/x86_smp.h
===================================================================
--- sys/x86/include/x86_smp.h
+++ sys/x86/include/x86_smp.h
@@ -29,7 +29,6 @@
extern struct pcb stoppcbs[];
extern int cpu_apic_ids[];
extern int bootAP;
-extern void *dpcpu;
extern char *bootSTK;
extern void *bootstacks[];
extern unsigned int boot_address;
Index: sys/x86/x86/mp_x86.c
===================================================================
--- sys/x86/x86/mp_x86.c
+++ sys/x86/x86/mp_x86.c
@@ -94,7 +94,6 @@
/* Free these after use */
void *bootstacks[MAXCPU];
-void *dpcpu;
struct pcb stoppcbs[MAXCPU];
struct susppcb **susppcbs;
Index: sys/x86/xen/pv.c
===================================================================
--- sys/x86/xen/pv.c
+++ sys/x86/xen/pv.c
@@ -365,7 +365,6 @@
mce_stack = (char *)kmem_malloc(PAGE_SIZE, M_WAITOK | M_ZERO);
nmi_stack = (char *)kmem_malloc(PAGE_SIZE, M_WAITOK | M_ZERO);
dbg_stack = (void *)kmem_malloc(PAGE_SIZE, M_WAITOK | M_ZERO);
- dpcpu = (void *)kmem_malloc(DPCPU_SIZE, M_WAITOK | M_ZERO);
bootSTK = (char *)bootstacks[cpu] + kstack_pages * PAGE_SIZE - 8;
bootAP = cpu;
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Thu, Feb 6, 2:36 PM (20 h, 47 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
16494814
Default Alt Text
D24758.diff (32 KB)
Attached To
Mode
D24758: Introduce a dynamic pcpu layout on amd64.
Attached
Detach File
Event Timeline
Log In to Comment