Page MenuHomeFreeBSD

D24758.diff
No OneTemporary

D24758.diff

Index: sys/amd64/amd64/machdep.c
===================================================================
--- sys/amd64/amd64/machdep.c
+++ sys/amd64/amd64/machdep.c
@@ -216,9 +216,6 @@
static struct trapframe proc0_tf;
struct region_descriptor r_idt;
-struct pcpu *__pcpu;
-struct pcpu temp_bsp_pcpu;
-
struct mtx icu_lock;
struct mem_range_softc mem_range_softc;
@@ -1670,13 +1667,20 @@
*/
pmap_thread_init_invl_gen(&thread0);
- pc = &temp_bsp_pcpu;
+ /*
+ * Initialize the static and dynamic per-CPU areas. The latter must
+ * immediately follow the former.
+ */
+ pc = (struct pcpu *)(physfree + KERNBASE);
+ physfree += sizeof(struct pcpu);
pcpu_init(pc, 0, sizeof(struct pcpu));
- gdt = &temp_bsp_pcpu.pc_gdt[0];
+ dpcpu_init((void *)(physfree + KERNBASE), 0);
+ physfree += DPCPU_SIZE;
/*
* make gdt memory segments
*/
+ gdt = &pc->pc_gdt[0];
for (x = 0; x < NGDT; x++) {
if (x != GPROC0_SEL && x != (GPROC0_SEL + 1) &&
x != GUSERLDT_SEL && x != (GUSERLDT_SEL) + 1)
@@ -1694,8 +1698,6 @@
wrmsr(MSR_GSBASE, (u_int64_t)pc);
wrmsr(MSR_KGSBASE, 0); /* User value while in the kernel */
- dpcpu_init((void *)(physfree + KERNBASE), 0);
- physfree += DPCPU_SIZE;
amd64_bsp_pcpu_init1(pc);
/* Non-late cninit() and printf() can be moved up to here. */
Index: sys/amd64/amd64/mp_machdep.c
===================================================================
--- sys/amd64/amd64/mp_machdep.c
+++ sys/amd64/amd64/mp_machdep.c
@@ -61,8 +61,13 @@
#include <vm/pmap.h>
#include <vm/vm_kern.h>
#include <vm/vm_extern.h>
+#include <vm/vm_map.h>
+#include <vm/vm_object.h>
#include <vm/vm_page.h>
#include <vm/vm_phys.h>
+#include <vm/vm_pagequeue.h>
+#include <vm/uma.h>
+#include <vm/uma_int.h>
#include <x86/apicreg.h>
#include <machine/clock.h>
@@ -124,6 +129,173 @@
return (start + AP_BOOTPT_SZ <= GiB(4) && atop(end) < Maxmem);
}
+/*
+ * Initialize the bootstrap allocator for dynamic per-CPU memory allocations.
+ * 2MB is reserved by pmap_bootstrap() for the BSP, from which its pcpu and
+ * dpcpu regions are allocated. The rest can be used by UMA to satisfy dynamic
+ * per-CPU allocations until SI_SUB_CPU, when the pcpu regions for the APs are
+ * laid out. At that point, unused portions of the initial 2MB allocation may
+ * be used for APs as well as the BSP.
+ */
+static void
+pcpu_bootstrap(void *arg __unused)
+{
+ uma_pcpu_init1(VM_PCPU_BASE_START + sizeof(struct pcpu) + DPCPU_SIZE,
+ VM_PCPU_BOOTSTRAP_SIZE - (sizeof(struct pcpu) + DPCPU_SIZE));
+}
+SYSINIT(pcpu_bootstrap, SI_SUB_VM, SI_ORDER_ANY, pcpu_bootstrap, NULL);
+
+static int
+pcpu_domidx(int domain)
+{
+ int bspdom;
+
+ bspdom = PCPU_GET(domain);
+ if (bspdom == 0)
+ return (domain);
+ if (domain == bspdom)
+ return (0);
+ return (domain > bspdom ? domain : domain + 1);
+}
+
+/*
+ * Place per-CPU structures. Each AP requires a pcpu and dpcpu region. The
+ * pcpu region of a CPU is its base pcpu address. A pointer to per-CPU data is
+ * an offset relative to the base pcpu address, and UMA's per-CPU allocator
+ * ensures that adding that offset to the base address always gives the address
+ * of memory allocated for the corresponding CPU.
+ *
+ * The layout attempts to maximize use of 2MB mappings while also providing
+ * domain-local memory on NUMA systems. It uses 2 parameters, N, the number of
+ * 4KB pages per CPU, and M, the number of 2MB pages per allocation quantum. M
+ * is a multiple of vm_ndomains and they are usually equal. N has a lower bound
+ * of L = sizeof(struct pcpu) + DPCPU_SIZE + uma_pcpu_bootstrap_used(), where
+ * the last term is the amount of memory used by the bootstrap per-CPU
+ * allocator. Each 2MB page hosts per-CPU data for CPUs belonging to the domain
+ * from which the page was allocated, so we first compute M by determining the
+ * maximum number of CPUs per domain and multiplying that by L. Then N is given
+ * by M*2MB divided by the number of CPUs per domain.
+ *
+ * __________ N 4KB pages __________
+ * / \
+ * VM_PCPU_BASE_START -------> +----------+-----------+--------------+
+ * | BSP pcpu | BSP dpcpu | UMA data ... |\
+ * +----------+-----------+--------------+ |
+ * | AP1 pcpu | AP1 dpcpu | UMA data ... | |
+ * +----------+-----------+--------------+ |
+ * | ... | | M 2MB
+ * +----------+--------------------------+ | pages
+ * | APi pcpu | APi dpcpu | UMA data ... | |
+ * +----------+-----------+--------------+ |
+ * | ... | |
+ * | ... |/
+ * +-------------------------------------+
+ *
+ * If the original region is exhausted, for example because a subsystem
+ * allocates many per-CPU counters, UMA allocaates another M*2MB region of KVA
+ * to mirror the base region.
+ */
+static void
+pcpu_layout(void)
+{
+ vm_offset_t addr;
+ vm_size_t size, used;
+ int count[MAXMEMDOM], domoff[MAXMEMDOM];
+ int domain, error, i, maxcpupdom, n2mpgpdom, n4kpgpcpu, nbpdom;
+
+ /*
+ * Compute the maximum count of CPUs in a single domain. Domains are
+ * typically symmetric but this is not required.
+ */
+ memset(count, 0, sizeof(count));
+ for (i = 0; i <= mp_maxid; i++) {
+ if (vm_ndomains > 1 && cpu_apic_ids[i] != -1)
+ domain = acpi_pxm_get_cpu_locality(cpu_apic_ids[i]);
+ else
+ domain = 0;
+ count[domain]++;
+ }
+ for (i = 0, maxcpupdom = -1; i < vm_ndomains; i++)
+ if (count[i] > maxcpupdom)
+ maxcpupdom = count[i];
+
+ /*
+ * Compute layout parameters: the number of 4KB pages per CPU, and the
+ * number of 2MB pages per domain. The amount of memory already
+ * allocated by the bootstrap allocator gives a lower bound for the
+ * former, and we use that bound to compute the number of 2MB pages
+ * per domain.
+ */
+ used = uma_pcpu_bootstrap_used();
+ n2mpgpdom = howmany(atop(used) * maxcpupdom, NPDEPG);
+ n4kpgpcpu = atop(NBPDR * n2mpgpdom) / maxcpupdom;
+
+ /*
+ * Assign a pcpu base address to each CPU. Handle the possibility that
+ * the BSP is not local to domain 0.
+ */
+ memset(domoff, 0, sizeof(domoff));
+ for (i = 0; i <= mp_maxid; i++) {
+ if (vm_ndomains > 1 && cpu_apic_ids[i] != -1)
+ domain = acpi_pxm_get_cpu_locality(cpu_apic_ids[i]);
+ else
+ domain = 0;
+
+ addr = VM_PCPU_BASE_START +
+ pcpu_domidx(domain) * n2mpgpdom * NBPDR +
+ domoff[domain] * n4kpgpcpu * PAGE_SIZE;
+ cpuid_to_pcpu[i] = (struct pcpu *)addr;
+ domoff[domain]++;
+ }
+
+ /*
+ * Ensure that the remaining bootstrap region is backed by physical
+ * pages.
+ */
+ nbpdom = n2mpgpdom * NBPDR;
+ for (domain = 0; domain < vm_ndomains; domain++) {
+ addr = VM_PCPU_BASE_START + nbpdom * pcpu_domidx(domain);
+ size = nbpdom;
+ if (domain == PCPU_GET(domain)) {
+ /* This 2MB page was allocated by pmap_bootstrap(). */
+ addr += NBPDR;
+ size -= NBPDR;
+ if (size == 0)
+ continue;
+ }
+ if (VM_DOMAIN_EMPTY(domain))
+ error = kmem_back(kernel_object, addr, size,
+ M_WAITOK | M_ZERO);
+ else
+ error = kmem_back_domain(domain, kernel_object, addr,
+ size, M_WAITOK | M_ZERO);
+ if (error != KERN_SUCCESS)
+ panic("%s: failed to allocate memory: %d",
+ __func__, error);
+ }
+
+ /*
+ * Release reserved, unused KVA back to the system.
+ */
+ vm_map_lock(kernel_map);
+ error = vm_map_delete(kernel_map,
+ VM_PCPU_BASE_START + vm_ndomains * nbpdom,
+ VM_PCPU_BASE_START + VM_PCPU_BASE_SIZE);
+ if (error != KERN_SUCCESS)
+ panic("%s: failed to release KVA: %d", __func__, error);
+ vm_map_unlock(kernel_map);
+
+ /*
+ * Finally, provide layout parameters to the allocator so that it can
+ * finish bootstrapping.
+ */
+ uma_pcpu_init2(n4kpgpcpu, n2mpgpdom);
+
+ if (bootverbose)
+ printf("%s: %d 2MB pages per domain, %d 4KB pages per CPU\n",
+ __func__, n2mpgpdom, n4kpgpcpu);
+}
+
/*
* Calculate usable address in base memory for AP trampoline code.
*/
@@ -263,6 +435,9 @@
assign_cpu_ids();
+ /* Place AP pcpu structures now that CPU IDs are defined. */
+ pcpu_layout();
+
/* Start each Application Processor */
init_ops.start_all_aps();
@@ -292,12 +467,9 @@
/* Update microcode before doing anything else. */
ucode_load_ap(cpu);
- /* Get per-cpu data and save */
- pc = &__pcpu[cpu];
-
- /* prime data page for it to use */
+ pc = cpuid_to_pcpu[cpu];
pcpu_init(pc, cpu, sizeof(struct pcpu));
- dpcpu_init(dpcpu, cpu);
+ dpcpu_init((void *)DPCPU_BASE(pc), cpu);
pc->pc_apic_id = cpu_apic_ids[cpu];
pc->pc_prvspace = pc;
pc->pc_curthread = 0;
@@ -315,7 +487,7 @@
pc->pc_pcid_gen = 1;
/* Init tss */
- pc->pc_common_tss = __pcpu[0].pc_common_tss;
+ pc->pc_common_tss = cpuid_to_pcpu[0]->pc_common_tss;
pc->pc_common_tss.tss_iobase = sizeof(struct amd64tss) +
IOPERM_BITMAP_SIZE;
pc->pc_common_tss.tss_rsp0 = 0;
@@ -388,27 +560,6 @@
* local functions and data
*/
-#ifdef NUMA
-static void
-mp_realloc_pcpu(int cpuid, int domain)
-{
- vm_page_t m;
- vm_offset_t oa, na;
-
- oa = (vm_offset_t)&__pcpu[cpuid];
- if (_vm_phys_domain(pmap_kextract(oa)) == domain)
- return;
- m = vm_page_alloc_domain(NULL, 0, domain,
- VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ);
- if (m == NULL)
- return;
- na = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
- pagecopy((void *)oa, (void *)na);
- pmap_qenter((vm_offset_t)&__pcpu[cpuid], &m, 1);
- /* XXX old pcpu page leaked. */
-}
-#endif
-
/*
* start each AP in our list
*/
@@ -456,16 +607,6 @@
outb(CMOS_REG, BIOS_RESET);
outb(CMOS_DATA, BIOS_WARM); /* 'warm-start' */
- /* Relocate pcpu areas to the correct domain. */
-#ifdef NUMA
- if (vm_ndomains > 1)
- for (cpu = 1; cpu < mp_ncpus; cpu++) {
- apic_id = cpu_apic_ids[cpu];
- domain = acpi_pxm_get_cpu_locality(apic_id);
- mp_realloc_pcpu(cpu, domain);
- }
-#endif
-
/* start each AP */
domain = 0;
for (cpu = 1; cpu < mp_ncpus; cpu++) {
@@ -484,8 +625,6 @@
DOMAINSET_PREF(domain), PAGE_SIZE, M_WAITOK | M_ZERO);
dbg_stack = (char *)kmem_malloc_domainset(
DOMAINSET_PREF(domain), PAGE_SIZE, M_WAITOK | M_ZERO);
- dpcpu = (void *)kmem_malloc_domainset(DOMAINSET_PREF(domain),
- DPCPU_SIZE, M_WAITOK | M_ZERO);
bootSTK = (char *)bootstacks[cpu] +
kstack_pages * PAGE_SIZE - 8;
Index: sys/amd64/amd64/pmap.c
===================================================================
--- sys/amd64/amd64/pmap.c
+++ sys/amd64/amd64/pmap.c
@@ -1415,6 +1415,17 @@
return (ret);
}
+static uint64_t
+alloc2mpage(vm_paddr_t *firstaddr)
+{
+ uint64_t ret;
+
+ ret = roundup2(*firstaddr, NBPDR);
+ bzero((void *)ret, NBPDR);
+ *firstaddr = ret + NBPDR;
+ return (ret);
+}
+
CTASSERT(powerof2(NDMPML4E));
/* number of kernel PDP slots */
@@ -1660,6 +1671,59 @@
}
}
+static void
+bootstrap_pcpu(vm_paddr_t pcpupg, vm_paddr_t pdppg)
+{
+ struct region_descriptor r_gdt;
+ struct pcpu *oldpc, *pc;
+ void *dpcpu;
+ vm_offset_t va;
+ pdp_entry_t *pdpe;
+ pd_entry_t *pde;
+
+ /*
+ * Map the bootstrap per-CPU region.
+ */
+ va = VM_PCPU_BASE_START;
+ pdpe = pmap_pdpe(kernel_pmap, va);
+ if ((*pdpe & X86_PG_V) != 0)
+ panic("pdpe for %#lx is already valid", va);
+ *pdpe = pdppg | X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M;
+ pde = pmap_pde(kernel_pmap, va);
+ pde_store(pde, pcpupg | X86_PG_V | X86_PG_PS | X86_PG_RW | X86_PG_A |
+ X86_PG_M | pg_nx | pg_g);
+
+ /*
+ * Re-initialize PCPU area for BSP after switching.
+ * Make hardware use gdt and common_tss from the new PCPU.
+ * Copy dynamic PCPU data following the PCPU structure.
+ */
+ STAILQ_INIT(&cpuhead);
+ pc = (struct pcpu *)va;
+ oldpc = get_pcpu();
+ wrmsr(MSR_GSBASE, (uintptr_t)pc);
+ pcpu_init(pc, 0, sizeof(struct pcpu));
+ amd64_bsp_pcpu_init1(pc);
+ amd64_bsp_ist_init(pc);
+ pc->pc_common_tss.tss_iobase = sizeof(struct amd64tss) +
+ IOPERM_BITMAP_SIZE;
+ memcpy(pc->pc_gdt, oldpc->pc_gdt, NGDT *
+ sizeof(struct user_segment_descriptor));
+ gdt_segs[GPROC0_SEL].ssd_base = (uintptr_t)&pc->pc_common_tss;
+ ssdtosyssd(&gdt_segs[GPROC0_SEL],
+ (struct system_segment_descriptor *)&pc->pc_gdt[GPROC0_SEL]);
+ r_gdt.rd_limit = NGDT * sizeof(struct user_segment_descriptor) - 1;
+ r_gdt.rd_base = (long)pc->pc_gdt;
+ lgdt(&r_gdt);
+ wrmsr(MSR_GSBASE, (uintptr_t)pc);
+ ltr(GSEL(GPROC0_SEL, SEL_KPL));
+ pc->pc_acpi_id = oldpc->pc_acpi_id;
+
+ dpcpu = (void *)DPCPU_BASE(pc);
+ dpcpu_init(dpcpu, 0);
+ memcpy(dpcpu, (void *)DPCPU_BASE(oldpc), DPCPU_BYTES);
+}
+
/*
* Bootstrap the system enough to run with virtual memory.
*
@@ -1674,10 +1738,9 @@
pmap_bootstrap(vm_paddr_t *firstaddr)
{
vm_offset_t va;
- pt_entry_t *pte, *pcpu_pte;
- struct region_descriptor r_gdt;
- uint64_t cr4, pcpu_phys;
- u_long res;
+ pt_entry_t *pte;
+ uint64_t cr4;
+ u_long res, pcpupg, pdppg;
int i;
KERNend = *firstaddr;
@@ -1691,8 +1754,6 @@
*/
create_pagetables(firstaddr);
- pcpu_phys = allocpages(firstaddr, MAXCPU);
-
/*
* Add a physical memory segment (vm_phys_seg) corresponding to the
* preallocated kernel page table pages so that vm_page structures
@@ -1708,6 +1769,20 @@
virtual_avail = (vm_offset_t)KERNBASE + round_2mpage(KERNend);
virtual_end = VM_MAX_KERNEL_ADDRESS;
+ /*
+ * Reserve physical memory to bootstrap the per-CPU allocator, as well
+ * as a PD page used to map it into the kernel map. Minimize the amount
+ * of memory wasted to maintain alignment.
+ */
+ if ((*firstaddr & PDRMASK) != 0) {
+ pdppg = allocpages(firstaddr, 1);
+ pcpupg = alloc2mpage(firstaddr);
+ } else {
+ pcpupg = alloc2mpage(firstaddr);
+ pdppg = allocpages(firstaddr, 1);
+ }
+ vm_phys_early_add_seg(pcpupg, pcpupg + NBPDR);
+
/*
* Enable PG_G global pages, then switch to the kernel page
* table from the bootstrap page table. After the switch, it
@@ -1759,38 +1834,12 @@
*/
SYSMAP(caddr_t, CMAP1, crashdumpmap, MAXDUMPPGS)
CADDR1 = crashdumpmap;
-
- SYSMAP(struct pcpu *, pcpu_pte, __pcpu, MAXCPU);
virtual_avail = va;
- for (i = 0; i < MAXCPU; i++) {
- pcpu_pte[i] = (pcpu_phys + ptoa(i)) | X86_PG_V | X86_PG_RW |
- pg_g | pg_nx | X86_PG_M | X86_PG_A;
- }
-
/*
- * Re-initialize PCPU area for BSP after switching.
- * Make hardware use gdt and common_tss from the new PCPU.
+ * Bootstrap the per-CPU allocator.
*/
- STAILQ_INIT(&cpuhead);
- wrmsr(MSR_GSBASE, (uint64_t)&__pcpu[0]);
- pcpu_init(&__pcpu[0], 0, sizeof(struct pcpu));
- amd64_bsp_pcpu_init1(&__pcpu[0]);
- amd64_bsp_ist_init(&__pcpu[0]);
- __pcpu[0].pc_common_tss.tss_iobase = sizeof(struct amd64tss) +
- IOPERM_BITMAP_SIZE;
- memcpy(__pcpu[0].pc_gdt, temp_bsp_pcpu.pc_gdt, NGDT *
- sizeof(struct user_segment_descriptor));
- gdt_segs[GPROC0_SEL].ssd_base = (uintptr_t)&__pcpu[0].pc_common_tss;
- ssdtosyssd(&gdt_segs[GPROC0_SEL],
- (struct system_segment_descriptor *)&__pcpu[0].pc_gdt[GPROC0_SEL]);
- r_gdt.rd_limit = NGDT * sizeof(struct user_segment_descriptor) - 1;
- r_gdt.rd_base = (long)__pcpu[0].pc_gdt;
- lgdt(&r_gdt);
- wrmsr(MSR_GSBASE, (uint64_t)&__pcpu[0]);
- ltr(GSEL(GPROC0_SEL, SEL_KPL));
- __pcpu[0].pc_dynamic = temp_bsp_pcpu.pc_dynamic;
- __pcpu[0].pc_acpi_id = temp_bsp_pcpu.pc_acpi_id;
+ bootstrap_pcpu(pcpupg, pdppg);
/*
* Initialize the PAT MSR.
@@ -4109,7 +4158,7 @@
vm_page_array_size = pages;
- start = VM_MIN_KERNEL_ADDRESS;
+ start = VM_PAGE_ARRAY_START;
end = start + pages * sizeof(struct vm_page);
for (va = start; va < end; va += NBPDR) {
pfn = first_page + (va - start) / sizeof(struct vm_page);
@@ -9818,6 +9867,7 @@
{
vm_page_t pml4_pg;
pdp_entry_t *pdpe;
+ struct pcpu *pc;
vm_offset_t va;
int i;
@@ -9832,23 +9882,24 @@
pdpe = pmap_pti_pdpe(va);
pmap_pti_wire_pte(pdpe);
}
- pmap_pti_add_kva_locked((vm_offset_t)&__pcpu[0],
- (vm_offset_t)&__pcpu[0] + sizeof(__pcpu[0]) * MAXCPU, false);
pmap_pti_add_kva_locked((vm_offset_t)idt, (vm_offset_t)idt +
sizeof(struct gate_descriptor) * NIDT, false);
CPU_FOREACH(i) {
+ pc = cpuid_to_pcpu[i];
+ pmap_pti_add_kva_locked((vm_offset_t)pc, (vm_offset_t)(pc + 1),
+ false);
/* Doublefault stack IST 1 */
- va = __pcpu[i].pc_common_tss.tss_ist1;
+ va = pc->pc_common_tss.tss_ist1;
pmap_pti_add_kva_locked(va - PAGE_SIZE, va, false);
/* NMI stack IST 2 */
- va = __pcpu[i].pc_common_tss.tss_ist2 + sizeof(struct nmi_pcpu);
+ va = pc->pc_common_tss.tss_ist2 + sizeof(struct nmi_pcpu);
pmap_pti_add_kva_locked(va - PAGE_SIZE, va, false);
/* MC# stack IST 3 */
- va = __pcpu[i].pc_common_tss.tss_ist3 +
+ va = pc->pc_common_tss.tss_ist3 +
sizeof(struct nmi_pcpu);
pmap_pti_add_kva_locked(va - PAGE_SIZE, va, false);
/* DB# stack IST 4 */
- va = __pcpu[i].pc_common_tss.tss_ist4 + sizeof(struct nmi_pcpu);
+ va = pc->pc_common_tss.tss_ist4 + sizeof(struct nmi_pcpu);
pmap_pti_add_kva_locked(va - PAGE_SIZE, va, false);
}
pmap_pti_add_kva_locked((vm_offset_t)kernphys + KERNBASE,
Index: sys/amd64/amd64/uma_machdep.c
===================================================================
--- sys/amd64/amd64/uma_machdep.c
+++ sys/amd64/amd64/uma_machdep.c
@@ -3,6 +3,10 @@
*
* Copyright (c) 2003 Alan L. Cox <alc@cs.rice.edu>
* All rights reserved.
+ * Copyright (c) 2020 The FreeBSD Foundation
+ *
+ * Portions of this software were developed by Mark Johnston under
+ * sponsorship from the FreeBSD Foundation.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -31,18 +35,36 @@
#include <sys/param.h>
#include <sys/lock.h>
+#include <sys/kernel.h>
#include <sys/malloc.h>
#include <sys/mutex.h>
+#include <sys/smp.h>
#include <sys/systm.h>
+#include <sys/vmem.h>
#include <sys/vmmeter.h>
+
#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_object.h>
#include <vm/vm_page.h>
#include <vm/vm_pageout.h>
+#include <vm/vm_phys.h>
+#include <vm/vm_pagequeue.h>
#include <vm/uma.h>
#include <vm/uma_int.h>
+
#include <machine/md_var.h>
#include <machine/vmparam.h>
+/* Bootstrap data. */
+static bool uma_pcpu_bootstrapped = false;
+static vm_offset_t uma_pcpu_bootstrap_addr;
+static vm_size_t uma_pcpu_bootstrap_size;
+
+static vmem_t *uma_pcpu_arena;
+
void *
uma_small_alloc(uma_zone_t zone, vm_size_t bytes, int domain, u_int8_t *flags,
int wait)
@@ -77,3 +99,150 @@
vm_page_unwire_noq(m);
vm_page_free(m);
}
+
+void *
+uma_pcpu_alloc(uma_zone_t zone, vm_size_t size, int domain, uint8_t *flags,
+ int wait)
+{
+ void *pcpu_offset;
+ vm_offset_t addr, pcpu_addr;
+ vm_size_t pcpu_size;
+ int error, i;
+
+ KASSERT(size == (mp_maxid + 1) * PAGE_SIZE,
+ ("%s: unexpected alloc size %#lx", __func__, size));
+
+ *flags = UMA_SLAB_PRIV;
+ pcpu_size = PAGE_SIZE;
+
+ if (!uma_pcpu_bootstrapped) {
+ if (uma_pcpu_bootstrap_size == 0)
+ panic("%s: ran out of per-CPU pages", __func__);
+ addr = uma_pcpu_bootstrap_addr;
+ uma_pcpu_bootstrap_addr += pcpu_size;
+ uma_pcpu_bootstrap_size -= pcpu_size;
+ return ((void *)addr);
+ }
+
+ error = vmem_alloc(uma_pcpu_arena, pcpu_size, M_BESTFIT | wait, &addr);
+ if (error != 0)
+ return (NULL);
+
+ /*
+ * If the address comes from the bootstrap region, it is already backed
+ * by physical memory. Otherwise we must allocate memory.
+ */
+ pcpu_offset = zpcpu_base_to_offset((void *)addr);
+ if ((vm_offset_t)pcpu_offset >= VM_PCPU_BOOTSTRAP_SIZE) {
+ for (i = 0; i <= mp_maxid; i++) {
+ domain = cpuid_to_pcpu[i]->pc_domain;
+ pcpu_addr = (vm_offset_t)zpcpu_get_cpu(pcpu_offset, i);
+ if (VM_DOMAIN_EMPTY(domain))
+ error = kmem_back(kernel_object, pcpu_addr,
+ pcpu_size, wait | M_ZERO);
+ else
+ error = kmem_back_domain(domain, kernel_object,
+ pcpu_addr, pcpu_size, wait | M_ZERO);
+ if (error != KERN_SUCCESS)
+ goto fail;
+ }
+ }
+ return ((void *)addr);
+
+fail:
+ for (; i > 0; i--) {
+ pcpu_addr = (vm_offset_t)zpcpu_get_cpu(pcpu_offset, i - 1);
+ kmem_unback(kernel_object, pcpu_addr, pcpu_size);
+ }
+ vmem_xfree(uma_pcpu_arena, addr, pcpu_size);
+ return (NULL);
+}
+
+void
+uma_pcpu_free(void *mem, vm_size_t size, uint8_t flags)
+{
+ void *pcpu_offset;
+ vm_offset_t pcpu_addr;
+ vm_size_t pcpu_size;
+ int i;
+
+ KASSERT(uma_pcpu_bootstrapped,
+ ("%s: not bootstrapped", __func__));
+ KASSERT(size == (mp_maxid + 1) * PAGE_SIZE,
+ ("%s: unexpected free size %#lx", __func__, size));
+
+ pcpu_offset = zpcpu_base_to_offset(mem);
+ pcpu_size = PAGE_SIZE;
+
+ /*
+ * Memory allocated from the bootstrap region remains permanently
+ * allocated.
+ */
+ if ((vm_offset_t)pcpu_offset >= VM_PCPU_BOOTSTRAP_SIZE)
+ for (i = 0; i <= mp_maxid; i++) {
+ pcpu_addr = (vm_offset_t)zpcpu_get_cpu(pcpu_offset, i);
+ kmem_unback(kernel_object, pcpu_addr, pcpu_size);
+ }
+
+ vmem_free(uma_pcpu_arena, (vm_offset_t)mem, pcpu_size);
+}
+
+static int
+pcpu_import(void *arg, vmem_size_t size, int flags, vmem_addr_t *addrp)
+{
+ vm_size_t kvasize, nbpdom;
+
+ nbpdom = (int)(uintptr_t)arg * NBPDR;
+ kvasize = nbpdom * vm_ndomains;
+ return (vmem_xalloc(kernel_arena, kvasize, VM_PCPU_ALIGN, 0, 0,
+ 0, ~(vmem_addr_t)0, M_BESTFIT | flags, addrp));
+}
+
+void
+uma_pcpu_init1(vm_offset_t addr, vm_size_t size)
+{
+ uma_pcpu_bootstrap_addr = addr;
+ uma_pcpu_bootstrap_size = size;
+}
+
+void
+uma_pcpu_init2(int n4kpgpcpu, int n2mpgpdom)
+{
+ vmem_addr_t addr, addr1;
+ vmem_size_t pcpu_size;
+ int error;
+
+ KASSERT(!smp_started, ("%s: called after SMP is started", __func__));
+
+ pcpu_size = PAGE_SIZE;
+
+ uma_pcpu_arena = vmem_create("UMA pcpu arena", 0, 0, pcpu_size, 0,
+ M_WAITOK);
+ vmem_set_import(uma_pcpu_arena, pcpu_import, NULL,
+ (void *)(uintptr_t)n2mpgpdom, ptoa(n4kpgpcpu));
+
+ /*
+ * Add the bootstrap region. Structures allocated during boot may be
+ * freed, for example if a preloaded module is unloaded, so they are
+ * marked here as allocated.
+ */
+ error = vmem_add(uma_pcpu_arena, VM_PCPU_BASE_START, ptoa(n4kpgpcpu),
+ M_WAITOK);
+ if (error != 0)
+ panic("%s: vmem_add() failed: %d", __func__, error);
+ for (addr = VM_PCPU_BASE_START; addr < uma_pcpu_bootstrap_addr;
+ addr += pcpu_size) {
+ error = vmem_xalloc(uma_pcpu_arena, pcpu_size, 0, 0, 0,
+ addr, addr + pcpu_size, M_BESTFIT | M_WAITOK, &addr1);
+ if (error != 0)
+ panic("%s: vmem_xalloc() failed: %d", __func__, error);
+ }
+
+ uma_pcpu_bootstrapped = true;
+}
+
+vm_size_t
+uma_pcpu_bootstrap_used(void)
+{
+ return (uma_pcpu_bootstrap_addr - VM_PCPU_BASE_START);
+}
Index: sys/amd64/include/pcpu.h
===================================================================
--- sys/amd64/include/pcpu.h
+++ sys/amd64/include/pcpu.h
@@ -37,6 +37,7 @@
#include <machine/segments.h>
#include <machine/tss.h>
+#include <machine/vmparam.h>
#define PC_PTI_STACK_SZ 16
@@ -238,11 +239,23 @@
#define PCPU_PTR(member) __PCPU_PTR(pc_ ## member)
#define PCPU_SET(member, val) __PCPU_SET(pc_ ## member, val)
+#define DPCPU_BASE(pc) ((uintptr_t)((struct pcpu *)(pc) + 1))
+
+/*
+ * Kernel modules use a dynamically allocated region in the DPCPU area,
+ * so they must fall back to the indirection through pc_dynamic.
+ */
+#ifndef KLD_MODULE
+#define DPCPU_BASE_OFFSET(pc) (DPCPU_BASE(pc) - DPCPU_START)
+#endif
+
#define IS_BSP() (PCPU_GET(cpuid) == 0)
-#define zpcpu_offset_cpu(cpu) ((uintptr_t)&__pcpu[0] + UMA_PCPU_ALLOC_SIZE * cpu)
-#define zpcpu_base_to_offset(base) (void *)((uintptr_t)(base) - (uintptr_t)&__pcpu[0])
-#define zpcpu_offset_to_base(base) (void *)((uintptr_t)(base) + (uintptr_t)&__pcpu[0])
+#define zpcpu_offset_cpu(cpu) ((uintptr_t)cpuid_to_pcpu[cpu])
+#define zpcpu_base_to_offset(base) ((void *)((uintptr_t)(base) - \
+ (uintptr_t)VM_PCPU_BASE_START))
+#define zpcpu_offset_to_base(base) ((void *)((uintptr_t)(base) + \
+ (uintptr_t)VM_PCPU_BASE_START))
#define zpcpu_sub_protected(base, n) do { \
ZPCPU_ASSERT_PROTECTED(); \
Index: sys/amd64/include/pcpu_aux.h
===================================================================
--- sys/amd64/include/pcpu_aux.h
+++ sys/amd64/include/pcpu_aux.h
@@ -42,10 +42,7 @@
#endif
/* Required for counters(9) to work on x86. */
-_Static_assert(sizeof(struct pcpu) == UMA_PCPU_ALLOC_SIZE, "fix pcpu size");
-
-extern struct pcpu *__pcpu;
-extern struct pcpu temp_bsp_pcpu;
+_Static_assert(sizeof(struct pcpu) % PAGE_SIZE == 0, "fix pcpu size");
static __inline __pure2 struct thread *
__curthread(void)
Index: sys/amd64/include/vmparam.h
===================================================================
--- sys/amd64/include/vmparam.h
+++ sys/amd64/include/vmparam.h
@@ -78,6 +78,12 @@
*/
#define UMA_MD_SMALL_ALLOC
+/*
+ * We provide a machine specific per-CPU allocator which returns 2MB mappings
+ * when possible.
+ */
+#define UMA_MD_PCPU_ALLOC
+
/*
* The physical address space is densely populated.
*/
@@ -165,7 +171,8 @@
*
* Within the kernel map:
*
- * 0xfffffe0000000000 vm_page_array
+ * 0xfffffe0000000000 bootstrap pcpu region
+ * 0xfffffe0020000000 vm_page_array
* 0xffffffff80000000 KERNBASE
*/
@@ -192,6 +199,13 @@
#define VM_MAX_ADDRESS UPT_MAX_ADDRESS
#define VM_MIN_ADDRESS (0)
+#define VM_PCPU_BASE_START VM_MIN_KERNEL_ADDRESS
+#define VM_PCPU_BASE_SIZE (MAXCPU * NBPDR)
+#define VM_PCPU_BOOTSTRAP_SIZE NBPDR
+#define VM_PCPU_ALIGN NBPDR
+
+#define VM_PAGE_ARRAY_START (VM_PCPU_BASE_START + VM_PCPU_BASE_SIZE)
+
/*
* XXX Allowing dmaplimit == 0 is a temporary workaround for vt(4) efifb's
* early use of PHYS_TO_DMAP before the mapping is actually setup. This works
Index: sys/i386/i386/mp_machdep.c
===================================================================
--- sys/i386/i386/mp_machdep.c
+++ sys/i386/i386/mp_machdep.c
@@ -146,6 +146,9 @@
static char *ap_copyout_buf;
static char *ap_tramp_stack_base;
+
+static void *dpcpu;
+
/*
* Initialize the IPI handlers and start up the AP's.
*/
Index: sys/sys/pcpu.h
===================================================================
--- sys/sys/pcpu.h
+++ sys/sys/pcpu.h
@@ -109,6 +109,10 @@
static t DPCPU_NAME(n) __section(DPCPU_SETNAME) __used
#endif
+#ifndef DPCPU_BASE_OFFSET
+#define DPCPU_BASE_OFFSET(pc) ((pc)->pc_dynamic)
+#endif
+
/*
* Accessors with a given base.
*/
@@ -120,7 +124,7 @@
/*
* Accessors for the current cpu.
*/
-#define DPCPU_PTR(n) _DPCPU_PTR(PCPU_GET(dynamic), n)
+#define DPCPU_PTR(n) _DPCPU_PTR(DPCPU_BASE_OFFSET(get_pcpu()), n)
#define DPCPU_GET(n) (*DPCPU_PTR(n))
#define DPCPU_SET(n, v) (*DPCPU_PTR(n) = v)
Index: sys/vm/uma_core.c
===================================================================
--- sys/vm/uma_core.c
+++ sys/vm/uma_core.c
@@ -280,11 +280,13 @@
static void *noobj_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int);
static void *page_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int);
+#ifndef UMA_MD_SMALL_ALLOC
static void *pcpu_page_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int);
+static void pcpu_page_free(void *, vm_size_t, uint8_t);
+#endif
static void *startup_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int);
static void *contig_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int);
static void page_free(void *, vm_size_t, uint8_t);
-static void pcpu_page_free(void *, vm_size_t, uint8_t);
static uma_slab_t keg_alloc_slab(uma_keg_t, uma_zone_t, int, int, int);
static void cache_drain(uma_zone_t);
static void bucket_drain(uma_zone_t, uma_bucket_t);
@@ -1514,6 +1516,7 @@
uma_alloc allocf;
uma_slab_t slab;
unsigned long size;
+ int pperslab;
uint8_t *mem;
uint8_t sflags;
int i;
@@ -1569,10 +1572,18 @@
else
slab_tohashslab(slab)->uhs_data = mem;
- if (keg->uk_flags & UMA_ZFLAG_VTOSLAB)
- for (i = 0; i < keg->uk_ppera; i++)
- vsetzoneslab((vm_offset_t)mem + (i * PAGE_SIZE),
+ if ((keg->uk_flags & UMA_ZFLAG_VTOSLAB) != 0) {
+ /*
+ * Per-CPU slabs have a special layout. Only pages belonging to
+ * the base of the allocation need to be marked, and the slab
+ * may not be contiguous.
+ */
+ pperslab = (keg->uk_flags & UMA_ZONE_PCPU) != 0 ?
+ atop(UMA_PCPU_ALLOC_SIZE) : keg->uk_ppera;
+ for (i = 0; i < pperslab; i++)
+ vsetzoneslab((vm_offset_t)mem + i * PAGE_SIZE,
zone, slab);
+ }
slab->us_freecount = keg->uk_ipers;
slab->us_flags = sflags;
@@ -1701,6 +1712,7 @@
return (p);
}
+#ifndef UMA_MD_PCPU_ALLOC
static void *
pcpu_page_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *pflag,
int wait)
@@ -1755,6 +1767,7 @@
}
return (NULL);
}
+#endif
/*
* Allocates a number of pages from within an object
@@ -1856,6 +1869,7 @@
kmem_free((vm_offset_t)mem, size);
}
+#ifndef UMA_MD_PCPU_ALLOC
/*
* Frees pcpu zone allocations
*
@@ -1891,7 +1905,7 @@
pmap_qremove(sva, size >> PAGE_SHIFT);
kva_free(sva, size);
}
-
+#endif
/*
* Zero fill initializer
@@ -2243,7 +2257,11 @@
if (booted < BOOT_KVA)
keg->uk_allocf = startup_alloc;
else if (keg->uk_flags & UMA_ZONE_PCPU)
+#ifdef UMA_MD_PCPU_ALLOC
+ keg->uk_allocf = uma_pcpu_alloc;
+#else
keg->uk_allocf = pcpu_page_alloc;
+#endif
else if ((keg->uk_flags & UMA_ZONE_CONTIG) != 0 && keg->uk_ppera > 1)
keg->uk_allocf = contig_alloc;
else
@@ -2254,7 +2272,11 @@
else
#endif
if (keg->uk_flags & UMA_ZONE_PCPU)
+#ifdef UMA_MD_PCPU_ALLOC
+ keg->uk_freef = uma_pcpu_free;
+#else
keg->uk_freef = pcpu_page_free;
+#endif
else
keg->uk_freef = page_free;
@@ -3114,10 +3136,21 @@
if (item == NULL)
return (NULL);
pcpu_item = zpcpu_base_to_offset(item);
- if (flags & M_ZERO) {
+ if ((flags & M_ZERO) != 0) {
#ifdef SMP
- for (i = 0; i <= mp_maxid; i++)
+ for (i = 0; i <= mp_maxid; i++) {
bzero(zpcpu_get_cpu(pcpu_item, i), zone->uz_size);
+#ifdef UMA_MD_PCPU_ALLOC
+ if (__predict_false(booted < BOOT_RUNNING))
+ /*
+ * Only CPU's 0 memory is accessible if the
+ * per-CPU allocator is still being
+ * bootstrapped. The allocator guarantees that
+ * early allocations will be zero-filled.
+ */
+ break;
+#endif
+ }
#else
bzero(item, zone->uz_size);
#endif
Index: sys/vm/uma_int.h
===================================================================
--- sys/vm/uma_int.h
+++ sys/vm/uma_int.h
@@ -664,6 +664,7 @@
uma_reclaim_wakeup();
}
+#ifdef UMA_MD_SMALL_ALLOC
/*
* The following two functions may be defined by architecture specific code
* if they can provide more efficient allocation functions. This is useful
@@ -672,6 +673,19 @@
void *uma_small_alloc(uma_zone_t zone, vm_size_t bytes, int domain,
uint8_t *pflag, int wait);
void uma_small_free(void *mem, vm_size_t size, uint8_t flags);
+#endif
+
+#ifdef UMA_MD_PCPU_ALLOC
+void *uma_pcpu_alloc(uma_zone_t zone, vm_size_t bytes, int domain,
+ uint8_t *pflag, int wait);
+void uma_pcpu_free(void *mem, vm_size_t size, uint8_t flags);
+
+#ifdef __amd64__
+void uma_pcpu_init1(vm_offset_t addr, vm_size_t size);
+void uma_pcpu_init2(int ptpstride, int npdepdom);
+vm_size_t uma_pcpu_bootstrap_used(void);
+#endif
+#endif
/* Set a global soft limit on UMA managed memory. */
void uma_set_limit(unsigned long limit);
Index: sys/vm/vm_kern.c
===================================================================
--- sys/vm/vm_kern.c
+++ sys/vm/vm_kern.c
@@ -767,6 +767,14 @@
/* ... and ending with the completion of the above `insert' */
#ifdef __amd64__
+ /*
+ * Mark the PCPU bootstrap region as allocated. In practice most of
+ * this region will be released back to the VM during boot.
+ */
+ (void)vm_map_insert(m, NULL, 0, VM_PCPU_BASE_START,
+ VM_PCPU_BASE_START + VM_PCPU_BASE_SIZE,
+ VM_PROT_RW, VM_PROT_RW, MAP_NOFAULT);
+
/*
* Mark KVA used for the page array as allocated. Other platforms
* that handle vm_page_array allocation can simply adjust virtual_avail
Index: sys/x86/include/x86_smp.h
===================================================================
--- sys/x86/include/x86_smp.h
+++ sys/x86/include/x86_smp.h
@@ -29,7 +29,6 @@
extern struct pcb stoppcbs[];
extern int cpu_apic_ids[];
extern int bootAP;
-extern void *dpcpu;
extern char *bootSTK;
extern void *bootstacks[];
extern unsigned int boot_address;
Index: sys/x86/x86/mp_x86.c
===================================================================
--- sys/x86/x86/mp_x86.c
+++ sys/x86/x86/mp_x86.c
@@ -94,7 +94,6 @@
/* Free these after use */
void *bootstacks[MAXCPU];
-void *dpcpu;
struct pcb stoppcbs[MAXCPU];
struct susppcb **susppcbs;
Index: sys/x86/xen/pv.c
===================================================================
--- sys/x86/xen/pv.c
+++ sys/x86/xen/pv.c
@@ -365,7 +365,6 @@
mce_stack = (char *)kmem_malloc(PAGE_SIZE, M_WAITOK | M_ZERO);
nmi_stack = (char *)kmem_malloc(PAGE_SIZE, M_WAITOK | M_ZERO);
dbg_stack = (void *)kmem_malloc(PAGE_SIZE, M_WAITOK | M_ZERO);
- dpcpu = (void *)kmem_malloc(DPCPU_SIZE, M_WAITOK | M_ZERO);
bootSTK = (char *)bootstacks[cpu] + kstack_pages * PAGE_SIZE - 8;
bootAP = cpu;

File Metadata

Mime Type
text/plain
Expires
Thu, Feb 6, 2:36 PM (20 h, 47 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
16494814
Default Alt Text
D24758.diff (32 KB)

Event Timeline