Page MenuHomeFreeBSD

D40403.id133461.diff
No OneTemporary

D40403.id133461.diff

diff --git a/sys/amd64/include/vmparam.h b/sys/amd64/include/vmparam.h
--- a/sys/amd64/include/vmparam.h
+++ b/sys/amd64/include/vmparam.h
@@ -94,14 +94,16 @@
#define VM_PHYSSEG_MAX 63
/*
- * Create two free page pools: VM_FREEPOOL_DEFAULT is the default pool
- * from which physical pages are allocated and VM_FREEPOOL_DIRECT is
- * the pool from which physical pages for page tables and small UMA
- * objects are allocated.
+ * Create three free page pools: VM_FREEPOOL_DEFAULT is the default pool from
+ * which physical pages are allocated and VM_FREEPOOL_DIRECT is the pool from
+ * which physical pages for page tables and small UMA objects are allocated.
+ * VM_FREEPOOL_LAZINIT is a special-purpose pool that is populated only during
+ * boot and is used to implement deferred initialization of page structures.
*/
-#define VM_NFREEPOOL 2
-#define VM_FREEPOOL_DEFAULT 0
-#define VM_FREEPOOL_DIRECT 1
+#define VM_NFREEPOOL 3
+#define VM_FREEPOOL_LAZYINIT 0
+#define VM_FREEPOOL_DEFAULT 1
+#define VM_FREEPOOL_DIRECT 2
/*
* Create up to three free page lists: VM_FREELIST_DMA32 is for physical pages
diff --git a/sys/arm64/include/vmparam.h b/sys/arm64/include/vmparam.h
--- a/sys/arm64/include/vmparam.h
+++ b/sys/arm64/include/vmparam.h
@@ -73,14 +73,16 @@
#define VM_PHYSSEG_MAX 64
/*
- * Create two free page pools: VM_FREEPOOL_DEFAULT is the default pool
- * from which physical pages are allocated and VM_FREEPOOL_DIRECT is
- * the pool from which physical pages for small UMA objects are
- * allocated.
+ * Create three free page pools: VM_FREEPOOL_DEFAULT is the default pool from
+ * which physical pages are allocated and VM_FREEPOOL_DIRECT is the pool from
+ * which physical pages for page tables and small UMA objects are allocated.
+ * VM_FREEPOOL_LAZINIT is a special-purpose pool that is populated only during
+ * boot and is used to implement deferred initialization of page structures.
*/
-#define VM_NFREEPOOL 2
-#define VM_FREEPOOL_DEFAULT 0
-#define VM_FREEPOOL_DIRECT 1
+#define VM_NFREEPOOL 3
+#define VM_FREEPOOL_LAZYINIT 0
+#define VM_FREEPOOL_DEFAULT 1
+#define VM_FREEPOOL_DIRECT 2
/*
* Create two free page lists: VM_FREELIST_DMA32 is for physical pages that have
diff --git a/sys/vm/vm_page.c b/sys/vm/vm_page.c
--- a/sys/vm/vm_page.c
+++ b/sys/vm/vm_page.c
@@ -333,9 +333,9 @@
if (m == NULL)
return (true); /* page does not exist, no failure */
- vmd = vm_pagequeue_domain(m);
+ vmd = VM_DOMAIN(vm_phys_domain(pa));
vm_domain_free_lock(vmd);
- found = vm_phys_unfree_page(m);
+ found = vm_phys_unfree_page(pa);
vm_domain_free_unlock(vmd);
if (found) {
vm_domain_freecnt_inc(vmd, -1);
@@ -568,6 +568,9 @@
#if defined(__i386__) && defined(VM_PHYSSEG_DENSE)
long ii;
#endif
+#ifdef VM_FREEPOOL_LAZYINIT
+ int lazyinit;
+#endif
vaddr = round_page(vaddr);
@@ -750,6 +753,11 @@
*/
vm_phys_init();
+#ifdef VM_FREEPOOL_LAZYINIT
+ lazyinit = 1;
+ TUNABLE_INT_FETCH("debug.vm.lazy_page_init", &lazyinit);
+#endif
+
/*
* Initialize the page structures and add every available page to the
* physical memory allocator's free lists.
@@ -765,9 +773,50 @@
vm_cnt.v_page_count = 0;
for (segind = 0; segind < vm_phys_nsegs; segind++) {
seg = &vm_phys_segs[segind];
- for (m = seg->first_page, pa = seg->start; pa < seg->end;
- m++, pa += PAGE_SIZE)
- vm_page_init_page(m, pa, segind, VM_FREEPOOL_DEFAULT);
+
+ /*
+ * If lazy vm_page initialization is not enabled, simply
+ * initialize all of the pages in the segment. Otherwise, we
+ * only initialize:
+ * 1. Pages not covered by phys_avail[], since they might be
+ * freed to the allocator at some future point, e.g., by
+ * kmem_bootstrap_free().
+ * 2. The first page of each run of free pages handed to the
+ * vm_phys allocator, which in turn defers initialization
+ * of pages until they are needed.
+ * This avoids blocking the boot process for long periods, which
+ * may be relevant for VMs (which ought to boot as quickly as
+ * possible) and/or systems with large amounts of physical
+ * memory.
+ */
+#ifdef VM_FREEPOOL_LAZYINIT
+ if (lazyinit) {
+ startp = seg->start;
+ for (i = 0; phys_avail[i + 1] != 0; i += 2) {
+ if (startp >= seg->end)
+ break;
+
+ if (phys_avail[i + 1] < startp)
+ continue;
+ if (phys_avail[i] <= startp) {
+ startp = phys_avail[i + 1];
+ continue;
+ }
+
+ m = &seg->first_page[atop(startp - seg->start)];
+ for (endp = MIN(phys_avail[i], seg->end);
+ startp < endp; startp += PAGE_SIZE, m++) {
+ vm_page_init_page(m, startp, segind,
+ VM_FREEPOOL_DEFAULT);
+ }
+ }
+ } else
+#endif
+ for (m = seg->first_page, pa = seg->start;
+ pa < seg->end; m++, pa += PAGE_SIZE) {
+ vm_page_init_page(m, pa, segind,
+ VM_FREEPOOL_DEFAULT);
+ }
/*
* Add the segment's pages that are covered by one of
@@ -785,6 +834,12 @@
continue;
m = seg->first_page + atop(startp - seg->start);
+#ifdef VM_FREEPOOL_LAZYINIT
+ if (lazyinit) {
+ vm_page_init_page(m, startp, segind,
+ VM_FREEPOOL_LAZYINIT);
+ }
+#endif
vmd = VM_DOMAIN(seg->domain);
vm_domain_free_lock(vmd);
vm_phys_enqueue_contig(m, pagecount);
diff --git a/sys/vm/vm_phys.h b/sys/vm/vm_phys.h
--- a/sys/vm/vm_phys.h
+++ b/sys/vm/vm_phys.h
@@ -79,7 +79,7 @@
vm_page_t vm_phys_paddr_to_vm_page(vm_paddr_t pa);
void vm_phys_register_domains(int ndomains, struct mem_affinity *affinity,
int *locality);
-bool vm_phys_unfree_page(vm_page_t m);
+bool vm_phys_unfree_page(vm_paddr_t pa);
int vm_phys_mem_affinity(int f, int t);
void vm_phys_early_add_seg(vm_paddr_t start, vm_paddr_t end);
vm_paddr_t vm_phys_early_alloc(int domain, size_t alloc_size);
diff --git a/sys/vm/vm_phys.c b/sys/vm/vm_phys.c
--- a/sys/vm/vm_phys.c
+++ b/sys/vm/vm_phys.c
@@ -47,14 +47,18 @@
#include <sys/domainset.h>
#include <sys/lock.h>
#include <sys/kernel.h>
+#include <sys/kthread.h>
#include <sys/malloc.h>
#include <sys/mutex.h>
#include <sys/proc.h>
#include <sys/queue.h>
#include <sys/rwlock.h>
#include <sys/sbuf.h>
+#include <sys/sched.h>
#include <sys/sysctl.h>
#include <sys/tree.h>
+#include <sys/tslog.h>
+#include <sys/unistd.h>
#include <sys/vmmeter.h>
#include <ddb/ddb.h>
@@ -141,6 +145,7 @@
* Provides the mapping from VM_FREELIST_* to free list indices (flind).
*/
static int __read_mostly vm_freelist_to_flind[VM_NFREELIST];
+static int __read_mostly vm_default_freepool;
CTASSERT(VM_FREELIST_DEFAULT == 0);
@@ -184,6 +189,16 @@
static void vm_phys_split_pages(vm_page_t m, int oind, struct vm_freelist *fl,
int order, int tail);
+static bool __diagused
+vm_phys_pool_valid(int pool)
+{
+#ifdef VM_FREEPOOL_LAZYINIT
+ if (pool == VM_FREEPOOL_LAZYINIT)
+ return (false);
+#endif
+ return (pool >= 0 && pool < VM_NFREEPOOL);
+}
+
/*
* Red-black tree helpers for vm fictitious range management.
*/
@@ -620,6 +635,12 @@
}
}
+#ifdef VM_FREEPOOL_LAZYINIT
+ vm_default_freepool = VM_FREEPOOL_LAZYINIT;
+#else
+ vm_default_freepool = VM_FREEPOOL_DEFAULT;
+#endif
+
rw_init(&vm_phys_fictitious_reg_lock, "vmfctr");
}
@@ -690,6 +711,17 @@
("%s: invalid order %d", __func__, order));
vm_freelist_add(fl, m, order, tail);
+#ifdef VM_FREEPOOL_LAZYINIT
+ if (__predict_false(m->pool == VM_FREEPOOL_LAZYINIT)) {
+ vm_page_t m_next;
+ int npages;
+
+ npages = 1 << order;
+ m_next = m + npages;
+ vm_page_init_page(m_next, m->phys_addr + ptoa(npages), m->segind,
+ VM_FREEPOOL_LAZYINIT);
+ }
+#endif
}
/*
@@ -761,15 +793,33 @@
}
/*
- * Set the pool for a contiguous, power of two-sized set of physical pages.
+ * Set the pool for a contiguous, power of two-sized set of physical pages.
+ *
+ * If the pages currently belong to the lazy init pool, then the corresponding
+ * page structures must be initialized. In this case it is assumed that the
+ * first page in the run has already been initialized.
*/
static void
vm_phys_set_pool(int pool, vm_page_t m, int order)
{
- vm_page_t m_tmp;
-
- for (m_tmp = m; m_tmp < &m[1 << order]; m_tmp++)
- m_tmp->pool = pool;
+#ifdef VM_FREEPOOL_LAZYINIT
+ if (__predict_false(m->pool == VM_FREEPOOL_LAZYINIT)) {
+ vm_paddr_t pa;
+ int segind;
+
+ m->pool = pool;
+
+ TSENTER();
+ pa = m->phys_addr + PAGE_SIZE;
+ segind = m->segind;
+ for (vm_page_t m_tmp = m + 1; m_tmp < &m[1 << order];
+ m_tmp++, pa += PAGE_SIZE)
+ vm_page_init_page(m_tmp, pa, segind, pool);
+ TSEXIT();
+ } else
+#endif
+ for (vm_page_t m_tmp = m; m_tmp < &m[1 << order]; m_tmp++)
+ m_tmp->pool = pool;
}
/*
@@ -793,7 +843,7 @@
KASSERT(domain >= 0 && domain < vm_ndomains,
("vm_phys_alloc_npages: domain %d is out of range", domain));
- KASSERT(pool < VM_NFREEPOOL,
+ KASSERT(vm_phys_pool_valid(pool),
("vm_phys_alloc_npages: pool %d is out of range", pool));
KASSERT(npages <= 1 << (VM_NFREEORDER - 1),
("vm_phys_alloc_npages: npages %d is out of range", npages));
@@ -822,7 +872,8 @@
}
}
for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) {
- for (pind = 0; pind < VM_NFREEPOOL; pind++) {
+ for (pind = vm_default_freepool; pind < VM_NFREEPOOL;
+ pind++) {
alt = vm_phys_free_queues[domain][flind][pind];
while ((m = TAILQ_FIRST(&alt[oind].pl)) !=
NULL) {
@@ -889,7 +940,7 @@
KASSERT(freelist < VM_NFREELIST,
("vm_phys_alloc_freelist_pages: freelist %d is out of range",
freelist));
- KASSERT(pool < VM_NFREEPOOL,
+ KASSERT(vm_phys_pool_valid(pool),
("vm_phys_alloc_freelist_pages: pool %d is out of range", pool));
KASSERT(order < VM_NFREEORDER,
("vm_phys_alloc_freelist_pages: order %d is out of range", order));
@@ -918,7 +969,7 @@
* use them to satisfy the allocation.
*/
for (oind = VM_NFREEORDER - 1; oind >= order; oind--) {
- for (pind = 0; pind < VM_NFREEPOOL; pind++) {
+ for (pind = vm_default_freepool; pind < VM_NFREEPOOL; pind++) {
alt = &vm_phys_free_queues[domain][flind][pind][0];
m = TAILQ_FIRST(&alt[oind].pl);
if (m != NULL) {
@@ -1158,7 +1209,7 @@
KASSERT(m->order == VM_NFREEORDER,
("vm_phys_free_pages: page %p has unexpected order %d",
m, m->order));
- KASSERT(m->pool < VM_NFREEPOOL,
+ KASSERT(vm_phys_pool_valid(m->pool),
("vm_phys_free_pages: page %p has unexpected pool %d",
m, m->pool));
KASSERT(order < VM_NFREEORDER,
@@ -1187,6 +1238,107 @@
vm_freelist_add(fl, m, order, 1);
}
+#ifdef VM_FREEPOOL_LAZYINIT
+/*
+ * Initialize all pages lingering in the lazy init pool of a NUMA domain, moving
+ * them to the default pool. This is a prerequisite for some rare operations
+ * which need to scan the page array and thus depend on all pages being
+ * initialized.
+ */
+static void
+vm_phys_lazy_init_domain(int domain, bool locked)
+{
+ static bool initdone[MAXMEMDOM];
+ struct vm_domain *vmd;
+ struct vm_freelist *fl;
+ vm_page_t m;
+ int pind;
+ bool unlocked;
+
+ if (__predict_true(atomic_load_bool(&initdone[domain])))
+ return;
+
+ vmd = VM_DOMAIN(domain);
+ if (locked)
+ vm_domain_free_assert_locked(vmd);
+ else
+ vm_domain_free_lock(vmd);
+ if (atomic_load_bool(&initdone[domain]))
+ goto out;
+ pind = VM_FREEPOOL_LAZYINIT;
+ for (int freelist = 0; freelist < VM_NFREELIST; freelist++) {
+ int flind;
+
+ flind = vm_freelist_to_flind[freelist];
+ if (flind < 0)
+ continue;
+ fl = vm_phys_free_queues[domain][flind][pind];
+ for (int oind = 0; oind < VM_NFREEORDER; oind++) {
+ if (atomic_load_int(&fl[oind].lcnt) == 0)
+ continue;
+ while ((m = TAILQ_FIRST(&fl[oind].pl)) != NULL) {
+ /*
+ * Avoid holding the lock across the
+ * initialization unless there's a free page
+ * shortage.
+ */
+ vm_freelist_rem(fl, m, oind);
+ unlocked = vm_domain_allocate(vmd,
+ VM_ALLOC_NORMAL, 1 << oind);
+ if (unlocked)
+ vm_domain_free_unlock(vmd);
+ vm_phys_set_pool(VM_FREEPOOL_DEFAULT, m, oind);
+ if (unlocked) {
+ vm_domain_freecnt_inc(vmd, 1 << oind);
+ vm_domain_free_lock(vmd);
+ }
+ vm_phys_free_pages(m, oind);
+ }
+ }
+ }
+ atomic_store_bool(&initdone[domain], true);
+out:
+ if (!locked)
+ vm_domain_free_unlock(vmd);
+}
+
+static void
+vm_phys_lazy_init(void)
+{
+ for (int domain = 0; domain < vm_ndomains; domain++)
+ vm_phys_lazy_init_domain(domain, false);
+ atomic_store_int(&vm_default_freepool, VM_FREEPOOL_DEFAULT);
+}
+
+static void
+vm_phys_lazy_init_kthr(void *arg __unused)
+{
+ vm_phys_lazy_init();
+ kthread_exit();
+}
+
+static void
+vm_phys_lazy_sysinit(void *arg __unused)
+{
+ struct thread *td;
+ int error;
+
+ error = kthread_add(vm_phys_lazy_init_kthr, NULL, curproc, &td,
+ RFSTOPPED, 0, "vmlazyinit");
+ if (error == 0) {
+ thread_lock(td);
+ sched_prio(td, PRI_MIN_IDLE);
+ sched_add(td, SRQ_BORING);
+ } else {
+ printf("%s: could not create lazy init thread: %d\n",
+ __func__, error);
+ vm_phys_lazy_init();
+ }
+}
+SYSINIT(vm_phys_lazy_init, SI_SUB_SMP, SI_ORDER_ANY, vm_phys_lazy_sysinit,
+ NULL);
+#endif /* VM_FREEPOOL_LAZYINIT */
+
/*
* Free a contiguous, arbitrarily sized set of physical pages, without
* merging across set boundaries.
@@ -1292,6 +1444,12 @@
pa_end = MIN(high, seg->end);
if (pa_end - pa_start < ptoa(npages))
continue;
+#ifdef VM_FREEPOOL_LAZYINIT
+ /*
+ * The pages on the free lists must be initialized.
+ */
+ vm_phys_lazy_init_domain(domain, false);
+#endif
bounds[0] = vm_phys_seg_paddr_to_vm_page(seg, pa_start);
bounds[1] = vm_phys_seg_paddr_to_vm_page(seg, pa_end);
return (seg - vm_phys_segs);
@@ -1307,21 +1465,30 @@
* The free page queues must be locked.
*/
bool
-vm_phys_unfree_page(vm_page_t m)
+vm_phys_unfree_page(vm_paddr_t pa)
{
struct vm_freelist *fl;
struct vm_phys_seg *seg;
- vm_paddr_t pa, pa_half;
- vm_page_t m_set, m_tmp;
+ vm_paddr_t pa_half;
+ vm_page_t m, m_set, m_tmp;
int order;
+ seg = vm_phys_paddr_to_seg(pa);
+ vm_domain_free_assert_locked(VM_DOMAIN(seg->domain));
+
+ /*
+ * The pages on the free lists must be initialized.
+ */
+#ifdef VM_FREEPOOL_LAZYINIT
+ vm_phys_lazy_init_domain(seg->domain, true);
+#endif
+
/*
* First, find the contiguous, power of two-sized set of free
* physical pages containing the given physical page "m" and
* assign it to "m_set".
*/
- seg = &vm_phys_segs[m->segind];
- vm_domain_free_assert_locked(VM_DOMAIN(seg->domain));
+ m = vm_phys_paddr_to_vm_page(pa);
for (m_set = m, order = 0; m_set->order == VM_NFREEORDER &&
order < VM_NFREEORDER - 1; ) {
order++;
@@ -1460,7 +1627,7 @@
/* Search for a large enough free block. */
size = npages << PAGE_SHIFT;
for (oind = order; oind < VM_NFREEORDER; oind++) {
- for (pind = 0; pind < VM_NFREEPOOL; pind++) {
+ for (pind = vm_default_freepool; pind < VM_NFREEPOOL; pind++) {
fl = (*queues)[pind];
TAILQ_FOREACH(m_ret, &fl[oind].pl, listq) {
/*
@@ -1480,7 +1647,7 @@
if (order < VM_NFREEORDER)
return (NULL);
/* Search for a long-enough sequence of max-order blocks. */
- for (pind = 0; pind < VM_NFREEPOOL; pind++) {
+ for (pind = vm_default_freepool; pind < VM_NFREEPOOL; pind++) {
fl = (*queues)[pind];
m_ret = vm_phys_find_freelist_contig(fl, npages,
low, high, alignment, boundary);

File Metadata

Mime Type
text/plain
Expires
Fri, Apr 25, 5:47 AM (11 h, 46 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
17777405
Default Alt Text
D40403.id133461.diff (14 KB)

Event Timeline