Page MenuHomeFreeBSD

D44174.diff
No OneTemporary

D44174.diff

diff --git a/sys/contrib/openzfs/include/os/freebsd/zfs/sys/zfs_vfsops_os.h b/sys/contrib/openzfs/include/os/freebsd/zfs/sys/zfs_vfsops_os.h
--- a/sys/contrib/openzfs/include/os/freebsd/zfs/sys/zfs_vfsops_os.h
+++ b/sys/contrib/openzfs/include/os/freebsd/zfs/sys/zfs_vfsops_os.h
@@ -286,6 +286,12 @@
extern uint_t zfs_fsyncer_key;
extern int zfs_super_owner;
+extern uint64_t zfs_znode_count;
+extern uint64_t zfs_znode_inuse_count;
+extern wmsum_t zfs_znode_pruning_requested;
+extern wmsum_t zfs_znode_pruning_skipped;
+extern wmsum_t zfs_znode_pruning_withwaiter;
+extern wmsum_t zfs_znode_pruning_withwaiter_throttled;
extern void zfs_init(void);
extern void zfs_fini(void);
diff --git a/sys/contrib/openzfs/include/sys/arc.h b/sys/contrib/openzfs/include/sys/arc.h
--- a/sys/contrib/openzfs/include/sys/arc.h
+++ b/sys/contrib/openzfs/include/sys/arc.h
@@ -321,6 +321,7 @@
void arc_set_limits(uint64_t);
void arc_init(void);
void arc_fini(void);
+boolean_t arc_is_waiting_evict(void);
/*
* Level 2 ARC
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/arc_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/arc_os.c
--- a/sys/contrib/openzfs/module/os/freebsd/zfs/arc_os.c
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/arc_os.c
@@ -51,6 +51,7 @@
#include <machine/vmparam.h>
#include <sys/vm.h>
#include <sys/vmmeter.h>
+#include <vm/vm_pageout.h>
extern struct vfsops zfs_vfsops;
@@ -146,18 +147,45 @@
static eventhandler_tag arc_event_lowmem = NULL;
+/*
+ * The vm_lowmem event counters.
+ */
+wmsum_t zfs_arc_vm_lowmem_events;
+wmsum_t zfs_arc_vm_lowmem_kmem;
+wmsum_t zfs_arc_vm_lowmem_pages;
+wmsum_t zfs_arc_vm_lowmem_nofree;
+wmsum_t zfs_arc_vm_lowmem_pagedaemon;
+
static void
-arc_lowmem(void *arg __unused, int howto __unused)
+arc_lowmem(void *arg __unused, int howto)
{
int64_t free_memory, to_free;
+ wmsum_add(&zfs_arc_vm_lowmem_events, 1);
+ switch (howto) {
+ case VM_LOW_KMEM:
+ wmsum_add(&zfs_arc_vm_lowmem_kmem, 1);
+ break;
+
+ case VM_LOW_PAGES:
+ wmsum_add(&zfs_arc_vm_lowmem_pages, 1);
+ break;
+
+ default:
+ break;
+ }
+ if (curproc == pageproc)
+ wmsum_add(&zfs_arc_vm_lowmem_pagedaemon, 1);
+
arc_no_grow = B_TRUE;
arc_warm = B_TRUE;
arc_growtime = gethrtime() + SEC2NSEC(arc_grow_retry);
free_memory = arc_available_memory();
int64_t can_free = arc_c - arc_c_min;
- if (can_free <= 0)
+ if (can_free <= 0) {
+ wmsum_add(&zfs_arc_vm_lowmem_nofree, 1);
return;
+ }
to_free = (can_free >> arc_shrink_shift) - MIN(free_memory, 0);
DTRACE_PROBE2(arc__needfree, int64_t, free_memory, int64_t, to_free);
arc_reduce_target_size(to_free);
@@ -174,6 +202,11 @@
void
arc_lowmem_init(void)
{
+ wmsum_init(&zfs_arc_vm_lowmem_events, 0);
+ wmsum_init(&zfs_arc_vm_lowmem_kmem, 0);
+ wmsum_init(&zfs_arc_vm_lowmem_pages, 0);
+ wmsum_init(&zfs_arc_vm_lowmem_nofree, 0);
+ wmsum_init(&zfs_arc_vm_lowmem_pagedaemon, 0);
arc_event_lowmem = EVENTHANDLER_REGISTER(vm_lowmem, arc_lowmem, NULL,
EVENTHANDLER_PRI_FIRST);
}
@@ -183,6 +216,11 @@
{
if (arc_event_lowmem != NULL)
EVENTHANDLER_DEREGISTER(vm_lowmem, arc_event_lowmem);
+ wmsum_fini(&zfs_arc_vm_lowmem_events);
+ wmsum_fini(&zfs_arc_vm_lowmem_kmem);
+ wmsum_fini(&zfs_arc_vm_lowmem_pages);
+ wmsum_fini(&zfs_arc_vm_lowmem_nofree);
+ wmsum_fini(&zfs_arc_vm_lowmem_pagedaemon);
}
void
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/sysctl_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/sysctl_os.c
--- a/sys/contrib/openzfs/module/os/freebsd/zfs/sysctl_os.c
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/sysctl_os.c
@@ -124,6 +124,7 @@
SYSCTL_NODE(_vfs_zfs, OID_AUTO, zevent, CTLFLAG_RW, 0, "ZFS event");
SYSCTL_NODE(_vfs_zfs, OID_AUTO, zil, CTLFLAG_RW, 0, "ZFS ZIL");
SYSCTL_NODE(_vfs_zfs, OID_AUTO, zio, CTLFLAG_RW, 0, "ZFS ZIO");
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, znode, CTLFLAG_RW, 0, "ZFS znode");
SYSCTL_NODE(_vfs_zfs_livelist, OID_AUTO, condense, CTLFLAG_RW, 0,
"ZFS livelist condense");
@@ -471,6 +472,35 @@
"size of l2c_only state");
/* END CSTYLED */
+/* arc_os.c */
+
+extern counter_u64_t zfs_arc_vm_lowmem_events;
+extern counter_u64_t zfs_arc_vm_lowmem_kmem;
+extern counter_u64_t zfs_arc_vm_lowmem_pages;
+extern counter_u64_t zfs_arc_vm_lowmem_nofree;
+extern counter_u64_t zfs_arc_vm_lowmem_pagedaemon;
+
+SYSCTL_NODE(_vfs_zfs_arc, OID_AUTO, vm_lowmem, CTLFLAG_RW, 0,
+ "vm_lowmem kernel event received by ARC");
+
+/* BEGIN CSTYLED */
+SYSCTL_COUNTER_U64(_vfs_zfs_arc_vm_lowmem, OID_AUTO, events,
+ CTLFLAG_RD, &zfs_arc_vm_lowmem_events,
+ "total vm_lowmem events");
+SYSCTL_COUNTER_U64(_vfs_zfs_arc_vm_lowmem, OID_AUTO, kmem,
+ CTLFLAG_RD, &zfs_arc_vm_lowmem_kmem,
+ "low kernel memory events");
+SYSCTL_COUNTER_U64(_vfs_zfs_arc_vm_lowmem, OID_AUTO, pages,
+ CTLFLAG_RD, &zfs_arc_vm_lowmem_pages,
+ "low page events");
+SYSCTL_COUNTER_U64(_vfs_zfs_arc_vm_lowmem, OID_AUTO, nofree,
+ CTLFLAG_RD, &zfs_arc_vm_lowmem_nofree,
+ "ARC memory not freed");
+SYSCTL_COUNTER_U64(_vfs_zfs_arc_vm_lowmem, OID_AUTO, pagedaemon,
+ CTLFLAG_RD, &zfs_arc_vm_lowmem_pagedaemon,
+ "calls by pagedaemon");
+/* END CSTYLED */
+
/* dbuf.c */
/* dmu.c */
@@ -870,3 +900,43 @@
CTLFLAG_RDTUN, &zio_exclude_metadata, 0,
"Exclude metadata buffers from dumps as well");
/* END CSTYLED */
+
+/* zfs_vfsops.c */
+
+static int
+param_get_znode_prunable_count(SYSCTL_HANDLER_ARGS)
+{
+ int64_t val;
+ uint64_t count, inuse;
+
+ count = atomic_load_acq_64(&zfs_znode_count);
+ inuse = atomic_load_acq_64(&zfs_znode_inuse_count);
+
+ val = count - inuse;
+ return (sysctl_handle_64(oidp, &val, 0, req));
+}
+
+/* BEGIN CSTYLED */
+SYSCTL_UQUAD(_vfs_zfs_znode, OID_AUTO, count,
+ CTLFLAG_RD, &zfs_znode_count, 0,
+ "number of zfs vnodes");
+SYSCTL_UQUAD(_vfs_zfs_znode, OID_AUTO, inuse,
+ CTLFLAG_RD, &zfs_znode_inuse_count, 0,
+ "number of zfs vnodes in use");
+SYSCTL_PROC(_vfs_zfs_znode, OID_AUTO, prunable,
+ CTLTYPE_S64 | CTLFLAG_RD | CTLFLAG_MPSAFE,
+ NULL, 0, param_get_znode_prunable_count, "Q",
+ "number of ARC-prunable zfs vnodes");
+SYSCTL_COUNTER_U64(_vfs_zfs_znode, OID_AUTO, pruning_requested,
+ CTLFLAG_RD, &zfs_znode_pruning_requested,
+ "number of ARC pruning requests");
+SYSCTL_COUNTER_U64(_vfs_zfs_znode, OID_AUTO, pruning_skipped,
+ CTLFLAG_RD, &zfs_znode_pruning_skipped,
+ "number of ARC pruning skips");
+SYSCTL_COUNTER_U64(_vfs_zfs_znode, OID_AUTO, pruning_withwaiter,
+ CTLFLAG_RD, &zfs_znode_pruning_withwaiter,
+ "number of ARC pruning executed due to waiters");
+SYSCTL_COUNTER_U64(_vfs_zfs_znode, OID_AUTO, pruning_withwaiter_throttled,
+ CTLFLAG_RD, &zfs_znode_pruning_withwaiter_throttled,
+ "number of ARC pruning with waiters, throttled");
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_ctldir.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_ctldir.c
--- a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_ctldir.c
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_ctldir.c
@@ -185,6 +185,11 @@
return (error);
}
+ /*
+ * Do not account the vnodes for the ZFS sfs; such the vnodes are not
+ * subject to the ARC pruning.
+ */
+
/*
* Exclusively lock the vnode vnode while it's being constructed.
*/
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vfsops.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vfsops.c
--- a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vfsops.c
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vfsops.c
@@ -40,6 +40,7 @@
#include <sys/vfs.h>
#include <sys/mntent.h>
#include <sys/mount.h>
+#include <sys/arc_impl.h>
#include <sys/cmn_err.h>
#include <sys/zfs_znode.h>
#include <sys/zfs_vnops.h>
@@ -167,6 +168,36 @@
*/
static uint32_t zfs_active_fs_count = 0;
+/*
+ * The counts of the znodes and those in use. (vp->v_usecount > 0)
+ * They are used to estimate the number of the ARC-prunable [vz]nodes and
+ * dnodes.
+ */
+uint64_t zfs_znode_count;
+uint64_t zfs_znode_inuse_count;
+
+/*
+ * The stats of the ARC pruning.
+ *
+ * - zfs_znode_pruning_requested
+ * The requests of the ARC pruning.
+ *
+ * - zfs_znode_pruning_skipped
+ * The skipped ARC pruning attempts because the prunable znodes do not meet
+ * the requested size.
+ *
+ * - zfs_znode_pruning_withwaiter
+ * The ARC pruning attempts executed because there is at least one thread
+ * waiting for the ARC eviction.
+ *
+ * - zfs_znode_pruning_withwaiter_throttled
+ * The ARC pruning attempts not boosted due to the rate limit.
+ */
+wmsum_t zfs_znode_pruning_requested;
+wmsum_t zfs_znode_pruning_skipped;
+wmsum_t zfs_znode_pruning_withwaiter;
+wmsum_t zfs_znode_pruning_withwaiter_throttled;
+
int
zfs_get_temporary_prop(dsl_dataset_t *ds, zfs_prop_t zfs_prop, uint64_t *val,
char *setpoint)
@@ -1208,6 +1239,9 @@
#if defined(_KERNEL) && !defined(KMEM_DEBUG)
vfsp->mnt_kern_flag |= MNTK_FPLOOKUP;
#endif
+
+ vfsp->mnt_fsvninusep = &zfs_znode_inuse_count;
+
/*
* The fsid is 64 bits, composed of an 8-bit fs type, which
* separates our fsid from any other filesystem types, and a
@@ -2077,17 +2111,102 @@
static arc_prune_t *zfs_prune;
static void
-zfs_prune_task(uint64_t nr_to_scan, void *arg __unused)
+zfs_prune_task(uint64_t dn_to_scan, void *arg __unused)
{
- if (nr_to_scan > INT_MAX)
- nr_to_scan = INT_MAX;
+ boolean_t update_ts_last_withwaiter;
+ int64_t zn_prunable, dn_total, zn_delta;
+ uint64_t zn_total, zn_inuse, zn_to_scan;
+ struct timespec ts_now, ts_delta;
+ static struct timespec ts_last_withwaiter;
+ static const struct timespec ts_pause_withwaiter =
+ {.tv_sec = 1, .tv_nsec = 0};
+
+ wmsum_add(&zfs_znode_pruning_requested, 1);
+
+ zn_total = atomic_load_acq_64(&zfs_znode_count);
+ zn_inuse = atomic_load_acq_64(&zfs_znode_inuse_count);
+
+ /*
+ * Work around the in-use counter error that may happen under a heavy load.
+ *
+ * Fix the in-use counter value only when the counters are stable, ie their
+ * values do not change across multiple reads. Otherwise, defer the fix to
+ * the next chance.
+ */
+ if (__predict_false(zn_total < zn_inuse))
+ zn_delta = zn_inuse - zn_total;
+ else if (__predict_false(((int64_t)zn_inuse) < 0))
+ zn_delta = (int64_t)zn_inuse;
+ else
+ zn_delta = 0;
+
+ if (__predict_false(0 != zn_delta)) {
+ if (zn_total == atomic_load_64(&zfs_znode_count)) {
+ if (atomic_cmpset_64(&zfs_znode_inuse_count, zn_inuse,
+ zn_inuse - zn_delta)) {
+ if (__predict_false(
+ zn_total != atomic_load_64(&zfs_znode_count))) {
+ atomic_add_64(&zfs_znode_inuse_count, zn_delta);
+ }
+ }
+ }
+ }
+
+ zn_prunable = zn_total - zn_inuse - zn_delta;
+
+ /*
+ * Scale the number of the prunable dnodes into the znodes by the total
+ * number of the znodes and dnodes. A znode may span across multiple
+ * dnodes, but the precise span estimation is both complicated and opaque
+ * to the znode and vnode sides.
+ *
+ * Assume that the numbers of the znodes and dnodes fit within the 32 bit
+ * integer type.
+ */
+ zn_to_scan = dn_to_scan * zn_total;
+ dn_total = wmsum_value(&arc_sums.arcstat_dnode_size) / sizeof(dnode_t);
+ zn_to_scan /= dn_total;
+
+ update_ts_last_withwaiter = B_FALSE;
+
+ if (arc_is_waiting_evict()) {
+ /*
+ * Someone wants the ARC eviction. Prune everything unless there are
+ * no prunable vnodes at all.
+ *
+ * Limit the rate up to 1 [Hz] because this eviction makes the vnode
+ * allocation so expensive.
+ */
+ wmsum_add(&zfs_znode_pruning_withwaiter, 1);
+ getnanotime(&ts_now);
+ timespecsub(&ts_now, &ts_last_withwaiter, &ts_delta);
+ if (timespeccmp(&ts_delta, &ts_pause_withwaiter, >=)) {
+ if (zn_prunable < zn_to_scan)
+ zn_to_scan = zn_prunable;
+ update_ts_last_withwaiter = B_TRUE;
+ } else
+ wmsum_add(&zfs_znode_pruning_withwaiter_throttled, 1);
+ }
+ if ((zn_prunable < zn_to_scan) || (0 == zn_to_scan)) {
+ wmsum_add(&zfs_znode_pruning_skipped, 1);
+ return;
+ }
+
+ if (zn_to_scan > INT_MAX)
+ zn_to_scan = INT_MAX;
+
+ if (zn_to_scan > 0) {
#if __FreeBSD_version >= 1300139
- sx_xlock(&zfs_vnlru_lock);
- vnlru_free_vfsops(nr_to_scan, &zfs_vfsops, zfs_vnlru_marker);
- sx_xunlock(&zfs_vnlru_lock);
+ sx_xlock(&zfs_vnlru_lock);
+ vnlru_free_vfsops(zn_to_scan, &zfs_vfsops, zfs_vnlru_marker);
+ sx_xunlock(&zfs_vnlru_lock);
#else
- vnlru_free(nr_to_scan, &zfs_vfsops);
+ vnlru_free(zn_to_scan, &zfs_vfsops);
#endif
+ }
+
+ if (update_ts_last_withwaiter)
+ getnanotime(&ts_last_withwaiter);
}
void
@@ -2113,6 +2232,11 @@
*/
zfs_vnodes_adjust();
+ wmsum_init(&zfs_znode_pruning_requested, 0);
+ wmsum_init(&zfs_znode_pruning_skipped, 0);
+ wmsum_init(&zfs_znode_pruning_withwaiter, 0);
+ wmsum_init(&zfs_znode_pruning_withwaiter_throttled, 0);
+
dmu_objset_register_type(DMU_OST_ZFS, zpl_get_file_info);
zfsvfs_taskq = taskq_create("zfsvfs", 1, minclsyspri, 0, 0, 0);
@@ -2133,6 +2257,11 @@
sx_destroy(&zfs_vnlru_lock);
#endif
+ wmsum_fini(&zfs_znode_pruning_requested);
+ wmsum_fini(&zfs_znode_pruning_skipped);
+ wmsum_fini(&zfs_znode_pruning_withwaiter);
+ wmsum_fini(&zfs_znode_pruning_withwaiter_throttled);
+
taskq_destroy(zfsvfs_taskq);
zfsctl_fini();
zfs_znode_fini();
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops_os.c
--- a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops_os.c
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops_os.c
@@ -82,6 +82,7 @@
#include <sys/vmmeter.h>
#include <vm/vm_param.h>
#include <sys/zil.h>
+#include <sys/zfs_vfsops.h>
#include <sys/zfs_vnops.h>
#include <sys/module.h>
#include <sys/sysent.h>
@@ -5171,6 +5172,7 @@
ZFS_TEARDOWN_INACTIVE_EXIT_READ(zfsvfs);
vp->v_data = NULL;
+ atomic_subtract_rel_64(&zfs_znode_count, 1);
return (0);
}
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_znode.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_znode.c
--- a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_znode.c
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_znode.c
@@ -50,6 +50,7 @@
#include <sys/zfs_ioctl.h>
#include <sys/zfs_rlock.h>
#include <sys/zfs_fuid.h>
+#include <sys/zfs_vfsops.h>
#include <sys/dnode.h>
#include <sys/fs/zfs.h>
#endif /* _KERNEL */
@@ -547,6 +548,12 @@
if (vp->v_type != VFIFO)
VN_LOCK_ASHARE(vp);
+ atomic_add_rel_64(&zfs_znode_count, 1);
+ /*
+ * Defer the increment of zfs_znode_inuse_count until vp gets inserted into
+ * mp.
+ */
+
return (zp);
}
@@ -827,6 +834,7 @@
vp->v_vflag &= ~VV_FORCEINSMQ;
(void) err;
KASSERT(err == 0, ("insmntque() failed: error %d", err));
+ atomic_add_rel_64(&zfs_znode_inuse_count, 1);
}
kmem_free(sa_attrs, sizeof (sa_bulk_attr_t) * ZPL_END);
ZFS_OBJ_HOLD_EXIT(zfsvfs, obj);
@@ -1056,6 +1064,7 @@
if (err == 0) {
vp->v_hash = obj_num;
VOP_UNLOCK1(vp);
+ atomic_add_rel_64(&zfs_znode_inuse_count, 1);
} else {
zp->z_vnode = NULL;
zfs_znode_dmu_fini(zp);
diff --git a/sys/contrib/openzfs/module/zfs/arc.c b/sys/contrib/openzfs/module/zfs/arc.c
--- a/sys/contrib/openzfs/module/zfs/arc.c
+++ b/sys/contrib/openzfs/module/zfs/arc.c
@@ -7761,6 +7761,18 @@
ASSERT0(arc_loaned_bytes);
}
+boolean_t
+arc_is_waiting_evict(void)
+{
+ boolean_t is_empty;
+
+ mutex_enter(&arc_evict_lock);
+ is_empty = list_is_empty(&arc_evict_waiters);
+ mutex_exit(&arc_evict_lock);
+
+ return (!is_empty);
+}
+
/*
* Level 2 ARC
*
diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c
--- a/sys/kern/vfs_subr.c
+++ b/sys/kern/vfs_subr.c
@@ -1313,11 +1313,74 @@
}
static int max_free_per_call = 10000;
+static bool recycle_vnode_bufs_pages = true;
+static bool recycle_vnode_nc_src = true;
SYSCTL_INT(_debug, OID_AUTO, max_vnlru_free, CTLFLAG_RW, &max_free_per_call, 0,
"limit on vnode free requests per call to the vnlru_free routine (legacy)");
SYSCTL_INT(_vfs_vnode_vnlru, OID_AUTO, max_free_per_call, CTLFLAG_RW,
&max_free_per_call, 0,
"limit on vnode free requests per call to the vnlru_free routine");
+SYSCTL_BOOL(_vfs_vnode_vnlru, OID_AUTO, recycle_bufs_pages, CTLFLAG_RW,
+ &recycle_vnode_bufs_pages, 0,
+ "enable recycling vnodes with clean buffers and clean/dirty VM pages");
+SYSCTL_BOOL(_vfs_vnode_vnlru, OID_AUTO, recycle_nc_src, CTLFLAG_RW,
+ &recycle_vnode_nc_src, 0,
+ "enable recycling vnodes acting as namecache source");
+
+/*
+ * Count the hold sources on a regular file vnode.
+ */
+static void
+vnlru_count_hold_sources_reg(struct vnode * restrict vp,
+ int * restrict vn_holdcnt,
+ int * restrict cleanbuf_holdcnt,
+ int * restrict dirtybuf_holdcnt,
+ int * restrict vmpage_holdcnt,
+ int * restrict unknown_holdcnt)
+{
+ struct vm_object *object;
+ struct bufobj *bo;
+
+ VNPASS(VREG == vp->v_type, vp);
+
+ *vn_holdcnt = atomic_load_int(&vp->v_holdcnt);
+
+ bo = &vp->v_bufobj;
+ *cleanbuf_holdcnt = atomic_load_int(&bo->bo_clean.bv_cnt);
+ *dirtybuf_holdcnt = atomic_load_int(&bo->bo_dirty.bv_cnt);
+
+ object = atomic_load_ptr(&vp->v_object);
+ if (object != NULL &&
+ object->type == OBJT_VNODE &&
+ object->resident_page_count > 0)
+ *vmpage_holdcnt = 1;
+ else
+ *vmpage_holdcnt = 0;
+
+ *unknown_holdcnt = *vn_holdcnt -
+ (*cleanbuf_holdcnt + *dirtybuf_holdcnt + *vmpage_holdcnt);
+}
+
+/*
+ * Count the hold sources on a directory vnode.
+ */
+static void
+vnlru_count_hold_sources_dir(struct vnode * restrict vp,
+ int * restrict vn_holdcnt,
+ int * restrict nc_src_holdcnt,
+ int * restrict unknown_holdcnt)
+{
+ VNPASS(VDIR == vp->v_type, vp);
+
+ *vn_holdcnt = atomic_load_int(&vp->v_holdcnt);
+
+ if (LIST_EMPTY(&vp->v_cache_src))
+ *nc_src_holdcnt = 0;
+ else
+ *nc_src_holdcnt = 1;
+
+ *unknown_holdcnt = *vn_holdcnt - *nc_src_holdcnt;
+}
/*
* Attempt to reduce the free list by the requested amount.
@@ -1327,8 +1390,9 @@
{
struct vnode *vp;
struct mount *mp;
- int ocount;
- bool retried;
+ int ocount, vn_holdcnt, cleanbuf_holdcnt, dirtybuf_holdcnt, vmpage_holdcnt,
+ nc_src_holdcnt, unknown_holdcnt;
+ bool retried, *phase2_go_toggle, phase2_go;
mtx_assert(&vnode_list_mtx, MA_OWNED);
if (count > max_free_per_call)
@@ -1368,8 +1432,6 @@
}
if (__predict_false(vp->v_type == VMARKER))
continue;
- if (vp->v_holdcnt > 0)
- continue;
/*
* Don't recycle if our vnode is from different type
* of mount point. Note that mp is type-safe, the
@@ -1380,9 +1442,71 @@
mp->mnt_op != mnt_op) {
continue;
}
- if (__predict_false(vp->v_type == VBAD || vp->v_type == VNON)) {
+ if (vp->v_type == VBAD || __predict_false(vp->v_type == VNON)) {
continue;
}
+ vn_holdcnt = atomic_load_int(&vp->v_holdcnt);
+ if (vn_holdcnt > 0) {
+ phase2_go_toggle = NULL;
+ phase2_go = false;
+
+ switch (vp->v_type) {
+ case VREG:
+ phase2_go_toggle = &recycle_vnode_bufs_pages;
+
+ /*
+ * Count the holds by the bufs and VM pages in the object,
+ * and compare them to the actual hold count.
+ */
+ vnlru_count_hold_sources_reg(vp,
+ &vn_holdcnt,
+ &cleanbuf_holdcnt,
+ &dirtybuf_holdcnt,
+ &vmpage_holdcnt,
+ &unknown_holdcnt);
+
+ if ((cleanbuf_holdcnt == vn_holdcnt) &&
+ (0 == dirtybuf_holdcnt) && (0 == vmpage_holdcnt)) {
+ phase2_go = true;
+ } else if (
+ ((cleanbuf_holdcnt + vmpage_holdcnt) == vn_holdcnt) &&
+ (0 == dirtybuf_holdcnt)) {
+ phase2_go = true;
+ }
+ break;
+
+ case VDIR:
+ phase2_go_toggle = &recycle_vnode_nc_src;
+
+ /*
+ * Count the holds by the namecache entries from this
+ * vnode, and compare them to the actual hold count.
+ */
+
+ vnlru_count_hold_sources_dir(vp,
+ &vn_holdcnt,
+ &nc_src_holdcnt,
+ &unknown_holdcnt);
+
+ if (nc_src_holdcnt == vn_holdcnt) {
+ phase2_go = true;
+ }
+
+ break;
+
+ default:
+ /*
+ * NOP; the rest of the vnode types should not happen so
+ * often.
+ */
+ break;
+ }
+
+ if ((NULL == phase2_go_toggle) ||
+ !(*phase2_go_toggle) ||
+ !phase2_go)
+ continue;
+ }
if (!vhold_recycle_free(vp))
continue;
TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist);
@@ -3768,7 +3892,9 @@
static bool
vhold_recycle_free(struct vnode *vp)
{
- int count;
+ int count, vn_holdcnt, cleanbuf_holdcnt, dirtybuf_holdcnt, vmpage_holdcnt,
+ nc_src_holdcnt, unknown_holdcnt;
+ bool *phase2_go_toggle, phase2_go;
mtx_assert(&vnode_list_mtx, MA_OWNED);
@@ -3781,10 +3907,61 @@
}
VNASSERT(count >= 0, vp, ("invalid hold count %d\n", count));
if (count > 0) {
- return (false);
+ /*
+ * Check for the vnode holds again. Refer to the phase 2 test in
+ * vnlru_free_impl() for the detail.
+ */
+ phase2_go_toggle = NULL;
+ phase2_go = false;
+
+ switch (vp->v_type) {
+ case VREG:
+ phase2_go_toggle = &recycle_vnode_bufs_pages;
+
+ vnlru_count_hold_sources_reg(vp,
+ &vn_holdcnt,
+ &cleanbuf_holdcnt,
+ &dirtybuf_holdcnt,
+ &vmpage_holdcnt,
+ &unknown_holdcnt);
+
+ if ((cleanbuf_holdcnt == vn_holdcnt) &&
+ (0 == vmpage_holdcnt) && (0 == dirtybuf_holdcnt)) {
+ phase2_go = true;
+ } else if (
+ ((cleanbuf_holdcnt + vmpage_holdcnt) == vn_holdcnt) &&
+ (0 == dirtybuf_holdcnt)) {
+ phase2_go = true;
+ }
+
+ break;
+
+ case VDIR:
+ phase2_go_toggle = &recycle_vnode_nc_src;
+
+ vnlru_count_hold_sources_dir(vp,
+ &vn_holdcnt,
+ &nc_src_holdcnt,
+ &unknown_holdcnt);
+
+ if (nc_src_holdcnt == vn_holdcnt) {
+ phase2_go = true;
+ }
+
+ break;
+
+ default:
+ return (false);
+ }
+
+ if ((NULL == phase2_go_toggle) ||
+ !(*phase2_go_toggle) ||
+ !phase2_go)
+ return (false);
}
if (atomic_fcmpset_int(&vp->v_holdcnt, &count, count + 1)) {
- vfs_freevnodes_dec();
+ if (0 == count)
+ vfs_freevnodes_dec();
return (true);
}
}

File Metadata

Mime Type
text/plain
Expires
Wed, Sep 25, 2:41 PM (4 h, 21 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
12762078
Default Alt Text
D44174.diff (21 KB)

Event Timeline