D23915.diff
No OneTemporary
Actions

Size

58 KB

Referenced Files

None

Subscribers

None

D23915.diff
View Options

	Index: sys/cddl/compat/opensolaris/kern/opensolaris_vfs.c
	===================================================================
	--- sys/cddl/compat/opensolaris/kern/opensolaris_vfs.c
	+++ sys/cddl/compat/opensolaris/kern/opensolaris_vfs.c
	@@ -154,6 +154,7 @@
	vput(vp);
	return (error);
	}
	+ vn_seqc_write_begin(vp);
	VOP_UNLOCK(vp);

	/*
	@@ -206,6 +207,7 @@
	VI_LOCK(vp);
	vp->v_iflag &= ~VI_MOUNT;
	VI_UNLOCK(vp);
	+ vn_seqc_write_end(vp);
	vput(vp);
	vfs_unbusy(mp);
	vfs_freeopts(mp->mnt_optnew);
	@@ -241,6 +243,7 @@
	vfs_event_signal(NULL, VQ_MOUNT, 0);
	if (VFS_ROOT(mp, LK_EXCLUSIVE, &mvp))
	panic("mount: lost mount");
	+ vn_seqc_write_end(vp);
	VOP_UNLOCK(vp);
	vfs_op_exit(mp);
	vfs_unbusy(mp);
	Index: sys/fs/tmpfs/tmpfs.h
	===================================================================
	--- sys/fs/tmpfs/tmpfs.h
	+++ sys/fs/tmpfs/tmpfs.h
	@@ -526,6 +526,14 @@
	return (node);
	}

	+static inline struct tmpfs_node *
	+VP_TO_TMPFS_NODE_SMR(struct vnode *vp)
	+{
	+
	+ MPASS(vp != NULL);
	+ return (atomic_load_ptr(&vp->v_data));
	+}
	+
	static inline struct tmpfs_node *
	VP_TO_TMPFS_DIR(struct vnode *vp)
	{
	Index: sys/fs/tmpfs/tmpfs_subr.c
	===================================================================
	--- sys/fs/tmpfs/tmpfs_subr.c
	+++ sys/fs/tmpfs/tmpfs_subr.c
	@@ -75,6 +75,7 @@

	static uma_zone_t tmpfs_dirent_pool;
	static uma_zone_t tmpfs_node_pool;
	+VFS_SMR_DECLARE;

	static int
	tmpfs_node_ctor(void mem, int size, void arg, int flags)
	@@ -131,6 +132,7 @@
	tmpfs_node_pool = uma_zcreate("TMPFS node",
	sizeof(struct tmpfs_node), tmpfs_node_ctor, tmpfs_node_dtor,
	tmpfs_node_init, tmpfs_node_fini, UMA_ALIGN_PTR, 0);
	+ VFS_SMR_ZONE_SET(tmpfs_node_pool);
	}

	void
	@@ -288,7 +290,7 @@
	if ((mp->mnt_kern_flag & MNT_RDONLY) != 0)
	return (EROFS);

	- nnode = uma_zalloc_arg(tmpfs_node_pool, tmp, M_WAITOK);
	+ nnode = uma_zalloc_smr(tmpfs_node_pool, M_WAITOK);

	/* Generic initialization. */
	nnode->tn_type = type;
	@@ -435,7 +437,7 @@
	panic("tmpfs_free_node: type %p %d", node, (int)node->tn_type);
	}

	- uma_zfree(tmpfs_node_pool, node);
	+ uma_zfree_smr(tmpfs_node_pool, node);
	TMPFS_LOCK(tmp);
	tmpfs_free_tmp(tmp);
	return (true);
	@@ -1619,10 +1621,11 @@
	int
	tmpfs_chmod(struct vnode vp, mode_t mode, struct ucred cred, struct thread *p)
	{
	- int error;
	+ int error, newmode;
	struct tmpfs_node *node;

	ASSERT_VOP_ELOCKED(vp, "chmod");
	+ ASSERT_VOP_IN_SEQC(vp);

	node = VP_TO_TMPFS_NODE(vp);

	@@ -1656,9 +1659,9 @@
	return (error);
	}

	-
	- node->tn_mode &= ~ALLPERMS;
	- node->tn_mode \|= mode & ALLPERMS;
	+ newmode = node->tn_mode & ~ALLPERMS;
	+ newmode \|= mode & ALLPERMS;
	+ atomic_store_int(&node->tn_mode, newmode);

	node->tn_status \|= TMPFS_NODE_CHANGED;

	@@ -1684,6 +1687,7 @@
	gid_t ogid;

	ASSERT_VOP_ELOCKED(vp, "chown");
	+ ASSERT_VOP_IN_SEQC(vp);

	node = VP_TO_TMPFS_NODE(vp);

	@@ -1730,7 +1734,7 @@

	if ((node->tn_mode & (S_ISUID \| S_ISGID)) && (ouid != uid \|\| ogid != gid)) {
	if (priv_check_cred(cred, PRIV_VFS_RETAINSUGID))
	- node->tn_mode &= ~(S_ISUID \| S_ISGID);
	+ atomic_store_int(&node->tn_mode, node->tn_mode & ~(S_ISUID \| S_ISGID));
	}

	ASSERT_VOP_ELOCKED(vp, "chown2");
	Index: sys/fs/tmpfs/tmpfs_vfsops.c
	===================================================================
	--- sys/fs/tmpfs/tmpfs_vfsops.c
	+++ sys/fs/tmpfs/tmpfs_vfsops.c
	@@ -462,6 +462,8 @@
	mp->mnt_flag \|= MNT_LOCAL;
	mp->mnt_kern_flag \|= MNTK_LOOKUP_SHARED \| MNTK_EXTENDED_SHARED \|
	MNTK_TEXT_REFS \| MNTK_NOMSYNC;
	+ if (!nonc)
	+ mp->mnt_kern_flag \|= MNTK_FPLOOKUP;
	MNT_IUNLOCK(mp);

	mp->mnt_data = tmp;
	Index: sys/fs/tmpfs/tmpfs_vnops.h
	===================================================================
	--- sys/fs/tmpfs/tmpfs_vnops.h
	+++ sys/fs/tmpfs/tmpfs_vnops.h
	@@ -49,6 +49,7 @@
	extern struct vop_vector tmpfs_vnodeop_nonc_entries;

	vop_access_t tmpfs_access;
	+vop_fplookup_vexec_t tmpfs_fplookup_vexec;
	vop_getattr_t tmpfs_getattr;
	vop_setattr_t tmpfs_setattr;
	vop_pathconf_t tmpfs_pathconf;
	Index: sys/fs/tmpfs/tmpfs_vnops.c
	===================================================================
	--- sys/fs/tmpfs/tmpfs_vnops.c
	+++ sys/fs/tmpfs/tmpfs_vnops.c
	@@ -317,6 +317,32 @@
	return (0);
	}

	+/*
	+ * VOP_FPLOOKUP_VEXEC routines are subject to special circumstances, see
	+ * the comment above cache_fplookup for details.
	+ */
	+int
	+tmpfs_fplookup_vexec(struct vop_fplookup_vexec_args *v)
	+{
	+ struct vnode *vp;
	+ struct tmpfs_node *node;
	+ struct ucred *cred;
	+ mode_t all_x, mode;
	+
	+ vp = v->a_vp;
	+ node = VP_TO_TMPFS_NODE_SMR(vp);
	+ if (__predict_false(node == NULL))
	+ return (EAGAIN);
	+
	+ all_x = S_IXUSR \| S_IXGRP \| S_IXOTH;
	+ mode = atomic_load_int(&node->tn_mode);
	+ if (__predict_true((mode & all_x) == all_x))
	+ return (0);
	+
	+ cred = v->a_cred;
	+ return (vaccess_vexec_smr(mode, node->tn_uid, node->tn_gid, cred));
	+}
	+
	int
	tmpfs_access(struct vop_access_args *v)
	{
	@@ -428,6 +454,8 @@

	MPASS(VOP_ISLOCKED(vp));

	+ vn_seqc_write_begin(vp);
	+
	error = 0;

	/* Abort if any unsettable attribute is given. */
	@@ -466,6 +494,8 @@
	* from tmpfs_update. */
	tmpfs_update(vp);

	+ vn_seqc_write_end(vp);
	+
	MPASS(VOP_ISLOCKED(vp));

	return error;
	@@ -806,12 +836,15 @@
	struct tmpfs_node *tnode;
	struct tmpfs_node *tdnode;
	int error;
	+ bool want_seqc_end;

	MPASS(VOP_ISLOCKED(tdvp));
	MPASS(IMPLIES(tvp != NULL, VOP_ISLOCKED(tvp)));
	MPASS(fcnp->cn_flags & HASBUF);
	MPASS(tcnp->cn_flags & HASBUF);

	+ want_seqc_end = false;
	+
	/*
	* Disallow cross-device renames.
	* XXX Why isn't this done by the caller?
	@@ -852,6 +885,13 @@
	}
	}

	+ if (tvp != NULL)
	+ vn_seqc_write_begin(tvp);
	+ vn_seqc_write_begin(tdvp);
	+ vn_seqc_write_begin(fvp);
	+ vn_seqc_write_begin(fdvp);
	+ want_seqc_end = true;
	+
	tmp = VFS_TO_TMPFS(tdvp->v_mount);
	tdnode = VP_TO_TMPFS_DIR(tdvp);
	tnode = (tvp == NULL) ? NULL : VP_TO_TMPFS_NODE(tvp);
	@@ -1065,6 +1105,14 @@
	VOP_UNLOCK(fdvp);

	out:
	+ if (want_seqc_end) {
	+ if (tvp != NULL)
	+ vn_seqc_write_end(tvp);
	+ vn_seqc_write_end(tdvp);
	+ vn_seqc_write_end(fvp);
	+ vn_seqc_write_end(fdvp);
	+ }
	+
	/*
	* Release target nodes.
	* XXX: I don't understand when tdvp can be the same as tvp, but
	@@ -1621,6 +1669,7 @@
	.vop_mknod = tmpfs_mknod,
	.vop_open = tmpfs_open,
	.vop_close = tmpfs_close,
	+ .vop_fplookup_vexec = tmpfs_fplookup_vexec,
	.vop_access = tmpfs_access,
	.vop_getattr = tmpfs_getattr,
	.vop_setattr = tmpfs_setattr,
	Index: sys/kern/kern_descrip.c
	===================================================================
	--- sys/kern/kern_descrip.c
	+++ sys/kern/kern_descrip.c
	@@ -102,8 +102,8 @@

	static __read_mostly uma_zone_t file_zone;
	static __read_mostly uma_zone_t filedesc0_zone;
	-static __read_mostly uma_zone_t pwd_zone;
	-static __read_mostly smr_t pwd_smr;
	+__read_mostly uma_zone_t pwd_zone;
	+VFS_SMR_DECLARE;

	static int closefp(struct filedesc fdp, int fd, struct file fp,
	struct thread *td, int holdleaders);
	@@ -3346,14 +3346,24 @@

	fdp = td->td_proc->p_fd;

	- smr_enter(pwd_smr);
	+ vfs_smr_enter();
	for (;;) {
	- pwd = smr_entered_load(&fdp->fd_pwd, pwd_smr);
	+ pwd = smr_entered_load(&fdp->fd_pwd, VFS_SMR());
	MPASS(pwd != NULL);
	if (refcount_acquire_if_not_zero(&pwd->pwd_refcount))
	break;
	}
	- smr_exit(pwd_smr);
	+ vfs_smr_exit();
	+ return (pwd);
	+}
	+
	+struct pwd *
	+pwd_get_smr(void)
	+{
	+ struct pwd *pwd;
	+
	+ pwd = smr_entered_load(&curproc->p_fd->fd_pwd, VFS_SMR());
	+ MPASS(pwd != NULL);
	return (pwd);
	}

	@@ -4363,7 +4373,11 @@
	NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
	pwd_zone = uma_zcreate("PWD", sizeof(struct pwd), NULL, NULL,
	NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_SMR);
	- pwd_smr = uma_zone_get_smr(pwd_zone);
	+ /*
	+ * XXXMJG this is a temporary hack due to boot ordering issues against
	+ * the vnode zone.
	+ */
	+ vfs_smr = uma_zone_get_smr(pwd_zone);
	mtx_init(&sigio_lock, "sigio lock", NULL, MTX_DEF);
	}
	SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, filelistinit, NULL);
	Index: sys/kern/vfs_cache.c
	===================================================================
	--- sys/kern/vfs_cache.c
	+++ sys/kern/vfs_cache.c
	@@ -55,6 +55,7 @@
	#include <sys/namei.h>
	#include <sys/proc.h>
	#include <sys/rwlock.h>
	+#include <sys/seqc.h>
	#include <sys/sdt.h>
	#include <sys/smr.h>
	#include <sys/smp.h>
	@@ -67,6 +68,11 @@
	#include <sys/ktrace.h>
	#endif

	+#include <sys/capsicum.h>
	+
	+#include <security/audit/audit.h>
	+#include <security/mac/mac_framework.h>
	+
	#ifdef DDB
	#include <ddb/ddb.h>
	#endif
	@@ -100,6 +106,8 @@
	SDT_PROBE_DEFINE2(vfs, namecache, shrink_negative, done, "struct vnode *",
	"char *");

	+SDT_PROBE_DEFINE3(vfs, fplookup, lookup, done, "struct nameidata", "int", "bool");
	+
	/*
	* This structure describes the elements in the cache of recent
	* names looked up by namei.
	@@ -2810,3 +2818,841 @@
	}

	#endif
	+
	+extern uma_zone_t namei_zone;
	+
	+static bool __read_frequently cache_fast_lookup = true;
	+SYSCTL_BOOL(_vfs, OID_AUTO, cache_fast_lookup, CTLFLAG_RW,
	+ &cache_fast_lookup, 0, "");
	+
	+#define CACHE_FPL_FAILED -2020
	+
	+static void
	+cache_fpl_cleanup_cnp(struct componentname *cnp)
	+{
	+
	+ uma_zfree(namei_zone, cnp->cn_pnbuf);
	+#ifdef DIAGNOSTIC
	+ cnp->cn_pnbuf = NULL;
	+ cnp->cn_nameptr = NULL;
	+#endif
	+}
	+
	+static void
	+cache_fpl_handle_root(struct nameidata ndp, struct vnode *dpp)
	+{
	+ struct componentname *cnp;
	+
	+ cnp = &ndp->ni_cnd;
	+ while (*(cnp->cn_nameptr) == '/') {
	+ cnp->cn_nameptr++;
	+ ndp->ni_pathlen--;
	+ }
	+}
	+
	+static void
	+cache_fpl_handle_root_initial(struct nameidata ndp, struct vnode *dpp)
	+{
	+
	+ cache_fpl_handle_root(ndp, dpp);
	+ *dpp = ndp->ni_rootdir;
	+}
	+
	+/*
	+ * Components of nameidata (or objects it can point to) which may
	+ * need restoring in case fast path lookup fails.
	+ */
	+struct nameidata_saved {
	+ int cn_flags;
	+ long cn_namelen;
	+ char *cn_nameptr;
	+ size_t ni_pathlen;
	+};
	+
	+struct cache_fpl {
	+ int line;
	+ enum cache_fpl_status status;
	+ bool in_smr;
	+ struct nameidata *ndp;
	+ struct nameidata_saved snd;
	+ struct componentname *cnp;
	+ struct vnode *dvp;
	+ seqc_t dvp_seqc;
	+ struct vnode *tvp;
	+ seqc_t tvp_seqc;
	+ struct pwd *pwd;
	+};
	+
	+static void
	+cache_fpl_checkpoint(struct cache_fpl fpl, struct nameidata_saved snd)
	+{
	+
	+ snd->cn_flags = fpl->ndp->ni_cnd.cn_flags;
	+ snd->cn_namelen = fpl->ndp->ni_cnd.cn_namelen;
	+ snd->cn_nameptr = fpl->ndp->ni_cnd.cn_nameptr;
	+ snd->ni_pathlen = fpl->ndp->ni_pathlen;
	+}
	+
	+static void
	+cache_fpl_restore(struct cache_fpl fpl, struct nameidata_saved snd)
	+{
	+
	+ fpl->ndp->ni_cnd.cn_flags = snd->cn_flags;
	+ fpl->ndp->ni_cnd.cn_namelen = snd->cn_namelen;
	+ fpl->ndp->ni_cnd.cn_nameptr = snd->cn_nameptr;
	+ fpl->ndp->ni_pathlen = snd->ni_pathlen;
	+}
	+
	+#ifdef INVARIANTS
	+#define cache_fpl_smr_assert_entered(fpl) ({ \
	+ struct cache_fpl *_fpl = (fpl); \
	+ MPASS(_fpl->in_smr == true); \
	+ VFS_SMR_ASSERT_ENTERED(); \
	+})
	+#define cache_fpl_smr_assert_not_entered(fpl) ({ \
	+ struct cache_fpl *_fpl = (fpl); \
	+ MPASS(_fpl->in_smr == false); \
	+ VFS_SMR_ASSERT_NOT_ENTERED(); \
	+})
	+#else
	+#define cache_fpl_smr_assert_entered(fpl) do { } while (0)
	+#define cache_fpl_smr_assert_not_entered(fpl) do { } while (0)
	+#endif
	+
	+#define cache_fpl_smr_enter(fpl) ({ \
	+ struct cache_fpl *_fpl = (fpl); \
	+ MPASS(_fpl->in_smr == false); \
	+ vfs_smr_enter(); \
	+ _fpl->in_smr = true; \
	+})
	+
	+#define cache_fpl_smr_exit(fpl) ({ \
	+ struct cache_fpl *_fpl = (fpl); \
	+ MPASS(_fpl->in_smr == true); \
	+ vfs_smr_exit(); \
	+ _fpl->in_smr = false; \
	+})
	+
	+static int
	+cache_fpl_aborted_impl(struct cache_fpl *fpl, int line)
	+{
	+
	+ KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET,
	+ ("%s: lookup status already set at %d\n", __func__, fpl->line));
	+ fpl->status = CACHE_FPL_STATUS_ABORTED;
	+ fpl->line = line;
	+ return (CACHE_FPL_FAILED);
	+}
	+
	+#define cache_fpl_aborted(x) cache_fpl_aborted_impl((x), __LINE__)
	+
	+static int
	+cache_fpl_partial_impl(struct cache_fpl *fpl, int line)
	+{
	+
	+ KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET,
	+ ("%s: lookup status already set at %d\n", __func__, fpl->line));
	+ cache_fpl_smr_assert_entered(fpl);
	+ fpl->status = CACHE_FPL_STATUS_PARTIAL;
	+ fpl->line = line;
	+ return (CACHE_FPL_FAILED);
	+}
	+
	+#define cache_fpl_partial(x) cache_fpl_partial_impl((x), __LINE__)
	+
	+static int
	+cache_fpl_handled_impl(struct cache_fpl *fpl, int error, int line)
	+{
	+
	+ KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET,
	+ ("%s: lookup status already set at %d\n", __func__, fpl->line));
	+ cache_fpl_smr_assert_not_entered(fpl);
	+ MPASS(error != CACHE_FPL_FAILED);
	+ fpl->status = CACHE_FPL_STATUS_HANDLED;
	+ fpl->line = line;
	+ return (error);
	+}
	+
	+#define cache_fpl_handled(x, e) cache_fpl_handled_impl((x), (e), __LINE__)
	+
	+#define CACHE_FPL_SUPPORTED_CN_FLAGS \
	+ (LOCKLEAF \| FOLLOW \| LOCKSHARED \| SAVENAME \| ISOPEN \| AUDITVNODE1)
	+
	+static bool
	+cache_can_fplookup(struct cache_fpl *fpl)
	+{
	+ struct nameidata *ndp;
	+ struct componentname *cnp;
	+ struct thread *td;
	+
	+ ndp = fpl->ndp;
	+ cnp = fpl->cnp;
	+ td = cnp->cn_thread;
	+
	+ if (!cache_fast_lookup) {
	+ cache_fpl_aborted(fpl);
	+ return (false);
	+ }
	+ if (mac_vnode_check_lookup_enabled()) {
	+ cache_fpl_aborted(fpl);
	+ return (false);
	+ }
	+ if (cnp->cn_flags & ~CACHE_FPL_SUPPORTED_CN_FLAGS) {
	+ cache_fpl_aborted(fpl);
	+ return (false);
	+ }
	+ if ((cnp->cn_flags & LOCKLEAF) == 0) {
	+ cache_fpl_aborted(fpl);
	+ return (false);
	+ }
	+ if (cnp->cn_nameiop != LOOKUP) {
	+ cache_fpl_aborted(fpl);
	+ return (false);
	+ }
	+ if (ndp->ni_dirfd != AT_FDCWD) {
	+ cache_fpl_aborted(fpl);
	+ return (false);
	+ }
	+ if (IN_CAPABILITY_MODE(td)) {
	+ cache_fpl_aborted(fpl);
	+ return (false);
	+ }
	+ if (AUDITING_TD(td)) {
	+ cache_fpl_aborted(fpl);
	+ return (false);
	+ }
	+ if (ndp->ni_startdir != NULL) {
	+ cache_fpl_aborted(fpl);
	+ return (false);
	+ }
	+ return (true);
	+}
	+
	+static bool
	+cache_fplookup_vnode_supported(struct vnode *vp)
	+{
	+
	+ switch (vp->v_type) {
	+ case VLNK:
	+ return (false);
	+ default:
	+ break;
	+ }
	+ return (true);
	+}
	+
	+/*
	+ * The target vnode is not supported, prepare for the slow path to take over.
	+ */
	+static int
	+cache_fplookup_partial_setup(struct cache_fpl *fpl)
	+{
	+ struct componentname *cnp;
	+ struct vnode *dvp;
	+ struct pwd *pwd;
	+ seqc_t dvp_seqc;
	+
	+ cnp = fpl->cnp;
	+ dvp = fpl->dvp;
	+ dvp_seqc = fpl->dvp_seqc;
	+
	+ if (!vref_smr(dvp)) {
	+ fpl->status = CACHE_FPL_STATUS_ABORTED;
	+ cache_fpl_smr_exit(fpl);
	+ return (CACHE_FPL_FAILED);
	+ }
	+
	+ cache_fpl_smr_exit(fpl);
	+ if (!seqc_consistent(&dvp->v_seqc, dvp_seqc)) {
	+ fpl->status = CACHE_FPL_STATUS_ABORTED;
	+ vrele(dvp);
	+ return (CACHE_FPL_FAILED);
	+ }
	+
	+ pwd = pwd_hold(curthread);
	+ if (fpl->pwd != pwd) {
	+ fpl->status = CACHE_FPL_STATUS_ABORTED;
	+ vrele(dvp);
	+ pwd_drop(pwd);
	+ return (CACHE_FPL_FAILED);
	+ }
	+
	+ fpl->ndp->ni_startdir = dvp;
	+ return (0);
	+}
	+
	+static int
	+cache_fplookup_final(struct cache_fpl *fpl)
	+{
	+ struct componentname *cnp;
	+ enum vgetstate tvs;
	+ struct vnode dvp, tvp;
	+ seqc_t dvp_seqc, tvp_seqc;
	+ int error;
	+
	+ cnp = fpl->cnp;
	+ dvp = fpl->dvp;
	+ dvp_seqc = fpl->dvp_seqc;
	+ tvp = fpl->tvp;
	+ tvp_seqc = fpl->tvp_seqc;
	+
	+ VNPASS(cache_fplookup_vnode_supported(dvp), dvp);
	+ MPASS((cnp->cn_flags & LOCKLEAF) != 0);
	+
	+ tvs = vget_prep_smr(tvp);
	+ if (tvs == VGET_NONE) {
	+ return (cache_fpl_partial(fpl));
	+ }
	+
	+ if (!seqc_consistent(&dvp->v_seqc, dvp_seqc)) {
	+ cache_fpl_smr_exit(fpl);
	+ vget_abort(tvp, tvs);
	+ return (cache_fpl_aborted(fpl));
	+ }
	+
	+ cache_fpl_smr_exit(fpl);
	+
	+ error = vget_finish(tvp, cnp->cn_lkflags, tvs);
	+ if (error != 0) {
	+ return (cache_fpl_aborted(fpl));
	+ }
	+
	+ if (!seqc_consistent(&tvp->v_seqc, tvp_seqc)) {
	+ vput(tvp);
	+ return (cache_fpl_aborted(fpl));
	+ }
	+
	+ return (cache_fpl_handled(fpl, 0));
	+}
	+
	+static int
	+cache_fplookup_next(struct cache_fpl *fpl)
	+{
	+ struct componentname *cnp;
	+ struct namecache *ncp;
	+ struct vnode dvp, tvp;
	+ u_char nc_flag;
	+ uint32_t hash;
	+
	+ cnp = fpl->cnp;
	+ dvp = fpl->dvp;
	+
	+ if (__predict_false(cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.')) {
	+ fpl->tvp = dvp;
	+ fpl->tvp_seqc = seqc_read_any(&dvp->v_seqc);
	+ if (seqc_in_modify(fpl->tvp_seqc)) {
	+ return (cache_fpl_partial(fpl));
	+ }
	+ return (0);
	+ }
	+
	+ hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
	+
	+ CK_LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
	+ counter_u64_add(numchecks, 1);
	+ if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
	+ !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
	+ break;
	+ }
	+
	+ /*
	+ * If there is no entry we have to punt to the slow path to perform
	+ * actual lookup. Should there be nothing with this name a negative
	+ * entry will be created.
	+ */
	+ if (__predict_false(ncp == NULL)) {
	+ return (cache_fpl_partial(fpl));
	+ }
	+
	+ tvp = atomic_load_ptr(&ncp->nc_vp);
	+ nc_flag = atomic_load_char(&ncp->nc_flag);
	+ if (__predict_false(cache_ncp_invalid(ncp))) {
	+ return (cache_fpl_partial(fpl));
	+ }
	+ if (__predict_false(nc_flag & NCF_WHITE)) {
	+ return (cache_fpl_partial(fpl));
	+ }
	+
	+ fpl->tvp = tvp;
	+ if (nc_flag & NCF_NEGATIVE) {
	+ if ((nc_flag & NCF_HOTNEGATIVE) == 0) {
	+ /*
	+ * TODO
	+ * Promoting to hot negative requires locks which are
	+ * not yet supported for simplicity.
	+ */
	+ return (cache_fpl_partial(fpl));
	+ }
	+ SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp,
	+ ncp->nc_name);
	+ counter_u64_add(numneghits, 1);
	+ cache_fpl_smr_exit(fpl);
	+ return (cache_fpl_handled(fpl, ENOENT));
	+ }
	+
	+ fpl->tvp_seqc = seqc_read_any(&tvp->v_seqc);
	+ if (seqc_in_modify(fpl->tvp_seqc)) {
	+ return (cache_fpl_partial(fpl));
	+ }
	+
	+ if (!cache_fplookup_vnode_supported(tvp)) {
	+ return (cache_fpl_partial(fpl));
	+ }
	+
	+ counter_u64_add(numposhits, 1);
	+ SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, tvp);
	+ return (0);
	+}
	+
	+static bool
	+cache_fplookup_mp_supported(struct mount *mp)
	+{
	+
	+ if (mp == NULL)
	+ return (false);
	+ if ((mp->mnt_kern_flag & MNTK_FPLOOKUP) == 0)
	+ return (false);
	+ if (mp->mnt_flag & MNT_UNION)
	+ return (false);
	+ return (true);
	+}
	+
	+/*
	+ * Walk up the mount stack (if any).
	+ *
	+ * Correctness is provided in the following ways:
	+ * - all vnodes are protected from freeing with SMR
	+ * - struct mount objects are type stable making them always safe to access
	+ * - stability of the particular mount is provided by busying it
	+ * - relationship between the vnode which is mounted on and the mount is
	+ * verified with the vnode sequence counter after busying
	+ * - association between root vnode of the mount and the mount is protected
	+ * by busy
	+ *
	+ * From that point on we can read the sequence counter of the root vnode
	+ * and get the next mount on the stack (if any) using the same protection.
	+ *
	+ * By the end of successful walk we are guaranteed the reached state was
	+ * indeed present at least at some point which matches the regular lookup.
	+ */
	+static int
	+cache_fplookup_climb_mount(struct cache_fpl *fpl)
	+{
	+ struct mount mp, prev_mp;
	+ struct vnode *vp;
	+ seqc_t vp_seqc;
	+
	+ vp = fpl->tvp;
	+ vp_seqc = fpl->tvp_seqc;
	+ if (vp->v_type != VDIR)
	+ return (0);
	+
	+ mp = atomic_load_ptr(&vp->v_mountedhere);
	+ if (mp == NULL)
	+ return (0);
	+
	+ prev_mp = NULL;
	+ for (;;) {
	+ if (!vfs_op_thread_enter(mp)) {
	+ if (prev_mp != NULL)
	+ vfs_op_thread_exit(prev_mp);
	+ return (cache_fpl_partial(fpl));
	+ }
	+ if (prev_mp != NULL)
	+ vfs_op_thread_exit(prev_mp);
	+ if (!seqc_consistent(&vp->v_seqc, vp_seqc)) {
	+ vfs_op_thread_exit(mp);
	+ return (cache_fpl_partial(fpl));
	+ }
	+ if (!cache_fplookup_mp_supported(mp)) {
	+ vfs_op_thread_exit(mp);
	+ return (cache_fpl_partial(fpl));
	+ }
	+ vp = atomic_load_ptr(&mp->mnt_rootvnode);
	+ if (vp == NULL \|\| VN_IS_DOOMED(vp)) {
	+ vfs_op_thread_exit(mp);
	+ return (cache_fpl_partial(fpl));
	+ }
	+ vp_seqc = seqc_read_any(&vp->v_seqc);
	+ if (seqc_in_modify(vp_seqc)) {
	+ vfs_op_thread_exit(mp);
	+ return (cache_fpl_partial(fpl));
	+ }
	+ prev_mp = mp;
	+ mp = atomic_load_ptr(&vp->v_mountedhere);
	+ if (mp == NULL)
	+ break;
	+ }
	+
	+ vfs_op_thread_exit(prev_mp);
	+ fpl->tvp = vp;
	+ fpl->tvp_seqc = vp_seqc;
	+ return (0);
	+}
	+
	+/*
	+ * Parse the path.
	+ *
	+ * The code is mostly copy-pasted from regular lookup, see lookup().
	+ * The structure is maintained along with comments for easier maintenance.
	+ * Deduplicating the code will become feasible after fast path lookup
	+ * becomes more feature-complete.
	+ */
	+static int
	+cache_fplookup_parse(struct cache_fpl *fpl)
	+{
	+ struct nameidata *ndp;
	+ struct componentname *cnp;
	+ char *cp;
	+ char prev_ni_next; / saved ndp->ni_next */
	+ size_t prev_ni_pathlen; /* saved ndp->ni_pathlen */
	+
	+ ndp = fpl->ndp;
	+ cnp = fpl->cnp;
	+
	+ /*
	+ * Search a new directory.
	+ *
	+ * The last component of the filename is left accessible via
	+ * cnp->cn_nameptr for callers that need the name. Callers needing
	+ * the name set the SAVENAME flag. When done, they assume
	+ * responsibility for freeing the pathname buffer.
	+ */
	+ for (cp = cnp->cn_nameptr; cp != 0 && cp != '/'; cp++)
	+ continue;
	+ cnp->cn_namelen = cp - cnp->cn_nameptr;
	+ if (cnp->cn_namelen > NAME_MAX) {
	+ cache_fpl_smr_exit(fpl);
	+ return (cache_fpl_handled(fpl, ENAMETOOLONG));
	+ }
	+ prev_ni_pathlen = ndp->ni_pathlen;
	+ ndp->ni_pathlen -= cnp->cn_namelen;
	+ KASSERT(ndp->ni_pathlen <= PATH_MAX,
	+ ("%s: ni_pathlen underflow to %zd\n", __func__, ndp->ni_pathlen));
	+ prev_ni_next = ndp->ni_next;
	+ ndp->ni_next = cp;
	+
	+ /*
	+ * Replace multiple slashes by a single slash and trailing slashes
	+ * by a null. This must be done before VOP_LOOKUP() because some
	+ * fs's don't know about trailing slashes. Remember if there were
	+ * trailing slashes to handle symlinks, existing non-directories
	+ * and non-existing files that won't be directories specially later.
	+ */
	+ while (*cp == '/' && (cp[1] == '/' \|\| cp[1] == '\0')) {
	+ cp++;
	+ ndp->ni_pathlen--;
	+ if (*cp == '\0') {
	+ /*
	+ * TODO
	+ * Regular lookup performs the following:
	+ * *ndp->ni_next = '\0';
	+ * cnp->cn_flags \|= TRAILINGSLASH;
	+ *
	+ * Which is problematic since it modifies data read
	+ * from userspace. Then if fast path lookup was to
	+ * abort we would have to either restore it or convey
	+ * the flag. Since this is a corner case just ignore
	+ * it for simplicity.
	+ */
	+ return (cache_fpl_partial(fpl));
	+ }
	+ }
	+ ndp->ni_next = cp;
	+
	+ cnp->cn_flags \|= MAKEENTRY;
	+
	+ if (cnp->cn_namelen == 2 &&
	+ cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.')
	+ cnp->cn_flags \|= ISDOTDOT;
	+ else
	+ cnp->cn_flags &= ~ISDOTDOT;
	+ if (*ndp->ni_next == 0)
	+ cnp->cn_flags \|= ISLASTCN;
	+ else
	+ cnp->cn_flags &= ~ISLASTCN;
	+
	+ /*
	+ * Check for degenerate name (e.g. / or "")
	+ * which is a way of talking about a directory,
	+ * e.g. like "/." or ".".
	+ *
	+ * TODO
	+ * Another corner case handled by the regular lookup
	+ */
	+ if (__predict_false(cnp->cn_nameptr[0] == '\0')) {
	+ return (cache_fpl_partial(fpl));
	+ }
	+ return (0);
	+}
	+
	+static void
	+cache_fplookup_parse_advance(struct cache_fpl *fpl)
	+{
	+ struct nameidata *ndp;
	+ struct componentname *cnp;
	+
	+ ndp = fpl->ndp;
	+ cnp = fpl->cnp;
	+
	+ cnp->cn_nameptr = ndp->ni_next;
	+ while (*cnp->cn_nameptr == '/') {
	+ cnp->cn_nameptr++;
	+ ndp->ni_pathlen--;
	+ }
	+}
	+
	+static int
	+cache_fplookup_impl(struct vnode dvp, struct cache_fpl fpl)
	+{
	+ struct nameidata *ndp;
	+ struct componentname *cnp;
	+ struct mount *mp;
	+ int error;
	+
	+ error = CACHE_FPL_FAILED;
	+ ndp = fpl->ndp;
	+ ndp->ni_lcf = 0;
	+ cnp = fpl->cnp;
	+ cnp->cn_lkflags = LK_SHARED;
	+ if ((cnp->cn_flags & LOCKSHARED) == 0)
	+ cnp->cn_lkflags = LK_EXCLUSIVE;
	+
	+ cache_fpl_checkpoint(fpl, &fpl->snd);
	+
	+ fpl->dvp = dvp;
	+ fpl->dvp_seqc = seqc_read_any(&fpl->dvp->v_seqc);
	+ if (seqc_in_modify(fpl->dvp_seqc)) {
	+ cache_fpl_aborted(fpl);
	+ goto out;
	+ }
	+ mp = atomic_load_ptr(&fpl->dvp->v_mount);
	+ if (!cache_fplookup_mp_supported(mp)) {
	+ cache_fpl_aborted(fpl);
	+ goto out;
	+ }
	+
	+ VNPASS(cache_fplookup_vnode_supported(fpl->dvp), fpl->dvp);
	+
	+ for (;;) {
	+ error = cache_fplookup_parse(fpl);
	+ if (__predict_false(error != 0)) {
	+ break;
	+ }
	+
	+ if (cnp->cn_flags & ISDOTDOT) {
	+ error = cache_fpl_partial(fpl);
	+ break;
	+ }
	+
	+ VNPASS(cache_fplookup_vnode_supported(fpl->dvp), fpl->dvp);
	+
	+ error = VOP_FPLOOKUP_VEXEC(fpl->dvp, cnp->cn_cred, cnp->cn_thread);
	+ if (__predict_false(error != 0)) {
	+ switch (error) {
	+ case EAGAIN:
	+ case EOPNOTSUPP: /* can happen when racing against vgone */
	+ cache_fpl_partial(fpl);
	+ break;
	+ default:
	+ /*
	+ * See the API contract for VOP_FPLOOKUP_VEXEC.
	+ */
	+ if (!seqc_consistent(&fpl->dvp->v_seqc, fpl->dvp_seqc)) {
	+ error = cache_fpl_aborted(fpl);
	+ } else {
	+ cache_fpl_smr_exit(fpl);
	+ cache_fpl_handled(fpl, error);
	+ }
	+ break;
	+ }
	+ break;
	+ }
	+
	+ error = cache_fplookup_next(fpl);
	+ if (__predict_false(error != 0)) {
	+ break;
	+ }
	+
	+ VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp);
	+
	+ error = cache_fplookup_climb_mount(fpl);
	+ if (__predict_false(error != 0)) {
	+ break;
	+ }
	+
	+ VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp);
	+
	+ if (cnp->cn_flags & ISLASTCN) {
	+ error = cache_fplookup_final(fpl);
	+ break;
	+ }
	+
	+ if (!seqc_consistent(&fpl->dvp->v_seqc, fpl->dvp_seqc)) {
	+ error = cache_fpl_aborted(fpl);
	+ break;
	+ }
	+
	+ fpl->dvp = fpl->tvp;
	+ fpl->dvp_seqc = fpl->tvp_seqc;
	+
	+ cache_fplookup_parse_advance(fpl);
	+ cache_fpl_checkpoint(fpl, &fpl->snd);
	+ }
	+out:
	+ switch (fpl->status) {
	+ case CACHE_FPL_STATUS_UNSET:
	+ __assert_unreachable();
	+ break;
	+ case CACHE_FPL_STATUS_PARTIAL:
	+ cache_fpl_smr_assert_entered(fpl);
	+ return (cache_fplookup_partial_setup(fpl));
	+ case CACHE_FPL_STATUS_ABORTED:
	+ if (fpl->in_smr)
	+ cache_fpl_smr_exit(fpl);
	+ return (CACHE_FPL_FAILED);
	+ case CACHE_FPL_STATUS_HANDLED:
	+ cache_fpl_smr_assert_not_entered(fpl);
	+ if (__predict_false(error != 0)) {
	+ ndp->ni_dvp = NULL;
	+ ndp->ni_vp = NULL;
	+ cache_fpl_cleanup_cnp(cnp);
	+ return (error);
	+ }
	+ ndp->ni_dvp = fpl->dvp;
	+ ndp->ni_vp = fpl->tvp;
	+ if (cnp->cn_flags & SAVENAME)
	+ cnp->cn_flags \|= HASBUF;
	+ else
	+ cache_fpl_cleanup_cnp(cnp);
	+ return (error);
	+ }
	+}
	+
	+/*
	+ * Fast path lookup protected with SMR and sequence counters.
	+ *
	+ * Note: all VOP_FPLOOKUP_VEXEC routines have a comment referencing this one.
	+ *
	+ * Filesystems can opt in by setting the MNTK_FPLOOKUP flag and meeting criteria
	+ * outlined below.
	+ *
	+ * Traditional vnode lookup conceptually looks like this:
	+ *
	+ * vn_lock(current);
	+ * for (;;) {
	+ * next = find();
	+ * vn_lock(next);
	+ * vn_unlock(current);
	+ * current = next;
	+ * if (last)
	+ * break;
	+ * }
	+ *
	+ * Each jump to the next vnode is safe memory-wise and atomic with respect to
	+ * any modifications thanks to holding respective locks.
	+ *
	+ * The same guarantee can be provided with a combination of safe memory
	+ * reclamation and sequence counters instead. If all operations which affect
	+ * the relationship between the current vnode and the one we are looking for
	+ * also modify the counter, we can verify whether all the conditions held as
	+ * we made the jump. This includes things like permissions, mount point etc.
	+ * You can grep for vn_seqc_write_begin to check all the places.
	+ *
	+ * Thus this translates to:
	+ *
	+ * vfs_smr_enter();
	+ * current_seqc = seqc_read_any(current);
	+ * if (seqc_in_modify(current_seqc)) // someone is altering the vnode
	+ * abort();
	+ * for (;;) {
	+ * next = find();
	+ * next_seqc = seqc_read_any(next);
	+ * if (!seqc_consistent(current, current_seqc) // someone is altering the vnode
	+ * abort();
	+ * current = next; // we know nothing of importance has changed
	+ * current_seqc = next_seqc; // store the counter for the next iteration
	+ * if (last)
	+ * break;
	+ * }
	+ *
	+ * API contract for VOP_FPLOOKUP_VEXEC routines is as follows:
	+ * - they are called while within vfs_smr protection which they must never exit
	+ * - EAGAIN can be returned to denote checking could not be performed, it is
	+ * always valid to return it
	+ * - if the sequence counter has not changed the result must be valid
	+ * - if the sequence counter has changed both false positives and false negatives
	+ * are permitted (since the result will be rejected later)
	+ * - for simple cases of unix permission checks vaccess_vexec_smr can be used
	+ *
	+ * Caveats to watch out for:
	+ * - vnodes are passed unlocked and unreferenced with nothing stopping
	+ * VOP_RECLAIM, in turn meaning that ->v_data can become NULL. It is advised
	+ * to use atomic_load_ptr to fetch it.
	+ * - aforementioned object can also get freed, meaning absent other means it
	+ * should be protected with vfs_smr
	+ * - either safely checking permissions as they are modified or guaranteeing
	+ * their stability is left to the routine
	+ */
	+int
	+cache_fplookup(struct nameidata ndp, enum cache_fpl_status status,
	+ struct pwd **pwdp)
	+{
	+ struct cache_fpl fpl;
	+ struct pwd *pwd;
	+ struct vnode *dvp;
	+ struct componentname *cnp;
	+ struct nameidata_saved orig;
	+ int error;
	+
	+ *status = CACHE_FPL_STATUS_UNSET;
	+ bzero(&fpl, sizeof(fpl));
	+ fpl.status = CACHE_FPL_STATUS_UNSET;
	+ fpl.ndp = ndp;
	+ fpl.cnp = &ndp->ni_cnd;
	+ MPASS(curthread == fpl.cnp->cn_thread);
	+
	+ if (!cache_can_fplookup(&fpl)) {
	+ SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status);
	+ *status = fpl.status;
	+ return (EOPNOTSUPP);
	+ }
	+
	+ cache_fpl_checkpoint(&fpl, &orig);
	+
	+ cache_fpl_smr_enter(&fpl);
	+ pwd = pwd_get_smr();
	+ fpl.pwd = pwd;
	+ ndp->ni_rootdir = pwd->pwd_rdir;
	+ ndp->ni_topdir = pwd->pwd_jdir;
	+
	+ cnp = fpl.cnp;
	+ cnp->cn_nameptr = cnp->cn_pnbuf;
	+ if (cnp->cn_pnbuf[0] == '/') {
	+ cache_fpl_handle_root_initial(ndp, &dvp);
	+ } else {
	+ MPASS(ndp->ni_dirfd == AT_FDCWD);
	+ dvp = pwd->pwd_cdir;
	+ }
	+
	+ error = cache_fplookup_impl(dvp, &fpl);
	+ cache_fpl_smr_assert_not_entered(&fpl);
	+ SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status);
	+
	+ *status = fpl.status;
	+ switch (fpl.status) {
	+ case CACHE_FPL_STATUS_UNSET:
	+ __assert_unreachable();
	+ break;
	+ case CACHE_FPL_STATUS_HANDLED:
	+ break;
	+ case CACHE_FPL_STATUS_PARTIAL:
	+ *pwdp = fpl.pwd;
	+ cache_fpl_restore(&fpl, &fpl.snd);
	+ break;
	+ case CACHE_FPL_STATUS_ABORTED:
	+ cache_fpl_restore(&fpl, &orig);
	+ break;
	+ }
	+ return (error);
	+}
	Index: sys/kern/vfs_lookup.c
	===================================================================
	--- sys/kern/vfs_lookup.c
	+++ sys/kern/vfs_lookup.c
	@@ -280,77 +280,21 @@
	return (0);
	}

	-/*
	- * Convert a pathname into a pointer to a locked vnode.
	- *
	- * The FOLLOW flag is set when symbolic links are to be followed
	- * when they occur at the end of the name translation process.
	- * Symbolic links are always followed for all other pathname
	- * components other than the last.
	- *
	- * The segflg defines whether the name is to be copied from user
	- * space or kernel space.
	- *
	- * Overall outline of namei:
	- *
	- * copy in name
	- * get starting directory
	- * while (!done && !error) {
	- * call lookup to search path.
	- * if symbolic link, massage name in buffer and continue
	- * }
	- */
	-int
	-namei(struct nameidata *ndp)
	+static int
	+namei_setup(struct nameidata ndp, struct vnode dpp, struct pwd *pwdp)
	{
	- char cp; / pointer into pathname argument */
	- struct vnode dp; / the directory we are searching */
	- struct iovec aiov; /* uio for reading symbolic links */
	struct componentname *cnp;
	struct file *dfp;
	struct thread *td;
	- struct proc *p;
	struct pwd *pwd;
	cap_rights_t rights;
	struct filecaps dirfd_caps;
	- struct uio auio;
	- int error, linklen, startdir_used;
	+ int error, startdir_used;

	cnp = &ndp->ni_cnd;
	td = cnp->cn_thread;
	- p = td->td_proc;
	- ndp->ni_cnd.cn_cred = ndp->ni_cnd.cn_thread->td_ucred;
	- KASSERT(cnp->cn_cred && p, ("namei: bad cred/proc"));
	- KASSERT((cnp->cn_nameiop & (~OPMASK)) == 0,
	- ("namei: nameiop contaminated with flags"));
	- KASSERT((cnp->cn_flags & OPMASK) == 0,
	- ("namei: flags contaminated with nameiops"));
	- MPASS(ndp->ni_startdir == NULL \|\| ndp->ni_startdir->v_type == VDIR \|\|
	- ndp->ni_startdir->v_type == VBAD);
	- TAILQ_INIT(&ndp->ni_cap_tracker);
	- ndp->ni_lcf = 0;
	-
	- /* We will set this ourselves if we need it. */
	- cnp->cn_flags &= ~TRAILINGSLASH;

	- /*
	- * Get a buffer for the name to be translated, and copy the
	- * name into the buffer.
	- */
	- if ((cnp->cn_flags & HASBUF) == 0)
	- cnp->cn_pnbuf = uma_zalloc(namei_zone, M_WAITOK);
	- if (ndp->ni_segflg == UIO_SYSSPACE)
	- error = copystr(ndp->ni_dirp, cnp->cn_pnbuf, MAXPATHLEN,
	- &ndp->ni_pathlen);
	- else
	- error = copyinstr(ndp->ni_dirp, cnp->cn_pnbuf, MAXPATHLEN,
	- &ndp->ni_pathlen);
	-
	- /*
	- * Don't allow empty pathnames.
	- */
	- if (error == 0 && *cnp->cn_pnbuf == '\0')
	- error = ENOENT;
	+ *pwdp = NULL;

	#ifdef CAPABILITY_MODE
	/*
	@@ -366,24 +310,17 @@
	* previously walked by us, which prevents an escape from
	* the relative root.
	*/
	- if (error == 0 && IN_CAPABILITY_MODE(td) &&
	- (cnp->cn_flags & NOCAPCHECK) == 0) {
	+ if (IN_CAPABILITY_MODE(td) && (cnp->cn_flags & NOCAPCHECK) == 0) {
	ndp->ni_lcf \|= NI_LCF_STRICTRELATIVE;
	if (ndp->ni_dirfd == AT_FDCWD) {
	#ifdef KTRACE
	if (KTRPOINT(td, KTR_CAPFAIL))
	ktrcapfail(CAPFAIL_LOOKUP, NULL, NULL);
	#endif
	- error = ECAPMODE;
	+ return (ECAPMODE);
	}
	}
	#endif
	- if (error != 0) {
	- namei_cleanup_cnp(cnp);
	- ndp->ni_vp = NULL;
	- return (error);
	- }
	- ndp->ni_loopcnt = 0;
	#ifdef KTRACE
	if (KTRPOINT(td, KTR_NAMEI)) {
	KASSERT(cnp->cn_thread == curthread,
	@@ -391,6 +328,8 @@
	ktrnamei(cnp->cn_pnbuf);
	}
	#endif
	+ error = 0;
	+
	/*
	* Get starting point for the translation.
	*/
	@@ -402,19 +341,16 @@
	ndp->ni_rootdir = pwd->pwd_rdir;
	ndp->ni_topdir = pwd->pwd_jdir;

	- startdir_used = 0;
	- dp = NULL;
	- cnp->cn_nameptr = cnp->cn_pnbuf;
	if (cnp->cn_pnbuf[0] == '/') {
	ndp->ni_resflags \|= NIRES_ABS;
	- error = namei_handle_root(ndp, &dp);
	+ error = namei_handle_root(ndp, dpp);
	} else {
	if (ndp->ni_startdir != NULL) {
	- dp = ndp->ni_startdir;
	+ *dpp = ndp->ni_startdir;
	startdir_used = 1;
	} else if (ndp->ni_dirfd == AT_FDCWD) {
	- dp = pwd->pwd_cdir;
	- vrefact(dp);
	+ *dpp = pwd->pwd_cdir;
	+ vrefact(*dpp);
	} else {
	rights = ndp->ni_rightsneeded;
	cap_rights_set_one(&rights, CAP_LOOKUP);
	@@ -441,8 +377,8 @@
	} else if (dfp->f_vnode == NULL) {
	error = ENOTDIR;
	} else {
	- dp = dfp->f_vnode;
	- vrefact(dp);
	+ *dpp = dfp->f_vnode;
	+ vrefact(*dpp);

	if ((dfp->f_flag & FSEARCH) != 0)
	cnp->cn_flags \|= NOEXECCHECK;
	@@ -464,7 +400,7 @@
	}
	#endif
	}
	- if (error == 0 && dp->v_type != VDIR)
	+ if (error == 0 && (*dpp)->v_type != VDIR)
	error = ENOTDIR;
	}
	if (error == 0 && (cnp->cn_flags & BENEATH) != 0) {
	@@ -476,7 +412,7 @@
	cap_rights_set_one(&rights, CAP_LOOKUP);
	error = fgetvp_rights(td, ndp->ni_dirfd, &rights,
	&dirfd_caps, &ndp->ni_beneath_latch);
	- if (error == 0 && dp->v_type != VDIR) {
	+ if (error == 0 && (*dpp)->v_type != VDIR) {
	vrele(ndp->ni_beneath_latch);
	error = ENOTDIR;
	}
	@@ -488,15 +424,15 @@
	* If we are auditing the kernel pathname, save the user pathname.
	*/
	if (cnp->cn_flags & AUDITVNODE1)
	- AUDIT_ARG_UPATH1_VP(td, ndp->ni_rootdir, dp, cnp->cn_pnbuf);
	+ AUDIT_ARG_UPATH1_VP(td, ndp->ni_rootdir, *dpp, cnp->cn_pnbuf);
	if (cnp->cn_flags & AUDITVNODE2)
	- AUDIT_ARG_UPATH2_VP(td, ndp->ni_rootdir, dp, cnp->cn_pnbuf);
	+ AUDIT_ARG_UPATH2_VP(td, ndp->ni_rootdir, *dpp, cnp->cn_pnbuf);
	if (ndp->ni_startdir != NULL && !startdir_used)
	vrele(ndp->ni_startdir);
	if (error != 0) {
	- if (dp != NULL)
	- vrele(dp);
	- goto out;
	+ if (*dpp != NULL)
	+ vrele(*dpp);
	+ return (error);
	}
	MPASS((ndp->ni_lcf & (NI_LCF_BENEATH_ABS \| NI_LCF_LATCH)) !=
	NI_LCF_BENEATH_ABS);
	@@ -505,8 +441,124 @@
	((ndp->ni_lcf & NI_LCF_STRICTRELATIVE) == 0 &&
	(cnp->cn_flags & BENEATH) != 0))
	ndp->ni_lcf \|= NI_LCF_CAP_DOTDOT;
	- SDT_PROBE3(vfs, namei, lookup, entry, dp, cnp->cn_pnbuf,
	+ SDT_PROBE3(vfs, namei, lookup, entry, *dpp, cnp->cn_pnbuf,
	cnp->cn_flags);
	+ *pwdp = pwd;
	+ return (0);
	+}
	+
	+/*
	+ * Convert a pathname into a pointer to a locked vnode.
	+ *
	+ * The FOLLOW flag is set when symbolic links are to be followed
	+ * when they occur at the end of the name translation process.
	+ * Symbolic links are always followed for all other pathname
	+ * components other than the last.
	+ *
	+ * The segflg defines whether the name is to be copied from user
	+ * space or kernel space.
	+ *
	+ * Overall outline of namei:
	+ *
	+ * copy in name
	+ * get starting directory
	+ * while (!done && !error) {
	+ * call lookup to search path.
	+ * if symbolic link, massage name in buffer and continue
	+ * }
	+ */
	+int
	+namei(struct nameidata *ndp)
	+{
	+ char cp; / pointer into pathname argument */
	+ struct vnode dp; / the directory we are searching */
	+ struct iovec aiov; /* uio for reading symbolic links */
	+ struct componentname *cnp;
	+ struct thread *td;
	+ struct proc *p;
	+ struct pwd *pwd;
	+ struct uio auio;
	+ int error, linklen;
	+ enum cache_fpl_status status;
	+
	+ cnp = &ndp->ni_cnd;
	+ td = cnp->cn_thread;
	+ p = td->td_proc;
	+ ndp->ni_cnd.cn_cred = ndp->ni_cnd.cn_thread->td_ucred;
	+ KASSERT(cnp->cn_cred && p, ("namei: bad cred/proc"));
	+ KASSERT((cnp->cn_nameiop & (~OPMASK)) == 0,
	+ ("namei: nameiop contaminated with flags"));
	+ KASSERT((cnp->cn_flags & OPMASK) == 0,
	+ ("namei: flags contaminated with nameiops"));
	+ MPASS(ndp->ni_startdir == NULL \|\| ndp->ni_startdir->v_type == VDIR \|\|
	+ ndp->ni_startdir->v_type == VBAD);
	+ TAILQ_INIT(&ndp->ni_cap_tracker);
	+ ndp->ni_lcf = 0;
	+ ndp->ni_loopcnt = 0;
	+ dp = NULL;
	+
	+ /* We will set this ourselves if we need it. */
	+ cnp->cn_flags &= ~TRAILINGSLASH;
	+
	+ ndp->ni_vp = NULL;
	+
	+ /*
	+ * Get a buffer for the name to be translated, and copy the
	+ * name into the buffer.
	+ */
	+ if ((cnp->cn_flags & HASBUF) == 0)
	+ cnp->cn_pnbuf = uma_zalloc(namei_zone, M_WAITOK);
	+ if (ndp->ni_segflg == UIO_SYSSPACE)
	+ error = copystr(ndp->ni_dirp, cnp->cn_pnbuf, MAXPATHLEN,
	+ &ndp->ni_pathlen);
	+ else
	+ error = copyinstr(ndp->ni_dirp, cnp->cn_pnbuf, MAXPATHLEN,
	+ &ndp->ni_pathlen);
	+
	+ if (error != 0) {
	+ namei_cleanup_cnp(cnp);
	+ return (error);
	+ }
	+
	+ cnp->cn_nameptr = cnp->cn_pnbuf;
	+
	+ /*
	+ * Don't allow empty pathnames.
	+ */
	+ if (*cnp->cn_pnbuf == '\0') {
	+ namei_cleanup_cnp(cnp);
	+ return (ENOENT);
	+ }
	+
	+ /*
	+ * First try the fast path.
	+ *
	+ * If it fails to handle the lookup, we are going to do perform it below.
	+ * Note this means that we either start from scratch or continue where it
	+ * left off.
	+ */
	+ error = cache_fplookup(ndp, &status, &pwd);
	+ switch (status) {
	+ case CACHE_FPL_STATUS_UNSET:
	+ __assert_unreachable();
	+ break;
	+ case CACHE_FPL_STATUS_HANDLED:
	+ return (error);
	+ case CACHE_FPL_STATUS_PARTIAL:
	+ dp = ndp->ni_startdir;
	+ break;
	+ case CACHE_FPL_STATUS_ABORTED:
	+ error = namei_setup(ndp, &dp, &pwd);
	+ if (error != 0) {
	+ namei_cleanup_cnp(cnp);
	+ return (error);
	+ }
	+ break;
	+ }
	+
	+ /*
	+ * Perform the lookup.
	+ */
	for (;;) {
	ndp->ni_startdir = dp;
	error = lookup(ndp);
	Index: sys/kern/vfs_mount.c
	===================================================================
	--- sys/kern/vfs_mount.c
	+++ sys/kern/vfs_mount.c
	@@ -947,6 +947,7 @@
	vput(vp);
	return (error);
	}
	+ vn_seqc_write_begin(vp);
	VOP_UNLOCK(vp);

	/* Allocate and initialize the filesystem. */
	@@ -979,9 +980,11 @@
	VI_LOCK(vp);
	vp->v_iflag &= ~VI_MOUNT;
	VI_UNLOCK(vp);
	+ vn_seqc_write_end(vp);
	vrele(vp);
	return (error);
	}
	+ vn_seqc_write_begin(newdp);
	VOP_UNLOCK(newdp);

	if (mp->mnt_opt != NULL)
	@@ -1018,6 +1021,8 @@
	EVENTHANDLER_DIRECT_INVOKE(vfs_mounted, mp, newdp, td);
	VOP_UNLOCK(newdp);
	mountcheckdirs(vp, newdp);
	+ vn_seqc_write_end(vp);
	+ vn_seqc_write_end(newdp);
	vrele(newdp);
	if ((mp->mnt_flag & MNT_RDONLY) == 0)
	vfs_allocate_syncvnode(mp);
	@@ -1094,7 +1099,9 @@
	VOP_UNLOCK(vp);

	vfs_op_enter(mp);
	+ vn_seqc_write_begin(vp);

	+ rootvp = NULL;
	MNT_ILOCK(mp);
	if ((mp->mnt_kern_flag & MNTK_UNMOUNT) != 0) {
	MNT_IUNLOCK(mp);
	@@ -1108,8 +1115,6 @@
	mp->mnt_kern_flag &= ~MNTK_ASYNC;
	rootvp = vfs_cache_root_clear(mp);
	MNT_IUNLOCK(mp);
	- if (rootvp != NULL)
	- vrele(rootvp);
	mp->mnt_optnew = *optlist;
	vfs_mergeopts(mp->mnt_optnew, mp->mnt_opt);

	@@ -1233,6 +1238,11 @@
	vfs_deallocate_syncvnode(mp);
	end:
	vfs_op_exit(mp);
	+ if (rootvp != NULL) {
	+ vn_seqc_write_end(rootvp);
	+ vrele(rootvp);
	+ }
	+ vn_seqc_write_end(vp);
	vfs_unbusy(mp);
	VI_LOCK(vp);
	vp->v_iflag &= ~VI_MOUNT;
	@@ -1723,14 +1733,19 @@
	}
	mp->mnt_kern_flag \|= MNTK_UNMOUNT;
	rootvp = vfs_cache_root_clear(mp);
	+ if (coveredvp != NULL)
	+ vn_seqc_write_begin(coveredvp);
	if (flags & MNT_NONBUSY) {
	MNT_IUNLOCK(mp);
	error = vfs_check_usecounts(mp);
	MNT_ILOCK(mp);
	if (error != 0) {
	+ vn_seqc_write_end(coveredvp);
	dounmount_cleanup(mp, coveredvp, MNTK_UNMOUNT);
	- if (rootvp != NULL)
	+ if (rootvp != NULL) {
	+ vn_seqc_write_end(rootvp);
	vrele(rootvp);
	+ }
	return (error);
	}
	}
	@@ -1759,22 +1774,19 @@
	("%s: invalid return value for msleep in the drain path @ %s:%d",
	__func__, __FILE__, __LINE__));

	- if (rootvp != NULL)
	+ /*
	+ * We want to keep the vnode around so that we can vn_seqc_write_end
	+ * after we are done with unmount. Downgrade our reference to a mere
	+ * hold count so that we don't interefere with anything.
	+ */
	+ if (rootvp != NULL) {
	+ vhold(rootvp);
	vrele(rootvp);
	+ }

	if (mp->mnt_flag & MNT_EXPUBLIC)
	vfs_setpublicfs(NULL, NULL, NULL);

	- /*
	- * From now, we can claim that the use reference on the
	- * coveredvp is ours, and the ref can be released only by
	- * successfull unmount by us, or left for later unmount
	- * attempt. The previously acquired hold reference is no
	- * longer needed to protect the vnode from reuse.
	- */
	- if (coveredvp != NULL)
	- vdrop(coveredvp);
	-
	vfs_periodic(mp, MNT_WAIT);
	MNT_ILOCK(mp);
	async_flag = mp->mnt_flag & MNT_ASYNC;
	@@ -1809,8 +1821,15 @@
	}
	vfs_op_exit_locked(mp);
	MNT_IUNLOCK(mp);
	- if (coveredvp)
	+ if (coveredvp) {
	+ vn_seqc_write_end(coveredvp);
	VOP_UNLOCK(coveredvp);
	+ vdrop(coveredvp);
	+ }
	+ if (rootvp != NULL) {
	+ vn_seqc_write_end(rootvp);
	+ vdrop(rootvp);
	+ }
	return (error);
	}
	mtx_lock(&mountlist_mtx);
	@@ -1819,7 +1838,13 @@
	EVENTHANDLER_DIRECT_INVOKE(vfs_unmounted, mp, td);
	if (coveredvp != NULL) {
	coveredvp->v_mountedhere = NULL;
	+ vn_seqc_write_end(coveredvp);
	VOP_UNLOCK(coveredvp);
	+ vdrop(coveredvp);
	+ }
	+ if (rootvp != NULL) {
	+ vn_seqc_write_end(rootvp);
	+ vdrop(rootvp);
	}
	vfs_event_signal(NULL, VQ_UNMOUNT, 0);
	if (rootvnode != NULL && mp == rootvnode->v_mount) {
	Index: sys/kern/vfs_subr.c
	===================================================================
	--- sys/kern/vfs_subr.c
	+++ sys/kern/vfs_subr.c
	@@ -664,8 +664,8 @@
	vnode_list_reclaim_marker = vn_alloc_marker(NULL);
	TAILQ_INSERT_HEAD(&vnode_list, vnode_list_reclaim_marker, v_vnodelist);
	vnode_zone = uma_zcreate("VNODE", sizeof (struct vnode), NULL, NULL,
	- vnode_init, vnode_fini, UMA_ALIGN_PTR, UMA_ZONE_SMR);
	- vfs_smr = uma_zone_get_smr(vnode_zone);
	+ vnode_init, vnode_fini, UMA_ALIGN_PTR, 0);
	+ uma_zone_set_smr(vnode_zone, vfs_smr);
	vnodepoll_zone = uma_zcreate("VNODEPOLL", sizeof (struct vpollinfo),
	NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
	/*
	@@ -1761,6 +1761,7 @@
	*/
	CTR2(KTR_VFS, "%s: destroying the vnode %p", __func__, vp);
	bo = &vp->v_bufobj;
	+ VNPASS(vp->v_seqc_users == 0, vp);
	VNASSERT(vp->v_data == NULL, vp, ("cleaned vnode isn't"));
	VNPASS(vp->v_holdcnt == VHOLD_NO_SMR, vp);
	VNASSERT(vp->v_usecount == 0, vp, ("Non-zero use count"));
	@@ -2889,6 +2890,17 @@
	return (vs);
	}

	+void
	+vget_abort(struct vnode *vp, enum vgetstate vs)
	+{
	+
	+ VNPASS(vs == VGET_HOLDCNT \|\| vs == VGET_USECOUNT, vp);
	+ if (vs == VGET_USECOUNT)
	+ vrele(vp);
	+ else
	+ vdrop(vp);
	+}
	+
	int
	vget(struct vnode vp, int flags, struct thread td)
	{
	@@ -2951,10 +2963,7 @@

	error = vn_lock(vp, flags);
	if (__predict_false(error != 0)) {
	- if (vs == VGET_USECOUNT)
	- vrele(vp);
	- else
	- vdrop(vp);
	+ vget_abort(vp, vs);
	CTR2(KTR_VFS, "%s: impossible to lock vnode %p", __func__,
	vp);
	return (error);
	@@ -3032,6 +3041,44 @@
	return;
	}

	+bool
	+vref_smr(struct vnode *vp)
	+{
	+ int old;
	+
	+ CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
	+ VFS_SMR_ASSERT_ENTERED();
	+
	+ /*
	+ * Devices are not supported since they may require taking the interlock.
	+ */
	+ VNPASS(vp->v_type != VCHR, vp);
	+
	+ if (refcount_acquire_if_not_zero(&vp->v_usecount)) {
	+ VNODE_REFCOUNT_FENCE_ACQ();
	+ VNPASS(vp->v_holdcnt > 0, vp);
	+ return (true);
	+ }
	+
	+ if (!vhold_smr(vp))
	+ return (false);
	+
	+ /*
	+ * See the comment in vget_finish.
	+ */
	+ old = atomic_fetchadd_int(&vp->v_usecount, 1);
	+ VNASSERT(old >= 0, vp, ("%s: wrong use count %d", __func__, old));
	+ if (old != 0) {
	+#ifdef INVARIANTS
	+ old = atomic_fetchadd_int(&vp->v_holdcnt, -1);
	+ VNASSERT(old > 1, vp, ("%s: wrong hold count %d", __func__, old));
	+#else
	+ refcount_release(&vp->v_holdcnt);
	+#endif
	+ }
	+ return (true);
	+}
	+
	void
	vref(struct vnode *vp)
	{
	@@ -3986,6 +4033,7 @@
	*/
	if (vp->v_irflag & VIRF_DOOMED)
	return;
	+ vn_seqc_write_begin_locked(vp);
	vunlazy_gone(vp);
	vp->v_irflag \|= VIRF_DOOMED;

	@@ -4088,6 +4136,7 @@
	vp->v_vnlock = &vp->v_lock;
	vp->v_op = &dead_vnodeops;
	vp->v_type = VBAD;
	+ vn_seqc_write_end_locked(vp);
	}

	/*
	@@ -4128,8 +4177,9 @@
	printf("%p: ", (void *)vp);
	printf("type %s\n", typename[vp->v_type]);
	holdcnt = atomic_load_int(&vp->v_holdcnt);
	- printf(" usecount %d, writecount %d, refcount %d",
	- vp->v_usecount, vp->v_writecount, holdcnt & ~VHOLD_ALL_FLAGS);
	+ printf(" usecount %d, writecount %d, refcount %d seqc users %d",
	+ vp->v_usecount, vp->v_writecount, holdcnt & ~VHOLD_ALL_FLAGS,
	+ vp->v_seqc_users);
	switch (vp->v_type) {
	case VDIR:
	printf(" mountedhere %p\n", vp->v_mountedhere);
	@@ -4381,6 +4431,7 @@
	MNT_KERN_FLAG(MNTK_LOOKUP_EXCL_DOTDOT);
	MNT_KERN_FLAG(MNTK_MARKER);
	MNT_KERN_FLAG(MNTK_USES_BCACHE);
	+ MNT_KERN_FLAG(MNTK_FPLOOKUP);
	MNT_KERN_FLAG(MNTK_NOASYNC);
	MNT_KERN_FLAG(MNTK_UNMOUNT);
	MNT_KERN_FLAG(MNTK_MWAIT);
	@@ -5196,6 +5247,38 @@
	return (error == 0);
	}

	+/*
	+ * VOP_FPLOOKUP_VEXEC routines are subject to special circumstances, see
	+ * the comment above cache_fplookup for details.
	+ *
	+ * We never deny as priv_check_cred calls are not yet supported, see vaccess.
	+ */
	+int
	+vaccess_vexec_smr(mode_t file_mode, uid_t file_uid, gid_t file_gid, struct ucred *cred)
	+{
	+
	+ VFS_SMR_ASSERT_ENTERED();
	+
	+ /* Check the owner. */
	+ if (cred->cr_uid == file_uid) {
	+ if (file_mode & S_IXUSR)
	+ return (0);
	+ return (EAGAIN);
	+ }
	+
	+ /* Otherwise, check the groups (first match) */
	+ if (groupmember(file_gid, cred)) {
	+ if (file_mode & S_IXGRP)
	+ return (0);
	+ return (EAGAIN);
	+ }
	+
	+ /* Otherwise, check everyone else. */
	+ if (file_mode & S_IXOTH)
	+ return (0);
	+ return (EAGAIN);
	+}
	+
	/*
	* Common filesystem object access control check routine. Accepts a
	* vnode's type, "mode", uid and gid, requested access mode, credentials,
	@@ -5476,6 +5559,14 @@
	ASSERT_VOP_LOCKED(a->a_tvp, "vop_rename: tvp not locked");
	ASSERT_VOP_LOCKED(a->a_tdvp, "vop_rename: tdvp not locked");
	#endif
	+ /*
	+ * It may be tempting to add vn_seqc_write_begin/end calls here and
	+ * in vop_rename_post but that's not going to work out since some
	+ * filesystems relookup vnodes mid-rename. This is probably a bug.
	+ *
	+ * For now filesystems are expected to do the relevant calls after they
	+ * decide what vnodes to operate on.
	+ */
	if (a->a_tdvp != a->a_fdvp)
	vhold(a->a_fdvp);
	if (a->a_tvp != a->a_fvp)
	@@ -5486,6 +5577,20 @@
	}

	#ifdef DEBUG_VFS_LOCKS
	+void
	+vop_fplookup_vexec_pre(void *ap __unused)
	+{
	+
	+ VFS_SMR_ASSERT_ENTERED();
	+}
	+
	+void
	+vop_fplookup_vexec_post(void *ap __unused, int rc __unused)
	+{
	+
	+ VFS_SMR_ASSERT_ENTERED();
	+}
	+
	void
	vop_strategy_pre(void *ap)
	{
	@@ -5565,11 +5670,26 @@
	VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
	}

	+void
	+vop_deleteextattr_pre(void *ap)
	+{
	+ struct vop_deleteextattr_args *a;
	+ struct vnode *vp;
	+
	+ a = ap;
	+ vp = a->a_vp;
	+ vn_seqc_write_begin(vp);
	+}
	+
	void
	vop_deleteextattr_post(void *ap, int rc)
	{
	- struct vop_deleteextattr_args *a = ap;
	+ struct vop_deleteextattr_args *a;
	+ struct vnode *vp;

	+ a = ap;
	+ vp = a->a_vp;
	+ vn_seqc_write_end(vp);
	if (!rc)
	VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB);
	}
	@@ -5672,22 +5792,74 @@
	}
	}

	+void
	+vop_setattr_pre(void *ap)
	+{
	+ struct vop_setattr_args *a;
	+ struct vnode *vp;
	+
	+ a = ap;
	+ vp = a->a_vp;
	+ vn_seqc_write_begin(vp);
	+}
	+
	void
	vop_setattr_post(void *ap, int rc)
	{
	- struct vop_setattr_args *a = ap;
	+ struct vop_setattr_args *a;
	+ struct vnode *vp;

	+ a = ap;
	+ vp = a->a_vp;
	+ vn_seqc_write_end(vp);
	if (!rc)
	- VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB);
	+ VFS_KNOTE_LOCKED(vp, NOTE_ATTRIB);
	+}
	+
	+void
	+vop_setacl_pre(void *ap)
	+{
	+ struct vop_setacl_args *a;
	+ struct vnode *vp;
	+
	+ a = ap;
	+ vp = a->a_vp;
	+ vn_seqc_write_begin(vp);
	+}
	+
	+void
	+vop_setacl_post(void *ap, int rc __unused)
	+{
	+ struct vop_setacl_args *a;
	+ struct vnode *vp;
	+
	+ a = ap;
	+ vp = a->a_vp;
	+ vn_seqc_write_end(vp);
	+}
	+
	+void
	+vop_setextattr_pre(void *ap)
	+{
	+ struct vop_setextattr_args *a;
	+ struct vnode *vp;
	+
	+ a = ap;
	+ vp = a->a_vp;
	+ vn_seqc_write_begin(vp);
	}

	void
	vop_setextattr_post(void *ap, int rc)
	{
	- struct vop_setextattr_args *a = ap;
	+ struct vop_setextattr_args *a;
	+ struct vnode *vp;

	+ a = ap;
	+ vp = a->a_vp;
	+ vn_seqc_write_end(vp);
	if (!rc)
	- VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB);
	+ VFS_KNOTE_LOCKED(vp, NOTE_ATTRIB);
	}

	void
	@@ -6249,6 +6421,8 @@
	*/
	MPASS(mp->mnt_vfs_ops > 0);
	vp = mp->mnt_rootvnode;
	+ if (vp != NULL)
	+ vn_seqc_write_begin(vp);
	mp->mnt_rootvnode = NULL;
	return (vp);
	}
	@@ -6545,3 +6719,45 @@

	return (VOP_ACCESS(vp, VEXEC, cnp->cn_cred, cnp->cn_thread));
	}
	+
	+void
	+vn_seqc_write_begin_locked(struct vnode *vp)
	+{
	+
	+ ASSERT_VI_LOCKED(vp, __func__);
	+ VNPASS(vp->v_holdcnt > 0, vp);
	+ VNPASS(vp->v_seqc_users >= 0, vp);
	+ vp->v_seqc_users++;
	+ if (vp->v_seqc_users == 1)
	+ seqc_sleepable_write_begin(&vp->v_seqc);
	+}
	+
	+void
	+vn_seqc_write_begin(struct vnode *vp)
	+{
	+
	+ VI_LOCK(vp);
	+ vn_seqc_write_begin_locked(vp);
	+ VI_UNLOCK(vp);
	+}
	+
	+void
	+vn_seqc_write_end_locked(struct vnode *vp)
	+{
	+
	+ ASSERT_VI_LOCKED(vp, __func__);
	+ VNPASS(vp->v_holdcnt > 0, vp);
	+ VNPASS(vp->v_seqc_users > 0, vp);
	+ vp->v_seqc_users--;
	+ if (vp->v_seqc_users == 0)
	+ seqc_sleepable_write_end(&vp->v_seqc);
	+}
	+
	+void
	+vn_seqc_write_end(struct vnode *vp)
	+{
	+
	+ VI_LOCK(vp);
	+ vn_seqc_write_end_locked(vp);
	+ VI_UNLOCK(vp);
	+}
	Index: sys/kern/vnode_if.src
	===================================================================
	--- sys/kern/vnode_if.src
	+++ sys/kern/vnode_if.src
	@@ -142,6 +142,17 @@
	};


	+%% fplookup_vexec vp - - -
	+%! fplookup_vexec pre vop_fplookup_vexec_pre
	+%! fplookup_vexec post vop_fplookup_vexec_post
	+
	+vop_fplookup_vexec {
	+ IN struct vnode *vp;
	+ IN struct ucred *cred;
	+ IN struct thread *td;
	+};
	+
	+
	%% access vp L L L

	vop_access {
	@@ -172,6 +183,7 @@


	%% setattr vp E E E
	+%! setattr pre vop_setattr_pre
	%! setattr post vop_setattr_post

	vop_setattr {
	@@ -523,6 +535,8 @@


	%% setacl vp E E E
	+%! setacl pre vop_setacl_pre
	+%! setacl post vop_setacl_post

	vop_setacl {
	IN struct vnode *vp;
	@@ -589,6 +603,7 @@


	%% deleteextattr vp E E E
	+%! deleteextattr pre vop_deleteextattr_pre
	%! deleteextattr post vop_deleteextattr_post

	vop_deleteextattr {
	@@ -601,6 +616,7 @@


	%% setextattr vp E E E
	+%! setextattr pre vop_setextattr_pre
	%! setextattr post vop_setextattr_post

	vop_setextattr {
	Index: sys/security/mac/mac_framework.h
	===================================================================
	--- sys/security/mac/mac_framework.h
	+++ sys/security/mac/mac_framework.h
	@@ -422,13 +422,14 @@
	int mac_vnode_check_lookup_impl(struct ucred cred, struct vnode dvp,
	struct componentname *cnp);
	extern bool mac_vnode_check_lookup_fp_flag;
	+#define mac_vnode_check_lookup_enabled() __predict_false(mac_vnode_check_lookup_fp_flag)
	static inline int
	mac_vnode_check_lookup(struct ucred cred, struct vnode dvp,
	struct componentname *cnp)
	{

	mac_vnode_assert_locked(dvp, "mac_vnode_check_lookup");
	- if (__predict_false(mac_vnode_check_lookup_fp_flag))
	+ if (mac_vnode_check_lookup_enabled())
	return (mac_vnode_check_lookup_impl(cred, dvp, cnp));
	return (0);
	}
	Index: sys/sys/_seqc.h
	===================================================================
	--- /dev/null
	+++ sys/sys/_seqc.h
	@@ -0,0 +1,6 @@
	+#ifndef _SYS__SEQC_H_
	+#define _SYS__SEQC_H_
	+
	+typedef uint32_t seqc_t;
	+
	+#endif /* _SYS__SEQC_H */
	Index: sys/sys/filedesc.h
	===================================================================
	--- sys/sys/filedesc.h
	+++ sys/sys/filedesc.h
	@@ -310,6 +310,7 @@
	smr_serialized_store(&fdp->fd_pwd, newpwd,
	(FILEDESC_XLOCK_ASSERT(fdp), true));
	}
	+struct pwd *pwd_get_smr(void);

	#endif /* _KERNEL */

	Index: sys/sys/mount.h
	===================================================================
	--- sys/sys/mount.h
	+++ sys/sys/mount.h
	@@ -420,6 +420,7 @@
	#define MNTK_TEXT_REFS 0x00008000 /* Keep use ref for text */
	#define MNTK_VMSETSIZE_BUG 0x00010000
	#define MNTK_UNIONFS 0x00020000 /* A hack for F_ISUNIONSTACK */
	+#define MNTK_FPLOOKUP 0x00040000 /* fast path lookup is supported */
	#define MNTK_NOASYNC 0x00800000 /* disable async */
	#define MNTK_UNMOUNT 0x01000000 /* unmount in progress */
	#define MNTK_MWAIT 0x02000000 /* waiting for unmount to finish */
	Index: sys/sys/namei.h
	===================================================================
	--- sys/sys/namei.h
	+++ sys/sys/namei.h
	@@ -108,6 +108,12 @@
	};

	#ifdef _KERNEL
	+
	+enum cache_fpl_status { CACHE_FPL_STATUS_ABORTED, CACHE_FPL_STATUS_PARTIAL,
	+ CACHE_FPL_STATUS_HANDLED, CACHE_FPL_STATUS_UNSET };
	+int cache_fplookup(struct nameidata ndp, enum cache_fpl_status status,
	+ struct pwd **pwdp);
	+
	/*
	* namei operations
	*/
	Index: sys/sys/seqc.h
	===================================================================
	--- sys/sys/seqc.h
	+++ sys/sys/seqc.h
	@@ -36,7 +36,7 @@
	/*
	* seqc_t may be included in structs visible to userspace
	*/
	-typedef uint32_t seqc_t;
	+#include <sys/_seqc.h>

	#ifdef _KERNEL

	@@ -111,5 +111,26 @@
	return (seqc_consistent_nomb(seqcp, oldseqc));
	}

	+/*
	+ * Variant which does not critical enter/exit.
	+ */
	+static __inline void
	+seqc_sleepable_write_begin(seqc_t *seqcp)
	+{
	+
	+ MPASS(!seqc_in_modify(*seqcp));
	+ *seqcp += 1;
	+ atomic_thread_fence_rel();
	+}
	+
	+static __inline void
	+seqc_sleepable_write_end(seqc_t *seqcp)
	+{
	+
	+ atomic_thread_fence_rel();
	+ *seqcp += 1;
	+ MPASS(!seqc_in_modify(*seqcp));
	+}
	+
	#endif /* _KERNEL */
	#endif /* _SYS_SEQC_H_ */
	Index: sys/sys/vnode.h
	===================================================================
	--- sys/sys/vnode.h
	+++ sys/sys/vnode.h
	@@ -45,6 +45,7 @@
	#include <sys/uio.h>
	#include <sys/acl.h>
	#include <sys/ktr.h>
	+#include <sys/_seqc.h>

	/*
	* The vnode is the focus of all file activity in UNIX. There is a
	@@ -105,6 +106,7 @@
	*/
	enum vtype v_type:8; /* u vnode type */
	short v_irflag; /* i frequently read flags */
	+ seqc_t v_seqc; /* i modification count */
	struct vop_vector v_op; / u vnode operations vector */
	void v_data; / u private data for fs */

	@@ -175,6 +177,7 @@
	short v_dbatchcpu; /* i LRU requeue deferral batch */
	int v_writecount; /* I ref count of writers or
	(negative) text users */
	+ int v_seqc_users; /* i modifications pending */
	u_int v_hash;
	};

	@@ -539,6 +542,18 @@
	#define ASSERT_VOP_LOCKED(vp, str) assert_vop_locked((vp), (str))
	#define ASSERT_VOP_UNLOCKED(vp, str) assert_vop_unlocked((vp), (str))

	+#define ASSERT_VOP_IN_SEQC(vp) do { \
	+ struct vnode *_vp = (vp); \
	+ \
	+ VNPASS(seqc_in_modify(_vp->v_seqc), _vp); \
	+} while (0)
	+
	+#define ASSERT_VOP_NOT_IN_SEQC(vp) do { \
	+ struct vnode *_vp = (vp); \
	+ \
	+ VNPASS(!seqc_in_modify(_vp->v_seqc), _vp); \
	+} while (0)
	+
	#else /* !DEBUG_VFS_LOCKS */

	#define ASSERT_VI_LOCKED(vp, str) ((void)0)
	@@ -546,6 +561,10 @@
	#define ASSERT_VOP_ELOCKED(vp, str) ((void)0)
	#define ASSERT_VOP_LOCKED(vp, str) ((void)0)
	#define ASSERT_VOP_UNLOCKED(vp, str) ((void)0)
	+
	+#define ASSERT_VOP_IN_SEQC(vp) ((void)0)
	+#define ASSERT_VOP_NOT_IN_SEQC(vp) ((void)0)
	+
	#endif /* DEBUG_VFS_LOCKS */


	@@ -602,6 +621,7 @@
	struct vattr;
	struct vfsops;
	struct vnode;
	+struct pwd;

	typedef int (vn_get_ino_t)(struct mount , void , int, struct vnode *);

	@@ -619,6 +639,10 @@
	void cache_purge(struct vnode *vp);
	void cache_purge_negative(struct vnode *vp);
	void cache_purgevfs(struct mount *mp, bool force);
	+void vn_seqc_write_begin_locked(struct vnode *vp);
	+void vn_seqc_write_begin(struct vnode *vp);
	+void vn_seqc_write_end_locked(struct vnode *vp);
	+void vn_seqc_write_end(struct vnode *vp);
	int change_dir(struct vnode vp, struct thread td);
	void cvtstat(struct stat st, struct ostat ost);
	void freebsd11_cvtnstat(struct stat sb, struct nstat nsb);
	@@ -644,6 +668,8 @@
	int vn_commname(struct vnode vn, char buf, u_int buflen);
	int vn_path_to_global_path(struct thread td, struct vnode vp,
	char *path, u_int pathlen);
	+int vaccess_vexec_smr(mode_t file_mode, uid_t file_uid, gid_t file_gid,
	+ struct ucred *cred);
	int vaccess(enum vtype type, mode_t file_mode, uid_t file_uid,
	gid_t file_gid, accmode_t accmode, struct ucred *cred,
	int *privused);
	@@ -663,6 +689,7 @@
	enum vgetstate vget_prep_smr(struct vnode *vp);
	enum vgetstate vget_prep(struct vnode *vp);
	int vget_finish(struct vnode *vp, int flags, enum vgetstate vs);
	+void vget_abort(struct vnode *vp, enum vgetstate vs);
	void vgone(struct vnode *vp);
	void vhold(struct vnode *);
	void vholdl(struct vnode *);
	@@ -805,6 +832,7 @@
	/* These are called from within the actual VOPS. */
	void vop_close_post(void *a, int rc);
	void vop_create_post(void *a, int rc);
	+void vop_deleteextattr_pre(void *a);
	void vop_deleteextattr_post(void *a, int rc);
	void vop_link_post(void *a, int rc);
	void vop_lookup_post(void *a, int rc);
	@@ -819,12 +847,18 @@
	void vop_rename_post(void *a, int rc);
	void vop_rename_pre(void *a);
	void vop_rmdir_post(void *a, int rc);
	+void vop_setattr_pre(void *a);
	void vop_setattr_post(void *a, int rc);
	+void vop_setacl_pre(void *a);
	+void vop_setacl_post(void *a, int rc);
	+void vop_setextattr_pre(void *a);
	void vop_setextattr_post(void *a, int rc);
	void vop_symlink_post(void *a, int rc);
	int vop_sigdefer(struct vop_vector vop, struct vop_generic_args a);

	#ifdef DEBUG_VFS_LOCKS
	+void vop_fplookup_vexec_pre(void *a);
	+void vop_fplookup_vexec_post(void *a, int rc);
	void vop_strategy_pre(void *a);
	void vop_lock_pre(void *a);
	void vop_lock_post(void *a, int rc);
	@@ -832,6 +866,8 @@
	void vop_need_inactive_pre(void *a);
	void vop_need_inactive_post(void *a, int rc);
	#else
	+#define vop_fplookup_vexec_pre(x) do { } while (0)
	+#define vop_fplookup_vexec_post(x, y) do { } while (0)
	#define vop_strategy_pre(x) do { } while (0)
	#define vop_lock_pre(x) do { } while (0)
	#define vop_lock_post(x, y) do { } while (0)
	@@ -901,6 +937,7 @@
	void vput(struct vnode *vp);
	void vrele(struct vnode *vp);
	void vref(struct vnode *vp);
	+bool vref_smr(struct vnode *vp);
	void vrefl(struct vnode *vp);
	void vrefact(struct vnode *vp);
	void vrefactn(struct vnode *vp, u_int n);

File Metadata

Mime Type: text/plain
Expires: Thu, Jan 16, 3:30 PM (21 h, 24 m)
Storage Engine: blob
Storage Format: Raw Data
Storage Handle: 15826783
Default Alt Text: D23915.diff (58 KB)

D23915.diffNo OneTemporaryActions

D23915.diffView Options

File Metadata

Event Timeline

D23915.diff
No OneTemporary
Actions

D23915.diff
View Options