Page Menu
Home
FreeBSD
Search
Configure Global Search
Log In
Files
F107485162
D23915.id69198.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Flag For Later
Award Token
Size
40 KB
Referenced Files
None
Subscribers
None
D23915.id69198.diff
View Options
Index: sys/cddl/compat/opensolaris/kern/opensolaris_vfs.c
===================================================================
--- sys/cddl/compat/opensolaris/kern/opensolaris_vfs.c
+++ sys/cddl/compat/opensolaris/kern/opensolaris_vfs.c
@@ -154,6 +154,7 @@
vput(vp);
return (error);
}
+ vfs_seqc_write_begin(vp);
VOP_UNLOCK(vp);
/*
@@ -206,6 +207,7 @@
VI_LOCK(vp);
vp->v_iflag &= ~VI_MOUNT;
VI_UNLOCK(vp);
+ vfs_seqc_write_end(vp);
vput(vp);
vfs_unbusy(mp);
vfs_freeopts(mp->mnt_optnew);
@@ -241,6 +243,7 @@
vfs_event_signal(NULL, VQ_MOUNT, 0);
if (VFS_ROOT(mp, LK_EXCLUSIVE, &mvp))
panic("mount: lost mount");
+ vfs_seqc_write_end(vp);
VOP_UNLOCK(vp);
vfs_op_exit(mp);
vfs_unbusy(mp);
Index: sys/kern/kern_descrip.c
===================================================================
--- sys/kern/kern_descrip.c
+++ sys/kern/kern_descrip.c
@@ -102,8 +102,8 @@
static __read_mostly uma_zone_t file_zone;
static __read_mostly uma_zone_t filedesc0_zone;
-static __read_mostly uma_zone_t pwd_zone;
-static __read_mostly smr_t pwd_smr;
+__read_mostly uma_zone_t pwd_zone;
+extern smr_t vfs_smr;
static int closefp(struct filedesc *fdp, int fd, struct file *fp,
struct thread *td, int holdleaders);
@@ -3297,14 +3297,24 @@
fdp = td->td_proc->p_fd;
- smr_enter(pwd_smr);
+ smr_enter(vfs_smr);
for (;;) {
- pwd = smr_entered_load(&fdp->fd_pwd, pwd_smr);
+ pwd = smr_entered_load(&fdp->fd_pwd, vfs_smr);
MPASS(pwd != NULL);
if (refcount_acquire_if_not_zero(&pwd->pwd_refcount))
break;
}
- smr_exit(pwd_smr);
+ smr_exit(vfs_smr);
+ return (pwd);
+}
+
+struct pwd *
+pwd_get_smr(void)
+{
+ struct pwd *pwd;
+
+ pwd = smr_entered_load(&curproc->p_fd->fd_pwd, vfs_smr);
+ MPASS(pwd != NULL);
return (pwd);
}
@@ -4293,7 +4303,7 @@
NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
pwd_zone = uma_zcreate("PWD", sizeof(struct pwd), NULL, NULL,
NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_SMR);
- pwd_smr = uma_zone_get_smr(pwd_zone);
+ vfs_smr = uma_zone_get_smr(pwd_zone);
mtx_init(&sigio_lock, "sigio lock", NULL, MTX_DEF);
}
SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, filelistinit, NULL);
Index: sys/kern/vfs_cache.c
===================================================================
--- sys/kern/vfs_cache.c
+++ sys/kern/vfs_cache.c
@@ -55,6 +55,7 @@
#include <sys/namei.h>
#include <sys/proc.h>
#include <sys/rwlock.h>
+#include <sys/seqc.h>
#include <sys/sdt.h>
#include <sys/smr.h>
#include <sys/smp.h>
@@ -67,6 +68,11 @@
#include <sys/ktrace.h>
#endif
+#include <sys/capsicum.h>
+
+#include <security/audit/audit.h>
+#include <security/mac/mac_framework.h>
+
#ifdef DDB
#include <ddb/ddb.h>
#endif
@@ -100,6 +106,8 @@
SDT_PROBE_DEFINE2(vfs, namecache, shrink_negative, done, "struct vnode *",
"char *");
+SDT_PROBE_DEFINE3(vfs, fplookup, lookup, done, "struct nameidata", "int", "bool");
+
/*
* This structure describes the elements in the cache of recent
* names looked up by namei.
@@ -2817,3 +2825,736 @@
}
#endif
+
+extern uma_zone_t namei_zone;
+
+static bool __read_frequently cache_fast_lookup = true;
+SYSCTL_BOOL(_vfs, OID_AUTO, cache_fast_lookup, CTLFLAG_RW,
+ &cache_fast_lookup, 0, "");
+
+#define CACHE_FPL_UNHANDLED -2020
+
+static void
+cache_fpl_cleanup_cnp(struct componentname *cnp)
+{
+
+ uma_zfree(namei_zone, cnp->cn_pnbuf);
+#ifdef DIAGNOSTIC
+ cnp->cn_pnbuf = NULL;
+ cnp->cn_nameptr = NULL;
+#endif
+}
+
+static void
+cache_fpl_handle_root(struct nameidata *ndp, struct vnode **dpp)
+{
+ struct componentname *cnp;
+
+ cnp = &ndp->ni_cnd;
+ while (*(cnp->cn_nameptr) == '/') {
+ cnp->cn_nameptr++;
+ ndp->ni_pathlen--;
+ }
+}
+
+static void
+cache_fpl_handle_root_initial(struct nameidata *ndp, struct vnode **dpp)
+{
+
+ cache_fpl_handle_root(ndp, dpp);
+ *dpp = ndp->ni_rootdir;
+}
+
+enum cache_fpl_status { CACHE_FPL_STATUS_UNSET, CACHE_FPL_STATUS_HANDLED,
+ CACHE_FPL_STATUS_UNHANDLED };
+
+struct cache_fpl {
+ int line;
+ enum cache_fpl_status handled;
+ bool in_smr;
+ struct nameidata *ndp;
+ struct componentname *cnp;
+ struct vnode *dvp;
+ seqc_t dvp_seqc;
+ struct vnode *tvp;
+ seqc_t tvp_seqc;
+};
+
+static void
+cache_fpl_smr_assert_not_entered(struct cache_fpl *fpl)
+{
+
+ SMR_ASSERT_NOT_ENTERED(vfs_smr);
+ MPASS(fpl->in_smr == false);
+}
+
+static void
+cache_fpl_smr_enter(struct cache_fpl *fpl)
+{
+
+ MPASS(fpl->in_smr == false);
+ smr_enter(vfs_smr);
+ fpl->in_smr = true;
+}
+
+static void
+cache_fpl_smr_exit(struct cache_fpl *fpl)
+{
+
+ MPASS(fpl->in_smr == true);
+ fpl->in_smr = false;
+ smr_exit(vfs_smr);
+}
+
+static int
+cache_fpl_unhandled_impl(struct cache_fpl *fpl, int line)
+{
+
+ KASSERT(fpl->handled == CACHE_FPL_STATUS_UNSET,
+ ("%s: lookup status already set at %d\n", __func__, fpl->line));
+ fpl->handled = CACHE_FPL_STATUS_UNHANDLED;
+ fpl->line = line;
+ return (CACHE_FPL_UNHANDLED);
+}
+
+#define cache_fpl_unhandled(x) cache_fpl_unhandled_impl((x), __LINE__)
+
+static int
+cache_fpl_handled_impl(struct cache_fpl *fpl, int error, int line)
+{
+
+ KASSERT(fpl->handled == CACHE_FPL_STATUS_UNSET,
+ ("%s: lookup status already set at %d\n", __func__, fpl->line));
+ fpl->handled = CACHE_FPL_STATUS_HANDLED;
+ fpl->line = line;
+ return (error);
+}
+
+#define cache_fpl_handled(x, e) cache_fpl_handled_impl((x), (e), __LINE__)
+
+#define CACHE_FPL_SUPPORTED_CN_FLAGS \
+ (LOCKLEAF | FOLLOW | LOCKSHARED | SAVENAME | ISOPEN | AUDITVNODE1)
+
+static bool
+cache_can_fplookup(struct cache_fpl *fpl)
+{
+ struct nameidata *ndp;
+ struct componentname *cnp;
+ struct thread *td;
+
+ ndp = fpl->ndp;
+ cnp = fpl->cnp;
+ td = cnp->cn_thread;
+
+ if (!cache_fast_lookup) {
+ cache_fpl_unhandled(fpl);
+ return (false);
+ }
+ if (mac_vnode_check_lookup_enabled()) {
+ cache_fpl_unhandled(fpl);
+ return (false);
+ }
+ if (cnp->cn_flags & ~CACHE_FPL_SUPPORTED_CN_FLAGS) {
+ cache_fpl_unhandled(fpl);
+ return (false);
+ }
+ if ((cnp->cn_flags & LOCKLEAF) == 0) {
+ cache_fpl_unhandled(fpl);
+ return (false);
+ }
+ if (cnp->cn_nameiop != LOOKUP) {
+ cache_fpl_unhandled(fpl);
+ return (false);
+ }
+ if (ndp->ni_dirfd != AT_FDCWD) {
+ cache_fpl_unhandled(fpl);
+ return (false);
+ }
+ if (IN_CAPABILITY_MODE(td)) {
+ cache_fpl_unhandled(fpl);
+ return (false);
+ }
+ if (AUDITING_TD(td)) {
+ cache_fpl_unhandled(fpl);
+ return (false);
+ }
+ return (true);
+}
+
+/*
+ * TODO
+ *
+ * Save and restore nameidata in case we need to fall back to the regular lookup.
+ *
+ * Everything which is modified (some direct fields and cnp) is covered, but the
+ * copy below is overzealous for simplicity. It can be modified to only save few
+ * fields.
+ */
+static void
+cache_save_nameidata(struct nameidata *ndp, struct nameidata *saved)
+{
+
+ *saved = *ndp;
+}
+
+static void
+cache_restore_nameidata(struct nameidata *ndp, struct nameidata *saved)
+{
+
+ *ndp = *saved;
+}
+
+static bool
+cache_fplookup_vnode_supported(struct vnode *vp)
+{
+
+ switch (vp->v_type) {
+ case VLNK:
+ return (false);
+ default:
+ break;
+ }
+ return (true);
+}
+
+static int
+cache_fplookup_final(struct cache_fpl *fpl)
+{
+ struct componentname *cnp;
+ enum vgetstate tvs;
+ struct vnode *dvp, *tvp;
+ seqc_t dvp_seqc, tvp_seqc;
+ int error;
+
+ cnp = fpl->cnp;
+ dvp = fpl->dvp;
+ dvp_seqc = fpl->dvp_seqc;
+ tvp = fpl->tvp;
+ tvp_seqc = fpl->tvp_seqc;
+
+ VNPASS(cache_fplookup_vnode_supported(dvp), dvp);
+
+ tvs = vget_prep_smr(tvp);
+ cache_fpl_smr_exit(fpl);
+ if (tvs == VGET_NONE) {
+ return (cache_fpl_unhandled(fpl));
+ }
+
+ if (!seqc_consistent(&dvp->v_seqc, dvp_seqc)) {
+ vget_abort(tvp, tvs);
+ return (cache_fpl_unhandled(fpl));
+ }
+
+ MPASS((cnp->cn_flags & LOCKLEAF) != 0);
+
+ error = vget_finish(tvp, cnp->cn_lkflags, tvs);
+ if (error != 0) {
+ return (cache_fpl_unhandled(fpl));
+ }
+
+ if (!seqc_consistent(&tvp->v_seqc, tvp_seqc)) {
+ vput(tvp);
+ return (cache_fpl_unhandled(fpl));
+ }
+
+ return (cache_fpl_handled(fpl, 0));
+}
+
+static int
+cache_fplookup_next(struct cache_fpl *fpl)
+{
+ struct componentname *cnp;
+ struct namecache *ncp;
+ struct vnode *dvp, *tvp;
+ u_char nc_flag;
+ uint32_t hash;
+
+ cnp = fpl->cnp;
+ dvp = fpl->dvp;
+
+ if (__predict_false(cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.')) {
+ fpl->tvp = dvp;
+ fpl->tvp_seqc = seqc_read_any(&dvp->v_seqc);
+ if (seqc_in_modify(fpl->tvp_seqc)) {
+ return (cache_fpl_unhandled(fpl));
+ }
+ return (0);
+ }
+
+ hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
+
+ CK_LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
+ counter_u64_add(numchecks, 1);
+ if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
+ !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
+ break;
+ }
+
+ fpl->tvp = (void *)0xdeadbeef;
+
+ /*
+ * If there is no entry we have to punt to the slow path to perform
+ * actual lookup. Should there be nothing with this name a negative
+ * entry will be created.
+ */
+ if (__predict_false(ncp == NULL)) {
+ return (cache_fpl_unhandled(fpl));
+ }
+
+ tvp = atomic_load_ptr(&ncp->nc_vp);
+ nc_flag = atomic_load_char(&ncp->nc_flag);
+ if (__predict_false(cache_ncp_invalid(ncp))) {
+ return (cache_fpl_unhandled(fpl));
+ }
+ if (__predict_false(nc_flag & NCF_WHITE)) {
+ return (cache_fpl_unhandled(fpl));
+ }
+
+ fpl->tvp = tvp;
+ if (nc_flag & NCF_NEGATIVE) {
+ if ((nc_flag & NCF_HOTNEGATIVE) == 0) {
+ /*
+ * TODO
+ * Promoting to hot negative requires locks which are
+ * not yet supported for simplicity.
+ */
+ return (cache_fpl_unhandled(fpl));
+ }
+ SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp,
+ ncp->nc_name);
+ counter_u64_add(numneghits, 1);
+ cache_fpl_smr_exit(fpl);
+ return (cache_fpl_handled(fpl, ENOENT));
+ }
+
+ fpl->tvp_seqc = seqc_read_any(&tvp->v_seqc);
+ if (seqc_in_modify(fpl->tvp_seqc)) {
+ return (cache_fpl_unhandled(fpl));
+ }
+
+ if (!cache_fplookup_vnode_supported(tvp)) {
+ return (cache_fpl_unhandled(fpl));
+ }
+
+ counter_u64_add(numposhits, 1);
+ SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, tvp);
+ return (0);
+}
+
+static bool
+cache_fplookup_mp_supported(struct mount *mp)
+{
+
+ if (mp == NULL)
+ return (false);
+ if ((mp->mnt_kern_flag & MNTK_FPLOOKUP) == 0)
+ return (false);
+ if (mp->mnt_flag & MNT_UNION)
+ return (false);
+ return (true);
+}
+
+/*
+ * Walk up the mount stack (if any).
+ *
+ * Correctness is provided in the following ways:
+ * - all vnodes are protected from freeing with SMR
+ * - struct mount objects are type stable making them always safe to access
+ * - stability of the particular mount is provided by busying it
+ * - relationship between the vnode which is mounted on and the mount is
+ * verified with the vnode sequence counter after busying
+ * - association between root vnode of the mount and the mount is protected
+ * by busy
+ *
+ * From that point on we can read the sequence counter of the root vnode
+ * and get the next mount on the stack (if any) using the same protection.
+ *
+ * By the end of successful walk we are guaranteed the reached state was
+ * indeed present at least at some point which matches the regular lookup.
+ */
+static int
+cache_fplookup_climb_mount(struct cache_fpl *fpl)
+{
+ struct mount *mp, *prev_mp;
+ struct vnode *vp;
+ seqc_t vp_seqc;
+
+ vp = fpl->tvp;
+ vp_seqc = fpl->tvp_seqc;
+ if (vp->v_type != VDIR)
+ return (0);
+
+ mp = atomic_load_ptr(&vp->v_mountedhere);
+ if (mp == NULL)
+ return (0);
+
+ prev_mp = NULL;
+ for (;;) {
+ if (!vfs_op_thread_enter(mp)) {
+ if (prev_mp != NULL)
+ vfs_op_thread_exit(prev_mp);
+ return (cache_fpl_unhandled(fpl));
+ }
+ if (prev_mp != NULL)
+ vfs_op_thread_exit(prev_mp);
+ if (!seqc_consistent(&vp->v_seqc, vp_seqc)) {
+ vfs_op_thread_exit(mp);
+ return (cache_fpl_unhandled(fpl));
+ }
+ if (!cache_fplookup_mp_supported(mp)) {
+ vfs_op_thread_exit(mp);
+ return (cache_fpl_unhandled(fpl));
+ }
+ vp = atomic_load_ptr(&mp->mnt_rootvnode);
+ if (vp == NULL || VN_IS_DOOMED(vp)) {
+ vfs_op_thread_exit(mp);
+ return (cache_fpl_unhandled(fpl));
+ }
+ vp_seqc = seqc_read_any(&vp->v_seqc);
+ if (seqc_in_modify(vp_seqc)) {
+ vfs_op_thread_exit(mp);
+ return (cache_fpl_unhandled(fpl));
+ }
+ prev_mp = mp;
+ mp = atomic_load_ptr(&vp->v_mountedhere);
+ if (mp == NULL)
+ break;
+ }
+
+ vfs_op_thread_exit(prev_mp);
+ fpl->tvp = vp;
+ fpl->tvp_seqc = vp_seqc;
+ return (0);
+}
+
+static int
+cache_fplookup_parse(struct cache_fpl *fpl)
+{
+ struct nameidata *ndp;
+ struct componentname *cnp;
+ char *cp;
+ char *prev_ni_next; /* saved ndp->ni_next */
+ size_t prev_ni_pathlen; /* saved ndp->ni_pathlen */
+
+ ndp = fpl->ndp;
+ cnp = fpl->cnp;
+
+ /*
+ * Search a new directory.
+ *
+ * The last component of the filename is left accessible via
+ * cnp->cn_nameptr for callers that need the name. Callers needing
+ * the name set the SAVENAME flag. When done, they assume
+ * responsibility for freeing the pathname buffer.
+ */
+ for (cp = cnp->cn_nameptr; *cp != 0 && *cp != '/'; cp++)
+ continue;
+ cnp->cn_namelen = cp - cnp->cn_nameptr;
+ if (cnp->cn_namelen > NAME_MAX) {
+ return (cache_fpl_handled(fpl, ENAMETOOLONG));
+ }
+ prev_ni_pathlen = ndp->ni_pathlen;
+ ndp->ni_pathlen -= cnp->cn_namelen;
+ KASSERT(ndp->ni_pathlen <= PATH_MAX,
+ ("%s: ni_pathlen underflow to %zd\n", __func__, ndp->ni_pathlen));
+ prev_ni_next = ndp->ni_next;
+ ndp->ni_next = cp;
+
+ /*
+ * Replace multiple slashes by a single slash and trailing slashes
+ * by a null. This must be done before VOP_LOOKUP() because some
+ * fs's don't know about trailing slashes. Remember if there were
+ * trailing slashes to handle symlinks, existing non-directories
+ * and non-existing files that won't be directories specially later.
+ */
+ while (*cp == '/' && (cp[1] == '/' || cp[1] == '\0')) {
+ cp++;
+ ndp->ni_pathlen--;
+ if (*cp == '\0') {
+ /*
+ * TODO
+ * Regular lookup performs the following:
+ * *ndp->ni_next = '\0';
+ * cnp->cn_flags |= TRAILINGSLASH;
+ *
+ * Which is problematic since it modifies data read
+ * from userspace. Then if fast path lookup was to
+ * abort we would have to either restore it or convey
+ * the flag. Since this is a corner case just ignore
+ * it for simplicity.
+ */
+ return (cache_fpl_unhandled(fpl));
+ }
+ }
+ ndp->ni_next = cp;
+
+ cnp->cn_flags |= MAKEENTRY;
+
+ if (cnp->cn_namelen == 2 &&
+ cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.')
+ cnp->cn_flags |= ISDOTDOT;
+ else
+ cnp->cn_flags &= ~ISDOTDOT;
+ if (*ndp->ni_next == 0)
+ cnp->cn_flags |= ISLASTCN;
+ else
+ cnp->cn_flags &= ~ISLASTCN;
+
+ /*
+ * Check for degenerate name (e.g. / or "")
+ * which is a way of talking about a directory,
+ * e.g. like "/." or ".".
+ *
+ * TODO
+ * Another corner case handled by the regular lookup
+ */
+ if (__predict_false(cnp->cn_nameptr[0] == '\0')) {
+ return (cache_fpl_unhandled(fpl));
+ }
+ return (0);
+}
+
+static int
+cache_fplookup_impl(struct vnode *dvp, struct cache_fpl *fpl)
+{
+ struct nameidata *ndp;
+ struct componentname *cnp;
+ struct mount *mp;
+ int error;
+
+ error = CACHE_FPL_UNHANDLED;
+ ndp = fpl->ndp;
+ ndp->ni_lcf = 0;
+ cnp = fpl->cnp;
+ cnp->cn_lkflags = LK_SHARED;
+ if ((cnp->cn_flags & LOCKSHARED) == 0)
+ cnp->cn_lkflags = LK_EXCLUSIVE;
+
+ fpl->dvp = dvp;
+ fpl->dvp_seqc = seqc_read_any(&fpl->dvp->v_seqc);
+ if (seqc_in_modify(fpl->dvp_seqc)) {
+ cache_fpl_unhandled(fpl);
+ goto out;
+ }
+ mp = atomic_load_ptr(&fpl->dvp->v_mount);
+ if (!cache_fplookup_mp_supported(mp)) {
+ cache_fpl_unhandled(fpl);
+ goto out;
+ }
+
+ VNPASS(cache_fplookup_vnode_supported(fpl->dvp), fpl->dvp);
+
+ for (;;) {
+ error = cache_fplookup_parse(fpl);
+ if (__predict_false(error != 0)) {
+ break;
+ }
+
+ if (cnp->cn_flags & ISDOTDOT) {
+ error = cache_fpl_unhandled(fpl);
+ break;
+ }
+
+ VNPASS(cache_fplookup_vnode_supported(fpl->dvp), fpl->dvp);
+
+ error = VOP_FPLOOKUP_VEXEC(fpl->dvp, cnp->cn_cred, cnp->cn_thread);
+ if (__predict_false(error != 0)) {
+ switch (error) {
+ case EAGAIN:
+ case EOPNOTSUPP: /* can happen when racing against vgone */
+ cache_fpl_unhandled(fpl);
+ break;
+ default:
+ /*
+ * TODO
+ */
+ panic("%s: untested handling of error code %d", __func__, error);
+ cache_fpl_handled(fpl, error);
+ break;
+ }
+ break;
+ }
+
+ error = cache_fplookup_next(fpl);
+ if (__predict_false(error != 0)) {
+ break;
+ }
+
+ VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp);
+
+ error = cache_fplookup_climb_mount(fpl);
+ if (__predict_false(error != 0)) {
+ break;
+ }
+
+ VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp);
+
+ if (cnp->cn_flags & ISLASTCN) {
+ error = cache_fplookup_final(fpl);
+ break;
+ }
+
+ if (!seqc_consistent(&fpl->dvp->v_seqc, fpl->dvp_seqc)) {
+ error = cache_fpl_unhandled(fpl);
+ break;
+ }
+
+ fpl->dvp = fpl->tvp;
+ fpl->dvp_seqc = fpl->tvp_seqc;
+
+ if (!cache_fplookup_vnode_supported(fpl->dvp)) {
+ error = cache_fpl_unhandled(fpl);
+ break;
+ }
+
+ cnp->cn_nameptr = ndp->ni_next;
+ while (*cnp->cn_nameptr == '/') {
+ cnp->cn_nameptr++;
+ ndp->ni_pathlen--;
+ }
+ }
+out:
+ MPASS(fpl->handled != CACHE_FPL_STATUS_UNSET);
+ if (fpl->handled == CACHE_FPL_STATUS_UNHANDLED) {
+ if (fpl->in_smr)
+ cache_fpl_smr_exit(fpl);
+ return (error);
+ }
+
+ cache_fpl_smr_assert_not_entered(fpl);
+ if (__predict_false(error != 0)) {
+ ndp->ni_dvp = NULL;
+ ndp->ni_vp = NULL;
+ cache_fpl_cleanup_cnp(cnp);
+ return (error);
+ }
+ ndp->ni_dvp = fpl->dvp;
+ ndp->ni_vp = fpl->tvp;
+ if (cnp->cn_flags & SAVENAME)
+ cnp->cn_flags |= HASBUF;
+ else
+ cache_fpl_cleanup_cnp(cnp);
+ return (error);
+}
+
+/*
+ * Fast path lookup protected with SMR and sequence counters.
+ *
+ * Note: all VOP_FPLOOKUP_VEXEC routines have a comment referencing this one.
+ *
+ * Filesystems can opt in by setting the MNTK_FPLOOKUP flag and meeting criteria
+ * outlined below.
+ *
+ * Traditional vnode lookup conceptually looks like this:
+ *
+ * vn_lock(current);
+ * for (;;) {
+ * next = find();
+ * vn_lock(next);
+ * vn_unlock(current);
+ * current = next;
+ * if (last)
+ * break;
+ * }
+ *
+ * Each jump to the next vnode is safe memory-wise and atomic with respect to
+ * any modifications thanks to holding respective locks.
+ *
+ * The same guarantee can be provided with a combination of safe memory
+ * reclamation and sequence counters instead. If all places which affect the
+ * relationship or permissions also the counter, the following provides the
+ * same guarantee.
+ *
+ * smr_enter(vfs_smr);
+ * current_seqc = seqc_read_any(current);
+ * if (seqc_in_modify(current_seqc))
+ * abort();
+ * for (;;) {
+ * next = find();
+ * next_seqc = seqc_read_any(next);
+ * if (!seqc_consistent(current, current_seqc)
+ * abort();
+ * current = next; // if anything changed the above would fail
+ * current_seqc = next_seqc;
+ * if (last)
+ * break;
+ * }
+ *
+ * API contract for VOP_FPLOOKUP_VEXEC routines is as follows:
+ * - they are called while within vfs_smr protection which they must never exit
+ * - EAGAIN can be returned to denote checking could not be performed, it is
+ * always valid to return it
+ * - if the sequence counter has not changed the result must be valid
+ * - if the sequence counter has changed both false positives and false negatives
+ * are permitted (since the result will be rejected later)
+ * - for simple cases of unix permission checks vaccess_vexec_smr can be used
+ *
+ * Caveats to watch out for:
+ * - vnodes are passed unlocked and unreferenced with nothing stopping
+ * VOP_RECLAIM, in turn meaning that ->v_data can become NULL. It is advised
+ * to use atomic_load_ptr to fetch it.
+ * - aforementioned object can also get freed, meaning absent other means it
+ * should be protected with vfs_smr
+ * - either safely checking permissions as they are modified or guaranteeing
+ * their stability is left to the routine
+ */
+int
+cache_fplookup(struct nameidata *ndp, bool *handled)
+{
+ struct nameidata saved_ndp;
+ struct cache_fpl fpl;
+ struct pwd *pwd;
+ struct vnode *dvp, *startdir;
+ struct componentname *cnp;
+ int error;
+
+ *handled = false;
+ bzero(&fpl, sizeof(fpl));
+ fpl.handled = CACHE_FPL_STATUS_UNSET;
+ fpl.ndp = ndp;
+ fpl.cnp = &ndp->ni_cnd;
+ MPASS(curthread == fpl.cnp->cn_thread);
+
+ if (!cache_can_fplookup(&fpl)) {
+ SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.handled);
+ return (EOPNOTSUPP);
+ }
+
+ cache_save_nameidata(ndp, &saved_ndp);
+
+ cache_fpl_smr_enter(&fpl);
+ pwd = pwd_get_smr();
+ ndp->ni_rootdir = pwd->pwd_rdir;
+ ndp->ni_topdir = pwd->pwd_jdir;
+
+ cnp = fpl.cnp;
+ cnp->cn_nameptr = cnp->cn_pnbuf;
+ startdir = ndp->ni_startdir;
+ if (cnp->cn_pnbuf[0] == '/') {
+ cache_fpl_handle_root_initial(ndp, &dvp);
+ } else {
+ if (ndp->ni_startdir != NULL) {
+ dvp = ndp->ni_startdir;
+ } else {
+ MPASS(ndp->ni_dirfd == AT_FDCWD);
+ dvp = pwd->pwd_cdir;
+ }
+ }
+
+ error = cache_fplookup_impl(dvp, &fpl);
+ cache_fpl_smr_assert_not_entered(&fpl);
+ SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.handled);
+
+ MPASS(fpl.handled != CACHE_FPL_STATUS_UNSET);
+ if (fpl.handled == CACHE_FPL_STATUS_HANDLED) {
+ if (error == CACHE_FPL_UNHANDLED)
+ panic("cache: bad error code from line %d\n", fpl.line);
+ *handled = true;
+ if (startdir != NULL)
+ vrele(startdir);
+ return (error);
+ } else {
+ cache_restore_nameidata(ndp, &saved_ndp);
+ return (error);
+ }
+}
Index: sys/kern/vfs_lookup.c
===================================================================
--- sys/kern/vfs_lookup.c
+++ sys/kern/vfs_lookup.c
@@ -315,6 +315,7 @@
struct filecaps dirfd_caps;
struct uio auio;
int error, linklen, startdir_used;
+ bool handled;
cnp = &ndp->ni_cnd;
td = cnp->cn_thread;
@@ -329,10 +330,15 @@
ndp->ni_startdir->v_type == VBAD);
TAILQ_INIT(&ndp->ni_cap_tracker);
ndp->ni_lcf = 0;
+ ndp->ni_loopcnt = 0;
+ startdir_used = 0;
+ dp = NULL;
/* We will set this ourselves if we need it. */
cnp->cn_flags &= ~TRAILINGSLASH;
+ ndp->ni_vp = NULL;
+
/*
* Get a buffer for the name to be translated, and copy the
* name into the buffer.
@@ -346,12 +352,29 @@
error = copyinstr(ndp->ni_dirp, cnp->cn_pnbuf, MAXPATHLEN,
&ndp->ni_pathlen);
+ if (error != 0) {
+ namei_cleanup_cnp(cnp);
+ return (error);
+ }
+
+ cnp->cn_nameptr = cnp->cn_pnbuf;
+
/*
* Don't allow empty pathnames.
*/
- if (error == 0 && *cnp->cn_pnbuf == '\0')
- error = ENOENT;
+ if (*cnp->cn_pnbuf == '\0') {
+ namei_cleanup_cnp(cnp);
+ return (ENOENT);
+ }
+ error = cache_fplookup(ndp, &handled);
+ if (handled)
+ return (error);
+
+ /*
+ * Ignore fast path.
+ */
+ error = 0;
#ifdef CAPABILITY_MODE
/*
* In capability mode, lookups must be restricted to happen in
@@ -380,10 +403,8 @@
#endif
if (error != 0) {
namei_cleanup_cnp(cnp);
- ndp->ni_vp = NULL;
return (error);
}
- ndp->ni_loopcnt = 0;
#ifdef KTRACE
if (KTRPOINT(td, KTR_NAMEI)) {
KASSERT(cnp->cn_thread == curthread,
@@ -402,9 +423,6 @@
ndp->ni_rootdir = pwd->pwd_rdir;
ndp->ni_topdir = pwd->pwd_jdir;
- startdir_used = 0;
- dp = NULL;
- cnp->cn_nameptr = cnp->cn_pnbuf;
if (cnp->cn_pnbuf[0] == '/') {
ndp->ni_resflags |= NIRES_ABS;
error = namei_handle_root(ndp, &dp);
Index: sys/kern/vfs_mount.c
===================================================================
--- sys/kern/vfs_mount.c
+++ sys/kern/vfs_mount.c
@@ -949,6 +949,8 @@
}
VOP_UNLOCK(vp);
+ vfs_seqc_write_begin(vp);
+
/* Allocate and initialize the filesystem. */
mp = vfs_mount_alloc(vp, vfsp, fspath, td->td_ucred);
/* XXXMAC: pass to vfs_mount_alloc? */
@@ -976,9 +978,11 @@
VI_LOCK(vp);
vp->v_iflag &= ~VI_MOUNT;
VI_UNLOCK(vp);
+ vfs_seqc_write_end(vp);
vrele(vp);
return (error);
}
+ vfs_seqc_write_begin(newdp);
VOP_UNLOCK(newdp);
if (mp->mnt_opt != NULL)
@@ -1015,6 +1019,8 @@
EVENTHANDLER_DIRECT_INVOKE(vfs_mounted, mp, newdp, td);
VOP_UNLOCK(newdp);
mountcheckdirs(vp, newdp);
+ vfs_seqc_write_end(vp);
+ vfs_seqc_write_end(newdp);
vrele(newdp);
if ((mp->mnt_flag & MNT_RDONLY) == 0)
vfs_allocate_syncvnode(mp);
@@ -1089,7 +1095,9 @@
VOP_UNLOCK(vp);
vfs_op_enter(mp);
+ vfs_seqc_write_begin(vp);
+ rootvp = NULL;
MNT_ILOCK(mp);
if ((mp->mnt_kern_flag & MNTK_UNMOUNT) != 0) {
MNT_IUNLOCK(mp);
@@ -1103,8 +1111,6 @@
mp->mnt_kern_flag &= ~MNTK_ASYNC;
rootvp = vfs_cache_root_clear(mp);
MNT_IUNLOCK(mp);
- if (rootvp != NULL)
- vrele(rootvp);
mp->mnt_optnew = *optlist;
vfs_mergeopts(mp->mnt_optnew, mp->mnt_opt);
@@ -1175,6 +1181,11 @@
vfs_deallocate_syncvnode(mp);
end:
vfs_op_exit(mp);
+ if (rootvp != NULL) {
+ vfs_seqc_write_end(rootvp);
+ vrele(rootvp);
+ }
+ vfs_seqc_write_end(vp);
vfs_unbusy(mp);
VI_LOCK(vp);
vp->v_iflag &= ~VI_MOUNT;
@@ -1665,14 +1676,19 @@
}
mp->mnt_kern_flag |= MNTK_UNMOUNT;
rootvp = vfs_cache_root_clear(mp);
+ if (coveredvp != NULL)
+ vfs_seqc_write_begin(coveredvp);
if (flags & MNT_NONBUSY) {
MNT_IUNLOCK(mp);
error = vfs_check_usecounts(mp);
MNT_ILOCK(mp);
if (error != 0) {
+ vfs_seqc_write_end(coveredvp);
dounmount_cleanup(mp, coveredvp, MNTK_UNMOUNT);
- if (rootvp != NULL)
+ if (rootvp != NULL) {
+ vfs_seqc_write_end(rootvp);
vrele(rootvp);
+ }
return (error);
}
}
@@ -1701,22 +1717,14 @@
("%s: invalid return value for msleep in the drain path @ %s:%d",
__func__, __FILE__, __LINE__));
- if (rootvp != NULL)
+ if (rootvp != NULL) {
+ vhold(rootvp);
vrele(rootvp);
+ }
if (mp->mnt_flag & MNT_EXPUBLIC)
vfs_setpublicfs(NULL, NULL, NULL);
- /*
- * From now, we can claim that the use reference on the
- * coveredvp is ours, and the ref can be released only by
- * successfull unmount by us, or left for later unmount
- * attempt. The previously acquired hold reference is no
- * longer needed to protect the vnode from reuse.
- */
- if (coveredvp != NULL)
- vdrop(coveredvp);
-
vfs_periodic(mp, MNT_WAIT);
MNT_ILOCK(mp);
async_flag = mp->mnt_flag & MNT_ASYNC;
@@ -1751,8 +1759,15 @@
}
vfs_op_exit_locked(mp);
MNT_IUNLOCK(mp);
- if (coveredvp)
+ if (coveredvp) {
+ vfs_seqc_write_end(coveredvp);
VOP_UNLOCK(coveredvp);
+ vdrop(coveredvp);
+ }
+ if (rootvp != NULL) {
+ vfs_seqc_write_end(rootvp);
+ vdrop(rootvp);
+ }
return (error);
}
mtx_lock(&mountlist_mtx);
@@ -1761,7 +1776,13 @@
EVENTHANDLER_DIRECT_INVOKE(vfs_unmounted, mp, td);
if (coveredvp != NULL) {
coveredvp->v_mountedhere = NULL;
+ vfs_seqc_write_end(coveredvp);
VOP_UNLOCK(coveredvp);
+ vdrop(coveredvp);
+ }
+ if (rootvp != NULL) {
+ vfs_seqc_write_end(rootvp);
+ vdrop(rootvp);
}
vfs_event_signal(NULL, VQ_UNMOUNT, 0);
if (rootvnode != NULL && mp == rootvnode->v_mount) {
Index: sys/kern/vfs_subr.c
===================================================================
--- sys/kern/vfs_subr.c
+++ sys/kern/vfs_subr.c
@@ -664,8 +664,8 @@
vnode_list_reclaim_marker = vn_alloc_marker(NULL);
TAILQ_INSERT_HEAD(&vnode_list, vnode_list_reclaim_marker, v_vnodelist);
vnode_zone = uma_zcreate("VNODE", sizeof (struct vnode), NULL, NULL,
- vnode_init, vnode_fini, UMA_ALIGN_PTR, UMA_ZONE_SMR);
- vfs_smr = uma_zone_get_smr(vnode_zone);
+ vnode_init, vnode_fini, UMA_ALIGN_PTR, 0);
+ uma_zone_set_smr(vnode_zone, vfs_smr);
vnodepoll_zone = uma_zcreate("VNODEPOLL", sizeof (struct vpollinfo),
NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
/*
@@ -1766,6 +1766,7 @@
*/
CTR2(KTR_VFS, "%s: destroying the vnode %p", __func__, vp);
bo = &vp->v_bufobj;
+ VNPASS(vp->v_seqc_users == 0, vp);
VNASSERT(vp->v_data == NULL, vp, ("cleaned vnode isn't"));
VNASSERT(vp->v_holdcnt == VHOLD_NO_SMR, vp, ("Invalid hold count"));
VNASSERT(vp->v_usecount == 0, vp, ("Non-zero use count"));
@@ -2892,6 +2893,17 @@
return (vs);
}
+void
+vget_abort(struct vnode *vp, enum vgetstate vs)
+{
+
+ VNPASS(vs == VGET_HOLDCNT || vs == VGET_USECOUNT, vp);
+ if (vs == VGET_USECOUNT)
+ vrele(vp);
+ else
+ vdrop(vp);
+}
+
int
vget(struct vnode *vp, int flags, struct thread *td)
{
@@ -2954,10 +2966,7 @@
error = vn_lock(vp, flags);
if (__predict_false(error != 0)) {
- if (vs == VGET_USECOUNT)
- vrele(vp);
- else
- vdrop(vp);
+ vget_abort(vp, vs);
CTR2(KTR_VFS, "%s: impossible to lock vnode %p", __func__,
vp);
return (error);
@@ -3982,6 +3991,7 @@
*/
if (vp->v_irflag & VIRF_DOOMED)
return;
+ vfs_seqc_write_begin_locked(vp);
vunlazy_gone(vp);
vp->v_irflag |= VIRF_DOOMED;
@@ -4084,6 +4094,7 @@
vp->v_vnlock = &vp->v_lock;
vp->v_op = &dead_vnodeops;
vp->v_type = VBAD;
+ vfs_seqc_write_end_locked(vp);
}
/*
@@ -4124,6 +4135,7 @@
printf(" usecount %d, writecount %d, refcount %d (flags %s)",
vp->v_usecount, vp->v_writecount, holdcnt & ~VHOLD_NO_SMR,
holdcnt & VHOLD_NO_SMR ? "VHOLD_NO_SMR" : "none");
+ printf(", seqc users %d", vp->v_seqc_users);
switch (vp->v_type) {
case VDIR:
printf(" mountedhere %p\n", vp->v_mountedhere);
@@ -4369,6 +4381,7 @@
MNT_KERN_FLAG(MNTK_LOOKUP_EXCL_DOTDOT);
MNT_KERN_FLAG(MNTK_MARKER);
MNT_KERN_FLAG(MNTK_USES_BCACHE);
+ MNT_KERN_FLAG(MNTK_FPLOOKUP);
MNT_KERN_FLAG(MNTK_NOASYNC);
MNT_KERN_FLAG(MNTK_UNMOUNT);
MNT_KERN_FLAG(MNTK_MWAIT);
@@ -5184,6 +5197,38 @@
return (error == 0);
}
+/*
+ * VOP_FPLOOKUP_VEXEC routines are subject to special circumstances, see
+ * the comment above cache_fplookup for details.
+ *
+ * We never deny as priv_check_cred calls are not yet supported, see vaccess.
+ */
+int
+vaccess_vexec_smr(mode_t file_mode, uid_t file_uid, gid_t file_gid, struct ucred *cred)
+{
+
+ SMR_ASSERT_ENTERED(vfs_smr);
+
+ /* Check the owner. */
+ if (cred->cr_uid == file_uid) {
+ if (file_mode & S_IXUSR)
+ return (0);
+ return (EAGAIN);
+ }
+
+ /* Otherwise, check the groups (first match) */
+ if (groupmember(file_gid, cred)) {
+ if (file_mode & S_IXGRP)
+ return (0);
+ return (EAGAIN);
+ }
+
+ /* Otherwise, check everyone else. */
+ if (file_mode & S_IXOTH)
+ return (0);
+ return (EAGAIN);
+}
+
/*
* Common filesystem object access control check routine. Accepts a
* vnode's type, "mode", uid and gid, requested access mode, credentials,
@@ -5464,6 +5509,14 @@
ASSERT_VOP_LOCKED(a->a_tvp, "vop_rename: tvp not locked");
ASSERT_VOP_LOCKED(a->a_tdvp, "vop_rename: tdvp not locked");
#endif
+ /*
+ * It may be tempting to add vfs_seqc_write_begin/end calls here and
+ * in vop_rename_post but that's not going to work out since some
+ * filesystems relookup vnodes mid-rename. This is probably a bug.
+ *
+ * For now filesystems are expected to do the relevant calls after they
+ * decide what vnodes to operate on.
+ */
if (a->a_tdvp != a->a_fdvp)
vhold(a->a_fdvp);
if (a->a_tvp != a->a_fvp)
@@ -5553,11 +5606,26 @@
VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
}
+void
+vop_deleteextattr_pre(void *ap)
+{
+ struct vop_deleteextattr_args *a;
+ struct vnode *vp;
+
+ a = ap;
+ vp = a->a_vp;
+ vfs_seqc_write_begin(vp);
+}
+
void
vop_deleteextattr_post(void *ap, int rc)
{
- struct vop_deleteextattr_args *a = ap;
+ struct vop_deleteextattr_args *a;
+ struct vnode *vp;
+ a = ap;
+ vp = a->a_vp;
+ vfs_seqc_write_end(vp);
if (!rc)
VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB);
}
@@ -5660,22 +5728,74 @@
}
}
+void
+vop_setattr_pre(void *ap)
+{
+ struct vop_setattr_args *a;
+ struct vnode *vp;
+
+ a = ap;
+ vp = a->a_vp;
+ vfs_seqc_write_begin(vp);
+}
+
void
vop_setattr_post(void *ap, int rc)
{
- struct vop_setattr_args *a = ap;
+ struct vop_setattr_args *a;
+ struct vnode *vp;
+ a = ap;
+ vp = a->a_vp;
+ vfs_seqc_write_end(vp);
if (!rc)
- VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB);
+ VFS_KNOTE_LOCKED(vp, NOTE_ATTRIB);
+}
+
+void
+vop_setacl_pre(void *ap)
+{
+ struct vop_setacl_args *a;
+ struct vnode *vp;
+
+ a = ap;
+ vp = a->a_vp;
+ vfs_seqc_write_begin(vp);
+}
+
+void
+vop_setacl_post(void *ap, int rc __unused)
+{
+ struct vop_setacl_args *a;
+ struct vnode *vp;
+
+ a = ap;
+ vp = a->a_vp;
+ vfs_seqc_write_end(vp);
+}
+
+void
+vop_setextattr_pre(void *ap)
+{
+ struct vop_setextattr_args *a;
+ struct vnode *vp;
+
+ a = ap;
+ vp = a->a_vp;
+ vfs_seqc_write_begin(vp);
}
void
vop_setextattr_post(void *ap, int rc)
{
- struct vop_setextattr_args *a = ap;
+ struct vop_setextattr_args *a;
+ struct vnode *vp;
+ a = ap;
+ vp = a->a_vp;
+ vfs_seqc_write_end(vp);
if (!rc)
- VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB);
+ VFS_KNOTE_LOCKED(vp, NOTE_ATTRIB);
}
void
@@ -6237,6 +6357,8 @@
*/
MPASS(mp->mnt_vfs_ops > 0);
vp = mp->mnt_rootvnode;
+ if (vp != NULL)
+ vfs_seqc_write_begin(vp);
mp->mnt_rootvnode = NULL;
return (vp);
}
@@ -6533,3 +6655,45 @@
return (VOP_ACCESS(vp, VEXEC, cnp->cn_cred, cnp->cn_thread));
}
+
+void
+vfs_seqc_write_begin_locked(struct vnode *vp)
+{
+
+ ASSERT_VI_LOCKED(vp, __func__);
+ VNPASS(vp->v_holdcnt > 0, vp);
+ VNPASS(vp->v_seqc_users >= 0, vp);
+ vp->v_seqc_users++;
+ if (vp->v_seqc_users == 1)
+ seqc_sleepable_write_begin(&vp->v_seqc);
+}
+
+void
+vfs_seqc_write_begin(struct vnode *vp)
+{
+
+ VI_LOCK(vp);
+ vfs_seqc_write_begin_locked(vp);
+ VI_UNLOCK(vp);
+}
+
+void
+vfs_seqc_write_end_locked(struct vnode *vp)
+{
+
+ ASSERT_VI_LOCKED(vp, __func__);
+ VNPASS(vp->v_holdcnt > 0, vp);
+ VNPASS(vp->v_seqc_users > 0, vp);
+ vp->v_seqc_users--;
+ if (vp->v_seqc_users == 0)
+ seqc_sleepable_write_end(&vp->v_seqc);
+}
+
+void
+vfs_seqc_write_end(struct vnode *vp)
+{
+
+ VI_LOCK(vp);
+ vfs_seqc_write_end_locked(vp);
+ VI_UNLOCK(vp);
+}
Index: sys/kern/vnode_if.src
===================================================================
--- sys/kern/vnode_if.src
+++ sys/kern/vnode_if.src
@@ -142,6 +142,15 @@
};
+%% fplookup_vexec vp - - -
+
+vop_fplookup_vexec {
+ IN struct vnode *vp;
+ IN struct ucred *cred;
+ IN struct thread *td;
+};
+
+
%% access vp L L L
vop_access {
@@ -172,6 +181,7 @@
%% setattr vp E E E
+%! setattr pre vop_setattr_pre
%! setattr post vop_setattr_post
vop_setattr {
@@ -523,6 +533,8 @@
%% setacl vp E E E
+%! setacl pre vop_setacl_pre
+%! setacl post vop_setacl_post
vop_setacl {
IN struct vnode *vp;
@@ -589,6 +601,7 @@
%% deleteextattr vp E E E
+%! deleteextattr pre vop_deleteextattr_pre
%! deleteextattr post vop_deleteextattr_post
vop_deleteextattr {
@@ -601,6 +614,7 @@
%% setextattr vp E E E
+%! setextattr pre vop_setextattr_pre
%! setextattr post vop_setextattr_post
vop_setextattr {
Index: sys/security/mac/mac_framework.h
===================================================================
--- sys/security/mac/mac_framework.h
+++ sys/security/mac/mac_framework.h
@@ -422,13 +422,14 @@
int mac_vnode_check_lookup_impl(struct ucred *cred, struct vnode *dvp,
struct componentname *cnp);
extern bool mac_vnode_check_lookup_fp_flag;
+#define mac_vnode_check_lookup_enabled() __predict_false(mac_vnode_check_lookup_fp_flag)
static inline int
mac_vnode_check_lookup(struct ucred *cred, struct vnode *dvp,
struct componentname *cnp)
{
mac_vnode_assert_locked(dvp, "mac_vnode_check_lookup");
- if (__predict_false(mac_vnode_check_lookup_fp_flag))
+ if (mac_vnode_check_lookup_enabled())
return (mac_vnode_check_lookup_impl(cred, dvp, cnp));
return (0);
}
Index: sys/sys/_seqc.h
===================================================================
--- /dev/null
+++ sys/sys/_seqc.h
@@ -0,0 +1,6 @@
+#ifndef _SYS__SEQC_H_
+#define _SYS__SEQC_H_
+
+typedef uint32_t seqc_t;
+
+#endif /* _SYS__SEQC_H */
Index: sys/sys/filedesc.h
===================================================================
--- sys/sys/filedesc.h
+++ sys/sys/filedesc.h
@@ -289,6 +289,7 @@
smr_serialized_store(&fdp->fd_pwd, newpwd,
(FILEDESC_XLOCK_ASSERT(fdp), true));
}
+struct pwd *pwd_get_smr(void);
#endif /* _KERNEL */
Index: sys/sys/mount.h
===================================================================
--- sys/sys/mount.h
+++ sys/sys/mount.h
@@ -415,6 +415,7 @@
#define MNTK_TEXT_REFS 0x00008000 /* Keep use ref for text */
#define MNTK_VMSETSIZE_BUG 0x00010000
#define MNTK_UNIONFS 0x00020000 /* A hack for F_ISUNIONSTACK */
+#define MNTK_FPLOOKUP 0x00040000 /* fast path lookup is supported */
#define MNTK_NOASYNC 0x00800000 /* disable async */
#define MNTK_UNMOUNT 0x01000000 /* unmount in progress */
#define MNTK_MWAIT 0x02000000 /* waiting for unmount to finish */
Index: sys/sys/seqc.h
===================================================================
--- sys/sys/seqc.h
+++ sys/sys/seqc.h
@@ -36,7 +36,7 @@
/*
* seqc_t may be included in structs visible to userspace
*/
-typedef uint32_t seqc_t;
+#include <sys/_seqc.h>
#ifdef _KERNEL
@@ -110,5 +110,22 @@
return (seqc_consistent_nomb(seqcp, oldseqc));
}
+static __inline void
+seqc_sleepable_write_begin(seqc_t *seqcp)
+{
+
+ MPASS(!seqc_in_modify(*seqcp));
+ *seqcp += 1;
+ atomic_thread_fence_rel();
+}
+
+static __inline void
+seqc_sleepable_write_end(seqc_t *seqcp)
+{
+
+ atomic_store_rel_int(seqcp, *seqcp + 1);
+ MPASS(!seqc_in_modify(*seqcp));
+}
+
#endif /* _KERNEL */
#endif /* _SYS_SEQC_H_ */
Index: sys/sys/vnode.h
===================================================================
--- sys/sys/vnode.h
+++ sys/sys/vnode.h
@@ -45,6 +45,7 @@
#include <sys/uio.h>
#include <sys/acl.h>
#include <sys/ktr.h>
+#include <sys/_seqc.h>
/*
* The vnode is the focus of all file activity in UNIX. There is a
@@ -105,6 +106,7 @@
*/
enum vtype v_type:8; /* u vnode type */
short v_irflag; /* i frequently read flags */
+ seqc_t v_seqc; /* i modification count */
struct vop_vector *v_op; /* u vnode operations vector */
void *v_data; /* u private data for fs */
@@ -175,6 +177,7 @@
short v_dbatchcpu; /* i LRU requeue deferral batch */
int v_writecount; /* I ref count of writers or
(negative) text users */
+ int v_seqc_users; /* i modifications pending */
u_int v_hash;
};
@@ -538,6 +541,18 @@
#define ASSERT_VOP_LOCKED(vp, str) assert_vop_locked((vp), (str))
#define ASSERT_VOP_UNLOCKED(vp, str) assert_vop_unlocked((vp), (str))
+#define ASSERT_VOP_IN_SEQC(vp) do { \
+ struct vnode *_vp = (vp); \
+ \
+ VNPASS(seqc_in_modify(_vp->v_seqc), _vp); \
+} while (0)
+
+#define ASSERT_VOP_NOT_IN_SEQC(vp) do { \
+ struct vnode *_vp = (vp); \
+ \
+ VNPASS(!seqc_in_modify(_vp->v_seqc), _vp); \
+} while (0)
+
#else /* !DEBUG_VFS_LOCKS */
#define ASSERT_VI_LOCKED(vp, str) ((void)0)
@@ -545,6 +560,10 @@
#define ASSERT_VOP_ELOCKED(vp, str) ((void)0)
#define ASSERT_VOP_LOCKED(vp, str) ((void)0)
#define ASSERT_VOP_UNLOCKED(vp, str) ((void)0)
+
+#define ASSERT_VOP_IN_SEQC(vp) ((void)0)
+#define ASSERT_VOP_NOT_IN_SEQC(vp) ((void)0)
+
#endif /* DEBUG_VFS_LOCKS */
@@ -601,6 +620,7 @@
struct vattr;
struct vfsops;
struct vnode;
+struct pwd;
typedef int (*vn_get_ino_t)(struct mount *, void *, int, struct vnode **);
@@ -613,11 +633,16 @@
void cache_enter_time(struct vnode *dvp, struct vnode *vp,
struct componentname *cnp, struct timespec *tsp,
struct timespec *dtsp);
+int cache_fplookup(struct nameidata *ndp, bool *handled);
int cache_lookup(struct vnode *dvp, struct vnode **vpp,
struct componentname *cnp, struct timespec *tsp, int *ticksp);
void cache_purge(struct vnode *vp);
void cache_purge_negative(struct vnode *vp);
void cache_purgevfs(struct mount *mp, bool force);
+void vfs_seqc_write_begin_locked(struct vnode *vp);
+void vfs_seqc_write_begin(struct vnode *vp);
+void vfs_seqc_write_end_locked(struct vnode *vp);
+void vfs_seqc_write_end(struct vnode *vp);
int change_dir(struct vnode *vp, struct thread *td);
void cvtstat(struct stat *st, struct ostat *ost);
void freebsd11_cvtnstat(struct stat *sb, struct nstat *nsb);
@@ -643,6 +668,8 @@
int vn_commname(struct vnode *vn, char *buf, u_int buflen);
int vn_path_to_global_path(struct thread *td, struct vnode *vp,
char *path, u_int pathlen);
+int vaccess_vexec_smr(mode_t file_mode, uid_t file_uid, gid_t file_gid,
+ struct ucred *cred);
int vaccess(enum vtype type, mode_t file_mode, uid_t file_uid,
gid_t file_gid, accmode_t accmode, struct ucred *cred,
int *privused);
@@ -662,6 +689,7 @@
enum vgetstate vget_prep_smr(struct vnode *vp);
enum vgetstate vget_prep(struct vnode *vp);
int vget_finish(struct vnode *vp, int flags, enum vgetstate vs);
+void vget_abort(struct vnode *vp, enum vgetstate vs);
void vgone(struct vnode *vp);
void vhold(struct vnode *);
void vholdl(struct vnode *);
@@ -804,6 +832,7 @@
/* These are called from within the actual VOPS. */
void vop_close_post(void *a, int rc);
void vop_create_post(void *a, int rc);
+void vop_deleteextattr_pre(void *a);
void vop_deleteextattr_post(void *a, int rc);
void vop_link_post(void *a, int rc);
void vop_lookup_post(void *a, int rc);
@@ -818,7 +847,11 @@
void vop_rename_post(void *a, int rc);
void vop_rename_pre(void *a);
void vop_rmdir_post(void *a, int rc);
+void vop_setattr_pre(void *a);
void vop_setattr_post(void *a, int rc);
+void vop_setacl_pre(void *a);
+void vop_setacl_post(void *a, int rc);
+void vop_setextattr_pre(void *a);
void vop_setextattr_post(void *a, int rc);
void vop_symlink_post(void *a, int rc);
int vop_sigdefer(struct vop_vector *vop, struct vop_generic_args *a);
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Wed, Jan 15, 9:21 PM (5 h, 48 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
15815454
Default Alt Text
D23915.id69198.diff (40 KB)
Attached To
Mode
D23915: vfs: fully smr-protected path lookup
Attached
Detach File
Event Timeline
Log In to Comment