Page Menu
Home
FreeBSD
Search
Configure Global Search
Log In
Files
F107539851
D23915.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Flag For Later
Award Token
Size
58 KB
Referenced Files
None
Subscribers
None
D23915.diff
View Options
Index: sys/cddl/compat/opensolaris/kern/opensolaris_vfs.c
===================================================================
--- sys/cddl/compat/opensolaris/kern/opensolaris_vfs.c
+++ sys/cddl/compat/opensolaris/kern/opensolaris_vfs.c
@@ -154,6 +154,7 @@
vput(vp);
return (error);
}
+ vn_seqc_write_begin(vp);
VOP_UNLOCK(vp);
/*
@@ -206,6 +207,7 @@
VI_LOCK(vp);
vp->v_iflag &= ~VI_MOUNT;
VI_UNLOCK(vp);
+ vn_seqc_write_end(vp);
vput(vp);
vfs_unbusy(mp);
vfs_freeopts(mp->mnt_optnew);
@@ -241,6 +243,7 @@
vfs_event_signal(NULL, VQ_MOUNT, 0);
if (VFS_ROOT(mp, LK_EXCLUSIVE, &mvp))
panic("mount: lost mount");
+ vn_seqc_write_end(vp);
VOP_UNLOCK(vp);
vfs_op_exit(mp);
vfs_unbusy(mp);
Index: sys/fs/tmpfs/tmpfs.h
===================================================================
--- sys/fs/tmpfs/tmpfs.h
+++ sys/fs/tmpfs/tmpfs.h
@@ -526,6 +526,14 @@
return (node);
}
+static inline struct tmpfs_node *
+VP_TO_TMPFS_NODE_SMR(struct vnode *vp)
+{
+
+ MPASS(vp != NULL);
+ return (atomic_load_ptr(&vp->v_data));
+}
+
static inline struct tmpfs_node *
VP_TO_TMPFS_DIR(struct vnode *vp)
{
Index: sys/fs/tmpfs/tmpfs_subr.c
===================================================================
--- sys/fs/tmpfs/tmpfs_subr.c
+++ sys/fs/tmpfs/tmpfs_subr.c
@@ -75,6 +75,7 @@
static uma_zone_t tmpfs_dirent_pool;
static uma_zone_t tmpfs_node_pool;
+VFS_SMR_DECLARE;
static int
tmpfs_node_ctor(void *mem, int size, void *arg, int flags)
@@ -131,6 +132,7 @@
tmpfs_node_pool = uma_zcreate("TMPFS node",
sizeof(struct tmpfs_node), tmpfs_node_ctor, tmpfs_node_dtor,
tmpfs_node_init, tmpfs_node_fini, UMA_ALIGN_PTR, 0);
+ VFS_SMR_ZONE_SET(tmpfs_node_pool);
}
void
@@ -288,7 +290,7 @@
if ((mp->mnt_kern_flag & MNT_RDONLY) != 0)
return (EROFS);
- nnode = uma_zalloc_arg(tmpfs_node_pool, tmp, M_WAITOK);
+ nnode = uma_zalloc_smr(tmpfs_node_pool, M_WAITOK);
/* Generic initialization. */
nnode->tn_type = type;
@@ -435,7 +437,7 @@
panic("tmpfs_free_node: type %p %d", node, (int)node->tn_type);
}
- uma_zfree(tmpfs_node_pool, node);
+ uma_zfree_smr(tmpfs_node_pool, node);
TMPFS_LOCK(tmp);
tmpfs_free_tmp(tmp);
return (true);
@@ -1619,10 +1621,11 @@
int
tmpfs_chmod(struct vnode *vp, mode_t mode, struct ucred *cred, struct thread *p)
{
- int error;
+ int error, newmode;
struct tmpfs_node *node;
ASSERT_VOP_ELOCKED(vp, "chmod");
+ ASSERT_VOP_IN_SEQC(vp);
node = VP_TO_TMPFS_NODE(vp);
@@ -1656,9 +1659,9 @@
return (error);
}
-
- node->tn_mode &= ~ALLPERMS;
- node->tn_mode |= mode & ALLPERMS;
+ newmode = node->tn_mode & ~ALLPERMS;
+ newmode |= mode & ALLPERMS;
+ atomic_store_int(&node->tn_mode, newmode);
node->tn_status |= TMPFS_NODE_CHANGED;
@@ -1684,6 +1687,7 @@
gid_t ogid;
ASSERT_VOP_ELOCKED(vp, "chown");
+ ASSERT_VOP_IN_SEQC(vp);
node = VP_TO_TMPFS_NODE(vp);
@@ -1730,7 +1734,7 @@
if ((node->tn_mode & (S_ISUID | S_ISGID)) && (ouid != uid || ogid != gid)) {
if (priv_check_cred(cred, PRIV_VFS_RETAINSUGID))
- node->tn_mode &= ~(S_ISUID | S_ISGID);
+ atomic_store_int(&node->tn_mode, node->tn_mode & ~(S_ISUID | S_ISGID));
}
ASSERT_VOP_ELOCKED(vp, "chown2");
Index: sys/fs/tmpfs/tmpfs_vfsops.c
===================================================================
--- sys/fs/tmpfs/tmpfs_vfsops.c
+++ sys/fs/tmpfs/tmpfs_vfsops.c
@@ -462,6 +462,8 @@
mp->mnt_flag |= MNT_LOCAL;
mp->mnt_kern_flag |= MNTK_LOOKUP_SHARED | MNTK_EXTENDED_SHARED |
MNTK_TEXT_REFS | MNTK_NOMSYNC;
+ if (!nonc)
+ mp->mnt_kern_flag |= MNTK_FPLOOKUP;
MNT_IUNLOCK(mp);
mp->mnt_data = tmp;
Index: sys/fs/tmpfs/tmpfs_vnops.h
===================================================================
--- sys/fs/tmpfs/tmpfs_vnops.h
+++ sys/fs/tmpfs/tmpfs_vnops.h
@@ -49,6 +49,7 @@
extern struct vop_vector tmpfs_vnodeop_nonc_entries;
vop_access_t tmpfs_access;
+vop_fplookup_vexec_t tmpfs_fplookup_vexec;
vop_getattr_t tmpfs_getattr;
vop_setattr_t tmpfs_setattr;
vop_pathconf_t tmpfs_pathconf;
Index: sys/fs/tmpfs/tmpfs_vnops.c
===================================================================
--- sys/fs/tmpfs/tmpfs_vnops.c
+++ sys/fs/tmpfs/tmpfs_vnops.c
@@ -317,6 +317,32 @@
return (0);
}
+/*
+ * VOP_FPLOOKUP_VEXEC routines are subject to special circumstances, see
+ * the comment above cache_fplookup for details.
+ */
+int
+tmpfs_fplookup_vexec(struct vop_fplookup_vexec_args *v)
+{
+ struct vnode *vp;
+ struct tmpfs_node *node;
+ struct ucred *cred;
+ mode_t all_x, mode;
+
+ vp = v->a_vp;
+ node = VP_TO_TMPFS_NODE_SMR(vp);
+ if (__predict_false(node == NULL))
+ return (EAGAIN);
+
+ all_x = S_IXUSR | S_IXGRP | S_IXOTH;
+ mode = atomic_load_int(&node->tn_mode);
+ if (__predict_true((mode & all_x) == all_x))
+ return (0);
+
+ cred = v->a_cred;
+ return (vaccess_vexec_smr(mode, node->tn_uid, node->tn_gid, cred));
+}
+
int
tmpfs_access(struct vop_access_args *v)
{
@@ -428,6 +454,8 @@
MPASS(VOP_ISLOCKED(vp));
+ vn_seqc_write_begin(vp);
+
error = 0;
/* Abort if any unsettable attribute is given. */
@@ -466,6 +494,8 @@
* from tmpfs_update. */
tmpfs_update(vp);
+ vn_seqc_write_end(vp);
+
MPASS(VOP_ISLOCKED(vp));
return error;
@@ -806,12 +836,15 @@
struct tmpfs_node *tnode;
struct tmpfs_node *tdnode;
int error;
+ bool want_seqc_end;
MPASS(VOP_ISLOCKED(tdvp));
MPASS(IMPLIES(tvp != NULL, VOP_ISLOCKED(tvp)));
MPASS(fcnp->cn_flags & HASBUF);
MPASS(tcnp->cn_flags & HASBUF);
+ want_seqc_end = false;
+
/*
* Disallow cross-device renames.
* XXX Why isn't this done by the caller?
@@ -852,6 +885,13 @@
}
}
+ if (tvp != NULL)
+ vn_seqc_write_begin(tvp);
+ vn_seqc_write_begin(tdvp);
+ vn_seqc_write_begin(fvp);
+ vn_seqc_write_begin(fdvp);
+ want_seqc_end = true;
+
tmp = VFS_TO_TMPFS(tdvp->v_mount);
tdnode = VP_TO_TMPFS_DIR(tdvp);
tnode = (tvp == NULL) ? NULL : VP_TO_TMPFS_NODE(tvp);
@@ -1065,6 +1105,14 @@
VOP_UNLOCK(fdvp);
out:
+ if (want_seqc_end) {
+ if (tvp != NULL)
+ vn_seqc_write_end(tvp);
+ vn_seqc_write_end(tdvp);
+ vn_seqc_write_end(fvp);
+ vn_seqc_write_end(fdvp);
+ }
+
/*
* Release target nodes.
* XXX: I don't understand when tdvp can be the same as tvp, but
@@ -1621,6 +1669,7 @@
.vop_mknod = tmpfs_mknod,
.vop_open = tmpfs_open,
.vop_close = tmpfs_close,
+ .vop_fplookup_vexec = tmpfs_fplookup_vexec,
.vop_access = tmpfs_access,
.vop_getattr = tmpfs_getattr,
.vop_setattr = tmpfs_setattr,
Index: sys/kern/kern_descrip.c
===================================================================
--- sys/kern/kern_descrip.c
+++ sys/kern/kern_descrip.c
@@ -102,8 +102,8 @@
static __read_mostly uma_zone_t file_zone;
static __read_mostly uma_zone_t filedesc0_zone;
-static __read_mostly uma_zone_t pwd_zone;
-static __read_mostly smr_t pwd_smr;
+__read_mostly uma_zone_t pwd_zone;
+VFS_SMR_DECLARE;
static int closefp(struct filedesc *fdp, int fd, struct file *fp,
struct thread *td, int holdleaders);
@@ -3346,14 +3346,24 @@
fdp = td->td_proc->p_fd;
- smr_enter(pwd_smr);
+ vfs_smr_enter();
for (;;) {
- pwd = smr_entered_load(&fdp->fd_pwd, pwd_smr);
+ pwd = smr_entered_load(&fdp->fd_pwd, VFS_SMR());
MPASS(pwd != NULL);
if (refcount_acquire_if_not_zero(&pwd->pwd_refcount))
break;
}
- smr_exit(pwd_smr);
+ vfs_smr_exit();
+ return (pwd);
+}
+
+struct pwd *
+pwd_get_smr(void)
+{
+ struct pwd *pwd;
+
+ pwd = smr_entered_load(&curproc->p_fd->fd_pwd, VFS_SMR());
+ MPASS(pwd != NULL);
return (pwd);
}
@@ -4363,7 +4373,11 @@
NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
pwd_zone = uma_zcreate("PWD", sizeof(struct pwd), NULL, NULL,
NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_SMR);
- pwd_smr = uma_zone_get_smr(pwd_zone);
+ /*
+ * XXXMJG this is a temporary hack due to boot ordering issues against
+ * the vnode zone.
+ */
+ vfs_smr = uma_zone_get_smr(pwd_zone);
mtx_init(&sigio_lock, "sigio lock", NULL, MTX_DEF);
}
SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, filelistinit, NULL);
Index: sys/kern/vfs_cache.c
===================================================================
--- sys/kern/vfs_cache.c
+++ sys/kern/vfs_cache.c
@@ -55,6 +55,7 @@
#include <sys/namei.h>
#include <sys/proc.h>
#include <sys/rwlock.h>
+#include <sys/seqc.h>
#include <sys/sdt.h>
#include <sys/smr.h>
#include <sys/smp.h>
@@ -67,6 +68,11 @@
#include <sys/ktrace.h>
#endif
+#include <sys/capsicum.h>
+
+#include <security/audit/audit.h>
+#include <security/mac/mac_framework.h>
+
#ifdef DDB
#include <ddb/ddb.h>
#endif
@@ -100,6 +106,8 @@
SDT_PROBE_DEFINE2(vfs, namecache, shrink_negative, done, "struct vnode *",
"char *");
+SDT_PROBE_DEFINE3(vfs, fplookup, lookup, done, "struct nameidata", "int", "bool");
+
/*
* This structure describes the elements in the cache of recent
* names looked up by namei.
@@ -2810,3 +2818,841 @@
}
#endif
+
+extern uma_zone_t namei_zone;
+
+static bool __read_frequently cache_fast_lookup = true;
+SYSCTL_BOOL(_vfs, OID_AUTO, cache_fast_lookup, CTLFLAG_RW,
+ &cache_fast_lookup, 0, "");
+
+#define CACHE_FPL_FAILED -2020
+
+static void
+cache_fpl_cleanup_cnp(struct componentname *cnp)
+{
+
+ uma_zfree(namei_zone, cnp->cn_pnbuf);
+#ifdef DIAGNOSTIC
+ cnp->cn_pnbuf = NULL;
+ cnp->cn_nameptr = NULL;
+#endif
+}
+
+static void
+cache_fpl_handle_root(struct nameidata *ndp, struct vnode **dpp)
+{
+ struct componentname *cnp;
+
+ cnp = &ndp->ni_cnd;
+ while (*(cnp->cn_nameptr) == '/') {
+ cnp->cn_nameptr++;
+ ndp->ni_pathlen--;
+ }
+}
+
+static void
+cache_fpl_handle_root_initial(struct nameidata *ndp, struct vnode **dpp)
+{
+
+ cache_fpl_handle_root(ndp, dpp);
+ *dpp = ndp->ni_rootdir;
+}
+
+/*
+ * Components of nameidata (or objects it can point to) which may
+ * need restoring in case fast path lookup fails.
+ */
+struct nameidata_saved {
+ int cn_flags;
+ long cn_namelen;
+ char *cn_nameptr;
+ size_t ni_pathlen;
+};
+
+struct cache_fpl {
+ int line;
+ enum cache_fpl_status status;
+ bool in_smr;
+ struct nameidata *ndp;
+ struct nameidata_saved snd;
+ struct componentname *cnp;
+ struct vnode *dvp;
+ seqc_t dvp_seqc;
+ struct vnode *tvp;
+ seqc_t tvp_seqc;
+ struct pwd *pwd;
+};
+
+static void
+cache_fpl_checkpoint(struct cache_fpl *fpl, struct nameidata_saved *snd)
+{
+
+ snd->cn_flags = fpl->ndp->ni_cnd.cn_flags;
+ snd->cn_namelen = fpl->ndp->ni_cnd.cn_namelen;
+ snd->cn_nameptr = fpl->ndp->ni_cnd.cn_nameptr;
+ snd->ni_pathlen = fpl->ndp->ni_pathlen;
+}
+
+static void
+cache_fpl_restore(struct cache_fpl *fpl, struct nameidata_saved *snd)
+{
+
+ fpl->ndp->ni_cnd.cn_flags = snd->cn_flags;
+ fpl->ndp->ni_cnd.cn_namelen = snd->cn_namelen;
+ fpl->ndp->ni_cnd.cn_nameptr = snd->cn_nameptr;
+ fpl->ndp->ni_pathlen = snd->ni_pathlen;
+}
+
+#ifdef INVARIANTS
+#define cache_fpl_smr_assert_entered(fpl) ({ \
+ struct cache_fpl *_fpl = (fpl); \
+ MPASS(_fpl->in_smr == true); \
+ VFS_SMR_ASSERT_ENTERED(); \
+})
+#define cache_fpl_smr_assert_not_entered(fpl) ({ \
+ struct cache_fpl *_fpl = (fpl); \
+ MPASS(_fpl->in_smr == false); \
+ VFS_SMR_ASSERT_NOT_ENTERED(); \
+})
+#else
+#define cache_fpl_smr_assert_entered(fpl) do { } while (0)
+#define cache_fpl_smr_assert_not_entered(fpl) do { } while (0)
+#endif
+
+#define cache_fpl_smr_enter(fpl) ({ \
+ struct cache_fpl *_fpl = (fpl); \
+ MPASS(_fpl->in_smr == false); \
+ vfs_smr_enter(); \
+ _fpl->in_smr = true; \
+})
+
+#define cache_fpl_smr_exit(fpl) ({ \
+ struct cache_fpl *_fpl = (fpl); \
+ MPASS(_fpl->in_smr == true); \
+ vfs_smr_exit(); \
+ _fpl->in_smr = false; \
+})
+
+static int
+cache_fpl_aborted_impl(struct cache_fpl *fpl, int line)
+{
+
+ KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET,
+ ("%s: lookup status already set at %d\n", __func__, fpl->line));
+ fpl->status = CACHE_FPL_STATUS_ABORTED;
+ fpl->line = line;
+ return (CACHE_FPL_FAILED);
+}
+
+#define cache_fpl_aborted(x) cache_fpl_aborted_impl((x), __LINE__)
+
+static int
+cache_fpl_partial_impl(struct cache_fpl *fpl, int line)
+{
+
+ KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET,
+ ("%s: lookup status already set at %d\n", __func__, fpl->line));
+ cache_fpl_smr_assert_entered(fpl);
+ fpl->status = CACHE_FPL_STATUS_PARTIAL;
+ fpl->line = line;
+ return (CACHE_FPL_FAILED);
+}
+
+#define cache_fpl_partial(x) cache_fpl_partial_impl((x), __LINE__)
+
+static int
+cache_fpl_handled_impl(struct cache_fpl *fpl, int error, int line)
+{
+
+ KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET,
+ ("%s: lookup status already set at %d\n", __func__, fpl->line));
+ cache_fpl_smr_assert_not_entered(fpl);
+ MPASS(error != CACHE_FPL_FAILED);
+ fpl->status = CACHE_FPL_STATUS_HANDLED;
+ fpl->line = line;
+ return (error);
+}
+
+#define cache_fpl_handled(x, e) cache_fpl_handled_impl((x), (e), __LINE__)
+
+#define CACHE_FPL_SUPPORTED_CN_FLAGS \
+ (LOCKLEAF | FOLLOW | LOCKSHARED | SAVENAME | ISOPEN | AUDITVNODE1)
+
+static bool
+cache_can_fplookup(struct cache_fpl *fpl)
+{
+ struct nameidata *ndp;
+ struct componentname *cnp;
+ struct thread *td;
+
+ ndp = fpl->ndp;
+ cnp = fpl->cnp;
+ td = cnp->cn_thread;
+
+ if (!cache_fast_lookup) {
+ cache_fpl_aborted(fpl);
+ return (false);
+ }
+ if (mac_vnode_check_lookup_enabled()) {
+ cache_fpl_aborted(fpl);
+ return (false);
+ }
+ if (cnp->cn_flags & ~CACHE_FPL_SUPPORTED_CN_FLAGS) {
+ cache_fpl_aborted(fpl);
+ return (false);
+ }
+ if ((cnp->cn_flags & LOCKLEAF) == 0) {
+ cache_fpl_aborted(fpl);
+ return (false);
+ }
+ if (cnp->cn_nameiop != LOOKUP) {
+ cache_fpl_aborted(fpl);
+ return (false);
+ }
+ if (ndp->ni_dirfd != AT_FDCWD) {
+ cache_fpl_aborted(fpl);
+ return (false);
+ }
+ if (IN_CAPABILITY_MODE(td)) {
+ cache_fpl_aborted(fpl);
+ return (false);
+ }
+ if (AUDITING_TD(td)) {
+ cache_fpl_aborted(fpl);
+ return (false);
+ }
+ if (ndp->ni_startdir != NULL) {
+ cache_fpl_aborted(fpl);
+ return (false);
+ }
+ return (true);
+}
+
+static bool
+cache_fplookup_vnode_supported(struct vnode *vp)
+{
+
+ switch (vp->v_type) {
+ case VLNK:
+ return (false);
+ default:
+ break;
+ }
+ return (true);
+}
+
+/*
+ * The target vnode is not supported, prepare for the slow path to take over.
+ */
+static int
+cache_fplookup_partial_setup(struct cache_fpl *fpl)
+{
+ struct componentname *cnp;
+ struct vnode *dvp;
+ struct pwd *pwd;
+ seqc_t dvp_seqc;
+
+ cnp = fpl->cnp;
+ dvp = fpl->dvp;
+ dvp_seqc = fpl->dvp_seqc;
+
+ if (!vref_smr(dvp)) {
+ fpl->status = CACHE_FPL_STATUS_ABORTED;
+ cache_fpl_smr_exit(fpl);
+ return (CACHE_FPL_FAILED);
+ }
+
+ cache_fpl_smr_exit(fpl);
+ if (!seqc_consistent(&dvp->v_seqc, dvp_seqc)) {
+ fpl->status = CACHE_FPL_STATUS_ABORTED;
+ vrele(dvp);
+ return (CACHE_FPL_FAILED);
+ }
+
+ pwd = pwd_hold(curthread);
+ if (fpl->pwd != pwd) {
+ fpl->status = CACHE_FPL_STATUS_ABORTED;
+ vrele(dvp);
+ pwd_drop(pwd);
+ return (CACHE_FPL_FAILED);
+ }
+
+ fpl->ndp->ni_startdir = dvp;
+ return (0);
+}
+
+static int
+cache_fplookup_final(struct cache_fpl *fpl)
+{
+ struct componentname *cnp;
+ enum vgetstate tvs;
+ struct vnode *dvp, *tvp;
+ seqc_t dvp_seqc, tvp_seqc;
+ int error;
+
+ cnp = fpl->cnp;
+ dvp = fpl->dvp;
+ dvp_seqc = fpl->dvp_seqc;
+ tvp = fpl->tvp;
+ tvp_seqc = fpl->tvp_seqc;
+
+ VNPASS(cache_fplookup_vnode_supported(dvp), dvp);
+ MPASS((cnp->cn_flags & LOCKLEAF) != 0);
+
+ tvs = vget_prep_smr(tvp);
+ if (tvs == VGET_NONE) {
+ return (cache_fpl_partial(fpl));
+ }
+
+ if (!seqc_consistent(&dvp->v_seqc, dvp_seqc)) {
+ cache_fpl_smr_exit(fpl);
+ vget_abort(tvp, tvs);
+ return (cache_fpl_aborted(fpl));
+ }
+
+ cache_fpl_smr_exit(fpl);
+
+ error = vget_finish(tvp, cnp->cn_lkflags, tvs);
+ if (error != 0) {
+ return (cache_fpl_aborted(fpl));
+ }
+
+ if (!seqc_consistent(&tvp->v_seqc, tvp_seqc)) {
+ vput(tvp);
+ return (cache_fpl_aborted(fpl));
+ }
+
+ return (cache_fpl_handled(fpl, 0));
+}
+
+static int
+cache_fplookup_next(struct cache_fpl *fpl)
+{
+ struct componentname *cnp;
+ struct namecache *ncp;
+ struct vnode *dvp, *tvp;
+ u_char nc_flag;
+ uint32_t hash;
+
+ cnp = fpl->cnp;
+ dvp = fpl->dvp;
+
+ if (__predict_false(cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.')) {
+ fpl->tvp = dvp;
+ fpl->tvp_seqc = seqc_read_any(&dvp->v_seqc);
+ if (seqc_in_modify(fpl->tvp_seqc)) {
+ return (cache_fpl_partial(fpl));
+ }
+ return (0);
+ }
+
+ hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
+
+ CK_LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
+ counter_u64_add(numchecks, 1);
+ if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
+ !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
+ break;
+ }
+
+ /*
+ * If there is no entry we have to punt to the slow path to perform
+ * actual lookup. Should there be nothing with this name a negative
+ * entry will be created.
+ */
+ if (__predict_false(ncp == NULL)) {
+ return (cache_fpl_partial(fpl));
+ }
+
+ tvp = atomic_load_ptr(&ncp->nc_vp);
+ nc_flag = atomic_load_char(&ncp->nc_flag);
+ if (__predict_false(cache_ncp_invalid(ncp))) {
+ return (cache_fpl_partial(fpl));
+ }
+ if (__predict_false(nc_flag & NCF_WHITE)) {
+ return (cache_fpl_partial(fpl));
+ }
+
+ fpl->tvp = tvp;
+ if (nc_flag & NCF_NEGATIVE) {
+ if ((nc_flag & NCF_HOTNEGATIVE) == 0) {
+ /*
+ * TODO
+ * Promoting to hot negative requires locks which are
+ * not yet supported for simplicity.
+ */
+ return (cache_fpl_partial(fpl));
+ }
+ SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp,
+ ncp->nc_name);
+ counter_u64_add(numneghits, 1);
+ cache_fpl_smr_exit(fpl);
+ return (cache_fpl_handled(fpl, ENOENT));
+ }
+
+ fpl->tvp_seqc = seqc_read_any(&tvp->v_seqc);
+ if (seqc_in_modify(fpl->tvp_seqc)) {
+ return (cache_fpl_partial(fpl));
+ }
+
+ if (!cache_fplookup_vnode_supported(tvp)) {
+ return (cache_fpl_partial(fpl));
+ }
+
+ counter_u64_add(numposhits, 1);
+ SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, tvp);
+ return (0);
+}
+
+static bool
+cache_fplookup_mp_supported(struct mount *mp)
+{
+
+ if (mp == NULL)
+ return (false);
+ if ((mp->mnt_kern_flag & MNTK_FPLOOKUP) == 0)
+ return (false);
+ if (mp->mnt_flag & MNT_UNION)
+ return (false);
+ return (true);
+}
+
+/*
+ * Walk up the mount stack (if any).
+ *
+ * Correctness is provided in the following ways:
+ * - all vnodes are protected from freeing with SMR
+ * - struct mount objects are type stable making them always safe to access
+ * - stability of the particular mount is provided by busying it
+ * - relationship between the vnode which is mounted on and the mount is
+ * verified with the vnode sequence counter after busying
+ * - association between root vnode of the mount and the mount is protected
+ * by busy
+ *
+ * From that point on we can read the sequence counter of the root vnode
+ * and get the next mount on the stack (if any) using the same protection.
+ *
+ * By the end of successful walk we are guaranteed the reached state was
+ * indeed present at least at some point which matches the regular lookup.
+ */
+static int
+cache_fplookup_climb_mount(struct cache_fpl *fpl)
+{
+ struct mount *mp, *prev_mp;
+ struct vnode *vp;
+ seqc_t vp_seqc;
+
+ vp = fpl->tvp;
+ vp_seqc = fpl->tvp_seqc;
+ if (vp->v_type != VDIR)
+ return (0);
+
+ mp = atomic_load_ptr(&vp->v_mountedhere);
+ if (mp == NULL)
+ return (0);
+
+ prev_mp = NULL;
+ for (;;) {
+ if (!vfs_op_thread_enter(mp)) {
+ if (prev_mp != NULL)
+ vfs_op_thread_exit(prev_mp);
+ return (cache_fpl_partial(fpl));
+ }
+ if (prev_mp != NULL)
+ vfs_op_thread_exit(prev_mp);
+ if (!seqc_consistent(&vp->v_seqc, vp_seqc)) {
+ vfs_op_thread_exit(mp);
+ return (cache_fpl_partial(fpl));
+ }
+ if (!cache_fplookup_mp_supported(mp)) {
+ vfs_op_thread_exit(mp);
+ return (cache_fpl_partial(fpl));
+ }
+ vp = atomic_load_ptr(&mp->mnt_rootvnode);
+ if (vp == NULL || VN_IS_DOOMED(vp)) {
+ vfs_op_thread_exit(mp);
+ return (cache_fpl_partial(fpl));
+ }
+ vp_seqc = seqc_read_any(&vp->v_seqc);
+ if (seqc_in_modify(vp_seqc)) {
+ vfs_op_thread_exit(mp);
+ return (cache_fpl_partial(fpl));
+ }
+ prev_mp = mp;
+ mp = atomic_load_ptr(&vp->v_mountedhere);
+ if (mp == NULL)
+ break;
+ }
+
+ vfs_op_thread_exit(prev_mp);
+ fpl->tvp = vp;
+ fpl->tvp_seqc = vp_seqc;
+ return (0);
+}
+
+/*
+ * Parse the path.
+ *
+ * The code is mostly copy-pasted from regular lookup, see lookup().
+ * The structure is maintained along with comments for easier maintenance.
+ * Deduplicating the code will become feasible after fast path lookup
+ * becomes more feature-complete.
+ */
+static int
+cache_fplookup_parse(struct cache_fpl *fpl)
+{
+ struct nameidata *ndp;
+ struct componentname *cnp;
+ char *cp;
+ char *prev_ni_next; /* saved ndp->ni_next */
+ size_t prev_ni_pathlen; /* saved ndp->ni_pathlen */
+
+ ndp = fpl->ndp;
+ cnp = fpl->cnp;
+
+ /*
+ * Search a new directory.
+ *
+ * The last component of the filename is left accessible via
+ * cnp->cn_nameptr for callers that need the name. Callers needing
+ * the name set the SAVENAME flag. When done, they assume
+ * responsibility for freeing the pathname buffer.
+ */
+ for (cp = cnp->cn_nameptr; *cp != 0 && *cp != '/'; cp++)
+ continue;
+ cnp->cn_namelen = cp - cnp->cn_nameptr;
+ if (cnp->cn_namelen > NAME_MAX) {
+ cache_fpl_smr_exit(fpl);
+ return (cache_fpl_handled(fpl, ENAMETOOLONG));
+ }
+ prev_ni_pathlen = ndp->ni_pathlen;
+ ndp->ni_pathlen -= cnp->cn_namelen;
+ KASSERT(ndp->ni_pathlen <= PATH_MAX,
+ ("%s: ni_pathlen underflow to %zd\n", __func__, ndp->ni_pathlen));
+ prev_ni_next = ndp->ni_next;
+ ndp->ni_next = cp;
+
+ /*
+ * Replace multiple slashes by a single slash and trailing slashes
+ * by a null. This must be done before VOP_LOOKUP() because some
+ * fs's don't know about trailing slashes. Remember if there were
+ * trailing slashes to handle symlinks, existing non-directories
+ * and non-existing files that won't be directories specially later.
+ */
+ while (*cp == '/' && (cp[1] == '/' || cp[1] == '\0')) {
+ cp++;
+ ndp->ni_pathlen--;
+ if (*cp == '\0') {
+ /*
+ * TODO
+ * Regular lookup performs the following:
+ * *ndp->ni_next = '\0';
+ * cnp->cn_flags |= TRAILINGSLASH;
+ *
+ * Which is problematic since it modifies data read
+ * from userspace. Then if fast path lookup was to
+ * abort we would have to either restore it or convey
+ * the flag. Since this is a corner case just ignore
+ * it for simplicity.
+ */
+ return (cache_fpl_partial(fpl));
+ }
+ }
+ ndp->ni_next = cp;
+
+ cnp->cn_flags |= MAKEENTRY;
+
+ if (cnp->cn_namelen == 2 &&
+ cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.')
+ cnp->cn_flags |= ISDOTDOT;
+ else
+ cnp->cn_flags &= ~ISDOTDOT;
+ if (*ndp->ni_next == 0)
+ cnp->cn_flags |= ISLASTCN;
+ else
+ cnp->cn_flags &= ~ISLASTCN;
+
+ /*
+ * Check for degenerate name (e.g. / or "")
+ * which is a way of talking about a directory,
+ * e.g. like "/." or ".".
+ *
+ * TODO
+ * Another corner case handled by the regular lookup
+ */
+ if (__predict_false(cnp->cn_nameptr[0] == '\0')) {
+ return (cache_fpl_partial(fpl));
+ }
+ return (0);
+}
+
+static void
+cache_fplookup_parse_advance(struct cache_fpl *fpl)
+{
+ struct nameidata *ndp;
+ struct componentname *cnp;
+
+ ndp = fpl->ndp;
+ cnp = fpl->cnp;
+
+ cnp->cn_nameptr = ndp->ni_next;
+ while (*cnp->cn_nameptr == '/') {
+ cnp->cn_nameptr++;
+ ndp->ni_pathlen--;
+ }
+}
+
+static int
+cache_fplookup_impl(struct vnode *dvp, struct cache_fpl *fpl)
+{
+ struct nameidata *ndp;
+ struct componentname *cnp;
+ struct mount *mp;
+ int error;
+
+ error = CACHE_FPL_FAILED;
+ ndp = fpl->ndp;
+ ndp->ni_lcf = 0;
+ cnp = fpl->cnp;
+ cnp->cn_lkflags = LK_SHARED;
+ if ((cnp->cn_flags & LOCKSHARED) == 0)
+ cnp->cn_lkflags = LK_EXCLUSIVE;
+
+ cache_fpl_checkpoint(fpl, &fpl->snd);
+
+ fpl->dvp = dvp;
+ fpl->dvp_seqc = seqc_read_any(&fpl->dvp->v_seqc);
+ if (seqc_in_modify(fpl->dvp_seqc)) {
+ cache_fpl_aborted(fpl);
+ goto out;
+ }
+ mp = atomic_load_ptr(&fpl->dvp->v_mount);
+ if (!cache_fplookup_mp_supported(mp)) {
+ cache_fpl_aborted(fpl);
+ goto out;
+ }
+
+ VNPASS(cache_fplookup_vnode_supported(fpl->dvp), fpl->dvp);
+
+ for (;;) {
+ error = cache_fplookup_parse(fpl);
+ if (__predict_false(error != 0)) {
+ break;
+ }
+
+ if (cnp->cn_flags & ISDOTDOT) {
+ error = cache_fpl_partial(fpl);
+ break;
+ }
+
+ VNPASS(cache_fplookup_vnode_supported(fpl->dvp), fpl->dvp);
+
+ error = VOP_FPLOOKUP_VEXEC(fpl->dvp, cnp->cn_cred, cnp->cn_thread);
+ if (__predict_false(error != 0)) {
+ switch (error) {
+ case EAGAIN:
+ case EOPNOTSUPP: /* can happen when racing against vgone */
+ cache_fpl_partial(fpl);
+ break;
+ default:
+ /*
+ * See the API contract for VOP_FPLOOKUP_VEXEC.
+ */
+ if (!seqc_consistent(&fpl->dvp->v_seqc, fpl->dvp_seqc)) {
+ error = cache_fpl_aborted(fpl);
+ } else {
+ cache_fpl_smr_exit(fpl);
+ cache_fpl_handled(fpl, error);
+ }
+ break;
+ }
+ break;
+ }
+
+ error = cache_fplookup_next(fpl);
+ if (__predict_false(error != 0)) {
+ break;
+ }
+
+ VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp);
+
+ error = cache_fplookup_climb_mount(fpl);
+ if (__predict_false(error != 0)) {
+ break;
+ }
+
+ VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp);
+
+ if (cnp->cn_flags & ISLASTCN) {
+ error = cache_fplookup_final(fpl);
+ break;
+ }
+
+ if (!seqc_consistent(&fpl->dvp->v_seqc, fpl->dvp_seqc)) {
+ error = cache_fpl_aborted(fpl);
+ break;
+ }
+
+ fpl->dvp = fpl->tvp;
+ fpl->dvp_seqc = fpl->tvp_seqc;
+
+ cache_fplookup_parse_advance(fpl);
+ cache_fpl_checkpoint(fpl, &fpl->snd);
+ }
+out:
+ switch (fpl->status) {
+ case CACHE_FPL_STATUS_UNSET:
+ __assert_unreachable();
+ break;
+ case CACHE_FPL_STATUS_PARTIAL:
+ cache_fpl_smr_assert_entered(fpl);
+ return (cache_fplookup_partial_setup(fpl));
+ case CACHE_FPL_STATUS_ABORTED:
+ if (fpl->in_smr)
+ cache_fpl_smr_exit(fpl);
+ return (CACHE_FPL_FAILED);
+ case CACHE_FPL_STATUS_HANDLED:
+ cache_fpl_smr_assert_not_entered(fpl);
+ if (__predict_false(error != 0)) {
+ ndp->ni_dvp = NULL;
+ ndp->ni_vp = NULL;
+ cache_fpl_cleanup_cnp(cnp);
+ return (error);
+ }
+ ndp->ni_dvp = fpl->dvp;
+ ndp->ni_vp = fpl->tvp;
+ if (cnp->cn_flags & SAVENAME)
+ cnp->cn_flags |= HASBUF;
+ else
+ cache_fpl_cleanup_cnp(cnp);
+ return (error);
+ }
+}
+
+/*
+ * Fast path lookup protected with SMR and sequence counters.
+ *
+ * Note: all VOP_FPLOOKUP_VEXEC routines have a comment referencing this one.
+ *
+ * Filesystems can opt in by setting the MNTK_FPLOOKUP flag and meeting criteria
+ * outlined below.
+ *
+ * Traditional vnode lookup conceptually looks like this:
+ *
+ * vn_lock(current);
+ * for (;;) {
+ * next = find();
+ * vn_lock(next);
+ * vn_unlock(current);
+ * current = next;
+ * if (last)
+ * break;
+ * }
+ *
+ * Each jump to the next vnode is safe memory-wise and atomic with respect to
+ * any modifications thanks to holding respective locks.
+ *
+ * The same guarantee can be provided with a combination of safe memory
+ * reclamation and sequence counters instead. If all operations which affect
+ * the relationship between the current vnode and the one we are looking for
+ * also modify the counter, we can verify whether all the conditions held as
+ * we made the jump. This includes things like permissions, mount point etc.
+ * You can grep for vn_seqc_write_begin to check all the places.
+ *
+ * Thus this translates to:
+ *
+ * vfs_smr_enter();
+ * current_seqc = seqc_read_any(current);
+ * if (seqc_in_modify(current_seqc)) // someone is altering the vnode
+ * abort();
+ * for (;;) {
+ * next = find();
+ * next_seqc = seqc_read_any(next);
+ * if (!seqc_consistent(current, current_seqc) // someone is altering the vnode
+ * abort();
+ * current = next; // we know nothing of importance has changed
+ * current_seqc = next_seqc; // store the counter for the next iteration
+ * if (last)
+ * break;
+ * }
+ *
+ * API contract for VOP_FPLOOKUP_VEXEC routines is as follows:
+ * - they are called while within vfs_smr protection which they must never exit
+ * - EAGAIN can be returned to denote checking could not be performed, it is
+ * always valid to return it
+ * - if the sequence counter has not changed the result must be valid
+ * - if the sequence counter has changed both false positives and false negatives
+ * are permitted (since the result will be rejected later)
+ * - for simple cases of unix permission checks vaccess_vexec_smr can be used
+ *
+ * Caveats to watch out for:
+ * - vnodes are passed unlocked and unreferenced with nothing stopping
+ * VOP_RECLAIM, in turn meaning that ->v_data can become NULL. It is advised
+ * to use atomic_load_ptr to fetch it.
+ * - aforementioned object can also get freed, meaning absent other means it
+ * should be protected with vfs_smr
+ * - either safely checking permissions as they are modified or guaranteeing
+ * their stability is left to the routine
+ */
+int
+cache_fplookup(struct nameidata *ndp, enum cache_fpl_status *status,
+ struct pwd **pwdp)
+{
+ struct cache_fpl fpl;
+ struct pwd *pwd;
+ struct vnode *dvp;
+ struct componentname *cnp;
+ struct nameidata_saved orig;
+ int error;
+
+ *status = CACHE_FPL_STATUS_UNSET;
+ bzero(&fpl, sizeof(fpl));
+ fpl.status = CACHE_FPL_STATUS_UNSET;
+ fpl.ndp = ndp;
+ fpl.cnp = &ndp->ni_cnd;
+ MPASS(curthread == fpl.cnp->cn_thread);
+
+ if (!cache_can_fplookup(&fpl)) {
+ SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status);
+ *status = fpl.status;
+ return (EOPNOTSUPP);
+ }
+
+ cache_fpl_checkpoint(&fpl, &orig);
+
+ cache_fpl_smr_enter(&fpl);
+ pwd = pwd_get_smr();
+ fpl.pwd = pwd;
+ ndp->ni_rootdir = pwd->pwd_rdir;
+ ndp->ni_topdir = pwd->pwd_jdir;
+
+ cnp = fpl.cnp;
+ cnp->cn_nameptr = cnp->cn_pnbuf;
+ if (cnp->cn_pnbuf[0] == '/') {
+ cache_fpl_handle_root_initial(ndp, &dvp);
+ } else {
+ MPASS(ndp->ni_dirfd == AT_FDCWD);
+ dvp = pwd->pwd_cdir;
+ }
+
+ error = cache_fplookup_impl(dvp, &fpl);
+ cache_fpl_smr_assert_not_entered(&fpl);
+ SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status);
+
+ *status = fpl.status;
+ switch (fpl.status) {
+ case CACHE_FPL_STATUS_UNSET:
+ __assert_unreachable();
+ break;
+ case CACHE_FPL_STATUS_HANDLED:
+ break;
+ case CACHE_FPL_STATUS_PARTIAL:
+ *pwdp = fpl.pwd;
+ cache_fpl_restore(&fpl, &fpl.snd);
+ break;
+ case CACHE_FPL_STATUS_ABORTED:
+ cache_fpl_restore(&fpl, &orig);
+ break;
+ }
+ return (error);
+}
Index: sys/kern/vfs_lookup.c
===================================================================
--- sys/kern/vfs_lookup.c
+++ sys/kern/vfs_lookup.c
@@ -280,77 +280,21 @@
return (0);
}
-/*
- * Convert a pathname into a pointer to a locked vnode.
- *
- * The FOLLOW flag is set when symbolic links are to be followed
- * when they occur at the end of the name translation process.
- * Symbolic links are always followed for all other pathname
- * components other than the last.
- *
- * The segflg defines whether the name is to be copied from user
- * space or kernel space.
- *
- * Overall outline of namei:
- *
- * copy in name
- * get starting directory
- * while (!done && !error) {
- * call lookup to search path.
- * if symbolic link, massage name in buffer and continue
- * }
- */
-int
-namei(struct nameidata *ndp)
+static int
+namei_setup(struct nameidata *ndp, struct vnode **dpp, struct pwd **pwdp)
{
- char *cp; /* pointer into pathname argument */
- struct vnode *dp; /* the directory we are searching */
- struct iovec aiov; /* uio for reading symbolic links */
struct componentname *cnp;
struct file *dfp;
struct thread *td;
- struct proc *p;
struct pwd *pwd;
cap_rights_t rights;
struct filecaps dirfd_caps;
- struct uio auio;
- int error, linklen, startdir_used;
+ int error, startdir_used;
cnp = &ndp->ni_cnd;
td = cnp->cn_thread;
- p = td->td_proc;
- ndp->ni_cnd.cn_cred = ndp->ni_cnd.cn_thread->td_ucred;
- KASSERT(cnp->cn_cred && p, ("namei: bad cred/proc"));
- KASSERT((cnp->cn_nameiop & (~OPMASK)) == 0,
- ("namei: nameiop contaminated with flags"));
- KASSERT((cnp->cn_flags & OPMASK) == 0,
- ("namei: flags contaminated with nameiops"));
- MPASS(ndp->ni_startdir == NULL || ndp->ni_startdir->v_type == VDIR ||
- ndp->ni_startdir->v_type == VBAD);
- TAILQ_INIT(&ndp->ni_cap_tracker);
- ndp->ni_lcf = 0;
-
- /* We will set this ourselves if we need it. */
- cnp->cn_flags &= ~TRAILINGSLASH;
- /*
- * Get a buffer for the name to be translated, and copy the
- * name into the buffer.
- */
- if ((cnp->cn_flags & HASBUF) == 0)
- cnp->cn_pnbuf = uma_zalloc(namei_zone, M_WAITOK);
- if (ndp->ni_segflg == UIO_SYSSPACE)
- error = copystr(ndp->ni_dirp, cnp->cn_pnbuf, MAXPATHLEN,
- &ndp->ni_pathlen);
- else
- error = copyinstr(ndp->ni_dirp, cnp->cn_pnbuf, MAXPATHLEN,
- &ndp->ni_pathlen);
-
- /*
- * Don't allow empty pathnames.
- */
- if (error == 0 && *cnp->cn_pnbuf == '\0')
- error = ENOENT;
+ *pwdp = NULL;
#ifdef CAPABILITY_MODE
/*
@@ -366,24 +310,17 @@
* previously walked by us, which prevents an escape from
* the relative root.
*/
- if (error == 0 && IN_CAPABILITY_MODE(td) &&
- (cnp->cn_flags & NOCAPCHECK) == 0) {
+ if (IN_CAPABILITY_MODE(td) && (cnp->cn_flags & NOCAPCHECK) == 0) {
ndp->ni_lcf |= NI_LCF_STRICTRELATIVE;
if (ndp->ni_dirfd == AT_FDCWD) {
#ifdef KTRACE
if (KTRPOINT(td, KTR_CAPFAIL))
ktrcapfail(CAPFAIL_LOOKUP, NULL, NULL);
#endif
- error = ECAPMODE;
+ return (ECAPMODE);
}
}
#endif
- if (error != 0) {
- namei_cleanup_cnp(cnp);
- ndp->ni_vp = NULL;
- return (error);
- }
- ndp->ni_loopcnt = 0;
#ifdef KTRACE
if (KTRPOINT(td, KTR_NAMEI)) {
KASSERT(cnp->cn_thread == curthread,
@@ -391,6 +328,8 @@
ktrnamei(cnp->cn_pnbuf);
}
#endif
+ error = 0;
+
/*
* Get starting point for the translation.
*/
@@ -402,19 +341,16 @@
ndp->ni_rootdir = pwd->pwd_rdir;
ndp->ni_topdir = pwd->pwd_jdir;
- startdir_used = 0;
- dp = NULL;
- cnp->cn_nameptr = cnp->cn_pnbuf;
if (cnp->cn_pnbuf[0] == '/') {
ndp->ni_resflags |= NIRES_ABS;
- error = namei_handle_root(ndp, &dp);
+ error = namei_handle_root(ndp, dpp);
} else {
if (ndp->ni_startdir != NULL) {
- dp = ndp->ni_startdir;
+ *dpp = ndp->ni_startdir;
startdir_used = 1;
} else if (ndp->ni_dirfd == AT_FDCWD) {
- dp = pwd->pwd_cdir;
- vrefact(dp);
+ *dpp = pwd->pwd_cdir;
+ vrefact(*dpp);
} else {
rights = ndp->ni_rightsneeded;
cap_rights_set_one(&rights, CAP_LOOKUP);
@@ -441,8 +377,8 @@
} else if (dfp->f_vnode == NULL) {
error = ENOTDIR;
} else {
- dp = dfp->f_vnode;
- vrefact(dp);
+ *dpp = dfp->f_vnode;
+ vrefact(*dpp);
if ((dfp->f_flag & FSEARCH) != 0)
cnp->cn_flags |= NOEXECCHECK;
@@ -464,7 +400,7 @@
}
#endif
}
- if (error == 0 && dp->v_type != VDIR)
+ if (error == 0 && (*dpp)->v_type != VDIR)
error = ENOTDIR;
}
if (error == 0 && (cnp->cn_flags & BENEATH) != 0) {
@@ -476,7 +412,7 @@
cap_rights_set_one(&rights, CAP_LOOKUP);
error = fgetvp_rights(td, ndp->ni_dirfd, &rights,
&dirfd_caps, &ndp->ni_beneath_latch);
- if (error == 0 && dp->v_type != VDIR) {
+ if (error == 0 && (*dpp)->v_type != VDIR) {
vrele(ndp->ni_beneath_latch);
error = ENOTDIR;
}
@@ -488,15 +424,15 @@
* If we are auditing the kernel pathname, save the user pathname.
*/
if (cnp->cn_flags & AUDITVNODE1)
- AUDIT_ARG_UPATH1_VP(td, ndp->ni_rootdir, dp, cnp->cn_pnbuf);
+ AUDIT_ARG_UPATH1_VP(td, ndp->ni_rootdir, *dpp, cnp->cn_pnbuf);
if (cnp->cn_flags & AUDITVNODE2)
- AUDIT_ARG_UPATH2_VP(td, ndp->ni_rootdir, dp, cnp->cn_pnbuf);
+ AUDIT_ARG_UPATH2_VP(td, ndp->ni_rootdir, *dpp, cnp->cn_pnbuf);
if (ndp->ni_startdir != NULL && !startdir_used)
vrele(ndp->ni_startdir);
if (error != 0) {
- if (dp != NULL)
- vrele(dp);
- goto out;
+ if (*dpp != NULL)
+ vrele(*dpp);
+ return (error);
}
MPASS((ndp->ni_lcf & (NI_LCF_BENEATH_ABS | NI_LCF_LATCH)) !=
NI_LCF_BENEATH_ABS);
@@ -505,8 +441,124 @@
((ndp->ni_lcf & NI_LCF_STRICTRELATIVE) == 0 &&
(cnp->cn_flags & BENEATH) != 0))
ndp->ni_lcf |= NI_LCF_CAP_DOTDOT;
- SDT_PROBE3(vfs, namei, lookup, entry, dp, cnp->cn_pnbuf,
+ SDT_PROBE3(vfs, namei, lookup, entry, *dpp, cnp->cn_pnbuf,
cnp->cn_flags);
+ *pwdp = pwd;
+ return (0);
+}
+
+/*
+ * Convert a pathname into a pointer to a locked vnode.
+ *
+ * The FOLLOW flag is set when symbolic links are to be followed
+ * when they occur at the end of the name translation process.
+ * Symbolic links are always followed for all other pathname
+ * components other than the last.
+ *
+ * The segflg defines whether the name is to be copied from user
+ * space or kernel space.
+ *
+ * Overall outline of namei:
+ *
+ * copy in name
+ * get starting directory
+ * while (!done && !error) {
+ * call lookup to search path.
+ * if symbolic link, massage name in buffer and continue
+ * }
+ */
+int
+namei(struct nameidata *ndp)
+{
+ char *cp; /* pointer into pathname argument */
+ struct vnode *dp; /* the directory we are searching */
+ struct iovec aiov; /* uio for reading symbolic links */
+ struct componentname *cnp;
+ struct thread *td;
+ struct proc *p;
+ struct pwd *pwd;
+ struct uio auio;
+ int error, linklen;
+ enum cache_fpl_status status;
+
+ cnp = &ndp->ni_cnd;
+ td = cnp->cn_thread;
+ p = td->td_proc;
+ ndp->ni_cnd.cn_cred = ndp->ni_cnd.cn_thread->td_ucred;
+ KASSERT(cnp->cn_cred && p, ("namei: bad cred/proc"));
+ KASSERT((cnp->cn_nameiop & (~OPMASK)) == 0,
+ ("namei: nameiop contaminated with flags"));
+ KASSERT((cnp->cn_flags & OPMASK) == 0,
+ ("namei: flags contaminated with nameiops"));
+ MPASS(ndp->ni_startdir == NULL || ndp->ni_startdir->v_type == VDIR ||
+ ndp->ni_startdir->v_type == VBAD);
+ TAILQ_INIT(&ndp->ni_cap_tracker);
+ ndp->ni_lcf = 0;
+ ndp->ni_loopcnt = 0;
+ dp = NULL;
+
+ /* We will set this ourselves if we need it. */
+ cnp->cn_flags &= ~TRAILINGSLASH;
+
+ ndp->ni_vp = NULL;
+
+ /*
+ * Get a buffer for the name to be translated, and copy the
+ * name into the buffer.
+ */
+ if ((cnp->cn_flags & HASBUF) == 0)
+ cnp->cn_pnbuf = uma_zalloc(namei_zone, M_WAITOK);
+ if (ndp->ni_segflg == UIO_SYSSPACE)
+ error = copystr(ndp->ni_dirp, cnp->cn_pnbuf, MAXPATHLEN,
+ &ndp->ni_pathlen);
+ else
+ error = copyinstr(ndp->ni_dirp, cnp->cn_pnbuf, MAXPATHLEN,
+ &ndp->ni_pathlen);
+
+ if (error != 0) {
+ namei_cleanup_cnp(cnp);
+ return (error);
+ }
+
+ cnp->cn_nameptr = cnp->cn_pnbuf;
+
+ /*
+ * Don't allow empty pathnames.
+ */
+ if (*cnp->cn_pnbuf == '\0') {
+ namei_cleanup_cnp(cnp);
+ return (ENOENT);
+ }
+
+ /*
+ * First try the fast path.
+ *
+ * If it fails to handle the lookup, we are going to do perform it below.
+ * Note this means that we either start from scratch or continue where it
+ * left off.
+ */
+ error = cache_fplookup(ndp, &status, &pwd);
+ switch (status) {
+ case CACHE_FPL_STATUS_UNSET:
+ __assert_unreachable();
+ break;
+ case CACHE_FPL_STATUS_HANDLED:
+ return (error);
+ case CACHE_FPL_STATUS_PARTIAL:
+ dp = ndp->ni_startdir;
+ break;
+ case CACHE_FPL_STATUS_ABORTED:
+ error = namei_setup(ndp, &dp, &pwd);
+ if (error != 0) {
+ namei_cleanup_cnp(cnp);
+ return (error);
+ }
+ break;
+ }
+
+ /*
+ * Perform the lookup.
+ */
for (;;) {
ndp->ni_startdir = dp;
error = lookup(ndp);
Index: sys/kern/vfs_mount.c
===================================================================
--- sys/kern/vfs_mount.c
+++ sys/kern/vfs_mount.c
@@ -947,6 +947,7 @@
vput(vp);
return (error);
}
+ vn_seqc_write_begin(vp);
VOP_UNLOCK(vp);
/* Allocate and initialize the filesystem. */
@@ -979,9 +980,11 @@
VI_LOCK(vp);
vp->v_iflag &= ~VI_MOUNT;
VI_UNLOCK(vp);
+ vn_seqc_write_end(vp);
vrele(vp);
return (error);
}
+ vn_seqc_write_begin(newdp);
VOP_UNLOCK(newdp);
if (mp->mnt_opt != NULL)
@@ -1018,6 +1021,8 @@
EVENTHANDLER_DIRECT_INVOKE(vfs_mounted, mp, newdp, td);
VOP_UNLOCK(newdp);
mountcheckdirs(vp, newdp);
+ vn_seqc_write_end(vp);
+ vn_seqc_write_end(newdp);
vrele(newdp);
if ((mp->mnt_flag & MNT_RDONLY) == 0)
vfs_allocate_syncvnode(mp);
@@ -1094,7 +1099,9 @@
VOP_UNLOCK(vp);
vfs_op_enter(mp);
+ vn_seqc_write_begin(vp);
+ rootvp = NULL;
MNT_ILOCK(mp);
if ((mp->mnt_kern_flag & MNTK_UNMOUNT) != 0) {
MNT_IUNLOCK(mp);
@@ -1108,8 +1115,6 @@
mp->mnt_kern_flag &= ~MNTK_ASYNC;
rootvp = vfs_cache_root_clear(mp);
MNT_IUNLOCK(mp);
- if (rootvp != NULL)
- vrele(rootvp);
mp->mnt_optnew = *optlist;
vfs_mergeopts(mp->mnt_optnew, mp->mnt_opt);
@@ -1233,6 +1238,11 @@
vfs_deallocate_syncvnode(mp);
end:
vfs_op_exit(mp);
+ if (rootvp != NULL) {
+ vn_seqc_write_end(rootvp);
+ vrele(rootvp);
+ }
+ vn_seqc_write_end(vp);
vfs_unbusy(mp);
VI_LOCK(vp);
vp->v_iflag &= ~VI_MOUNT;
@@ -1723,14 +1733,19 @@
}
mp->mnt_kern_flag |= MNTK_UNMOUNT;
rootvp = vfs_cache_root_clear(mp);
+ if (coveredvp != NULL)
+ vn_seqc_write_begin(coveredvp);
if (flags & MNT_NONBUSY) {
MNT_IUNLOCK(mp);
error = vfs_check_usecounts(mp);
MNT_ILOCK(mp);
if (error != 0) {
+ vn_seqc_write_end(coveredvp);
dounmount_cleanup(mp, coveredvp, MNTK_UNMOUNT);
- if (rootvp != NULL)
+ if (rootvp != NULL) {
+ vn_seqc_write_end(rootvp);
vrele(rootvp);
+ }
return (error);
}
}
@@ -1759,22 +1774,19 @@
("%s: invalid return value for msleep in the drain path @ %s:%d",
__func__, __FILE__, __LINE__));
- if (rootvp != NULL)
+ /*
+ * We want to keep the vnode around so that we can vn_seqc_write_end
+ * after we are done with unmount. Downgrade our reference to a mere
+ * hold count so that we don't interefere with anything.
+ */
+ if (rootvp != NULL) {
+ vhold(rootvp);
vrele(rootvp);
+ }
if (mp->mnt_flag & MNT_EXPUBLIC)
vfs_setpublicfs(NULL, NULL, NULL);
- /*
- * From now, we can claim that the use reference on the
- * coveredvp is ours, and the ref can be released only by
- * successfull unmount by us, or left for later unmount
- * attempt. The previously acquired hold reference is no
- * longer needed to protect the vnode from reuse.
- */
- if (coveredvp != NULL)
- vdrop(coveredvp);
-
vfs_periodic(mp, MNT_WAIT);
MNT_ILOCK(mp);
async_flag = mp->mnt_flag & MNT_ASYNC;
@@ -1809,8 +1821,15 @@
}
vfs_op_exit_locked(mp);
MNT_IUNLOCK(mp);
- if (coveredvp)
+ if (coveredvp) {
+ vn_seqc_write_end(coveredvp);
VOP_UNLOCK(coveredvp);
+ vdrop(coveredvp);
+ }
+ if (rootvp != NULL) {
+ vn_seqc_write_end(rootvp);
+ vdrop(rootvp);
+ }
return (error);
}
mtx_lock(&mountlist_mtx);
@@ -1819,7 +1838,13 @@
EVENTHANDLER_DIRECT_INVOKE(vfs_unmounted, mp, td);
if (coveredvp != NULL) {
coveredvp->v_mountedhere = NULL;
+ vn_seqc_write_end(coveredvp);
VOP_UNLOCK(coveredvp);
+ vdrop(coveredvp);
+ }
+ if (rootvp != NULL) {
+ vn_seqc_write_end(rootvp);
+ vdrop(rootvp);
}
vfs_event_signal(NULL, VQ_UNMOUNT, 0);
if (rootvnode != NULL && mp == rootvnode->v_mount) {
Index: sys/kern/vfs_subr.c
===================================================================
--- sys/kern/vfs_subr.c
+++ sys/kern/vfs_subr.c
@@ -664,8 +664,8 @@
vnode_list_reclaim_marker = vn_alloc_marker(NULL);
TAILQ_INSERT_HEAD(&vnode_list, vnode_list_reclaim_marker, v_vnodelist);
vnode_zone = uma_zcreate("VNODE", sizeof (struct vnode), NULL, NULL,
- vnode_init, vnode_fini, UMA_ALIGN_PTR, UMA_ZONE_SMR);
- vfs_smr = uma_zone_get_smr(vnode_zone);
+ vnode_init, vnode_fini, UMA_ALIGN_PTR, 0);
+ uma_zone_set_smr(vnode_zone, vfs_smr);
vnodepoll_zone = uma_zcreate("VNODEPOLL", sizeof (struct vpollinfo),
NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
/*
@@ -1761,6 +1761,7 @@
*/
CTR2(KTR_VFS, "%s: destroying the vnode %p", __func__, vp);
bo = &vp->v_bufobj;
+ VNPASS(vp->v_seqc_users == 0, vp);
VNASSERT(vp->v_data == NULL, vp, ("cleaned vnode isn't"));
VNPASS(vp->v_holdcnt == VHOLD_NO_SMR, vp);
VNASSERT(vp->v_usecount == 0, vp, ("Non-zero use count"));
@@ -2889,6 +2890,17 @@
return (vs);
}
+void
+vget_abort(struct vnode *vp, enum vgetstate vs)
+{
+
+ VNPASS(vs == VGET_HOLDCNT || vs == VGET_USECOUNT, vp);
+ if (vs == VGET_USECOUNT)
+ vrele(vp);
+ else
+ vdrop(vp);
+}
+
int
vget(struct vnode *vp, int flags, struct thread *td)
{
@@ -2951,10 +2963,7 @@
error = vn_lock(vp, flags);
if (__predict_false(error != 0)) {
- if (vs == VGET_USECOUNT)
- vrele(vp);
- else
- vdrop(vp);
+ vget_abort(vp, vs);
CTR2(KTR_VFS, "%s: impossible to lock vnode %p", __func__,
vp);
return (error);
@@ -3032,6 +3041,44 @@
return;
}
+bool
+vref_smr(struct vnode *vp)
+{
+ int old;
+
+ CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
+ VFS_SMR_ASSERT_ENTERED();
+
+ /*
+ * Devices are not supported since they may require taking the interlock.
+ */
+ VNPASS(vp->v_type != VCHR, vp);
+
+ if (refcount_acquire_if_not_zero(&vp->v_usecount)) {
+ VNODE_REFCOUNT_FENCE_ACQ();
+ VNPASS(vp->v_holdcnt > 0, vp);
+ return (true);
+ }
+
+ if (!vhold_smr(vp))
+ return (false);
+
+ /*
+ * See the comment in vget_finish.
+ */
+ old = atomic_fetchadd_int(&vp->v_usecount, 1);
+ VNASSERT(old >= 0, vp, ("%s: wrong use count %d", __func__, old));
+ if (old != 0) {
+#ifdef INVARIANTS
+ old = atomic_fetchadd_int(&vp->v_holdcnt, -1);
+ VNASSERT(old > 1, vp, ("%s: wrong hold count %d", __func__, old));
+#else
+ refcount_release(&vp->v_holdcnt);
+#endif
+ }
+ return (true);
+}
+
void
vref(struct vnode *vp)
{
@@ -3986,6 +4033,7 @@
*/
if (vp->v_irflag & VIRF_DOOMED)
return;
+ vn_seqc_write_begin_locked(vp);
vunlazy_gone(vp);
vp->v_irflag |= VIRF_DOOMED;
@@ -4088,6 +4136,7 @@
vp->v_vnlock = &vp->v_lock;
vp->v_op = &dead_vnodeops;
vp->v_type = VBAD;
+ vn_seqc_write_end_locked(vp);
}
/*
@@ -4128,8 +4177,9 @@
printf("%p: ", (void *)vp);
printf("type %s\n", typename[vp->v_type]);
holdcnt = atomic_load_int(&vp->v_holdcnt);
- printf(" usecount %d, writecount %d, refcount %d",
- vp->v_usecount, vp->v_writecount, holdcnt & ~VHOLD_ALL_FLAGS);
+ printf(" usecount %d, writecount %d, refcount %d seqc users %d",
+ vp->v_usecount, vp->v_writecount, holdcnt & ~VHOLD_ALL_FLAGS,
+ vp->v_seqc_users);
switch (vp->v_type) {
case VDIR:
printf(" mountedhere %p\n", vp->v_mountedhere);
@@ -4381,6 +4431,7 @@
MNT_KERN_FLAG(MNTK_LOOKUP_EXCL_DOTDOT);
MNT_KERN_FLAG(MNTK_MARKER);
MNT_KERN_FLAG(MNTK_USES_BCACHE);
+ MNT_KERN_FLAG(MNTK_FPLOOKUP);
MNT_KERN_FLAG(MNTK_NOASYNC);
MNT_KERN_FLAG(MNTK_UNMOUNT);
MNT_KERN_FLAG(MNTK_MWAIT);
@@ -5196,6 +5247,38 @@
return (error == 0);
}
+/*
+ * VOP_FPLOOKUP_VEXEC routines are subject to special circumstances, see
+ * the comment above cache_fplookup for details.
+ *
+ * We never deny as priv_check_cred calls are not yet supported, see vaccess.
+ */
+int
+vaccess_vexec_smr(mode_t file_mode, uid_t file_uid, gid_t file_gid, struct ucred *cred)
+{
+
+ VFS_SMR_ASSERT_ENTERED();
+
+ /* Check the owner. */
+ if (cred->cr_uid == file_uid) {
+ if (file_mode & S_IXUSR)
+ return (0);
+ return (EAGAIN);
+ }
+
+ /* Otherwise, check the groups (first match) */
+ if (groupmember(file_gid, cred)) {
+ if (file_mode & S_IXGRP)
+ return (0);
+ return (EAGAIN);
+ }
+
+ /* Otherwise, check everyone else. */
+ if (file_mode & S_IXOTH)
+ return (0);
+ return (EAGAIN);
+}
+
/*
* Common filesystem object access control check routine. Accepts a
* vnode's type, "mode", uid and gid, requested access mode, credentials,
@@ -5476,6 +5559,14 @@
ASSERT_VOP_LOCKED(a->a_tvp, "vop_rename: tvp not locked");
ASSERT_VOP_LOCKED(a->a_tdvp, "vop_rename: tdvp not locked");
#endif
+ /*
+ * It may be tempting to add vn_seqc_write_begin/end calls here and
+ * in vop_rename_post but that's not going to work out since some
+ * filesystems relookup vnodes mid-rename. This is probably a bug.
+ *
+ * For now filesystems are expected to do the relevant calls after they
+ * decide what vnodes to operate on.
+ */
if (a->a_tdvp != a->a_fdvp)
vhold(a->a_fdvp);
if (a->a_tvp != a->a_fvp)
@@ -5486,6 +5577,20 @@
}
#ifdef DEBUG_VFS_LOCKS
+void
+vop_fplookup_vexec_pre(void *ap __unused)
+{
+
+ VFS_SMR_ASSERT_ENTERED();
+}
+
+void
+vop_fplookup_vexec_post(void *ap __unused, int rc __unused)
+{
+
+ VFS_SMR_ASSERT_ENTERED();
+}
+
void
vop_strategy_pre(void *ap)
{
@@ -5565,11 +5670,26 @@
VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
}
+void
+vop_deleteextattr_pre(void *ap)
+{
+ struct vop_deleteextattr_args *a;
+ struct vnode *vp;
+
+ a = ap;
+ vp = a->a_vp;
+ vn_seqc_write_begin(vp);
+}
+
void
vop_deleteextattr_post(void *ap, int rc)
{
- struct vop_deleteextattr_args *a = ap;
+ struct vop_deleteextattr_args *a;
+ struct vnode *vp;
+ a = ap;
+ vp = a->a_vp;
+ vn_seqc_write_end(vp);
if (!rc)
VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB);
}
@@ -5672,22 +5792,74 @@
}
}
+void
+vop_setattr_pre(void *ap)
+{
+ struct vop_setattr_args *a;
+ struct vnode *vp;
+
+ a = ap;
+ vp = a->a_vp;
+ vn_seqc_write_begin(vp);
+}
+
void
vop_setattr_post(void *ap, int rc)
{
- struct vop_setattr_args *a = ap;
+ struct vop_setattr_args *a;
+ struct vnode *vp;
+ a = ap;
+ vp = a->a_vp;
+ vn_seqc_write_end(vp);
if (!rc)
- VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB);
+ VFS_KNOTE_LOCKED(vp, NOTE_ATTRIB);
+}
+
+void
+vop_setacl_pre(void *ap)
+{
+ struct vop_setacl_args *a;
+ struct vnode *vp;
+
+ a = ap;
+ vp = a->a_vp;
+ vn_seqc_write_begin(vp);
+}
+
+void
+vop_setacl_post(void *ap, int rc __unused)
+{
+ struct vop_setacl_args *a;
+ struct vnode *vp;
+
+ a = ap;
+ vp = a->a_vp;
+ vn_seqc_write_end(vp);
+}
+
+void
+vop_setextattr_pre(void *ap)
+{
+ struct vop_setextattr_args *a;
+ struct vnode *vp;
+
+ a = ap;
+ vp = a->a_vp;
+ vn_seqc_write_begin(vp);
}
void
vop_setextattr_post(void *ap, int rc)
{
- struct vop_setextattr_args *a = ap;
+ struct vop_setextattr_args *a;
+ struct vnode *vp;
+ a = ap;
+ vp = a->a_vp;
+ vn_seqc_write_end(vp);
if (!rc)
- VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB);
+ VFS_KNOTE_LOCKED(vp, NOTE_ATTRIB);
}
void
@@ -6249,6 +6421,8 @@
*/
MPASS(mp->mnt_vfs_ops > 0);
vp = mp->mnt_rootvnode;
+ if (vp != NULL)
+ vn_seqc_write_begin(vp);
mp->mnt_rootvnode = NULL;
return (vp);
}
@@ -6545,3 +6719,45 @@
return (VOP_ACCESS(vp, VEXEC, cnp->cn_cred, cnp->cn_thread));
}
+
+void
+vn_seqc_write_begin_locked(struct vnode *vp)
+{
+
+ ASSERT_VI_LOCKED(vp, __func__);
+ VNPASS(vp->v_holdcnt > 0, vp);
+ VNPASS(vp->v_seqc_users >= 0, vp);
+ vp->v_seqc_users++;
+ if (vp->v_seqc_users == 1)
+ seqc_sleepable_write_begin(&vp->v_seqc);
+}
+
+void
+vn_seqc_write_begin(struct vnode *vp)
+{
+
+ VI_LOCK(vp);
+ vn_seqc_write_begin_locked(vp);
+ VI_UNLOCK(vp);
+}
+
+void
+vn_seqc_write_end_locked(struct vnode *vp)
+{
+
+ ASSERT_VI_LOCKED(vp, __func__);
+ VNPASS(vp->v_holdcnt > 0, vp);
+ VNPASS(vp->v_seqc_users > 0, vp);
+ vp->v_seqc_users--;
+ if (vp->v_seqc_users == 0)
+ seqc_sleepable_write_end(&vp->v_seqc);
+}
+
+void
+vn_seqc_write_end(struct vnode *vp)
+{
+
+ VI_LOCK(vp);
+ vn_seqc_write_end_locked(vp);
+ VI_UNLOCK(vp);
+}
Index: sys/kern/vnode_if.src
===================================================================
--- sys/kern/vnode_if.src
+++ sys/kern/vnode_if.src
@@ -142,6 +142,17 @@
};
+%% fplookup_vexec vp - - -
+%! fplookup_vexec pre vop_fplookup_vexec_pre
+%! fplookup_vexec post vop_fplookup_vexec_post
+
+vop_fplookup_vexec {
+ IN struct vnode *vp;
+ IN struct ucred *cred;
+ IN struct thread *td;
+};
+
+
%% access vp L L L
vop_access {
@@ -172,6 +183,7 @@
%% setattr vp E E E
+%! setattr pre vop_setattr_pre
%! setattr post vop_setattr_post
vop_setattr {
@@ -523,6 +535,8 @@
%% setacl vp E E E
+%! setacl pre vop_setacl_pre
+%! setacl post vop_setacl_post
vop_setacl {
IN struct vnode *vp;
@@ -589,6 +603,7 @@
%% deleteextattr vp E E E
+%! deleteextattr pre vop_deleteextattr_pre
%! deleteextattr post vop_deleteextattr_post
vop_deleteextattr {
@@ -601,6 +616,7 @@
%% setextattr vp E E E
+%! setextattr pre vop_setextattr_pre
%! setextattr post vop_setextattr_post
vop_setextattr {
Index: sys/security/mac/mac_framework.h
===================================================================
--- sys/security/mac/mac_framework.h
+++ sys/security/mac/mac_framework.h
@@ -422,13 +422,14 @@
int mac_vnode_check_lookup_impl(struct ucred *cred, struct vnode *dvp,
struct componentname *cnp);
extern bool mac_vnode_check_lookup_fp_flag;
+#define mac_vnode_check_lookup_enabled() __predict_false(mac_vnode_check_lookup_fp_flag)
static inline int
mac_vnode_check_lookup(struct ucred *cred, struct vnode *dvp,
struct componentname *cnp)
{
mac_vnode_assert_locked(dvp, "mac_vnode_check_lookup");
- if (__predict_false(mac_vnode_check_lookup_fp_flag))
+ if (mac_vnode_check_lookup_enabled())
return (mac_vnode_check_lookup_impl(cred, dvp, cnp));
return (0);
}
Index: sys/sys/_seqc.h
===================================================================
--- /dev/null
+++ sys/sys/_seqc.h
@@ -0,0 +1,6 @@
+#ifndef _SYS__SEQC_H_
+#define _SYS__SEQC_H_
+
+typedef uint32_t seqc_t;
+
+#endif /* _SYS__SEQC_H */
Index: sys/sys/filedesc.h
===================================================================
--- sys/sys/filedesc.h
+++ sys/sys/filedesc.h
@@ -310,6 +310,7 @@
smr_serialized_store(&fdp->fd_pwd, newpwd,
(FILEDESC_XLOCK_ASSERT(fdp), true));
}
+struct pwd *pwd_get_smr(void);
#endif /* _KERNEL */
Index: sys/sys/mount.h
===================================================================
--- sys/sys/mount.h
+++ sys/sys/mount.h
@@ -420,6 +420,7 @@
#define MNTK_TEXT_REFS 0x00008000 /* Keep use ref for text */
#define MNTK_VMSETSIZE_BUG 0x00010000
#define MNTK_UNIONFS 0x00020000 /* A hack for F_ISUNIONSTACK */
+#define MNTK_FPLOOKUP 0x00040000 /* fast path lookup is supported */
#define MNTK_NOASYNC 0x00800000 /* disable async */
#define MNTK_UNMOUNT 0x01000000 /* unmount in progress */
#define MNTK_MWAIT 0x02000000 /* waiting for unmount to finish */
Index: sys/sys/namei.h
===================================================================
--- sys/sys/namei.h
+++ sys/sys/namei.h
@@ -108,6 +108,12 @@
};
#ifdef _KERNEL
+
+enum cache_fpl_status { CACHE_FPL_STATUS_ABORTED, CACHE_FPL_STATUS_PARTIAL,
+ CACHE_FPL_STATUS_HANDLED, CACHE_FPL_STATUS_UNSET };
+int cache_fplookup(struct nameidata *ndp, enum cache_fpl_status *status,
+ struct pwd **pwdp);
+
/*
* namei operations
*/
Index: sys/sys/seqc.h
===================================================================
--- sys/sys/seqc.h
+++ sys/sys/seqc.h
@@ -36,7 +36,7 @@
/*
* seqc_t may be included in structs visible to userspace
*/
-typedef uint32_t seqc_t;
+#include <sys/_seqc.h>
#ifdef _KERNEL
@@ -111,5 +111,26 @@
return (seqc_consistent_nomb(seqcp, oldseqc));
}
+/*
+ * Variant which does not critical enter/exit.
+ */
+static __inline void
+seqc_sleepable_write_begin(seqc_t *seqcp)
+{
+
+ MPASS(!seqc_in_modify(*seqcp));
+ *seqcp += 1;
+ atomic_thread_fence_rel();
+}
+
+static __inline void
+seqc_sleepable_write_end(seqc_t *seqcp)
+{
+
+ atomic_thread_fence_rel();
+ *seqcp += 1;
+ MPASS(!seqc_in_modify(*seqcp));
+}
+
#endif /* _KERNEL */
#endif /* _SYS_SEQC_H_ */
Index: sys/sys/vnode.h
===================================================================
--- sys/sys/vnode.h
+++ sys/sys/vnode.h
@@ -45,6 +45,7 @@
#include <sys/uio.h>
#include <sys/acl.h>
#include <sys/ktr.h>
+#include <sys/_seqc.h>
/*
* The vnode is the focus of all file activity in UNIX. There is a
@@ -105,6 +106,7 @@
*/
enum vtype v_type:8; /* u vnode type */
short v_irflag; /* i frequently read flags */
+ seqc_t v_seqc; /* i modification count */
struct vop_vector *v_op; /* u vnode operations vector */
void *v_data; /* u private data for fs */
@@ -175,6 +177,7 @@
short v_dbatchcpu; /* i LRU requeue deferral batch */
int v_writecount; /* I ref count of writers or
(negative) text users */
+ int v_seqc_users; /* i modifications pending */
u_int v_hash;
};
@@ -539,6 +542,18 @@
#define ASSERT_VOP_LOCKED(vp, str) assert_vop_locked((vp), (str))
#define ASSERT_VOP_UNLOCKED(vp, str) assert_vop_unlocked((vp), (str))
+#define ASSERT_VOP_IN_SEQC(vp) do { \
+ struct vnode *_vp = (vp); \
+ \
+ VNPASS(seqc_in_modify(_vp->v_seqc), _vp); \
+} while (0)
+
+#define ASSERT_VOP_NOT_IN_SEQC(vp) do { \
+ struct vnode *_vp = (vp); \
+ \
+ VNPASS(!seqc_in_modify(_vp->v_seqc), _vp); \
+} while (0)
+
#else /* !DEBUG_VFS_LOCKS */
#define ASSERT_VI_LOCKED(vp, str) ((void)0)
@@ -546,6 +561,10 @@
#define ASSERT_VOP_ELOCKED(vp, str) ((void)0)
#define ASSERT_VOP_LOCKED(vp, str) ((void)0)
#define ASSERT_VOP_UNLOCKED(vp, str) ((void)0)
+
+#define ASSERT_VOP_IN_SEQC(vp) ((void)0)
+#define ASSERT_VOP_NOT_IN_SEQC(vp) ((void)0)
+
#endif /* DEBUG_VFS_LOCKS */
@@ -602,6 +621,7 @@
struct vattr;
struct vfsops;
struct vnode;
+struct pwd;
typedef int (*vn_get_ino_t)(struct mount *, void *, int, struct vnode **);
@@ -619,6 +639,10 @@
void cache_purge(struct vnode *vp);
void cache_purge_negative(struct vnode *vp);
void cache_purgevfs(struct mount *mp, bool force);
+void vn_seqc_write_begin_locked(struct vnode *vp);
+void vn_seqc_write_begin(struct vnode *vp);
+void vn_seqc_write_end_locked(struct vnode *vp);
+void vn_seqc_write_end(struct vnode *vp);
int change_dir(struct vnode *vp, struct thread *td);
void cvtstat(struct stat *st, struct ostat *ost);
void freebsd11_cvtnstat(struct stat *sb, struct nstat *nsb);
@@ -644,6 +668,8 @@
int vn_commname(struct vnode *vn, char *buf, u_int buflen);
int vn_path_to_global_path(struct thread *td, struct vnode *vp,
char *path, u_int pathlen);
+int vaccess_vexec_smr(mode_t file_mode, uid_t file_uid, gid_t file_gid,
+ struct ucred *cred);
int vaccess(enum vtype type, mode_t file_mode, uid_t file_uid,
gid_t file_gid, accmode_t accmode, struct ucred *cred,
int *privused);
@@ -663,6 +689,7 @@
enum vgetstate vget_prep_smr(struct vnode *vp);
enum vgetstate vget_prep(struct vnode *vp);
int vget_finish(struct vnode *vp, int flags, enum vgetstate vs);
+void vget_abort(struct vnode *vp, enum vgetstate vs);
void vgone(struct vnode *vp);
void vhold(struct vnode *);
void vholdl(struct vnode *);
@@ -805,6 +832,7 @@
/* These are called from within the actual VOPS. */
void vop_close_post(void *a, int rc);
void vop_create_post(void *a, int rc);
+void vop_deleteextattr_pre(void *a);
void vop_deleteextattr_post(void *a, int rc);
void vop_link_post(void *a, int rc);
void vop_lookup_post(void *a, int rc);
@@ -819,12 +847,18 @@
void vop_rename_post(void *a, int rc);
void vop_rename_pre(void *a);
void vop_rmdir_post(void *a, int rc);
+void vop_setattr_pre(void *a);
void vop_setattr_post(void *a, int rc);
+void vop_setacl_pre(void *a);
+void vop_setacl_post(void *a, int rc);
+void vop_setextattr_pre(void *a);
void vop_setextattr_post(void *a, int rc);
void vop_symlink_post(void *a, int rc);
int vop_sigdefer(struct vop_vector *vop, struct vop_generic_args *a);
#ifdef DEBUG_VFS_LOCKS
+void vop_fplookup_vexec_pre(void *a);
+void vop_fplookup_vexec_post(void *a, int rc);
void vop_strategy_pre(void *a);
void vop_lock_pre(void *a);
void vop_lock_post(void *a, int rc);
@@ -832,6 +866,8 @@
void vop_need_inactive_pre(void *a);
void vop_need_inactive_post(void *a, int rc);
#else
+#define vop_fplookup_vexec_pre(x) do { } while (0)
+#define vop_fplookup_vexec_post(x, y) do { } while (0)
#define vop_strategy_pre(x) do { } while (0)
#define vop_lock_pre(x) do { } while (0)
#define vop_lock_post(x, y) do { } while (0)
@@ -901,6 +937,7 @@
void vput(struct vnode *vp);
void vrele(struct vnode *vp);
void vref(struct vnode *vp);
+bool vref_smr(struct vnode *vp);
void vrefl(struct vnode *vp);
void vrefact(struct vnode *vp);
void vrefactn(struct vnode *vp, u_int n);
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Thu, Jan 16, 3:30 PM (21 h, 24 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
15826783
Default Alt Text
D23915.diff (58 KB)
Attached To
Mode
D23915: vfs: fully smr-protected path lookup
Attached
Detach File
Event Timeline
Log In to Comment