Page MenuHomeFreeBSD

D25578.id74307.diff
No OneTemporary

D25578.id74307.diff

Index: sys/kern/vfs_cache.c
===================================================================
--- sys/kern/vfs_cache.c
+++ sys/kern/vfs_cache.c
@@ -55,6 +55,7 @@
#include <sys/namei.h>
#include <sys/proc.h>
#include <sys/rwlock.h>
+#include <sys/seqc.h>
#include <sys/sdt.h>
#include <sys/smr.h>
#include <sys/smp.h>
@@ -67,6 +68,11 @@
#include <sys/ktrace.h>
#endif
+#include <sys/capsicum.h>
+
+#include <security/audit/audit.h>
+#include <security/mac/mac_framework.h>
+
#ifdef DDB
#include <ddb/ddb.h>
#endif
@@ -100,6 +106,10 @@
SDT_PROBE_DEFINE2(vfs, namecache, shrink_negative, done, "struct vnode *",
"char *");
+SDT_PROBE_DEFINE3(vfs, fplookup, lookup, done, "struct nameidata", "int", "bool");
+SDT_PROBE_DECLARE(vfs, namei, lookup, entry);
+SDT_PROBE_DECLARE(vfs, namei, lookup, return);
+
/*
* This structure describes the elements in the cache of recent
* names looked up by namei.
@@ -2810,3 +2820,838 @@
}
#endif
+
+extern uma_zone_t namei_zone;
+
+static bool __read_frequently cache_fast_lookup = true;
+SYSCTL_BOOL(_vfs, OID_AUTO, cache_fast_lookup, CTLFLAG_RW,
+ &cache_fast_lookup, 0, "");
+
+#define CACHE_FPL_FAILED -2020
+
+static void
+cache_fpl_cleanup_cnp(struct componentname *cnp)
+{
+
+ uma_zfree(namei_zone, cnp->cn_pnbuf);
+#ifdef DIAGNOSTIC
+ cnp->cn_pnbuf = NULL;
+ cnp->cn_nameptr = NULL;
+#endif
+}
+
+static void
+cache_fpl_handle_root(struct nameidata *ndp, struct vnode **dpp)
+{
+ struct componentname *cnp;
+
+ cnp = &ndp->ni_cnd;
+ while (*(cnp->cn_nameptr) == '/') {
+ cnp->cn_nameptr++;
+ ndp->ni_pathlen--;
+ }
+
+ *dpp = ndp->ni_rootdir;
+}
+
+/*
+ * Components of nameidata (or objects it can point to) which may
+ * need restoring in case fast path lookup fails.
+ */
+struct nameidata_saved {
+ int cn_flags;
+ long cn_namelen;
+ char *cn_nameptr;
+ size_t ni_pathlen;
+};
+
+struct cache_fpl {
+ int line;
+ enum cache_fpl_status status;
+ bool in_smr;
+ struct nameidata *ndp;
+ struct nameidata_saved snd;
+ struct componentname *cnp;
+ struct vnode *dvp;
+ seqc_t dvp_seqc;
+ struct vnode *tvp;
+ seqc_t tvp_seqc;
+ struct pwd *pwd;
+};
+
+static void
+cache_fpl_checkpoint(struct cache_fpl *fpl, struct nameidata_saved *snd)
+{
+
+ snd->cn_flags = fpl->ndp->ni_cnd.cn_flags;
+ snd->cn_namelen = fpl->ndp->ni_cnd.cn_namelen;
+ snd->cn_nameptr = fpl->ndp->ni_cnd.cn_nameptr;
+ snd->ni_pathlen = fpl->ndp->ni_pathlen;
+}
+
+static void
+cache_fpl_restore(struct cache_fpl *fpl, struct nameidata_saved *snd)
+{
+
+ fpl->ndp->ni_cnd.cn_flags = snd->cn_flags;
+ fpl->ndp->ni_cnd.cn_namelen = snd->cn_namelen;
+ fpl->ndp->ni_cnd.cn_nameptr = snd->cn_nameptr;
+ fpl->ndp->ni_pathlen = snd->ni_pathlen;
+}
+
+#ifdef INVARIANTS
+#define cache_fpl_smr_assert_entered(fpl) ({ \
+ struct cache_fpl *_fpl = (fpl); \
+ MPASS(_fpl->in_smr == true); \
+ VFS_SMR_ASSERT_ENTERED(); \
+})
+#define cache_fpl_smr_assert_not_entered(fpl) ({ \
+ struct cache_fpl *_fpl = (fpl); \
+ MPASS(_fpl->in_smr == false); \
+ VFS_SMR_ASSERT_NOT_ENTERED(); \
+})
+#else
+#define cache_fpl_smr_assert_entered(fpl) do { } while (0)
+#define cache_fpl_smr_assert_not_entered(fpl) do { } while (0)
+#endif
+
+#define cache_fpl_smr_enter(fpl) ({ \
+ struct cache_fpl *_fpl = (fpl); \
+ MPASS(_fpl->in_smr == false); \
+ vfs_smr_enter(); \
+ _fpl->in_smr = true; \
+})
+
+#define cache_fpl_smr_exit(fpl) ({ \
+ struct cache_fpl *_fpl = (fpl); \
+ MPASS(_fpl->in_smr == true); \
+ vfs_smr_exit(); \
+ _fpl->in_smr = false; \
+})
+
+static int
+cache_fpl_aborted_impl(struct cache_fpl *fpl, int line)
+{
+
+ if (fpl->status != CACHE_FPL_STATUS_UNSET) {
+ KASSERT(fpl->status == CACHE_FPL_STATUS_PARTIAL,
+ ("%s: converting to abort from %d at %d, set at %d\n",
+ __func__, fpl->status, line, fpl->line));
+ }
+ fpl->status = CACHE_FPL_STATUS_ABORTED;
+ fpl->line = line;
+ return (CACHE_FPL_FAILED);
+}
+
+#define cache_fpl_aborted(x) cache_fpl_aborted_impl((x), __LINE__)
+
+static int
+cache_fpl_partial_impl(struct cache_fpl *fpl, int line)
+{
+
+ KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET,
+ ("%s: setting to partial at %d, but already set to %d at %d\n",
+ __func__, line, fpl->status, fpl->line));
+ cache_fpl_smr_assert_entered(fpl);
+ fpl->status = CACHE_FPL_STATUS_PARTIAL;
+ fpl->line = line;
+ return (CACHE_FPL_FAILED);
+}
+
+#define cache_fpl_partial(x) cache_fpl_partial_impl((x), __LINE__)
+
+static int
+cache_fpl_handled_impl(struct cache_fpl *fpl, int error, int line)
+{
+
+ KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET,
+ ("%s: setting to handled at %d, but already set to %d at %d\n",
+ __func__, line, fpl->status, fpl->line));
+ cache_fpl_smr_assert_not_entered(fpl);
+ MPASS(error != CACHE_FPL_FAILED);
+ fpl->status = CACHE_FPL_STATUS_HANDLED;
+ fpl->line = line;
+ return (error);
+}
+
+#define cache_fpl_handled(x, e) cache_fpl_handled_impl((x), (e), __LINE__)
+
+#define CACHE_FPL_SUPPORTED_CN_FLAGS \
+ (LOCKLEAF | FOLLOW | LOCKSHARED | SAVENAME | ISOPEN | AUDITVNODE1)
+
+static bool
+cache_can_fplookup(struct cache_fpl *fpl)
+{
+ struct nameidata *ndp;
+ struct componentname *cnp;
+ struct thread *td;
+
+ ndp = fpl->ndp;
+ cnp = fpl->cnp;
+ td = cnp->cn_thread;
+
+ if (!cache_fast_lookup) {
+ cache_fpl_aborted(fpl);
+ return (false);
+ }
+#ifdef MAC
+ if (mac_vnode_check_lookup_enabled()) {
+ cache_fpl_aborted(fpl);
+ return (false);
+ }
+#endif
+ if ((cnp->cn_flags & ~CACHE_FPL_SUPPORTED_CN_FLAGS) != 0) {
+ cache_fpl_aborted(fpl);
+ return (false);
+ }
+ if ((cnp->cn_flags & LOCKLEAF) == 0) {
+ cache_fpl_aborted(fpl);
+ return (false);
+ }
+ if (cnp->cn_nameiop != LOOKUP) {
+ cache_fpl_aborted(fpl);
+ return (false);
+ }
+ if (ndp->ni_dirfd != AT_FDCWD) {
+ cache_fpl_aborted(fpl);
+ return (false);
+ }
+ if (IN_CAPABILITY_MODE(td)) {
+ cache_fpl_aborted(fpl);
+ return (false);
+ }
+ if (AUDITING_TD(td)) {
+ cache_fpl_aborted(fpl);
+ return (false);
+ }
+ if (ndp->ni_startdir != NULL) {
+ cache_fpl_aborted(fpl);
+ return (false);
+ }
+ return (true);
+}
+
+static bool
+cache_fplookup_vnode_supported(struct vnode *vp)
+{
+
+ return (vp->v_type != VLNK);
+}
+
+/*
+ * The target vnode is not supported, prepare for the slow path to take over.
+ */
+static int
+cache_fplookup_partial_setup(struct cache_fpl *fpl)
+{
+ struct componentname *cnp;
+ struct vnode *dvp;
+ struct pwd *pwd;
+ seqc_t dvp_seqc;
+
+ cnp = fpl->cnp;
+ dvp = fpl->dvp;
+ dvp_seqc = fpl->dvp_seqc;
+
+ if (!vref_smr(dvp)) {
+ cache_fpl_smr_exit(fpl);
+ return (cache_fpl_aborted(fpl));
+ }
+
+ cache_fpl_smr_exit(fpl);
+ if (!vn_seqc_consistent(dvp, dvp_seqc)) {
+ vrele(dvp);
+ return (cache_fpl_aborted(fpl));
+ }
+
+ pwd = pwd_hold(curthread);
+ if (fpl->pwd != pwd) {
+ vrele(dvp);
+ pwd_drop(pwd);
+ return (cache_fpl_aborted(fpl));
+ }
+
+ fpl->ndp->ni_startdir = dvp;
+ return (0);
+}
+
+static int
+cache_fplookup_final(struct cache_fpl *fpl)
+{
+ struct componentname *cnp;
+ enum vgetstate tvs;
+ struct vnode *dvp, *tvp;
+ seqc_t dvp_seqc, tvp_seqc;
+ int error;
+
+ cnp = fpl->cnp;
+ dvp = fpl->dvp;
+ dvp_seqc = fpl->dvp_seqc;
+ tvp = fpl->tvp;
+ tvp_seqc = fpl->tvp_seqc;
+
+ VNPASS(cache_fplookup_vnode_supported(dvp), dvp);
+ MPASS((cnp->cn_flags & LOCKLEAF) != 0);
+
+ tvs = vget_prep_smr(tvp);
+ if (tvs == VGET_NONE) {
+ return (cache_fpl_partial(fpl));
+ }
+
+ if (!vn_seqc_consistent(dvp, dvp_seqc)) {
+ cache_fpl_smr_exit(fpl);
+ vget_abort(tvp, tvs);
+ return (cache_fpl_aborted(fpl));
+ }
+
+ cache_fpl_smr_exit(fpl);
+
+ error = vget_finish(tvp, cnp->cn_lkflags, tvs);
+ if (error != 0) {
+ return (cache_fpl_aborted(fpl));
+ }
+
+ if (!vn_seqc_consistent(tvp, tvp_seqc)) {
+ vput(tvp);
+ return (cache_fpl_aborted(fpl));
+ }
+
+ return (cache_fpl_handled(fpl, 0));
+}
+
+static int
+cache_fplookup_next(struct cache_fpl *fpl)
+{
+ struct componentname *cnp;
+ struct namecache *ncp;
+ struct vnode *dvp, *tvp;
+ u_char nc_flag;
+ uint32_t hash;
+
+ cnp = fpl->cnp;
+ dvp = fpl->dvp;
+
+ if (__predict_false(cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.')) {
+ fpl->tvp = dvp;
+ fpl->tvp_seqc = vn_seqc_read_any(dvp);
+ if (seqc_in_modify(fpl->tvp_seqc)) {
+ return (cache_fpl_aborted(fpl));
+ }
+ return (0);
+ }
+
+ hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
+
+ CK_LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
+ counter_u64_add(numchecks, 1);
+ if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
+ !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
+ break;
+ }
+
+ /*
+ * If there is no entry we have to punt to the slow path to perform
+ * actual lookup. Should there be nothing with this name a negative
+ * entry will be created.
+ */
+ if (__predict_false(ncp == NULL)) {
+ return (cache_fpl_partial(fpl));
+ }
+
+ tvp = atomic_load_ptr(&ncp->nc_vp);
+ nc_flag = atomic_load_char(&ncp->nc_flag);
+ if (__predict_false(cache_ncp_invalid(ncp))) {
+ return (cache_fpl_partial(fpl));
+ }
+ if (__predict_false(nc_flag & NCF_WHITE)) {
+ return (cache_fpl_partial(fpl));
+ }
+
+ fpl->tvp = tvp;
+ if (nc_flag & NCF_NEGATIVE) {
+ if ((nc_flag & NCF_HOTNEGATIVE) == 0) {
+ /*
+ * TODO
+ * Promoting to hot negative requires locks which are
+ * not yet supported for simplicity.
+ */
+ return (cache_fpl_partial(fpl));
+ }
+ SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp,
+ ncp->nc_name);
+ counter_u64_add(numneghits, 1);
+ cache_fpl_smr_exit(fpl);
+ return (cache_fpl_handled(fpl, ENOENT));
+ }
+
+ fpl->tvp_seqc = vn_seqc_read_any(tvp);
+ if (seqc_in_modify(fpl->tvp_seqc)) {
+ return (cache_fpl_partial(fpl));
+ }
+
+ if (!cache_fplookup_vnode_supported(tvp)) {
+ return (cache_fpl_partial(fpl));
+ }
+
+ counter_u64_add(numposhits, 1);
+ SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, tvp);
+ return (0);
+}
+
+static bool
+cache_fplookup_mp_supported(struct mount *mp)
+{
+
+ if (mp == NULL)
+ return (false);
+ if ((mp->mnt_kern_flag & MNTK_FPLOOKUP) == 0)
+ return (false);
+ if (mp->mnt_flag & MNT_UNION)
+ return (false);
+ return (true);
+}
+
+/*
+ * Walk up the mount stack (if any).
+ *
+ * Correctness is provided in the following ways:
+ * - all vnodes are protected from freeing with SMR
+ * - struct mount objects are type stable making them always safe to access
+ * - stability of the particular mount is provided by busying it
+ * - relationship between the vnode which is mounted on and the mount is
+ * verified with the vnode sequence counter after busying
+ * - association between root vnode of the mount and the mount is protected
+ * by busy
+ *
+ * From that point on we can read the sequence counter of the root vnode
+ * and get the next mount on the stack (if any) using the same protection.
+ *
+ * By the end of successful walk we are guaranteed the reached state was
+ * indeed present at least at some point which matches the regular lookup.
+ */
+static int
+cache_fplookup_climb_mount(struct cache_fpl *fpl)
+{
+ struct mount *mp, *prev_mp;
+ struct vnode *vp;
+ seqc_t vp_seqc;
+
+ vp = fpl->tvp;
+ vp_seqc = fpl->tvp_seqc;
+ if (vp->v_type != VDIR)
+ return (0);
+
+ mp = atomic_load_ptr(&vp->v_mountedhere);
+ if (mp == NULL)
+ return (0);
+
+ prev_mp = NULL;
+ for (;;) {
+ if (!vfs_op_thread_enter(mp)) {
+ if (prev_mp != NULL)
+ vfs_op_thread_exit(prev_mp);
+ return (cache_fpl_partial(fpl));
+ }
+ if (prev_mp != NULL)
+ vfs_op_thread_exit(prev_mp);
+ if (!vn_seqc_consistent(vp, vp_seqc)) {
+ vfs_op_thread_exit(mp);
+ return (cache_fpl_partial(fpl));
+ }
+ if (!cache_fplookup_mp_supported(mp)) {
+ vfs_op_thread_exit(mp);
+ return (cache_fpl_partial(fpl));
+ }
+ vp = atomic_load_ptr(&mp->mnt_rootvnode);
+ if (vp == NULL || VN_IS_DOOMED(vp)) {
+ vfs_op_thread_exit(mp);
+ return (cache_fpl_partial(fpl));
+ }
+ vp_seqc = vn_seqc_read_any(vp);
+ if (seqc_in_modify(vp_seqc)) {
+ vfs_op_thread_exit(mp);
+ return (cache_fpl_partial(fpl));
+ }
+ prev_mp = mp;
+ mp = atomic_load_ptr(&vp->v_mountedhere);
+ if (mp == NULL)
+ break;
+ }
+
+ vfs_op_thread_exit(prev_mp);
+ fpl->tvp = vp;
+ fpl->tvp_seqc = vp_seqc;
+ return (0);
+}
+
+/*
+ * Parse the path.
+ *
+ * The code is mostly copy-pasted from regular lookup, see lookup().
+ * The structure is maintained along with comments for easier maintenance.
+ * Deduplicating the code will become feasible after fast path lookup
+ * becomes more feature-complete.
+ */
+static int
+cache_fplookup_parse(struct cache_fpl *fpl)
+{
+ struct nameidata *ndp;
+ struct componentname *cnp;
+ char *cp;
+ char *prev_ni_next; /* saved ndp->ni_next */
+ size_t prev_ni_pathlen; /* saved ndp->ni_pathlen */
+
+ ndp = fpl->ndp;
+ cnp = fpl->cnp;
+
+ /*
+ * Search a new directory.
+ *
+ * The last component of the filename is left accessible via
+ * cnp->cn_nameptr for callers that need the name. Callers needing
+ * the name set the SAVENAME flag. When done, they assume
+ * responsibility for freeing the pathname buffer.
+ */
+ for (cp = cnp->cn_nameptr; *cp != 0 && *cp != '/'; cp++)
+ continue;
+ cnp->cn_namelen = cp - cnp->cn_nameptr;
+ if (cnp->cn_namelen > NAME_MAX) {
+ cache_fpl_smr_exit(fpl);
+ return (cache_fpl_handled(fpl, ENAMETOOLONG));
+ }
+ prev_ni_pathlen = ndp->ni_pathlen;
+ ndp->ni_pathlen -= cnp->cn_namelen;
+ KASSERT(ndp->ni_pathlen <= PATH_MAX,
+ ("%s: ni_pathlen underflow to %zd\n", __func__, ndp->ni_pathlen));
+ prev_ni_next = ndp->ni_next;
+ ndp->ni_next = cp;
+
+ /*
+ * Replace multiple slashes by a single slash and trailing slashes
+ * by a null. This must be done before VOP_LOOKUP() because some
+ * fs's don't know about trailing slashes. Remember if there were
+ * trailing slashes to handle symlinks, existing non-directories
+ * and non-existing files that won't be directories specially later.
+ */
+ while (*cp == '/' && (cp[1] == '/' || cp[1] == '\0')) {
+ cp++;
+ ndp->ni_pathlen--;
+ if (*cp == '\0') {
+ /*
+ * TODO
+ * Regular lookup performs the following:
+ * *ndp->ni_next = '\0';
+ * cnp->cn_flags |= TRAILINGSLASH;
+ *
+ * Which is problematic since it modifies data read
+ * from userspace. Then if fast path lookup was to
+ * abort we would have to either restore it or convey
+ * the flag. Since this is a corner case just ignore
+ * it for simplicity.
+ */
+ return (cache_fpl_partial(fpl));
+ }
+ }
+ ndp->ni_next = cp;
+
+ cnp->cn_flags |= MAKEENTRY;
+
+ if (cnp->cn_namelen == 2 &&
+ cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.')
+ cnp->cn_flags |= ISDOTDOT;
+ else
+ cnp->cn_flags &= ~ISDOTDOT;
+ if (*ndp->ni_next == 0)
+ cnp->cn_flags |= ISLASTCN;
+ else
+ cnp->cn_flags &= ~ISLASTCN;
+
+ /*
+ * Check for degenerate name (e.g. / or "")
+ * which is a way of talking about a directory,
+ * e.g. like "/." or ".".
+ *
+ * TODO
+ * Another corner case handled by the regular lookup
+ */
+ if (__predict_false(cnp->cn_nameptr[0] == '\0')) {
+ return (cache_fpl_partial(fpl));
+ }
+ return (0);
+}
+
+static void
+cache_fplookup_parse_advance(struct cache_fpl *fpl)
+{
+ struct nameidata *ndp;
+ struct componentname *cnp;
+
+ ndp = fpl->ndp;
+ cnp = fpl->cnp;
+
+ cnp->cn_nameptr = ndp->ni_next;
+ while (*cnp->cn_nameptr == '/') {
+ cnp->cn_nameptr++;
+ ndp->ni_pathlen--;
+ }
+}
+
+static int
+cache_fplookup_impl(struct vnode *dvp, struct cache_fpl *fpl)
+{
+ struct nameidata *ndp;
+ struct componentname *cnp;
+ struct mount *mp;
+ int error;
+
+ error = CACHE_FPL_FAILED;
+ ndp = fpl->ndp;
+ ndp->ni_lcf = 0;
+ cnp = fpl->cnp;
+ cnp->cn_lkflags = LK_SHARED;
+ if ((cnp->cn_flags & LOCKSHARED) == 0)
+ cnp->cn_lkflags = LK_EXCLUSIVE;
+
+ cache_fpl_checkpoint(fpl, &fpl->snd);
+
+ fpl->dvp = dvp;
+ fpl->dvp_seqc = vn_seqc_read_any(fpl->dvp);
+ if (seqc_in_modify(fpl->dvp_seqc)) {
+ cache_fpl_aborted(fpl);
+ goto out;
+ }
+ mp = atomic_load_ptr(&fpl->dvp->v_mount);
+ if (!cache_fplookup_mp_supported(mp)) {
+ cache_fpl_aborted(fpl);
+ goto out;
+ }
+
+ VNPASS(cache_fplookup_vnode_supported(fpl->dvp), fpl->dvp);
+
+ for (;;) {
+ error = cache_fplookup_parse(fpl);
+ if (__predict_false(error != 0)) {
+ break;
+ }
+
+ if (cnp->cn_flags & ISDOTDOT) {
+ error = cache_fpl_partial(fpl);
+ break;
+ }
+
+ VNPASS(cache_fplookup_vnode_supported(fpl->dvp), fpl->dvp);
+
+ error = VOP_FPLOOKUP_VEXEC(fpl->dvp, cnp->cn_cred, cnp->cn_thread);
+ if (__predict_false(error != 0)) {
+ switch (error) {
+ case EAGAIN:
+ case EOPNOTSUPP: /* can happen when racing against vgone */
+ cache_fpl_partial(fpl);
+ break;
+ default:
+ /*
+ * See the API contract for VOP_FPLOOKUP_VEXEC.
+ */
+ if (!vn_seqc_consistent(fpl->dvp, fpl->dvp_seqc)) {
+ error = cache_fpl_aborted(fpl);
+ } else {
+ cache_fpl_smr_exit(fpl);
+ cache_fpl_handled(fpl, error);
+ }
+ break;
+ }
+ break;
+ }
+
+ error = cache_fplookup_next(fpl);
+ if (__predict_false(error != 0)) {
+ break;
+ }
+
+ VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp);
+
+ error = cache_fplookup_climb_mount(fpl);
+ if (__predict_false(error != 0)) {
+ break;
+ }
+
+ VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp);
+
+ if (cnp->cn_flags & ISLASTCN) {
+ error = cache_fplookup_final(fpl);
+ break;
+ }
+
+ if (!vn_seqc_consistent(fpl->dvp, fpl->dvp_seqc)) {
+ error = cache_fpl_aborted(fpl);
+ break;
+ }
+
+ fpl->dvp = fpl->tvp;
+ fpl->dvp_seqc = fpl->tvp_seqc;
+
+ cache_fplookup_parse_advance(fpl);
+ cache_fpl_checkpoint(fpl, &fpl->snd);
+ }
+out:
+ switch (fpl->status) {
+ case CACHE_FPL_STATUS_UNSET:
+ __assert_unreachable();
+ break;
+ case CACHE_FPL_STATUS_PARTIAL:
+ cache_fpl_smr_assert_entered(fpl);
+ return (cache_fplookup_partial_setup(fpl));
+ case CACHE_FPL_STATUS_ABORTED:
+ if (fpl->in_smr)
+ cache_fpl_smr_exit(fpl);
+ return (CACHE_FPL_FAILED);
+ case CACHE_FPL_STATUS_HANDLED:
+ cache_fpl_smr_assert_not_entered(fpl);
+ if (__predict_false(error != 0)) {
+ ndp->ni_dvp = NULL;
+ ndp->ni_vp = NULL;
+ cache_fpl_cleanup_cnp(cnp);
+ return (error);
+ }
+ ndp->ni_dvp = fpl->dvp;
+ ndp->ni_vp = fpl->tvp;
+ if (cnp->cn_flags & SAVENAME)
+ cnp->cn_flags |= HASBUF;
+ else
+ cache_fpl_cleanup_cnp(cnp);
+ return (error);
+ }
+}
+
+/*
+ * Fast path lookup protected with SMR and sequence counters.
+ *
+ * Note: all VOP_FPLOOKUP_VEXEC routines have a comment referencing this one.
+ *
+ * Filesystems can opt in by setting the MNTK_FPLOOKUP flag and meeting criteria
+ * outlined below.
+ *
+ * Traditional vnode lookup conceptually looks like this:
+ *
+ * vn_lock(current);
+ * for (;;) {
+ * next = find();
+ * vn_lock(next);
+ * vn_unlock(current);
+ * current = next;
+ * if (last)
+ * break;
+ * }
+ *
+ * Each jump to the next vnode is safe memory-wise and atomic with respect to
+ * any modifications thanks to holding respective locks.
+ *
+ * The same guarantee can be provided with a combination of safe memory
+ * reclamation and sequence counters instead. If all operations which affect
+ * the relationship between the current vnode and the one we are looking for
+ * also modify the counter, we can verify whether all the conditions held as
+ * we made the jump. This includes things like permissions, mount points etc.
+ * In order to provide the guarantee all aforementioned places are enclosed by
+ * vn_seqc_write_begin()/end().
+ *
+ * Thus this translates to:
+ *
+ * vfs_smr_enter();
+ * current_seqc = seqc_read_any(current);
+ * if (seqc_in_modify(current_seqc)) // someone is altering the vnode
+ * abort();
+ * for (;;) {
+ * next = find();
+ * next_seqc = seqc_read_any(next);
+ * if (!seqc_consistent(current, current_seqc) // someone is altering the vnode
+ * abort();
+ * current = next; // we know nothing of importance has changed
+ * current_seqc = next_seqc; // store the counter for the next iteration
+ * if (last)
+ * break;
+ * }
+ *
+ * API contract for VOP_FPLOOKUP_VEXEC routines is as follows:
+ * - they are called while within vfs_smr protection which they must never exit
+ * - EAGAIN can be returned to denote checking could not be performed, it is
+ * always valid to return it
+ * - if the sequence counter has not changed the result must be valid
+ * - if the sequence counter has changed both false positives and false negatives
+ * are permitted (since the result will be rejected later)
+ * - for simple cases of unix permission checks vaccess_vexec_smr can be used
+ *
+ * Caveats to watch out for:
+ * - vnodes are passed unlocked and unreferenced with nothing stopping
+ * VOP_RECLAIM, in turn meaning that ->v_data can become NULL. It is advised
+ * to use atomic_load_ptr to fetch it.
+ * - aforementioned object can also get freed, meaning absent other means it
+ * should be protected with vfs_smr
+ * - either safely checking permissions as they are modified or guaranteeing
+ * their stability is left to the routine
+ */
+int
+cache_fplookup(struct nameidata *ndp, enum cache_fpl_status *status,
+ struct pwd **pwdp)
+{
+ struct cache_fpl fpl;
+ struct pwd *pwd;
+ struct vnode *dvp;
+ struct componentname *cnp;
+ struct nameidata_saved orig;
+ int error;
+
+ *status = CACHE_FPL_STATUS_UNSET;
+ bzero(&fpl, sizeof(fpl));
+ fpl.status = CACHE_FPL_STATUS_UNSET;
+ fpl.ndp = ndp;
+ fpl.cnp = &ndp->ni_cnd;
+ MPASS(curthread == fpl.cnp->cn_thread);
+
+ if (!cache_can_fplookup(&fpl)) {
+ SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status);
+ *status = fpl.status;
+ return (EOPNOTSUPP);
+ }
+
+ cache_fpl_checkpoint(&fpl, &orig);
+
+ cache_fpl_smr_enter(&fpl);
+ pwd = pwd_get_smr();
+ fpl.pwd = pwd;
+ ndp->ni_rootdir = pwd->pwd_rdir;
+ ndp->ni_topdir = pwd->pwd_jdir;
+
+ cnp = fpl.cnp;
+ cnp->cn_nameptr = cnp->cn_pnbuf;
+ if (cnp->cn_pnbuf[0] == '/') {
+ cache_fpl_handle_root(ndp, &dvp);
+ } else {
+ MPASS(ndp->ni_dirfd == AT_FDCWD);
+ dvp = pwd->pwd_cdir;
+ }
+
+ SDT_PROBE4(vfs, namei, lookup, entry, dvp, cnp->cn_pnbuf, cnp->cn_flags, true);
+
+ error = cache_fplookup_impl(dvp, &fpl);
+ cache_fpl_smr_assert_not_entered(&fpl);
+ SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status);
+
+ *status = fpl.status;
+ switch (fpl.status) {
+ case CACHE_FPL_STATUS_UNSET:
+ __assert_unreachable();
+ break;
+ case CACHE_FPL_STATUS_HANDLED:
+ SDT_PROBE3(vfs, namei, lookup, return, error,
+ (error == 0 ? ndp->ni_vp : NULL), true);
+ break;
+ case CACHE_FPL_STATUS_PARTIAL:
+ *pwdp = fpl.pwd;
+ cache_fpl_restore(&fpl, &fpl.snd);
+ break;
+ case CACHE_FPL_STATUS_ABORTED:
+ cache_fpl_restore(&fpl, &orig);
+ break;
+ }
+ return (error);
+}
Index: sys/kern/vfs_lookup.c
===================================================================
--- sys/kern/vfs_lookup.c
+++ sys/kern/vfs_lookup.c
@@ -71,9 +71,9 @@
#undef NAMEI_DIAGNOSTIC
SDT_PROVIDER_DECLARE(vfs);
-SDT_PROBE_DEFINE3(vfs, namei, lookup, entry, "struct vnode *", "char *",
- "unsigned long");
-SDT_PROBE_DEFINE2(vfs, namei, lookup, return, "int", "struct vnode *");
+SDT_PROBE_DEFINE4(vfs, namei, lookup, entry, "struct vnode *", "char *",
+ "unsigned long", "bool");
+SDT_PROBE_DEFINE3(vfs, namei, lookup, return, "int", "struct vnode *", "bool");
/* Allocation zone for namei. */
uma_zone_t namei_zone;
@@ -280,77 +280,21 @@
return (0);
}
-/*
- * Convert a pathname into a pointer to a locked vnode.
- *
- * The FOLLOW flag is set when symbolic links are to be followed
- * when they occur at the end of the name translation process.
- * Symbolic links are always followed for all other pathname
- * components other than the last.
- *
- * The segflg defines whether the name is to be copied from user
- * space or kernel space.
- *
- * Overall outline of namei:
- *
- * copy in name
- * get starting directory
- * while (!done && !error) {
- * call lookup to search path.
- * if symbolic link, massage name in buffer and continue
- * }
- */
-int
-namei(struct nameidata *ndp)
+static int
+namei_setup(struct nameidata *ndp, struct vnode **dpp, struct pwd **pwdp)
{
- char *cp; /* pointer into pathname argument */
- struct vnode *dp; /* the directory we are searching */
- struct iovec aiov; /* uio for reading symbolic links */
struct componentname *cnp;
struct file *dfp;
struct thread *td;
- struct proc *p;
struct pwd *pwd;
cap_rights_t rights;
struct filecaps dirfd_caps;
- struct uio auio;
- int error, linklen, startdir_used;
+ int error, startdir_used;
cnp = &ndp->ni_cnd;
td = cnp->cn_thread;
- p = td->td_proc;
- ndp->ni_cnd.cn_cred = ndp->ni_cnd.cn_thread->td_ucred;
- KASSERT(cnp->cn_cred && p, ("namei: bad cred/proc"));
- KASSERT((cnp->cn_nameiop & (~OPMASK)) == 0,
- ("namei: nameiop contaminated with flags"));
- KASSERT((cnp->cn_flags & OPMASK) == 0,
- ("namei: flags contaminated with nameiops"));
- MPASS(ndp->ni_startdir == NULL || ndp->ni_startdir->v_type == VDIR ||
- ndp->ni_startdir->v_type == VBAD);
- TAILQ_INIT(&ndp->ni_cap_tracker);
- ndp->ni_lcf = 0;
-
- /* We will set this ourselves if we need it. */
- cnp->cn_flags &= ~TRAILINGSLASH;
-
- /*
- * Get a buffer for the name to be translated, and copy the
- * name into the buffer.
- */
- if ((cnp->cn_flags & HASBUF) == 0)
- cnp->cn_pnbuf = uma_zalloc(namei_zone, M_WAITOK);
- if (ndp->ni_segflg == UIO_SYSSPACE)
- error = copystr(ndp->ni_dirp, cnp->cn_pnbuf, MAXPATHLEN,
- &ndp->ni_pathlen);
- else
- error = copyinstr(ndp->ni_dirp, cnp->cn_pnbuf, MAXPATHLEN,
- &ndp->ni_pathlen);
- /*
- * Don't allow empty pathnames.
- */
- if (error == 0 && *cnp->cn_pnbuf == '\0')
- error = ENOENT;
+ *pwdp = NULL;
#ifdef CAPABILITY_MODE
/*
@@ -366,31 +310,19 @@
* previously walked by us, which prevents an escape from
* the relative root.
*/
- if (error == 0 && IN_CAPABILITY_MODE(td) &&
- (cnp->cn_flags & NOCAPCHECK) == 0) {
+ if (IN_CAPABILITY_MODE(td) && (cnp->cn_flags & NOCAPCHECK) == 0) {
ndp->ni_lcf |= NI_LCF_STRICTRELATIVE;
if (ndp->ni_dirfd == AT_FDCWD) {
#ifdef KTRACE
if (KTRPOINT(td, KTR_CAPFAIL))
ktrcapfail(CAPFAIL_LOOKUP, NULL, NULL);
#endif
- error = ECAPMODE;
+ return (ECAPMODE);
}
}
#endif
- if (error != 0) {
- namei_cleanup_cnp(cnp);
- ndp->ni_vp = NULL;
- return (error);
- }
- ndp->ni_loopcnt = 0;
-#ifdef KTRACE
- if (KTRPOINT(td, KTR_NAMEI)) {
- KASSERT(cnp->cn_thread == curthread,
- ("namei not using curthread"));
- ktrnamei(cnp->cn_pnbuf);
- }
-#endif
+ error = 0;
+
/*
* Get starting point for the translation.
*/
@@ -402,19 +334,16 @@
ndp->ni_rootdir = pwd->pwd_rdir;
ndp->ni_topdir = pwd->pwd_jdir;
- startdir_used = 0;
- dp = NULL;
- cnp->cn_nameptr = cnp->cn_pnbuf;
if (cnp->cn_pnbuf[0] == '/') {
ndp->ni_resflags |= NIRES_ABS;
- error = namei_handle_root(ndp, &dp);
+ error = namei_handle_root(ndp, dpp);
} else {
if (ndp->ni_startdir != NULL) {
- dp = ndp->ni_startdir;
+ *dpp = ndp->ni_startdir;
startdir_used = 1;
} else if (ndp->ni_dirfd == AT_FDCWD) {
- dp = pwd->pwd_cdir;
- vrefact(dp);
+ *dpp = pwd->pwd_cdir;
+ vrefact(*dpp);
} else {
rights = ndp->ni_rightsneeded;
cap_rights_set_one(&rights, CAP_LOOKUP);
@@ -441,8 +370,8 @@
} else if (dfp->f_vnode == NULL) {
error = ENOTDIR;
} else {
- dp = dfp->f_vnode;
- vrefact(dp);
+ *dpp = dfp->f_vnode;
+ vrefact(*dpp);
if ((dfp->f_flag & FSEARCH) != 0)
cnp->cn_flags |= NOEXECCHECK;
@@ -464,7 +393,7 @@
}
#endif
}
- if (error == 0 && dp->v_type != VDIR)
+ if (error == 0 && (*dpp)->v_type != VDIR)
error = ENOTDIR;
}
if (error == 0 && (cnp->cn_flags & BENEATH) != 0) {
@@ -476,7 +405,7 @@
cap_rights_set_one(&rights, CAP_LOOKUP);
error = fgetvp_rights(td, ndp->ni_dirfd, &rights,
&dirfd_caps, &ndp->ni_beneath_latch);
- if (error == 0 && dp->v_type != VDIR) {
+ if (error == 0 && (*dpp)->v_type != VDIR) {
vrele(ndp->ni_beneath_latch);
error = ENOTDIR;
}
@@ -488,15 +417,15 @@
* If we are auditing the kernel pathname, save the user pathname.
*/
if (cnp->cn_flags & AUDITVNODE1)
- AUDIT_ARG_UPATH1_VP(td, ndp->ni_rootdir, dp, cnp->cn_pnbuf);
+ AUDIT_ARG_UPATH1_VP(td, ndp->ni_rootdir, *dpp, cnp->cn_pnbuf);
if (cnp->cn_flags & AUDITVNODE2)
- AUDIT_ARG_UPATH2_VP(td, ndp->ni_rootdir, dp, cnp->cn_pnbuf);
+ AUDIT_ARG_UPATH2_VP(td, ndp->ni_rootdir, *dpp, cnp->cn_pnbuf);
if (ndp->ni_startdir != NULL && !startdir_used)
vrele(ndp->ni_startdir);
if (error != 0) {
- if (dp != NULL)
- vrele(dp);
- goto out;
+ if (*dpp != NULL)
+ vrele(*dpp);
+ return (error);
}
MPASS((ndp->ni_lcf & (NI_LCF_BENEATH_ABS | NI_LCF_LATCH)) !=
NI_LCF_BENEATH_ABS);
@@ -505,8 +434,132 @@
((ndp->ni_lcf & NI_LCF_STRICTRELATIVE) == 0 &&
(cnp->cn_flags & BENEATH) != 0))
ndp->ni_lcf |= NI_LCF_CAP_DOTDOT;
- SDT_PROBE3(vfs, namei, lookup, entry, dp, cnp->cn_pnbuf,
- cnp->cn_flags);
+ SDT_PROBE4(vfs, namei, lookup, entry, *dpp, cnp->cn_pnbuf,
+ cnp->cn_flags, false);
+ *pwdp = pwd;
+ return (0);
+}
+
+/*
+ * Convert a pathname into a pointer to a locked vnode.
+ *
+ * The FOLLOW flag is set when symbolic links are to be followed
+ * when they occur at the end of the name translation process.
+ * Symbolic links are always followed for all other pathname
+ * components other than the last.
+ *
+ * The segflg defines whether the name is to be copied from user
+ * space or kernel space.
+ *
+ * Overall outline of namei:
+ *
+ * copy in name
+ * get starting directory
+ * while (!done && !error) {
+ * call lookup to search path.
+ * if symbolic link, massage name in buffer and continue
+ * }
+ */
+int
+namei(struct nameidata *ndp)
+{
+ char *cp; /* pointer into pathname argument */
+ struct vnode *dp; /* the directory we are searching */
+ struct iovec aiov; /* uio for reading symbolic links */
+ struct componentname *cnp;
+ struct thread *td;
+ struct proc *p;
+ struct pwd *pwd;
+ struct uio auio;
+ int error, linklen;
+ enum cache_fpl_status status;
+
+ cnp = &ndp->ni_cnd;
+ td = cnp->cn_thread;
+ p = td->td_proc;
+ ndp->ni_cnd.cn_cred = ndp->ni_cnd.cn_thread->td_ucred;
+ KASSERT(cnp->cn_cred && p, ("namei: bad cred/proc"));
+ KASSERT((cnp->cn_nameiop & (~OPMASK)) == 0,
+ ("namei: nameiop contaminated with flags"));
+ KASSERT((cnp->cn_flags & OPMASK) == 0,
+ ("namei: flags contaminated with nameiops"));
+ MPASS(ndp->ni_startdir == NULL || ndp->ni_startdir->v_type == VDIR ||
+ ndp->ni_startdir->v_type == VBAD);
+ TAILQ_INIT(&ndp->ni_cap_tracker);
+ ndp->ni_lcf = 0;
+ ndp->ni_loopcnt = 0;
+ dp = NULL;
+
+ /* We will set this ourselves if we need it. */
+ cnp->cn_flags &= ~TRAILINGSLASH;
+
+ ndp->ni_vp = NULL;
+
+ /*
+ * Get a buffer for the name to be translated, and copy the
+ * name into the buffer.
+ */
+ if ((cnp->cn_flags & HASBUF) == 0)
+ cnp->cn_pnbuf = uma_zalloc(namei_zone, M_WAITOK);
+ if (ndp->ni_segflg == UIO_SYSSPACE)
+ error = copystr(ndp->ni_dirp, cnp->cn_pnbuf, MAXPATHLEN,
+ &ndp->ni_pathlen);
+ else
+ error = copyinstr(ndp->ni_dirp, cnp->cn_pnbuf, MAXPATHLEN,
+ &ndp->ni_pathlen);
+
+ if (error != 0) {
+ namei_cleanup_cnp(cnp);
+ return (error);
+ }
+
+ cnp->cn_nameptr = cnp->cn_pnbuf;
+
+ /*
+ * Don't allow empty pathnames.
+ */
+ if (*cnp->cn_pnbuf == '\0') {
+ namei_cleanup_cnp(cnp);
+ return (ENOENT);
+ }
+
+#ifdef KTRACE
+ if (KTRPOINT(td, KTR_NAMEI)) {
+ KASSERT(cnp->cn_thread == curthread,
+ ("namei not using curthread"));
+ ktrnamei(cnp->cn_pnbuf);
+ }
+#endif
+
+ /*
+ * First try the fast path.
+ *
+ * If it fails to handle the lookup, we are going to do perform it below.
+ * Note this means that we either start from scratch or continue where it
+ * left off.
+ */
+ error = cache_fplookup(ndp, &status, &pwd);
+ switch (status) {
+ case CACHE_FPL_STATUS_UNSET:
+ __assert_unreachable();
+ break;
+ case CACHE_FPL_STATUS_HANDLED:
+ return (error);
+ case CACHE_FPL_STATUS_PARTIAL:
+ dp = ndp->ni_startdir;
+ break;
+ case CACHE_FPL_STATUS_ABORTED:
+ error = namei_setup(ndp, &dp, &pwd);
+ if (error != 0) {
+ namei_cleanup_cnp(cnp);
+ return (error);
+ }
+ break;
+ }
+
+ /*
+ * Perform the lookup.
+ */
for (;;) {
ndp->ni_startdir = dp;
error = lookup(ndp);
@@ -526,8 +579,8 @@
error = ENOTCAPABLE;
}
nameicap_cleanup(ndp, true);
- SDT_PROBE2(vfs, namei, lookup, return, error,
- (error == 0 ? ndp->ni_vp : NULL));
+ SDT_PROBE3(vfs, namei, lookup, return, error,
+ (error == 0 ? ndp->ni_vp : NULL), false);
pwd_drop(pwd);
return (error);
}
@@ -602,7 +655,7 @@
MPASS(error != 0);
namei_cleanup_cnp(cnp);
nameicap_cleanup(ndp, true);
- SDT_PROBE2(vfs, namei, lookup, return, error, NULL);
+ SDT_PROBE3(vfs, namei, lookup, return, error, NULL, false);
pwd_drop(pwd);
return (error);
}
Index: sys/sys/namei.h
===================================================================
--- sys/sys/namei.h
+++ sys/sys/namei.h
@@ -108,6 +108,12 @@
};
#ifdef _KERNEL
+
+enum cache_fpl_status { CACHE_FPL_STATUS_ABORTED, CACHE_FPL_STATUS_PARTIAL,
+ CACHE_FPL_STATUS_HANDLED, CACHE_FPL_STATUS_UNSET };
+int cache_fplookup(struct nameidata *ndp, enum cache_fpl_status *status,
+ struct pwd **pwdp);
+
/*
* namei operations
*/

File Metadata

Mime Type
text/plain
Expires
Thu, Jan 16, 12:03 AM (1 h, 9 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
15817638
Default Alt Text
D25578.id74307.diff (32 KB)

Event Timeline