D40850.diff
No OneTemporary
Actions

Size

8 KB

Referenced Files

None

Subscribers

None

D40850.diff
View Options

	diff --git a/sys/kern/vfs_lookup.c b/sys/kern/vfs_lookup.c
	--- a/sys/kern/vfs_lookup.c
	+++ b/sys/kern/vfs_lookup.c
	@@ -102,6 +102,10 @@
	/* Allocation zone for namei. */
	uma_zone_t namei_zone;

	+/* Forward declaration (for use in vn_cross_mounts()). */
	+static int enforce_lkflags(struct mount *mp, int lkflags);
	+
	+
	/* Placeholder vnode for mp traversal. */
	static struct vnode *vp_crossmp;

	@@ -157,6 +161,233 @@
	* gets allocated early. See nameiinit for the direct call below.
	*/

	+/*
	+ * Returns busied the mount point mounted on the passed vnode, if any.
	+ *
	+ * The vnode's lock must be held and may be released on output, as indicated by
	+ * '*unlocked'. The caller must also have an active reference on the vnode
	+ * (vref() or vget()) which is preserved across the call. On success, the
	+ * busied mount point is passed through 'mp'.
	+ *
	+ * If the vnode is not mounted-on, EJUSTRETURN is returned and '*mp' is set to
	+ * NULL. Concurrent unmounts/remounts of the covering mount are handled
	+ * transparently by restarting the process (doing so is currently not really
	+ * necessary for correctness but is closer to the historical behavior where the
	+ * unmounts/remounts were prevented to happen in this case, and will be required
	+ * (but not enough) if we ever want to implement such things as atomic mount
	+ * substitutions). ENOENT is returned if the vnode was doomed while trying to
	+ * determine its covering mount, and 'mp' is set to NULL. Else, 'mp' is set
	+ * to the busied mount point and 0 is returned.
	+ */
	+int
	+vn_busy_mountedhere(struct vnode vp, bool unlocked, struct mount **mp)
	+{
	+ int error;
	+
	+ ASSERT_VOP_LOCKED(vp, __func__);
	+ ASSERT_VI_UNLOCKED(vp, __func__);
	+
	+ *unlocked = false;
	+ *mp = NULL;
	+
	+ if (VN_IS_DOOMED(vp))
	+ return (ENOENT);
	+
	+ if (__predict_true((vn_irflag_read(vp) & VIRF_MOUNTPOINT) == 0))
	+ return (EJUSTRETURN);
	+
	+ *mp = vp->v_mountedhere;
	+ MPASS(*mp != NULL);
	+
	+ /*
	+ * Opportunistically try to busy the mount point. On success, this can
	+ * avoid a superfluous unlock/relock cycle on 'vp' in some cases (in
	+ * particular, the vfs_lookup() case), and always avoids a pair of
	+ * vfs_ref()/vfs_rel() calls.
	+ */
	+ error = vfs_busy(*mp, MBF_NOWAIT);
	+ if (__predict_true(error == 0))
	+ return (error);
	+
	+ /* Make sure 'mp' survives the unlock of 'vp'. /
	+ vfs_ref(*mp);
	+ VOP_UNLOCK(vp);
	+ *unlocked = true;
	+
	+ for (;;) {
	+ error = vfs_busy(*mp, 0);
	+ vfs_rel(*mp);
	+
	+ if (__predict_true(error == 0))
	+ return (0);
	+
	+ *mp = NULL;
	+ VI_LOCK(vp);
	+
	+ if (VN_IS_DOOMED(vp)) {
	+ error = ENOENT;
	+ goto unlock_exit;
	+ }
	+
	+ if (__predict_true
	+ ((vn_irflag_read(vp) & VIRF_MOUNTPOINT) == 0)) {
	+ error = EJUSTRETURN;
	+ goto unlock_exit;
	+ }
	+ /*
	+ * We can't retrieve the same (conceptual) mount point as before
	+ * since the vfs_busy() above returned with an error only after
	+ * 'v_mountedhere' was cleared on the covered vnode (but we well
	+ * could retrieve the same pointer in case the structure is
	+ * recycled).
	+ */
	+ *mp = vp->v_mountedhere;
	+ MPASS(*mp != NULL);
	+
	+ /*
	+ * This establishes the order "covered vnode's interlock" ->
	+ * "mounted-here mount point interlock". Note that this order
	+ * between a vnode and a mount point is the reverse of that of
	+ * "vnode's owning mount point" -> "vnode's interlock", without
	+ * causing trouble since the mount point is different in both
	+ * cases. This causes a spurious LOR with the initial 'devfs'
	+ * being mounted at '/' and then remounted at '/dev' (see
	+ * vfs_mountroot()).
	+ */
	+ vfs_ref(*mp);
	+ VI_UNLOCK(vp);
	+ }
	+
	+ __assert_unreachable();
	+
	+unlock_exit:
	+ VI_UNLOCK(vp);
	+ return (error);
	+}
	+
	+/*
	+ * Cross a single mounted-on vnode, returning the mount's root vnode.
	+ *
	+ * The vnode's lock must be held, and may be unlocked on output as indicated by
	+ * '*unlocked' or on success. The caller must also have an active reference on
	+ * the vnode (vref() or vget()) which is preserved across the call. On success,
	+ * the mount's root vnode is returned locked according to 'root_lkflags' and
	+ * with an active reference.
	+ *
	+ * Behaves essentially for errors and outputs as vn_busy_mountedhere(), which it
	+ * calls first, with 'vpp' taking the role of 'mp'. In case of success for
	+ * this step, VFS_ROOT() is called and its result returned. In case of any
	+ * error, 'vpp' is set to NULL. On overall success, 'unlocked' is guaranteed
	+ * to be set to true.
	+ */
	+int
	+vn_cross_single_mount(struct vnode* vp, int root_lkflags,
	+ bool unlocked, struct vnode *vpp)
	+{
	+ struct mount *mp;
	+ int error;
	+
	+ *vpp = NULL;
	+
	+ error = vn_busy_mountedhere(vp, unlocked, &mp);
	+ if (__predict_false(error == 0)) {
	+ if (__predict_true(!*unlocked)) {
	+ VOP_UNLOCK(vp);
	+ *unlocked = true;
	+ }
	+ error = VFS_ROOT(mp, root_lkflags, vpp);
	+ vfs_unbusy(mp);
	+ }
	+
	+ return (error);
	+}
	+
	+static void
	+vn_lock_enforced_flags(struct vnode *vp, int lkflags)
	+{
	+ int error __unused;
	+
	+ error = vn_lock(vp, enforce_lkflags(vp->v_mount, lkflags \| LK_RETRY));
	+ KASSERT(error == 0,
	+ ("%s: vn_lock(LK_RETRY) returned %d", __func__, error));
	+}
	+
	+/*
	+ * Repeatedly cross mounts starting from a given vnode.
	+ *
	+ * Traverses all successive mounts on the same path, locking the successive
	+ * vnodes as specified by enforce_lkflags() and unlocking them after obtaining
	+ * their covering mount. Ensures the final vnode is locked and actively
	+ * referenced. The initial vnode is returned unlocked and its active reference
	+ * is released except if it is also the final vnode (no mount points to cross).
	+ *
	+ * Mounts are crossed until reaching vnode that is not covered by a mount, which
	+ * is returned locked. If some traversed vnode happens to be doomed, ENOENT is
	+ * returned. Can return errors reported by VFS_ROOT(). On success, puts the
	+ * final vnode into '*vpp' and returns 0.
	+ *
	+ * This function ensures that the crossed mountpoint cannot be busied and the
	+ * initial vnode locked at the same time. The goal is to avoid establishing
	+ * a lock order between them in order to avoid deadlocks, at lookup with mounted
	+ * stacked filesystems (nullfs, unionfs) where locking the mountpoint's root
	+ * vnode leads to locking the covered vnode as well and vice-versa, but also at
	+ * unmount where parallel vfs_busy() calls block while acquiring the covered
	+ * vnode's lock, which establishes the acquisition order mount point -> covered
	+ * vnode. This function (through the VFS_ROOT() call) only establishes the
	+ * acquisition order mount point -> root vnode, which implies mount point ->
	+ * covered vnode for stacked filesystems, thus the same order as that of
	+ * dounmount(). In other words, the legal order is that a mount point reference
	+ * must always be acquired before the vnode's lock, be it the root vnode under
	+ * the mount point or the covered vnode over it.
	+ */
	+int
	+vn_cross_mounts(struct vnode* vp, int const lkflags, struct vnode ** const vpp)
	+{
	+ int error;
	+ bool unlocked;
	+
	+ for (;;) {
	+ error = vn_cross_single_mount(vp, lkflags, &unlocked, vpp);
	+
	+ /* Optimize for the non-mount-point case. */
	+ if (__predict_true(error == EJUSTRETURN)) {
	+ /* No more mounts to cross. */
	+ *vpp = vp;
	+ error = 0;
	+
	+ if (__predict_false(unlocked)) {
	+ vn_lock_enforced_flags(vp, lkflags);
	+ if (VN_IS_DOOMED(vp)) {
	+ vput(vp);
	+
	+ *vpp = NULL;
	+ error = ENOENT;
	+ }
	+ }
	+
	+ return (error);
	+ }
	+
	+ if (__predict_false(error != 0)) {
	+ if (__predict_true(unlocked))
	+ vrele(vp);
	+ else
	+ vput(vp);
	+
	+ return (error);
	+ }
	+
	+ /* Crossed one mount. Try to cross another one. */
	+ MPASS(unlocked);
	+ ASSERT_VOP_UNLOCKED(vp, __func__);
	+ vrele(vp);
	+ vp = *vpp;
	+ ASSERT_VOP_LOCKED(vp, __func__);
	+ }
	+
	+ __assert_unreachable();
	+}
	+
	struct nameicap_tracker {
	struct vnode *dp;
	TAILQ_ENTRY(nameicap_tracker) nm_link;
	diff --git a/sys/sys/vnode.h b/sys/sys/vnode.h
	--- a/sys/sys/vnode.h
	+++ b/sys/sys/vnode.h
	@@ -826,6 +826,12 @@
	int vn_io_fault_pgmove(vm_page_t ma[], vm_offset_t offset, int xfersize,
	struct uio *uio);

	+int vn_busy_mountedhere(struct vnode vp, bool unlocked,
	+ struct mount **mp);
	+int vn_cross_single_mount(struct vnode* vp, int root_lkflags,
	+ bool unlocked, struct vnode *vpp);
	+int vn_cross_mounts(struct vnode* vp, int lkflags, struct vnode **vpp);
	+
	void vn_seqc_write_begin_locked(struct vnode *vp);
	void vn_seqc_write_begin(struct vnode *vp);
	void vn_seqc_write_end_locked(struct vnode *vp);

File Metadata

Mime Type: text/plain
Expires: Sat, Nov 16, 5:21 PM (22 h, 4 m)
Storage Engine: blob
Storage Format: Raw Data
Storage Handle: 14661981
Default Alt Text: D40850.diff (8 KB)

D40850.diffNo OneTemporaryActions

D40850.diffView Options

File Metadata

Event Timeline

D40850.diff
No OneTemporary
Actions

D40850.diff
View Options