Page MenuHomeFreeBSD

D40850.diff
No OneTemporary

D40850.diff

diff --git a/sys/kern/vfs_lookup.c b/sys/kern/vfs_lookup.c
--- a/sys/kern/vfs_lookup.c
+++ b/sys/kern/vfs_lookup.c
@@ -102,6 +102,10 @@
/* Allocation zone for namei. */
uma_zone_t namei_zone;
+/* Forward declaration (for use in vn_cross_mounts()). */
+static int enforce_lkflags(struct mount *mp, int lkflags);
+
+
/* Placeholder vnode for mp traversal. */
static struct vnode *vp_crossmp;
@@ -157,6 +161,233 @@
* gets allocated early. See nameiinit for the direct call below.
*/
+/*
+ * Returns busied the mount point mounted on the passed vnode, if any.
+ *
+ * The vnode's lock must be held and may be released on output, as indicated by
+ * '*unlocked'. The caller must also have an active reference on the vnode
+ * (vref() or vget()) which is preserved across the call. On success, the
+ * busied mount point is passed through 'mp'.
+ *
+ * If the vnode is not mounted-on, EJUSTRETURN is returned and '*mp' is set to
+ * NULL. Concurrent unmounts/remounts of the covering mount are handled
+ * transparently by restarting the process (doing so is currently not really
+ * necessary for correctness but is closer to the historical behavior where the
+ * unmounts/remounts were prevented to happen in this case, and will be required
+ * (but not enough) if we ever want to implement such things as atomic mount
+ * substitutions). ENOENT is returned if the vnode was doomed while trying to
+ * determine its covering mount, and '*mp' is set to NULL. Else, '*mp' is set
+ * to the busied mount point and 0 is returned.
+ */
+int
+vn_busy_mountedhere(struct vnode *vp, bool *unlocked, struct mount **mp)
+{
+ int error;
+
+ ASSERT_VOP_LOCKED(vp, __func__);
+ ASSERT_VI_UNLOCKED(vp, __func__);
+
+ *unlocked = false;
+ *mp = NULL;
+
+ if (VN_IS_DOOMED(vp))
+ return (ENOENT);
+
+ if (__predict_true((vn_irflag_read(vp) & VIRF_MOUNTPOINT) == 0))
+ return (EJUSTRETURN);
+
+ *mp = vp->v_mountedhere;
+ MPASS(*mp != NULL);
+
+ /*
+ * Opportunistically try to busy the mount point. On success, this can
+ * avoid a superfluous unlock/relock cycle on 'vp' in some cases (in
+ * particular, the vfs_lookup() case), and always avoids a pair of
+ * vfs_ref()/vfs_rel() calls.
+ */
+ error = vfs_busy(*mp, MBF_NOWAIT);
+ if (__predict_true(error == 0))
+ return (error);
+
+ /* Make sure '*mp' survives the unlock of 'vp'. */
+ vfs_ref(*mp);
+ VOP_UNLOCK(vp);
+ *unlocked = true;
+
+ for (;;) {
+ error = vfs_busy(*mp, 0);
+ vfs_rel(*mp);
+
+ if (__predict_true(error == 0))
+ return (0);
+
+ *mp = NULL;
+ VI_LOCK(vp);
+
+ if (VN_IS_DOOMED(vp)) {
+ error = ENOENT;
+ goto unlock_exit;
+ }
+
+ if (__predict_true
+ ((vn_irflag_read(vp) & VIRF_MOUNTPOINT) == 0)) {
+ error = EJUSTRETURN;
+ goto unlock_exit;
+ }
+ /*
+ * We can't retrieve the same (conceptual) mount point as before
+ * since the vfs_busy() above returned with an error only after
+ * 'v_mountedhere' was cleared on the covered vnode (but we well
+ * could retrieve the same pointer in case the structure is
+ * recycled).
+ */
+ *mp = vp->v_mountedhere;
+ MPASS(*mp != NULL);
+
+ /*
+ * This establishes the order "covered vnode's interlock" ->
+ * "mounted-here mount point interlock". Note that this order
+ * between a vnode and a mount point is the reverse of that of
+ * "vnode's owning mount point" -> "vnode's interlock", without
+ * causing trouble since the mount point is different in both
+ * cases. This causes a spurious LOR with the initial 'devfs'
+ * being mounted at '/' and then remounted at '/dev' (see
+ * vfs_mountroot()).
+ */
+ vfs_ref(*mp);
+ VI_UNLOCK(vp);
+ }
+
+ __assert_unreachable();
+
+unlock_exit:
+ VI_UNLOCK(vp);
+ return (error);
+}
+
+/*
+ * Cross a single mounted-on vnode, returning the mount's root vnode.
+ *
+ * The vnode's lock must be held, and may be unlocked on output as indicated by
+ * '*unlocked' or on success. The caller must also have an active reference on
+ * the vnode (vref() or vget()) which is preserved across the call. On success,
+ * the mount's root vnode is returned locked according to 'root_lkflags' and
+ * with an active reference.
+ *
+ * Behaves essentially for errors and outputs as vn_busy_mountedhere(), which it
+ * calls first, with '*vpp' taking the role of '*mp'. In case of success for
+ * this step, VFS_ROOT() is called and its result returned. In case of any
+ * error, '*vpp' is set to NULL. On overall success, '*unlocked' is guaranteed
+ * to be set to true.
+ */
+int
+vn_cross_single_mount(struct vnode* vp, int root_lkflags,
+ bool *unlocked, struct vnode **vpp)
+{
+ struct mount *mp;
+ int error;
+
+ *vpp = NULL;
+
+ error = vn_busy_mountedhere(vp, unlocked, &mp);
+ if (__predict_false(error == 0)) {
+ if (__predict_true(!*unlocked)) {
+ VOP_UNLOCK(vp);
+ *unlocked = true;
+ }
+ error = VFS_ROOT(mp, root_lkflags, vpp);
+ vfs_unbusy(mp);
+ }
+
+ return (error);
+}
+
+static void
+vn_lock_enforced_flags(struct vnode *vp, int lkflags)
+{
+ int error __unused;
+
+ error = vn_lock(vp, enforce_lkflags(vp->v_mount, lkflags | LK_RETRY));
+ KASSERT(error == 0,
+ ("%s: vn_lock(LK_RETRY) returned %d", __func__, error));
+}
+
+/*
+ * Repeatedly cross mounts starting from a given vnode.
+ *
+ * Traverses all successive mounts on the same path, locking the successive
+ * vnodes as specified by enforce_lkflags() and unlocking them after obtaining
+ * their covering mount. Ensures the final vnode is locked and actively
+ * referenced. The initial vnode is returned unlocked and its active reference
+ * is released except if it is also the final vnode (no mount points to cross).
+ *
+ * Mounts are crossed until reaching vnode that is not covered by a mount, which
+ * is returned locked. If some traversed vnode happens to be doomed, ENOENT is
+ * returned. Can return errors reported by VFS_ROOT(). On success, puts the
+ * final vnode into '*vpp' and returns 0.
+ *
+ * This function ensures that the crossed mountpoint cannot be busied and the
+ * initial vnode locked at the same time. The goal is to avoid establishing
+ * a lock order between them in order to avoid deadlocks, at lookup with mounted
+ * stacked filesystems (nullfs, unionfs) where locking the mountpoint's root
+ * vnode leads to locking the covered vnode as well and vice-versa, but also at
+ * unmount where parallel vfs_busy() calls block while acquiring the covered
+ * vnode's lock, which establishes the acquisition order mount point -> covered
+ * vnode. This function (through the VFS_ROOT() call) only establishes the
+ * acquisition order mount point -> root vnode, which implies mount point ->
+ * covered vnode for stacked filesystems, thus the same order as that of
+ * dounmount(). In other words, the legal order is that a mount point reference
+ * must always be acquired before the vnode's lock, be it the root vnode under
+ * the mount point or the covered vnode over it.
+ */
+int
+vn_cross_mounts(struct vnode* vp, int const lkflags, struct vnode ** const vpp)
+{
+ int error;
+ bool unlocked;
+
+ for (;;) {
+ error = vn_cross_single_mount(vp, lkflags, &unlocked, vpp);
+
+ /* Optimize for the non-mount-point case. */
+ if (__predict_true(error == EJUSTRETURN)) {
+ /* No more mounts to cross. */
+ *vpp = vp;
+ error = 0;
+
+ if (__predict_false(unlocked)) {
+ vn_lock_enforced_flags(vp, lkflags);
+ if (VN_IS_DOOMED(vp)) {
+ vput(vp);
+
+ *vpp = NULL;
+ error = ENOENT;
+ }
+ }
+
+ return (error);
+ }
+
+ if (__predict_false(error != 0)) {
+ if (__predict_true(unlocked))
+ vrele(vp);
+ else
+ vput(vp);
+
+ return (error);
+ }
+
+ /* Crossed one mount. Try to cross another one. */
+ MPASS(unlocked);
+ ASSERT_VOP_UNLOCKED(vp, __func__);
+ vrele(vp);
+ vp = *vpp;
+ ASSERT_VOP_LOCKED(vp, __func__);
+ }
+
+ __assert_unreachable();
+}
+
struct nameicap_tracker {
struct vnode *dp;
TAILQ_ENTRY(nameicap_tracker) nm_link;
diff --git a/sys/sys/vnode.h b/sys/sys/vnode.h
--- a/sys/sys/vnode.h
+++ b/sys/sys/vnode.h
@@ -826,6 +826,12 @@
int vn_io_fault_pgmove(vm_page_t ma[], vm_offset_t offset, int xfersize,
struct uio *uio);
+int vn_busy_mountedhere(struct vnode *vp, bool *unlocked,
+ struct mount **mp);
+int vn_cross_single_mount(struct vnode* vp, int root_lkflags,
+ bool *unlocked, struct vnode **vpp);
+int vn_cross_mounts(struct vnode* vp, int lkflags, struct vnode **vpp);
+
void vn_seqc_write_begin_locked(struct vnode *vp);
void vn_seqc_write_begin(struct vnode *vp);
void vn_seqc_write_end_locked(struct vnode *vp);

File Metadata

Mime Type
text/plain
Expires
Sat, Nov 16, 5:21 PM (22 h, 4 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
14661981
Default Alt Text
D40850.diff (8 KB)

Event Timeline