Page Menu
Home
FreeBSD
Search
Configure Global Search
Log In
Files
F108370475
D45398.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Flag For Later
Award Token
Size
72 KB
Referenced Files
None
Subscribers
None
D45398.diff
View Options
diff --git a/sys/fs/unionfs/union.h b/sys/fs/unionfs/union.h
--- a/sys/fs/unionfs/union.h
+++ b/sys/fs/unionfs/union.h
@@ -97,15 +97,17 @@
char *un_path; /* path */
int un_pathlen; /* strlen of path */
- int un_flag; /* unionfs node flag */
-};
-/*
- * unionfs node flags
- * It needs the vnode with exclusive lock, when changing the un_flag variable.
- */
-#define UNIONFS_OPENEXTL 0x01 /* openextattr (lower) */
-#define UNIONFS_OPENEXTU 0x02 /* openextattr (upper) */
+ /*
+ * unionfs node flags
+ * Changing these flags requires the vnode to be locked exclusive.
+ */
+ #define UNIONFS_OPENEXTL 0x01 /* openextattr (lower) */
+ #define UNIONFS_OPENEXTU 0x02 /* openextattr (upper) */
+ #define UNIONFS_COPY_IN_PROGRESS 0x04 /* copy/dir shadow in progres */
+ #define UNIONFS_LOOKUP_IN_PROGRESS 0x08
+ unsigned int un_flag; /* unionfs node flag */
+};
extern struct vop_vector unionfs_vnodeops;
@@ -136,29 +138,25 @@
void unionfs_tryrem_node_status(struct unionfs_node *,
struct unionfs_node_status *);
int unionfs_check_rmdir(struct vnode *, struct ucred *, struct thread *td);
-int unionfs_copyfile(struct unionfs_node *, int, struct ucred *,
+int unionfs_copyfile(struct vnode *, int, struct ucred *,
struct thread *);
void unionfs_create_uppervattr_core(struct unionfs_mount *, struct vattr *,
struct vattr *, struct thread *);
int unionfs_create_uppervattr(struct unionfs_mount *, struct vnode *,
struct vattr *, struct ucred *, struct thread *);
-int unionfs_mkshadowdir(struct unionfs_mount *, struct vnode *,
- struct unionfs_node *, struct componentname *, struct thread *);
+int unionfs_mkshadowdir(struct vnode *, struct vnode *,
+ struct componentname *, struct thread *);
int unionfs_mkwhiteout(struct vnode *, struct vnode *,
struct componentname *, struct thread *, char *, int);
int unionfs_relookup(struct vnode *, struct vnode **,
struct componentname *, struct componentname *, struct thread *,
char *, int, u_long);
-int unionfs_relookup_for_create(struct vnode *, struct componentname *,
- struct thread *);
-int unionfs_relookup_for_delete(struct vnode *, struct componentname *,
- struct thread *);
-int unionfs_relookup_for_rename(struct vnode *, struct componentname *,
- struct thread *);
void unionfs_forward_vop_start_pair(struct vnode *, int *,
struct vnode *, int *);
bool unionfs_forward_vop_finish_pair(struct vnode *, struct vnode *, int,
struct vnode *, struct vnode *, int);
+int unionfs_set_in_progress_flag(struct vnode *, unsigned int);
+void unionfs_clear_in_progress_flag(struct vnode *, unsigned int);
static inline void
unionfs_forward_vop_start(struct vnode *basevp, int *lkflags)
diff --git a/sys/fs/unionfs/union_subr.c b/sys/fs/unionfs/union_subr.c
--- a/sys/fs/unionfs/union_subr.c
+++ b/sys/fs/unionfs/union_subr.c
@@ -203,19 +203,19 @@
struct unionfs_node_hashhead *hd;
struct vnode *vp;
- ASSERT_VOP_ELOCKED(uncp->un_uppervp, __func__);
- ASSERT_VOP_ELOCKED(uncp->un_lowervp, __func__);
- KASSERT(uncp->un_uppervp == NULLVP || uncp->un_uppervp->v_type == VDIR,
- ("%s: v_type != VDIR", __func__));
- KASSERT(uncp->un_lowervp == NULLVP || uncp->un_lowervp->v_type == VDIR,
- ("%s: v_type != VDIR", __func__));
-
vp = NULLVP;
VI_LOCK(dvp);
- if (uncp->un_uppervp != NULL)
+ if (uncp->un_uppervp != NULLVP) {
+ ASSERT_VOP_ELOCKED(uncp->un_uppervp, __func__);
+ KASSERT(uncp->un_uppervp->v_type == VDIR,
+ ("%s: v_type != VDIR", __func__));
vp = unionfs_get_cached_vnode_locked(uncp->un_uppervp, dvp);
- else if (uncp->un_lowervp != NULL)
+ } else if (uncp->un_lowervp != NULLVP) {
+ ASSERT_VOP_ELOCKED(uncp->un_lowervp, __func__);
+ KASSERT(uncp->un_lowervp->v_type == VDIR,
+ ("%s: v_type != VDIR", __func__));
vp = unionfs_get_cached_vnode_locked(uncp->un_lowervp, dvp);
+ }
if (vp == NULLVP) {
hd = unionfs_get_hashhead(dvp, (uncp->un_uppervp != NULLVP ?
uncp->un_uppervp : uncp->un_lowervp));
@@ -276,9 +276,11 @@
if (unp->un_dvp != NULLVP)
vrele(unp->un_dvp);
- if (unp->un_uppervp != NULLVP)
+ if (unp->un_uppervp != NULLVP) {
vput(unp->un_uppervp);
- if (unp->un_lowervp != NULLVP)
+ if (unp->un_lowervp != NULLVP)
+ vrele(unp->un_lowervp);
+ } else if (unp->un_lowervp != NULLVP)
vput(unp->un_lowervp);
if (unp->un_hashtbl != NULL)
hashdestroy(unp->un_hashtbl, M_UNIONFSHASH, UNIONFSHASHMASK);
@@ -314,7 +316,7 @@
*vpp = NULLVP;
if (uppervp == NULLVP && lowervp == NULLVP)
- panic("%s: upper and lower is null", __func__);
+ panic("%s: upper and lower are both null", __func__);
vt = (uppervp != NULLVP ? uppervp->v_type : lowervp->v_type);
@@ -327,7 +329,9 @@
vp = unionfs_get_cached_vnode(uppervp, lowervp, dvp);
if (vp != NULLVP) {
*vpp = vp;
- goto unionfs_nodeget_out;
+ if (lkflags != 0)
+ vn_lock(*vpp, lkflags | LK_RETRY);
+ return (0);
}
}
@@ -385,27 +389,47 @@
KASSERT(dvp != NULL || (vp->v_vflag & VV_ROOT) != 0,
("%s: NULL dvp for non-root vp %p", __func__, vp));
- vn_lock_pair(lowervp, false, LK_EXCLUSIVE, uppervp, false,
- LK_EXCLUSIVE);
+
+ /*
+ * NOTE: There is still a possibility for cross-filesystem locking here.
+ * If dvp has an upper FS component and is locked, while the new vnode
+ * created here only has a lower-layer FS component, then we will end
+ * up taking a lower-FS lock while holding an upper-FS lock.
+ * That situation could be dealt with here using vn_lock_pair().
+ * However, that would only address one instance out of many in which
+ * a child vnode lock is taken while holding a lock on its parent
+ * directory. This is done in many places in common VFS code, as well as
+ * a few places within unionfs (which could lead to the same cross-FS
+ * locking issue if, for example, the upper FS is another nested unionfs
+ * instance). Additionally, it is unclear under what circumstances this
+ * specific lock sequence (a directory on one FS followed by a child of
+ * its 'peer' directory on another FS) would present the practical
+ * possibility of deadlock due to some other agent on the system
+ * attempting to lock those two specific vnodes in the opposite order.
+ */
+ if (uppervp != NULLVP)
+ vn_lock(uppervp, LK_EXCLUSIVE | LK_RETRY);
+ else
+ vn_lock(lowervp, LK_EXCLUSIVE | LK_RETRY);
error = insmntque1(vp, mp);
if (error != 0) {
unionfs_nodeget_cleanup(vp, unp);
return (error);
}
- if (lowervp != NULL && VN_IS_DOOMED(lowervp)) {
- vput(lowervp);
- unp->un_lowervp = lowervp = NULL;
- }
- if (uppervp != NULL && VN_IS_DOOMED(uppervp)) {
- vput(uppervp);
- unp->un_uppervp = uppervp = NULL;
- if (lowervp != NULLVP)
- vp->v_vnlock = lowervp->v_vnlock;
- }
- if (lowervp == NULL && uppervp == NULL) {
- unionfs_nodeget_cleanup(vp, unp);
- return (ENOENT);
- }
+ /*
+ * lowervp and uppervp should only be doomed by a forced unmount of
+ * their respective filesystems, but that can only happen if the
+ * unionfs instance is first unmounted. We also effectively hold the
+ * lock on the new unionfs vnode at this point. Therefore, if a
+ * unionfs umount has not yet reached the point at which the above
+ * insmntque1() would fail, then its vflush() call will end up
+ * blocked on our vnode lock, effectively also preventing unmount
+ * of the underlying filesystems.
+ */
+ VNASSERT(lowervp == NULLVP || !VN_IS_DOOMED(lowervp), vp,
+ ("%s: doomed lowervp %p", __func__, lowervp));
+ VNASSERT(uppervp == NULLVP || !VN_IS_DOOMED(uppervp), vp,
+ ("%s: doomed lowervp %p", __func__, uppervp));
vn_set_state(vp, VSTATE_CONSTRUCTED);
@@ -413,18 +437,16 @@
*vpp = unionfs_ins_cached_vnode(unp, dvp);
if (*vpp != NULLVP) {
unionfs_nodeget_cleanup(vp, unp);
- vp = *vpp;
- } else {
- if (uppervp != NULL)
- VOP_UNLOCK(uppervp);
- if (lowervp != NULL)
- VOP_UNLOCK(lowervp);
+ if (lkflags != 0)
+ vn_lock(*vpp, lkflags | LK_RETRY);
+ return (0);
+ } else
*vpp = vp;
- }
-unionfs_nodeget_out:
- if (lkflags & LK_TYPE_MASK)
- vn_lock(vp, lkflags | LK_RETRY);
+ if ((lkflags & LK_SHARED) != 0)
+ vn_lock(vp, LK_DOWNGRADE);
+ else if ((lkflags & LK_EXCLUSIVE) == 0)
+ VOP_UNLOCK(vp);
return (0);
}
@@ -443,6 +465,7 @@
struct vnode *dvp;
int count;
int writerefs;
+ bool unlock_lvp;
/*
* The root vnode lock may be recursed during unmount, because
@@ -455,18 +478,36 @@
*/
KASSERT(vp->v_vnlock->lk_recurse == 0 || (vp->v_vflag & VV_ROOT) != 0,
("%s: vnode %p locked recursively", __func__, vp));
+
+ unp = VTOUNIONFS(vp);
+ VNASSERT(unp != NULL, vp, ("%s: already reclaimed", __func__));
+ lvp = unp->un_lowervp;
+ uvp = unp->un_uppervp;
+ dvp = unp->un_dvp;
+ unlock_lvp = (uvp == NULLVP);
+
+ /*
+ * Lock the lower vnode in addition to the upper vnode lock in order
+ * to synchronize against any unionfs_lock() operation which may still
+ * hold the lower vnode lock. We do not need to do this for the root
+ * vnode, as the root vnode should always have both upper and lower
+ * base vnodes for its entire lifecycled, so unionfs_lock() should
+ * never attempt to lock its lower vnode in the first place.
+ * Moreover, during unmount of a non-"below" unionfs mount, the lower
+ * root vnode will already be locked as it is the covered vnode.
+ */
+ if (uvp != NULLVP && lvp != NULLVP && (vp->v_vflag & VV_ROOT) == 0) {
+ vn_lock_pair(uvp, true, LK_EXCLUSIVE, lvp, false, LK_EXCLUSIVE);
+ unlock_lvp = true;
+ }
+
if (lockmgr(&vp->v_lock, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0)
panic("%s: failed to acquire lock for vnode lock", __func__);
-
/*
* Use the interlock to protect the clearing of v_data to
* prevent faults in unionfs_lock().
*/
VI_LOCK(vp);
- unp = VTOUNIONFS(vp);
- lvp = unp->un_lowervp;
- uvp = unp->un_uppervp;
- dvp = unp->un_dvp;
unp->un_lowervp = unp->un_uppervp = NULLVP;
vp->v_vnlock = &(vp->v_lock);
vp->v_data = NULL;
@@ -502,18 +543,16 @@
("%s: write reference without upper vnode", __func__));
VOP_ADD_WRITECOUNT(uvp, -writerefs);
}
- if (lvp != NULLVP)
- VOP_UNLOCK(lvp);
if (uvp != NULLVP)
- VOP_UNLOCK(uvp);
+ vput(uvp);
+ if (unlock_lvp)
+ vput(lvp);
+ else if (lvp != NULLVP)
+ vrele(lvp);
if (dvp != NULLVP)
unionfs_rem_cached_vnode(unp, dvp);
- if (lvp != NULLVP)
- vrele(lvp);
- if (uvp != NULLVP)
- vrele(uvp);
if (unp->un_path != NULL) {
free(unp->un_path, M_UNIONFSPATH);
unp->un_path = NULL;
@@ -696,110 +735,6 @@
return (error);
}
-/*
- * relookup for CREATE namei operation.
- *
- * dvp is unionfs vnode. dvp should be locked.
- *
- * If it called 'unionfs_copyfile' function by unionfs_link etc,
- * VOP_LOOKUP information is broken.
- * So it need relookup in order to create link etc.
- */
-int
-unionfs_relookup_for_create(struct vnode *dvp, struct componentname *cnp,
- struct thread *td)
-{
- struct vnode *udvp;
- struct vnode *vp;
- struct componentname cn;
- int error;
-
- udvp = UNIONFSVPTOUPPERVP(dvp);
- vp = NULLVP;
-
- error = unionfs_relookup(udvp, &vp, cnp, &cn, td, cnp->cn_nameptr,
- cnp->cn_namelen, CREATE);
- if (error)
- return (error);
-
- if (vp != NULLVP) {
- if (udvp == vp)
- vrele(vp);
- else
- vput(vp);
-
- error = EEXIST;
- }
-
- return (error);
-}
-
-/*
- * relookup for DELETE namei operation.
- *
- * dvp is unionfs vnode. dvp should be locked.
- */
-int
-unionfs_relookup_for_delete(struct vnode *dvp, struct componentname *cnp,
- struct thread *td)
-{
- struct vnode *udvp;
- struct vnode *vp;
- struct componentname cn;
- int error;
-
- udvp = UNIONFSVPTOUPPERVP(dvp);
- vp = NULLVP;
-
- error = unionfs_relookup(udvp, &vp, cnp, &cn, td, cnp->cn_nameptr,
- cnp->cn_namelen, DELETE);
- if (error)
- return (error);
-
- if (vp == NULLVP)
- error = ENOENT;
- else {
- if (udvp == vp)
- vrele(vp);
- else
- vput(vp);
- }
-
- return (error);
-}
-
-/*
- * relookup for RENAME namei operation.
- *
- * dvp is unionfs vnode. dvp should be locked.
- */
-int
-unionfs_relookup_for_rename(struct vnode *dvp, struct componentname *cnp,
- struct thread *td)
-{
- struct vnode *udvp;
- struct vnode *vp;
- struct componentname cn;
- int error;
-
- udvp = UNIONFSVPTOUPPERVP(dvp);
- vp = NULLVP;
-
- error = unionfs_relookup(udvp, &vp, cnp, &cn, td, cnp->cn_nameptr,
- cnp->cn_namelen, RENAME);
- if (error)
- return (error);
-
- if (vp != NULLVP) {
- if (udvp == vp)
- vrele(vp);
- else
- vput(vp);
- }
-
- return (error);
-}
-
/*
* Update the unionfs_node.
*
@@ -836,6 +771,8 @@
vp->v_vnlock = uvp->v_vnlock;
VI_UNLOCK(vp);
+ for (count = 0; count < lockrec + 1; count++)
+ VOP_UNLOCK(lvp);
/*
* Re-cache the unionfs vnode against the upper vnode
*/
@@ -850,19 +787,88 @@
}
}
+/*
+ * Mark a unionfs operation as being in progress, sleeping if the
+ * same operation is already in progress.
+ * This is useful, for example, during copy-up operations in which
+ * we may drop the target vnode lock, but we want to avoid the
+ * possibility of a concurrent copy-up on the same vnode triggering
+ * a spurious failure.
+ */
+int
+unionfs_set_in_progress_flag(struct vnode *vp, unsigned int flag)
+{
+ struct unionfs_node *unp;
+ int error;
+
+ error = 0;
+ ASSERT_VOP_ELOCKED(vp, __func__);
+ VI_LOCK(vp);
+ unp = VTOUNIONFS(vp);
+ while (error == 0 && (unp->un_flag & flag) != 0) {
+ VOP_UNLOCK(vp);
+ error = msleep(vp, VI_MTX(vp), PCATCH | PDROP, "unioncp", 0);
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+ VI_LOCK(vp);
+ if (error == 0) {
+ /*
+ * If we waited on a concurrent copy-up and that
+ * copy-up was successful, return a non-fatal
+ * indication that the desired operation is already
+ * complete. If we waited on a concurrent lookup,
+ * return ERELOOKUP to indicate the VFS cache should
+ * be re-queried to avoid creating a duplicate unionfs
+ * vnode.
+ */
+ unp = VTOUNIONFS(vp);
+ if (unp == NULL)
+ error = ENOENT;
+ else if (flag == UNIONFS_COPY_IN_PROGRESS &&
+ unp->un_uppervp != NULLVP)
+ error = EJUSTRETURN;
+ else if (flag == UNIONFS_LOOKUP_IN_PROGRESS)
+ error = ERELOOKUP;
+ }
+ }
+ if (error == 0)
+ unp->un_flag |= flag;
+ VI_UNLOCK(vp);
+
+ return (error);
+}
+
+void
+unionfs_clear_in_progress_flag(struct vnode *vp, unsigned int flag)
+{
+ struct unionfs_node *unp;
+
+ ASSERT_VOP_ELOCKED(vp, __func__);
+ unp = VTOUNIONFS(vp);
+ VI_LOCK(vp);
+ if (unp != NULL) {
+ VNASSERT((unp->un_flag & flag) != 0, vp,
+ ("%s: copy not in progress", __func__));
+ unp->un_flag &= ~flag;
+ }
+ wakeup(vp);
+ VI_UNLOCK(vp);
+}
+
/*
* Create a new shadow dir.
*
- * udvp should be locked on entry and will be locked on return.
+ * dvp and vp are unionfs vnodes representing a parent directory and
+ * child file, should be locked on entry, and will be locked on return.
*
* If no error returned, unp will be updated.
*/
int
-unionfs_mkshadowdir(struct unionfs_mount *ump, struct vnode *udvp,
- struct unionfs_node *unp, struct componentname *cnp, struct thread *td)
+unionfs_mkshadowdir(struct vnode *dvp, struct vnode *vp,
+ struct componentname *cnp, struct thread *td)
{
struct vnode *lvp;
struct vnode *uvp;
+ struct vnode *udvp;
struct vattr va;
struct vattr lva;
struct nameidata nd;
@@ -870,10 +876,25 @@
struct ucred *cred;
struct ucred *credbk;
struct uidinfo *rootinfo;
+ struct unionfs_mount *ump;
+ struct unionfs_node *dunp;
+ struct unionfs_node *unp;
int error;
+ ASSERT_VOP_ELOCKED(dvp, __func__);
+ ASSERT_VOP_ELOCKED(vp, __func__);
+ ump = MOUNTTOUNIONFSMOUNT(vp->v_mount);
+ unp = VTOUNIONFS(vp);
if (unp->un_uppervp != NULLVP)
return (EEXIST);
+ dunp = VTOUNIONFS(dvp);
+ udvp = dunp->un_uppervp;
+
+ error = unionfs_set_in_progress_flag(vp, UNIONFS_COPY_IN_PROGRESS);
+ if (error == EJUSTRETURN)
+ return (0);
+ else if (error != 0)
+ return (error);
lvp = unp->un_lowervp;
uvp = NULLVP;
@@ -897,11 +918,29 @@
NDPREINIT(&nd);
if ((error = VOP_GETATTR(lvp, &lva, cnp->cn_cred)))
- goto unionfs_mkshadowdir_abort;
+ goto unionfs_mkshadowdir_finish;
+ vref(udvp);
+ VOP_UNLOCK(vp);
if ((error = unionfs_relookup(udvp, &uvp, cnp, &nd.ni_cnd, td,
- cnp->cn_nameptr, cnp->cn_namelen, CREATE)))
- goto unionfs_mkshadowdir_abort;
+ cnp->cn_nameptr, cnp->cn_namelen, CREATE))) {
+ /*
+ * When handling error cases here, we drop udvp's lock and
+ * then jump to exit code that relocks dvp, which in most
+ * cases will effectively relock udvp. However, this is
+ * not guaranteed to be the case, as various calls made
+ * here (such as unionfs_relookup() above and VOP_MKDIR()
+ * below) may unlock and then relock udvp, allowing dvp to
+ * be reclaimed in the meantime. In such a situation dvp
+ * will no longer share its lock with udvp. Since
+ * performance isn't a concern for these error cases, it
+ * makes more sense to reuse the common code that locks
+ * dvp on exit than to explicitly check for reclamation
+ * of dvp.
+ */
+ vput(udvp);
+ goto unionfs_mkshadowdir_relock;
+ }
if (uvp != NULLVP) {
if (udvp == uvp)
vrele(uvp);
@@ -909,11 +948,14 @@
vput(uvp);
error = EEXIST;
- goto unionfs_mkshadowdir_abort;
+ vput(udvp);
+ goto unionfs_mkshadowdir_relock;
}
- if ((error = vn_start_write(udvp, &mp, V_WAIT | V_PCATCH)))
- goto unionfs_mkshadowdir_abort;
+ if ((error = vn_start_write(udvp, &mp, V_WAIT | V_PCATCH))) {
+ vput(udvp);
+ goto unionfs_mkshadowdir_relock;
+ }
unionfs_create_uppervattr_core(ump, &lva, &va, td);
/*
@@ -924,7 +966,7 @@
* component. This *should* be fine, as cn_namelen will still
* correctly indicate the length of only the current component,
* but ZFS in particular does not respect cn_namelen in its VOP_MKDIR
- * implementation
+ * implementation.
* Note that this assumes nd.ni_cnd.cn_pnbuf was allocated by
* something like a local namei() operation and the temporary
* NUL-termination will not have an effect on other threads.
@@ -934,27 +976,58 @@
*pathend = '\0';
error = VOP_MKDIR(udvp, &uvp, &nd.ni_cnd, &va);
*pathend = pathterm;
-
- if (!error) {
- /*
- * XXX The bug which cannot set uid/gid was corrected.
- * Ignore errors.
- */
- va.va_type = VNON;
- VOP_SETATTR(uvp, &va, nd.ni_cnd.cn_cred);
-
+ if (error != 0) {
/*
- * VOP_SETATTR() may transiently drop uvp's lock, so it's
- * important to call it before unionfs_node_update() transfers
- * the unionfs vnode's lock from lvp to uvp; otherwise the
- * unionfs vnode itself would be transiently unlocked and
- * potentially doomed.
+ * See the comment after unionfs_relookup() above for an
+ * explanation of why we unlock udvp here only to relock
+ * dvp on exit.
*/
- unionfs_node_update(unp, uvp, td);
+ vput(udvp);
+ vn_finished_write(mp);
+ goto unionfs_mkshadowdir_relock;
}
+
+ /*
+ * XXX The bug which cannot set uid/gid was corrected.
+ * Ignore errors.
+ */
+ va.va_type = VNON;
+ /*
+ * VOP_SETATTR() may transiently drop uvp's lock, so it's
+ * important to call it before unionfs_node_update() transfers
+ * the unionfs vnode's lock from lvp to uvp; otherwise the
+ * unionfs vnode itself would be transiently unlocked and
+ * potentially doomed.
+ */
+ VOP_SETATTR(uvp, &va, nd.ni_cnd.cn_cred);
+
+ /*
+ * uvp may become doomed during VOP_VPUT_PAIR() if the implementation
+ * must temporarily drop uvp's lock. However, since we hold a
+ * reference to uvp from the VOP_MKDIR() call above, this would require
+ * a forcible unmount of uvp's filesystem, which in turn can only
+ * happen if our unionfs instance is first forcibly unmounted. We'll
+ * therefore catch this case in the NULL check of unp below.
+ */
+ VOP_VPUT_PAIR(udvp, &uvp, false);
vn_finished_write(mp);
+ vn_lock_pair(vp, false, LK_EXCLUSIVE, uvp, true, LK_EXCLUSIVE);
+ unp = VTOUNIONFS(vp);
+ if (unp == NULL) {
+ vput(uvp);
+ error = ENOENT;
+ } else
+ unionfs_node_update(unp, uvp, td);
+ VOP_UNLOCK(vp);
+
+unionfs_mkshadowdir_relock:
+ vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY);
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+ if (error == 0 && (VN_IS_DOOMED(dvp) || VN_IS_DOOMED(vp)))
+ error = ENOENT;
-unionfs_mkshadowdir_abort:
+unionfs_mkshadowdir_finish:
+ unionfs_clear_in_progress_flag(vp, UNIONFS_COPY_IN_PROGRESS);
cnp->cn_cred = credbk;
chgproccnt(cred->cr_ruidinfo, -1, 0);
crfree(cred);
@@ -1116,23 +1189,31 @@
/*
* Create a new whiteout.
*
- * udvp and dvp should be locked on entry and will be locked on return.
+ * dvp and vp are unionfs vnodes representing a parent directory and
+ * child file, should be locked on entry, and will be locked on return.
*/
int
-unionfs_mkwhiteout(struct vnode *dvp, struct vnode *udvp,
+unionfs_mkwhiteout(struct vnode *dvp, struct vnode *vp,
struct componentname *cnp, struct thread *td, char *path, int pathlen)
{
+ struct vnode *udvp;
struct vnode *wvp;
struct nameidata nd;
struct mount *mp;
int error;
- int lkflags;
+ bool dvp_locked;
+
+ ASSERT_VOP_ELOCKED(dvp, __func__);
+ ASSERT_VOP_ELOCKED(vp, __func__);
+ udvp = VTOUNIONFS(dvp)->un_uppervp;
wvp = NULLVP;
NDPREINIT(&nd);
+ vref(udvp);
+ VOP_UNLOCK(vp);
if ((error = unionfs_relookup(udvp, &wvp, cnp, &nd.ni_cnd, td, path,
pathlen, CREATE))) {
- return (error);
+ goto unionfs_mkwhiteout_cleanup;
}
if (wvp != NULLVP) {
if (udvp == wvp)
@@ -1140,18 +1221,27 @@
else
vput(wvp);
- return (EEXIST);
+ if (nd.ni_cnd.cn_flags & ISWHITEOUT)
+ error = 0;
+ else
+ error = EEXIST;
+ goto unionfs_mkwhiteout_cleanup;
}
if ((error = vn_start_write(udvp, &mp, V_WAIT | V_PCATCH)))
- goto unionfs_mkwhiteout_free_out;
- unionfs_forward_vop_start(udvp, &lkflags);
+ goto unionfs_mkwhiteout_cleanup;
error = VOP_WHITEOUT(udvp, &nd.ni_cnd, CREATE);
- unionfs_forward_vop_finish(dvp, udvp, lkflags);
-
vn_finished_write(mp);
-unionfs_mkwhiteout_free_out:
+unionfs_mkwhiteout_cleanup:
+ if (VTOUNIONFS(dvp) == NULL) {
+ vput(udvp);
+ dvp_locked = false;
+ } else {
+ vrele(udvp);
+ dvp_locked = true;
+ }
+ vn_lock_pair(dvp, dvp_locked, LK_EXCLUSIVE, vp, false, LK_EXCLUSIVE);
return (error);
}
@@ -1165,10 +1255,11 @@
*/
static int
unionfs_vn_create_on_upper(struct vnode **vpp, struct vnode *udvp,
- struct unionfs_node *unp, struct vattr *uvap, struct thread *td)
+ struct vnode *vp, struct vattr *uvap, struct thread *td)
{
struct unionfs_mount *ump;
- struct vnode *vp;
+ struct unionfs_node *unp;
+ struct vnode *uvp;
struct vnode *lvp;
struct ucred *cred;
struct vattr lva;
@@ -1176,8 +1267,10 @@
int fmode;
int error;
+ ASSERT_VOP_ELOCKED(vp, __func__);
+ unp = VTOUNIONFS(vp);
ump = MOUNTTOUNIONFSMOUNT(UNIONFSTOV(unp)->v_mount);
- vp = NULLVP;
+ uvp = NULLVP;
lvp = unp->un_lowervp;
cred = td->td_ucred;
fmode = FFLAGS(O_WRONLY | O_CREAT | O_TRUNC | O_EXCL);
@@ -1200,42 +1293,39 @@
NDPREINIT(&nd);
vref(udvp);
- if ((error = vfs_relookup(udvp, &vp, &nd.ni_cnd, false)) != 0)
- goto unionfs_vn_create_on_upper_free_out2;
- vrele(udvp);
+ VOP_UNLOCK(vp);
+ if ((error = vfs_relookup(udvp, &uvp, &nd.ni_cnd, false)) != 0) {
+ vrele(udvp);
+ return (error);
+ }
- if (vp != NULLVP) {
- if (vp == udvp)
- vrele(vp);
+ if (uvp != NULLVP) {
+ if (uvp == udvp)
+ vrele(uvp);
else
- vput(vp);
+ vput(uvp);
error = EEXIST;
- goto unionfs_vn_create_on_upper_free_out1;
+ goto unionfs_vn_create_on_upper_cleanup;
}
- if ((error = VOP_CREATE(udvp, &vp, &nd.ni_cnd, uvap)) != 0)
- goto unionfs_vn_create_on_upper_free_out1;
+ if ((error = VOP_CREATE(udvp, &uvp, &nd.ni_cnd, uvap)) != 0)
+ goto unionfs_vn_create_on_upper_cleanup;
- if ((error = VOP_OPEN(vp, fmode, cred, td, NULL)) != 0) {
- vput(vp);
- goto unionfs_vn_create_on_upper_free_out1;
+ if ((error = VOP_OPEN(uvp, fmode, cred, td, NULL)) != 0) {
+ vput(uvp);
+ goto unionfs_vn_create_on_upper_cleanup;
}
- error = VOP_ADD_WRITECOUNT(vp, 1);
- CTR3(KTR_VFS, "%s: vp %p v_writecount increased to %d",
- __func__, vp, vp->v_writecount);
+ error = VOP_ADD_WRITECOUNT(uvp, 1);
+ CTR3(KTR_VFS, "%s: newvp %p v_writecount increased to %d",
+ __func__, newvp, newvp->v_writecount);
if (error == 0) {
- *vpp = vp;
+ *vpp = uvp;
} else {
- VOP_CLOSE(vp, fmode, cred, td);
+ VOP_CLOSE(uvp, fmode, cred, td);
}
-unionfs_vn_create_on_upper_free_out1:
- VOP_UNLOCK(udvp);
-
-unionfs_vn_create_on_upper_free_out2:
- KASSERT(nd.ni_cnd.cn_pnbuf == unp->un_path,
- ("%s: cn_pnbuf changed", __func__));
-
+unionfs_vn_create_on_upper_cleanup:
+ vput(udvp);
return (error);
}
@@ -1310,13 +1400,18 @@
*
* If you need copy of the contents, set 1 to docopy. Otherwise, set 0 to
* docopy.
+ *
+ * vp is a unionfs vnode that should be locked on entry and will be
+ * locked on return.
*
* If no error returned, unp will be updated.
*/
int
-unionfs_copyfile(struct unionfs_node *unp, int docopy, struct ucred *cred,
+unionfs_copyfile(struct vnode *vp, int docopy, struct ucred *cred,
struct thread *td)
{
+ struct unionfs_node *unp;
+ struct unionfs_node *dunp;
struct mount *mp;
struct vnode *udvp;
struct vnode *lvp;
@@ -1324,6 +1419,8 @@
struct vattr uva;
int error;
+ ASSERT_VOP_ELOCKED(vp, __func__);
+ unp = VTOUNIONFS(vp);
lvp = unp->un_lowervp;
uvp = NULLVP;
@@ -1333,22 +1430,51 @@
return (EINVAL);
if (unp->un_uppervp != NULLVP)
return (EEXIST);
- udvp = VTOUNIONFS(unp->un_dvp)->un_uppervp;
+
+ udvp = NULLVP;
+ VI_LOCK(unp->un_dvp);
+ dunp = VTOUNIONFS(unp->un_dvp);
+ if (dunp != NULL)
+ udvp = dunp->un_uppervp;
+ VI_UNLOCK(unp->un_dvp);
+
if (udvp == NULLVP)
return (EROFS);
if ((udvp->v_mount->mnt_flag & MNT_RDONLY))
return (EROFS);
+ ASSERT_VOP_UNLOCKED(udvp, __func__);
+
+ error = unionfs_set_in_progress_flag(vp, UNIONFS_COPY_IN_PROGRESS);
+ if (error == EJUSTRETURN)
+ return (0);
+ else if (error != 0)
+ return (error);
error = VOP_ACCESS(lvp, VREAD, cred, td);
if (error != 0)
- return (error);
+ goto unionfs_copyfile_cleanup;
if ((error = vn_start_write(udvp, &mp, V_WAIT | V_PCATCH)) != 0)
- return (error);
- error = unionfs_vn_create_on_upper(&uvp, udvp, unp, &uva, td);
+ goto unionfs_copyfile_cleanup;
+ error = unionfs_vn_create_on_upper(&uvp, udvp, vp, &uva, td);
if (error != 0) {
vn_finished_write(mp);
- return (error);
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+ goto unionfs_copyfile_cleanup;
+ }
+
+ /*
+ * Note that it's still possible for e.g. VOP_WRITE to relock
+ * uvp below while holding vp[=lvp] locked. Replacing
+ * unionfs_copyfile_core with vn_generic_copy_file_range() will
+ * allow us to avoid the problem by moving this vn_lock_pair()
+ * call much later.
+ */
+ vn_lock_pair(vp, false, LK_EXCLUSIVE, uvp, true, LK_EXCLUSIVE);
+ unp = VTOUNIONFS(vp);
+ if (unp == NULL) {
+ error = ENOENT;
+ goto unionfs_copyfile_cleanup;
}
if (docopy != 0) {
@@ -1369,18 +1495,30 @@
/* Reset the attributes. Ignore errors. */
uva.va_type = VNON;
VOP_SETATTR(uvp, &uva, cred);
+ unionfs_node_update(unp, uvp, td);
}
- unionfs_node_update(unp, uvp, td);
-
+unionfs_copyfile_cleanup:
+ unionfs_clear_in_progress_flag(vp, UNIONFS_COPY_IN_PROGRESS);
return (error);
}
/*
- * It checks whether vp can rmdir. (check empty)
+ * Determine if the unionfs view of a directory is empty such that
+ * an rmdir operation can be permitted.
+ *
+ * We assume the VOP_RMDIR() against the upper layer vnode will take
+ * care of this check for us where the upper FS is concerned, so here
+ * we concentrate on the lower FS. We need to check for the presence
+ * of files other than "." and ".." in the lower FS directory and
+ * then cross-check any files we find against the upper FS to see if
+ * a whiteout is present (in which case we treat the lower file as
+ * non-present).
*
- * vp is unionfs vnode.
- * vp should be locked.
+ * The logic here is based heavily on vn_dir_check_empty().
+ *
+ * vp should be a locked unionfs node, and vp's lowervp should also be
+ * locked.
*/
int
unionfs_check_rmdir(struct vnode *vp, struct ucred *cred, struct thread *td)
@@ -1388,115 +1526,127 @@
struct vnode *uvp;
struct vnode *lvp;
struct vnode *tvp;
+ char *dirbuf;
+ size_t dirbuflen, len;
+ off_t off;
struct dirent *dp;
- struct dirent *edp;
struct componentname cn;
- struct iovec iov;
- struct uio uio;
struct vattr va;
int error;
int eofflag;
- int lookuperr;
-
- /*
- * The size of buf needs to be larger than DIRBLKSIZ.
- */
- char buf[256 * 6];
-
- ASSERT_VOP_ELOCKED(vp, __func__);
eofflag = 0;
- uvp = UNIONFSVPTOUPPERVP(vp);
lvp = UNIONFSVPTOLOWERVP(vp);
+ uvp = UNIONFSVPTOUPPERVP(vp);
+
+ /*
+ * Note that the locking here still isn't ideal: We expect the caller
+ * to hold both the upper and lower layer locks as well as the upper
+ * parent directory lock, which it can do in a manner that avoids
+ * deadlock. However, if the cross-check logic below needs to call
+ * VOP_LOOKUP(), that may relock the upper vnode and lock any found
+ * child vnode in a way that doesn't protect against deadlock given
+ * the other held locks. Beyond that, the various other VOPs we issue
+ * below, such as VOP_OPEN() and VOP_READDIR(), may also re-lock the
+ * lower vnode.
+ * We might instead just handoff between the upper vnode lock
+ * (and its parent directory lock) and the lower vnode lock as needed,
+ * so that the lower lock is never held at the same time as the upper
+ * locks, but that opens up a wider window in which the upper
+ * directory (and also the lower directory if it isn't truly
+ * read-only) may change while the relevant lock is dropped. But
+ * since re-locking may happen here and open up such a window anyway,
+ * perhaps that is a worthwile tradeoff? Or perhaps we can ultimately
+ * do sufficient tracking of empty state within the unionfs vnode
+ * (in conjunction with upcalls from the lower FSes to notify us
+ * of out-of-band state changes) that we can avoid these costly checks
+ * altogether.
+ */
+ ASSERT_VOP_LOCKED(lvp, __func__);
+ ASSERT_VOP_ELOCKED(uvp, __func__);
- /* check opaque */
if ((error = VOP_GETATTR(uvp, &va, cred)) != 0)
return (error);
if (va.va_flags & OPAQUE)
return (0);
- /* open vnode */
#ifdef MAC
- if ((error = mac_vnode_check_open(cred, vp, VEXEC|VREAD)) != 0)
+ if ((error = mac_vnode_check_open(cred, lvp, VEXEC | VREAD)) != 0)
return (error);
#endif
- if ((error = VOP_ACCESS(vp, VEXEC|VREAD, cred, td)) != 0)
+ if ((error = VOP_ACCESS(lvp, VEXEC | VREAD, cred, td)) != 0)
return (error);
- if ((error = VOP_OPEN(vp, FREAD, cred, td, NULL)) != 0)
+ if ((error = VOP_OPEN(lvp, FREAD, cred, td, NULL)) != 0)
+ return (error);
+ if ((error = VOP_GETATTR(lvp, &va, cred)) != 0)
return (error);
- uio.uio_rw = UIO_READ;
- uio.uio_segflg = UIO_SYSSPACE;
- uio.uio_td = td;
- uio.uio_offset = 0;
+ dirbuflen = max(DEV_BSIZE, GENERIC_MAXDIRSIZ);
+ if (dirbuflen < va.va_blocksize)
+ dirbuflen = va.va_blocksize;
+ dirbuf = malloc(dirbuflen, M_TEMP, M_WAITOK);
-#ifdef MAC
- error = mac_vnode_check_readdir(td->td_ucred, lvp);
-#endif
- while (!error && !eofflag) {
- iov.iov_base = buf;
- iov.iov_len = sizeof(buf);
- uio.uio_iov = &iov;
- uio.uio_iovcnt = 1;
- uio.uio_resid = iov.iov_len;
+ len = 0;
+ off = 0;
+ eofflag = 0;
- error = VOP_READDIR(lvp, &uio, cred, &eofflag, NULL, NULL);
+ for (;;) {
+ error = vn_dir_next_dirent(lvp, td, dirbuf, dirbuflen,
+ &dp, &len, &off, &eofflag);
if (error != 0)
break;
- KASSERT(eofflag != 0 || uio.uio_resid < sizeof(buf),
- ("%s: empty read from lower FS", __func__));
-
- edp = (struct dirent*)&buf[sizeof(buf) - uio.uio_resid];
- for (dp = (struct dirent*)buf; !error && dp < edp;
- dp = (struct dirent*)((caddr_t)dp + dp->d_reclen)) {
- if (dp->d_type == DT_WHT || dp->d_fileno == 0 ||
- (dp->d_namlen == 1 && dp->d_name[0] == '.') ||
- (dp->d_namlen == 2 && !bcmp(dp->d_name, "..", 2)))
- continue;
-
- cn.cn_namelen = dp->d_namlen;
- cn.cn_pnbuf = NULL;
- cn.cn_nameptr = dp->d_name;
- cn.cn_nameiop = LOOKUP;
- cn.cn_flags = LOCKPARENT | LOCKLEAF | RDONLY | ISLASTCN;
- cn.cn_lkflags = LK_EXCLUSIVE;
- cn.cn_cred = cred;
-
- /*
- * check entry in lower.
- * Sometimes, readdir function returns
- * wrong entry.
- */
- lookuperr = VOP_LOOKUP(lvp, &tvp, &cn);
- if (!lookuperr)
- vput(tvp);
- else
- continue; /* skip entry */
-
- /*
- * check entry
- * If it has no exist/whiteout entry in upper,
- * directory is not empty.
- */
- cn.cn_flags = LOCKPARENT | LOCKLEAF | RDONLY | ISLASTCN;
- lookuperr = VOP_LOOKUP(uvp, &tvp, &cn);
+ if (len == 0) {
+ /* EOF */
+ error = 0;
+ break;
+ }
- if (!lookuperr)
- vput(tvp);
+ if (dp->d_type == DT_WHT)
+ continue;
- /* ignore exist or whiteout entry */
- if (!lookuperr ||
- (lookuperr == ENOENT && (cn.cn_flags & ISWHITEOUT)))
- continue;
+ /*
+ * Any file in the directory which is not '.' or '..' indicates
+ * the directory is not empty.
+ */
+ switch (dp->d_namlen) {
+ case 2:
+ if (dp->d_name[1] != '.') {
+ /* Can't be '..' (nor '.') */
+ break;
+ }
+ /* FALLTHROUGH */
+ case 1:
+ if (dp->d_name[0] != '.') {
+ /* Can't be '..' nor '.' */
+ break;
+ }
+ continue;
+ default:
+ break;
+ }
+ cn.cn_namelen = dp->d_namlen;
+ cn.cn_pnbuf = NULL;
+ cn.cn_nameptr = dp->d_name;
+ cn.cn_nameiop = LOOKUP;
+ cn.cn_flags = LOCKPARENT | LOCKLEAF | RDONLY | ISLASTCN;
+ cn.cn_lkflags = LK_EXCLUSIVE;
+ cn.cn_cred = cred;
+
+ error = VOP_LOOKUP(uvp, &tvp, &cn);
+ if (tvp != NULLVP)
+ vput(tvp);
+ if (error != 0 && error != ENOENT && error != EJUSTRETURN)
+ break;
+ else if ((cn.cn_flags & ISWHITEOUT) == 0) {
error = ENOTEMPTY;
- }
+ break;
+ } else
+ error = 0;
}
- /* close vnode */
- VOP_CLOSE(vp, FREAD, cred, td);
-
+ VOP_CLOSE(lvp, FREAD, cred, td);
+ free(dirbuf, M_TEMP);
return (error);
}
-
diff --git a/sys/fs/unionfs/union_vfsops.c b/sys/fs/unionfs/union_vfsops.c
--- a/sys/fs/unionfs/union_vfsops.c
+++ b/sys/fs/unionfs/union_vfsops.c
@@ -327,18 +327,15 @@
* unionfs_lock()) and the mountpoint's busy count. Without this,
* unmount will lock the covered vnode lock (directly through the
* covered vnode) and wait for the busy count to drain, while a
- * concurrent lookup will increment the busy count and then lock
+ * concurrent lookup will increment the busy count and then may lock
* the covered vnode lock (indirectly through unionfs_lock()).
*
- * Note that we can't yet use this facility for the 'below' case
- * in which the upper vnode is the covered vnode, because that would
- * introduce a different LOR in which the cross-mount lookup would
- * effectively hold the upper vnode lock before acquiring the lower
- * vnode lock, while an unrelated lock operation would still acquire
- * the lower vnode lock before the upper vnode lock, which is the
- * order unionfs currently requires.
+ * Note that this is only needed for the 'below' case in which the
+ * upper vnode is also the covered vnode, because unionfs_lock()
+ * only locks the upper vnode as long as both lower and upper vnodes
+ * are present (which they will always be for the unionfs mount root).
*/
- if (!below) {
+ if (below) {
vn_lock(mp->mnt_vnodecovered, LK_EXCLUSIVE | LK_RETRY | LK_CANRECURSE);
mp->mnt_vnodecovered->v_vflag |= VV_CROSSLOCK;
VOP_UNLOCK(mp->mnt_vnodecovered);
diff --git a/sys/fs/unionfs/union_vnops.c b/sys/fs/unionfs/union_vnops.c
--- a/sys/fs/unionfs/union_vnops.c
+++ b/sys/fs/unionfs/union_vnops.c
@@ -85,12 +85,11 @@
struct componentname *cnp;
struct thread *td;
u_long nameiop;
- u_long cnflags, cnflagsbk;
- int iswhiteout;
+ u_long cnflags;
int lockflag;
- int error , uerror, lerror;
+ int lkflags;
+ int error, uerror, lerror;
- iswhiteout = 0;
lockflag = 0;
error = uerror = lerror = ENOENT;
cnp = ap->a_cnp;
@@ -119,88 +118,180 @@
LOOKUP != nameiop)
return (EROFS);
+ /*
+ * Note that a lookup is in-flight, and block if another lookup
+ * is already in-flight against dvp. This is done because we may
+ * end up dropping dvp's lock to look up a lower vnode or to create
+ * a shadow directory, opening up the possibility of parallel lookups
+ * against the same directory creating duplicate unionfs vnodes for
+ * the same file(s). Note that if this function encounters an
+ * in-progress lookup for the directory, it will block until the
+ * lookup is complete and then return ERELOOKUP to allow any
+ * existing unionfs vnode to be loaded from the VFS cache.
+ * This is really a hack; filesystems that support MNTK_LOOKUP_SHARED
+ * (which unionfs currently doesn't) seem to deal with this by using
+ * the vfs_hash_* functions to manage a per-mount vnode cache keyed
+ * by the inode number (or some roughly equivalent unique ID
+ * usually assocated with the storage medium). It may make sense
+ * for unionfs to adopt something similar as a replacement for its
+ * current half-baked directory-only cache implementation, particularly
+ * if we want to support MNTK_LOOKUP_SHARED here.
+ */
+ error = unionfs_set_in_progress_flag(dvp, UNIONFS_LOOKUP_IN_PROGRESS);
+ if (error != 0)
+ return (error);
/*
* lookup dotdot
*/
if (cnflags & ISDOTDOT) {
- if (LOOKUP != nameiop && udvp == NULLVP)
- return (EROFS);
+ if (LOOKUP != nameiop && udvp == NULLVP) {
+ error = EROFS;
+ goto unionfs_lookup_return;
+ }
- if (udvp != NULLVP) {
+ if (udvp != NULLVP)
dtmpvp = udvp;
- if (ldvp != NULLVP)
- VOP_UNLOCK(ldvp);
- }
else
dtmpvp = ldvp;
+ unionfs_forward_vop_start(dtmpvp, &lkflags);
error = VOP_LOOKUP(dtmpvp, &vp, cnp);
+ unionfs_forward_vop_finish(dvp, dtmpvp, lkflags);
- if (dtmpvp == udvp && ldvp != NULLVP) {
- VOP_UNLOCK(udvp);
- vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY);
- dunp = VTOUNIONFS(dvp);
- if (error == 0 && dunp == NULL)
- error = ENOENT;
- }
+ /*
+ * Drop the lock and reference on vp. If the lookup was
+ * successful, we'll either need to exchange vp's lock and
+ * reference for the unionfs parent vnode's lock and
+ * reference, or (if dvp was reclaimed) we'll need to drop
+ * vp's lock and reference to return early.
+ */
+ if (vp != NULLVP)
+ vput(vp);
+ dunp = VTOUNIONFS(dvp);
+ if (error == 0 && dunp == NULL)
+ error = ENOENT;
if (error == 0) {
- /*
- * Exchange lock and reference from vp to
- * dunp->un_dvp. vp is upper/lower vnode, but it
- * will need to return the unionfs vnode.
- */
- if (nameiop == DELETE || nameiop == RENAME ||
- (cnp->cn_lkflags & LK_TYPE_MASK))
- VOP_UNLOCK(vp);
- vrele(vp);
-
dtmpvp = dunp->un_dvp;
vref(dtmpvp);
VOP_UNLOCK(dvp);
*(ap->a_vpp) = dtmpvp;
- if (nameiop == DELETE || nameiop == RENAME)
- vn_lock(dtmpvp, LK_EXCLUSIVE | LK_RETRY);
- else if (cnp->cn_lkflags & LK_TYPE_MASK)
- vn_lock(dtmpvp, cnp->cn_lkflags |
- LK_RETRY);
+ vn_lock(dtmpvp, cnp->cn_lkflags | LK_RETRY);
+ if (VN_IS_DOOMED(dtmpvp)) {
+ vput(dtmpvp);
+ *(ap->a_vpp) = NULLVP;
+ error = ENOENT;
+ }
vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY);
- } else if (error == ENOENT && (cnflags & MAKEENTRY) != 0)
- cache_enter(dvp, NULLVP, cnp);
+ }
- goto unionfs_lookup_return;
+ goto unionfs_lookup_cleanup;
}
+ /*
+ * Lookup lower layer. We do this before looking up the the upper
+ * layer, as we may drop the upper parent directory's lock, and we
+ * want to ensure the upper parent remains locked from the point of
+ * lookup through any ensuing VOP that may require it to be locked.
+ * The cost of this is that we may end up performing an unnecessary
+ * lower layer lookup if a whiteout is present in the upper layer.
+ */
+ if (ldvp != NULLVP && !(cnflags & DOWHITEOUT)) {
+ struct componentname lcn;
+ bool is_dot;
+
+ if (udvp != NULLVP) {
+ vref(ldvp);
+ VOP_UNLOCK(dvp);
+ vn_lock(ldvp, LK_EXCLUSIVE | LK_RETRY);
+ }
+
+ lcn = *cnp;
+ /* always op is LOOKUP */
+ lcn.cn_nameiop = LOOKUP;
+ lcn.cn_flags = cnflags;
+ is_dot = false;
+
+ if (udvp == NULLVP)
+ unionfs_forward_vop_start(ldvp, &lkflags);
+ lerror = VOP_LOOKUP(ldvp, &lvp, &lcn);
+ if (udvp == NULLVP &&
+ unionfs_forward_vop_finish(dvp, ldvp, lkflags)) {
+ if (lvp != NULLVP)
+ VOP_UNLOCK(lvp);
+ error = ENOENT;
+ goto unionfs_lookup_cleanup;
+ }
+
+ if (udvp == NULLVP)
+ cnp->cn_flags = lcn.cn_flags;
+
+ if (lerror == 0) {
+ if (ldvp == lvp) { /* is dot */
+ vrele(lvp);
+ *(ap->a_vpp) = dvp;
+ vref(dvp);
+ is_dot = true;
+ error = lerror;
+ } else if (lvp != NULLVP)
+ VOP_UNLOCK(lvp);
+ }
+
+ if (udvp != NULLVP) {
+ vput(ldvp);
+ vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY);
+ if (VN_IS_DOOMED(dvp))
+ error = ENOENT;
+ }
+ if (is_dot)
+ goto unionfs_lookup_return;
+ else if (error != 0)
+ goto unionfs_lookup_cleanup;
+ }
/*
* lookup upper layer
*/
if (udvp != NULLVP) {
+ bool iswhiteout = false;
+
+ unionfs_forward_vop_start(udvp, &lkflags);
uerror = VOP_LOOKUP(udvp, &uvp, cnp);
+ if (unionfs_forward_vop_finish(dvp, udvp, lkflags)) {
+ if (uvp != NULLVP)
+ VOP_UNLOCK(uvp);
+ error = ENOENT;
+ goto unionfs_lookup_cleanup;
+ }
if (uerror == 0) {
if (udvp == uvp) { /* is dot */
+ if (lvp != NULLVP)
+ vrele(lvp);
vrele(uvp);
*(ap->a_vpp) = dvp;
vref(dvp);
error = uerror;
goto unionfs_lookup_return;
- }
- if (nameiop == DELETE || nameiop == RENAME ||
- (cnp->cn_lkflags & LK_TYPE_MASK))
+ } else if (uvp != NULLVP)
VOP_UNLOCK(uvp);
}
/* check whiteout */
- if (uerror == ENOENT || uerror == EJUSTRETURN)
- if (cnp->cn_flags & ISWHITEOUT)
- iswhiteout = 1; /* don't lookup lower */
- if (iswhiteout == 0 && ldvp != NULLVP)
- if (!VOP_GETATTR(udvp, &va, cnp->cn_cred) &&
- (va.va_flags & OPAQUE))
- iswhiteout = 1; /* don't lookup lower */
+ if ((uerror == ENOENT || uerror == EJUSTRETURN) &&
+ (cnp->cn_flags & ISWHITEOUT))
+ iswhiteout = true;
+ else if (VOP_GETATTR(udvp, &va, cnp->cn_cred) == 0 &&
+ (va.va_flags & OPAQUE))
+ iswhiteout = true;
+
+ if (iswhiteout && lvp != NULLVP) {
+ vrele(lvp);
+ lvp = NULLVP;
+ }
+
#if 0
UNIONFS_INTERNAL_DEBUG(
"unionfs_lookup: debug: whiteout=%d, path=%s\n",
@@ -208,39 +299,6 @@
#endif
}
- /*
- * lookup lower layer
- */
- if (ldvp != NULLVP && !(cnflags & DOWHITEOUT) && iswhiteout == 0) {
- /* always op is LOOKUP */
- cnp->cn_nameiop = LOOKUP;
- cnflagsbk = cnp->cn_flags;
- cnp->cn_flags = cnflags;
-
- lerror = VOP_LOOKUP(ldvp, &lvp, cnp);
-
- cnp->cn_nameiop = nameiop;
- if (udvp != NULLVP && (uerror == 0 || uerror == EJUSTRETURN))
- cnp->cn_flags = cnflagsbk;
-
- if (lerror == 0) {
- if (ldvp == lvp) { /* is dot */
- if (uvp != NULLVP)
- vrele(uvp); /* no need? */
- vrele(lvp);
- *(ap->a_vpp) = dvp;
- vref(dvp);
-
- UNIONFS_INTERNAL_DEBUG(
- "unionfs_lookup: leave (%d)\n", lerror);
-
- return (lerror);
- }
- if (cnp->cn_lkflags & LK_TYPE_MASK)
- VOP_UNLOCK(lvp);
- }
- }
-
/*
* check lookup result
*/
@@ -280,8 +338,7 @@
if (unp == NULL)
error = ENOENT;
else
- error = unionfs_mkshadowdir(MOUNTTOUNIONFSMOUNT(dvp->v_mount),
- udvp, unp, cnp, td);
+ error = unionfs_mkshadowdir(dvp, vp, cnp, td);
if (lockflag != 0)
VOP_UNLOCK(vp);
if (error != 0) {
@@ -293,6 +350,10 @@
vrele(vp);
goto unionfs_lookup_cleanup;
}
+ /*
+ * TODO: Since unionfs_mkshadowdir() relocks udvp after
+ * creating the new directory, return ERELOOKUP here?
+ */
if ((cnp->cn_lkflags & LK_TYPE_MASK) == LK_SHARED)
vn_lock(vp, LK_SHARED | LK_RETRY);
}
@@ -313,9 +374,12 @@
"unionfs_lookup: Unable to create unionfs vnode.");
goto unionfs_lookup_cleanup;
}
- if ((nameiop == DELETE || nameiop == RENAME) &&
- (cnp->cn_lkflags & LK_TYPE_MASK) == 0)
- vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+ }
+
+ if (VN_IS_DOOMED(dvp) || VN_IS_DOOMED(vp)) {
+ error = ENOENT;
+ vput(vp);
+ goto unionfs_lookup_cleanup;
}
*(ap->a_vpp) = vp;
@@ -329,10 +393,12 @@
if (lvp != NULLVP)
vrele(lvp);
- if (error == ENOENT && (cnflags & MAKEENTRY) != 0)
+ if (error == ENOENT && (cnflags & MAKEENTRY) != 0 &&
+ !VN_IS_DOOMED(dvp))
cache_enter(dvp, NULLVP, cnp);
unionfs_lookup_return:
+ unionfs_clear_in_progress_flag(dvp, UNIONFS_LOOKUP_IN_PROGRESS);
UNIONFS_INTERNAL_DEBUG("unionfs_lookup: leave (%d)\n", error);
@@ -492,6 +558,61 @@
vn_lock(vp, LK_DOWNGRADE | LK_RETRY);
}
+/*
+ * Exchange the default (upper vnode) lock on a unionfs vnode for the lower
+ * vnode lock, in support of operations that require access to the lower vnode
+ * even when an upper vnode is present. We don't use vn_lock_pair() to hold
+ * both vnodes at the same time, primarily because the caller may proceed
+ * to issue VOPs to the lower layer which re-lock or perform other operations
+ * which may not be safe in the presence of a locked vnode from another FS.
+ * Moreover, vn_lock_pair()'s deadlock resolution approach can introduce
+ * additional overhead that isn't necessary on these paths.
+ *
+ * vp must be a locked unionfs vnode; the lock state of this vnode is
+ * returned through *lkflags for later use in unionfs_unlock_lvp().
+ *
+ * Returns the locked lower vnode, or NULL if the lower vnode (and therefore
+ * also the unionfs vnode above it) has been doomed.
+ */
+static struct vnode *
+unionfs_lock_lvp(struct vnode *vp, int *lkflags)
+{
+ struct unionfs_node *unp;
+ struct vnode *lvp;
+
+ unp = VTOUNIONFS(vp);
+ lvp = unp->un_lowervp;
+ ASSERT_VOP_LOCKED(vp, __func__);
+ ASSERT_VOP_UNLOCKED(lvp, __func__);
+ *lkflags = VOP_ISLOCKED(vp);
+ vref(lvp);
+ VOP_UNLOCK(vp);
+ vn_lock(lvp, *lkflags | LK_RETRY);
+ if (VN_IS_DOOMED(lvp)) {
+ vput(lvp);
+ lvp = NULLVP;
+ vn_lock(vp, *lkflags | LK_RETRY);
+ }
+ return (lvp);
+}
+
+/*
+ * Undo a previous call to unionfs_lock_lvp(), restoring the default lock
+ * on the unionfs vnode. This function reloads and returns the vnode
+ * private data for the unionfs vnode, which will be NULL if the unionfs
+ * vnode became doomed while its lock was dropped. The caller must check
+ * for this case.
+ */
+static struct unionfs_node *
+unionfs_unlock_lvp(struct vnode *vp, struct vnode *lvp, int lkflags)
+{
+ ASSERT_VOP_LOCKED(lvp, __func__);
+ ASSERT_VOP_UNLOCKED(vp, __func__);
+ vput(lvp);
+ vn_lock(vp, lkflags | LK_RETRY);
+ return (VTOUNIONFS(vp));
+}
+
static int
unionfs_open(struct vop_open_args *ap)
{
@@ -504,7 +625,9 @@
struct ucred *cred;
struct thread *td;
int error;
+ int lkflags;
enum unionfs_lkupgrade lkstatus;
+ bool lock_lvp, open_lvp;
UNIONFS_INTERNAL_DEBUG("unionfs_open: enter\n");
@@ -515,6 +638,7 @@
targetvp = NULLVP;
cred = ap->a_cred;
td = ap->a_td;
+ open_lvp = lock_lvp = false;
/*
* The executable loader path may call this function with vp locked
@@ -546,10 +670,12 @@
if (targetvp == NULLVP) {
if (uvp == NULLVP) {
if ((ap->a_mode & FWRITE) && lvp->v_type == VREG) {
- error = unionfs_copyfile(unp,
+ error = unionfs_copyfile(vp,
!(ap->a_mode & O_TRUNC), cred, td);
- if (error != 0)
+ if (error != 0) {
+ unp = VTOUNIONFS(vp);
goto unionfs_open_abort;
+ }
targetvp = uvp = unp->un_uppervp;
} else
targetvp = lvp;
@@ -557,30 +683,69 @@
targetvp = uvp;
}
+ if (targetvp == uvp && uvp->v_type == VDIR && lvp != NULLVP &&
+ unsp->uns_lower_opencnt <= 0)
+ open_lvp = true;
+ else if (targetvp == lvp && uvp != NULLVP)
+ lock_lvp = true;
+
+ if (lock_lvp) {
+ unp = NULL;
+ lvp = unionfs_lock_lvp(vp, &lkflags);
+ if (lvp == NULLVP) {
+ error = ENOENT;
+ goto unionfs_open_abort;
+ }
+ } else
+ unionfs_forward_vop_start(targetvp, &lkflags);
+
error = VOP_OPEN(targetvp, ap->a_mode, cred, td, ap->a_fp);
- if (error == 0) {
- if (targetvp == uvp) {
- if (uvp->v_type == VDIR && lvp != NULLVP &&
- unsp->uns_lower_opencnt <= 0) {
- /* open lower for readdir */
- error = VOP_OPEN(lvp, FREAD, cred, td, NULL);
- if (error != 0) {
- VOP_CLOSE(uvp, ap->a_mode, cred, td);
- goto unionfs_open_abort;
- }
- unsp->uns_node_flag |= UNS_OPENL_4_READDIR;
- unsp->uns_lower_opencnt++;
+
+ if (lock_lvp) {
+ unp = unionfs_unlock_lvp(vp, lvp, lkflags);
+ if (unp == NULL && error == 0)
+ error = ENOENT;
+ } else if (unionfs_forward_vop_finish(vp, targetvp, lkflags))
+ error = error ? error : ENOENT;
+
+ if (error != 0)
+ goto unionfs_open_abort;
+
+ if (targetvp == uvp) {
+ if (open_lvp) {
+ unp = NULL;
+ lvp = unionfs_lock_lvp(vp, &lkflags);
+ if (lvp == NULLVP) {
+ error = ENOENT;
+ goto unionfs_open_abort;
}
- unsp->uns_upper_opencnt++;
- } else {
+ /* open lower for readdir */
+ error = VOP_OPEN(lvp, FREAD, cred, td, NULL);
+ unp = unionfs_unlock_lvp(vp, lvp, lkflags);
+ if (unp == NULL) {
+ error = error ? error : ENOENT;
+ goto unionfs_open_abort;
+ }
+ if (error != 0) {
+ unionfs_forward_vop_start(uvp, &lkflags);
+ VOP_CLOSE(uvp, ap->a_mode, cred, td);
+ if (unionfs_forward_vop_finish(vp, uvp, lkflags))
+ unp = NULL;
+ goto unionfs_open_abort;
+ }
+ unsp->uns_node_flag |= UNS_OPENL_4_READDIR;
unsp->uns_lower_opencnt++;
- unsp->uns_lower_openmode = ap->a_mode;
}
- vp->v_object = targetvp->v_object;
+ unsp->uns_upper_opencnt++;
+ } else {
+ unsp->uns_lower_opencnt++;
+ unsp->uns_lower_openmode = ap->a_mode;
}
+ vp->v_object = targetvp->v_object;
unionfs_open_abort:
- if (error != 0)
+
+ if (error != 0 && unp != NULL)
unionfs_tryrem_node_status(unp, unsp);
unionfs_open_cleanup:
@@ -599,9 +764,13 @@
struct ucred *cred;
struct thread *td;
struct vnode *vp;
+ struct vnode *uvp;
+ struct vnode *lvp;
struct vnode *ovp;
int error;
+ int lkflags;
enum unionfs_lkupgrade lkstatus;
+ bool lock_lvp;
UNIONFS_INTERNAL_DEBUG("unionfs_close: enter\n");
@@ -611,6 +780,7 @@
cred = ap->a_cred;
td = ap->a_td;
error = 0;
+ lock_lvp = false;
/*
* If the vnode is reclaimed while upgrading, we can't safely use unp
@@ -621,44 +791,72 @@
goto unionfs_close_cleanup;
unp = VTOUNIONFS(vp);
+ lvp = unp->un_lowervp;
+ uvp = unp->un_uppervp;
unionfs_get_node_status(unp, td, &unsp);
if (unsp->uns_lower_opencnt <= 0 && unsp->uns_upper_opencnt <= 0) {
-#ifdef DIAGNOSTIC
- printf("unionfs_close: warning: open count is 0\n");
-#endif
- if (unp->un_uppervp != NULLVP)
- ovp = unp->un_uppervp;
+ if (uvp != NULLVP)
+ ovp = uvp;
else
- ovp = unp->un_lowervp;
+ ovp = lvp;
} else if (unsp->uns_upper_opencnt > 0)
- ovp = unp->un_uppervp;
+ ovp = uvp;
else
- ovp = unp->un_lowervp;
+ ovp = lvp;
+
+ if (ovp == lvp && uvp != NULLVP) {
+ lock_lvp = true;
+ unp = NULL;
+ lvp = unionfs_lock_lvp(vp, &lkflags);
+ if (lvp == NULLVP) {
+ error = ENOENT;
+ goto unionfs_close_abort;
+ }
+ } else
+ unionfs_forward_vop_start(ovp, &lkflags);
error = VOP_CLOSE(ovp, ap->a_fflag, cred, td);
+ if (lock_lvp) {
+ unp = unionfs_unlock_lvp(vp, lvp, lkflags);
+ if (unp == NULL && error == 0)
+ error = ENOENT;
+ } else if (unionfs_forward_vop_finish(vp, ovp, lkflags))
+ error = error ? error : ENOENT;
+
if (error != 0)
goto unionfs_close_abort;
vp->v_object = ovp->v_object;
- if (ovp == unp->un_uppervp) {
- unsp->uns_upper_opencnt--;
- if (unsp->uns_upper_opencnt == 0) {
+ if (ovp == uvp) {
+ if (unsp != NULL && ((--unsp->uns_upper_opencnt) == 0)) {
if (unsp->uns_node_flag & UNS_OPENL_4_READDIR) {
- VOP_CLOSE(unp->un_lowervp, FREAD, cred, td);
+ unp = NULL;
+ lvp = unionfs_lock_lvp(vp, &lkflags);
+ if (lvp == NULLVP) {
+ error = ENOENT;
+ goto unionfs_close_abort;
+ }
+ VOP_CLOSE(lvp, FREAD, cred, td);
+ unp = unionfs_unlock_lvp(vp, lvp, lkflags);
+ if (unp == NULL) {
+ error = ENOENT;
+ goto unionfs_close_abort;
+ }
unsp->uns_node_flag &= ~UNS_OPENL_4_READDIR;
unsp->uns_lower_opencnt--;
}
if (unsp->uns_lower_opencnt > 0)
- vp->v_object = unp->un_lowervp->v_object;
+ vp->v_object = lvp->v_object;
}
- } else
+ } else if (unsp != NULL)
unsp->uns_lower_opencnt--;
unionfs_close_abort:
- unionfs_tryrem_node_status(unp, unsp);
+ if (unp != NULL && unsp != NULL)
+ unionfs_tryrem_node_status(unp, unsp);
unionfs_close_cleanup:
unionfs_downgrade_lock(vp, lkstatus);
@@ -883,7 +1081,7 @@
return (EROFS);
if (uvp == NULLVP && lvp->v_type == VREG) {
- error = unionfs_copyfile(unp, (vap->va_size != 0),
+ error = unionfs_copyfile(ap->a_vp, (vap->va_size != 0),
ap->a_cred, td);
if (error != 0)
return (error);
@@ -1078,8 +1276,10 @@
error = VOP_REMOVE(udvp, uvp, cnp);
unionfs_forward_vop_finish_pair(ap->a_dvp, udvp, udvp_lkflags,
ap->a_vp, uvp, uvp_lkflags);
- } else if (lvp != NULLVP)
- error = unionfs_mkwhiteout(ap->a_dvp, udvp, cnp, td, path, pathlen);
+ } else if (lvp != NULLVP) {
+ error = unionfs_mkwhiteout(ap->a_dvp, ap->a_vp, cnp, td,
+ path, pathlen);
+ }
UNIONFS_INTERNAL_DEBUG("unionfs_remove: leave (%d)\n", error);
@@ -1096,7 +1296,6 @@
struct componentname *cnp;
struct thread *td;
int error;
- int needrelookup;
UNIONFS_INTERNAL_DEBUG("unionfs_link: enter\n");
@@ -1104,7 +1303,6 @@
KASSERT_UNIONFS_VNODE(ap->a_vp);
error = 0;
- needrelookup = 0;
dunp = VTOUNIONFS(ap->a_tdvp);
unp = NULL;
udvp = dunp->un_uppervp;
@@ -1121,16 +1319,15 @@
if (ap->a_vp->v_type != VREG)
return (EOPNOTSUPP);
- error = unionfs_copyfile(unp, 1, cnp->cn_cred, td);
- if (error != 0)
- return (error);
- needrelookup = 1;
+ VOP_UNLOCK(ap->a_tdvp);
+ error = unionfs_copyfile(ap->a_vp, 1, cnp->cn_cred, td);
+ vn_lock(ap->a_tdvp, LK_EXCLUSIVE | LK_RETRY);
+ if (error == 0)
+ error = ERELOOKUP;
+ return (error);
}
uvp = unp->un_uppervp;
- if (needrelookup != 0)
- error = unionfs_relookup_for_create(ap->a_tdvp, cnp, td);
-
if (error == 0) {
int udvp_lkflags, uvp_lkflags;
unionfs_forward_vop_start_pair(udvp, &udvp_lkflags,
@@ -1154,8 +1351,6 @@
struct vnode *tdvp;
struct vnode *tvp;
struct componentname *tcnp;
- struct vnode *ltdvp;
- struct vnode *ltvp;
struct thread *td;
/* rename target vnodes */
@@ -1164,7 +1359,6 @@
struct vnode *rtdvp;
struct vnode *rtvp;
- struct unionfs_mount *ump;
struct unionfs_node *unp;
int error;
@@ -1177,8 +1371,6 @@
tdvp = ap->a_tdvp;
tvp = ap->a_tvp;
tcnp = ap->a_tcnp;
- ltdvp = NULLVP;
- ltvp = NULLVP;
td = curthread;
rfdvp = fdvp;
rfvp = fvp;
@@ -1238,7 +1430,6 @@
UNIONFS_INTERNAL_DEBUG("fvp=%p, ufvp=%p, lfvp=%p\n",
fvp, unp->un_uppervp, unp->un_lowervp);
#endif
- ump = MOUNTTOUNIONFSMOUNT(fvp->v_mount);
/*
* If we only have a lower vnode, copy the source file to the upper
* FS so that the rename operation can be issued against the upper FS.
@@ -1282,10 +1473,10 @@
else if (unp->un_uppervp == NULLVP) {
switch (fvp->v_type) {
case VREG:
- error = unionfs_copyfile(unp, 1, fcnp->cn_cred, td);
+ error = unionfs_copyfile(fvp, 1, fcnp->cn_cred, td);
break;
case VDIR:
- error = unionfs_mkshadowdir(ump, rfdvp, unp, fcnp, td);
+ error = unionfs_mkshadowdir(fdvp, fvp, fcnp, td);
break;
default:
error = ENODEV;
@@ -1327,7 +1518,6 @@
goto unionfs_rename_abort;
}
rtdvp = unp->un_uppervp;
- ltdvp = unp->un_lowervp;
vref(rtdvp);
if (tvp != NULLVP) {
@@ -1348,7 +1538,6 @@
goto unionfs_rename_abort;
}
rtvp = unp->un_uppervp;
- ltvp = unp->un_lowervp;
vref(rtvp);
}
}
@@ -1365,12 +1554,8 @@
cache_purge(fdvp);
}
- if (ltdvp != NULLVP)
- VOP_UNLOCK(ltdvp);
if (tdvp != rtdvp)
vrele(tdvp);
- if (ltvp != NULLVP)
- VOP_UNLOCK(ltvp);
if (tvp != rtvp && tvp != NULLVP) {
if (rtvp == NULLVP)
vput(tvp);
@@ -1504,43 +1689,55 @@
if (uvp != NULLVP) {
if (lvp != NULLVP) {
+ /*
+ * We need to keep dvp and vp's upper vnodes locked
+ * going into the VOP_RMDIR() call, but the empty
+ * directory check also requires the lower vnode lock.
+ * For this third, cross-filesystem lock we use a
+ * similar approach taken by various FS' VOP_RENAME
+ * implementations (which require 2-4 vnode locks).
+ * First we attempt a NOWAIT acquisition, then if
+ * that fails we drops the other two vnode locks,
+ * acquire lvp's lock in the normal fashion to reduce
+ * the likelihood of spinning on it in the future,
+ * then drop, reacquire the other locks, and return
+ * ERELOOKUP to re-drive the lookup in case the dvp->
+ * vp relationship has changed.
+ */
+ if (vn_lock(lvp, LK_SHARED | LK_NOWAIT) != 0) {
+ VOP_UNLOCK(ap->a_vp);
+ VOP_UNLOCK(ap->a_dvp);
+ vn_lock(lvp, LK_SHARED | LK_RETRY);
+ VOP_UNLOCK(lvp);
+ vn_lock(ap->a_dvp, LK_EXCLUSIVE | LK_RETRY);
+ vn_lock(ap->a_vp, LK_EXCLUSIVE | LK_RETRY);
+ return (ERELOOKUP);
+ }
error = unionfs_check_rmdir(ap->a_vp, cnp->cn_cred, td);
+ /*
+ * It's possible for a direct operation on the lower FS
+ * to make the lower directory non-empty after we drop
+ * the lock, but it's also possible for the upper-layer
+ * VOP_RMDIR to relock udvp/uvp which would lead to
+ * LOR if we kept lvp locked across that call.
+ */
+ VOP_UNLOCK(lvp);
if (error != 0)
return (error);
}
ump = MOUNTTOUNIONFSMOUNT(ap->a_vp->v_mount);
if (ump->um_whitemode == UNIONFS_WHITE_ALWAYS || lvp != NULLVP)
cnp->cn_flags |= DOWHITEOUT;
- /*
- * The relookup path will need to relock the parent dvp and
- * possibly the vp as well. Locking is expected to be done
- * in parent->child order; drop the lock on vp to avoid LOR
- * and potential recursion on vp's lock.
- * vp is expected to remain referenced during VOP_RMDIR(),
- * so vref/vrele should not be necessary here.
- */
- VOP_UNLOCK(ap->a_vp);
- VNPASS(vrefcnt(ap->a_vp) > 0, ap->a_vp);
- error = unionfs_relookup_for_delete(ap->a_dvp, cnp, td);
- vn_lock(ap->a_vp, LK_EXCLUSIVE | LK_RETRY);
- /*
- * VOP_RMDIR is dispatched against udvp, so if uvp became
- * doomed while the lock was dropped above the target
- * filesystem may not be able to cope.
- */
- if (error == 0 && VN_IS_DOOMED(uvp))
- error = ENOENT;
- if (error == 0) {
- int udvp_lkflags, uvp_lkflags;
- unionfs_forward_vop_start_pair(udvp, &udvp_lkflags,
- uvp, &uvp_lkflags);
- error = VOP_RMDIR(udvp, uvp, cnp);
- unionfs_forward_vop_finish_pair(ap->a_dvp, udvp, udvp_lkflags,
- ap->a_vp, uvp, uvp_lkflags);
- }
- } else if (lvp != NULLVP)
- error = unionfs_mkwhiteout(ap->a_dvp, udvp, cnp, td,
+ int udvp_lkflags, uvp_lkflags;
+ unionfs_forward_vop_start_pair(udvp, &udvp_lkflags,
+ uvp, &uvp_lkflags);
+ error = VOP_RMDIR(udvp, uvp, cnp);
+ unionfs_forward_vop_finish_pair(ap->a_dvp, udvp, udvp_lkflags,
+ ap->a_vp, uvp, uvp_lkflags);
+ } else if (lvp != NULLVP) {
+ error = unionfs_mkwhiteout(ap->a_dvp, ap->a_vp, cnp, td,
unp->un_path, unp->un_pathlen);
+ }
if (error == 0) {
cache_purge(ap->a_dvp);
@@ -1613,6 +1810,7 @@
uint64_t *cookies_bk;
int error;
int eofflag;
+ int lkflags;
int ncookies_bk;
int uio_offset_bk;
enum unionfs_lkupgrade lkstatus;
@@ -1668,18 +1866,26 @@
/* upper only */
if (uvp != NULLVP && lvp == NULLVP) {
+ unionfs_forward_vop_start(uvp, &lkflags);
error = VOP_READDIR(uvp, uio, ap->a_cred, ap->a_eofflag,
ap->a_ncookies, ap->a_cookies);
- unsp->uns_readdir_status = 0;
+ if (unionfs_forward_vop_finish(vp, uvp, lkflags))
+ error = error ? error : ENOENT;
+ else
+ unsp->uns_readdir_status = 0;
goto unionfs_readdir_exit;
}
/* lower only */
if (uvp == NULLVP && lvp != NULLVP) {
+ unionfs_forward_vop_start(lvp, &lkflags);
error = VOP_READDIR(lvp, uio, ap->a_cred, ap->a_eofflag,
ap->a_ncookies, ap->a_cookies);
- unsp->uns_readdir_status = 2;
+ if (unionfs_forward_vop_finish(vp, lvp, lkflags))
+ error = error ? error : ENOENT;
+ else
+ unsp->uns_readdir_status = 2;
goto unionfs_readdir_exit;
}
@@ -1689,14 +1895,17 @@
*/
KASSERT(uvp != NULLVP, ("unionfs_readdir: null upper vp"));
KASSERT(lvp != NULLVP, ("unionfs_readdir: null lower vp"));
+
if (uio->uio_offset == 0)
unsp->uns_readdir_status = 0;
if (unsp->uns_readdir_status == 0) {
/* read upper */
+ unionfs_forward_vop_start(uvp, &lkflags);
error = VOP_READDIR(uvp, uio, ap->a_cred, &eofflag,
ap->a_ncookies, ap->a_cookies);
-
+ if (unionfs_forward_vop_finish(vp, uvp, lkflags) && error == 0)
+ error = ENOENT;
if (error != 0 || eofflag == 0)
goto unionfs_readdir_exit;
unsp->uns_readdir_status = 1;
@@ -1735,14 +1944,22 @@
uio->uio_offset = 0;
}
- if (lvp == NULLVP) {
- error = EBADF;
+ lvp = unionfs_lock_lvp(vp, &lkflags);
+ if (lvp == NULL) {
+ error = ENOENT;
goto unionfs_readdir_exit;
}
+
/* read lower */
error = VOP_READDIR(lvp, uio, ap->a_cred, ap->a_eofflag,
ap->a_ncookies, ap->a_cookies);
+
+ unp = unionfs_unlock_lvp(vp, lvp, lkflags);
+ if (unp == NULL && error == 0)
+ error = ENOENT;
+
+
/*
* We can't return an uio_offset of 0: this would trigger an
* infinite loop, because the next call to unionfs_readdir would
@@ -1906,97 +2123,50 @@
return (0);
}
-static int
-unionfs_get_llt_revlock(struct vnode *vp, int flags)
-{
- int revlock;
-
- revlock = 0;
-
- switch (flags & LK_TYPE_MASK) {
- case LK_SHARED:
- if (VOP_ISLOCKED(vp) == LK_EXCLUSIVE)
- revlock = LK_UPGRADE;
- else
- revlock = LK_RELEASE;
- break;
- case LK_EXCLUSIVE:
- case LK_UPGRADE:
- revlock = LK_RELEASE;
- break;
- case LK_DOWNGRADE:
- revlock = LK_UPGRADE;
- break;
- default:
- break;
- }
-
- return (revlock);
-}
-
-/*
- * The state of an acquired lock is adjusted similarly to
- * the time of error generating.
- * flags: LK_RELEASE or LK_UPGRADE
- */
-static void
-unionfs_revlock(struct vnode *vp, int flags)
-{
- if (flags & LK_RELEASE)
- VOP_UNLOCK_FLAGS(vp, flags);
- else {
- /* UPGRADE */
- if (vn_lock(vp, flags) != 0)
- vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
- }
-}
-
static int
unionfs_lock(struct vop_lock1_args *ap)
{
struct unionfs_node *unp;
struct vnode *vp;
- struct vnode *uvp;
- struct vnode *lvp;
+ struct vnode *tvp;
int error;
int flags;
- int revlock;
- int interlock;
- int uhold;
+ bool lvp_locked;
- /*
- * TODO: rework the unionfs locking scheme.
- * It's not guaranteed to be safe to blindly lock two vnodes on
- * different mounts as is done here. Further, the entanglement
- * of locking both vnodes with the various options that can be
- * passed to VOP_LOCK() makes this code hard to reason about.
- * Instead, consider locking only the upper vnode, or the lower
- * vnode is the upper is not present, and taking separate measures
- * to lock both vnodes in the few cases when that is needed.
- */
error = 0;
- interlock = 1;
- uhold = 0;
flags = ap->a_flags;
vp = ap->a_vp;
if (LK_RELEASE == (flags & LK_TYPE_MASK) || !(flags & LK_TYPE_MASK))
return (VOP_UNLOCK_FLAGS(vp, flags | LK_RELEASE));
+unionfs_lock_restart:
+ /*
+ * We currently need the interlock here to ensure we can safely
+ * access the unionfs vnode's private data. We may be able to
+ * eliminate this extra locking by instead using vfs_smr_enter()
+ * and vn_load_v_data_smr() here in conjunction with an SMR UMA
+ * zone for unionfs nodes.
+ */
if ((flags & LK_INTERLOCK) == 0)
VI_LOCK(vp);
+ else
+ flags &= ~LK_INTERLOCK;
unp = VTOUNIONFS(vp);
- if (unp == NULL)
- goto unionfs_lock_null_vnode;
-
- KASSERT_UNIONFS_VNODE(ap->a_vp);
-
- lvp = unp->un_lowervp;
- uvp = unp->un_uppervp;
+ if (unp == NULL) {
+ VI_UNLOCK(vp);
+ ap->a_flags = flags;
+ return (vop_stdlock(ap));
+ }
- if ((revlock = unionfs_get_llt_revlock(vp, flags)) == 0)
- panic("unknown lock type: 0x%x", flags & LK_TYPE_MASK);
+ if (unp->un_uppervp != NULL) {
+ tvp = unp->un_uppervp;
+ lvp_locked = false;
+ } else {
+ tvp = unp->un_lowervp;
+ lvp_locked = true;
+ }
/*
* During unmount, the root vnode lock may be taken recursively,
@@ -2009,150 +2179,77 @@
(vp->v_vflag & VV_ROOT) != 0)
flags |= LK_CANRECURSE;
- if (lvp != NULLVP) {
- if (uvp != NULLVP && flags & LK_UPGRADE) {
+ vholdnz(tvp);
+ VI_UNLOCK(vp);
+ error = VOP_LOCK(tvp, flags);
+ vdrop(tvp);
+ if (error == 0 && (lvp_locked || VTOUNIONFS(vp) == NULL)) {
+ /*
+ * After dropping the interlock above, there exists a window
+ * in which another thread may acquire the lower vnode lock
+ * and then either doom the unionfs vnode or create an upper
+ * vnode. In either case, we will effectively be holding the
+ * wrong lock, so we must drop the lower vnode lock and
+ * restart the lock operation.
+ *
+ * If unp is not already NULL, we assume that we can safely
+ * access it because we currently hold lvp's lock.
+ * unionfs_noderem() acquires lvp's lock before freeing
+ * the vnode private data, ensuring it can't be concurrently
+ * freed while we are using it here. Likewise,
+ * unionfs_node_update() acquires lvp's lock before installing
+ * an upper vnode. Without those guarantees, we would need to
+ * reacquire the vnode interlock here.
+ * Note that unionfs_noderem() doesn't acquire lvp's lock if
+ * this is the root vnode, but the root vnode should always
+ * have an upper vnode and therefore we should never use its
+ * lower vnode lock here.
+ */
+ unp = VTOUNIONFS(vp);
+ if (unp == NULL || unp->un_uppervp != NULLVP) {
+ VOP_UNLOCK(tvp);
/*
- * Share Lock is once released and a deadlock is
- * avoided.
+ * If we previously held the lock, the upgrade may
+ * have temporarily dropped the lock, in which case
+ * concurrent dooming or copy-up will necessitate
+ * acquiring a different lock. Since we never held
+ * the new lock, LK_UPGRADE must be cleared here to
+ * avoid triggering a lockmgr panic.
*/
- vholdnz(uvp);
- uhold = 1;
- VOP_UNLOCK(uvp);
- }
- VI_LOCK_FLAGS(lvp, MTX_DUPOK);
- flags |= LK_INTERLOCK;
- vholdl(lvp);
-
- VI_UNLOCK(vp);
- ap->a_flags &= ~LK_INTERLOCK;
-
- error = VOP_LOCK(lvp, flags);
-
- VI_LOCK(vp);
- unp = VTOUNIONFS(vp);
- if (unp == NULL) {
- /* vnode is released. */
- VI_UNLOCK(vp);
- if (error == 0)
- VOP_UNLOCK(lvp);
- vdrop(lvp);
- if (uhold != 0)
- vdrop(uvp);
- goto unionfs_lock_fallback;
+ if (flags & LK_UPGRADE)
+ flags = (flags & ~LK_TYPE_MASK) | LK_EXCLUSIVE;
+ VNASSERT((flags & LK_DOWNGRADE) == 0, vp,
+ ("%s: vnode doomed during downgrade", __func__));
+ goto unionfs_lock_restart;
}
}
- if (error == 0 && uvp != NULLVP) {
- if (uhold && flags & LK_UPGRADE) {
- flags &= ~LK_TYPE_MASK;
- flags |= LK_EXCLUSIVE;
- }
- VI_LOCK_FLAGS(uvp, MTX_DUPOK);
- flags |= LK_INTERLOCK;
- if (uhold == 0) {
- vholdl(uvp);
- uhold = 1;
- }
-
- VI_UNLOCK(vp);
- ap->a_flags &= ~LK_INTERLOCK;
-
- error = VOP_LOCK(uvp, flags);
-
- VI_LOCK(vp);
- unp = VTOUNIONFS(vp);
- if (unp == NULL) {
- /* vnode is released. */
- VI_UNLOCK(vp);
- if (error == 0)
- VOP_UNLOCK(uvp);
- vdrop(uvp);
- if (lvp != NULLVP) {
- VOP_UNLOCK(lvp);
- vdrop(lvp);
- }
- goto unionfs_lock_fallback;
- }
- if (error != 0 && lvp != NULLVP) {
- /* rollback */
- VI_UNLOCK(vp);
- unionfs_revlock(lvp, revlock);
- interlock = 0;
- }
- }
-
- if (interlock)
- VI_UNLOCK(vp);
- if (lvp != NULLVP)
- vdrop(lvp);
- if (uhold != 0)
- vdrop(uvp);
-
return (error);
-
-unionfs_lock_null_vnode:
- ap->a_flags |= LK_INTERLOCK;
- return (vop_stdlock(ap));
-
-unionfs_lock_fallback:
- /*
- * If we reach this point, we've discovered the unionfs vnode
- * has been reclaimed while the upper/lower vnode locks were
- * temporarily dropped. Such temporary droppage may happen
- * during the course of an LK_UPGRADE operation itself, and in
- * that case LK_UPGRADE must be cleared as the unionfs vnode's
- * lock has been reset to point to the standard v_lock field,
- * which has not previously been held.
- */
- if (flags & LK_UPGRADE) {
- ap->a_flags &= ~LK_TYPE_MASK;
- ap->a_flags |= LK_EXCLUSIVE;
- }
- return (vop_stdlock(ap));
}
static int
unionfs_unlock(struct vop_unlock_args *ap)
{
struct vnode *vp;
- struct vnode *lvp;
- struct vnode *uvp;
+ struct vnode *tvp;
struct unionfs_node *unp;
int error;
- int uhold;
KASSERT_UNIONFS_VNODE(ap->a_vp);
- error = 0;
- uhold = 0;
vp = ap->a_vp;
unp = VTOUNIONFS(vp);
if (unp == NULL)
- goto unionfs_unlock_null_vnode;
- lvp = unp->un_lowervp;
- uvp = unp->un_uppervp;
-
- if (lvp != NULLVP) {
- vholdnz(lvp);
- error = VOP_UNLOCK(lvp);
- }
-
- if (error == 0 && uvp != NULLVP) {
- vholdnz(uvp);
- uhold = 1;
- error = VOP_UNLOCK(uvp);
- }
+ return (vop_stdunlock(ap));
- if (lvp != NULLVP)
- vdrop(lvp);
- if (uhold != 0)
- vdrop(uvp);
+ tvp = (unp->un_uppervp != NULL ? unp->un_uppervp : unp->un_lowervp);
- return error;
+ vholdnz(tvp);
+ error = VOP_UNLOCK(tvp);
+ vdrop(tvp);
-unionfs_unlock_null_vnode:
- return (vop_stdunlock(ap));
+ return (error);
}
static int
@@ -2192,7 +2289,7 @@
uvp = unp->un_uppervp;
if (uvp == NULLVP) {
- error = unionfs_copyfile(unp, 1, td->td_ucred, td);
+ error = unionfs_copyfile(ap->a_vp, 1, td->td_ucred, td);
if (error != 0)
goto unionfs_advlock_abort;
uvp = unp->un_uppervp;
@@ -2294,7 +2391,7 @@
return (EROFS);
if (uvp == NULLVP && lvp->v_type == VREG) {
- if ((error = unionfs_copyfile(unp, 1, ap->a_cred, td)) != 0)
+ if ((error = unionfs_copyfile(ap->a_vp, 1, ap->a_cred, td)) != 0)
return (error);
uvp = unp->un_uppervp;
}
@@ -2467,9 +2564,10 @@
if (ovp == lvp && lvp->v_type == VREG) {
VOP_CLOSEEXTATTR(lvp, 0, cred, td);
if (uvp == NULLVP &&
- (error = unionfs_copyfile(unp, 1, cred, td)) != 0) {
+ (error = unionfs_copyfile(ap->a_vp, 1, cred, td)) != 0) {
unionfs_setextattr_reopen:
- if ((unp->un_flag & UNIONFS_OPENEXTL) &&
+ unp = VTOUNIONFS(ap->a_vp);
+ if (unp != NULL && (unp->un_flag & UNIONFS_OPENEXTL) &&
VOP_OPENEXTATTR(lvp, cred, td)) {
#ifdef DIAGNOSTIC
panic("unionfs: VOP_OPENEXTATTR failed");
@@ -2561,9 +2659,10 @@
if (ovp == lvp && lvp->v_type == VREG) {
VOP_CLOSEEXTATTR(lvp, 0, cred, td);
if (uvp == NULLVP &&
- (error = unionfs_copyfile(unp, 1, cred, td)) != 0) {
+ (error = unionfs_copyfile(ap->a_vp, 1, cred, td)) != 0) {
unionfs_deleteextattr_reopen:
- if ((unp->un_flag & UNIONFS_OPENEXTL) &&
+ unp = VTOUNIONFS(ap->a_vp);
+ if (unp != NULL && (unp->un_flag & UNIONFS_OPENEXTL) &&
VOP_OPENEXTATTR(lvp, cred, td)) {
#ifdef DIAGNOSTIC
panic("unionfs: VOP_OPENEXTATTR failed");
@@ -2613,7 +2712,7 @@
return (EROFS);
if (uvp == NULLVP && lvp->v_type == VREG) {
- if ((error = unionfs_copyfile(unp, 1, ap->a_cred, td)) != 0)
+ if ((error = unionfs_copyfile(ap->a_vp, 1, ap->a_cred, td)) != 0)
return (error);
uvp = unp->un_uppervp;
}
@@ -2665,7 +2764,7 @@
unionfs_vput_pair(struct vop_vput_pair_args *ap)
{
struct mount *mp;
- struct vnode *dvp, *vp, **vpp, *lvp, *ldvp, *uvp, *udvp, *tempvp;
+ struct vnode *dvp, *vp, **vpp, *lvp, *uvp, *tvp, *tdvp, *tempvp;
struct unionfs_node *dunp, *unp;
int error, res;
@@ -2674,11 +2773,14 @@
vp = NULLVP;
lvp = NULLVP;
uvp = NULLVP;
+ tvp = NULLVP;
unp = NULL;
dunp = VTOUNIONFS(dvp);
- udvp = dunp->un_uppervp;
- ldvp = dunp->un_lowervp;
+ if (dunp->un_uppervp != NULL)
+ tdvp = dunp->un_uppervp;
+ else
+ tdvp = dunp->un_lowervp;
/*
* Underlying vnodes should be locked because the encompassing unionfs
@@ -2686,10 +2788,7 @@
* only be on the unionfs node. Reference them now so that the vput()s
* performed by VOP_VPUT_PAIR() will have a reference to drop.
*/
- if (udvp != NULLVP)
- vref(udvp);
- if (ldvp != NULLVP)
- vref(ldvp);
+ vref(tdvp);
if (vpp != NULL)
vp = *vpp;
@@ -2699,9 +2798,10 @@
uvp = unp->un_uppervp;
lvp = unp->un_lowervp;
if (uvp != NULLVP)
- vref(uvp);
- if (lvp != NULLVP)
- vref(lvp);
+ tvp = uvp;
+ else
+ tvp = lvp;
+ vref(tvp);
/*
* If we're being asked to return a locked child vnode, then
@@ -2721,31 +2821,19 @@
}
}
- /*
- * TODO: Because unionfs_lock() locks both the lower and upper vnodes
- * (if available), we must also call VOP_VPUT_PAIR() on both the lower
- * and upper parent/child pairs. If unionfs_lock() is reworked to lock
- * only a single vnode, this code will need to change to also only
- * operate on one vnode pair.
- */
- ASSERT_VOP_LOCKED(ldvp, __func__);
- ASSERT_VOP_LOCKED(udvp, __func__);
- ASSERT_VOP_LOCKED(lvp, __func__);
- ASSERT_VOP_LOCKED(uvp, __func__);
-
- KASSERT(lvp == NULLVP || ldvp != NULLVP,
- ("%s: NULL ldvp with non-NULL lvp", __func__));
- if (ldvp != NULLVP)
- res = VOP_VPUT_PAIR(ldvp, lvp != NULLVP ? &lvp : NULL, true);
- KASSERT(uvp == NULLVP || udvp != NULLVP,
- ("%s: NULL udvp with non-NULL uvp", __func__));
- if (udvp != NULLVP)
- res = VOP_VPUT_PAIR(udvp, uvp != NULLVP ? &uvp : NULL, true);
-
- ASSERT_VOP_UNLOCKED(ldvp, __func__);
- ASSERT_VOP_UNLOCKED(udvp, __func__);
- ASSERT_VOP_UNLOCKED(lvp, __func__);
- ASSERT_VOP_UNLOCKED(uvp, __func__);
+ ASSERT_VOP_LOCKED(tdvp, __func__);
+ ASSERT_VOP_LOCKED(tvp, __func__);
+
+ if (tdvp == dunp->un_uppervp && tvp != NULLVP && tvp == lvp) {
+ vput(tvp);
+ vput(tdvp);
+ res = 0;
+ } else {
+ res = VOP_VPUT_PAIR(tdvp, tvp != NULLVP ? &tvp : NULL, true);
+ }
+
+ ASSERT_VOP_UNLOCKED(tdvp, __func__);
+ ASSERT_VOP_UNLOCKED(tvp, __func__);
/*
* VOP_VPUT_PAIR() dropped the references we added to the underlying
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Sat, Jan 25, 5:56 AM (20 h, 10 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
16127209
Default Alt Text
D45398.diff (72 KB)
Attached To
Mode
D45398: unionfs: rework locking scheme to only lock a single vnode
Attached
Detach File
Event Timeline
Log In to Comment