Page Menu
Home
FreeBSD
Search
Configure Global Search
Log In
Files
F106992089
D41787.id128185.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Flag For Later
Award Token
Size
29 KB
Referenced Files
None
Subscribers
None
D41787.id128185.diff
View Options
diff --git a/sys/kern/kern_rangelock.c b/sys/kern/kern_rangelock.c
--- a/sys/kern/kern_rangelock.c
+++ b/sys/kern/kern_rangelock.c
@@ -2,7 +2,11 @@
* SPDX-License-Identifier: BSD-2-Clause
*
* Copyright (c) 2009 Konstantin Belousov <kib@FreeBSD.org>
- * All rights reserved.
+ * Copyright (c) 2023 The FreeBSD Foundation
+ *
+ * Portions of this software were developed by
+ * Konstantin Belousov <kib@FreeBSD.org> under sponsorship from
+ * the FreeBSD Foundation.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -26,306 +30,691 @@
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
-#include <sys/cdefs.h>
#include <sys/param.h>
+#include <sys/kassert.h>
#include <sys/kernel.h>
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/proc.h>
#include <sys/rangelock.h>
-#include <sys/systm.h>
+#include <sys/sleepqueue.h>
+#include <sys/smr.h>
+#include <sys/sysctl.h>
#include <vm/uma.h>
+/*
+ * Immediately after initialization, and until a first conflicting
+ * pair of range locking requests is noted, range locks are served in
+ * the 'cheating' mode. There, rangelock becomes effectively sxlock,
+ * with each request covering whole range of the lock interval.
+ * Either incoming locking request can be granted immediately without
+ * blocking, or when a conflict is detected, current ranges are
+ * drained, and the lock is switched to full range-obeying lock.
+ *
+ * Normal sx implementation is not used to not bloat structures (most
+ * important, vnodes) which embeds rangelocks.
+ *
+ * The cheating greatly helps very common pattern where file is first
+ * written single-threaded, and then read by many processes.
+ *
+ * Lock is in cheat mode when RL_CHEAT_CHEATING bit is set in the
+ * lock->head. Special cookies are returned in this mode, and
+ * trylocks are same as normal locks but do not drain.
+ */
+
+static int rangelock_cheat = 1;
+SYSCTL_INT(_debug, OID_AUTO, rangelock_cheat, CTLFLAG_RWTUN,
+ &rangelock_cheat, 0,
+ "");
+
+#define RL_CHEAT_MASK 0x7
+#define RL_CHEAT_CHEATING 0x1
+/* #define RL_CHEAT_RLOCKED 0x0 */
+#define RL_CHEAT_WLOCKED 0x2
+#define RL_CHEAT_DRAINING 0x4
+
+#define RL_CHEAT_READER 0x8
+
+#define RL_RET_CHEAT_RLOCKED 0x1100
+#define RL_RET_CHEAT_WLOCKED 0x2200
+
+static bool
+rangelock_cheat_lock(struct rangelock *lock, int locktype, bool trylock,
+ void **cookie)
+{
+ uintptr_t v, x;
+
+ v = atomic_load_ptr(&lock->head);
+ if ((v & RL_CHEAT_CHEATING) == 0)
+ return (false);
+ if ((v & RL_CHEAT_DRAINING) != 0) {
+drain:
+ if (trylock) {
+ *cookie = NULL;
+ return (true);
+ }
+ sleepq_lock(&lock->head);
+drain1:
+ DROP_GIANT();
+ for (;;) {
+ v = atomic_load_ptr(&lock->head);
+ if ((v & RL_CHEAT_DRAINING) == 0)
+ break;
+ sleepq_add(&lock->head, NULL, "ranged1", 0, 0);
+ sleepq_wait(&lock->head, PRI_USER);
+ sleepq_lock(&lock->head);
+ }
+ sleepq_release(&lock->head);
+ PICKUP_GIANT();
+ return (false);
+ }
+
+ switch (locktype) {
+ case RL_LOCK_READ:
+ for (;;) {
+ if ((v & RL_CHEAT_WLOCKED) != 0) {
+ if (trylock) {
+ *cookie = NULL;
+ return (true);
+ }
+ x = v | RL_CHEAT_DRAINING;
+ sleepq_lock(&lock->head);
+ if (atomic_fcmpset_rel_ptr(&lock->head, &v,
+ x) != 0)
+ goto drain1;
+ sleepq_release(&lock->head);
+ /* Possibly forgive passed conflict */
+ continue;
+ }
+ x = (v & ~RL_CHEAT_MASK) + RL_CHEAT_READER;
+ x |= RL_CHEAT_CHEATING;
+ if (atomic_fcmpset_acq_ptr(&lock->head, &v, x) != 0)
+ break;
+ if ((v & RL_CHEAT_CHEATING) == 0)
+ return (false);
+ if ((v & RL_CHEAT_DRAINING) != 0)
+ goto drain;
+ }
+ *(uintptr_t *)cookie = RL_RET_CHEAT_RLOCKED;
+ break;
+ case RL_LOCK_WRITE:
+ for (;;) {
+ if ((v & ~RL_CHEAT_MASK) >= RL_CHEAT_READER ||
+ (v & RL_CHEAT_WLOCKED) != 0) {
+ if (trylock) {
+ *cookie = NULL;
+ return (true);
+ }
+ x = v | RL_CHEAT_DRAINING;
+ sleepq_lock(&lock->head);
+ if (atomic_fcmpset_rel_ptr(&lock->head, &v,
+ x) != 0)
+ goto drain1;
+ sleepq_release(&lock->head);
+ /* Possibly forgive passed conflict */
+ continue;
+ }
+ x = RL_CHEAT_WLOCKED | RL_CHEAT_CHEATING;
+ if (atomic_fcmpset_acq_ptr(&lock->head, &v, x) != 0)
+ break;
+ if ((v & RL_CHEAT_CHEATING) == 0)
+ return (false);
+ if ((v & RL_CHEAT_DRAINING) != 0)
+ goto drain;
+ }
+ *(uintptr_t *)cookie = RL_RET_CHEAT_WLOCKED;
+ break;
+ default:
+ __assert_unreachable();
+ break;
+ }
+ return (true);
+}
+
+static bool
+rangelock_cheat_unlock(struct rangelock *lock, void *cookie)
+{
+ uintptr_t v, x;
+
+ v = atomic_load_ptr(&lock->head);
+ if ((v & RL_CHEAT_CHEATING) == 0)
+ return (false);
+
+ MPASS((uintptr_t)cookie == RL_RET_CHEAT_WLOCKED ||
+ (uintptr_t)cookie == RL_RET_CHEAT_RLOCKED);
+
+ switch ((uintptr_t)cookie) {
+ case RL_RET_CHEAT_RLOCKED:
+ for (;;) {
+ MPASS((v & ~RL_CHEAT_MASK) >= RL_CHEAT_READER);
+ MPASS((v & RL_CHEAT_WLOCKED) == 0);
+ x = (v & ~RL_CHEAT_MASK) - RL_CHEAT_READER;
+ if ((v & RL_CHEAT_DRAINING) != 0) {
+ if (x != 0) {
+ x |= RL_CHEAT_DRAINING |
+ RL_CHEAT_CHEATING;
+ }
+ sleepq_lock(&lock->head);
+ if (atomic_fcmpset_rel_ptr(&lock->head, &v,
+ x) != 0) {
+ if (x == 0) {
+ sleepq_broadcast(&lock->head,
+ SLEEPQ_SLEEP, 0, 0);
+ }
+ sleepq_release(&lock->head);
+ break;
+ } else {
+ sleepq_release(&lock->head);
+ }
+ } else {
+ x |= RL_CHEAT_CHEATING;
+ if (atomic_fcmpset_rel_ptr(&lock->head, &v,
+ x) != 0)
+ break;
+ }
+ }
+ break;
+ case RL_RET_CHEAT_WLOCKED:
+ for (;;) {
+ MPASS((v & RL_CHEAT_WLOCKED) != 0);
+ if ((v & RL_CHEAT_DRAINING) != 0) {
+ sleepq_lock(&lock->head);
+ atomic_store_ptr(&lock->head, 0);
+ sleepq_broadcast(&lock->head, SLEEPQ_SLEEP,
+ 0, 0);
+ sleepq_release(&lock->head);
+ break;
+ } else {
+ if (atomic_fcmpset_ptr(&lock->head, &v,
+ RL_CHEAT_CHEATING) != 0)
+ break;
+ }
+ }
+ break;
+ default:
+ __assert_unreachable();
+ break;
+ }
+ return (true);
+}
+
+static bool
+rangelock_cheat_destroy(struct rangelock *lock)
+{
+ uintptr_t v;
+
+ v = atomic_load_ptr(&lock->head);
+ if ((v & RL_CHEAT_CHEATING) == 0)
+ return (false);
+ MPASS(v == RL_CHEAT_CHEATING);
+ return (true);
+}
+
+/*
+ * Implementation of range locks based on the paper
+ * https://doi.org/10.1145/3342195.3387533
+ * arXiv:2006.12144v1 [cs.OS] 22 Jun 2020
+ * Scalable Range Locks for Scalable Address Spaces and Beyond
+ * by Alex Kogan, Dave Dice, and Shady Issa
+ */
+
+static struct rl_q_entry *rl_e_unmark(const struct rl_q_entry *e);
+
+/*
+ * rl_q_next links all granted ranges in the lock. We cannot free an
+ * rl_q_entry while in the smr section, and cannot reuse rl_q_next
+ * linkage since other threads might follow it even after CAS removed
+ * the range. Use rl_q_free for local list of ranges to remove after
+ * the smr section is dropped.
+ */
struct rl_q_entry {
- TAILQ_ENTRY(rl_q_entry) rl_q_link;
+ struct rl_q_entry *rl_q_next;
+ struct rl_q_entry *rl_q_free;
off_t rl_q_start, rl_q_end;
int rl_q_flags;
+#ifdef INVARIANTS
+ struct thread *rl_q_owner;
+#endif
};
static uma_zone_t rl_entry_zone;
+static smr_t rl_smr;
static void
rangelock_sys_init(void)
{
-
+ /*
+ * The alignment requirement comes from the need for
+ * RL_CHEAT_* flags space.
+ */
rl_entry_zone = uma_zcreate("rl_entry", sizeof(struct rl_q_entry),
- NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
+ NULL, NULL, NULL, NULL, 8 - 1, UMA_ZONE_SMR);
+ rl_smr = uma_zone_get_smr(rl_entry_zone);
}
-SYSINIT(vfs, SI_SUB_LOCK, SI_ORDER_ANY, rangelock_sys_init, NULL);
+SYSINIT(rl, SI_SUB_LOCK, SI_ORDER_ANY, rangelock_sys_init, NULL);
static struct rl_q_entry *
-rlqentry_alloc(void)
+rlqentry_alloc(vm_ooffset_t start, vm_ooffset_t end, int flags)
{
+ struct rl_q_entry *e;
+ struct thread *td;
- return (uma_zalloc(rl_entry_zone, M_WAITOK));
+ td = curthread;
+ if (td->td_rlqe != NULL) {
+ e = td->td_rlqe;
+ td->td_rlqe = NULL;
+ } else {
+ e = uma_zalloc_smr(rl_entry_zone, M_WAITOK);
+ }
+ e->rl_q_next = NULL;
+ e->rl_q_free = NULL;
+ e->rl_q_start = start;
+ e->rl_q_end = end;
+ e->rl_q_flags = flags;
+#ifdef INVARIANTS
+ e->rl_q_owner = curthread;
+#endif
+ return (e);
}
void
-rlqentry_free(struct rl_q_entry *rleq)
+rangelock_entry_free(struct rl_q_entry *e)
{
-
- uma_zfree(rl_entry_zone, rleq);
+ uma_zfree_smr(rl_entry_zone, e);
}
void
rangelock_init(struct rangelock *lock)
{
-
- TAILQ_INIT(&lock->rl_waiters);
- lock->rl_currdep = NULL;
+ lock->sleepers = false;
+ atomic_store_ptr(&lock->head, rangelock_cheat ? RL_CHEAT_CHEATING : 0);
}
void
rangelock_destroy(struct rangelock *lock)
{
+ struct rl_q_entry *e, *ep;
+
+ MPASS(!lock->sleepers);
+ if (rangelock_cheat_destroy(lock))
+ return;
+ for (e = (struct rl_q_entry *)atomic_load_ptr(&lock->head);
+ e != NULL; e = rl_e_unmark(ep)) {
+ ep = atomic_load_ptr(&e->rl_q_next);
+ uma_zfree_smr(rl_entry_zone, e);
+ }
+}
- KASSERT(TAILQ_EMPTY(&lock->rl_waiters), ("Dangling waiters"));
+static bool
+rl_e_is_marked(const struct rl_q_entry *e)
+{
+ return (((uintptr_t)e & 1) != 0);
}
-/*
- * Two entries are compatible if their ranges do not overlap, or both
- * entries are for read.
- */
-static int
-ranges_overlap(const struct rl_q_entry *e1,
- const struct rl_q_entry *e2)
+static struct rl_q_entry *
+rl_e_unmark_unchecked(const struct rl_q_entry *e)
{
+ return ((struct rl_q_entry *)((uintptr_t)e & ~1));
+}
- if (e1->rl_q_start < e2->rl_q_end && e1->rl_q_end > e2->rl_q_start)
- return (1);
- return (0);
+static struct rl_q_entry *
+rl_e_unmark(const struct rl_q_entry *e)
+{
+ MPASS(rl_e_is_marked(e));
+ return (rl_e_unmark_unchecked(e));
}
-/*
- * Recalculate the lock->rl_currdep after an unlock.
- */
static void
-rangelock_calc_block(struct rangelock *lock)
+rl_e_mark(struct rl_q_entry *e)
{
- struct rl_q_entry *entry, *nextentry, *entry1;
-
- for (entry = lock->rl_currdep; entry != NULL; entry = nextentry) {
- nextentry = TAILQ_NEXT(entry, rl_q_link);
- if (entry->rl_q_flags & RL_LOCK_READ) {
- /* Reads must not overlap with granted writes. */
- for (entry1 = TAILQ_FIRST(&lock->rl_waiters);
- !(entry1->rl_q_flags & RL_LOCK_READ);
- entry1 = TAILQ_NEXT(entry1, rl_q_link)) {
- if (ranges_overlap(entry, entry1))
- goto out;
- }
- } else {
- /* Write must not overlap with any granted locks. */
- for (entry1 = TAILQ_FIRST(&lock->rl_waiters);
- entry1 != entry;
- entry1 = TAILQ_NEXT(entry1, rl_q_link)) {
- if (ranges_overlap(entry, entry1))
- goto out;
- }
+#ifdef INVARIANTS
+ int r = atomic_testandset_long((uintptr_t *)&e->rl_q_next, 0);
+ MPASS(r == 0);
+#else
+ atomic_set_ptr((uintptr_t *)&e->rl_q_next, 1);
+#endif
+}
- /* Move grantable write locks to the front. */
- TAILQ_REMOVE(&lock->rl_waiters, entry, rl_q_link);
- TAILQ_INSERT_HEAD(&lock->rl_waiters, entry, rl_q_link);
- }
+static struct rl_q_entry *
+rl_q_load(struct rl_q_entry **p)
+{
+ return ((struct rl_q_entry *)atomic_load_acq_ptr((uintptr_t *)p));
+}
- /* Grant this lock. */
- entry->rl_q_flags |= RL_LOCK_GRANTED;
- wakeup(entry);
- }
-out:
- lock->rl_currdep = entry;
+static bool
+rl_e_is_rlock(const struct rl_q_entry *e)
+{
+ return ((e->rl_q_flags & RL_LOCK_TYPE_MASK) == RL_LOCK_READ);
}
static void
-rangelock_unlock_locked(struct rangelock *lock, struct rl_q_entry *entry,
- struct mtx *ilk, bool do_calc_block)
+rangelock_unlock_int(struct rangelock *lock, struct rl_q_entry *e)
{
+ MPASS(lock != NULL && e != NULL);
+ MPASS(!rl_e_is_marked(rl_q_load(&e->rl_q_next)));
+ MPASS(e->rl_q_owner == curthread);
- MPASS(lock != NULL && entry != NULL && ilk != NULL);
- mtx_assert(ilk, MA_OWNED);
-
- if (!do_calc_block) {
- /*
- * This is the case where rangelock_enqueue() has been called
- * with trylock == true and just inserted this entry in the
- * queue.
- * If rl_currdep is this entry, rl_currdep needs to
- * be set to the next entry in the rl_waiters list.
- * However, since this entry is the last entry in the
- * list, the next entry is NULL.
- */
- if (lock->rl_currdep == entry) {
- KASSERT(TAILQ_NEXT(lock->rl_currdep, rl_q_link) == NULL,
- ("rangelock_enqueue: next entry not NULL"));
- lock->rl_currdep = NULL;
- }
- } else
- KASSERT(entry != lock->rl_currdep, ("stuck currdep"));
-
- TAILQ_REMOVE(&lock->rl_waiters, entry, rl_q_link);
- if (do_calc_block)
- rangelock_calc_block(lock);
- mtx_unlock(ilk);
- if (curthread->td_rlqe == NULL)
- curthread->td_rlqe = entry;
- else
- rlqentry_free(entry);
+ rl_e_mark(e);
+ lock->sleepers = false;
+ sleepq_broadcast(&lock->sleepers, SLEEPQ_SLEEP, 0, 0);
}
void
-rangelock_unlock(struct rangelock *lock, void *cookie, struct mtx *ilk)
+rangelock_unlock(struct rangelock *lock, void *cookie)
{
+ if (rangelock_cheat_unlock(lock, cookie))
+ return;
- MPASS(lock != NULL && cookie != NULL && ilk != NULL);
-
- mtx_lock(ilk);
- rangelock_unlock_locked(lock, cookie, ilk, true);
+ sleepq_lock(&lock->sleepers);
+ rangelock_unlock_int(lock, cookie);
+ sleepq_release(&lock->sleepers);
}
/*
- * Unlock the sub-range of granted lock.
+ * result: -1 if e1 before e2, or both locks are readers and e1
+ * starts before or at e2
+ * 1 if e1 after e2, or both locks are readers and e1
+ * starts after e2
+ * 0 if e1 and e2 overlap and at least one lock is writer
*/
-void *
-rangelock_unlock_range(struct rangelock *lock, void *cookie, off_t start,
- off_t end, struct mtx *ilk)
+static int
+rl_e_compare(const struct rl_q_entry *e1, const struct rl_q_entry *e2)
{
- struct rl_q_entry *entry;
-
- MPASS(lock != NULL && cookie != NULL && ilk != NULL);
- entry = cookie;
- KASSERT(entry->rl_q_flags & RL_LOCK_GRANTED,
- ("Unlocking non-granted lock"));
- KASSERT(entry->rl_q_start == start, ("wrong start"));
- KASSERT(entry->rl_q_end >= end, ("wrong end"));
-
- mtx_lock(ilk);
- if (entry->rl_q_end == end) {
- rangelock_unlock_locked(lock, cookie, ilk, true);
- return (NULL);
- }
- entry->rl_q_end = end;
- rangelock_calc_block(lock);
- mtx_unlock(ilk);
- return (cookie);
+ bool rds;
+
+ if (e1 == NULL)
+ return (1);
+ if (e2->rl_q_start >= e1->rl_q_end)
+ return (-1);
+ rds = rl_e_is_rlock(e1) && rl_e_is_rlock(e2);
+ if (e2->rl_q_start >= e1->rl_q_start && rds)
+ return (-1);
+ if (e1->rl_q_start >= e2->rl_q_end)
+ return (1);
+ if (e1->rl_q_start >= e2->rl_q_start && rds)
+ return (1);
+ return (0);
}
-/*
- * Add the lock request to the queue of the pending requests for
- * rangelock. Sleep until the request can be granted unless trylock == true.
- */
-static void *
-rangelock_enqueue(struct rangelock *lock, off_t start, off_t end, int mode,
- struct mtx *ilk, bool trylock)
+static void
+rl_insert_sleep(struct rangelock *lock)
{
- struct rl_q_entry *entry;
- struct thread *td;
+ smr_exit(rl_smr);
+ DROP_GIANT();
+ lock->sleepers = true;
+ sleepq_add(&lock->sleepers, NULL, "rangelk", 0, 0);
+ sleepq_wait(&lock->sleepers, PRI_USER);
+ PICKUP_GIANT();
+ smr_enter(rl_smr);
+}
+
+static bool
+rl_q_cas(struct rl_q_entry **prev, struct rl_q_entry *old,
+ struct rl_q_entry *new)
+{
+ return (atomic_cmpset_rel_ptr((uintptr_t *)prev, (uintptr_t)old,
+ (uintptr_t)new) != 0);
+}
- MPASS(lock != NULL && ilk != NULL);
+enum RL_INSERT_RES {
+ RL_TRYLOCK_FAILED,
+ RL_LOCK_SUCCESS,
+ RL_LOCK_RETRY,
+};
- td = curthread;
- if (td->td_rlqe != NULL) {
- entry = td->td_rlqe;
- td->td_rlqe = NULL;
- } else
- entry = rlqentry_alloc();
- MPASS(entry != NULL);
- entry->rl_q_flags = mode;
- entry->rl_q_start = start;
- entry->rl_q_end = end;
-
- mtx_lock(ilk);
- /*
- * XXXKIB TODO. Check that a thread does not try to enqueue a
- * lock that is incompatible with another request from the same
- * thread.
- */
+static enum RL_INSERT_RES
+rl_r_validate(struct rangelock *lock, struct rl_q_entry *e, bool trylock,
+ struct rl_q_entry **free)
+{
+ struct rl_q_entry *cur, *next, **prev;
+
+ prev = &e->rl_q_next;
+ cur = rl_e_unmark_unchecked(rl_q_load(prev));
+ for (;;) {
+ if (cur == NULL || cur->rl_q_start > e->rl_q_end)
+ return (RL_LOCK_SUCCESS);
+ next = rl_q_load(&cur->rl_q_next);
+ if (rl_e_is_marked(next)) {
+ next = rl_e_unmark(next);
+ if (rl_q_cas(prev, cur, next)) {
+ cur->rl_q_free = *free;
+ *free = cur;
+ }
+ cur = next;
+ continue;
+ }
+ if (rl_e_is_rlock(cur)) {
+ prev = &cur->rl_q_next;
+ cur = rl_e_unmark_unchecked(rl_q_load(prev));
+ continue;
+ }
+ if (!rl_e_is_marked(rl_q_load(&cur->rl_q_next))) {
+ sleepq_lock(&lock->sleepers);
+ if (rl_e_is_marked(rl_q_load(&cur->rl_q_next))) {
+ sleepq_release(&lock->sleepers);
+ continue;
+ }
+ rangelock_unlock_int(lock, e);
+ if (trylock) {
+ sleepq_release(&lock->sleepers);
+ return (RL_TRYLOCK_FAILED);
+ }
+ rl_insert_sleep(lock);
+ return (RL_LOCK_RETRY);
+ }
+ }
+}
- TAILQ_INSERT_TAIL(&lock->rl_waiters, entry, rl_q_link);
- /*
- * If rl_currdep == NULL, there is no entry waiting for a conflicting
- * range to be resolved, so set rl_currdep to this entry. If there is
- * no conflicting entry for this entry, rl_currdep will be set back to
- * NULL by rangelock_calc_block().
- */
- if (lock->rl_currdep == NULL)
- lock->rl_currdep = entry;
- rangelock_calc_block(lock);
- while (!(entry->rl_q_flags & RL_LOCK_GRANTED)) {
+static enum RL_INSERT_RES
+rl_w_validate(struct rangelock *lock, struct rl_q_entry *e,
+ bool trylock, struct rl_q_entry **free)
+{
+ struct rl_q_entry *cur, *next, **prev;
+
+ prev = (struct rl_q_entry **)&lock->head;
+ cur = rl_e_unmark_unchecked(rl_q_load(prev));
+ for (;;) {
+ if (cur == e)
+ return (RL_LOCK_SUCCESS);
+ next = rl_q_load(&cur->rl_q_next);
+ if (rl_e_is_marked(next)) {
+ next = rl_e_unmark(next);
+ if (rl_q_cas(prev, cur, next)) {
+ cur->rl_q_next = *free;
+ *free = cur;
+ }
+ cur = next;
+ continue;
+ }
+ if (cur->rl_q_end <= e->rl_q_start) {
+ prev = &cur->rl_q_next;
+ cur = rl_e_unmark_unchecked(rl_q_load(prev));
+ continue;
+ }
+ sleepq_lock(&lock->sleepers);
+ rangelock_unlock_int(lock, e);
if (trylock) {
- /*
- * For this case, the range is not actually locked
- * yet, but removal from the list requires the same
- * steps, except for not doing a rangelock_calc_block()
- * call, since rangelock_calc_block() was called above.
- */
- rangelock_unlock_locked(lock, entry, ilk, false);
- return (NULL);
+ sleepq_release(&lock->sleepers);
+ return (RL_TRYLOCK_FAILED);
}
- msleep(entry, ilk, 0, "range", 0);
+ rl_insert_sleep(lock);
+ return (RL_LOCK_RETRY);
}
- mtx_unlock(ilk);
- return (entry);
}
-void *
-rangelock_rlock(struct rangelock *lock, off_t start, off_t end, struct mtx *ilk)
+static enum RL_INSERT_RES
+rl_insert(struct rangelock *lock, struct rl_q_entry *e, bool trylock,
+ struct rl_q_entry **free)
{
+ struct rl_q_entry *cur, *next, **prev;
+ int r;
+
+again:
+ prev = (struct rl_q_entry **)&lock->head;
+ if (rl_q_load(prev) == NULL && rl_q_cas(prev, NULL, e))
+ return (RL_LOCK_SUCCESS);
+
+ for (cur = rl_q_load(prev);;) {
+ if (rl_e_is_marked(cur))
+ goto again;
+
+ if (cur != NULL) {
+ next = rl_q_load(&cur->rl_q_next);
+ if (rl_e_is_marked(next)) {
+ next = rl_e_unmark(next);
+ if (rl_q_cas(prev, cur, next)) {
+#ifdef INVARIANTS
+ cur->rl_q_owner = NULL;
+#endif
+ cur->rl_q_free = *free;
+ *free = cur;
+ }
+ cur = next;
+ continue;
+ }
+ }
- return (rangelock_enqueue(lock, start, end, RL_LOCK_READ, ilk, false));
+ r = rl_e_compare(cur, e);
+ if (r == -1) {
+ prev = &cur->rl_q_next;
+ cur = rl_q_load(prev);
+ } else if (r == 0) {
+ sleepq_lock(&lock->sleepers);
+ if (__predict_false(rl_e_is_marked(rl_q_load(
+ &cur->rl_q_next)))) {
+ sleepq_release(&lock->sleepers);
+ continue;
+ }
+ if (trylock) {
+ sleepq_release(&lock->sleepers);
+ return (RL_TRYLOCK_FAILED);
+ }
+ rl_insert_sleep(lock);
+ /* e is still valid */
+ goto again;
+ } else /* r == 1 */ {
+ e->rl_q_next = cur;
+ if (rl_q_cas(prev, cur, e)) {
+ atomic_thread_fence_acq();
+ return (rl_e_is_rlock(e) ?
+ rl_r_validate(lock, e, trylock, free) :
+ rl_w_validate(lock, e, trylock, free));
+ }
+ /* Reset rl_q_next in case we hit fast path. */
+ e->rl_q_next = NULL;
+ cur = rl_q_load(prev);
+ }
+ }
}
-void *
-rangelock_tryrlock(struct rangelock *lock, off_t start, off_t end,
- struct mtx *ilk)
+static struct rl_q_entry *
+rangelock_lock_int(struct rangelock *lock, bool trylock, vm_ooffset_t start,
+ vm_ooffset_t end, int locktype)
{
+ struct rl_q_entry *e, *free, *x, *xp;
+ struct thread *td;
+ void *cookie;
+ enum RL_INSERT_RES res;
- return (rangelock_enqueue(lock, start, end, RL_LOCK_READ, ilk, true));
+ if (rangelock_cheat_lock(lock, locktype, trylock, &cookie))
+ return (cookie);
+ td = curthread;
+ for (res = RL_LOCK_RETRY; res == RL_LOCK_RETRY;) {
+ free = NULL;
+ e = rlqentry_alloc(start, end, locktype);
+ smr_enter(rl_smr);
+ res = rl_insert(lock, e, trylock, &free);
+ smr_exit(rl_smr);
+ if (res == RL_TRYLOCK_FAILED) {
+ MPASS(trylock);
+ e->rl_q_free = free;
+ free = e;
+ e = NULL;
+ }
+ for (x = free; x != NULL; x = xp) {
+ MPASS(!rl_e_is_marked(x));
+ xp = x->rl_q_free;
+ MPASS(!rl_e_is_marked(xp));
+ if (td->td_rlqe == NULL) {
+ smr_synchronize(rl_smr);
+ td->td_rlqe = x;
+ } else {
+ uma_zfree_smr(rl_entry_zone, x);
+ }
+ }
+ }
+ return (e);
}
void *
-rangelock_wlock(struct rangelock *lock, off_t start, off_t end, struct mtx *ilk)
+rangelock_rlock(struct rangelock *lock, vm_ooffset_t start, vm_ooffset_t end)
{
+ return (rangelock_lock_int(lock, false, start, end, RL_LOCK_READ));
+}
- return (rangelock_enqueue(lock, start, end, RL_LOCK_WRITE, ilk, false));
+void *
+rangelock_tryrlock(struct rangelock *lock, vm_ooffset_t start, vm_ooffset_t end)
+{
+ return (rangelock_lock_int(lock, true, start, end, RL_LOCK_READ));
}
void *
-rangelock_trywlock(struct rangelock *lock, off_t start, off_t end,
- struct mtx *ilk)
+rangelock_wlock(struct rangelock *lock, vm_ooffset_t start, vm_ooffset_t end)
{
+ return (rangelock_lock_int(lock, false, start, end, RL_LOCK_WRITE));
+}
- return (rangelock_enqueue(lock, start, end, RL_LOCK_WRITE, ilk, true));
+void *
+rangelock_trywlock(struct rangelock *lock, vm_ooffset_t start, vm_ooffset_t end)
+{
+ return (rangelock_lock_int(lock, true, start, end, RL_LOCK_WRITE));
}
#ifdef INVARIANT_SUPPORT
void
_rangelock_cookie_assert(void *cookie, int what, const char *file, int line)
{
- struct rl_q_entry *entry;
- int flags;
-
- MPASS(cookie != NULL);
- entry = cookie;
- flags = entry->rl_q_flags;
- switch (what) {
- case RCA_LOCKED:
- if ((flags & RL_LOCK_GRANTED) == 0)
- panic("rangelock not held @ %s:%d\n", file, line);
- break;
- case RCA_RLOCKED:
- if ((flags & (RL_LOCK_GRANTED | RL_LOCK_READ)) !=
- (RL_LOCK_GRANTED | RL_LOCK_READ))
- panic("rangelock not rlocked @ %s:%d\n", file, line);
- break;
- case RCA_WLOCKED:
- if ((flags & (RL_LOCK_GRANTED | RL_LOCK_WRITE)) !=
- (RL_LOCK_GRANTED | RL_LOCK_WRITE))
- panic("rangelock not wlocked @ %s:%d\n", file, line);
- break;
- default:
- panic("Unknown rangelock assertion: %d @ %s:%d", what, file,
- line);
- }
}
#endif /* INVARIANT_SUPPORT */
+
+#include "opt_ddb.h"
+#ifdef DDB
+#include <ddb/ddb.h>
+
+DB_SHOW_COMMAND(rangelock, db_show_rangelock)
+{
+ struct rangelock *lock;
+ struct rl_q_entry *e, *x;
+ uintptr_t v;
+
+ if (!have_addr) {
+ db_printf("show rangelock addr\n");
+ return;
+ }
+
+ lock = (struct rangelock *)addr;
+ db_printf("rangelock %p sleepers %d\n", lock, lock->sleepers);
+ v = lock->head;
+ if ((v & RL_CHEAT_CHEATING) != 0) {
+ db_printf(" cheating head %#jx\n", (uintmax_t)v);
+ return;
+ }
+ for (e = (struct rl_q_entry *)(lock->head);;) {
+ x = rl_e_is_marked(e) ? rl_e_unmark(e) : e;
+ if (x == NULL)
+ break;
+ db_printf(" entry %p marked %d %d start %#jx end %#jx "
+ "flags %x next %p",
+ e, rl_e_is_marked(e), rl_e_is_marked(x->rl_q_next),
+ x->rl_q_start, x->rl_q_end, x->rl_q_flags, x->rl_q_next);
+#ifdef INVARIANTS
+ db_printf(" owner %p (%d)", x->rl_q_owner,
+ x->rl_q_owner != NULL ? x->rl_q_owner->td_tid : -1);
+#endif
+ db_printf("\n");
+ e = x->rl_q_next;
+ }
+}
+
+#endif /* DDB */
diff --git a/sys/kern/kern_thread.c b/sys/kern/kern_thread.c
--- a/sys/kern/kern_thread.c
+++ b/sys/kern/kern_thread.c
@@ -479,9 +479,9 @@
td = (struct thread *)mem;
EVENTHANDLER_DIRECT_INVOKE(thread_fini, td);
- rlqentry_free(td->td_rlqe);
turnstile_free(td->td_turnstile);
sleepq_free(td->td_sleepqueue);
+ rangelock_entry_free(td->td_rlqe);
umtx_thread_fini(td);
MPASS(td->td_sel == NULL);
}
diff --git a/sys/kern/uipc_shm.c b/sys/kern/uipc_shm.c
--- a/sys/kern/uipc_shm.c
+++ b/sys/kern/uipc_shm.c
@@ -183,13 +183,13 @@
"Number of contig reclaims before giving up for default alloc policy");
#define shm_rangelock_unlock(shmfd, cookie) \
- rangelock_unlock(&(shmfd)->shm_rl, (cookie), &(shmfd)->shm_mtx)
+ rangelock_unlock(&(shmfd)->shm_rl, (cookie))
#define shm_rangelock_rlock(shmfd, start, end) \
- rangelock_rlock(&(shmfd)->shm_rl, (start), (end), &(shmfd)->shm_mtx)
+ rangelock_rlock(&(shmfd)->shm_rl, (start), (end))
#define shm_rangelock_tryrlock(shmfd, start, end) \
- rangelock_tryrlock(&(shmfd)->shm_rl, (start), (end), &(shmfd)->shm_mtx)
+ rangelock_tryrlock(&(shmfd)->shm_rl, (start), (end))
#define shm_rangelock_wlock(shmfd, start, end) \
- rangelock_wlock(&(shmfd)->shm_rl, (start), (end), &(shmfd)->shm_mtx)
+ rangelock_wlock(&(shmfd)->shm_rl, (start), (end))
static int
uiomove_object_page(vm_object_t obj, size_t len, struct uio *uio)
diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c
--- a/sys/kern/vfs_subr.c
+++ b/sys/kern/vfs_subr.c
@@ -2013,8 +2013,6 @@
VNASSERT(bo->bo_dirty.bv_cnt == 0, vp, ("dirtybufcnt not 0"));
VNASSERT(pctrie_is_empty(&bo->bo_dirty.bv_root), vp,
("dirty blk trie not empty"));
- VNASSERT(TAILQ_EMPTY(&vp->v_rl.rl_waiters), vp,
- ("Dangling rangelock waiters"));
VNASSERT((vp->v_iflag & (VI_DOINGINACT | VI_OWEINACT)) == 0, vp,
("Leaked inactivation"));
VI_UNLOCK(vp);
diff --git a/sys/sys/rangelock.h b/sys/sys/rangelock.h
--- a/sys/sys/rangelock.h
+++ b/sys/sys/rangelock.h
@@ -29,12 +29,14 @@
#ifndef _SYS_RANGELOCK_H
#define _SYS_RANGELOCK_H
-#include <sys/queue.h>
+#include <sys/types.h>
+#ifndef _KERNEL
+#include <stdbool.h>
+#endif
#define RL_LOCK_READ 0x0001
#define RL_LOCK_WRITE 0x0002
#define RL_LOCK_TYPE_MASK 0x0003
-#define RL_LOCK_GRANTED 0x0004
struct rl_q_entry;
@@ -44,42 +46,26 @@
* all existing lock owners are compatible with the request. Two lock
* owners are compatible if their ranges do not overlap, or both
* owners are for read.
- *
- * Access to the structure itself is synchronized with the externally
- * supplied mutex.
- *
- * rl_waiters is the queue containing in order (a) granted write lock
- * requests, (b) granted read lock requests, and (c) in order of arrival,
- * lock requests which cannot be granted yet.
- *
- * rl_currdep is the first lock request that cannot be granted now due
- * to the preceding requests conflicting with it (i.e., it points to
- * position (c) in the list above).
*/
struct rangelock {
- TAILQ_HEAD(, rl_q_entry) rl_waiters;
- struct rl_q_entry *rl_currdep;
+ uintptr_t head;
+ bool sleepers;
};
#ifdef _KERNEL
-struct mtx;
-
void rangelock_init(struct rangelock *lock);
void rangelock_destroy(struct rangelock *lock);
-void rangelock_unlock(struct rangelock *lock, void *cookie,
- struct mtx *ilk);
-void *rangelock_unlock_range(struct rangelock *lock, void *cookie,
- off_t start, off_t end, struct mtx *ilk);
-void *rangelock_rlock(struct rangelock *lock, off_t start, off_t end,
- struct mtx *ilk);
-void *rangelock_tryrlock(struct rangelock *lock, off_t start, off_t end,
- struct mtx *ilk);
-void *rangelock_wlock(struct rangelock *lock, off_t start, off_t end,
- struct mtx *ilk);
-void *rangelock_trywlock(struct rangelock *lock, off_t start, off_t end,
- struct mtx *ilk);
-void rlqentry_free(struct rl_q_entry *rlqe);
+void rangelock_unlock(struct rangelock *lock, void *cookie);
+void *rangelock_rlock(struct rangelock *lock, vm_ooffset_t start,
+ vm_ooffset_t end);
+void *rangelock_tryrlock(struct rangelock *lock, vm_ooffset_t start,
+ vm_ooffset_t end);
+void *rangelock_wlock(struct rangelock *lock, vm_ooffset_t start,
+ vm_ooffset_t end);
+void *rangelock_trywlock(struct rangelock *lock, vm_ooffset_t start,
+ vm_ooffset_t end);
+void rangelock_entry_free(struct rl_q_entry *e);
#if defined(INVARIANTS) || defined(INVARIANT_SUPPORT)
void _rangelock_cookie_assert(void *cookie, int what, const char *file,
int line);
diff --git a/sys/sys/vnode.h b/sys/sys/vnode.h
--- a/sys/sys/vnode.h
+++ b/sys/sys/vnode.h
@@ -834,18 +834,15 @@
#define vn_seqc_consistent(vp, seq) seqc_consistent(&(vp)->v_seqc, seq)
#define vn_rangelock_unlock(vp, cookie) \
- rangelock_unlock(&(vp)->v_rl, (cookie), VI_MTX(vp))
-#define vn_rangelock_unlock_range(vp, cookie, start, end) \
- rangelock_unlock_range(&(vp)->v_rl, (cookie), (start), (end), \
- VI_MTX(vp))
+ rangelock_unlock(&(vp)->v_rl, (cookie))
#define vn_rangelock_rlock(vp, start, end) \
- rangelock_rlock(&(vp)->v_rl, (start), (end), VI_MTX(vp))
+ rangelock_rlock(&(vp)->v_rl, (start), (end))
#define vn_rangelock_tryrlock(vp, start, end) \
- rangelock_tryrlock(&(vp)->v_rl, (start), (end), VI_MTX(vp))
+ rangelock_tryrlock(&(vp)->v_rl, (start), (end))
#define vn_rangelock_wlock(vp, start, end) \
- rangelock_wlock(&(vp)->v_rl, (start), (end), VI_MTX(vp))
+ rangelock_wlock(&(vp)->v_rl, (start), (end))
#define vn_rangelock_trywlock(vp, start, end) \
- rangelock_trywlock(&(vp)->v_rl, (start), (end), VI_MTX(vp))
+ rangelock_trywlock(&(vp)->v_rl, (start), (end))
#define vn_irflag_read(vp) atomic_load_short(&(vp)->v_irflag)
void vn_irflag_set_locked(struct vnode *vp, short toset);
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Thu, Jan 9, 4:06 PM (6 h, 57 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
15733171
Default Alt Text
D41787.id128185.diff (29 KB)
Attached To
Mode
D41787: Reimplement rangelocks
Attached
Detach File
Event Timeline
Log In to Comment