Page MenuHomeFreeBSD

D41787.id128185.diff
No OneTemporary

D41787.id128185.diff

diff --git a/sys/kern/kern_rangelock.c b/sys/kern/kern_rangelock.c
--- a/sys/kern/kern_rangelock.c
+++ b/sys/kern/kern_rangelock.c
@@ -2,7 +2,11 @@
* SPDX-License-Identifier: BSD-2-Clause
*
* Copyright (c) 2009 Konstantin Belousov <kib@FreeBSD.org>
- * All rights reserved.
+ * Copyright (c) 2023 The FreeBSD Foundation
+ *
+ * Portions of this software were developed by
+ * Konstantin Belousov <kib@FreeBSD.org> under sponsorship from
+ * the FreeBSD Foundation.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -26,306 +30,691 @@
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
-#include <sys/cdefs.h>
#include <sys/param.h>
+#include <sys/kassert.h>
#include <sys/kernel.h>
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/proc.h>
#include <sys/rangelock.h>
-#include <sys/systm.h>
+#include <sys/sleepqueue.h>
+#include <sys/smr.h>
+#include <sys/sysctl.h>
#include <vm/uma.h>
+/*
+ * Immediately after initialization, and until a first conflicting
+ * pair of range locking requests is noted, range locks are served in
+ * the 'cheating' mode. There, rangelock becomes effectively sxlock,
+ * with each request covering whole range of the lock interval.
+ * Either incoming locking request can be granted immediately without
+ * blocking, or when a conflict is detected, current ranges are
+ * drained, and the lock is switched to full range-obeying lock.
+ *
+ * Normal sx implementation is not used to not bloat structures (most
+ * important, vnodes) which embeds rangelocks.
+ *
+ * The cheating greatly helps very common pattern where file is first
+ * written single-threaded, and then read by many processes.
+ *
+ * Lock is in cheat mode when RL_CHEAT_CHEATING bit is set in the
+ * lock->head. Special cookies are returned in this mode, and
+ * trylocks are same as normal locks but do not drain.
+ */
+
+static int rangelock_cheat = 1;
+SYSCTL_INT(_debug, OID_AUTO, rangelock_cheat, CTLFLAG_RWTUN,
+ &rangelock_cheat, 0,
+ "");
+
+#define RL_CHEAT_MASK 0x7
+#define RL_CHEAT_CHEATING 0x1
+/* #define RL_CHEAT_RLOCKED 0x0 */
+#define RL_CHEAT_WLOCKED 0x2
+#define RL_CHEAT_DRAINING 0x4
+
+#define RL_CHEAT_READER 0x8
+
+#define RL_RET_CHEAT_RLOCKED 0x1100
+#define RL_RET_CHEAT_WLOCKED 0x2200
+
+static bool
+rangelock_cheat_lock(struct rangelock *lock, int locktype, bool trylock,
+ void **cookie)
+{
+ uintptr_t v, x;
+
+ v = atomic_load_ptr(&lock->head);
+ if ((v & RL_CHEAT_CHEATING) == 0)
+ return (false);
+ if ((v & RL_CHEAT_DRAINING) != 0) {
+drain:
+ if (trylock) {
+ *cookie = NULL;
+ return (true);
+ }
+ sleepq_lock(&lock->head);
+drain1:
+ DROP_GIANT();
+ for (;;) {
+ v = atomic_load_ptr(&lock->head);
+ if ((v & RL_CHEAT_DRAINING) == 0)
+ break;
+ sleepq_add(&lock->head, NULL, "ranged1", 0, 0);
+ sleepq_wait(&lock->head, PRI_USER);
+ sleepq_lock(&lock->head);
+ }
+ sleepq_release(&lock->head);
+ PICKUP_GIANT();
+ return (false);
+ }
+
+ switch (locktype) {
+ case RL_LOCK_READ:
+ for (;;) {
+ if ((v & RL_CHEAT_WLOCKED) != 0) {
+ if (trylock) {
+ *cookie = NULL;
+ return (true);
+ }
+ x = v | RL_CHEAT_DRAINING;
+ sleepq_lock(&lock->head);
+ if (atomic_fcmpset_rel_ptr(&lock->head, &v,
+ x) != 0)
+ goto drain1;
+ sleepq_release(&lock->head);
+ /* Possibly forgive passed conflict */
+ continue;
+ }
+ x = (v & ~RL_CHEAT_MASK) + RL_CHEAT_READER;
+ x |= RL_CHEAT_CHEATING;
+ if (atomic_fcmpset_acq_ptr(&lock->head, &v, x) != 0)
+ break;
+ if ((v & RL_CHEAT_CHEATING) == 0)
+ return (false);
+ if ((v & RL_CHEAT_DRAINING) != 0)
+ goto drain;
+ }
+ *(uintptr_t *)cookie = RL_RET_CHEAT_RLOCKED;
+ break;
+ case RL_LOCK_WRITE:
+ for (;;) {
+ if ((v & ~RL_CHEAT_MASK) >= RL_CHEAT_READER ||
+ (v & RL_CHEAT_WLOCKED) != 0) {
+ if (trylock) {
+ *cookie = NULL;
+ return (true);
+ }
+ x = v | RL_CHEAT_DRAINING;
+ sleepq_lock(&lock->head);
+ if (atomic_fcmpset_rel_ptr(&lock->head, &v,
+ x) != 0)
+ goto drain1;
+ sleepq_release(&lock->head);
+ /* Possibly forgive passed conflict */
+ continue;
+ }
+ x = RL_CHEAT_WLOCKED | RL_CHEAT_CHEATING;
+ if (atomic_fcmpset_acq_ptr(&lock->head, &v, x) != 0)
+ break;
+ if ((v & RL_CHEAT_CHEATING) == 0)
+ return (false);
+ if ((v & RL_CHEAT_DRAINING) != 0)
+ goto drain;
+ }
+ *(uintptr_t *)cookie = RL_RET_CHEAT_WLOCKED;
+ break;
+ default:
+ __assert_unreachable();
+ break;
+ }
+ return (true);
+}
+
+static bool
+rangelock_cheat_unlock(struct rangelock *lock, void *cookie)
+{
+ uintptr_t v, x;
+
+ v = atomic_load_ptr(&lock->head);
+ if ((v & RL_CHEAT_CHEATING) == 0)
+ return (false);
+
+ MPASS((uintptr_t)cookie == RL_RET_CHEAT_WLOCKED ||
+ (uintptr_t)cookie == RL_RET_CHEAT_RLOCKED);
+
+ switch ((uintptr_t)cookie) {
+ case RL_RET_CHEAT_RLOCKED:
+ for (;;) {
+ MPASS((v & ~RL_CHEAT_MASK) >= RL_CHEAT_READER);
+ MPASS((v & RL_CHEAT_WLOCKED) == 0);
+ x = (v & ~RL_CHEAT_MASK) - RL_CHEAT_READER;
+ if ((v & RL_CHEAT_DRAINING) != 0) {
+ if (x != 0) {
+ x |= RL_CHEAT_DRAINING |
+ RL_CHEAT_CHEATING;
+ }
+ sleepq_lock(&lock->head);
+ if (atomic_fcmpset_rel_ptr(&lock->head, &v,
+ x) != 0) {
+ if (x == 0) {
+ sleepq_broadcast(&lock->head,
+ SLEEPQ_SLEEP, 0, 0);
+ }
+ sleepq_release(&lock->head);
+ break;
+ } else {
+ sleepq_release(&lock->head);
+ }
+ } else {
+ x |= RL_CHEAT_CHEATING;
+ if (atomic_fcmpset_rel_ptr(&lock->head, &v,
+ x) != 0)
+ break;
+ }
+ }
+ break;
+ case RL_RET_CHEAT_WLOCKED:
+ for (;;) {
+ MPASS((v & RL_CHEAT_WLOCKED) != 0);
+ if ((v & RL_CHEAT_DRAINING) != 0) {
+ sleepq_lock(&lock->head);
+ atomic_store_ptr(&lock->head, 0);
+ sleepq_broadcast(&lock->head, SLEEPQ_SLEEP,
+ 0, 0);
+ sleepq_release(&lock->head);
+ break;
+ } else {
+ if (atomic_fcmpset_ptr(&lock->head, &v,
+ RL_CHEAT_CHEATING) != 0)
+ break;
+ }
+ }
+ break;
+ default:
+ __assert_unreachable();
+ break;
+ }
+ return (true);
+}
+
+static bool
+rangelock_cheat_destroy(struct rangelock *lock)
+{
+ uintptr_t v;
+
+ v = atomic_load_ptr(&lock->head);
+ if ((v & RL_CHEAT_CHEATING) == 0)
+ return (false);
+ MPASS(v == RL_CHEAT_CHEATING);
+ return (true);
+}
+
+/*
+ * Implementation of range locks based on the paper
+ * https://doi.org/10.1145/3342195.3387533
+ * arXiv:2006.12144v1 [cs.OS] 22 Jun 2020
+ * Scalable Range Locks for Scalable Address Spaces and Beyond
+ * by Alex Kogan, Dave Dice, and Shady Issa
+ */
+
+static struct rl_q_entry *rl_e_unmark(const struct rl_q_entry *e);
+
+/*
+ * rl_q_next links all granted ranges in the lock. We cannot free an
+ * rl_q_entry while in the smr section, and cannot reuse rl_q_next
+ * linkage since other threads might follow it even after CAS removed
+ * the range. Use rl_q_free for local list of ranges to remove after
+ * the smr section is dropped.
+ */
struct rl_q_entry {
- TAILQ_ENTRY(rl_q_entry) rl_q_link;
+ struct rl_q_entry *rl_q_next;
+ struct rl_q_entry *rl_q_free;
off_t rl_q_start, rl_q_end;
int rl_q_flags;
+#ifdef INVARIANTS
+ struct thread *rl_q_owner;
+#endif
};
static uma_zone_t rl_entry_zone;
+static smr_t rl_smr;
static void
rangelock_sys_init(void)
{
-
+ /*
+ * The alignment requirement comes from the need for
+ * RL_CHEAT_* flags space.
+ */
rl_entry_zone = uma_zcreate("rl_entry", sizeof(struct rl_q_entry),
- NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
+ NULL, NULL, NULL, NULL, 8 - 1, UMA_ZONE_SMR);
+ rl_smr = uma_zone_get_smr(rl_entry_zone);
}
-SYSINIT(vfs, SI_SUB_LOCK, SI_ORDER_ANY, rangelock_sys_init, NULL);
+SYSINIT(rl, SI_SUB_LOCK, SI_ORDER_ANY, rangelock_sys_init, NULL);
static struct rl_q_entry *
-rlqentry_alloc(void)
+rlqentry_alloc(vm_ooffset_t start, vm_ooffset_t end, int flags)
{
+ struct rl_q_entry *e;
+ struct thread *td;
- return (uma_zalloc(rl_entry_zone, M_WAITOK));
+ td = curthread;
+ if (td->td_rlqe != NULL) {
+ e = td->td_rlqe;
+ td->td_rlqe = NULL;
+ } else {
+ e = uma_zalloc_smr(rl_entry_zone, M_WAITOK);
+ }
+ e->rl_q_next = NULL;
+ e->rl_q_free = NULL;
+ e->rl_q_start = start;
+ e->rl_q_end = end;
+ e->rl_q_flags = flags;
+#ifdef INVARIANTS
+ e->rl_q_owner = curthread;
+#endif
+ return (e);
}
void
-rlqentry_free(struct rl_q_entry *rleq)
+rangelock_entry_free(struct rl_q_entry *e)
{
-
- uma_zfree(rl_entry_zone, rleq);
+ uma_zfree_smr(rl_entry_zone, e);
}
void
rangelock_init(struct rangelock *lock)
{
-
- TAILQ_INIT(&lock->rl_waiters);
- lock->rl_currdep = NULL;
+ lock->sleepers = false;
+ atomic_store_ptr(&lock->head, rangelock_cheat ? RL_CHEAT_CHEATING : 0);
}
void
rangelock_destroy(struct rangelock *lock)
{
+ struct rl_q_entry *e, *ep;
+
+ MPASS(!lock->sleepers);
+ if (rangelock_cheat_destroy(lock))
+ return;
+ for (e = (struct rl_q_entry *)atomic_load_ptr(&lock->head);
+ e != NULL; e = rl_e_unmark(ep)) {
+ ep = atomic_load_ptr(&e->rl_q_next);
+ uma_zfree_smr(rl_entry_zone, e);
+ }
+}
- KASSERT(TAILQ_EMPTY(&lock->rl_waiters), ("Dangling waiters"));
+static bool
+rl_e_is_marked(const struct rl_q_entry *e)
+{
+ return (((uintptr_t)e & 1) != 0);
}
-/*
- * Two entries are compatible if their ranges do not overlap, or both
- * entries are for read.
- */
-static int
-ranges_overlap(const struct rl_q_entry *e1,
- const struct rl_q_entry *e2)
+static struct rl_q_entry *
+rl_e_unmark_unchecked(const struct rl_q_entry *e)
{
+ return ((struct rl_q_entry *)((uintptr_t)e & ~1));
+}
- if (e1->rl_q_start < e2->rl_q_end && e1->rl_q_end > e2->rl_q_start)
- return (1);
- return (0);
+static struct rl_q_entry *
+rl_e_unmark(const struct rl_q_entry *e)
+{
+ MPASS(rl_e_is_marked(e));
+ return (rl_e_unmark_unchecked(e));
}
-/*
- * Recalculate the lock->rl_currdep after an unlock.
- */
static void
-rangelock_calc_block(struct rangelock *lock)
+rl_e_mark(struct rl_q_entry *e)
{
- struct rl_q_entry *entry, *nextentry, *entry1;
-
- for (entry = lock->rl_currdep; entry != NULL; entry = nextentry) {
- nextentry = TAILQ_NEXT(entry, rl_q_link);
- if (entry->rl_q_flags & RL_LOCK_READ) {
- /* Reads must not overlap with granted writes. */
- for (entry1 = TAILQ_FIRST(&lock->rl_waiters);
- !(entry1->rl_q_flags & RL_LOCK_READ);
- entry1 = TAILQ_NEXT(entry1, rl_q_link)) {
- if (ranges_overlap(entry, entry1))
- goto out;
- }
- } else {
- /* Write must not overlap with any granted locks. */
- for (entry1 = TAILQ_FIRST(&lock->rl_waiters);
- entry1 != entry;
- entry1 = TAILQ_NEXT(entry1, rl_q_link)) {
- if (ranges_overlap(entry, entry1))
- goto out;
- }
+#ifdef INVARIANTS
+ int r = atomic_testandset_long((uintptr_t *)&e->rl_q_next, 0);
+ MPASS(r == 0);
+#else
+ atomic_set_ptr((uintptr_t *)&e->rl_q_next, 1);
+#endif
+}
- /* Move grantable write locks to the front. */
- TAILQ_REMOVE(&lock->rl_waiters, entry, rl_q_link);
- TAILQ_INSERT_HEAD(&lock->rl_waiters, entry, rl_q_link);
- }
+static struct rl_q_entry *
+rl_q_load(struct rl_q_entry **p)
+{
+ return ((struct rl_q_entry *)atomic_load_acq_ptr((uintptr_t *)p));
+}
- /* Grant this lock. */
- entry->rl_q_flags |= RL_LOCK_GRANTED;
- wakeup(entry);
- }
-out:
- lock->rl_currdep = entry;
+static bool
+rl_e_is_rlock(const struct rl_q_entry *e)
+{
+ return ((e->rl_q_flags & RL_LOCK_TYPE_MASK) == RL_LOCK_READ);
}
static void
-rangelock_unlock_locked(struct rangelock *lock, struct rl_q_entry *entry,
- struct mtx *ilk, bool do_calc_block)
+rangelock_unlock_int(struct rangelock *lock, struct rl_q_entry *e)
{
+ MPASS(lock != NULL && e != NULL);
+ MPASS(!rl_e_is_marked(rl_q_load(&e->rl_q_next)));
+ MPASS(e->rl_q_owner == curthread);
- MPASS(lock != NULL && entry != NULL && ilk != NULL);
- mtx_assert(ilk, MA_OWNED);
-
- if (!do_calc_block) {
- /*
- * This is the case where rangelock_enqueue() has been called
- * with trylock == true and just inserted this entry in the
- * queue.
- * If rl_currdep is this entry, rl_currdep needs to
- * be set to the next entry in the rl_waiters list.
- * However, since this entry is the last entry in the
- * list, the next entry is NULL.
- */
- if (lock->rl_currdep == entry) {
- KASSERT(TAILQ_NEXT(lock->rl_currdep, rl_q_link) == NULL,
- ("rangelock_enqueue: next entry not NULL"));
- lock->rl_currdep = NULL;
- }
- } else
- KASSERT(entry != lock->rl_currdep, ("stuck currdep"));
-
- TAILQ_REMOVE(&lock->rl_waiters, entry, rl_q_link);
- if (do_calc_block)
- rangelock_calc_block(lock);
- mtx_unlock(ilk);
- if (curthread->td_rlqe == NULL)
- curthread->td_rlqe = entry;
- else
- rlqentry_free(entry);
+ rl_e_mark(e);
+ lock->sleepers = false;
+ sleepq_broadcast(&lock->sleepers, SLEEPQ_SLEEP, 0, 0);
}
void
-rangelock_unlock(struct rangelock *lock, void *cookie, struct mtx *ilk)
+rangelock_unlock(struct rangelock *lock, void *cookie)
{
+ if (rangelock_cheat_unlock(lock, cookie))
+ return;
- MPASS(lock != NULL && cookie != NULL && ilk != NULL);
-
- mtx_lock(ilk);
- rangelock_unlock_locked(lock, cookie, ilk, true);
+ sleepq_lock(&lock->sleepers);
+ rangelock_unlock_int(lock, cookie);
+ sleepq_release(&lock->sleepers);
}
/*
- * Unlock the sub-range of granted lock.
+ * result: -1 if e1 before e2, or both locks are readers and e1
+ * starts before or at e2
+ * 1 if e1 after e2, or both locks are readers and e1
+ * starts after e2
+ * 0 if e1 and e2 overlap and at least one lock is writer
*/
-void *
-rangelock_unlock_range(struct rangelock *lock, void *cookie, off_t start,
- off_t end, struct mtx *ilk)
+static int
+rl_e_compare(const struct rl_q_entry *e1, const struct rl_q_entry *e2)
{
- struct rl_q_entry *entry;
-
- MPASS(lock != NULL && cookie != NULL && ilk != NULL);
- entry = cookie;
- KASSERT(entry->rl_q_flags & RL_LOCK_GRANTED,
- ("Unlocking non-granted lock"));
- KASSERT(entry->rl_q_start == start, ("wrong start"));
- KASSERT(entry->rl_q_end >= end, ("wrong end"));
-
- mtx_lock(ilk);
- if (entry->rl_q_end == end) {
- rangelock_unlock_locked(lock, cookie, ilk, true);
- return (NULL);
- }
- entry->rl_q_end = end;
- rangelock_calc_block(lock);
- mtx_unlock(ilk);
- return (cookie);
+ bool rds;
+
+ if (e1 == NULL)
+ return (1);
+ if (e2->rl_q_start >= e1->rl_q_end)
+ return (-1);
+ rds = rl_e_is_rlock(e1) && rl_e_is_rlock(e2);
+ if (e2->rl_q_start >= e1->rl_q_start && rds)
+ return (-1);
+ if (e1->rl_q_start >= e2->rl_q_end)
+ return (1);
+ if (e1->rl_q_start >= e2->rl_q_start && rds)
+ return (1);
+ return (0);
}
-/*
- * Add the lock request to the queue of the pending requests for
- * rangelock. Sleep until the request can be granted unless trylock == true.
- */
-static void *
-rangelock_enqueue(struct rangelock *lock, off_t start, off_t end, int mode,
- struct mtx *ilk, bool trylock)
+static void
+rl_insert_sleep(struct rangelock *lock)
{
- struct rl_q_entry *entry;
- struct thread *td;
+ smr_exit(rl_smr);
+ DROP_GIANT();
+ lock->sleepers = true;
+ sleepq_add(&lock->sleepers, NULL, "rangelk", 0, 0);
+ sleepq_wait(&lock->sleepers, PRI_USER);
+ PICKUP_GIANT();
+ smr_enter(rl_smr);
+}
+
+static bool
+rl_q_cas(struct rl_q_entry **prev, struct rl_q_entry *old,
+ struct rl_q_entry *new)
+{
+ return (atomic_cmpset_rel_ptr((uintptr_t *)prev, (uintptr_t)old,
+ (uintptr_t)new) != 0);
+}
- MPASS(lock != NULL && ilk != NULL);
+enum RL_INSERT_RES {
+ RL_TRYLOCK_FAILED,
+ RL_LOCK_SUCCESS,
+ RL_LOCK_RETRY,
+};
- td = curthread;
- if (td->td_rlqe != NULL) {
- entry = td->td_rlqe;
- td->td_rlqe = NULL;
- } else
- entry = rlqentry_alloc();
- MPASS(entry != NULL);
- entry->rl_q_flags = mode;
- entry->rl_q_start = start;
- entry->rl_q_end = end;
-
- mtx_lock(ilk);
- /*
- * XXXKIB TODO. Check that a thread does not try to enqueue a
- * lock that is incompatible with another request from the same
- * thread.
- */
+static enum RL_INSERT_RES
+rl_r_validate(struct rangelock *lock, struct rl_q_entry *e, bool trylock,
+ struct rl_q_entry **free)
+{
+ struct rl_q_entry *cur, *next, **prev;
+
+ prev = &e->rl_q_next;
+ cur = rl_e_unmark_unchecked(rl_q_load(prev));
+ for (;;) {
+ if (cur == NULL || cur->rl_q_start > e->rl_q_end)
+ return (RL_LOCK_SUCCESS);
+ next = rl_q_load(&cur->rl_q_next);
+ if (rl_e_is_marked(next)) {
+ next = rl_e_unmark(next);
+ if (rl_q_cas(prev, cur, next)) {
+ cur->rl_q_free = *free;
+ *free = cur;
+ }
+ cur = next;
+ continue;
+ }
+ if (rl_e_is_rlock(cur)) {
+ prev = &cur->rl_q_next;
+ cur = rl_e_unmark_unchecked(rl_q_load(prev));
+ continue;
+ }
+ if (!rl_e_is_marked(rl_q_load(&cur->rl_q_next))) {
+ sleepq_lock(&lock->sleepers);
+ if (rl_e_is_marked(rl_q_load(&cur->rl_q_next))) {
+ sleepq_release(&lock->sleepers);
+ continue;
+ }
+ rangelock_unlock_int(lock, e);
+ if (trylock) {
+ sleepq_release(&lock->sleepers);
+ return (RL_TRYLOCK_FAILED);
+ }
+ rl_insert_sleep(lock);
+ return (RL_LOCK_RETRY);
+ }
+ }
+}
- TAILQ_INSERT_TAIL(&lock->rl_waiters, entry, rl_q_link);
- /*
- * If rl_currdep == NULL, there is no entry waiting for a conflicting
- * range to be resolved, so set rl_currdep to this entry. If there is
- * no conflicting entry for this entry, rl_currdep will be set back to
- * NULL by rangelock_calc_block().
- */
- if (lock->rl_currdep == NULL)
- lock->rl_currdep = entry;
- rangelock_calc_block(lock);
- while (!(entry->rl_q_flags & RL_LOCK_GRANTED)) {
+static enum RL_INSERT_RES
+rl_w_validate(struct rangelock *lock, struct rl_q_entry *e,
+ bool trylock, struct rl_q_entry **free)
+{
+ struct rl_q_entry *cur, *next, **prev;
+
+ prev = (struct rl_q_entry **)&lock->head;
+ cur = rl_e_unmark_unchecked(rl_q_load(prev));
+ for (;;) {
+ if (cur == e)
+ return (RL_LOCK_SUCCESS);
+ next = rl_q_load(&cur->rl_q_next);
+ if (rl_e_is_marked(next)) {
+ next = rl_e_unmark(next);
+ if (rl_q_cas(prev, cur, next)) {
+ cur->rl_q_next = *free;
+ *free = cur;
+ }
+ cur = next;
+ continue;
+ }
+ if (cur->rl_q_end <= e->rl_q_start) {
+ prev = &cur->rl_q_next;
+ cur = rl_e_unmark_unchecked(rl_q_load(prev));
+ continue;
+ }
+ sleepq_lock(&lock->sleepers);
+ rangelock_unlock_int(lock, e);
if (trylock) {
- /*
- * For this case, the range is not actually locked
- * yet, but removal from the list requires the same
- * steps, except for not doing a rangelock_calc_block()
- * call, since rangelock_calc_block() was called above.
- */
- rangelock_unlock_locked(lock, entry, ilk, false);
- return (NULL);
+ sleepq_release(&lock->sleepers);
+ return (RL_TRYLOCK_FAILED);
}
- msleep(entry, ilk, 0, "range", 0);
+ rl_insert_sleep(lock);
+ return (RL_LOCK_RETRY);
}
- mtx_unlock(ilk);
- return (entry);
}
-void *
-rangelock_rlock(struct rangelock *lock, off_t start, off_t end, struct mtx *ilk)
+static enum RL_INSERT_RES
+rl_insert(struct rangelock *lock, struct rl_q_entry *e, bool trylock,
+ struct rl_q_entry **free)
{
+ struct rl_q_entry *cur, *next, **prev;
+ int r;
+
+again:
+ prev = (struct rl_q_entry **)&lock->head;
+ if (rl_q_load(prev) == NULL && rl_q_cas(prev, NULL, e))
+ return (RL_LOCK_SUCCESS);
+
+ for (cur = rl_q_load(prev);;) {
+ if (rl_e_is_marked(cur))
+ goto again;
+
+ if (cur != NULL) {
+ next = rl_q_load(&cur->rl_q_next);
+ if (rl_e_is_marked(next)) {
+ next = rl_e_unmark(next);
+ if (rl_q_cas(prev, cur, next)) {
+#ifdef INVARIANTS
+ cur->rl_q_owner = NULL;
+#endif
+ cur->rl_q_free = *free;
+ *free = cur;
+ }
+ cur = next;
+ continue;
+ }
+ }
- return (rangelock_enqueue(lock, start, end, RL_LOCK_READ, ilk, false));
+ r = rl_e_compare(cur, e);
+ if (r == -1) {
+ prev = &cur->rl_q_next;
+ cur = rl_q_load(prev);
+ } else if (r == 0) {
+ sleepq_lock(&lock->sleepers);
+ if (__predict_false(rl_e_is_marked(rl_q_load(
+ &cur->rl_q_next)))) {
+ sleepq_release(&lock->sleepers);
+ continue;
+ }
+ if (trylock) {
+ sleepq_release(&lock->sleepers);
+ return (RL_TRYLOCK_FAILED);
+ }
+ rl_insert_sleep(lock);
+ /* e is still valid */
+ goto again;
+ } else /* r == 1 */ {
+ e->rl_q_next = cur;
+ if (rl_q_cas(prev, cur, e)) {
+ atomic_thread_fence_acq();
+ return (rl_e_is_rlock(e) ?
+ rl_r_validate(lock, e, trylock, free) :
+ rl_w_validate(lock, e, trylock, free));
+ }
+ /* Reset rl_q_next in case we hit fast path. */
+ e->rl_q_next = NULL;
+ cur = rl_q_load(prev);
+ }
+ }
}
-void *
-rangelock_tryrlock(struct rangelock *lock, off_t start, off_t end,
- struct mtx *ilk)
+static struct rl_q_entry *
+rangelock_lock_int(struct rangelock *lock, bool trylock, vm_ooffset_t start,
+ vm_ooffset_t end, int locktype)
{
+ struct rl_q_entry *e, *free, *x, *xp;
+ struct thread *td;
+ void *cookie;
+ enum RL_INSERT_RES res;
- return (rangelock_enqueue(lock, start, end, RL_LOCK_READ, ilk, true));
+ if (rangelock_cheat_lock(lock, locktype, trylock, &cookie))
+ return (cookie);
+ td = curthread;
+ for (res = RL_LOCK_RETRY; res == RL_LOCK_RETRY;) {
+ free = NULL;
+ e = rlqentry_alloc(start, end, locktype);
+ smr_enter(rl_smr);
+ res = rl_insert(lock, e, trylock, &free);
+ smr_exit(rl_smr);
+ if (res == RL_TRYLOCK_FAILED) {
+ MPASS(trylock);
+ e->rl_q_free = free;
+ free = e;
+ e = NULL;
+ }
+ for (x = free; x != NULL; x = xp) {
+ MPASS(!rl_e_is_marked(x));
+ xp = x->rl_q_free;
+ MPASS(!rl_e_is_marked(xp));
+ if (td->td_rlqe == NULL) {
+ smr_synchronize(rl_smr);
+ td->td_rlqe = x;
+ } else {
+ uma_zfree_smr(rl_entry_zone, x);
+ }
+ }
+ }
+ return (e);
}
void *
-rangelock_wlock(struct rangelock *lock, off_t start, off_t end, struct mtx *ilk)
+rangelock_rlock(struct rangelock *lock, vm_ooffset_t start, vm_ooffset_t end)
{
+ return (rangelock_lock_int(lock, false, start, end, RL_LOCK_READ));
+}
- return (rangelock_enqueue(lock, start, end, RL_LOCK_WRITE, ilk, false));
+void *
+rangelock_tryrlock(struct rangelock *lock, vm_ooffset_t start, vm_ooffset_t end)
+{
+ return (rangelock_lock_int(lock, true, start, end, RL_LOCK_READ));
}
void *
-rangelock_trywlock(struct rangelock *lock, off_t start, off_t end,
- struct mtx *ilk)
+rangelock_wlock(struct rangelock *lock, vm_ooffset_t start, vm_ooffset_t end)
{
+ return (rangelock_lock_int(lock, false, start, end, RL_LOCK_WRITE));
+}
- return (rangelock_enqueue(lock, start, end, RL_LOCK_WRITE, ilk, true));
+void *
+rangelock_trywlock(struct rangelock *lock, vm_ooffset_t start, vm_ooffset_t end)
+{
+ return (rangelock_lock_int(lock, true, start, end, RL_LOCK_WRITE));
}
#ifdef INVARIANT_SUPPORT
void
_rangelock_cookie_assert(void *cookie, int what, const char *file, int line)
{
- struct rl_q_entry *entry;
- int flags;
-
- MPASS(cookie != NULL);
- entry = cookie;
- flags = entry->rl_q_flags;
- switch (what) {
- case RCA_LOCKED:
- if ((flags & RL_LOCK_GRANTED) == 0)
- panic("rangelock not held @ %s:%d\n", file, line);
- break;
- case RCA_RLOCKED:
- if ((flags & (RL_LOCK_GRANTED | RL_LOCK_READ)) !=
- (RL_LOCK_GRANTED | RL_LOCK_READ))
- panic("rangelock not rlocked @ %s:%d\n", file, line);
- break;
- case RCA_WLOCKED:
- if ((flags & (RL_LOCK_GRANTED | RL_LOCK_WRITE)) !=
- (RL_LOCK_GRANTED | RL_LOCK_WRITE))
- panic("rangelock not wlocked @ %s:%d\n", file, line);
- break;
- default:
- panic("Unknown rangelock assertion: %d @ %s:%d", what, file,
- line);
- }
}
#endif /* INVARIANT_SUPPORT */
+
+#include "opt_ddb.h"
+#ifdef DDB
+#include <ddb/ddb.h>
+
+DB_SHOW_COMMAND(rangelock, db_show_rangelock)
+{
+ struct rangelock *lock;
+ struct rl_q_entry *e, *x;
+ uintptr_t v;
+
+ if (!have_addr) {
+ db_printf("show rangelock addr\n");
+ return;
+ }
+
+ lock = (struct rangelock *)addr;
+ db_printf("rangelock %p sleepers %d\n", lock, lock->sleepers);
+ v = lock->head;
+ if ((v & RL_CHEAT_CHEATING) != 0) {
+ db_printf(" cheating head %#jx\n", (uintmax_t)v);
+ return;
+ }
+ for (e = (struct rl_q_entry *)(lock->head);;) {
+ x = rl_e_is_marked(e) ? rl_e_unmark(e) : e;
+ if (x == NULL)
+ break;
+ db_printf(" entry %p marked %d %d start %#jx end %#jx "
+ "flags %x next %p",
+ e, rl_e_is_marked(e), rl_e_is_marked(x->rl_q_next),
+ x->rl_q_start, x->rl_q_end, x->rl_q_flags, x->rl_q_next);
+#ifdef INVARIANTS
+ db_printf(" owner %p (%d)", x->rl_q_owner,
+ x->rl_q_owner != NULL ? x->rl_q_owner->td_tid : -1);
+#endif
+ db_printf("\n");
+ e = x->rl_q_next;
+ }
+}
+
+#endif /* DDB */
diff --git a/sys/kern/kern_thread.c b/sys/kern/kern_thread.c
--- a/sys/kern/kern_thread.c
+++ b/sys/kern/kern_thread.c
@@ -479,9 +479,9 @@
td = (struct thread *)mem;
EVENTHANDLER_DIRECT_INVOKE(thread_fini, td);
- rlqentry_free(td->td_rlqe);
turnstile_free(td->td_turnstile);
sleepq_free(td->td_sleepqueue);
+ rangelock_entry_free(td->td_rlqe);
umtx_thread_fini(td);
MPASS(td->td_sel == NULL);
}
diff --git a/sys/kern/uipc_shm.c b/sys/kern/uipc_shm.c
--- a/sys/kern/uipc_shm.c
+++ b/sys/kern/uipc_shm.c
@@ -183,13 +183,13 @@
"Number of contig reclaims before giving up for default alloc policy");
#define shm_rangelock_unlock(shmfd, cookie) \
- rangelock_unlock(&(shmfd)->shm_rl, (cookie), &(shmfd)->shm_mtx)
+ rangelock_unlock(&(shmfd)->shm_rl, (cookie))
#define shm_rangelock_rlock(shmfd, start, end) \
- rangelock_rlock(&(shmfd)->shm_rl, (start), (end), &(shmfd)->shm_mtx)
+ rangelock_rlock(&(shmfd)->shm_rl, (start), (end))
#define shm_rangelock_tryrlock(shmfd, start, end) \
- rangelock_tryrlock(&(shmfd)->shm_rl, (start), (end), &(shmfd)->shm_mtx)
+ rangelock_tryrlock(&(shmfd)->shm_rl, (start), (end))
#define shm_rangelock_wlock(shmfd, start, end) \
- rangelock_wlock(&(shmfd)->shm_rl, (start), (end), &(shmfd)->shm_mtx)
+ rangelock_wlock(&(shmfd)->shm_rl, (start), (end))
static int
uiomove_object_page(vm_object_t obj, size_t len, struct uio *uio)
diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c
--- a/sys/kern/vfs_subr.c
+++ b/sys/kern/vfs_subr.c
@@ -2013,8 +2013,6 @@
VNASSERT(bo->bo_dirty.bv_cnt == 0, vp, ("dirtybufcnt not 0"));
VNASSERT(pctrie_is_empty(&bo->bo_dirty.bv_root), vp,
("dirty blk trie not empty"));
- VNASSERT(TAILQ_EMPTY(&vp->v_rl.rl_waiters), vp,
- ("Dangling rangelock waiters"));
VNASSERT((vp->v_iflag & (VI_DOINGINACT | VI_OWEINACT)) == 0, vp,
("Leaked inactivation"));
VI_UNLOCK(vp);
diff --git a/sys/sys/rangelock.h b/sys/sys/rangelock.h
--- a/sys/sys/rangelock.h
+++ b/sys/sys/rangelock.h
@@ -29,12 +29,14 @@
#ifndef _SYS_RANGELOCK_H
#define _SYS_RANGELOCK_H
-#include <sys/queue.h>
+#include <sys/types.h>
+#ifndef _KERNEL
+#include <stdbool.h>
+#endif
#define RL_LOCK_READ 0x0001
#define RL_LOCK_WRITE 0x0002
#define RL_LOCK_TYPE_MASK 0x0003
-#define RL_LOCK_GRANTED 0x0004
struct rl_q_entry;
@@ -44,42 +46,26 @@
* all existing lock owners are compatible with the request. Two lock
* owners are compatible if their ranges do not overlap, or both
* owners are for read.
- *
- * Access to the structure itself is synchronized with the externally
- * supplied mutex.
- *
- * rl_waiters is the queue containing in order (a) granted write lock
- * requests, (b) granted read lock requests, and (c) in order of arrival,
- * lock requests which cannot be granted yet.
- *
- * rl_currdep is the first lock request that cannot be granted now due
- * to the preceding requests conflicting with it (i.e., it points to
- * position (c) in the list above).
*/
struct rangelock {
- TAILQ_HEAD(, rl_q_entry) rl_waiters;
- struct rl_q_entry *rl_currdep;
+ uintptr_t head;
+ bool sleepers;
};
#ifdef _KERNEL
-struct mtx;
-
void rangelock_init(struct rangelock *lock);
void rangelock_destroy(struct rangelock *lock);
-void rangelock_unlock(struct rangelock *lock, void *cookie,
- struct mtx *ilk);
-void *rangelock_unlock_range(struct rangelock *lock, void *cookie,
- off_t start, off_t end, struct mtx *ilk);
-void *rangelock_rlock(struct rangelock *lock, off_t start, off_t end,
- struct mtx *ilk);
-void *rangelock_tryrlock(struct rangelock *lock, off_t start, off_t end,
- struct mtx *ilk);
-void *rangelock_wlock(struct rangelock *lock, off_t start, off_t end,
- struct mtx *ilk);
-void *rangelock_trywlock(struct rangelock *lock, off_t start, off_t end,
- struct mtx *ilk);
-void rlqentry_free(struct rl_q_entry *rlqe);
+void rangelock_unlock(struct rangelock *lock, void *cookie);
+void *rangelock_rlock(struct rangelock *lock, vm_ooffset_t start,
+ vm_ooffset_t end);
+void *rangelock_tryrlock(struct rangelock *lock, vm_ooffset_t start,
+ vm_ooffset_t end);
+void *rangelock_wlock(struct rangelock *lock, vm_ooffset_t start,
+ vm_ooffset_t end);
+void *rangelock_trywlock(struct rangelock *lock, vm_ooffset_t start,
+ vm_ooffset_t end);
+void rangelock_entry_free(struct rl_q_entry *e);
#if defined(INVARIANTS) || defined(INVARIANT_SUPPORT)
void _rangelock_cookie_assert(void *cookie, int what, const char *file,
int line);
diff --git a/sys/sys/vnode.h b/sys/sys/vnode.h
--- a/sys/sys/vnode.h
+++ b/sys/sys/vnode.h
@@ -834,18 +834,15 @@
#define vn_seqc_consistent(vp, seq) seqc_consistent(&(vp)->v_seqc, seq)
#define vn_rangelock_unlock(vp, cookie) \
- rangelock_unlock(&(vp)->v_rl, (cookie), VI_MTX(vp))
-#define vn_rangelock_unlock_range(vp, cookie, start, end) \
- rangelock_unlock_range(&(vp)->v_rl, (cookie), (start), (end), \
- VI_MTX(vp))
+ rangelock_unlock(&(vp)->v_rl, (cookie))
#define vn_rangelock_rlock(vp, start, end) \
- rangelock_rlock(&(vp)->v_rl, (start), (end), VI_MTX(vp))
+ rangelock_rlock(&(vp)->v_rl, (start), (end))
#define vn_rangelock_tryrlock(vp, start, end) \
- rangelock_tryrlock(&(vp)->v_rl, (start), (end), VI_MTX(vp))
+ rangelock_tryrlock(&(vp)->v_rl, (start), (end))
#define vn_rangelock_wlock(vp, start, end) \
- rangelock_wlock(&(vp)->v_rl, (start), (end), VI_MTX(vp))
+ rangelock_wlock(&(vp)->v_rl, (start), (end))
#define vn_rangelock_trywlock(vp, start, end) \
- rangelock_trywlock(&(vp)->v_rl, (start), (end), VI_MTX(vp))
+ rangelock_trywlock(&(vp)->v_rl, (start), (end))
#define vn_irflag_read(vp) atomic_load_short(&(vp)->v_irflag)
void vn_irflag_set_locked(struct vnode *vp, short toset);

File Metadata

Mime Type
text/plain
Expires
Thu, Jan 9, 4:06 PM (6 h, 57 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
15733171
Default Alt Text
D41787.id128185.diff (29 KB)

Event Timeline