Page MenuHomeFreeBSD

D30376.diff
No OneTemporary

D30376.diff

diff --git a/share/man/man9/Makefile b/share/man/man9/Makefile
--- a/share/man/man9/Makefile
+++ b/share/man/man9/Makefile
@@ -1027,6 +1027,12 @@
epoch.9 epoch_enter.9 \
epoch.9 epoch_exit.9 \
epoch.9 epoch_wait.9 \
+ epoch.9 epoch_enter_preempt.9 \
+ epoch.9 epoch_exit_preempt.9 \
+ epoch.9 epoch_wait_preempt.9 \
+ epoch.9 epoch_enter_sleepable.9 \
+ epoch.9 epoch_exit_sleepable.9 \
+ epoch.9 epoch_wait_sleepable.9 \
epoch.9 epoch_call.9 \
epoch.9 epoch_drain_callbacks.9 \
epoch.9 in_epoch.9
diff --git a/share/man/man9/epoch.9 b/share/man/man9/epoch.9
--- a/share/man/man9/epoch.9
+++ b/share/man/man9/epoch.9
@@ -26,7 +26,7 @@
.\"
.\" $FreeBSD$
.\"
-.Dd April 30, 2020
+.Dd May 21, 2021
.Dt EPOCH 9
.Os
.Sh NAME
@@ -40,6 +40,9 @@
.Nm epoch_enter_preempt ,
.Nm epoch_exit_preempt ,
.Nm epoch_wait_preempt ,
+.Nm epoch_enter_sleepable ,
+.Nm epoch_exit_sleepable ,
+.Nm epoch_wait_sleepable ,
.Nm epoch_call ,
.Nm epoch_drain_callbacks ,
.Nm in_epoch ,
@@ -83,6 +86,12 @@
.Ft void
.Fn epoch_wait_preempt "epoch_t epoch"
.Ft void
+.Fn epoch_enter_sleepable "epoch_t epoch" "epoch_tracker_t et"
+.Ft void
+.Fn epoch_exit_sleepable "epoch_t epoch" "epoch_tracker_t et"
+.Ft void
+.Fn epoch_wait_sleepable "epoch_t epoch"
+.Ft void
.Fn epoch_call "epoch_t epoch" "epoch_callback_t callback" "epoch_context_t ctx"
.Ft void
.Fn epoch_drain_callbacks "epoch_t epoch"
@@ -105,7 +114,9 @@
kernel option is configured.
By default, epochs do not allow preemption during sections.
By default mutexes cannot be held across
-.Fn epoch_wait_preempt .
+.Fn epoch_wait_preempt
+and
+.Fn epoch_wait_sleepable .
The
.Fa flags
specified are formed by
@@ -114,11 +125,17 @@
.Bl -tag -offset indent -width Ds
.It Dv EPOCH_LOCKED
Permit holding mutexes across
-.Fn epoch_wait_preempt
-(requires
-.Dv EPOCH_PREEMPT ) .
+.Fn epoch_wait_preempt .
+Requires
+.Dv EPOCH_PREEMPT
+or
+.Dv EPOCH_SLEEPABLE .
When doing this one must be cautious of creating a situation where a deadlock
is possible.
+.It Dv EPOCH_CRITICAL
+The default non-preemptable
+.Vt epoch
+type.
.It Dv EPOCH_PREEMPT
The
.Vt epoch
@@ -135,6 +152,21 @@
and
.Fn epoch_wait ,
respectively.
+.It Dv EPOCH_SLEEPABLE
+The
+.Vt epoch
+will allow preemption and sleeping during sections.
+The functions
+.Fn epoch_enter_sleepable ,
+.Fn epoch_exit_sleepable ,
+and
+.Fn epoch_wait_sleepable
+must be used in place of
+.Fn epoch_enter ,
+.Fn epoch_exit ,
+and
+.Fn epoch_wait ,
+respectively.
.El
.Pp
.Vt epoch Ns s
@@ -142,23 +174,26 @@
.Fn epoch_free .
.Pp
Threads indicate the start of an epoch critical section by calling
-.Fn epoch_enter
-(or
+.Fn epoch_enter ,
.Fn epoch_enter_preempt
-for preemptible epochs).
+for preemptible epochs or
+.Fn epoch_enter_sleepable
+for sleepable epochs.
Threads call
-.Fn epoch_exit
-(or
+.Fn epoch_exit ,
.Fn epoch_exit_preempt
-for preemptible epochs)
-to indicate the end of a critical section.
+for preemptible epochs or
+.Fn epoch_exit_sleepable
+for sleepable epochs, to indicate the end of a critical section.
.Vt struct epoch_tracker Ns s
are stack objects whose pointers are passed to
-.Fn epoch_enter_preempt
+.Fn epoch_enter_preempt ,
+.Fn epoch_exit_preempt ,
+.Fn epoch_enter_sleepable
and
-.Fn epoch_exit_preempt
-(much like
-.Vt struct rm_priotracker ) .
+.Fn epoch_exit_sleepable ,
+much like the
+.Vt struct rm_priotracker .
.Pp
Threads can defer work until a grace period has expired since any thread has
entered the epoch either synchronously or asynchronously.
@@ -166,19 +201,25 @@
defers work asynchronously by invoking the provided
.Fa callback
at a later time.
-.Fn epoch_wait
-(or
-.Fn epoch_wait_preempt )
+.Fn epoch_wait ,
+.Fn epoch_wait_preempt
+or
+.Fn epoch_wait_sleepable
blocks the current thread until the grace period has expired and the work can be
done safely.
.Pp
-Default, non-preemptible epoch wait
-.Fn ( epoch_wait )
+Default, non-preemptible epoch wait,
+.Fn epoch_wait ,
+is guaranteed to have much shorter completion times relative to
+preemptible epoch wait,
+.Fn epoch_wait_preempt .
+In turn the preemptible epoch wait,
+.Fn epoch_wait_preempt ,
is guaranteed to have much shorter completion times relative to
-preemptible epoch wait
-.Fn ( epoch_wait_preempt ) .
-(In the default type, none of the threads in an epoch section will be preempted
-before completing its section.)
+sleepable epoch wait,
+.Fn epoch_wait_sleepable .
+In the default type, none of the threads in an epoch section will be preempted
+before completing its section.
.Pp
INVARIANTS can assert that a thread is in an epoch by using
.Fn in_epoch .
@@ -191,9 +232,11 @@
.Fn in_epoch_verbose "epoch" "1"
provides additional verbose debugging information.
.Pp
-The epoch API currently does not support sleeping in epoch_preempt sections.
A caller should never call
-.Fn epoch_wait
+.Fn epoch_wait ,
+.Fn epoch_wait_preempt
+or
+.Fn epoch_wait_sleepable
in the middle of an epoch section for the same epoch as this will lead to a deadlock.
.Pp
The
@@ -282,7 +325,9 @@
.Fx 11.0 .
.Sh CAVEATS
One must be cautious when using
-.Fn epoch_wait_preempt .
+.Fn epoch_wait_preempt
+and
+.Fn epoch_wait_sleepable .
Threads are pinned during epoch sections, so if a thread in a section is then
preempted by a higher priority compute bound thread on that CPU, it can be
prevented from leaving the section indefinitely.
diff --git a/sys/kern/subr_epoch.c b/sys/kern/subr_epoch.c
--- a/sys/kern/subr_epoch.c
+++ b/sys/kern/subr_epoch.c
@@ -2,6 +2,7 @@
* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
*
* Copyright (c) 2018, Matthew Macy <mmacy@freebsd.org>
+ * Copyright (c) 2017-2021, Hans Petter Selasky <hselasky@freebsd.org>
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -138,6 +139,7 @@
static __read_mostly int inited;
__read_mostly epoch_t global_epoch;
__read_mostly epoch_t global_epoch_preempt;
+__read_mostly epoch_t global_epoch_sleepable;
static void epoch_call_task(void *context __unused);
static uma_zone_t pcpu_zone_record;
@@ -291,8 +293,9 @@
#endif
sx_init(&epoch_sx, "epoch-sx");
inited = 1;
- global_epoch = epoch_alloc("Global", 0);
+ global_epoch = epoch_alloc("Global critical", EPOCH_CRITICAL);
global_epoch_preempt = epoch_alloc("Global preemptible", EPOCH_PREEMPT);
+ global_epoch_sleepable = epoch_alloc("Global sleepable", EPOCH_SLEEPABLE);
}
SYSINIT(epoch, SI_SUB_EPOCH, SI_ORDER_FIRST, epoch_init, NULL);
@@ -338,6 +341,7 @@
int i;
MPASS(name != NULL);
+ MPASS((flags & EPOCH_TYPE_MASK) != EPOCH_RESERVED);
if (__predict_false(!inited))
panic("%s called too early in boot", __func__);
@@ -446,9 +450,8 @@
MPASS((vm_offset_t)et >= td->td_kstack &&
(vm_offset_t)et + sizeof(struct epoch_tracker) <=
td->td_kstack + td->td_kstack_pages * PAGE_SIZE);
-
INIT_CHECK(epoch);
- MPASS(epoch->e_flags & EPOCH_PREEMPT);
+ MPASS((epoch->e_flags & EPOCH_TYPE_MASK) == EPOCH_PREEMPT);
#ifdef EPOCH_TRACE
epoch_trace_enter(td, epoch, et, file, line);
@@ -466,6 +469,37 @@
critical_exit();
}
+void
+_epoch_enter_sleepable(epoch_t epoch, epoch_tracker_t et EPOCH_FILE_LINE)
+{
+ struct epoch_record *er;
+ struct thread *td;
+
+ MPASS(cold || epoch != NULL);
+ td = curthread;
+ MPASS((vm_offset_t)et >= td->td_kstack &&
+ (vm_offset_t)et + sizeof(struct epoch_tracker) <=
+ td->td_kstack + td->td_kstack_pages * PAGE_SIZE);
+
+ INIT_CHECK(epoch);
+ MPASS((epoch->e_flags & EPOCH_TYPE_MASK) == EPOCH_SLEEPABLE);
+
+#ifdef EPOCH_TRACE
+ epoch_trace_enter(td, epoch, et, file, line);
+#endif
+ et->et_td = td;
+ et->et_old_priority = 0; /* not used */
+
+ critical_enter();
+ sched_pin();
+ er = epoch_currecord(epoch);
+ /* Record-level tracking is reserved for non-preemptible epochs. */
+ MPASS(er->er_td == NULL);
+ TAILQ_INSERT_TAIL(&er->er_tdlist, et, et_link);
+ ck_epoch_begin(&er->er_record, &et->et_section);
+ critical_exit();
+}
+
void
epoch_enter(epoch_t epoch)
{
@@ -499,7 +533,7 @@
sched_unpin();
THREAD_SLEEPING_OK();
er = epoch_currecord(epoch);
- MPASS(epoch->e_flags & EPOCH_PREEMPT);
+ MPASS((epoch->e_flags & EPOCH_TYPE_MASK) == EPOCH_PREEMPT);
MPASS(et != NULL);
MPASS(et->et_td == td);
#ifdef INVARIANTS
@@ -518,6 +552,35 @@
#endif
}
+void
+_epoch_exit_sleepable(epoch_t epoch, epoch_tracker_t et EPOCH_FILE_LINE)
+{
+ struct epoch_record *er;
+ struct thread *td;
+
+ INIT_CHECK(epoch);
+ td = curthread;
+ critical_enter();
+ sched_unpin();
+ er = epoch_currecord(epoch);
+ MPASS((epoch->e_flags & EPOCH_TYPE_MASK) == EPOCH_SLEEPABLE);
+ MPASS(et != NULL);
+ MPASS(et->et_td == td);
+#ifdef INVARIANTS
+ et->et_td = (void*)0xDEADBEEF;
+ MPASS(et->et_old_priority == 0);
+ /* Record-level tracking is reserved for non-preemptible epochs. */
+ MPASS(er->er_td == NULL);
+#endif
+ ck_epoch_end(&er->er_record, &et->et_section);
+ TAILQ_REMOVE(&er->er_tdlist, et, et_link);
+ er->er_gen++;
+ critical_exit();
+#ifdef EPOCH_TRACE
+ epoch_trace_exit(td, epoch, et, file, line);
+#endif
+}
+
void
epoch_exit(epoch_t epoch)
{
@@ -691,7 +754,7 @@
td = curthread;
#ifdef INVARIANTS
locks = curthread->td_locks;
- MPASS(epoch->e_flags & EPOCH_PREEMPT);
+ MPASS((epoch->e_flags & EPOCH_TYPE_MASK) == EPOCH_PREEMPT);
if ((epoch->e_flags & EPOCH_LOCKED) == 0)
WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
"epoch_wait() can be long running");
@@ -732,6 +795,133 @@
("%d residual locks held", td->td_locks - locks));
}
+/*
+ * epoch_block_handler_sleepable() is a callback from the CK code when another
+ * thread is currently in an epoch section.
+ */
+static void
+epoch_block_handler_sleepable(struct ck_epoch *global __unused,
+ ck_epoch_record_t *cr, void *arg __unused)
+{
+ epoch_record_t record;
+ struct thread *td;
+ struct epoch_tracker *tdwait;
+
+ record = __containerof(cr, struct epoch_record, er_record);
+ td = curthread;
+ counter_u64_add(block_count, 1);
+
+ /*
+ * We lost a race and there's no longer any threads
+ * on the CPU in an epoch section.
+ */
+ if (TAILQ_EMPTY(&record->er_tdlist))
+ return;
+
+ if (record->er_cpuid == curcpu) {
+ bool is_sleeping = 0;
+ uint8_t prio = 0;
+
+ /*
+ * Find the lowest priority or sleeping thread which
+ * is blocking synchronization on this CPU core. All
+ * the threads in the queue are CPU-pinned and cannot
+ * go anywhere while the current thread is locked.
+ */
+ TAILQ_FOREACH(tdwait, &record->er_tdlist, et_link) {
+ if (tdwait->et_td->td_priority > prio)
+ prio = tdwait->et_td->td_priority;
+ is_sleeping |= (tdwait->et_td->td_inhibitors != 0);
+ }
+
+ if (is_sleeping) {
+ /*
+ * Wait one tick. Performance is not critical
+ * for sleepable EPOCHs.
+ */
+ thread_unlock(td);
+ pause("W", 1);
+ thread_lock(td);
+ } else {
+ /* set new thread priority */
+ sched_prio(td, prio);
+ /* task switch */
+ mi_switch(SW_VOL | SWT_RELINQUISH);
+ /*
+ * It is important the thread lock is dropped
+ * while yielding to allow other threads to
+ * acquire the lock pointed to by
+ * TDQ_LOCKPTR(td). Currently mi_switch() will
+ * unlock the thread lock before
+ * returning. Else a deadlock like situation
+ * might happen.
+ */
+ thread_lock(td);
+ }
+ } else {
+ /*
+ * To avoid spinning move execution to the other CPU
+ * which is blocking synchronization. Set highest
+ * thread priority so that code gets run. The thread
+ * priority will be restored later.
+ */
+ sched_prio(td, 0);
+ sched_bind(td, record->er_cpuid);
+ }
+}
+
+void
+epoch_wait_sleepable(epoch_t epoch)
+{
+ struct thread *td;
+ int was_bound;
+ int old_cpu;
+ int old_pinned;
+ u_char old_prio;
+
+ MPASS(cold || epoch != NULL);
+ INIT_CHECK(epoch);
+ td = curthread;
+#ifdef INVARIANTS
+ MPASS((epoch->e_flags & EPOCH_TYPE_MASK) == EPOCH_SLEEPABLE);
+ if ((epoch->e_flags & EPOCH_LOCKED) == 0)
+ WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
+ "epoch_wait() can be long running");
+ KASSERT(!in_epoch(epoch), ("epoch_wait_sleepable() called in the middle "
+ "of an epoch section of the same epoch"));
+#endif
+ DROP_GIANT();
+ thread_lock(td);
+
+ old_cpu = PCPU_GET(cpuid);
+ old_pinned = td->td_pinned;
+ old_prio = td->td_priority;
+ was_bound = sched_is_bound(td);
+ sched_unbind(td);
+ td->td_pinned = 0;
+ sched_bind(td, old_cpu);
+
+ ck_epoch_synchronize_wait(&epoch->e_epoch,
+ epoch_block_handler_sleepable, NULL);
+
+ /* restore CPU binding, if any */
+ if (was_bound != 0) {
+ sched_bind(td, old_cpu);
+ } else {
+ /* get thread back to initial CPU, if any */
+ if (old_pinned != 0)
+ sched_bind(td, old_cpu);
+ sched_unbind(td);
+ }
+ /* restore pinned after bind */
+ td->td_pinned = old_pinned;
+
+ /* restore thread priority */
+ sched_prio(td, old_prio);
+ thread_unlock(td);
+ PICKUP_GIANT();
+}
+
static void
epoch_block_handler(struct ck_epoch *g __unused, ck_epoch_record_t *c __unused,
void *arg __unused)
@@ -828,7 +1018,7 @@
struct thread *td;
MPASS(epoch != NULL);
- MPASS((epoch->e_flags & EPOCH_PREEMPT) != 0);
+ MPASS((epoch->e_flags & EPOCH_TYPE_MASK) == EPOCH_PREEMPT);
td = curthread;
if (THREAD_CAN_SLEEP())
return (0);
@@ -852,6 +1042,37 @@
return (0);
}
+static int
+in_epoch_verbose_sleepable(epoch_t epoch, int dump_onfail)
+{
+ epoch_record_t er;
+ struct epoch_tracker *tdwait;
+ struct thread *td;
+
+ MPASS(epoch != NULL);
+ MPASS((epoch->e_flags & EPOCH_TYPE_MASK) == EPOCH_SLEEPABLE);
+ td = curthread;
+ critical_enter();
+ er = epoch_currecord(epoch);
+ TAILQ_FOREACH(tdwait, &er->er_tdlist, et_link) {
+ if (tdwait->et_td != td)
+ continue;
+ critical_exit();
+ return (1);
+ }
+#ifdef INVARIANTS
+ if (dump_onfail) {
+ MPASS(td->td_pinned);
+ printf("cpu: %d id: %d\n", curcpu, td->td_tid);
+ TAILQ_FOREACH(tdwait, &er->er_tdlist, et_link)
+ printf("td_tid: %d ", tdwait->et_td->td_tid);
+ printf("\n");
+ }
+#endif
+ critical_exit();
+ return (0);
+}
+
#ifdef INVARIANTS
static void
epoch_assert_nocpu(epoch_t epoch, struct thread *td)
@@ -880,10 +1101,19 @@
epoch_record_t er;
struct thread *td;
- if (__predict_false((epoch) == NULL))
+ if (__predict_false(epoch == NULL))
return (0);
- if ((epoch->e_flags & EPOCH_PREEMPT) != 0)
+
+ switch (epoch->e_flags & EPOCH_TYPE_MASK) {
+ case EPOCH_CRITICAL:
+ break;
+ case EPOCH_PREEMPT:
return (in_epoch_verbose_preempt(epoch, dump_onfail));
+ case EPOCH_SLEEPABLE:
+ return (in_epoch_verbose_sleepable(epoch, dump_onfail));
+ default:
+ panic("in_epoch_verbose: Invalid EPOCH type.");
+ }
/*
* The thread being in a critical section is a necessary
diff --git a/sys/sys/epoch.h b/sys/sys/epoch.h
--- a/sys/sys/epoch.h
+++ b/sys/sys/epoch.h
@@ -45,11 +45,16 @@
struct epoch;
typedef struct epoch *epoch_t;
-#define EPOCH_PREEMPT 0x1
-#define EPOCH_LOCKED 0x2
+#define EPOCH_TYPE_MASK (EPOCH_PREEMPT | EPOCH_SLEEPABLE)
+#define EPOCH_CRITICAL 0x0
+#define EPOCH_PREEMPT 0x1
+#define EPOCH_LOCKED 0x2
+#define EPOCH_SLEEPABLE 0x4
+#define EPOCH_RESERVED 0x5
extern epoch_t global_epoch;
extern epoch_t global_epoch_preempt;
+extern epoch_t global_epoch_sleepable;
struct epoch_tracker {
TAILQ_ENTRY(epoch_tracker) et_link;
@@ -69,10 +74,11 @@
void epoch_free(epoch_t epoch);
void epoch_wait(epoch_t epoch);
void epoch_wait_preempt(epoch_t epoch);
+void epoch_wait_sleepable(epoch_t epoch);
void epoch_drain_callbacks(epoch_t epoch);
void epoch_call(epoch_t epoch, epoch_callback_t cb, epoch_context_t ctx);
int in_epoch(epoch_t epoch);
-int in_epoch_verbose(epoch_t epoch, int dump_onfail);
+int in_epoch_verbose(epoch_t epoch, int dump_onfail);
DPCPU_DECLARE(int, epoch_cb_count);
DPCPU_DECLARE(struct grouptask, epoch_cb_task);
@@ -84,14 +90,25 @@
void _epoch_enter_preempt(epoch_t epoch, epoch_tracker_t et EPOCH_FILE_LINE);
void _epoch_exit_preempt(epoch_t epoch, epoch_tracker_t et EPOCH_FILE_LINE);
+
+void _epoch_enter_sleepable(epoch_t epoch, epoch_tracker_t et EPOCH_FILE_LINE);
+void _epoch_exit_sleepable(epoch_t epoch, epoch_tracker_t et EPOCH_FILE_LINE);
+
#ifdef EPOCH_TRACE
void epoch_trace_list(struct thread *);
#define epoch_enter_preempt(epoch, et) _epoch_enter_preempt(epoch, et, __FILE__, __LINE__)
#define epoch_exit_preempt(epoch, et) _epoch_exit_preempt(epoch, et, __FILE__, __LINE__)
+
+#define epoch_enter_sleepable(epoch, et) _epoch_enter_sleepable(epoch, et, __FILE__, __LINE__)
+#define epoch_exit_sleepable(epoch, et) _epoch_exit_sleepable(epoch, et, __FILE__, __LINE__)
#else
#define epoch_enter_preempt(epoch, et) _epoch_enter_preempt(epoch, et)
#define epoch_exit_preempt(epoch, et) _epoch_exit_preempt(epoch, et)
+
+#define epoch_enter_sleepable(epoch, et) _epoch_enter_sleepable(epoch, et)
+#define epoch_exit_sleepable(epoch, et) _epoch_exit_sleepable(epoch, et)
#endif
+
void epoch_enter(epoch_t epoch);
void epoch_exit(epoch_t epoch);

File Metadata

Mime Type
text/plain
Expires
Sat, Feb 22, 6:53 PM (9 h, 22 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
16777266
Default Alt Text
D30376.diff (16 KB)

Event Timeline