Page MenuHomeFreeBSD

D41452.id125990.diff
No OneTemporary

D41452.id125990.diff

diff --git a/sys/dev/nvme/nvme_ctrlr.c b/sys/dev/nvme/nvme_ctrlr.c
--- a/sys/dev/nvme/nvme_ctrlr.c
+++ b/sys/dev/nvme/nvme_ctrlr.c
@@ -425,13 +425,35 @@
TSENTER();
+ /*
+ * Make sure that all the ISRs are done before proceeding with the reset
+ * (and also keep any stray interrupts that happen during this process
+ * from racing this process). For startup, this is a nop, since the
+ * hardware is in a good state. But for recovery, where we randomly
+ * reset the hardware, this ensure that we're not racing the ISRs.
+ */
+ mtx_lock_spin(&ctrlr->adminq.isr_spin);
+ for (int i = 0; i < ctrlr->num_io_queues; i++) {
+ mtx_lock_spin(&ctrlr->ioq[i].isr_spin);
+ }
+
nvme_ctrlr_disable_qpairs(ctrlr);
err = nvme_ctrlr_disable(ctrlr);
if (err != 0)
- return err;
+ goto out;
err = nvme_ctrlr_enable(ctrlr);
+out:
+
+ /*
+ * Reset complete, unblock ISRs
+ */
+ mtx_unlock_spin(&ctrlr->adminq.isr_spin);
+ for (int i = 0; i < ctrlr->num_io_queues; i++) {
+ mtx_unlock_spin(&ctrlr->ioq[i].isr_spin);
+ }
+
TSEXIT();
return (err);
}
diff --git a/sys/dev/nvme/nvme_private.h b/sys/dev/nvme/nvme_private.h
--- a/sys/dev/nvme/nvme_private.h
+++ b/sys/dev/nvme/nvme_private.h
@@ -184,6 +184,7 @@
int64_t num_retries;
int64_t num_failures;
int64_t num_ignored;
+ int64_t num_spinlock_fail;
struct nvme_command *cmd;
struct nvme_completion *cpl;
@@ -201,8 +202,9 @@
struct nvme_tracker **act_tr;
- struct mtx lock __aligned(CACHE_LINE_SIZE);
+ struct mtx isr_spin __aligned(CACHE_LINE_SIZE);
+ struct mtx lock __aligned(CACHE_LINE_SIZE);
} __aligned(CACHE_LINE_SIZE);
struct nvme_namespace {
diff --git a/sys/dev/nvme/nvme_qpair.c b/sys/dev/nvme/nvme_qpair.c
--- a/sys/dev/nvme/nvme_qpair.c
+++ b/sys/dev/nvme/nvme_qpair.c
@@ -538,6 +538,17 @@
int done = 0;
bool in_panic = dumping || SCHEDULER_STOPPED();
+ /*
+ * Interlock with reset / recovery code. This is an usually uncontended
+ * spinlock to make sure that we drain out of the ISRs before we reset
+ * the card and to prevent races with the recovery process called from
+ * a timeout context.
+ */
+ if (!mtx_trylock_spin(&qpair->isr_spin)) {
+ qpair->num_spinlock_fail++;
+ return (false);
+ }
+
/*
* qpair is not enabled, likely because a controller reset is in
* progress. Ignore the interrupt - any I/O that was associated with
@@ -548,6 +559,7 @@
*/
if (qpair->recovery_state != RECOVERY_NONE) {
qpair->num_ignored++;
+ mtx_unlock_spin(&qpair->isr_spin);
return (false);
}
@@ -673,6 +685,7 @@
qpair->cq_hdbl_off, qpair->cq_head);
}
+ mtx_unlock_spin(&qpair->isr_spin);
return (done != 0);
}
@@ -1007,10 +1020,14 @@
/*
* Next, check to see if we have any completions. If we do,
* we've likely missed an interrupt, but the card is otherwise
- * fine. This will also catch all the commands that are about
- * to timeout (but there's still a tiny race). Since the timeout
- * is long relative to the race between here and the check below,
+ * fine. This will also catch all the commands that are about to
+ * timeout (but there's still a tiny race). Since the timeout is
+ * long relative to the race between here and the check below,
* this is still a win.
+ *
+ * It will also catch misconfigured interrupts. In addition, the
+ * isr_spin lock taken out in nvme_qpair_process_completions
+ * protects against races here, no matter how small.
*/
mtx_unlock(&qpair->lock);
nvme_qpair_process_completions(qpair);
@@ -1019,7 +1036,8 @@
/*
* Somebody else adjusted recovery state while unlocked,
* we should bail. Unlock the qpair and return without
- * doing anything else.
+ * doing anything else. That somebody else has stepped
+ * the state machine into the next stage.
*/
mtx_unlock(&qpair->lock);
return;
@@ -1032,6 +1050,10 @@
now = getsbinuptime();
needs_reset = false;
TAILQ_FOREACH(tr, &qpair->outstanding_tr, tailq) {
+ /*
+ * Skip async commands, they are posted to the card for
+ * an indefinite amount of time and have no deadline.
+ */
if (tr->deadline == SBT_MAX)
continue;
if (now > tr->deadline) {
diff --git a/sys/dev/nvme/nvme_sysctl.c b/sys/dev/nvme/nvme_sysctl.c
--- a/sys/dev/nvme/nvme_sysctl.c
+++ b/sys/dev/nvme/nvme_sysctl.c
@@ -165,6 +165,7 @@
qpair->num_retries = 0;
qpair->num_failures = 0;
qpair->num_ignored = 0;
+ qpair->num_spinlock_fail = 0;
}
static int
@@ -242,6 +243,21 @@
return (sysctl_handle_64(oidp, &num_ignored, 0, req));
}
+static int
+nvme_sysctl_num_spinlock_fail(SYSCTL_HANDLER_ARGS)
+{
+ struct nvme_controller *ctrlr = arg1;
+ int64_t num_spinfail = 0;
+ int i;
+
+ num_spinfail = ctrlr->adminq.num_spinlock_fail;
+
+ for (i = 0; i < ctrlr->num_io_queues; i++)
+ num_spinfail += ctrlr->ioq[i].num_spinlock_fail;
+
+ return (sysctl_handle_64(oidp, &num_spinfail, 0, req));
+}
+
static int
nvme_sysctl_reset_stats(SYSCTL_HANDLER_ARGS)
{
@@ -300,6 +316,9 @@
SYSCTL_ADD_QUAD(ctrlr_ctx, que_list, OID_AUTO, "num_ignored",
CTLFLAG_RD, &qpair->num_ignored,
"Number of interrupts posted, but were administratively ignored");
+ SYSCTL_ADD_QUAD(ctrlr_ctx, que_list, OID_AUTO, "num_spinlock_fail",
+ CTLFLAG_RD, &qpair->num_spinlock_fail,
+ "Number of times that we can't get a spin lock in the ISR");
SYSCTL_ADD_PROC(ctrlr_ctx, que_list, OID_AUTO,
"dump_debug", CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE,
@@ -368,6 +387,11 @@
ctrlr, 0, nvme_sysctl_num_ignored, "IU",
"Number of interrupts ignored administratively");
+ SYSCTL_ADD_PROC(ctrlr_ctx, ctrlr_list, OID_AUTO,
+ "num_spinlock_fail", CTLTYPE_S64 | CTLFLAG_RD | CTLFLAG_MPSAFE,
+ ctrlr, 0, nvme_sysctl_num_spinlock_fail, "IU",
+ "Number of times that we can't get a spin lock in the ISR");
+
SYSCTL_ADD_PROC(ctrlr_ctx, ctrlr_list, OID_AUTO,
"reset_stats", CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, ctrlr,
0, nvme_sysctl_reset_stats, "IU", "Reset statistics to zero");

File Metadata

Mime Type
text/plain
Expires
Mon, Jan 27, 6:04 PM (4 h, 7 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
16202844
Default Alt Text
D41452.id125990.diff (5 KB)

Event Timeline