Page MenuHomeFreeBSD

D41452.diff
No OneTemporary

D41452.diff

diff --git a/sys/dev/nvme/nvme_ctrlr.c b/sys/dev/nvme/nvme_ctrlr.c
--- a/sys/dev/nvme/nvme_ctrlr.c
+++ b/sys/dev/nvme/nvme_ctrlr.c
@@ -416,6 +416,34 @@
}
}
+static void
+nvme_pre_reset(struct nvme_controller *ctrlr)
+{
+ /*
+ * Make sure that all the ISRs are done before proceeding with the reset
+ * (and also keep any stray interrupts that happen during this process
+ * from racing this process). For startup, this is a nop, since the
+ * hardware is in a good state. But for recovery, where we randomly
+ * reset the hardware, this ensure that we're not racing the ISRs.
+ */
+ mtx_lock(&ctrlr->adminq.recovery);
+ for (int i = 0; i < ctrlr->num_io_queues; i++) {
+ mtx_lock(&ctrlr->ioq[i].recovery);
+ }
+}
+
+static void
+nvme_post_reset(struct nvme_controller *ctrlr)
+{
+ /*
+ * Reset complete, unblock ISRs
+ */
+ mtx_unlock(&ctrlr->adminq.recovery);
+ for (int i = 0; i < ctrlr->num_io_queues; i++) {
+ mtx_unlock(&ctrlr->ioq[i].recovery);
+ }
+}
+
static int
nvme_ctrlr_hw_reset(struct nvme_controller *ctrlr)
{
@@ -427,9 +455,11 @@
err = nvme_ctrlr_disable(ctrlr);
if (err != 0)
- return err;
+ goto out;
err = nvme_ctrlr_enable(ctrlr);
+out:
+
TSEXIT();
return (err);
}
@@ -1157,6 +1187,11 @@
TSENTER();
+ /*
+ * Don't call pre/post reset here. We've not yet created the qpairs,
+ * haven't setup the ISRs, so there's no need to 'drain' them or
+ * 'exclude' them.
+ */
if (nvme_ctrlr_hw_reset(ctrlr) != 0) {
fail:
nvme_ctrlr_fail(ctrlr);
@@ -1201,16 +1236,9 @@
int status;
nvme_ctrlr_devctl_log(ctrlr, "RESET", "resetting controller");
+ nvme_pre_reset(ctrlr);
status = nvme_ctrlr_hw_reset(ctrlr);
- /*
- * Use pause instead of DELAY, so that we yield to any nvme interrupt
- * handlers on this CPU that were blocked on a qpair lock. We want
- * all nvme interrupts completed before proceeding with restarting the
- * controller.
- *
- * XXX - any way to guarantee the interrupt handlers have quiesced?
- */
- pause("nvmereset", hz / 10);
+ nvme_post_reset(ctrlr);
if (status == 0)
nvme_ctrlr_start(ctrlr, true);
else
@@ -1697,6 +1725,7 @@
if (ctrlr->is_failed)
return (0);
+ nvme_pre_reset(ctrlr);
if (nvme_ctrlr_hw_reset(ctrlr) != 0)
goto fail;
#ifdef NVME_2X_RESET
@@ -1708,6 +1737,7 @@
if (nvme_ctrlr_hw_reset(ctrlr) != 0)
goto fail;
#endif
+ nvme_post_reset(ctrlr);
/*
* Now that we've reset the hardware, we can restart the controller. Any
@@ -1724,6 +1754,7 @@
* the controller. However, we have to return success for the resume
* itself, due to questionable APIs.
*/
+ nvme_post_reset(ctrlr);
nvme_printf(ctrlr, "Failed to reset on resume, failing.\n");
nvme_ctrlr_fail(ctrlr);
(void)atomic_cmpset_32(&ctrlr->is_resetting, 1, 0);
diff --git a/sys/dev/nvme/nvme_private.h b/sys/dev/nvme/nvme_private.h
--- a/sys/dev/nvme/nvme_private.h
+++ b/sys/dev/nvme/nvme_private.h
@@ -162,10 +162,9 @@
struct resource *res;
void *tag;
- struct callout timer;
- sbintime_t deadline;
- bool timer_armed;
- enum nvme_recovery recovery_state;
+ struct callout timer; /* recovery lock */
+ bool timer_armed; /* recovery lock */
+ enum nvme_recovery recovery_state; /* recovery lock */
uint32_t num_entries;
uint32_t num_trackers;
@@ -182,6 +181,7 @@
int64_t num_retries;
int64_t num_failures;
int64_t num_ignored;
+ int64_t num_recovery_nolock;
struct nvme_command *cmd;
struct nvme_completion *cpl;
@@ -200,7 +200,7 @@
struct nvme_tracker **act_tr;
struct mtx_padalign lock;
-
+ struct mtx_padalign recovery;
} __aligned(CACHE_LINE_SIZE);
struct nvme_namespace {
diff --git a/sys/dev/nvme/nvme_qpair.c b/sys/dev/nvme/nvme_qpair.c
--- a/sys/dev/nvme/nvme_qpair.c
+++ b/sys/dev/nvme/nvme_qpair.c
@@ -528,14 +528,17 @@
nvme_free_request(req);
}
-bool
-nvme_qpair_process_completions(struct nvme_qpair *qpair)
+/* Locked version of completion processor */
+static bool
+_nvme_qpair_process_completions(struct nvme_qpair *qpair)
{
struct nvme_tracker *tr;
struct nvme_completion cpl;
- int done = 0;
+ bool done = false;
bool in_panic = dumping || SCHEDULER_STOPPED();
+ mtx_assert(&qpair->recovery, MA_OWNED);
+
/*
* qpair is not enabled, likely because a controller reset is in
* progress. Ignore the interrupt - any I/O that was associated with
@@ -629,7 +632,7 @@
else
tr = NULL;
- done++;
+ done = true;
if (tr != NULL) {
nvme_qpair_complete_tracker(tr, &cpl, ERROR_PRINT_ALL);
qpair->sq_head = cpl.sqhd;
@@ -666,12 +669,35 @@
}
}
- if (done != 0) {
+ if (done) {
bus_space_write_4(qpair->ctrlr->bus_tag, qpair->ctrlr->bus_handle,
qpair->cq_hdbl_off, qpair->cq_head);
}
- return (done != 0);
+ return (done);
+}
+
+bool
+nvme_qpair_process_completions(struct nvme_qpair *qpair)
+{
+ bool done;
+
+ /*
+ * Interlock with reset / recovery code. This is an usually uncontended
+ * to make sure that we drain out of the ISRs before we reset the card
+ * and to prevent races with the recovery process called from a timeout
+ * context.
+ */
+ if (!mtx_trylock(&qpair->recovery)) {
+ qpair->num_recovery_nolock++;
+ return (false);
+ }
+
+ done = _nvme_qpair_process_completions(qpair);
+
+ mtx_unlock(&qpair->recovery);
+
+ return (done);
}
static void
@@ -699,6 +725,7 @@
qpair->ctrlr = ctrlr;
mtx_init(&qpair->lock, "nvme qpair lock", NULL, MTX_DEF);
+ mtx_init(&qpair->recovery, "nvme qpair recovery", NULL, MTX_DEF);
/* Note: NVMe PRP format is restricted to 4-byte alignment. */
err = bus_dma_tag_create(bus_get_dma_tag(ctrlr->dev),
@@ -765,7 +792,7 @@
qpair->cpl_bus_addr = queuemem_phys + cmdsz;
prpmem_phys = queuemem_phys + cmdsz + cplsz;
- callout_init(&qpair->timer, 1);
+ callout_init_mtx(&qpair->timer, &qpair->recovery, 0);
qpair->timer_armed = false;
qpair->recovery_state = RECOVERY_WAITING;
@@ -903,6 +930,8 @@
if (mtx_initialized(&qpair->lock))
mtx_destroy(&qpair->lock);
+ if (mtx_initialized(&qpair->recovery))
+ mtx_destroy(&qpair->recovery);
if (qpair->res) {
bus_release_resource(qpair->ctrlr->dev, SYS_RES_IRQ,
@@ -975,14 +1004,12 @@
struct nvme_controller *ctrlr = qpair->ctrlr;
struct nvme_tracker *tr;
sbintime_t now;
- bool idle;
+ bool idle = false;
bool needs_reset;
uint32_t csts;
uint8_t cfs;
-
- mtx_lock(&qpair->lock);
- idle = TAILQ_EMPTY(&qpair->outstanding_tr);
+ mtx_assert(&qpair->recovery, MA_OWNED);
switch (qpair->recovery_state) {
case RECOVERY_NONE:
@@ -1003,25 +1030,10 @@
goto do_reset;
/*
- * Next, check to see if we have any completions. If we do,
- * we've likely missed an interrupt, but the card is otherwise
- * fine. This will also catch all the commands that are about
- * to timeout (but there's still a tiny race). Since the timeout
- * is long relative to the race between here and the check below,
- * this is still a win.
+ * Process completions. We already have the recovery lock, so
+ * call the locked version.
*/
- mtx_unlock(&qpair->lock);
- nvme_qpair_process_completions(qpair);
- mtx_lock(&qpair->lock);
- if (qpair->recovery_state != RECOVERY_NONE) {
- /*
- * Somebody else adjusted recovery state while unlocked,
- * we should bail. Unlock the qpair and return without
- * doing anything else.
- */
- mtx_unlock(&qpair->lock);
- return;
- }
+ _nvme_qpair_process_completions(qpair);
/*
* Check to see if we need to timeout any commands. If we do, then
@@ -1029,7 +1041,13 @@
*/
now = getsbinuptime();
needs_reset = false;
+ idle = true;
+ mtx_lock(&qpair->lock);
TAILQ_FOREACH(tr, &qpair->outstanding_tr, tailq) {
+ /*
+ * Skip async commands, they are posted to the card for
+ * an indefinite amount of time and have no deadline.
+ */
if (tr->deadline == SBT_MAX)
continue;
if (now > tr->deadline) {
@@ -1055,6 +1073,7 @@
idle = false;
}
}
+ mtx_unlock(&qpair->lock);
if (!needs_reset)
break;
@@ -1076,6 +1095,7 @@
break;
case RECOVERY_WAITING:
nvme_printf(ctrlr, "waiting for reset to complete\n");
+ idle = false; /* We want to keep polling */
break;
}
@@ -1087,7 +1107,6 @@
} else {
qpair->timer_armed = false;
}
- mtx_unlock(&qpair->lock);
}
/*
@@ -1196,9 +1215,12 @@
if (tr == NULL || qpair->recovery_state != RECOVERY_NONE) {
/*
- * No tracker is available, or the qpair is disabled due to
- * an in-progress controller-level reset or controller
- * failure.
+ * No tracker is available, or the qpair is disabled due to an
+ * in-progress controller-level reset or controller failure. If
+ * we lose the race with recovery_state, then we may add an
+ * extra request to the queue which will be resubmitted later.
+ * We only set recovery_state to NONE with qpair->lock also
+ * held.
*/
if (qpair->ctrlr->is_failed) {
@@ -1260,7 +1282,10 @@
static void
nvme_qpair_enable(struct nvme_qpair *qpair)
{
- mtx_assert(&qpair->lock, MA_OWNED);
+ if (mtx_initialized(&qpair->recovery))
+ mtx_assert(&qpair->recovery, MA_OWNED);
+ if (mtx_initialized(&qpair->lock))
+ mtx_assert(&qpair->lock, MA_OWNED);
qpair->recovery_state = RECOVERY_NONE;
}
@@ -1311,9 +1336,11 @@
nvme_printf(qpair->ctrlr,
"done aborting outstanding admin\n");
+ mtx_lock(&qpair->recovery);
mtx_lock(&qpair->lock);
nvme_qpair_enable(qpair);
mtx_unlock(&qpair->lock);
+ mtx_unlock(&qpair->recovery);
}
void
@@ -1340,8 +1367,8 @@
if (report)
nvme_printf(qpair->ctrlr, "done aborting outstanding i/o\n");
+ mtx_lock(&qpair->recovery);
mtx_lock(&qpair->lock);
-
nvme_qpair_enable(qpair);
STAILQ_INIT(&temp);
@@ -1360,6 +1387,7 @@
nvme_printf(qpair->ctrlr, "done resubmitting i/o\n");
mtx_unlock(&qpair->lock);
+ mtx_unlock(&qpair->recovery);
}
static void
@@ -1367,27 +1395,40 @@
{
struct nvme_tracker *tr, *tr_temp;
- mtx_lock(&qpair->lock);
+ if (mtx_initialized(&qpair->recovery))
+ mtx_assert(&qpair->recovery, MA_OWNED);
+ if (mtx_initialized(&qpair->lock))
+ mtx_assert(&qpair->lock, MA_OWNED);
+
qpair->recovery_state = RECOVERY_WAITING;
TAILQ_FOREACH_SAFE(tr, &qpair->outstanding_tr, tailq, tr_temp) {
tr->deadline = SBT_MAX;
}
- mtx_unlock(&qpair->lock);
}
void
nvme_admin_qpair_disable(struct nvme_qpair *qpair)
{
+ mtx_lock(&qpair->recovery);
+ mtx_lock(&qpair->lock);
nvme_qpair_disable(qpair);
nvme_admin_qpair_abort_aers(qpair);
+
+ mtx_unlock(&qpair->lock);
+ mtx_unlock(&qpair->recovery);
}
void
nvme_io_qpair_disable(struct nvme_qpair *qpair)
{
+ mtx_lock(&qpair->recovery);
+ mtx_lock(&qpair->lock);
nvme_qpair_disable(qpair);
+
+ mtx_unlock(&qpair->lock);
+ mtx_unlock(&qpair->recovery);
}
void
diff --git a/sys/dev/nvme/nvme_sysctl.c b/sys/dev/nvme/nvme_sysctl.c
--- a/sys/dev/nvme/nvme_sysctl.c
+++ b/sys/dev/nvme/nvme_sysctl.c
@@ -163,6 +163,7 @@
qpair->num_retries = 0;
qpair->num_failures = 0;
qpair->num_ignored = 0;
+ qpair->num_recovery_nolock = 0;
}
static int
@@ -240,6 +241,21 @@
return (sysctl_handle_64(oidp, &num_ignored, 0, req));
}
+static int
+nvme_sysctl_num_recovery_nolock(SYSCTL_HANDLER_ARGS)
+{
+ struct nvme_controller *ctrlr = arg1;
+ int64_t num;
+ int i;
+
+ num = ctrlr->adminq.num_recovery_nolock;
+
+ for (i = 0; i < ctrlr->num_io_queues; i++)
+ num += ctrlr->ioq[i].num_recovery_nolock;
+
+ return (sysctl_handle_64(oidp, &num, 0, req));
+}
+
static int
nvme_sysctl_reset_stats(SYSCTL_HANDLER_ARGS)
{
@@ -298,6 +314,9 @@
SYSCTL_ADD_QUAD(ctrlr_ctx, que_list, OID_AUTO, "num_ignored",
CTLFLAG_RD, &qpair->num_ignored,
"Number of interrupts posted, but were administratively ignored");
+ SYSCTL_ADD_QUAD(ctrlr_ctx, que_list, OID_AUTO, "num_recovery_nolock",
+ CTLFLAG_RD, &qpair->num_recovery_nolock,
+ "Number of times that we failed to lock recovery in the ISR");
SYSCTL_ADD_PROC(ctrlr_ctx, que_list, OID_AUTO,
"dump_debug", CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE,
@@ -366,6 +385,11 @@
ctrlr, 0, nvme_sysctl_num_ignored, "IU",
"Number of interrupts ignored administratively");
+ SYSCTL_ADD_PROC(ctrlr_ctx, ctrlr_list, OID_AUTO,
+ "num_recovery_nolock", CTLTYPE_S64 | CTLFLAG_RD | CTLFLAG_MPSAFE,
+ ctrlr, 0, nvme_sysctl_num_recovery_nolock, "IU",
+ "Number of times that we failed to lock recovery in the ISR");
+
SYSCTL_ADD_PROC(ctrlr_ctx, ctrlr_list, OID_AUTO,
"reset_stats", CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, ctrlr,
0, nvme_sysctl_reset_stats, "IU", "Reset statistics to zero");

File Metadata

Mime Type
text/plain
Expires
Thu, Nov 7, 7:36 PM (22 h, 9 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
14519398
Default Alt Text
D41452.diff (12 KB)

Event Timeline