Page Menu
Home
FreeBSD
Search
Configure Global Search
Log In
Files
F102036159
D41452.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Flag For Later
Award Token
Size
12 KB
Referenced Files
None
Subscribers
None
D41452.diff
View Options
diff --git a/sys/dev/nvme/nvme_ctrlr.c b/sys/dev/nvme/nvme_ctrlr.c
--- a/sys/dev/nvme/nvme_ctrlr.c
+++ b/sys/dev/nvme/nvme_ctrlr.c
@@ -416,6 +416,34 @@
}
}
+static void
+nvme_pre_reset(struct nvme_controller *ctrlr)
+{
+ /*
+ * Make sure that all the ISRs are done before proceeding with the reset
+ * (and also keep any stray interrupts that happen during this process
+ * from racing this process). For startup, this is a nop, since the
+ * hardware is in a good state. But for recovery, where we randomly
+ * reset the hardware, this ensure that we're not racing the ISRs.
+ */
+ mtx_lock(&ctrlr->adminq.recovery);
+ for (int i = 0; i < ctrlr->num_io_queues; i++) {
+ mtx_lock(&ctrlr->ioq[i].recovery);
+ }
+}
+
+static void
+nvme_post_reset(struct nvme_controller *ctrlr)
+{
+ /*
+ * Reset complete, unblock ISRs
+ */
+ mtx_unlock(&ctrlr->adminq.recovery);
+ for (int i = 0; i < ctrlr->num_io_queues; i++) {
+ mtx_unlock(&ctrlr->ioq[i].recovery);
+ }
+}
+
static int
nvme_ctrlr_hw_reset(struct nvme_controller *ctrlr)
{
@@ -427,9 +455,11 @@
err = nvme_ctrlr_disable(ctrlr);
if (err != 0)
- return err;
+ goto out;
err = nvme_ctrlr_enable(ctrlr);
+out:
+
TSEXIT();
return (err);
}
@@ -1157,6 +1187,11 @@
TSENTER();
+ /*
+ * Don't call pre/post reset here. We've not yet created the qpairs,
+ * haven't setup the ISRs, so there's no need to 'drain' them or
+ * 'exclude' them.
+ */
if (nvme_ctrlr_hw_reset(ctrlr) != 0) {
fail:
nvme_ctrlr_fail(ctrlr);
@@ -1201,16 +1236,9 @@
int status;
nvme_ctrlr_devctl_log(ctrlr, "RESET", "resetting controller");
+ nvme_pre_reset(ctrlr);
status = nvme_ctrlr_hw_reset(ctrlr);
- /*
- * Use pause instead of DELAY, so that we yield to any nvme interrupt
- * handlers on this CPU that were blocked on a qpair lock. We want
- * all nvme interrupts completed before proceeding with restarting the
- * controller.
- *
- * XXX - any way to guarantee the interrupt handlers have quiesced?
- */
- pause("nvmereset", hz / 10);
+ nvme_post_reset(ctrlr);
if (status == 0)
nvme_ctrlr_start(ctrlr, true);
else
@@ -1697,6 +1725,7 @@
if (ctrlr->is_failed)
return (0);
+ nvme_pre_reset(ctrlr);
if (nvme_ctrlr_hw_reset(ctrlr) != 0)
goto fail;
#ifdef NVME_2X_RESET
@@ -1708,6 +1737,7 @@
if (nvme_ctrlr_hw_reset(ctrlr) != 0)
goto fail;
#endif
+ nvme_post_reset(ctrlr);
/*
* Now that we've reset the hardware, we can restart the controller. Any
@@ -1724,6 +1754,7 @@
* the controller. However, we have to return success for the resume
* itself, due to questionable APIs.
*/
+ nvme_post_reset(ctrlr);
nvme_printf(ctrlr, "Failed to reset on resume, failing.\n");
nvme_ctrlr_fail(ctrlr);
(void)atomic_cmpset_32(&ctrlr->is_resetting, 1, 0);
diff --git a/sys/dev/nvme/nvme_private.h b/sys/dev/nvme/nvme_private.h
--- a/sys/dev/nvme/nvme_private.h
+++ b/sys/dev/nvme/nvme_private.h
@@ -162,10 +162,9 @@
struct resource *res;
void *tag;
- struct callout timer;
- sbintime_t deadline;
- bool timer_armed;
- enum nvme_recovery recovery_state;
+ struct callout timer; /* recovery lock */
+ bool timer_armed; /* recovery lock */
+ enum nvme_recovery recovery_state; /* recovery lock */
uint32_t num_entries;
uint32_t num_trackers;
@@ -182,6 +181,7 @@
int64_t num_retries;
int64_t num_failures;
int64_t num_ignored;
+ int64_t num_recovery_nolock;
struct nvme_command *cmd;
struct nvme_completion *cpl;
@@ -200,7 +200,7 @@
struct nvme_tracker **act_tr;
struct mtx_padalign lock;
-
+ struct mtx_padalign recovery;
} __aligned(CACHE_LINE_SIZE);
struct nvme_namespace {
diff --git a/sys/dev/nvme/nvme_qpair.c b/sys/dev/nvme/nvme_qpair.c
--- a/sys/dev/nvme/nvme_qpair.c
+++ b/sys/dev/nvme/nvme_qpair.c
@@ -528,14 +528,17 @@
nvme_free_request(req);
}
-bool
-nvme_qpair_process_completions(struct nvme_qpair *qpair)
+/* Locked version of completion processor */
+static bool
+_nvme_qpair_process_completions(struct nvme_qpair *qpair)
{
struct nvme_tracker *tr;
struct nvme_completion cpl;
- int done = 0;
+ bool done = false;
bool in_panic = dumping || SCHEDULER_STOPPED();
+ mtx_assert(&qpair->recovery, MA_OWNED);
+
/*
* qpair is not enabled, likely because a controller reset is in
* progress. Ignore the interrupt - any I/O that was associated with
@@ -629,7 +632,7 @@
else
tr = NULL;
- done++;
+ done = true;
if (tr != NULL) {
nvme_qpair_complete_tracker(tr, &cpl, ERROR_PRINT_ALL);
qpair->sq_head = cpl.sqhd;
@@ -666,12 +669,35 @@
}
}
- if (done != 0) {
+ if (done) {
bus_space_write_4(qpair->ctrlr->bus_tag, qpair->ctrlr->bus_handle,
qpair->cq_hdbl_off, qpair->cq_head);
}
- return (done != 0);
+ return (done);
+}
+
+bool
+nvme_qpair_process_completions(struct nvme_qpair *qpair)
+{
+ bool done;
+
+ /*
+ * Interlock with reset / recovery code. This is an usually uncontended
+ * to make sure that we drain out of the ISRs before we reset the card
+ * and to prevent races with the recovery process called from a timeout
+ * context.
+ */
+ if (!mtx_trylock(&qpair->recovery)) {
+ qpair->num_recovery_nolock++;
+ return (false);
+ }
+
+ done = _nvme_qpair_process_completions(qpair);
+
+ mtx_unlock(&qpair->recovery);
+
+ return (done);
}
static void
@@ -699,6 +725,7 @@
qpair->ctrlr = ctrlr;
mtx_init(&qpair->lock, "nvme qpair lock", NULL, MTX_DEF);
+ mtx_init(&qpair->recovery, "nvme qpair recovery", NULL, MTX_DEF);
/* Note: NVMe PRP format is restricted to 4-byte alignment. */
err = bus_dma_tag_create(bus_get_dma_tag(ctrlr->dev),
@@ -765,7 +792,7 @@
qpair->cpl_bus_addr = queuemem_phys + cmdsz;
prpmem_phys = queuemem_phys + cmdsz + cplsz;
- callout_init(&qpair->timer, 1);
+ callout_init_mtx(&qpair->timer, &qpair->recovery, 0);
qpair->timer_armed = false;
qpair->recovery_state = RECOVERY_WAITING;
@@ -903,6 +930,8 @@
if (mtx_initialized(&qpair->lock))
mtx_destroy(&qpair->lock);
+ if (mtx_initialized(&qpair->recovery))
+ mtx_destroy(&qpair->recovery);
if (qpair->res) {
bus_release_resource(qpair->ctrlr->dev, SYS_RES_IRQ,
@@ -975,14 +1004,12 @@
struct nvme_controller *ctrlr = qpair->ctrlr;
struct nvme_tracker *tr;
sbintime_t now;
- bool idle;
+ bool idle = false;
bool needs_reset;
uint32_t csts;
uint8_t cfs;
-
- mtx_lock(&qpair->lock);
- idle = TAILQ_EMPTY(&qpair->outstanding_tr);
+ mtx_assert(&qpair->recovery, MA_OWNED);
switch (qpair->recovery_state) {
case RECOVERY_NONE:
@@ -1003,25 +1030,10 @@
goto do_reset;
/*
- * Next, check to see if we have any completions. If we do,
- * we've likely missed an interrupt, but the card is otherwise
- * fine. This will also catch all the commands that are about
- * to timeout (but there's still a tiny race). Since the timeout
- * is long relative to the race between here and the check below,
- * this is still a win.
+ * Process completions. We already have the recovery lock, so
+ * call the locked version.
*/
- mtx_unlock(&qpair->lock);
- nvme_qpair_process_completions(qpair);
- mtx_lock(&qpair->lock);
- if (qpair->recovery_state != RECOVERY_NONE) {
- /*
- * Somebody else adjusted recovery state while unlocked,
- * we should bail. Unlock the qpair and return without
- * doing anything else.
- */
- mtx_unlock(&qpair->lock);
- return;
- }
+ _nvme_qpair_process_completions(qpair);
/*
* Check to see if we need to timeout any commands. If we do, then
@@ -1029,7 +1041,13 @@
*/
now = getsbinuptime();
needs_reset = false;
+ idle = true;
+ mtx_lock(&qpair->lock);
TAILQ_FOREACH(tr, &qpair->outstanding_tr, tailq) {
+ /*
+ * Skip async commands, they are posted to the card for
+ * an indefinite amount of time and have no deadline.
+ */
if (tr->deadline == SBT_MAX)
continue;
if (now > tr->deadline) {
@@ -1055,6 +1073,7 @@
idle = false;
}
}
+ mtx_unlock(&qpair->lock);
if (!needs_reset)
break;
@@ -1076,6 +1095,7 @@
break;
case RECOVERY_WAITING:
nvme_printf(ctrlr, "waiting for reset to complete\n");
+ idle = false; /* We want to keep polling */
break;
}
@@ -1087,7 +1107,6 @@
} else {
qpair->timer_armed = false;
}
- mtx_unlock(&qpair->lock);
}
/*
@@ -1196,9 +1215,12 @@
if (tr == NULL || qpair->recovery_state != RECOVERY_NONE) {
/*
- * No tracker is available, or the qpair is disabled due to
- * an in-progress controller-level reset or controller
- * failure.
+ * No tracker is available, or the qpair is disabled due to an
+ * in-progress controller-level reset or controller failure. If
+ * we lose the race with recovery_state, then we may add an
+ * extra request to the queue which will be resubmitted later.
+ * We only set recovery_state to NONE with qpair->lock also
+ * held.
*/
if (qpair->ctrlr->is_failed) {
@@ -1260,7 +1282,10 @@
static void
nvme_qpair_enable(struct nvme_qpair *qpair)
{
- mtx_assert(&qpair->lock, MA_OWNED);
+ if (mtx_initialized(&qpair->recovery))
+ mtx_assert(&qpair->recovery, MA_OWNED);
+ if (mtx_initialized(&qpair->lock))
+ mtx_assert(&qpair->lock, MA_OWNED);
qpair->recovery_state = RECOVERY_NONE;
}
@@ -1311,9 +1336,11 @@
nvme_printf(qpair->ctrlr,
"done aborting outstanding admin\n");
+ mtx_lock(&qpair->recovery);
mtx_lock(&qpair->lock);
nvme_qpair_enable(qpair);
mtx_unlock(&qpair->lock);
+ mtx_unlock(&qpair->recovery);
}
void
@@ -1340,8 +1367,8 @@
if (report)
nvme_printf(qpair->ctrlr, "done aborting outstanding i/o\n");
+ mtx_lock(&qpair->recovery);
mtx_lock(&qpair->lock);
-
nvme_qpair_enable(qpair);
STAILQ_INIT(&temp);
@@ -1360,6 +1387,7 @@
nvme_printf(qpair->ctrlr, "done resubmitting i/o\n");
mtx_unlock(&qpair->lock);
+ mtx_unlock(&qpair->recovery);
}
static void
@@ -1367,27 +1395,40 @@
{
struct nvme_tracker *tr, *tr_temp;
- mtx_lock(&qpair->lock);
+ if (mtx_initialized(&qpair->recovery))
+ mtx_assert(&qpair->recovery, MA_OWNED);
+ if (mtx_initialized(&qpair->lock))
+ mtx_assert(&qpair->lock, MA_OWNED);
+
qpair->recovery_state = RECOVERY_WAITING;
TAILQ_FOREACH_SAFE(tr, &qpair->outstanding_tr, tailq, tr_temp) {
tr->deadline = SBT_MAX;
}
- mtx_unlock(&qpair->lock);
}
void
nvme_admin_qpair_disable(struct nvme_qpair *qpair)
{
+ mtx_lock(&qpair->recovery);
+ mtx_lock(&qpair->lock);
nvme_qpair_disable(qpair);
nvme_admin_qpair_abort_aers(qpair);
+
+ mtx_unlock(&qpair->lock);
+ mtx_unlock(&qpair->recovery);
}
void
nvme_io_qpair_disable(struct nvme_qpair *qpair)
{
+ mtx_lock(&qpair->recovery);
+ mtx_lock(&qpair->lock);
nvme_qpair_disable(qpair);
+
+ mtx_unlock(&qpair->lock);
+ mtx_unlock(&qpair->recovery);
}
void
diff --git a/sys/dev/nvme/nvme_sysctl.c b/sys/dev/nvme/nvme_sysctl.c
--- a/sys/dev/nvme/nvme_sysctl.c
+++ b/sys/dev/nvme/nvme_sysctl.c
@@ -163,6 +163,7 @@
qpair->num_retries = 0;
qpair->num_failures = 0;
qpair->num_ignored = 0;
+ qpair->num_recovery_nolock = 0;
}
static int
@@ -240,6 +241,21 @@
return (sysctl_handle_64(oidp, &num_ignored, 0, req));
}
+static int
+nvme_sysctl_num_recovery_nolock(SYSCTL_HANDLER_ARGS)
+{
+ struct nvme_controller *ctrlr = arg1;
+ int64_t num;
+ int i;
+
+ num = ctrlr->adminq.num_recovery_nolock;
+
+ for (i = 0; i < ctrlr->num_io_queues; i++)
+ num += ctrlr->ioq[i].num_recovery_nolock;
+
+ return (sysctl_handle_64(oidp, &num, 0, req));
+}
+
static int
nvme_sysctl_reset_stats(SYSCTL_HANDLER_ARGS)
{
@@ -298,6 +314,9 @@
SYSCTL_ADD_QUAD(ctrlr_ctx, que_list, OID_AUTO, "num_ignored",
CTLFLAG_RD, &qpair->num_ignored,
"Number of interrupts posted, but were administratively ignored");
+ SYSCTL_ADD_QUAD(ctrlr_ctx, que_list, OID_AUTO, "num_recovery_nolock",
+ CTLFLAG_RD, &qpair->num_recovery_nolock,
+ "Number of times that we failed to lock recovery in the ISR");
SYSCTL_ADD_PROC(ctrlr_ctx, que_list, OID_AUTO,
"dump_debug", CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE,
@@ -366,6 +385,11 @@
ctrlr, 0, nvme_sysctl_num_ignored, "IU",
"Number of interrupts ignored administratively");
+ SYSCTL_ADD_PROC(ctrlr_ctx, ctrlr_list, OID_AUTO,
+ "num_recovery_nolock", CTLTYPE_S64 | CTLFLAG_RD | CTLFLAG_MPSAFE,
+ ctrlr, 0, nvme_sysctl_num_recovery_nolock, "IU",
+ "Number of times that we failed to lock recovery in the ISR");
+
SYSCTL_ADD_PROC(ctrlr_ctx, ctrlr_list, OID_AUTO,
"reset_stats", CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, ctrlr,
0, nvme_sysctl_reset_stats, "IU", "Reset statistics to zero");
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Thu, Nov 7, 7:36 PM (22 h, 9 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
14519398
Default Alt Text
D41452.diff (12 KB)
Attached To
Mode
D41452: nvme: Add exclusion for ISR
Attached
Detach File
Event Timeline
Log In to Comment