D41452.diff
No OneTemporary
Actions

Size

12 KB

Referenced Files

None

Subscribers

None

D41452.diff
View Options

	diff --git a/sys/dev/nvme/nvme_ctrlr.c b/sys/dev/nvme/nvme_ctrlr.c
	--- a/sys/dev/nvme/nvme_ctrlr.c
	+++ b/sys/dev/nvme/nvme_ctrlr.c
	@@ -416,6 +416,34 @@
	}
	}

	+static void
	+nvme_pre_reset(struct nvme_controller *ctrlr)
	+{
	+ /*
	+ * Make sure that all the ISRs are done before proceeding with the reset
	+ * (and also keep any stray interrupts that happen during this process
	+ * from racing this process). For startup, this is a nop, since the
	+ * hardware is in a good state. But for recovery, where we randomly
	+ * reset the hardware, this ensure that we're not racing the ISRs.
	+ */
	+ mtx_lock(&ctrlr->adminq.recovery);
	+ for (int i = 0; i < ctrlr->num_io_queues; i++) {
	+ mtx_lock(&ctrlr->ioq[i].recovery);
	+ }
	+}
	+
	+static void
	+nvme_post_reset(struct nvme_controller *ctrlr)
	+{
	+ /*
	+ * Reset complete, unblock ISRs
	+ */
	+ mtx_unlock(&ctrlr->adminq.recovery);
	+ for (int i = 0; i < ctrlr->num_io_queues; i++) {
	+ mtx_unlock(&ctrlr->ioq[i].recovery);
	+ }
	+}
	+
	static int
	nvme_ctrlr_hw_reset(struct nvme_controller *ctrlr)
	{
	@@ -427,9 +455,11 @@

	err = nvme_ctrlr_disable(ctrlr);
	if (err != 0)
	- return err;
	+ goto out;

	err = nvme_ctrlr_enable(ctrlr);
	+out:
	+
	TSEXIT();
	return (err);
	}
	@@ -1157,6 +1187,11 @@

	TSENTER();

	+ /*
	+ * Don't call pre/post reset here. We've not yet created the qpairs,
	+ * haven't setup the ISRs, so there's no need to 'drain' them or
	+ * 'exclude' them.
	+ */
	if (nvme_ctrlr_hw_reset(ctrlr) != 0) {
	fail:
	nvme_ctrlr_fail(ctrlr);
	@@ -1201,16 +1236,9 @@
	int status;

	nvme_ctrlr_devctl_log(ctrlr, "RESET", "resetting controller");
	+ nvme_pre_reset(ctrlr);
	status = nvme_ctrlr_hw_reset(ctrlr);
	- /*
	- * Use pause instead of DELAY, so that we yield to any nvme interrupt
	- * handlers on this CPU that were blocked on a qpair lock. We want
	- * all nvme interrupts completed before proceeding with restarting the
	- * controller.
	- *
	- * XXX - any way to guarantee the interrupt handlers have quiesced?
	- */
	- pause("nvmereset", hz / 10);
	+ nvme_post_reset(ctrlr);
	if (status == 0)
	nvme_ctrlr_start(ctrlr, true);
	else
	@@ -1697,6 +1725,7 @@
	if (ctrlr->is_failed)
	return (0);

	+ nvme_pre_reset(ctrlr);
	if (nvme_ctrlr_hw_reset(ctrlr) != 0)
	goto fail;
	#ifdef NVME_2X_RESET
	@@ -1708,6 +1737,7 @@
	if (nvme_ctrlr_hw_reset(ctrlr) != 0)
	goto fail;
	#endif
	+ nvme_post_reset(ctrlr);

	/*
	* Now that we've reset the hardware, we can restart the controller. Any
	@@ -1724,6 +1754,7 @@
	* the controller. However, we have to return success for the resume
	* itself, due to questionable APIs.
	*/
	+ nvme_post_reset(ctrlr);
	nvme_printf(ctrlr, "Failed to reset on resume, failing.\n");
	nvme_ctrlr_fail(ctrlr);
	(void)atomic_cmpset_32(&ctrlr->is_resetting, 1, 0);
	diff --git a/sys/dev/nvme/nvme_private.h b/sys/dev/nvme/nvme_private.h
	--- a/sys/dev/nvme/nvme_private.h
	+++ b/sys/dev/nvme/nvme_private.h
	@@ -162,10 +162,9 @@
	struct resource *res;
	void *tag;

	- struct callout timer;
	- sbintime_t deadline;
	- bool timer_armed;
	- enum nvme_recovery recovery_state;
	+ struct callout timer; /* recovery lock */
	+ bool timer_armed; /* recovery lock */
	+ enum nvme_recovery recovery_state; /* recovery lock */

	uint32_t num_entries;
	uint32_t num_trackers;
	@@ -182,6 +181,7 @@
	int64_t num_retries;
	int64_t num_failures;
	int64_t num_ignored;
	+ int64_t num_recovery_nolock;

	struct nvme_command *cmd;
	struct nvme_completion *cpl;
	@@ -200,7 +200,7 @@
	struct nvme_tracker **act_tr;

	struct mtx_padalign lock;
	-
	+ struct mtx_padalign recovery;
	} __aligned(CACHE_LINE_SIZE);

	struct nvme_namespace {
	diff --git a/sys/dev/nvme/nvme_qpair.c b/sys/dev/nvme/nvme_qpair.c
	--- a/sys/dev/nvme/nvme_qpair.c
	+++ b/sys/dev/nvme/nvme_qpair.c
	@@ -528,14 +528,17 @@
	nvme_free_request(req);
	}

	-bool
	-nvme_qpair_process_completions(struct nvme_qpair *qpair)
	+/* Locked version of completion processor */
	+static bool
	+_nvme_qpair_process_completions(struct nvme_qpair *qpair)
	{
	struct nvme_tracker *tr;
	struct nvme_completion cpl;
	- int done = 0;
	+ bool done = false;
	bool in_panic = dumping \|\| SCHEDULER_STOPPED();

	+ mtx_assert(&qpair->recovery, MA_OWNED);
	+
	/*
	* qpair is not enabled, likely because a controller reset is in
	* progress. Ignore the interrupt - any I/O that was associated with
	@@ -629,7 +632,7 @@
	else
	tr = NULL;

	- done++;
	+ done = true;
	if (tr != NULL) {
	nvme_qpair_complete_tracker(tr, &cpl, ERROR_PRINT_ALL);
	qpair->sq_head = cpl.sqhd;
	@@ -666,12 +669,35 @@
	}
	}

	- if (done != 0) {
	+ if (done) {
	bus_space_write_4(qpair->ctrlr->bus_tag, qpair->ctrlr->bus_handle,
	qpair->cq_hdbl_off, qpair->cq_head);
	}

	- return (done != 0);
	+ return (done);
	+}
	+
	+bool
	+nvme_qpair_process_completions(struct nvme_qpair *qpair)
	+{
	+ bool done;
	+
	+ /*
	+ * Interlock with reset / recovery code. This is an usually uncontended
	+ * to make sure that we drain out of the ISRs before we reset the card
	+ * and to prevent races with the recovery process called from a timeout
	+ * context.
	+ */
	+ if (!mtx_trylock(&qpair->recovery)) {
	+ qpair->num_recovery_nolock++;
	+ return (false);
	+ }
	+
	+ done = _nvme_qpair_process_completions(qpair);
	+
	+ mtx_unlock(&qpair->recovery);
	+
	+ return (done);
	}

	static void
	@@ -699,6 +725,7 @@
	qpair->ctrlr = ctrlr;

	mtx_init(&qpair->lock, "nvme qpair lock", NULL, MTX_DEF);
	+ mtx_init(&qpair->recovery, "nvme qpair recovery", NULL, MTX_DEF);

	/* Note: NVMe PRP format is restricted to 4-byte alignment. */
	err = bus_dma_tag_create(bus_get_dma_tag(ctrlr->dev),
	@@ -765,7 +792,7 @@
	qpair->cpl_bus_addr = queuemem_phys + cmdsz;
	prpmem_phys = queuemem_phys + cmdsz + cplsz;

	- callout_init(&qpair->timer, 1);
	+ callout_init_mtx(&qpair->timer, &qpair->recovery, 0);
	qpair->timer_armed = false;
	qpair->recovery_state = RECOVERY_WAITING;

	@@ -903,6 +930,8 @@

	if (mtx_initialized(&qpair->lock))
	mtx_destroy(&qpair->lock);
	+ if (mtx_initialized(&qpair->recovery))
	+ mtx_destroy(&qpair->recovery);

	if (qpair->res) {
	bus_release_resource(qpair->ctrlr->dev, SYS_RES_IRQ,
	@@ -975,14 +1004,12 @@
	struct nvme_controller *ctrlr = qpair->ctrlr;
	struct nvme_tracker *tr;
	sbintime_t now;
	- bool idle;
	+ bool idle = false;
	bool needs_reset;
	uint32_t csts;
	uint8_t cfs;

	-
	- mtx_lock(&qpair->lock);
	- idle = TAILQ_EMPTY(&qpair->outstanding_tr);
	+ mtx_assert(&qpair->recovery, MA_OWNED);

	switch (qpair->recovery_state) {
	case RECOVERY_NONE:
	@@ -1003,25 +1030,10 @@
	goto do_reset;

	/*
	- * Next, check to see if we have any completions. If we do,
	- * we've likely missed an interrupt, but the card is otherwise
	- * fine. This will also catch all the commands that are about
	- * to timeout (but there's still a tiny race). Since the timeout
	- * is long relative to the race between here and the check below,
	- * this is still a win.
	+ * Process completions. We already have the recovery lock, so
	+ * call the locked version.
	*/
	- mtx_unlock(&qpair->lock);
	- nvme_qpair_process_completions(qpair);
	- mtx_lock(&qpair->lock);
	- if (qpair->recovery_state != RECOVERY_NONE) {
	- /*
	- * Somebody else adjusted recovery state while unlocked,
	- * we should bail. Unlock the qpair and return without
	- * doing anything else.
	- */
	- mtx_unlock(&qpair->lock);
	- return;
	- }
	+ _nvme_qpair_process_completions(qpair);

	/*
	* Check to see if we need to timeout any commands. If we do, then
	@@ -1029,7 +1041,13 @@
	*/
	now = getsbinuptime();
	needs_reset = false;
	+ idle = true;
	+ mtx_lock(&qpair->lock);
	TAILQ_FOREACH(tr, &qpair->outstanding_tr, tailq) {
	+ /*
	+ * Skip async commands, they are posted to the card for
	+ * an indefinite amount of time and have no deadline.
	+ */
	if (tr->deadline == SBT_MAX)
	continue;
	if (now > tr->deadline) {
	@@ -1055,6 +1073,7 @@
	idle = false;
	}
	}
	+ mtx_unlock(&qpair->lock);
	if (!needs_reset)
	break;

	@@ -1076,6 +1095,7 @@
	break;
	case RECOVERY_WAITING:
	nvme_printf(ctrlr, "waiting for reset to complete\n");
	+ idle = false; /* We want to keep polling */
	break;
	}

	@@ -1087,7 +1107,6 @@
	} else {
	qpair->timer_armed = false;
	}
	- mtx_unlock(&qpair->lock);
	}

	/*
	@@ -1196,9 +1215,12 @@

	if (tr == NULL \|\| qpair->recovery_state != RECOVERY_NONE) {
	/*
	- * No tracker is available, or the qpair is disabled due to
	- * an in-progress controller-level reset or controller
	- * failure.
	+ * No tracker is available, or the qpair is disabled due to an
	+ * in-progress controller-level reset or controller failure. If
	+ * we lose the race with recovery_state, then we may add an
	+ * extra request to the queue which will be resubmitted later.
	+ * We only set recovery_state to NONE with qpair->lock also
	+ * held.
	*/

	if (qpair->ctrlr->is_failed) {
	@@ -1260,7 +1282,10 @@
	static void
	nvme_qpair_enable(struct nvme_qpair *qpair)
	{
	- mtx_assert(&qpair->lock, MA_OWNED);
	+ if (mtx_initialized(&qpair->recovery))
	+ mtx_assert(&qpair->recovery, MA_OWNED);
	+ if (mtx_initialized(&qpair->lock))
	+ mtx_assert(&qpair->lock, MA_OWNED);

	qpair->recovery_state = RECOVERY_NONE;
	}
	@@ -1311,9 +1336,11 @@
	nvme_printf(qpair->ctrlr,
	"done aborting outstanding admin\n");

	+ mtx_lock(&qpair->recovery);
	mtx_lock(&qpair->lock);
	nvme_qpair_enable(qpair);
	mtx_unlock(&qpair->lock);
	+ mtx_unlock(&qpair->recovery);
	}

	void
	@@ -1340,8 +1367,8 @@
	if (report)
	nvme_printf(qpair->ctrlr, "done aborting outstanding i/o\n");

	+ mtx_lock(&qpair->recovery);
	mtx_lock(&qpair->lock);
	-
	nvme_qpair_enable(qpair);

	STAILQ_INIT(&temp);
	@@ -1360,6 +1387,7 @@
	nvme_printf(qpair->ctrlr, "done resubmitting i/o\n");

	mtx_unlock(&qpair->lock);
	+ mtx_unlock(&qpair->recovery);
	}

	static void
	@@ -1367,27 +1395,40 @@
	{
	struct nvme_tracker tr, tr_temp;

	- mtx_lock(&qpair->lock);
	+ if (mtx_initialized(&qpair->recovery))
	+ mtx_assert(&qpair->recovery, MA_OWNED);
	+ if (mtx_initialized(&qpair->lock))
	+ mtx_assert(&qpair->lock, MA_OWNED);
	+
	qpair->recovery_state = RECOVERY_WAITING;
	TAILQ_FOREACH_SAFE(tr, &qpair->outstanding_tr, tailq, tr_temp) {
	tr->deadline = SBT_MAX;
	}
	- mtx_unlock(&qpair->lock);
	}

	void
	nvme_admin_qpair_disable(struct nvme_qpair *qpair)
	{
	+ mtx_lock(&qpair->recovery);
	+ mtx_lock(&qpair->lock);

	nvme_qpair_disable(qpair);
	nvme_admin_qpair_abort_aers(qpair);
	+
	+ mtx_unlock(&qpair->lock);
	+ mtx_unlock(&qpair->recovery);
	}

	void
	nvme_io_qpair_disable(struct nvme_qpair *qpair)
	{
	+ mtx_lock(&qpair->recovery);
	+ mtx_lock(&qpair->lock);

	nvme_qpair_disable(qpair);
	+
	+ mtx_unlock(&qpair->lock);
	+ mtx_unlock(&qpair->recovery);
	}

	void
	diff --git a/sys/dev/nvme/nvme_sysctl.c b/sys/dev/nvme/nvme_sysctl.c
	--- a/sys/dev/nvme/nvme_sysctl.c
	+++ b/sys/dev/nvme/nvme_sysctl.c
	@@ -163,6 +163,7 @@
	qpair->num_retries = 0;
	qpair->num_failures = 0;
	qpair->num_ignored = 0;
	+ qpair->num_recovery_nolock = 0;
	}

	static int
	@@ -240,6 +241,21 @@
	return (sysctl_handle_64(oidp, &num_ignored, 0, req));
	}

	+static int
	+nvme_sysctl_num_recovery_nolock(SYSCTL_HANDLER_ARGS)
	+{
	+ struct nvme_controller *ctrlr = arg1;
	+ int64_t num;
	+ int i;
	+
	+ num = ctrlr->adminq.num_recovery_nolock;
	+
	+ for (i = 0; i < ctrlr->num_io_queues; i++)
	+ num += ctrlr->ioq[i].num_recovery_nolock;
	+
	+ return (sysctl_handle_64(oidp, &num, 0, req));
	+}
	+
	static int
	nvme_sysctl_reset_stats(SYSCTL_HANDLER_ARGS)
	{
	@@ -298,6 +314,9 @@
	SYSCTL_ADD_QUAD(ctrlr_ctx, que_list, OID_AUTO, "num_ignored",
	CTLFLAG_RD, &qpair->num_ignored,
	"Number of interrupts posted, but were administratively ignored");
	+ SYSCTL_ADD_QUAD(ctrlr_ctx, que_list, OID_AUTO, "num_recovery_nolock",
	+ CTLFLAG_RD, &qpair->num_recovery_nolock,
	+ "Number of times that we failed to lock recovery in the ISR");

	SYSCTL_ADD_PROC(ctrlr_ctx, que_list, OID_AUTO,
	"dump_debug", CTLTYPE_UINT \| CTLFLAG_RW \| CTLFLAG_MPSAFE,
	@@ -366,6 +385,11 @@
	ctrlr, 0, nvme_sysctl_num_ignored, "IU",
	"Number of interrupts ignored administratively");

	+ SYSCTL_ADD_PROC(ctrlr_ctx, ctrlr_list, OID_AUTO,
	+ "num_recovery_nolock", CTLTYPE_S64 \| CTLFLAG_RD \| CTLFLAG_MPSAFE,
	+ ctrlr, 0, nvme_sysctl_num_recovery_nolock, "IU",
	+ "Number of times that we failed to lock recovery in the ISR");
	+
	SYSCTL_ADD_PROC(ctrlr_ctx, ctrlr_list, OID_AUTO,
	"reset_stats", CTLTYPE_UINT \| CTLFLAG_RW \| CTLFLAG_MPSAFE, ctrlr,
	0, nvme_sysctl_reset_stats, "IU", "Reset statistics to zero");

File Metadata

Mime Type: text/plain
Expires: Thu, Nov 7, 7:36 PM (22 h, 9 m)
Storage Engine: blob
Storage Format: Raw Data
Storage Handle: 14519398
Default Alt Text: D41452.diff (12 KB)

D41452.diffNo OneTemporaryActions

D41452.diffView Options

File Metadata

Event Timeline

D41452.diff
No OneTemporary
Actions

D41452.diff
View Options