D41452.id125990.diff
No OneTemporary
Actions

Size

5 KB

Referenced Files

None

Subscribers

None

D41452.id125990.diff
View Options

	diff --git a/sys/dev/nvme/nvme_ctrlr.c b/sys/dev/nvme/nvme_ctrlr.c
	--- a/sys/dev/nvme/nvme_ctrlr.c
	+++ b/sys/dev/nvme/nvme_ctrlr.c
	@@ -425,13 +425,35 @@

	TSENTER();

	+ /*
	+ * Make sure that all the ISRs are done before proceeding with the reset
	+ * (and also keep any stray interrupts that happen during this process
	+ * from racing this process). For startup, this is a nop, since the
	+ * hardware is in a good state. But for recovery, where we randomly
	+ * reset the hardware, this ensure that we're not racing the ISRs.
	+ */
	+ mtx_lock_spin(&ctrlr->adminq.isr_spin);
	+ for (int i = 0; i < ctrlr->num_io_queues; i++) {
	+ mtx_lock_spin(&ctrlr->ioq[i].isr_spin);
	+ }
	+
	nvme_ctrlr_disable_qpairs(ctrlr);

	err = nvme_ctrlr_disable(ctrlr);
	if (err != 0)
	- return err;
	+ goto out;

	err = nvme_ctrlr_enable(ctrlr);
	+out:
	+
	+ /*
	+ * Reset complete, unblock ISRs
	+ */
	+ mtx_unlock_spin(&ctrlr->adminq.isr_spin);
	+ for (int i = 0; i < ctrlr->num_io_queues; i++) {
	+ mtx_unlock_spin(&ctrlr->ioq[i].isr_spin);
	+ }
	+
	TSEXIT();
	return (err);
	}
	diff --git a/sys/dev/nvme/nvme_private.h b/sys/dev/nvme/nvme_private.h
	--- a/sys/dev/nvme/nvme_private.h
	+++ b/sys/dev/nvme/nvme_private.h
	@@ -184,6 +184,7 @@
	int64_t num_retries;
	int64_t num_failures;
	int64_t num_ignored;
	+ int64_t num_spinlock_fail;

	struct nvme_command *cmd;
	struct nvme_completion *cpl;
	@@ -201,8 +202,9 @@

	struct nvme_tracker **act_tr;

	- struct mtx lock __aligned(CACHE_LINE_SIZE);
	+ struct mtx isr_spin __aligned(CACHE_LINE_SIZE);

	+ struct mtx lock __aligned(CACHE_LINE_SIZE);
	} __aligned(CACHE_LINE_SIZE);

	struct nvme_namespace {
	diff --git a/sys/dev/nvme/nvme_qpair.c b/sys/dev/nvme/nvme_qpair.c
	--- a/sys/dev/nvme/nvme_qpair.c
	+++ b/sys/dev/nvme/nvme_qpair.c
	@@ -538,6 +538,17 @@
	int done = 0;
	bool in_panic = dumping \|\| SCHEDULER_STOPPED();

	+ /*
	+ * Interlock with reset / recovery code. This is an usually uncontended
	+ * spinlock to make sure that we drain out of the ISRs before we reset
	+ * the card and to prevent races with the recovery process called from
	+ * a timeout context.
	+ */
	+ if (!mtx_trylock_spin(&qpair->isr_spin)) {
	+ qpair->num_spinlock_fail++;
	+ return (false);
	+ }
	+
	/*
	* qpair is not enabled, likely because a controller reset is in
	* progress. Ignore the interrupt - any I/O that was associated with
	@@ -548,6 +559,7 @@
	*/
	if (qpair->recovery_state != RECOVERY_NONE) {
	qpair->num_ignored++;
	+ mtx_unlock_spin(&qpair->isr_spin);
	return (false);
	}

	@@ -673,6 +685,7 @@
	qpair->cq_hdbl_off, qpair->cq_head);
	}

	+ mtx_unlock_spin(&qpair->isr_spin);
	return (done != 0);
	}

	@@ -1007,10 +1020,14 @@
	/*
	* Next, check to see if we have any completions. If we do,
	* we've likely missed an interrupt, but the card is otherwise
	- * fine. This will also catch all the commands that are about
	- * to timeout (but there's still a tiny race). Since the timeout
	- * is long relative to the race between here and the check below,
	+ * fine. This will also catch all the commands that are about to
	+ * timeout (but there's still a tiny race). Since the timeout is
	+ * long relative to the race between here and the check below,
	* this is still a win.
	+ *
	+ * It will also catch misconfigured interrupts. In addition, the
	+ * isr_spin lock taken out in nvme_qpair_process_completions
	+ * protects against races here, no matter how small.
	*/
	mtx_unlock(&qpair->lock);
	nvme_qpair_process_completions(qpair);
	@@ -1019,7 +1036,8 @@
	/*
	* Somebody else adjusted recovery state while unlocked,
	* we should bail. Unlock the qpair and return without
	- * doing anything else.
	+ * doing anything else. That somebody else has stepped
	+ * the state machine into the next stage.
	*/
	mtx_unlock(&qpair->lock);
	return;
	@@ -1032,6 +1050,10 @@
	now = getsbinuptime();
	needs_reset = false;
	TAILQ_FOREACH(tr, &qpair->outstanding_tr, tailq) {
	+ /*
	+ * Skip async commands, they are posted to the card for
	+ * an indefinite amount of time and have no deadline.
	+ */
	if (tr->deadline == SBT_MAX)
	continue;
	if (now > tr->deadline) {
	diff --git a/sys/dev/nvme/nvme_sysctl.c b/sys/dev/nvme/nvme_sysctl.c
	--- a/sys/dev/nvme/nvme_sysctl.c
	+++ b/sys/dev/nvme/nvme_sysctl.c
	@@ -165,6 +165,7 @@
	qpair->num_retries = 0;
	qpair->num_failures = 0;
	qpair->num_ignored = 0;
	+ qpair->num_spinlock_fail = 0;
	}

	static int
	@@ -242,6 +243,21 @@
	return (sysctl_handle_64(oidp, &num_ignored, 0, req));
	}

	+static int
	+nvme_sysctl_num_spinlock_fail(SYSCTL_HANDLER_ARGS)
	+{
	+ struct nvme_controller *ctrlr = arg1;
	+ int64_t num_spinfail = 0;
	+ int i;
	+
	+ num_spinfail = ctrlr->adminq.num_spinlock_fail;
	+
	+ for (i = 0; i < ctrlr->num_io_queues; i++)
	+ num_spinfail += ctrlr->ioq[i].num_spinlock_fail;
	+
	+ return (sysctl_handle_64(oidp, &num_spinfail, 0, req));
	+}
	+
	static int
	nvme_sysctl_reset_stats(SYSCTL_HANDLER_ARGS)
	{
	@@ -300,6 +316,9 @@
	SYSCTL_ADD_QUAD(ctrlr_ctx, que_list, OID_AUTO, "num_ignored",
	CTLFLAG_RD, &qpair->num_ignored,
	"Number of interrupts posted, but were administratively ignored");
	+ SYSCTL_ADD_QUAD(ctrlr_ctx, que_list, OID_AUTO, "num_spinlock_fail",
	+ CTLFLAG_RD, &qpair->num_spinlock_fail,
	+ "Number of times that we can't get a spin lock in the ISR");

	SYSCTL_ADD_PROC(ctrlr_ctx, que_list, OID_AUTO,
	"dump_debug", CTLTYPE_UINT \| CTLFLAG_RW \| CTLFLAG_MPSAFE,
	@@ -368,6 +387,11 @@
	ctrlr, 0, nvme_sysctl_num_ignored, "IU",
	"Number of interrupts ignored administratively");

	+ SYSCTL_ADD_PROC(ctrlr_ctx, ctrlr_list, OID_AUTO,
	+ "num_spinlock_fail", CTLTYPE_S64 \| CTLFLAG_RD \| CTLFLAG_MPSAFE,
	+ ctrlr, 0, nvme_sysctl_num_spinlock_fail, "IU",
	+ "Number of times that we can't get a spin lock in the ISR");
	+
	SYSCTL_ADD_PROC(ctrlr_ctx, ctrlr_list, OID_AUTO,
	"reset_stats", CTLTYPE_UINT \| CTLFLAG_RW \| CTLFLAG_MPSAFE, ctrlr,
	0, nvme_sysctl_reset_stats, "IU", "Reset statistics to zero");

File Metadata

Mime Type: text/plain
Expires: Mon, Jan 27, 6:04 PM (4 h, 7 m)
Storage Engine: blob
Storage Format: Raw Data
Storage Handle: 16202844
Default Alt Text: D41452.id125990.diff (5 KB)

D41452.id125990.diffNo OneTemporaryActions

D41452.id125990.diffView Options

File Metadata

Event Timeline

D41452.id125990.diff
No OneTemporary
Actions

D41452.id125990.diff
View Options