D42051.id128114.diff
No OneTemporary
Actions

Size

3 KB

Referenced Files

None

Subscribers

None

D42051.id128114.diff
View Options

	diff --git a/sys/dev/nvme/nvme_private.h b/sys/dev/nvme/nvme_private.h
	--- a/sys/dev/nvme/nvme_private.h
	+++ b/sys/dev/nvme/nvme_private.h
	@@ -150,7 +150,6 @@
	enum nvme_recovery {
	RECOVERY_NONE = 0, /* Normal operations */
	RECOVERY_WAITING, /* waiting for the reset to complete */
	- RECOVERY_FAILED, /* We have failed, no more I/O */
	};
	struct nvme_qpair {
	struct nvme_controller *ctrlr;
	diff --git a/sys/dev/nvme/nvme_qpair.c b/sys/dev/nvme/nvme_qpair.c
	--- a/sys/dev/nvme/nvme_qpair.c
	+++ b/sys/dev/nvme/nvme_qpair.c
	@@ -1027,6 +1027,17 @@

	mtx_assert(&qpair->recovery, MA_OWNED);

	+ /*
	+ * If the controller is failed, then stop polling. This ensures that any
	+ * failure processing that races with the qpair timeout will fail
	+ * safely.
	+ */
	+ if (qpair->ctrlr->is_failed) {
	+ nvme_printf(qpair->ctrlr,
	+ "Failed controller, stopping watchdog timeout.\n");
	+ return;
	+ }
	+
	switch (qpair->recovery_state) {
	case RECOVERY_NONE:
	/*
	@@ -1120,11 +1131,6 @@
	nvme_printf(ctrlr, "Waiting for reset to complete\n");
	idle = false; /* We want to keep polling */
	break;
	- case RECOVERY_FAILED:
	- KASSERT(qpair->ctrlr->is_failed,
	- ("Recovery state failed w/o failed controller\n"));
	- idle = true; /* nothing to monitor */
	- break;
	}

	/*
	@@ -1244,11 +1250,21 @@
	if (tr == NULL \|\| qpair->recovery_state != RECOVERY_NONE) {
	/*
	* No tracker is available, or the qpair is disabled due to an
	- * in-progress controller-level reset or controller failure. If
	- * we lose the race with recovery_state, then we may add an
	- * extra request to the queue which will be resubmitted later.
	- * We only set recovery_state to NONE with qpair->lock also
	- * held.
	+ * in-progress controller-level reset. If we lose the race with
	+ * recovery_state, then we may add an extra request to the queue
	+ * which will be resubmitted later. We only set recovery_state
	+ * to NONE with qpair->lock also held, so if we observe that the
	+ * state is not NONE, we know it can't transition to NONE below
	+ * when we've submitted the request to hardware.
	+ *
	+ * Also, as part of the failure process, we set recovery_state
	+ * to RECOVERY_WAITING, so we check here to see if we've failed
	+ * the controller. We set it before we call the qpair_fail
	+ * functions, which take out the lock lock before messing with
	+ * queued_req. Since we hold that lock, we know it's safe to
	+ * either fail directly, or queue the failure should is_failed
	+ * be stale. If we lose the race reading is_failed, then
	+ * nvme_qpair_fail will fail the queued request.
	*/

	if (qpair->ctrlr->is_failed) {
	@@ -1314,7 +1330,7 @@
	mtx_assert(&qpair->recovery, MA_OWNED);
	if (mtx_initialized(&qpair->lock))
	mtx_assert(&qpair->lock, MA_OWNED);
	- KASSERT(qpair->recovery_state != RECOVERY_FAILED,
	+ KASSERT(!qpair->ctrlr->is_failed,
	("Enabling a failed qpair\n"));

	qpair->recovery_state = RECOVERY_NONE;
	@@ -1471,10 +1487,6 @@
	if (!mtx_initialized(&qpair->lock))
	return;

	- mtx_lock(&qpair->recovery);
	- qpair->recovery_state = RECOVERY_FAILED;
	- mtx_unlock(&qpair->recovery);
	-
	mtx_lock(&qpair->lock);

	if (!STAILQ_EMPTY(&qpair->queued_req)) {

File Metadata

Mime Type: text/plain
Expires: Mon, Nov 18, 6:32 PM (40 m, 54 s)
Storage Engine: blob
Storage Format: Raw Data
Storage Handle: 14702728
Default Alt Text: D42051.id128114.diff (3 KB)

D42051.id128114.diffNo OneTemporaryActions

D42051.id128114.diffView Options

File Metadata

Event Timeline

D42051.id128114.diff
No OneTemporary
Actions

D42051.id128114.diff
View Options