Page Menu
Home
FreeBSD
Search
Configure Global Search
Log In
Files
F108570871
D44714.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Flag For Later
Award Token
Size
77 KB
Referenced Files
None
Subscribers
None
D44714.diff
View Options
diff --git a/share/man/man4/Makefile b/share/man/man4/Makefile
--- a/share/man/man4/Makefile
+++ b/share/man/man4/Makefile
@@ -408,6 +408,7 @@
nvd.4 \
${_nvdimm.4} \
nvme.4 \
+ nvmf.4 \
nvmf_tcp.4 \
${_nvram.4} \
oce.4 \
diff --git a/share/man/man4/nvmf.4 b/share/man/man4/nvmf.4
new file mode 100644
--- /dev/null
+++ b/share/man/man4/nvmf.4
@@ -0,0 +1,87 @@
+.\"
+.\" SPDX-License-Identifier: BSD-2-Clause
+.\"
+.\" Copyright (c) 2024 Chelsio Communications, Inc.
+.\"
+.Dd May 2, 2024
+.Dt NVMF 4
+.Os
+.Sh NAME
+.Nm nvmf
+.Nd "NVM Express over Fabrics host driver"
+.Sh SYNOPSIS
+To compile the driver into the kernel,
+place the following line in the
+kernel configuration file:
+.Bd -ragged -offset indent
+.Cd "device nvmf"
+.Ed
+.Pp
+Alternatively, to load the driver as a
+module at boot time, place the following line in
+.Xr loader.conf 5 :
+.Bd -literal -offset indent
+nvmf_load="YES"
+.Ed
+.Sh DESCRIPTION
+The
+.Nm
+driver provides the kernel component of an NVM Express over Fabrics
+host.
+The NVMeoF host is the client which provides local access to
+namespaces exported by a remote controller.
+.Pp
+Associations between the local host and remote controllers are managed
+using
+.Xr nvmecontrol 8 .
+New associations are created via the
+.Cm connect
+command and destroyed via the
+.Cm disconnect
+command.
+If an association's connection is interrupted,
+the
+.Cm reconnect
+command creates a new association to replace the interrupted association.
+.Pp
+Similar to
+.Xr nvme 4 ,
+.Nm
+creates controller device nodes using the format
+.Pa /dev/nvmeX
+and namespace device nodes using the format
+.Pa /dev/nvmeXnsY .
+.Nm
+also exports remote namespaces via the CAM
+.Xr nda 4
+peripheral driver.
+Unlike
+.Xr nvme 4 ,
+.Nm
+does not support the
+.Xr nvd 4
+disk driver.
+.Pp
+Associations require a supported transport such as
+.Xr nvmf_tcp 4
+for associations using TCP/IP.
+.Sh SEE ALSO
+.Xr nda 4 ,
+.Xr nvme 4 ,
+.Xr nvmf_tcp 4 ,
+.Xr nvmft 4 ,
+.Xr nvmecontrol 8
+.Sh HISTORY
+The
+.Nm
+module first appeared in
+.Fx 15.0 .
+.Sh AUTHORS
+The
+.Nm
+driver was developed by
+.An John Baldwin Aq Mt jhb@FreeBSD.org
+under sponsorship from Chelsio Communications, Inc.
+.Sh BUGS
+.Nm
+only supports a single I/O queue pair per association.
diff --git a/sys/conf/NOTES b/sys/conf/NOTES
--- a/sys/conf/NOTES
+++ b/sys/conf/NOTES
@@ -1676,12 +1676,14 @@
# NVM Express
#
# nvme: PCI-express NVM Express host controllers
+# nvmf: NVM Express over Fabrics host
# nvmf_tcp: TCP transport for NVM Express over Fabrics
# nda: CAM NVMe disk driver
# nvd: non-CAM NVMe disk driver
-device nvme # base NVMe driver
+device nvme # PCI-express NVMe host driver
options NVME_USE_NVD=1 # Use nvd(4) instead of the CAM nda(4) driver
+device nvmf # NVMeoF host driver
device nvmf_tcp # NVMeoF TCP transport
device nda # NVMe direct access devices (aka disks)
device nvd # expose NVMe namespaces as disks, depends on nvme
diff --git a/sys/conf/files b/sys/conf/files
--- a/sys/conf/files
+++ b/sys/conf/files
@@ -2533,7 +2533,15 @@
dev/nvme/nvme_util.c optional nvme
dev/nvmem/nvmem.c optional nvmem fdt
dev/nvmem/nvmem_if.m optional nvmem
+dev/nvmf/host/nvmf.c optional nvmf
+dev/nvmf/host/nvmf_aer.c optional nvmf
+dev/nvmf/host/nvmf_cmd.c optional nvmf
+dev/nvmf/host/nvmf_ctldev.c optional nvmf
+dev/nvmf/host/nvmf_ns.c optional nvmf
+dev/nvmf/host/nvmf_qpair.c optional nvmf
+dev/nvmf/host/nvmf_sim.c optional nvmf
dev/nvmf/nvmf_tcp.c optional nvmf_tcp
+dev/nvmf/nvmf_transport.c optional nvmf
dev/oce/oce_hw.c optional oce pci
dev/oce/oce_if.c optional oce pci
dev/oce/oce_mbox.c optional oce pci
diff --git a/sys/dev/nvmf/host/nvmf.c b/sys/dev/nvmf/host/nvmf.c
new file mode 100644
--- /dev/null
+++ b/sys/dev/nvmf/host/nvmf.c
@@ -0,0 +1,939 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2023-2024 Chelsio Communications, Inc.
+ * Written by: John Baldwin <jhb@FreeBSD.org>
+ */
+
+#include <sys/param.h>
+#include <sys/bus.h>
+#include <sys/conf.h>
+#include <sys/lock.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/memdesc.h>
+#include <sys/module.h>
+#include <sys/mutex.h>
+#include <sys/sx.h>
+#include <sys/taskqueue.h>
+#include <dev/nvme/nvme.h>
+#include <dev/nvmf/nvmf.h>
+#include <dev/nvmf/nvmf_transport.h>
+#include <dev/nvmf/host/nvmf_var.h>
+
+static struct cdevsw nvmf_cdevsw;
+
+MALLOC_DEFINE(M_NVMF, "nvmf", "NVMe over Fabrics host");
+
+static void nvmf_disconnect_task(void *arg, int pending);
+
+void
+nvmf_complete(void *arg, const struct nvme_completion *cqe)
+{
+ struct nvmf_completion_status *status = arg;
+ struct mtx *mtx;
+
+ status->cqe = *cqe;
+ mtx = mtx_pool_find(mtxpool_sleep, status);
+ mtx_lock(mtx);
+ status->done = true;
+ mtx_unlock(mtx);
+ wakeup(status);
+}
+
+void
+nvmf_io_complete(void *arg, size_t xfered, int error)
+{
+ struct nvmf_completion_status *status = arg;
+ struct mtx *mtx;
+
+ status->io_error = error;
+ mtx = mtx_pool_find(mtxpool_sleep, status);
+ mtx_lock(mtx);
+ status->io_done = true;
+ mtx_unlock(mtx);
+ wakeup(status);
+}
+
+void
+nvmf_wait_for_reply(struct nvmf_completion_status *status)
+{
+ struct mtx *mtx;
+
+ mtx = mtx_pool_find(mtxpool_sleep, status);
+ mtx_lock(mtx);
+ while (!status->done || !status->io_done)
+ mtx_sleep(status, mtx, 0, "nvmfcmd", 0);
+ mtx_unlock(mtx);
+}
+
+static int
+nvmf_read_property(struct nvmf_softc *sc, uint32_t offset, uint8_t size,
+ uint64_t *value)
+{
+ const struct nvmf_fabric_prop_get_rsp *rsp;
+ struct nvmf_completion_status status;
+
+ nvmf_status_init(&status);
+ if (!nvmf_cmd_get_property(sc, offset, size, nvmf_complete, &status,
+ M_WAITOK))
+ return (ECONNABORTED);
+ nvmf_wait_for_reply(&status);
+
+ if (status.cqe.status != 0) {
+ device_printf(sc->dev, "PROPERTY_GET failed, status %#x\n",
+ le16toh(status.cqe.status));
+ return (EIO);
+ }
+
+ rsp = (const struct nvmf_fabric_prop_get_rsp *)&status.cqe;
+ if (size == 8)
+ *value = le64toh(rsp->value.u64);
+ else
+ *value = le32toh(rsp->value.u32.low);
+ return (0);
+}
+
+static int
+nvmf_write_property(struct nvmf_softc *sc, uint32_t offset, uint8_t size,
+ uint64_t value)
+{
+ struct nvmf_completion_status status;
+
+ nvmf_status_init(&status);
+ if (!nvmf_cmd_set_property(sc, offset, size, value, nvmf_complete, &status,
+ M_WAITOK))
+ return (ECONNABORTED);
+ nvmf_wait_for_reply(&status);
+
+ if (status.cqe.status != 0) {
+ device_printf(sc->dev, "PROPERTY_SET failed, status %#x\n",
+ le16toh(status.cqe.status));
+ return (EIO);
+ }
+ return (0);
+}
+
+static void
+nvmf_shutdown_controller(struct nvmf_softc *sc)
+{
+ uint64_t cc;
+ int error;
+
+ error = nvmf_read_property(sc, NVMF_PROP_CC, 4, &cc);
+ if (error != 0) {
+ device_printf(sc->dev, "Failed to fetch CC for shutdown\n");
+ return;
+ }
+
+ cc |= NVMEF(NVME_CC_REG_SHN, NVME_SHN_NORMAL);
+
+ error = nvmf_write_property(sc, NVMF_PROP_CC, 4, cc);
+ if (error != 0)
+ device_printf(sc->dev,
+ "Failed to set CC to trigger shutdown\n");
+}
+
+static void
+nvmf_check_keep_alive(void *arg)
+{
+ struct nvmf_softc *sc = arg;
+ int traffic;
+
+ traffic = atomic_readandclear_int(&sc->ka_active_rx_traffic);
+ if (traffic == 0) {
+ device_printf(sc->dev,
+ "disconnecting due to KeepAlive timeout\n");
+ nvmf_disconnect(sc);
+ return;
+ }
+
+ callout_schedule_sbt(&sc->ka_rx_timer, sc->ka_rx_sbt, 0, C_HARDCLOCK);
+}
+
+static void
+nvmf_keep_alive_complete(void *arg, const struct nvme_completion *cqe)
+{
+ struct nvmf_softc *sc = arg;
+
+ atomic_store_int(&sc->ka_active_rx_traffic, 1);
+ if (cqe->status != 0) {
+ device_printf(sc->dev,
+ "KeepAlive response reported status %#x\n",
+ le16toh(cqe->status));
+ }
+}
+
+static void
+nvmf_send_keep_alive(void *arg)
+{
+ struct nvmf_softc *sc = arg;
+ int traffic;
+
+ /*
+ * Don't bother sending a KeepAlive command if TKAS is active
+ * and another command has been sent during the interval.
+ */
+ traffic = atomic_load_int(&sc->ka_active_tx_traffic);
+ if (traffic == 0 && !nvmf_cmd_keep_alive(sc, nvmf_keep_alive_complete,
+ sc, M_NOWAIT))
+ device_printf(sc->dev,
+ "Failed to allocate KeepAlive command\n");
+
+ /* Clear ka_active_tx_traffic after sending the keep alive command. */
+ atomic_store_int(&sc->ka_active_tx_traffic, 0);
+
+ callout_schedule_sbt(&sc->ka_tx_timer, sc->ka_tx_sbt, 0, C_HARDCLOCK);
+}
+
+int
+nvmf_init_ivars(struct nvmf_ivars *ivars, struct nvmf_handoff_host *hh)
+{
+ size_t len;
+ u_int i;
+ int error;
+
+ memset(ivars, 0, sizeof(*ivars));
+
+ if (!hh->admin.admin || hh->num_io_queues < 1)
+ return (EINVAL);
+
+ ivars->cdata = malloc(sizeof(*ivars->cdata), M_NVMF, M_WAITOK);
+ error = copyin(hh->cdata, ivars->cdata, sizeof(*ivars->cdata));
+ if (error != 0)
+ goto out;
+ nvme_controller_data_swapbytes(ivars->cdata);
+
+ len = hh->num_io_queues * sizeof(*ivars->io_params);
+ ivars->io_params = malloc(len, M_NVMF, M_WAITOK);
+ error = copyin(hh->io, ivars->io_params, len);
+ if (error != 0)
+ goto out;
+ for (i = 0; i < hh->num_io_queues; i++) {
+ if (ivars->io_params[i].admin) {
+ error = EINVAL;
+ goto out;
+ }
+
+ /* Require all I/O queues to be the same size. */
+ if (ivars->io_params[i].qsize != ivars->io_params[0].qsize) {
+ error = EINVAL;
+ goto out;
+ }
+ }
+
+ ivars->hh = hh;
+ return (0);
+
+out:
+ free(ivars->io_params, M_NVMF);
+ free(ivars->cdata, M_NVMF);
+ return (error);
+}
+
+void
+nvmf_free_ivars(struct nvmf_ivars *ivars)
+{
+ free(ivars->io_params, M_NVMF);
+ free(ivars->cdata, M_NVMF);
+}
+
+static int
+nvmf_probe(device_t dev)
+{
+ struct nvmf_ivars *ivars = device_get_ivars(dev);
+ char desc[260];
+
+ if (ivars == NULL)
+ return (ENXIO);
+
+ snprintf(desc, sizeof(desc), "Fabrics: %.256s", ivars->cdata->subnqn);
+ device_set_desc_copy(dev, desc);
+ return (BUS_PROBE_DEFAULT);
+}
+
+static int
+nvmf_establish_connection(struct nvmf_softc *sc, struct nvmf_ivars *ivars)
+{
+ char name[16];
+
+ /* Setup the admin queue. */
+ sc->admin = nvmf_init_qp(sc, ivars->hh->trtype, &ivars->hh->admin,
+ "admin queue");
+ if (sc->admin == NULL) {
+ device_printf(sc->dev, "Failed to setup admin queue\n");
+ return (ENXIO);
+ }
+
+ /* Setup I/O queues. */
+ sc->io = malloc(ivars->hh->num_io_queues * sizeof(*sc->io), M_NVMF,
+ M_WAITOK | M_ZERO);
+ sc->num_io_queues = ivars->hh->num_io_queues;
+ for (u_int i = 0; i < sc->num_io_queues; i++) {
+ snprintf(name, sizeof(name), "I/O queue %u", i);
+ sc->io[i] = nvmf_init_qp(sc, ivars->hh->trtype,
+ &ivars->io_params[i], name);
+ if (sc->io[i] == NULL) {
+ device_printf(sc->dev, "Failed to setup I/O queue %u\n",
+ i + 1);
+ return (ENXIO);
+ }
+ }
+
+ /* Start KeepAlive timers. */
+ if (ivars->hh->kato != 0) {
+ sc->ka_traffic = NVMEV(NVME_CTRLR_DATA_CTRATT_TBKAS,
+ sc->cdata->ctratt) != 0;
+ sc->ka_rx_sbt = mstosbt(ivars->hh->kato);
+ sc->ka_tx_sbt = sc->ka_rx_sbt / 2;
+ callout_reset_sbt(&sc->ka_rx_timer, sc->ka_rx_sbt, 0,
+ nvmf_check_keep_alive, sc, C_HARDCLOCK);
+ callout_reset_sbt(&sc->ka_tx_timer, sc->ka_tx_sbt, 0,
+ nvmf_send_keep_alive, sc, C_HARDCLOCK);
+ }
+
+ return (0);
+}
+
+static bool
+nvmf_scan_nslist(struct nvmf_softc *sc, struct nvme_ns_list *nslist,
+ struct nvme_namespace_data *data, uint32_t *nsidp)
+{
+ struct nvmf_completion_status status;
+ uint32_t nsid;
+
+ nvmf_status_init(&status);
+ nvmf_status_wait_io(&status);
+ if (!nvmf_cmd_identify_active_namespaces(sc, *nsidp, nslist,
+ nvmf_complete, &status, nvmf_io_complete, &status, M_WAITOK)) {
+ device_printf(sc->dev,
+ "failed to send IDENTIFY active namespaces command\n");
+ return (false);
+ }
+ nvmf_wait_for_reply(&status);
+
+ if (status.cqe.status != 0) {
+ device_printf(sc->dev,
+ "IDENTIFY active namespaces failed, status %#x\n",
+ le16toh(status.cqe.status));
+ return (false);
+ }
+
+ if (status.io_error != 0) {
+ device_printf(sc->dev,
+ "IDENTIFY active namespaces failed with I/O error %d\n",
+ status.io_error);
+ return (false);
+ }
+
+ for (u_int i = 0; i < nitems(nslist->ns); i++) {
+ nsid = nslist->ns[i];
+ if (nsid == 0) {
+ *nsidp = 0;
+ return (true);
+ }
+
+ if (sc->ns[nsid - 1] != NULL) {
+ device_printf(sc->dev,
+ "duplicate namespace %u in active namespace list\n",
+ nsid);
+ return (false);
+ }
+
+ nvmf_status_init(&status);
+ nvmf_status_wait_io(&status);
+ if (!nvmf_cmd_identify_namespace(sc, nsid, data, nvmf_complete,
+ &status, nvmf_io_complete, &status, M_WAITOK)) {
+ device_printf(sc->dev,
+ "failed to send IDENTIFY namespace %u command\n",
+ nsid);
+ return (false);
+ }
+ nvmf_wait_for_reply(&status);
+
+ if (status.cqe.status != 0) {
+ device_printf(sc->dev,
+ "IDENTIFY namespace %u failed, status %#x\n", nsid,
+ le16toh(status.cqe.status));
+ return (false);
+ }
+
+ if (status.io_error != 0) {
+ device_printf(sc->dev,
+ "IDENTIFY namespace %u failed with I/O error %d\n",
+ nsid, status.io_error);
+ return (false);
+ }
+
+ /*
+ * As in nvme_ns_construct, a size of zero indicates an
+ * invalid namespace.
+ */
+ nvme_namespace_data_swapbytes(data);
+ if (data->nsze == 0) {
+ device_printf(sc->dev,
+ "ignoring active namespace %u with zero size\n",
+ nsid);
+ continue;
+ }
+
+ sc->ns[nsid - 1] = nvmf_init_ns(sc, nsid, data);
+
+ nvmf_sim_rescan_ns(sc, nsid);
+ }
+
+ MPASS(nsid == nslist->ns[nitems(nslist->ns) - 1] && nsid != 0);
+
+ if (nsid >= 0xfffffffd)
+ *nsidp = 0;
+ else
+ *nsidp = nsid + 1;
+ return (true);
+}
+
+static bool
+nvmf_add_namespaces(struct nvmf_softc *sc)
+{
+ struct nvme_namespace_data *data;
+ struct nvme_ns_list *nslist;
+ uint32_t nsid;
+ bool retval;
+
+ sc->ns = mallocarray(sc->cdata->nn, sizeof(*sc->ns), M_NVMF,
+ M_WAITOK | M_ZERO);
+ nslist = malloc(sizeof(*nslist), M_NVMF, M_WAITOK);
+ data = malloc(sizeof(*data), M_NVMF, M_WAITOK);
+
+ nsid = 0;
+ retval = true;
+ for (;;) {
+ if (!nvmf_scan_nslist(sc, nslist, data, &nsid)) {
+ retval = false;
+ break;
+ }
+ if (nsid == 0)
+ break;
+ }
+
+ free(data, M_NVMF);
+ free(nslist, M_NVMF);
+ return (retval);
+}
+
+static int
+nvmf_attach(device_t dev)
+{
+ struct make_dev_args mda;
+ struct nvmf_softc *sc = device_get_softc(dev);
+ struct nvmf_ivars *ivars = device_get_ivars(dev);
+ uint64_t val;
+ u_int i;
+ int error;
+
+ if (ivars == NULL)
+ return (ENXIO);
+
+ sc->dev = dev;
+ sc->trtype = ivars->hh->trtype;
+ callout_init(&sc->ka_rx_timer, 1);
+ callout_init(&sc->ka_tx_timer, 1);
+ sx_init(&sc->connection_lock, "nvmf connection");
+ TASK_INIT(&sc->disconnect_task, 0, nvmf_disconnect_task, sc);
+
+ /* Claim the cdata pointer from ivars. */
+ sc->cdata = ivars->cdata;
+ ivars->cdata = NULL;
+
+ nvmf_init_aer(sc);
+
+ /* TODO: Multiqueue support. */
+ sc->max_pending_io = ivars->io_params[0].qsize /* * sc->num_io_queues */;
+
+ error = nvmf_establish_connection(sc, ivars);
+ if (error != 0)
+ goto out;
+
+ error = nvmf_read_property(sc, NVMF_PROP_CAP, 8, &sc->cap);
+ if (error != 0) {
+ device_printf(sc->dev, "Failed to fetch CAP\n");
+ error = ENXIO;
+ goto out;
+ }
+
+ error = nvmf_read_property(sc, NVMF_PROP_VS, 4, &val);
+ if (error != 0) {
+ device_printf(sc->dev, "Failed to fetch VS\n");
+ error = ENXIO;
+ goto out;
+ }
+ sc->vs = val;
+
+ /* Honor MDTS if it is set. */
+ sc->max_xfer_size = maxphys;
+ if (sc->cdata->mdts != 0) {
+ sc->max_xfer_size = ulmin(sc->max_xfer_size,
+ 1 << (sc->cdata->mdts + NVME_MPS_SHIFT +
+ NVME_CAP_HI_MPSMIN(sc->cap >> 32)));
+ }
+
+ error = nvmf_init_sim(sc);
+ if (error != 0)
+ goto out;
+
+ error = nvmf_start_aer(sc);
+ if (error != 0) {
+ nvmf_destroy_sim(sc);
+ goto out;
+ }
+
+ if (!nvmf_add_namespaces(sc)) {
+ nvmf_destroy_sim(sc);
+ goto out;
+ }
+
+ make_dev_args_init(&mda);
+ mda.mda_devsw = &nvmf_cdevsw;
+ mda.mda_uid = UID_ROOT;
+ mda.mda_gid = GID_WHEEL;
+ mda.mda_mode = 0600;
+ mda.mda_si_drv1 = sc;
+ error = make_dev_s(&mda, &sc->cdev, "%s", device_get_nameunit(dev));
+ if (error != 0) {
+ nvmf_destroy_sim(sc);
+ goto out;
+ }
+
+ return (0);
+out:
+ if (sc->ns != NULL) {
+ for (i = 0; i < sc->cdata->nn; i++) {
+ if (sc->ns[i] != NULL)
+ nvmf_destroy_ns(sc->ns[i]);
+ }
+ free(sc->ns, M_NVMF);
+ }
+
+ callout_drain(&sc->ka_tx_timer);
+ callout_drain(&sc->ka_rx_timer);
+
+ if (sc->admin != NULL)
+ nvmf_shutdown_controller(sc);
+
+ for (i = 0; i < sc->num_io_queues; i++) {
+ if (sc->io[i] != NULL)
+ nvmf_destroy_qp(sc->io[i]);
+ }
+ free(sc->io, M_NVMF);
+ if (sc->admin != NULL)
+ nvmf_destroy_qp(sc->admin);
+
+ nvmf_destroy_aer(sc);
+
+ taskqueue_drain(taskqueue_thread, &sc->disconnect_task);
+ sx_destroy(&sc->connection_lock);
+ free(sc->cdata, M_NVMF);
+ return (error);
+}
+
+void
+nvmf_disconnect(struct nvmf_softc *sc)
+{
+ taskqueue_enqueue(taskqueue_thread, &sc->disconnect_task);
+}
+
+static void
+nvmf_disconnect_task(void *arg, int pending __unused)
+{
+ struct nvmf_softc *sc = arg;
+ u_int i;
+
+ sx_xlock(&sc->connection_lock);
+ if (sc->admin == NULL) {
+ /*
+ * Ignore transport errors if there is no active
+ * association.
+ */
+ sx_xunlock(&sc->connection_lock);
+ return;
+ }
+
+ if (sc->detaching) {
+ if (sc->admin != NULL) {
+ /*
+ * This unsticks the detach process if a
+ * transport error occurs during detach.
+ */
+ nvmf_shutdown_qp(sc->admin);
+ }
+ sx_xunlock(&sc->connection_lock);
+ return;
+ }
+
+ if (sc->cdev == NULL) {
+ /*
+ * Transport error occurred during attach (nvmf_add_namespaces).
+ * Shutdown the admin queue.
+ */
+ nvmf_shutdown_qp(sc->admin);
+ sx_xunlock(&sc->connection_lock);
+ return;
+ }
+
+ callout_drain(&sc->ka_tx_timer);
+ callout_drain(&sc->ka_rx_timer);
+ sc->ka_traffic = false;
+
+ /* Quiesce namespace consumers. */
+ nvmf_disconnect_sim(sc);
+ for (i = 0; i < sc->cdata->nn; i++) {
+ if (sc->ns[i] != NULL)
+ nvmf_disconnect_ns(sc->ns[i]);
+ }
+
+ /* Shutdown the existing qpairs. */
+ for (i = 0; i < sc->num_io_queues; i++) {
+ nvmf_destroy_qp(sc->io[i]);
+ }
+ free(sc->io, M_NVMF);
+ sc->io = NULL;
+ sc->num_io_queues = 0;
+ nvmf_destroy_qp(sc->admin);
+ sc->admin = NULL;
+
+ sx_xunlock(&sc->connection_lock);
+}
+
+static int
+nvmf_reconnect_host(struct nvmf_softc *sc, struct nvmf_handoff_host *hh)
+{
+ struct nvmf_ivars ivars;
+ u_int i;
+ int error;
+
+ /* XXX: Should we permit changing the transport type? */
+ if (sc->trtype != hh->trtype) {
+ device_printf(sc->dev,
+ "transport type mismatch on reconnect\n");
+ return (EINVAL);
+ }
+
+ error = nvmf_init_ivars(&ivars, hh);
+ if (error != 0)
+ return (error);
+
+ sx_xlock(&sc->connection_lock);
+ if (sc->admin != NULL || sc->detaching) {
+ error = EBUSY;
+ goto out;
+ }
+
+ /*
+ * Ensure this is for the same controller. Note that the
+ * controller ID can vary across associations if the remote
+ * system is using the dynamic controller model. This merely
+ * ensures the new association is connected to the same NVMe
+ * subsystem.
+ */
+ if (memcmp(sc->cdata->subnqn, ivars.cdata->subnqn,
+ sizeof(ivars.cdata->subnqn)) != 0) {
+ device_printf(sc->dev,
+ "controller subsystem NQN mismatch on reconnect\n");
+ error = EINVAL;
+ goto out;
+ }
+
+ /*
+ * XXX: Require same number and size of I/O queues so that
+ * max_pending_io is still correct?
+ */
+
+ error = nvmf_establish_connection(sc, &ivars);
+ if (error != 0)
+ goto out;
+
+ error = nvmf_start_aer(sc);
+ if (error != 0)
+ goto out;
+
+ device_printf(sc->dev,
+ "established new association with %u I/O queues\n",
+ sc->num_io_queues);
+
+ /* Restart namespace consumers. */
+ for (i = 0; i < sc->cdata->nn; i++) {
+ if (sc->ns[i] != NULL)
+ nvmf_reconnect_ns(sc->ns[i]);
+ }
+ nvmf_reconnect_sim(sc);
+out:
+ sx_xunlock(&sc->connection_lock);
+ nvmf_free_ivars(&ivars);
+ return (error);
+}
+
+static int
+nvmf_detach(device_t dev)
+{
+ struct nvmf_softc *sc = device_get_softc(dev);
+ u_int i;
+
+ destroy_dev(sc->cdev);
+
+ sx_xlock(&sc->connection_lock);
+ sc->detaching = true;
+ sx_xunlock(&sc->connection_lock);
+
+ nvmf_destroy_sim(sc);
+ for (i = 0; i < sc->cdata->nn; i++) {
+ if (sc->ns[i] != NULL)
+ nvmf_destroy_ns(sc->ns[i]);
+ }
+ free(sc->ns, M_NVMF);
+
+ callout_drain(&sc->ka_tx_timer);
+ callout_drain(&sc->ka_rx_timer);
+
+ if (sc->admin != NULL)
+ nvmf_shutdown_controller(sc);
+
+ for (i = 0; i < sc->num_io_queues; i++) {
+ nvmf_destroy_qp(sc->io[i]);
+ }
+ free(sc->io, M_NVMF);
+
+ taskqueue_drain(taskqueue_thread, &sc->disconnect_task);
+
+ if (sc->admin != NULL)
+ nvmf_destroy_qp(sc->admin);
+
+ nvmf_destroy_aer(sc);
+
+ sx_destroy(&sc->connection_lock);
+ free(sc->cdata, M_NVMF);
+ return (0);
+}
+
+void
+nvmf_rescan_ns(struct nvmf_softc *sc, uint32_t nsid)
+{
+ struct nvmf_completion_status status;
+ struct nvme_namespace_data *data;
+ struct nvmf_namespace *ns;
+
+ data = malloc(sizeof(*data), M_NVMF, M_WAITOK);
+
+ nvmf_status_init(&status);
+ nvmf_status_wait_io(&status);
+ if (!nvmf_cmd_identify_namespace(sc, nsid, data, nvmf_complete,
+ &status, nvmf_io_complete, &status, M_WAITOK)) {
+ device_printf(sc->dev,
+ "failed to send IDENTIFY namespace %u command\n", nsid);
+ free(data, M_NVMF);
+ return;
+ }
+ nvmf_wait_for_reply(&status);
+
+ if (status.cqe.status != 0) {
+ device_printf(sc->dev,
+ "IDENTIFY namespace %u failed, status %#x\n", nsid,
+ le16toh(status.cqe.status));
+ free(data, M_NVMF);
+ return;
+ }
+
+ if (status.io_error != 0) {
+ device_printf(sc->dev,
+ "IDENTIFY namespace %u failed with I/O error %d\n",
+ nsid, status.io_error);
+ free(data, M_NVMF);
+ return;
+ }
+
+ nvme_namespace_data_swapbytes(data);
+
+ /* XXX: Needs locking around sc->ns[]. */
+ ns = sc->ns[nsid - 1];
+ if (data->nsze == 0) {
+ /* XXX: Needs locking */
+ if (ns != NULL) {
+ nvmf_destroy_ns(ns);
+ sc->ns[nsid - 1] = NULL;
+ }
+ } else {
+ /* XXX: Needs locking */
+ if (ns == NULL) {
+ sc->ns[nsid - 1] = nvmf_init_ns(sc, nsid, data);
+ } else {
+ if (!nvmf_update_ns(ns, data)) {
+ nvmf_destroy_ns(ns);
+ sc->ns[nsid - 1] = NULL;
+ }
+ }
+ }
+
+ free(data, M_NVMF);
+
+ nvmf_sim_rescan_ns(sc, nsid);
+}
+
+int
+nvmf_passthrough_cmd(struct nvmf_softc *sc, struct nvme_pt_command *pt,
+ bool admin)
+{
+ struct nvmf_completion_status status;
+ struct nvme_command cmd;
+ struct memdesc mem;
+ struct nvmf_host_qpair *qp;
+ struct nvmf_request *req;
+ void *buf;
+ int error;
+
+ if (pt->len > sc->max_xfer_size)
+ return (EINVAL);
+
+ buf = NULL;
+ if (pt->len != 0) {
+ /*
+ * XXX: Depending on the size we may want to pin the
+ * user pages and use a memdesc with vm_page_t's
+ * instead.
+ */
+ buf = malloc(pt->len, M_NVMF, M_WAITOK);
+ if (pt->is_read == 0) {
+ error = copyin(pt->buf, buf, pt->len);
+ if (error != 0) {
+ free(buf, M_NVMF);
+ return (error);
+ }
+ } else {
+ /* Ensure no kernel data is leaked to userland. */
+ memset(buf, 0, pt->len);
+ }
+ }
+
+ memset(&cmd, 0, sizeof(cmd));
+ cmd.opc = pt->cmd.opc;
+ cmd.fuse = pt->cmd.fuse;
+ cmd.nsid = pt->cmd.nsid;
+ cmd.cdw10 = pt->cmd.cdw10;
+ cmd.cdw11 = pt->cmd.cdw11;
+ cmd.cdw12 = pt->cmd.cdw12;
+ cmd.cdw13 = pt->cmd.cdw13;
+ cmd.cdw14 = pt->cmd.cdw14;
+ cmd.cdw15 = pt->cmd.cdw15;
+
+ if (admin)
+ qp = sc->admin;
+ else
+ qp = nvmf_select_io_queue(sc);
+ nvmf_status_init(&status);
+ req = nvmf_allocate_request(qp, &cmd, nvmf_complete, &status, M_WAITOK);
+ if (req == NULL) {
+ device_printf(sc->dev, "failed to send passthrough command\n");
+ error = ECONNABORTED;
+ goto error;
+ }
+
+ if (pt->len != 0) {
+ mem = memdesc_vaddr(buf, pt->len);
+ nvmf_capsule_append_data(req->nc, &mem, pt->len,
+ pt->is_read == 0, nvmf_io_complete, &status);
+ nvmf_status_wait_io(&status);
+ }
+
+ nvmf_submit_request(req);
+ nvmf_wait_for_reply(&status);
+
+ memset(&pt->cpl, 0, sizeof(pt->cpl));
+ pt->cpl.cdw0 = status.cqe.cdw0;
+ pt->cpl.status = status.cqe.status;
+
+ error = status.io_error;
+ if (error == 0 && pt->len != 0 && pt->is_read != 0)
+ error = copyout(buf, pt->buf, pt->len);
+error:
+ free(buf, M_NVMF);
+ return (error);
+}
+
+static int
+nvmf_ioctl(struct cdev *cdev, u_long cmd, caddr_t arg, int flag,
+ struct thread *td)
+{
+ struct nvmf_softc *sc = cdev->si_drv1;
+ struct nvme_get_nsid *gnsid;
+ struct nvme_pt_command *pt;
+ struct nvmf_reconnect_params *rp;
+ struct nvmf_handoff_host *hh;
+
+ switch (cmd) {
+ case NVME_PASSTHROUGH_CMD:
+ pt = (struct nvme_pt_command *)arg;
+ return (nvmf_passthrough_cmd(sc, pt, true));
+ case NVME_GET_NSID:
+ gnsid = (struct nvme_get_nsid *)arg;
+ strncpy(gnsid->cdev, device_get_nameunit(sc->dev),
+ sizeof(gnsid->cdev));
+ gnsid->cdev[sizeof(gnsid->cdev) - 1] = '\0';
+ gnsid->nsid = 0;
+ return (0);
+ case NVME_GET_MAX_XFER_SIZE:
+ *(uint64_t *)arg = sc->max_xfer_size;
+ return (0);
+ case NVMF_RECONNECT_PARAMS:
+ rp = (struct nvmf_reconnect_params *)arg;
+ if ((sc->cdata->fcatt & 1) == 0)
+ rp->cntlid = NVMF_CNTLID_DYNAMIC;
+ else
+ rp->cntlid = sc->cdata->ctrlr_id;
+ memcpy(rp->subnqn, sc->cdata->subnqn, sizeof(rp->subnqn));
+ return (0);
+ case NVMF_RECONNECT_HOST:
+ hh = (struct nvmf_handoff_host *)arg;
+ return (nvmf_reconnect_host(sc, hh));
+ default:
+ return (ENOTTY);
+ }
+}
+
+static struct cdevsw nvmf_cdevsw = {
+ .d_version = D_VERSION,
+ .d_ioctl = nvmf_ioctl
+};
+
+static int
+nvmf_modevent(module_t mod, int what, void *arg)
+{
+ switch (what) {
+ case MOD_LOAD:
+ return (nvmf_ctl_load());
+ case MOD_QUIESCE:
+ return (0);
+ case MOD_UNLOAD:
+ nvmf_ctl_unload();
+ destroy_dev_drain(&nvmf_cdevsw);
+ return (0);
+ default:
+ return (EOPNOTSUPP);
+ }
+}
+
+static device_method_t nvmf_methods[] = {
+ /* Device interface */
+ DEVMETHOD(device_probe, nvmf_probe),
+ DEVMETHOD(device_attach, nvmf_attach),
+ DEVMETHOD(device_detach, nvmf_detach),
+#if 0
+ DEVMETHOD(device_shutdown, nvmf_shutdown),
+#endif
+ DEVMETHOD_END
+};
+
+driver_t nvme_nvmf_driver = {
+ "nvme",
+ nvmf_methods,
+ sizeof(struct nvmf_softc),
+};
+
+DRIVER_MODULE(nvme, root, nvme_nvmf_driver, nvmf_modevent, NULL);
+MODULE_DEPEND(nvmf, nvmf_transport, 1, 1, 1);
diff --git a/sys/dev/nvmf/host/nvmf_aer.c b/sys/dev/nvmf/host/nvmf_aer.c
new file mode 100644
--- /dev/null
+++ b/sys/dev/nvmf/host/nvmf_aer.c
@@ -0,0 +1,290 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2024 Chelsio Communications, Inc.
+ * Written by: John Baldwin <jhb@FreeBSD.org>
+ */
+
+#include <sys/types.h>
+#include <sys/bus.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/taskqueue.h>
+#include <dev/nvmf/host/nvmf_var.h>
+
+struct nvmf_aer {
+ struct nvmf_softc *sc;
+ uint8_t log_page_id;
+ uint8_t info;
+ uint8_t type;
+
+ u_int page_len;
+ void *page;
+
+ int error;
+ uint16_t status;
+ int pending;
+ struct mtx *lock;
+ struct task complete_task;
+ struct task finish_page_task;
+};
+
+#define MAX_LOG_PAGE_SIZE 4096
+
+static void nvmf_complete_aer(void *arg, const struct nvme_completion *cqe);
+
+static void
+nvmf_submit_aer(struct nvmf_softc *sc, struct nvmf_aer *aer)
+{
+ struct nvmf_request *req;
+ struct nvme_command cmd;
+
+ memset(&cmd, 0, sizeof(cmd));
+ cmd.opc = NVME_OPC_ASYNC_EVENT_REQUEST;
+
+ req = nvmf_allocate_request(sc->admin, &cmd, nvmf_complete_aer, aer,
+ M_WAITOK);
+ if (req == NULL)
+ return;
+ req->aer = true;
+ nvmf_submit_request(req);
+}
+
+static void
+nvmf_handle_changed_namespaces(struct nvmf_softc *sc,
+ struct nvme_ns_list *ns_list)
+{
+ uint32_t nsid;
+
+ /*
+ * If more than 1024 namespaces have changed, we should
+ * probably just rescan the entire set of namespaces.
+ */
+ if (ns_list->ns[0] == 0xffffffff) {
+ device_printf(sc->dev, "too many changed namespaces\n");
+ return;
+ }
+
+ for (u_int i = 0; i < nitems(ns_list->ns); i++) {
+ if (ns_list->ns[i] == 0)
+ break;
+
+ nsid = le32toh(ns_list->ns[i]);
+ nvmf_rescan_ns(sc, nsid);
+ }
+}
+
+static void
+nvmf_finish_aer_page(struct nvmf_softc *sc, struct nvmf_aer *aer)
+{
+ /* If an error occurred fetching the page, just bail. */
+ if (aer->error != 0 || aer->status != 0)
+ return;
+
+ taskqueue_enqueue(taskqueue_thread, &aer->finish_page_task);
+}
+
+static void
+nvmf_finish_aer_page_task(void *arg, int pending)
+{
+ struct nvmf_aer *aer = arg;
+ struct nvmf_softc *sc = aer->sc;
+
+ switch (aer->log_page_id) {
+ case NVME_LOG_ERROR:
+ /* TODO: Should we log these? */
+ break;
+ case NVME_LOG_CHANGED_NAMESPACE:
+ nvmf_handle_changed_namespaces(sc, aer->page);
+ break;
+ }
+
+ /* Resubmit this AER command. */
+ nvmf_submit_aer(sc, aer);
+}
+
+static void
+nvmf_io_complete_aer_page(void *arg, size_t xfered, int error)
+{
+ struct nvmf_aer *aer = arg;
+ struct nvmf_softc *sc = aer->sc;
+
+ mtx_lock(aer->lock);
+ aer->error = error;
+ aer->pending--;
+ if (aer->pending == 0) {
+ mtx_unlock(aer->lock);
+ nvmf_finish_aer_page(sc, aer);
+ } else
+ mtx_unlock(aer->lock);
+}
+
+static void
+nvmf_complete_aer_page(void *arg, const struct nvme_completion *cqe)
+{
+ struct nvmf_aer *aer = arg;
+ struct nvmf_softc *sc = aer->sc;
+
+ mtx_lock(aer->lock);
+ aer->status = cqe->status;
+ aer->pending--;
+ if (aer->pending == 0) {
+ mtx_unlock(aer->lock);
+ nvmf_finish_aer_page(sc, aer);
+ } else
+ mtx_unlock(aer->lock);
+}
+
+static u_int
+nvmf_log_page_size(struct nvmf_softc *sc, uint8_t log_page_id)
+{
+ switch (log_page_id) {
+ case NVME_LOG_ERROR:
+ return ((sc->cdata->elpe + 1) *
+ sizeof(struct nvme_error_information_entry));
+ case NVME_LOG_CHANGED_NAMESPACE:
+ return (sizeof(struct nvme_ns_list));
+ default:
+ return (0);
+ }
+}
+
+static void
+nvmf_complete_aer(void *arg, const struct nvme_completion *cqe)
+{
+ struct nvmf_aer *aer = arg;
+ struct nvmf_softc *sc = aer->sc;
+ uint32_t cdw0;
+
+ /*
+ * The only error defined for AER is an abort due to
+ * submitting too many AER commands. Just discard this AER
+ * without resubmitting if we get an error.
+ *
+ * NB: Pending AER commands are aborted during controller
+ * shutdown, so discard aborted commands silently.
+ */
+ if (cqe->status != 0) {
+ if (!nvmf_cqe_aborted(cqe))
+ device_printf(sc->dev, "Ignoring error %#x for AER\n",
+ le16toh(cqe->status));
+ return;
+ }
+
+ cdw0 = le32toh(cqe->cdw0);
+ aer->log_page_id = NVMEV(NVME_ASYNC_EVENT_LOG_PAGE_ID, cdw0);
+ aer->info = NVMEV(NVME_ASYNC_EVENT_INFO, cdw0);
+ aer->type = NVMEV(NVME_ASYNC_EVENT_TYPE, cdw0);
+
+ device_printf(sc->dev, "AER type %u, info %#x, page %#x\n",
+ aer->type, aer->info, aer->log_page_id);
+
+ aer->page_len = nvmf_log_page_size(sc, aer->log_page_id);
+ taskqueue_enqueue(taskqueue_thread, &aer->complete_task);
+}
+
+static void
+nvmf_complete_aer_task(void *arg, int pending)
+{
+ struct nvmf_aer *aer = arg;
+ struct nvmf_softc *sc = aer->sc;
+
+ if (aer->page_len != 0) {
+ /* Read the associated log page. */
+ aer->page_len = MIN(aer->page_len, MAX_LOG_PAGE_SIZE);
+ aer->pending = 2;
+ (void) nvmf_cmd_get_log_page(sc, NVME_GLOBAL_NAMESPACE_TAG,
+ aer->log_page_id, 0, aer->page, aer->page_len,
+ nvmf_complete_aer_page, aer, nvmf_io_complete_aer_page,
+ aer, M_WAITOK);
+ } else {
+ /* Resubmit this AER command. */
+ nvmf_submit_aer(sc, aer);
+ }
+}
+
+static int
+nvmf_set_async_event_config(struct nvmf_softc *sc, uint32_t config)
+{
+ struct nvme_command cmd;
+ struct nvmf_completion_status status;
+ struct nvmf_request *req;
+
+ memset(&cmd, 0, sizeof(cmd));
+ cmd.opc = NVME_OPC_SET_FEATURES;
+ cmd.cdw10 = htole32(NVME_FEAT_ASYNC_EVENT_CONFIGURATION);
+ cmd.cdw11 = htole32(config);
+
+ nvmf_status_init(&status);
+ req = nvmf_allocate_request(sc->admin, &cmd, nvmf_complete, &status,
+ M_WAITOK);
+ if (req == NULL) {
+ device_printf(sc->dev,
+ "failed to allocate SET_FEATURES (ASYNC_EVENT_CONFIGURATION) command\n");
+ return (ECONNABORTED);
+ }
+ nvmf_submit_request(req);
+ nvmf_wait_for_reply(&status);
+
+ if (status.cqe.status != 0) {
+ device_printf(sc->dev,
+ "SET_FEATURES (ASYNC_EVENT_CONFIGURATION) failed, status %#x\n",
+ le16toh(status.cqe.status));
+ return (EIO);
+ }
+
+ return (0);
+}
+
+void
+nvmf_init_aer(struct nvmf_softc *sc)
+{
+ /* 8 matches NVME_MAX_ASYNC_EVENTS */
+ sc->num_aer = min(8, sc->cdata->aerl + 1);
+ sc->aer = mallocarray(sc->num_aer, sizeof(*sc->aer), M_NVMF,
+ M_WAITOK | M_ZERO);
+ for (u_int i = 0; i < sc->num_aer; i++) {
+ sc->aer[i].sc = sc;
+ sc->aer[i].page = malloc(MAX_LOG_PAGE_SIZE, M_NVMF, M_WAITOK);
+ sc->aer[i].lock = mtx_pool_find(mtxpool_sleep, &sc->aer[i]);
+ TASK_INIT(&sc->aer[i].complete_task, 0, nvmf_complete_aer_task,
+ &sc->aer[i]);
+ TASK_INIT(&sc->aer[i].finish_page_task, 0,
+ nvmf_finish_aer_page_task, &sc->aer[i]);
+ }
+}
+
+int
+nvmf_start_aer(struct nvmf_softc *sc)
+{
+ uint32_t async_event_config;
+ int error;
+
+ async_event_config = NVME_CRIT_WARN_ST_AVAILABLE_SPARE |
+ NVME_CRIT_WARN_ST_DEVICE_RELIABILITY |
+ NVME_CRIT_WARN_ST_READ_ONLY |
+ NVME_CRIT_WARN_ST_VOLATILE_MEMORY_BACKUP;
+ if (sc->cdata->ver >= NVME_REV(1, 2))
+ async_event_config |=
+ sc->cdata->oaes & NVME_ASYNC_EVENT_NS_ATTRIBUTE;
+ error = nvmf_set_async_event_config(sc, async_event_config);
+ if (error != 0)
+ return (error);
+
+ for (u_int i = 0; i < sc->num_aer; i++)
+ nvmf_submit_aer(sc, &sc->aer[i]);
+
+ return (0);
+}
+
+void
+nvmf_destroy_aer(struct nvmf_softc *sc)
+{
+ for (u_int i = 0; i < sc->num_aer; i++) {
+ taskqueue_drain(taskqueue_thread, &sc->aer[i].complete_task);
+ taskqueue_drain(taskqueue_thread, &sc->aer[i].finish_page_task);
+ free(sc->aer[i].page, M_NVMF);
+ }
+ free(sc->aer, M_NVMF);
+}
diff --git a/sys/dev/nvmf/host/nvmf_cmd.c b/sys/dev/nvmf/host/nvmf_cmd.c
new file mode 100644
--- /dev/null
+++ b/sys/dev/nvmf/host/nvmf_cmd.c
@@ -0,0 +1,171 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2023-2024 Chelsio Communications, Inc.
+ * Written by: John Baldwin <jhb@FreeBSD.org>
+ */
+
+#include <sys/types.h>
+#include <sys/memdesc.h>
+#include <sys/systm.h>
+#include <dev/nvme/nvme.h>
+#include <dev/nvmf/nvmf.h>
+#include <dev/nvmf/nvmf_proto.h>
+#include <dev/nvmf/host/nvmf_var.h>
+
+bool
+nvmf_cmd_get_property(struct nvmf_softc *sc, uint32_t offset, uint8_t size,
+ nvmf_request_complete_t *cb, void *cb_arg, int how)
+{
+ struct nvmf_fabric_prop_get_cmd cmd;
+ struct nvmf_request *req;
+
+ memset(&cmd, 0, sizeof(cmd));
+ cmd.opcode = NVME_OPC_FABRICS_COMMANDS;
+ cmd.fctype = NVMF_FABRIC_COMMAND_PROPERTY_GET;
+ switch (size) {
+ case 4:
+ cmd.attrib.size = NVMF_PROP_SIZE_4;
+ break;
+ case 8:
+ cmd.attrib.size = NVMF_PROP_SIZE_8;
+ break;
+ default:
+ panic("Invalid property size");
+ }
+ cmd.ofst = htole32(offset);
+
+ req = nvmf_allocate_request(sc->admin, &cmd, cb, cb_arg, how);
+ if (req != NULL)
+ nvmf_submit_request(req);
+ return (req != NULL);
+}
+
+bool
+nvmf_cmd_set_property(struct nvmf_softc *sc, uint32_t offset, uint8_t size,
+ uint64_t value, nvmf_request_complete_t *cb, void *cb_arg, int how)
+{
+ struct nvmf_fabric_prop_set_cmd cmd;
+ struct nvmf_request *req;
+
+ memset(&cmd, 0, sizeof(cmd));
+ cmd.opcode = NVME_OPC_FABRICS_COMMANDS;
+ cmd.fctype = NVMF_FABRIC_COMMAND_PROPERTY_SET;
+ switch (size) {
+ case 4:
+ cmd.attrib.size = NVMF_PROP_SIZE_4;
+ cmd.value.u32.low = htole32(value);
+ break;
+ case 8:
+ cmd.attrib.size = NVMF_PROP_SIZE_8;
+ cmd.value.u64 = htole64(value);
+ break;
+ default:
+ panic("Invalid property size");
+ }
+ cmd.ofst = htole32(offset);
+
+ req = nvmf_allocate_request(sc->admin, &cmd, cb, cb_arg, how);
+ if (req != NULL)
+ nvmf_submit_request(req);
+ return (req != NULL);
+}
+
+bool
+nvmf_cmd_keep_alive(struct nvmf_softc *sc, nvmf_request_complete_t *cb,
+ void *cb_arg, int how)
+{
+ struct nvme_command cmd;
+ struct nvmf_request *req;
+
+ memset(&cmd, 0, sizeof(cmd));
+ cmd.opc = NVME_OPC_KEEP_ALIVE;
+
+ req = nvmf_allocate_request(sc->admin, &cmd, cb, cb_arg, how);
+ if (req != NULL)
+ nvmf_submit_request(req);
+ return (req != NULL);
+}
+
+bool
+nvmf_cmd_identify_active_namespaces(struct nvmf_softc *sc, uint32_t id,
+ struct nvme_ns_list *nslist, nvmf_request_complete_t *req_cb,
+ void *req_cb_arg, nvmf_io_complete_t *io_cb, void *io_cb_arg, int how)
+{
+ struct nvme_command cmd;
+ struct memdesc mem;
+ struct nvmf_request *req;
+
+ memset(&cmd, 0, sizeof(cmd));
+ cmd.opc = NVME_OPC_IDENTIFY;
+
+ /* 5.15.1 Use CNS of 0x02 for namespace data. */
+ cmd.cdw10 = htole32(2);
+ cmd.nsid = htole32(id);
+
+ req = nvmf_allocate_request(sc->admin, &cmd, req_cb, req_cb_arg, how);
+ if (req == NULL)
+ return (false);
+ mem = memdesc_vaddr(nslist, sizeof(*nslist));
+ nvmf_capsule_append_data(req->nc, &mem, sizeof(*nslist), false,
+ io_cb, io_cb_arg);
+ nvmf_submit_request(req);
+ return (true);
+}
+
+bool
+nvmf_cmd_identify_namespace(struct nvmf_softc *sc, uint32_t id,
+ struct nvme_namespace_data *nsdata, nvmf_request_complete_t *req_cb,
+ void *req_cb_arg, nvmf_io_complete_t *io_cb, void *io_cb_arg, int how)
+{
+ struct nvme_command cmd;
+ struct memdesc mem;
+ struct nvmf_request *req;
+
+ memset(&cmd, 0, sizeof(cmd));
+ cmd.opc = NVME_OPC_IDENTIFY;
+
+ /* 5.15.1 Use CNS of 0x00 for namespace data. */
+ cmd.cdw10 = htole32(0);
+ cmd.nsid = htole32(id);
+
+ req = nvmf_allocate_request(sc->admin, &cmd, req_cb, req_cb_arg, how);
+ if (req == NULL)
+ return (false);
+ mem = memdesc_vaddr(nsdata, sizeof(*nsdata));
+ nvmf_capsule_append_data(req->nc, &mem, sizeof(*nsdata), false,
+ io_cb, io_cb_arg);
+ nvmf_submit_request(req);
+ return (true);
+}
+
+bool
+nvmf_cmd_get_log_page(struct nvmf_softc *sc, uint32_t nsid, uint8_t lid,
+ uint64_t offset, void *buf, size_t len, nvmf_request_complete_t *req_cb,
+ void *req_cb_arg, nvmf_io_complete_t *io_cb, void *io_cb_arg, int how)
+{
+ struct nvme_command cmd;
+ struct memdesc mem;
+ struct nvmf_request *req;
+ size_t numd;
+
+ MPASS(len != 0 && len % 4 == 0);
+ MPASS(offset % 4 == 0);
+
+ numd = (len / 4) - 1;
+ memset(&cmd, 0, sizeof(cmd));
+ cmd.opc = NVME_OPC_GET_LOG_PAGE;
+ cmd.nsid = htole32(nsid);
+ cmd.cdw10 = htole32(numd << 16 | lid);
+ cmd.cdw11 = htole32(numd >> 16);
+ cmd.cdw12 = htole32(offset);
+ cmd.cdw13 = htole32(offset >> 32);
+
+ req = nvmf_allocate_request(sc->admin, &cmd, req_cb, req_cb_arg, how);
+ if (req == NULL)
+ return (false);
+ mem = memdesc_vaddr(buf, len);
+ nvmf_capsule_append_data(req->nc, &mem, len, false, io_cb, io_cb_arg);
+ nvmf_submit_request(req);
+ return (true);
+}
diff --git a/sys/dev/nvmf/host/nvmf_ctldev.c b/sys/dev/nvmf/host/nvmf_ctldev.c
new file mode 100644
--- /dev/null
+++ b/sys/dev/nvmf/host/nvmf_ctldev.c
@@ -0,0 +1,159 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2023 Chelsio Communications, Inc.
+ * Written by: John Baldwin <jhb@FreeBSD.org>
+ */
+
+#include <sys/param.h>
+#include <sys/bus.h>
+#include <sys/conf.h>
+#include <sys/malloc.h>
+#include <dev/nvme/nvme.h>
+#include <dev/nvmf/nvmf.h>
+#include <dev/nvmf/nvmf_transport.h>
+#include <dev/nvmf/host/nvmf_var.h>
+
+static struct cdev *nvmf_cdev;
+
+static int
+nvmf_handoff_host(struct nvmf_handoff_host *hh)
+{
+ struct nvmf_ivars ivars;
+ device_t dev;
+ int error;
+
+ error = nvmf_init_ivars(&ivars, hh);
+ if (error != 0)
+ return (error);
+
+ bus_topo_lock();
+ dev = device_add_child(root_bus, "nvme", -1);
+ if (dev == NULL) {
+ bus_topo_unlock();
+ error = ENXIO;
+ goto out;
+ }
+
+ device_set_ivars(dev, &ivars);
+ error = device_probe_and_attach(dev);
+ device_set_ivars(dev, NULL);
+ if (error != 0)
+ device_delete_child(root_bus, dev);
+ bus_topo_unlock();
+
+out:
+ nvmf_free_ivars(&ivars);
+ return (error);
+}
+
+static bool
+nvmf_matches(device_t dev, char *name)
+{
+ struct nvmf_softc *sc = device_get_softc(dev);
+
+ if (strcmp(device_get_nameunit(dev), name) == 0)
+ return (true);
+ if (strcmp(sc->cdata->subnqn, name) == 0)
+ return (true);
+ return (false);
+}
+
+static int
+nvmf_disconnect_by_name(char *name)
+{
+ devclass_t dc;
+ device_t dev;
+ int error, unit;
+ bool found;
+
+ found = false;
+ error = 0;
+ bus_topo_lock();
+ dc = devclass_find("nvme");
+ if (dc == NULL)
+ goto out;
+
+ for (unit = 0; unit < devclass_get_maxunit(dc); unit++) {
+ dev = devclass_get_device(dc, unit);
+ if (dev == NULL)
+ continue;
+ if (device_get_driver(dev) != &nvme_nvmf_driver)
+ continue;
+ if (device_get_parent(dev) != root_bus)
+ continue;
+ if (name != NULL && !nvmf_matches(dev, name))
+ continue;
+
+ error = device_delete_child(root_bus, dev);
+ if (error != 0)
+ break;
+ found = true;
+ }
+out:
+ bus_topo_unlock();
+ if (error == 0 && !found)
+ error = ENOENT;
+ return (error);
+}
+
+static int
+nvmf_disconnect_host(const char **namep)
+{
+ char *name;
+ int error;
+
+ name = malloc(PATH_MAX, M_NVMF, M_WAITOK);
+ error = copyinstr(*namep, name, PATH_MAX, NULL);
+ if (error == 0)
+ error = nvmf_disconnect_by_name(name);
+ free(name, M_NVMF);
+ return (error);
+}
+
+static int
+nvmf_ctl_ioctl(struct cdev *dev, u_long cmd, caddr_t arg, int flag,
+ struct thread *td)
+{
+ switch (cmd) {
+ case NVMF_HANDOFF_HOST:
+ return (nvmf_handoff_host((struct nvmf_handoff_host *)arg));
+ case NVMF_DISCONNECT_HOST:
+ return (nvmf_disconnect_host((const char **)arg));
+ case NVMF_DISCONNECT_ALL:
+ return (nvmf_disconnect_by_name(NULL));
+ default:
+ return (ENOTTY);
+ }
+}
+
+static struct cdevsw nvmf_ctl_cdevsw = {
+ .d_version = D_VERSION,
+ .d_ioctl = nvmf_ctl_ioctl
+};
+
+int
+nvmf_ctl_load(void)
+{
+ struct make_dev_args mda;
+ int error;
+
+ make_dev_args_init(&mda);
+ mda.mda_devsw = &nvmf_ctl_cdevsw;
+ mda.mda_uid = UID_ROOT;
+ mda.mda_gid = GID_WHEEL;
+ mda.mda_mode = 0600;
+ error = make_dev_s(&mda, &nvmf_cdev, "nvmf");
+ if (error != 0)
+ nvmf_cdev = NULL;
+ return (error);
+}
+
+void
+nvmf_ctl_unload(void)
+{
+ if (nvmf_cdev != NULL) {
+ destroy_dev(nvmf_cdev);
+ nvmf_cdev = NULL;
+ }
+}
diff --git a/sys/dev/nvmf/host/nvmf_ns.c b/sys/dev/nvmf/host/nvmf_ns.c
new file mode 100644
--- /dev/null
+++ b/sys/dev/nvmf/host/nvmf_ns.c
@@ -0,0 +1,483 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2023-2024 Chelsio Communications, Inc.
+ * Written by: John Baldwin <jhb@FreeBSD.org>
+ */
+
+#include <sys/param.h>
+#include <sys/bio.h>
+#include <sys/bus.h>
+#include <sys/conf.h>
+#include <sys/disk.h>
+#include <sys/fcntl.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/memdesc.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/refcount.h>
+#include <sys/sbuf.h>
+#include <machine/stdarg.h>
+#include <dev/nvme/nvme.h>
+#include <dev/nvmf/host/nvmf_var.h>
+
+struct nvmf_namespace {
+ struct nvmf_softc *sc;
+ uint64_t size;
+ uint32_t id;
+ u_int flags;
+ uint32_t lba_size;
+ bool disconnected;
+
+ TAILQ_HEAD(, bio) pending_bios;
+ struct mtx lock;
+ volatile u_int active_bios;
+
+ struct cdev *cdev;
+};
+
+static void nvmf_ns_strategy(struct bio *bio);
+
+static void
+ns_printf(struct nvmf_namespace *ns, const char *fmt, ...)
+{
+ char buf[128];
+ struct sbuf sb;
+ va_list ap;
+
+ sbuf_new(&sb, buf, sizeof(buf), SBUF_FIXEDLEN);
+ sbuf_set_drain(&sb, sbuf_printf_drain, NULL);
+
+ sbuf_printf(&sb, "%sns%u: ", device_get_nameunit(ns->sc->dev),
+ ns->id);
+
+ va_start(ap, fmt);
+ sbuf_vprintf(&sb, fmt, ap);
+ va_end(ap);
+
+ sbuf_finish(&sb);
+ sbuf_delete(&sb);
+}
+
+/*
+ * The I/O completion may trigger after the received CQE if the I/O
+ * used a zero-copy mbuf that isn't harvested until after the NIC
+ * driver processes TX completions. Abuse bio_driver1 as a refcount.
+ * Store I/O errors in bio_driver2.
+ */
+static __inline u_int *
+bio_refs(struct bio *bio)
+{
+ return ((u_int *)&bio->bio_driver1);
+}
+
+static void
+nvmf_ns_biodone(struct bio *bio)
+{
+ struct nvmf_namespace *ns;
+ int error;
+
+ if (!refcount_release(bio_refs(bio)))
+ return;
+
+ ns = bio->bio_dev->si_drv1;
+
+ /* If a request is aborted, resubmit or queue it for resubmission. */
+ if (bio->bio_error == ECONNABORTED) {
+ bio->bio_error = 0;
+ bio->bio_driver2 = 0;
+ mtx_lock(&ns->lock);
+ if (ns->disconnected) {
+ TAILQ_INSERT_TAIL(&ns->pending_bios, bio, bio_queue);
+ mtx_unlock(&ns->lock);
+ } else {
+ mtx_unlock(&ns->lock);
+ nvmf_ns_strategy(bio);
+ }
+ } else {
+ /*
+ * I/O errors take precedence over generic EIO from
+ * CQE errors.
+ */
+ error = (intptr_t)bio->bio_driver2;
+ if (error != 0)
+ bio->bio_error = error;
+ if (bio->bio_error != 0)
+ bio->bio_flags |= BIO_ERROR;
+ biodone(bio);
+ }
+
+ if (refcount_release(&ns->active_bios))
+ wakeup(ns);
+}
+
+static void
+nvmf_ns_io_complete(void *arg, size_t xfered, int error)
+{
+ struct bio *bio = arg;
+
+ KASSERT(xfered <= bio->bio_bcount,
+ ("%s: xfered > bio_bcount", __func__));
+
+ bio->bio_driver2 = (void *)(intptr_t)error;
+ bio->bio_resid = bio->bio_bcount - xfered;
+
+ nvmf_ns_biodone(bio);
+}
+
+static void
+nvmf_ns_delete_complete(void *arg, size_t xfered, int error)
+{
+ struct bio *bio = arg;
+
+ if (error != 0)
+ bio->bio_resid = bio->bio_bcount;
+ else
+ bio->bio_resid = 0;
+
+ free(bio->bio_driver2, M_NVMF);
+ bio->bio_driver2 = (void *)(intptr_t)error;
+
+ nvmf_ns_biodone(bio);
+}
+
+static void
+nvmf_ns_bio_complete(void *arg, const struct nvme_completion *cqe)
+{
+ struct bio *bio = arg;
+
+ if (nvmf_cqe_aborted(cqe))
+ bio->bio_error = ECONNABORTED;
+ else if (cqe->status != 0)
+ bio->bio_error = EIO;
+
+ nvmf_ns_biodone(bio);
+}
+
+static int
+nvmf_ns_submit_bio(struct nvmf_namespace *ns, struct bio *bio)
+{
+ struct nvme_command cmd;
+ struct nvmf_request *req;
+ struct nvme_dsm_range *dsm_range;
+ struct memdesc mem;
+ uint64_t lba, lba_count;
+
+ dsm_range = NULL;
+ memset(&cmd, 0, sizeof(cmd));
+ switch (bio->bio_cmd) {
+ case BIO_READ:
+ lba = bio->bio_offset / ns->lba_size;
+ lba_count = bio->bio_bcount / ns->lba_size;
+ nvme_ns_read_cmd(&cmd, ns->id, lba, lba_count);
+ break;
+ case BIO_WRITE:
+ lba = bio->bio_offset / ns->lba_size;
+ lba_count = bio->bio_bcount / ns->lba_size;
+ nvme_ns_write_cmd(&cmd, ns->id, lba, lba_count);
+ break;
+ case BIO_FLUSH:
+ nvme_ns_flush_cmd(&cmd, ns->id);
+ break;
+ case BIO_DELETE:
+ dsm_range = malloc(sizeof(*dsm_range), M_NVMF, M_NOWAIT |
+ M_ZERO);
+ if (dsm_range == NULL)
+ return (ENOMEM);
+ lba = bio->bio_offset / ns->lba_size;
+ lba_count = bio->bio_bcount / ns->lba_size;
+ dsm_range->starting_lba = htole64(lba);
+ dsm_range->length = htole32(lba_count);
+
+ cmd.opc = NVME_OPC_DATASET_MANAGEMENT;
+ cmd.nsid = htole32(ns->id);
+ cmd.cdw10 = htole32(0); /* 1 range */
+ cmd.cdw11 = htole32(NVME_DSM_ATTR_DEALLOCATE);
+ break;
+ default:
+ return (EOPNOTSUPP);
+ }
+
+ mtx_lock(&ns->lock);
+ if (ns->disconnected) {
+ TAILQ_INSERT_TAIL(&ns->pending_bios, bio, bio_queue);
+ mtx_unlock(&ns->lock);
+ free(dsm_range, M_NVMF);
+ return (0);
+ }
+
+ req = nvmf_allocate_request(nvmf_select_io_queue(ns->sc), &cmd,
+ nvmf_ns_bio_complete, bio, M_NOWAIT);
+ if (req == NULL) {
+ mtx_unlock(&ns->lock);
+ free(dsm_range, M_NVMF);
+ return (ENOMEM);
+ }
+
+ switch (bio->bio_cmd) {
+ case BIO_READ:
+ case BIO_WRITE:
+ refcount_init(bio_refs(bio), 2);
+ mem = memdesc_bio(bio);
+ nvmf_capsule_append_data(req->nc, &mem, bio->bio_bcount,
+ bio->bio_cmd == BIO_WRITE, nvmf_ns_io_complete, bio);
+ break;
+ case BIO_DELETE:
+ refcount_init(bio_refs(bio), 2);
+ mem = memdesc_vaddr(dsm_range, sizeof(*dsm_range));
+ nvmf_capsule_append_data(req->nc, &mem, sizeof(*dsm_range),
+ true, nvmf_ns_delete_complete, bio);
+ bio->bio_driver2 = dsm_range;
+ break;
+ default:
+ refcount_init(bio_refs(bio), 1);
+ KASSERT(bio->bio_resid == 0,
+ ("%s: input bio_resid != 0", __func__));
+ break;
+ }
+
+ refcount_acquire(&ns->active_bios);
+ nvmf_submit_request(req);
+ mtx_unlock(&ns->lock);
+ return (0);
+}
+
+static int
+nvmf_ns_ioctl(struct cdev *dev, u_long cmd, caddr_t arg, int flag,
+ struct thread *td)
+{
+ struct nvmf_namespace *ns = dev->si_drv1;
+ struct nvme_get_nsid *gnsid;
+ struct nvme_pt_command *pt;
+
+ switch (cmd) {
+ case NVME_PASSTHROUGH_CMD:
+ pt = (struct nvme_pt_command *)arg;
+ pt->cmd.nsid = htole32(ns->id);
+ return (nvmf_passthrough_cmd(ns->sc, pt, false));
+ case NVME_GET_NSID:
+ gnsid = (struct nvme_get_nsid *)arg;
+ strncpy(gnsid->cdev, device_get_nameunit(ns->sc->dev),
+ sizeof(gnsid->cdev));
+ gnsid->cdev[sizeof(gnsid->cdev) - 1] = '\0';
+ gnsid->nsid = ns->id;
+ return (0);
+ case DIOCGMEDIASIZE:
+ *(off_t *)arg = ns->size;
+ return (0);
+ case DIOCGSECTORSIZE:
+ *(u_int *)arg = ns->lba_size;
+ return (0);
+ default:
+ return (ENOTTY);
+ }
+}
+
+static int
+nvmf_ns_open(struct cdev *dev, int oflags, int devtype, struct thread *td)
+{
+ int error;
+
+ error = 0;
+ if ((oflags & FWRITE) != 0)
+ error = securelevel_gt(td->td_ucred, 0);
+ return (error);
+}
+
+void
+nvmf_ns_strategy(struct bio *bio)
+{
+ struct nvmf_namespace *ns;
+ int error;
+
+ ns = bio->bio_dev->si_drv1;
+
+ error = nvmf_ns_submit_bio(ns, bio);
+ if (error != 0) {
+ bio->bio_error = error;
+ bio->bio_flags |= BIO_ERROR;
+ bio->bio_resid = bio->bio_bcount;
+ biodone(bio);
+ }
+}
+
+static struct cdevsw nvmf_ns_cdevsw = {
+ .d_version = D_VERSION,
+ .d_flags = D_DISK,
+ .d_open = nvmf_ns_open,
+ .d_read = physread,
+ .d_write = physwrite,
+ .d_strategy = nvmf_ns_strategy,
+ .d_ioctl = nvmf_ns_ioctl
+};
+
+struct nvmf_namespace *
+nvmf_init_ns(struct nvmf_softc *sc, uint32_t id,
+ struct nvme_namespace_data *data)
+{
+ struct make_dev_args mda;
+ struct nvmf_namespace *ns;
+ int error;
+ uint8_t lbads, lbaf;
+
+ ns = malloc(sizeof(*ns), M_NVMF, M_WAITOK | M_ZERO);
+ ns->sc = sc;
+ ns->id = id;
+ TAILQ_INIT(&ns->pending_bios);
+ mtx_init(&ns->lock, "nvmf ns", NULL, MTX_DEF);
+
+ /* One dummy bio avoids dropping to 0 until destroy. */
+ refcount_init(&ns->active_bios, 1);
+
+ if (NVMEV(NVME_NS_DATA_DPS_PIT, data->dps) != 0) {
+ ns_printf(ns, "End-to-end data protection not supported\n");
+ goto fail;
+ }
+
+ lbaf = NVMEV(NVME_NS_DATA_FLBAS_FORMAT, data->flbas);
+ if (lbaf > data->nlbaf) {
+ ns_printf(ns, "Invalid LBA format index\n");
+ goto fail;
+ }
+
+ if (NVMEV(NVME_NS_DATA_LBAF_MS, data->lbaf[lbaf]) != 0) {
+ ns_printf(ns, "Namespaces with metadata are not supported\n");
+ goto fail;
+ }
+
+ lbads = NVMEV(NVME_NS_DATA_LBAF_LBADS, data->lbaf[lbaf]);
+ if (lbads == 0) {
+ ns_printf(ns, "Invalid LBA format index\n");
+ goto fail;
+ }
+
+ ns->lba_size = 1 << lbads;
+ ns->size = data->nsze * ns->lba_size;
+
+ if (nvme_ctrlr_has_dataset_mgmt(sc->cdata))
+ ns->flags |= NVME_NS_DEALLOCATE_SUPPORTED;
+
+ if (NVMEV(NVME_CTRLR_DATA_VWC_PRESENT, sc->cdata->vwc) != 0)
+ ns->flags |= NVME_NS_FLUSH_SUPPORTED;
+
+ /*
+ * XXX: Does any of the boundary splitting for NOIOB make any
+ * sense for Fabrics?
+ */
+
+ make_dev_args_init(&mda);
+ mda.mda_devsw = &nvmf_ns_cdevsw;
+ mda.mda_uid = UID_ROOT;
+ mda.mda_gid = GID_WHEEL;
+ mda.mda_mode = 0600;
+ mda.mda_si_drv1 = ns;
+ error = make_dev_s(&mda, &ns->cdev, "%sns%u",
+ device_get_nameunit(sc->dev), id);
+ if (error != 0)
+ goto fail;
+
+ ns->cdev->si_flags |= SI_UNMAPPED;
+
+ return (ns);
+fail:
+ mtx_destroy(&ns->lock);
+ free(ns, M_NVMF);
+ return (NULL);
+}
+
+void
+nvmf_disconnect_ns(struct nvmf_namespace *ns)
+{
+ mtx_lock(&ns->lock);
+ ns->disconnected = true;
+ mtx_unlock(&ns->lock);
+}
+
+void
+nvmf_reconnect_ns(struct nvmf_namespace *ns)
+{
+ TAILQ_HEAD(, bio) bios;
+ struct bio *bio;
+
+ mtx_lock(&ns->lock);
+ ns->disconnected = false;
+ TAILQ_INIT(&bios);
+ TAILQ_CONCAT(&bios, &ns->pending_bios, bio_queue);
+ mtx_unlock(&ns->lock);
+
+ while (!TAILQ_EMPTY(&bios)) {
+ bio = TAILQ_FIRST(&bios);
+ TAILQ_REMOVE(&bios, bio, bio_queue);
+ nvmf_ns_strategy(bio);
+ }
+}
+
+void
+nvmf_destroy_ns(struct nvmf_namespace *ns)
+{
+ TAILQ_HEAD(, bio) bios;
+ struct bio *bio;
+
+ destroy_dev(ns->cdev);
+
+ /*
+ * Wait for active I/O requests to drain. The release drops
+ * the reference on the "dummy bio" when the namespace is
+ * created.
+ */
+ mtx_lock(&ns->lock);
+ if (!refcount_release(&ns->active_bios)) {
+ while (ns->active_bios != 0)
+ mtx_sleep(ns, &ns->lock, 0, "nvmfrmns", 0);
+ }
+
+ /* Abort any pending I/O requests. */
+ TAILQ_INIT(&bios);
+ TAILQ_CONCAT(&bios, &ns->pending_bios, bio_queue);
+ mtx_unlock(&ns->lock);
+
+ while (!TAILQ_EMPTY(&bios)) {
+ bio = TAILQ_FIRST(&bios);
+ TAILQ_REMOVE(&bios, bio, bio_queue);
+ bio->bio_error = ECONNABORTED;
+ bio->bio_flags |= BIO_ERROR;
+ bio->bio_resid = bio->bio_bcount;
+ biodone(bio);
+ }
+
+ mtx_destroy(&ns->lock);
+ free(ns, M_NVMF);
+}
+
+bool
+nvmf_update_ns(struct nvmf_namespace *ns, struct nvme_namespace_data *data)
+{
+ uint8_t lbads, lbaf;
+
+ if (NVMEV(NVME_NS_DATA_DPS_PIT, data->dps) != 0) {
+ ns_printf(ns, "End-to-end data protection not supported\n");
+ return (false);
+ }
+
+ lbaf = NVMEV(NVME_NS_DATA_FLBAS_FORMAT, data->flbas);
+ if (lbaf > data->nlbaf) {
+ ns_printf(ns, "Invalid LBA format index\n");
+ return (false);
+ }
+
+ if (NVMEV(NVME_NS_DATA_LBAF_MS, data->lbaf[lbaf]) != 0) {
+ ns_printf(ns, "Namespaces with metadata are not supported\n");
+ return (false);
+ }
+
+ lbads = NVMEV(NVME_NS_DATA_LBAF_LBADS, data->lbaf[lbaf]);
+ if (lbads == 0) {
+ ns_printf(ns, "Invalid LBA format index\n");
+ return (false);
+ }
+
+ ns->lba_size = 1 << lbads;
+ ns->size = data->nsze * ns->lba_size;
+ return (true);
+}
diff --git a/sys/dev/nvmf/host/nvmf_qpair.c b/sys/dev/nvmf/host/nvmf_qpair.c
new file mode 100644
--- /dev/null
+++ b/sys/dev/nvmf/host/nvmf_qpair.c
@@ -0,0 +1,386 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2023-2024 Chelsio Communications, Inc.
+ * Written by: John Baldwin <jhb@FreeBSD.org>
+ */
+
+#include <sys/types.h>
+#include <sys/bus.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <dev/nvme/nvme.h>
+#include <dev/nvmf/nvmf.h>
+#include <dev/nvmf/nvmf_transport.h>
+#include <dev/nvmf/host/nvmf_var.h>
+
+struct nvmf_host_command {
+ struct nvmf_request *req;
+ TAILQ_ENTRY(nvmf_host_command) link;
+ uint16_t cid;
+};
+
+struct nvmf_host_qpair {
+ struct nvmf_softc *sc;
+ struct nvmf_qpair *qp;
+
+ bool sq_flow_control;
+ bool shutting_down;
+ u_int allocating;
+ u_int num_commands;
+ uint16_t sqhd;
+ uint16_t sqtail;
+
+ struct mtx lock;
+
+ TAILQ_HEAD(, nvmf_host_command) free_commands;
+ STAILQ_HEAD(, nvmf_request) pending_requests;
+
+ /* Indexed by cid. */
+ struct nvmf_host_command **active_commands;
+
+ char name[16];
+};
+
+struct nvmf_request *
+nvmf_allocate_request(struct nvmf_host_qpair *qp, void *sqe,
+ nvmf_request_complete_t *cb, void *cb_arg, int how)
+{
+ struct nvmf_request *req;
+ struct nvmf_qpair *nq;
+
+ KASSERT(how == M_WAITOK || how == M_NOWAIT,
+ ("%s: invalid how", __func__));
+
+ req = malloc(sizeof(*req), M_NVMF, how | M_ZERO);
+ if (req == NULL)
+ return (NULL);
+
+ mtx_lock(&qp->lock);
+ nq = qp->qp;
+ if (nq == NULL) {
+ mtx_unlock(&qp->lock);
+ free(req, M_NVMF);
+ return (NULL);
+ }
+ qp->allocating++;
+ MPASS(qp->allocating != 0);
+ mtx_unlock(&qp->lock);
+
+ req->qp = qp;
+ req->cb = cb;
+ req->cb_arg = cb_arg;
+ req->nc = nvmf_allocate_command(nq, sqe, how);
+ if (req->nc == NULL) {
+ free(req, M_NVMF);
+ req = NULL;
+ }
+
+ mtx_lock(&qp->lock);
+ qp->allocating--;
+ if (qp->allocating == 0 && qp->shutting_down)
+ wakeup(qp);
+ mtx_unlock(&qp->lock);
+
+ return (req);
+}
+
+static void
+nvmf_abort_request(struct nvmf_request *req, uint16_t cid)
+{
+ struct nvme_completion cqe;
+
+ memset(&cqe, 0, sizeof(cqe));
+ cqe.cid = cid;
+ cqe.status = htole16(NVMEF(NVME_STATUS_SCT, NVME_SCT_PATH_RELATED) |
+ NVMEF(NVME_STATUS_SC, NVME_SC_COMMAND_ABORTED_BY_HOST));
+ req->cb(req->cb_arg, &cqe);
+}
+
+void
+nvmf_free_request(struct nvmf_request *req)
+{
+ if (req->nc != NULL)
+ nvmf_free_capsule(req->nc);
+ free(req, M_NVMF);
+}
+
+static void
+nvmf_dispatch_command(struct nvmf_host_qpair *qp, struct nvmf_host_command *cmd)
+{
+ struct nvmf_softc *sc = qp->sc;
+ struct nvme_command *sqe;
+ struct nvmf_capsule *nc;
+ int error;
+
+ nc = cmd->req->nc;
+ sqe = nvmf_capsule_sqe(nc);
+
+ /*
+ * NB: Don't bother byte-swapping the cid so that receive
+ * doesn't have to swap.
+ */
+ sqe->cid = cmd->cid;
+
+ error = nvmf_transmit_capsule(nc);
+ if (error != 0) {
+ device_printf(sc->dev,
+ "failed to transmit capsule: %d, disconnecting\n", error);
+ nvmf_disconnect(sc);
+ return;
+ }
+
+ if (sc->ka_traffic)
+ atomic_store_int(&sc->ka_active_tx_traffic, 1);
+}
+
+static void
+nvmf_qp_error(void *arg, int error)
+{
+ struct nvmf_host_qpair *qp = arg;
+ struct nvmf_softc *sc = qp->sc;
+
+ /* Ignore simple close of queue pairs during shutdown. */
+ if (!(sc->detaching && error == 0))
+ device_printf(sc->dev, "error %d on %s, disconnecting\n", error,
+ qp->name);
+ nvmf_disconnect(sc);
+}
+
+static void
+nvmf_receive_capsule(void *arg, struct nvmf_capsule *nc)
+{
+ struct nvmf_host_qpair *qp = arg;
+ struct nvmf_softc *sc = qp->sc;
+ struct nvmf_host_command *cmd;
+ struct nvmf_request *req;
+ const struct nvme_completion *cqe;
+ uint16_t cid;
+
+ cqe = nvmf_capsule_cqe(nc);
+
+ if (sc->ka_traffic)
+ atomic_store_int(&sc->ka_active_rx_traffic, 1);
+
+ /*
+ * NB: Don't bother byte-swapping the cid as transmit doesn't
+ * swap either.
+ */
+ cid = cqe->cid;
+
+ if (cid > qp->num_commands) {
+ device_printf(sc->dev,
+ "received invalid CID %u, disconnecting\n", cid);
+ nvmf_disconnect(sc);
+ nvmf_free_capsule(nc);
+ return;
+ }
+
+ /*
+ * If the queue has been shutdown due to an error, silently
+ * drop the response.
+ */
+ mtx_lock(&qp->lock);
+ if (qp->qp == NULL) {
+ device_printf(sc->dev,
+ "received completion for CID %u on shutdown %s\n", cid,
+ qp->name);
+ mtx_unlock(&qp->lock);
+ nvmf_free_capsule(nc);
+ return;
+ }
+
+ cmd = qp->active_commands[cid];
+ if (cmd == NULL) {
+ mtx_unlock(&qp->lock);
+ device_printf(sc->dev,
+ "received completion for inactive CID %u, disconnecting\n",
+ cid);
+ nvmf_disconnect(sc);
+ nvmf_free_capsule(nc);
+ return;
+ }
+
+ KASSERT(cmd->cid == cid, ("%s: CID mismatch", __func__));
+ req = cmd->req;
+ cmd->req = NULL;
+ if (STAILQ_EMPTY(&qp->pending_requests)) {
+ qp->active_commands[cid] = NULL;
+ TAILQ_INSERT_TAIL(&qp->free_commands, cmd, link);
+ mtx_unlock(&qp->lock);
+ } else {
+ cmd->req = STAILQ_FIRST(&qp->pending_requests);
+ STAILQ_REMOVE_HEAD(&qp->pending_requests, link);
+ mtx_unlock(&qp->lock);
+ nvmf_dispatch_command(qp, cmd);
+ }
+
+ req->cb(req->cb_arg, cqe);
+ nvmf_free_capsule(nc);
+ nvmf_free_request(req);
+}
+
+struct nvmf_host_qpair *
+nvmf_init_qp(struct nvmf_softc *sc, enum nvmf_trtype trtype,
+ struct nvmf_handoff_qpair_params *handoff, const char *name)
+{
+ struct nvmf_host_command *cmd, *ncmd;
+ struct nvmf_host_qpair *qp;
+ u_int i;
+
+ qp = malloc(sizeof(*qp), M_NVMF, M_WAITOK | M_ZERO);
+ qp->sc = sc;
+ qp->sq_flow_control = handoff->sq_flow_control;
+ qp->sqhd = handoff->sqhd;
+ qp->sqtail = handoff->sqtail;
+ strlcpy(qp->name, name, sizeof(qp->name));
+ mtx_init(&qp->lock, "nvmf qp", NULL, MTX_DEF);
+
+ /*
+ * Allocate a spare command slot for each pending AER command
+ * on the admin queue.
+ */
+ qp->num_commands = handoff->qsize - 1;
+ if (handoff->admin)
+ qp->num_commands += sc->num_aer;
+
+ qp->active_commands = malloc(sizeof(*qp->active_commands) *
+ qp->num_commands, M_NVMF, M_WAITOK | M_ZERO);
+ TAILQ_INIT(&qp->free_commands);
+ for (i = 0; i < qp->num_commands; i++) {
+ cmd = malloc(sizeof(*cmd), M_NVMF, M_WAITOK | M_ZERO);
+ cmd->cid = i;
+ TAILQ_INSERT_TAIL(&qp->free_commands, cmd, link);
+ }
+ STAILQ_INIT(&qp->pending_requests);
+
+ qp->qp = nvmf_allocate_qpair(trtype, false, handoff, nvmf_qp_error,
+ qp, nvmf_receive_capsule, qp);
+ if (qp->qp == NULL) {
+ TAILQ_FOREACH_SAFE(cmd, &qp->free_commands, link, ncmd) {
+ TAILQ_REMOVE(&qp->free_commands, cmd, link);
+ free(cmd, M_NVMF);
+ }
+ free(qp->active_commands, M_NVMF);
+ mtx_destroy(&qp->lock);
+ free(qp, M_NVMF);
+ return (NULL);
+ }
+
+ return (qp);
+}
+
+void
+nvmf_shutdown_qp(struct nvmf_host_qpair *qp)
+{
+ struct nvmf_host_command *cmd;
+ struct nvmf_request *req;
+ struct nvmf_qpair *nq;
+
+ mtx_lock(&qp->lock);
+ nq = qp->qp;
+ qp->qp = NULL;
+
+ if (nq == NULL) {
+ while (qp->shutting_down)
+ mtx_sleep(qp, &qp->lock, 0, "nvmfqpsh", 0);
+ mtx_unlock(&qp->lock);
+ return;
+ }
+ qp->shutting_down = true;
+ while (qp->allocating != 0)
+ mtx_sleep(qp, &qp->lock, 0, "nvmfqpqu", 0);
+ mtx_unlock(&qp->lock);
+
+ nvmf_free_qpair(nq);
+
+ /*
+ * Abort outstanding requests. Active requests will have
+ * their I/O completions invoked and associated capsules freed
+ * by the transport layer via nvmf_free_qpair. Pending
+ * requests must have their I/O completion invoked via
+ * nvmf_abort_capsule_data.
+ */
+ for (u_int i = 0; i < qp->num_commands; i++) {
+ cmd = qp->active_commands[i];
+ if (cmd != NULL) {
+ if (!cmd->req->aer)
+ printf("%s: aborted active command %p (CID %u)\n",
+ __func__, cmd->req, cmd->cid);
+
+ /* This was freed by nvmf_free_qpair. */
+ cmd->req->nc = NULL;
+ nvmf_abort_request(cmd->req, cmd->cid);
+ nvmf_free_request(cmd->req);
+ free(cmd, M_NVMF);
+ }
+ }
+ while (!STAILQ_EMPTY(&qp->pending_requests)) {
+ req = STAILQ_FIRST(&qp->pending_requests);
+ STAILQ_REMOVE_HEAD(&qp->pending_requests, link);
+ if (!req->aer)
+ printf("%s: aborted pending command %p\n", __func__,
+ req);
+ nvmf_abort_capsule_data(req->nc, ECONNABORTED);
+ nvmf_abort_request(req, 0);
+ nvmf_free_request(req);
+ }
+
+ mtx_lock(&qp->lock);
+ qp->shutting_down = false;
+ mtx_unlock(&qp->lock);
+ wakeup(qp);
+}
+
+void
+nvmf_destroy_qp(struct nvmf_host_qpair *qp)
+{
+ struct nvmf_host_command *cmd, *ncmd;
+
+ nvmf_shutdown_qp(qp);
+
+ TAILQ_FOREACH_SAFE(cmd, &qp->free_commands, link, ncmd) {
+ TAILQ_REMOVE(&qp->free_commands, cmd, link);
+ free(cmd, M_NVMF);
+ }
+ free(qp->active_commands, M_NVMF);
+ mtx_destroy(&qp->lock);
+ free(qp, M_NVMF);
+}
+
+void
+nvmf_submit_request(struct nvmf_request *req)
+{
+ struct nvmf_host_qpair *qp;
+ struct nvmf_host_command *cmd;
+
+ qp = req->qp;
+ mtx_lock(&qp->lock);
+ if (qp->qp == NULL) {
+ mtx_unlock(&qp->lock);
+ printf("%s: aborted pending command %p\n", __func__, req);
+ nvmf_abort_capsule_data(req->nc, ECONNABORTED);
+ nvmf_abort_request(req, 0);
+ nvmf_free_request(req);
+ return;
+ }
+ cmd = TAILQ_FIRST(&qp->free_commands);
+ if (cmd == NULL) {
+ /*
+ * Queue this request. Will be sent after enough
+ * in-flight requests have completed.
+ */
+ STAILQ_INSERT_TAIL(&qp->pending_requests, req, link);
+ mtx_unlock(&qp->lock);
+ return;
+ }
+
+ TAILQ_REMOVE(&qp->free_commands, cmd, link);
+ KASSERT(qp->active_commands[cmd->cid] == NULL,
+ ("%s: CID already busy", __func__));
+ qp->active_commands[cmd->cid] = cmd;
+ cmd->req = req;
+ mtx_unlock(&qp->lock);
+ nvmf_dispatch_command(qp, cmd);
+}
diff --git a/sys/dev/nvmf/host/nvmf_sim.c b/sys/dev/nvmf/host/nvmf_sim.c
new file mode 100644
--- /dev/null
+++ b/sys/dev/nvmf/host/nvmf_sim.c
@@ -0,0 +1,332 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2023-2024 Chelsio Communications, Inc.
+ * Written by: John Baldwin <jhb@FreeBSD.org>
+ */
+
+#include <sys/types.h>
+#include <sys/malloc.h>
+#include <sys/memdesc.h>
+#include <sys/refcount.h>
+
+#include <cam/cam.h>
+#include <cam/cam_ccb.h>
+#include <cam/cam_sim.h>
+#include <cam/cam_xpt_sim.h>
+#include <cam/cam_debug.h>
+
+#include <dev/nvmf/host/nvmf_var.h>
+
+/*
+ * The I/O completion may trigger after the received CQE if the I/O
+ * used a zero-copy mbuf that isn't harvested until after the NIC
+ * driver processes TX completions. Use spriv_field0 to as a refcount.
+ *
+ * Store any I/O error returned in spriv_field1.
+ */
+static __inline u_int *
+ccb_refs(union ccb *ccb)
+{
+ return ((u_int *)&ccb->ccb_h.spriv_field0);
+}
+
+#define spriv_ioerror spriv_field1
+
+static void
+nvmf_ccb_done(union ccb *ccb)
+{
+ if (!refcount_release(ccb_refs(ccb)))
+ return;
+
+ if (nvmf_cqe_aborted(&ccb->nvmeio.cpl)) {
+ ccb->ccb_h.status = CAM_REQUEUE_REQ;
+ xpt_done(ccb);
+ } else if (ccb->nvmeio.cpl.status != 0) {
+ ccb->ccb_h.status = CAM_NVME_STATUS_ERROR;
+ xpt_done(ccb);
+ } else if (ccb->ccb_h.spriv_ioerror != 0) {
+ KASSERT(ccb->ccb_h.spriv_ioerror != EJUSTRETURN,
+ ("%s: zero sized transfer without CQE error", __func__));
+ ccb->ccb_h.status = CAM_REQ_CMP_ERR;
+ xpt_done(ccb);
+ } else {
+ ccb->ccb_h.status = CAM_REQ_CMP;
+ xpt_done_direct(ccb);
+ }
+}
+
+static void
+nvmf_ccb_io_complete(void *arg, size_t xfered, int error)
+{
+ union ccb *ccb = arg;
+
+ /*
+ * TODO: Reporting partial completions requires extending
+ * nvmeio to support resid and updating nda to handle partial
+ * reads, either by returning partial success (or an error) to
+ * the caller, or retrying all or part of the request.
+ */
+ ccb->ccb_h.spriv_ioerror = error;
+ if (error == 0) {
+ if (xfered == 0) {
+#ifdef INVARIANTS
+ /*
+ * If the request fails with an error in the CQE
+ * there will be no data transferred but also no
+ * I/O error.
+ */
+ ccb->ccb_h.spriv_ioerror = EJUSTRETURN;
+#endif
+ } else
+ KASSERT(xfered == ccb->nvmeio.dxfer_len,
+ ("%s: partial CCB completion", __func__));
+ }
+
+ nvmf_ccb_done(ccb);
+}
+
+static void
+nvmf_ccb_complete(void *arg, const struct nvme_completion *cqe)
+{
+ union ccb *ccb = arg;
+
+ ccb->nvmeio.cpl = *cqe;
+ nvmf_ccb_done(ccb);
+}
+
+static void
+nvmf_sim_io(struct nvmf_softc *sc, union ccb *ccb)
+{
+ struct ccb_nvmeio *nvmeio = &ccb->nvmeio;
+ struct memdesc mem;
+ struct nvmf_request *req;
+ struct nvmf_host_qpair *qp;
+
+ mtx_lock(&sc->sim_mtx);
+ if (sc->sim_disconnected) {
+ mtx_unlock(&sc->sim_mtx);
+ nvmeio->ccb_h.status = CAM_REQUEUE_REQ;
+ xpt_done(ccb);
+ return;
+ }
+ if (nvmeio->ccb_h.func_code == XPT_NVME_IO)
+ qp = nvmf_select_io_queue(sc);
+ else
+ qp = sc->admin;
+ req = nvmf_allocate_request(qp, &nvmeio->cmd, nvmf_ccb_complete,
+ ccb, M_NOWAIT);
+ if (req == NULL) {
+ mtx_unlock(&sc->sim_mtx);
+ nvmeio->ccb_h.status = CAM_RESRC_UNAVAIL;
+ xpt_done(ccb);
+ return;
+ }
+
+ if (nvmeio->dxfer_len != 0) {
+ refcount_init(ccb_refs(ccb), 2);
+ mem = memdesc_ccb(ccb);
+ nvmf_capsule_append_data(req->nc, &mem, nvmeio->dxfer_len,
+ (ccb->ccb_h.flags & CAM_DIR_MASK) == CAM_DIR_OUT,
+ nvmf_ccb_io_complete, ccb);
+ } else
+ refcount_init(ccb_refs(ccb), 1);
+
+ /*
+ * Clear spriv_ioerror as it can hold an earlier error if this
+ * CCB was aborted and has been retried.
+ */
+ ccb->ccb_h.spriv_ioerror = 0;
+ KASSERT(ccb->ccb_h.status == CAM_REQ_INPROG,
+ ("%s: incoming CCB is not in-progress", __func__));
+ ccb->ccb_h.status |= CAM_SIM_QUEUED;
+ nvmf_submit_request(req);
+ mtx_unlock(&sc->sim_mtx);
+}
+
+static void
+nvmf_sim_action(struct cam_sim *sim, union ccb *ccb)
+{
+ struct nvmf_softc *sc = cam_sim_softc(sim);
+
+ CAM_DEBUG(ccb->ccb_h.path, CAM_DEBUG_TRACE,
+ ("nvmf_sim_action: func= %#x\n",
+ ccb->ccb_h.func_code));
+
+ switch (ccb->ccb_h.func_code) {
+ case XPT_PATH_INQ: /* Path routing inquiry */
+ {
+ struct ccb_pathinq *cpi = &ccb->cpi;
+
+ cpi->version_num = 1;
+ cpi->hba_inquiry = 0;
+ cpi->target_sprt = 0;
+ cpi->hba_misc = PIM_UNMAPPED | PIM_NOSCAN;
+ cpi->hba_eng_cnt = 0;
+ cpi->max_target = 0;
+ cpi->max_lun = sc->cdata->nn;
+ cpi->async_flags = 0;
+ cpi->hpath_id = 0;
+ cpi->initiator_id = 0;
+ strlcpy(cpi->sim_vid, "FreeBSD", SIM_IDLEN);
+ strlcpy(cpi->hba_vid, "NVMeoF", HBA_IDLEN);
+ strlcpy(cpi->dev_name, cam_sim_name(sim), DEV_IDLEN);
+ cpi->unit_number = cam_sim_unit(sim);
+ cpi->bus_id = 0;
+
+ /* XXX: Same as iSCSI. */
+ cpi->base_transfer_speed = 150000;
+ cpi->protocol = PROTO_NVME;
+ cpi->protocol_version = sc->vs;
+ cpi->transport = XPORT_NVMF;
+ cpi->transport_version = sc->vs;
+ cpi->xport_specific.nvmf.nsid =
+ xpt_path_lun_id(ccb->ccb_h.path);
+ cpi->xport_specific.nvmf.trtype = sc->trtype;
+ strncpy(cpi->xport_specific.nvmf.dev_name,
+ device_get_nameunit(sc->dev),
+ sizeof(cpi->xport_specific.nvmf.dev_name));
+ cpi->maxio = sc->max_xfer_size;
+ cpi->hba_vendor = 0;
+ cpi->hba_device = 0;
+ cpi->hba_subvendor = 0;
+ cpi->hba_subdevice = 0;
+ cpi->ccb_h.status = CAM_REQ_CMP;
+ break;
+ }
+ case XPT_GET_TRAN_SETTINGS: /* Get transport settings */
+ {
+ struct ccb_trans_settings *cts = &ccb->cts;
+ struct ccb_trans_settings_nvme *nvme;
+ struct ccb_trans_settings_nvmf *nvmf;
+
+ cts->protocol = PROTO_NVME;
+ cts->protocol_version = sc->vs;
+ cts->transport = XPORT_NVMF;
+ cts->transport_version = sc->vs;
+
+ nvme = &cts->proto_specific.nvme;
+ nvme->valid = CTS_NVME_VALID_SPEC;
+ nvme->spec = sc->vs;
+
+ nvmf = &cts->xport_specific.nvmf;
+ nvmf->valid = CTS_NVMF_VALID_TRTYPE;
+ nvmf->trtype = sc->trtype;
+ cts->ccb_h.status = CAM_REQ_CMP;
+ break;
+ }
+ case XPT_SET_TRAN_SETTINGS: /* Set transport settings */
+ /*
+ * No transfer settings can be set, but nvme_xpt sends
+ * this anyway.
+ */
+ ccb->ccb_h.status = CAM_REQ_CMP;
+ break;
+ case XPT_NVME_IO: /* Execute the requested I/O */
+ case XPT_NVME_ADMIN: /* or Admin operation */
+ nvmf_sim_io(sc, ccb);
+ return;
+ default:
+ /* XXX */
+ device_printf(sc->dev, "unhandled sim function %#x\n",
+ ccb->ccb_h.func_code);
+ ccb->ccb_h.status = CAM_REQ_INVALID;
+ break;
+ }
+ xpt_done(ccb);
+}
+
+int
+nvmf_init_sim(struct nvmf_softc *sc)
+{
+ struct cam_devq *devq;
+ int max_trans;
+
+ max_trans = sc->max_pending_io * 3 / 4;
+ devq = cam_simq_alloc(max_trans);
+ if (devq == NULL) {
+ device_printf(sc->dev, "Failed to allocate CAM simq\n");
+ return (ENOMEM);
+ }
+
+ mtx_init(&sc->sim_mtx, "nvmf sim", NULL, MTX_DEF);
+ sc->sim = cam_sim_alloc(nvmf_sim_action, NULL, "nvme", sc,
+ device_get_unit(sc->dev), NULL, max_trans, max_trans, devq);
+ if (sc->sim == NULL) {
+ device_printf(sc->dev, "Failed to allocate CAM sim\n");
+ cam_simq_free(devq);
+ mtx_destroy(&sc->sim_mtx);
+ return (ENXIO);
+ }
+ if (xpt_bus_register(sc->sim, sc->dev, 0) != CAM_SUCCESS) {
+ device_printf(sc->dev, "Failed to create CAM bus\n");
+ cam_sim_free(sc->sim, TRUE);
+ mtx_destroy(&sc->sim_mtx);
+ return (ENXIO);
+ }
+ if (xpt_create_path(&sc->path, NULL, cam_sim_path(sc->sim),
+ CAM_TARGET_WILDCARD, CAM_LUN_WILDCARD) != CAM_REQ_CMP) {
+ device_printf(sc->dev, "Failed to create CAM path\n");
+ xpt_bus_deregister(cam_sim_path(sc->sim));
+ cam_sim_free(sc->sim, TRUE);
+ mtx_destroy(&sc->sim_mtx);
+ return (ENXIO);
+ }
+ return (0);
+}
+
+void
+nvmf_sim_rescan_ns(struct nvmf_softc *sc, uint32_t id)
+{
+ union ccb *ccb;
+
+ ccb = xpt_alloc_ccb_nowait();
+ if (ccb == NULL) {
+ device_printf(sc->dev,
+ "unable to alloc CCB for rescan of namespace %u\n", id);
+ return;
+ }
+
+ /*
+ * As with nvme_sim, map NVMe namespace IDs onto CAM unit
+ * LUNs.
+ */
+ if (xpt_create_path(&ccb->ccb_h.path, NULL, cam_sim_path(sc->sim), 0,
+ id) != CAM_REQ_CMP) {
+ device_printf(sc->dev,
+ "Unable to create path for rescan of namespace %u\n", id);
+ xpt_free_ccb(ccb);
+ return;
+ }
+ xpt_rescan(ccb);
+}
+
+void
+nvmf_disconnect_sim(struct nvmf_softc *sc)
+{
+ mtx_lock(&sc->sim_mtx);
+ sc->sim_disconnected = true;
+ xpt_freeze_simq(sc->sim, 1);
+ mtx_unlock(&sc->sim_mtx);
+}
+
+void
+nvmf_reconnect_sim(struct nvmf_softc *sc)
+{
+ mtx_lock(&sc->sim_mtx);
+ sc->sim_disconnected = false;
+ mtx_unlock(&sc->sim_mtx);
+ xpt_release_simq(sc->sim, 1);
+}
+
+void
+nvmf_destroy_sim(struct nvmf_softc *sc)
+{
+ xpt_async(AC_LOST_DEVICE, sc->path, NULL);
+ if (sc->sim_disconnected)
+ xpt_release_simq(sc->sim, 1);
+ xpt_free_path(sc->path);
+ xpt_bus_deregister(cam_sim_path(sc->sim));
+ cam_sim_free(sc->sim, TRUE);
+ mtx_destroy(&sc->sim_mtx);
+}
diff --git a/sys/dev/nvmf/host/nvmf_var.h b/sys/dev/nvmf/host/nvmf_var.h
new file mode 100644
--- /dev/null
+++ b/sys/dev/nvmf/host/nvmf_var.h
@@ -0,0 +1,208 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2023-2024 Chelsio Communications, Inc.
+ * Written by: John Baldwin <jhb@FreeBSD.org>
+ */
+
+#ifndef __NVMF_VAR_H__
+#define __NVMF_VAR_H__
+
+#include <sys/_callout.h>
+#include <sys/_lock.h>
+#include <sys/_mutex.h>
+#include <sys/_sx.h>
+#include <sys/_task.h>
+#include <sys/queue.h>
+#include <dev/nvme/nvme.h>
+#include <dev/nvmf/nvmf_transport.h>
+
+struct nvmf_aer;
+struct nvmf_capsule;
+struct nvmf_host_qpair;
+struct nvmf_namespace;
+
+typedef void nvmf_request_complete_t(void *, const struct nvme_completion *);
+
+struct nvmf_ivars {
+ struct nvmf_handoff_host *hh;
+ struct nvmf_handoff_qpair_params *io_params;
+ struct nvme_controller_data *cdata;
+};
+
+struct nvmf_softc {
+ device_t dev;
+
+ struct nvmf_host_qpair *admin;
+ struct nvmf_host_qpair **io;
+ u_int num_io_queues;
+ enum nvmf_trtype trtype;
+
+ struct cam_sim *sim;
+ struct cam_path *path;
+ struct mtx sim_mtx;
+ bool sim_disconnected;
+
+ struct nvmf_namespace **ns;
+
+ struct nvme_controller_data *cdata;
+ uint64_t cap;
+ uint32_t vs;
+ u_int max_pending_io;
+ u_long max_xfer_size;
+
+ struct cdev *cdev;
+
+ /*
+ * Keep Alive support depends on two timers. The 'tx' timer
+ * is responsible for sending KeepAlive commands and runs at
+ * half the timeout interval. The 'rx' timer is responsible
+ * for detecting an actual timeout.
+ *
+ * For efficient support of TKAS, the host does not reschedule
+ * these timers every time new commands are scheduled.
+ * Instead, the host sets the *_traffic flags when commands
+ * are sent and received. The timeout handlers check and
+ * clear these flags. This does mean it can take up to twice
+ * the timeout time to detect an AWOL controller.
+ */
+ bool ka_traffic; /* Using TKAS? */
+
+ volatile int ka_active_tx_traffic;
+ struct callout ka_tx_timer;
+ sbintime_t ka_tx_sbt;
+
+ volatile int ka_active_rx_traffic;
+ struct callout ka_rx_timer;
+ sbintime_t ka_rx_sbt;
+
+ struct sx connection_lock;
+ struct task disconnect_task;
+ bool detaching;
+
+ u_int num_aer;
+ struct nvmf_aer *aer;
+};
+
+struct nvmf_request {
+ struct nvmf_host_qpair *qp;
+ struct nvmf_capsule *nc;
+ nvmf_request_complete_t *cb;
+ void *cb_arg;
+ bool aer;
+
+ STAILQ_ENTRY(nvmf_request) link;
+};
+
+struct nvmf_completion_status {
+ struct nvme_completion cqe;
+ bool done;
+ bool io_done;
+ int io_error;
+};
+
+static __inline struct nvmf_host_qpair *
+nvmf_select_io_queue(struct nvmf_softc *sc)
+{
+ /* TODO: Support multiple queues? */
+ return (sc->io[0]);
+}
+
+static __inline bool
+nvmf_cqe_aborted(const struct nvme_completion *cqe)
+{
+ uint16_t status;
+
+ status = le16toh(cqe->status);
+ return (NVME_STATUS_GET_SCT(status) == NVME_SCT_PATH_RELATED &&
+ NVME_STATUS_GET_SC(status) == NVME_SC_COMMAND_ABORTED_BY_HOST);
+}
+
+static __inline void
+nvmf_status_init(struct nvmf_completion_status *status)
+{
+ status->done = false;
+ status->io_done = true;
+ status->io_error = 0;
+}
+
+static __inline void
+nvmf_status_wait_io(struct nvmf_completion_status *status)
+{
+ status->io_done = false;
+}
+
+#ifdef DRIVER_MODULE
+extern driver_t nvme_nvmf_driver;
+#endif
+
+#ifdef MALLOC_DECLARE
+MALLOC_DECLARE(M_NVMF);
+#endif
+
+/* nvmf.c */
+void nvmf_complete(void *arg, const struct nvme_completion *cqe);
+void nvmf_io_complete(void *arg, size_t xfered, int error);
+void nvmf_wait_for_reply(struct nvmf_completion_status *status);
+int nvmf_init_ivars(struct nvmf_ivars *ivars, struct nvmf_handoff_host *hh);
+void nvmf_free_ivars(struct nvmf_ivars *ivars);
+void nvmf_disconnect(struct nvmf_softc *sc);
+void nvmf_rescan_ns(struct nvmf_softc *sc, uint32_t nsid);
+int nvmf_passthrough_cmd(struct nvmf_softc *sc, struct nvme_pt_command *pt,
+ bool admin);
+
+/* nvmf_aer.c */
+void nvmf_init_aer(struct nvmf_softc *sc);
+int nvmf_start_aer(struct nvmf_softc *sc);
+void nvmf_destroy_aer(struct nvmf_softc *sc);
+
+/* nvmf_cmd.c */
+bool nvmf_cmd_get_property(struct nvmf_softc *sc, uint32_t offset,
+ uint8_t size, nvmf_request_complete_t *cb, void *cb_arg, int how);
+bool nvmf_cmd_set_property(struct nvmf_softc *sc, uint32_t offset,
+ uint8_t size, uint64_t value, nvmf_request_complete_t *cb, void *cb_arg,
+ int how);
+bool nvmf_cmd_keep_alive(struct nvmf_softc *sc, nvmf_request_complete_t *cb,
+ void *cb_arg, int how);
+bool nvmf_cmd_identify_active_namespaces(struct nvmf_softc *sc, uint32_t id,
+ struct nvme_ns_list *nslist, nvmf_request_complete_t *req_cb,
+ void *req_cb_arg, nvmf_io_complete_t *io_cb, void *io_cb_arg, int how);
+bool nvmf_cmd_identify_namespace(struct nvmf_softc *sc, uint32_t id,
+ struct nvme_namespace_data *nsdata, nvmf_request_complete_t *req_cb,
+ void *req_cb_arg, nvmf_io_complete_t *io_cb, void *io_cb_arg, int how);
+bool nvmf_cmd_get_log_page(struct nvmf_softc *sc, uint32_t nsid, uint8_t lid,
+ uint64_t offset, void *buf, size_t len, nvmf_request_complete_t *req_cb,
+ void *req_cb_arg, nvmf_io_complete_t *io_cb, void *io_cb_arg, int how);
+
+/* nvmf_ctldev.c */
+int nvmf_ctl_load(void);
+void nvmf_ctl_unload(void);
+
+/* nvmf_ns.c */
+struct nvmf_namespace *nvmf_init_ns(struct nvmf_softc *sc, uint32_t id,
+ struct nvme_namespace_data *data);
+void nvmf_disconnect_ns(struct nvmf_namespace *ns);
+void nvmf_reconnect_ns(struct nvmf_namespace *ns);
+void nvmf_destroy_ns(struct nvmf_namespace *ns);
+bool nvmf_update_ns(struct nvmf_namespace *ns,
+ struct nvme_namespace_data *data);
+
+/* nvmf_qpair.c */
+struct nvmf_host_qpair *nvmf_init_qp(struct nvmf_softc *sc,
+ enum nvmf_trtype trtype, struct nvmf_handoff_qpair_params *handoff,
+ const char *name);
+void nvmf_shutdown_qp(struct nvmf_host_qpair *qp);
+void nvmf_destroy_qp(struct nvmf_host_qpair *qp);
+struct nvmf_request *nvmf_allocate_request(struct nvmf_host_qpair *qp,
+ void *sqe, nvmf_request_complete_t *cb, void *cb_arg, int how);
+void nvmf_submit_request(struct nvmf_request *req);
+void nvmf_free_request(struct nvmf_request *req);
+
+/* nvmf_sim.c */
+int nvmf_init_sim(struct nvmf_softc *sc);
+void nvmf_disconnect_sim(struct nvmf_softc *sc);
+void nvmf_reconnect_sim(struct nvmf_softc *sc);
+void nvmf_destroy_sim(struct nvmf_softc *sc);
+void nvmf_sim_rescan_ns(struct nvmf_softc *sc, uint32_t id);
+
+#endif /* !__NVMF_VAR_H__ */
diff --git a/sys/modules/nvmf/Makefile b/sys/modules/nvmf/Makefile
--- a/sys/modules/nvmf/Makefile
+++ b/sys/modules/nvmf/Makefile
@@ -1,4 +1,5 @@
-SUBDIR= nvmf_tcp \
+SUBDIR= nvmf \
+ nvmf_tcp \
nvmf_transport
.include <bsd.subdir.mk>
diff --git a/sys/modules/nvmf/nvmf/Makefile b/sys/modules/nvmf/nvmf/Makefile
new file mode 100644
--- /dev/null
+++ b/sys/modules/nvmf/nvmf/Makefile
@@ -0,0 +1,13 @@
+.PATH: ${SRCTOP}/sys/dev/nvmf/host
+
+KMOD= nvmf
+
+SRCS= nvmf.c \
+ nvmf_aer.c \
+ nvmf_cmd.c \
+ nvmf_ctldev.c \
+ nvmf_ns.c \
+ nvmf_qpair.c \
+ nvmf_sim.c
+
+.include <bsd.kmod.mk>
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Mon, Jan 27, 11:48 AM (4 h, 10 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
16195313
Default Alt Text
D44714.diff (77 KB)
Attached To
Mode
D44714: nvmf: The in-kernel NVMe over Fabrics host
Attached
Detach File
Event Timeline
Log In to Comment