Page MenuHomeFreeBSD

D44714.diff
No OneTemporary

D44714.diff

diff --git a/share/man/man4/Makefile b/share/man/man4/Makefile
--- a/share/man/man4/Makefile
+++ b/share/man/man4/Makefile
@@ -408,6 +408,7 @@
nvd.4 \
${_nvdimm.4} \
nvme.4 \
+ nvmf.4 \
nvmf_tcp.4 \
${_nvram.4} \
oce.4 \
diff --git a/share/man/man4/nvmf.4 b/share/man/man4/nvmf.4
new file mode 100644
--- /dev/null
+++ b/share/man/man4/nvmf.4
@@ -0,0 +1,87 @@
+.\"
+.\" SPDX-License-Identifier: BSD-2-Clause
+.\"
+.\" Copyright (c) 2024 Chelsio Communications, Inc.
+.\"
+.Dd May 2, 2024
+.Dt NVMF 4
+.Os
+.Sh NAME
+.Nm nvmf
+.Nd "NVM Express over Fabrics host driver"
+.Sh SYNOPSIS
+To compile the driver into the kernel,
+place the following line in the
+kernel configuration file:
+.Bd -ragged -offset indent
+.Cd "device nvmf"
+.Ed
+.Pp
+Alternatively, to load the driver as a
+module at boot time, place the following line in
+.Xr loader.conf 5 :
+.Bd -literal -offset indent
+nvmf_load="YES"
+.Ed
+.Sh DESCRIPTION
+The
+.Nm
+driver provides the kernel component of an NVM Express over Fabrics
+host.
+The NVMeoF host is the client which provides local access to
+namespaces exported by a remote controller.
+.Pp
+Associations between the local host and remote controllers are managed
+using
+.Xr nvmecontrol 8 .
+New associations are created via the
+.Cm connect
+command and destroyed via the
+.Cm disconnect
+command.
+If an association's connection is interrupted,
+the
+.Cm reconnect
+command creates a new association to replace the interrupted association.
+.Pp
+Similar to
+.Xr nvme 4 ,
+.Nm
+creates controller device nodes using the format
+.Pa /dev/nvmeX
+and namespace device nodes using the format
+.Pa /dev/nvmeXnsY .
+.Nm
+also exports remote namespaces via the CAM
+.Xr nda 4
+peripheral driver.
+Unlike
+.Xr nvme 4 ,
+.Nm
+does not support the
+.Xr nvd 4
+disk driver.
+.Pp
+Associations require a supported transport such as
+.Xr nvmf_tcp 4
+for associations using TCP/IP.
+.Sh SEE ALSO
+.Xr nda 4 ,
+.Xr nvme 4 ,
+.Xr nvmf_tcp 4 ,
+.Xr nvmft 4 ,
+.Xr nvmecontrol 8
+.Sh HISTORY
+The
+.Nm
+module first appeared in
+.Fx 15.0 .
+.Sh AUTHORS
+The
+.Nm
+driver was developed by
+.An John Baldwin Aq Mt jhb@FreeBSD.org
+under sponsorship from Chelsio Communications, Inc.
+.Sh BUGS
+.Nm
+only supports a single I/O queue pair per association.
diff --git a/sys/conf/NOTES b/sys/conf/NOTES
--- a/sys/conf/NOTES
+++ b/sys/conf/NOTES
@@ -1676,12 +1676,14 @@
# NVM Express
#
# nvme: PCI-express NVM Express host controllers
+# nvmf: NVM Express over Fabrics host
# nvmf_tcp: TCP transport for NVM Express over Fabrics
# nda: CAM NVMe disk driver
# nvd: non-CAM NVMe disk driver
-device nvme # base NVMe driver
+device nvme # PCI-express NVMe host driver
options NVME_USE_NVD=1 # Use nvd(4) instead of the CAM nda(4) driver
+device nvmf # NVMeoF host driver
device nvmf_tcp # NVMeoF TCP transport
device nda # NVMe direct access devices (aka disks)
device nvd # expose NVMe namespaces as disks, depends on nvme
diff --git a/sys/conf/files b/sys/conf/files
--- a/sys/conf/files
+++ b/sys/conf/files
@@ -2533,7 +2533,15 @@
dev/nvme/nvme_util.c optional nvme
dev/nvmem/nvmem.c optional nvmem fdt
dev/nvmem/nvmem_if.m optional nvmem
+dev/nvmf/host/nvmf.c optional nvmf
+dev/nvmf/host/nvmf_aer.c optional nvmf
+dev/nvmf/host/nvmf_cmd.c optional nvmf
+dev/nvmf/host/nvmf_ctldev.c optional nvmf
+dev/nvmf/host/nvmf_ns.c optional nvmf
+dev/nvmf/host/nvmf_qpair.c optional nvmf
+dev/nvmf/host/nvmf_sim.c optional nvmf
dev/nvmf/nvmf_tcp.c optional nvmf_tcp
+dev/nvmf/nvmf_transport.c optional nvmf
dev/oce/oce_hw.c optional oce pci
dev/oce/oce_if.c optional oce pci
dev/oce/oce_mbox.c optional oce pci
diff --git a/sys/dev/nvmf/host/nvmf.c b/sys/dev/nvmf/host/nvmf.c
new file mode 100644
--- /dev/null
+++ b/sys/dev/nvmf/host/nvmf.c
@@ -0,0 +1,939 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2023-2024 Chelsio Communications, Inc.
+ * Written by: John Baldwin <jhb@FreeBSD.org>
+ */
+
+#include <sys/param.h>
+#include <sys/bus.h>
+#include <sys/conf.h>
+#include <sys/lock.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/memdesc.h>
+#include <sys/module.h>
+#include <sys/mutex.h>
+#include <sys/sx.h>
+#include <sys/taskqueue.h>
+#include <dev/nvme/nvme.h>
+#include <dev/nvmf/nvmf.h>
+#include <dev/nvmf/nvmf_transport.h>
+#include <dev/nvmf/host/nvmf_var.h>
+
+static struct cdevsw nvmf_cdevsw;
+
+MALLOC_DEFINE(M_NVMF, "nvmf", "NVMe over Fabrics host");
+
+static void nvmf_disconnect_task(void *arg, int pending);
+
+void
+nvmf_complete(void *arg, const struct nvme_completion *cqe)
+{
+ struct nvmf_completion_status *status = arg;
+ struct mtx *mtx;
+
+ status->cqe = *cqe;
+ mtx = mtx_pool_find(mtxpool_sleep, status);
+ mtx_lock(mtx);
+ status->done = true;
+ mtx_unlock(mtx);
+ wakeup(status);
+}
+
+void
+nvmf_io_complete(void *arg, size_t xfered, int error)
+{
+ struct nvmf_completion_status *status = arg;
+ struct mtx *mtx;
+
+ status->io_error = error;
+ mtx = mtx_pool_find(mtxpool_sleep, status);
+ mtx_lock(mtx);
+ status->io_done = true;
+ mtx_unlock(mtx);
+ wakeup(status);
+}
+
+void
+nvmf_wait_for_reply(struct nvmf_completion_status *status)
+{
+ struct mtx *mtx;
+
+ mtx = mtx_pool_find(mtxpool_sleep, status);
+ mtx_lock(mtx);
+ while (!status->done || !status->io_done)
+ mtx_sleep(status, mtx, 0, "nvmfcmd", 0);
+ mtx_unlock(mtx);
+}
+
+static int
+nvmf_read_property(struct nvmf_softc *sc, uint32_t offset, uint8_t size,
+ uint64_t *value)
+{
+ const struct nvmf_fabric_prop_get_rsp *rsp;
+ struct nvmf_completion_status status;
+
+ nvmf_status_init(&status);
+ if (!nvmf_cmd_get_property(sc, offset, size, nvmf_complete, &status,
+ M_WAITOK))
+ return (ECONNABORTED);
+ nvmf_wait_for_reply(&status);
+
+ if (status.cqe.status != 0) {
+ device_printf(sc->dev, "PROPERTY_GET failed, status %#x\n",
+ le16toh(status.cqe.status));
+ return (EIO);
+ }
+
+ rsp = (const struct nvmf_fabric_prop_get_rsp *)&status.cqe;
+ if (size == 8)
+ *value = le64toh(rsp->value.u64);
+ else
+ *value = le32toh(rsp->value.u32.low);
+ return (0);
+}
+
+static int
+nvmf_write_property(struct nvmf_softc *sc, uint32_t offset, uint8_t size,
+ uint64_t value)
+{
+ struct nvmf_completion_status status;
+
+ nvmf_status_init(&status);
+ if (!nvmf_cmd_set_property(sc, offset, size, value, nvmf_complete, &status,
+ M_WAITOK))
+ return (ECONNABORTED);
+ nvmf_wait_for_reply(&status);
+
+ if (status.cqe.status != 0) {
+ device_printf(sc->dev, "PROPERTY_SET failed, status %#x\n",
+ le16toh(status.cqe.status));
+ return (EIO);
+ }
+ return (0);
+}
+
+static void
+nvmf_shutdown_controller(struct nvmf_softc *sc)
+{
+ uint64_t cc;
+ int error;
+
+ error = nvmf_read_property(sc, NVMF_PROP_CC, 4, &cc);
+ if (error != 0) {
+ device_printf(sc->dev, "Failed to fetch CC for shutdown\n");
+ return;
+ }
+
+ cc |= NVMEF(NVME_CC_REG_SHN, NVME_SHN_NORMAL);
+
+ error = nvmf_write_property(sc, NVMF_PROP_CC, 4, cc);
+ if (error != 0)
+ device_printf(sc->dev,
+ "Failed to set CC to trigger shutdown\n");
+}
+
+static void
+nvmf_check_keep_alive(void *arg)
+{
+ struct nvmf_softc *sc = arg;
+ int traffic;
+
+ traffic = atomic_readandclear_int(&sc->ka_active_rx_traffic);
+ if (traffic == 0) {
+ device_printf(sc->dev,
+ "disconnecting due to KeepAlive timeout\n");
+ nvmf_disconnect(sc);
+ return;
+ }
+
+ callout_schedule_sbt(&sc->ka_rx_timer, sc->ka_rx_sbt, 0, C_HARDCLOCK);
+}
+
+static void
+nvmf_keep_alive_complete(void *arg, const struct nvme_completion *cqe)
+{
+ struct nvmf_softc *sc = arg;
+
+ atomic_store_int(&sc->ka_active_rx_traffic, 1);
+ if (cqe->status != 0) {
+ device_printf(sc->dev,
+ "KeepAlive response reported status %#x\n",
+ le16toh(cqe->status));
+ }
+}
+
+static void
+nvmf_send_keep_alive(void *arg)
+{
+ struct nvmf_softc *sc = arg;
+ int traffic;
+
+ /*
+ * Don't bother sending a KeepAlive command if TKAS is active
+ * and another command has been sent during the interval.
+ */
+ traffic = atomic_load_int(&sc->ka_active_tx_traffic);
+ if (traffic == 0 && !nvmf_cmd_keep_alive(sc, nvmf_keep_alive_complete,
+ sc, M_NOWAIT))
+ device_printf(sc->dev,
+ "Failed to allocate KeepAlive command\n");
+
+ /* Clear ka_active_tx_traffic after sending the keep alive command. */
+ atomic_store_int(&sc->ka_active_tx_traffic, 0);
+
+ callout_schedule_sbt(&sc->ka_tx_timer, sc->ka_tx_sbt, 0, C_HARDCLOCK);
+}
+
+int
+nvmf_init_ivars(struct nvmf_ivars *ivars, struct nvmf_handoff_host *hh)
+{
+ size_t len;
+ u_int i;
+ int error;
+
+ memset(ivars, 0, sizeof(*ivars));
+
+ if (!hh->admin.admin || hh->num_io_queues < 1)
+ return (EINVAL);
+
+ ivars->cdata = malloc(sizeof(*ivars->cdata), M_NVMF, M_WAITOK);
+ error = copyin(hh->cdata, ivars->cdata, sizeof(*ivars->cdata));
+ if (error != 0)
+ goto out;
+ nvme_controller_data_swapbytes(ivars->cdata);
+
+ len = hh->num_io_queues * sizeof(*ivars->io_params);
+ ivars->io_params = malloc(len, M_NVMF, M_WAITOK);
+ error = copyin(hh->io, ivars->io_params, len);
+ if (error != 0)
+ goto out;
+ for (i = 0; i < hh->num_io_queues; i++) {
+ if (ivars->io_params[i].admin) {
+ error = EINVAL;
+ goto out;
+ }
+
+ /* Require all I/O queues to be the same size. */
+ if (ivars->io_params[i].qsize != ivars->io_params[0].qsize) {
+ error = EINVAL;
+ goto out;
+ }
+ }
+
+ ivars->hh = hh;
+ return (0);
+
+out:
+ free(ivars->io_params, M_NVMF);
+ free(ivars->cdata, M_NVMF);
+ return (error);
+}
+
+void
+nvmf_free_ivars(struct nvmf_ivars *ivars)
+{
+ free(ivars->io_params, M_NVMF);
+ free(ivars->cdata, M_NVMF);
+}
+
+static int
+nvmf_probe(device_t dev)
+{
+ struct nvmf_ivars *ivars = device_get_ivars(dev);
+ char desc[260];
+
+ if (ivars == NULL)
+ return (ENXIO);
+
+ snprintf(desc, sizeof(desc), "Fabrics: %.256s", ivars->cdata->subnqn);
+ device_set_desc_copy(dev, desc);
+ return (BUS_PROBE_DEFAULT);
+}
+
+static int
+nvmf_establish_connection(struct nvmf_softc *sc, struct nvmf_ivars *ivars)
+{
+ char name[16];
+
+ /* Setup the admin queue. */
+ sc->admin = nvmf_init_qp(sc, ivars->hh->trtype, &ivars->hh->admin,
+ "admin queue");
+ if (sc->admin == NULL) {
+ device_printf(sc->dev, "Failed to setup admin queue\n");
+ return (ENXIO);
+ }
+
+ /* Setup I/O queues. */
+ sc->io = malloc(ivars->hh->num_io_queues * sizeof(*sc->io), M_NVMF,
+ M_WAITOK | M_ZERO);
+ sc->num_io_queues = ivars->hh->num_io_queues;
+ for (u_int i = 0; i < sc->num_io_queues; i++) {
+ snprintf(name, sizeof(name), "I/O queue %u", i);
+ sc->io[i] = nvmf_init_qp(sc, ivars->hh->trtype,
+ &ivars->io_params[i], name);
+ if (sc->io[i] == NULL) {
+ device_printf(sc->dev, "Failed to setup I/O queue %u\n",
+ i + 1);
+ return (ENXIO);
+ }
+ }
+
+ /* Start KeepAlive timers. */
+ if (ivars->hh->kato != 0) {
+ sc->ka_traffic = NVMEV(NVME_CTRLR_DATA_CTRATT_TBKAS,
+ sc->cdata->ctratt) != 0;
+ sc->ka_rx_sbt = mstosbt(ivars->hh->kato);
+ sc->ka_tx_sbt = sc->ka_rx_sbt / 2;
+ callout_reset_sbt(&sc->ka_rx_timer, sc->ka_rx_sbt, 0,
+ nvmf_check_keep_alive, sc, C_HARDCLOCK);
+ callout_reset_sbt(&sc->ka_tx_timer, sc->ka_tx_sbt, 0,
+ nvmf_send_keep_alive, sc, C_HARDCLOCK);
+ }
+
+ return (0);
+}
+
+static bool
+nvmf_scan_nslist(struct nvmf_softc *sc, struct nvme_ns_list *nslist,
+ struct nvme_namespace_data *data, uint32_t *nsidp)
+{
+ struct nvmf_completion_status status;
+ uint32_t nsid;
+
+ nvmf_status_init(&status);
+ nvmf_status_wait_io(&status);
+ if (!nvmf_cmd_identify_active_namespaces(sc, *nsidp, nslist,
+ nvmf_complete, &status, nvmf_io_complete, &status, M_WAITOK)) {
+ device_printf(sc->dev,
+ "failed to send IDENTIFY active namespaces command\n");
+ return (false);
+ }
+ nvmf_wait_for_reply(&status);
+
+ if (status.cqe.status != 0) {
+ device_printf(sc->dev,
+ "IDENTIFY active namespaces failed, status %#x\n",
+ le16toh(status.cqe.status));
+ return (false);
+ }
+
+ if (status.io_error != 0) {
+ device_printf(sc->dev,
+ "IDENTIFY active namespaces failed with I/O error %d\n",
+ status.io_error);
+ return (false);
+ }
+
+ for (u_int i = 0; i < nitems(nslist->ns); i++) {
+ nsid = nslist->ns[i];
+ if (nsid == 0) {
+ *nsidp = 0;
+ return (true);
+ }
+
+ if (sc->ns[nsid - 1] != NULL) {
+ device_printf(sc->dev,
+ "duplicate namespace %u in active namespace list\n",
+ nsid);
+ return (false);
+ }
+
+ nvmf_status_init(&status);
+ nvmf_status_wait_io(&status);
+ if (!nvmf_cmd_identify_namespace(sc, nsid, data, nvmf_complete,
+ &status, nvmf_io_complete, &status, M_WAITOK)) {
+ device_printf(sc->dev,
+ "failed to send IDENTIFY namespace %u command\n",
+ nsid);
+ return (false);
+ }
+ nvmf_wait_for_reply(&status);
+
+ if (status.cqe.status != 0) {
+ device_printf(sc->dev,
+ "IDENTIFY namespace %u failed, status %#x\n", nsid,
+ le16toh(status.cqe.status));
+ return (false);
+ }
+
+ if (status.io_error != 0) {
+ device_printf(sc->dev,
+ "IDENTIFY namespace %u failed with I/O error %d\n",
+ nsid, status.io_error);
+ return (false);
+ }
+
+ /*
+ * As in nvme_ns_construct, a size of zero indicates an
+ * invalid namespace.
+ */
+ nvme_namespace_data_swapbytes(data);
+ if (data->nsze == 0) {
+ device_printf(sc->dev,
+ "ignoring active namespace %u with zero size\n",
+ nsid);
+ continue;
+ }
+
+ sc->ns[nsid - 1] = nvmf_init_ns(sc, nsid, data);
+
+ nvmf_sim_rescan_ns(sc, nsid);
+ }
+
+ MPASS(nsid == nslist->ns[nitems(nslist->ns) - 1] && nsid != 0);
+
+ if (nsid >= 0xfffffffd)
+ *nsidp = 0;
+ else
+ *nsidp = nsid + 1;
+ return (true);
+}
+
+static bool
+nvmf_add_namespaces(struct nvmf_softc *sc)
+{
+ struct nvme_namespace_data *data;
+ struct nvme_ns_list *nslist;
+ uint32_t nsid;
+ bool retval;
+
+ sc->ns = mallocarray(sc->cdata->nn, sizeof(*sc->ns), M_NVMF,
+ M_WAITOK | M_ZERO);
+ nslist = malloc(sizeof(*nslist), M_NVMF, M_WAITOK);
+ data = malloc(sizeof(*data), M_NVMF, M_WAITOK);
+
+ nsid = 0;
+ retval = true;
+ for (;;) {
+ if (!nvmf_scan_nslist(sc, nslist, data, &nsid)) {
+ retval = false;
+ break;
+ }
+ if (nsid == 0)
+ break;
+ }
+
+ free(data, M_NVMF);
+ free(nslist, M_NVMF);
+ return (retval);
+}
+
+static int
+nvmf_attach(device_t dev)
+{
+ struct make_dev_args mda;
+ struct nvmf_softc *sc = device_get_softc(dev);
+ struct nvmf_ivars *ivars = device_get_ivars(dev);
+ uint64_t val;
+ u_int i;
+ int error;
+
+ if (ivars == NULL)
+ return (ENXIO);
+
+ sc->dev = dev;
+ sc->trtype = ivars->hh->trtype;
+ callout_init(&sc->ka_rx_timer, 1);
+ callout_init(&sc->ka_tx_timer, 1);
+ sx_init(&sc->connection_lock, "nvmf connection");
+ TASK_INIT(&sc->disconnect_task, 0, nvmf_disconnect_task, sc);
+
+ /* Claim the cdata pointer from ivars. */
+ sc->cdata = ivars->cdata;
+ ivars->cdata = NULL;
+
+ nvmf_init_aer(sc);
+
+ /* TODO: Multiqueue support. */
+ sc->max_pending_io = ivars->io_params[0].qsize /* * sc->num_io_queues */;
+
+ error = nvmf_establish_connection(sc, ivars);
+ if (error != 0)
+ goto out;
+
+ error = nvmf_read_property(sc, NVMF_PROP_CAP, 8, &sc->cap);
+ if (error != 0) {
+ device_printf(sc->dev, "Failed to fetch CAP\n");
+ error = ENXIO;
+ goto out;
+ }
+
+ error = nvmf_read_property(sc, NVMF_PROP_VS, 4, &val);
+ if (error != 0) {
+ device_printf(sc->dev, "Failed to fetch VS\n");
+ error = ENXIO;
+ goto out;
+ }
+ sc->vs = val;
+
+ /* Honor MDTS if it is set. */
+ sc->max_xfer_size = maxphys;
+ if (sc->cdata->mdts != 0) {
+ sc->max_xfer_size = ulmin(sc->max_xfer_size,
+ 1 << (sc->cdata->mdts + NVME_MPS_SHIFT +
+ NVME_CAP_HI_MPSMIN(sc->cap >> 32)));
+ }
+
+ error = nvmf_init_sim(sc);
+ if (error != 0)
+ goto out;
+
+ error = nvmf_start_aer(sc);
+ if (error != 0) {
+ nvmf_destroy_sim(sc);
+ goto out;
+ }
+
+ if (!nvmf_add_namespaces(sc)) {
+ nvmf_destroy_sim(sc);
+ goto out;
+ }
+
+ make_dev_args_init(&mda);
+ mda.mda_devsw = &nvmf_cdevsw;
+ mda.mda_uid = UID_ROOT;
+ mda.mda_gid = GID_WHEEL;
+ mda.mda_mode = 0600;
+ mda.mda_si_drv1 = sc;
+ error = make_dev_s(&mda, &sc->cdev, "%s", device_get_nameunit(dev));
+ if (error != 0) {
+ nvmf_destroy_sim(sc);
+ goto out;
+ }
+
+ return (0);
+out:
+ if (sc->ns != NULL) {
+ for (i = 0; i < sc->cdata->nn; i++) {
+ if (sc->ns[i] != NULL)
+ nvmf_destroy_ns(sc->ns[i]);
+ }
+ free(sc->ns, M_NVMF);
+ }
+
+ callout_drain(&sc->ka_tx_timer);
+ callout_drain(&sc->ka_rx_timer);
+
+ if (sc->admin != NULL)
+ nvmf_shutdown_controller(sc);
+
+ for (i = 0; i < sc->num_io_queues; i++) {
+ if (sc->io[i] != NULL)
+ nvmf_destroy_qp(sc->io[i]);
+ }
+ free(sc->io, M_NVMF);
+ if (sc->admin != NULL)
+ nvmf_destroy_qp(sc->admin);
+
+ nvmf_destroy_aer(sc);
+
+ taskqueue_drain(taskqueue_thread, &sc->disconnect_task);
+ sx_destroy(&sc->connection_lock);
+ free(sc->cdata, M_NVMF);
+ return (error);
+}
+
+void
+nvmf_disconnect(struct nvmf_softc *sc)
+{
+ taskqueue_enqueue(taskqueue_thread, &sc->disconnect_task);
+}
+
+static void
+nvmf_disconnect_task(void *arg, int pending __unused)
+{
+ struct nvmf_softc *sc = arg;
+ u_int i;
+
+ sx_xlock(&sc->connection_lock);
+ if (sc->admin == NULL) {
+ /*
+ * Ignore transport errors if there is no active
+ * association.
+ */
+ sx_xunlock(&sc->connection_lock);
+ return;
+ }
+
+ if (sc->detaching) {
+ if (sc->admin != NULL) {
+ /*
+ * This unsticks the detach process if a
+ * transport error occurs during detach.
+ */
+ nvmf_shutdown_qp(sc->admin);
+ }
+ sx_xunlock(&sc->connection_lock);
+ return;
+ }
+
+ if (sc->cdev == NULL) {
+ /*
+ * Transport error occurred during attach (nvmf_add_namespaces).
+ * Shutdown the admin queue.
+ */
+ nvmf_shutdown_qp(sc->admin);
+ sx_xunlock(&sc->connection_lock);
+ return;
+ }
+
+ callout_drain(&sc->ka_tx_timer);
+ callout_drain(&sc->ka_rx_timer);
+ sc->ka_traffic = false;
+
+ /* Quiesce namespace consumers. */
+ nvmf_disconnect_sim(sc);
+ for (i = 0; i < sc->cdata->nn; i++) {
+ if (sc->ns[i] != NULL)
+ nvmf_disconnect_ns(sc->ns[i]);
+ }
+
+ /* Shutdown the existing qpairs. */
+ for (i = 0; i < sc->num_io_queues; i++) {
+ nvmf_destroy_qp(sc->io[i]);
+ }
+ free(sc->io, M_NVMF);
+ sc->io = NULL;
+ sc->num_io_queues = 0;
+ nvmf_destroy_qp(sc->admin);
+ sc->admin = NULL;
+
+ sx_xunlock(&sc->connection_lock);
+}
+
+static int
+nvmf_reconnect_host(struct nvmf_softc *sc, struct nvmf_handoff_host *hh)
+{
+ struct nvmf_ivars ivars;
+ u_int i;
+ int error;
+
+ /* XXX: Should we permit changing the transport type? */
+ if (sc->trtype != hh->trtype) {
+ device_printf(sc->dev,
+ "transport type mismatch on reconnect\n");
+ return (EINVAL);
+ }
+
+ error = nvmf_init_ivars(&ivars, hh);
+ if (error != 0)
+ return (error);
+
+ sx_xlock(&sc->connection_lock);
+ if (sc->admin != NULL || sc->detaching) {
+ error = EBUSY;
+ goto out;
+ }
+
+ /*
+ * Ensure this is for the same controller. Note that the
+ * controller ID can vary across associations if the remote
+ * system is using the dynamic controller model. This merely
+ * ensures the new association is connected to the same NVMe
+ * subsystem.
+ */
+ if (memcmp(sc->cdata->subnqn, ivars.cdata->subnqn,
+ sizeof(ivars.cdata->subnqn)) != 0) {
+ device_printf(sc->dev,
+ "controller subsystem NQN mismatch on reconnect\n");
+ error = EINVAL;
+ goto out;
+ }
+
+ /*
+ * XXX: Require same number and size of I/O queues so that
+ * max_pending_io is still correct?
+ */
+
+ error = nvmf_establish_connection(sc, &ivars);
+ if (error != 0)
+ goto out;
+
+ error = nvmf_start_aer(sc);
+ if (error != 0)
+ goto out;
+
+ device_printf(sc->dev,
+ "established new association with %u I/O queues\n",
+ sc->num_io_queues);
+
+ /* Restart namespace consumers. */
+ for (i = 0; i < sc->cdata->nn; i++) {
+ if (sc->ns[i] != NULL)
+ nvmf_reconnect_ns(sc->ns[i]);
+ }
+ nvmf_reconnect_sim(sc);
+out:
+ sx_xunlock(&sc->connection_lock);
+ nvmf_free_ivars(&ivars);
+ return (error);
+}
+
+static int
+nvmf_detach(device_t dev)
+{
+ struct nvmf_softc *sc = device_get_softc(dev);
+ u_int i;
+
+ destroy_dev(sc->cdev);
+
+ sx_xlock(&sc->connection_lock);
+ sc->detaching = true;
+ sx_xunlock(&sc->connection_lock);
+
+ nvmf_destroy_sim(sc);
+ for (i = 0; i < sc->cdata->nn; i++) {
+ if (sc->ns[i] != NULL)
+ nvmf_destroy_ns(sc->ns[i]);
+ }
+ free(sc->ns, M_NVMF);
+
+ callout_drain(&sc->ka_tx_timer);
+ callout_drain(&sc->ka_rx_timer);
+
+ if (sc->admin != NULL)
+ nvmf_shutdown_controller(sc);
+
+ for (i = 0; i < sc->num_io_queues; i++) {
+ nvmf_destroy_qp(sc->io[i]);
+ }
+ free(sc->io, M_NVMF);
+
+ taskqueue_drain(taskqueue_thread, &sc->disconnect_task);
+
+ if (sc->admin != NULL)
+ nvmf_destroy_qp(sc->admin);
+
+ nvmf_destroy_aer(sc);
+
+ sx_destroy(&sc->connection_lock);
+ free(sc->cdata, M_NVMF);
+ return (0);
+}
+
+void
+nvmf_rescan_ns(struct nvmf_softc *sc, uint32_t nsid)
+{
+ struct nvmf_completion_status status;
+ struct nvme_namespace_data *data;
+ struct nvmf_namespace *ns;
+
+ data = malloc(sizeof(*data), M_NVMF, M_WAITOK);
+
+ nvmf_status_init(&status);
+ nvmf_status_wait_io(&status);
+ if (!nvmf_cmd_identify_namespace(sc, nsid, data, nvmf_complete,
+ &status, nvmf_io_complete, &status, M_WAITOK)) {
+ device_printf(sc->dev,
+ "failed to send IDENTIFY namespace %u command\n", nsid);
+ free(data, M_NVMF);
+ return;
+ }
+ nvmf_wait_for_reply(&status);
+
+ if (status.cqe.status != 0) {
+ device_printf(sc->dev,
+ "IDENTIFY namespace %u failed, status %#x\n", nsid,
+ le16toh(status.cqe.status));
+ free(data, M_NVMF);
+ return;
+ }
+
+ if (status.io_error != 0) {
+ device_printf(sc->dev,
+ "IDENTIFY namespace %u failed with I/O error %d\n",
+ nsid, status.io_error);
+ free(data, M_NVMF);
+ return;
+ }
+
+ nvme_namespace_data_swapbytes(data);
+
+ /* XXX: Needs locking around sc->ns[]. */
+ ns = sc->ns[nsid - 1];
+ if (data->nsze == 0) {
+ /* XXX: Needs locking */
+ if (ns != NULL) {
+ nvmf_destroy_ns(ns);
+ sc->ns[nsid - 1] = NULL;
+ }
+ } else {
+ /* XXX: Needs locking */
+ if (ns == NULL) {
+ sc->ns[nsid - 1] = nvmf_init_ns(sc, nsid, data);
+ } else {
+ if (!nvmf_update_ns(ns, data)) {
+ nvmf_destroy_ns(ns);
+ sc->ns[nsid - 1] = NULL;
+ }
+ }
+ }
+
+ free(data, M_NVMF);
+
+ nvmf_sim_rescan_ns(sc, nsid);
+}
+
+int
+nvmf_passthrough_cmd(struct nvmf_softc *sc, struct nvme_pt_command *pt,
+ bool admin)
+{
+ struct nvmf_completion_status status;
+ struct nvme_command cmd;
+ struct memdesc mem;
+ struct nvmf_host_qpair *qp;
+ struct nvmf_request *req;
+ void *buf;
+ int error;
+
+ if (pt->len > sc->max_xfer_size)
+ return (EINVAL);
+
+ buf = NULL;
+ if (pt->len != 0) {
+ /*
+ * XXX: Depending on the size we may want to pin the
+ * user pages and use a memdesc with vm_page_t's
+ * instead.
+ */
+ buf = malloc(pt->len, M_NVMF, M_WAITOK);
+ if (pt->is_read == 0) {
+ error = copyin(pt->buf, buf, pt->len);
+ if (error != 0) {
+ free(buf, M_NVMF);
+ return (error);
+ }
+ } else {
+ /* Ensure no kernel data is leaked to userland. */
+ memset(buf, 0, pt->len);
+ }
+ }
+
+ memset(&cmd, 0, sizeof(cmd));
+ cmd.opc = pt->cmd.opc;
+ cmd.fuse = pt->cmd.fuse;
+ cmd.nsid = pt->cmd.nsid;
+ cmd.cdw10 = pt->cmd.cdw10;
+ cmd.cdw11 = pt->cmd.cdw11;
+ cmd.cdw12 = pt->cmd.cdw12;
+ cmd.cdw13 = pt->cmd.cdw13;
+ cmd.cdw14 = pt->cmd.cdw14;
+ cmd.cdw15 = pt->cmd.cdw15;
+
+ if (admin)
+ qp = sc->admin;
+ else
+ qp = nvmf_select_io_queue(sc);
+ nvmf_status_init(&status);
+ req = nvmf_allocate_request(qp, &cmd, nvmf_complete, &status, M_WAITOK);
+ if (req == NULL) {
+ device_printf(sc->dev, "failed to send passthrough command\n");
+ error = ECONNABORTED;
+ goto error;
+ }
+
+ if (pt->len != 0) {
+ mem = memdesc_vaddr(buf, pt->len);
+ nvmf_capsule_append_data(req->nc, &mem, pt->len,
+ pt->is_read == 0, nvmf_io_complete, &status);
+ nvmf_status_wait_io(&status);
+ }
+
+ nvmf_submit_request(req);
+ nvmf_wait_for_reply(&status);
+
+ memset(&pt->cpl, 0, sizeof(pt->cpl));
+ pt->cpl.cdw0 = status.cqe.cdw0;
+ pt->cpl.status = status.cqe.status;
+
+ error = status.io_error;
+ if (error == 0 && pt->len != 0 && pt->is_read != 0)
+ error = copyout(buf, pt->buf, pt->len);
+error:
+ free(buf, M_NVMF);
+ return (error);
+}
+
+static int
+nvmf_ioctl(struct cdev *cdev, u_long cmd, caddr_t arg, int flag,
+ struct thread *td)
+{
+ struct nvmf_softc *sc = cdev->si_drv1;
+ struct nvme_get_nsid *gnsid;
+ struct nvme_pt_command *pt;
+ struct nvmf_reconnect_params *rp;
+ struct nvmf_handoff_host *hh;
+
+ switch (cmd) {
+ case NVME_PASSTHROUGH_CMD:
+ pt = (struct nvme_pt_command *)arg;
+ return (nvmf_passthrough_cmd(sc, pt, true));
+ case NVME_GET_NSID:
+ gnsid = (struct nvme_get_nsid *)arg;
+ strncpy(gnsid->cdev, device_get_nameunit(sc->dev),
+ sizeof(gnsid->cdev));
+ gnsid->cdev[sizeof(gnsid->cdev) - 1] = '\0';
+ gnsid->nsid = 0;
+ return (0);
+ case NVME_GET_MAX_XFER_SIZE:
+ *(uint64_t *)arg = sc->max_xfer_size;
+ return (0);
+ case NVMF_RECONNECT_PARAMS:
+ rp = (struct nvmf_reconnect_params *)arg;
+ if ((sc->cdata->fcatt & 1) == 0)
+ rp->cntlid = NVMF_CNTLID_DYNAMIC;
+ else
+ rp->cntlid = sc->cdata->ctrlr_id;
+ memcpy(rp->subnqn, sc->cdata->subnqn, sizeof(rp->subnqn));
+ return (0);
+ case NVMF_RECONNECT_HOST:
+ hh = (struct nvmf_handoff_host *)arg;
+ return (nvmf_reconnect_host(sc, hh));
+ default:
+ return (ENOTTY);
+ }
+}
+
+static struct cdevsw nvmf_cdevsw = {
+ .d_version = D_VERSION,
+ .d_ioctl = nvmf_ioctl
+};
+
+static int
+nvmf_modevent(module_t mod, int what, void *arg)
+{
+ switch (what) {
+ case MOD_LOAD:
+ return (nvmf_ctl_load());
+ case MOD_QUIESCE:
+ return (0);
+ case MOD_UNLOAD:
+ nvmf_ctl_unload();
+ destroy_dev_drain(&nvmf_cdevsw);
+ return (0);
+ default:
+ return (EOPNOTSUPP);
+ }
+}
+
+static device_method_t nvmf_methods[] = {
+ /* Device interface */
+ DEVMETHOD(device_probe, nvmf_probe),
+ DEVMETHOD(device_attach, nvmf_attach),
+ DEVMETHOD(device_detach, nvmf_detach),
+#if 0
+ DEVMETHOD(device_shutdown, nvmf_shutdown),
+#endif
+ DEVMETHOD_END
+};
+
+driver_t nvme_nvmf_driver = {
+ "nvme",
+ nvmf_methods,
+ sizeof(struct nvmf_softc),
+};
+
+DRIVER_MODULE(nvme, root, nvme_nvmf_driver, nvmf_modevent, NULL);
+MODULE_DEPEND(nvmf, nvmf_transport, 1, 1, 1);
diff --git a/sys/dev/nvmf/host/nvmf_aer.c b/sys/dev/nvmf/host/nvmf_aer.c
new file mode 100644
--- /dev/null
+++ b/sys/dev/nvmf/host/nvmf_aer.c
@@ -0,0 +1,290 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2024 Chelsio Communications, Inc.
+ * Written by: John Baldwin <jhb@FreeBSD.org>
+ */
+
+#include <sys/types.h>
+#include <sys/bus.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/taskqueue.h>
+#include <dev/nvmf/host/nvmf_var.h>
+
+struct nvmf_aer {
+ struct nvmf_softc *sc;
+ uint8_t log_page_id;
+ uint8_t info;
+ uint8_t type;
+
+ u_int page_len;
+ void *page;
+
+ int error;
+ uint16_t status;
+ int pending;
+ struct mtx *lock;
+ struct task complete_task;
+ struct task finish_page_task;
+};
+
+#define MAX_LOG_PAGE_SIZE 4096
+
+static void nvmf_complete_aer(void *arg, const struct nvme_completion *cqe);
+
+static void
+nvmf_submit_aer(struct nvmf_softc *sc, struct nvmf_aer *aer)
+{
+ struct nvmf_request *req;
+ struct nvme_command cmd;
+
+ memset(&cmd, 0, sizeof(cmd));
+ cmd.opc = NVME_OPC_ASYNC_EVENT_REQUEST;
+
+ req = nvmf_allocate_request(sc->admin, &cmd, nvmf_complete_aer, aer,
+ M_WAITOK);
+ if (req == NULL)
+ return;
+ req->aer = true;
+ nvmf_submit_request(req);
+}
+
+static void
+nvmf_handle_changed_namespaces(struct nvmf_softc *sc,
+ struct nvme_ns_list *ns_list)
+{
+ uint32_t nsid;
+
+ /*
+ * If more than 1024 namespaces have changed, we should
+ * probably just rescan the entire set of namespaces.
+ */
+ if (ns_list->ns[0] == 0xffffffff) {
+ device_printf(sc->dev, "too many changed namespaces\n");
+ return;
+ }
+
+ for (u_int i = 0; i < nitems(ns_list->ns); i++) {
+ if (ns_list->ns[i] == 0)
+ break;
+
+ nsid = le32toh(ns_list->ns[i]);
+ nvmf_rescan_ns(sc, nsid);
+ }
+}
+
+static void
+nvmf_finish_aer_page(struct nvmf_softc *sc, struct nvmf_aer *aer)
+{
+ /* If an error occurred fetching the page, just bail. */
+ if (aer->error != 0 || aer->status != 0)
+ return;
+
+ taskqueue_enqueue(taskqueue_thread, &aer->finish_page_task);
+}
+
+static void
+nvmf_finish_aer_page_task(void *arg, int pending)
+{
+ struct nvmf_aer *aer = arg;
+ struct nvmf_softc *sc = aer->sc;
+
+ switch (aer->log_page_id) {
+ case NVME_LOG_ERROR:
+ /* TODO: Should we log these? */
+ break;
+ case NVME_LOG_CHANGED_NAMESPACE:
+ nvmf_handle_changed_namespaces(sc, aer->page);
+ break;
+ }
+
+ /* Resubmit this AER command. */
+ nvmf_submit_aer(sc, aer);
+}
+
+static void
+nvmf_io_complete_aer_page(void *arg, size_t xfered, int error)
+{
+ struct nvmf_aer *aer = arg;
+ struct nvmf_softc *sc = aer->sc;
+
+ mtx_lock(aer->lock);
+ aer->error = error;
+ aer->pending--;
+ if (aer->pending == 0) {
+ mtx_unlock(aer->lock);
+ nvmf_finish_aer_page(sc, aer);
+ } else
+ mtx_unlock(aer->lock);
+}
+
+static void
+nvmf_complete_aer_page(void *arg, const struct nvme_completion *cqe)
+{
+ struct nvmf_aer *aer = arg;
+ struct nvmf_softc *sc = aer->sc;
+
+ mtx_lock(aer->lock);
+ aer->status = cqe->status;
+ aer->pending--;
+ if (aer->pending == 0) {
+ mtx_unlock(aer->lock);
+ nvmf_finish_aer_page(sc, aer);
+ } else
+ mtx_unlock(aer->lock);
+}
+
+static u_int
+nvmf_log_page_size(struct nvmf_softc *sc, uint8_t log_page_id)
+{
+ switch (log_page_id) {
+ case NVME_LOG_ERROR:
+ return ((sc->cdata->elpe + 1) *
+ sizeof(struct nvme_error_information_entry));
+ case NVME_LOG_CHANGED_NAMESPACE:
+ return (sizeof(struct nvme_ns_list));
+ default:
+ return (0);
+ }
+}
+
+static void
+nvmf_complete_aer(void *arg, const struct nvme_completion *cqe)
+{
+ struct nvmf_aer *aer = arg;
+ struct nvmf_softc *sc = aer->sc;
+ uint32_t cdw0;
+
+ /*
+ * The only error defined for AER is an abort due to
+ * submitting too many AER commands. Just discard this AER
+ * without resubmitting if we get an error.
+ *
+ * NB: Pending AER commands are aborted during controller
+ * shutdown, so discard aborted commands silently.
+ */
+ if (cqe->status != 0) {
+ if (!nvmf_cqe_aborted(cqe))
+ device_printf(sc->dev, "Ignoring error %#x for AER\n",
+ le16toh(cqe->status));
+ return;
+ }
+
+ cdw0 = le32toh(cqe->cdw0);
+ aer->log_page_id = NVMEV(NVME_ASYNC_EVENT_LOG_PAGE_ID, cdw0);
+ aer->info = NVMEV(NVME_ASYNC_EVENT_INFO, cdw0);
+ aer->type = NVMEV(NVME_ASYNC_EVENT_TYPE, cdw0);
+
+ device_printf(sc->dev, "AER type %u, info %#x, page %#x\n",
+ aer->type, aer->info, aer->log_page_id);
+
+ aer->page_len = nvmf_log_page_size(sc, aer->log_page_id);
+ taskqueue_enqueue(taskqueue_thread, &aer->complete_task);
+}
+
+static void
+nvmf_complete_aer_task(void *arg, int pending)
+{
+ struct nvmf_aer *aer = arg;
+ struct nvmf_softc *sc = aer->sc;
+
+ if (aer->page_len != 0) {
+ /* Read the associated log page. */
+ aer->page_len = MIN(aer->page_len, MAX_LOG_PAGE_SIZE);
+ aer->pending = 2;
+ (void) nvmf_cmd_get_log_page(sc, NVME_GLOBAL_NAMESPACE_TAG,
+ aer->log_page_id, 0, aer->page, aer->page_len,
+ nvmf_complete_aer_page, aer, nvmf_io_complete_aer_page,
+ aer, M_WAITOK);
+ } else {
+ /* Resubmit this AER command. */
+ nvmf_submit_aer(sc, aer);
+ }
+}
+
+static int
+nvmf_set_async_event_config(struct nvmf_softc *sc, uint32_t config)
+{
+ struct nvme_command cmd;
+ struct nvmf_completion_status status;
+ struct nvmf_request *req;
+
+ memset(&cmd, 0, sizeof(cmd));
+ cmd.opc = NVME_OPC_SET_FEATURES;
+ cmd.cdw10 = htole32(NVME_FEAT_ASYNC_EVENT_CONFIGURATION);
+ cmd.cdw11 = htole32(config);
+
+ nvmf_status_init(&status);
+ req = nvmf_allocate_request(sc->admin, &cmd, nvmf_complete, &status,
+ M_WAITOK);
+ if (req == NULL) {
+ device_printf(sc->dev,
+ "failed to allocate SET_FEATURES (ASYNC_EVENT_CONFIGURATION) command\n");
+ return (ECONNABORTED);
+ }
+ nvmf_submit_request(req);
+ nvmf_wait_for_reply(&status);
+
+ if (status.cqe.status != 0) {
+ device_printf(sc->dev,
+ "SET_FEATURES (ASYNC_EVENT_CONFIGURATION) failed, status %#x\n",
+ le16toh(status.cqe.status));
+ return (EIO);
+ }
+
+ return (0);
+}
+
+void
+nvmf_init_aer(struct nvmf_softc *sc)
+{
+ /* 8 matches NVME_MAX_ASYNC_EVENTS */
+ sc->num_aer = min(8, sc->cdata->aerl + 1);
+ sc->aer = mallocarray(sc->num_aer, sizeof(*sc->aer), M_NVMF,
+ M_WAITOK | M_ZERO);
+ for (u_int i = 0; i < sc->num_aer; i++) {
+ sc->aer[i].sc = sc;
+ sc->aer[i].page = malloc(MAX_LOG_PAGE_SIZE, M_NVMF, M_WAITOK);
+ sc->aer[i].lock = mtx_pool_find(mtxpool_sleep, &sc->aer[i]);
+ TASK_INIT(&sc->aer[i].complete_task, 0, nvmf_complete_aer_task,
+ &sc->aer[i]);
+ TASK_INIT(&sc->aer[i].finish_page_task, 0,
+ nvmf_finish_aer_page_task, &sc->aer[i]);
+ }
+}
+
+int
+nvmf_start_aer(struct nvmf_softc *sc)
+{
+ uint32_t async_event_config;
+ int error;
+
+ async_event_config = NVME_CRIT_WARN_ST_AVAILABLE_SPARE |
+ NVME_CRIT_WARN_ST_DEVICE_RELIABILITY |
+ NVME_CRIT_WARN_ST_READ_ONLY |
+ NVME_CRIT_WARN_ST_VOLATILE_MEMORY_BACKUP;
+ if (sc->cdata->ver >= NVME_REV(1, 2))
+ async_event_config |=
+ sc->cdata->oaes & NVME_ASYNC_EVENT_NS_ATTRIBUTE;
+ error = nvmf_set_async_event_config(sc, async_event_config);
+ if (error != 0)
+ return (error);
+
+ for (u_int i = 0; i < sc->num_aer; i++)
+ nvmf_submit_aer(sc, &sc->aer[i]);
+
+ return (0);
+}
+
+void
+nvmf_destroy_aer(struct nvmf_softc *sc)
+{
+ for (u_int i = 0; i < sc->num_aer; i++) {
+ taskqueue_drain(taskqueue_thread, &sc->aer[i].complete_task);
+ taskqueue_drain(taskqueue_thread, &sc->aer[i].finish_page_task);
+ free(sc->aer[i].page, M_NVMF);
+ }
+ free(sc->aer, M_NVMF);
+}
diff --git a/sys/dev/nvmf/host/nvmf_cmd.c b/sys/dev/nvmf/host/nvmf_cmd.c
new file mode 100644
--- /dev/null
+++ b/sys/dev/nvmf/host/nvmf_cmd.c
@@ -0,0 +1,171 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2023-2024 Chelsio Communications, Inc.
+ * Written by: John Baldwin <jhb@FreeBSD.org>
+ */
+
+#include <sys/types.h>
+#include <sys/memdesc.h>
+#include <sys/systm.h>
+#include <dev/nvme/nvme.h>
+#include <dev/nvmf/nvmf.h>
+#include <dev/nvmf/nvmf_proto.h>
+#include <dev/nvmf/host/nvmf_var.h>
+
+bool
+nvmf_cmd_get_property(struct nvmf_softc *sc, uint32_t offset, uint8_t size,
+ nvmf_request_complete_t *cb, void *cb_arg, int how)
+{
+ struct nvmf_fabric_prop_get_cmd cmd;
+ struct nvmf_request *req;
+
+ memset(&cmd, 0, sizeof(cmd));
+ cmd.opcode = NVME_OPC_FABRICS_COMMANDS;
+ cmd.fctype = NVMF_FABRIC_COMMAND_PROPERTY_GET;
+ switch (size) {
+ case 4:
+ cmd.attrib.size = NVMF_PROP_SIZE_4;
+ break;
+ case 8:
+ cmd.attrib.size = NVMF_PROP_SIZE_8;
+ break;
+ default:
+ panic("Invalid property size");
+ }
+ cmd.ofst = htole32(offset);
+
+ req = nvmf_allocate_request(sc->admin, &cmd, cb, cb_arg, how);
+ if (req != NULL)
+ nvmf_submit_request(req);
+ return (req != NULL);
+}
+
+bool
+nvmf_cmd_set_property(struct nvmf_softc *sc, uint32_t offset, uint8_t size,
+ uint64_t value, nvmf_request_complete_t *cb, void *cb_arg, int how)
+{
+ struct nvmf_fabric_prop_set_cmd cmd;
+ struct nvmf_request *req;
+
+ memset(&cmd, 0, sizeof(cmd));
+ cmd.opcode = NVME_OPC_FABRICS_COMMANDS;
+ cmd.fctype = NVMF_FABRIC_COMMAND_PROPERTY_SET;
+ switch (size) {
+ case 4:
+ cmd.attrib.size = NVMF_PROP_SIZE_4;
+ cmd.value.u32.low = htole32(value);
+ break;
+ case 8:
+ cmd.attrib.size = NVMF_PROP_SIZE_8;
+ cmd.value.u64 = htole64(value);
+ break;
+ default:
+ panic("Invalid property size");
+ }
+ cmd.ofst = htole32(offset);
+
+ req = nvmf_allocate_request(sc->admin, &cmd, cb, cb_arg, how);
+ if (req != NULL)
+ nvmf_submit_request(req);
+ return (req != NULL);
+}
+
+bool
+nvmf_cmd_keep_alive(struct nvmf_softc *sc, nvmf_request_complete_t *cb,
+ void *cb_arg, int how)
+{
+ struct nvme_command cmd;
+ struct nvmf_request *req;
+
+ memset(&cmd, 0, sizeof(cmd));
+ cmd.opc = NVME_OPC_KEEP_ALIVE;
+
+ req = nvmf_allocate_request(sc->admin, &cmd, cb, cb_arg, how);
+ if (req != NULL)
+ nvmf_submit_request(req);
+ return (req != NULL);
+}
+
+bool
+nvmf_cmd_identify_active_namespaces(struct nvmf_softc *sc, uint32_t id,
+ struct nvme_ns_list *nslist, nvmf_request_complete_t *req_cb,
+ void *req_cb_arg, nvmf_io_complete_t *io_cb, void *io_cb_arg, int how)
+{
+ struct nvme_command cmd;
+ struct memdesc mem;
+ struct nvmf_request *req;
+
+ memset(&cmd, 0, sizeof(cmd));
+ cmd.opc = NVME_OPC_IDENTIFY;
+
+ /* 5.15.1 Use CNS of 0x02 for namespace data. */
+ cmd.cdw10 = htole32(2);
+ cmd.nsid = htole32(id);
+
+ req = nvmf_allocate_request(sc->admin, &cmd, req_cb, req_cb_arg, how);
+ if (req == NULL)
+ return (false);
+ mem = memdesc_vaddr(nslist, sizeof(*nslist));
+ nvmf_capsule_append_data(req->nc, &mem, sizeof(*nslist), false,
+ io_cb, io_cb_arg);
+ nvmf_submit_request(req);
+ return (true);
+}
+
+bool
+nvmf_cmd_identify_namespace(struct nvmf_softc *sc, uint32_t id,
+ struct nvme_namespace_data *nsdata, nvmf_request_complete_t *req_cb,
+ void *req_cb_arg, nvmf_io_complete_t *io_cb, void *io_cb_arg, int how)
+{
+ struct nvme_command cmd;
+ struct memdesc mem;
+ struct nvmf_request *req;
+
+ memset(&cmd, 0, sizeof(cmd));
+ cmd.opc = NVME_OPC_IDENTIFY;
+
+ /* 5.15.1 Use CNS of 0x00 for namespace data. */
+ cmd.cdw10 = htole32(0);
+ cmd.nsid = htole32(id);
+
+ req = nvmf_allocate_request(sc->admin, &cmd, req_cb, req_cb_arg, how);
+ if (req == NULL)
+ return (false);
+ mem = memdesc_vaddr(nsdata, sizeof(*nsdata));
+ nvmf_capsule_append_data(req->nc, &mem, sizeof(*nsdata), false,
+ io_cb, io_cb_arg);
+ nvmf_submit_request(req);
+ return (true);
+}
+
+bool
+nvmf_cmd_get_log_page(struct nvmf_softc *sc, uint32_t nsid, uint8_t lid,
+ uint64_t offset, void *buf, size_t len, nvmf_request_complete_t *req_cb,
+ void *req_cb_arg, nvmf_io_complete_t *io_cb, void *io_cb_arg, int how)
+{
+ struct nvme_command cmd;
+ struct memdesc mem;
+ struct nvmf_request *req;
+ size_t numd;
+
+ MPASS(len != 0 && len % 4 == 0);
+ MPASS(offset % 4 == 0);
+
+ numd = (len / 4) - 1;
+ memset(&cmd, 0, sizeof(cmd));
+ cmd.opc = NVME_OPC_GET_LOG_PAGE;
+ cmd.nsid = htole32(nsid);
+ cmd.cdw10 = htole32(numd << 16 | lid);
+ cmd.cdw11 = htole32(numd >> 16);
+ cmd.cdw12 = htole32(offset);
+ cmd.cdw13 = htole32(offset >> 32);
+
+ req = nvmf_allocate_request(sc->admin, &cmd, req_cb, req_cb_arg, how);
+ if (req == NULL)
+ return (false);
+ mem = memdesc_vaddr(buf, len);
+ nvmf_capsule_append_data(req->nc, &mem, len, false, io_cb, io_cb_arg);
+ nvmf_submit_request(req);
+ return (true);
+}
diff --git a/sys/dev/nvmf/host/nvmf_ctldev.c b/sys/dev/nvmf/host/nvmf_ctldev.c
new file mode 100644
--- /dev/null
+++ b/sys/dev/nvmf/host/nvmf_ctldev.c
@@ -0,0 +1,159 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2023 Chelsio Communications, Inc.
+ * Written by: John Baldwin <jhb@FreeBSD.org>
+ */
+
+#include <sys/param.h>
+#include <sys/bus.h>
+#include <sys/conf.h>
+#include <sys/malloc.h>
+#include <dev/nvme/nvme.h>
+#include <dev/nvmf/nvmf.h>
+#include <dev/nvmf/nvmf_transport.h>
+#include <dev/nvmf/host/nvmf_var.h>
+
+static struct cdev *nvmf_cdev;
+
+static int
+nvmf_handoff_host(struct nvmf_handoff_host *hh)
+{
+ struct nvmf_ivars ivars;
+ device_t dev;
+ int error;
+
+ error = nvmf_init_ivars(&ivars, hh);
+ if (error != 0)
+ return (error);
+
+ bus_topo_lock();
+ dev = device_add_child(root_bus, "nvme", -1);
+ if (dev == NULL) {
+ bus_topo_unlock();
+ error = ENXIO;
+ goto out;
+ }
+
+ device_set_ivars(dev, &ivars);
+ error = device_probe_and_attach(dev);
+ device_set_ivars(dev, NULL);
+ if (error != 0)
+ device_delete_child(root_bus, dev);
+ bus_topo_unlock();
+
+out:
+ nvmf_free_ivars(&ivars);
+ return (error);
+}
+
+static bool
+nvmf_matches(device_t dev, char *name)
+{
+ struct nvmf_softc *sc = device_get_softc(dev);
+
+ if (strcmp(device_get_nameunit(dev), name) == 0)
+ return (true);
+ if (strcmp(sc->cdata->subnqn, name) == 0)
+ return (true);
+ return (false);
+}
+
+static int
+nvmf_disconnect_by_name(char *name)
+{
+ devclass_t dc;
+ device_t dev;
+ int error, unit;
+ bool found;
+
+ found = false;
+ error = 0;
+ bus_topo_lock();
+ dc = devclass_find("nvme");
+ if (dc == NULL)
+ goto out;
+
+ for (unit = 0; unit < devclass_get_maxunit(dc); unit++) {
+ dev = devclass_get_device(dc, unit);
+ if (dev == NULL)
+ continue;
+ if (device_get_driver(dev) != &nvme_nvmf_driver)
+ continue;
+ if (device_get_parent(dev) != root_bus)
+ continue;
+ if (name != NULL && !nvmf_matches(dev, name))
+ continue;
+
+ error = device_delete_child(root_bus, dev);
+ if (error != 0)
+ break;
+ found = true;
+ }
+out:
+ bus_topo_unlock();
+ if (error == 0 && !found)
+ error = ENOENT;
+ return (error);
+}
+
+static int
+nvmf_disconnect_host(const char **namep)
+{
+ char *name;
+ int error;
+
+ name = malloc(PATH_MAX, M_NVMF, M_WAITOK);
+ error = copyinstr(*namep, name, PATH_MAX, NULL);
+ if (error == 0)
+ error = nvmf_disconnect_by_name(name);
+ free(name, M_NVMF);
+ return (error);
+}
+
+static int
+nvmf_ctl_ioctl(struct cdev *dev, u_long cmd, caddr_t arg, int flag,
+ struct thread *td)
+{
+ switch (cmd) {
+ case NVMF_HANDOFF_HOST:
+ return (nvmf_handoff_host((struct nvmf_handoff_host *)arg));
+ case NVMF_DISCONNECT_HOST:
+ return (nvmf_disconnect_host((const char **)arg));
+ case NVMF_DISCONNECT_ALL:
+ return (nvmf_disconnect_by_name(NULL));
+ default:
+ return (ENOTTY);
+ }
+}
+
+static struct cdevsw nvmf_ctl_cdevsw = {
+ .d_version = D_VERSION,
+ .d_ioctl = nvmf_ctl_ioctl
+};
+
+int
+nvmf_ctl_load(void)
+{
+ struct make_dev_args mda;
+ int error;
+
+ make_dev_args_init(&mda);
+ mda.mda_devsw = &nvmf_ctl_cdevsw;
+ mda.mda_uid = UID_ROOT;
+ mda.mda_gid = GID_WHEEL;
+ mda.mda_mode = 0600;
+ error = make_dev_s(&mda, &nvmf_cdev, "nvmf");
+ if (error != 0)
+ nvmf_cdev = NULL;
+ return (error);
+}
+
+void
+nvmf_ctl_unload(void)
+{
+ if (nvmf_cdev != NULL) {
+ destroy_dev(nvmf_cdev);
+ nvmf_cdev = NULL;
+ }
+}
diff --git a/sys/dev/nvmf/host/nvmf_ns.c b/sys/dev/nvmf/host/nvmf_ns.c
new file mode 100644
--- /dev/null
+++ b/sys/dev/nvmf/host/nvmf_ns.c
@@ -0,0 +1,483 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2023-2024 Chelsio Communications, Inc.
+ * Written by: John Baldwin <jhb@FreeBSD.org>
+ */
+
+#include <sys/param.h>
+#include <sys/bio.h>
+#include <sys/bus.h>
+#include <sys/conf.h>
+#include <sys/disk.h>
+#include <sys/fcntl.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/memdesc.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/refcount.h>
+#include <sys/sbuf.h>
+#include <machine/stdarg.h>
+#include <dev/nvme/nvme.h>
+#include <dev/nvmf/host/nvmf_var.h>
+
+struct nvmf_namespace {
+ struct nvmf_softc *sc;
+ uint64_t size;
+ uint32_t id;
+ u_int flags;
+ uint32_t lba_size;
+ bool disconnected;
+
+ TAILQ_HEAD(, bio) pending_bios;
+ struct mtx lock;
+ volatile u_int active_bios;
+
+ struct cdev *cdev;
+};
+
+static void nvmf_ns_strategy(struct bio *bio);
+
+static void
+ns_printf(struct nvmf_namespace *ns, const char *fmt, ...)
+{
+ char buf[128];
+ struct sbuf sb;
+ va_list ap;
+
+ sbuf_new(&sb, buf, sizeof(buf), SBUF_FIXEDLEN);
+ sbuf_set_drain(&sb, sbuf_printf_drain, NULL);
+
+ sbuf_printf(&sb, "%sns%u: ", device_get_nameunit(ns->sc->dev),
+ ns->id);
+
+ va_start(ap, fmt);
+ sbuf_vprintf(&sb, fmt, ap);
+ va_end(ap);
+
+ sbuf_finish(&sb);
+ sbuf_delete(&sb);
+}
+
+/*
+ * The I/O completion may trigger after the received CQE if the I/O
+ * used a zero-copy mbuf that isn't harvested until after the NIC
+ * driver processes TX completions. Abuse bio_driver1 as a refcount.
+ * Store I/O errors in bio_driver2.
+ */
+static __inline u_int *
+bio_refs(struct bio *bio)
+{
+ return ((u_int *)&bio->bio_driver1);
+}
+
+static void
+nvmf_ns_biodone(struct bio *bio)
+{
+ struct nvmf_namespace *ns;
+ int error;
+
+ if (!refcount_release(bio_refs(bio)))
+ return;
+
+ ns = bio->bio_dev->si_drv1;
+
+ /* If a request is aborted, resubmit or queue it for resubmission. */
+ if (bio->bio_error == ECONNABORTED) {
+ bio->bio_error = 0;
+ bio->bio_driver2 = 0;
+ mtx_lock(&ns->lock);
+ if (ns->disconnected) {
+ TAILQ_INSERT_TAIL(&ns->pending_bios, bio, bio_queue);
+ mtx_unlock(&ns->lock);
+ } else {
+ mtx_unlock(&ns->lock);
+ nvmf_ns_strategy(bio);
+ }
+ } else {
+ /*
+ * I/O errors take precedence over generic EIO from
+ * CQE errors.
+ */
+ error = (intptr_t)bio->bio_driver2;
+ if (error != 0)
+ bio->bio_error = error;
+ if (bio->bio_error != 0)
+ bio->bio_flags |= BIO_ERROR;
+ biodone(bio);
+ }
+
+ if (refcount_release(&ns->active_bios))
+ wakeup(ns);
+}
+
+static void
+nvmf_ns_io_complete(void *arg, size_t xfered, int error)
+{
+ struct bio *bio = arg;
+
+ KASSERT(xfered <= bio->bio_bcount,
+ ("%s: xfered > bio_bcount", __func__));
+
+ bio->bio_driver2 = (void *)(intptr_t)error;
+ bio->bio_resid = bio->bio_bcount - xfered;
+
+ nvmf_ns_biodone(bio);
+}
+
+static void
+nvmf_ns_delete_complete(void *arg, size_t xfered, int error)
+{
+ struct bio *bio = arg;
+
+ if (error != 0)
+ bio->bio_resid = bio->bio_bcount;
+ else
+ bio->bio_resid = 0;
+
+ free(bio->bio_driver2, M_NVMF);
+ bio->bio_driver2 = (void *)(intptr_t)error;
+
+ nvmf_ns_biodone(bio);
+}
+
+static void
+nvmf_ns_bio_complete(void *arg, const struct nvme_completion *cqe)
+{
+ struct bio *bio = arg;
+
+ if (nvmf_cqe_aborted(cqe))
+ bio->bio_error = ECONNABORTED;
+ else if (cqe->status != 0)
+ bio->bio_error = EIO;
+
+ nvmf_ns_biodone(bio);
+}
+
+static int
+nvmf_ns_submit_bio(struct nvmf_namespace *ns, struct bio *bio)
+{
+ struct nvme_command cmd;
+ struct nvmf_request *req;
+ struct nvme_dsm_range *dsm_range;
+ struct memdesc mem;
+ uint64_t lba, lba_count;
+
+ dsm_range = NULL;
+ memset(&cmd, 0, sizeof(cmd));
+ switch (bio->bio_cmd) {
+ case BIO_READ:
+ lba = bio->bio_offset / ns->lba_size;
+ lba_count = bio->bio_bcount / ns->lba_size;
+ nvme_ns_read_cmd(&cmd, ns->id, lba, lba_count);
+ break;
+ case BIO_WRITE:
+ lba = bio->bio_offset / ns->lba_size;
+ lba_count = bio->bio_bcount / ns->lba_size;
+ nvme_ns_write_cmd(&cmd, ns->id, lba, lba_count);
+ break;
+ case BIO_FLUSH:
+ nvme_ns_flush_cmd(&cmd, ns->id);
+ break;
+ case BIO_DELETE:
+ dsm_range = malloc(sizeof(*dsm_range), M_NVMF, M_NOWAIT |
+ M_ZERO);
+ if (dsm_range == NULL)
+ return (ENOMEM);
+ lba = bio->bio_offset / ns->lba_size;
+ lba_count = bio->bio_bcount / ns->lba_size;
+ dsm_range->starting_lba = htole64(lba);
+ dsm_range->length = htole32(lba_count);
+
+ cmd.opc = NVME_OPC_DATASET_MANAGEMENT;
+ cmd.nsid = htole32(ns->id);
+ cmd.cdw10 = htole32(0); /* 1 range */
+ cmd.cdw11 = htole32(NVME_DSM_ATTR_DEALLOCATE);
+ break;
+ default:
+ return (EOPNOTSUPP);
+ }
+
+ mtx_lock(&ns->lock);
+ if (ns->disconnected) {
+ TAILQ_INSERT_TAIL(&ns->pending_bios, bio, bio_queue);
+ mtx_unlock(&ns->lock);
+ free(dsm_range, M_NVMF);
+ return (0);
+ }
+
+ req = nvmf_allocate_request(nvmf_select_io_queue(ns->sc), &cmd,
+ nvmf_ns_bio_complete, bio, M_NOWAIT);
+ if (req == NULL) {
+ mtx_unlock(&ns->lock);
+ free(dsm_range, M_NVMF);
+ return (ENOMEM);
+ }
+
+ switch (bio->bio_cmd) {
+ case BIO_READ:
+ case BIO_WRITE:
+ refcount_init(bio_refs(bio), 2);
+ mem = memdesc_bio(bio);
+ nvmf_capsule_append_data(req->nc, &mem, bio->bio_bcount,
+ bio->bio_cmd == BIO_WRITE, nvmf_ns_io_complete, bio);
+ break;
+ case BIO_DELETE:
+ refcount_init(bio_refs(bio), 2);
+ mem = memdesc_vaddr(dsm_range, sizeof(*dsm_range));
+ nvmf_capsule_append_data(req->nc, &mem, sizeof(*dsm_range),
+ true, nvmf_ns_delete_complete, bio);
+ bio->bio_driver2 = dsm_range;
+ break;
+ default:
+ refcount_init(bio_refs(bio), 1);
+ KASSERT(bio->bio_resid == 0,
+ ("%s: input bio_resid != 0", __func__));
+ break;
+ }
+
+ refcount_acquire(&ns->active_bios);
+ nvmf_submit_request(req);
+ mtx_unlock(&ns->lock);
+ return (0);
+}
+
+static int
+nvmf_ns_ioctl(struct cdev *dev, u_long cmd, caddr_t arg, int flag,
+ struct thread *td)
+{
+ struct nvmf_namespace *ns = dev->si_drv1;
+ struct nvme_get_nsid *gnsid;
+ struct nvme_pt_command *pt;
+
+ switch (cmd) {
+ case NVME_PASSTHROUGH_CMD:
+ pt = (struct nvme_pt_command *)arg;
+ pt->cmd.nsid = htole32(ns->id);
+ return (nvmf_passthrough_cmd(ns->sc, pt, false));
+ case NVME_GET_NSID:
+ gnsid = (struct nvme_get_nsid *)arg;
+ strncpy(gnsid->cdev, device_get_nameunit(ns->sc->dev),
+ sizeof(gnsid->cdev));
+ gnsid->cdev[sizeof(gnsid->cdev) - 1] = '\0';
+ gnsid->nsid = ns->id;
+ return (0);
+ case DIOCGMEDIASIZE:
+ *(off_t *)arg = ns->size;
+ return (0);
+ case DIOCGSECTORSIZE:
+ *(u_int *)arg = ns->lba_size;
+ return (0);
+ default:
+ return (ENOTTY);
+ }
+}
+
+static int
+nvmf_ns_open(struct cdev *dev, int oflags, int devtype, struct thread *td)
+{
+ int error;
+
+ error = 0;
+ if ((oflags & FWRITE) != 0)
+ error = securelevel_gt(td->td_ucred, 0);
+ return (error);
+}
+
+void
+nvmf_ns_strategy(struct bio *bio)
+{
+ struct nvmf_namespace *ns;
+ int error;
+
+ ns = bio->bio_dev->si_drv1;
+
+ error = nvmf_ns_submit_bio(ns, bio);
+ if (error != 0) {
+ bio->bio_error = error;
+ bio->bio_flags |= BIO_ERROR;
+ bio->bio_resid = bio->bio_bcount;
+ biodone(bio);
+ }
+}
+
+static struct cdevsw nvmf_ns_cdevsw = {
+ .d_version = D_VERSION,
+ .d_flags = D_DISK,
+ .d_open = nvmf_ns_open,
+ .d_read = physread,
+ .d_write = physwrite,
+ .d_strategy = nvmf_ns_strategy,
+ .d_ioctl = nvmf_ns_ioctl
+};
+
+struct nvmf_namespace *
+nvmf_init_ns(struct nvmf_softc *sc, uint32_t id,
+ struct nvme_namespace_data *data)
+{
+ struct make_dev_args mda;
+ struct nvmf_namespace *ns;
+ int error;
+ uint8_t lbads, lbaf;
+
+ ns = malloc(sizeof(*ns), M_NVMF, M_WAITOK | M_ZERO);
+ ns->sc = sc;
+ ns->id = id;
+ TAILQ_INIT(&ns->pending_bios);
+ mtx_init(&ns->lock, "nvmf ns", NULL, MTX_DEF);
+
+ /* One dummy bio avoids dropping to 0 until destroy. */
+ refcount_init(&ns->active_bios, 1);
+
+ if (NVMEV(NVME_NS_DATA_DPS_PIT, data->dps) != 0) {
+ ns_printf(ns, "End-to-end data protection not supported\n");
+ goto fail;
+ }
+
+ lbaf = NVMEV(NVME_NS_DATA_FLBAS_FORMAT, data->flbas);
+ if (lbaf > data->nlbaf) {
+ ns_printf(ns, "Invalid LBA format index\n");
+ goto fail;
+ }
+
+ if (NVMEV(NVME_NS_DATA_LBAF_MS, data->lbaf[lbaf]) != 0) {
+ ns_printf(ns, "Namespaces with metadata are not supported\n");
+ goto fail;
+ }
+
+ lbads = NVMEV(NVME_NS_DATA_LBAF_LBADS, data->lbaf[lbaf]);
+ if (lbads == 0) {
+ ns_printf(ns, "Invalid LBA format index\n");
+ goto fail;
+ }
+
+ ns->lba_size = 1 << lbads;
+ ns->size = data->nsze * ns->lba_size;
+
+ if (nvme_ctrlr_has_dataset_mgmt(sc->cdata))
+ ns->flags |= NVME_NS_DEALLOCATE_SUPPORTED;
+
+ if (NVMEV(NVME_CTRLR_DATA_VWC_PRESENT, sc->cdata->vwc) != 0)
+ ns->flags |= NVME_NS_FLUSH_SUPPORTED;
+
+ /*
+ * XXX: Does any of the boundary splitting for NOIOB make any
+ * sense for Fabrics?
+ */
+
+ make_dev_args_init(&mda);
+ mda.mda_devsw = &nvmf_ns_cdevsw;
+ mda.mda_uid = UID_ROOT;
+ mda.mda_gid = GID_WHEEL;
+ mda.mda_mode = 0600;
+ mda.mda_si_drv1 = ns;
+ error = make_dev_s(&mda, &ns->cdev, "%sns%u",
+ device_get_nameunit(sc->dev), id);
+ if (error != 0)
+ goto fail;
+
+ ns->cdev->si_flags |= SI_UNMAPPED;
+
+ return (ns);
+fail:
+ mtx_destroy(&ns->lock);
+ free(ns, M_NVMF);
+ return (NULL);
+}
+
+void
+nvmf_disconnect_ns(struct nvmf_namespace *ns)
+{
+ mtx_lock(&ns->lock);
+ ns->disconnected = true;
+ mtx_unlock(&ns->lock);
+}
+
+void
+nvmf_reconnect_ns(struct nvmf_namespace *ns)
+{
+ TAILQ_HEAD(, bio) bios;
+ struct bio *bio;
+
+ mtx_lock(&ns->lock);
+ ns->disconnected = false;
+ TAILQ_INIT(&bios);
+ TAILQ_CONCAT(&bios, &ns->pending_bios, bio_queue);
+ mtx_unlock(&ns->lock);
+
+ while (!TAILQ_EMPTY(&bios)) {
+ bio = TAILQ_FIRST(&bios);
+ TAILQ_REMOVE(&bios, bio, bio_queue);
+ nvmf_ns_strategy(bio);
+ }
+}
+
+void
+nvmf_destroy_ns(struct nvmf_namespace *ns)
+{
+ TAILQ_HEAD(, bio) bios;
+ struct bio *bio;
+
+ destroy_dev(ns->cdev);
+
+ /*
+ * Wait for active I/O requests to drain. The release drops
+ * the reference on the "dummy bio" when the namespace is
+ * created.
+ */
+ mtx_lock(&ns->lock);
+ if (!refcount_release(&ns->active_bios)) {
+ while (ns->active_bios != 0)
+ mtx_sleep(ns, &ns->lock, 0, "nvmfrmns", 0);
+ }
+
+ /* Abort any pending I/O requests. */
+ TAILQ_INIT(&bios);
+ TAILQ_CONCAT(&bios, &ns->pending_bios, bio_queue);
+ mtx_unlock(&ns->lock);
+
+ while (!TAILQ_EMPTY(&bios)) {
+ bio = TAILQ_FIRST(&bios);
+ TAILQ_REMOVE(&bios, bio, bio_queue);
+ bio->bio_error = ECONNABORTED;
+ bio->bio_flags |= BIO_ERROR;
+ bio->bio_resid = bio->bio_bcount;
+ biodone(bio);
+ }
+
+ mtx_destroy(&ns->lock);
+ free(ns, M_NVMF);
+}
+
+bool
+nvmf_update_ns(struct nvmf_namespace *ns, struct nvme_namespace_data *data)
+{
+ uint8_t lbads, lbaf;
+
+ if (NVMEV(NVME_NS_DATA_DPS_PIT, data->dps) != 0) {
+ ns_printf(ns, "End-to-end data protection not supported\n");
+ return (false);
+ }
+
+ lbaf = NVMEV(NVME_NS_DATA_FLBAS_FORMAT, data->flbas);
+ if (lbaf > data->nlbaf) {
+ ns_printf(ns, "Invalid LBA format index\n");
+ return (false);
+ }
+
+ if (NVMEV(NVME_NS_DATA_LBAF_MS, data->lbaf[lbaf]) != 0) {
+ ns_printf(ns, "Namespaces with metadata are not supported\n");
+ return (false);
+ }
+
+ lbads = NVMEV(NVME_NS_DATA_LBAF_LBADS, data->lbaf[lbaf]);
+ if (lbads == 0) {
+ ns_printf(ns, "Invalid LBA format index\n");
+ return (false);
+ }
+
+ ns->lba_size = 1 << lbads;
+ ns->size = data->nsze * ns->lba_size;
+ return (true);
+}
diff --git a/sys/dev/nvmf/host/nvmf_qpair.c b/sys/dev/nvmf/host/nvmf_qpair.c
new file mode 100644
--- /dev/null
+++ b/sys/dev/nvmf/host/nvmf_qpair.c
@@ -0,0 +1,386 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2023-2024 Chelsio Communications, Inc.
+ * Written by: John Baldwin <jhb@FreeBSD.org>
+ */
+
+#include <sys/types.h>
+#include <sys/bus.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <dev/nvme/nvme.h>
+#include <dev/nvmf/nvmf.h>
+#include <dev/nvmf/nvmf_transport.h>
+#include <dev/nvmf/host/nvmf_var.h>
+
+struct nvmf_host_command {
+ struct nvmf_request *req;
+ TAILQ_ENTRY(nvmf_host_command) link;
+ uint16_t cid;
+};
+
+struct nvmf_host_qpair {
+ struct nvmf_softc *sc;
+ struct nvmf_qpair *qp;
+
+ bool sq_flow_control;
+ bool shutting_down;
+ u_int allocating;
+ u_int num_commands;
+ uint16_t sqhd;
+ uint16_t sqtail;
+
+ struct mtx lock;
+
+ TAILQ_HEAD(, nvmf_host_command) free_commands;
+ STAILQ_HEAD(, nvmf_request) pending_requests;
+
+ /* Indexed by cid. */
+ struct nvmf_host_command **active_commands;
+
+ char name[16];
+};
+
+struct nvmf_request *
+nvmf_allocate_request(struct nvmf_host_qpair *qp, void *sqe,
+ nvmf_request_complete_t *cb, void *cb_arg, int how)
+{
+ struct nvmf_request *req;
+ struct nvmf_qpair *nq;
+
+ KASSERT(how == M_WAITOK || how == M_NOWAIT,
+ ("%s: invalid how", __func__));
+
+ req = malloc(sizeof(*req), M_NVMF, how | M_ZERO);
+ if (req == NULL)
+ return (NULL);
+
+ mtx_lock(&qp->lock);
+ nq = qp->qp;
+ if (nq == NULL) {
+ mtx_unlock(&qp->lock);
+ free(req, M_NVMF);
+ return (NULL);
+ }
+ qp->allocating++;
+ MPASS(qp->allocating != 0);
+ mtx_unlock(&qp->lock);
+
+ req->qp = qp;
+ req->cb = cb;
+ req->cb_arg = cb_arg;
+ req->nc = nvmf_allocate_command(nq, sqe, how);
+ if (req->nc == NULL) {
+ free(req, M_NVMF);
+ req = NULL;
+ }
+
+ mtx_lock(&qp->lock);
+ qp->allocating--;
+ if (qp->allocating == 0 && qp->shutting_down)
+ wakeup(qp);
+ mtx_unlock(&qp->lock);
+
+ return (req);
+}
+
+static void
+nvmf_abort_request(struct nvmf_request *req, uint16_t cid)
+{
+ struct nvme_completion cqe;
+
+ memset(&cqe, 0, sizeof(cqe));
+ cqe.cid = cid;
+ cqe.status = htole16(NVMEF(NVME_STATUS_SCT, NVME_SCT_PATH_RELATED) |
+ NVMEF(NVME_STATUS_SC, NVME_SC_COMMAND_ABORTED_BY_HOST));
+ req->cb(req->cb_arg, &cqe);
+}
+
+void
+nvmf_free_request(struct nvmf_request *req)
+{
+ if (req->nc != NULL)
+ nvmf_free_capsule(req->nc);
+ free(req, M_NVMF);
+}
+
+static void
+nvmf_dispatch_command(struct nvmf_host_qpair *qp, struct nvmf_host_command *cmd)
+{
+ struct nvmf_softc *sc = qp->sc;
+ struct nvme_command *sqe;
+ struct nvmf_capsule *nc;
+ int error;
+
+ nc = cmd->req->nc;
+ sqe = nvmf_capsule_sqe(nc);
+
+ /*
+ * NB: Don't bother byte-swapping the cid so that receive
+ * doesn't have to swap.
+ */
+ sqe->cid = cmd->cid;
+
+ error = nvmf_transmit_capsule(nc);
+ if (error != 0) {
+ device_printf(sc->dev,
+ "failed to transmit capsule: %d, disconnecting\n", error);
+ nvmf_disconnect(sc);
+ return;
+ }
+
+ if (sc->ka_traffic)
+ atomic_store_int(&sc->ka_active_tx_traffic, 1);
+}
+
+static void
+nvmf_qp_error(void *arg, int error)
+{
+ struct nvmf_host_qpair *qp = arg;
+ struct nvmf_softc *sc = qp->sc;
+
+ /* Ignore simple close of queue pairs during shutdown. */
+ if (!(sc->detaching && error == 0))
+ device_printf(sc->dev, "error %d on %s, disconnecting\n", error,
+ qp->name);
+ nvmf_disconnect(sc);
+}
+
+static void
+nvmf_receive_capsule(void *arg, struct nvmf_capsule *nc)
+{
+ struct nvmf_host_qpair *qp = arg;
+ struct nvmf_softc *sc = qp->sc;
+ struct nvmf_host_command *cmd;
+ struct nvmf_request *req;
+ const struct nvme_completion *cqe;
+ uint16_t cid;
+
+ cqe = nvmf_capsule_cqe(nc);
+
+ if (sc->ka_traffic)
+ atomic_store_int(&sc->ka_active_rx_traffic, 1);
+
+ /*
+ * NB: Don't bother byte-swapping the cid as transmit doesn't
+ * swap either.
+ */
+ cid = cqe->cid;
+
+ if (cid > qp->num_commands) {
+ device_printf(sc->dev,
+ "received invalid CID %u, disconnecting\n", cid);
+ nvmf_disconnect(sc);
+ nvmf_free_capsule(nc);
+ return;
+ }
+
+ /*
+ * If the queue has been shutdown due to an error, silently
+ * drop the response.
+ */
+ mtx_lock(&qp->lock);
+ if (qp->qp == NULL) {
+ device_printf(sc->dev,
+ "received completion for CID %u on shutdown %s\n", cid,
+ qp->name);
+ mtx_unlock(&qp->lock);
+ nvmf_free_capsule(nc);
+ return;
+ }
+
+ cmd = qp->active_commands[cid];
+ if (cmd == NULL) {
+ mtx_unlock(&qp->lock);
+ device_printf(sc->dev,
+ "received completion for inactive CID %u, disconnecting\n",
+ cid);
+ nvmf_disconnect(sc);
+ nvmf_free_capsule(nc);
+ return;
+ }
+
+ KASSERT(cmd->cid == cid, ("%s: CID mismatch", __func__));
+ req = cmd->req;
+ cmd->req = NULL;
+ if (STAILQ_EMPTY(&qp->pending_requests)) {
+ qp->active_commands[cid] = NULL;
+ TAILQ_INSERT_TAIL(&qp->free_commands, cmd, link);
+ mtx_unlock(&qp->lock);
+ } else {
+ cmd->req = STAILQ_FIRST(&qp->pending_requests);
+ STAILQ_REMOVE_HEAD(&qp->pending_requests, link);
+ mtx_unlock(&qp->lock);
+ nvmf_dispatch_command(qp, cmd);
+ }
+
+ req->cb(req->cb_arg, cqe);
+ nvmf_free_capsule(nc);
+ nvmf_free_request(req);
+}
+
+struct nvmf_host_qpair *
+nvmf_init_qp(struct nvmf_softc *sc, enum nvmf_trtype trtype,
+ struct nvmf_handoff_qpair_params *handoff, const char *name)
+{
+ struct nvmf_host_command *cmd, *ncmd;
+ struct nvmf_host_qpair *qp;
+ u_int i;
+
+ qp = malloc(sizeof(*qp), M_NVMF, M_WAITOK | M_ZERO);
+ qp->sc = sc;
+ qp->sq_flow_control = handoff->sq_flow_control;
+ qp->sqhd = handoff->sqhd;
+ qp->sqtail = handoff->sqtail;
+ strlcpy(qp->name, name, sizeof(qp->name));
+ mtx_init(&qp->lock, "nvmf qp", NULL, MTX_DEF);
+
+ /*
+ * Allocate a spare command slot for each pending AER command
+ * on the admin queue.
+ */
+ qp->num_commands = handoff->qsize - 1;
+ if (handoff->admin)
+ qp->num_commands += sc->num_aer;
+
+ qp->active_commands = malloc(sizeof(*qp->active_commands) *
+ qp->num_commands, M_NVMF, M_WAITOK | M_ZERO);
+ TAILQ_INIT(&qp->free_commands);
+ for (i = 0; i < qp->num_commands; i++) {
+ cmd = malloc(sizeof(*cmd), M_NVMF, M_WAITOK | M_ZERO);
+ cmd->cid = i;
+ TAILQ_INSERT_TAIL(&qp->free_commands, cmd, link);
+ }
+ STAILQ_INIT(&qp->pending_requests);
+
+ qp->qp = nvmf_allocate_qpair(trtype, false, handoff, nvmf_qp_error,
+ qp, nvmf_receive_capsule, qp);
+ if (qp->qp == NULL) {
+ TAILQ_FOREACH_SAFE(cmd, &qp->free_commands, link, ncmd) {
+ TAILQ_REMOVE(&qp->free_commands, cmd, link);
+ free(cmd, M_NVMF);
+ }
+ free(qp->active_commands, M_NVMF);
+ mtx_destroy(&qp->lock);
+ free(qp, M_NVMF);
+ return (NULL);
+ }
+
+ return (qp);
+}
+
+void
+nvmf_shutdown_qp(struct nvmf_host_qpair *qp)
+{
+ struct nvmf_host_command *cmd;
+ struct nvmf_request *req;
+ struct nvmf_qpair *nq;
+
+ mtx_lock(&qp->lock);
+ nq = qp->qp;
+ qp->qp = NULL;
+
+ if (nq == NULL) {
+ while (qp->shutting_down)
+ mtx_sleep(qp, &qp->lock, 0, "nvmfqpsh", 0);
+ mtx_unlock(&qp->lock);
+ return;
+ }
+ qp->shutting_down = true;
+ while (qp->allocating != 0)
+ mtx_sleep(qp, &qp->lock, 0, "nvmfqpqu", 0);
+ mtx_unlock(&qp->lock);
+
+ nvmf_free_qpair(nq);
+
+ /*
+ * Abort outstanding requests. Active requests will have
+ * their I/O completions invoked and associated capsules freed
+ * by the transport layer via nvmf_free_qpair. Pending
+ * requests must have their I/O completion invoked via
+ * nvmf_abort_capsule_data.
+ */
+ for (u_int i = 0; i < qp->num_commands; i++) {
+ cmd = qp->active_commands[i];
+ if (cmd != NULL) {
+ if (!cmd->req->aer)
+ printf("%s: aborted active command %p (CID %u)\n",
+ __func__, cmd->req, cmd->cid);
+
+ /* This was freed by nvmf_free_qpair. */
+ cmd->req->nc = NULL;
+ nvmf_abort_request(cmd->req, cmd->cid);
+ nvmf_free_request(cmd->req);
+ free(cmd, M_NVMF);
+ }
+ }
+ while (!STAILQ_EMPTY(&qp->pending_requests)) {
+ req = STAILQ_FIRST(&qp->pending_requests);
+ STAILQ_REMOVE_HEAD(&qp->pending_requests, link);
+ if (!req->aer)
+ printf("%s: aborted pending command %p\n", __func__,
+ req);
+ nvmf_abort_capsule_data(req->nc, ECONNABORTED);
+ nvmf_abort_request(req, 0);
+ nvmf_free_request(req);
+ }
+
+ mtx_lock(&qp->lock);
+ qp->shutting_down = false;
+ mtx_unlock(&qp->lock);
+ wakeup(qp);
+}
+
+void
+nvmf_destroy_qp(struct nvmf_host_qpair *qp)
+{
+ struct nvmf_host_command *cmd, *ncmd;
+
+ nvmf_shutdown_qp(qp);
+
+ TAILQ_FOREACH_SAFE(cmd, &qp->free_commands, link, ncmd) {
+ TAILQ_REMOVE(&qp->free_commands, cmd, link);
+ free(cmd, M_NVMF);
+ }
+ free(qp->active_commands, M_NVMF);
+ mtx_destroy(&qp->lock);
+ free(qp, M_NVMF);
+}
+
+void
+nvmf_submit_request(struct nvmf_request *req)
+{
+ struct nvmf_host_qpair *qp;
+ struct nvmf_host_command *cmd;
+
+ qp = req->qp;
+ mtx_lock(&qp->lock);
+ if (qp->qp == NULL) {
+ mtx_unlock(&qp->lock);
+ printf("%s: aborted pending command %p\n", __func__, req);
+ nvmf_abort_capsule_data(req->nc, ECONNABORTED);
+ nvmf_abort_request(req, 0);
+ nvmf_free_request(req);
+ return;
+ }
+ cmd = TAILQ_FIRST(&qp->free_commands);
+ if (cmd == NULL) {
+ /*
+ * Queue this request. Will be sent after enough
+ * in-flight requests have completed.
+ */
+ STAILQ_INSERT_TAIL(&qp->pending_requests, req, link);
+ mtx_unlock(&qp->lock);
+ return;
+ }
+
+ TAILQ_REMOVE(&qp->free_commands, cmd, link);
+ KASSERT(qp->active_commands[cmd->cid] == NULL,
+ ("%s: CID already busy", __func__));
+ qp->active_commands[cmd->cid] = cmd;
+ cmd->req = req;
+ mtx_unlock(&qp->lock);
+ nvmf_dispatch_command(qp, cmd);
+}
diff --git a/sys/dev/nvmf/host/nvmf_sim.c b/sys/dev/nvmf/host/nvmf_sim.c
new file mode 100644
--- /dev/null
+++ b/sys/dev/nvmf/host/nvmf_sim.c
@@ -0,0 +1,332 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2023-2024 Chelsio Communications, Inc.
+ * Written by: John Baldwin <jhb@FreeBSD.org>
+ */
+
+#include <sys/types.h>
+#include <sys/malloc.h>
+#include <sys/memdesc.h>
+#include <sys/refcount.h>
+
+#include <cam/cam.h>
+#include <cam/cam_ccb.h>
+#include <cam/cam_sim.h>
+#include <cam/cam_xpt_sim.h>
+#include <cam/cam_debug.h>
+
+#include <dev/nvmf/host/nvmf_var.h>
+
+/*
+ * The I/O completion may trigger after the received CQE if the I/O
+ * used a zero-copy mbuf that isn't harvested until after the NIC
+ * driver processes TX completions. Use spriv_field0 to as a refcount.
+ *
+ * Store any I/O error returned in spriv_field1.
+ */
+static __inline u_int *
+ccb_refs(union ccb *ccb)
+{
+ return ((u_int *)&ccb->ccb_h.spriv_field0);
+}
+
+#define spriv_ioerror spriv_field1
+
+static void
+nvmf_ccb_done(union ccb *ccb)
+{
+ if (!refcount_release(ccb_refs(ccb)))
+ return;
+
+ if (nvmf_cqe_aborted(&ccb->nvmeio.cpl)) {
+ ccb->ccb_h.status = CAM_REQUEUE_REQ;
+ xpt_done(ccb);
+ } else if (ccb->nvmeio.cpl.status != 0) {
+ ccb->ccb_h.status = CAM_NVME_STATUS_ERROR;
+ xpt_done(ccb);
+ } else if (ccb->ccb_h.spriv_ioerror != 0) {
+ KASSERT(ccb->ccb_h.spriv_ioerror != EJUSTRETURN,
+ ("%s: zero sized transfer without CQE error", __func__));
+ ccb->ccb_h.status = CAM_REQ_CMP_ERR;
+ xpt_done(ccb);
+ } else {
+ ccb->ccb_h.status = CAM_REQ_CMP;
+ xpt_done_direct(ccb);
+ }
+}
+
+static void
+nvmf_ccb_io_complete(void *arg, size_t xfered, int error)
+{
+ union ccb *ccb = arg;
+
+ /*
+ * TODO: Reporting partial completions requires extending
+ * nvmeio to support resid and updating nda to handle partial
+ * reads, either by returning partial success (or an error) to
+ * the caller, or retrying all or part of the request.
+ */
+ ccb->ccb_h.spriv_ioerror = error;
+ if (error == 0) {
+ if (xfered == 0) {
+#ifdef INVARIANTS
+ /*
+ * If the request fails with an error in the CQE
+ * there will be no data transferred but also no
+ * I/O error.
+ */
+ ccb->ccb_h.spriv_ioerror = EJUSTRETURN;
+#endif
+ } else
+ KASSERT(xfered == ccb->nvmeio.dxfer_len,
+ ("%s: partial CCB completion", __func__));
+ }
+
+ nvmf_ccb_done(ccb);
+}
+
+static void
+nvmf_ccb_complete(void *arg, const struct nvme_completion *cqe)
+{
+ union ccb *ccb = arg;
+
+ ccb->nvmeio.cpl = *cqe;
+ nvmf_ccb_done(ccb);
+}
+
+static void
+nvmf_sim_io(struct nvmf_softc *sc, union ccb *ccb)
+{
+ struct ccb_nvmeio *nvmeio = &ccb->nvmeio;
+ struct memdesc mem;
+ struct nvmf_request *req;
+ struct nvmf_host_qpair *qp;
+
+ mtx_lock(&sc->sim_mtx);
+ if (sc->sim_disconnected) {
+ mtx_unlock(&sc->sim_mtx);
+ nvmeio->ccb_h.status = CAM_REQUEUE_REQ;
+ xpt_done(ccb);
+ return;
+ }
+ if (nvmeio->ccb_h.func_code == XPT_NVME_IO)
+ qp = nvmf_select_io_queue(sc);
+ else
+ qp = sc->admin;
+ req = nvmf_allocate_request(qp, &nvmeio->cmd, nvmf_ccb_complete,
+ ccb, M_NOWAIT);
+ if (req == NULL) {
+ mtx_unlock(&sc->sim_mtx);
+ nvmeio->ccb_h.status = CAM_RESRC_UNAVAIL;
+ xpt_done(ccb);
+ return;
+ }
+
+ if (nvmeio->dxfer_len != 0) {
+ refcount_init(ccb_refs(ccb), 2);
+ mem = memdesc_ccb(ccb);
+ nvmf_capsule_append_data(req->nc, &mem, nvmeio->dxfer_len,
+ (ccb->ccb_h.flags & CAM_DIR_MASK) == CAM_DIR_OUT,
+ nvmf_ccb_io_complete, ccb);
+ } else
+ refcount_init(ccb_refs(ccb), 1);
+
+ /*
+ * Clear spriv_ioerror as it can hold an earlier error if this
+ * CCB was aborted and has been retried.
+ */
+ ccb->ccb_h.spriv_ioerror = 0;
+ KASSERT(ccb->ccb_h.status == CAM_REQ_INPROG,
+ ("%s: incoming CCB is not in-progress", __func__));
+ ccb->ccb_h.status |= CAM_SIM_QUEUED;
+ nvmf_submit_request(req);
+ mtx_unlock(&sc->sim_mtx);
+}
+
+static void
+nvmf_sim_action(struct cam_sim *sim, union ccb *ccb)
+{
+ struct nvmf_softc *sc = cam_sim_softc(sim);
+
+ CAM_DEBUG(ccb->ccb_h.path, CAM_DEBUG_TRACE,
+ ("nvmf_sim_action: func= %#x\n",
+ ccb->ccb_h.func_code));
+
+ switch (ccb->ccb_h.func_code) {
+ case XPT_PATH_INQ: /* Path routing inquiry */
+ {
+ struct ccb_pathinq *cpi = &ccb->cpi;
+
+ cpi->version_num = 1;
+ cpi->hba_inquiry = 0;
+ cpi->target_sprt = 0;
+ cpi->hba_misc = PIM_UNMAPPED | PIM_NOSCAN;
+ cpi->hba_eng_cnt = 0;
+ cpi->max_target = 0;
+ cpi->max_lun = sc->cdata->nn;
+ cpi->async_flags = 0;
+ cpi->hpath_id = 0;
+ cpi->initiator_id = 0;
+ strlcpy(cpi->sim_vid, "FreeBSD", SIM_IDLEN);
+ strlcpy(cpi->hba_vid, "NVMeoF", HBA_IDLEN);
+ strlcpy(cpi->dev_name, cam_sim_name(sim), DEV_IDLEN);
+ cpi->unit_number = cam_sim_unit(sim);
+ cpi->bus_id = 0;
+
+ /* XXX: Same as iSCSI. */
+ cpi->base_transfer_speed = 150000;
+ cpi->protocol = PROTO_NVME;
+ cpi->protocol_version = sc->vs;
+ cpi->transport = XPORT_NVMF;
+ cpi->transport_version = sc->vs;
+ cpi->xport_specific.nvmf.nsid =
+ xpt_path_lun_id(ccb->ccb_h.path);
+ cpi->xport_specific.nvmf.trtype = sc->trtype;
+ strncpy(cpi->xport_specific.nvmf.dev_name,
+ device_get_nameunit(sc->dev),
+ sizeof(cpi->xport_specific.nvmf.dev_name));
+ cpi->maxio = sc->max_xfer_size;
+ cpi->hba_vendor = 0;
+ cpi->hba_device = 0;
+ cpi->hba_subvendor = 0;
+ cpi->hba_subdevice = 0;
+ cpi->ccb_h.status = CAM_REQ_CMP;
+ break;
+ }
+ case XPT_GET_TRAN_SETTINGS: /* Get transport settings */
+ {
+ struct ccb_trans_settings *cts = &ccb->cts;
+ struct ccb_trans_settings_nvme *nvme;
+ struct ccb_trans_settings_nvmf *nvmf;
+
+ cts->protocol = PROTO_NVME;
+ cts->protocol_version = sc->vs;
+ cts->transport = XPORT_NVMF;
+ cts->transport_version = sc->vs;
+
+ nvme = &cts->proto_specific.nvme;
+ nvme->valid = CTS_NVME_VALID_SPEC;
+ nvme->spec = sc->vs;
+
+ nvmf = &cts->xport_specific.nvmf;
+ nvmf->valid = CTS_NVMF_VALID_TRTYPE;
+ nvmf->trtype = sc->trtype;
+ cts->ccb_h.status = CAM_REQ_CMP;
+ break;
+ }
+ case XPT_SET_TRAN_SETTINGS: /* Set transport settings */
+ /*
+ * No transfer settings can be set, but nvme_xpt sends
+ * this anyway.
+ */
+ ccb->ccb_h.status = CAM_REQ_CMP;
+ break;
+ case XPT_NVME_IO: /* Execute the requested I/O */
+ case XPT_NVME_ADMIN: /* or Admin operation */
+ nvmf_sim_io(sc, ccb);
+ return;
+ default:
+ /* XXX */
+ device_printf(sc->dev, "unhandled sim function %#x\n",
+ ccb->ccb_h.func_code);
+ ccb->ccb_h.status = CAM_REQ_INVALID;
+ break;
+ }
+ xpt_done(ccb);
+}
+
+int
+nvmf_init_sim(struct nvmf_softc *sc)
+{
+ struct cam_devq *devq;
+ int max_trans;
+
+ max_trans = sc->max_pending_io * 3 / 4;
+ devq = cam_simq_alloc(max_trans);
+ if (devq == NULL) {
+ device_printf(sc->dev, "Failed to allocate CAM simq\n");
+ return (ENOMEM);
+ }
+
+ mtx_init(&sc->sim_mtx, "nvmf sim", NULL, MTX_DEF);
+ sc->sim = cam_sim_alloc(nvmf_sim_action, NULL, "nvme", sc,
+ device_get_unit(sc->dev), NULL, max_trans, max_trans, devq);
+ if (sc->sim == NULL) {
+ device_printf(sc->dev, "Failed to allocate CAM sim\n");
+ cam_simq_free(devq);
+ mtx_destroy(&sc->sim_mtx);
+ return (ENXIO);
+ }
+ if (xpt_bus_register(sc->sim, sc->dev, 0) != CAM_SUCCESS) {
+ device_printf(sc->dev, "Failed to create CAM bus\n");
+ cam_sim_free(sc->sim, TRUE);
+ mtx_destroy(&sc->sim_mtx);
+ return (ENXIO);
+ }
+ if (xpt_create_path(&sc->path, NULL, cam_sim_path(sc->sim),
+ CAM_TARGET_WILDCARD, CAM_LUN_WILDCARD) != CAM_REQ_CMP) {
+ device_printf(sc->dev, "Failed to create CAM path\n");
+ xpt_bus_deregister(cam_sim_path(sc->sim));
+ cam_sim_free(sc->sim, TRUE);
+ mtx_destroy(&sc->sim_mtx);
+ return (ENXIO);
+ }
+ return (0);
+}
+
+void
+nvmf_sim_rescan_ns(struct nvmf_softc *sc, uint32_t id)
+{
+ union ccb *ccb;
+
+ ccb = xpt_alloc_ccb_nowait();
+ if (ccb == NULL) {
+ device_printf(sc->dev,
+ "unable to alloc CCB for rescan of namespace %u\n", id);
+ return;
+ }
+
+ /*
+ * As with nvme_sim, map NVMe namespace IDs onto CAM unit
+ * LUNs.
+ */
+ if (xpt_create_path(&ccb->ccb_h.path, NULL, cam_sim_path(sc->sim), 0,
+ id) != CAM_REQ_CMP) {
+ device_printf(sc->dev,
+ "Unable to create path for rescan of namespace %u\n", id);
+ xpt_free_ccb(ccb);
+ return;
+ }
+ xpt_rescan(ccb);
+}
+
+void
+nvmf_disconnect_sim(struct nvmf_softc *sc)
+{
+ mtx_lock(&sc->sim_mtx);
+ sc->sim_disconnected = true;
+ xpt_freeze_simq(sc->sim, 1);
+ mtx_unlock(&sc->sim_mtx);
+}
+
+void
+nvmf_reconnect_sim(struct nvmf_softc *sc)
+{
+ mtx_lock(&sc->sim_mtx);
+ sc->sim_disconnected = false;
+ mtx_unlock(&sc->sim_mtx);
+ xpt_release_simq(sc->sim, 1);
+}
+
+void
+nvmf_destroy_sim(struct nvmf_softc *sc)
+{
+ xpt_async(AC_LOST_DEVICE, sc->path, NULL);
+ if (sc->sim_disconnected)
+ xpt_release_simq(sc->sim, 1);
+ xpt_free_path(sc->path);
+ xpt_bus_deregister(cam_sim_path(sc->sim));
+ cam_sim_free(sc->sim, TRUE);
+ mtx_destroy(&sc->sim_mtx);
+}
diff --git a/sys/dev/nvmf/host/nvmf_var.h b/sys/dev/nvmf/host/nvmf_var.h
new file mode 100644
--- /dev/null
+++ b/sys/dev/nvmf/host/nvmf_var.h
@@ -0,0 +1,208 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2023-2024 Chelsio Communications, Inc.
+ * Written by: John Baldwin <jhb@FreeBSD.org>
+ */
+
+#ifndef __NVMF_VAR_H__
+#define __NVMF_VAR_H__
+
+#include <sys/_callout.h>
+#include <sys/_lock.h>
+#include <sys/_mutex.h>
+#include <sys/_sx.h>
+#include <sys/_task.h>
+#include <sys/queue.h>
+#include <dev/nvme/nvme.h>
+#include <dev/nvmf/nvmf_transport.h>
+
+struct nvmf_aer;
+struct nvmf_capsule;
+struct nvmf_host_qpair;
+struct nvmf_namespace;
+
+typedef void nvmf_request_complete_t(void *, const struct nvme_completion *);
+
+struct nvmf_ivars {
+ struct nvmf_handoff_host *hh;
+ struct nvmf_handoff_qpair_params *io_params;
+ struct nvme_controller_data *cdata;
+};
+
+struct nvmf_softc {
+ device_t dev;
+
+ struct nvmf_host_qpair *admin;
+ struct nvmf_host_qpair **io;
+ u_int num_io_queues;
+ enum nvmf_trtype trtype;
+
+ struct cam_sim *sim;
+ struct cam_path *path;
+ struct mtx sim_mtx;
+ bool sim_disconnected;
+
+ struct nvmf_namespace **ns;
+
+ struct nvme_controller_data *cdata;
+ uint64_t cap;
+ uint32_t vs;
+ u_int max_pending_io;
+ u_long max_xfer_size;
+
+ struct cdev *cdev;
+
+ /*
+ * Keep Alive support depends on two timers. The 'tx' timer
+ * is responsible for sending KeepAlive commands and runs at
+ * half the timeout interval. The 'rx' timer is responsible
+ * for detecting an actual timeout.
+ *
+ * For efficient support of TKAS, the host does not reschedule
+ * these timers every time new commands are scheduled.
+ * Instead, the host sets the *_traffic flags when commands
+ * are sent and received. The timeout handlers check and
+ * clear these flags. This does mean it can take up to twice
+ * the timeout time to detect an AWOL controller.
+ */
+ bool ka_traffic; /* Using TKAS? */
+
+ volatile int ka_active_tx_traffic;
+ struct callout ka_tx_timer;
+ sbintime_t ka_tx_sbt;
+
+ volatile int ka_active_rx_traffic;
+ struct callout ka_rx_timer;
+ sbintime_t ka_rx_sbt;
+
+ struct sx connection_lock;
+ struct task disconnect_task;
+ bool detaching;
+
+ u_int num_aer;
+ struct nvmf_aer *aer;
+};
+
+struct nvmf_request {
+ struct nvmf_host_qpair *qp;
+ struct nvmf_capsule *nc;
+ nvmf_request_complete_t *cb;
+ void *cb_arg;
+ bool aer;
+
+ STAILQ_ENTRY(nvmf_request) link;
+};
+
+struct nvmf_completion_status {
+ struct nvme_completion cqe;
+ bool done;
+ bool io_done;
+ int io_error;
+};
+
+static __inline struct nvmf_host_qpair *
+nvmf_select_io_queue(struct nvmf_softc *sc)
+{
+ /* TODO: Support multiple queues? */
+ return (sc->io[0]);
+}
+
+static __inline bool
+nvmf_cqe_aborted(const struct nvme_completion *cqe)
+{
+ uint16_t status;
+
+ status = le16toh(cqe->status);
+ return (NVME_STATUS_GET_SCT(status) == NVME_SCT_PATH_RELATED &&
+ NVME_STATUS_GET_SC(status) == NVME_SC_COMMAND_ABORTED_BY_HOST);
+}
+
+static __inline void
+nvmf_status_init(struct nvmf_completion_status *status)
+{
+ status->done = false;
+ status->io_done = true;
+ status->io_error = 0;
+}
+
+static __inline void
+nvmf_status_wait_io(struct nvmf_completion_status *status)
+{
+ status->io_done = false;
+}
+
+#ifdef DRIVER_MODULE
+extern driver_t nvme_nvmf_driver;
+#endif
+
+#ifdef MALLOC_DECLARE
+MALLOC_DECLARE(M_NVMF);
+#endif
+
+/* nvmf.c */
+void nvmf_complete(void *arg, const struct nvme_completion *cqe);
+void nvmf_io_complete(void *arg, size_t xfered, int error);
+void nvmf_wait_for_reply(struct nvmf_completion_status *status);
+int nvmf_init_ivars(struct nvmf_ivars *ivars, struct nvmf_handoff_host *hh);
+void nvmf_free_ivars(struct nvmf_ivars *ivars);
+void nvmf_disconnect(struct nvmf_softc *sc);
+void nvmf_rescan_ns(struct nvmf_softc *sc, uint32_t nsid);
+int nvmf_passthrough_cmd(struct nvmf_softc *sc, struct nvme_pt_command *pt,
+ bool admin);
+
+/* nvmf_aer.c */
+void nvmf_init_aer(struct nvmf_softc *sc);
+int nvmf_start_aer(struct nvmf_softc *sc);
+void nvmf_destroy_aer(struct nvmf_softc *sc);
+
+/* nvmf_cmd.c */
+bool nvmf_cmd_get_property(struct nvmf_softc *sc, uint32_t offset,
+ uint8_t size, nvmf_request_complete_t *cb, void *cb_arg, int how);
+bool nvmf_cmd_set_property(struct nvmf_softc *sc, uint32_t offset,
+ uint8_t size, uint64_t value, nvmf_request_complete_t *cb, void *cb_arg,
+ int how);
+bool nvmf_cmd_keep_alive(struct nvmf_softc *sc, nvmf_request_complete_t *cb,
+ void *cb_arg, int how);
+bool nvmf_cmd_identify_active_namespaces(struct nvmf_softc *sc, uint32_t id,
+ struct nvme_ns_list *nslist, nvmf_request_complete_t *req_cb,
+ void *req_cb_arg, nvmf_io_complete_t *io_cb, void *io_cb_arg, int how);
+bool nvmf_cmd_identify_namespace(struct nvmf_softc *sc, uint32_t id,
+ struct nvme_namespace_data *nsdata, nvmf_request_complete_t *req_cb,
+ void *req_cb_arg, nvmf_io_complete_t *io_cb, void *io_cb_arg, int how);
+bool nvmf_cmd_get_log_page(struct nvmf_softc *sc, uint32_t nsid, uint8_t lid,
+ uint64_t offset, void *buf, size_t len, nvmf_request_complete_t *req_cb,
+ void *req_cb_arg, nvmf_io_complete_t *io_cb, void *io_cb_arg, int how);
+
+/* nvmf_ctldev.c */
+int nvmf_ctl_load(void);
+void nvmf_ctl_unload(void);
+
+/* nvmf_ns.c */
+struct nvmf_namespace *nvmf_init_ns(struct nvmf_softc *sc, uint32_t id,
+ struct nvme_namespace_data *data);
+void nvmf_disconnect_ns(struct nvmf_namespace *ns);
+void nvmf_reconnect_ns(struct nvmf_namespace *ns);
+void nvmf_destroy_ns(struct nvmf_namespace *ns);
+bool nvmf_update_ns(struct nvmf_namespace *ns,
+ struct nvme_namespace_data *data);
+
+/* nvmf_qpair.c */
+struct nvmf_host_qpair *nvmf_init_qp(struct nvmf_softc *sc,
+ enum nvmf_trtype trtype, struct nvmf_handoff_qpair_params *handoff,
+ const char *name);
+void nvmf_shutdown_qp(struct nvmf_host_qpair *qp);
+void nvmf_destroy_qp(struct nvmf_host_qpair *qp);
+struct nvmf_request *nvmf_allocate_request(struct nvmf_host_qpair *qp,
+ void *sqe, nvmf_request_complete_t *cb, void *cb_arg, int how);
+void nvmf_submit_request(struct nvmf_request *req);
+void nvmf_free_request(struct nvmf_request *req);
+
+/* nvmf_sim.c */
+int nvmf_init_sim(struct nvmf_softc *sc);
+void nvmf_disconnect_sim(struct nvmf_softc *sc);
+void nvmf_reconnect_sim(struct nvmf_softc *sc);
+void nvmf_destroy_sim(struct nvmf_softc *sc);
+void nvmf_sim_rescan_ns(struct nvmf_softc *sc, uint32_t id);
+
+#endif /* !__NVMF_VAR_H__ */
diff --git a/sys/modules/nvmf/Makefile b/sys/modules/nvmf/Makefile
--- a/sys/modules/nvmf/Makefile
+++ b/sys/modules/nvmf/Makefile
@@ -1,4 +1,5 @@
-SUBDIR= nvmf_tcp \
+SUBDIR= nvmf \
+ nvmf_tcp \
nvmf_transport
.include <bsd.subdir.mk>
diff --git a/sys/modules/nvmf/nvmf/Makefile b/sys/modules/nvmf/nvmf/Makefile
new file mode 100644
--- /dev/null
+++ b/sys/modules/nvmf/nvmf/Makefile
@@ -0,0 +1,13 @@
+.PATH: ${SRCTOP}/sys/dev/nvmf/host
+
+KMOD= nvmf
+
+SRCS= nvmf.c \
+ nvmf_aer.c \
+ nvmf_cmd.c \
+ nvmf_ctldev.c \
+ nvmf_ns.c \
+ nvmf_qpair.c \
+ nvmf_sim.c
+
+.include <bsd.kmod.mk>

File Metadata

Mime Type
text/plain
Expires
Mon, Jan 27, 11:48 AM (4 h, 10 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
16195313
Default Alt Text
D44714.diff (77 KB)

Event Timeline