D44726.diff
No OneTemporary
Actions

Size

76 KB

Referenced Files

None

Subscribers

None

D44726.diff
View Options

	diff --git a/share/man/man4/Makefile b/share/man/man4/Makefile
	--- a/share/man/man4/Makefile
	+++ b/share/man/man4/Makefile
	@@ -410,6 +410,7 @@
	nvme.4 \
	nvmf.4 \
	nvmf_tcp.4 \
	+ nvmft.4 \
	${_nvram.4} \
	oce.4 \
	ocs_fc.4\
	diff --git a/share/man/man4/nvmft.4 b/share/man/man4/nvmft.4
	new file mode 100644
	--- /dev/null
	+++ b/share/man/man4/nvmft.4
	@@ -0,0 +1,85 @@
	+.\"
	+.\" SPDX-License-Identifier: BSD-2-Clause
	+.\"
	+.\" Copyright (c) 2024 Chelsio Communications, Inc.
	+.\"
	+.Dd May 2, 2024
	+.Dt NVMFT 4
	+.Os
	+.Sh NAME
	+.Nm nvmft
	+.Nd "NVM Express over Fabrics CAM Target Layer frontend"
	+.Sh SYNOPSIS
	+To compile the subsystem into the kernel,
	+place the following lines in the
	+kernel configuration file:
	+.Bd -ragged -offset indent
	+.Cd "device nvmft"
	+.Cd "device ctl"
	+.Ed
	+.Pp
	+Alternatively, to load the subsystem as a
	+module at boot time, place the following line in
	+.Xr loader.conf 5 :
	+.Bd -literal -offset indent
	+nvmft_load="YES"
	+.Ed
	+.Sh DESCRIPTION
	+The
	+.Nm
	+driver provides the kernel component of an NVM Express over Fabrics
	+controller.
	+The NVMeoF controller is the server exporting namespaces backed by
	+local files and volumes to remote hosts.
	+.Nm
	+follows the dynamic controller model and creates a new dynamic controller
	+for each association.
	+.Pp
	+.Nm
	+is implemented as a
	+.Xr ctl 4
	+frontend and exports CAM Target Layer LUNs as namespaces to remote hosts.
	+LUNs can be configured via
	+.Xr ctladm 8 .
	+.Pp
	+Associations between the local controller and remote hosts are managed
	+using both the
	+.Xr nvmfd 8
	+daemon and the
	+.Xr ctladm 8
	+utility.
	+The
	+.Xr nvmfd 8
	+daemon listens for new associations and handles transport-specific
	+negotiation before handing off connected queue pairs to
	+.Nm
	+which associates queue pairs with a suitable controller instance.
	+The
	+.Cm nvlist
	+.Xr ctladm 8
	+command lists active controllers.
	+The
	+.Cm nvterminate
	+command terminates one or more associations between a local controller
	+and a remote host.
	+.Pp
	+Associations require a supported transport such as
	+.Xr nvmf_tcp 4
	+for associations using TCP/IP.
	+.Sh SEE ALSO
	+.Xr ctl 4 ,
	+.Xr nvmf 4 ,
	+.Xr nvmf_tcp 4 ,
	+.Xr ctladm 8 ,
	+.Xr nvmfd 8
	+.Sh HISTORY
	+The
	+.Nm
	+module first appeared in
	+.Fx 15.0 .
	+.Sh AUTHORS
	+The
	+.Nm
	+subsystem was developed by
	+.An John Baldwin Aq Mt jhb@FreeBSD.org
	+under sponsorship from Chelsio Communications, Inc.
	diff --git a/sys/conf/NOTES b/sys/conf/NOTES
	--- a/sys/conf/NOTES
	+++ b/sys/conf/NOTES
	@@ -1677,6 +1677,7 @@
	#
	# nvme: PCI-express NVM Express host controllers
	# nvmf: NVM Express over Fabrics host
	+# nvmft: NVM Express over Fabrics CAM Target Layer frontend
	# nvmf_tcp: TCP transport for NVM Express over Fabrics
	# nda: CAM NVMe disk driver
	# nvd: non-CAM NVMe disk driver
	@@ -1684,6 +1685,7 @@
	device nvme # PCI-express NVMe host driver
	options NVME_USE_NVD=1 # Use nvd(4) instead of the CAM nda(4) driver
	device nvmf # NVMeoF host driver
	+device nvmft # NVMeoF ctl(4) frontend
	device nvmf_tcp # NVMeoF TCP transport
	device nda # NVMe direct access devices (aka disks)
	device nvd # expose NVMe namespaces as disks, depends on nvme
	diff --git a/sys/conf/files b/sys/conf/files
	--- a/sys/conf/files
	+++ b/sys/conf/files
	@@ -2535,6 +2535,10 @@
	dev/nvme/nvme_util.c optional nvme
	dev/nvmem/nvmem.c optional nvmem fdt
	dev/nvmem/nvmem_if.m optional nvmem
	+dev/nvmf/controller/ctl_frontend_nvmf.c optional nvmft
	+dev/nvmf/controller/nvmft_controller.c optional nvmft
	+dev/nvmf/controller/nvmft_subr.c optional nvmft
	+dev/nvmf/controller/nvmft_qpair.c optional nvmft
	dev/nvmf/host/nvmf.c optional nvmf
	dev/nvmf/host/nvmf_aer.c optional nvmf
	dev/nvmf/host/nvmf_cmd.c optional nvmf
	@@ -2543,7 +2547,7 @@
	dev/nvmf/host/nvmf_qpair.c optional nvmf
	dev/nvmf/host/nvmf_sim.c optional nvmf
	dev/nvmf/nvmf_tcp.c optional nvmf_tcp
	-dev/nvmf/nvmf_transport.c optional nvmf
	+dev/nvmf/nvmf_transport.c optional nvmf \| optional nvmft
	dev/oce/oce_hw.c optional oce pci
	dev/oce/oce_if.c optional oce pci
	dev/oce/oce_mbox.c optional oce pci
	diff --git a/sys/dev/nvmf/controller/ctl_frontend_nvmf.c b/sys/dev/nvmf/controller/ctl_frontend_nvmf.c
	new file mode 100644
	--- /dev/null
	+++ b/sys/dev/nvmf/controller/ctl_frontend_nvmf.c
	@@ -0,0 +1,1123 @@
	+/*-
	+ * SPDX-License-Identifier: BSD-2-Clause
	+ *
	+ * Copyright (c) 2023-2024 Chelsio Communications, Inc.
	+ * Written by: John Baldwin <jhb@FreeBSD.org>
	+ */
	+
	+#include <sys/param.h>
	+#include <sys/dnv.h>
	+#include <sys/jail.h>
	+#include <sys/kernel.h>
	+#include <sys/limits.h>
	+#include <sys/lock.h>
	+#include <sys/malloc.h>
	+#include <sys/mbuf.h>
	+#include <sys/memdesc.h>
	+#include <sys/module.h>
	+#include <sys/proc.h>
	+#include <sys/queue.h>
	+#include <sys/refcount.h>
	+#include <sys/sbuf.h>
	+#include <sys/sx.h>
	+
	+#include <machine/bus.h>
	+#include <machine/bus_dma.h>
	+
	+#include <dev/nvmf/nvmf.h>
	+#include <dev/nvmf/nvmf_transport.h>
	+#include <dev/nvmf/controller/nvmft_subr.h>
	+#include <dev/nvmf/controller/nvmft_var.h>
	+
	+#include <cam/ctl/ctl.h>
	+#include <cam/ctl/ctl_error.h>
	+#include <cam/ctl/ctl_io.h>
	+#include <cam/ctl/ctl_frontend.h>
	+
	+/*
	+ * Store pointers to the capsule and qpair in the two pointer members
	+ * of CTL_PRIV_FRONTEND.
	+ */
	+#define NVMFT_NC(io) ((io)->io_hdr.ctl_private[CTL_PRIV_FRONTEND].ptrs[0])
	+#define NVMFT_QP(io) ((io)->io_hdr.ctl_private[CTL_PRIV_FRONTEND].ptrs[1])
	+
	+static void nvmft_done(union ctl_io *io);
	+static int nvmft_init(void);
	+static int nvmft_ioctl(struct cdev *cdev, u_long cmd, caddr_t data,
	+ int flag, struct thread *td);
	+static int nvmft_shutdown(void);
	+
	+static TAILQ_HEAD(, nvmft_port) nvmft_ports;
	+static struct sx nvmft_ports_lock;
	+
	+MALLOC_DEFINE(M_NVMFT, "nvmft", "NVMe over Fabrics controller");
	+
	+static struct ctl_frontend nvmft_frontend = {
	+ .name = "nvmf",
	+ .init = nvmft_init,
	+ .ioctl = nvmft_ioctl,
	+ .fe_dump = NULL,
	+ .shutdown = nvmft_shutdown,
	+};
	+
	+static void
	+nvmft_online(void *arg)
	+{
	+ struct nvmft_port *np = arg;
	+
	+ sx_xlock(&np->lock);
	+ np->online = true;
	+ sx_xunlock(&np->lock);
	+}
	+
	+static void
	+nvmft_offline(void *arg)
	+{
	+ struct nvmft_port *np = arg;
	+ struct nvmft_controller *ctrlr;
	+
	+ sx_xlock(&np->lock);
	+ np->online = false;
	+
	+ TAILQ_FOREACH(ctrlr, &np->controllers, link) {
	+ nvmft_printf(ctrlr,
	+ "shutting down due to port going offline\n");
	+ nvmft_controller_error(ctrlr, NULL, ENODEV);
	+ }
	+
	+ while (!TAILQ_EMPTY(&np->controllers))
	+ sx_sleep(np, &np->lock, 0, "nvmfoff", 0);
	+ sx_xunlock(&np->lock);
	+}
	+
	+static int
	+nvmft_lun_enable(void *arg, int lun_id)
	+{
	+ struct nvmft_port *np = arg;
	+ struct nvmft_controller *ctrlr;
	+ uint32_t old_ns, new_ns;
	+ uint32_t nsid;
	+ u_int i;
	+
	+ if (lun_id >= le32toh(np->cdata.nn)) {
	+ printf("NVMFT: %s lun %d larger than maximum nsid %u\n",
	+ np->cdata.subnqn, lun_id, le32toh(np->cdata.nn));
	+ return (EOPNOTSUPP);
	+ }
	+ nsid = lun_id + 1;
	+
	+ sx_xlock(&np->lock);
	+ new_ns = mallocarray(np->num_ns + 1, sizeof(*new_ns), M_NVMFT,
	+ M_WAITOK);
	+ for (i = 0; i < np->num_ns; i++) {
	+ if (np->active_ns[i] < nsid)
	+ continue;
	+ if (np->active_ns[i] == nsid) {
	+ sx_xunlock(&np->lock);
	+ free(new_ns, M_NVMFT);
	+ printf("NVMFT: %s duplicate lun %d\n",
	+ np->cdata.subnqn, lun_id);
	+ return (EINVAL);
	+ }
	+ break;
	+ }
	+
	+ /* Copy over IDs smaller than nsid. */
	+ memcpy(new_ns, np->active_ns, i * sizeof(*np->active_ns));
	+
	+ /* Insert nsid. */
	+ new_ns[i] = nsid;
	+
	+ /* Copy over IDs greater than nsid. */
	+ memcpy(new_ns + i + 1, np->active_ns + i, (np->num_ns - i) *
	+ sizeof(*np->active_ns));
	+
	+ np->num_ns++;
	+ old_ns = np->active_ns;
	+ np->active_ns = new_ns;
	+
	+ TAILQ_FOREACH(ctrlr, &np->controllers, link) {
	+ nvmft_controller_lun_changed(ctrlr, lun_id);
	+ }
	+
	+ sx_xunlock(&np->lock);
	+ free(old_ns, M_NVMFT);
	+
	+ return (0);
	+}
	+
	+static int
	+nvmft_lun_disable(void *arg, int lun_id)
	+{
	+ struct nvmft_port *np = arg;
	+ struct nvmft_controller *ctrlr;
	+ uint32_t nsid;
	+ u_int i;
	+
	+ if (lun_id >= le32toh(np->cdata.nn))
	+ return (0);
	+ nsid = lun_id + 1;
	+
	+ sx_xlock(&np->lock);
	+ for (i = 0; i < np->num_ns; i++) {
	+ if (np->active_ns[i] == nsid)
	+ goto found;
	+ }
	+ sx_xunlock(&np->lock);
	+ printf("NVMFT: %s request to disable nonexistent lun %d\n",
	+ np->cdata.subnqn, lun_id);
	+ return (EINVAL);
	+
	+found:
	+ /* Move down IDs greater than nsid. */
	+ memmove(np->active_ns + i, np->active_ns + i + 1,
	+ (np->num_ns - (i + 1)) * sizeof(*np->active_ns));
	+ np->num_ns--;
	+
	+ /* NB: Don't bother freeing the old active_ns array. */
	+
	+ TAILQ_FOREACH(ctrlr, &np->controllers, link) {
	+ nvmft_controller_lun_changed(ctrlr, lun_id);
	+ }
	+
	+ sx_xunlock(&np->lock);
	+
	+ return (0);
	+}
	+
	+void
	+nvmft_populate_active_nslist(struct nvmft_port *np, uint32_t nsid,
	+ struct nvme_ns_list *nslist)
	+{
	+ u_int i, count;
	+
	+ sx_slock(&np->lock);
	+ count = 0;
	+ for (i = 0; i < np->num_ns; i++) {
	+ if (np->active_ns[i] <= nsid)
	+ continue;
	+ nslist->ns[count] = htole32(np->active_ns[i]);
	+ count++;
	+ if (count == nitems(nslist->ns))
	+ break;
	+ }
	+ sx_sunlock(&np->lock);
	+}
	+
	+void
	+nvmft_dispatch_command(struct nvmft_qpair qp, struct nvmf_capsule nc,
	+ bool admin)
	+{
	+ struct nvmft_controller *ctrlr = nvmft_qpair_ctrlr(qp);
	+ const struct nvme_command *cmd = nvmf_capsule_sqe(nc);
	+ struct nvmft_port *np = ctrlr->np;
	+ union ctl_io *io;
	+ int error;
	+
	+ if (cmd->nsid == htole32(0)) {
	+ nvmft_send_generic_error(qp, nc,
	+ NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
	+ nvmf_free_capsule(nc);
	+ return;
	+ }
	+
	+ mtx_lock(&ctrlr->lock);
	+ if (ctrlr->pending_commands == 0)
	+ ctrlr->start_busy = sbinuptime();
	+ ctrlr->pending_commands++;
	+ mtx_unlock(&ctrlr->lock);
	+ io = ctl_alloc_io(np->port.ctl_pool_ref);
	+ ctl_zero_io(io);
	+ NVMFT_NC(io) = nc;
	+ NVMFT_QP(io) = qp;
	+ io->io_hdr.io_type = admin ? CTL_IO_NVME_ADMIN : CTL_IO_NVME;
	+ io->io_hdr.nexus.initid = ctrlr->cntlid;
	+ io->io_hdr.nexus.targ_port = np->port.targ_port;
	+ io->io_hdr.nexus.targ_lun = le32toh(cmd->nsid) - 1;
	+ io->nvmeio.cmd = *cmd;
	+ error = ctl_run(io);
	+ if (error != 0) {
	+ nvmft_printf(ctrlr, "ctl_run failed for command on %s: %d\n",
	+ nvmft_qpair_name(qp), error);
	+ ctl_nvme_set_generic_error(&io->nvmeio,
	+ NVME_SC_INTERNAL_DEVICE_ERROR);
	+ nvmft_done(io);
	+
	+ nvmft_controller_error(ctrlr, qp, ENXIO);
	+ }
	+}
	+
	+void
	+nvmft_terminate_commands(struct nvmft_controller *ctrlr)
	+{
	+ struct nvmft_port *np = ctrlr->np;
	+ union ctl_io *io;
	+ int error;
	+
	+ mtx_lock(&ctrlr->lock);
	+ if (ctrlr->pending_commands == 0)
	+ ctrlr->start_busy = sbinuptime();
	+ ctrlr->pending_commands++;
	+ mtx_unlock(&ctrlr->lock);
	+ io = ctl_alloc_io(np->port.ctl_pool_ref);
	+ ctl_zero_io(io);
	+ NVMFT_QP(io) = ctrlr->admin;
	+ io->io_hdr.io_type = CTL_IO_TASK;
	+ io->io_hdr.nexus.initid = ctrlr->cntlid;
	+ io->io_hdr.nexus.targ_port = np->port.targ_port;
	+ io->io_hdr.nexus.targ_lun = 0;
	+ io->taskio.tag_type = CTL_TAG_SIMPLE; /* XXX: unused? */
	+ io->taskio.task_action = CTL_TASK_I_T_NEXUS_RESET;
	+ error = ctl_run(io);
	+ if (error != CTL_RETVAL_COMPLETE) {
	+ nvmft_printf(ctrlr, "failed to terminate tasks: %d\n", error);
	+#ifdef INVARIANTS
	+ io->io_hdr.status = CTL_SUCCESS;
	+#endif
	+ nvmft_done(io);
	+ }
	+}
	+
	+static void
	+nvmft_datamove_out_cb(void *arg, size_t xfered, int error)
	+{
	+ struct ctl_nvmeio *ctnio = arg;
	+
	+ if (error != 0) {
	+ ctl_nvme_set_data_transfer_error(ctnio);
	+ } else {
	+ MPASS(xfered == ctnio->kern_data_len);
	+ ctnio->kern_data_resid -= xfered;
	+ }
	+
	+ if (ctnio->kern_sg_entries) {
	+ free(ctnio->ext_data_ptr, M_NVMFT);
	+ ctnio->ext_data_ptr = NULL;
	+ } else
	+ MPASS(ctnio->ext_data_ptr == NULL);
	+ ctl_datamove_done((union ctl_io *)ctnio, false);
	+}
	+
	+static void
	+nvmft_datamove_out(struct ctl_nvmeio ctnio, struct nvmft_qpair qp,
	+ struct nvmf_capsule *nc)
	+{
	+ struct memdesc mem;
	+ int error;
	+
	+ MPASS(ctnio->ext_data_ptr == NULL);
	+ if (ctnio->kern_sg_entries > 0) {
	+ struct ctl_sg_entry *sgl;
	+ struct bus_dma_segment *vlist;
	+
	+ vlist = mallocarray(ctnio->kern_sg_entries, sizeof(*vlist),
	+ M_NVMFT, M_WAITOK);
	+ ctnio->ext_data_ptr = (void *)vlist;
	+ sgl = (struct ctl_sg_entry *)ctnio->kern_data_ptr;
	+ for (u_int i = 0; i < ctnio->kern_sg_entries; i++) {
	+ vlist[i].ds_addr = (uintptr_t)sgl[i].addr;
	+ vlist[i].ds_len = sgl[i].len;
	+ }
	+ mem = memdesc_vlist(vlist, ctnio->kern_sg_entries);
	+ } else
	+ mem = memdesc_vaddr(ctnio->kern_data_ptr, ctnio->kern_data_len);
	+
	+ error = nvmf_receive_controller_data(nc, ctnio->kern_rel_offset, &mem,
	+ ctnio->kern_data_len, nvmft_datamove_out_cb, ctnio);
	+ if (error == 0)
	+ return;
	+
	+ nvmft_printf(nvmft_qpair_ctrlr(qp),
	+ "Failed to request capsule data: %d\n", error);
	+ ctl_nvme_set_data_transfer_error(ctnio);
	+
	+ if (ctnio->kern_sg_entries) {
	+ free(ctnio->ext_data_ptr, M_NVMFT);
	+ ctnio->ext_data_ptr = NULL;
	+ } else
	+ MPASS(ctnio->ext_data_ptr == NULL);
	+ ctl_datamove_done((union ctl_io *)ctnio, true);
	+}
	+
	+static struct mbuf *
	+nvmft_copy_data(struct ctl_nvmeio *ctnio)
	+{
	+ struct ctl_sg_entry *sgl;
	+ struct mbuf m0, m;
	+ uint32_t resid, off, todo;
	+ int mlen;
	+
	+ MPASS(ctnio->kern_data_len != 0);
	+
	+ m0 = m_getm2(NULL, ctnio->kern_data_len, M_WAITOK, MT_DATA, 0);
	+
	+ if (ctnio->kern_sg_entries == 0) {
	+ m_copyback(m0, 0, ctnio->kern_data_len, ctnio->kern_data_ptr);
	+ return (m0);
	+ }
	+
	+ resid = ctnio->kern_data_len;
	+ sgl = (struct ctl_sg_entry *)ctnio->kern_data_ptr;
	+ off = 0;
	+ m = m0;
	+ mlen = M_TRAILINGSPACE(m);
	+ for (;;) {
	+ todo = MIN(mlen, sgl->len - off);
	+ memcpy(mtod(m, char ) + m->m_len, (char )sgl->addr + off,
	+ todo);
	+ m->m_len += todo;
	+ resid -= todo;
	+ if (resid == 0) {
	+ MPASS(m->m_next == NULL);
	+ break;
	+ }
	+
	+ off += todo;
	+ if (off == sgl->len) {
	+ sgl++;
	+ off = 0;
	+ }
	+ mlen -= todo;
	+ if (mlen == 0) {
	+ m = m->m_next;
	+ mlen = M_TRAILINGSPACE(m);
	+ }
	+ }
	+
	+ return (m0);
	+}
	+
	+static void
	+m_free_ref_data(struct mbuf *m)
	+{
	+ ctl_ref kern_data_ref = m->m_ext.ext_arg1;
	+
	+ kern_data_ref(m->m_ext.ext_arg2, -1);
	+}
	+
	+static struct mbuf *
	+m_get_ref_data(struct ctl_nvmeio ctnio, void buf, u_int size)
	+{
	+ struct mbuf *m;
	+
	+ m = m_get(M_WAITOK, MT_DATA);
	+ m_extadd(m, buf, size, m_free_ref_data, ctnio->kern_data_ref,
	+ ctnio->kern_data_arg, M_RDONLY, EXT_CTL);
	+ m->m_len = size;
	+ ctnio->kern_data_ref(ctnio->kern_data_arg, 1);
	+ return (m);
	+}
	+
	+static struct mbuf *
	+nvmft_ref_data(struct ctl_nvmeio *ctnio)
	+{
	+ struct ctl_sg_entry *sgl;
	+ struct mbuf m0, m;
	+
	+ MPASS(ctnio->kern_data_len != 0);
	+
	+ if (ctnio->kern_sg_entries == 0)
	+ return (m_get_ref_data(ctnio, ctnio->kern_data_ptr,
	+ ctnio->kern_data_len));
	+
	+ sgl = (struct ctl_sg_entry *)ctnio->kern_data_ptr;
	+ m0 = m_get_ref_data(ctnio, sgl[0].addr, sgl[0].len);
	+ m = m0;
	+ for (u_int i = 1; i < ctnio->kern_sg_entries; i++) {
	+ m->m_next = m_get_ref_data(ctnio, sgl[i].addr, sgl[i].len);
	+ m = m->m_next;
	+ }
	+ return (m0);
	+}
	+
	+static void
	+nvmft_datamove_in(struct ctl_nvmeio ctnio, struct nvmft_qpair qp,
	+ struct nvmf_capsule *nc)
	+{
	+ struct mbuf *m;
	+ u_int status;
	+
	+ if (ctnio->kern_data_ref != NULL)
	+ m = nvmft_ref_data(ctnio);
	+ else
	+ m = nvmft_copy_data(ctnio);
	+ status = nvmf_send_controller_data(nc, ctnio->kern_rel_offset, m,
	+ ctnio->kern_data_len);
	+ switch (status) {
	+ case NVMF_SUCCESS_SENT:
	+ ctnio->success_sent = true;
	+ nvmft_command_completed(qp, nc);
	+ /* FALLTHROUGH */
	+ case NVMF_MORE:
	+ case NVME_SC_SUCCESS:
	+ break;
	+ default:
	+ ctl_nvme_set_generic_error(ctnio, status);
	+ break;
	+ }
	+ ctl_datamove_done((union ctl_io *)ctnio, true);
	+}
	+
	+static void
	+nvmft_datamove(union ctl_io *io)
	+{
	+ struct nvmf_capsule *nc;
	+ struct nvmft_qpair *qp;
	+
	+ /* Some CTL commands preemptively set a success status. */
	+ MPASS(io->io_hdr.status == CTL_STATUS_NONE \|\|
	+ io->io_hdr.status == CTL_SUCCESS);
	+ MPASS(!io->nvmeio.success_sent);
	+
	+ nc = NVMFT_NC(io);
	+ qp = NVMFT_QP(io);
	+
	+ if ((io->io_hdr.flags & CTL_FLAG_DATA_MASK) == CTL_FLAG_DATA_IN)
	+ nvmft_datamove_in(&io->nvmeio, qp, nc);
	+ else
	+ nvmft_datamove_out(&io->nvmeio, qp, nc);
	+}
	+
	+static void
	+hip_add(uint64_t pair[2], uint64_t addend)
	+{
	+ uint64_t old, new;
	+
	+ old = le64toh(pair[0]);
	+ new = old + addend;
	+ pair[0] = htole64(new);
	+ if (new < old)
	+ pair[1] += htole64(1);
	+}
	+
	+static void
	+nvmft_done(union ctl_io *io)
	+{
	+ struct nvmft_controller *ctrlr;
	+ const struct nvme_command *cmd;
	+ struct nvmft_qpair *qp;
	+ struct nvmf_capsule *nc;
	+ size_t len;
	+
	+ KASSERT(io->io_hdr.status == CTL_SUCCESS \|\|
	+ io->io_hdr.status == CTL_NVME_ERROR,
	+ ("%s: bad status %u", __func__, io->io_hdr.status));
	+
	+ nc = NVMFT_NC(io);
	+ qp = NVMFT_QP(io);
	+ ctrlr = nvmft_qpair_ctrlr(qp);
	+
	+ if (nc == NULL) {
	+ /* Completion of nvmft_terminate_commands. */
	+ goto end;
	+ }
	+
	+ cmd = nvmf_capsule_sqe(nc);
	+
	+ if (io->io_hdr.status == CTL_SUCCESS)
	+ len = nvmf_capsule_data_len(nc) / 512;
	+ else
	+ len = 0;
	+ switch (cmd->opc) {
	+ case NVME_OPC_WRITE:
	+ mtx_lock(&ctrlr->lock);
	+ hip_add(ctrlr->hip.host_write_commands, 1);
	+ len += ctrlr->partial_duw;
	+ if (len > 1000)
	+ hip_add(ctrlr->hip.data_units_written, len / 1000);
	+ ctrlr->partial_duw = len % 1000;
	+ mtx_unlock(&ctrlr->lock);
	+ break;
	+ case NVME_OPC_READ:
	+ case NVME_OPC_COMPARE:
	+ case NVME_OPC_VERIFY:
	+ mtx_lock(&ctrlr->lock);
	+ if (cmd->opc != NVME_OPC_VERIFY)
	+ hip_add(ctrlr->hip.host_read_commands, 1);
	+ len += ctrlr->partial_dur;
	+ if (len > 1000)
	+ hip_add(ctrlr->hip.data_units_read, len / 1000);
	+ ctrlr->partial_dur = len % 1000;
	+ mtx_unlock(&ctrlr->lock);
	+ break;
	+ }
	+
	+ if (io->nvmeio.success_sent) {
	+ MPASS(io->io_hdr.status == CTL_SUCCESS);
	+ } else {
	+ io->nvmeio.cpl.cid = cmd->cid;
	+ nvmft_send_response(qp, &io->nvmeio.cpl);
	+ }
	+ nvmf_free_capsule(nc);
	+end:
	+ ctl_free_io(io);
	+ mtx_lock(&ctrlr->lock);
	+ ctrlr->pending_commands--;
	+ if (ctrlr->pending_commands == 0)
	+ ctrlr->busy_total += sbinuptime() - ctrlr->start_busy;
	+ mtx_unlock(&ctrlr->lock);
	+}
	+
	+static int
	+nvmft_init(void)
	+{
	+ TAILQ_INIT(&nvmft_ports);
	+ sx_init(&nvmft_ports_lock, "nvmft ports");
	+ return (0);
	+}
	+
	+void
	+nvmft_port_free(struct nvmft_port *np)
	+{
	+ KASSERT(TAILQ_EMPTY(&np->controllers),
	+ ("%s(%p): active controllers", __func__, np));
	+
	+ if (np->port.targ_port != -1) {
	+ if (ctl_port_deregister(&np->port) != 0)
	+ printf("%s: ctl_port_deregister() failed\n", __func__);
	+ }
	+
	+ free(np->active_ns, M_NVMFT);
	+ clean_unrhdr(np->ids);
	+ delete_unrhdr(np->ids);
	+ sx_destroy(&np->lock);
	+ free(np, M_NVMFT);
	+}
	+
	+static struct nvmft_port *
	+nvmft_port_find(const char *subnqn)
	+{
	+ struct nvmft_port *np;
	+
	+ KASSERT(nvmf_nqn_valid(subnqn), ("%s: invalid nqn", __func__));
	+
	+ sx_assert(&nvmft_ports_lock, SA_LOCKED);
	+ TAILQ_FOREACH(np, &nvmft_ports, link) {
	+ if (strcmp(np->cdata.subnqn, subnqn) == 0)
	+ break;
	+ }
	+ return (np);
	+}
	+
	+static struct nvmft_port *
	+nvmft_port_find_by_id(int port_id)
	+{
	+ struct nvmft_port *np;
	+
	+ sx_assert(&nvmft_ports_lock, SA_LOCKED);
	+ TAILQ_FOREACH(np, &nvmft_ports, link) {
	+ if (np->port.targ_port == port_id)
	+ break;
	+ }
	+ return (np);
	+}
	+
	+/*
	+ * Helper function to fetch a number stored as a string in an nv_list.
	+ * Returns false if the string was not a valid number.
	+ */
	+static bool
	+dnvlist_get_strnum(nvlist_t nvl, const char name, u_long default_value,
	+ u_long *value)
	+{
	+ const char *str;
	+ char *cp;
	+
	+ str = dnvlist_get_string(nvl, name, NULL);
	+ if (str == NULL) {
	+ *value = default_value;
	+ return (true);
	+ }
	+ if (*str == '\0')
	+ return (false);
	+ *value = strtoul(str, &cp, 0);
	+ if (*cp != '\0')
	+ return (false);
	+ return (true);
	+}
	+
	+/*
	+ * NVMeoF ports support the following parameters:
	+ *
	+ * Mandatory:
	+ *
	+ * subnqn: subsystem NVMe Qualified Name
	+ * portid: integer port ID from Discovery Log Page entry
	+ *
	+ * Optional:
	+ * serial: Serial Number string
	+ * max_io_qsize: Maximum number of I/O queue entries
	+ * enable_timeout: Timeout for controller enable in milliseconds
	+ * ioccsz: Maximum command capsule size
	+ * iorcsz: Maximum response capsule size
	+ * nn: Number of namespaces
	+ */
	+static void
	+nvmft_port_create(struct ctl_req *req)
	+{
	+ struct nvmft_port *np;
	+ struct ctl_port *port;
	+ const char serial, subnqn;
	+ char serial_buf[NVME_SERIAL_NUMBER_LENGTH];
	+ u_long enable_timeout, hostid, ioccsz, iorcsz, max_io_qsize, nn, portid;
	+ int error;
	+
	+ /* Required parameters. */
	+ subnqn = dnvlist_get_string(req->args_nvl, "subnqn", NULL);
	+ if (subnqn == NULL \|\| !nvlist_exists_string(req->args_nvl, "portid")) {
	+ req->status = CTL_LUN_ERROR;
	+ snprintf(req->error_str, sizeof(req->error_str),
	+ "Missing required argument");
	+ return;
	+ }
	+ if (!nvmf_nqn_valid(subnqn)) {
	+ req->status = CTL_LUN_ERROR;
	+ snprintf(req->error_str, sizeof(req->error_str),
	+ "Invalid SubNQN");
	+ return;
	+ }
	+ if (!dnvlist_get_strnum(req->args_nvl, "portid", UINT16_MAX, &portid) \|\|
	+ portid > UINT16_MAX) {
	+ req->status = CTL_LUN_ERROR;
	+ snprintf(req->error_str, sizeof(req->error_str),
	+ "Invalid port ID");
	+ return;
	+ }
	+
	+ /* Optional parameters. */
	+ if (!dnvlist_get_strnum(req->args_nvl, "max_io_qsize",
	+ NVMF_MAX_IO_ENTRIES, &max_io_qsize) \|\|
	+ max_io_qsize < NVME_MIN_IO_ENTRIES \|\|
	+ max_io_qsize > NVME_MAX_IO_ENTRIES) {
	+ req->status = CTL_LUN_ERROR;
	+ snprintf(req->error_str, sizeof(req->error_str),
	+ "Invalid maximum I/O queue size");
	+ return;
	+ }
	+
	+ if (!dnvlist_get_strnum(req->args_nvl, "enable_timeout",
	+ NVMF_CC_EN_TIMEOUT * 500, &enable_timeout) \|\|
	+ (enable_timeout % 500) != 0 \|\| (enable_timeout / 500) > 255) {
	+ req->status = CTL_LUN_ERROR;
	+ snprintf(req->error_str, sizeof(req->error_str),
	+ "Invalid enable timeout");
	+ return;
	+ }
	+
	+ if (!dnvlist_get_strnum(req->args_nvl, "ioccsz", NVMF_IOCCSZ,
	+ &ioccsz) \|\| ioccsz < sizeof(struct nvme_command) \|\|
	+ (ioccsz % 16) != 0) {
	+ req->status = CTL_LUN_ERROR;
	+ snprintf(req->error_str, sizeof(req->error_str),
	+ "Invalid Command Capsule size");
	+ return;
	+ }
	+
	+ if (!dnvlist_get_strnum(req->args_nvl, "iorcsz", NVMF_IORCSZ,
	+ &iorcsz) \|\| iorcsz < sizeof(struct nvme_completion) \|\|
	+ (iorcsz % 16) != 0) {
	+ req->status = CTL_LUN_ERROR;
	+ snprintf(req->error_str, sizeof(req->error_str),
	+ "Invalid Response Capsule size");
	+ return;
	+ }
	+
	+ if (!dnvlist_get_strnum(req->args_nvl, "nn", NVMF_NN, &nn) \|\|
	+ nn < 1 \|\| nn > UINT32_MAX) {
	+ req->status = CTL_LUN_ERROR;
	+ snprintf(req->error_str, sizeof(req->error_str),
	+ "Invalid number of namespaces");
	+ return;
	+ }
	+
	+ serial = dnvlist_get_string(req->args_nvl, "serial", NULL);
	+ if (serial == NULL) {
	+ getcredhostid(curthread->td_ucred, &hostid);
	+ nvmf_controller_serial(serial_buf, sizeof(serial_buf), hostid);
	+ serial = serial_buf;
	+ }
	+
	+ sx_xlock(&nvmft_ports_lock);
	+
	+ np = nvmft_port_find(subnqn);
	+ if (np != NULL) {
	+ req->status = CTL_LUN_ERROR;
	+ snprintf(req->error_str, sizeof(req->error_str),
	+ "SubNQN \"%s\" already exists", subnqn);
	+ sx_xunlock(&nvmft_ports_lock);
	+ return;
	+ }
	+
	+ np = malloc(sizeof(*np), M_NVMFT, M_WAITOK \| M_ZERO);
	+ refcount_init(&np->refs, 1);
	+ np->max_io_qsize = max_io_qsize;
	+ np->cap = _nvmf_controller_cap(max_io_qsize, enable_timeout / 500);
	+ sx_init(&np->lock, "nvmft port");
	+ np->ids = new_unrhdr(0, MIN(CTL_MAX_INIT_PER_PORT - 1,
	+ NVMF_CNTLID_STATIC_MAX), UNR_NO_MTX);
	+ TAILQ_INIT(&np->controllers);
	+
	+ /* The controller ID is set later for individual controllers. */
	+ _nvmf_init_io_controller_data(0, max_io_qsize, serial, ostype,
	+ osrelease, subnqn, nn, ioccsz, iorcsz, &np->cdata);
	+ np->cdata.aerl = NVMFT_NUM_AER - 1;
	+ np->cdata.oaes = htole32(NVME_ASYNC_EVENT_NS_ATTRIBUTE);
	+ np->cdata.oncs = htole16(NVMEF(NVME_CTRLR_DATA_ONCS_VERIFY, 1) \|
	+ NVMEF(NVME_CTRLR_DATA_ONCS_WRZERO, 1) \|
	+ NVMEF(NVME_CTRLR_DATA_ONCS_DSM, 1) \|
	+ NVMEF(NVME_CTRLR_DATA_ONCS_COMPARE, 1));
	+ np->cdata.fuses = NVMEF(NVME_CTRLR_DATA_FUSES_CNW, 1);
	+
	+ np->fp.afi = NVMEF(NVME_FIRMWARE_PAGE_AFI_SLOT, 1);
	+ memcpy(np->fp.revision[0], np->cdata.fr, sizeof(np->cdata.fr));
	+
	+ port = &np->port;
	+
	+ port->frontend = &nvmft_frontend;
	+ port->port_type = CTL_PORT_NVMF;
	+ port->num_requested_ctl_io = max_io_qsize;
	+ port->port_name = "nvmf";
	+ port->physical_port = portid;
	+ port->virtual_port = 0;
	+ port->port_online = nvmft_online;
	+ port->port_offline = nvmft_offline;
	+ port->onoff_arg = np;
	+ port->lun_enable = nvmft_lun_enable;
	+ port->lun_disable = nvmft_lun_disable;
	+ port->targ_lun_arg = np;
	+ port->fe_datamove = nvmft_datamove;
	+ port->fe_done = nvmft_done;
	+ port->targ_port = -1;
	+ port->options = nvlist_clone(req->args_nvl);
	+
	+ error = ctl_port_register(port);
	+ if (error != 0) {
	+ sx_xunlock(&nvmft_ports_lock);
	+ nvlist_destroy(port->options);
	+ nvmft_port_rele(np);
	+ req->status = CTL_LUN_ERROR;
	+ snprintf(req->error_str, sizeof(req->error_str),
	+ "Failed to register CTL port with error %d", error);
	+ return;
	+ }
	+
	+ TAILQ_INSERT_TAIL(&nvmft_ports, np, link);
	+ sx_xunlock(&nvmft_ports_lock);
	+
	+ req->status = CTL_LUN_OK;
	+ req->result_nvl = nvlist_create(0);
	+ nvlist_add_number(req->result_nvl, "port_id", port->targ_port);
	+}
	+
	+static void
	+nvmft_port_remove(struct ctl_req *req)
	+{
	+ struct nvmft_port *np;
	+ const char *subnqn;
	+ u_long port_id;
	+
	+ /*
	+ * ctladm port -r just provides the port_id, so permit looking
	+ * up a port either by "subnqn" or "port_id".
	+ */
	+ port_id = ULONG_MAX;
	+ subnqn = dnvlist_get_string(req->args_nvl, "subnqn", NULL);
	+ if (subnqn == NULL) {
	+ if (!nvlist_exists_string(req->args_nvl, "port_id")) {
	+ req->status = CTL_LUN_ERROR;
	+ snprintf(req->error_str, sizeof(req->error_str),
	+ "Missing required argument");
	+ return;
	+ }
	+ if (!dnvlist_get_strnum(req->args_nvl, "port_id", ULONG_MAX,
	+ &port_id)) {
	+ req->status = CTL_LUN_ERROR;
	+ snprintf(req->error_str, sizeof(req->error_str),
	+ "Invalid CTL port ID");
	+ return;
	+ }
	+ } else {
	+ if (nvlist_exists_string(req->args_nvl, "port_id")) {
	+ req->status = CTL_LUN_ERROR;
	+ snprintf(req->error_str, sizeof(req->error_str),
	+ "Ambiguous port removal request");
	+ return;
	+ }
	+ }
	+
	+ sx_xlock(&nvmft_ports_lock);
	+
	+ if (subnqn != NULL) {
	+ np = nvmft_port_find(subnqn);
	+ if (np == NULL) {
	+ req->status = CTL_LUN_ERROR;
	+ snprintf(req->error_str, sizeof(req->error_str),
	+ "SubNQN \"%s\" does not exist", subnqn);
	+ sx_xunlock(&nvmft_ports_lock);
	+ return;
	+ }
	+ } else {
	+ np = nvmft_port_find_by_id(port_id);
	+ if (np == NULL) {
	+ req->status = CTL_LUN_ERROR;
	+ snprintf(req->error_str, sizeof(req->error_str),
	+ "CTL port %lu is not a NVMF port", port_id);
	+ sx_xunlock(&nvmft_ports_lock);
	+ return;
	+ }
	+ }
	+
	+ TAILQ_REMOVE(&nvmft_ports, np, link);
	+ sx_xunlock(&nvmft_ports_lock);
	+
	+ ctl_port_offline(&np->port);
	+ nvmft_port_rele(np);
	+ req->status = CTL_LUN_OK;
	+}
	+
	+static void
	+nvmft_handoff(struct ctl_nvmf *cn)
	+{
	+ struct nvmf_fabric_connect_cmd cmd;
	+ struct nvmf_handoff_controller_qpair *handoff;
	+ struct nvmf_fabric_connect_data *data;
	+ struct nvmft_port *np;
	+ int error;
	+
	+ np = NULL;
	+ data = NULL;
	+ handoff = &cn->data.handoff;
	+ error = copyin(handoff->cmd, &cmd, sizeof(cmd));
	+ if (error != 0) {
	+ cn->status = CTL_NVMF_ERROR;
	+ snprintf(cn->error_str, sizeof(cn->error_str),
	+ "Failed to copyin CONNECT SQE");
	+ return;
	+ }
	+
	+ data = malloc(sizeof(*data), M_NVMFT, M_WAITOK);
	+ error = copyin(handoff->data, data, sizeof(*data));
	+ if (error != 0) {
	+ cn->status = CTL_NVMF_ERROR;
	+ snprintf(cn->error_str, sizeof(cn->error_str),
	+ "Failed to copyin CONNECT data");
	+ goto out;
	+ }
	+
	+ if (!nvmf_nqn_valid(data->subnqn)) {
	+ cn->status = CTL_NVMF_ERROR;
	+ snprintf(cn->error_str, sizeof(cn->error_str),
	+ "Invalid SubNQN");
	+ goto out;
	+ }
	+
	+ sx_slock(&nvmft_ports_lock);
	+ np = nvmft_port_find(data->subnqn);
	+ if (np == NULL) {
	+ sx_sunlock(&nvmft_ports_lock);
	+ cn->status = CTL_NVMF_ERROR;
	+ snprintf(cn->error_str, sizeof(cn->error_str),
	+ "Unknown SubNQN");
	+ goto out;
	+ }
	+ if (!np->online) {
	+ sx_sunlock(&nvmft_ports_lock);
	+ cn->status = CTL_NVMF_ERROR;
	+ snprintf(cn->error_str, sizeof(cn->error_str),
	+ "CTL port offline");
	+ np = NULL;
	+ goto out;
	+ }
	+ nvmft_port_ref(np);
	+ sx_sunlock(&nvmft_ports_lock);
	+
	+ if (handoff->params.admin) {
	+ error = nvmft_handoff_admin_queue(np, handoff, &cmd, data);
	+ if (error != 0) {
	+ cn->status = CTL_NVMF_ERROR;
	+ snprintf(cn->error_str, sizeof(cn->error_str),
	+ "Failed to handoff admin queue: %d", error);
	+ goto out;
	+ }
	+ } else {
	+ error = nvmft_handoff_io_queue(np, handoff, &cmd, data);
	+ if (error != 0) {
	+ cn->status = CTL_NVMF_ERROR;
	+ snprintf(cn->error_str, sizeof(cn->error_str),
	+ "Failed to handoff admin queue: %d", error);
	+ goto out;
	+ }
	+ }
	+
	+ cn->status = CTL_NVMF_OK;
	+out:
	+ if (np != NULL)
	+ nvmft_port_rele(np);
	+ free(data, M_NVMFT);
	+}
	+
	+static void
	+nvmft_list(struct ctl_nvmf *cn)
	+{
	+ struct ctl_nvmf_list_params *lp;
	+ struct nvmft_controller *ctrlr;
	+ struct nvmft_port *np;
	+ struct sbuf *sb;
	+ int error;
	+
	+ lp = &cn->data.list;
	+
	+ sb = sbuf_new(NULL, NULL, lp->alloc_len, SBUF_FIXEDLEN \|
	+ SBUF_INCLUDENUL);
	+ if (sb == NULL) {
	+ cn->status = CTL_NVMF_ERROR;
	+ snprintf(cn->error_str, sizeof(cn->error_str),
	+ "Failed to allocate NVMeoF session list");
	+ return;
	+ }
	+
	+ sbuf_printf(sb, "<ctlnvmflist>\n");
	+ sx_slock(&nvmft_ports_lock);
	+ TAILQ_FOREACH(np, &nvmft_ports, link) {
	+ sx_slock(&np->lock);
	+ TAILQ_FOREACH(ctrlr, &np->controllers, link) {
	+ sbuf_printf(sb, "<connection id=\"%d\">"
	+ "<hostnqn>%s</hostnqn>"
	+ "<subnqn>%s</subnqn>"
	+ "<trtype>%u</trtype>"
	+ "</connection>\n",
	+ ctrlr->cntlid,
	+ ctrlr->hostnqn,
	+ np->cdata.subnqn,
	+ ctrlr->trtype);
	+ }
	+ sx_sunlock(&np->lock);
	+ }
	+ sx_sunlock(&nvmft_ports_lock);
	+ sbuf_printf(sb, "</ctlnvmflist>\n");
	+ if (sbuf_finish(sb) != 0) {
	+ sbuf_delete(sb);
	+ cn->status = CTL_NVMF_LIST_NEED_MORE_SPACE;
	+ snprintf(cn->error_str, sizeof(cn->error_str),
	+ "Out of space, %d bytes is too small", lp->alloc_len);
	+ return;
	+ }
	+
	+ error = copyout(sbuf_data(sb), lp->conn_xml, sbuf_len(sb));
	+ if (error != 0) {
	+ sbuf_delete(sb);
	+ cn->status = CTL_NVMF_ERROR;
	+ snprintf(cn->error_str, sizeof(cn->error_str),
	+ "Failed to copyout session list: %d", error);
	+ return;
	+ }
	+ lp->fill_len = sbuf_len(sb);
	+ cn->status = CTL_NVMF_OK;
	+ sbuf_delete(sb);
	+}
	+
	+static void
	+nvmft_terminate(struct ctl_nvmf *cn)
	+{
	+ struct ctl_nvmf_terminate_params *tp;
	+ struct nvmft_controller *ctrlr;
	+ struct nvmft_port *np;
	+ bool found, match;
	+
	+ tp = &cn->data.terminate;
	+
	+ found = false;
	+ sx_slock(&nvmft_ports_lock);
	+ TAILQ_FOREACH(np, &nvmft_ports, link) {
	+ sx_slock(&np->lock);
	+ TAILQ_FOREACH(ctrlr, &np->controllers, link) {
	+ if (tp->all != 0)
	+ match = true;
	+ else if (tp->cntlid != -1)
	+ match = tp->cntlid == ctrlr->cntlid;
	+ else if (tp->hostnqn[0] != '\0')
	+ match = strncmp(tp->hostnqn, ctrlr->hostnqn,
	+ sizeof(tp->hostnqn)) == 0;
	+ else
	+ match = false;
	+ if (!match)
	+ continue;
	+ nvmft_printf(ctrlr,
	+ "disconnecting due to administrative request\n");
	+ nvmft_controller_error(ctrlr, NULL, ECONNABORTED);
	+ found = true;
	+ }
	+ sx_sunlock(&np->lock);
	+ }
	+ sx_sunlock(&nvmft_ports_lock);
	+
	+ if (!found) {
	+ cn->status = CTL_NVMF_ASSOCIATION_NOT_FOUND;
	+ snprintf(cn->error_str, sizeof(cn->error_str),
	+ "No matching associations found");
	+ return;
	+ }
	+ cn->status = CTL_NVMF_OK;
	+}
	+
	+static int
	+nvmft_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int flag,
	+ struct thread *td)
	+{
	+ struct ctl_nvmf *cn;
	+ struct ctl_req *req;
	+
	+ switch (cmd) {
	+ case CTL_PORT_REQ:
	+ req = (struct ctl_req *)data;
	+ switch (req->reqtype) {
	+ case CTL_REQ_CREATE:
	+ nvmft_port_create(req);
	+ break;
	+ case CTL_REQ_REMOVE:
	+ nvmft_port_remove(req);
	+ break;
	+ default:
	+ req->status = CTL_LUN_ERROR;
	+ snprintf(req->error_str, sizeof(req->error_str),
	+ "Unsupported request type %d", req->reqtype);
	+ break;
	+ }
	+ return (0);
	+ case CTL_NVMF:
	+ cn = (struct ctl_nvmf *)data;
	+ switch (cn->type) {
	+ case CTL_NVMF_HANDOFF:
	+ nvmft_handoff(cn);
	+ break;
	+ case CTL_NVMF_LIST:
	+ nvmft_list(cn);
	+ break;
	+ case CTL_NVMF_TERMINATE:
	+ nvmft_terminate(cn);
	+ break;
	+ default:
	+ cn->status = CTL_NVMF_ERROR;
	+ snprintf(cn->error_str, sizeof(cn->error_str),
	+ "Invalid NVMeoF request type %d", cn->type);
	+ break;
	+ }
	+ return (0);
	+ default:
	+ return (ENOTTY);
	+ }
	+}
	+
	+static int
	+nvmft_shutdown(void)
	+{
	+ /* TODO: Need to check for active controllers. */
	+ if (!TAILQ_EMPTY(&nvmft_ports))
	+ return (EBUSY);
	+
	+ sx_destroy(&nvmft_ports_lock);
	+ return (0);
	+}
	+
	+CTL_FRONTEND_DECLARE(nvmft, nvmft_frontend);
	+MODULE_DEPEND(nvmft, nvmf_transport, 1, 1, 1);
	diff --git a/sys/dev/nvmf/controller/nvmft_controller.c b/sys/dev/nvmf/controller/nvmft_controller.c
	new file mode 100644
	--- /dev/null
	+++ b/sys/dev/nvmf/controller/nvmft_controller.c
	@@ -0,0 +1,1130 @@
	+/*-
	+ * SPDX-License-Identifier: BSD-2-Clause
	+ *
	+ * Copyright (c) 2023-2024 Chelsio Communications, Inc.
	+ * Written by: John Baldwin <jhb@FreeBSD.org>
	+ */
	+
	+#include <sys/param.h>
	+#include <sys/callout.h>
	+#include <sys/kernel.h>
	+#include <sys/lock.h>
	+#include <sys/malloc.h>
	+#include <sys/mbuf.h>
	+#include <sys/memdesc.h>
	+#include <sys/mutex.h>
	+#include <sys/sbuf.h>
	+#include <sys/sx.h>
	+#include <sys/taskqueue.h>
	+
	+#include <dev/nvmf/nvmf_transport.h>
	+#include <dev/nvmf/controller/nvmft_subr.h>
	+#include <dev/nvmf/controller/nvmft_var.h>
	+
	+static void nvmft_controller_shutdown(void *arg, int pending);
	+static void nvmft_controller_terminate(void *arg, int pending);
	+
	+int
	+nvmft_printf(struct nvmft_controller ctrlr, const char fmt, ...)
	+{
	+ char buf[128];
	+ struct sbuf sb;
	+ va_list ap;
	+ size_t retval;
	+
	+ sbuf_new(&sb, buf, sizeof(buf), SBUF_FIXEDLEN);
	+ sbuf_set_drain(&sb, sbuf_printf_drain, &retval);
	+
	+ sbuf_printf(&sb, "nvmft%u: ", ctrlr->cntlid);
	+
	+ va_start(ap, fmt);
	+ sbuf_vprintf(&sb, fmt, ap);
	+ va_end(ap);
	+
	+ sbuf_finish(&sb);
	+ sbuf_delete(&sb);
	+
	+ return (retval);
	+}
	+
	+static struct nvmft_controller *
	+nvmft_controller_alloc(struct nvmft_port *np, uint16_t cntlid,
	+ const struct nvmf_fabric_connect_data *data)
	+{
	+ struct nvmft_controller *ctrlr;
	+
	+ ctrlr = malloc(sizeof(*ctrlr), M_NVMFT, M_WAITOK \| M_ZERO);
	+ ctrlr->cntlid = cntlid;
	+ nvmft_port_ref(np);
	+ TAILQ_INSERT_TAIL(&np->controllers, ctrlr, link);
	+ ctrlr->np = np;
	+ mtx_init(&ctrlr->lock, "nvmft controller", NULL, MTX_DEF);
	+ callout_init(&ctrlr->ka_timer, 1);
	+ TASK_INIT(&ctrlr->shutdown_task, 0, nvmft_controller_shutdown, ctrlr);
	+ TIMEOUT_TASK_INIT(taskqueue_thread, &ctrlr->terminate_task, 0,
	+ nvmft_controller_terminate, ctrlr);
	+
	+ ctrlr->cdata = np->cdata;
	+ ctrlr->cdata.ctrlr_id = htole16(cntlid);
	+ memcpy(ctrlr->hostid, data->hostid, sizeof(ctrlr->hostid));
	+ memcpy(ctrlr->hostnqn, data->hostnqn, sizeof(ctrlr->hostnqn));
	+ ctrlr->hip.power_cycles[0] = 1;
	+ ctrlr->create_time = sbinuptime();
	+
	+ ctrlr->changed_ns = malloc(sizeof(*ctrlr->changed_ns), M_NVMFT,
	+ M_WAITOK \| M_ZERO);
	+
	+ return (ctrlr);
	+}
	+
	+static void
	+nvmft_controller_free(struct nvmft_controller *ctrlr)
	+{
	+ mtx_destroy(&ctrlr->lock);
	+ MPASS(ctrlr->io_qpairs == NULL);
	+ free(ctrlr->changed_ns, M_NVMFT);
	+ free(ctrlr, M_NVMFT);
	+}
	+
	+static void
	+nvmft_keep_alive_timer(void *arg)
	+{
	+ struct nvmft_controller *ctrlr = arg;
	+ int traffic;
	+
	+ if (ctrlr->shutdown)
	+ return;
	+
	+ traffic = atomic_readandclear_int(&ctrlr->ka_active_traffic);
	+ if (traffic == 0) {
	+ nvmft_printf(ctrlr,
	+ "disconnecting due to KeepAlive timeout\n");
	+ nvmft_controller_error(ctrlr, NULL, ETIMEDOUT);
	+ return;
	+ }
	+
	+ callout_schedule_sbt(&ctrlr->ka_timer, ctrlr->ka_sbt, 0, C_HARDCLOCK);
	+}
	+
	+int
	+nvmft_handoff_admin_queue(struct nvmft_port *np,
	+ const struct nvmf_handoff_controller_qpair *handoff,
	+ const struct nvmf_fabric_connect_cmd *cmd,
	+ const struct nvmf_fabric_connect_data *data)
	+{
	+ struct nvmft_controller *ctrlr;
	+ struct nvmft_qpair *qp;
	+ uint32_t kato;
	+ int cntlid;
	+
	+ if (cmd->qid != htole16(0))
	+ return (EINVAL);
	+
	+ qp = nvmft_qpair_init(handoff->trtype, &handoff->params, 0,
	+ "admin queue");
	+
	+ sx_xlock(&np->lock);
	+ cntlid = alloc_unr(np->ids);
	+ if (cntlid == -1) {
	+ sx_xunlock(&np->lock);
	+ printf("NVMFT: Unable to allocate controller for %.*s\n",
	+ (int)sizeof(data->hostnqn), data->hostnqn);
	+ nvmft_connect_error(qp, cmd, NVME_SCT_COMMAND_SPECIFIC,
	+ NVMF_FABRIC_SC_INVALID_HOST);
	+ nvmft_qpair_destroy(qp);
	+ return (ENOMEM);
	+ }
	+
	+#ifdef INVARIANTS
	+ TAILQ_FOREACH(ctrlr, &np->controllers, link) {
	+ KASSERT(ctrlr->cntlid != cntlid,
	+ ("%s: duplicate controllers with id %d", __func__, cntlid));
	+ }
	+#endif
	+
	+ ctrlr = nvmft_controller_alloc(np, cntlid, data);
	+ nvmft_printf(ctrlr, "associated with %.*s\n",
	+ (int)sizeof(data->hostnqn), data->hostnqn);
	+ ctrlr->admin = qp;
	+ ctrlr->trtype = handoff->trtype;
	+
	+ /*
	+ * The spec requires a non-zero KeepAlive timer, but allow a
	+ * zero KATO value to match Linux.
	+ */
	+ kato = le32toh(cmd->kato);
	+ if (kato != 0) {
	+ /*
	+ * Round up to 1 second matching granularity
	+ * advertised in cdata.
	+ */
	+ ctrlr->ka_sbt = mstosbt(roundup(kato, 1000));
	+ callout_reset_sbt(&ctrlr->ka_timer, ctrlr->ka_sbt, 0,
	+ nvmft_keep_alive_timer, ctrlr, C_HARDCLOCK);
	+ }
	+
	+ nvmft_finish_accept(qp, cmd, ctrlr);
	+ sx_xunlock(&np->lock);
	+
	+ return (0);
	+}
	+
	+int
	+nvmft_handoff_io_queue(struct nvmft_port *np,
	+ const struct nvmf_handoff_controller_qpair *handoff,
	+ const struct nvmf_fabric_connect_cmd *cmd,
	+ const struct nvmf_fabric_connect_data *data)
	+{
	+ struct nvmft_controller *ctrlr;
	+ struct nvmft_qpair *qp;
	+ char name[16];
	+ uint16_t cntlid, qid;
	+
	+ qid = le16toh(cmd->qid);
	+ if (qid == 0)
	+ return (EINVAL);
	+ cntlid = le16toh(data->cntlid);
	+
	+ snprintf(name, sizeof(name), "I/O queue %u", qid);
	+ qp = nvmft_qpair_init(handoff->trtype, &handoff->params, qid, name);
	+
	+ sx_slock(&np->lock);
	+ TAILQ_FOREACH(ctrlr, &np->controllers, link) {
	+ if (ctrlr->cntlid == cntlid)
	+ break;
	+ }
	+ if (ctrlr == NULL) {
	+ sx_sunlock(&np->lock);
	+ printf("NVMFT: Nonexistent controller %u for I/O queue %u from %.*s\n",
	+ ctrlr->cntlid, qid, (int)sizeof(data->hostnqn),
	+ data->hostnqn);
	+ nvmft_connect_invalid_parameters(qp, cmd, true,
	+ offsetof(struct nvmf_fabric_connect_data, cntlid));
	+ nvmft_qpair_destroy(qp);
	+ return (ENOENT);
	+ }
	+
	+ if (memcmp(ctrlr->hostid, data->hostid, sizeof(ctrlr->hostid)) != 0) {
	+ sx_sunlock(&np->lock);
	+ nvmft_printf(ctrlr,
	+ "hostid mismatch for I/O queue %u from %.*s\n", qid,
	+ (int)sizeof(data->hostnqn), data->hostnqn);
	+ nvmft_connect_invalid_parameters(qp, cmd, true,
	+ offsetof(struct nvmf_fabric_connect_data, hostid));
	+ nvmft_qpair_destroy(qp);
	+ return (EINVAL);
	+ }
	+ if (memcmp(ctrlr->hostnqn, data->hostnqn, sizeof(ctrlr->hostnqn)) != 0) {
	+ sx_sunlock(&np->lock);
	+ nvmft_printf(ctrlr,
	+ "hostnqn mismatch for I/O queue %u from %.*s\n", qid,
	+ (int)sizeof(data->hostnqn), data->hostnqn);
	+ nvmft_connect_invalid_parameters(qp, cmd, true,
	+ offsetof(struct nvmf_fabric_connect_data, hostnqn));
	+ nvmft_qpair_destroy(qp);
	+ return (EINVAL);
	+ }
	+
	+ /* XXX: Require handoff->trtype == ctrlr->trtype? */
	+
	+ mtx_lock(&ctrlr->lock);
	+ if (ctrlr->shutdown) {
	+ mtx_unlock(&ctrlr->lock);
	+ sx_sunlock(&np->lock);
	+ nvmft_printf(ctrlr,
	+ "attempt to create I/O queue %u on disabled controller from %.*s\n",
	+ qid, (int)sizeof(data->hostnqn), data->hostnqn);
	+ nvmft_connect_invalid_parameters(qp, cmd, true,
	+ offsetof(struct nvmf_fabric_connect_data, cntlid));
	+ nvmft_qpair_destroy(qp);
	+ return (EINVAL);
	+ }
	+ if (ctrlr->num_io_queues == 0) {
	+ mtx_unlock(&ctrlr->lock);
	+ sx_sunlock(&np->lock);
	+ nvmft_printf(ctrlr,
	+ "attempt to create I/O queue %u without enabled queues from %.*s\n",
	+ qid, (int)sizeof(data->hostnqn), data->hostnqn);
	+ nvmft_connect_error(qp, cmd, NVME_SCT_GENERIC,
	+ NVME_SC_COMMAND_SEQUENCE_ERROR);
	+ nvmft_qpair_destroy(qp);
	+ return (EINVAL);
	+ }
	+ if (cmd->qid > ctrlr->num_io_queues) {
	+ mtx_unlock(&ctrlr->lock);
	+ sx_sunlock(&np->lock);
	+ nvmft_printf(ctrlr,
	+ "attempt to create invalid I/O queue %u from %.*s\n", qid,
	+ (int)sizeof(data->hostnqn), data->hostnqn);
	+ nvmft_connect_invalid_parameters(qp, cmd, false,
	+ offsetof(struct nvmf_fabric_connect_cmd, qid));
	+ nvmft_qpair_destroy(qp);
	+ return (EINVAL);
	+ }
	+ if (ctrlr->io_qpairs[qid - 1].qp != NULL) {
	+ mtx_unlock(&ctrlr->lock);
	+ sx_sunlock(&np->lock);
	+ nvmft_printf(ctrlr,
	+ "attempt to re-create I/O queue %u from %.*s\n", qid,
	+ (int)sizeof(data->hostnqn), data->hostnqn);
	+ nvmft_connect_error(qp, cmd, NVME_SCT_GENERIC,
	+ NVME_SC_COMMAND_SEQUENCE_ERROR);
	+ nvmft_qpair_destroy(qp);
	+ return (EINVAL);
	+ }
	+
	+ ctrlr->io_qpairs[qid - 1].qp = qp;
	+ mtx_unlock(&ctrlr->lock);
	+ nvmft_finish_accept(qp, cmd, ctrlr);
	+ sx_sunlock(&np->lock);
	+
	+ return (0);
	+}
	+
	+static void
	+nvmft_controller_shutdown(void *arg, int pending)
	+{
	+ struct nvmft_controller *ctrlr = arg;
	+
	+ MPASS(pending == 1);
	+
	+ /*
	+ * Shutdown all I/O queues to terminate pending datamoves and
	+ * stop receiving new commands.
	+ */
	+ mtx_lock(&ctrlr->lock);
	+ for (u_int i = 0; i < ctrlr->num_io_queues; i++) {
	+ if (ctrlr->io_qpairs[i].qp != NULL) {
	+ ctrlr->io_qpairs[i].shutdown = true;
	+ mtx_unlock(&ctrlr->lock);
	+ nvmft_qpair_shutdown(ctrlr->io_qpairs[i].qp);
	+ mtx_lock(&ctrlr->lock);
	+ }
	+ }
	+ mtx_unlock(&ctrlr->lock);
	+
	+ /* Terminate active CTL commands. */
	+ nvmft_terminate_commands(ctrlr);
	+
	+ /* Wait for all pending CTL commands to complete. */
	+ mtx_lock(&ctrlr->lock);
	+ while (ctrlr->pending_commands != 0)
	+ mtx_sleep(&ctrlr->pending_commands, &ctrlr->lock, 0, "nvmftsh",
	+ hz / 100);
	+ mtx_unlock(&ctrlr->lock);
	+
	+ /* Delete all of the I/O queues. */
	+ for (u_int i = 0; i < ctrlr->num_io_queues; i++) {
	+ if (ctrlr->io_qpairs[i].qp != NULL)
	+ nvmft_qpair_destroy(ctrlr->io_qpairs[i].qp);
	+ }
	+ free(ctrlr->io_qpairs, M_NVMFT);
	+ ctrlr->io_qpairs = NULL;
	+
	+ mtx_lock(&ctrlr->lock);
	+ ctrlr->num_io_queues = 0;
	+
	+ /* Mark shutdown complete. */
	+ if (NVMEV(NVME_CSTS_REG_SHST, ctrlr->csts) == NVME_SHST_OCCURRING) {
	+ ctrlr->csts &= ~NVMEM(NVME_CSTS_REG_SHST);
	+ ctrlr->csts \|= NVMEF(NVME_CSTS_REG_SHST, NVME_SHST_COMPLETE);
	+ }
	+
	+ if (NVMEV(NVME_CSTS_REG_CFS, ctrlr->csts) == 0) {
	+ ctrlr->csts &= ~NVMEM(NVME_CSTS_REG_RDY);
	+ ctrlr->shutdown = false;
	+ }
	+ mtx_unlock(&ctrlr->lock);
	+
	+ /*
	+ * If the admin queue was closed while shutting down or a
	+ * fatal controller error has occurred, terminate the
	+ * association immediately, otherwise wait up to 2 minutes
	+ * (NVMe-over-Fabrics 1.1 4.6).
	+ */
	+ if (ctrlr->admin_closed \|\| NVMEV(NVME_CSTS_REG_CFS, ctrlr->csts) != 0)
	+ nvmft_controller_terminate(ctrlr, 0);
	+ else
	+ taskqueue_enqueue_timeout(taskqueue_thread,
	+ &ctrlr->terminate_task, hz * 60 * 2);
	+}
	+
	+static void
	+nvmft_controller_terminate(void *arg, int pending)
	+{
	+ struct nvmft_controller *ctrlr = arg;
	+ struct nvmft_port *np;
	+ bool wakeup_np;
	+
	+ /* If the controller has been re-enabled, nothing to do. */
	+ mtx_lock(&ctrlr->lock);
	+ if (NVMEV(NVME_CC_REG_EN, ctrlr->cc) != 0) {
	+ mtx_unlock(&ctrlr->lock);
	+
	+ if (ctrlr->ka_sbt != 0)
	+ callout_schedule_sbt(&ctrlr->ka_timer, ctrlr->ka_sbt, 0,
	+ C_HARDCLOCK);
	+ return;
	+ }
	+
	+ /* Disable updates to CC while destroying admin qpair. */
	+ ctrlr->shutdown = true;
	+ mtx_unlock(&ctrlr->lock);
	+
	+ nvmft_qpair_destroy(ctrlr->admin);
	+
	+ /* Remove association (CNTLID). */
	+ np = ctrlr->np;
	+ sx_xlock(&np->lock);
	+ TAILQ_REMOVE(&np->controllers, ctrlr, link);
	+ free_unr(np->ids, ctrlr->cntlid);
	+ wakeup_np = (!np->online && TAILQ_EMPTY(&np->controllers));
	+ sx_xunlock(&np->lock);
	+ if (wakeup_np)
	+ wakeup(np);
	+
	+ callout_drain(&ctrlr->ka_timer);
	+
	+ nvmft_printf(ctrlr, "association terminated\n");
	+ nvmft_controller_free(ctrlr);
	+ nvmft_port_rele(np);
	+}
	+
	+void
	+nvmft_controller_error(struct nvmft_controller ctrlr, struct nvmft_qpair qp,
	+ int error)
	+{
	+ /*
	+ * If a queue pair is closed, that isn't an error per se.
	+ * That just means additional commands cannot be received on
	+ * that queue pair.
	+ *
	+ * If the admin queue pair is closed while idle or while
	+ * shutting down, terminate the association immediately.
	+ *
	+ * If an I/O queue pair is closed, just ignore it.
	+ */
	+ if (error == 0) {
	+ if (qp != ctrlr->admin)
	+ return;
	+
	+ mtx_lock(&ctrlr->lock);
	+ if (ctrlr->shutdown) {
	+ ctrlr->admin_closed = true;
	+ mtx_unlock(&ctrlr->lock);
	+ return;
	+ }
	+
	+ if (NVMEV(NVME_CC_REG_EN, ctrlr->cc) == 0) {
	+ MPASS(ctrlr->num_io_queues == 0);
	+ mtx_unlock(&ctrlr->lock);
	+
	+ /*
	+ * Ok to drop lock here since ctrlr->cc can't
	+ * change if the admin queue pair has closed.
	+ * This also means no new queues can be handed
	+ * off, etc. Note that since there are no I/O
	+ * queues, only the admin queue needs to be
	+ * destroyed, so it is safe to skip
	+ * nvmft_controller_shutdown and just schedule
	+ * nvmft_controller_terminate. Note that we
	+ * cannot call nvmft_controller_terminate from
	+ * here directly as this is called from the
	+ * transport layer and freeing the admin qpair
	+ * might deadlock waiting for the current
	+ * thread to exit.
	+ */
	+ if (taskqueue_cancel_timeout(taskqueue_thread,
	+ &ctrlr->terminate_task, NULL) == 0)
	+ taskqueue_enqueue_timeout(taskqueue_thread,
	+ &ctrlr->terminate_task, 0);
	+ return;
	+ }
	+
	+ /*
	+ * Treat closing of the admin queue pair while enabled
	+ * as a transport error. Note that the admin queue
	+ * pair has been closed.
	+ */
	+ ctrlr->admin_closed = true;
	+ } else
	+ mtx_lock(&ctrlr->lock);
	+
	+ /* Ignore transport errors if we are already shutting down. */
	+ if (ctrlr->shutdown) {
	+ mtx_unlock(&ctrlr->lock);
	+ return;
	+ }
	+
	+ ctrlr->csts \|= NVMEF(NVME_CSTS_REG_CFS, 1);
	+ ctrlr->cc &= ~NVMEM(NVME_CC_REG_EN);
	+ ctrlr->shutdown = true;
	+ mtx_unlock(&ctrlr->lock);
	+
	+ callout_stop(&ctrlr->ka_timer);
	+ taskqueue_enqueue(taskqueue_thread, &ctrlr->shutdown_task);
	+}
	+
	+/* Wrapper around m_getm2 that also sets m_len in the mbufs in the chain. */
	+static struct mbuf *
	+m_getml(size_t len, int how)
	+{
	+ struct mbuf m, n;
	+
	+ m = m_getm2(NULL, len, how, MT_DATA, 0);
	+ if (m == NULL)
	+ return (NULL);
	+ for (n = m; len > 0; n = n->m_next) {
	+ n->m_len = M_SIZE(n);
	+ if (n->m_len >= len) {
	+ n->m_len = len;
	+ MPASS(n->m_next == NULL);
	+ }
	+ len -= n->m_len;
	+ }
	+ return (m);
	+}
	+
	+static void
	+m_zero(struct mbuf *m, u_int offset, u_int len)
	+{
	+ u_int todo;
	+
	+ if (len == 0)
	+ return;
	+
	+ while (m->m_len <= offset) {
	+ offset -= m->m_len;
	+ m = m->m_next;
	+ }
	+
	+ todo = m->m_len - offset;
	+ if (todo > len)
	+ todo = len;
	+ memset(mtodo(m, offset), 0, todo);
	+ m = m->m_next;
	+ len -= todo;
	+
	+ while (len > 0) {
	+ todo = m->m_len;
	+ if (todo > len)
	+ todo = len;
	+ memset(mtod(m, void *), 0, todo);
	+ m = m->m_next;
	+ len -= todo;
	+ }
	+}
	+
	+static void
	+handle_get_log_page(struct nvmft_controller *ctrlr,
	+ struct nvmf_capsule nc, const struct nvme_command cmd)
	+{
	+ struct mbuf *m;
	+ uint64_t offset;
	+ uint32_t numd;
	+ size_t len, todo;
	+ u_int status;
	+ uint8_t lid;
	+ bool rae;
	+
	+ lid = le32toh(cmd->cdw10) & 0xff;
	+ rae = (le32toh(cmd->cdw10) & (1U << 15)) != 0;
	+ numd = le32toh(cmd->cdw10) >> 16 \| le32toh(cmd->cdw11) << 16;
	+ offset = le32toh(cmd->cdw12) \| (uint64_t)le32toh(cmd->cdw13) << 32;
	+
	+ if (offset % 3 != 0) {
	+ status = NVME_SC_INVALID_FIELD;
	+ goto done;
	+ }
	+
	+ len = (numd + 1) * 4;
	+
	+ switch (lid) {
	+ case NVME_LOG_ERROR:
	+ todo = 0;
	+
	+ m = m_getml(len, M_WAITOK);
	+ if (todo != len)
	+ m_zero(m, todo, len - todo);
	+ status = nvmf_send_controller_data(nc, 0, m, len);
	+ MPASS(status != NVMF_MORE);
	+ break;
	+ case NVME_LOG_HEALTH_INFORMATION:
	+ {
	+ struct nvme_health_information_page hip;
	+
	+ if (offset >= sizeof(hip)) {
	+ status = NVME_SC_INVALID_FIELD;
	+ goto done;
	+ }
	+ todo = sizeof(hip) - offset;
	+ if (todo > len)
	+ todo = len;
	+
	+ mtx_lock(&ctrlr->lock);
	+ hip = ctrlr->hip;
	+ hip.controller_busy_time[0] =
	+ sbintime_getsec(ctrlr->busy_total) / 60;
	+ hip.power_on_hours[0] =
	+ sbintime_getsec(sbinuptime() - ctrlr->create_time) / 3600;
	+ mtx_unlock(&ctrlr->lock);
	+
	+ m = m_getml(len, M_WAITOK);
	+ m_copyback(m, 0, todo, (char *)&hip + offset);
	+ if (todo != len)
	+ m_zero(m, todo, len - todo);
	+ status = nvmf_send_controller_data(nc, 0, m, len);
	+ MPASS(status != NVMF_MORE);
	+ break;
	+ }
	+ case NVME_LOG_FIRMWARE_SLOT:
	+ if (offset >= sizeof(ctrlr->np->fp)) {
	+ status = NVME_SC_INVALID_FIELD;
	+ goto done;
	+ }
	+ todo = sizeof(ctrlr->np->fp) - offset;
	+ if (todo > len)
	+ todo = len;
	+
	+ m = m_getml(len, M_WAITOK);
	+ m_copyback(m, 0, todo, (char *)&ctrlr->np->fp + offset);
	+ if (todo != len)
	+ m_zero(m, todo, len - todo);
	+ status = nvmf_send_controller_data(nc, 0, m, len);
	+ MPASS(status != NVMF_MORE);
	+ break;
	+ case NVME_LOG_CHANGED_NAMESPACE:
	+ if (offset >= sizeof(*ctrlr->changed_ns)) {
	+ status = NVME_SC_INVALID_FIELD;
	+ goto done;
	+ }
	+ todo = sizeof(*ctrlr->changed_ns) - offset;
	+ if (todo > len)
	+ todo = len;
	+
	+ m = m_getml(len, M_WAITOK);
	+ mtx_lock(&ctrlr->lock);
	+ m_copyback(m, 0, todo, (char *)ctrlr->changed_ns + offset);
	+ if (offset == 0 && len == sizeof(*ctrlr->changed_ns))
	+ memset(ctrlr->changed_ns, 0,
	+ sizeof(*ctrlr->changed_ns));
	+ if (!rae)
	+ ctrlr->changed_ns_reported = false;
	+ mtx_unlock(&ctrlr->lock);
	+ if (todo != len)
	+ m_zero(m, todo, len - todo);
	+ status = nvmf_send_controller_data(nc, 0, m, len);
	+ MPASS(status != NVMF_MORE);
	+ break;
	+ default:
	+ nvmft_printf(ctrlr, "Unsupported page %#x for GET_LOG_PAGE\n",
	+ lid);
	+ status = NVME_SC_INVALID_FIELD;
	+ break;
	+ }
	+
	+done:
	+ if (status == NVMF_SUCCESS_SENT)
	+ nvmft_command_completed(ctrlr->admin, nc);
	+ else
	+ nvmft_send_generic_error(ctrlr->admin, nc, status);
	+ nvmf_free_capsule(nc);
	+}
	+
	+static void
	+m_free_nslist(struct mbuf *m)
	+{
	+ free(m->m_ext.ext_arg1, M_NVMFT);
	+}
	+
	+static void
	+handle_identify_command(struct nvmft_controller *ctrlr,
	+ struct nvmf_capsule nc, const struct nvme_command cmd)
	+{
	+ struct mbuf *m;
	+ size_t data_len;
	+ u_int status;
	+ uint8_t cns;
	+
	+ cns = le32toh(cmd->cdw10) & 0xFF;
	+ data_len = nvmf_capsule_data_len(nc);
	+ if (data_len != sizeof(ctrlr->cdata)) {
	+ nvmft_printf(ctrlr,
	+ "Invalid length %zu for IDENTIFY with CNS %#x\n", data_len,
	+ cns);
	+ nvmft_send_generic_error(ctrlr->admin, nc,
	+ NVME_SC_INVALID_OPCODE);
	+ nvmf_free_capsule(nc);
	+ return;
	+ }
	+
	+ switch (cns) {
	+ case 0: /* Namespace data. */
	+ case 3: /* Namespace Identification Descriptor list. */
	+ nvmft_dispatch_command(ctrlr->admin, nc, true);
	+ return;
	+ case 1:
	+ /* Controller data. */
	+ m = m_getml(sizeof(ctrlr->cdata), M_WAITOK);
	+ m_copyback(m, 0, sizeof(ctrlr->cdata), (void *)&ctrlr->cdata);
	+ status = nvmf_send_controller_data(nc, 0, m,
	+ sizeof(ctrlr->cdata));
	+ MPASS(status != NVMF_MORE);
	+ break;
	+ case 2:
	+ {
	+ /* Active namespace list. */
	+ struct nvme_ns_list *nslist;
	+ uint32_t nsid;
	+
	+ nsid = le32toh(cmd->nsid);
	+ if (nsid >= 0xfffffffe) {
	+ status = NVME_SC_INVALID_FIELD;
	+ break;
	+ }
	+
	+ nslist = malloc(sizeof(*nslist), M_NVMFT, M_WAITOK \| M_ZERO);
	+ nvmft_populate_active_nslist(ctrlr->np, nsid, nslist);
	+ m = m_get(M_WAITOK, MT_DATA);
	+ m_extadd(m, (void )nslist, sizeof(nslist), m_free_nslist,
	+ nslist, NULL, 0, EXT_CTL);
	+ m->m_len = sizeof(*nslist);
	+ status = nvmf_send_controller_data(nc, 0, m, m->m_len);
	+ MPASS(status != NVMF_MORE);
	+ break;
	+ }
	+ default:
	+ nvmft_printf(ctrlr, "Unsupported CNS %#x for IDENTIFY\n", cns);
	+ status = NVME_SC_INVALID_FIELD;
	+ break;
	+ }
	+
	+ if (status == NVMF_SUCCESS_SENT)
	+ nvmft_command_completed(ctrlr->admin, nc);
	+ else
	+ nvmft_send_generic_error(ctrlr->admin, nc, status);
	+ nvmf_free_capsule(nc);
	+}
	+
	+static void
	+handle_set_features(struct nvmft_controller *ctrlr,
	+ struct nvmf_capsule nc, const struct nvme_command cmd)
	+{
	+ struct nvme_completion cqe;
	+ uint8_t fid;
	+
	+ fid = NVMEV(NVME_FEAT_SET_FID, le32toh(cmd->cdw10));
	+ switch (fid) {
	+ case NVME_FEAT_NUMBER_OF_QUEUES:
	+ {
	+ uint32_t num_queues;
	+ struct nvmft_io_qpair *io_qpairs;
	+
	+ num_queues = le32toh(cmd->cdw11) & 0xffff;
	+
	+ /* 5.12.1.7: 65535 is invalid. */
	+ if (num_queues == 65535)
	+ goto error;
	+
	+ /* Fabrics requires the same number of SQs and CQs. */
	+ if (le32toh(cmd->cdw11) >> 16 != num_queues)
	+ goto error;
	+
	+ /* Convert to 1's based */
	+ num_queues++;
	+
	+ io_qpairs = mallocarray(num_queues, sizeof(*io_qpairs),
	+ M_NVMFT, M_WAITOK \| M_ZERO);
	+
	+ mtx_lock(&ctrlr->lock);
	+ if (ctrlr->num_io_queues != 0) {
	+ mtx_unlock(&ctrlr->lock);
	+ free(io_qpairs, M_NVMFT);
	+ nvmft_send_generic_error(ctrlr->admin, nc,
	+ NVME_SC_COMMAND_SEQUENCE_ERROR);
	+ nvmf_free_capsule(nc);
	+ return;
	+ }
	+
	+ ctrlr->num_io_queues = num_queues;
	+ ctrlr->io_qpairs = io_qpairs;
	+ mtx_unlock(&ctrlr->lock);
	+
	+ nvmft_init_cqe(&cqe, nc, 0);
	+ cqe.cdw0 = cmd->cdw11;
	+ nvmft_send_response(ctrlr->admin, &cqe);
	+ nvmf_free_capsule(nc);
	+ return;
	+ }
	+ case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
	+ {
	+ uint32_t aer_mask;
	+
	+ aer_mask = le32toh(cmd->cdw11);
	+
	+ /* Check for any reserved or unimplemented feature bits. */
	+ if ((aer_mask & 0xffffc000) != 0)
	+ goto error;
	+
	+ mtx_lock(&ctrlr->lock);
	+ ctrlr->aer_mask = aer_mask;
	+ mtx_unlock(&ctrlr->lock);
	+ nvmft_send_success(ctrlr->admin, nc);
	+ return;
	+ }
	+ default:
	+ nvmft_printf(ctrlr,
	+ "Unsupported feature ID %u for SET_FEATURES\n", fid);
	+ goto error;
	+ }
	+
	+error:
	+ nvmft_send_generic_error(ctrlr->admin, nc, NVME_SC_INVALID_FIELD);
	+ nvmf_free_capsule(nc);
	+}
	+
	+static bool
	+update_cc(struct nvmft_controller ctrlr, uint32_t new_cc, bool need_shutdown)
	+{
	+ struct nvmft_port *np = ctrlr->np;
	+ uint32_t changes;
	+
	+ *need_shutdown = false;
	+
	+ mtx_lock(&ctrlr->lock);
	+
	+ /* Don't allow any changes while shutting down. */
	+ if (ctrlr->shutdown) {
	+ mtx_unlock(&ctrlr->lock);
	+ return (false);
	+ }
	+
	+ if (!_nvmf_validate_cc(np->max_io_qsize, np->cap, ctrlr->cc, new_cc)) {
	+ mtx_unlock(&ctrlr->lock);
	+ return (false);
	+ }
	+
	+ changes = ctrlr->cc ^ new_cc;
	+ ctrlr->cc = new_cc;
	+
	+ /* Handle shutdown requests. */
	+ if (NVMEV(NVME_CC_REG_SHN, changes) != 0 &&
	+ NVMEV(NVME_CC_REG_SHN, new_cc) != 0) {
	+ ctrlr->csts &= ~NVMEM(NVME_CSTS_REG_SHST);
	+ ctrlr->csts \|= NVMEF(NVME_CSTS_REG_SHST, NVME_SHST_OCCURRING);
	+ ctrlr->cc &= ~NVMEM(NVME_CC_REG_EN);
	+ ctrlr->shutdown = true;
	+ *need_shutdown = true;
	+ nvmft_printf(ctrlr, "shutdown requested\n");
	+ }
	+
	+ if (NVMEV(NVME_CC_REG_EN, changes) != 0) {
	+ if (NVMEV(NVME_CC_REG_EN, new_cc) == 0) {
	+ /* Controller reset. */
	+ nvmft_printf(ctrlr, "reset requested\n");
	+ ctrlr->shutdown = true;
	+ *need_shutdown = true;
	+ } else
	+ ctrlr->csts \|= NVMEF(NVME_CSTS_REG_RDY, 1);
	+ }
	+ mtx_unlock(&ctrlr->lock);
	+
	+ return (true);
	+}
	+
	+static void
	+handle_property_get(struct nvmft_controller ctrlr, struct nvmf_capsule nc,
	+ const struct nvmf_fabric_prop_get_cmd *pget)
	+{
	+ struct nvmf_fabric_prop_get_rsp rsp;
	+
	+ nvmft_init_cqe(&rsp, nc, 0);
	+
	+ switch (le32toh(pget->ofst)) {
	+ case NVMF_PROP_CAP:
	+ if (pget->attrib.size != NVMF_PROP_SIZE_8)
	+ goto error;
	+ rsp.value.u64 = htole64(ctrlr->np->cap);
	+ break;
	+ case NVMF_PROP_VS:
	+ if (pget->attrib.size != NVMF_PROP_SIZE_4)
	+ goto error;
	+ rsp.value.u32.low = ctrlr->cdata.ver;
	+ break;
	+ case NVMF_PROP_CC:
	+ if (pget->attrib.size != NVMF_PROP_SIZE_4)
	+ goto error;
	+ rsp.value.u32.low = htole32(ctrlr->cc);
	+ break;
	+ case NVMF_PROP_CSTS:
	+ if (pget->attrib.size != NVMF_PROP_SIZE_4)
	+ goto error;
	+ rsp.value.u32.low = htole32(ctrlr->csts);
	+ break;
	+ default:
	+ goto error;
	+ }
	+
	+ nvmft_send_response(ctrlr->admin, &rsp);
	+ return;
	+error:
	+ nvmft_send_generic_error(ctrlr->admin, nc, NVME_SC_INVALID_FIELD);
	+}
	+
	+static void
	+handle_property_set(struct nvmft_controller ctrlr, struct nvmf_capsule nc,
	+ const struct nvmf_fabric_prop_set_cmd *pset)
	+{
	+ bool need_shutdown;
	+
	+ need_shutdown = false;
	+ switch (le32toh(pset->ofst)) {
	+ case NVMF_PROP_CC:
	+ if (pset->attrib.size != NVMF_PROP_SIZE_4)
	+ goto error;
	+ if (!update_cc(ctrlr, le32toh(pset->value.u32.low),
	+ &need_shutdown))
	+ goto error;
	+ break;
	+ default:
	+ goto error;
	+ }
	+
	+ nvmft_send_success(ctrlr->admin, nc);
	+ if (need_shutdown) {
	+ callout_stop(&ctrlr->ka_timer);
	+ taskqueue_enqueue(taskqueue_thread, &ctrlr->shutdown_task);
	+ }
	+ return;
	+error:
	+ nvmft_send_generic_error(ctrlr->admin, nc, NVME_SC_INVALID_FIELD);
	+}
	+
	+static void
	+handle_admin_fabrics_command(struct nvmft_controller *ctrlr,
	+ struct nvmf_capsule nc, const struct nvmf_fabric_cmd fc)
	+{
	+ switch (fc->fctype) {
	+ case NVMF_FABRIC_COMMAND_PROPERTY_GET:
	+ handle_property_get(ctrlr, nc,
	+ (const struct nvmf_fabric_prop_get_cmd *)fc);
	+ break;
	+ case NVMF_FABRIC_COMMAND_PROPERTY_SET:
	+ handle_property_set(ctrlr, nc,
	+ (const struct nvmf_fabric_prop_set_cmd *)fc);
	+ break;
	+ case NVMF_FABRIC_COMMAND_CONNECT:
	+ nvmft_printf(ctrlr,
	+ "CONNECT command on connected admin queue\n");
	+ nvmft_send_generic_error(ctrlr->admin, nc,
	+ NVME_SC_COMMAND_SEQUENCE_ERROR);
	+ break;
	+ case NVMF_FABRIC_COMMAND_DISCONNECT:
	+ nvmft_printf(ctrlr, "DISCONNECT command on admin queue\n");
	+ nvmft_send_error(ctrlr->admin, nc, NVME_SCT_COMMAND_SPECIFIC,
	+ NVMF_FABRIC_SC_INVALID_QUEUE_TYPE);
	+ break;
	+ default:
	+ nvmft_printf(ctrlr, "Unsupported fabrics command %#x\n",
	+ fc->fctype);
	+ nvmft_send_generic_error(ctrlr->admin, nc,
	+ NVME_SC_INVALID_OPCODE);
	+ break;
	+ }
	+ nvmf_free_capsule(nc);
	+}
	+
	+void
	+nvmft_handle_admin_command(struct nvmft_controller *ctrlr,
	+ struct nvmf_capsule *nc)
	+{
	+ const struct nvme_command *cmd = nvmf_capsule_sqe(nc);
	+
	+ /* Only permit Fabrics commands while a controller is disabled. */
	+ if (NVMEV(NVME_CC_REG_EN, ctrlr->cc) == 0 &&
	+ cmd->opc != NVME_OPC_FABRICS_COMMANDS) {
	+ nvmft_printf(ctrlr,
	+ "Unsupported admin opcode %#x whiled disabled\n", cmd->opc);
	+ nvmft_send_generic_error(ctrlr->admin, nc,
	+ NVME_SC_COMMAND_SEQUENCE_ERROR);
	+ nvmf_free_capsule(nc);
	+ return;
	+ }
	+
	+ atomic_store_int(&ctrlr->ka_active_traffic, 1);
	+
	+ switch (cmd->opc) {
	+ case NVME_OPC_GET_LOG_PAGE:
	+ handle_get_log_page(ctrlr, nc, cmd);
	+ break;
	+ case NVME_OPC_IDENTIFY:
	+ handle_identify_command(ctrlr, nc, cmd);
	+ break;
	+ case NVME_OPC_SET_FEATURES:
	+ handle_set_features(ctrlr, nc, cmd);
	+ break;
	+ case NVME_OPC_ASYNC_EVENT_REQUEST:
	+ mtx_lock(&ctrlr->lock);
	+ if (ctrlr->aer_pending == NVMFT_NUM_AER) {
	+ mtx_unlock(&ctrlr->lock);
	+ nvmft_send_error(ctrlr->admin, nc,
	+ NVME_SCT_COMMAND_SPECIFIC,
	+ NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED);
	+ } else {
	+ /* NB: Store the CID without byte-swapping. */
	+ ctrlr->aer_cids[ctrlr->aer_pidx] = cmd->cid;
	+ ctrlr->aer_pending++;
	+ ctrlr->aer_pidx = (ctrlr->aer_pidx + 1) % NVMFT_NUM_AER;
	+ mtx_unlock(&ctrlr->lock);
	+ }
	+ nvmf_free_capsule(nc);
	+ break;
	+ case NVME_OPC_KEEP_ALIVE:
	+ nvmft_send_success(ctrlr->admin, nc);
	+ nvmf_free_capsule(nc);
	+ break;
	+ case NVME_OPC_FABRICS_COMMANDS:
	+ handle_admin_fabrics_command(ctrlr, nc,
	+ (const struct nvmf_fabric_cmd *)cmd);
	+ break;
	+ default:
	+ nvmft_printf(ctrlr, "Unsupported admin opcode %#x\n", cmd->opc);
	+ nvmft_send_generic_error(ctrlr->admin, nc,
	+ NVME_SC_INVALID_OPCODE);
	+ nvmf_free_capsule(nc);
	+ break;
	+ }
	+}
	+
	+void
	+nvmft_handle_io_command(struct nvmft_qpair *qp, uint16_t qid,
	+ struct nvmf_capsule *nc)
	+{
	+ struct nvmft_controller *ctrlr = nvmft_qpair_ctrlr(qp);
	+ const struct nvme_command *cmd = nvmf_capsule_sqe(nc);
	+
	+ atomic_store_int(&ctrlr->ka_active_traffic, 1);
	+
	+ switch (cmd->opc) {
	+ case NVME_OPC_FLUSH:
	+ if (cmd->nsid == htole32(0xffffffff)) {
	+ nvmft_send_generic_error(qp, nc,
	+ NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
	+ nvmf_free_capsule(nc);
	+ break;
	+ }
	+ /* FALLTHROUGH */
	+ case NVME_OPC_WRITE:
	+ case NVME_OPC_READ:
	+ case NVME_OPC_WRITE_UNCORRECTABLE:
	+ case NVME_OPC_COMPARE:
	+ case NVME_OPC_WRITE_ZEROES:
	+ case NVME_OPC_DATASET_MANAGEMENT:
	+ case NVME_OPC_VERIFY:
	+ nvmft_dispatch_command(qp, nc, false);
	+ break;
	+ default:
	+ nvmft_printf(ctrlr, "Unsupported I/O opcode %#x\n", cmd->opc);
	+ nvmft_send_generic_error(qp, nc,
	+ NVME_SC_INVALID_OPCODE);
	+ nvmf_free_capsule(nc);
	+ break;
	+ }
	+}
	+
	+static void
	+nvmft_report_aer(struct nvmft_controller *ctrlr, uint32_t aer_mask,
	+ u_int type, uint8_t info, uint8_t log_page_id)
	+{
	+ struct nvme_completion cpl;
	+
	+ MPASS(type <= 7);
	+
	+ /* Drop events that are not enabled. */
	+ mtx_lock(&ctrlr->lock);
	+ if ((ctrlr->aer_mask & aer_mask) == 0) {
	+ mtx_unlock(&ctrlr->lock);
	+ return;
	+ }
	+
	+ /*
	+ * If there is no pending AER command, drop it.
	+ * XXX: Should we queue these?
	+ */
	+ if (ctrlr->aer_pending == 0) {
	+ mtx_unlock(&ctrlr->lock);
	+ nvmft_printf(ctrlr,
	+ "dropping AER type %u, info %#x, page %#x\n",
	+ type, info, log_page_id);
	+ return;
	+ }
	+
	+ memset(&cpl, 0, sizeof(cpl));
	+ cpl.cid = ctrlr->aer_cids[ctrlr->aer_cidx];
	+ ctrlr->aer_pending--;
	+ ctrlr->aer_cidx = (ctrlr->aer_cidx + 1) % NVMFT_NUM_AER;
	+ mtx_unlock(&ctrlr->lock);
	+
	+ cpl.cdw0 = htole32(NVMEF(NVME_ASYNC_EVENT_TYPE, type) \|
	+ NVMEF(NVME_ASYNC_EVENT_INFO, info) \|
	+ NVMEF(NVME_ASYNC_EVENT_LOG_PAGE_ID, log_page_id));
	+
	+ nvmft_send_response(ctrlr->admin, &cpl);
	+}
	+
	+void
	+nvmft_controller_lun_changed(struct nvmft_controller *ctrlr, int lun_id)
	+{
	+ struct nvme_ns_list *nslist;
	+ uint32_t new_nsid, nsid;
	+ u_int i;
	+
	+ new_nsid = lun_id + 1;
	+
	+ mtx_lock(&ctrlr->lock);
	+ nslist = ctrlr->changed_ns;
	+
	+ /* If the first entry is 0xffffffff, the list is already full. */
	+ if (nslist->ns[0] != 0xffffffff) {
	+ /* Find the insertion point for this namespace ID. */
	+ for (i = 0; i < nitems(nslist->ns); i++) {
	+ nsid = le32toh(nslist->ns[i]);
	+ if (nsid == new_nsid) {
	+ /* Already reported, nothing to do. */
	+ mtx_unlock(&ctrlr->lock);
	+ return;
	+ }
	+
	+ if (nsid == 0 \|\| nsid > new_nsid)
	+ break;
	+ }
	+
	+ if (nslist->ns[nitems(nslist->ns) - 1] != htole32(0)) {
	+ /* List is full. */
	+ memset(ctrlr->changed_ns, 0,
	+ sizeof(*ctrlr->changed_ns));
	+ ctrlr->changed_ns->ns[0] = 0xffffffff;
	+ } else if (nslist->ns[i] == htole32(0)) {
	+ /*
	+ * Optimize case where this ID is appended to
	+ * the end.
	+ */
	+ nslist->ns[i] = htole32(new_nsid);
	+ } else {
	+ memmove(&nslist->ns[i + 1], &nslist->ns[i],
	+ (nitems(nslist->ns) - i - 1) *
	+ sizeof(nslist->ns[0]));
	+ nslist->ns[i] = htole32(new_nsid);
	+ }
	+ }
	+
	+ if (ctrlr->changed_ns_reported) {
	+ mtx_unlock(&ctrlr->lock);
	+ return;
	+ }
	+ ctrlr->changed_ns_reported = true;
	+ mtx_unlock(&ctrlr->lock);
	+
	+ nvmft_report_aer(ctrlr, NVME_ASYNC_EVENT_NS_ATTRIBUTE, 0x2, 0x0,
	+ NVME_LOG_CHANGED_NAMESPACE);
	+}
	diff --git a/sys/dev/nvmf/controller/nvmft_qpair.c b/sys/dev/nvmf/controller/nvmft_qpair.c
	new file mode 100644
	--- /dev/null
	+++ b/sys/dev/nvmf/controller/nvmft_qpair.c
	@@ -0,0 +1,361 @@
	+/*-
	+ * SPDX-License-Identifier: BSD-2-Clause
	+ *
	+ * Copyright (c) 2023-2024 Chelsio Communications, Inc.
	+ * Written by: John Baldwin <jhb@FreeBSD.org>
	+ */
	+
	+#include <sys/types.h>
	+#include <sys/_bitset.h>
	+#include <sys/bitset.h>
	+#include <sys/lock.h>
	+#include <sys/mutex.h>
	+
	+#include <dev/nvmf/nvmf_transport.h>
	+#include <dev/nvmf/controller/nvmft_var.h>
	+
	+/*
	+ * A bitmask of command ID values. This is used to detect duplicate
	+ * commands with the same ID.
	+ */
	+#define NUM_CIDS (UINT16_MAX + 1)
	+BITSET_DEFINE(cidset, NUM_CIDS);
	+
	+struct nvmft_qpair {
	+ struct nvmft_controller *ctrlr;
	+ struct nvmf_qpair *qp;
	+ struct cidset *cids;
	+
	+ bool admin;
	+ bool sq_flow_control;
	+ uint16_t qid;
	+ u_int qsize;
	+ uint16_t sqhd;
	+ uint16_t sqtail;
	+ volatile u_int qp_refs; /* Internal references on 'qp'. */
	+
	+ struct mtx lock;
	+
	+ char name[16];
	+};
	+
	+static int _nvmft_send_generic_error(struct nvmft_qpair *qp,
	+ struct nvmf_capsule *nc, uint8_t sc_status);
	+
	+static void
	+nvmft_qpair_error(void *arg, int error)
	+{
	+ struct nvmft_qpair *qp = arg;
	+ struct nvmft_controller *ctrlr = qp->ctrlr;
	+
	+ /*
	+ * XXX: The Linux TCP initiator sends a RST immediately after
	+ * the FIN, so treat ECONNRESET as plain EOF to avoid spurious
	+ * errors on shutdown.
	+ */
	+ if (error == ECONNRESET)
	+ error = 0;
	+
	+ if (error != 0)
	+ nvmft_printf(ctrlr, "error %d on %s\n", error, qp->name);
	+ nvmft_controller_error(ctrlr, qp, error);
	+}
	+
	+static void
	+nvmft_receive_capsule(void arg, struct nvmf_capsule nc)
	+{
	+ struct nvmft_qpair *qp = arg;
	+ struct nvmft_controller *ctrlr = qp->ctrlr;
	+ const struct nvme_command *cmd;
	+ uint8_t sc_status;
	+
	+ cmd = nvmf_capsule_sqe(nc);
	+ if (ctrlr == NULL) {
	+ printf("NVMFT: %s received CID %u opcode %u on newborn queue\n",
	+ qp->name, le16toh(cmd->cid), cmd->opc);
	+ nvmf_free_capsule(nc);
	+ return;
	+ }
	+
	+ sc_status = nvmf_validate_command_capsule(nc);
	+ if (sc_status != NVME_SC_SUCCESS) {
	+ _nvmft_send_generic_error(qp, nc, sc_status);
	+ nvmf_free_capsule(nc);
	+ return;
	+ }
	+
	+ /* Don't bother byte-swapping CID. */
	+ if (BIT_TEST_SET_ATOMIC(NUM_CIDS, cmd->cid, qp->cids)) {
	+ _nvmft_send_generic_error(qp, nc, NVME_SC_COMMAND_ID_CONFLICT);
	+ nvmf_free_capsule(nc);
	+ return;
	+ }
	+
	+ if (qp->admin)
	+ nvmft_handle_admin_command(ctrlr, nc);
	+ else
	+ nvmft_handle_io_command(qp, qp->qid, nc);
	+}
	+
	+struct nvmft_qpair *
	+nvmft_qpair_init(enum nvmf_trtype trtype,
	+ const struct nvmf_handoff_qpair_params *handoff, uint16_t qid,
	+ const char *name)
	+{
	+ struct nvmft_qpair *qp;
	+
	+ qp = malloc(sizeof(*qp), M_NVMFT, M_WAITOK \| M_ZERO);
	+ qp->admin = handoff->admin;
	+ qp->sq_flow_control = handoff->sq_flow_control;
	+ qp->qsize = handoff->qsize;
	+ qp->qid = qid;
	+ qp->sqhd = handoff->sqhd;
	+ qp->sqtail = handoff->sqtail;
	+ strlcpy(qp->name, name, sizeof(qp->name));
	+ mtx_init(&qp->lock, "nvmft qp", NULL, MTX_DEF);
	+ qp->cids = BITSET_ALLOC(NUM_CIDS, M_NVMFT, M_WAITOK \| M_ZERO);
	+
	+ qp->qp = nvmf_allocate_qpair(trtype, true, handoff, nvmft_qpair_error,
	+ qp, nvmft_receive_capsule, qp);
	+ if (qp->qp == NULL) {
	+ mtx_destroy(&qp->lock);
	+ free(qp->cids, M_NVMFT);
	+ free(qp, M_NVMFT);
	+ return (NULL);
	+ }
	+
	+ refcount_init(&qp->qp_refs, 1);
	+ return (qp);
	+}
	+
	+void
	+nvmft_qpair_shutdown(struct nvmft_qpair *qp)
	+{
	+ struct nvmf_qpair *nq;
	+
	+ mtx_lock(&qp->lock);
	+ nq = qp->qp;
	+ qp->qp = NULL;
	+ mtx_unlock(&qp->lock);
	+ if (nq != NULL && refcount_release(&qp->qp_refs))
	+ nvmf_free_qpair(nq);
	+}
	+
	+void
	+nvmft_qpair_destroy(struct nvmft_qpair *qp)
	+{
	+ nvmft_qpair_shutdown(qp);
	+ mtx_destroy(&qp->lock);
	+ free(qp->cids, M_NVMFT);
	+ free(qp, M_NVMFT);
	+}
	+
	+struct nvmft_controller *
	+nvmft_qpair_ctrlr(struct nvmft_qpair *qp)
	+{
	+ return (qp->ctrlr);
	+}
	+
	+uint16_t
	+nvmft_qpair_id(struct nvmft_qpair *qp)
	+{
	+ return (qp->qid);
	+}
	+
	+const char *
	+nvmft_qpair_name(struct nvmft_qpair *qp)
	+{
	+ return (qp->name);
	+}
	+
	+static int
	+_nvmft_send_response(struct nvmft_qpair qp, const void cqe)
	+{
	+ struct nvme_completion cpl;
	+ struct nvmf_qpair *nq;
	+ struct nvmf_capsule *rc;
	+ int error;
	+
	+ memcpy(&cpl, cqe, sizeof(cpl));
	+ mtx_lock(&qp->lock);
	+ nq = qp->qp;
	+ if (nq == NULL) {
	+ mtx_unlock(&qp->lock);
	+ return (ENOTCONN);
	+ }
	+ refcount_acquire(&qp->qp_refs);
	+
	+ /* Set SQHD. */
	+ if (qp->sq_flow_control) {
	+ qp->sqhd = (qp->sqhd + 1) % qp->qsize;
	+ cpl.sqhd = htole16(qp->sqhd);
	+ } else
	+ cpl.sqhd = 0;
	+ mtx_unlock(&qp->lock);
	+
	+ rc = nvmf_allocate_response(nq, &cpl, M_WAITOK);
	+ error = nvmf_transmit_capsule(rc);
	+ nvmf_free_capsule(rc);
	+
	+ if (refcount_release(&qp->qp_refs))
	+ nvmf_free_qpair(nq);
	+ return (error);
	+}
	+
	+void
	+nvmft_command_completed(struct nvmft_qpair qp, struct nvmf_capsule nc)
	+{
	+ const struct nvme_command *cmd = nvmf_capsule_sqe(nc);
	+
	+ /* Don't bother byte-swapping CID. */
	+ KASSERT(BIT_ISSET(NUM_CIDS, cmd->cid, qp->cids),
	+ ("%s: CID %u not busy", __func__, cmd->cid));
	+
	+ BIT_CLR_ATOMIC(NUM_CIDS, cmd->cid, qp->cids);
	+}
	+
	+int
	+nvmft_send_response(struct nvmft_qpair qp, const void cqe)
	+{
	+ const struct nvme_completion *cpl = cqe;
	+
	+ /* Don't bother byte-swapping CID. */
	+ KASSERT(BIT_ISSET(NUM_CIDS, cpl->cid, qp->cids),
	+ ("%s: CID %u not busy", __func__, cpl->cid));
	+
	+ BIT_CLR_ATOMIC(NUM_CIDS, cpl->cid, qp->cids);
	+ return (_nvmft_send_response(qp, cqe));
	+}
	+
	+void
	+nvmft_init_cqe(void cqe, struct nvmf_capsule nc, uint16_t status)
	+{
	+ struct nvme_completion *cpl = cqe;
	+ const struct nvme_command *cmd = nvmf_capsule_sqe(nc);
	+
	+ memset(cpl, 0, sizeof(*cpl));
	+ cpl->cid = cmd->cid;
	+ cpl->status = htole16(status);
	+}
	+
	+int
	+nvmft_send_error(struct nvmft_qpair qp, struct nvmf_capsule nc,
	+ uint8_t sc_type, uint8_t sc_status)
	+{
	+ struct nvme_completion cpl;
	+ uint16_t status;
	+
	+ status = NVMEF(NVME_STATUS_SCT, sc_type) \|
	+ NVMEF(NVME_STATUS_SC, sc_status);
	+ nvmft_init_cqe(&cpl, nc, status);
	+ return (nvmft_send_response(qp, &cpl));
	+}
	+
	+int
	+nvmft_send_generic_error(struct nvmft_qpair qp, struct nvmf_capsule nc,
	+ uint8_t sc_status)
	+{
	+ return (nvmft_send_error(qp, nc, NVME_SCT_GENERIC, sc_status));
	+}
	+
	+/*
	+ * This version doesn't clear CID in qp->cids and is used for errors
	+ * before the CID is validated.
	+ */
	+static int
	+_nvmft_send_generic_error(struct nvmft_qpair qp, struct nvmf_capsule nc,
	+ uint8_t sc_status)
	+{
	+ struct nvme_completion cpl;
	+ uint16_t status;
	+
	+ status = NVMEF(NVME_STATUS_SCT, NVME_SCT_GENERIC) \|
	+ NVMEF(NVME_STATUS_SC, sc_status);
	+ nvmft_init_cqe(&cpl, nc, status);
	+ return (_nvmft_send_response(qp, &cpl));
	+}
	+
	+int
	+nvmft_send_success(struct nvmft_qpair qp, struct nvmf_capsule nc)
	+{
	+ return (nvmft_send_generic_error(qp, nc, NVME_SC_SUCCESS));
	+}
	+
	+static void
	+nvmft_init_connect_rsp(struct nvmf_fabric_connect_rsp *rsp,
	+ const struct nvmf_fabric_connect_cmd *cmd, uint16_t status)
	+{
	+ memset(rsp, 0, sizeof(*rsp));
	+ rsp->cid = cmd->cid;
	+ rsp->status = htole16(status);
	+}
	+
	+static int
	+nvmft_send_connect_response(struct nvmft_qpair *qp,
	+ const struct nvmf_fabric_connect_rsp *rsp)
	+{
	+ struct nvmf_capsule *rc;
	+ struct nvmf_qpair *nq;
	+ int error;
	+
	+ mtx_lock(&qp->lock);
	+ nq = qp->qp;
	+ if (nq == NULL) {
	+ mtx_unlock(&qp->lock);
	+ return (ENOTCONN);
	+ }
	+ refcount_acquire(&qp->qp_refs);
	+ mtx_unlock(&qp->lock);
	+
	+ rc = nvmf_allocate_response(qp->qp, rsp, M_WAITOK);
	+ error = nvmf_transmit_capsule(rc);
	+ nvmf_free_capsule(rc);
	+
	+ if (refcount_release(&qp->qp_refs))
	+ nvmf_free_qpair(nq);
	+ return (error);
	+}
	+
	+void
	+nvmft_connect_error(struct nvmft_qpair *qp,
	+ const struct nvmf_fabric_connect_cmd *cmd, uint8_t sc_type,
	+ uint8_t sc_status)
	+{
	+ struct nvmf_fabric_connect_rsp rsp;
	+ uint16_t status;
	+
	+ status = NVMEF(NVME_STATUS_SCT, sc_type) \|
	+ NVMEF(NVME_STATUS_SC, sc_status);
	+ nvmft_init_connect_rsp(&rsp, cmd, status);
	+ nvmft_send_connect_response(qp, &rsp);
	+}
	+
	+void
	+nvmft_connect_invalid_parameters(struct nvmft_qpair *qp,
	+ const struct nvmf_fabric_connect_cmd *cmd, bool data, uint16_t offset)
	+{
	+ struct nvmf_fabric_connect_rsp rsp;
	+
	+ nvmft_init_connect_rsp(&rsp, cmd,
	+ NVMEF(NVME_STATUS_SCT, NVME_SCT_COMMAND_SPECIFIC) \|
	+ NVMEF(NVME_STATUS_SC, NVMF_FABRIC_SC_INVALID_PARAM));
	+ rsp.status_code_specific.invalid.ipo = htole16(offset);
	+ rsp.status_code_specific.invalid.iattr = data ? 1 : 0;
	+ nvmft_send_connect_response(qp, &rsp);
	+}
	+
	+int
	+nvmft_finish_accept(struct nvmft_qpair *qp,
	+ const struct nvmf_fabric_connect_cmd cmd, struct nvmft_controller ctrlr)
	+{
	+ struct nvmf_fabric_connect_rsp rsp;
	+
	+ qp->ctrlr = ctrlr;
	+ nvmft_init_connect_rsp(&rsp, cmd, 0);
	+ if (qp->sq_flow_control)
	+ rsp.sqhd = htole16(qp->sqhd);
	+ else
	+ rsp.sqhd = htole16(0xffff);
	+ rsp.status_code_specific.success.cntlid = htole16(ctrlr->cntlid);
	+ return (nvmft_send_connect_response(qp, &rsp));
	+}
	diff --git a/sys/dev/nvmf/controller/nvmft_var.h b/sys/dev/nvmf/controller/nvmft_var.h
	new file mode 100644
	--- /dev/null
	+++ b/sys/dev/nvmf/controller/nvmft_var.h
	@@ -0,0 +1,174 @@
	+/*-
	+ * SPDX-License-Identifier: BSD-2-Clause
	+ *
	+ * Copyright (c) 2023-2024 Chelsio Communications, Inc.
	+ * Written by: John Baldwin <jhb@FreeBSD.org>
	+ */
	+
	+#ifndef __NVMFT_VAR_H__
	+#define __NVMFT_VAR_H__
	+
	+#include <sys/_callout.h>
	+#include <sys/refcount.h>
	+#include <sys/taskqueue.h>
	+
	+#include <dev/nvmf/nvmf_proto.h>
	+
	+#include <cam/ctl/ctl.h>
	+#include <cam/ctl/ctl_io.h>
	+#include <cam/ctl/ctl_frontend.h>
	+
	+struct nvmf_capsule;
	+struct nvmft_controller;
	+struct nvmft_qpair;
	+
	+#define NVMFT_NUM_AER 16
	+
	+struct nvmft_port {
	+ TAILQ_ENTRY(nvmft_port) link;
	+ u_int refs;
	+ struct ctl_port port;
	+ struct nvme_controller_data cdata;
	+ struct nvme_firmware_page fp;
	+ uint64_t cap;
	+ uint32_t max_io_qsize;
	+ bool online;
	+
	+ struct sx lock;
	+
	+ struct unrhdr *ids;
	+ TAILQ_HEAD(, nvmft_controller) controllers;
	+
	+ uint32_t *active_ns;
	+ u_int num_ns;
	+};
	+
	+struct nvmft_io_qpair {
	+ struct nvmft_qpair *qp;
	+
	+ bool shutdown;
	+};
	+
	+struct nvmft_controller {
	+ struct nvmft_qpair *admin;
	+ struct nvmft_io_qpair *io_qpairs;
	+ u_int num_io_queues;
	+ bool shutdown;
	+ bool admin_closed;
	+ uint16_t cntlid;
	+ uint32_t cc;
	+ uint32_t csts;
	+
	+ struct nvmft_port *np;
	+ struct mtx lock;
	+
	+ struct nvme_controller_data cdata;
	+ struct nvme_health_information_page hip;
	+ sbintime_t create_time;
	+ sbintime_t start_busy;
	+ sbintime_t busy_total;
	+ uint16_t partial_dur;
	+ uint16_t partial_duw;
	+
	+ uint8_t hostid[16];
	+ uint8_t hostnqn[NVME_NQN_FIELD_SIZE];
	+ u_int trtype;
	+
	+ TAILQ_ENTRY(nvmft_controller) link;
	+
	+ /*
	+ * Each queue can have at most UINT16_MAX commands, so the total
	+ * across all queues will fit in a uint32_t.
	+ */
	+ uint32_t pending_commands;
	+
	+ volatile int ka_active_traffic;
	+ struct callout ka_timer;
	+ sbintime_t ka_sbt;
	+
	+ /* AER fields. */
	+ uint32_t aer_mask;
	+ uint16_t aer_cids[NVMFT_NUM_AER];
	+ uint8_t aer_pending;
	+ uint8_t aer_cidx;
	+ uint8_t aer_pidx;
	+
	+ /* Changed namespace IDs. */
	+ struct nvme_ns_list *changed_ns;
	+ bool changed_ns_reported;
	+
	+ struct task shutdown_task;
	+ struct timeout_task terminate_task;
	+};
	+
	+MALLOC_DECLARE(M_NVMFT);
	+
	+/* ctl_frontend_nvmf.c */
	+void nvmft_port_free(struct nvmft_port *np);
	+void nvmft_populate_active_nslist(struct nvmft_port *np, uint32_t nsid,
	+ struct nvme_ns_list *nslist);
	+void nvmft_dispatch_command(struct nvmft_qpair *qp,
	+ struct nvmf_capsule *nc, bool admin);
	+void nvmft_terminate_commands(struct nvmft_controller *ctrlr);
	+
	+/* nvmft_controller.c */
	+void nvmft_controller_error(struct nvmft_controller *ctrlr,
	+ struct nvmft_qpair *qp, int error);
	+void nvmft_controller_lun_changed(struct nvmft_controller *ctrlr,
	+ int lun_id);
	+void nvmft_handle_admin_command(struct nvmft_controller *ctrlr,
	+ struct nvmf_capsule *nc);
	+void nvmft_handle_io_command(struct nvmft_qpair *qp, uint16_t qid,
	+ struct nvmf_capsule *nc);
	+int nvmft_handoff_admin_queue(struct nvmft_port *np,
	+ const struct nvmf_handoff_controller_qpair *handoff,
	+ const struct nvmf_fabric_connect_cmd *cmd,
	+ const struct nvmf_fabric_connect_data *data);
	+int nvmft_handoff_io_queue(struct nvmft_port *np,
	+ const struct nvmf_handoff_controller_qpair *handoff,
	+ const struct nvmf_fabric_connect_cmd *cmd,
	+ const struct nvmf_fabric_connect_data *data);
	+int nvmft_printf(struct nvmft_controller ctrlr, const char fmt, ...)
	+ __printflike(2, 3);
	+
	+/* nvmft_qpair.c */
	+struct nvmft_qpair *nvmft_qpair_init(enum nvmf_trtype trtype,
	+ const struct nvmf_handoff_qpair_params *handoff, uint16_t qid,
	+ const char *name);
	+void nvmft_qpair_shutdown(struct nvmft_qpair *qp);
	+void nvmft_qpair_destroy(struct nvmft_qpair *qp);
	+struct nvmft_controller nvmft_qpair_ctrlr(struct nvmft_qpair qp);
	+uint16_t nvmft_qpair_id(struct nvmft_qpair *qp);
	+const char nvmft_qpair_name(struct nvmft_qpair qp);
	+void nvmft_command_completed(struct nvmft_qpair *qp,
	+ struct nvmf_capsule *nc);
	+int nvmft_send_response(struct nvmft_qpair qp, const void cqe);
	+void nvmft_init_cqe(void cqe, struct nvmf_capsule nc, uint16_t status);
	+int nvmft_send_error(struct nvmft_qpair qp, struct nvmf_capsule nc,
	+ uint8_t sc_type, uint8_t sc_status);
	+int nvmft_send_generic_error(struct nvmft_qpair *qp,
	+ struct nvmf_capsule *nc, uint8_t sc_status);
	+int nvmft_send_success(struct nvmft_qpair *qp,
	+ struct nvmf_capsule *nc);
	+void nvmft_connect_error(struct nvmft_qpair *qp,
	+ const struct nvmf_fabric_connect_cmd *cmd, uint8_t sc_type,
	+ uint8_t sc_status);
	+void nvmft_connect_invalid_parameters(struct nvmft_qpair *qp,
	+ const struct nvmf_fabric_connect_cmd *cmd, bool data, uint16_t offset);
	+int nvmft_finish_accept(struct nvmft_qpair *qp,
	+ const struct nvmf_fabric_connect_cmd cmd, struct nvmft_controller ctrlr);
	+
	+static __inline void
	+nvmft_port_ref(struct nvmft_port *np)
	+{
	+ refcount_acquire(&np->refs);
	+}
	+
	+static __inline void
	+nvmft_port_rele(struct nvmft_port *np)
	+{
	+ if (refcount_release(&np->refs))
	+ nvmft_port_free(np);
	+}
	+
	+#endif /* !__NVMFT_VAR_H__ */
	diff --git a/sys/modules/nvmf/Makefile b/sys/modules/nvmf/Makefile
	--- a/sys/modules/nvmf/Makefile
	+++ b/sys/modules/nvmf/Makefile
	@@ -1,5 +1,6 @@
	SUBDIR= nvmf \
	nvmf_tcp \
	- nvmf_transport
	+ nvmf_transport \
	+ nvmft

	.include <bsd.subdir.mk>
	diff --git a/sys/modules/nvmf/nvmft/Makefile b/sys/modules/nvmf/nvmft/Makefile
	new file mode 100644
	--- /dev/null
	+++ b/sys/modules/nvmf/nvmft/Makefile
	@@ -0,0 +1,10 @@
	+.PATH: ${SRCTOP}/sys/dev/nvmf/controller
	+
	+KMOD= nvmft
	+
	+SRCS= ctl_frontend_nvmf.c \
	+ nvmft_controller.c \
	+ nvmft_subr.c \
	+ nvmft_qpair.c
	+
	+.include <bsd.kmod.mk>

File Metadata

Mime Type: text/plain
Expires: Mon, Jan 27, 11:47 AM (3 h, 53 m)
Storage Engine: blob
Storage Format: Raw Data
Storage Handle: 16195294
Default Alt Text: D44726.diff (76 KB)

D44726.diffNo OneTemporaryActions

D44726.diffView Options

File Metadata

Event Timeline

D44726.diff
No OneTemporary
Actions

D44726.diff
View Options