Page MenuHomeFreeBSD

D44726.diff
No OneTemporary

D44726.diff

diff --git a/share/man/man4/Makefile b/share/man/man4/Makefile
--- a/share/man/man4/Makefile
+++ b/share/man/man4/Makefile
@@ -410,6 +410,7 @@
nvme.4 \
nvmf.4 \
nvmf_tcp.4 \
+ nvmft.4 \
${_nvram.4} \
oce.4 \
ocs_fc.4\
diff --git a/share/man/man4/nvmft.4 b/share/man/man4/nvmft.4
new file mode 100644
--- /dev/null
+++ b/share/man/man4/nvmft.4
@@ -0,0 +1,85 @@
+.\"
+.\" SPDX-License-Identifier: BSD-2-Clause
+.\"
+.\" Copyright (c) 2024 Chelsio Communications, Inc.
+.\"
+.Dd May 2, 2024
+.Dt NVMFT 4
+.Os
+.Sh NAME
+.Nm nvmft
+.Nd "NVM Express over Fabrics CAM Target Layer frontend"
+.Sh SYNOPSIS
+To compile the subsystem into the kernel,
+place the following lines in the
+kernel configuration file:
+.Bd -ragged -offset indent
+.Cd "device nvmft"
+.Cd "device ctl"
+.Ed
+.Pp
+Alternatively, to load the subsystem as a
+module at boot time, place the following line in
+.Xr loader.conf 5 :
+.Bd -literal -offset indent
+nvmft_load="YES"
+.Ed
+.Sh DESCRIPTION
+The
+.Nm
+driver provides the kernel component of an NVM Express over Fabrics
+controller.
+The NVMeoF controller is the server exporting namespaces backed by
+local files and volumes to remote hosts.
+.Nm
+follows the dynamic controller model and creates a new dynamic controller
+for each association.
+.Pp
+.Nm
+is implemented as a
+.Xr ctl 4
+frontend and exports CAM Target Layer LUNs as namespaces to remote hosts.
+LUNs can be configured via
+.Xr ctladm 8 .
+.Pp
+Associations between the local controller and remote hosts are managed
+using both the
+.Xr nvmfd 8
+daemon and the
+.Xr ctladm 8
+utility.
+The
+.Xr nvmfd 8
+daemon listens for new associations and handles transport-specific
+negotiation before handing off connected queue pairs to
+.Nm
+which associates queue pairs with a suitable controller instance.
+The
+.Cm nvlist
+.Xr ctladm 8
+command lists active controllers.
+The
+.Cm nvterminate
+command terminates one or more associations between a local controller
+and a remote host.
+.Pp
+Associations require a supported transport such as
+.Xr nvmf_tcp 4
+for associations using TCP/IP.
+.Sh SEE ALSO
+.Xr ctl 4 ,
+.Xr nvmf 4 ,
+.Xr nvmf_tcp 4 ,
+.Xr ctladm 8 ,
+.Xr nvmfd 8
+.Sh HISTORY
+The
+.Nm
+module first appeared in
+.Fx 15.0 .
+.Sh AUTHORS
+The
+.Nm
+subsystem was developed by
+.An John Baldwin Aq Mt jhb@FreeBSD.org
+under sponsorship from Chelsio Communications, Inc.
diff --git a/sys/conf/NOTES b/sys/conf/NOTES
--- a/sys/conf/NOTES
+++ b/sys/conf/NOTES
@@ -1677,6 +1677,7 @@
#
# nvme: PCI-express NVM Express host controllers
# nvmf: NVM Express over Fabrics host
+# nvmft: NVM Express over Fabrics CAM Target Layer frontend
# nvmf_tcp: TCP transport for NVM Express over Fabrics
# nda: CAM NVMe disk driver
# nvd: non-CAM NVMe disk driver
@@ -1684,6 +1685,7 @@
device nvme # PCI-express NVMe host driver
options NVME_USE_NVD=1 # Use nvd(4) instead of the CAM nda(4) driver
device nvmf # NVMeoF host driver
+device nvmft # NVMeoF ctl(4) frontend
device nvmf_tcp # NVMeoF TCP transport
device nda # NVMe direct access devices (aka disks)
device nvd # expose NVMe namespaces as disks, depends on nvme
diff --git a/sys/conf/files b/sys/conf/files
--- a/sys/conf/files
+++ b/sys/conf/files
@@ -2535,6 +2535,10 @@
dev/nvme/nvme_util.c optional nvme
dev/nvmem/nvmem.c optional nvmem fdt
dev/nvmem/nvmem_if.m optional nvmem
+dev/nvmf/controller/ctl_frontend_nvmf.c optional nvmft
+dev/nvmf/controller/nvmft_controller.c optional nvmft
+dev/nvmf/controller/nvmft_subr.c optional nvmft
+dev/nvmf/controller/nvmft_qpair.c optional nvmft
dev/nvmf/host/nvmf.c optional nvmf
dev/nvmf/host/nvmf_aer.c optional nvmf
dev/nvmf/host/nvmf_cmd.c optional nvmf
@@ -2543,7 +2547,7 @@
dev/nvmf/host/nvmf_qpair.c optional nvmf
dev/nvmf/host/nvmf_sim.c optional nvmf
dev/nvmf/nvmf_tcp.c optional nvmf_tcp
-dev/nvmf/nvmf_transport.c optional nvmf
+dev/nvmf/nvmf_transport.c optional nvmf | optional nvmft
dev/oce/oce_hw.c optional oce pci
dev/oce/oce_if.c optional oce pci
dev/oce/oce_mbox.c optional oce pci
diff --git a/sys/dev/nvmf/controller/ctl_frontend_nvmf.c b/sys/dev/nvmf/controller/ctl_frontend_nvmf.c
new file mode 100644
--- /dev/null
+++ b/sys/dev/nvmf/controller/ctl_frontend_nvmf.c
@@ -0,0 +1,1123 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2023-2024 Chelsio Communications, Inc.
+ * Written by: John Baldwin <jhb@FreeBSD.org>
+ */
+
+#include <sys/param.h>
+#include <sys/dnv.h>
+#include <sys/jail.h>
+#include <sys/kernel.h>
+#include <sys/limits.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/memdesc.h>
+#include <sys/module.h>
+#include <sys/proc.h>
+#include <sys/queue.h>
+#include <sys/refcount.h>
+#include <sys/sbuf.h>
+#include <sys/sx.h>
+
+#include <machine/bus.h>
+#include <machine/bus_dma.h>
+
+#include <dev/nvmf/nvmf.h>
+#include <dev/nvmf/nvmf_transport.h>
+#include <dev/nvmf/controller/nvmft_subr.h>
+#include <dev/nvmf/controller/nvmft_var.h>
+
+#include <cam/ctl/ctl.h>
+#include <cam/ctl/ctl_error.h>
+#include <cam/ctl/ctl_io.h>
+#include <cam/ctl/ctl_frontend.h>
+
+/*
+ * Store pointers to the capsule and qpair in the two pointer members
+ * of CTL_PRIV_FRONTEND.
+ */
+#define NVMFT_NC(io) ((io)->io_hdr.ctl_private[CTL_PRIV_FRONTEND].ptrs[0])
+#define NVMFT_QP(io) ((io)->io_hdr.ctl_private[CTL_PRIV_FRONTEND].ptrs[1])
+
+static void nvmft_done(union ctl_io *io);
+static int nvmft_init(void);
+static int nvmft_ioctl(struct cdev *cdev, u_long cmd, caddr_t data,
+ int flag, struct thread *td);
+static int nvmft_shutdown(void);
+
+static TAILQ_HEAD(, nvmft_port) nvmft_ports;
+static struct sx nvmft_ports_lock;
+
+MALLOC_DEFINE(M_NVMFT, "nvmft", "NVMe over Fabrics controller");
+
+static struct ctl_frontend nvmft_frontend = {
+ .name = "nvmf",
+ .init = nvmft_init,
+ .ioctl = nvmft_ioctl,
+ .fe_dump = NULL,
+ .shutdown = nvmft_shutdown,
+};
+
+static void
+nvmft_online(void *arg)
+{
+ struct nvmft_port *np = arg;
+
+ sx_xlock(&np->lock);
+ np->online = true;
+ sx_xunlock(&np->lock);
+}
+
+static void
+nvmft_offline(void *arg)
+{
+ struct nvmft_port *np = arg;
+ struct nvmft_controller *ctrlr;
+
+ sx_xlock(&np->lock);
+ np->online = false;
+
+ TAILQ_FOREACH(ctrlr, &np->controllers, link) {
+ nvmft_printf(ctrlr,
+ "shutting down due to port going offline\n");
+ nvmft_controller_error(ctrlr, NULL, ENODEV);
+ }
+
+ while (!TAILQ_EMPTY(&np->controllers))
+ sx_sleep(np, &np->lock, 0, "nvmfoff", 0);
+ sx_xunlock(&np->lock);
+}
+
+static int
+nvmft_lun_enable(void *arg, int lun_id)
+{
+ struct nvmft_port *np = arg;
+ struct nvmft_controller *ctrlr;
+ uint32_t *old_ns, *new_ns;
+ uint32_t nsid;
+ u_int i;
+
+ if (lun_id >= le32toh(np->cdata.nn)) {
+ printf("NVMFT: %s lun %d larger than maximum nsid %u\n",
+ np->cdata.subnqn, lun_id, le32toh(np->cdata.nn));
+ return (EOPNOTSUPP);
+ }
+ nsid = lun_id + 1;
+
+ sx_xlock(&np->lock);
+ new_ns = mallocarray(np->num_ns + 1, sizeof(*new_ns), M_NVMFT,
+ M_WAITOK);
+ for (i = 0; i < np->num_ns; i++) {
+ if (np->active_ns[i] < nsid)
+ continue;
+ if (np->active_ns[i] == nsid) {
+ sx_xunlock(&np->lock);
+ free(new_ns, M_NVMFT);
+ printf("NVMFT: %s duplicate lun %d\n",
+ np->cdata.subnqn, lun_id);
+ return (EINVAL);
+ }
+ break;
+ }
+
+ /* Copy over IDs smaller than nsid. */
+ memcpy(new_ns, np->active_ns, i * sizeof(*np->active_ns));
+
+ /* Insert nsid. */
+ new_ns[i] = nsid;
+
+ /* Copy over IDs greater than nsid. */
+ memcpy(new_ns + i + 1, np->active_ns + i, (np->num_ns - i) *
+ sizeof(*np->active_ns));
+
+ np->num_ns++;
+ old_ns = np->active_ns;
+ np->active_ns = new_ns;
+
+ TAILQ_FOREACH(ctrlr, &np->controllers, link) {
+ nvmft_controller_lun_changed(ctrlr, lun_id);
+ }
+
+ sx_xunlock(&np->lock);
+ free(old_ns, M_NVMFT);
+
+ return (0);
+}
+
+static int
+nvmft_lun_disable(void *arg, int lun_id)
+{
+ struct nvmft_port *np = arg;
+ struct nvmft_controller *ctrlr;
+ uint32_t nsid;
+ u_int i;
+
+ if (lun_id >= le32toh(np->cdata.nn))
+ return (0);
+ nsid = lun_id + 1;
+
+ sx_xlock(&np->lock);
+ for (i = 0; i < np->num_ns; i++) {
+ if (np->active_ns[i] == nsid)
+ goto found;
+ }
+ sx_xunlock(&np->lock);
+ printf("NVMFT: %s request to disable nonexistent lun %d\n",
+ np->cdata.subnqn, lun_id);
+ return (EINVAL);
+
+found:
+ /* Move down IDs greater than nsid. */
+ memmove(np->active_ns + i, np->active_ns + i + 1,
+ (np->num_ns - (i + 1)) * sizeof(*np->active_ns));
+ np->num_ns--;
+
+ /* NB: Don't bother freeing the old active_ns array. */
+
+ TAILQ_FOREACH(ctrlr, &np->controllers, link) {
+ nvmft_controller_lun_changed(ctrlr, lun_id);
+ }
+
+ sx_xunlock(&np->lock);
+
+ return (0);
+}
+
+void
+nvmft_populate_active_nslist(struct nvmft_port *np, uint32_t nsid,
+ struct nvme_ns_list *nslist)
+{
+ u_int i, count;
+
+ sx_slock(&np->lock);
+ count = 0;
+ for (i = 0; i < np->num_ns; i++) {
+ if (np->active_ns[i] <= nsid)
+ continue;
+ nslist->ns[count] = htole32(np->active_ns[i]);
+ count++;
+ if (count == nitems(nslist->ns))
+ break;
+ }
+ sx_sunlock(&np->lock);
+}
+
+void
+nvmft_dispatch_command(struct nvmft_qpair *qp, struct nvmf_capsule *nc,
+ bool admin)
+{
+ struct nvmft_controller *ctrlr = nvmft_qpair_ctrlr(qp);
+ const struct nvme_command *cmd = nvmf_capsule_sqe(nc);
+ struct nvmft_port *np = ctrlr->np;
+ union ctl_io *io;
+ int error;
+
+ if (cmd->nsid == htole32(0)) {
+ nvmft_send_generic_error(qp, nc,
+ NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
+ nvmf_free_capsule(nc);
+ return;
+ }
+
+ mtx_lock(&ctrlr->lock);
+ if (ctrlr->pending_commands == 0)
+ ctrlr->start_busy = sbinuptime();
+ ctrlr->pending_commands++;
+ mtx_unlock(&ctrlr->lock);
+ io = ctl_alloc_io(np->port.ctl_pool_ref);
+ ctl_zero_io(io);
+ NVMFT_NC(io) = nc;
+ NVMFT_QP(io) = qp;
+ io->io_hdr.io_type = admin ? CTL_IO_NVME_ADMIN : CTL_IO_NVME;
+ io->io_hdr.nexus.initid = ctrlr->cntlid;
+ io->io_hdr.nexus.targ_port = np->port.targ_port;
+ io->io_hdr.nexus.targ_lun = le32toh(cmd->nsid) - 1;
+ io->nvmeio.cmd = *cmd;
+ error = ctl_run(io);
+ if (error != 0) {
+ nvmft_printf(ctrlr, "ctl_run failed for command on %s: %d\n",
+ nvmft_qpair_name(qp), error);
+ ctl_nvme_set_generic_error(&io->nvmeio,
+ NVME_SC_INTERNAL_DEVICE_ERROR);
+ nvmft_done(io);
+
+ nvmft_controller_error(ctrlr, qp, ENXIO);
+ }
+}
+
+void
+nvmft_terminate_commands(struct nvmft_controller *ctrlr)
+{
+ struct nvmft_port *np = ctrlr->np;
+ union ctl_io *io;
+ int error;
+
+ mtx_lock(&ctrlr->lock);
+ if (ctrlr->pending_commands == 0)
+ ctrlr->start_busy = sbinuptime();
+ ctrlr->pending_commands++;
+ mtx_unlock(&ctrlr->lock);
+ io = ctl_alloc_io(np->port.ctl_pool_ref);
+ ctl_zero_io(io);
+ NVMFT_QP(io) = ctrlr->admin;
+ io->io_hdr.io_type = CTL_IO_TASK;
+ io->io_hdr.nexus.initid = ctrlr->cntlid;
+ io->io_hdr.nexus.targ_port = np->port.targ_port;
+ io->io_hdr.nexus.targ_lun = 0;
+ io->taskio.tag_type = CTL_TAG_SIMPLE; /* XXX: unused? */
+ io->taskio.task_action = CTL_TASK_I_T_NEXUS_RESET;
+ error = ctl_run(io);
+ if (error != CTL_RETVAL_COMPLETE) {
+ nvmft_printf(ctrlr, "failed to terminate tasks: %d\n", error);
+#ifdef INVARIANTS
+ io->io_hdr.status = CTL_SUCCESS;
+#endif
+ nvmft_done(io);
+ }
+}
+
+static void
+nvmft_datamove_out_cb(void *arg, size_t xfered, int error)
+{
+ struct ctl_nvmeio *ctnio = arg;
+
+ if (error != 0) {
+ ctl_nvme_set_data_transfer_error(ctnio);
+ } else {
+ MPASS(xfered == ctnio->kern_data_len);
+ ctnio->kern_data_resid -= xfered;
+ }
+
+ if (ctnio->kern_sg_entries) {
+ free(ctnio->ext_data_ptr, M_NVMFT);
+ ctnio->ext_data_ptr = NULL;
+ } else
+ MPASS(ctnio->ext_data_ptr == NULL);
+ ctl_datamove_done((union ctl_io *)ctnio, false);
+}
+
+static void
+nvmft_datamove_out(struct ctl_nvmeio *ctnio, struct nvmft_qpair *qp,
+ struct nvmf_capsule *nc)
+{
+ struct memdesc mem;
+ int error;
+
+ MPASS(ctnio->ext_data_ptr == NULL);
+ if (ctnio->kern_sg_entries > 0) {
+ struct ctl_sg_entry *sgl;
+ struct bus_dma_segment *vlist;
+
+ vlist = mallocarray(ctnio->kern_sg_entries, sizeof(*vlist),
+ M_NVMFT, M_WAITOK);
+ ctnio->ext_data_ptr = (void *)vlist;
+ sgl = (struct ctl_sg_entry *)ctnio->kern_data_ptr;
+ for (u_int i = 0; i < ctnio->kern_sg_entries; i++) {
+ vlist[i].ds_addr = (uintptr_t)sgl[i].addr;
+ vlist[i].ds_len = sgl[i].len;
+ }
+ mem = memdesc_vlist(vlist, ctnio->kern_sg_entries);
+ } else
+ mem = memdesc_vaddr(ctnio->kern_data_ptr, ctnio->kern_data_len);
+
+ error = nvmf_receive_controller_data(nc, ctnio->kern_rel_offset, &mem,
+ ctnio->kern_data_len, nvmft_datamove_out_cb, ctnio);
+ if (error == 0)
+ return;
+
+ nvmft_printf(nvmft_qpair_ctrlr(qp),
+ "Failed to request capsule data: %d\n", error);
+ ctl_nvme_set_data_transfer_error(ctnio);
+
+ if (ctnio->kern_sg_entries) {
+ free(ctnio->ext_data_ptr, M_NVMFT);
+ ctnio->ext_data_ptr = NULL;
+ } else
+ MPASS(ctnio->ext_data_ptr == NULL);
+ ctl_datamove_done((union ctl_io *)ctnio, true);
+}
+
+static struct mbuf *
+nvmft_copy_data(struct ctl_nvmeio *ctnio)
+{
+ struct ctl_sg_entry *sgl;
+ struct mbuf *m0, *m;
+ uint32_t resid, off, todo;
+ int mlen;
+
+ MPASS(ctnio->kern_data_len != 0);
+
+ m0 = m_getm2(NULL, ctnio->kern_data_len, M_WAITOK, MT_DATA, 0);
+
+ if (ctnio->kern_sg_entries == 0) {
+ m_copyback(m0, 0, ctnio->kern_data_len, ctnio->kern_data_ptr);
+ return (m0);
+ }
+
+ resid = ctnio->kern_data_len;
+ sgl = (struct ctl_sg_entry *)ctnio->kern_data_ptr;
+ off = 0;
+ m = m0;
+ mlen = M_TRAILINGSPACE(m);
+ for (;;) {
+ todo = MIN(mlen, sgl->len - off);
+ memcpy(mtod(m, char *) + m->m_len, (char *)sgl->addr + off,
+ todo);
+ m->m_len += todo;
+ resid -= todo;
+ if (resid == 0) {
+ MPASS(m->m_next == NULL);
+ break;
+ }
+
+ off += todo;
+ if (off == sgl->len) {
+ sgl++;
+ off = 0;
+ }
+ mlen -= todo;
+ if (mlen == 0) {
+ m = m->m_next;
+ mlen = M_TRAILINGSPACE(m);
+ }
+ }
+
+ return (m0);
+}
+
+static void
+m_free_ref_data(struct mbuf *m)
+{
+ ctl_ref kern_data_ref = m->m_ext.ext_arg1;
+
+ kern_data_ref(m->m_ext.ext_arg2, -1);
+}
+
+static struct mbuf *
+m_get_ref_data(struct ctl_nvmeio *ctnio, void *buf, u_int size)
+{
+ struct mbuf *m;
+
+ m = m_get(M_WAITOK, MT_DATA);
+ m_extadd(m, buf, size, m_free_ref_data, ctnio->kern_data_ref,
+ ctnio->kern_data_arg, M_RDONLY, EXT_CTL);
+ m->m_len = size;
+ ctnio->kern_data_ref(ctnio->kern_data_arg, 1);
+ return (m);
+}
+
+static struct mbuf *
+nvmft_ref_data(struct ctl_nvmeio *ctnio)
+{
+ struct ctl_sg_entry *sgl;
+ struct mbuf *m0, *m;
+
+ MPASS(ctnio->kern_data_len != 0);
+
+ if (ctnio->kern_sg_entries == 0)
+ return (m_get_ref_data(ctnio, ctnio->kern_data_ptr,
+ ctnio->kern_data_len));
+
+ sgl = (struct ctl_sg_entry *)ctnio->kern_data_ptr;
+ m0 = m_get_ref_data(ctnio, sgl[0].addr, sgl[0].len);
+ m = m0;
+ for (u_int i = 1; i < ctnio->kern_sg_entries; i++) {
+ m->m_next = m_get_ref_data(ctnio, sgl[i].addr, sgl[i].len);
+ m = m->m_next;
+ }
+ return (m0);
+}
+
+static void
+nvmft_datamove_in(struct ctl_nvmeio *ctnio, struct nvmft_qpair *qp,
+ struct nvmf_capsule *nc)
+{
+ struct mbuf *m;
+ u_int status;
+
+ if (ctnio->kern_data_ref != NULL)
+ m = nvmft_ref_data(ctnio);
+ else
+ m = nvmft_copy_data(ctnio);
+ status = nvmf_send_controller_data(nc, ctnio->kern_rel_offset, m,
+ ctnio->kern_data_len);
+ switch (status) {
+ case NVMF_SUCCESS_SENT:
+ ctnio->success_sent = true;
+ nvmft_command_completed(qp, nc);
+ /* FALLTHROUGH */
+ case NVMF_MORE:
+ case NVME_SC_SUCCESS:
+ break;
+ default:
+ ctl_nvme_set_generic_error(ctnio, status);
+ break;
+ }
+ ctl_datamove_done((union ctl_io *)ctnio, true);
+}
+
+static void
+nvmft_datamove(union ctl_io *io)
+{
+ struct nvmf_capsule *nc;
+ struct nvmft_qpair *qp;
+
+ /* Some CTL commands preemptively set a success status. */
+ MPASS(io->io_hdr.status == CTL_STATUS_NONE ||
+ io->io_hdr.status == CTL_SUCCESS);
+ MPASS(!io->nvmeio.success_sent);
+
+ nc = NVMFT_NC(io);
+ qp = NVMFT_QP(io);
+
+ if ((io->io_hdr.flags & CTL_FLAG_DATA_MASK) == CTL_FLAG_DATA_IN)
+ nvmft_datamove_in(&io->nvmeio, qp, nc);
+ else
+ nvmft_datamove_out(&io->nvmeio, qp, nc);
+}
+
+static void
+hip_add(uint64_t pair[2], uint64_t addend)
+{
+ uint64_t old, new;
+
+ old = le64toh(pair[0]);
+ new = old + addend;
+ pair[0] = htole64(new);
+ if (new < old)
+ pair[1] += htole64(1);
+}
+
+static void
+nvmft_done(union ctl_io *io)
+{
+ struct nvmft_controller *ctrlr;
+ const struct nvme_command *cmd;
+ struct nvmft_qpair *qp;
+ struct nvmf_capsule *nc;
+ size_t len;
+
+ KASSERT(io->io_hdr.status == CTL_SUCCESS ||
+ io->io_hdr.status == CTL_NVME_ERROR,
+ ("%s: bad status %u", __func__, io->io_hdr.status));
+
+ nc = NVMFT_NC(io);
+ qp = NVMFT_QP(io);
+ ctrlr = nvmft_qpair_ctrlr(qp);
+
+ if (nc == NULL) {
+ /* Completion of nvmft_terminate_commands. */
+ goto end;
+ }
+
+ cmd = nvmf_capsule_sqe(nc);
+
+ if (io->io_hdr.status == CTL_SUCCESS)
+ len = nvmf_capsule_data_len(nc) / 512;
+ else
+ len = 0;
+ switch (cmd->opc) {
+ case NVME_OPC_WRITE:
+ mtx_lock(&ctrlr->lock);
+ hip_add(ctrlr->hip.host_write_commands, 1);
+ len += ctrlr->partial_duw;
+ if (len > 1000)
+ hip_add(ctrlr->hip.data_units_written, len / 1000);
+ ctrlr->partial_duw = len % 1000;
+ mtx_unlock(&ctrlr->lock);
+ break;
+ case NVME_OPC_READ:
+ case NVME_OPC_COMPARE:
+ case NVME_OPC_VERIFY:
+ mtx_lock(&ctrlr->lock);
+ if (cmd->opc != NVME_OPC_VERIFY)
+ hip_add(ctrlr->hip.host_read_commands, 1);
+ len += ctrlr->partial_dur;
+ if (len > 1000)
+ hip_add(ctrlr->hip.data_units_read, len / 1000);
+ ctrlr->partial_dur = len % 1000;
+ mtx_unlock(&ctrlr->lock);
+ break;
+ }
+
+ if (io->nvmeio.success_sent) {
+ MPASS(io->io_hdr.status == CTL_SUCCESS);
+ } else {
+ io->nvmeio.cpl.cid = cmd->cid;
+ nvmft_send_response(qp, &io->nvmeio.cpl);
+ }
+ nvmf_free_capsule(nc);
+end:
+ ctl_free_io(io);
+ mtx_lock(&ctrlr->lock);
+ ctrlr->pending_commands--;
+ if (ctrlr->pending_commands == 0)
+ ctrlr->busy_total += sbinuptime() - ctrlr->start_busy;
+ mtx_unlock(&ctrlr->lock);
+}
+
+static int
+nvmft_init(void)
+{
+ TAILQ_INIT(&nvmft_ports);
+ sx_init(&nvmft_ports_lock, "nvmft ports");
+ return (0);
+}
+
+void
+nvmft_port_free(struct nvmft_port *np)
+{
+ KASSERT(TAILQ_EMPTY(&np->controllers),
+ ("%s(%p): active controllers", __func__, np));
+
+ if (np->port.targ_port != -1) {
+ if (ctl_port_deregister(&np->port) != 0)
+ printf("%s: ctl_port_deregister() failed\n", __func__);
+ }
+
+ free(np->active_ns, M_NVMFT);
+ clean_unrhdr(np->ids);
+ delete_unrhdr(np->ids);
+ sx_destroy(&np->lock);
+ free(np, M_NVMFT);
+}
+
+static struct nvmft_port *
+nvmft_port_find(const char *subnqn)
+{
+ struct nvmft_port *np;
+
+ KASSERT(nvmf_nqn_valid(subnqn), ("%s: invalid nqn", __func__));
+
+ sx_assert(&nvmft_ports_lock, SA_LOCKED);
+ TAILQ_FOREACH(np, &nvmft_ports, link) {
+ if (strcmp(np->cdata.subnqn, subnqn) == 0)
+ break;
+ }
+ return (np);
+}
+
+static struct nvmft_port *
+nvmft_port_find_by_id(int port_id)
+{
+ struct nvmft_port *np;
+
+ sx_assert(&nvmft_ports_lock, SA_LOCKED);
+ TAILQ_FOREACH(np, &nvmft_ports, link) {
+ if (np->port.targ_port == port_id)
+ break;
+ }
+ return (np);
+}
+
+/*
+ * Helper function to fetch a number stored as a string in an nv_list.
+ * Returns false if the string was not a valid number.
+ */
+static bool
+dnvlist_get_strnum(nvlist_t *nvl, const char *name, u_long default_value,
+ u_long *value)
+{
+ const char *str;
+ char *cp;
+
+ str = dnvlist_get_string(nvl, name, NULL);
+ if (str == NULL) {
+ *value = default_value;
+ return (true);
+ }
+ if (*str == '\0')
+ return (false);
+ *value = strtoul(str, &cp, 0);
+ if (*cp != '\0')
+ return (false);
+ return (true);
+}
+
+/*
+ * NVMeoF ports support the following parameters:
+ *
+ * Mandatory:
+ *
+ * subnqn: subsystem NVMe Qualified Name
+ * portid: integer port ID from Discovery Log Page entry
+ *
+ * Optional:
+ * serial: Serial Number string
+ * max_io_qsize: Maximum number of I/O queue entries
+ * enable_timeout: Timeout for controller enable in milliseconds
+ * ioccsz: Maximum command capsule size
+ * iorcsz: Maximum response capsule size
+ * nn: Number of namespaces
+ */
+static void
+nvmft_port_create(struct ctl_req *req)
+{
+ struct nvmft_port *np;
+ struct ctl_port *port;
+ const char *serial, *subnqn;
+ char serial_buf[NVME_SERIAL_NUMBER_LENGTH];
+ u_long enable_timeout, hostid, ioccsz, iorcsz, max_io_qsize, nn, portid;
+ int error;
+
+ /* Required parameters. */
+ subnqn = dnvlist_get_string(req->args_nvl, "subnqn", NULL);
+ if (subnqn == NULL || !nvlist_exists_string(req->args_nvl, "portid")) {
+ req->status = CTL_LUN_ERROR;
+ snprintf(req->error_str, sizeof(req->error_str),
+ "Missing required argument");
+ return;
+ }
+ if (!nvmf_nqn_valid(subnqn)) {
+ req->status = CTL_LUN_ERROR;
+ snprintf(req->error_str, sizeof(req->error_str),
+ "Invalid SubNQN");
+ return;
+ }
+ if (!dnvlist_get_strnum(req->args_nvl, "portid", UINT16_MAX, &portid) ||
+ portid > UINT16_MAX) {
+ req->status = CTL_LUN_ERROR;
+ snprintf(req->error_str, sizeof(req->error_str),
+ "Invalid port ID");
+ return;
+ }
+
+ /* Optional parameters. */
+ if (!dnvlist_get_strnum(req->args_nvl, "max_io_qsize",
+ NVMF_MAX_IO_ENTRIES, &max_io_qsize) ||
+ max_io_qsize < NVME_MIN_IO_ENTRIES ||
+ max_io_qsize > NVME_MAX_IO_ENTRIES) {
+ req->status = CTL_LUN_ERROR;
+ snprintf(req->error_str, sizeof(req->error_str),
+ "Invalid maximum I/O queue size");
+ return;
+ }
+
+ if (!dnvlist_get_strnum(req->args_nvl, "enable_timeout",
+ NVMF_CC_EN_TIMEOUT * 500, &enable_timeout) ||
+ (enable_timeout % 500) != 0 || (enable_timeout / 500) > 255) {
+ req->status = CTL_LUN_ERROR;
+ snprintf(req->error_str, sizeof(req->error_str),
+ "Invalid enable timeout");
+ return;
+ }
+
+ if (!dnvlist_get_strnum(req->args_nvl, "ioccsz", NVMF_IOCCSZ,
+ &ioccsz) || ioccsz < sizeof(struct nvme_command) ||
+ (ioccsz % 16) != 0) {
+ req->status = CTL_LUN_ERROR;
+ snprintf(req->error_str, sizeof(req->error_str),
+ "Invalid Command Capsule size");
+ return;
+ }
+
+ if (!dnvlist_get_strnum(req->args_nvl, "iorcsz", NVMF_IORCSZ,
+ &iorcsz) || iorcsz < sizeof(struct nvme_completion) ||
+ (iorcsz % 16) != 0) {
+ req->status = CTL_LUN_ERROR;
+ snprintf(req->error_str, sizeof(req->error_str),
+ "Invalid Response Capsule size");
+ return;
+ }
+
+ if (!dnvlist_get_strnum(req->args_nvl, "nn", NVMF_NN, &nn) ||
+ nn < 1 || nn > UINT32_MAX) {
+ req->status = CTL_LUN_ERROR;
+ snprintf(req->error_str, sizeof(req->error_str),
+ "Invalid number of namespaces");
+ return;
+ }
+
+ serial = dnvlist_get_string(req->args_nvl, "serial", NULL);
+ if (serial == NULL) {
+ getcredhostid(curthread->td_ucred, &hostid);
+ nvmf_controller_serial(serial_buf, sizeof(serial_buf), hostid);
+ serial = serial_buf;
+ }
+
+ sx_xlock(&nvmft_ports_lock);
+
+ np = nvmft_port_find(subnqn);
+ if (np != NULL) {
+ req->status = CTL_LUN_ERROR;
+ snprintf(req->error_str, sizeof(req->error_str),
+ "SubNQN \"%s\" already exists", subnqn);
+ sx_xunlock(&nvmft_ports_lock);
+ return;
+ }
+
+ np = malloc(sizeof(*np), M_NVMFT, M_WAITOK | M_ZERO);
+ refcount_init(&np->refs, 1);
+ np->max_io_qsize = max_io_qsize;
+ np->cap = _nvmf_controller_cap(max_io_qsize, enable_timeout / 500);
+ sx_init(&np->lock, "nvmft port");
+ np->ids = new_unrhdr(0, MIN(CTL_MAX_INIT_PER_PORT - 1,
+ NVMF_CNTLID_STATIC_MAX), UNR_NO_MTX);
+ TAILQ_INIT(&np->controllers);
+
+ /* The controller ID is set later for individual controllers. */
+ _nvmf_init_io_controller_data(0, max_io_qsize, serial, ostype,
+ osrelease, subnqn, nn, ioccsz, iorcsz, &np->cdata);
+ np->cdata.aerl = NVMFT_NUM_AER - 1;
+ np->cdata.oaes = htole32(NVME_ASYNC_EVENT_NS_ATTRIBUTE);
+ np->cdata.oncs = htole16(NVMEF(NVME_CTRLR_DATA_ONCS_VERIFY, 1) |
+ NVMEF(NVME_CTRLR_DATA_ONCS_WRZERO, 1) |
+ NVMEF(NVME_CTRLR_DATA_ONCS_DSM, 1) |
+ NVMEF(NVME_CTRLR_DATA_ONCS_COMPARE, 1));
+ np->cdata.fuses = NVMEF(NVME_CTRLR_DATA_FUSES_CNW, 1);
+
+ np->fp.afi = NVMEF(NVME_FIRMWARE_PAGE_AFI_SLOT, 1);
+ memcpy(np->fp.revision[0], np->cdata.fr, sizeof(np->cdata.fr));
+
+ port = &np->port;
+
+ port->frontend = &nvmft_frontend;
+ port->port_type = CTL_PORT_NVMF;
+ port->num_requested_ctl_io = max_io_qsize;
+ port->port_name = "nvmf";
+ port->physical_port = portid;
+ port->virtual_port = 0;
+ port->port_online = nvmft_online;
+ port->port_offline = nvmft_offline;
+ port->onoff_arg = np;
+ port->lun_enable = nvmft_lun_enable;
+ port->lun_disable = nvmft_lun_disable;
+ port->targ_lun_arg = np;
+ port->fe_datamove = nvmft_datamove;
+ port->fe_done = nvmft_done;
+ port->targ_port = -1;
+ port->options = nvlist_clone(req->args_nvl);
+
+ error = ctl_port_register(port);
+ if (error != 0) {
+ sx_xunlock(&nvmft_ports_lock);
+ nvlist_destroy(port->options);
+ nvmft_port_rele(np);
+ req->status = CTL_LUN_ERROR;
+ snprintf(req->error_str, sizeof(req->error_str),
+ "Failed to register CTL port with error %d", error);
+ return;
+ }
+
+ TAILQ_INSERT_TAIL(&nvmft_ports, np, link);
+ sx_xunlock(&nvmft_ports_lock);
+
+ req->status = CTL_LUN_OK;
+ req->result_nvl = nvlist_create(0);
+ nvlist_add_number(req->result_nvl, "port_id", port->targ_port);
+}
+
+static void
+nvmft_port_remove(struct ctl_req *req)
+{
+ struct nvmft_port *np;
+ const char *subnqn;
+ u_long port_id;
+
+ /*
+ * ctladm port -r just provides the port_id, so permit looking
+ * up a port either by "subnqn" or "port_id".
+ */
+ port_id = ULONG_MAX;
+ subnqn = dnvlist_get_string(req->args_nvl, "subnqn", NULL);
+ if (subnqn == NULL) {
+ if (!nvlist_exists_string(req->args_nvl, "port_id")) {
+ req->status = CTL_LUN_ERROR;
+ snprintf(req->error_str, sizeof(req->error_str),
+ "Missing required argument");
+ return;
+ }
+ if (!dnvlist_get_strnum(req->args_nvl, "port_id", ULONG_MAX,
+ &port_id)) {
+ req->status = CTL_LUN_ERROR;
+ snprintf(req->error_str, sizeof(req->error_str),
+ "Invalid CTL port ID");
+ return;
+ }
+ } else {
+ if (nvlist_exists_string(req->args_nvl, "port_id")) {
+ req->status = CTL_LUN_ERROR;
+ snprintf(req->error_str, sizeof(req->error_str),
+ "Ambiguous port removal request");
+ return;
+ }
+ }
+
+ sx_xlock(&nvmft_ports_lock);
+
+ if (subnqn != NULL) {
+ np = nvmft_port_find(subnqn);
+ if (np == NULL) {
+ req->status = CTL_LUN_ERROR;
+ snprintf(req->error_str, sizeof(req->error_str),
+ "SubNQN \"%s\" does not exist", subnqn);
+ sx_xunlock(&nvmft_ports_lock);
+ return;
+ }
+ } else {
+ np = nvmft_port_find_by_id(port_id);
+ if (np == NULL) {
+ req->status = CTL_LUN_ERROR;
+ snprintf(req->error_str, sizeof(req->error_str),
+ "CTL port %lu is not a NVMF port", port_id);
+ sx_xunlock(&nvmft_ports_lock);
+ return;
+ }
+ }
+
+ TAILQ_REMOVE(&nvmft_ports, np, link);
+ sx_xunlock(&nvmft_ports_lock);
+
+ ctl_port_offline(&np->port);
+ nvmft_port_rele(np);
+ req->status = CTL_LUN_OK;
+}
+
+static void
+nvmft_handoff(struct ctl_nvmf *cn)
+{
+ struct nvmf_fabric_connect_cmd cmd;
+ struct nvmf_handoff_controller_qpair *handoff;
+ struct nvmf_fabric_connect_data *data;
+ struct nvmft_port *np;
+ int error;
+
+ np = NULL;
+ data = NULL;
+ handoff = &cn->data.handoff;
+ error = copyin(handoff->cmd, &cmd, sizeof(cmd));
+ if (error != 0) {
+ cn->status = CTL_NVMF_ERROR;
+ snprintf(cn->error_str, sizeof(cn->error_str),
+ "Failed to copyin CONNECT SQE");
+ return;
+ }
+
+ data = malloc(sizeof(*data), M_NVMFT, M_WAITOK);
+ error = copyin(handoff->data, data, sizeof(*data));
+ if (error != 0) {
+ cn->status = CTL_NVMF_ERROR;
+ snprintf(cn->error_str, sizeof(cn->error_str),
+ "Failed to copyin CONNECT data");
+ goto out;
+ }
+
+ if (!nvmf_nqn_valid(data->subnqn)) {
+ cn->status = CTL_NVMF_ERROR;
+ snprintf(cn->error_str, sizeof(cn->error_str),
+ "Invalid SubNQN");
+ goto out;
+ }
+
+ sx_slock(&nvmft_ports_lock);
+ np = nvmft_port_find(data->subnqn);
+ if (np == NULL) {
+ sx_sunlock(&nvmft_ports_lock);
+ cn->status = CTL_NVMF_ERROR;
+ snprintf(cn->error_str, sizeof(cn->error_str),
+ "Unknown SubNQN");
+ goto out;
+ }
+ if (!np->online) {
+ sx_sunlock(&nvmft_ports_lock);
+ cn->status = CTL_NVMF_ERROR;
+ snprintf(cn->error_str, sizeof(cn->error_str),
+ "CTL port offline");
+ np = NULL;
+ goto out;
+ }
+ nvmft_port_ref(np);
+ sx_sunlock(&nvmft_ports_lock);
+
+ if (handoff->params.admin) {
+ error = nvmft_handoff_admin_queue(np, handoff, &cmd, data);
+ if (error != 0) {
+ cn->status = CTL_NVMF_ERROR;
+ snprintf(cn->error_str, sizeof(cn->error_str),
+ "Failed to handoff admin queue: %d", error);
+ goto out;
+ }
+ } else {
+ error = nvmft_handoff_io_queue(np, handoff, &cmd, data);
+ if (error != 0) {
+ cn->status = CTL_NVMF_ERROR;
+ snprintf(cn->error_str, sizeof(cn->error_str),
+ "Failed to handoff admin queue: %d", error);
+ goto out;
+ }
+ }
+
+ cn->status = CTL_NVMF_OK;
+out:
+ if (np != NULL)
+ nvmft_port_rele(np);
+ free(data, M_NVMFT);
+}
+
+static void
+nvmft_list(struct ctl_nvmf *cn)
+{
+ struct ctl_nvmf_list_params *lp;
+ struct nvmft_controller *ctrlr;
+ struct nvmft_port *np;
+ struct sbuf *sb;
+ int error;
+
+ lp = &cn->data.list;
+
+ sb = sbuf_new(NULL, NULL, lp->alloc_len, SBUF_FIXEDLEN |
+ SBUF_INCLUDENUL);
+ if (sb == NULL) {
+ cn->status = CTL_NVMF_ERROR;
+ snprintf(cn->error_str, sizeof(cn->error_str),
+ "Failed to allocate NVMeoF session list");
+ return;
+ }
+
+ sbuf_printf(sb, "<ctlnvmflist>\n");
+ sx_slock(&nvmft_ports_lock);
+ TAILQ_FOREACH(np, &nvmft_ports, link) {
+ sx_slock(&np->lock);
+ TAILQ_FOREACH(ctrlr, &np->controllers, link) {
+ sbuf_printf(sb, "<connection id=\"%d\">"
+ "<hostnqn>%s</hostnqn>"
+ "<subnqn>%s</subnqn>"
+ "<trtype>%u</trtype>"
+ "</connection>\n",
+ ctrlr->cntlid,
+ ctrlr->hostnqn,
+ np->cdata.subnqn,
+ ctrlr->trtype);
+ }
+ sx_sunlock(&np->lock);
+ }
+ sx_sunlock(&nvmft_ports_lock);
+ sbuf_printf(sb, "</ctlnvmflist>\n");
+ if (sbuf_finish(sb) != 0) {
+ sbuf_delete(sb);
+ cn->status = CTL_NVMF_LIST_NEED_MORE_SPACE;
+ snprintf(cn->error_str, sizeof(cn->error_str),
+ "Out of space, %d bytes is too small", lp->alloc_len);
+ return;
+ }
+
+ error = copyout(sbuf_data(sb), lp->conn_xml, sbuf_len(sb));
+ if (error != 0) {
+ sbuf_delete(sb);
+ cn->status = CTL_NVMF_ERROR;
+ snprintf(cn->error_str, sizeof(cn->error_str),
+ "Failed to copyout session list: %d", error);
+ return;
+ }
+ lp->fill_len = sbuf_len(sb);
+ cn->status = CTL_NVMF_OK;
+ sbuf_delete(sb);
+}
+
+static void
+nvmft_terminate(struct ctl_nvmf *cn)
+{
+ struct ctl_nvmf_terminate_params *tp;
+ struct nvmft_controller *ctrlr;
+ struct nvmft_port *np;
+ bool found, match;
+
+ tp = &cn->data.terminate;
+
+ found = false;
+ sx_slock(&nvmft_ports_lock);
+ TAILQ_FOREACH(np, &nvmft_ports, link) {
+ sx_slock(&np->lock);
+ TAILQ_FOREACH(ctrlr, &np->controllers, link) {
+ if (tp->all != 0)
+ match = true;
+ else if (tp->cntlid != -1)
+ match = tp->cntlid == ctrlr->cntlid;
+ else if (tp->hostnqn[0] != '\0')
+ match = strncmp(tp->hostnqn, ctrlr->hostnqn,
+ sizeof(tp->hostnqn)) == 0;
+ else
+ match = false;
+ if (!match)
+ continue;
+ nvmft_printf(ctrlr,
+ "disconnecting due to administrative request\n");
+ nvmft_controller_error(ctrlr, NULL, ECONNABORTED);
+ found = true;
+ }
+ sx_sunlock(&np->lock);
+ }
+ sx_sunlock(&nvmft_ports_lock);
+
+ if (!found) {
+ cn->status = CTL_NVMF_ASSOCIATION_NOT_FOUND;
+ snprintf(cn->error_str, sizeof(cn->error_str),
+ "No matching associations found");
+ return;
+ }
+ cn->status = CTL_NVMF_OK;
+}
+
+static int
+nvmft_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int flag,
+ struct thread *td)
+{
+ struct ctl_nvmf *cn;
+ struct ctl_req *req;
+
+ switch (cmd) {
+ case CTL_PORT_REQ:
+ req = (struct ctl_req *)data;
+ switch (req->reqtype) {
+ case CTL_REQ_CREATE:
+ nvmft_port_create(req);
+ break;
+ case CTL_REQ_REMOVE:
+ nvmft_port_remove(req);
+ break;
+ default:
+ req->status = CTL_LUN_ERROR;
+ snprintf(req->error_str, sizeof(req->error_str),
+ "Unsupported request type %d", req->reqtype);
+ break;
+ }
+ return (0);
+ case CTL_NVMF:
+ cn = (struct ctl_nvmf *)data;
+ switch (cn->type) {
+ case CTL_NVMF_HANDOFF:
+ nvmft_handoff(cn);
+ break;
+ case CTL_NVMF_LIST:
+ nvmft_list(cn);
+ break;
+ case CTL_NVMF_TERMINATE:
+ nvmft_terminate(cn);
+ break;
+ default:
+ cn->status = CTL_NVMF_ERROR;
+ snprintf(cn->error_str, sizeof(cn->error_str),
+ "Invalid NVMeoF request type %d", cn->type);
+ break;
+ }
+ return (0);
+ default:
+ return (ENOTTY);
+ }
+}
+
+static int
+nvmft_shutdown(void)
+{
+ /* TODO: Need to check for active controllers. */
+ if (!TAILQ_EMPTY(&nvmft_ports))
+ return (EBUSY);
+
+ sx_destroy(&nvmft_ports_lock);
+ return (0);
+}
+
+CTL_FRONTEND_DECLARE(nvmft, nvmft_frontend);
+MODULE_DEPEND(nvmft, nvmf_transport, 1, 1, 1);
diff --git a/sys/dev/nvmf/controller/nvmft_controller.c b/sys/dev/nvmf/controller/nvmft_controller.c
new file mode 100644
--- /dev/null
+++ b/sys/dev/nvmf/controller/nvmft_controller.c
@@ -0,0 +1,1130 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2023-2024 Chelsio Communications, Inc.
+ * Written by: John Baldwin <jhb@FreeBSD.org>
+ */
+
+#include <sys/param.h>
+#include <sys/callout.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/memdesc.h>
+#include <sys/mutex.h>
+#include <sys/sbuf.h>
+#include <sys/sx.h>
+#include <sys/taskqueue.h>
+
+#include <dev/nvmf/nvmf_transport.h>
+#include <dev/nvmf/controller/nvmft_subr.h>
+#include <dev/nvmf/controller/nvmft_var.h>
+
+static void nvmft_controller_shutdown(void *arg, int pending);
+static void nvmft_controller_terminate(void *arg, int pending);
+
+int
+nvmft_printf(struct nvmft_controller *ctrlr, const char *fmt, ...)
+{
+ char buf[128];
+ struct sbuf sb;
+ va_list ap;
+ size_t retval;
+
+ sbuf_new(&sb, buf, sizeof(buf), SBUF_FIXEDLEN);
+ sbuf_set_drain(&sb, sbuf_printf_drain, &retval);
+
+ sbuf_printf(&sb, "nvmft%u: ", ctrlr->cntlid);
+
+ va_start(ap, fmt);
+ sbuf_vprintf(&sb, fmt, ap);
+ va_end(ap);
+
+ sbuf_finish(&sb);
+ sbuf_delete(&sb);
+
+ return (retval);
+}
+
+static struct nvmft_controller *
+nvmft_controller_alloc(struct nvmft_port *np, uint16_t cntlid,
+ const struct nvmf_fabric_connect_data *data)
+{
+ struct nvmft_controller *ctrlr;
+
+ ctrlr = malloc(sizeof(*ctrlr), M_NVMFT, M_WAITOK | M_ZERO);
+ ctrlr->cntlid = cntlid;
+ nvmft_port_ref(np);
+ TAILQ_INSERT_TAIL(&np->controllers, ctrlr, link);
+ ctrlr->np = np;
+ mtx_init(&ctrlr->lock, "nvmft controller", NULL, MTX_DEF);
+ callout_init(&ctrlr->ka_timer, 1);
+ TASK_INIT(&ctrlr->shutdown_task, 0, nvmft_controller_shutdown, ctrlr);
+ TIMEOUT_TASK_INIT(taskqueue_thread, &ctrlr->terminate_task, 0,
+ nvmft_controller_terminate, ctrlr);
+
+ ctrlr->cdata = np->cdata;
+ ctrlr->cdata.ctrlr_id = htole16(cntlid);
+ memcpy(ctrlr->hostid, data->hostid, sizeof(ctrlr->hostid));
+ memcpy(ctrlr->hostnqn, data->hostnqn, sizeof(ctrlr->hostnqn));
+ ctrlr->hip.power_cycles[0] = 1;
+ ctrlr->create_time = sbinuptime();
+
+ ctrlr->changed_ns = malloc(sizeof(*ctrlr->changed_ns), M_NVMFT,
+ M_WAITOK | M_ZERO);
+
+ return (ctrlr);
+}
+
+static void
+nvmft_controller_free(struct nvmft_controller *ctrlr)
+{
+ mtx_destroy(&ctrlr->lock);
+ MPASS(ctrlr->io_qpairs == NULL);
+ free(ctrlr->changed_ns, M_NVMFT);
+ free(ctrlr, M_NVMFT);
+}
+
+static void
+nvmft_keep_alive_timer(void *arg)
+{
+ struct nvmft_controller *ctrlr = arg;
+ int traffic;
+
+ if (ctrlr->shutdown)
+ return;
+
+ traffic = atomic_readandclear_int(&ctrlr->ka_active_traffic);
+ if (traffic == 0) {
+ nvmft_printf(ctrlr,
+ "disconnecting due to KeepAlive timeout\n");
+ nvmft_controller_error(ctrlr, NULL, ETIMEDOUT);
+ return;
+ }
+
+ callout_schedule_sbt(&ctrlr->ka_timer, ctrlr->ka_sbt, 0, C_HARDCLOCK);
+}
+
+int
+nvmft_handoff_admin_queue(struct nvmft_port *np,
+ const struct nvmf_handoff_controller_qpair *handoff,
+ const struct nvmf_fabric_connect_cmd *cmd,
+ const struct nvmf_fabric_connect_data *data)
+{
+ struct nvmft_controller *ctrlr;
+ struct nvmft_qpair *qp;
+ uint32_t kato;
+ int cntlid;
+
+ if (cmd->qid != htole16(0))
+ return (EINVAL);
+
+ qp = nvmft_qpair_init(handoff->trtype, &handoff->params, 0,
+ "admin queue");
+
+ sx_xlock(&np->lock);
+ cntlid = alloc_unr(np->ids);
+ if (cntlid == -1) {
+ sx_xunlock(&np->lock);
+ printf("NVMFT: Unable to allocate controller for %.*s\n",
+ (int)sizeof(data->hostnqn), data->hostnqn);
+ nvmft_connect_error(qp, cmd, NVME_SCT_COMMAND_SPECIFIC,
+ NVMF_FABRIC_SC_INVALID_HOST);
+ nvmft_qpair_destroy(qp);
+ return (ENOMEM);
+ }
+
+#ifdef INVARIANTS
+ TAILQ_FOREACH(ctrlr, &np->controllers, link) {
+ KASSERT(ctrlr->cntlid != cntlid,
+ ("%s: duplicate controllers with id %d", __func__, cntlid));
+ }
+#endif
+
+ ctrlr = nvmft_controller_alloc(np, cntlid, data);
+ nvmft_printf(ctrlr, "associated with %.*s\n",
+ (int)sizeof(data->hostnqn), data->hostnqn);
+ ctrlr->admin = qp;
+ ctrlr->trtype = handoff->trtype;
+
+ /*
+ * The spec requires a non-zero KeepAlive timer, but allow a
+ * zero KATO value to match Linux.
+ */
+ kato = le32toh(cmd->kato);
+ if (kato != 0) {
+ /*
+ * Round up to 1 second matching granularity
+ * advertised in cdata.
+ */
+ ctrlr->ka_sbt = mstosbt(roundup(kato, 1000));
+ callout_reset_sbt(&ctrlr->ka_timer, ctrlr->ka_sbt, 0,
+ nvmft_keep_alive_timer, ctrlr, C_HARDCLOCK);
+ }
+
+ nvmft_finish_accept(qp, cmd, ctrlr);
+ sx_xunlock(&np->lock);
+
+ return (0);
+}
+
+int
+nvmft_handoff_io_queue(struct nvmft_port *np,
+ const struct nvmf_handoff_controller_qpair *handoff,
+ const struct nvmf_fabric_connect_cmd *cmd,
+ const struct nvmf_fabric_connect_data *data)
+{
+ struct nvmft_controller *ctrlr;
+ struct nvmft_qpair *qp;
+ char name[16];
+ uint16_t cntlid, qid;
+
+ qid = le16toh(cmd->qid);
+ if (qid == 0)
+ return (EINVAL);
+ cntlid = le16toh(data->cntlid);
+
+ snprintf(name, sizeof(name), "I/O queue %u", qid);
+ qp = nvmft_qpair_init(handoff->trtype, &handoff->params, qid, name);
+
+ sx_slock(&np->lock);
+ TAILQ_FOREACH(ctrlr, &np->controllers, link) {
+ if (ctrlr->cntlid == cntlid)
+ break;
+ }
+ if (ctrlr == NULL) {
+ sx_sunlock(&np->lock);
+ printf("NVMFT: Nonexistent controller %u for I/O queue %u from %.*s\n",
+ ctrlr->cntlid, qid, (int)sizeof(data->hostnqn),
+ data->hostnqn);
+ nvmft_connect_invalid_parameters(qp, cmd, true,
+ offsetof(struct nvmf_fabric_connect_data, cntlid));
+ nvmft_qpair_destroy(qp);
+ return (ENOENT);
+ }
+
+ if (memcmp(ctrlr->hostid, data->hostid, sizeof(ctrlr->hostid)) != 0) {
+ sx_sunlock(&np->lock);
+ nvmft_printf(ctrlr,
+ "hostid mismatch for I/O queue %u from %.*s\n", qid,
+ (int)sizeof(data->hostnqn), data->hostnqn);
+ nvmft_connect_invalid_parameters(qp, cmd, true,
+ offsetof(struct nvmf_fabric_connect_data, hostid));
+ nvmft_qpair_destroy(qp);
+ return (EINVAL);
+ }
+ if (memcmp(ctrlr->hostnqn, data->hostnqn, sizeof(ctrlr->hostnqn)) != 0) {
+ sx_sunlock(&np->lock);
+ nvmft_printf(ctrlr,
+ "hostnqn mismatch for I/O queue %u from %.*s\n", qid,
+ (int)sizeof(data->hostnqn), data->hostnqn);
+ nvmft_connect_invalid_parameters(qp, cmd, true,
+ offsetof(struct nvmf_fabric_connect_data, hostnqn));
+ nvmft_qpair_destroy(qp);
+ return (EINVAL);
+ }
+
+ /* XXX: Require handoff->trtype == ctrlr->trtype? */
+
+ mtx_lock(&ctrlr->lock);
+ if (ctrlr->shutdown) {
+ mtx_unlock(&ctrlr->lock);
+ sx_sunlock(&np->lock);
+ nvmft_printf(ctrlr,
+ "attempt to create I/O queue %u on disabled controller from %.*s\n",
+ qid, (int)sizeof(data->hostnqn), data->hostnqn);
+ nvmft_connect_invalid_parameters(qp, cmd, true,
+ offsetof(struct nvmf_fabric_connect_data, cntlid));
+ nvmft_qpair_destroy(qp);
+ return (EINVAL);
+ }
+ if (ctrlr->num_io_queues == 0) {
+ mtx_unlock(&ctrlr->lock);
+ sx_sunlock(&np->lock);
+ nvmft_printf(ctrlr,
+ "attempt to create I/O queue %u without enabled queues from %.*s\n",
+ qid, (int)sizeof(data->hostnqn), data->hostnqn);
+ nvmft_connect_error(qp, cmd, NVME_SCT_GENERIC,
+ NVME_SC_COMMAND_SEQUENCE_ERROR);
+ nvmft_qpair_destroy(qp);
+ return (EINVAL);
+ }
+ if (cmd->qid > ctrlr->num_io_queues) {
+ mtx_unlock(&ctrlr->lock);
+ sx_sunlock(&np->lock);
+ nvmft_printf(ctrlr,
+ "attempt to create invalid I/O queue %u from %.*s\n", qid,
+ (int)sizeof(data->hostnqn), data->hostnqn);
+ nvmft_connect_invalid_parameters(qp, cmd, false,
+ offsetof(struct nvmf_fabric_connect_cmd, qid));
+ nvmft_qpair_destroy(qp);
+ return (EINVAL);
+ }
+ if (ctrlr->io_qpairs[qid - 1].qp != NULL) {
+ mtx_unlock(&ctrlr->lock);
+ sx_sunlock(&np->lock);
+ nvmft_printf(ctrlr,
+ "attempt to re-create I/O queue %u from %.*s\n", qid,
+ (int)sizeof(data->hostnqn), data->hostnqn);
+ nvmft_connect_error(qp, cmd, NVME_SCT_GENERIC,
+ NVME_SC_COMMAND_SEQUENCE_ERROR);
+ nvmft_qpair_destroy(qp);
+ return (EINVAL);
+ }
+
+ ctrlr->io_qpairs[qid - 1].qp = qp;
+ mtx_unlock(&ctrlr->lock);
+ nvmft_finish_accept(qp, cmd, ctrlr);
+ sx_sunlock(&np->lock);
+
+ return (0);
+}
+
+static void
+nvmft_controller_shutdown(void *arg, int pending)
+{
+ struct nvmft_controller *ctrlr = arg;
+
+ MPASS(pending == 1);
+
+ /*
+ * Shutdown all I/O queues to terminate pending datamoves and
+ * stop receiving new commands.
+ */
+ mtx_lock(&ctrlr->lock);
+ for (u_int i = 0; i < ctrlr->num_io_queues; i++) {
+ if (ctrlr->io_qpairs[i].qp != NULL) {
+ ctrlr->io_qpairs[i].shutdown = true;
+ mtx_unlock(&ctrlr->lock);
+ nvmft_qpair_shutdown(ctrlr->io_qpairs[i].qp);
+ mtx_lock(&ctrlr->lock);
+ }
+ }
+ mtx_unlock(&ctrlr->lock);
+
+ /* Terminate active CTL commands. */
+ nvmft_terminate_commands(ctrlr);
+
+ /* Wait for all pending CTL commands to complete. */
+ mtx_lock(&ctrlr->lock);
+ while (ctrlr->pending_commands != 0)
+ mtx_sleep(&ctrlr->pending_commands, &ctrlr->lock, 0, "nvmftsh",
+ hz / 100);
+ mtx_unlock(&ctrlr->lock);
+
+ /* Delete all of the I/O queues. */
+ for (u_int i = 0; i < ctrlr->num_io_queues; i++) {
+ if (ctrlr->io_qpairs[i].qp != NULL)
+ nvmft_qpair_destroy(ctrlr->io_qpairs[i].qp);
+ }
+ free(ctrlr->io_qpairs, M_NVMFT);
+ ctrlr->io_qpairs = NULL;
+
+ mtx_lock(&ctrlr->lock);
+ ctrlr->num_io_queues = 0;
+
+ /* Mark shutdown complete. */
+ if (NVMEV(NVME_CSTS_REG_SHST, ctrlr->csts) == NVME_SHST_OCCURRING) {
+ ctrlr->csts &= ~NVMEM(NVME_CSTS_REG_SHST);
+ ctrlr->csts |= NVMEF(NVME_CSTS_REG_SHST, NVME_SHST_COMPLETE);
+ }
+
+ if (NVMEV(NVME_CSTS_REG_CFS, ctrlr->csts) == 0) {
+ ctrlr->csts &= ~NVMEM(NVME_CSTS_REG_RDY);
+ ctrlr->shutdown = false;
+ }
+ mtx_unlock(&ctrlr->lock);
+
+ /*
+ * If the admin queue was closed while shutting down or a
+ * fatal controller error has occurred, terminate the
+ * association immediately, otherwise wait up to 2 minutes
+ * (NVMe-over-Fabrics 1.1 4.6).
+ */
+ if (ctrlr->admin_closed || NVMEV(NVME_CSTS_REG_CFS, ctrlr->csts) != 0)
+ nvmft_controller_terminate(ctrlr, 0);
+ else
+ taskqueue_enqueue_timeout(taskqueue_thread,
+ &ctrlr->terminate_task, hz * 60 * 2);
+}
+
+static void
+nvmft_controller_terminate(void *arg, int pending)
+{
+ struct nvmft_controller *ctrlr = arg;
+ struct nvmft_port *np;
+ bool wakeup_np;
+
+ /* If the controller has been re-enabled, nothing to do. */
+ mtx_lock(&ctrlr->lock);
+ if (NVMEV(NVME_CC_REG_EN, ctrlr->cc) != 0) {
+ mtx_unlock(&ctrlr->lock);
+
+ if (ctrlr->ka_sbt != 0)
+ callout_schedule_sbt(&ctrlr->ka_timer, ctrlr->ka_sbt, 0,
+ C_HARDCLOCK);
+ return;
+ }
+
+ /* Disable updates to CC while destroying admin qpair. */
+ ctrlr->shutdown = true;
+ mtx_unlock(&ctrlr->lock);
+
+ nvmft_qpair_destroy(ctrlr->admin);
+
+ /* Remove association (CNTLID). */
+ np = ctrlr->np;
+ sx_xlock(&np->lock);
+ TAILQ_REMOVE(&np->controllers, ctrlr, link);
+ free_unr(np->ids, ctrlr->cntlid);
+ wakeup_np = (!np->online && TAILQ_EMPTY(&np->controllers));
+ sx_xunlock(&np->lock);
+ if (wakeup_np)
+ wakeup(np);
+
+ callout_drain(&ctrlr->ka_timer);
+
+ nvmft_printf(ctrlr, "association terminated\n");
+ nvmft_controller_free(ctrlr);
+ nvmft_port_rele(np);
+}
+
+void
+nvmft_controller_error(struct nvmft_controller *ctrlr, struct nvmft_qpair *qp,
+ int error)
+{
+ /*
+ * If a queue pair is closed, that isn't an error per se.
+ * That just means additional commands cannot be received on
+ * that queue pair.
+ *
+ * If the admin queue pair is closed while idle or while
+ * shutting down, terminate the association immediately.
+ *
+ * If an I/O queue pair is closed, just ignore it.
+ */
+ if (error == 0) {
+ if (qp != ctrlr->admin)
+ return;
+
+ mtx_lock(&ctrlr->lock);
+ if (ctrlr->shutdown) {
+ ctrlr->admin_closed = true;
+ mtx_unlock(&ctrlr->lock);
+ return;
+ }
+
+ if (NVMEV(NVME_CC_REG_EN, ctrlr->cc) == 0) {
+ MPASS(ctrlr->num_io_queues == 0);
+ mtx_unlock(&ctrlr->lock);
+
+ /*
+ * Ok to drop lock here since ctrlr->cc can't
+ * change if the admin queue pair has closed.
+ * This also means no new queues can be handed
+ * off, etc. Note that since there are no I/O
+ * queues, only the admin queue needs to be
+ * destroyed, so it is safe to skip
+ * nvmft_controller_shutdown and just schedule
+ * nvmft_controller_terminate. Note that we
+ * cannot call nvmft_controller_terminate from
+ * here directly as this is called from the
+ * transport layer and freeing the admin qpair
+ * might deadlock waiting for the current
+ * thread to exit.
+ */
+ if (taskqueue_cancel_timeout(taskqueue_thread,
+ &ctrlr->terminate_task, NULL) == 0)
+ taskqueue_enqueue_timeout(taskqueue_thread,
+ &ctrlr->terminate_task, 0);
+ return;
+ }
+
+ /*
+ * Treat closing of the admin queue pair while enabled
+ * as a transport error. Note that the admin queue
+ * pair has been closed.
+ */
+ ctrlr->admin_closed = true;
+ } else
+ mtx_lock(&ctrlr->lock);
+
+ /* Ignore transport errors if we are already shutting down. */
+ if (ctrlr->shutdown) {
+ mtx_unlock(&ctrlr->lock);
+ return;
+ }
+
+ ctrlr->csts |= NVMEF(NVME_CSTS_REG_CFS, 1);
+ ctrlr->cc &= ~NVMEM(NVME_CC_REG_EN);
+ ctrlr->shutdown = true;
+ mtx_unlock(&ctrlr->lock);
+
+ callout_stop(&ctrlr->ka_timer);
+ taskqueue_enqueue(taskqueue_thread, &ctrlr->shutdown_task);
+}
+
+/* Wrapper around m_getm2 that also sets m_len in the mbufs in the chain. */
+static struct mbuf *
+m_getml(size_t len, int how)
+{
+ struct mbuf *m, *n;
+
+ m = m_getm2(NULL, len, how, MT_DATA, 0);
+ if (m == NULL)
+ return (NULL);
+ for (n = m; len > 0; n = n->m_next) {
+ n->m_len = M_SIZE(n);
+ if (n->m_len >= len) {
+ n->m_len = len;
+ MPASS(n->m_next == NULL);
+ }
+ len -= n->m_len;
+ }
+ return (m);
+}
+
+static void
+m_zero(struct mbuf *m, u_int offset, u_int len)
+{
+ u_int todo;
+
+ if (len == 0)
+ return;
+
+ while (m->m_len <= offset) {
+ offset -= m->m_len;
+ m = m->m_next;
+ }
+
+ todo = m->m_len - offset;
+ if (todo > len)
+ todo = len;
+ memset(mtodo(m, offset), 0, todo);
+ m = m->m_next;
+ len -= todo;
+
+ while (len > 0) {
+ todo = m->m_len;
+ if (todo > len)
+ todo = len;
+ memset(mtod(m, void *), 0, todo);
+ m = m->m_next;
+ len -= todo;
+ }
+}
+
+static void
+handle_get_log_page(struct nvmft_controller *ctrlr,
+ struct nvmf_capsule *nc, const struct nvme_command *cmd)
+{
+ struct mbuf *m;
+ uint64_t offset;
+ uint32_t numd;
+ size_t len, todo;
+ u_int status;
+ uint8_t lid;
+ bool rae;
+
+ lid = le32toh(cmd->cdw10) & 0xff;
+ rae = (le32toh(cmd->cdw10) & (1U << 15)) != 0;
+ numd = le32toh(cmd->cdw10) >> 16 | le32toh(cmd->cdw11) << 16;
+ offset = le32toh(cmd->cdw12) | (uint64_t)le32toh(cmd->cdw13) << 32;
+
+ if (offset % 3 != 0) {
+ status = NVME_SC_INVALID_FIELD;
+ goto done;
+ }
+
+ len = (numd + 1) * 4;
+
+ switch (lid) {
+ case NVME_LOG_ERROR:
+ todo = 0;
+
+ m = m_getml(len, M_WAITOK);
+ if (todo != len)
+ m_zero(m, todo, len - todo);
+ status = nvmf_send_controller_data(nc, 0, m, len);
+ MPASS(status != NVMF_MORE);
+ break;
+ case NVME_LOG_HEALTH_INFORMATION:
+ {
+ struct nvme_health_information_page hip;
+
+ if (offset >= sizeof(hip)) {
+ status = NVME_SC_INVALID_FIELD;
+ goto done;
+ }
+ todo = sizeof(hip) - offset;
+ if (todo > len)
+ todo = len;
+
+ mtx_lock(&ctrlr->lock);
+ hip = ctrlr->hip;
+ hip.controller_busy_time[0] =
+ sbintime_getsec(ctrlr->busy_total) / 60;
+ hip.power_on_hours[0] =
+ sbintime_getsec(sbinuptime() - ctrlr->create_time) / 3600;
+ mtx_unlock(&ctrlr->lock);
+
+ m = m_getml(len, M_WAITOK);
+ m_copyback(m, 0, todo, (char *)&hip + offset);
+ if (todo != len)
+ m_zero(m, todo, len - todo);
+ status = nvmf_send_controller_data(nc, 0, m, len);
+ MPASS(status != NVMF_MORE);
+ break;
+ }
+ case NVME_LOG_FIRMWARE_SLOT:
+ if (offset >= sizeof(ctrlr->np->fp)) {
+ status = NVME_SC_INVALID_FIELD;
+ goto done;
+ }
+ todo = sizeof(ctrlr->np->fp) - offset;
+ if (todo > len)
+ todo = len;
+
+ m = m_getml(len, M_WAITOK);
+ m_copyback(m, 0, todo, (char *)&ctrlr->np->fp + offset);
+ if (todo != len)
+ m_zero(m, todo, len - todo);
+ status = nvmf_send_controller_data(nc, 0, m, len);
+ MPASS(status != NVMF_MORE);
+ break;
+ case NVME_LOG_CHANGED_NAMESPACE:
+ if (offset >= sizeof(*ctrlr->changed_ns)) {
+ status = NVME_SC_INVALID_FIELD;
+ goto done;
+ }
+ todo = sizeof(*ctrlr->changed_ns) - offset;
+ if (todo > len)
+ todo = len;
+
+ m = m_getml(len, M_WAITOK);
+ mtx_lock(&ctrlr->lock);
+ m_copyback(m, 0, todo, (char *)ctrlr->changed_ns + offset);
+ if (offset == 0 && len == sizeof(*ctrlr->changed_ns))
+ memset(ctrlr->changed_ns, 0,
+ sizeof(*ctrlr->changed_ns));
+ if (!rae)
+ ctrlr->changed_ns_reported = false;
+ mtx_unlock(&ctrlr->lock);
+ if (todo != len)
+ m_zero(m, todo, len - todo);
+ status = nvmf_send_controller_data(nc, 0, m, len);
+ MPASS(status != NVMF_MORE);
+ break;
+ default:
+ nvmft_printf(ctrlr, "Unsupported page %#x for GET_LOG_PAGE\n",
+ lid);
+ status = NVME_SC_INVALID_FIELD;
+ break;
+ }
+
+done:
+ if (status == NVMF_SUCCESS_SENT)
+ nvmft_command_completed(ctrlr->admin, nc);
+ else
+ nvmft_send_generic_error(ctrlr->admin, nc, status);
+ nvmf_free_capsule(nc);
+}
+
+static void
+m_free_nslist(struct mbuf *m)
+{
+ free(m->m_ext.ext_arg1, M_NVMFT);
+}
+
+static void
+handle_identify_command(struct nvmft_controller *ctrlr,
+ struct nvmf_capsule *nc, const struct nvme_command *cmd)
+{
+ struct mbuf *m;
+ size_t data_len;
+ u_int status;
+ uint8_t cns;
+
+ cns = le32toh(cmd->cdw10) & 0xFF;
+ data_len = nvmf_capsule_data_len(nc);
+ if (data_len != sizeof(ctrlr->cdata)) {
+ nvmft_printf(ctrlr,
+ "Invalid length %zu for IDENTIFY with CNS %#x\n", data_len,
+ cns);
+ nvmft_send_generic_error(ctrlr->admin, nc,
+ NVME_SC_INVALID_OPCODE);
+ nvmf_free_capsule(nc);
+ return;
+ }
+
+ switch (cns) {
+ case 0: /* Namespace data. */
+ case 3: /* Namespace Identification Descriptor list. */
+ nvmft_dispatch_command(ctrlr->admin, nc, true);
+ return;
+ case 1:
+ /* Controller data. */
+ m = m_getml(sizeof(ctrlr->cdata), M_WAITOK);
+ m_copyback(m, 0, sizeof(ctrlr->cdata), (void *)&ctrlr->cdata);
+ status = nvmf_send_controller_data(nc, 0, m,
+ sizeof(ctrlr->cdata));
+ MPASS(status != NVMF_MORE);
+ break;
+ case 2:
+ {
+ /* Active namespace list. */
+ struct nvme_ns_list *nslist;
+ uint32_t nsid;
+
+ nsid = le32toh(cmd->nsid);
+ if (nsid >= 0xfffffffe) {
+ status = NVME_SC_INVALID_FIELD;
+ break;
+ }
+
+ nslist = malloc(sizeof(*nslist), M_NVMFT, M_WAITOK | M_ZERO);
+ nvmft_populate_active_nslist(ctrlr->np, nsid, nslist);
+ m = m_get(M_WAITOK, MT_DATA);
+ m_extadd(m, (void *)nslist, sizeof(*nslist), m_free_nslist,
+ nslist, NULL, 0, EXT_CTL);
+ m->m_len = sizeof(*nslist);
+ status = nvmf_send_controller_data(nc, 0, m, m->m_len);
+ MPASS(status != NVMF_MORE);
+ break;
+ }
+ default:
+ nvmft_printf(ctrlr, "Unsupported CNS %#x for IDENTIFY\n", cns);
+ status = NVME_SC_INVALID_FIELD;
+ break;
+ }
+
+ if (status == NVMF_SUCCESS_SENT)
+ nvmft_command_completed(ctrlr->admin, nc);
+ else
+ nvmft_send_generic_error(ctrlr->admin, nc, status);
+ nvmf_free_capsule(nc);
+}
+
+static void
+handle_set_features(struct nvmft_controller *ctrlr,
+ struct nvmf_capsule *nc, const struct nvme_command *cmd)
+{
+ struct nvme_completion cqe;
+ uint8_t fid;
+
+ fid = NVMEV(NVME_FEAT_SET_FID, le32toh(cmd->cdw10));
+ switch (fid) {
+ case NVME_FEAT_NUMBER_OF_QUEUES:
+ {
+ uint32_t num_queues;
+ struct nvmft_io_qpair *io_qpairs;
+
+ num_queues = le32toh(cmd->cdw11) & 0xffff;
+
+ /* 5.12.1.7: 65535 is invalid. */
+ if (num_queues == 65535)
+ goto error;
+
+ /* Fabrics requires the same number of SQs and CQs. */
+ if (le32toh(cmd->cdw11) >> 16 != num_queues)
+ goto error;
+
+ /* Convert to 1's based */
+ num_queues++;
+
+ io_qpairs = mallocarray(num_queues, sizeof(*io_qpairs),
+ M_NVMFT, M_WAITOK | M_ZERO);
+
+ mtx_lock(&ctrlr->lock);
+ if (ctrlr->num_io_queues != 0) {
+ mtx_unlock(&ctrlr->lock);
+ free(io_qpairs, M_NVMFT);
+ nvmft_send_generic_error(ctrlr->admin, nc,
+ NVME_SC_COMMAND_SEQUENCE_ERROR);
+ nvmf_free_capsule(nc);
+ return;
+ }
+
+ ctrlr->num_io_queues = num_queues;
+ ctrlr->io_qpairs = io_qpairs;
+ mtx_unlock(&ctrlr->lock);
+
+ nvmft_init_cqe(&cqe, nc, 0);
+ cqe.cdw0 = cmd->cdw11;
+ nvmft_send_response(ctrlr->admin, &cqe);
+ nvmf_free_capsule(nc);
+ return;
+ }
+ case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
+ {
+ uint32_t aer_mask;
+
+ aer_mask = le32toh(cmd->cdw11);
+
+ /* Check for any reserved or unimplemented feature bits. */
+ if ((aer_mask & 0xffffc000) != 0)
+ goto error;
+
+ mtx_lock(&ctrlr->lock);
+ ctrlr->aer_mask = aer_mask;
+ mtx_unlock(&ctrlr->lock);
+ nvmft_send_success(ctrlr->admin, nc);
+ return;
+ }
+ default:
+ nvmft_printf(ctrlr,
+ "Unsupported feature ID %u for SET_FEATURES\n", fid);
+ goto error;
+ }
+
+error:
+ nvmft_send_generic_error(ctrlr->admin, nc, NVME_SC_INVALID_FIELD);
+ nvmf_free_capsule(nc);
+}
+
+static bool
+update_cc(struct nvmft_controller *ctrlr, uint32_t new_cc, bool *need_shutdown)
+{
+ struct nvmft_port *np = ctrlr->np;
+ uint32_t changes;
+
+ *need_shutdown = false;
+
+ mtx_lock(&ctrlr->lock);
+
+ /* Don't allow any changes while shutting down. */
+ if (ctrlr->shutdown) {
+ mtx_unlock(&ctrlr->lock);
+ return (false);
+ }
+
+ if (!_nvmf_validate_cc(np->max_io_qsize, np->cap, ctrlr->cc, new_cc)) {
+ mtx_unlock(&ctrlr->lock);
+ return (false);
+ }
+
+ changes = ctrlr->cc ^ new_cc;
+ ctrlr->cc = new_cc;
+
+ /* Handle shutdown requests. */
+ if (NVMEV(NVME_CC_REG_SHN, changes) != 0 &&
+ NVMEV(NVME_CC_REG_SHN, new_cc) != 0) {
+ ctrlr->csts &= ~NVMEM(NVME_CSTS_REG_SHST);
+ ctrlr->csts |= NVMEF(NVME_CSTS_REG_SHST, NVME_SHST_OCCURRING);
+ ctrlr->cc &= ~NVMEM(NVME_CC_REG_EN);
+ ctrlr->shutdown = true;
+ *need_shutdown = true;
+ nvmft_printf(ctrlr, "shutdown requested\n");
+ }
+
+ if (NVMEV(NVME_CC_REG_EN, changes) != 0) {
+ if (NVMEV(NVME_CC_REG_EN, new_cc) == 0) {
+ /* Controller reset. */
+ nvmft_printf(ctrlr, "reset requested\n");
+ ctrlr->shutdown = true;
+ *need_shutdown = true;
+ } else
+ ctrlr->csts |= NVMEF(NVME_CSTS_REG_RDY, 1);
+ }
+ mtx_unlock(&ctrlr->lock);
+
+ return (true);
+}
+
+static void
+handle_property_get(struct nvmft_controller *ctrlr, struct nvmf_capsule *nc,
+ const struct nvmf_fabric_prop_get_cmd *pget)
+{
+ struct nvmf_fabric_prop_get_rsp rsp;
+
+ nvmft_init_cqe(&rsp, nc, 0);
+
+ switch (le32toh(pget->ofst)) {
+ case NVMF_PROP_CAP:
+ if (pget->attrib.size != NVMF_PROP_SIZE_8)
+ goto error;
+ rsp.value.u64 = htole64(ctrlr->np->cap);
+ break;
+ case NVMF_PROP_VS:
+ if (pget->attrib.size != NVMF_PROP_SIZE_4)
+ goto error;
+ rsp.value.u32.low = ctrlr->cdata.ver;
+ break;
+ case NVMF_PROP_CC:
+ if (pget->attrib.size != NVMF_PROP_SIZE_4)
+ goto error;
+ rsp.value.u32.low = htole32(ctrlr->cc);
+ break;
+ case NVMF_PROP_CSTS:
+ if (pget->attrib.size != NVMF_PROP_SIZE_4)
+ goto error;
+ rsp.value.u32.low = htole32(ctrlr->csts);
+ break;
+ default:
+ goto error;
+ }
+
+ nvmft_send_response(ctrlr->admin, &rsp);
+ return;
+error:
+ nvmft_send_generic_error(ctrlr->admin, nc, NVME_SC_INVALID_FIELD);
+}
+
+static void
+handle_property_set(struct nvmft_controller *ctrlr, struct nvmf_capsule *nc,
+ const struct nvmf_fabric_prop_set_cmd *pset)
+{
+ bool need_shutdown;
+
+ need_shutdown = false;
+ switch (le32toh(pset->ofst)) {
+ case NVMF_PROP_CC:
+ if (pset->attrib.size != NVMF_PROP_SIZE_4)
+ goto error;
+ if (!update_cc(ctrlr, le32toh(pset->value.u32.low),
+ &need_shutdown))
+ goto error;
+ break;
+ default:
+ goto error;
+ }
+
+ nvmft_send_success(ctrlr->admin, nc);
+ if (need_shutdown) {
+ callout_stop(&ctrlr->ka_timer);
+ taskqueue_enqueue(taskqueue_thread, &ctrlr->shutdown_task);
+ }
+ return;
+error:
+ nvmft_send_generic_error(ctrlr->admin, nc, NVME_SC_INVALID_FIELD);
+}
+
+static void
+handle_admin_fabrics_command(struct nvmft_controller *ctrlr,
+ struct nvmf_capsule *nc, const struct nvmf_fabric_cmd *fc)
+{
+ switch (fc->fctype) {
+ case NVMF_FABRIC_COMMAND_PROPERTY_GET:
+ handle_property_get(ctrlr, nc,
+ (const struct nvmf_fabric_prop_get_cmd *)fc);
+ break;
+ case NVMF_FABRIC_COMMAND_PROPERTY_SET:
+ handle_property_set(ctrlr, nc,
+ (const struct nvmf_fabric_prop_set_cmd *)fc);
+ break;
+ case NVMF_FABRIC_COMMAND_CONNECT:
+ nvmft_printf(ctrlr,
+ "CONNECT command on connected admin queue\n");
+ nvmft_send_generic_error(ctrlr->admin, nc,
+ NVME_SC_COMMAND_SEQUENCE_ERROR);
+ break;
+ case NVMF_FABRIC_COMMAND_DISCONNECT:
+ nvmft_printf(ctrlr, "DISCONNECT command on admin queue\n");
+ nvmft_send_error(ctrlr->admin, nc, NVME_SCT_COMMAND_SPECIFIC,
+ NVMF_FABRIC_SC_INVALID_QUEUE_TYPE);
+ break;
+ default:
+ nvmft_printf(ctrlr, "Unsupported fabrics command %#x\n",
+ fc->fctype);
+ nvmft_send_generic_error(ctrlr->admin, nc,
+ NVME_SC_INVALID_OPCODE);
+ break;
+ }
+ nvmf_free_capsule(nc);
+}
+
+void
+nvmft_handle_admin_command(struct nvmft_controller *ctrlr,
+ struct nvmf_capsule *nc)
+{
+ const struct nvme_command *cmd = nvmf_capsule_sqe(nc);
+
+ /* Only permit Fabrics commands while a controller is disabled. */
+ if (NVMEV(NVME_CC_REG_EN, ctrlr->cc) == 0 &&
+ cmd->opc != NVME_OPC_FABRICS_COMMANDS) {
+ nvmft_printf(ctrlr,
+ "Unsupported admin opcode %#x whiled disabled\n", cmd->opc);
+ nvmft_send_generic_error(ctrlr->admin, nc,
+ NVME_SC_COMMAND_SEQUENCE_ERROR);
+ nvmf_free_capsule(nc);
+ return;
+ }
+
+ atomic_store_int(&ctrlr->ka_active_traffic, 1);
+
+ switch (cmd->opc) {
+ case NVME_OPC_GET_LOG_PAGE:
+ handle_get_log_page(ctrlr, nc, cmd);
+ break;
+ case NVME_OPC_IDENTIFY:
+ handle_identify_command(ctrlr, nc, cmd);
+ break;
+ case NVME_OPC_SET_FEATURES:
+ handle_set_features(ctrlr, nc, cmd);
+ break;
+ case NVME_OPC_ASYNC_EVENT_REQUEST:
+ mtx_lock(&ctrlr->lock);
+ if (ctrlr->aer_pending == NVMFT_NUM_AER) {
+ mtx_unlock(&ctrlr->lock);
+ nvmft_send_error(ctrlr->admin, nc,
+ NVME_SCT_COMMAND_SPECIFIC,
+ NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED);
+ } else {
+ /* NB: Store the CID without byte-swapping. */
+ ctrlr->aer_cids[ctrlr->aer_pidx] = cmd->cid;
+ ctrlr->aer_pending++;
+ ctrlr->aer_pidx = (ctrlr->aer_pidx + 1) % NVMFT_NUM_AER;
+ mtx_unlock(&ctrlr->lock);
+ }
+ nvmf_free_capsule(nc);
+ break;
+ case NVME_OPC_KEEP_ALIVE:
+ nvmft_send_success(ctrlr->admin, nc);
+ nvmf_free_capsule(nc);
+ break;
+ case NVME_OPC_FABRICS_COMMANDS:
+ handle_admin_fabrics_command(ctrlr, nc,
+ (const struct nvmf_fabric_cmd *)cmd);
+ break;
+ default:
+ nvmft_printf(ctrlr, "Unsupported admin opcode %#x\n", cmd->opc);
+ nvmft_send_generic_error(ctrlr->admin, nc,
+ NVME_SC_INVALID_OPCODE);
+ nvmf_free_capsule(nc);
+ break;
+ }
+}
+
+void
+nvmft_handle_io_command(struct nvmft_qpair *qp, uint16_t qid,
+ struct nvmf_capsule *nc)
+{
+ struct nvmft_controller *ctrlr = nvmft_qpair_ctrlr(qp);
+ const struct nvme_command *cmd = nvmf_capsule_sqe(nc);
+
+ atomic_store_int(&ctrlr->ka_active_traffic, 1);
+
+ switch (cmd->opc) {
+ case NVME_OPC_FLUSH:
+ if (cmd->nsid == htole32(0xffffffff)) {
+ nvmft_send_generic_error(qp, nc,
+ NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
+ nvmf_free_capsule(nc);
+ break;
+ }
+ /* FALLTHROUGH */
+ case NVME_OPC_WRITE:
+ case NVME_OPC_READ:
+ case NVME_OPC_WRITE_UNCORRECTABLE:
+ case NVME_OPC_COMPARE:
+ case NVME_OPC_WRITE_ZEROES:
+ case NVME_OPC_DATASET_MANAGEMENT:
+ case NVME_OPC_VERIFY:
+ nvmft_dispatch_command(qp, nc, false);
+ break;
+ default:
+ nvmft_printf(ctrlr, "Unsupported I/O opcode %#x\n", cmd->opc);
+ nvmft_send_generic_error(qp, nc,
+ NVME_SC_INVALID_OPCODE);
+ nvmf_free_capsule(nc);
+ break;
+ }
+}
+
+static void
+nvmft_report_aer(struct nvmft_controller *ctrlr, uint32_t aer_mask,
+ u_int type, uint8_t info, uint8_t log_page_id)
+{
+ struct nvme_completion cpl;
+
+ MPASS(type <= 7);
+
+ /* Drop events that are not enabled. */
+ mtx_lock(&ctrlr->lock);
+ if ((ctrlr->aer_mask & aer_mask) == 0) {
+ mtx_unlock(&ctrlr->lock);
+ return;
+ }
+
+ /*
+ * If there is no pending AER command, drop it.
+ * XXX: Should we queue these?
+ */
+ if (ctrlr->aer_pending == 0) {
+ mtx_unlock(&ctrlr->lock);
+ nvmft_printf(ctrlr,
+ "dropping AER type %u, info %#x, page %#x\n",
+ type, info, log_page_id);
+ return;
+ }
+
+ memset(&cpl, 0, sizeof(cpl));
+ cpl.cid = ctrlr->aer_cids[ctrlr->aer_cidx];
+ ctrlr->aer_pending--;
+ ctrlr->aer_cidx = (ctrlr->aer_cidx + 1) % NVMFT_NUM_AER;
+ mtx_unlock(&ctrlr->lock);
+
+ cpl.cdw0 = htole32(NVMEF(NVME_ASYNC_EVENT_TYPE, type) |
+ NVMEF(NVME_ASYNC_EVENT_INFO, info) |
+ NVMEF(NVME_ASYNC_EVENT_LOG_PAGE_ID, log_page_id));
+
+ nvmft_send_response(ctrlr->admin, &cpl);
+}
+
+void
+nvmft_controller_lun_changed(struct nvmft_controller *ctrlr, int lun_id)
+{
+ struct nvme_ns_list *nslist;
+ uint32_t new_nsid, nsid;
+ u_int i;
+
+ new_nsid = lun_id + 1;
+
+ mtx_lock(&ctrlr->lock);
+ nslist = ctrlr->changed_ns;
+
+ /* If the first entry is 0xffffffff, the list is already full. */
+ if (nslist->ns[0] != 0xffffffff) {
+ /* Find the insertion point for this namespace ID. */
+ for (i = 0; i < nitems(nslist->ns); i++) {
+ nsid = le32toh(nslist->ns[i]);
+ if (nsid == new_nsid) {
+ /* Already reported, nothing to do. */
+ mtx_unlock(&ctrlr->lock);
+ return;
+ }
+
+ if (nsid == 0 || nsid > new_nsid)
+ break;
+ }
+
+ if (nslist->ns[nitems(nslist->ns) - 1] != htole32(0)) {
+ /* List is full. */
+ memset(ctrlr->changed_ns, 0,
+ sizeof(*ctrlr->changed_ns));
+ ctrlr->changed_ns->ns[0] = 0xffffffff;
+ } else if (nslist->ns[i] == htole32(0)) {
+ /*
+ * Optimize case where this ID is appended to
+ * the end.
+ */
+ nslist->ns[i] = htole32(new_nsid);
+ } else {
+ memmove(&nslist->ns[i + 1], &nslist->ns[i],
+ (nitems(nslist->ns) - i - 1) *
+ sizeof(nslist->ns[0]));
+ nslist->ns[i] = htole32(new_nsid);
+ }
+ }
+
+ if (ctrlr->changed_ns_reported) {
+ mtx_unlock(&ctrlr->lock);
+ return;
+ }
+ ctrlr->changed_ns_reported = true;
+ mtx_unlock(&ctrlr->lock);
+
+ nvmft_report_aer(ctrlr, NVME_ASYNC_EVENT_NS_ATTRIBUTE, 0x2, 0x0,
+ NVME_LOG_CHANGED_NAMESPACE);
+}
diff --git a/sys/dev/nvmf/controller/nvmft_qpair.c b/sys/dev/nvmf/controller/nvmft_qpair.c
new file mode 100644
--- /dev/null
+++ b/sys/dev/nvmf/controller/nvmft_qpair.c
@@ -0,0 +1,361 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2023-2024 Chelsio Communications, Inc.
+ * Written by: John Baldwin <jhb@FreeBSD.org>
+ */
+
+#include <sys/types.h>
+#include <sys/_bitset.h>
+#include <sys/bitset.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+
+#include <dev/nvmf/nvmf_transport.h>
+#include <dev/nvmf/controller/nvmft_var.h>
+
+/*
+ * A bitmask of command ID values. This is used to detect duplicate
+ * commands with the same ID.
+ */
+#define NUM_CIDS (UINT16_MAX + 1)
+BITSET_DEFINE(cidset, NUM_CIDS);
+
+struct nvmft_qpair {
+ struct nvmft_controller *ctrlr;
+ struct nvmf_qpair *qp;
+ struct cidset *cids;
+
+ bool admin;
+ bool sq_flow_control;
+ uint16_t qid;
+ u_int qsize;
+ uint16_t sqhd;
+ uint16_t sqtail;
+ volatile u_int qp_refs; /* Internal references on 'qp'. */
+
+ struct mtx lock;
+
+ char name[16];
+};
+
+static int _nvmft_send_generic_error(struct nvmft_qpair *qp,
+ struct nvmf_capsule *nc, uint8_t sc_status);
+
+static void
+nvmft_qpair_error(void *arg, int error)
+{
+ struct nvmft_qpair *qp = arg;
+ struct nvmft_controller *ctrlr = qp->ctrlr;
+
+ /*
+ * XXX: The Linux TCP initiator sends a RST immediately after
+ * the FIN, so treat ECONNRESET as plain EOF to avoid spurious
+ * errors on shutdown.
+ */
+ if (error == ECONNRESET)
+ error = 0;
+
+ if (error != 0)
+ nvmft_printf(ctrlr, "error %d on %s\n", error, qp->name);
+ nvmft_controller_error(ctrlr, qp, error);
+}
+
+static void
+nvmft_receive_capsule(void *arg, struct nvmf_capsule *nc)
+{
+ struct nvmft_qpair *qp = arg;
+ struct nvmft_controller *ctrlr = qp->ctrlr;
+ const struct nvme_command *cmd;
+ uint8_t sc_status;
+
+ cmd = nvmf_capsule_sqe(nc);
+ if (ctrlr == NULL) {
+ printf("NVMFT: %s received CID %u opcode %u on newborn queue\n",
+ qp->name, le16toh(cmd->cid), cmd->opc);
+ nvmf_free_capsule(nc);
+ return;
+ }
+
+ sc_status = nvmf_validate_command_capsule(nc);
+ if (sc_status != NVME_SC_SUCCESS) {
+ _nvmft_send_generic_error(qp, nc, sc_status);
+ nvmf_free_capsule(nc);
+ return;
+ }
+
+ /* Don't bother byte-swapping CID. */
+ if (BIT_TEST_SET_ATOMIC(NUM_CIDS, cmd->cid, qp->cids)) {
+ _nvmft_send_generic_error(qp, nc, NVME_SC_COMMAND_ID_CONFLICT);
+ nvmf_free_capsule(nc);
+ return;
+ }
+
+ if (qp->admin)
+ nvmft_handle_admin_command(ctrlr, nc);
+ else
+ nvmft_handle_io_command(qp, qp->qid, nc);
+}
+
+struct nvmft_qpair *
+nvmft_qpair_init(enum nvmf_trtype trtype,
+ const struct nvmf_handoff_qpair_params *handoff, uint16_t qid,
+ const char *name)
+{
+ struct nvmft_qpair *qp;
+
+ qp = malloc(sizeof(*qp), M_NVMFT, M_WAITOK | M_ZERO);
+ qp->admin = handoff->admin;
+ qp->sq_flow_control = handoff->sq_flow_control;
+ qp->qsize = handoff->qsize;
+ qp->qid = qid;
+ qp->sqhd = handoff->sqhd;
+ qp->sqtail = handoff->sqtail;
+ strlcpy(qp->name, name, sizeof(qp->name));
+ mtx_init(&qp->lock, "nvmft qp", NULL, MTX_DEF);
+ qp->cids = BITSET_ALLOC(NUM_CIDS, M_NVMFT, M_WAITOK | M_ZERO);
+
+ qp->qp = nvmf_allocate_qpair(trtype, true, handoff, nvmft_qpair_error,
+ qp, nvmft_receive_capsule, qp);
+ if (qp->qp == NULL) {
+ mtx_destroy(&qp->lock);
+ free(qp->cids, M_NVMFT);
+ free(qp, M_NVMFT);
+ return (NULL);
+ }
+
+ refcount_init(&qp->qp_refs, 1);
+ return (qp);
+}
+
+void
+nvmft_qpair_shutdown(struct nvmft_qpair *qp)
+{
+ struct nvmf_qpair *nq;
+
+ mtx_lock(&qp->lock);
+ nq = qp->qp;
+ qp->qp = NULL;
+ mtx_unlock(&qp->lock);
+ if (nq != NULL && refcount_release(&qp->qp_refs))
+ nvmf_free_qpair(nq);
+}
+
+void
+nvmft_qpair_destroy(struct nvmft_qpair *qp)
+{
+ nvmft_qpair_shutdown(qp);
+ mtx_destroy(&qp->lock);
+ free(qp->cids, M_NVMFT);
+ free(qp, M_NVMFT);
+}
+
+struct nvmft_controller *
+nvmft_qpair_ctrlr(struct nvmft_qpair *qp)
+{
+ return (qp->ctrlr);
+}
+
+uint16_t
+nvmft_qpair_id(struct nvmft_qpair *qp)
+{
+ return (qp->qid);
+}
+
+const char *
+nvmft_qpair_name(struct nvmft_qpair *qp)
+{
+ return (qp->name);
+}
+
+static int
+_nvmft_send_response(struct nvmft_qpair *qp, const void *cqe)
+{
+ struct nvme_completion cpl;
+ struct nvmf_qpair *nq;
+ struct nvmf_capsule *rc;
+ int error;
+
+ memcpy(&cpl, cqe, sizeof(cpl));
+ mtx_lock(&qp->lock);
+ nq = qp->qp;
+ if (nq == NULL) {
+ mtx_unlock(&qp->lock);
+ return (ENOTCONN);
+ }
+ refcount_acquire(&qp->qp_refs);
+
+ /* Set SQHD. */
+ if (qp->sq_flow_control) {
+ qp->sqhd = (qp->sqhd + 1) % qp->qsize;
+ cpl.sqhd = htole16(qp->sqhd);
+ } else
+ cpl.sqhd = 0;
+ mtx_unlock(&qp->lock);
+
+ rc = nvmf_allocate_response(nq, &cpl, M_WAITOK);
+ error = nvmf_transmit_capsule(rc);
+ nvmf_free_capsule(rc);
+
+ if (refcount_release(&qp->qp_refs))
+ nvmf_free_qpair(nq);
+ return (error);
+}
+
+void
+nvmft_command_completed(struct nvmft_qpair *qp, struct nvmf_capsule *nc)
+{
+ const struct nvme_command *cmd = nvmf_capsule_sqe(nc);
+
+ /* Don't bother byte-swapping CID. */
+ KASSERT(BIT_ISSET(NUM_CIDS, cmd->cid, qp->cids),
+ ("%s: CID %u not busy", __func__, cmd->cid));
+
+ BIT_CLR_ATOMIC(NUM_CIDS, cmd->cid, qp->cids);
+}
+
+int
+nvmft_send_response(struct nvmft_qpair *qp, const void *cqe)
+{
+ const struct nvme_completion *cpl = cqe;
+
+ /* Don't bother byte-swapping CID. */
+ KASSERT(BIT_ISSET(NUM_CIDS, cpl->cid, qp->cids),
+ ("%s: CID %u not busy", __func__, cpl->cid));
+
+ BIT_CLR_ATOMIC(NUM_CIDS, cpl->cid, qp->cids);
+ return (_nvmft_send_response(qp, cqe));
+}
+
+void
+nvmft_init_cqe(void *cqe, struct nvmf_capsule *nc, uint16_t status)
+{
+ struct nvme_completion *cpl = cqe;
+ const struct nvme_command *cmd = nvmf_capsule_sqe(nc);
+
+ memset(cpl, 0, sizeof(*cpl));
+ cpl->cid = cmd->cid;
+ cpl->status = htole16(status);
+}
+
+int
+nvmft_send_error(struct nvmft_qpair *qp, struct nvmf_capsule *nc,
+ uint8_t sc_type, uint8_t sc_status)
+{
+ struct nvme_completion cpl;
+ uint16_t status;
+
+ status = NVMEF(NVME_STATUS_SCT, sc_type) |
+ NVMEF(NVME_STATUS_SC, sc_status);
+ nvmft_init_cqe(&cpl, nc, status);
+ return (nvmft_send_response(qp, &cpl));
+}
+
+int
+nvmft_send_generic_error(struct nvmft_qpair *qp, struct nvmf_capsule *nc,
+ uint8_t sc_status)
+{
+ return (nvmft_send_error(qp, nc, NVME_SCT_GENERIC, sc_status));
+}
+
+/*
+ * This version doesn't clear CID in qp->cids and is used for errors
+ * before the CID is validated.
+ */
+static int
+_nvmft_send_generic_error(struct nvmft_qpair *qp, struct nvmf_capsule *nc,
+ uint8_t sc_status)
+{
+ struct nvme_completion cpl;
+ uint16_t status;
+
+ status = NVMEF(NVME_STATUS_SCT, NVME_SCT_GENERIC) |
+ NVMEF(NVME_STATUS_SC, sc_status);
+ nvmft_init_cqe(&cpl, nc, status);
+ return (_nvmft_send_response(qp, &cpl));
+}
+
+int
+nvmft_send_success(struct nvmft_qpair *qp, struct nvmf_capsule *nc)
+{
+ return (nvmft_send_generic_error(qp, nc, NVME_SC_SUCCESS));
+}
+
+static void
+nvmft_init_connect_rsp(struct nvmf_fabric_connect_rsp *rsp,
+ const struct nvmf_fabric_connect_cmd *cmd, uint16_t status)
+{
+ memset(rsp, 0, sizeof(*rsp));
+ rsp->cid = cmd->cid;
+ rsp->status = htole16(status);
+}
+
+static int
+nvmft_send_connect_response(struct nvmft_qpair *qp,
+ const struct nvmf_fabric_connect_rsp *rsp)
+{
+ struct nvmf_capsule *rc;
+ struct nvmf_qpair *nq;
+ int error;
+
+ mtx_lock(&qp->lock);
+ nq = qp->qp;
+ if (nq == NULL) {
+ mtx_unlock(&qp->lock);
+ return (ENOTCONN);
+ }
+ refcount_acquire(&qp->qp_refs);
+ mtx_unlock(&qp->lock);
+
+ rc = nvmf_allocate_response(qp->qp, rsp, M_WAITOK);
+ error = nvmf_transmit_capsule(rc);
+ nvmf_free_capsule(rc);
+
+ if (refcount_release(&qp->qp_refs))
+ nvmf_free_qpair(nq);
+ return (error);
+}
+
+void
+nvmft_connect_error(struct nvmft_qpair *qp,
+ const struct nvmf_fabric_connect_cmd *cmd, uint8_t sc_type,
+ uint8_t sc_status)
+{
+ struct nvmf_fabric_connect_rsp rsp;
+ uint16_t status;
+
+ status = NVMEF(NVME_STATUS_SCT, sc_type) |
+ NVMEF(NVME_STATUS_SC, sc_status);
+ nvmft_init_connect_rsp(&rsp, cmd, status);
+ nvmft_send_connect_response(qp, &rsp);
+}
+
+void
+nvmft_connect_invalid_parameters(struct nvmft_qpair *qp,
+ const struct nvmf_fabric_connect_cmd *cmd, bool data, uint16_t offset)
+{
+ struct nvmf_fabric_connect_rsp rsp;
+
+ nvmft_init_connect_rsp(&rsp, cmd,
+ NVMEF(NVME_STATUS_SCT, NVME_SCT_COMMAND_SPECIFIC) |
+ NVMEF(NVME_STATUS_SC, NVMF_FABRIC_SC_INVALID_PARAM));
+ rsp.status_code_specific.invalid.ipo = htole16(offset);
+ rsp.status_code_specific.invalid.iattr = data ? 1 : 0;
+ nvmft_send_connect_response(qp, &rsp);
+}
+
+int
+nvmft_finish_accept(struct nvmft_qpair *qp,
+ const struct nvmf_fabric_connect_cmd *cmd, struct nvmft_controller *ctrlr)
+{
+ struct nvmf_fabric_connect_rsp rsp;
+
+ qp->ctrlr = ctrlr;
+ nvmft_init_connect_rsp(&rsp, cmd, 0);
+ if (qp->sq_flow_control)
+ rsp.sqhd = htole16(qp->sqhd);
+ else
+ rsp.sqhd = htole16(0xffff);
+ rsp.status_code_specific.success.cntlid = htole16(ctrlr->cntlid);
+ return (nvmft_send_connect_response(qp, &rsp));
+}
diff --git a/sys/dev/nvmf/controller/nvmft_var.h b/sys/dev/nvmf/controller/nvmft_var.h
new file mode 100644
--- /dev/null
+++ b/sys/dev/nvmf/controller/nvmft_var.h
@@ -0,0 +1,174 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2023-2024 Chelsio Communications, Inc.
+ * Written by: John Baldwin <jhb@FreeBSD.org>
+ */
+
+#ifndef __NVMFT_VAR_H__
+#define __NVMFT_VAR_H__
+
+#include <sys/_callout.h>
+#include <sys/refcount.h>
+#include <sys/taskqueue.h>
+
+#include <dev/nvmf/nvmf_proto.h>
+
+#include <cam/ctl/ctl.h>
+#include <cam/ctl/ctl_io.h>
+#include <cam/ctl/ctl_frontend.h>
+
+struct nvmf_capsule;
+struct nvmft_controller;
+struct nvmft_qpair;
+
+#define NVMFT_NUM_AER 16
+
+struct nvmft_port {
+ TAILQ_ENTRY(nvmft_port) link;
+ u_int refs;
+ struct ctl_port port;
+ struct nvme_controller_data cdata;
+ struct nvme_firmware_page fp;
+ uint64_t cap;
+ uint32_t max_io_qsize;
+ bool online;
+
+ struct sx lock;
+
+ struct unrhdr *ids;
+ TAILQ_HEAD(, nvmft_controller) controllers;
+
+ uint32_t *active_ns;
+ u_int num_ns;
+};
+
+struct nvmft_io_qpair {
+ struct nvmft_qpair *qp;
+
+ bool shutdown;
+};
+
+struct nvmft_controller {
+ struct nvmft_qpair *admin;
+ struct nvmft_io_qpair *io_qpairs;
+ u_int num_io_queues;
+ bool shutdown;
+ bool admin_closed;
+ uint16_t cntlid;
+ uint32_t cc;
+ uint32_t csts;
+
+ struct nvmft_port *np;
+ struct mtx lock;
+
+ struct nvme_controller_data cdata;
+ struct nvme_health_information_page hip;
+ sbintime_t create_time;
+ sbintime_t start_busy;
+ sbintime_t busy_total;
+ uint16_t partial_dur;
+ uint16_t partial_duw;
+
+ uint8_t hostid[16];
+ uint8_t hostnqn[NVME_NQN_FIELD_SIZE];
+ u_int trtype;
+
+ TAILQ_ENTRY(nvmft_controller) link;
+
+ /*
+ * Each queue can have at most UINT16_MAX commands, so the total
+ * across all queues will fit in a uint32_t.
+ */
+ uint32_t pending_commands;
+
+ volatile int ka_active_traffic;
+ struct callout ka_timer;
+ sbintime_t ka_sbt;
+
+ /* AER fields. */
+ uint32_t aer_mask;
+ uint16_t aer_cids[NVMFT_NUM_AER];
+ uint8_t aer_pending;
+ uint8_t aer_cidx;
+ uint8_t aer_pidx;
+
+ /* Changed namespace IDs. */
+ struct nvme_ns_list *changed_ns;
+ bool changed_ns_reported;
+
+ struct task shutdown_task;
+ struct timeout_task terminate_task;
+};
+
+MALLOC_DECLARE(M_NVMFT);
+
+/* ctl_frontend_nvmf.c */
+void nvmft_port_free(struct nvmft_port *np);
+void nvmft_populate_active_nslist(struct nvmft_port *np, uint32_t nsid,
+ struct nvme_ns_list *nslist);
+void nvmft_dispatch_command(struct nvmft_qpair *qp,
+ struct nvmf_capsule *nc, bool admin);
+void nvmft_terminate_commands(struct nvmft_controller *ctrlr);
+
+/* nvmft_controller.c */
+void nvmft_controller_error(struct nvmft_controller *ctrlr,
+ struct nvmft_qpair *qp, int error);
+void nvmft_controller_lun_changed(struct nvmft_controller *ctrlr,
+ int lun_id);
+void nvmft_handle_admin_command(struct nvmft_controller *ctrlr,
+ struct nvmf_capsule *nc);
+void nvmft_handle_io_command(struct nvmft_qpair *qp, uint16_t qid,
+ struct nvmf_capsule *nc);
+int nvmft_handoff_admin_queue(struct nvmft_port *np,
+ const struct nvmf_handoff_controller_qpair *handoff,
+ const struct nvmf_fabric_connect_cmd *cmd,
+ const struct nvmf_fabric_connect_data *data);
+int nvmft_handoff_io_queue(struct nvmft_port *np,
+ const struct nvmf_handoff_controller_qpair *handoff,
+ const struct nvmf_fabric_connect_cmd *cmd,
+ const struct nvmf_fabric_connect_data *data);
+int nvmft_printf(struct nvmft_controller *ctrlr, const char *fmt, ...)
+ __printflike(2, 3);
+
+/* nvmft_qpair.c */
+struct nvmft_qpair *nvmft_qpair_init(enum nvmf_trtype trtype,
+ const struct nvmf_handoff_qpair_params *handoff, uint16_t qid,
+ const char *name);
+void nvmft_qpair_shutdown(struct nvmft_qpair *qp);
+void nvmft_qpair_destroy(struct nvmft_qpair *qp);
+struct nvmft_controller *nvmft_qpair_ctrlr(struct nvmft_qpair *qp);
+uint16_t nvmft_qpair_id(struct nvmft_qpair *qp);
+const char *nvmft_qpair_name(struct nvmft_qpair *qp);
+void nvmft_command_completed(struct nvmft_qpair *qp,
+ struct nvmf_capsule *nc);
+int nvmft_send_response(struct nvmft_qpair *qp, const void *cqe);
+void nvmft_init_cqe(void *cqe, struct nvmf_capsule *nc, uint16_t status);
+int nvmft_send_error(struct nvmft_qpair *qp, struct nvmf_capsule *nc,
+ uint8_t sc_type, uint8_t sc_status);
+int nvmft_send_generic_error(struct nvmft_qpair *qp,
+ struct nvmf_capsule *nc, uint8_t sc_status);
+int nvmft_send_success(struct nvmft_qpair *qp,
+ struct nvmf_capsule *nc);
+void nvmft_connect_error(struct nvmft_qpair *qp,
+ const struct nvmf_fabric_connect_cmd *cmd, uint8_t sc_type,
+ uint8_t sc_status);
+void nvmft_connect_invalid_parameters(struct nvmft_qpair *qp,
+ const struct nvmf_fabric_connect_cmd *cmd, bool data, uint16_t offset);
+int nvmft_finish_accept(struct nvmft_qpair *qp,
+ const struct nvmf_fabric_connect_cmd *cmd, struct nvmft_controller *ctrlr);
+
+static __inline void
+nvmft_port_ref(struct nvmft_port *np)
+{
+ refcount_acquire(&np->refs);
+}
+
+static __inline void
+nvmft_port_rele(struct nvmft_port *np)
+{
+ if (refcount_release(&np->refs))
+ nvmft_port_free(np);
+}
+
+#endif /* !__NVMFT_VAR_H__ */
diff --git a/sys/modules/nvmf/Makefile b/sys/modules/nvmf/Makefile
--- a/sys/modules/nvmf/Makefile
+++ b/sys/modules/nvmf/Makefile
@@ -1,5 +1,6 @@
SUBDIR= nvmf \
nvmf_tcp \
- nvmf_transport
+ nvmf_transport \
+ nvmft
.include <bsd.subdir.mk>
diff --git a/sys/modules/nvmf/nvmft/Makefile b/sys/modules/nvmf/nvmft/Makefile
new file mode 100644
--- /dev/null
+++ b/sys/modules/nvmf/nvmft/Makefile
@@ -0,0 +1,10 @@
+.PATH: ${SRCTOP}/sys/dev/nvmf/controller
+
+KMOD= nvmft
+
+SRCS= ctl_frontend_nvmf.c \
+ nvmft_controller.c \
+ nvmft_subr.c \
+ nvmft_qpair.c
+
+.include <bsd.kmod.mk>

File Metadata

Mime Type
text/plain
Expires
Mon, Jan 27, 11:47 AM (3 h, 53 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
16195294
Default Alt Text
D44726.diff (76 KB)

Event Timeline