D44712.diff
No OneTemporary
Actions

Size

50 KB

Referenced Files

None

Subscribers

None

D44712.diff
View Options

	diff --git a/share/man/man4/Makefile b/share/man/man4/Makefile
	--- a/share/man/man4/Makefile
	+++ b/share/man/man4/Makefile
	@@ -408,6 +408,7 @@
	nvd.4 \
	${_nvdimm.4} \
	nvme.4 \
	+ nvmf_tcp.4 \
	${_nvram.4} \
	oce.4 \
	ocs_fc.4\
	diff --git a/share/man/man4/nvmf_tcp.4 b/share/man/man4/nvmf_tcp.4
	new file mode 100644
	--- /dev/null
	+++ b/share/man/man4/nvmf_tcp.4
	@@ -0,0 +1,57 @@
	+.\"
	+.\" SPDX-License-Identifier: BSD-2-Clause
	+.\"
	+.\" Copyright (c) 2024 Chelsio Communications, Inc.
	+.\"
	+.Dd May 2, 2024
	+.Dt NVMF_TCP 4
	+.Os
	+.Sh NAME
	+.Nm nvmf_tcp
	+.Nd "TCP transport for NVM Express over Fabrics"
	+.Sh SYNOPSIS
	+To compile the module into the kernel,
	+place the following line in the
	+kernel configuration file:
	+.Bd -ragged -offset indent
	+.Cd "device nvmf_tcp"
	+.Ed
	+.Pp
	+Alternatively, to load the
	+module at boot time, place the following line in
	+.Xr loader.conf 5 :
	+.Bd -literal -offset indent
	+nvmf_tcp_load="YES"
	+.Ed
	+.Sh DESCRIPTION
	+The
	+.Nm
	+module implements the software TCP/IP transport for NVM Express over Fabrics.
	+It can be used by either the in-kernel NVMeoF host driver or controller.
	+.Sh SYSCTL VARIABLES
	+The following variables are available as both
	+.Xr sysctl 8
	+variables and
	+.Xr loader 8
	+tunables:
	+.Bl -tag -width indent
	+.It Va kern.nvmf.tcp.max_c2hdata
	+The maximum data payload size of a
	+.Va C2H_DATA
	+PDU sent by the controller to a remote host.
	+The default size is 256 kilobytes.
	+.El
	+.Sh SEE ALSO
	+.Xr nvmf 4 ,
	+.Xr nvmft 4
	+.Sh HISTORY
	+The
	+.Nm
	+module first appeared in
	+.Fx 15.0 .
	+.Sh AUTHORS
	+The
	+.Nm
	+module was developed by
	+.An John Baldwin Aq Mt jhb@FreeBSD.org
	+under sponsorship from Chelsio Communications, Inc.
	diff --git a/sys/conf/NOTES b/sys/conf/NOTES
	--- a/sys/conf/NOTES
	+++ b/sys/conf/NOTES
	@@ -1676,11 +1676,13 @@
	# NVM Express
	#
	# nvme: PCI-express NVM Express host controllers
	+# nvmf_tcp: TCP transport for NVM Express over Fabrics
	# nda: CAM NVMe disk driver
	# nvd: non-CAM NVMe disk driver

	device nvme # base NVMe driver
	options NVME_USE_NVD=1 # Use nvd(4) instead of the CAM nda(4) driver
	+device nvmf_tcp # NVMeoF TCP transport
	device nda # NVMe direct access devices (aka disks)
	device nvd # expose NVMe namespaces as disks, depends on nvme

	diff --git a/sys/conf/files b/sys/conf/files
	--- a/sys/conf/files
	+++ b/sys/conf/files
	@@ -2533,6 +2533,7 @@
	dev/nvme/nvme_util.c optional nvme
	dev/nvmem/nvmem.c optional nvmem fdt
	dev/nvmem/nvmem_if.m optional nvmem
	+dev/nvmf/nvmf_tcp.c optional nvmf_tcp
	dev/oce/oce_hw.c optional oce pci
	dev/oce/oce_if.c optional oce pci
	dev/oce/oce_mbox.c optional oce pci
	diff --git a/sys/dev/nvmf/nvmf_tcp.c b/sys/dev/nvmf/nvmf_tcp.c
	new file mode 100644
	--- /dev/null
	+++ b/sys/dev/nvmf/nvmf_tcp.c
	@@ -0,0 +1,1867 @@
	+/*-
	+ * SPDX-License-Identifier: BSD-2-Clause
	+ *
	+ * Copyright (c) 2022-2024 Chelsio Communications, Inc.
	+ * Written by: John Baldwin <jhb@FreeBSD.org>
	+ */
	+
	+#include <sys/param.h>
	+#include <sys/capsicum.h>
	+#include <sys/condvar.h>
	+#include <sys/file.h>
	+#include <sys/gsb_crc32.h>
	+#include <sys/kernel.h>
	+#include <sys/kthread.h>
	+#include <sys/limits.h>
	+#include <sys/lock.h>
	+#include <sys/malloc.h>
	+#include <sys/mbuf.h>
	+#include <sys/module.h>
	+#include <sys/mutex.h>
	+#include <sys/protosw.h>
	+#include <sys/refcount.h>
	+#include <sys/socket.h>
	+#include <sys/socketvar.h>
	+#include <sys/sysctl.h>
	+#include <sys/uio.h>
	+#include <netinet/in.h>
	+#include <dev/nvme/nvme.h>
	+#include <dev/nvmf/nvmf.h>
	+#include <dev/nvmf/nvmf_proto.h>
	+#include <dev/nvmf/nvmf_tcp.h>
	+#include <dev/nvmf/nvmf_transport.h>
	+#include <dev/nvmf/nvmf_transport_internal.h>
	+
	+struct nvmf_tcp_capsule;
	+struct nvmf_tcp_qpair;
	+
	+struct nvmf_tcp_command_buffer {
	+ struct nvmf_tcp_qpair *qp;
	+
	+ struct nvmf_io_request io;
	+ size_t data_len;
	+ size_t data_xfered;
	+ uint32_t data_offset;
	+
	+ u_int refs;
	+ int error;
	+
	+ uint16_t cid;
	+ uint16_t ttag;
	+
	+ TAILQ_ENTRY(nvmf_tcp_command_buffer) link;
	+
	+ /* Controller only */
	+ struct nvmf_tcp_capsule *tc;
	+};
	+
	+struct nvmf_tcp_command_buffer_list {
	+ TAILQ_HEAD(, nvmf_tcp_command_buffer) head;
	+ struct mtx lock;
	+};
	+
	+struct nvmf_tcp_qpair {
	+ struct nvmf_qpair qp;
	+
	+ struct socket *so;
	+
	+ volatile u_int refs; /* Every allocated capsule holds a reference */
	+ uint8_t txpda;
	+ uint8_t rxpda;
	+ bool header_digests;
	+ bool data_digests;
	+ uint32_t maxr2t;
	+ uint32_t maxh2cdata; /* Controller only */
	+ uint32_t max_tx_data;
	+ uint32_t max_icd; /* Host only */
	+ uint16_t next_ttag; /* Controller only */
	+ u_int num_ttags; /* Controller only */
	+ u_int active_ttags; /* Controller only */
	+ bool send_success; /* Controller only */
	+
	+ /* Receive state. */
	+ struct thread *rx_thread;
	+ struct cv rx_cv;
	+ bool rx_shutdown;
	+
	+ /* Transmit state. */
	+ struct thread *tx_thread;
	+ struct cv tx_cv;
	+ bool tx_shutdown;
	+ struct mbufq tx_pdus;
	+ STAILQ_HEAD(, nvmf_tcp_capsule) tx_capsules;
	+
	+ struct nvmf_tcp_command_buffer_list tx_buffers;
	+ struct nvmf_tcp_command_buffer_list rx_buffers;
	+
	+ /*
	+ * For the controller, an RX command buffer can be in one of
	+ * two locations, all protected by the rx_buffers.lock. If a
	+ * receive request is waiting for either an R2T slot for its
	+ * command (due to exceeding MAXR2T), or a transfer tag it is
	+ * placed on the rx_buffers list. When a request is allocated
	+ * an active transfer tag, it moves to the open_ttags[] array
	+ * (indexed by the tag) until it completes.
	+ */
	+ struct nvmf_tcp_command_buffer *open_ttags; / Controller only */
	+};
	+
	+struct nvmf_tcp_rxpdu {
	+ struct mbuf *m;
	+ const struct nvme_tcp_common_pdu_hdr *hdr;
	+ uint32_t data_len;
	+ bool data_digest_mismatch;
	+};
	+
	+struct nvmf_tcp_capsule {
	+ struct nvmf_capsule nc;
	+
	+ volatile u_int refs;
	+
	+ struct nvmf_tcp_rxpdu rx_pdu;
	+
	+ uint32_t active_r2ts; /* Controller only */
	+#ifdef INVARIANTS
	+ uint32_t tx_data_offset; /* Controller only */
	+ u_int pending_r2ts; /* Controller only */
	+#endif
	+
	+ STAILQ_ENTRY(nvmf_tcp_capsule) link;
	+};
	+
	+#define TCAP(nc) ((struct nvmf_tcp_capsule *)(nc))
	+#define TQP(qp) ((struct nvmf_tcp_qpair *)(qp))
	+
	+static void tcp_release_capsule(struct nvmf_tcp_capsule *tc);
	+static void tcp_free_qpair(struct nvmf_qpair *nq);
	+
	+SYSCTL_NODE(_kern_nvmf, OID_AUTO, tcp, CTLFLAG_RD \| CTLFLAG_MPSAFE, 0,
	+ "TCP transport");
	+static u_int tcp_max_transmit_data = 256 * 1024;
	+SYSCTL_UINT(_kern_nvmf_tcp, OID_AUTO, max_c2hdata, CTLFLAG_RWTUN,
	+ &tcp_max_transmit_data, 0,
	+ "Maximum size of data payload in a transmitted PDU");
	+
	+static MALLOC_DEFINE(M_NVMF_TCP, "nvmf_tcp", "NVMe over TCP");
	+
	+static int
	+mbuf_crc32c_helper(void arg, void data, u_int len)
	+{
	+ uint32_t *digestp = arg;
	+
	+ digestp = calculate_crc32c(digestp, data, len);
	+ return (0);
	+}
	+
	+static uint32_t
	+mbuf_crc32c(struct mbuf *m, u_int offset, u_int len)
	+{
	+ uint32_t digest = 0xffffffff;
	+
	+ m_apply(m, offset, len, mbuf_crc32c_helper, &digest);
	+ digest = digest ^ 0xffffffff;
	+
	+ return (digest);
	+}
	+
	+static uint32_t
	+compute_digest(const void *buf, size_t len)
	+{
	+ return (calculate_crc32c(0xffffffff, buf, len) ^ 0xffffffff);
	+}
	+
	+static struct nvmf_tcp_command_buffer *
	+tcp_alloc_command_buffer(struct nvmf_tcp_qpair *qp,
	+ const struct nvmf_io_request *io, uint32_t data_offset, size_t data_len,
	+ uint16_t cid)
	+{
	+ struct nvmf_tcp_command_buffer *cb;
	+
	+ cb = malloc(sizeof(*cb), M_NVMF_TCP, M_WAITOK);
	+ cb->qp = qp;
	+ cb->io = *io;
	+ cb->data_offset = data_offset;
	+ cb->data_len = data_len;
	+ cb->data_xfered = 0;
	+ refcount_init(&cb->refs, 1);
	+ cb->error = 0;
	+ cb->cid = cid;
	+ cb->ttag = 0;
	+ cb->tc = NULL;
	+
	+ return (cb);
	+}
	+
	+static void
	+tcp_hold_command_buffer(struct nvmf_tcp_command_buffer *cb)
	+{
	+ refcount_acquire(&cb->refs);
	+}
	+
	+static void
	+tcp_free_command_buffer(struct nvmf_tcp_command_buffer *cb)
	+{
	+ nvmf_complete_io_request(&cb->io, cb->data_xfered, cb->error);
	+ if (cb->tc != NULL)
	+ tcp_release_capsule(cb->tc);
	+ free(cb, M_NVMF_TCP);
	+}
	+
	+static void
	+tcp_release_command_buffer(struct nvmf_tcp_command_buffer *cb)
	+{
	+ if (refcount_release(&cb->refs))
	+ tcp_free_command_buffer(cb);
	+}
	+
	+static void
	+tcp_add_command_buffer(struct nvmf_tcp_command_buffer_list *list,
	+ struct nvmf_tcp_command_buffer *cb)
	+{
	+ mtx_assert(&list->lock, MA_OWNED);
	+ TAILQ_INSERT_HEAD(&list->head, cb, link);
	+}
	+
	+static struct nvmf_tcp_command_buffer *
	+tcp_find_command_buffer(struct nvmf_tcp_command_buffer_list *list,
	+ uint16_t cid, uint16_t ttag)
	+{
	+ struct nvmf_tcp_command_buffer *cb;
	+
	+ mtx_assert(&list->lock, MA_OWNED);
	+ TAILQ_FOREACH(cb, &list->head, link) {
	+ if (cb->cid == cid && cb->ttag == ttag)
	+ return (cb);
	+ }
	+ return (NULL);
	+}
	+
	+static void
	+tcp_remove_command_buffer(struct nvmf_tcp_command_buffer_list *list,
	+ struct nvmf_tcp_command_buffer *cb)
	+{
	+ mtx_assert(&list->lock, MA_OWNED);
	+ TAILQ_REMOVE(&list->head, cb, link);
	+}
	+
	+static void
	+tcp_purge_command_buffer(struct nvmf_tcp_command_buffer_list *list,
	+ uint16_t cid, uint16_t ttag)
	+{
	+ struct nvmf_tcp_command_buffer *cb;
	+
	+ mtx_lock(&list->lock);
	+ cb = tcp_find_command_buffer(list, cid, ttag);
	+ if (cb != NULL) {
	+ tcp_remove_command_buffer(list, cb);
	+ mtx_unlock(&list->lock);
	+ tcp_release_command_buffer(cb);
	+ } else
	+ mtx_unlock(&list->lock);
	+}
	+
	+static void
	+nvmf_tcp_write_pdu(struct nvmf_tcp_qpair qp, struct mbuf m)
	+{
	+ struct socket *so = qp->so;
	+
	+ SOCKBUF_LOCK(&so->so_snd);
	+ mbufq_enqueue(&qp->tx_pdus, m);
	+ /* XXX: Do we need to handle sb_hiwat being wrong? */
	+ if (sowriteable(so))
	+ cv_signal(&qp->tx_cv);
	+ SOCKBUF_UNLOCK(&so->so_snd);
	+}
	+
	+static void
	+nvmf_tcp_report_error(struct nvmf_tcp_qpair *qp, uint16_t fes, uint32_t fei,
	+ struct mbuf *rx_pdu, u_int hlen)
	+{
	+ struct nvme_tcp_term_req_hdr *hdr;
	+ struct mbuf *m;
	+
	+ if (hlen != 0) {
	+ hlen = min(hlen, NVME_TCP_TERM_REQ_ERROR_DATA_MAX_SIZE);
	+ hlen = min(hlen, m_length(rx_pdu, NULL));
	+ }
	+
	+ m = m_get2(sizeof(*hdr) + hlen, M_WAITOK, MT_DATA, 0);
	+ m->m_len = sizeof(*hdr) + hlen;
	+ hdr = mtod(m, void *);
	+ memset(hdr, 0, sizeof(*hdr));
	+ hdr->common.pdu_type = qp->qp.nq_controller ?
	+ NVME_TCP_PDU_TYPE_C2H_TERM_REQ : NVME_TCP_PDU_TYPE_H2C_TERM_REQ;
	+ hdr->common.hlen = sizeof(*hdr);
	+ hdr->common.plen = sizeof(*hdr) + hlen;
	+ hdr->fes = htole16(fes);
	+ le32enc(hdr->fei, fei);
	+ if (hlen != 0)
	+ m_copydata(rx_pdu, 0, hlen, (caddr_t)(hdr + 1));
	+
	+ nvmf_tcp_write_pdu(qp, m);
	+}
	+
	+static int
	+nvmf_tcp_validate_pdu(struct nvmf_tcp_qpair qp, struct nvmf_tcp_rxpdu pdu)
	+{
	+ const struct nvme_tcp_common_pdu_hdr *ch;
	+ struct mbuf *m = pdu->m;
	+ uint32_t data_len, fei, plen;
	+ uint32_t digest, rx_digest;
	+ u_int hlen;
	+ int error;
	+ uint16_t fes;
	+
	+ /* Determine how large of a PDU header to return for errors. */
	+ ch = pdu->hdr;
	+ hlen = ch->hlen;
	+ plen = le32toh(ch->plen);
	+ if (hlen < sizeof(*ch) \|\| hlen > plen)
	+ hlen = sizeof(*ch);
	+
	+ error = nvmf_tcp_validate_pdu_header(ch, qp->qp.nq_controller,
	+ qp->header_digests, qp->data_digests, qp->rxpda, &data_len, &fes,
	+ &fei);
	+ if (error != 0) {
	+ if (error != ECONNRESET)
	+ nvmf_tcp_report_error(qp, fes, fei, m, hlen);
	+ return (error);
	+ }
	+
	+ /* Check header digest if present. */
	+ if ((ch->flags & NVME_TCP_CH_FLAGS_HDGSTF) != 0) {
	+ digest = mbuf_crc32c(m, 0, ch->hlen);
	+ m_copydata(m, ch->hlen, sizeof(rx_digest), (caddr_t)&rx_digest);
	+ if (digest != rx_digest) {
	+ printf("NVMe/TCP: Header digest mismatch\n");
	+ nvmf_tcp_report_error(qp,
	+ NVME_TCP_TERM_REQ_FES_HDGST_ERROR, rx_digest, m,
	+ hlen);
	+ return (EBADMSG);
	+ }
	+ }
	+
	+ /* Check data digest if present. */
	+ pdu->data_digest_mismatch = false;
	+ if ((ch->flags & NVME_TCP_CH_FLAGS_DDGSTF) != 0) {
	+ digest = mbuf_crc32c(m, ch->pdo, data_len);
	+ m_copydata(m, plen - sizeof(rx_digest), sizeof(rx_digest),
	+ (caddr_t)&rx_digest);
	+ if (digest != rx_digest) {
	+ printf("NVMe/TCP: Data digest mismatch\n");
	+ pdu->data_digest_mismatch = true;
	+ }
	+ }
	+
	+ pdu->data_len = data_len;
	+ return (0);
	+}
	+
	+static void
	+nvmf_tcp_free_pdu(struct nvmf_tcp_rxpdu *pdu)
	+{
	+ m_freem(pdu->m);
	+ pdu->m = NULL;
	+ pdu->hdr = NULL;
	+}
	+
	+static int
	+nvmf_tcp_handle_term_req(struct nvmf_tcp_rxpdu *pdu)
	+{
	+ const struct nvme_tcp_term_req_hdr *hdr;
	+
	+ hdr = (const void *)pdu->hdr;
	+
	+ printf("NVMe/TCP: Received termination request: fes %#x fei %#x\n",
	+ le16toh(hdr->fes), le32dec(hdr->fei));
	+ nvmf_tcp_free_pdu(pdu);
	+ return (ECONNRESET);
	+}
	+
	+static int
	+nvmf_tcp_save_command_capsule(struct nvmf_tcp_qpair *qp,
	+ struct nvmf_tcp_rxpdu *pdu)
	+{
	+ const struct nvme_tcp_cmd *cmd;
	+ struct nvmf_capsule *nc;
	+ struct nvmf_tcp_capsule *tc;
	+
	+ cmd = (const void *)pdu->hdr;
	+
	+ nc = nvmf_allocate_command(&qp->qp, &cmd->ccsqe, M_WAITOK);
	+
	+ tc = TCAP(nc);
	+ tc->rx_pdu = *pdu;
	+
	+ nvmf_capsule_received(&qp->qp, nc);
	+ return (0);
	+}
	+
	+static int
	+nvmf_tcp_save_response_capsule(struct nvmf_tcp_qpair *qp,
	+ struct nvmf_tcp_rxpdu *pdu)
	+{
	+ const struct nvme_tcp_rsp *rsp;
	+ struct nvmf_capsule *nc;
	+ struct nvmf_tcp_capsule *tc;
	+
	+ rsp = (const void *)pdu->hdr;
	+
	+ nc = nvmf_allocate_response(&qp->qp, &rsp->rccqe, M_WAITOK);
	+
	+ nc->nc_sqhd_valid = true;
	+ tc = TCAP(nc);
	+ tc->rx_pdu = *pdu;
	+
	+ /*
	+ * Once the CQE has been received, no further transfers to the
	+ * command buffer for the associated CID can occur.
	+ */
	+ tcp_purge_command_buffer(&qp->rx_buffers, rsp->rccqe.cid, 0);
	+ tcp_purge_command_buffer(&qp->tx_buffers, rsp->rccqe.cid, 0);
	+
	+ nvmf_capsule_received(&qp->qp, nc);
	+ return (0);
	+}
	+
	+/*
	+ * Construct a PDU that contains an optional data payload. This
	+ * includes dealing with digests and the length fields in the common
	+ * header.
	+ */
	+static struct mbuf *
	+nvmf_tcp_construct_pdu(struct nvmf_tcp_qpair qp, void hdr, size_t hlen,
	+ struct mbuf *data, uint32_t data_len)
	+{
	+ struct nvme_tcp_common_pdu_hdr *ch;
	+ struct mbuf *top;
	+ uint32_t digest, pad, pdo, plen, mlen;
	+
	+ plen = hlen;
	+ if (qp->header_digests)
	+ plen += sizeof(digest);
	+ if (data_len != 0) {
	+ KASSERT(m_length(data, NULL) == data_len, ("length mismatch"));
	+ pdo = roundup2(plen, qp->txpda);
	+ pad = pdo - plen;
	+ plen = pdo + data_len;
	+ if (qp->data_digests)
	+ plen += sizeof(digest);
	+ mlen = pdo;
	+ } else {
	+ KASSERT(data == NULL, ("payload mbuf with zero length"));
	+ pdo = 0;
	+ pad = 0;
	+ mlen = plen;
	+ }
	+
	+ top = m_get2(mlen, M_WAITOK, MT_DATA, 0);
	+ top->m_len = mlen;
	+ ch = mtod(top, void *);
	+ memcpy(ch, hdr, hlen);
	+ ch->hlen = hlen;
	+ if (qp->header_digests)
	+ ch->flags \|= NVME_TCP_CH_FLAGS_HDGSTF;
	+ if (qp->data_digests && data_len != 0)
	+ ch->flags \|= NVME_TCP_CH_FLAGS_DDGSTF;
	+ ch->pdo = pdo;
	+ ch->plen = htole32(plen);
	+
	+ /* HDGST */
	+ if (qp->header_digests) {
	+ digest = compute_digest(ch, hlen);
	+ memcpy((char *)ch + hlen, &digest, sizeof(digest));
	+ }
	+
	+ if (pad != 0) {
	+ /* PAD */
	+ memset((char *)ch + pdo - pad, 0, pad);
	+ }
	+
	+ if (data_len != 0) {
	+ /* DATA */
	+ top->m_next = data;
	+
	+ /* DDGST */
	+ if (qp->data_digests) {
	+ digest = mbuf_crc32c(data, 0, data_len);
	+
	+ /* XXX: Can't use m_append as it uses M_NOWAIT. */
	+ while (data->m_next != NULL)
	+ data = data->m_next;
	+
	+ data->m_next = m_get(M_WAITOK, MT_DATA);
	+ data->m_next->m_len = sizeof(digest);
	+ memcpy(mtod(data->m_next, void *), &digest,
	+ sizeof(digest));
	+ }
	+ }
	+
	+ return (top);
	+}
	+
	+/* Find the next command buffer eligible to schedule for R2T. */
	+static struct nvmf_tcp_command_buffer *
	+nvmf_tcp_next_r2t(struct nvmf_tcp_qpair *qp)
	+{
	+ struct nvmf_tcp_command_buffer *cb;
	+
	+ mtx_assert(&qp->rx_buffers.lock, MA_OWNED);
	+ MPASS(qp->active_ttags < qp->num_ttags);
	+
	+ TAILQ_FOREACH(cb, &qp->rx_buffers.head, link) {
	+ /* NB: maxr2t is 0's based. */
	+ if (cb->tc->active_r2ts > qp->maxr2t)
	+ continue;
	+#ifdef INVARIANTS
	+ cb->tc->pending_r2ts--;
	+#endif
	+ TAILQ_REMOVE(&qp->rx_buffers.head, cb, link);
	+ return (cb);
	+ }
	+ return (NULL);
	+}
	+
	+/* Allocate the next free transfer tag and assign it to cb. */
	+static void
	+nvmf_tcp_allocate_ttag(struct nvmf_tcp_qpair *qp,
	+ struct nvmf_tcp_command_buffer *cb)
	+{
	+ uint16_t ttag;
	+
	+ mtx_assert(&qp->rx_buffers.lock, MA_OWNED);
	+
	+ ttag = qp->next_ttag;
	+ for (;;) {
	+ if (qp->open_ttags[ttag] == NULL)
	+ break;
	+ if (ttag == qp->num_ttags - 1)
	+ ttag = 0;
	+ else
	+ ttag++;
	+ MPASS(ttag != qp->next_ttag);
	+ }
	+ if (ttag == qp->num_ttags - 1)
	+ qp->next_ttag = 0;
	+ else
	+ qp->next_ttag = ttag + 1;
	+
	+ cb->tc->active_r2ts++;
	+ qp->active_ttags++;
	+ qp->open_ttags[ttag] = cb;
	+
	+ /*
	+ * Don't bother byte-swapping ttag as it is just a cookie
	+ * value returned by the other end as-is.
	+ */
	+ cb->ttag = ttag;
	+}
	+
	+/* NB: cid and ttag are both little-endian already. */
	+static void
	+tcp_send_r2t(struct nvmf_tcp_qpair *qp, uint16_t cid, uint16_t ttag,
	+ uint32_t data_offset, uint32_t data_len)
	+{
	+ struct nvme_tcp_r2t_hdr r2t;
	+ struct mbuf *m;
	+
	+ memset(&r2t, 0, sizeof(r2t));
	+ r2t.common.pdu_type = NVME_TCP_PDU_TYPE_R2T;
	+ r2t.cccid = cid;
	+ r2t.ttag = ttag;
	+ r2t.r2to = htole32(data_offset);
	+ r2t.r2tl = htole32(data_len);
	+
	+ m = nvmf_tcp_construct_pdu(qp, &r2t, sizeof(r2t), NULL, 0);
	+ nvmf_tcp_write_pdu(qp, m);
	+}
	+
	+/*
	+ * Release a transfer tag and schedule another R2T.
	+ *
	+ * NB: This drops the rx_buffers.lock mutex.
	+ */
	+static void
	+nvmf_tcp_send_next_r2t(struct nvmf_tcp_qpair *qp,
	+ struct nvmf_tcp_command_buffer *cb)
	+{
	+ struct nvmf_tcp_command_buffer *ncb;
	+
	+ mtx_assert(&qp->rx_buffers.lock, MA_OWNED);
	+ MPASS(qp->open_ttags[cb->ttag] == cb);
	+
	+ /* Release this transfer tag. */
	+ qp->open_ttags[cb->ttag] = NULL;
	+ qp->active_ttags--;
	+ cb->tc->active_r2ts--;
	+
	+ /* Schedule another R2T. */
	+ ncb = nvmf_tcp_next_r2t(qp);
	+ if (ncb != NULL) {
	+ nvmf_tcp_allocate_ttag(qp, ncb);
	+ mtx_unlock(&qp->rx_buffers.lock);
	+ tcp_send_r2t(qp, ncb->cid, ncb->ttag, ncb->data_offset,
	+ ncb->data_len);
	+ } else
	+ mtx_unlock(&qp->rx_buffers.lock);
	+}
	+
	+/*
	+ * Copy len bytes starting at offset skip from an mbuf chain into an
	+ * I/O buffer at destination offset io_offset.
	+ */
	+static void
	+mbuf_copyto_io(struct mbuf *m, u_int skip, u_int len,
	+ struct nvmf_io_request *io, u_int io_offset)
	+{
	+ u_int todo;
	+
	+ while (m->m_len <= skip) {
	+ skip -= m->m_len;
	+ m = m->m_next;
	+ }
	+ while (len != 0) {
	+ MPASS((m->m_flags & M_EXTPG) == 0);
	+
	+ todo = m->m_len - skip;
	+ if (todo > len)
	+ todo = len;
	+
	+ memdesc_copyback(&io->io_mem, io_offset, todo, mtodo(m, skip));
	+ skip = 0;
	+ io_offset += todo;
	+ len -= todo;
	+ m = m->m_next;
	+ }
	+}
	+
	+static int
	+nvmf_tcp_handle_h2c_data(struct nvmf_tcp_qpair qp, struct nvmf_tcp_rxpdu pdu)
	+{
	+ const struct nvme_tcp_h2c_data_hdr *h2c;
	+ struct nvmf_tcp_command_buffer *cb;
	+ uint32_t data_len, data_offset;
	+ uint16_t ttag;
	+
	+ h2c = (const void *)pdu->hdr;
	+ if (le32toh(h2c->datal) > qp->maxh2cdata) {
	+ nvmf_tcp_report_error(qp,
	+ NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_LIMIT_EXCEEDED, 0,
	+ pdu->m, pdu->hdr->hlen);
	+ nvmf_tcp_free_pdu(pdu);
	+ return (EBADMSG);
	+ }
	+
	+ /*
	+ * NB: Don't bother byte-swapping ttag as we don't byte-swap
	+ * it when sending.
	+ */
	+ ttag = h2c->ttag;
	+ if (ttag >= qp->num_ttags) {
	+ nvmf_tcp_report_error(qp,
	+ NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD,
	+ offsetof(struct nvme_tcp_h2c_data_hdr, ttag), pdu->m,
	+ pdu->hdr->hlen);
	+ nvmf_tcp_free_pdu(pdu);
	+ return (EBADMSG);
	+ }
	+
	+ mtx_lock(&qp->rx_buffers.lock);
	+ cb = qp->open_ttags[ttag];
	+ if (cb == NULL) {
	+ mtx_unlock(&qp->rx_buffers.lock);
	+ nvmf_tcp_report_error(qp,
	+ NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD,
	+ offsetof(struct nvme_tcp_h2c_data_hdr, ttag), pdu->m,
	+ pdu->hdr->hlen);
	+ nvmf_tcp_free_pdu(pdu);
	+ return (EBADMSG);
	+ }
	+ MPASS(cb->ttag == ttag);
	+
	+ /* For a data digest mismatch, fail the I/O request. */
	+ if (pdu->data_digest_mismatch) {
	+ nvmf_tcp_send_next_r2t(qp, cb);
	+ cb->error = EINTEGRITY;
	+ tcp_release_command_buffer(cb);
	+ nvmf_tcp_free_pdu(pdu);
	+ return (0);
	+ }
	+
	+ data_len = le32toh(h2c->datal);
	+ if (data_len != pdu->data_len) {
	+ mtx_unlock(&qp->rx_buffers.lock);
	+ nvmf_tcp_report_error(qp,
	+ NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD,
	+ offsetof(struct nvme_tcp_h2c_data_hdr, datal), pdu->m,
	+ pdu->hdr->hlen);
	+ nvmf_tcp_free_pdu(pdu);
	+ return (EBADMSG);
	+ }
	+
	+ data_offset = le32toh(h2c->datao);
	+ if (data_offset < cb->data_offset \|\|
	+ data_offset + data_len > cb->data_offset + cb->data_len) {
	+ mtx_unlock(&qp->rx_buffers.lock);
	+ nvmf_tcp_report_error(qp,
	+ NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_OUT_OF_RANGE, 0, pdu->m,
	+ pdu->hdr->hlen);
	+ nvmf_tcp_free_pdu(pdu);
	+ return (EBADMSG);
	+ }
	+
	+ if (data_offset != cb->data_offset + cb->data_xfered) {
	+ mtx_unlock(&qp->rx_buffers.lock);
	+ nvmf_tcp_report_error(qp,
	+ NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, pdu->m,
	+ pdu->hdr->hlen);
	+ nvmf_tcp_free_pdu(pdu);
	+ return (EBADMSG);
	+ }
	+
	+ if ((cb->data_xfered + data_len == cb->data_len) !=
	+ ((pdu->hdr->flags & NVME_TCP_H2C_DATA_FLAGS_LAST_PDU) != 0)) {
	+ mtx_unlock(&qp->rx_buffers.lock);
	+ nvmf_tcp_report_error(qp,
	+ NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, pdu->m,
	+ pdu->hdr->hlen);
	+ nvmf_tcp_free_pdu(pdu);
	+ return (EBADMSG);
	+ }
	+
	+ cb->data_xfered += data_len;
	+ data_offset -= cb->data_offset;
	+ if (cb->data_xfered == cb->data_len) {
	+ nvmf_tcp_send_next_r2t(qp, cb);
	+ } else {
	+ tcp_hold_command_buffer(cb);
	+ mtx_unlock(&qp->rx_buffers.lock);
	+ }
	+
	+ mbuf_copyto_io(pdu->m, pdu->hdr->pdo, data_len, &cb->io, data_offset);
	+
	+ tcp_release_command_buffer(cb);
	+ nvmf_tcp_free_pdu(pdu);
	+ return (0);
	+}
	+
	+static int
	+nvmf_tcp_handle_c2h_data(struct nvmf_tcp_qpair qp, struct nvmf_tcp_rxpdu pdu)
	+{
	+ const struct nvme_tcp_c2h_data_hdr *c2h;
	+ struct nvmf_tcp_command_buffer *cb;
	+ uint32_t data_len, data_offset;
	+
	+ c2h = (const void *)pdu->hdr;
	+
	+ mtx_lock(&qp->rx_buffers.lock);
	+ cb = tcp_find_command_buffer(&qp->rx_buffers, c2h->cccid, 0);
	+ if (cb == NULL) {
	+ mtx_unlock(&qp->rx_buffers.lock);
	+ /*
	+ * XXX: Could be PDU sequence error if cccid is for a
	+ * command that doesn't use a command buffer.
	+ */
	+ nvmf_tcp_report_error(qp,
	+ NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD,
	+ offsetof(struct nvme_tcp_c2h_data_hdr, cccid), pdu->m,
	+ pdu->hdr->hlen);
	+ nvmf_tcp_free_pdu(pdu);
	+ return (EBADMSG);
	+ }
	+
	+ /* For a data digest mismatch, fail the I/O request. */
	+ if (pdu->data_digest_mismatch) {
	+ cb->error = EINTEGRITY;
	+ tcp_remove_command_buffer(&qp->rx_buffers, cb);
	+ mtx_unlock(&qp->rx_buffers.lock);
	+ tcp_release_command_buffer(cb);
	+ nvmf_tcp_free_pdu(pdu);
	+ return (0);
	+ }
	+
	+ data_len = le32toh(c2h->datal);
	+ if (data_len != pdu->data_len) {
	+ mtx_unlock(&qp->rx_buffers.lock);
	+ nvmf_tcp_report_error(qp,
	+ NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD,
	+ offsetof(struct nvme_tcp_c2h_data_hdr, datal), pdu->m,
	+ pdu->hdr->hlen);
	+ nvmf_tcp_free_pdu(pdu);
	+ return (EBADMSG);
	+ }
	+
	+ data_offset = le32toh(c2h->datao);
	+ if (data_offset < cb->data_offset \|\|
	+ data_offset + data_len > cb->data_offset + cb->data_len) {
	+ mtx_unlock(&qp->rx_buffers.lock);
	+ nvmf_tcp_report_error(qp,
	+ NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_OUT_OF_RANGE, 0,
	+ pdu->m, pdu->hdr->hlen);
	+ nvmf_tcp_free_pdu(pdu);
	+ return (EBADMSG);
	+ }
	+
	+ if (data_offset != cb->data_offset + cb->data_xfered) {
	+ mtx_unlock(&qp->rx_buffers.lock);
	+ nvmf_tcp_report_error(qp,
	+ NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, pdu->m,
	+ pdu->hdr->hlen);
	+ nvmf_tcp_free_pdu(pdu);
	+ return (EBADMSG);
	+ }
	+
	+ if ((cb->data_xfered + data_len == cb->data_len) !=
	+ ((pdu->hdr->flags & NVME_TCP_C2H_DATA_FLAGS_LAST_PDU) != 0)) {
	+ mtx_unlock(&qp->rx_buffers.lock);
	+ nvmf_tcp_report_error(qp,
	+ NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, pdu->m,
	+ pdu->hdr->hlen);
	+ nvmf_tcp_free_pdu(pdu);
	+ return (EBADMSG);
	+ }
	+
	+ cb->data_xfered += data_len;
	+ data_offset -= cb->data_offset;
	+ if (cb->data_xfered == cb->data_len)
	+ tcp_remove_command_buffer(&qp->rx_buffers, cb);
	+ else
	+ tcp_hold_command_buffer(cb);
	+ mtx_unlock(&qp->rx_buffers.lock);
	+
	+ mbuf_copyto_io(pdu->m, pdu->hdr->pdo, data_len, &cb->io, data_offset);
	+
	+ tcp_release_command_buffer(cb);
	+
	+ if ((pdu->hdr->flags & NVME_TCP_C2H_DATA_FLAGS_SUCCESS) != 0) {
	+ struct nvme_completion cqe;
	+ struct nvmf_capsule *nc;
	+
	+ memset(&cqe, 0, sizeof(cqe));
	+ cqe.cid = c2h->cccid;
	+
	+ nc = nvmf_allocate_response(&qp->qp, &cqe, M_WAITOK);
	+ nc->nc_sqhd_valid = false;
	+
	+ nvmf_capsule_received(&qp->qp, nc);
	+ }
	+
	+ nvmf_tcp_free_pdu(pdu);
	+ return (0);
	+}
	+
	+/* Called when m_free drops refcount to 0. */
	+static void
	+nvmf_tcp_mbuf_done(struct mbuf *m)
	+{
	+ struct nvmf_tcp_command_buffer *cb = m->m_ext.ext_arg1;
	+
	+ tcp_free_command_buffer(cb);
	+}
	+
	+static struct mbuf *
	+nvmf_tcp_mbuf(void arg, int how, void data, size_t len)
	+{
	+ struct nvmf_tcp_command_buffer *cb = arg;
	+ struct mbuf *m;
	+
	+ m = m_get(how, MT_DATA);
	+ m->m_flags \|= M_RDONLY;
	+ m_extaddref(m, data, len, &cb->refs, nvmf_tcp_mbuf_done, cb, NULL);
	+ m->m_len = len;
	+ return (m);
	+}
	+
	+static void
	+nvmf_tcp_free_mext_pg(struct mbuf *m)
	+{
	+ struct nvmf_tcp_command_buffer *cb = m->m_ext.ext_arg1;
	+
	+ M_ASSERTEXTPG(m);
	+ tcp_release_command_buffer(cb);
	+}
	+
	+static struct mbuf *
	+nvmf_tcp_mext_pg(void *arg, int how)
	+{
	+ struct nvmf_tcp_command_buffer *cb = arg;
	+ struct mbuf *m;
	+
	+ m = mb_alloc_ext_pgs(how, nvmf_tcp_free_mext_pg);
	+ m->m_ext.ext_arg1 = cb;
	+ tcp_hold_command_buffer(cb);
	+ return (m);
	+}
	+
	+/*
	+ * Return an mbuf chain for a range of data belonging to a command
	+ * buffer.
	+ *
	+ * The mbuf chain uses M_EXT mbufs which hold references on the
	+ * command buffer so that it remains "alive" until the data has been
	+ * fully transmitted. If truncate_ok is true, then the mbuf chain
	+ * might return a short chain to avoid gratuitously splitting up a
	+ * page.
	+ */
	+static struct mbuf *
	+nvmf_tcp_command_buffer_mbuf(struct nvmf_tcp_command_buffer *cb,
	+ uint32_t data_offset, uint32_t data_len, uint32_t *actual_len,
	+ bool can_truncate)
	+{
	+ struct mbuf *m;
	+ size_t len;
	+
	+ m = memdesc_alloc_ext_mbufs(&cb->io.io_mem, nvmf_tcp_mbuf,
	+ nvmf_tcp_mext_pg, cb, M_WAITOK, data_offset, data_len, &len,
	+ can_truncate);
	+ if (actual_len != NULL)
	+ *actual_len = len;
	+ return (m);
	+}
	+
	+/* NB: cid and ttag and little-endian already. */
	+static void
	+tcp_send_h2c_pdu(struct nvmf_tcp_qpair *qp, uint16_t cid, uint16_t ttag,
	+ uint32_t data_offset, struct mbuf *m, size_t len, bool last_pdu)
	+{
	+ struct nvme_tcp_h2c_data_hdr h2c;
	+ struct mbuf *top;
	+
	+ memset(&h2c, 0, sizeof(h2c));
	+ h2c.common.pdu_type = NVME_TCP_PDU_TYPE_H2C_DATA;
	+ if (last_pdu)
	+ h2c.common.flags \|= NVME_TCP_H2C_DATA_FLAGS_LAST_PDU;
	+ h2c.cccid = cid;
	+ h2c.ttag = ttag;
	+ h2c.datao = htole32(data_offset);
	+ h2c.datal = htole32(len);
	+
	+ top = nvmf_tcp_construct_pdu(qp, &h2c, sizeof(h2c), m, len);
	+ nvmf_tcp_write_pdu(qp, top);
	+}
	+
	+static int
	+nvmf_tcp_handle_r2t(struct nvmf_tcp_qpair qp, struct nvmf_tcp_rxpdu pdu)
	+{
	+ const struct nvme_tcp_r2t_hdr *r2t;
	+ struct nvmf_tcp_command_buffer *cb;
	+ uint32_t data_len, data_offset;
	+
	+ r2t = (const void *)pdu->hdr;
	+
	+ mtx_lock(&qp->tx_buffers.lock);
	+ cb = tcp_find_command_buffer(&qp->tx_buffers, r2t->cccid, 0);
	+ if (cb == NULL) {
	+ mtx_unlock(&qp->tx_buffers.lock);
	+ nvmf_tcp_report_error(qp,
	+ NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD,
	+ offsetof(struct nvme_tcp_r2t_hdr, cccid), pdu->m,
	+ pdu->hdr->hlen);
	+ nvmf_tcp_free_pdu(pdu);
	+ return (EBADMSG);
	+ }
	+
	+ data_offset = le32toh(r2t->r2to);
	+ if (data_offset != cb->data_xfered) {
	+ mtx_unlock(&qp->tx_buffers.lock);
	+ nvmf_tcp_report_error(qp,
	+ NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, pdu->m,
	+ pdu->hdr->hlen);
	+ nvmf_tcp_free_pdu(pdu);
	+ return (EBADMSG);
	+ }
	+
	+ /*
	+ * XXX: The spec does not specify how to handle R2T tranfers
	+ * out of range of the original command.
	+ */
	+ data_len = le32toh(r2t->r2tl);
	+ if (data_offset + data_len > cb->data_len) {
	+ mtx_unlock(&qp->tx_buffers.lock);
	+ nvmf_tcp_report_error(qp,
	+ NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_OUT_OF_RANGE, 0,
	+ pdu->m, pdu->hdr->hlen);
	+ nvmf_tcp_free_pdu(pdu);
	+ return (EBADMSG);
	+ }
	+
	+ cb->data_xfered += data_len;
	+ if (cb->data_xfered == cb->data_len)
	+ tcp_remove_command_buffer(&qp->tx_buffers, cb);
	+ else
	+ tcp_hold_command_buffer(cb);
	+ mtx_unlock(&qp->tx_buffers.lock);
	+
	+ /*
	+ * Queue one or more H2C_DATA PDUs containing the requested
	+ * data.
	+ */
	+ while (data_len > 0) {
	+ struct mbuf *m;
	+ uint32_t sent, todo;
	+
	+ todo = data_len;
	+ if (todo > qp->max_tx_data)
	+ todo = qp->max_tx_data;
	+ m = nvmf_tcp_command_buffer_mbuf(cb, data_offset, todo, &sent,
	+ todo < data_len);
	+ tcp_send_h2c_pdu(qp, r2t->cccid, r2t->ttag, data_offset, m,
	+ sent, sent == data_len);
	+
	+ data_offset += sent;
	+ data_len -= sent;
	+ }
	+
	+ tcp_release_command_buffer(cb);
	+ nvmf_tcp_free_pdu(pdu);
	+ return (0);
	+}
	+
	+/*
	+ * A variant of m_pullup that uses M_WAITOK instead of failing. It
	+ * also doesn't do anything if enough bytes are already present in the
	+ * first mbuf.
	+ */
	+static struct mbuf *
	+pullup_pdu_hdr(struct mbuf *m, int len)
	+{
	+ struct mbuf n, p;
	+
	+ KASSERT(len <= MCLBYTES, ("%s: len too large", __func__));
	+ if (m->m_len >= len)
	+ return (m);
	+
	+ n = m_get2(len, M_WAITOK, MT_DATA, 0);
	+ n->m_len = len;
	+ m_copydata(m, 0, len, mtod(n, void *));
	+
	+ while (m != NULL && m->m_len <= len) {
	+ p = m->m_next;
	+ len -= m->m_len;
	+ m_free(m);
	+ m = p;
	+ }
	+ if (len > 0) {
	+ m->m_data += len;
	+ m->m_len -= len;
	+ }
	+ n->m_next = m;
	+ return (n);
	+}
	+
	+static int
	+nvmf_tcp_dispatch_pdu(struct nvmf_tcp_qpair *qp,
	+ const struct nvme_tcp_common_pdu_hdr ch, struct nvmf_tcp_rxpdu pdu)
	+{
	+ /* Ensure the PDU header is contiguous. */
	+ pdu->m = pullup_pdu_hdr(pdu->m, ch->hlen);
	+ pdu->hdr = mtod(pdu->m, const void *);
	+
	+ switch (ch->pdu_type) {
	+ default:
	+ __assert_unreachable();
	+ break;
	+ case NVME_TCP_PDU_TYPE_H2C_TERM_REQ:
	+ case NVME_TCP_PDU_TYPE_C2H_TERM_REQ:
	+ return (nvmf_tcp_handle_term_req(pdu));
	+ case NVME_TCP_PDU_TYPE_CAPSULE_CMD:
	+ return (nvmf_tcp_save_command_capsule(qp, pdu));
	+ case NVME_TCP_PDU_TYPE_CAPSULE_RESP:
	+ return (nvmf_tcp_save_response_capsule(qp, pdu));
	+ case NVME_TCP_PDU_TYPE_H2C_DATA:
	+ return (nvmf_tcp_handle_h2c_data(qp, pdu));
	+ case NVME_TCP_PDU_TYPE_C2H_DATA:
	+ return (nvmf_tcp_handle_c2h_data(qp, pdu));
	+ case NVME_TCP_PDU_TYPE_R2T:
	+ return (nvmf_tcp_handle_r2t(qp, pdu));
	+ }
	+}
	+
	+static void
	+nvmf_tcp_receive(void *arg)
	+{
	+ struct nvmf_tcp_qpair *qp = arg;
	+ struct socket *so = qp->so;
	+ struct nvmf_tcp_rxpdu pdu;
	+ struct nvme_tcp_common_pdu_hdr ch;
	+ struct uio uio;
	+ struct iovec iov[1];
	+ struct mbuf m, n, *tail;
	+ u_int avail, needed;
	+ int error, flags, terror;
	+ bool have_header;
	+
	+ m = tail = NULL;
	+ have_header = false;
	+ SOCKBUF_LOCK(&so->so_rcv);
	+ while (!qp->rx_shutdown) {
	+ /* Wait until there is enough data for the next step. */
	+ if (so->so_error != 0 \|\| so->so_rerror != 0) {
	+ if (so->so_error != 0)
	+ error = so->so_error;
	+ else
	+ error = so->so_rerror;
	+ SOCKBUF_UNLOCK(&so->so_rcv);
	+ error:
	+ m_freem(m);
	+ nvmf_qpair_error(&qp->qp, error);
	+ SOCKBUF_LOCK(&so->so_rcv);
	+ while (!qp->rx_shutdown)
	+ cv_wait(&qp->rx_cv, SOCKBUF_MTX(&so->so_rcv));
	+ break;
	+ }
	+ avail = sbavail(&so->so_rcv);
	+ if ((so->so_rcv.sb_state & SBS_CANTRCVMORE) != 0) {
	+ if (!have_header && avail == 0)
	+ error = 0;
	+ else
	+ error = ECONNRESET;
	+ SOCKBUF_UNLOCK(&so->so_rcv);
	+ goto error;
	+ }
	+ if (avail == 0 \|\| (!have_header && avail < sizeof(ch))) {
	+ cv_wait(&qp->rx_cv, SOCKBUF_MTX(&so->so_rcv));
	+ continue;
	+ }
	+ SOCKBUF_UNLOCK(&so->so_rcv);
	+
	+ if (!have_header) {
	+ KASSERT(m == NULL, ("%s: m != NULL but no header",
	+ __func__));
	+ memset(&uio, 0, sizeof(uio));
	+ iov[0].iov_base = &ch;
	+ iov[0].iov_len = sizeof(ch);
	+ uio.uio_iov = iov;
	+ uio.uio_iovcnt = 1;
	+ uio.uio_resid = sizeof(ch);
	+ uio.uio_segflg = UIO_SYSSPACE;
	+ uio.uio_rw = UIO_READ;
	+ flags = MSG_DONTWAIT \| MSG_PEEK;
	+
	+ error = soreceive(so, NULL, &uio, NULL, NULL, &flags);
	+ if (error != 0)
	+ goto error;
	+ KASSERT(uio.uio_resid == 0, ("%s: short CH read",
	+ __func__));
	+
	+ have_header = true;
	+ needed = le32toh(ch.plen);
	+
	+ /*
	+ * Malformed PDUs will be reported as errors
	+ * by nvmf_tcp_validate_pdu. Just pass along
	+ * garbage headers if the lengths mismatch.
	+ */
	+ if (needed < sizeof(ch) \|\| ch.hlen > needed)
	+ needed = sizeof(ch);
	+
	+ memset(&uio, 0, sizeof(uio));
	+ uio.uio_resid = needed;
	+ }
	+
	+ flags = MSG_DONTWAIT;
	+ error = soreceive(so, NULL, &uio, &n, NULL, &flags);
	+ if (error != 0)
	+ goto error;
	+
	+ if (m == NULL)
	+ m = n;
	+ else
	+ tail->m_next = n;
	+
	+ if (uio.uio_resid != 0) {
	+ tail = n;
	+ while (tail->m_next != NULL)
	+ tail = tail->m_next;
	+
	+ SOCKBUF_LOCK(&so->so_rcv);
	+ continue;
	+ }
	+#ifdef INVARIANTS
	+ tail = NULL;
	+#endif
	+
	+ pdu.m = m;
	+ m = NULL;
	+ pdu.hdr = &ch;
	+ error = nvmf_tcp_validate_pdu(qp, &pdu);
	+ if (error != 0)
	+ m_freem(pdu.m);
	+ else
	+ error = nvmf_tcp_dispatch_pdu(qp, &ch, &pdu);
	+ if (error != 0) {
	+ /*
	+ * If we received a termination request, close
	+ * the connection immediately.
	+ */
	+ if (error == ECONNRESET)
	+ goto error;
	+
	+ /*
	+ * Wait for up to 30 seconds for the socket to
	+ * be closed by the other end.
	+ */
	+ SOCKBUF_LOCK(&so->so_rcv);
	+ if ((so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) {
	+ terror = cv_timedwait(&qp->rx_cv,
	+ SOCKBUF_MTX(&so->so_rcv), 30 * hz);
	+ if (terror == ETIMEDOUT)
	+ printf("NVMe/TCP: Timed out after sending terminate request\n");
	+ }
	+ SOCKBUF_UNLOCK(&so->so_rcv);
	+ goto error;
	+ }
	+
	+ have_header = false;
	+ SOCKBUF_LOCK(&so->so_rcv);
	+ }
	+ SOCKBUF_UNLOCK(&so->so_rcv);
	+ kthread_exit();
	+}
	+
	+static struct mbuf *
	+tcp_command_pdu(struct nvmf_tcp_qpair qp, struct nvmf_tcp_capsule tc)
	+{
	+ struct nvmf_capsule *nc = &tc->nc;
	+ struct nvmf_tcp_command_buffer *cb;
	+ struct nvme_sgl_descriptor *sgl;
	+ struct nvme_tcp_cmd cmd;
	+ struct mbuf top, m;
	+ bool use_icd;
	+
	+ use_icd = false;
	+ cb = NULL;
	+ m = NULL;
	+
	+ if (nc->nc_data.io_len != 0) {
	+ cb = tcp_alloc_command_buffer(qp, &nc->nc_data, 0,
	+ nc->nc_data.io_len, nc->nc_sqe.cid);
	+
	+ if (nc->nc_send_data && nc->nc_data.io_len <= qp->max_icd) {
	+ use_icd = true;
	+ m = nvmf_tcp_command_buffer_mbuf(cb, 0,
	+ nc->nc_data.io_len, NULL, false);
	+ cb->data_xfered = nc->nc_data.io_len;
	+ tcp_release_command_buffer(cb);
	+ } else if (nc->nc_send_data) {
	+ mtx_lock(&qp->tx_buffers.lock);
	+ tcp_add_command_buffer(&qp->tx_buffers, cb);
	+ mtx_unlock(&qp->tx_buffers.lock);
	+ } else {
	+ mtx_lock(&qp->rx_buffers.lock);
	+ tcp_add_command_buffer(&qp->rx_buffers, cb);
	+ mtx_unlock(&qp->rx_buffers.lock);
	+ }
	+ }
	+
	+ memset(&cmd, 0, sizeof(cmd));
	+ cmd.common.pdu_type = NVME_TCP_PDU_TYPE_CAPSULE_CMD;
	+ cmd.ccsqe = nc->nc_sqe;
	+
	+ /* Populate SGL in SQE. */
	+ sgl = &cmd.ccsqe.sgl;
	+ memset(sgl, 0, sizeof(*sgl));
	+ sgl->address = 0;
	+ sgl->length = htole32(nc->nc_data.io_len);
	+ if (use_icd) {
	+ /* Use in-capsule data. */
	+ sgl->type = NVME_SGL_TYPE_ICD;
	+ } else {
	+ /* Use a command buffer. */
	+ sgl->type = NVME_SGL_TYPE_COMMAND_BUFFER;
	+ }
	+
	+ top = nvmf_tcp_construct_pdu(qp, &cmd, sizeof(cmd), m, m != NULL ?
	+ nc->nc_data.io_len : 0);
	+ return (top);
	+}
	+
	+static struct mbuf *
	+tcp_response_pdu(struct nvmf_tcp_qpair qp, struct nvmf_tcp_capsule tc)
	+{
	+ struct nvmf_capsule *nc = &tc->nc;
	+ struct nvme_tcp_rsp rsp;
	+
	+ memset(&rsp, 0, sizeof(rsp));
	+ rsp.common.pdu_type = NVME_TCP_PDU_TYPE_CAPSULE_RESP;
	+ rsp.rccqe = nc->nc_cqe;
	+
	+ return (nvmf_tcp_construct_pdu(qp, &rsp, sizeof(rsp), NULL, 0));
	+}
	+
	+static struct mbuf *
	+capsule_to_pdu(struct nvmf_tcp_qpair qp, struct nvmf_tcp_capsule tc)
	+{
	+ if (tc->nc.nc_qe_len == sizeof(struct nvme_command))
	+ return (tcp_command_pdu(qp, tc));
	+ else
	+ return (tcp_response_pdu(qp, tc));
	+}
	+
	+static void
	+nvmf_tcp_send(void *arg)
	+{
	+ struct nvmf_tcp_qpair *qp = arg;
	+ struct nvmf_tcp_capsule *tc;
	+ struct socket *so = qp->so;
	+ struct mbuf m, n, *p;
	+ u_long space, tosend;
	+ int error;
	+
	+ m = NULL;
	+ SOCKBUF_LOCK(&so->so_snd);
	+ while (!qp->tx_shutdown) {
	+ if (so->so_error != 0) {
	+ error = so->so_error;
	+ SOCKBUF_UNLOCK(&so->so_snd);
	+ error:
	+ m_freem(m);
	+ nvmf_qpair_error(&qp->qp, error);
	+ SOCKBUF_LOCK(&so->so_snd);
	+ while (!qp->tx_shutdown)
	+ cv_wait(&qp->tx_cv, SOCKBUF_MTX(&so->so_snd));
	+ break;
	+ }
	+
	+ if (m == NULL) {
	+ /* Next PDU to send. */
	+ m = mbufq_dequeue(&qp->tx_pdus);
	+ }
	+ if (m == NULL) {
	+ if (STAILQ_EMPTY(&qp->tx_capsules)) {
	+ cv_wait(&qp->tx_cv, SOCKBUF_MTX(&so->so_snd));
	+ continue;
	+ }
	+
	+ /* Convert a capsule into a PDU. */
	+ tc = STAILQ_FIRST(&qp->tx_capsules);
	+ STAILQ_REMOVE_HEAD(&qp->tx_capsules, link);
	+ SOCKBUF_UNLOCK(&so->so_snd);
	+
	+ n = capsule_to_pdu(qp, tc);
	+ tcp_release_capsule(tc);
	+
	+ SOCKBUF_LOCK(&so->so_snd);
	+ mbufq_enqueue(&qp->tx_pdus, n);
	+ continue;
	+ }
	+
	+ /*
	+ * Wait until there is enough room to send some data.
	+ * If the socket buffer is empty, always send at least
	+ * something.
	+ */
	+ space = sbspace(&so->so_snd);
	+ if (space < m->m_len && sbused(&so->so_snd) != 0) {
	+ cv_wait(&qp->tx_cv, SOCKBUF_MTX(&so->so_snd));
	+ continue;
	+ }
	+ SOCKBUF_UNLOCK(&so->so_snd);
	+
	+ /*
	+ * If 'm' is too big, then the socket buffer must be
	+ * empty. Split 'm' to make at least some forward
	+ * progress.
	+ *
	+ * Otherwise, chain up as many pending mbufs from 'm'
	+ * that will fit.
	+ */
	+ if (m->m_len > space) {
	+ n = m_split(m, space, M_WAITOK);
	+ } else {
	+ tosend = m->m_len;
	+ n = m->m_next;
	+ p = m;
	+ while (n != NULL && tosend + n->m_len <= space) {
	+ tosend += n->m_len;
	+ p = n;
	+ n = n->m_next;
	+ }
	+ KASSERT(p->m_next == n, ("%s: p not before n",
	+ __func__));
	+ p->m_next = NULL;
	+
	+ KASSERT(m_length(m, NULL) == tosend,
	+ ("%s: length mismatch", __func__));
	+ }
	+ error = sosend(so, NULL, NULL, m, NULL, MSG_DONTWAIT, NULL);
	+ if (error != 0) {
	+ m = NULL;
	+ m_freem(n);
	+ goto error;
	+ }
	+ m = n;
	+ SOCKBUF_LOCK(&so->so_snd);
	+ }
	+ SOCKBUF_UNLOCK(&so->so_snd);
	+ kthread_exit();
	+}
	+
	+static int
	+nvmf_soupcall_receive(struct socket so, void arg, int waitflag)
	+{
	+ struct nvmf_tcp_qpair *qp = arg;
	+
	+ if (soreadable(so))
	+ cv_signal(&qp->rx_cv);
	+ return (SU_OK);
	+}
	+
	+static int
	+nvmf_soupcall_send(struct socket so, void arg, int waitflag)
	+{
	+ struct nvmf_tcp_qpair *qp = arg;
	+
	+ if (sowriteable(so))
	+ cv_signal(&qp->tx_cv);
	+ return (SU_OK);
	+}
	+
	+static struct nvmf_qpair *
	+tcp_allocate_qpair(bool controller,
	+ const struct nvmf_handoff_qpair_params *params)
	+{
	+ struct nvmf_tcp_qpair *qp;
	+ struct socket *so;
	+ struct file *fp;
	+ cap_rights_t rights;
	+ int error;
	+
	+ error = fget(curthread, params->tcp.fd, cap_rights_init_one(&rights,
	+ CAP_SOCK_CLIENT), &fp);
	+ if (error != 0)
	+ return (NULL);
	+ if (fp->f_type != DTYPE_SOCKET) {
	+ fdrop(fp, curthread);
	+ return (NULL);
	+ }
	+ so = fp->f_data;
	+ if (so->so_type != SOCK_STREAM \|\|
	+ so->so_proto->pr_protocol != IPPROTO_TCP) {
	+ fdrop(fp, curthread);
	+ return (NULL);
	+ }
	+
	+ /* Claim socket from file descriptor. */
	+ fp->f_ops = &badfileops;
	+ fp->f_data = NULL;
	+ fdrop(fp, curthread);
	+
	+ qp = malloc(sizeof(*qp), M_NVMF_TCP, M_WAITOK \| M_ZERO);
	+ qp->so = so;
	+ refcount_init(&qp->refs, 1);
	+ qp->txpda = params->tcp.txpda;
	+ qp->rxpda = params->tcp.rxpda;
	+ qp->header_digests = params->tcp.header_digests;
	+ qp->data_digests = params->tcp.data_digests;
	+ qp->maxr2t = params->tcp.maxr2t;
	+ qp->maxh2cdata = params->tcp.maxh2cdata;
	+ qp->max_tx_data = tcp_max_transmit_data;
	+ if (!controller) {
	+ if (qp->max_tx_data > params->tcp.maxh2cdata)
	+ qp->max_tx_data = params->tcp.maxh2cdata;
	+ }
	+ qp->max_icd = params->tcp.max_icd;
	+
	+ if (controller) {
	+ /* Use the SUCCESS flag if SQ flow control is disabled. */
	+ qp->send_success = !params->sq_flow_control;
	+
	+ /* NB: maxr2t is 0's based. */
	+ qp->num_ttags = MIN((u_int)UINT16_MAX + 1,
	+ (uint64_t)params->qsize * (uint64_t)qp->maxr2t + 1);
	+ qp->open_ttags = mallocarray(qp->num_ttags,
	+ sizeof(*qp->open_ttags), M_NVMF_TCP, M_WAITOK \| M_ZERO);
	+ }
	+
	+ TAILQ_INIT(&qp->rx_buffers.head);
	+ TAILQ_INIT(&qp->tx_buffers.head);
	+ mtx_init(&qp->rx_buffers.lock, "nvmf/tcp rx buffers", NULL, MTX_DEF);
	+ mtx_init(&qp->tx_buffers.lock, "nvmf/tcp tx buffers", NULL, MTX_DEF);
	+
	+ cv_init(&qp->rx_cv, "-");
	+ cv_init(&qp->tx_cv, "-");
	+ mbufq_init(&qp->tx_pdus, 0);
	+ STAILQ_INIT(&qp->tx_capsules);
	+
	+ /* Register socket upcalls. */
	+ SOCKBUF_LOCK(&so->so_rcv);
	+ soupcall_set(so, SO_RCV, nvmf_soupcall_receive, qp);
	+ SOCKBUF_UNLOCK(&so->so_rcv);
	+ SOCKBUF_LOCK(&so->so_snd);
	+ soupcall_set(so, SO_SND, nvmf_soupcall_send, qp);
	+ SOCKBUF_UNLOCK(&so->so_snd);
	+
	+ /* Spin up kthreads. */
	+ error = kthread_add(nvmf_tcp_receive, qp, NULL, &qp->rx_thread, 0, 0,
	+ "nvmef tcp rx");
	+ if (error != 0) {
	+ tcp_free_qpair(&qp->qp);
	+ return (NULL);
	+ }
	+ error = kthread_add(nvmf_tcp_send, qp, NULL, &qp->tx_thread, 0, 0,
	+ "nvmef tcp tx");
	+ if (error != 0) {
	+ tcp_free_qpair(&qp->qp);
	+ return (NULL);
	+ }
	+
	+ return (&qp->qp);
	+}
	+
	+static void
	+tcp_release_qpair(struct nvmf_tcp_qpair *qp)
	+{
	+ if (refcount_release(&qp->refs))
	+ free(qp, M_NVMF_TCP);
	+}
	+
	+static void
	+tcp_free_qpair(struct nvmf_qpair *nq)
	+{
	+ struct nvmf_tcp_qpair *qp = TQP(nq);
	+ struct nvmf_tcp_command_buffer ncb, cb;
	+ struct nvmf_tcp_capsule ntc, tc;
	+ struct socket *so = qp->so;
	+
	+ /* Shut down kthreads and clear upcalls */
	+ SOCKBUF_LOCK(&so->so_snd);
	+ qp->tx_shutdown = true;
	+ if (qp->tx_thread != NULL) {
	+ cv_signal(&qp->tx_cv);
	+ mtx_sleep(qp->tx_thread, SOCKBUF_MTX(&so->so_snd), 0,
	+ "nvtcptx", 0);
	+ }
	+ soupcall_clear(so, SO_SND);
	+ SOCKBUF_UNLOCK(&so->so_snd);
	+
	+ SOCKBUF_LOCK(&so->so_rcv);
	+ qp->rx_shutdown = true;
	+ if (qp->rx_thread != NULL) {
	+ cv_signal(&qp->rx_cv);
	+ mtx_sleep(qp->rx_thread, SOCKBUF_MTX(&so->so_rcv), 0,
	+ "nvtcprx", 0);
	+ }
	+ soupcall_clear(so, SO_RCV);
	+ SOCKBUF_UNLOCK(&so->so_rcv);
	+
	+ STAILQ_FOREACH_SAFE(tc, &qp->tx_capsules, link, ntc) {
	+ nvmf_abort_capsule_data(&tc->nc, ECONNABORTED);
	+ tcp_release_capsule(tc);
	+ }
	+ mbufq_drain(&qp->tx_pdus);
	+
	+ cv_destroy(&qp->tx_cv);
	+ cv_destroy(&qp->rx_cv);
	+
	+ if (qp->open_ttags != NULL) {
	+ for (u_int i = 0; i < qp->num_ttags; i++) {
	+ cb = qp->open_ttags[i];
	+ if (cb != NULL) {
	+ cb->error = ECONNABORTED;
	+ tcp_release_command_buffer(cb);
	+ }
	+ }
	+ free(qp->open_ttags, M_NVMF_TCP);
	+ }
	+
	+ mtx_lock(&qp->rx_buffers.lock);
	+ TAILQ_FOREACH_SAFE(cb, &qp->rx_buffers.head, link, ncb) {
	+ tcp_remove_command_buffer(&qp->rx_buffers, cb);
	+ mtx_unlock(&qp->rx_buffers.lock);
	+ cb->error = ECONNABORTED;
	+ tcp_release_command_buffer(cb);
	+ mtx_lock(&qp->rx_buffers.lock);
	+ }
	+ mtx_destroy(&qp->rx_buffers.lock);
	+
	+ mtx_lock(&qp->tx_buffers.lock);
	+ TAILQ_FOREACH_SAFE(cb, &qp->tx_buffers.head, link, ncb) {
	+ tcp_remove_command_buffer(&qp->tx_buffers, cb);
	+ mtx_unlock(&qp->tx_buffers.lock);
	+ cb->error = ECONNABORTED;
	+ tcp_release_command_buffer(cb);
	+ mtx_lock(&qp->tx_buffers.lock);
	+ }
	+ mtx_destroy(&qp->tx_buffers.lock);
	+
	+ soclose(so);
	+
	+ tcp_release_qpair(qp);
	+}
	+
	+static struct nvmf_capsule *
	+tcp_allocate_capsule(struct nvmf_qpair *nq, int how)
	+{
	+ struct nvmf_tcp_qpair *qp = TQP(nq);
	+ struct nvmf_tcp_capsule *tc;
	+
	+ tc = malloc(sizeof(*tc), M_NVMF_TCP, how \| M_ZERO);
	+ if (tc == NULL)
	+ return (NULL);
	+ refcount_init(&tc->refs, 1);
	+ refcount_acquire(&qp->refs);
	+ return (&tc->nc);
	+}
	+
	+static void
	+tcp_release_capsule(struct nvmf_tcp_capsule *tc)
	+{
	+ struct nvmf_tcp_qpair *qp = TQP(tc->nc.nc_qpair);
	+
	+ if (!refcount_release(&tc->refs))
	+ return;
	+
	+ MPASS(tc->active_r2ts == 0);
	+ MPASS(tc->pending_r2ts == 0);
	+
	+ nvmf_tcp_free_pdu(&tc->rx_pdu);
	+ free(tc, M_NVMF_TCP);
	+ tcp_release_qpair(qp);
	+}
	+
	+static void
	+tcp_free_capsule(struct nvmf_capsule *nc)
	+{
	+ struct nvmf_tcp_capsule *tc = TCAP(nc);
	+
	+ tcp_release_capsule(tc);
	+}
	+
	+static int
	+tcp_transmit_capsule(struct nvmf_capsule *nc)
	+{
	+ struct nvmf_tcp_qpair *qp = TQP(nc->nc_qpair);
	+ struct nvmf_tcp_capsule *tc = TCAP(nc);
	+ struct socket *so = qp->so;
	+
	+ refcount_acquire(&tc->refs);
	+ SOCKBUF_LOCK(&so->so_snd);
	+ STAILQ_INSERT_TAIL(&qp->tx_capsules, tc, link);
	+ if (sowriteable(so))
	+ cv_signal(&qp->tx_cv);
	+ SOCKBUF_UNLOCK(&so->so_snd);
	+ return (0);
	+}
	+
	+static uint8_t
	+tcp_validate_command_capsule(struct nvmf_capsule *nc)
	+{
	+ struct nvmf_tcp_capsule *tc = TCAP(nc);
	+ struct nvme_sgl_descriptor *sgl;
	+
	+ KASSERT(tc->rx_pdu.hdr != NULL, ("capsule wasn't received"));
	+
	+ sgl = &nc->nc_sqe.sgl;
	+ switch (sgl->type) {
	+ case NVME_SGL_TYPE_ICD:
	+ if (tc->rx_pdu.data_len != le32toh(sgl->length)) {
	+ printf("NVMe/TCP: Command Capsule with mismatched ICD length\n");
	+ return (NVME_SC_DATA_SGL_LENGTH_INVALID);
	+ }
	+ break;
	+ case NVME_SGL_TYPE_COMMAND_BUFFER:
	+ if (tc->rx_pdu.data_len != 0) {
	+ printf("NVMe/TCP: Command Buffer SGL with ICD\n");
	+ return (NVME_SC_INVALID_FIELD);
	+ }
	+ break;
	+ default:
	+ printf("NVMe/TCP: Invalid SGL type in Command Capsule\n");
	+ return (NVME_SC_SGL_DESCRIPTOR_TYPE_INVALID);
	+ }
	+
	+ if (sgl->address != 0) {
	+ printf("NVMe/TCP: Invalid SGL offset in Command Capsule\n");
	+ return (NVME_SC_SGL_OFFSET_INVALID);
	+ }
	+
	+ return (NVME_SC_SUCCESS);
	+}
	+
	+static size_t
	+tcp_capsule_data_len(const struct nvmf_capsule *nc)
	+{
	+ MPASS(nc->nc_qe_len == sizeof(struct nvme_command));
	+ return (le32toh(nc->nc_sqe.sgl.length));
	+}
	+
	+static void
	+tcp_receive_r2t_data(struct nvmf_capsule *nc, uint32_t data_offset,
	+ struct nvmf_io_request *io)
	+{
	+ struct nvmf_tcp_qpair *qp = TQP(nc->nc_qpair);
	+ struct nvmf_tcp_capsule *tc = TCAP(nc);
	+ struct nvmf_tcp_command_buffer *cb;
	+
	+ cb = tcp_alloc_command_buffer(qp, io, data_offset, io->io_len,
	+ nc->nc_sqe.cid);
	+
	+ cb->tc = tc;
	+ refcount_acquire(&tc->refs);
	+
	+ /*
	+ * If this command has too many active R2Ts or there are no
	+ * available transfer tags, queue the request for later.
	+ *
	+ * NB: maxr2t is 0's based.
	+ */
	+ mtx_lock(&qp->rx_buffers.lock);
	+ if (tc->active_r2ts > qp->maxr2t \|\| qp->active_ttags == qp->num_ttags) {
	+#ifdef INVARIANTS
	+ tc->pending_r2ts++;
	+#endif
	+ TAILQ_INSERT_TAIL(&qp->rx_buffers.head, cb, link);
	+ mtx_unlock(&qp->rx_buffers.lock);
	+ return;
	+ }
	+
	+ nvmf_tcp_allocate_ttag(qp, cb);
	+ mtx_unlock(&qp->rx_buffers.lock);
	+
	+ tcp_send_r2t(qp, nc->nc_sqe.cid, cb->ttag, data_offset, io->io_len);
	+}
	+
	+static void
	+tcp_receive_icd_data(struct nvmf_capsule *nc, uint32_t data_offset,
	+ struct nvmf_io_request *io)
	+{
	+ struct nvmf_tcp_capsule *tc = TCAP(nc);
	+
	+ mbuf_copyto_io(tc->rx_pdu.m, tc->rx_pdu.hdr->pdo + data_offset,
	+ io->io_len, io, 0);
	+ nvmf_complete_io_request(io, io->io_len, 0);
	+}
	+
	+static int
	+tcp_receive_controller_data(struct nvmf_capsule *nc, uint32_t data_offset,
	+ struct nvmf_io_request *io)
	+{
	+ struct nvme_sgl_descriptor *sgl;
	+ size_t data_len;
	+
	+ if (nc->nc_qe_len != sizeof(struct nvme_command) \|\|
	+ !nc->nc_qpair->nq_controller)
	+ return (EINVAL);
	+
	+ sgl = &nc->nc_sqe.sgl;
	+ data_len = le32toh(sgl->length);
	+ if (data_offset + io->io_len > data_len)
	+ return (EFBIG);
	+
	+ if (sgl->type == NVME_SGL_TYPE_ICD)
	+ tcp_receive_icd_data(nc, data_offset, io);
	+ else
	+ tcp_receive_r2t_data(nc, data_offset, io);
	+ return (0);
	+}
	+
	+/* NB: cid is little-endian already. */
	+static void
	+tcp_send_c2h_pdu(struct nvmf_tcp_qpair *qp, uint16_t cid, uint32_t data_offset,
	+ struct mbuf *m, size_t len, bool last_pdu, bool success)
	+{
	+ struct nvme_tcp_c2h_data_hdr c2h;
	+ struct mbuf *top;
	+
	+ memset(&c2h, 0, sizeof(c2h));
	+ c2h.common.pdu_type = NVME_TCP_PDU_TYPE_C2H_DATA;
	+ if (last_pdu)
	+ c2h.common.flags \|= NVME_TCP_C2H_DATA_FLAGS_LAST_PDU;
	+ if (success)
	+ c2h.common.flags \|= NVME_TCP_C2H_DATA_FLAGS_SUCCESS;
	+ c2h.cccid = cid;
	+ c2h.datao = htole32(data_offset);
	+ c2h.datal = htole32(len);
	+
	+ top = nvmf_tcp_construct_pdu(qp, &c2h, sizeof(c2h), m, len);
	+ nvmf_tcp_write_pdu(qp, top);
	+}
	+
	+static u_int
	+tcp_send_controller_data(struct nvmf_capsule *nc, uint32_t data_offset,
	+ struct mbuf *m, size_t len)
	+{
	+ struct nvmf_tcp_qpair *qp = TQP(nc->nc_qpair);
	+ struct nvme_sgl_descriptor *sgl;
	+ struct mbuf n, p;
	+ uint32_t data_len;
	+ bool last_pdu, last_xfer;
	+
	+ if (nc->nc_qe_len != sizeof(struct nvme_command) \|\|
	+ !qp->qp.nq_controller) {
	+ m_freem(m);
	+ return (NVME_SC_INVALID_FIELD);
	+ }
	+
	+ sgl = &nc->nc_sqe.sgl;
	+ data_len = le32toh(sgl->length);
	+ if (data_offset + len > data_len) {
	+ m_freem(m);
	+ return (NVME_SC_INVALID_FIELD);
	+ }
	+ last_xfer = (data_offset + len == data_len);
	+
	+ if (sgl->type != NVME_SGL_TYPE_COMMAND_BUFFER) {
	+ m_freem(m);
	+ return (NVME_SC_INVALID_FIELD);
	+ }
	+
	+ KASSERT(data_offset == TCAP(nc)->tx_data_offset,
	+ ("%s: starting data_offset %u doesn't match end of previous xfer %u",
	+ __func__, data_offset, TCAP(nc)->tx_data_offset));
	+
	+ /* Queue one more C2H_DATA PDUs containing the data from 'm'. */
	+ while (m != NULL) {
	+ uint32_t todo;
	+
	+ todo = m->m_len;
	+ p = m;
	+ n = p->m_next;
	+ while (n != NULL) {
	+ if (todo + n->m_len > qp->max_tx_data) {
	+ p->m_next = NULL;
	+ break;
	+ }
	+ todo += n->m_len;
	+ p = n;
	+ n = p->m_next;
	+ }
	+ MPASS(m_length(m, NULL) == todo);
	+
	+ last_pdu = (n == NULL && last_xfer);
	+ tcp_send_c2h_pdu(qp, nc->nc_sqe.cid, data_offset, m, todo,
	+ last_pdu, last_pdu && qp->send_success);
	+
	+ data_offset += todo;
	+ data_len -= todo;
	+ m = n;
	+ }
	+ MPASS(data_len == 0);
	+
	+#ifdef INVARIANTS
	+ TCAP(nc)->tx_data_offset = data_offset;
	+#endif
	+ if (!last_xfer)
	+ return (NVMF_MORE);
	+ else if (qp->send_success)
	+ return (NVMF_SUCCESS_SENT);
	+ else
	+ return (NVME_SC_SUCCESS);
	+}
	+
	+struct nvmf_transport_ops tcp_ops = {
	+ .allocate_qpair = tcp_allocate_qpair,
	+ .free_qpair = tcp_free_qpair,
	+ .allocate_capsule = tcp_allocate_capsule,
	+ .free_capsule = tcp_free_capsule,
	+ .transmit_capsule = tcp_transmit_capsule,
	+ .validate_command_capsule = tcp_validate_command_capsule,
	+ .capsule_data_len = tcp_capsule_data_len,
	+ .receive_controller_data = tcp_receive_controller_data,
	+ .send_controller_data = tcp_send_controller_data,
	+ .trtype = NVMF_TRTYPE_TCP,
	+ .priority = 0,
	+};
	+
	+NVMF_TRANSPORT(tcp, tcp_ops);
	diff --git a/sys/modules/nvmf/Makefile b/sys/modules/nvmf/Makefile
	--- a/sys/modules/nvmf/Makefile
	+++ b/sys/modules/nvmf/Makefile
	@@ -1,3 +1,4 @@
	-SUBDIR= nvmf_transport
	+SUBDIR= nvmf_tcp \
	+ nvmf_transport

	.include <bsd.subdir.mk>
	diff --git a/sys/modules/nvmf/nvmf_tcp/Makefile b/sys/modules/nvmf/nvmf_tcp/Makefile
	new file mode 100644
	--- /dev/null
	+++ b/sys/modules/nvmf/nvmf_tcp/Makefile
	@@ -0,0 +1,7 @@
	+.PATH: ${SRCTOP}/sys/dev/nvmf
	+
	+KMOD= nvmf_tcp
	+
	+SRCS= nvmf_tcp.c
	+
	+.include <bsd.kmod.mk>

File Metadata

Mime Type: text/plain
Expires: Fri, Nov 15, 4:02 PM (17 h, 59 m)
Storage Engine: blob
Storage Format: Raw Data
Storage Handle: 14645022
Default Alt Text: D44712.diff (50 KB)

D44712.diffNo OneTemporaryActions

D44712.diffView Options

File Metadata

Event Timeline

D44712.diff
No OneTemporary
Actions

D44712.diff
View Options