D44710.diff
No OneTemporary
Actions

Size

93 KB

Referenced Files

None

Subscribers

None

D44710.diff
View Options

	diff --git a/lib/Makefile b/lib/Makefile
	--- a/lib/Makefile
	+++ b/lib/Makefile
	@@ -78,6 +78,7 @@
	libnetbsd \
	libnetmap \
	libnv \
	+ libnvmf \
	libopenbsd \
	libpam \
	libpathconv \
	diff --git a/lib/libnvmf/Makefile b/lib/libnvmf/Makefile
	new file mode 100644
	--- /dev/null
	+++ b/lib/libnvmf/Makefile
	@@ -0,0 +1,22 @@
	+.PATH: ${SRCTOP}/sys/dev/nvmf/controller
	+.PATH: ${SRCTOP}/sys/libkern
	+
	+LIB= nvmf
	+INTERNALLIB=
	+PACKAGE= nvmf
	+
	+INCS= libnvmf.h
	+
	+SRCS= gsb_crc32.c \
	+ nvmf_controller.c \
	+ nvmf_host.c \
	+ nvmf_tcp.c \
	+ nvmf_transport.c \
	+ nvmft_subr.c
	+
	+CFLAGS+= -I${SRCTOP}/sys/dev/nvmf/controller
	+CFLAGS+= -I${SRCTOP}/sys/dev/nvmf
	+
	+.include <bsd.lib.mk>
	+
	+CWARNFLAGS.gsb_crc32.c= -Wno-cast-align
	diff --git a/lib/libnvmf/internal.h b/lib/libnvmf/internal.h
	new file mode 100644
	--- /dev/null
	+++ b/lib/libnvmf/internal.h
	@@ -0,0 +1,116 @@
	+/*-
	+ * SPDX-License-Identifier: BSD-2-Clause
	+ *
	+ * Copyright (c) 2022-2024 Chelsio Communications, Inc.
	+ * Written by: John Baldwin <jhb@FreeBSD.org>
	+ */
	+
	+#ifndef __LIBNVMF_INTERNAL_H__
	+#define __LIBNVMF_INTERNAL_H__
	+
	+#include <sys/queue.h>
	+
	+struct nvmf_transport_ops {
	+ /* Association management. */
	+ struct nvmf_association (allocate_association)(bool controller,
	+ const struct nvmf_association_params *params);
	+ void (update_association)(struct nvmf_association na,
	+ const struct nvme_controller_data *cdata);
	+ void (free_association)(struct nvmf_association na);
	+
	+ /* Queue pair management. */
	+ struct nvmf_qpair (allocate_qpair)(struct nvmf_association *na,
	+ const struct nvmf_qpair_params *params);
	+ void (free_qpair)(struct nvmf_qpair qp);
	+
	+ /* Create params for kernel handoff. */
	+ int (kernel_handoff_params)(struct nvmf_qpair qp,
	+ struct nvmf_handoff_qpair_params *qparams);
	+
	+ /* Capsule operations. */
	+ struct nvmf_capsule (allocate_capsule)(struct nvmf_qpair *qp);
	+ void (free_capsule)(struct nvmf_capsule nc);
	+ int (transmit_capsule)(struct nvmf_capsule nc);
	+ int (receive_capsule)(struct nvmf_qpair qp,
	+ struct nvmf_capsule **ncp);
	+ uint8_t (validate_command_capsule)(const struct nvmf_capsule nc);
	+
	+ /* Transferring controller data. */
	+ size_t (capsule_data_len)(const struct nvmf_capsule nc);
	+ int (receive_controller_data)(const struct nvmf_capsule nc,
	+ uint32_t data_offset, void *buf, size_t len);
	+ int (send_controller_data)(const struct nvmf_capsule nc,
	+ const void *buf, size_t len);
	+};
	+
	+struct nvmf_association {
	+ struct nvmf_transport_ops *na_ops;
	+ enum nvmf_trtype na_trtype;
	+ bool na_controller;
	+
	+ struct nvmf_association_params na_params;
	+
	+ /* Each qpair holds a reference on an association. */
	+ u_int na_refs;
	+
	+ char *na_last_error;
	+};
	+
	+struct nvmf_qpair {
	+ struct nvmf_association *nq_association;
	+ bool nq_admin;
	+
	+ uint16_t nq_cid; /* host only */
	+
	+ /*
	+ * Queue sizes. This assumes the same size for both the
	+ * completion and submission queues within a pair.
	+ */
	+ u_int nq_qsize;
	+
	+ /* Flow control management for submission queues. */
	+ bool nq_flow_control;
	+ uint16_t nq_sqhd;
	+ uint16_t nq_sqtail; /* host only */
	+
	+ /* Value in response to/from CONNECT. */
	+ uint16_t nq_cntlid;
	+
	+ uint32_t nq_kato; /* valid on admin queue only */
	+
	+ TAILQ_HEAD(, nvmf_capsule) nq_rx_capsules;
	+};
	+
	+struct nvmf_capsule {
	+ struct nvmf_qpair *nc_qpair;
	+
	+ /* Either a SQE or CQE. */
	+ union {
	+ struct nvme_command nc_sqe;
	+ struct nvme_completion nc_cqe;
	+ };
	+ int nc_qe_len;
	+
	+ /*
	+ * Is SQHD in received capsule valid? False for locally-
	+ * synthesized responses.
	+ */
	+ bool nc_sqhd_valid;
	+
	+ /* Data buffer. */
	+ bool nc_send_data;
	+ void *nc_data;
	+ size_t nc_data_len;
	+
	+ TAILQ_ENTRY(nvmf_capsule) nc_link;
	+};
	+
	+extern struct nvmf_transport_ops tcp_ops;
	+
	+void na_clear_error(struct nvmf_association *na);
	+void na_error(struct nvmf_association na, const char fmt, ...);
	+
	+int nvmf_kernel_handoff_params(struct nvmf_qpair *qp,
	+ struct nvmf_handoff_qpair_params *qparams);
	+
	+#endif /* !__LIBNVMF_INTERNAL_H__ */
	diff --git a/lib/libnvmf/libnvmf.h b/lib/libnvmf/libnvmf.h
	new file mode 100644
	--- /dev/null
	+++ b/lib/libnvmf/libnvmf.h
	@@ -0,0 +1,363 @@
	+/*-
	+ * SPDX-License-Identifier: BSD-2-Clause
	+ *
	+ * Copyright (c) 2022-2024 Chelsio Communications, Inc.
	+ * Written by: John Baldwin <jhb@FreeBSD.org>
	+ */
	+
	+#ifndef __LIBNVMF_H__
	+#define __LIBNVMF_H__
	+
	+#include <sys/uio.h>
	+#include <stdbool.h>
	+#include <stddef.h>
	+#include <dev/nvme/nvme.h>
	+#include <dev/nvmf/nvmf.h>
	+#include <dev/nvmf/nvmf_proto.h>
	+
	+struct nvmf_capsule;
	+struct nvmf_association;
	+struct nvmf_qpair;
	+
	+/*
	+ * Parameters shared by all queue-pairs of an association. Note that
	+ * this contains the requested values used to initiate transport
	+ * negotiation.
	+ */
	+struct nvmf_association_params {
	+ bool sq_flow_control; /* SQ flow control required. */
	+ bool dynamic_controller_model; /* Controller only */
	+ uint16_t max_admin_qsize; /* Controller only */
	+ uint32_t max_io_qsize; /* Controller only, 0 for discovery */
	+ union {
	+ struct {
	+ uint8_t pda; /* Tx-side PDA. */
	+ bool header_digests;
	+ bool data_digests;
	+ uint32_t maxr2t; /* Host only */
	+ uint32_t maxh2cdata; /* Controller only */
	+ } tcp;
	+ };
	+};
	+
	+/* Parameters specific to a single queue pair of an association. */
	+struct nvmf_qpair_params {
	+ bool admin; /* Host only */
	+ union {
	+ struct {
	+ int fd;
	+ } tcp;
	+ };
	+};
	+
	+/* Transport-independent APIs. */
	+
	+/*
	+ * A host should allocate a new association for each association with
	+ * a controller. After the admin queue has been allocated and the
	+ * controller's data has been fetched, it should be passed to
	+ * nvmf_update_association to update internal transport-specific
	+ * parameters before allocating I/O queues.
	+ *
	+ * A controller uses a single association to manage all incoming
	+ * queues since it is not known until after parsing the CONNECT
	+ * command which transport queues are admin vs I/O and which
	+ * controller they are created against.
	+ */
	+struct nvmf_association *nvmf_allocate_association(enum nvmf_trtype trtype,
	+ bool controller, const struct nvmf_association_params *params);
	+void nvmf_update_assocation(struct nvmf_association *na,
	+ const struct nvme_controller_data *cdata);
	+void nvmf_free_association(struct nvmf_association *na);
	+
	+/* The most recent association-wide error message. */
	+const char nvmf_association_error(const struct nvmf_association na);
	+
	+/*
	+ * A queue pair represents either an Admin or I/O
	+ * submission/completion queue pair.
	+ *
	+ * Each open qpair holds a reference on its association. Once queue
	+ * pairs are allocated, callers can safely free the association to
	+ * ease bookkeeping.
	+ *
	+ * If nvmf_allocate_qpair fails, a detailed error message can be obtained
	+ * from nvmf_association_error.
	+ */
	+struct nvmf_qpair nvmf_allocate_qpair(struct nvmf_association na,
	+ const struct nvmf_qpair_params *params);
	+void nvmf_free_qpair(struct nvmf_qpair *qp);
	+
	+/*
	+ * Capsules are either commands (host -> controller) or responses
	+ * (controller -> host). A single data buffer segment may be
	+ * associated with a command capsule. Transmitted data is not copied
	+ * by this API but instead must be preserved until the capsule is
	+ * transmitted and freed.
	+ */
	+struct nvmf_capsule nvmf_allocate_command(struct nvmf_qpair qp,
	+ const void *sqe);
	+struct nvmf_capsule nvmf_allocate_response(struct nvmf_qpair qp,
	+ const void *cqe);
	+void nvmf_free_capsule(struct nvmf_capsule *nc);
	+int nvmf_capsule_append_data(struct nvmf_capsule *nc,
	+ void *buf, size_t len, bool send);
	+int nvmf_transmit_capsule(struct nvmf_capsule *nc);
	+int nvmf_receive_capsule(struct nvmf_qpair qp, struct nvmf_capsule *ncp);
	+const void nvmf_capsule_sqe(const struct nvmf_capsule nc);
	+const void nvmf_capsule_cqe(const struct nvmf_capsule nc);
	+
	+/* Return a string name for a transport type. */
	+const char *nvmf_transport_type(uint8_t trtype);
	+
	+/* Validate a NVMe Qualified Name. */
	+bool nvmf_nqn_valid(const char *nqn);
	+
	+/* Controller-specific APIs. */
	+
	+/*
	+ * A controller calls this function to check for any
	+ * transport-specific errors (invalid fields) in a received command
	+ * capsule. The callback returns a generic command status value:
	+ * NVME_SC_SUCCESS if no error is found.
	+ */
	+uint8_t nvmf_validate_command_capsule(const struct nvmf_capsule *nc);
	+
	+/*
	+ * A controller calls this function to query the amount of data
	+ * associated with a command capsule.
	+ */
	+size_t nvmf_capsule_data_len(const struct nvmf_capsule *cc);
	+
	+/*
	+ * A controller calls this function to receive data associated with a
	+ * command capsule (e.g. the data for a WRITE command). This can
	+ * either return in-capsule data or fetch data from the host
	+ * (e.g. using a R2T PDU over TCP). The received command capsule
	+ * should be passed in 'nc'. The received data is stored in '*buf'.
	+ */
	+int nvmf_receive_controller_data(const struct nvmf_capsule *nc,
	+ uint32_t data_offset, void *buf, size_t len);
	+
	+/*
	+ * A controller calls this function to send data in response to a
	+ * command along with a response capsule. If the data transfer
	+ * succeeds, a success response is sent. If the data transfer fails,
	+ * an appropriate error status capsule is sent. Regardless, a
	+ * response capsule is always sent.
	+ */
	+int nvmf_send_controller_data(const struct nvmf_capsule *nc,
	+ const void *buf, size_t len);
	+
	+/*
	+ * Construct a CQE for a reply to a command capsule in 'nc' with the
	+ * completion status 'status'. This is useful when additional CQE
	+ * info is required beyond the completion status.
	+ */
	+void nvmf_init_cqe(void cqe, const struct nvmf_capsule nc,
	+ uint16_t status);
	+
	+/*
	+ * Construct and send a response capsule to a command capsule with
	+ * the supplied CQE.
	+ */
	+int nvmf_send_response(const struct nvmf_capsule nc, const void cqe);
	+
	+/*
	+ * Wait for a single command capsule and return it in *ncp. This can
	+ * fail if an invalid capsule is received or an I/O error occurs.
	+ */
	+int nvmf_controller_receive_capsule(struct nvmf_qpair *qp,
	+ struct nvmf_capsule **ncp);
	+
	+/* Send a response capsule from a controller. */
	+int nvmf_controller_transmit_response(struct nvmf_capsule *nc);
	+
	+/* Construct and send an error response capsule. */
	+int nvmf_send_error(const struct nvmf_capsule *cc, uint8_t sc_type,
	+ uint8_t sc_status);
	+
	+/*
	+ * Construct and send an error response capsule using a generic status
	+ * code.
	+ */
	+int nvmf_send_generic_error(const struct nvmf_capsule *nc,
	+ uint8_t sc_status);
	+
	+/* Construct and send a simple success response capsule. */
	+int nvmf_send_success(const struct nvmf_capsule *nc);
	+
	+/*
	+ * Allocate a new queue pair and wait for the CONNECT command capsule.
	+ * If this fails, a detailed error message can be obtained from
	+ * nvmf_association_error. On success, the command capsule is saved
	+ * in '*ccp' and the connect data is saved in 'data'. The caller
	+ * must send an explicit response and free the the command capsule.
	+ */
	+struct nvmf_qpair nvmf_accept(struct nvmf_association na,
	+ const struct nvmf_qpair_params params, struct nvmf_capsule *ccp,
	+ struct nvmf_fabric_connect_data *data);
	+
	+/*
	+ * Construct and send a response capsule with the Fabrics CONNECT
	+ * invalid parameters error status. If data is true the offset is
	+ * relative to the CONNECT data structure, otherwise the offset is
	+ * relative to the SQE.
	+ */
	+void nvmf_connect_invalid_parameters(const struct nvmf_capsule *cc,
	+ bool data, uint16_t offset);
	+
	+/* Construct and send a response capsule for a successful CONNECT. */
	+int nvmf_finish_accept(const struct nvmf_capsule *cc, uint16_t cntlid);
	+
	+/* Compute the initial state of CAP for a controller. */
	+uint64_t nvmf_controller_cap(struct nvmf_qpair *qp);
	+
	+/* Generate a serial number string from a host ID. */
	+void nvmf_controller_serial(char *buf, size_t len, u_long hostid);
	+
	+/*
	+ * Populate an Identify Controller data structure for a Discovery
	+ * controller.
	+ */
	+void nvmf_init_discovery_controller_data(struct nvmf_qpair *qp,
	+ struct nvme_controller_data *cdata);
	+
	+/*
	+ * Populate an Identify Controller data structure for an I/O
	+ * controller.
	+ */
	+void nvmf_init_io_controller_data(struct nvmf_qpair qp, const char serial,
	+ const char *subnqn, int nn, uint32_t ioccsz,
	+ struct nvme_controller_data *cdata);
	+
	+/*
	+ * Validate if a new value for CC is legal given the existing values of
	+ * CAP and CC.
	+ */
	+bool nvmf_validate_cc(struct nvmf_qpair *qp, uint64_t cap, uint32_t old_cc,
	+ uint32_t new_cc);
	+
	+/* Return the log page id (LID) of a GET_LOG_PAGE command. */
	+uint8_t nvmf_get_log_page_id(const struct nvme_command *cmd);
	+
	+/* Return the requested data length of a GET_LOG_PAGE command. */
	+uint64_t nvmf_get_log_page_length(const struct nvme_command *cmd);
	+
	+/* Return the requested data offset of a GET_LOG_PAGE command. */
	+uint64_t nvmf_get_log_page_offset(const struct nvme_command *cmd);
	+
	+/* Prepare to handoff a controller qpair. */
	+int nvmf_handoff_controller_qpair(struct nvmf_qpair *qp,
	+ struct nvmf_handoff_controller_qpair *h);
	+
	+/* Host-specific APIs. */
	+
	+/*
	+ * Connect to an admin or I/O queue. If this fails, a detailed error
	+ * message can be obtained from nvmf_association_error.
	+ */
	+struct nvmf_qpair nvmf_connect(struct nvmf_association na,
	+ const struct nvmf_qpair_params *params, uint16_t qid, u_int queue_size,
	+ const uint8_t hostid[16], uint16_t cntlid, const char *subnqn,
	+ const char *hostnqn, uint32_t kato);
	+
	+/* Return the CNTLID for a queue returned from CONNECT. */
	+uint16_t nvmf_cntlid(struct nvmf_qpair *qp);
	+
	+/*
	+ * Send a command to the controller. This can fail with EBUSY if the
	+ * submission queue is full.
	+ */
	+int nvmf_host_transmit_command(struct nvmf_capsule *nc);
	+
	+/*
	+ * Wait for a response to a command. If there are no outstanding
	+ * commands in the SQ, fails with EWOULDBLOCK.
	+ */
	+int nvmf_host_receive_response(struct nvmf_qpair *qp,
	+ struct nvmf_capsule **rcp);
	+
	+/*
	+ * Wait for a response to a specific command. The command must have been
	+ * succesfully sent previously.
	+ */
	+int nvmf_host_wait_for_response(struct nvmf_capsule *cc,
	+ struct nvmf_capsule **rcp);
	+
	+/* Build a KeepAlive command. */
	+struct nvmf_capsule nvmf_keepalive(struct nvmf_qpair qp);
	+
	+/* Read a controller property. */
	+int nvmf_read_property(struct nvmf_qpair *qp, uint32_t offset, uint8_t size,
	+ uint64_t *value);
	+
	+/* Write a controller property. */
	+int nvmf_write_property(struct nvmf_qpair *qp, uint32_t offset,
	+ uint8_t size, uint64_t value);
	+
	+/* Construct a 16-byte HostId from kern.hostuuid. */
	+int nvmf_hostid_from_hostuuid(uint8_t hostid[16]);
	+
	+/* Construct a NQN from kern.hostuuid. */
	+int nvmf_nqn_from_hostuuid(char nqn[NVMF_NQN_MAX_LEN]);
	+
	+/* Fetch controller data via IDENTIFY. */
	+int nvmf_host_identify_controller(struct nvmf_qpair *qp,
	+ struct nvme_controller_data *data);
	+
	+/* Fetch namespace data via IDENTIFY. */
	+int nvmf_host_identify_namespace(struct nvmf_qpair *qp, uint32_t nsid,
	+ struct nvme_namespace_data *nsdata);
	+
	+/*
	+ * Fetch discovery log page. The memory for the log page is allocated
	+ * by malloc() and returned in *logp. The caller must free the
	+ * memory.
	+ */
	+int nvmf_host_fetch_discovery_log_page(struct nvmf_qpair *qp,
	+ struct nvme_discovery_log **logp);
	+
	+/*
	+ * Request a desired number of I/O queues via SET_FEATURES. The
	+ * number of actual I/O queues available is returned in *actual on
	+ * success.
	+ */
	+int nvmf_host_request_queues(struct nvmf_qpair *qp, u_int requested,
	+ u_int *actual);
	+
	+/*
	+ * Handoff active host association to the kernel. This frees the
	+ * qpairs (even on error).
	+ */
	+int nvmf_handoff_host(struct nvmf_qpair *admin_qp, u_int num_queues,
	+ struct nvmf_qpair *io_queues, const struct nvme_controller_data cdata);
	+
	+/*
	+ * Disconnect an active host association previously handed off to the
	+ * kernel. *name is either the name of the device (nvmeX) for this
	+ * association or the remote subsystem NQN.
	+ */
	+int nvmf_disconnect_host(const char *host);
	+
	+/*
	+ * Disconnect all active host associations previously handed off to
	+ * the kernel.
	+ */
	+int nvmf_disconnect_all(void);
	+
	+/*
	+ * Fetch reconnect parameters from an existing kernel host to use for
	+ * establishing a new association.
	+ */
	+int nvmf_reconnect_params(int fd, struct nvmf_reconnect_params *rparams);
	+
	+/*
	+ * Handoff active host association to an existing host in the kernel.
	+ * This frees the qpairs (even on error).
	+ */
	+int nvmf_reconnect_host(int fd, struct nvmf_qpair *admin_qp,
	+ u_int num_queues, struct nvmf_qpair **io_queues,
	+ const struct nvme_controller_data *cdata);
	+
	+#endif /* !__LIBNVMF_H__ */
	diff --git a/lib/libnvmf/nvmf_controller.c b/lib/libnvmf/nvmf_controller.c
	new file mode 100644
	--- /dev/null
	+++ b/lib/libnvmf/nvmf_controller.c
	@@ -0,0 +1,463 @@
	+/*-
	+ * SPDX-License-Identifier: BSD-2-Clause
	+ *
	+ * Copyright (c) 2024 Chelsio Communications, Inc.
	+ * Written by: John Baldwin <jhb@FreeBSD.org>
	+ */
	+
	+#include <sys/utsname.h>
	+#include <assert.h>
	+#include <errno.h>
	+#include <string.h>
	+#include <unistd.h>
	+
	+#include "libnvmf.h"
	+#include "internal.h"
	+#include "nvmft_subr.h"
	+
	+void
	+nvmf_init_cqe(void cqe, const struct nvmf_capsule nc, uint16_t status)
	+{
	+ struct nvme_completion *cpl = cqe;
	+ const struct nvme_command *cmd = nvmf_capsule_sqe(nc);
	+
	+ memset(cpl, 0, sizeof(*cpl));
	+ cpl->cid = cmd->cid;
	+ cpl->status = htole16(status);
	+}
	+
	+static struct nvmf_capsule *
	+nvmf_simple_response(const struct nvmf_capsule *nc, uint8_t sc_type,
	+ uint8_t sc_status)
	+{
	+ struct nvme_completion cpl;
	+ uint16_t status;
	+
	+ status = NVMEF(NVME_STATUS_SCT, sc_type) \|
	+ NVMEF(NVME_STATUS_SC, sc_status);
	+ nvmf_init_cqe(&cpl, nc, status);
	+ return (nvmf_allocate_response(nc->nc_qpair, &cpl));
	+}
	+
	+int
	+nvmf_controller_receive_capsule(struct nvmf_qpair *qp,
	+ struct nvmf_capsule **ncp)
	+{
	+ struct nvmf_capsule *nc;
	+ int error;
	+ uint8_t sc_status;
	+
	+ *ncp = NULL;
	+ error = nvmf_receive_capsule(qp, &nc);
	+ if (error != 0)
	+ return (error);
	+
	+ sc_status = nvmf_validate_command_capsule(nc);
	+ if (sc_status != NVME_SC_SUCCESS) {
	+ nvmf_send_generic_error(nc, sc_status);
	+ nvmf_free_capsule(nc);
	+ return (EPROTO);
	+ }
	+
	+ *ncp = nc;
	+ return (0);
	+}
	+
	+int
	+nvmf_controller_transmit_response(struct nvmf_capsule *nc)
	+{
	+ struct nvmf_qpair *qp = nc->nc_qpair;
	+
	+ /* Set SQHD. */
	+ if (qp->nq_flow_control) {
	+ qp->nq_sqhd = (qp->nq_sqhd + 1) % qp->nq_qsize;
	+ nc->nc_cqe.sqhd = htole16(qp->nq_sqhd);
	+ } else
	+ nc->nc_cqe.sqhd = 0;
	+
	+ return (nvmf_transmit_capsule(nc));
	+}
	+
	+int
	+nvmf_send_response(const struct nvmf_capsule cc, const void cqe)
	+{
	+ struct nvmf_capsule *rc;
	+ int error;
	+
	+ rc = nvmf_allocate_response(cc->nc_qpair, cqe);
	+ if (rc == NULL)
	+ return (ENOMEM);
	+ error = nvmf_controller_transmit_response(rc);
	+ nvmf_free_capsule(rc);
	+ return (error);
	+}
	+
	+int
	+nvmf_send_error(const struct nvmf_capsule *cc, uint8_t sc_type,
	+ uint8_t sc_status)
	+{
	+ struct nvmf_capsule *rc;
	+ int error;
	+
	+ rc = nvmf_simple_response(cc, sc_type, sc_status);
	+ error = nvmf_controller_transmit_response(rc);
	+ nvmf_free_capsule(rc);
	+ return (error);
	+}
	+
	+int
	+nvmf_send_generic_error(const struct nvmf_capsule *nc, uint8_t sc_status)
	+{
	+ return (nvmf_send_error(nc, NVME_SCT_GENERIC, sc_status));
	+}
	+
	+int
	+nvmf_send_success(const struct nvmf_capsule *nc)
	+{
	+ return (nvmf_send_generic_error(nc, NVME_SC_SUCCESS));
	+}
	+
	+void
	+nvmf_connect_invalid_parameters(const struct nvmf_capsule *cc, bool data,
	+ uint16_t offset)
	+{
	+ struct nvmf_fabric_connect_rsp rsp;
	+ struct nvmf_capsule *rc;
	+
	+ nvmf_init_cqe(&rsp, cc,
	+ NVMEF(NVME_STATUS_SCT, NVME_SCT_COMMAND_SPECIFIC) \|
	+ NVMEF(NVME_STATUS_SC, NVMF_FABRIC_SC_INVALID_PARAM));
	+ rsp.status_code_specific.invalid.ipo = htole16(offset);
	+ rsp.status_code_specific.invalid.iattr = data ? 1 : 0;
	+ rc = nvmf_allocate_response(cc->nc_qpair, &rsp);
	+ nvmf_transmit_capsule(rc);
	+ nvmf_free_capsule(rc);
	+}
	+
	+struct nvmf_qpair *
	+nvmf_accept(struct nvmf_association na, const struct nvmf_qpair_params params,
	+ struct nvmf_capsule *ccp, struct nvmf_fabric_connect_data data)
	+{
	+ static const char hostid_zero[sizeof(data->hostid)];
	+ const struct nvmf_fabric_connect_cmd *cmd;
	+ struct nvmf_qpair *qp;
	+ struct nvmf_capsule cc, rc;
	+ u_int qsize;
	+ int error;
	+ uint16_t cntlid;
	+ uint8_t sc_status;
	+
	+ qp = NULL;
	+ cc = NULL;
	+ rc = NULL;
	+ *ccp = NULL;
	+ na_clear_error(na);
	+ if (!na->na_controller) {
	+ na_error(na, "Cannot accept on a host");
	+ goto error;
	+ }
	+
	+ qp = nvmf_allocate_qpair(na, params);
	+ if (qp == NULL)
	+ goto error;
	+
	+ /* Read the CONNECT capsule. */
	+ error = nvmf_receive_capsule(qp, &cc);
	+ if (error != 0) {
	+ na_error(na, "Failed to receive CONNECT: %s", strerror(error));
	+ goto error;
	+ }
	+
	+ sc_status = nvmf_validate_command_capsule(cc);
	+ if (sc_status != 0) {
	+ na_error(na, "CONNECT command failed to validate: %u",
	+ sc_status);
	+ rc = nvmf_simple_response(cc, NVME_SCT_GENERIC, sc_status);
	+ goto error;
	+ }
	+
	+ cmd = nvmf_capsule_sqe(cc);
	+ if (cmd->opcode != NVME_OPC_FABRICS_COMMANDS \|\|
	+ cmd->fctype != NVMF_FABRIC_COMMAND_CONNECT) {
	+ na_error(na, "Invalid opcode in CONNECT (%u,%u)", cmd->opcode,
	+ cmd->fctype);
	+ rc = nvmf_simple_response(cc, NVME_SCT_GENERIC,
	+ NVME_SC_INVALID_OPCODE);
	+ goto error;
	+ }
	+
	+ if (cmd->recfmt != htole16(0)) {
	+ na_error(na, "Unsupported CONNECT record format %u",
	+ le16toh(cmd->recfmt));
	+ rc = nvmf_simple_response(cc, NVME_SCT_COMMAND_SPECIFIC,
	+ NVMF_FABRIC_SC_INCOMPATIBLE_FORMAT);
	+ goto error;
	+ }
	+
	+ qsize = le16toh(cmd->sqsize) + 1;
	+ if (cmd->qid == 0) {
	+ /* Admin queue limits. */
	+ if (qsize < NVME_MIN_ADMIN_ENTRIES \|\|
	+ qsize > NVME_MAX_ADMIN_ENTRIES \|\|
	+ qsize > na->na_params.max_admin_qsize) {
	+ na_error(na, "Invalid queue size %u", qsize);
	+ nvmf_connect_invalid_parameters(cc, false,
	+ offsetof(struct nvmf_fabric_connect_cmd, sqsize));
	+ goto error;
	+ }
	+ qp->nq_admin = true;
	+ } else {
	+ /* I/O queues not allowed for discovery. */
	+ if (na->na_params.max_io_qsize == 0) {
	+ na_error(na, "I/O queue on discovery controller");
	+ nvmf_connect_invalid_parameters(cc, false,
	+ offsetof(struct nvmf_fabric_connect_cmd, qid));
	+ goto error;
	+ }
	+
	+ /* I/O queue limits. */
	+ if (qsize < NVME_MIN_IO_ENTRIES \|\|
	+ qsize > NVME_MAX_IO_ENTRIES \|\|
	+ qsize > na->na_params.max_io_qsize) {
	+ na_error(na, "Invalid queue size %u", qsize);
	+ nvmf_connect_invalid_parameters(cc, false,
	+ offsetof(struct nvmf_fabric_connect_cmd, sqsize));
	+ goto error;
	+ }
	+
	+ /* KATO is reserved for I/O queues. */
	+ if (cmd->kato != 0) {
	+ na_error(na,
	+ "KeepAlive timeout specified for I/O queue");
	+ nvmf_connect_invalid_parameters(cc, false,
	+ offsetof(struct nvmf_fabric_connect_cmd, kato));
	+ goto error;
	+ }
	+ qp->nq_admin = false;
	+ }
	+ qp->nq_qsize = qsize;
	+
	+ /* Fetch CONNECT data. */
	+ if (nvmf_capsule_data_len(cc) != sizeof(*data)) {
	+ na_error(na, "Invalid data payload length for CONNECT: %zu",
	+ nvmf_capsule_data_len(cc));
	+ nvmf_connect_invalid_parameters(cc, false,
	+ offsetof(struct nvmf_fabric_connect_cmd, sgl1));
	+ goto error;
	+ }
	+
	+ error = nvmf_receive_controller_data(cc, 0, data, sizeof(*data));
	+ if (error != 0) {
	+ na_error(na, "Failed to read data for CONNECT: %s",
	+ strerror(error));
	+ rc = nvmf_simple_response(cc, NVME_SCT_GENERIC,
	+ NVME_SC_DATA_TRANSFER_ERROR);
	+ goto error;
	+ }
	+
	+ /* The hostid must be non-zero. */
	+ if (memcmp(data->hostid, hostid_zero, sizeof(hostid_zero)) == 0) {
	+ na_error(na, "HostID in CONNECT data is zero");
	+ nvmf_connect_invalid_parameters(cc, true,
	+ offsetof(struct nvmf_fabric_connect_data, hostid));
	+ goto error;
	+ }
	+
	+ cntlid = le16toh(data->cntlid);
	+ if (cmd->qid == 0) {
	+ if (na->na_params.dynamic_controller_model) {
	+ if (cntlid != NVMF_CNTLID_DYNAMIC) {
	+ na_error(na, "Invalid controller ID %#x",
	+ cntlid);
	+ nvmf_connect_invalid_parameters(cc, true,
	+ offsetof(struct nvmf_fabric_connect_data,
	+ cntlid));
	+ goto error;
	+ }
	+ } else {
	+ if (cntlid > NVMF_CNTLID_STATIC_MAX &&
	+ cntlid != NVMF_CNTLID_STATIC_ANY) {
	+ na_error(na, "Invalid controller ID %#x",
	+ cntlid);
	+ nvmf_connect_invalid_parameters(cc, true,
	+ offsetof(struct nvmf_fabric_connect_data,
	+ cntlid));
	+ goto error;
	+ }
	+ }
	+ } else {
	+ /* Wildcard Controller IDs are only valid on an Admin queue. */
	+ if (cntlid > NVMF_CNTLID_STATIC_MAX) {
	+ na_error(na, "Invalid controller ID %#x", cntlid);
	+ nvmf_connect_invalid_parameters(cc, true,
	+ offsetof(struct nvmf_fabric_connect_data, cntlid));
	+ goto error;
	+ }
	+ }
	+
	+ /* Simple validation of each NQN. */
	+ if (!nvmf_nqn_valid(data->subnqn)) {
	+ na_error(na, "Invalid SubNQN %.*s", (int)sizeof(data->subnqn),
	+ data->subnqn);
	+ nvmf_connect_invalid_parameters(cc, true,
	+ offsetof(struct nvmf_fabric_connect_data, subnqn));
	+ goto error;
	+ }
	+ if (!nvmf_nqn_valid(data->hostnqn)) {
	+ na_error(na, "Invalid HostNQN %.*s", (int)sizeof(data->hostnqn),
	+ data->hostnqn);
	+ nvmf_connect_invalid_parameters(cc, true,
	+ offsetof(struct nvmf_fabric_connect_data, hostnqn));
	+ goto error;
	+ }
	+
	+ if (na->na_params.sq_flow_control \|\|
	+ (cmd->cattr & NVMF_CONNECT_ATTR_DISABLE_SQ_FC) == 0)
	+ qp->nq_flow_control = true;
	+ else
	+ qp->nq_flow_control = false;
	+ qp->nq_sqhd = 0;
	+ qp->nq_kato = le32toh(cmd->kato);
	+ *ccp = cc;
	+ return (qp);
	+error:
	+ if (rc != NULL) {
	+ nvmf_transmit_capsule(rc);
	+ nvmf_free_capsule(rc);
	+ }
	+ if (cc != NULL)
	+ nvmf_free_capsule(cc);
	+ if (qp != NULL)
	+ nvmf_free_qpair(qp);
	+ return (NULL);
	+}
	+
	+int
	+nvmf_finish_accept(const struct nvmf_capsule *cc, uint16_t cntlid)
	+{
	+ struct nvmf_fabric_connect_rsp rsp;
	+ struct nvmf_qpair *qp = cc->nc_qpair;
	+ struct nvmf_capsule *rc;
	+ int error;
	+
	+ nvmf_init_cqe(&rsp, cc, 0);
	+ if (qp->nq_flow_control)
	+ rsp.sqhd = htole16(qp->nq_sqhd);
	+ else
	+ rsp.sqhd = htole16(0xffff);
	+ rsp.status_code_specific.success.cntlid = htole16(cntlid);
	+ rc = nvmf_allocate_response(qp, &rsp);
	+ if (rc == NULL)
	+ return (ENOMEM);
	+ error = nvmf_transmit_capsule(rc);
	+ nvmf_free_capsule(rc);
	+ if (error == 0)
	+ qp->nq_cntlid = cntlid;
	+ return (error);
	+}
	+
	+uint64_t
	+nvmf_controller_cap(struct nvmf_qpair *qp)
	+{
	+ const struct nvmf_association *na = qp->nq_association;
	+
	+ return (_nvmf_controller_cap(na->na_params.max_io_qsize,
	+ NVMF_CC_EN_TIMEOUT));
	+}
	+
	+bool
	+nvmf_validate_cc(struct nvmf_qpair *qp, uint64_t cap, uint32_t old_cc,
	+ uint32_t new_cc)
	+{
	+ const struct nvmf_association *na = qp->nq_association;
	+
	+ return (_nvmf_validate_cc(na->na_params.max_io_qsize, cap, old_cc,
	+ new_cc));
	+}
	+
	+void
	+nvmf_init_discovery_controller_data(struct nvmf_qpair *qp,
	+ struct nvme_controller_data *cdata)
	+{
	+ const struct nvmf_association *na = qp->nq_association;
	+ struct utsname utsname;
	+ char *cp;
	+
	+ memset(cdata, 0, sizeof(*cdata));
	+
	+ /*
	+ * 5.2 Figure 37 states model name and serial are reserved,
	+ * but Linux includes them. Don't bother with serial, but
	+ * do set model name.
	+ */
	+ uname(&utsname);
	+ nvmf_strpad(cdata->mn, utsname.sysname, sizeof(cdata->mn));
	+ nvmf_strpad(cdata->fr, utsname.release, sizeof(cdata->fr));
	+ cp = memchr(cdata->fr, '-', sizeof(cdata->fr));
	+ if (cp != NULL)
	+ memset(cp, ' ', sizeof(cdata->fr) - (cp - (char *)cdata->fr));
	+
	+ cdata->ctrlr_id = htole16(qp->nq_cntlid);
	+ cdata->ver = htole32(NVME_REV(1, 4));
	+ cdata->cntrltype = 2;
	+
	+ cdata->lpa = NVMEF(NVME_CTRLR_DATA_LPA_EXT_DATA, 1);
	+ cdata->elpe = 0;
	+
	+ cdata->maxcmd = htole16(na->na_params.max_admin_qsize);
	+
	+ /* Transport-specific? */
	+ cdata->sgls = htole32(
	+ NVMEF(NVME_CTRLR_DATA_SGLS_TRANSPORT_DATA_BLOCK, 1) \|
	+ NVMEF(NVME_CTRLR_DATA_SGLS_ADDRESS_AS_OFFSET, 1) \|
	+ NVMEF(NVME_CTRLR_DATA_SGLS_NVM_COMMAND_SET, 1));
	+
	+ strlcpy(cdata->subnqn, NVMF_DISCOVERY_NQN, sizeof(cdata->subnqn));
	+}
	+
	+void
	+nvmf_init_io_controller_data(struct nvmf_qpair qp, const char serial,
	+ const char *subnqn, int nn, uint32_t ioccsz,
	+ struct nvme_controller_data *cdata)
	+{
	+ const struct nvmf_association *na = qp->nq_association;
	+ struct utsname utsname;
	+
	+ uname(&utsname);
	+
	+ _nvmf_init_io_controller_data(qp->nq_cntlid, na->na_params.max_io_qsize,
	+ serial, utsname.sysname, utsname.release, subnqn, nn, ioccsz,
	+ sizeof(struct nvme_completion), cdata);
	+}
	+
	+uint8_t
	+nvmf_get_log_page_id(const struct nvme_command *cmd)
	+{
	+ assert(cmd->opc == NVME_OPC_GET_LOG_PAGE);
	+ return (le32toh(cmd->cdw10) & 0xff);
	+}
	+
	+uint64_t
	+nvmf_get_log_page_length(const struct nvme_command *cmd)
	+{
	+ uint32_t numd;
	+
	+ assert(cmd->opc == NVME_OPC_GET_LOG_PAGE);
	+ numd = le32toh(cmd->cdw10) >> 16 \| (le32toh(cmd->cdw11) & 0xffff) << 16;
	+ return ((numd + 1) * 4);
	+}
	+
	+uint64_t
	+nvmf_get_log_page_offset(const struct nvme_command *cmd)
	+{
	+ assert(cmd->opc == NVME_OPC_GET_LOG_PAGE);
	+ return (le32toh(cmd->cdw12) \| (uint64_t)le32toh(cmd->cdw13) << 32);
	+}
	+
	+int
	+nvmf_handoff_controller_qpair(struct nvmf_qpair *qp,
	+ struct nvmf_handoff_controller_qpair *h)
	+{
	+ h->trtype = qp->nq_association->na_trtype;
	+ return (nvmf_kernel_handoff_params(qp, &h->params));
	+}
	diff --git a/lib/libnvmf/nvmf_host.c b/lib/libnvmf/nvmf_host.c
	new file mode 100644
	--- /dev/null
	+++ b/lib/libnvmf/nvmf_host.c
	@@ -0,0 +1,911 @@
	+/*-
	+ * SPDX-License-Identifier: BSD-2-Clause
	+ *
	+ * Copyright (c) 2024 Chelsio Communications, Inc.
	+ * Written by: John Baldwin <jhb@FreeBSD.org>
	+ */
	+
	+#include <sys/sysctl.h>
	+#include <errno.h>
	+#include <fcntl.h>
	+#include <stdio.h>
	+#include <stdlib.h>
	+#include <string.h>
	+#include <unistd.h>
	+#include <uuid.h>
	+
	+#include "libnvmf.h"
	+#include "internal.h"
	+
	+static void
	+nvmf_init_sqe(void *sqe, uint8_t opcode)
	+{
	+ struct nvme_command *cmd = sqe;
	+
	+ memset(cmd, 0, sizeof(*cmd));
	+ cmd->opc = opcode;
	+}
	+
	+static void
	+nvmf_init_fabrics_sqe(void *sqe, uint8_t fctype)
	+{
	+ struct nvmf_capsule_cmd *cmd = sqe;
	+
	+ nvmf_init_sqe(sqe, NVME_OPC_FABRICS_COMMANDS);
	+ cmd->fctype = fctype;
	+}
	+
	+struct nvmf_qpair *
	+nvmf_connect(struct nvmf_association *na,
	+ const struct nvmf_qpair_params *params, uint16_t qid, u_int queue_size,
	+ const uint8_t hostid[16], uint16_t cntlid, const char *subnqn,
	+ const char *hostnqn, uint32_t kato)
	+{
	+ struct nvmf_fabric_connect_cmd cmd;
	+ struct nvmf_fabric_connect_data data;
	+ const struct nvmf_fabric_connect_rsp *rsp;
	+ struct nvmf_qpair *qp;
	+ struct nvmf_capsule cc, rc;
	+ int error;
	+ uint16_t sqhd, status;
	+
	+ qp = NULL;
	+ cc = NULL;
	+ rc = NULL;
	+ na_clear_error(na);
	+ if (na->na_controller) {
	+ na_error(na, "Cannot connect on a controller");
	+ goto error;
	+ }
	+
	+ if (params->admin != (qid == 0)) {
	+ na_error(na, "Admin queue must use Queue ID 0");
	+ goto error;
	+ }
	+
	+ if (qid == 0) {
	+ if (queue_size < NVME_MIN_ADMIN_ENTRIES \|\|
	+ queue_size > NVME_MAX_ADMIN_ENTRIES) {
	+ na_error(na, "Invalid queue size %u", queue_size);
	+ goto error;
	+ }
	+ } else {
	+ if (queue_size < NVME_MIN_IO_ENTRIES \|\|
	+ queue_size > NVME_MAX_IO_ENTRIES) {
	+ na_error(na, "Invalid queue size %u", queue_size);
	+ goto error;
	+ }
	+
	+ /* KATO is only for Admin queues. */
	+ if (kato != 0) {
	+ na_error(na, "Cannot set KATO on I/O queues");
	+ goto error;
	+ }
	+ }
	+
	+ qp = nvmf_allocate_qpair(na, params);
	+ if (qp == NULL)
	+ goto error;
	+
	+ nvmf_init_fabrics_sqe(&cmd, NVMF_FABRIC_COMMAND_CONNECT);
	+ cmd.recfmt = 0;
	+ cmd.qid = htole16(qid);
	+
	+ /* N.B. sqsize is 0's based. */
	+ cmd.sqsize = htole16(queue_size - 1);
	+ if (!na->na_params.sq_flow_control)
	+ cmd.cattr \|= NVMF_CONNECT_ATTR_DISABLE_SQ_FC;
	+ cmd.kato = htole32(kato);
	+
	+ cc = nvmf_allocate_command(qp, &cmd);
	+ if (cc == NULL) {
	+ na_error(na, "Failed to allocate command capsule: %s",
	+ strerror(errno));
	+ goto error;
	+ }
	+
	+ memset(&data, 0, sizeof(data));
	+ memcpy(data.hostid, hostid, sizeof(data.hostid));
	+ data.cntlid = htole16(cntlid);
	+ strlcpy(data.subnqn, subnqn, sizeof(data.subnqn));
	+ strlcpy(data.hostnqn, hostnqn, sizeof(data.hostnqn));
	+
	+ error = nvmf_capsule_append_data(cc, &data, sizeof(data), true);
	+ if (error != 0) {
	+ na_error(na, "Failed to append data to CONNECT capsule: %s",
	+ strerror(error));
	+ goto error;
	+ }
	+
	+ error = nvmf_transmit_capsule(cc);
	+ if (error != 0) {
	+ na_error(na, "Failed to transmit CONNECT capsule: %s",
	+ strerror(errno));
	+ goto error;
	+ }
	+
	+ error = nvmf_receive_capsule(qp, &rc);
	+ if (error != 0) {
	+ na_error(na, "Failed to receive CONNECT response: %s",
	+ strerror(error));
	+ goto error;
	+ }
	+
	+ rsp = (const struct nvmf_fabric_connect_rsp *)&rc->nc_cqe;
	+ status = le16toh(rc->nc_cqe.status);
	+ if (status != 0) {
	+ if (NVME_STATUS_GET_SC(status) == NVMF_FABRIC_SC_INVALID_PARAM)
	+ na_error(na,
	+ "CONNECT invalid parameter IATTR: %#x IPO: %#x",
	+ rsp->status_code_specific.invalid.iattr,
	+ rsp->status_code_specific.invalid.ipo);
	+ else
	+ na_error(na, "CONNECT failed, status %#x", status);
	+ goto error;
	+ }
	+
	+ if (rc->nc_cqe.cid != cmd.cid) {
	+ na_error(na, "Mismatched CID in CONNECT response");
	+ goto error;
	+ }
	+
	+ if (!rc->nc_sqhd_valid) {
	+ na_error(na, "CONNECT response without valid SQHD");
	+ goto error;
	+ }
	+
	+ sqhd = le16toh(rsp->sqhd);
	+ if (sqhd == 0xffff) {
	+ if (na->na_params.sq_flow_control) {
	+ na_error(na, "Controller disabled SQ flow control");
	+ goto error;
	+ }
	+ qp->nq_flow_control = false;
	+ } else {
	+ qp->nq_flow_control = true;
	+ qp->nq_sqhd = sqhd;
	+ qp->nq_sqtail = sqhd;
	+ }
	+
	+ if (rsp->status_code_specific.success.authreq) {
	+ na_error(na, "CONNECT response requests authentication\n");
	+ goto error;
	+ }
	+
	+ qp->nq_qsize = queue_size;
	+ qp->nq_cntlid = le16toh(rsp->status_code_specific.success.cntlid);
	+ qp->nq_kato = kato;
	+ /* XXX: Save qid in qp? */
	+ return (qp);
	+
	+error:
	+ if (rc != NULL)
	+ nvmf_free_capsule(rc);
	+ if (cc != NULL)
	+ nvmf_free_capsule(cc);
	+ if (qp != NULL)
	+ nvmf_free_qpair(qp);
	+ return (NULL);
	+}
	+
	+uint16_t
	+nvmf_cntlid(struct nvmf_qpair *qp)
	+{
	+ return (qp->nq_cntlid);
	+}
	+
	+int
	+nvmf_host_transmit_command(struct nvmf_capsule *nc)
	+{
	+ struct nvmf_qpair *qp = nc->nc_qpair;
	+ uint16_t new_sqtail;
	+ int error;
	+
	+ /* Fail if the queue is full. */
	+ new_sqtail = (qp->nq_sqtail + 1) % qp->nq_qsize;
	+ if (new_sqtail == qp->nq_sqhd)
	+ return (EBUSY);
	+
	+ nc->nc_sqe.cid = htole16(qp->nq_cid);
	+
	+ /* 4.2 Skip CID of 0xFFFF. */
	+ qp->nq_cid++;
	+ if (qp->nq_cid == 0xFFFF)
	+ qp->nq_cid = 0;
	+
	+ error = nvmf_transmit_capsule(nc);
	+ if (error != 0)
	+ return (error);
	+
	+ qp->nq_sqtail = new_sqtail;
	+ return (0);
	+}
	+
	+/* Receive a single capsule and update SQ FC accounting. */
	+static int
	+nvmf_host_receive_capsule(struct nvmf_qpair qp, struct nvmf_capsule *ncp)
	+{
	+ struct nvmf_capsule *nc;
	+ int error;
	+
	+ /* If the SQ is empty, there is no response to wait for. */
	+ if (qp->nq_sqhd == qp->nq_sqtail)
	+ return (EWOULDBLOCK);
	+
	+ error = nvmf_receive_capsule(qp, &nc);
	+ if (error != 0)
	+ return (error);
	+
	+ if (qp->nq_flow_control) {
	+ if (nc->nc_sqhd_valid)
	+ qp->nq_sqhd = le16toh(nc->nc_cqe.sqhd);
	+ } else {
	+ /*
	+ * If SQ FC is disabled, just advance the head for
	+ * each response capsule received so that we track the
	+ * number of outstanding commands.
	+ */
	+ qp->nq_sqhd = (qp->nq_sqhd + 1) % qp->nq_qsize;
	+ }
	+ *ncp = nc;
	+ return (0);
	+}
	+
	+int
	+nvmf_host_receive_response(struct nvmf_qpair qp, struct nvmf_capsule *ncp)
	+{
	+ struct nvmf_capsule *nc;
	+
	+ /* Return the oldest previously received response. */
	+ if (!TAILQ_EMPTY(&qp->nq_rx_capsules)) {
	+ nc = TAILQ_FIRST(&qp->nq_rx_capsules);
	+ TAILQ_REMOVE(&qp->nq_rx_capsules, nc, nc_link);
	+ *ncp = nc;
	+ return (0);
	+ }
	+
	+ return (nvmf_host_receive_capsule(qp, ncp));
	+}
	+
	+int
	+nvmf_host_wait_for_response(struct nvmf_capsule *cc,
	+ struct nvmf_capsule **rcp)
	+{
	+ struct nvmf_qpair *qp = cc->nc_qpair;
	+ struct nvmf_capsule *rc;
	+ int error;
	+
	+ /* Check if a response was already received. */
	+ TAILQ_FOREACH(rc, &qp->nq_rx_capsules, nc_link) {
	+ if (rc->nc_cqe.cid == cc->nc_sqe.cid) {
	+ TAILQ_REMOVE(&qp->nq_rx_capsules, rc, nc_link);
	+ *rcp = rc;
	+ return (0);
	+ }
	+ }
	+
	+ /* Wait for a response. */
	+ for (;;) {
	+ error = nvmf_host_receive_capsule(qp, &rc);
	+ if (error != 0)
	+ return (error);
	+
	+ if (rc->nc_cqe.cid != cc->nc_sqe.cid) {
	+ TAILQ_INSERT_TAIL(&qp->nq_rx_capsules, rc, nc_link);
	+ continue;
	+ }
	+
	+ *rcp = rc;
	+ return (0);
	+ }
	+}
	+
	+struct nvmf_capsule *
	+nvmf_keepalive(struct nvmf_qpair *qp)
	+{
	+ struct nvme_command cmd;
	+
	+ if (!qp->nq_admin) {
	+ errno = EINVAL;
	+ return (NULL);
	+ }
	+
	+ nvmf_init_sqe(&cmd, NVME_OPC_KEEP_ALIVE);
	+
	+ return (nvmf_allocate_command(qp, &cmd));
	+}
	+
	+static struct nvmf_capsule *
	+nvmf_get_property(struct nvmf_qpair *qp, uint32_t offset, uint8_t size)
	+{
	+ struct nvmf_fabric_prop_get_cmd cmd;
	+
	+ nvmf_init_fabrics_sqe(&cmd, NVMF_FABRIC_COMMAND_PROPERTY_GET);
	+ switch (size) {
	+ case 4:
	+ cmd.attrib.size = NVMF_PROP_SIZE_4;
	+ break;
	+ case 8:
	+ cmd.attrib.size = NVMF_PROP_SIZE_8;
	+ break;
	+ default:
	+ errno = EINVAL;
	+ return (NULL);
	+ }
	+ cmd.ofst = htole32(offset);
	+
	+ return (nvmf_allocate_command(qp, &cmd));
	+}
	+
	+int
	+nvmf_read_property(struct nvmf_qpair *qp, uint32_t offset, uint8_t size,
	+ uint64_t *value)
	+{
	+ struct nvmf_capsule cc, rc;
	+ const struct nvmf_fabric_prop_get_rsp *rsp;
	+ uint16_t status;
	+ int error;
	+
	+ if (!qp->nq_admin)
	+ return (EINVAL);
	+
	+ cc = nvmf_get_property(qp, offset, size);
	+ if (cc == NULL)
	+ return (errno);
	+
	+ error = nvmf_host_transmit_command(cc);
	+ if (error != 0) {
	+ nvmf_free_capsule(cc);
	+ return (error);
	+ }
	+
	+ error = nvmf_host_wait_for_response(cc, &rc);
	+ nvmf_free_capsule(cc);
	+ if (error != 0)
	+ return (error);
	+
	+ rsp = (const struct nvmf_fabric_prop_get_rsp *)&rc->nc_cqe;
	+ status = le16toh(rc->nc_cqe.status);
	+ if (status != 0) {
	+ printf("NVMF: PROPERTY_GET failed, status %#x\n", status);
	+ nvmf_free_capsule(rc);
	+ return (EIO);
	+ }
	+
	+ if (size == 8)
	+ *value = le64toh(rsp->value.u64);
	+ else
	+ *value = le32toh(rsp->value.u32.low);
	+ nvmf_free_capsule(rc);
	+ return (0);
	+}
	+
	+static struct nvmf_capsule *
	+nvmf_set_property(struct nvmf_qpair *qp, uint32_t offset, uint8_t size,
	+ uint64_t value)
	+{
	+ struct nvmf_fabric_prop_set_cmd cmd;
	+
	+ nvmf_init_fabrics_sqe(&cmd, NVMF_FABRIC_COMMAND_PROPERTY_SET);
	+ switch (size) {
	+ case 4:
	+ cmd.attrib.size = NVMF_PROP_SIZE_4;
	+ cmd.value.u32.low = htole32(value);
	+ break;
	+ case 8:
	+ cmd.attrib.size = NVMF_PROP_SIZE_8;
	+ cmd.value.u64 = htole64(value);
	+ break;
	+ default:
	+ errno = EINVAL;
	+ return (NULL);
	+ }
	+ cmd.ofst = htole32(offset);
	+
	+ return (nvmf_allocate_command(qp, &cmd));
	+}
	+
	+int
	+nvmf_write_property(struct nvmf_qpair *qp, uint32_t offset, uint8_t size,
	+ uint64_t value)
	+{
	+ struct nvmf_capsule cc, rc;
	+ uint16_t status;
	+ int error;
	+
	+ if (!qp->nq_admin)
	+ return (EINVAL);
	+
	+ cc = nvmf_set_property(qp, offset, size, value);
	+ if (cc == NULL)
	+ return (errno);
	+
	+ error = nvmf_host_transmit_command(cc);
	+ if (error != 0) {
	+ nvmf_free_capsule(cc);
	+ return (error);
	+ }
	+
	+ error = nvmf_host_wait_for_response(cc, &rc);
	+ nvmf_free_capsule(cc);
	+ if (error != 0)
	+ return (error);
	+
	+ status = le16toh(rc->nc_cqe.status);
	+ if (status != 0) {
	+ printf("NVMF: PROPERTY_SET failed, status %#x\n", status);
	+ nvmf_free_capsule(rc);
	+ return (EIO);
	+ }
	+
	+ nvmf_free_capsule(rc);
	+ return (0);
	+}
	+
	+int
	+nvmf_hostid_from_hostuuid(uint8_t hostid[16])
	+{
	+ char hostuuid_str[64];
	+ uuid_t hostuuid;
	+ size_t len;
	+ uint32_t status;
	+
	+ len = sizeof(hostuuid_str);
	+ if (sysctlbyname("kern.hostuuid", hostuuid_str, &len, NULL, 0) != 0)
	+ return (errno);
	+
	+ uuid_from_string(hostuuid_str, &hostuuid, &status);
	+ switch (status) {
	+ case uuid_s_ok:
	+ break;
	+ case uuid_s_no_memory:
	+ return (ENOMEM);
	+ default:
	+ return (EINVAL);
	+ }
	+
	+ uuid_enc_le(hostid, &hostuuid);
	+ return (0);
	+}
	+
	+int
	+nvmf_nqn_from_hostuuid(char nqn[NVMF_NQN_MAX_LEN])
	+{
	+ char hostuuid_str[64];
	+ size_t len;
	+
	+ len = sizeof(hostuuid_str);
	+ if (sysctlbyname("kern.hostuuid", hostuuid_str, &len, NULL, 0) != 0)
	+ return (errno);
	+
	+ strlcpy(nqn, NVMF_NQN_UUID_PRE, NVMF_NQN_MAX_LEN);
	+ strlcat(nqn, hostuuid_str, NVMF_NQN_MAX_LEN);
	+ return (0);
	+}
	+
	+int
	+nvmf_host_identify_controller(struct nvmf_qpair *qp,
	+ struct nvme_controller_data *cdata)
	+{
	+ struct nvme_command cmd;
	+ struct nvmf_capsule cc, rc;
	+ int error;
	+ uint16_t status;
	+
	+ if (!qp->nq_admin)
	+ return (EINVAL);
	+
	+ nvmf_init_sqe(&cmd, NVME_OPC_IDENTIFY);
	+
	+ /* 5.15.1 Use CNS of 0x01 for controller data. */
	+ cmd.cdw10 = htole32(1);
	+
	+ cc = nvmf_allocate_command(qp, &cmd);
	+ if (cc == NULL)
	+ return (errno);
	+
	+ error = nvmf_capsule_append_data(cc, cdata, sizeof(*cdata), false);
	+ if (error != 0) {
	+ nvmf_free_capsule(cc);
	+ return (error);
	+ }
	+
	+ error = nvmf_host_transmit_command(cc);
	+ if (error != 0) {
	+ nvmf_free_capsule(cc);
	+ return (error);
	+ }
	+
	+ error = nvmf_host_wait_for_response(cc, &rc);
	+ nvmf_free_capsule(cc);
	+ if (error != 0)
	+ return (error);
	+
	+ status = le16toh(rc->nc_cqe.status);
	+ if (status != 0) {
	+ printf("NVMF: IDENTIFY failed, status %#x\n", status);
	+ nvmf_free_capsule(rc);
	+ return (EIO);
	+ }
	+
	+ nvmf_free_capsule(rc);
	+ return (0);
	+}
	+
	+int
	+nvmf_host_identify_namespace(struct nvmf_qpair *qp, uint32_t nsid,
	+ struct nvme_namespace_data *nsdata)
	+{
	+ struct nvme_command cmd;
	+ struct nvmf_capsule cc, rc;
	+ int error;
	+ uint16_t status;
	+
	+ if (!qp->nq_admin)
	+ return (EINVAL);
	+
	+ nvmf_init_sqe(&cmd, NVME_OPC_IDENTIFY);
	+
	+ /* 5.15.1 Use CNS of 0x00 for namespace data. */
	+ cmd.cdw10 = htole32(0);
	+ cmd.nsid = htole32(nsid);
	+
	+ cc = nvmf_allocate_command(qp, &cmd);
	+ if (cc == NULL)
	+ return (errno);
	+
	+ error = nvmf_capsule_append_data(cc, nsdata, sizeof(*nsdata), false);
	+ if (error != 0) {
	+ nvmf_free_capsule(cc);
	+ return (error);
	+ }
	+
	+ error = nvmf_host_transmit_command(cc);
	+ if (error != 0) {
	+ nvmf_free_capsule(cc);
	+ return (error);
	+ }
	+
	+ error = nvmf_host_wait_for_response(cc, &rc);
	+ nvmf_free_capsule(cc);
	+ if (error != 0)
	+ return (error);
	+
	+ status = le16toh(rc->nc_cqe.status);
	+ if (status != 0) {
	+ printf("NVMF: IDENTIFY failed, status %#x\n", status);
	+ nvmf_free_capsule(rc);
	+ return (EIO);
	+ }
	+
	+ nvmf_free_capsule(rc);
	+ return (0);
	+}
	+
	+static int
	+nvmf_get_discovery_log_page(struct nvmf_qpair qp, uint64_t offset, void buf,
	+ size_t len)
	+{
	+ struct nvme_command cmd;
	+ struct nvmf_capsule cc, rc;
	+ size_t numd;
	+ int error;
	+ uint16_t status;
	+
	+ if (len % 4 != 0 \|\| len == 0 \|\| offset % 4 != 0)
	+ return (EINVAL);
	+
	+ numd = (len / 4) - 1;
	+ nvmf_init_sqe(&cmd, NVME_OPC_GET_LOG_PAGE);
	+ cmd.cdw10 = htole32(numd << 16 \| NVME_LOG_DISCOVERY);
	+ cmd.cdw11 = htole32(numd >> 16);
	+ cmd.cdw12 = htole32(offset);
	+ cmd.cdw13 = htole32(offset >> 32);
	+
	+ cc = nvmf_allocate_command(qp, &cmd);
	+ if (cc == NULL)
	+ return (errno);
	+
	+ error = nvmf_capsule_append_data(cc, buf, len, false);
	+ if (error != 0) {
	+ nvmf_free_capsule(cc);
	+ return (error);
	+ }
	+
	+ error = nvmf_host_transmit_command(cc);
	+ if (error != 0) {
	+ nvmf_free_capsule(cc);
	+ return (error);
	+ }
	+
	+ error = nvmf_host_wait_for_response(cc, &rc);
	+ nvmf_free_capsule(cc);
	+ if (error != 0)
	+ return (error);
	+
	+ status = le16toh(rc->nc_cqe.status);
	+ if (NVMEV(NVME_STATUS_SC, status) ==
	+ NVMF_FABRIC_SC_LOG_RESTART_DISCOVERY) {
	+ nvmf_free_capsule(rc);
	+ return (EAGAIN);
	+ }
	+ if (status != 0) {
	+ printf("NVMF: GET_LOG_PAGE failed, status %#x\n", status);
	+ nvmf_free_capsule(rc);
	+ return (EIO);
	+ }
	+
	+ nvmf_free_capsule(rc);
	+ return (0);
	+}
	+
	+int
	+nvmf_host_fetch_discovery_log_page(struct nvmf_qpair *qp,
	+ struct nvme_discovery_log **logp)
	+{
	+ struct nvme_discovery_log hdr, *log;
	+ size_t payload_len;
	+ int error;
	+
	+ if (!qp->nq_admin)
	+ return (EINVAL);
	+
	+ log = NULL;
	+ for (;;) {
	+ error = nvmf_get_discovery_log_page(qp, 0, &hdr, sizeof(hdr));
	+ if (error != 0)
	+ return (error);
	+ nvme_discovery_log_swapbytes(&hdr);
	+
	+ if (hdr.recfmt != 0) {
	+ printf("NVMF: Unsupported discovery log format: %d\n",
	+ hdr.recfmt);
	+ return (EINVAL);
	+ }
	+
	+ if (hdr.numrec > 1024) {
	+ printf("NVMF: Too many discovery log entries: %ju\n",
	+ (uintmax_t)hdr.numrec);
	+ return (EFBIG);
	+ }
	+
	+ payload_len = sizeof(log->entries[0]) * hdr.numrec;
	+ log = reallocf(log, sizeof(*log) + payload_len);
	+ if (log == NULL)
	+ return (ENOMEM);
	+ *log = hdr;
	+ if (hdr.numrec == 0)
	+ break;
	+
	+ error = nvmf_get_discovery_log_page(qp, sizeof(hdr),
	+ log->entries, payload_len);
	+ if (error == EAGAIN)
	+ continue;
	+ if (error != 0) {
	+ free(log);
	+ return (error);
	+ }
	+
	+ /* Re-read the header and check the generation count. */
	+ error = nvmf_get_discovery_log_page(qp, 0, &hdr, sizeof(hdr));
	+ if (error != 0) {
	+ free(log);
	+ return (error);
	+ }
	+ nvme_discovery_log_swapbytes(&hdr);
	+
	+ if (log->genctr != hdr.genctr)
	+ continue;
	+
	+ for (u_int i = 0; i < log->numrec; i++)
	+ nvme_discovery_log_entry_swapbytes(&log->entries[i]);
	+ break;
	+ }
	+ *logp = log;
	+ return (0);
	+}
	+
	+int
	+nvmf_host_request_queues(struct nvmf_qpair qp, u_int requested, u_int actual)
	+{
	+ struct nvme_command cmd;
	+ struct nvmf_capsule cc, rc;
	+ int error;
	+ uint16_t status;
	+
	+ if (!qp->nq_admin \|\| requested < 1 \|\| requested > 65535)
	+ return (EINVAL);
	+
	+ /* The number of queues is 0's based. */
	+ requested--;
	+
	+ nvmf_init_sqe(&cmd, NVME_OPC_SET_FEATURES);
	+ cmd.cdw10 = htole32(NVME_FEAT_NUMBER_OF_QUEUES);
	+
	+ /* Same number of completion and submission queues. */
	+ cmd.cdw11 = htole32((requested << 16) \| requested);
	+
	+ cc = nvmf_allocate_command(qp, &cmd);
	+ if (cc == NULL)
	+ return (errno);
	+
	+ error = nvmf_host_transmit_command(cc);
	+ if (error != 0) {
	+ nvmf_free_capsule(cc);
	+ return (error);
	+ }
	+
	+ error = nvmf_host_wait_for_response(cc, &rc);
	+ nvmf_free_capsule(cc);
	+ if (error != 0)
	+ return (error);
	+
	+ status = le16toh(rc->nc_cqe.status);
	+ if (status != 0) {
	+ printf("NVMF: SET_FEATURES failed, status %#x\n", status);
	+ nvmf_free_capsule(rc);
	+ return (EIO);
	+ }
	+
	+ *actual = (le32toh(rc->nc_cqe.cdw0) & 0xffff) + 1;
	+ nvmf_free_capsule(rc);
	+ return (0);
	+}
	+
	+static bool
	+is_queue_pair_idle(struct nvmf_qpair *qp)
	+{
	+ if (qp->nq_sqhd != qp->nq_sqtail)
	+ return (false);
	+ if (!TAILQ_EMPTY(&qp->nq_rx_capsules))
	+ return (false);
	+ return (true);
	+}
	+
	+static int
	+prepare_queues_for_handoff(struct nvmf_handoff_host *hh,
	+ struct nvmf_qpair *admin_qp, u_int num_queues,
	+ struct nvmf_qpair *io_queues, const struct nvme_controller_data cdata)
	+{
	+ struct nvmf_handoff_qpair_params *io;
	+ u_int i;
	+ int error;
	+
	+ memset(hh, 0, sizeof(*hh));
	+
	+ /* All queue pairs must be idle. */
	+ if (!is_queue_pair_idle(admin_qp))
	+ return (EBUSY);
	+ for (i = 0; i < num_queues; i++) {
	+ if (!is_queue_pair_idle(io_queues[i]))
	+ return (EBUSY);
	+ }
	+
	+ /* First, the admin queue. */
	+ hh->trtype = admin_qp->nq_association->na_trtype;
	+ hh->kato = admin_qp->nq_kato;
	+ error = nvmf_kernel_handoff_params(admin_qp, &hh->admin);
	+ if (error)
	+ return (error);
	+
	+ /* Next, the I/O queues. */
	+ hh->num_io_queues = num_queues;
	+ io = calloc(num_queues, sizeof(*io));
	+ for (i = 0; i < num_queues; i++) {
	+ error = nvmf_kernel_handoff_params(io_queues[i], &io[i]);
	+ if (error) {
	+ free(io);
	+ return (error);
	+ }
	+ }
	+
	+ hh->io = io;
	+ hh->cdata = cdata;
	+ return (0);
	+}
	+
	+int
	+nvmf_handoff_host(struct nvmf_qpair *admin_qp, u_int num_queues,
	+ struct nvmf_qpair *io_queues, const struct nvme_controller_data cdata)
	+{
	+ struct nvmf_handoff_host hh;
	+ u_int i;
	+ int error, fd;
	+
	+ fd = open("/dev/nvmf", O_RDWR);
	+ if (fd == -1) {
	+ error = errno;
	+ goto out;
	+ }
	+
	+ error = prepare_queues_for_handoff(&hh, admin_qp, num_queues, io_queues,
	+ cdata);
	+ if (error != 0)
	+ goto out;
	+
	+ if (ioctl(fd, NVMF_HANDOFF_HOST, &hh) == -1)
	+ error = errno;
	+ free(hh.io);
	+
	+out:
	+ if (fd >= 0)
	+ close(fd);
	+ for (i = 0; i < num_queues; i++)
	+ (void)nvmf_free_qpair(io_queues[i]);
	+ (void)nvmf_free_qpair(admin_qp);
	+ return (error);
	+}
	+
	+int
	+nvmf_disconnect_host(const char *host)
	+{
	+ int error, fd;
	+
	+ error = 0;
	+ fd = open("/dev/nvmf", O_RDWR);
	+ if (fd == -1) {
	+ error = errno;
	+ goto out;
	+ }
	+
	+ if (ioctl(fd, NVMF_DISCONNECT_HOST, &host) == -1)
	+ error = errno;
	+
	+out:
	+ if (fd >= 0)
	+ close(fd);
	+ return (error);
	+}
	+
	+int
	+nvmf_disconnect_all(void)
	+{
	+ int error, fd;
	+
	+ error = 0;
	+ fd = open("/dev/nvmf", O_RDWR);
	+ if (fd == -1) {
	+ error = errno;
	+ goto out;
	+ }
	+
	+ if (ioctl(fd, NVMF_DISCONNECT_ALL) == -1)
	+ error = errno;
	+
	+out:
	+ if (fd >= 0)
	+ close(fd);
	+ return (error);
	+}
	+
	+int
	+nvmf_reconnect_params(int fd, struct nvmf_reconnect_params *rparams)
	+{
	+ if (ioctl(fd, NVMF_RECONNECT_PARAMS, rparams) == -1)
	+ return (errno);
	+ return (0);
	+}
	+
	+int
	+nvmf_reconnect_host(int fd, struct nvmf_qpair *admin_qp, u_int num_queues,
	+ struct nvmf_qpair *io_queues, const struct nvme_controller_data cdata)
	+{
	+ struct nvmf_handoff_host hh;
	+ u_int i;
	+ int error;
	+
	+ error = prepare_queues_for_handoff(&hh, admin_qp, num_queues, io_queues,
	+ cdata);
	+ if (error != 0)
	+ goto out;
	+
	+ if (ioctl(fd, NVMF_RECONNECT_HOST, &hh) == -1)
	+ error = errno;
	+ free(hh.io);
	+
	+out:
	+ for (i = 0; i < num_queues; i++)
	+ (void)nvmf_free_qpair(io_queues[i]);
	+ (void)nvmf_free_qpair(admin_qp);
	+ return (error);
	+}
	diff --git a/lib/libnvmf/nvmf_tcp.c b/lib/libnvmf/nvmf_tcp.c
	new file mode 100644
	--- /dev/null
	+++ b/lib/libnvmf/nvmf_tcp.c
	@@ -0,0 +1,1474 @@
	+/*-
	+ * SPDX-License-Identifier: BSD-2-Clause
	+ *
	+ * Copyright (c) 2022-2024 Chelsio Communications, Inc.
	+ * Written by: John Baldwin <jhb@FreeBSD.org>
	+ */
	+
	+#include <sys/endian.h>
	+#include <sys/gsb_crc32.h>
	+#include <sys/queue.h>
	+#include <sys/uio.h>
	+#include <assert.h>
	+#include <errno.h>
	+#include <stdio.h>
	+#include <stdlib.h>
	+#include <string.h>
	+#include <unistd.h>
	+
	+#include "libnvmf.h"
	+#include "internal.h"
	+#include "nvmf_tcp.h"
	+
	+struct nvmf_tcp_qpair;
	+
	+struct nvmf_tcp_command_buffer {
	+ struct nvmf_tcp_qpair *qp;
	+
	+ void *data;
	+ size_t data_len;
	+ size_t data_xfered;
	+ uint32_t data_offset;
	+
	+ uint16_t cid;
	+ uint16_t ttag;
	+
	+ LIST_ENTRY(nvmf_tcp_command_buffer) link;
	+};
	+
	+LIST_HEAD(nvmf_tcp_command_buffer_list, nvmf_tcp_command_buffer);
	+
	+struct nvmf_tcp_association {
	+ struct nvmf_association na;
	+
	+ uint32_t ioccsz;
	+};
	+
	+struct nvmf_tcp_rxpdu {
	+ struct nvme_tcp_common_pdu_hdr *hdr;
	+ uint32_t data_len;
	+};
	+
	+struct nvmf_tcp_capsule {
	+ struct nvmf_capsule nc;
	+
	+ struct nvmf_tcp_rxpdu rx_pdu;
	+ struct nvmf_tcp_command_buffer *cb;
	+
	+ TAILQ_ENTRY(nvmf_tcp_capsule) link;
	+};
	+
	+struct nvmf_tcp_qpair {
	+ struct nvmf_qpair qp;
	+ int s;
	+
	+ uint8_t txpda;
	+ uint8_t rxpda;
	+ bool header_digests;
	+ bool data_digests;
	+ uint32_t maxr2t;
	+ uint32_t maxh2cdata;
	+ uint32_t max_icd; /* Host only */
	+ uint16_t next_ttag; /* Controller only */
	+
	+ struct nvmf_tcp_command_buffer_list tx_buffers;
	+ struct nvmf_tcp_command_buffer_list rx_buffers;
	+ TAILQ_HEAD(, nvmf_tcp_capsule) rx_capsules;
	+};
	+
	+#define TASSOC(nc) ((struct nvmf_tcp_association *)(na))
	+#define TCAP(nc) ((struct nvmf_tcp_capsule *)(nc))
	+#define CTCAP(nc) ((const struct nvmf_tcp_capsule *)(nc))
	+#define TQP(qp) ((struct nvmf_tcp_qpair *)(qp))
	+
	+static const char zero_padding[NVME_TCP_PDU_PDO_MAX_OFFSET];
	+
	+static uint32_t
	+compute_digest(const void *buf, size_t len)
	+{
	+ return (calculate_crc32c(0xffffffff, buf, len) ^ 0xffffffff);
	+}
	+
	+static struct nvmf_tcp_command_buffer *
	+tcp_alloc_command_buffer(struct nvmf_tcp_qpair qp, void data,
	+ uint32_t data_offset, size_t data_len, uint16_t cid, uint16_t ttag,
	+ bool receive)
	+{
	+ struct nvmf_tcp_command_buffer *cb;
	+
	+ cb = malloc(sizeof(*cb));
	+ cb->qp = qp;
	+ cb->data = data;
	+ cb->data_offset = data_offset;
	+ cb->data_len = data_len;
	+ cb->data_xfered = 0;
	+ cb->cid = cid;
	+ cb->ttag = ttag;
	+
	+ if (receive)
	+ LIST_INSERT_HEAD(&qp->rx_buffers, cb, link);
	+ else
	+ LIST_INSERT_HEAD(&qp->tx_buffers, cb, link);
	+ return (cb);
	+}
	+
	+static struct nvmf_tcp_command_buffer *
	+tcp_find_command_buffer(struct nvmf_tcp_qpair *qp, uint16_t cid, uint16_t ttag,
	+ bool receive)
	+{
	+ struct nvmf_tcp_command_buffer_list *list;
	+ struct nvmf_tcp_command_buffer *cb;
	+
	+ list = receive ? &qp->rx_buffers : &qp->tx_buffers;
	+ LIST_FOREACH(cb, list, link) {
	+ if (cb->cid == cid && cb->ttag == ttag)
	+ return (cb);
	+ }
	+ return (NULL);
	+}
	+
	+static void
	+tcp_purge_command_buffer(struct nvmf_tcp_qpair *qp, uint16_t cid, uint16_t ttag,
	+ bool receive)
	+{
	+ struct nvmf_tcp_command_buffer *cb;
	+
	+ cb = tcp_find_command_buffer(qp, cid, ttag, receive);
	+ if (cb != NULL)
	+ LIST_REMOVE(cb, link);
	+}
	+
	+static void
	+tcp_free_command_buffer(struct nvmf_tcp_command_buffer *cb)
	+{
	+ LIST_REMOVE(cb, link);
	+ free(cb);
	+}
	+
	+static int
	+nvmf_tcp_write_pdu(struct nvmf_tcp_qpair qp, const void pdu, size_t len)
	+{
	+ ssize_t nwritten;
	+ const char *cp;
	+
	+ cp = pdu;
	+ while (len != 0) {
	+ nwritten = write(qp->s, cp, len);
	+ if (nwritten < 0)
	+ return (errno);
	+ len -= nwritten;
	+ cp += nwritten;
	+ }
	+ return (0);
	+}
	+
	+static int
	+nvmf_tcp_write_pdu_iov(struct nvmf_tcp_qpair qp, struct iovec iov,
	+ u_int iovcnt, size_t len)
	+{
	+ ssize_t nwritten;
	+
	+ for (;;) {
	+ nwritten = writev(qp->s, iov, iovcnt);
	+ if (nwritten < 0)
	+ return (errno);
	+
	+ len -= nwritten;
	+ if (len == 0)
	+ return (0);
	+
	+ while (iov->iov_len <= (size_t)nwritten) {
	+ nwritten -= iov->iov_len;
	+ iovcnt--;
	+ iov++;
	+ }
	+
	+ iov->iov_base = (char *)iov->iov_base + nwritten;
	+ iov->iov_len -= nwritten;
	+ }
	+}
	+
	+static void
	+nvmf_tcp_report_error(struct nvmf_association na, struct nvmf_tcp_qpair qp,
	+ uint16_t fes, uint32_t fei, const void *rx_pdu, size_t pdu_len, u_int hlen)
	+{
	+ struct nvme_tcp_term_req_hdr hdr;
	+ struct iovec iov[2];
	+
	+ if (hlen != 0) {
	+ if (hlen > NVME_TCP_TERM_REQ_ERROR_DATA_MAX_SIZE)
	+ hlen = NVME_TCP_TERM_REQ_ERROR_DATA_MAX_SIZE;
	+ if (hlen > pdu_len)
	+ hlen = pdu_len;
	+ }
	+
	+ memset(&hdr, 0, sizeof(hdr));
	+ hdr.common.pdu_type = na->na_controller ?
	+ NVME_TCP_PDU_TYPE_C2H_TERM_REQ : NVME_TCP_PDU_TYPE_H2C_TERM_REQ;
	+ hdr.common.hlen = sizeof(hdr);
	+ hdr.common.plen = sizeof(hdr) + hlen;
	+ hdr.fes = htole16(fes);
	+ le32enc(hdr.fei, fei);
	+ iov[0].iov_base = &hdr;
	+ iov[0].iov_len = sizeof(hdr);
	+ iov[1].iov_base = __DECONST(void *, rx_pdu);
	+ iov[1].iov_len = hlen;
	+
	+ (void)nvmf_tcp_write_pdu_iov(qp, iov, nitems(iov), sizeof(hdr) + hlen);
	+ close(qp->s);
	+ qp->s = -1;
	+}
	+
	+static int
	+nvmf_tcp_validate_pdu(struct nvmf_tcp_qpair qp, struct nvmf_tcp_rxpdu pdu,
	+ size_t pdu_len)
	+{
	+ const struct nvme_tcp_common_pdu_hdr *ch;
	+ uint32_t data_len, fei, plen;
	+ uint32_t digest, rx_digest;
	+ u_int hlen;
	+ int error;
	+ uint16_t fes;
	+
	+ /* Determine how large of a PDU header to return for errors. */
	+ ch = pdu->hdr;
	+ hlen = ch->hlen;
	+ plen = le32toh(ch->plen);
	+ if (hlen < sizeof(*ch) \|\| hlen > plen)
	+ hlen = sizeof(*ch);
	+
	+ error = nvmf_tcp_validate_pdu_header(ch,
	+ qp->qp.nq_association->na_controller, qp->header_digests,
	+ qp->data_digests, qp->rxpda, &data_len, &fes, &fei);
	+ if (error != 0) {
	+ if (error == ECONNRESET) {
	+ close(qp->s);
	+ qp->s = -1;
	+ } else {
	+ nvmf_tcp_report_error(qp->qp.nq_association, qp,
	+ fes, fei, ch, pdu_len, hlen);
	+ }
	+ return (error);
	+ }
	+
	+ /* Check header digest if present. */
	+ if ((ch->flags & NVME_TCP_CH_FLAGS_HDGSTF) != 0) {
	+ digest = compute_digest(ch, ch->hlen);
	+ memcpy(&rx_digest, (const char *)ch + ch->hlen,
	+ sizeof(rx_digest));
	+ if (digest != rx_digest) {
	+ printf("NVMe/TCP: Header digest mismatch\n");
	+ nvmf_tcp_report_error(qp->qp.nq_association, qp,
	+ NVME_TCP_TERM_REQ_FES_HDGST_ERROR, rx_digest, ch,
	+ pdu_len, hlen);
	+ return (EBADMSG);
	+ }
	+ }
	+
	+ /* Check data digest if present. */
	+ if ((ch->flags & NVME_TCP_CH_FLAGS_DDGSTF) != 0) {
	+ digest = compute_digest((const char *)ch + ch->pdo, data_len);
	+ memcpy(&rx_digest, (const char *)ch + plen - sizeof(rx_digest),
	+ sizeof(rx_digest));
	+ if (digest != rx_digest) {
	+ printf("NVMe/TCP: Data digest mismatch\n");
	+ return (EBADMSG);
	+ }
	+ }
	+
	+ pdu->data_len = data_len;
	+ return (0);
	+}
	+
	+/*
	+ * Read data from a socket, retrying until the data has been fully
	+ * read or an error occurs.
	+ */
	+static int
	+nvmf_tcp_read_buffer(int s, void *buf, size_t len)
	+{
	+ ssize_t nread;
	+ char *cp;
	+
	+ cp = buf;
	+ while (len != 0) {
	+ nread = read(s, cp, len);
	+ if (nread < 0)
	+ return (errno);
	+ if (nread == 0)
	+ return (ECONNRESET);
	+ len -= nread;
	+ cp += nread;
	+ }
	+ return (0);
	+}
	+
	+static int
	+nvmf_tcp_read_pdu(struct nvmf_tcp_qpair qp, struct nvmf_tcp_rxpdu pdu)
	+{
	+ struct nvme_tcp_common_pdu_hdr ch;
	+ uint32_t plen;
	+ int error;
	+
	+ memset(pdu, 0, sizeof(*pdu));
	+ error = nvmf_tcp_read_buffer(qp->s, &ch, sizeof(ch));
	+ if (error != 0)
	+ return (error);
	+
	+ plen = le32toh(ch.plen);
	+
	+ /*
	+ * Validate a header with garbage lengths to trigger
	+ * an error message without reading more.
	+ */
	+ if (plen < sizeof(ch) \|\| ch.hlen > plen) {
	+ pdu->hdr = &ch;
	+ error = nvmf_tcp_validate_pdu(qp, pdu, sizeof(ch));
	+ pdu->hdr = NULL;
	+ assert(error != 0);
	+ return (error);
	+ }
	+
	+ /* Read the rest of the PDU. */
	+ pdu->hdr = malloc(plen);
	+ memcpy(pdu->hdr, &ch, sizeof(ch));
	+ error = nvmf_tcp_read_buffer(qp->s, pdu->hdr + 1, plen - sizeof(ch));
	+ if (error != 0)
	+ return (error);
	+ error = nvmf_tcp_validate_pdu(qp, pdu, plen);
	+ if (error != 0) {
	+ free(pdu->hdr);
	+ pdu->hdr = NULL;
	+ }
	+ return (error);
	+}
	+
	+static void
	+nvmf_tcp_free_pdu(struct nvmf_tcp_rxpdu *pdu)
	+{
	+ free(pdu->hdr);
	+ pdu->hdr = NULL;
	+}
	+
	+static int
	+nvmf_tcp_handle_term_req(struct nvmf_tcp_rxpdu *pdu)
	+{
	+ struct nvme_tcp_term_req_hdr *hdr;
	+
	+ hdr = (void *)pdu->hdr;
	+
	+ printf("NVMe/TCP: Received termination request: fes %#x fei %#x\n",
	+ le16toh(hdr->fes), le32dec(hdr->fei));
	+ nvmf_tcp_free_pdu(pdu);
	+ return (ECONNRESET);
	+}
	+
	+static int
	+nvmf_tcp_save_command_capsule(struct nvmf_tcp_qpair *qp,
	+ struct nvmf_tcp_rxpdu *pdu)
	+{
	+ struct nvme_tcp_cmd *cmd;
	+ struct nvmf_capsule *nc;
	+ struct nvmf_tcp_capsule *tc;
	+
	+ cmd = (void *)pdu->hdr;
	+
	+ nc = nvmf_allocate_command(&qp->qp, &cmd->ccsqe);
	+ if (nc == NULL)
	+ return (ENOMEM);
	+
	+ tc = TCAP(nc);
	+ tc->rx_pdu = *pdu;
	+
	+ TAILQ_INSERT_TAIL(&qp->rx_capsules, tc, link);
	+ return (0);
	+}
	+
	+static int
	+nvmf_tcp_save_response_capsule(struct nvmf_tcp_qpair *qp,
	+ struct nvmf_tcp_rxpdu *pdu)
	+{
	+ struct nvme_tcp_rsp *rsp;
	+ struct nvmf_capsule *nc;
	+ struct nvmf_tcp_capsule *tc;
	+
	+ rsp = (void *)pdu->hdr;
	+
	+ nc = nvmf_allocate_response(&qp->qp, &rsp->rccqe);
	+ if (nc == NULL)
	+ return (ENOMEM);
	+
	+ nc->nc_sqhd_valid = true;
	+ tc = TCAP(nc);
	+ tc->rx_pdu = *pdu;
	+
	+ TAILQ_INSERT_TAIL(&qp->rx_capsules, tc, link);
	+
	+ /*
	+ * Once the CQE has been received, no further transfers to the
	+ * command buffer for the associated CID can occur.
	+ */
	+ tcp_purge_command_buffer(qp, rsp->rccqe.cid, 0, true);
	+ tcp_purge_command_buffer(qp, rsp->rccqe.cid, 0, false);
	+
	+ return (0);
	+}
	+
	+/*
	+ * Construct and send a PDU that contains an optional data payload.
	+ * This includes dealing with digests and the length fields in the
	+ * common header.
	+ */
	+static int
	+nvmf_tcp_construct_pdu(struct nvmf_tcp_qpair qp, void hdr, size_t hlen,
	+ void *data, uint32_t data_len)
	+{
	+ struct nvme_tcp_common_pdu_hdr *ch;
	+ struct iovec iov[5];
	+ u_int iovcnt;
	+ uint32_t header_digest, data_digest, pad, pdo, plen;
	+
	+ plen = hlen;
	+ if (qp->header_digests)
	+ plen += sizeof(header_digest);
	+ if (data_len != 0) {
	+ pdo = roundup2(plen, qp->txpda);
	+ pad = pdo - plen;
	+ plen = pdo + data_len;
	+ if (qp->data_digests)
	+ plen += sizeof(data_digest);
	+ } else {
	+ assert(data == NULL);
	+ pdo = 0;
	+ pad = 0;
	+ }
	+
	+ ch = hdr;
	+ ch->hlen = hlen;
	+ if (qp->header_digests)
	+ ch->flags \|= NVME_TCP_CH_FLAGS_HDGSTF;
	+ if (qp->data_digests && data_len != 0)
	+ ch->flags \|= NVME_TCP_CH_FLAGS_DDGSTF;
	+ ch->pdo = pdo;
	+ ch->plen = htole32(plen);
	+
	+ /* CH + PSH */
	+ iov[0].iov_base = hdr;
	+ iov[0].iov_len = hlen;
	+ iovcnt = 1;
	+
	+ /* HDGST */
	+ if (qp->header_digests) {
	+ header_digest = compute_digest(hdr, hlen);
	+ iov[iovcnt].iov_base = &header_digest;
	+ iov[iovcnt].iov_len = sizeof(header_digest);
	+ iovcnt++;
	+ }
	+
	+ if (pad != 0) {
	+ /* PAD */
	+ iov[iovcnt].iov_base = __DECONST(char *, zero_padding);
	+ iov[iovcnt].iov_len = pad;
	+ iovcnt++;
	+ }
	+
	+ if (data_len != 0) {
	+ /* DATA */
	+ iov[iovcnt].iov_base = data;
	+ iov[iovcnt].iov_len = data_len;
	+ iovcnt++;
	+
	+ /* DDGST */
	+ if (qp->data_digests) {
	+ data_digest = compute_digest(data, data_len);
	+ iov[iovcnt].iov_base = &data_digest;
	+ iov[iovcnt].iov_len = sizeof(data_digest);
	+ iovcnt++;
	+ }
	+ }
	+
	+ return (nvmf_tcp_write_pdu_iov(qp, iov, iovcnt, plen));
	+}
	+
	+static int
	+nvmf_tcp_handle_h2c_data(struct nvmf_tcp_qpair qp, struct nvmf_tcp_rxpdu pdu)
	+{
	+ struct nvme_tcp_h2c_data_hdr *h2c;
	+ struct nvmf_tcp_command_buffer *cb;
	+ uint32_t data_len, data_offset;
	+ const char *icd;
	+
	+ h2c = (void *)pdu->hdr;
	+ if (le32toh(h2c->datal) > qp->maxh2cdata) {
	+ nvmf_tcp_report_error(qp->qp.nq_association, qp,
	+ NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_LIMIT_EXCEEDED, 0,
	+ pdu->hdr, le32toh(pdu->hdr->plen), pdu->hdr->hlen);
	+ nvmf_tcp_free_pdu(pdu);
	+ return (EBADMSG);
	+ }
	+
	+ cb = tcp_find_command_buffer(qp, h2c->cccid, h2c->ttag, true);
	+ if (cb == NULL) {
	+ nvmf_tcp_report_error(qp->qp.nq_association, qp,
	+ NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD,
	+ offsetof(struct nvme_tcp_h2c_data_hdr, ttag), pdu->hdr,
	+ le32toh(pdu->hdr->plen), pdu->hdr->hlen);
	+ nvmf_tcp_free_pdu(pdu);
	+ return (EBADMSG);
	+ }
	+
	+ data_len = le32toh(h2c->datal);
	+ if (data_len != pdu->data_len) {
	+ nvmf_tcp_report_error(qp->qp.nq_association, qp,
	+ NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD,
	+ offsetof(struct nvme_tcp_h2c_data_hdr, datal), pdu->hdr,
	+ le32toh(pdu->hdr->plen), pdu->hdr->hlen);
	+ nvmf_tcp_free_pdu(pdu);
	+ return (EBADMSG);
	+ }
	+
	+ data_offset = le32toh(h2c->datao);
	+ if (data_offset < cb->data_offset \|\|
	+ data_offset + data_len > cb->data_offset + cb->data_len) {
	+ nvmf_tcp_report_error(qp->qp.nq_association, qp,
	+ NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_OUT_OF_RANGE, 0,
	+ pdu->hdr, le32toh(pdu->hdr->plen), pdu->hdr->hlen);
	+ nvmf_tcp_free_pdu(pdu);
	+ return (EBADMSG);
	+ }
	+
	+ if (data_offset != cb->data_offset + cb->data_xfered) {
	+ nvmf_tcp_report_error(qp->qp.nq_association, qp,
	+ NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, pdu->hdr,
	+ le32toh(pdu->hdr->plen), pdu->hdr->hlen);
	+ nvmf_tcp_free_pdu(pdu);
	+ return (EBADMSG);
	+ }
	+
	+ if ((cb->data_xfered + data_len == cb->data_len) !=
	+ ((pdu->hdr->flags & NVME_TCP_H2C_DATA_FLAGS_LAST_PDU) != 0)) {
	+ nvmf_tcp_report_error(qp->qp.nq_association, qp,
	+ NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, pdu->hdr,
	+ le32toh(pdu->hdr->plen), pdu->hdr->hlen);
	+ nvmf_tcp_free_pdu(pdu);
	+ return (EBADMSG);
	+ }
	+
	+ cb->data_xfered += data_len;
	+ data_offset -= cb->data_offset;
	+ icd = (const char *)pdu->hdr + pdu->hdr->pdo;
	+ memcpy((char *)cb->data + data_offset, icd, data_len);
	+
	+ nvmf_tcp_free_pdu(pdu);
	+ return (0);
	+}
	+
	+static int
	+nvmf_tcp_handle_c2h_data(struct nvmf_tcp_qpair qp, struct nvmf_tcp_rxpdu pdu)
	+{
	+ struct nvme_tcp_c2h_data_hdr *c2h;
	+ struct nvmf_tcp_command_buffer *cb;
	+ uint32_t data_len, data_offset;
	+ const char *icd;
	+
	+ c2h = (void *)pdu->hdr;
	+
	+ cb = tcp_find_command_buffer(qp, c2h->cccid, 0, true);
	+ if (cb == NULL) {
	+ /*
	+ * XXX: Could be PDU sequence error if cccid is for a
	+ * command that doesn't use a command buffer.
	+ */
	+ nvmf_tcp_report_error(qp->qp.nq_association, qp,
	+ NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD,
	+ offsetof(struct nvme_tcp_c2h_data_hdr, cccid), pdu->hdr,
	+ le32toh(pdu->hdr->plen), pdu->hdr->hlen);
	+ nvmf_tcp_free_pdu(pdu);
	+ return (EBADMSG);
	+ }
	+
	+ data_len = le32toh(c2h->datal);
	+ if (data_len != pdu->data_len) {
	+ nvmf_tcp_report_error(qp->qp.nq_association, qp,
	+ NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD,
	+ offsetof(struct nvme_tcp_c2h_data_hdr, datal), pdu->hdr,
	+ le32toh(pdu->hdr->plen), pdu->hdr->hlen);
	+ nvmf_tcp_free_pdu(pdu);
	+ return (EBADMSG);
	+ }
	+
	+ data_offset = le32toh(c2h->datao);
	+ if (data_offset < cb->data_offset \|\|
	+ data_offset + data_len > cb->data_offset + cb->data_len) {
	+ nvmf_tcp_report_error(qp->qp.nq_association, qp,
	+ NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_OUT_OF_RANGE, 0,
	+ pdu->hdr, le32toh(pdu->hdr->plen), pdu->hdr->hlen);
	+ nvmf_tcp_free_pdu(pdu);
	+ return (EBADMSG);
	+ }
	+
	+ if (data_offset != cb->data_offset + cb->data_xfered) {
	+ nvmf_tcp_report_error(qp->qp.nq_association, qp,
	+ NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, pdu->hdr,
	+ le32toh(pdu->hdr->plen), pdu->hdr->hlen);
	+ nvmf_tcp_free_pdu(pdu);
	+ return (EBADMSG);
	+ }
	+
	+ if ((cb->data_xfered + data_len == cb->data_len) !=
	+ ((pdu->hdr->flags & NVME_TCP_C2H_DATA_FLAGS_LAST_PDU) != 0)) {
	+ nvmf_tcp_report_error(qp->qp.nq_association, qp,
	+ NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, pdu->hdr,
	+ le32toh(pdu->hdr->plen), pdu->hdr->hlen);
	+ nvmf_tcp_free_pdu(pdu);
	+ return (EBADMSG);
	+ }
	+
	+ cb->data_xfered += data_len;
	+ data_offset -= cb->data_offset;
	+ icd = (const char *)pdu->hdr + pdu->hdr->pdo;
	+ memcpy((char *)cb->data + data_offset, icd, data_len);
	+
	+ if ((pdu->hdr->flags & NVME_TCP_C2H_DATA_FLAGS_SUCCESS) != 0) {
	+ struct nvme_completion cqe;
	+ struct nvmf_tcp_capsule *tc;
	+ struct nvmf_capsule *nc;
	+
	+ memset(&cqe, 0, sizeof(cqe));
	+ cqe.cid = cb->cid;
	+
	+ nc = nvmf_allocate_response(&qp->qp, &cqe);
	+ if (nc == NULL) {
	+ nvmf_tcp_free_pdu(pdu);
	+ return (ENOMEM);
	+ }
	+ nc->nc_sqhd_valid = false;
	+
	+ tc = TCAP(nc);
	+ TAILQ_INSERT_TAIL(&qp->rx_capsules, tc, link);
	+ }
	+
	+ nvmf_tcp_free_pdu(pdu);
	+ return (0);
	+}
	+
	+/* NB: cid and ttag and little-endian already. */
	+static int
	+tcp_send_h2c_pdu(struct nvmf_tcp_qpair *qp, uint16_t cid, uint16_t ttag,
	+ uint32_t data_offset, void *buf, size_t len, bool last_pdu)
	+{
	+ struct nvme_tcp_h2c_data_hdr h2c;
	+
	+ memset(&h2c, 0, sizeof(h2c));
	+ h2c.common.pdu_type = NVME_TCP_PDU_TYPE_H2C_DATA;
	+ if (last_pdu)
	+ h2c.common.flags \|= NVME_TCP_H2C_DATA_FLAGS_LAST_PDU;
	+ h2c.cccid = cid;
	+ h2c.ttag = ttag;
	+ h2c.datao = htole32(data_offset);
	+ h2c.datal = htole32(len);
	+
	+ return (nvmf_tcp_construct_pdu(qp, &h2c, sizeof(h2c), buf, len));
	+}
	+
	+/* Sends one or more H2C_DATA PDUs, subject to MAXH2CDATA. */
	+static int
	+tcp_send_h2c_pdus(struct nvmf_tcp_qpair *qp, uint16_t cid, uint16_t ttag,
	+ uint32_t data_offset, void *buf, size_t len, bool last_pdu)
	+{
	+ char *p;
	+
	+ p = buf;
	+ while (len != 0) {
	+ size_t todo;
	+ int error;
	+
	+ todo = len;
	+ if (todo > qp->maxh2cdata)
	+ todo = qp->maxh2cdata;
	+ error = tcp_send_h2c_pdu(qp, cid, ttag, data_offset, p, todo,
	+ last_pdu && todo == len);
	+ if (error != 0)
	+ return (error);
	+ p += todo;
	+ len -= todo;
	+ }
	+ return (0);
	+}
	+
	+static int
	+nvmf_tcp_handle_r2t(struct nvmf_tcp_qpair qp, struct nvmf_tcp_rxpdu pdu)
	+{
	+ struct nvmf_tcp_command_buffer *cb;
	+ struct nvme_tcp_r2t_hdr *r2t;
	+ uint32_t data_len, data_offset;
	+ int error;
	+
	+ r2t = (void *)pdu->hdr;
	+
	+ cb = tcp_find_command_buffer(qp, r2t->cccid, 0, false);
	+ if (cb == NULL) {
	+ nvmf_tcp_report_error(qp->qp.nq_association, qp,
	+ NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD,
	+ offsetof(struct nvme_tcp_r2t_hdr, cccid), pdu->hdr,
	+ le32toh(pdu->hdr->plen), pdu->hdr->hlen);
	+ nvmf_tcp_free_pdu(pdu);
	+ return (EBADMSG);
	+ }
	+
	+ data_offset = le32toh(r2t->r2to);
	+ if (data_offset != cb->data_xfered) {
	+ nvmf_tcp_report_error(qp->qp.nq_association, qp,
	+ NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, pdu->hdr,
	+ le32toh(pdu->hdr->plen), pdu->hdr->hlen);
	+ nvmf_tcp_free_pdu(pdu);
	+ return (EBADMSG);
	+ }
	+
	+ /*
	+ * XXX: The spec does not specify how to handle R2T tranfers
	+ * out of range of the original command.
	+ */
	+ data_len = le32toh(r2t->r2tl);
	+ if (data_offset + data_len > cb->data_len) {
	+ nvmf_tcp_report_error(qp->qp.nq_association, qp,
	+ NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_OUT_OF_RANGE, 0,
	+ pdu->hdr, le32toh(pdu->hdr->plen), pdu->hdr->hlen);
	+ nvmf_tcp_free_pdu(pdu);
	+ return (EBADMSG);
	+ }
	+
	+ cb->data_xfered += data_len;
	+
	+ /*
	+ * Write out one or more H2C_DATA PDUs containing the
	+ * requested data.
	+ */
	+ error = tcp_send_h2c_pdus(qp, r2t->cccid, r2t->ttag,
	+ data_offset, (char *)cb->data + data_offset, data_len, true);
	+
	+ nvmf_tcp_free_pdu(pdu);
	+ return (error);
	+}
	+
	+static int
	+nvmf_tcp_receive_pdu(struct nvmf_tcp_qpair *qp)
	+{
	+ struct nvmf_tcp_rxpdu pdu;
	+ int error;
	+
	+ error = nvmf_tcp_read_pdu(qp, &pdu);
	+ if (error != 0)
	+ return (error);
	+
	+ switch (pdu.hdr->pdu_type) {
	+ default:
	+ __unreachable();
	+ break;
	+ case NVME_TCP_PDU_TYPE_H2C_TERM_REQ:
	+ case NVME_TCP_PDU_TYPE_C2H_TERM_REQ:
	+ return (nvmf_tcp_handle_term_req(&pdu));
	+ case NVME_TCP_PDU_TYPE_CAPSULE_CMD:
	+ return (nvmf_tcp_save_command_capsule(qp, &pdu));
	+ case NVME_TCP_PDU_TYPE_CAPSULE_RESP:
	+ return (nvmf_tcp_save_response_capsule(qp, &pdu));
	+ case NVME_TCP_PDU_TYPE_H2C_DATA:
	+ return (nvmf_tcp_handle_h2c_data(qp, &pdu));
	+ case NVME_TCP_PDU_TYPE_C2H_DATA:
	+ return (nvmf_tcp_handle_c2h_data(qp, &pdu));
	+ case NVME_TCP_PDU_TYPE_R2T:
	+ return (nvmf_tcp_handle_r2t(qp, &pdu));
	+ }
	+}
	+
	+static bool
	+nvmf_tcp_validate_ic_pdu(struct nvmf_association na, struct nvmf_tcp_qpair qp,
	+ const struct nvme_tcp_common_pdu_hdr *ch, size_t pdu_len)
	+{
	+ const struct nvme_tcp_ic_req *pdu;
	+ uint32_t plen;
	+ u_int hlen;
	+
	+ /* Determine how large of a PDU header to return for errors. */
	+ hlen = ch->hlen;
	+ plen = le32toh(ch->plen);
	+ if (hlen < sizeof(*ch) \|\| hlen > plen)
	+ hlen = sizeof(*ch);
	+
	+ /*
	+ * Errors must be reported for the lowest incorrect field
	+ * first, so validate fields in order.
	+ */
	+
	+ /* Validate pdu_type. */
	+
	+ /* Controllers only receive PDUs with a PDU direction of 0. */
	+ if (na->na_controller != (ch->pdu_type & 0x01) == 0) {
	+ na_error(na, "NVMe/TCP: Invalid PDU type %u", ch->pdu_type);
	+ nvmf_tcp_report_error(na, qp,
	+ NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, 0, ch, pdu_len,
	+ hlen);
	+ return (false);
	+ }
	+
	+ switch (ch->pdu_type) {
	+ case NVME_TCP_PDU_TYPE_IC_REQ:
	+ case NVME_TCP_PDU_TYPE_IC_RESP:
	+ break;
	+ default:
	+ na_error(na, "NVMe/TCP: Invalid PDU type %u", ch->pdu_type);
	+ nvmf_tcp_report_error(na, qp,
	+ NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, 0, ch, pdu_len,
	+ hlen);
	+ return (false);
	+ }
	+
	+ /* Validate flags. */
	+ if (ch->flags != 0) {
	+ na_error(na, "NVMe/TCP: Invalid PDU header flags %#x",
	+ ch->flags);
	+ nvmf_tcp_report_error(na, qp,
	+ NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, 1, ch, pdu_len,
	+ hlen);
	+ return (false);
	+ }
	+
	+ /* Validate hlen. */
	+ if (ch->hlen != 128) {
	+ na_error(na, "NVMe/TCP: Invalid PDU header length %u",
	+ ch->hlen);
	+ nvmf_tcp_report_error(na, qp,
	+ NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, 2, ch, pdu_len,
	+ hlen);
	+ return (false);
	+ }
	+
	+ /* Validate pdo. */
	+ if (ch->pdo != 0) {
	+ na_error(na, "NVMe/TCP: Invalid PDU data offset %u", ch->pdo);
	+ nvmf_tcp_report_error(na, qp,
	+ NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, 3, ch, pdu_len,
	+ hlen);
	+ return (false);
	+ }
	+
	+ /* Validate plen. */
	+ if (plen != 128) {
	+ na_error(na, "NVMe/TCP: Invalid PDU length %u", plen);
	+ nvmf_tcp_report_error(na, qp,
	+ NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, 4, ch, pdu_len,
	+ hlen);
	+ return (false);
	+ }
	+
	+ /* Validate fields common to both ICReq and ICResp. */
	+ pdu = (const struct nvme_tcp_ic_req *)ch;
	+ if (le16toh(pdu->pfv) != 0) {
	+ na_error(na, "NVMe/TCP: Unsupported PDU version %u",
	+ le16toh(pdu->pfv));
	+ nvmf_tcp_report_error(na, qp,
	+ NVME_TCP_TERM_REQ_FES_INVALID_DATA_UNSUPPORTED_PARAMETER,
	+ 8, ch, pdu_len, hlen);
	+ return (false);
	+ }
	+
	+ if (pdu->hpda > NVME_TCP_HPDA_MAX) {
	+ na_error(na, "NVMe/TCP: Unsupported PDA %u", pdu->hpda);
	+ nvmf_tcp_report_error(na, qp,
	+ NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, 10, ch, pdu_len,
	+ hlen);
	+ return (false);
	+ }
	+
	+ if (pdu->dgst.bits.reserved != 0) {
	+ na_error(na, "NVMe/TCP: Invalid digest settings");
	+ nvmf_tcp_report_error(na, qp,
	+ NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, 11, ch, pdu_len,
	+ hlen);
	+ return (false);
	+ }
	+
	+ return (true);
	+}
	+
	+static bool
	+nvmf_tcp_read_ic_req(struct nvmf_association na, struct nvmf_tcp_qpair qp,
	+ struct nvme_tcp_ic_req *pdu)
	+{
	+ int error;
	+
	+ error = nvmf_tcp_read_buffer(qp->s, pdu, sizeof(*pdu));
	+ if (error != 0) {
	+ na_error(na, "NVMe/TCP: Failed to read IC request: %s",
	+ strerror(error));
	+ return (false);
	+ }
	+
	+ return (nvmf_tcp_validate_ic_pdu(na, qp, &pdu->common, sizeof(*pdu)));
	+}
	+
	+static bool
	+nvmf_tcp_read_ic_resp(struct nvmf_association na, struct nvmf_tcp_qpair qp,
	+ struct nvme_tcp_ic_resp *pdu)
	+{
	+ int error;
	+
	+ error = nvmf_tcp_read_buffer(qp->s, pdu, sizeof(*pdu));
	+ if (error != 0) {
	+ na_error(na, "NVMe/TCP: Failed to read IC response: %s",
	+ strerror(error));
	+ return (false);
	+ }
	+
	+ return (nvmf_tcp_validate_ic_pdu(na, qp, &pdu->common, sizeof(*pdu)));
	+}
	+
	+static struct nvmf_association *
	+tcp_allocate_association(bool controller __unused,
	+ const struct nvmf_association_params *params __unused)
	+{
	+ struct nvmf_tcp_association *ta;
	+
	+ ta = calloc(1, sizeof(*ta));
	+
	+ return (&ta->na);
	+}
	+
	+static void
	+tcp_update_association(struct nvmf_association *na,
	+ const struct nvme_controller_data *cdata)
	+{
	+ struct nvmf_tcp_association *ta = TASSOC(na);
	+
	+ ta->ioccsz = le32toh(cdata->ioccsz);
	+}
	+
	+static void
	+tcp_free_association(struct nvmf_association *na)
	+{
	+ free(na);
	+}
	+
	+static bool
	+tcp_connect(struct nvmf_tcp_qpair qp, struct nvmf_association na, bool admin)
	+{
	+ const struct nvmf_association_params *params = &na->na_params;
	+ struct nvmf_tcp_association *ta = TASSOC(na);
	+ struct nvme_tcp_ic_req ic_req;
	+ struct nvme_tcp_ic_resp ic_resp;
	+ int error;
	+
	+ if (!admin) {
	+ if (ta->ioccsz == 0) {
	+ na_error(na, "TCP I/O queues require cdata");
	+ return (false);
	+ }
	+ if (ta->ioccsz < 4) {
	+ na_error(na, "Invalid IOCCSZ %u", ta->ioccsz);
	+ return (false);
	+ }
	+ }
	+
	+ memset(&ic_req, 0, sizeof(ic_req));
	+ ic_req.common.pdu_type = NVME_TCP_PDU_TYPE_IC_REQ;
	+ ic_req.common.hlen = sizeof(ic_req);
	+ ic_req.common.plen = htole32(sizeof(ic_req));
	+ ic_req.pfv = htole16(0);
	+ ic_req.hpda = params->tcp.pda;
	+ if (params->tcp.header_digests)
	+ ic_req.dgst.bits.hdgst_enable = 1;
	+ if (params->tcp.data_digests)
	+ ic_req.dgst.bits.ddgst_enable = 1;
	+ ic_req.maxr2t = htole32(params->tcp.maxr2t);
	+
	+ error = nvmf_tcp_write_pdu(qp, &ic_req, sizeof(ic_req));
	+ if (error != 0) {
	+ na_error(na, "Failed to write IC request: %s", strerror(error));
	+ return (false);
	+ }
	+
	+ if (!nvmf_tcp_read_ic_resp(na, qp, &ic_resp))
	+ return (false);
	+
	+ /* Ensure the controller didn't enable digests we didn't request. */
	+ if ((!params->tcp.header_digests &&
	+ ic_resp.dgst.bits.hdgst_enable != 0) \|\|
	+ (!params->tcp.data_digests &&
	+ ic_resp.dgst.bits.ddgst_enable != 0)) {
	+ na_error(na, "Controller enabled unrequested digests");
	+ nvmf_tcp_report_error(na, qp,
	+ NVME_TCP_TERM_REQ_FES_INVALID_DATA_UNSUPPORTED_PARAMETER,
	+ 11, &ic_resp, sizeof(ic_resp), sizeof(ic_resp));
	+ return (false);
	+ }
	+
	+ /*
	+ * XXX: Is there an upper-bound to enforce here? Perhaps pick
	+ * some large value and report larger values as an unsupported
	+ * parameter?
	+ */
	+ if (le32toh(ic_resp.maxh2cdata) < 4096) {
	+ na_error(na, "Invalid MAXH2CDATA %u",
	+ le32toh(ic_resp.maxh2cdata));
	+ nvmf_tcp_report_error(na, qp,
	+ NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, 12, &ic_resp,
	+ sizeof(ic_resp), sizeof(ic_resp));
	+ return (false);
	+ }
	+
	+ qp->txpda = (params->tcp.pda + 1) * 4;
	+ qp->rxpda = (ic_resp.cpda + 1) * 4;
	+ qp->header_digests = ic_resp.dgst.bits.hdgst_enable != 0;
	+ qp->data_digests = ic_resp.dgst.bits.ddgst_enable != 0;
	+ qp->maxr2t = params->tcp.maxr2t;
	+ qp->maxh2cdata = le32toh(ic_resp.maxh2cdata);
	+ if (admin)
	+ /* 7.4.3 */
	+ qp->max_icd = 8192;
	+ else
	+ qp->max_icd = (ta->ioccsz - 4) * 16;
	+
	+ return (0);
	+}
	+
	+static bool
	+tcp_accept(struct nvmf_tcp_qpair qp, struct nvmf_association na)
	+{
	+ const struct nvmf_association_params *params = &na->na_params;
	+ struct nvme_tcp_ic_req ic_req;
	+ struct nvme_tcp_ic_resp ic_resp;
	+ int error;
	+
	+ if (!nvmf_tcp_read_ic_req(na, qp, &ic_req))
	+ return (false);
	+
	+ memset(&ic_resp, 0, sizeof(ic_resp));
	+ ic_resp.common.pdu_type = NVME_TCP_PDU_TYPE_IC_RESP;
	+ ic_resp.common.hlen = sizeof(ic_req);
	+ ic_resp.common.plen = htole32(sizeof(ic_req));
	+ ic_resp.pfv = htole16(0);
	+ ic_resp.cpda = params->tcp.pda;
	+ if (params->tcp.header_digests && ic_req.dgst.bits.hdgst_enable != 0)
	+ ic_resp.dgst.bits.hdgst_enable = 1;
	+ if (params->tcp.data_digests && ic_req.dgst.bits.ddgst_enable != 0)
	+ ic_resp.dgst.bits.ddgst_enable = 1;
	+ ic_resp.maxh2cdata = htole32(params->tcp.maxh2cdata);
	+
	+ error = nvmf_tcp_write_pdu(qp, &ic_resp, sizeof(ic_resp));
	+ if (error != 0) {
	+ na_error(na, "Failed to write IC response: %s",
	+ strerror(error));
	+ return (false);
	+ }
	+
	+ qp->txpda = (params->tcp.pda + 1) * 4;
	+ qp->rxpda = (ic_req.hpda + 1) * 4;
	+ qp->header_digests = ic_resp.dgst.bits.hdgst_enable != 0;
	+ qp->data_digests = ic_resp.dgst.bits.ddgst_enable != 0;
	+ qp->maxr2t = le32toh(ic_req.maxr2t);
	+ qp->maxh2cdata = params->tcp.maxh2cdata;
	+ qp->max_icd = 0; /* XXX */
	+ return (0);
	+}
	+
	+static struct nvmf_qpair *
	+tcp_allocate_qpair(struct nvmf_association *na,
	+ const struct nvmf_qpair_params *qparams)
	+{
	+ const struct nvmf_association_params *aparams = &na->na_params;
	+ struct nvmf_tcp_qpair *qp;
	+ int error;
	+
	+ if (aparams->tcp.pda > NVME_TCP_CPDA_MAX) {
	+ na_error(na, "Invalid PDA");
	+ return (NULL);
	+ }
	+
	+ qp = calloc(1, sizeof(*qp));
	+ qp->s = qparams->tcp.fd;
	+ LIST_INIT(&qp->rx_buffers);
	+ LIST_INIT(&qp->tx_buffers);
	+ TAILQ_INIT(&qp->rx_capsules);
	+ if (na->na_controller)
	+ error = tcp_accept(qp, na);
	+ else
	+ error = tcp_connect(qp, na, qparams->admin);
	+ if (error != 0) {
	+ free(qp);
	+ return (NULL);
	+ }
	+
	+ return (&qp->qp);
	+}
	+
	+static void
	+tcp_free_qpair(struct nvmf_qpair *nq)
	+{
	+ struct nvmf_tcp_qpair *qp = TQP(nq);
	+ struct nvmf_tcp_capsule ntc, tc;
	+ struct nvmf_tcp_command_buffer ncb, cb;
	+
	+ TAILQ_FOREACH_SAFE(tc, &qp->rx_capsules, link, ntc) {
	+ TAILQ_REMOVE(&qp->rx_capsules, tc, link);
	+ nvmf_free_capsule(&tc->nc);
	+ }
	+ LIST_FOREACH_SAFE(cb, &qp->rx_buffers, link, ncb) {
	+ tcp_free_command_buffer(cb);
	+ }
	+ LIST_FOREACH_SAFE(cb, &qp->tx_buffers, link, ncb) {
	+ tcp_free_command_buffer(cb);
	+ }
	+ free(qp);
	+}
	+
	+static int
	+tcp_kernel_handoff_params(struct nvmf_qpair *nq,
	+ struct nvmf_handoff_qpair_params *qparams)
	+{
	+ struct nvmf_tcp_qpair *qp = TQP(nq);
	+
	+ qparams->tcp.fd = qp->s;
	+ qparams->tcp.rxpda = qp->rxpda;
	+ qparams->tcp.txpda = qp->txpda;
	+ qparams->tcp.header_digests = qp->header_digests;
	+ qparams->tcp.data_digests = qp->data_digests;
	+ qparams->tcp.maxr2t = qp->maxr2t;
	+ qparams->tcp.maxh2cdata = qp->maxh2cdata;
	+ qparams->tcp.max_icd = qp->max_icd;
	+
	+ return (0);
	+}
	+
	+static struct nvmf_capsule *
	+tcp_allocate_capsule(struct nvmf_qpair *qp __unused)
	+{
	+ struct nvmf_tcp_capsule *nc;
	+
	+ nc = calloc(1, sizeof(*nc));
	+ return (&nc->nc);
	+}
	+
	+static void
	+tcp_free_capsule(struct nvmf_capsule *nc)
	+{
	+ struct nvmf_tcp_capsule *tc = TCAP(nc);
	+
	+ nvmf_tcp_free_pdu(&tc->rx_pdu);
	+ if (tc->cb != NULL)
	+ tcp_free_command_buffer(tc->cb);
	+ free(tc);
	+}
	+
	+static int
	+tcp_transmit_command(struct nvmf_capsule *nc)
	+{
	+ struct nvmf_tcp_qpair *qp = TQP(nc->nc_qpair);
	+ struct nvmf_tcp_capsule *tc = TCAP(nc);
	+ struct nvme_tcp_cmd cmd;
	+ struct nvme_sgl_descriptor *sgl;
	+ int error;
	+ bool use_icd;
	+
	+ use_icd = false;
	+ if (nc->nc_data_len != 0 && nc->nc_send_data &&
	+ nc->nc_data_len <= qp->max_icd)
	+ use_icd = true;
	+
	+ memset(&cmd, 0, sizeof(cmd));
	+ cmd.common.pdu_type = NVME_TCP_PDU_TYPE_CAPSULE_CMD;
	+ cmd.ccsqe = nc->nc_sqe;
	+
	+ /* Populate SGL in SQE. */
	+ sgl = &cmd.ccsqe.sgl;
	+ memset(sgl, 0, sizeof(*sgl));
	+ sgl->address = 0;
	+ sgl->length = htole32(nc->nc_data_len);
	+ if (use_icd) {
	+ /* Use in-capsule data. */
	+ sgl->type = NVME_SGL_TYPE_ICD;
	+ } else {
	+ /* Use a command buffer. */
	+ sgl->type = NVME_SGL_TYPE_COMMAND_BUFFER;
	+ }
	+
	+ /* Send command capsule. */
	+ error = nvmf_tcp_construct_pdu(qp, &cmd, sizeof(cmd), use_icd ?
	+ nc->nc_data : NULL, use_icd ? nc->nc_data_len : 0);
	+ if (error != 0)
	+ return (error);
	+
	+ /*
	+ * If data will be transferred using a command buffer, allocate a
	+ * buffer structure and queue it.
	+ */
	+ if (nc->nc_data_len != 0 && !use_icd)
	+ tc->cb = tcp_alloc_command_buffer(qp, nc->nc_data, 0,
	+ nc->nc_data_len, cmd.ccsqe.cid, 0, !nc->nc_send_data);
	+
	+ return (0);
	+}
	+
	+static int
	+tcp_transmit_response(struct nvmf_capsule *nc)
	+{
	+ struct nvmf_tcp_qpair *qp = TQP(nc->nc_qpair);
	+ struct nvme_tcp_rsp rsp;
	+
	+ memset(&rsp, 0, sizeof(rsp));
	+ rsp.common.pdu_type = NVME_TCP_PDU_TYPE_CAPSULE_RESP;
	+ rsp.rccqe = nc->nc_cqe;
	+
	+ return (nvmf_tcp_construct_pdu(qp, &rsp, sizeof(rsp), NULL, 0));
	+}
	+
	+static int
	+tcp_transmit_capsule(struct nvmf_capsule *nc)
	+{
	+ if (nc->nc_qe_len == sizeof(struct nvme_command))
	+ return (tcp_transmit_command(nc));
	+ else
	+ return (tcp_transmit_response(nc));
	+}
	+
	+static int
	+tcp_receive_capsule(struct nvmf_qpair nq, struct nvmf_capsule *ncp)
	+{
	+ struct nvmf_tcp_qpair *qp = TQP(nq);
	+ struct nvmf_tcp_capsule *tc;
	+ int error;
	+
	+ while (TAILQ_EMPTY(&qp->rx_capsules)) {
	+ error = nvmf_tcp_receive_pdu(qp);
	+ if (error != 0)
	+ return (error);
	+ }
	+ tc = TAILQ_FIRST(&qp->rx_capsules);
	+ TAILQ_REMOVE(&qp->rx_capsules, tc, link);
	+ *ncp = &tc->nc;
	+ return (0);
	+}
	+
	+static uint8_t
	+tcp_validate_command_capsule(const struct nvmf_capsule *nc)
	+{
	+ const struct nvmf_tcp_capsule *tc = CTCAP(nc);
	+ const struct nvme_sgl_descriptor *sgl;
	+
	+ assert(tc->rx_pdu.hdr != NULL);
	+
	+ sgl = &nc->nc_sqe.sgl;
	+ switch (sgl->type) {
	+ case NVME_SGL_TYPE_ICD:
	+ if (tc->rx_pdu.data_len != le32toh(sgl->length)) {
	+ printf("NVMe/TCP: Command Capsule with mismatched ICD length\n");
	+ return (NVME_SC_DATA_SGL_LENGTH_INVALID);
	+ }
	+ break;
	+ case NVME_SGL_TYPE_COMMAND_BUFFER:
	+ if (tc->rx_pdu.data_len != 0) {
	+ printf("NVMe/TCP: Command Buffer SGL with ICD\n");
	+ return (NVME_SC_INVALID_FIELD);
	+ }
	+ break;
	+ default:
	+ printf("NVMe/TCP: Invalid SGL type in Command Capsule\n");
	+ return (NVME_SC_SGL_DESCRIPTOR_TYPE_INVALID);
	+ }
	+
	+ if (sgl->address != 0) {
	+ printf("NVMe/TCP: Invalid SGL offset in Command Capsule\n");
	+ return (NVME_SC_SGL_OFFSET_INVALID);
	+ }
	+
	+ return (NVME_SC_SUCCESS);
	+}
	+
	+static size_t
	+tcp_capsule_data_len(const struct nvmf_capsule *nc)
	+{
	+ assert(nc->nc_qe_len == sizeof(struct nvme_command));
	+ return (le32toh(nc->nc_sqe.sgl.length));
	+}
	+
	+/* NB: cid and ttag are both little-endian already. */
	+static int
	+tcp_send_r2t(struct nvmf_tcp_qpair *qp, uint16_t cid, uint16_t ttag,
	+ uint32_t data_offset, uint32_t data_len)
	+{
	+ struct nvme_tcp_r2t_hdr r2t;
	+
	+ memset(&r2t, 0, sizeof(r2t));
	+ r2t.common.pdu_type = NVME_TCP_PDU_TYPE_R2T;
	+ r2t.cccid = cid;
	+ r2t.ttag = ttag;
	+ r2t.r2to = htole32(data_offset);
	+ r2t.r2tl = htole32(data_len);
	+
	+ return (nvmf_tcp_construct_pdu(qp, &r2t, sizeof(r2t), NULL, 0));
	+}
	+
	+static int
	+tcp_receive_r2t_data(const struct nvmf_capsule *nc, uint32_t data_offset,
	+ void *buf, size_t len)
	+{
	+ struct nvmf_tcp_qpair *qp = TQP(nc->nc_qpair);
	+ struct nvmf_tcp_command_buffer *cb;
	+ int error;
	+ uint16_t ttag;
	+
	+ /*
	+ * Don't bother byte-swapping ttag as it is just a cookie
	+ * value returned by the other end as-is.
	+ */
	+ ttag = qp->next_ttag++;
	+
	+ error = tcp_send_r2t(qp, nc->nc_sqe.cid, ttag, data_offset, len);
	+ if (error != 0)
	+ return (error);
	+
	+ cb = tcp_alloc_command_buffer(qp, buf, data_offset, len,
	+ nc->nc_sqe.cid, ttag, true);
	+
	+ /* Parse received PDUs until the data transfer is complete. */
	+ while (cb->data_xfered < cb->data_len) {
	+ error = nvmf_tcp_receive_pdu(qp);
	+ if (error != 0)
	+ break;
	+ }
	+ tcp_free_command_buffer(cb);
	+ return (error);
	+}
	+
	+static int
	+tcp_receive_icd_data(const struct nvmf_capsule *nc, uint32_t data_offset,
	+ void *buf, size_t len)
	+{
	+ const struct nvmf_tcp_capsule *tc = CTCAP(nc);
	+ const char *icd;
	+
	+ icd = (const char *)tc->rx_pdu.hdr + tc->rx_pdu.hdr->pdo + data_offset;
	+ memcpy(buf, icd, len);
	+ return (0);
	+}
	+
	+static int
	+tcp_receive_controller_data(const struct nvmf_capsule *nc, uint32_t data_offset,
	+ void *buf, size_t len)
	+{
	+ struct nvmf_association *na = nc->nc_qpair->nq_association;
	+ const struct nvme_sgl_descriptor *sgl;
	+ size_t data_len;
	+
	+ if (nc->nc_qe_len != sizeof(struct nvme_command) \|\| !na->na_controller)
	+ return (EINVAL);
	+
	+ sgl = &nc->nc_sqe.sgl;
	+ data_len = le32toh(sgl->length);
	+ if (data_offset + len > data_len)
	+ return (EFBIG);
	+
	+ if (sgl->type == NVME_SGL_TYPE_ICD)
	+ return (tcp_receive_icd_data(nc, data_offset, buf, len));
	+ else
	+ return (tcp_receive_r2t_data(nc, data_offset, buf, len));
	+}
	+
	+/* NB: cid is little-endian already. */
	+static int
	+tcp_send_c2h_pdu(struct nvmf_tcp_qpair *qp, uint16_t cid,
	+ uint32_t data_offset, const void *buf, size_t len, bool last_pdu,
	+ bool success)
	+{
	+ struct nvme_tcp_c2h_data_hdr c2h;
	+
	+ memset(&c2h, 0, sizeof(c2h));
	+ c2h.common.pdu_type = NVME_TCP_PDU_TYPE_C2H_DATA;
	+ if (last_pdu)
	+ c2h.common.flags \|= NVME_TCP_C2H_DATA_FLAGS_LAST_PDU;
	+ if (success)
	+ c2h.common.flags \|= NVME_TCP_C2H_DATA_FLAGS_SUCCESS;
	+ c2h.cccid = cid;
	+ c2h.datao = htole32(data_offset);
	+ c2h.datal = htole32(len);
	+
	+ return (nvmf_tcp_construct_pdu(qp, &c2h, sizeof(c2h),
	+ __DECONST(void *, buf), len));
	+}
	+
	+static int
	+tcp_send_controller_data(const struct nvmf_capsule nc, const void buf,
	+ size_t len)
	+{
	+ struct nvmf_association *na = nc->nc_qpair->nq_association;
	+ struct nvmf_tcp_qpair *qp = TQP(nc->nc_qpair);
	+ const struct nvme_sgl_descriptor *sgl;
	+ const char *src;
	+ size_t todo;
	+ uint32_t data_len, data_offset;
	+ int error;
	+ bool last_pdu, send_success_flag;
	+
	+ if (nc->nc_qe_len != sizeof(struct nvme_command) \|\| !na->na_controller)
	+ return (EINVAL);
	+
	+ sgl = &nc->nc_sqe.sgl;
	+ data_len = le32toh(sgl->length);
	+ if (len != data_len) {
	+ nvmf_send_generic_error(nc, NVME_SC_INVALID_FIELD);
	+ return (EFBIG);
	+ }
	+
	+ if (sgl->type != NVME_SGL_TYPE_COMMAND_BUFFER) {
	+ nvmf_send_generic_error(nc, NVME_SC_INVALID_FIELD);
	+ return (EINVAL);
	+ }
	+
	+ /* Use the SUCCESS flag if SQ flow control is disabled. */
	+ send_success_flag = !qp->qp.nq_flow_control;
	+
	+ /*
	+ * Write out one or more C2H_DATA PDUs containing the data.
	+ * Each PDU is arbitrarily capped at 256k.
	+ */
	+ data_offset = 0;
	+ src = buf;
	+ while (len > 0) {
	+ if (len > 256 * 1024) {
	+ todo = 256 * 1024;
	+ last_pdu = false;
	+ } else {
	+ todo = len;
	+ last_pdu = true;
	+ }
	+ error = tcp_send_c2h_pdu(qp, nc->nc_sqe.cid, data_offset,
	+ src, todo, last_pdu, last_pdu && send_success_flag);
	+ if (error != 0) {
	+ nvmf_send_generic_error(nc,
	+ NVME_SC_TRANSIENT_TRANSPORT_ERROR);
	+ return (error);
	+ }
	+ data_offset += todo;
	+ src += todo;
	+ len -= todo;
	+ }
	+ if (!send_success_flag)
	+ nvmf_send_success(nc);
	+ return (0);
	+}
	+
	+struct nvmf_transport_ops tcp_ops = {
	+ .allocate_association = tcp_allocate_association,
	+ .update_association = tcp_update_association,
	+ .free_association = tcp_free_association,
	+ .allocate_qpair = tcp_allocate_qpair,
	+ .free_qpair = tcp_free_qpair,
	+ .kernel_handoff_params = tcp_kernel_handoff_params,
	+ .allocate_capsule = tcp_allocate_capsule,
	+ .free_capsule = tcp_free_capsule,
	+ .transmit_capsule = tcp_transmit_capsule,
	+ .receive_capsule = tcp_receive_capsule,
	+ .validate_command_capsule = tcp_validate_command_capsule,
	+ .capsule_data_len = tcp_capsule_data_len,
	+ .receive_controller_data = tcp_receive_controller_data,
	+ .send_controller_data = tcp_send_controller_data,
	+};
	diff --git a/lib/libnvmf/nvmf_transport.c b/lib/libnvmf/nvmf_transport.c
	new file mode 100644
	--- /dev/null
	+++ b/lib/libnvmf/nvmf_transport.c
	@@ -0,0 +1,269 @@
	+/*-
	+ * SPDX-License-Identifier: BSD-2-Clause
	+ *
	+ * Copyright (c) 2022-2024 Chelsio Communications, Inc.
	+ * Written by: John Baldwin <jhb@FreeBSD.org>
	+ */
	+
	+#include <sys/refcount.h>
	+#include <assert.h>
	+#include <errno.h>
	+#include <stdarg.h>
	+#include <stdio.h>
	+#include <stdlib.h>
	+#include <string.h>
	+
	+#include "libnvmf.h"
	+#include "internal.h"
	+
	+struct nvmf_association *
	+nvmf_allocate_association(enum nvmf_trtype trtype, bool controller,
	+ const struct nvmf_association_params *params)
	+{
	+ struct nvmf_transport_ops *ops;
	+ struct nvmf_association *na;
	+
	+ switch (trtype) {
	+ case NVMF_TRTYPE_TCP:
	+ ops = &tcp_ops;
	+ break;
	+ default:
	+ errno = EINVAL;
	+ return (NULL);
	+ }
	+
	+ na = ops->allocate_association(controller, params);
	+ if (na == NULL)
	+ return (NULL);
	+
	+ na->na_ops = ops;
	+ na->na_trtype = trtype;
	+ na->na_controller = controller;
	+ na->na_params = *params;
	+ na->na_last_error = NULL;
	+ refcount_init(&na->na_refs, 1);
	+ return (na);
	+}
	+
	+void
	+nvmf_update_assocation(struct nvmf_association *na,
	+ const struct nvme_controller_data *cdata)
	+{
	+ na->na_ops->update_association(na, cdata);
	+}
	+
	+void
	+nvmf_free_association(struct nvmf_association *na)
	+{
	+ if (refcount_release(&na->na_refs)) {
	+ free(na->na_last_error);
	+ na->na_ops->free_association(na);
	+ }
	+}
	+
	+const char *
	+nvmf_association_error(const struct nvmf_association *na)
	+{
	+ return (na->na_last_error);
	+}
	+
	+void
	+na_clear_error(struct nvmf_association *na)
	+{
	+ free(na->na_last_error);
	+ na->na_last_error = NULL;
	+}
	+
	+void
	+na_error(struct nvmf_association na, const char fmt, ...)
	+{
	+ va_list ap;
	+ char *str;
	+
	+ if (na->na_last_error != NULL)
	+ return;
	+ va_start(ap, fmt);
	+ vasprintf(&str, fmt, ap);
	+ va_end(ap);
	+ na->na_last_error = str;
	+}
	+
	+struct nvmf_qpair *
	+nvmf_allocate_qpair(struct nvmf_association *na,
	+ const struct nvmf_qpair_params *params)
	+{
	+ struct nvmf_qpair *qp;
	+
	+ na_clear_error(na);
	+ qp = na->na_ops->allocate_qpair(na, params);
	+ if (qp == NULL)
	+ return (NULL);
	+
	+ refcount_acquire(&na->na_refs);
	+ qp->nq_association = na;
	+ qp->nq_admin = params->admin;
	+ TAILQ_INIT(&qp->nq_rx_capsules);
	+ return (qp);
	+}
	+
	+void
	+nvmf_free_qpair(struct nvmf_qpair *qp)
	+{
	+ struct nvmf_association *na;
	+ struct nvmf_capsule nc, tc;
	+
	+ TAILQ_FOREACH_SAFE(nc, &qp->nq_rx_capsules, nc_link, tc) {
	+ TAILQ_REMOVE(&qp->nq_rx_capsules, nc, nc_link);
	+ nvmf_free_capsule(nc);
	+ }
	+ na = qp->nq_association;
	+ na->na_ops->free_qpair(qp);
	+ nvmf_free_association(na);
	+}
	+
	+struct nvmf_capsule *
	+nvmf_allocate_command(struct nvmf_qpair qp, const void sqe)
	+{
	+ struct nvmf_capsule *nc;
	+
	+ nc = qp->nq_association->na_ops->allocate_capsule(qp);
	+ if (nc == NULL)
	+ return (NULL);
	+
	+ nc->nc_qpair = qp;
	+ nc->nc_qe_len = sizeof(struct nvme_command);
	+ memcpy(&nc->nc_sqe, sqe, nc->nc_qe_len);
	+
	+ /* 4.2 of NVMe base spec: Fabrics always uses SGL. */
	+ nc->nc_sqe.fuse &= ~NVMEM(NVME_CMD_PSDT);
	+ nc->nc_sqe.fuse \|= NVMEF(NVME_CMD_PSDT, NVME_PSDT_SGL);
	+ return (nc);
	+}
	+
	+struct nvmf_capsule *
	+nvmf_allocate_response(struct nvmf_qpair qp, const void cqe)
	+{
	+ struct nvmf_capsule *nc;
	+
	+ nc = qp->nq_association->na_ops->allocate_capsule(qp);
	+ if (nc == NULL)
	+ return (NULL);
	+
	+ nc->nc_qpair = qp;
	+ nc->nc_qe_len = sizeof(struct nvme_completion);
	+ memcpy(&nc->nc_cqe, cqe, nc->nc_qe_len);
	+ return (nc);
	+}
	+
	+int
	+nvmf_capsule_append_data(struct nvmf_capsule nc, void buf, size_t len,
	+ bool send)
	+{
	+ if (nc->nc_qe_len == sizeof(struct nvme_completion))
	+ return (EINVAL);
	+ if (nc->nc_data_len != 0)
	+ return (EBUSY);
	+
	+ nc->nc_data = buf;
	+ nc->nc_data_len = len;
	+ nc->nc_send_data = send;
	+ return (0);
	+}
	+
	+void
	+nvmf_free_capsule(struct nvmf_capsule *nc)
	+{
	+ nc->nc_qpair->nq_association->na_ops->free_capsule(nc);
	+}
	+
	+int
	+nvmf_transmit_capsule(struct nvmf_capsule *nc)
	+{
	+ return (nc->nc_qpair->nq_association->na_ops->transmit_capsule(nc));
	+}
	+
	+int
	+nvmf_receive_capsule(struct nvmf_qpair qp, struct nvmf_capsule *ncp)
	+{
	+ return (qp->nq_association->na_ops->receive_capsule(qp, ncp));
	+}
	+
	+const void *
	+nvmf_capsule_sqe(const struct nvmf_capsule *nc)
	+{
	+ assert(nc->nc_qe_len == sizeof(struct nvme_command));
	+ return (&nc->nc_sqe);
	+}
	+
	+const void *
	+nvmf_capsule_cqe(const struct nvmf_capsule *nc)
	+{
	+ assert(nc->nc_qe_len == sizeof(struct nvme_completion));
	+ return (&nc->nc_cqe);
	+}
	+
	+uint8_t
	+nvmf_validate_command_capsule(const struct nvmf_capsule *nc)
	+{
	+ assert(nc->nc_qe_len == sizeof(struct nvme_command));
	+
	+ if (NVMEV(NVME_CMD_PSDT, nc->nc_sqe.fuse) != NVME_PSDT_SGL)
	+ return (NVME_SC_INVALID_FIELD);
	+
	+ return (nc->nc_qpair->nq_association->na_ops->validate_command_capsule(nc));
	+}
	+
	+size_t
	+nvmf_capsule_data_len(const struct nvmf_capsule *nc)
	+{
	+ return (nc->nc_qpair->nq_association->na_ops->capsule_data_len(nc));
	+}
	+
	+int
	+nvmf_receive_controller_data(const struct nvmf_capsule *nc,
	+ uint32_t data_offset, void *buf, size_t len)
	+{
	+ return (nc->nc_qpair->nq_association->na_ops->receive_controller_data(nc,
	+ data_offset, buf, len));
	+}
	+
	+int
	+nvmf_send_controller_data(const struct nvmf_capsule nc, const void buf,
	+ size_t len)
	+{
	+ return (nc->nc_qpair->nq_association->na_ops->send_controller_data(nc,
	+ buf, len));
	+}
	+
	+int
	+nvmf_kernel_handoff_params(struct nvmf_qpair *qp,
	+ struct nvmf_handoff_qpair_params *qparams)
	+{
	+ memset(qparams, 0, sizeof(*qparams));
	+ qparams->admin = qp->nq_admin;
	+ qparams->sq_flow_control = qp->nq_flow_control;
	+ qparams->qsize = qp->nq_qsize;
	+ qparams->sqhd = qp->nq_sqhd;
	+ qparams->sqtail = qp->nq_sqtail;
	+ return (qp->nq_association->na_ops->kernel_handoff_params(qp, qparams));
	+}
	+
	+const char *
	+nvmf_transport_type(uint8_t trtype)
	+{
	+ static _Thread_local char buf[8];
	+
	+ switch (trtype) {
	+ case NVMF_TRTYPE_RDMA:
	+ return ("RDMA");
	+ case NVMF_TRTYPE_FC:
	+ return ("Fibre Channel");
	+ case NVMF_TRTYPE_TCP:
	+ return ("TCP");
	+ case NVMF_TRTYPE_INTRA_HOST:
	+ return ("Intra-host");
	+ default:
	+ snprintf(buf, sizeof(buf), "0x%02x\n", trtype);
	+ return (buf);
	+ }
	+}
	diff --git a/share/mk/src.libnames.mk b/share/mk/src.libnames.mk
	--- a/share/mk/src.libnames.mk
	+++ b/share/mk/src.libnames.mk
	@@ -56,6 +56,7 @@
	netbsd \
	ntp \
	ntpevent \
	+ nvmf \
	openbsd \
	opts \
	parse \
	@@ -599,6 +600,9 @@
	LIBISCSIUTILDIR= ${_LIB_OBJTOP}/lib/libiscsiutil
	LIBISCSIUTIL?= ${LIBISCSIUTILDIR}/libiscsiutil${PIE_SUFFIX}.a

	+LIBNVMFDIR= ${_LIB_OBJTOP}/lib/libnvmf
	+LIBNVMF?= ${LIBNVMFDIR}/libnvmf${PIE_SUFFIX}.a
	+
	LIBTELNETDIR= ${_LIB_OBJTOP}/lib/libtelnet
	LIBTELNET?= ${LIBTELNETDIR}/libtelnet${PIE_SUFFIX}.a

File Metadata

Mime Type: text/plain
Expires: Mon, Nov 18, 7:56 PM (21 h, 17 m)
Storage Engine: blob
Storage Format: Raw Data
Storage Handle: 14703857
Default Alt Text: D44710.diff (93 KB)

D44710.diffNo OneTemporaryActions

D44710.diffView Options

File Metadata

Event Timeline

D44710.diff
No OneTemporary
Actions

D44710.diff
View Options