Page MenuHomeFreeBSD

D44711.diff
No OneTemporary

D44711.diff

diff --git a/sys/dev/nvmf/nvmf_transport.h b/sys/dev/nvmf/nvmf_transport.h
new file mode 100644
--- /dev/null
+++ b/sys/dev/nvmf/nvmf_transport.h
@@ -0,0 +1,140 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2022-2024 Chelsio Communications, Inc.
+ * Written by: John Baldwin <jhb@FreeBSD.org>
+ */
+
+#ifndef __NVMF_TRANSPORT_H__
+#define __NVMF_TRANSPORT_H__
+
+/*
+ * Interface used by the Fabrics host (initiator) and controller
+ * (target) to send and receive capsules and associated data.
+ */
+
+#include <sys/sysctl.h>
+#include <dev/nvmf/nvmf_proto.h>
+
+struct mbuf;
+struct memdesc;
+struct nvmf_capsule;
+struct nvmf_connection;
+struct nvmf_qpair;
+struct nvmf_handoff_qpair_params;
+
+SYSCTL_DECL(_kern_nvmf);
+
+/*
+ * Callback to invoke when an error occurs on a qpair. The last
+ * parameter is an error value. If the error value is zero, the qpair
+ * has been closed at the transport level rather than a transport
+ * error occuring.
+ */
+typedef void nvmf_qpair_error_t(void *, int);
+
+/* Callback to invoke when a capsule is received. */
+typedef void nvmf_capsule_receive_t(void *, struct nvmf_capsule *);
+
+/*
+ * Callback to invoke when an I/O request has completed. The second
+ * parameter is the amount of data transferred. The last parameter is
+ * an error value which is non-zero if the request did not complete
+ * successfully. A request with an error may complete partially.
+ */
+typedef void nvmf_io_complete_t(void *, size_t, int);
+
+/*
+ * A queue pair represents either an Admin or I/O
+ * submission/completion queue pair. The params contains negotiated
+ * values passed in from userland.
+ *
+ * Unlike libnvmf in userland, the kernel transport interface does not
+ * have any notion of an association. Instead, qpairs are
+ * independent.
+ */
+struct nvmf_qpair *nvmf_allocate_qpair(enum nvmf_trtype trtype,
+ bool controller, const struct nvmf_handoff_qpair_params *params,
+ nvmf_qpair_error_t *error_cb, void *error_cb_arg,
+ nvmf_capsule_receive_t *receive_cb, void *receive_cb_arg);
+void nvmf_free_qpair(struct nvmf_qpair *qp);
+
+/*
+ * Capsules are either commands (host -> controller) or responses
+ * (controller -> host). A data buffer may be associated with a
+ * command capsule. Transmitted data is not copied by this API but
+ * instead must be preserved until the completion callback is invoked
+ * to indicate capsule transmission has completed.
+ */
+struct nvmf_capsule *nvmf_allocate_command(struct nvmf_qpair *qp,
+ const void *sqe, int how);
+struct nvmf_capsule *nvmf_allocate_response(struct nvmf_qpair *qp,
+ const void *cqe, int how);
+void nvmf_free_capsule(struct nvmf_capsule *nc);
+int nvmf_capsule_append_data(struct nvmf_capsule *nc,
+ struct memdesc *mem, size_t len, bool send,
+ nvmf_io_complete_t *complete_cb, void *cb_arg);
+int nvmf_transmit_capsule(struct nvmf_capsule *nc);
+void nvmf_abort_capsule_data(struct nvmf_capsule *nc, int error);
+void *nvmf_capsule_sqe(struct nvmf_capsule *nc);
+void *nvmf_capsule_cqe(struct nvmf_capsule *nc);
+
+/* Controller-specific APIs. */
+
+/*
+ * A controller calls this function to check for any
+ * transport-specific errors (invalid fields) in a received command
+ * capsule. The callback returns a generic command status value:
+ * NVME_SC_SUCCESS if no error is found.
+ */
+uint8_t nvmf_validate_command_capsule(struct nvmf_capsule *nc);
+
+/*
+ * A controller calls this function to query the amount of data
+ * associated with a command capsule.
+ */
+size_t nvmf_capsule_data_len(const struct nvmf_capsule *cc);
+
+/*
+ * A controller calls this function to receive data associated with a
+ * command capsule (e.g. the data for a WRITE command). This can
+ * either return in-capsule data or fetch data from the host
+ * (e.g. using a R2T PDU over TCP). The received command capsule
+ * should be passed in 'nc'. The received data is stored in 'mem'.
+ * If this function returns success, then the callback will be invoked
+ * once the operation has completed. Note that the callback might be
+ * invoked before this function returns.
+ */
+int nvmf_receive_controller_data(struct nvmf_capsule *nc,
+ uint32_t data_offset, struct memdesc *mem, size_t len,
+ nvmf_io_complete_t *complete_cb, void *cb_arg);
+
+/*
+ * A controller calls this function to send data in response to a
+ * command prior to sending a response capsule. If an error occurs,
+ * the function returns a generic status completion code to be sent in
+ * the following CQE. Note that the transfer might send a subset of
+ * the data requested by nc. If the transfer succeeds, this function
+ * can return one of the following values:
+ *
+ * - NVME_SC_SUCCESS: The transfer has completed successfully and the
+ * caller should send a success CQE in a response capsule.
+ *
+ * - NVMF_SUCCESS_SENT: The transfer has completed successfully and
+ * the transport layer has sent an implicit success CQE to the
+ * remote host (e.g. the SUCCESS flag for TCP). The caller should
+ * not send a response capsule.
+ *
+ * - NVMF_MORE: The transfer has completed successfully, but the
+ * transfer did not complete the data buffer.
+ *
+ * The mbuf chain in 'm' is consumed by this function even if an error
+ * is returned.
+ */
+u_int nvmf_send_controller_data(struct nvmf_capsule *nc,
+ uint32_t data_offset, struct mbuf *m, size_t len);
+
+#define NVMF_SUCCESS_SENT 0x100
+#define NVMF_MORE 0x101
+
+#endif /* !__NVMF_TRANSPORT_H__ */
diff --git a/sys/dev/nvmf/nvmf_transport.c b/sys/dev/nvmf/nvmf_transport.c
new file mode 100644
--- /dev/null
+++ b/sys/dev/nvmf/nvmf_transport.c
@@ -0,0 +1,344 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2022-2024 Chelsio Communications, Inc.
+ * Written by: John Baldwin <jhb@FreeBSD.org>
+ */
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/limits.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/module.h>
+#include <sys/refcount.h>
+#include <sys/sysctl.h>
+#include <sys/sx.h>
+#include <dev/nvme/nvme.h>
+#include <dev/nvmf/nvmf.h>
+#include <dev/nvmf/nvmf_transport.h>
+#include <dev/nvmf/nvmf_transport_internal.h>
+
+/* Transport-independent support for fabrics queue pairs and commands. */
+
+struct nvmf_transport {
+ struct nvmf_transport_ops *nt_ops;
+
+ volatile u_int nt_active_qpairs;
+ SLIST_ENTRY(nvmf_transport) nt_link;
+};
+
+/* nvmf_transports[nvmf_trtype] is sorted by priority */
+static SLIST_HEAD(, nvmf_transport) nvmf_transports[NVMF_TRTYPE_TCP + 1];
+static struct sx nvmf_transports_lock;
+
+static MALLOC_DEFINE(M_NVMF_TRANSPORT, "nvmf_xport",
+ "NVMe over Fabrics transport");
+
+SYSCTL_NODE(_kern, OID_AUTO, nvmf, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
+ "NVMe over Fabrics");
+
+static bool
+nvmf_supported_trtype(enum nvmf_trtype trtype)
+{
+ return (trtype < nitems(nvmf_transports));
+}
+
+struct nvmf_qpair *
+nvmf_allocate_qpair(enum nvmf_trtype trtype, bool controller,
+ const struct nvmf_handoff_qpair_params *params,
+ nvmf_qpair_error_t *error_cb, void *error_cb_arg,
+ nvmf_capsule_receive_t *receive_cb, void *receive_cb_arg)
+{
+ struct nvmf_transport *nt;
+ struct nvmf_qpair *qp;
+
+ if (!nvmf_supported_trtype(trtype))
+ return (NULL);
+
+ sx_slock(&nvmf_transports_lock);
+ SLIST_FOREACH(nt, &nvmf_transports[trtype], nt_link) {
+ qp = nt->nt_ops->allocate_qpair(controller, params);
+ if (qp != NULL) {
+ refcount_acquire(&nt->nt_active_qpairs);
+ break;
+ }
+ }
+ sx_sunlock(&nvmf_transports_lock);
+ if (qp == NULL)
+ return (NULL);
+
+ qp->nq_transport = nt;
+ qp->nq_ops = nt->nt_ops;
+ qp->nq_controller = controller;
+ qp->nq_error = error_cb;
+ qp->nq_error_arg = error_cb_arg;
+ qp->nq_receive = receive_cb;
+ qp->nq_receive_arg = receive_cb_arg;
+ qp->nq_admin = params->admin;
+ return (qp);
+}
+
+void
+nvmf_free_qpair(struct nvmf_qpair *qp)
+{
+ struct nvmf_transport *nt;
+
+ nt = qp->nq_transport;
+ qp->nq_ops->free_qpair(qp);
+ if (refcount_release(&nt->nt_active_qpairs))
+ wakeup(nt);
+}
+
+struct nvmf_capsule *
+nvmf_allocate_command(struct nvmf_qpair *qp, const void *sqe, int how)
+{
+ struct nvmf_capsule *nc;
+
+ KASSERT(how == M_WAITOK || how == M_NOWAIT,
+ ("%s: invalid how", __func__));
+ nc = qp->nq_ops->allocate_capsule(qp, how);
+ if (nc == NULL)
+ return (NULL);
+
+ nc->nc_qpair = qp;
+ nc->nc_qe_len = sizeof(struct nvme_command);
+ memcpy(&nc->nc_sqe, sqe, nc->nc_qe_len);
+
+ /* 4.2 of NVMe base spec: Fabrics always uses SGL. */
+ nc->nc_sqe.fuse &= ~NVMEM(NVME_CMD_PSDT);
+ nc->nc_sqe.fuse |= NVMEF(NVME_CMD_PSDT, NVME_PSDT_SGL);
+ return (nc);
+}
+
+struct nvmf_capsule *
+nvmf_allocate_response(struct nvmf_qpair *qp, const void *cqe, int how)
+{
+ struct nvmf_capsule *nc;
+
+ KASSERT(how == M_WAITOK || how == M_NOWAIT,
+ ("%s: invalid how", __func__));
+ nc = qp->nq_ops->allocate_capsule(qp, how);
+ if (nc == NULL)
+ return (NULL);
+
+ nc->nc_qpair = qp;
+ nc->nc_qe_len = sizeof(struct nvme_completion);
+ memcpy(&nc->nc_cqe, cqe, nc->nc_qe_len);
+ return (nc);
+}
+
+int
+nvmf_capsule_append_data(struct nvmf_capsule *nc, struct memdesc *mem,
+ size_t len, bool send, nvmf_io_complete_t *complete_cb,
+ void *cb_arg)
+{
+ if (nc->nc_data.io_len != 0)
+ return (EBUSY);
+
+ nc->nc_send_data = send;
+ nc->nc_data.io_mem = *mem;
+ nc->nc_data.io_len = len;
+ nc->nc_data.io_complete = complete_cb;
+ nc->nc_data.io_complete_arg = cb_arg;
+ return (0);
+}
+
+void
+nvmf_free_capsule(struct nvmf_capsule *nc)
+{
+ nc->nc_qpair->nq_ops->free_capsule(nc);
+}
+
+int
+nvmf_transmit_capsule(struct nvmf_capsule *nc)
+{
+ return (nc->nc_qpair->nq_ops->transmit_capsule(nc));
+}
+
+void
+nvmf_abort_capsule_data(struct nvmf_capsule *nc, int error)
+{
+ if (nc->nc_data.io_len != 0)
+ nvmf_complete_io_request(&nc->nc_data, 0, error);
+}
+
+void *
+nvmf_capsule_sqe(struct nvmf_capsule *nc)
+{
+ KASSERT(nc->nc_qe_len == sizeof(struct nvme_command),
+ ("%s: capsule %p is not a command capsule", __func__, nc));
+ return (&nc->nc_sqe);
+}
+
+void *
+nvmf_capsule_cqe(struct nvmf_capsule *nc)
+{
+ KASSERT(nc->nc_qe_len == sizeof(struct nvme_completion),
+ ("%s: capsule %p is not a response capsule", __func__, nc));
+ return (&nc->nc_cqe);
+}
+
+uint8_t
+nvmf_validate_command_capsule(struct nvmf_capsule *nc)
+{
+ KASSERT(nc->nc_qe_len == sizeof(struct nvme_command),
+ ("%s: capsule %p is not a command capsule", __func__, nc));
+
+ if (NVMEV(NVME_CMD_PSDT, nc->nc_sqe.fuse) != NVME_PSDT_SGL)
+ return (NVME_SC_INVALID_FIELD);
+
+ return (nc->nc_qpair->nq_ops->validate_command_capsule(nc));
+}
+
+size_t
+nvmf_capsule_data_len(const struct nvmf_capsule *nc)
+{
+ return (nc->nc_qpair->nq_ops->capsule_data_len(nc));
+}
+
+int
+nvmf_receive_controller_data(struct nvmf_capsule *nc, uint32_t data_offset,
+ struct memdesc *mem, size_t len, nvmf_io_complete_t *complete_cb,
+ void *cb_arg)
+{
+ struct nvmf_io_request io;
+
+ io.io_mem = *mem;
+ io.io_len = len;
+ io.io_complete = complete_cb;
+ io.io_complete_arg = cb_arg;
+ return (nc->nc_qpair->nq_ops->receive_controller_data(nc, data_offset,
+ &io));
+}
+
+u_int
+nvmf_send_controller_data(struct nvmf_capsule *nc, uint32_t data_offset,
+ struct mbuf *m, size_t len)
+{
+ MPASS(m_length(m, NULL) == len);
+ return (nc->nc_qpair->nq_ops->send_controller_data(nc, data_offset, m,
+ len));
+}
+
+int
+nvmf_transport_module_handler(struct module *mod, int what, void *arg)
+{
+ struct nvmf_transport_ops *ops = arg;
+ struct nvmf_transport *nt, *nt2, *prev;
+ int error;
+
+ switch (what) {
+ case MOD_LOAD:
+ if (!nvmf_supported_trtype(ops->trtype)) {
+ printf("NVMF: Unsupported transport %u", ops->trtype);
+ return (EINVAL);
+ }
+
+ nt = malloc(sizeof(*nt), M_NVMF_TRANSPORT, M_WAITOK | M_ZERO);
+ nt->nt_ops = arg;
+
+ sx_xlock(&nvmf_transports_lock);
+ if (SLIST_EMPTY(&nvmf_transports[ops->trtype])) {
+ SLIST_INSERT_HEAD(&nvmf_transports[ops->trtype], nt,
+ nt_link);
+ } else {
+ prev = NULL;
+ SLIST_FOREACH(nt2, &nvmf_transports[ops->trtype],
+ nt_link) {
+ if (ops->priority > nt2->nt_ops->priority)
+ break;
+ prev = nt2;
+ }
+ if (prev == NULL)
+ SLIST_INSERT_HEAD(&nvmf_transports[ops->trtype],
+ nt, nt_link);
+ else
+ SLIST_INSERT_AFTER(prev, nt, nt_link);
+ }
+ sx_xunlock(&nvmf_transports_lock);
+ return (0);
+
+ case MOD_QUIESCE:
+ if (!nvmf_supported_trtype(ops->trtype))
+ return (0);
+
+ sx_slock(&nvmf_transports_lock);
+ SLIST_FOREACH(nt, &nvmf_transports[ops->trtype], nt_link) {
+ if (nt->nt_ops == ops)
+ break;
+ }
+ if (nt == NULL) {
+ sx_sunlock(&nvmf_transports_lock);
+ return (0);
+ }
+ if (nt->nt_active_qpairs != 0) {
+ sx_sunlock(&nvmf_transports_lock);
+ return (EBUSY);
+ }
+ sx_sunlock(&nvmf_transports_lock);
+ return (0);
+
+ case MOD_UNLOAD:
+ if (!nvmf_supported_trtype(ops->trtype))
+ return (0);
+
+ sx_xlock(&nvmf_transports_lock);
+ prev = NULL;
+ SLIST_FOREACH(nt, &nvmf_transports[ops->trtype], nt_link) {
+ if (nt->nt_ops == ops)
+ break;
+ prev = nt;
+ }
+ if (nt == NULL) {
+ KASSERT(nt->nt_active_qpairs == 0,
+ ("unregistered transport has connections"));
+ sx_xunlock(&nvmf_transports_lock);
+ return (0);
+ }
+
+ if (prev == NULL)
+ SLIST_REMOVE_HEAD(&nvmf_transports[ops->trtype],
+ nt_link);
+ else
+ SLIST_REMOVE_AFTER(prev, nt_link);
+
+ error = 0;
+ while (nt->nt_active_qpairs != 0 && error == 0)
+ error = sx_sleep(nt, &nvmf_transports_lock, PCATCH,
+ "nftunld", 0);
+ sx_xunlock(&nvmf_transports_lock);
+ if (error != 0)
+ return (error);
+ free(nt, M_NVMF_TRANSPORT);
+ return (0);
+
+ default:
+ return (EOPNOTSUPP);
+ }
+}
+
+static int
+nvmf_transport_modevent(module_t mod __unused, int what, void *arg __unused)
+{
+ switch (what) {
+ case MOD_LOAD:
+ for (u_int i = 0; i < nitems(nvmf_transports); i++)
+ SLIST_INIT(&nvmf_transports[i]);
+ sx_init(&nvmf_transports_lock, "nvmf transports");
+ return (0);
+ default:
+ return (EOPNOTSUPP);
+ }
+}
+
+static moduledata_t nvmf_transport_mod = {
+ "nvmf_transport",
+ nvmf_transport_modevent,
+ 0
+};
+
+DECLARE_MODULE(nvmf_transport, nvmf_transport_mod, SI_SUB_DRIVERS,
+ SI_ORDER_FIRST);
+MODULE_VERSION(nvmf_transport, 1);
diff --git a/sys/dev/nvmf/nvmf_transport_internal.h b/sys/dev/nvmf/nvmf_transport_internal.h
new file mode 100644
--- /dev/null
+++ b/sys/dev/nvmf/nvmf_transport_internal.h
@@ -0,0 +1,128 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2022-2024 Chelsio Communications, Inc.
+ * Written by: John Baldwin <jhb@FreeBSD.org>
+ */
+
+#ifndef __NVMF_TRANSPORT_INTERNAL_H__
+#define __NVMF_TRANSPORT_INTERNAL_H__
+
+#include <sys/memdesc.h>
+
+/*
+ * Interface between the transport-independent APIs in
+ * nvmf_transport.c and individual transports.
+ */
+
+struct module;
+struct nvmf_io_request;
+
+struct nvmf_transport_ops {
+ /* Queue pair management. */
+ struct nvmf_qpair *(*allocate_qpair)(bool controller,
+ const struct nvmf_handoff_qpair_params *params);
+ void (*free_qpair)(struct nvmf_qpair *qp);
+
+ /* Capsule operations. */
+ struct nvmf_capsule *(*allocate_capsule)(struct nvmf_qpair *qp,
+ int how);
+ void (*free_capsule)(struct nvmf_capsule *nc);
+ int (*transmit_capsule)(struct nvmf_capsule *nc);
+ uint8_t (*validate_command_capsule)(struct nvmf_capsule *nc);
+
+ /* Transferring controller data. */
+ size_t (*capsule_data_len)(const struct nvmf_capsule *nc);
+ int (*receive_controller_data)(struct nvmf_capsule *nc,
+ uint32_t data_offset, struct nvmf_io_request *io);
+ u_int (*send_controller_data)(struct nvmf_capsule *nc,
+ uint32_t data_offset, struct mbuf *m, size_t len);
+
+ enum nvmf_trtype trtype;
+ int priority;
+};
+
+/* Either an Admin or I/O Submission/Completion Queue pair. */
+struct nvmf_qpair {
+ struct nvmf_transport *nq_transport;
+ struct nvmf_transport_ops *nq_ops;
+ bool nq_controller;
+
+ /* Callback to invoke for a received capsule. */
+ nvmf_capsule_receive_t *nq_receive;
+ void *nq_receive_arg;
+
+ /* Callback to invoke for an error. */
+ nvmf_qpair_error_t *nq_error;
+ void *nq_error_arg;
+
+ bool nq_admin;
+};
+
+struct nvmf_io_request {
+ /*
+ * Data buffer contains io_len bytes in the backing store
+ * described by mem.
+ */
+ struct memdesc io_mem;
+ size_t io_len;
+ nvmf_io_complete_t *io_complete;
+ void *io_complete_arg;
+};
+
+/*
+ * Fabrics Command and Response Capsules. The Fabrics host
+ * (initiator) and controller (target) drivers work with capsules that
+ * are transmitted and received by a specific transport.
+ */
+struct nvmf_capsule {
+ struct nvmf_qpair *nc_qpair;
+
+ /* Either a SQE or CQE. */
+ union {
+ struct nvme_command nc_sqe;
+ struct nvme_completion nc_cqe;
+ };
+ int nc_qe_len;
+
+ /*
+ * Is SQHD in received capsule valid? False for locally-
+ * synthesized responses.
+ */
+ bool nc_sqhd_valid;
+
+ bool nc_send_data;
+ struct nvmf_io_request nc_data;
+};
+
+static void __inline
+nvmf_qpair_error(struct nvmf_qpair *nq, int error)
+{
+ nq->nq_error(nq->nq_error_arg, error);
+}
+
+static void __inline
+nvmf_capsule_received(struct nvmf_qpair *nq, struct nvmf_capsule *nc)
+{
+ nq->nq_receive(nq->nq_receive_arg, nc);
+}
+
+static void __inline
+nvmf_complete_io_request(struct nvmf_io_request *io, size_t xfered, int error)
+{
+ io->io_complete(io->io_complete_arg, xfered, error);
+}
+
+int nvmf_transport_module_handler(struct module *, int, void *);
+
+#define NVMF_TRANSPORT(name, ops) \
+static moduledata_t nvmf_transport_##name##_mod = { \
+ "nvmf/" #name, \
+ nvmf_transport_module_handler, \
+ &(ops) \
+}; \
+DECLARE_MODULE(nvmf_transport_##name, nvmf_transport_##name##_mod, \
+ SI_SUB_DRIVERS, SI_ORDER_ANY); \
+MODULE_DEPEND(nvmf_transport_##name, nvmf_transport, 1, 1, 1)
+
+#endif /* !__NVMF_TRANSPORT_INTERNAL_H__ */
diff --git a/sys/modules/Makefile b/sys/modules/Makefile
--- a/sys/modules/Makefile
+++ b/sys/modules/Makefile
@@ -296,6 +296,7 @@
nvd \
${_nvdimm} \
nvme \
+ nvmf \
${_nvram} \
oce \
${_ocs_fc} \
diff --git a/sys/modules/nvmf/Makefile b/sys/modules/nvmf/Makefile
new file mode 100644
--- /dev/null
+++ b/sys/modules/nvmf/Makefile
@@ -0,0 +1,3 @@
+SUBDIR= nvmf_transport
+
+.include <bsd.subdir.mk>
diff --git a/sys/modules/nvmf/nvmf_transport/Makefile b/sys/modules/nvmf/nvmf_transport/Makefile
new file mode 100644
--- /dev/null
+++ b/sys/modules/nvmf/nvmf_transport/Makefile
@@ -0,0 +1,9 @@
+.PATH: ${SRCTOP}/sys/dev/nvmf
+
+KMOD= nvmf_transport
+
+SRCS= nvmf_transport.c
+
+EXPORT_SYMS= YES
+
+.include <bsd.kmod.mk>

File Metadata

Mime Type
text/plain
Expires
Mon, Jan 13, 3:13 PM (16 h, 56 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
15783405
Default Alt Text
D44711.diff (18 KB)

Event Timeline