Page Menu
Home
FreeBSD
Search
Configure Global Search
Log In
Files
F102661125
D36382.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Flag For Later
Award Token
Size
13 KB
Referenced Files
None
Subscribers
None
D36382.diff
View Options
diff --git a/share/man/man4/divert.4 b/share/man/man4/divert.4
--- a/share/man/man4/divert.4
+++ b/share/man/man4/divert.4
@@ -159,10 +159,9 @@
Packets written as incoming and having incorrect checksums will be dropped.
Otherwise, all header fields are unchanged (and therefore in network order).
.Pp
-Binding to port numbers less than 1024 requires super-user access, as does
-creating a
+Creating a
.Nm
-socket.
+socket requires super-user access.
.Sh ERRORS
Writing to a divert socket can return these errors, along with
the usual errors possible when writing raw packets:
diff --git a/sys/netinet/ip_divert.c b/sys/netinet/ip_divert.c
--- a/sys/netinet/ip_divert.c
+++ b/sys/netinet/ip_divert.c
@@ -86,6 +86,14 @@
#define DIVSNDQ (65536 + 100)
#define DIVRCVQ (65536 + 100)
+/*
+ * Usually a system has very few divert ports. Previous implementation
+ * used a linked list.
+ */
+#define DIVHASHSIZE (1 << 3) /* 8 entries, one cache line. */
+#define DIVHASH(port) (port % DIVHASHSIZE)
+#define DCBHASH(dcb) ((dcb)->dcb_port % DIVHASHSIZE)
+
/*
* Divert sockets work in conjunction with ipfw or other packet filters,
* see the divert(4) manpage for features.
@@ -124,9 +132,6 @@
#define DIVSTAT_INC(name) \
VNET_PCPUSTAT_ADD(struct divstat, divstat, div_ ## name, 1)
-VNET_DEFINE_STATIC(struct inpcbinfo, divcbinfo);
-#define V_divcbinfo VNET(divcbinfo)
-
static u_long div_sendspace = DIVSNDQ; /* XXX sysctl ? */
static u_long div_recvspace = DIVRCVQ; /* XXX sysctl ? */
@@ -134,39 +139,31 @@
struct sockaddr_in *sin);
static int div_output_outbound(int family, struct socket *so, struct mbuf *m);
-/*
- * Initialize divert connection block queue.
- */
-INPCBSTORAGE_DEFINE(divcbstor, "divinp", "divcb", "div", "divhash");
-
-static void
-div_init(void *arg __unused)
-{
-
- /*
- * XXX We don't use the hash list for divert IP, but it's easier to
- * allocate one-entry hash lists than it is to check all over the
- * place for hashbase == NULL.
- */
- in_pcbinfo_init(&V_divcbinfo, &divcbstor, 1, 1);
-}
-VNET_SYSINIT(div_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, div_init, NULL);
-
-static void
-div_destroy(void *unused __unused)
-{
+struct divcb {
+ union {
+ SLIST_ENTRY(divcb) dcb_next;
+ intptr_t dcb_bound;
+#define DCB_UNBOUND ((intptr_t)-1)
+ };
+ struct socket *dcb_socket;
+ uint16_t dcb_port;
+ uint64_t dcb_gencnt;
+ struct epoch_context dcb_epochctx;
+};
- in_pcbinfo_destroy(&V_divcbinfo);
-}
-VNET_SYSUNINIT(divert, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, div_destroy, NULL);
+SLIST_HEAD(divhashhead, divcb);
-static bool
-div_port_match(const struct inpcb *inp, void *v)
-{
- uint16_t nport = *(uint16_t *)v;
+VNET_DEFINE_STATIC(struct divhashhead, divhash[DIVHASHSIZE]) = {};
+#define V_divhash VNET(divhash)
+VNET_DEFINE_STATIC(uint64_t, dcb_count) = 0;
+#define V_dcb_count VNET(dcb_count)
+VNET_DEFINE_STATIC(uint64_t, dcb_gencnt) = 0;
+#define V_dcb_gencnt VNET(dcb_gencnt)
- return (inp->inp_lport == nport);
-}
+static struct mtx divert_mtx;
+MTX_SYSINIT(divert, &divert_mtx, "divert(4) socket pcb lists", MTX_DEF);
+#define DIVERT_LOCK() mtx_lock(&divert_mtx)
+#define DIVERT_UNLOCK() mtx_unlock(&divert_mtx)
/*
* Divert a packet by passing it up to the divert socket at port 'port'.
@@ -177,12 +174,9 @@
#if defined(SCTP) || defined(SCTP_SUPPORT)
struct ip *ip;
#endif
- struct inpcb *inp;
- struct socket *sa;
+ struct divcb *dcb;
u_int16_t nport;
struct sockaddr_in divsrc;
- struct inpcb_iterator inpi = INP_ITERATOR(&V_divcbinfo,
- INPLOOKUP_RLOCKPCB, div_port_match, &nport);
struct m_tag *mtag;
NET_EPOCH_ASSERT();
@@ -275,27 +269,26 @@
}
/* Put packet on socket queue, if any */
- sa = NULL;
- /* nport is inp_next's context. */
- nport = htons((u_int16_t)(((struct ipfw_rule_ref *)(mtag+1))->info));
- while ((inp = inp_next(&inpi)) != NULL) {
- sa = inp->inp_socket;
+ nport = htons((uint16_t)(((struct ipfw_rule_ref *)(mtag+1))->info));
+ SLIST_FOREACH(dcb, &V_divhash[DIVHASH(nport)], dcb_next)
+ if (dcb->dcb_port == nport)
+ break;
+
+ if (dcb != NULL) {
+ struct socket *sa = dcb->dcb_socket;
+
SOCKBUF_LOCK(&sa->so_rcv);
if (sbappendaddr_locked(&sa->so_rcv,
(struct sockaddr *)&divsrc, m, NULL) == 0) {
soroverflow_locked(sa);
- sa = NULL; /* force mbuf reclaim below */
+ m_freem(m);
} else {
sorwakeup_locked(sa);
DIVSTAT_INC(diverted);
}
- /* XXX why does only one socket match? */
- INP_RUNLOCK(inp);
- break;
- }
- if (sa == NULL) {
- m_freem(m);
+ } else {
DIVSTAT_INC(noport);
+ m_freem(m);
}
}
@@ -422,23 +415,12 @@
div_output_outbound(int family, struct socket *so, struct mbuf *m)
{
struct ip *const ip = mtod(m, struct ip *);
- struct mbuf *options;
- struct inpcb *inp;
int error;
- inp = sotoinpcb(so);
- INP_RLOCK(inp);
switch (family) {
case AF_INET:
- /*
- * Don't allow both user specified and setsockopt
- * options, and don't allow packet length sizes that
- * will crash.
- */
- if ((((ip->ip_hl << 2) != sizeof(struct ip)) &&
- inp->inp_options != NULL) ||
- ((u_short)ntohs(ip->ip_len) > m->m_pkthdr.len)) {
- INP_RUNLOCK(inp);
+ /* Don't allow packet length sizes that will crash. */
+ if (((u_short)ntohs(ip->ip_len) > m->m_pkthdr.len)) {
m_freem(m);
return (EINVAL);
}
@@ -450,7 +432,6 @@
/* Don't allow packet length sizes that will crash */
if (((u_short)ntohs(ip6->ip6_plen) > m->m_pkthdr.len)) {
- INP_RUNLOCK(inp);
m_freem(m);
return (EINVAL);
}
@@ -460,44 +441,13 @@
}
#ifdef MAC
- mac_inpcb_create_mbuf(inp, m);
+ mac_socket_create_mbuf(so, m);
#endif
- /*
- * Get ready to inject the packet into ip_output().
- * Just in case socket options were specified on the
- * divert socket, we duplicate them. This is done
- * to avoid having to hold the PCB locks over the call
- * to ip_output(), as doing this results in a number of
- * lock ordering complexities.
- *
- * Note that we set the multicast options argument for
- * ip_output() to NULL since it should be invariant that
- * they are not present.
- */
- KASSERT(inp->inp_moptions == NULL,
- ("multicast options set on a divert socket"));
- /*
- * XXXCSJP: It is unclear to me whether or not it makes
- * sense for divert sockets to have options. However,
- * for now we will duplicate them with the INP locks
- * held so we can use them in ip_output() without
- * requring a reference to the pcb.
- */
- options = NULL;
- if (inp->inp_options != NULL) {
- options = m_dup(inp->inp_options, M_NOWAIT);
- if (options == NULL) {
- INP_RUNLOCK(inp);
- m_freem(m);
- return (ENOBUFS);
- }
- }
- INP_RUNLOCK(inp);
error = 0;
switch (family) {
case AF_INET:
- error = ip_output(m, options, NULL,
+ error = ip_output(m, NULL, NULL,
((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0)
| IP_ALLOWBROADCAST | IP_RAWOUTPUT, NULL, NULL);
break;
@@ -509,8 +459,6 @@
}
if (error == 0)
DIVSTAT_INC(outbound);
- if (options != NULL)
- m_freem(options);
return (error);
}
@@ -579,11 +527,9 @@
static int
div_attach(struct socket *so, int proto, struct thread *td)
{
- struct inpcb *inp;
+ struct divcb *dcb;
int error;
- inp = sotoinpcb(so);
- KASSERT(inp == NULL, ("div_attach: inp != NULL"));
if (td != NULL) {
error = priv_check(td, PRIV_NETINET_DIVERT);
if (error)
@@ -592,85 +538,90 @@
error = soreserve(so, div_sendspace, div_recvspace);
if (error)
return error;
- error = in_pcballoc(so, &V_divcbinfo);
- if (error)
- return error;
- inp = (struct inpcb *)so->so_pcb;
- inp->inp_ip_p = proto;
- inp->inp_flags |= INP_HDRINCL;
- INP_WUNLOCK(inp);
- return 0;
+ dcb = malloc(sizeof(*dcb), M_PCB, M_WAITOK);
+ dcb->dcb_bound = DCB_UNBOUND;
+ dcb->dcb_socket = so;
+ DIVERT_LOCK();
+ V_dcb_count++;
+ dcb->dcb_gencnt = ++V_dcb_gencnt;
+ DIVERT_UNLOCK();
+ so->so_pcb = dcb;
+
+ return (0);
}
static void
-div_detach(struct socket *so)
+div_free(epoch_context_t ctx)
{
- struct inpcb *inp;
+ struct divcb *dcb = __containerof(ctx, struct divcb, dcb_epochctx);
+
+ free(dcb, M_PCB);
+}
- inp = sotoinpcb(so);
- KASSERT(inp != NULL, ("div_detach: inp == NULL"));
- INP_WLOCK(inp);
- in_pcbdetach(inp);
- in_pcbfree(inp);
+static void
+div_detach(struct socket *so)
+{
+ struct divcb *dcb = so->so_pcb;
+
+ so->so_pcb = NULL;
+ DIVERT_LOCK();
+ if (dcb->dcb_bound != DCB_UNBOUND)
+ SLIST_REMOVE(&V_divhash[DCBHASH(dcb)], dcb, divcb, dcb_next);
+ V_dcb_count--;
+ V_dcb_gencnt++;
+ DIVERT_UNLOCK();
+ NET_EPOCH_CALL(div_free, &dcb->dcb_epochctx);
}
static int
div_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
{
- struct inpcb *inp;
- int error;
+ struct divcb *dcb;
+ uint16_t port;
- inp = sotoinpcb(so);
- KASSERT(inp != NULL, ("div_bind: inp == NULL"));
- /* in_pcbbind assumes that nam is a sockaddr_in
- * and in_pcbbind requires a valid address. Since divert
- * sockets don't we need to make sure the address is
- * filled in properly.
- * XXX -- divert should not be abusing in_pcbind
- * and should probably have its own family.
- */
if (nam->sa_family != AF_INET)
return EAFNOSUPPORT;
if (nam->sa_len != sizeof(struct sockaddr_in))
return EINVAL;
- ((struct sockaddr_in *)nam)->sin_addr.s_addr = INADDR_ANY;
- INP_WLOCK(inp);
- INP_HASH_WLOCK(&V_divcbinfo);
- error = in_pcbbind(inp, nam, td->td_ucred);
- INP_HASH_WUNLOCK(&V_divcbinfo);
- INP_WUNLOCK(inp);
- return error;
+ port = ((struct sockaddr_in *)nam)->sin_port;
+ DIVERT_LOCK();
+ SLIST_FOREACH(dcb, &V_divhash[DIVHASH(port)], dcb_next)
+ if (dcb->dcb_port == port) {
+ DIVERT_UNLOCK();
+ return (EADDRINUSE);
+ }
+ dcb = so->so_pcb;
+ if (dcb->dcb_bound != DCB_UNBOUND)
+ SLIST_REMOVE(&V_divhash[DCBHASH(dcb)], dcb, divcb, dcb_next);
+ dcb->dcb_port = port;
+ SLIST_INSERT_HEAD(&V_divhash[DIVHASH(port)], dcb, dcb_next);
+ DIVERT_UNLOCK();
+
+ return (0);
}
static int
div_shutdown(struct socket *so)
{
- struct inpcb *inp;
- inp = sotoinpcb(so);
- KASSERT(inp != NULL, ("div_shutdown: inp == NULL"));
- INP_WLOCK(inp);
socantsendmore(so);
- INP_WUNLOCK(inp);
return 0;
}
static int
div_pcblist(SYSCTL_HANDLER_ARGS)
{
- struct inpcb_iterator inpi = INP_ALL_ITERATOR(&V_divcbinfo,
- INPLOOKUP_RLOCKPCB);
struct xinpgen xig;
- struct inpcb *inp;
+ struct divcb *dcb;
int error;
if (req->newptr != 0)
return EPERM;
if (req->oldptr == 0) {
- int n;
+ u_int n;
- n = V_divcbinfo.ipi_count;
+ n = V_dcb_count;
n += imax(n / 8, 10);
req->oldidx = 2 * (sizeof xig) + n * sizeof(struct xinpcb);
return 0;
@@ -681,39 +632,45 @@
bzero(&xig, sizeof(xig));
xig.xig_len = sizeof xig;
- xig.xig_count = V_divcbinfo.ipi_count;
- xig.xig_gen = V_divcbinfo.ipi_gencnt;
+ xig.xig_count = V_dcb_count;
+ xig.xig_gen = V_dcb_gencnt;
xig.xig_sogen = so_gencnt;
error = SYSCTL_OUT(req, &xig, sizeof xig);
if (error)
return error;
- while ((inp = inp_next(&inpi)) != NULL) {
- if (inp->inp_gencnt <= xig.xig_gen) {
- struct xinpcb xi;
-
- in_pcbtoxinpcb(inp, &xi);
- error = SYSCTL_OUT(req, &xi, sizeof xi);
- if (error) {
- INP_RUNLOCK(inp);
- break;
+ DIVERT_LOCK();
+ for (int i = 0; i < DIVHASHSIZE; i++)
+ SLIST_FOREACH(dcb, &V_divhash[i], dcb_next) {
+ if (dcb->dcb_gencnt <= xig.xig_gen) {
+ struct xinpcb xi;
+
+ bzero(&xi, sizeof(xi));
+ xi.xi_len = sizeof(struct xinpcb);
+ sotoxsocket(dcb->dcb_socket, &xi.xi_socket);
+ xi.inp_gencnt = dcb->dcb_gencnt;
+ xi.inp_vflag = INP_IPV4; /* XXX: netstat(1) */
+ xi.inp_inc.inc_ie.ie_lport = dcb->dcb_port;
+ error = SYSCTL_OUT(req, &xi, sizeof xi);
+ if (error)
+ goto errout;
}
}
- }
- if (!error) {
- /*
- * Give the user an updated idea of our state.
- * If the generation differs from what we told
- * her before, she knows that something happened
- * while we were processing this request, and it
- * might be necessary to retry.
- */
- xig.xig_gen = V_divcbinfo.ipi_gencnt;
- xig.xig_sogen = so_gencnt;
- xig.xig_count = V_divcbinfo.ipi_count;
- error = SYSCTL_OUT(req, &xig, sizeof xig);
- }
+ /*
+ * Give the user an updated idea of our state.
+ * If the generation differs from what we told
+ * her before, she knows that something happened
+ * while we were processing this request, and it
+ * might be necessary to retry.
+ */
+ xig.xig_gen = V_dcb_gencnt;
+ xig.xig_sogen = so_gencnt;
+ xig.xig_count = V_dcb_count;
+ error = SYSCTL_OUT(req, &xig, sizeof xig);
+
+errout:
+ DIVERT_UNLOCK();
return (error);
}
@@ -726,13 +683,9 @@
.pr_flags = PR_ATOMIC|PR_ADDR,
.pr_attach = div_attach,
.pr_bind = div_bind,
- .pr_control = in_control,
.pr_detach = div_detach,
- .pr_peeraddr = in_getpeeraddr,
.pr_send = div_send,
.pr_shutdown = div_shutdown,
- .pr_sockaddr = in_getsockaddr,
- .pr_sosetlabel = in_pcbsosetlabel
};
static struct domain divertdomain = {
@@ -775,18 +728,15 @@
* XXXGL: One more reason this code is incorrect is that it
* checks only the current vnet.
*/
- INP_INFO_WLOCK(&V_divcbinfo);
- if (V_divcbinfo.ipi_count != 0) {
+ DIVERT_LOCK();
+ if (V_dcb_count != 0) {
+ DIVERT_UNLOCK();
err = EBUSY;
- INP_INFO_WUNLOCK(&V_divcbinfo);
break;
}
+ DIVERT_UNLOCK();
ip_divert_ptr = NULL;
domain_remove(&divertdomain);
- INP_INFO_WUNLOCK(&V_divcbinfo);
-#ifndef VIMAGE
- div_destroy(NULL);
-#endif
break;
default:
err = EOPNOTSUPP;
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Sat, Nov 16, 12:18 PM (19 h, 48 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
14658579
Default Alt Text
D36382.diff (13 KB)
Attached To
Mode
D36382: divert(4): maintain own cb database and stop using inpcb KPI
Attached
Detach File
Event Timeline
Log In to Comment