Page Menu
Home
FreeBSD
Search
Configure Global Search
Log In
Files
F102619997
D11003.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Flag For Later
Award Token
Size
30 KB
Referenced Files
None
Subscribers
None
D11003.diff
View Options
Index: head/cddl/lib/libdtrace/tcp.d
===================================================================
--- head/cddl/lib/libdtrace/tcp.d
+++ head/cddl/lib/libdtrace/tcp.d
@@ -192,12 +192,12 @@
tcps_rport = p == NULL ? 0 : ntohs(p->t_inpcb->inp_inc.inc_ie.ie_fport);
tcps_laddr = p == NULL ? 0 :
p->t_inpcb->inp_vflag == INP_IPV4 ?
- inet_ntoa(&p->t_inpcb->inp_inc.inc_ie.ie_dependladdr.ie46_local.ia46_addr4.s_addr) :
- inet_ntoa6(&p->t_inpcb->inp_inc.inc_ie.ie_dependladdr.ie6_local);
+ inet_ntoa(&p->t_inpcb->inp_inc.inc_ie.ie_dependladdr.id46_addr.ia46_addr4.s_addr) :
+ inet_ntoa6(&p->t_inpcb->inp_inc.inc_ie.ie_dependladdr.id6_addr);
tcps_raddr = p == NULL ? 0 :
p->t_inpcb->inp_vflag == INP_IPV4 ?
- inet_ntoa(&p->t_inpcb->inp_inc.inc_ie.ie_dependfaddr.ie46_foreign.ia46_addr4.s_addr) :
- inet_ntoa6(&p->t_inpcb->inp_inc.inc_ie.ie_dependfaddr.ie6_foreign);
+ inet_ntoa(&p->t_inpcb->inp_inc.inc_ie.ie_dependfaddr.id46_addr.ia46_addr4.s_addr) :
+ inet_ntoa6(&p->t_inpcb->inp_inc.inc_ie.ie_dependfaddr.id6_addr);
tcps_state = p == NULL ? -1 : p->t_state;
tcps_iss = p == NULL ? 0 : p->iss;
tcps_irs = p == NULL ? 0 : p->irs;
Index: head/lib/libc/sys/getsockopt.2
===================================================================
--- head/lib/libc/sys/getsockopt.2
+++ head/lib/libc/sys/getsockopt.2
@@ -152,6 +152,7 @@
.It Dv SO_DEBUG Ta "enables recording of debugging information"
.It Dv SO_REUSEADDR Ta "enables local address reuse"
.It Dv SO_REUSEPORT Ta "enables duplicate address and port bindings"
+.It Dv SO_REUSEPORT_LB Ta "enables duplicate address and port bindings with load balancing"
.It Dv SO_KEEPALIVE Ta "enables keep connections alive"
.It Dv SO_DONTROUTE Ta "enables routing bypass for outgoing messages"
.It Dv SO_LINGER Ta "linger on close if data present"
@@ -207,6 +208,15 @@
before binding the port.
This option permits multiple instances of a program to each
receive UDP/IP multicast or broadcast datagrams destined for the bound port.
+.Pp
+.Dv SO_REUSEPORT_LB
+allows completely duplicate bindings by multiple processes
+if they all set
+.Dv SO_REUSEPORT_LB
+before binding the port.
+Incoming TCP and UDP connections are distributed among the sharing
+processes based on a hash function of local port number, foreign IP
+address and port number. A maximum of 256 processes can share one socket.
.Pp
.Dv SO_KEEPALIVE
enables the
Index: head/sys/kern/uipc_debug.c
===================================================================
--- head/sys/kern/uipc_debug.c
+++ head/sys/kern/uipc_debug.c
@@ -77,7 +77,7 @@
}
static void
-db_print_sooptions(short so_options)
+db_print_sooptions(int so_options)
{
int comma;
@@ -120,6 +120,10 @@
}
if (so_options & SO_REUSEPORT) {
db_printf("%sSO_REUSEPORT", comma ? ", " : "");
+ comma = 1;
+ }
+ if (so_options & SO_REUSEPORT_LB) {
+ db_printf("%sSO_REUSEPORT_LB", comma ? ", " : "");
comma = 1;
}
if (so_options & SO_TIMESTAMP) {
Index: head/sys/kern/uipc_socket.c
===================================================================
--- head/sys/kern/uipc_socket.c
+++ head/sys/kern/uipc_socket.c
@@ -2776,6 +2776,7 @@
case SO_BROADCAST:
case SO_REUSEADDR:
case SO_REUSEPORT:
+ case SO_REUSEPORT_LB:
case SO_OOBINLINE:
case SO_TIMESTAMP:
case SO_BINTIME:
@@ -2994,6 +2995,7 @@
case SO_KEEPALIVE:
case SO_REUSEADDR:
case SO_REUSEPORT:
+ case SO_REUSEPORT_LB:
case SO_BROADCAST:
case SO_OOBINLINE:
case SO_ACCEPTCONN:
Index: head/sys/netinet/in_pcb.h
===================================================================
--- head/sys/netinet/in_pcb.h
+++ head/sys/netinet/in_pcb.h
@@ -79,6 +79,11 @@
struct in_addr ia46_addr4;
};
+union in_dependaddr {
+ struct in_addr_4in6 id46_addr;
+ struct in6_addr id6_addr;
+};
+
/*
* NOTE: ipv6 addrs should be 64-bit aligned, per RFC 2553. in_conninfo has
* some extra padding to accomplish this.
@@ -89,22 +94,14 @@
u_int16_t ie_fport; /* foreign port */
u_int16_t ie_lport; /* local port */
/* protocol dependent part, local and foreign addr */
- union {
- /* foreign host table entry */
- struct in_addr_4in6 ie46_foreign;
- struct in6_addr ie6_foreign;
- } ie_dependfaddr;
- union {
- /* local host table entry */
- struct in_addr_4in6 ie46_local;
- struct in6_addr ie6_local;
- } ie_dependladdr;
+ union in_dependaddr ie_dependfaddr; /* foreign host table entry */
+ union in_dependaddr ie_dependladdr; /* local host table entry */
+#define ie_faddr ie_dependfaddr.id46_addr.ia46_addr4
+#define ie_laddr ie_dependladdr.id46_addr.ia46_addr4
+#define ie6_faddr ie_dependfaddr.id6_addr
+#define ie6_laddr ie_dependladdr.id6_addr
u_int32_t ie6_zoneid; /* scope zone id */
};
-#define ie_faddr ie_dependfaddr.ie46_foreign.ia46_addr4
-#define ie_laddr ie_dependladdr.ie46_local.ia46_addr4
-#define ie6_faddr ie_dependfaddr.ie6_foreign
-#define ie6_laddr ie_dependladdr.ie6_local
/*
* XXX The defines for inc_* are hacks and should be changed to direct
@@ -508,6 +505,13 @@
u_long ipi_wildmask; /* (p) */
/*
+ * Load balance groups used for the SO_REUSEPORT_LB option,
+ * hashed by local port.
+ */
+ struct inpcblbgrouphead *ipi_lbgrouphashbase; /* (h) */
+ u_long ipi_lbgrouphashmask; /* (h) */
+
+ /*
* Pointer to network stack instance
*/
struct vnet *ipi_vnet; /* (c) */
@@ -549,6 +553,27 @@
struct mtx ipg_lock;
} __aligned(CACHE_LINE_SIZE);
+/*
+ * Load balance groups used for the SO_REUSEPORT_LB socket option. Each group
+ * (or unique address:port combination) can be re-used at most
+ * INPCBLBGROUP_SIZMAX (256) times. The inpcbs are stored in il_inp which
+ * is dynamically resized as processes bind/unbind to that specific group.
+ */
+struct inpcblbgroup {
+ LIST_ENTRY(inpcblbgroup) il_list;
+ uint16_t il_lport; /* (c) */
+ u_char il_vflag; /* (c) */
+ u_char il_pad;
+ uint32_t il_pad2;
+ union in_dependaddr il_dependladdr; /* (c) */
+#define il_laddr il_dependladdr.id46_addr.ia46_addr4
+#define il6_laddr il_dependladdr.id6_addr
+ uint32_t il_inpsiz; /* max count in il_inp[] (h) */
+ uint32_t il_inpcnt; /* cur count in il_inp[] (h) */
+ struct inpcb *il_inp[]; /* (h) */
+};
+LIST_HEAD(inpcblbgrouphead, inpcblbgroup);
+
#define INP_LOCK_INIT(inp, d, t) \
rw_init_flags(&(inp)->inp_lock, (t), RW_RECURSE | RW_DUPOK)
#define INP_LOCK_DESTROY(inp) rw_destroy(&(inp)->inp_lock)
@@ -593,7 +618,7 @@
inp_inpcbtotcpcb(struct inpcb *inp);
void inp_4tuple_get(struct inpcb *inp, uint32_t *laddr, uint16_t *lp,
uint32_t *faddr, uint16_t *fp);
-short inp_so_options(const struct inpcb *inp);
+int inp_so_options(const struct inpcb *inp);
#endif /* _KERNEL */
@@ -656,6 +681,10 @@
(((faddr) ^ ((faddr) >> 16) ^ ntohs((lport) ^ (fport))) & (mask))
#define INP_PCBPORTHASH(lport, mask) \
(ntohs((lport)) & (mask))
+#define INP_PCBLBGROUP_PORTHASH(lport, mask) \
+ (ntohs((lport)) & (mask))
+#define INP_PCBLBGROUP_PKTHASH(faddr, lport, fport) \
+ ((faddr) ^ ((faddr) >> 16) ^ ntohs((lport) ^ (fport)))
#define INP6_PCBHASHKEY(faddr) ((faddr)->s6_addr32[3])
/*
@@ -724,6 +753,7 @@
#define INP_RATE_LIMIT_CHANGED 0x00000400 /* rate limit needs attention */
#define INP_ORIGDSTADDR 0x00000800 /* receive IP dst address/port */
#define INP_CANNOT_DO_ECN 0x00001000 /* The stack does not do ECN */
+#define INP_REUSEPORT_LB 0x00002000 /* SO_REUSEPORT_LB option is set */
/*
* Flags passed to in_pcblookup*() functions.
Index: head/sys/netinet/in_pcb.c
===================================================================
--- head/sys/netinet/in_pcb.c
+++ head/sys/netinet/in_pcb.c
@@ -108,6 +108,9 @@
#include <security/mac/mac_framework.h>
+#define INPCBLBGROUP_SIZMIN 8
+#define INPCBLBGROUP_SIZMAX 256
+
static struct callout ipport_tick_callout;
/*
@@ -217,7 +220,214 @@
* functions often modify hash chains or addresses in pcbs.
*/
+static struct inpcblbgroup *
+in_pcblbgroup_alloc(struct inpcblbgrouphead *hdr, u_char vflag,
+ uint16_t port, const union in_dependaddr *addr, int size)
+{
+ struct inpcblbgroup *grp;
+ size_t bytes;
+
+ bytes = __offsetof(struct inpcblbgroup, il_inp[size]);
+ grp = malloc(bytes, M_PCB, M_ZERO | M_NOWAIT);
+ if (!grp)
+ return (NULL);
+ grp->il_vflag = vflag;
+ grp->il_lport = port;
+ grp->il_dependladdr = *addr;
+ grp->il_inpsiz = size;
+ LIST_INSERT_HEAD(hdr, grp, il_list);
+ return (grp);
+}
+
+static void
+in_pcblbgroup_free(struct inpcblbgroup *grp)
+{
+
+ LIST_REMOVE(grp, il_list);
+ free(grp, M_TEMP);
+}
+
+static struct inpcblbgroup *
+in_pcblbgroup_resize(struct inpcblbgrouphead *hdr,
+ struct inpcblbgroup *old_grp, int size)
+{
+ struct inpcblbgroup *grp;
+ int i;
+
+ grp = in_pcblbgroup_alloc(hdr, old_grp->il_vflag,
+ old_grp->il_lport, &old_grp->il_dependladdr, size);
+ if (!grp)
+ return (NULL);
+
+ KASSERT(old_grp->il_inpcnt < grp->il_inpsiz,
+ ("invalid new local group size %d and old local group count %d",
+ grp->il_inpsiz, old_grp->il_inpcnt));
+
+ for (i = 0; i < old_grp->il_inpcnt; ++i)
+ grp->il_inp[i] = old_grp->il_inp[i];
+ grp->il_inpcnt = old_grp->il_inpcnt;
+ in_pcblbgroup_free(old_grp);
+ return (grp);
+}
+
/*
+ * PCB at index 'i' is removed from the group. Pull up the ones below il_inp[i]
+ * and shrink group if possible.
+ */
+static void
+in_pcblbgroup_reorder(struct inpcblbgrouphead *hdr, struct inpcblbgroup **grpp,
+ int i)
+{
+ struct inpcblbgroup *grp = *grpp;
+
+ for (; i + 1 < grp->il_inpcnt; ++i)
+ grp->il_inp[i] = grp->il_inp[i + 1];
+ grp->il_inpcnt--;
+
+ if (grp->il_inpsiz > INPCBLBGROUP_SIZMIN &&
+ grp->il_inpcnt <= (grp->il_inpsiz / 4)) {
+ /* Shrink this group. */
+ struct inpcblbgroup *new_grp =
+ in_pcblbgroup_resize(hdr, grp, grp->il_inpsiz / 2);
+ if (new_grp)
+ *grpp = new_grp;
+ }
+ return;
+}
+
+/*
+ * Add PCB to load balance group for SO_REUSEPORT_LB option.
+ */
+static int
+in_pcbinslbgrouphash(struct inpcb *inp)
+{
+ struct inpcbinfo *pcbinfo;
+ struct inpcblbgrouphead *hdr;
+ struct inpcblbgroup *grp;
+ uint16_t hashmask, lport;
+ uint32_t group_index;
+ struct ucred *cred;
+ static int limit_logged = 0;
+
+ pcbinfo = inp->inp_pcbinfo;
+
+ INP_WLOCK_ASSERT(inp);
+ INP_HASH_WLOCK_ASSERT(pcbinfo);
+
+ if (pcbinfo->ipi_lbgrouphashbase == NULL)
+ return (0);
+
+ hashmask = pcbinfo->ipi_lbgrouphashmask;
+ lport = inp->inp_lport;
+ group_index = INP_PCBLBGROUP_PORTHASH(lport, hashmask);
+ hdr = &pcbinfo->ipi_lbgrouphashbase[group_index];
+
+ /*
+ * Don't allow jailed socket to join local group.
+ */
+ if (inp->inp_socket != NULL)
+ cred = inp->inp_socket->so_cred;
+ else
+ cred = NULL;
+ if (cred != NULL && jailed(cred))
+ return (0);
+
+#ifdef INET6
+ /*
+ * Don't allow IPv4 mapped INET6 wild socket.
+ */
+ if ((inp->inp_vflag & INP_IPV4) &&
+ inp->inp_laddr.s_addr == INADDR_ANY &&
+ INP_CHECK_SOCKAF(inp->inp_socket, AF_INET6)) {
+ return (0);
+ }
+#endif
+
+ hdr = &pcbinfo->ipi_lbgrouphashbase[
+ INP_PCBLBGROUP_PORTHASH(inp->inp_lport,
+ pcbinfo->ipi_lbgrouphashmask)];
+ LIST_FOREACH(grp, hdr, il_list) {
+ if (grp->il_vflag == inp->inp_vflag &&
+ grp->il_lport == inp->inp_lport &&
+ memcmp(&grp->il_dependladdr,
+ &inp->inp_inc.inc_ie.ie_dependladdr,
+ sizeof(grp->il_dependladdr)) == 0) {
+ break;
+ }
+ }
+ if (grp == NULL) {
+ /* Create new load balance group. */
+ grp = in_pcblbgroup_alloc(hdr, inp->inp_vflag,
+ inp->inp_lport, &inp->inp_inc.inc_ie.ie_dependladdr,
+ INPCBLBGROUP_SIZMIN);
+ if (!grp)
+ return (ENOBUFS);
+ } else if (grp->il_inpcnt == grp->il_inpsiz) {
+ if (grp->il_inpsiz >= INPCBLBGROUP_SIZMAX) {
+ if (!limit_logged) {
+ limit_logged = 1;
+ printf("lb group port %d, limit reached\n",
+ ntohs(grp->il_lport));
+ }
+ return (0);
+ }
+
+ /* Expand this local group. */
+ grp = in_pcblbgroup_resize(hdr, grp, grp->il_inpsiz * 2);
+ if (!grp)
+ return (ENOBUFS);
+ }
+
+ KASSERT(grp->il_inpcnt < grp->il_inpsiz,
+ ("invalid local group size %d and count %d",
+ grp->il_inpsiz, grp->il_inpcnt));
+
+ grp->il_inp[grp->il_inpcnt] = inp;
+ grp->il_inpcnt++;
+ return (0);
+}
+
+/*
+ * Remove PCB from load balance group.
+ */
+static void
+in_pcbremlbgrouphash(struct inpcb *inp)
+{
+ struct inpcbinfo *pcbinfo;
+ struct inpcblbgrouphead *hdr;
+ struct inpcblbgroup *grp;
+ int i;
+
+ pcbinfo = inp->inp_pcbinfo;
+
+ INP_WLOCK_ASSERT(inp);
+ INP_HASH_WLOCK_ASSERT(pcbinfo);
+
+ if (pcbinfo->ipi_lbgrouphashbase == NULL)
+ return;
+
+ hdr = &pcbinfo->ipi_lbgrouphashbase[
+ INP_PCBLBGROUP_PORTHASH(inp->inp_lport,
+ pcbinfo->ipi_lbgrouphashmask)];
+
+ LIST_FOREACH(grp, hdr, il_list) {
+ for (i = 0; i < grp->il_inpcnt; ++i) {
+ if (grp->il_inp[i] != inp)
+ continue;
+
+ if (grp->il_inpcnt == 1) {
+ /* We are the last, free this local group. */
+ in_pcblbgroup_free(grp);
+ } else {
+ /* Pull up inpcbs, shrink group if possible. */
+ in_pcblbgroup_reorder(hdr, &grp, i);
+ }
+ return;
+ }
+ }
+}
+
+/*
* Different protocols initialize their inpcbs differently - giving
* different name to the lock. But they all are disposed the same.
*/
@@ -252,6 +462,8 @@
&pcbinfo->ipi_hashmask);
pcbinfo->ipi_porthashbase = hashinit(porthash_nelements, M_PCB,
&pcbinfo->ipi_porthashmask);
+ pcbinfo->ipi_lbgrouphashbase = hashinit(hash_nelements, M_PCB,
+ &pcbinfo->ipi_lbgrouphashmask);
#ifdef PCBGROUP
in_pcbgroup_init(pcbinfo, hashfields, hash_nelements);
#endif
@@ -275,6 +487,8 @@
hashdestroy(pcbinfo->ipi_hashbase, M_PCB, pcbinfo->ipi_hashmask);
hashdestroy(pcbinfo->ipi_porthashbase, M_PCB,
pcbinfo->ipi_porthashmask);
+ hashdestroy(pcbinfo->ipi_lbgrouphashbase, M_PCB,
+ pcbinfo->ipi_lbgrouphashmask);
#ifdef PCBGROUP
in_pcbgroup_destroy(pcbinfo);
#endif
@@ -513,18 +727,20 @@
/*
* Return cached socket options.
*/
-short
+int
inp_so_options(const struct inpcb *inp)
{
- short so_options;
+ int so_options;
- so_options = 0;
+ so_options = 0;
- if ((inp->inp_flags2 & INP_REUSEPORT) != 0)
- so_options |= SO_REUSEPORT;
- if ((inp->inp_flags2 & INP_REUSEADDR) != 0)
- so_options |= SO_REUSEADDR;
- return (so_options);
+ if ((inp->inp_flags2 & INP_REUSEPORT_LB) != 0)
+ so_options |= SO_REUSEPORT_LB;
+ if ((inp->inp_flags2 & INP_REUSEPORT) != 0)
+ so_options |= SO_REUSEPORT;
+ if ((inp->inp_flags2 & INP_REUSEADDR) != 0)
+ so_options |= SO_REUSEADDR;
+ return (so_options);
}
#endif /* INET || INET6 */
@@ -581,6 +797,12 @@
int error;
/*
+ * XXX: Maybe we could let SO_REUSEPORT_LB set SO_REUSEPORT bit here
+ * so that we don't have to add to the (already messy) code below.
+ */
+ int reuseport_lb = (so->so_options & SO_REUSEPORT_LB);
+
+ /*
* No state changes, so read locks are sufficient here.
*/
INP_LOCK_ASSERT(inp);
@@ -591,7 +813,7 @@
laddr.s_addr = *laddrp;
if (nam != NULL && laddr.s_addr != INADDR_ANY)
return (EINVAL);
- if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) == 0)
+ if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT|SO_REUSEPORT_LB)) == 0)
lookupflags = INPLOOKUP_WILDCARD;
if (nam == NULL) {
if ((error = prison_local_ip4(cred, &laddr)) != 0)
@@ -628,16 +850,23 @@
*/
if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) != 0)
reuseport = SO_REUSEADDR|SO_REUSEPORT;
+ /*
+ * XXX: How to deal with SO_REUSEPORT_LB here?
+ * Treat same as SO_REUSEPORT for now.
+ */
+ if ((so->so_options &
+ (SO_REUSEADDR|SO_REUSEPORT_LB)) != 0)
+ reuseport_lb = SO_REUSEADDR|SO_REUSEPORT_LB;
} else if (sin->sin_addr.s_addr != INADDR_ANY) {
sin->sin_port = 0; /* yech... */
bzero(&sin->sin_zero, sizeof(sin->sin_zero));
/*
- * Is the address a local IP address?
+ * Is the address a local IP address?
* If INP_BINDANY is set, then the socket may be bound
* to any endpoint address, local or not.
*/
if ((inp->inp_flags & INP_BINDANY) == 0 &&
- ifa_ifwithaddr_check((struct sockaddr *)sin) == 0)
+ ifa_ifwithaddr_check((struct sockaddr *)sin) == 0)
return (EADDRNOTAVAIL);
}
laddr = sin->sin_addr;
@@ -667,7 +896,8 @@
ntohl(t->inp_faddr.s_addr) == INADDR_ANY) &&
(ntohl(sin->sin_addr.s_addr) != INADDR_ANY ||
ntohl(t->inp_laddr.s_addr) != INADDR_ANY ||
- (t->inp_flags2 & INP_REUSEPORT) == 0) &&
+ (t->inp_flags2 & INP_REUSEPORT) ||
+ (t->inp_flags2 & INP_REUSEPORT_LB) == 0) &&
(inp->inp_cred->cr_uid !=
t->inp_cred->cr_uid))
return (EADDRINUSE);
@@ -692,11 +922,15 @@
*/
tw = intotw(t);
if (tw == NULL ||
- (reuseport & tw->tw_so_options) == 0)
+ ((reuseport & tw->tw_so_options) == 0 &&
+ (reuseport_lb &
+ tw->tw_so_options) == 0)) {
return (EADDRINUSE);
+ }
} else if (t &&
- ((inp->inp_flags2 & INP_BINDMULTI) == 0) &&
- (reuseport & inp_so_options(t)) == 0) {
+ ((inp->inp_flags2 & INP_BINDMULTI) == 0) &&
+ (reuseport & inp_so_options(t)) == 0 &&
+ (reuseport_lb & inp_so_options(t)) == 0) {
#ifdef INET6
if (ntohl(sin->sin_addr.s_addr) !=
INADDR_ANY ||
@@ -705,7 +939,7 @@
(inp->inp_vflag & INP_IPV6PROTO) == 0 ||
(t->inp_vflag & INP_IPV6PROTO) == 0)
#endif
- return (EADDRINUSE);
+ return (EADDRINUSE);
if (t && (! in_pcbbind_check_bindmulti(inp, t)))
return (EADDRINUSE);
}
@@ -1442,6 +1676,7 @@
struct inpcbport *phd = inp->inp_phd;
INP_HASH_WLOCK(inp->inp_pcbinfo);
+ in_pcbremlbgrouphash(inp);
LIST_REMOVE(inp, inp_hash);
LIST_REMOVE(inp, inp_portlist);
if (LIST_FIRST(&phd->phd_pcblist) == NULL) {
@@ -1705,6 +1940,61 @@
}
#undef INP_LOOKUP_MAPPED_PCB_COST
+static struct inpcb *
+in_pcblookup_lbgroup(const struct inpcbinfo *pcbinfo,
+ const struct in_addr *laddr, uint16_t lport, const struct in_addr *faddr,
+ uint16_t fport, int lookupflags)
+{
+ struct inpcb *local_wild = NULL;
+ const struct inpcblbgrouphead *hdr;
+ struct inpcblbgroup *grp;
+ struct inpcblbgroup *grp_local_wild;
+
+ INP_HASH_LOCK_ASSERT(pcbinfo);
+
+ hdr = &pcbinfo->ipi_lbgrouphashbase[
+ INP_PCBLBGROUP_PORTHASH(lport, pcbinfo->ipi_lbgrouphashmask)];
+
+ /*
+ * Order of socket selection:
+ * 1. non-wild.
+ * 2. wild (if lookupflags contains INPLOOKUP_WILDCARD).
+ *
+ * NOTE:
+ * - Load balanced group does not contain jailed sockets
+ * - Load balanced group does not contain IPv4 mapped INET6 wild sockets
+ */
+ LIST_FOREACH(grp, hdr, il_list) {
+#ifdef INET6
+ if (!(grp->il_vflag & INP_IPV4))
+ continue;
+#endif
+
+ if (grp->il_lport == lport) {
+
+ uint32_t idx = 0;
+ int pkt_hash = INP_PCBLBGROUP_PKTHASH(faddr->s_addr,
+ lport, fport);
+
+ idx = pkt_hash % grp->il_inpcnt;
+
+ if (grp->il_laddr.s_addr == laddr->s_addr) {
+ return (grp->il_inp[idx]);
+ } else {
+ if (grp->il_laddr.s_addr == INADDR_ANY &&
+ (lookupflags & INPLOOKUP_WILDCARD)) {
+ local_wild = grp->il_inp[idx];
+ grp_local_wild = grp;
+ }
+ }
+ }
+ }
+ if (local_wild != NULL) {
+ return (local_wild);
+ }
+ return (NULL);
+}
+
#ifdef PCBGROUP
/*
* Lookup PCB in hash list, using pcbgroup tables.
@@ -1984,6 +2274,18 @@
return (tmpinp);
/*
+ * Then look in lb group (for wildcard match).
+ */
+ if (pcbinfo->ipi_lbgrouphashbase != NULL &&
+ (lookupflags & INPLOOKUP_WILDCARD)) {
+ inp = in_pcblookup_lbgroup(pcbinfo, &laddr, lport, &faddr,
+ fport, lookupflags);
+ if (inp != NULL) {
+ return (inp);
+ }
+ }
+
+ /*
* Then look for a wildcard match, if requested.
*/
if ((lookupflags & INPLOOKUP_WILDCARD) != 0) {
@@ -2200,6 +2502,7 @@
struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
struct inpcbport *phd;
u_int32_t hashkey_faddr;
+ int so_options;
INP_WLOCK_ASSERT(inp);
INP_HASH_WLOCK_ASSERT(pcbinfo);
@@ -2221,6 +2524,19 @@
INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_porthashmask)];
/*
+ * Add entry to load balance group.
+ * Only do this if SO_REUSEPORT_LB is set.
+ */
+ so_options = inp_so_options(inp);
+ if (so_options & SO_REUSEPORT_LB) {
+ int ret = in_pcbinslbgrouphash(inp);
+ if (ret) {
+ /* pcb lb group malloc fail (ret=ENOBUFS). */
+ return (ret);
+ }
+ }
+
+ /*
* Go through port list and look for a head for this lport.
*/
LIST_FOREACH(phd, pcbporthash, phd_hash) {
@@ -2346,6 +2662,10 @@
struct inpcbport *phd = inp->inp_phd;
INP_HASH_WLOCK(pcbinfo);
+
+ /* XXX: Only do if SO_REUSEPORT_LB set? */
+ in_pcbremlbgrouphash(inp);
+
LIST_REMOVE(inp, inp_hash);
LIST_REMOVE(inp, inp_portlist);
if (LIST_FIRST(&phd->phd_pcblist) == NULL) {
Index: head/sys/netinet/ip_output.c
===================================================================
--- head/sys/netinet/ip_output.c
+++ head/sys/netinet/ip_output.c
@@ -992,6 +992,15 @@
INP_WUNLOCK(inp);
error = 0;
break;
+ case SO_REUSEPORT_LB:
+ INP_WLOCK(inp);
+ if ((so->so_options & SO_REUSEPORT_LB) != 0)
+ inp->inp_flags2 |= INP_REUSEPORT_LB;
+ else
+ inp->inp_flags2 &= ~INP_REUSEPORT_LB;
+ INP_WUNLOCK(inp);
+ error = 0;
+ break;
case SO_SETFIB:
INP_WLOCK(inp);
inp->inp_inc.inc_fibnum = so->so_fibnum;
Index: head/sys/netinet/udp_usrreq.c
===================================================================
--- head/sys/netinet/udp_usrreq.c
+++ head/sys/netinet/udp_usrreq.c
@@ -612,7 +612,7 @@
* will never clear these options after setting them.
*/
if ((last->inp_socket->so_options &
- (SO_REUSEPORT|SO_REUSEADDR)) == 0)
+ (SO_REUSEPORT|SO_REUSEPORT_LB|SO_REUSEADDR)) == 0)
break;
}
Index: head/sys/netinet6/in6_pcb.c
===================================================================
--- head/sys/netinet6/in6_pcb.c
+++ head/sys/netinet6/in6_pcb.c
@@ -125,6 +125,12 @@
int error, lookupflags = 0;
int reuseport = (so->so_options & SO_REUSEPORT);
+ /*
+ * XXX: Maybe we could let SO_REUSEPORT_LB set SO_REUSEPORT bit here
+ * so that we don't have to add to the (already messy) code below.
+ */
+ int reuseport_lb = (so->so_options & SO_REUSEPORT_LB);
+
INP_WLOCK_ASSERT(inp);
INP_HASH_WLOCK_ASSERT(pcbinfo);
@@ -132,7 +138,7 @@
return (EADDRNOTAVAIL);
if (inp->inp_lport || !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr))
return (EINVAL);
- if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) == 0)
+ if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT|SO_REUSEPORT_LB)) == 0)
lookupflags = INPLOOKUP_WILDCARD;
if (nam == NULL) {
if ((error = prison_local_ip6(cred, &inp->in6p_laddr,
@@ -166,6 +172,13 @@
*/
if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) != 0)
reuseport = SO_REUSEADDR|SO_REUSEPORT;
+ /*
+ * XXX: How to deal with SO_REUSEPORT_LB here?
+ * Treat same as SO_REUSEPORT for now.
+ */
+ if ((so->so_options &
+ (SO_REUSEADDR|SO_REUSEPORT_LB)) != 0)
+ reuseport_lb = SO_REUSEADDR|SO_REUSEPORT_LB;
} else if (!IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) {
struct ifaddr *ifa;
@@ -215,7 +228,8 @@
IN6_IS_ADDR_UNSPECIFIED(&t->in6p_faddr)) &&
(!IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr) ||
!IN6_IS_ADDR_UNSPECIFIED(&t->in6p_laddr) ||
- (t->inp_flags2 & INP_REUSEPORT) == 0) &&
+ (t->inp_flags2 & INP_REUSEPORT) ||
+ (t->inp_flags2 & INP_REUSEPORT_LB) == 0) &&
(inp->inp_cred->cr_uid !=
t->inp_cred->cr_uid))
return (EADDRINUSE);
@@ -265,9 +279,11 @@
*/
tw = intotw(t);
if (tw == NULL ||
- (reuseport & tw->tw_so_options) == 0)
+ ((reuseport & tw->tw_so_options) == 0 &&
+ (reuseport_lb & tw->tw_so_options) == 0))
return (EADDRINUSE);
- } else if (t && (reuseport & inp_so_options(t)) == 0) {
+ } else if (t && (reuseport & inp_so_options(t)) == 0 &&
+ (reuseport_lb & inp_so_options(t)) == 0) {
return (EADDRINUSE);
}
#ifdef INET
@@ -277,22 +293,25 @@
in6_sin6_2_sin(&sin, sin6);
t = in_pcblookup_local(pcbinfo, sin.sin_addr,
- lport, lookupflags, cred);
+ lport, lookupflags, cred);
if (t && t->inp_flags & INP_TIMEWAIT) {
tw = intotw(t);
if (tw == NULL)
return (EADDRINUSE);
if ((reuseport & tw->tw_so_options) == 0
+ && (reuseport_lb & tw->tw_so_options) == 0
&& (ntohl(t->inp_laddr.s_addr) !=
- INADDR_ANY || ((inp->inp_vflag &
- INP_IPV6PROTO) ==
- (t->inp_vflag & INP_IPV6PROTO))))
+ INADDR_ANY || ((inp->inp_vflag &
+ INP_IPV6PROTO) ==
+ (t->inp_vflag & INP_IPV6PROTO))))
return (EADDRINUSE);
} else if (t &&
(reuseport & inp_so_options(t)) == 0 &&
+ (reuseport_lb & inp_so_options(t)) == 0 &&
(ntohl(t->inp_laddr.s_addr) != INADDR_ANY ||
- (t->inp_vflag & INP_IPV6PROTO) != 0))
+ (t->inp_vflag & INP_IPV6PROTO) != 0)) {
return (EADDRINUSE);
+ }
}
#endif
}
@@ -856,6 +875,56 @@
return inp;
}
+static struct inpcb *
+in6_pcblookup_lbgroup(const struct inpcbinfo *pcbinfo,
+ const struct in6_addr *laddr, uint16_t lport, const struct in6_addr *faddr,
+ uint16_t fport, int lookupflags)
+{
+ struct inpcb *local_wild = NULL;
+ const struct inpcblbgrouphead *hdr;
+ struct inpcblbgroup *grp;
+ struct inpcblbgroup *grp_local_wild;
+ uint32_t idx;
+
+ INP_HASH_LOCK_ASSERT(pcbinfo);
+
+ hdr = &pcbinfo->ipi_lbgrouphashbase[INP_PCBLBGROUP_PORTHASH(
+ lport, pcbinfo->ipi_lbgrouphashmask)];
+
+ /*
+ * Order of socket selection:
+ * 1. non-wild.
+ * 2. wild (if lookupflags contains INPLOOKUP_WILDCARD).
+ *
+ * NOTE:
+ * - Load balanced group does not contain jailed sockets.
+ * - Load balanced does not contain IPv4 mapped INET6 wild sockets.
+ */
+ LIST_FOREACH(grp, hdr, il_list) {
+ if (grp->il_lport == lport) {
+ idx = 0;
+ int pkt_hash = INP_PCBLBGROUP_PKTHASH(
+ INP6_PCBHASHKEY(faddr), lport, fport);
+
+ idx = pkt_hash % grp->il_inpcnt;
+
+ if (IN6_ARE_ADDR_EQUAL(&grp->il6_laddr, laddr)) {
+ return (grp->il_inp[idx]);
+ } else {
+ if (IN6_IS_ADDR_UNSPECIFIED(&grp->il6_laddr) &&
+ (lookupflags & INPLOOKUP_WILDCARD)) {
+ local_wild = grp->il_inp[idx];
+ grp_local_wild = grp;
+ }
+ }
+ }
+ }
+ if (local_wild != NULL) {
+ return (local_wild);
+ }
+ return (NULL);
+}
+
#ifdef PCBGROUP
/*
* Lookup PCB in hash list, using pcbgroup tables.
@@ -1101,6 +1170,18 @@
}
if (tmpinp != NULL)
return (tmpinp);
+
+ /*
+ * Then look in lb group (for wildcard match).
+ */
+ if (pcbinfo->ipi_lbgrouphashbase != NULL &&
+ (lookupflags & INPLOOKUP_WILDCARD)) {
+ inp = in6_pcblookup_lbgroup(pcbinfo, laddr, lport, faddr,
+ fport, lookupflags);
+ if (inp != NULL) {
+ return (inp);
+ }
+ }
/*
* Then look for a wildcard match, if requested.
Index: head/sys/netinet6/in6_src.c
===================================================================
--- head/sys/netinet6/in6_src.c
+++ head/sys/netinet6/in6_src.c
@@ -973,7 +973,7 @@
return(error);
/* XXX: this is redundant when called from in6_pcbbind */
- if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) == 0)
+ if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT|SO_REUSEPORT_LB)) == 0)
lookupflags = INPLOOKUP_WILDCARD;
inp->inp_flags |= INP_ANONPORT;
Index: head/sys/netinet6/ip6_output.c
===================================================================
--- head/sys/netinet6/ip6_output.c
+++ head/sys/netinet6/ip6_output.c
@@ -1446,6 +1446,15 @@
INP_WUNLOCK(in6p);
error = 0;
break;
+ case SO_REUSEPORT_LB:
+ INP_WLOCK(in6p);
+ if ((so->so_options & SO_REUSEPORT_LB) != 0)
+ in6p->inp_flags2 |= INP_REUSEPORT_LB;
+ else
+ in6p->inp_flags2 &= ~INP_REUSEPORT_LB;
+ INP_WUNLOCK(in6p);
+ error = 0;
+ break;
case SO_SETFIB:
INP_WLOCK(in6p);
in6p->inp_inc.inc_fibnum = so->so_fibnum;
Index: head/sys/netinet6/udp6_usrreq.c
===================================================================
--- head/sys/netinet6/udp6_usrreq.c
+++ head/sys/netinet6/udp6_usrreq.c
@@ -399,7 +399,7 @@
* will never clear these options after setting them.
*/
if ((last->inp_socket->so_options &
- (SO_REUSEPORT|SO_REUSEADDR)) == 0)
+ (SO_REUSEPORT|SO_REUSEPORT_LB|SO_REUSEADDR)) == 0)
break;
}
Index: head/sys/sys/socket.h
===================================================================
--- head/sys/sys/socket.h
+++ head/sys/sys/socket.h
@@ -126,26 +126,27 @@
/*
* Option flags per-socket.
*/
-#define SO_DEBUG 0x0001 /* turn on debugging info recording */
-#define SO_ACCEPTCONN 0x0002 /* socket has had listen() */
-#define SO_REUSEADDR 0x0004 /* allow local address reuse */
-#define SO_KEEPALIVE 0x0008 /* keep connections alive */
-#define SO_DONTROUTE 0x0010 /* just use interface addresses */
-#define SO_BROADCAST 0x0020 /* permit sending of broadcast msgs */
+#define SO_DEBUG 0x00000001 /* turn on debugging info recording */
+#define SO_ACCEPTCONN 0x00000002 /* socket has had listen() */
+#define SO_REUSEADDR 0x00000004 /* allow local address reuse */
+#define SO_KEEPALIVE 0x00000008 /* keep connections alive */
+#define SO_DONTROUTE 0x00000010 /* just use interface addresses */
+#define SO_BROADCAST 0x00000020 /* permit sending of broadcast msgs */
#if __BSD_VISIBLE
-#define SO_USELOOPBACK 0x0040 /* bypass hardware when possible */
+#define SO_USELOOPBACK 0x00000040 /* bypass hardware when possible */
#endif
-#define SO_LINGER 0x0080 /* linger on close if data present */
-#define SO_OOBINLINE 0x0100 /* leave received OOB data in line */
+#define SO_LINGER 0x00000080 /* linger on close if data present */
+#define SO_OOBINLINE 0x00000100 /* leave received OOB data in line */
#if __BSD_VISIBLE
-#define SO_REUSEPORT 0x0200 /* allow local address & port reuse */
-#define SO_TIMESTAMP 0x0400 /* timestamp received dgram traffic */
-#define SO_NOSIGPIPE 0x0800 /* no SIGPIPE from EPIPE */
-#define SO_ACCEPTFILTER 0x1000 /* there is an accept filter */
-#define SO_BINTIME 0x2000 /* timestamp received dgram traffic */
+#define SO_REUSEPORT 0x00000200 /* allow local address & port reuse */
+#define SO_TIMESTAMP 0x00000400 /* timestamp received dgram traffic */
+#define SO_NOSIGPIPE 0x00000800 /* no SIGPIPE from EPIPE */
+#define SO_ACCEPTFILTER 0x00001000 /* there is an accept filter */
+#define SO_BINTIME 0x00002000 /* timestamp received dgram traffic */
#endif
-#define SO_NO_OFFLOAD 0x4000 /* socket cannot be offloaded */
-#define SO_NO_DDP 0x8000 /* disable direct data placement */
+#define SO_NO_OFFLOAD 0x00004000 /* socket cannot be offloaded */
+#define SO_NO_DDP 0x00008000 /* disable direct data placement */
+#define SO_REUSEPORT_LB 0x00010000 /* reuse with load balancing */
/*
* Additional options, not kept in so_options.
Index: head/sys/sys/socketvar.h
===================================================================
--- head/sys/sys/socketvar.h
+++ head/sys/sys/socketvar.h
@@ -84,7 +84,7 @@
struct selinfo so_rdsel; /* (b/cr) for so_rcv/so_comp */
struct selinfo so_wrsel; /* (b/cs) for so_snd */
short so_type; /* (a) generic type, see socket.h */
- short so_options; /* (b) from socket call, see socket.h */
+ int so_options; /* (b) from socket call, see socket.h */
short so_linger; /* time to linger close(2) */
short so_state; /* (b) internal state flags SS_* */
void *so_pcb; /* protocol control block */
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Fri, Nov 15, 10:04 PM (20 h, 7 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
14648647
Default Alt Text
D11003.diff (30 KB)
Attached To
Mode
D11003: Load balance sockets with new SO_REUSEPORT_LB option
Attached
Detach File
Event Timeline
Log In to Comment