Page Menu
Home
FreeBSD
Search
Configure Global Search
Log In
Files
F102601573
D11003.id33740.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Flag For Later
Award Token
Size
30 KB
Referenced Files
None
Subscribers
None
D11003.id33740.diff
View Options
Index: cddl/lib/libdtrace/tcp.d
===================================================================
--- cddl/lib/libdtrace/tcp.d
+++ cddl/lib/libdtrace/tcp.d
@@ -192,12 +192,12 @@
tcps_rport = p == NULL ? 0 : ntohs(p->t_inpcb->inp_inc.inc_ie.ie_fport);
tcps_laddr = p == NULL ? 0 :
p->t_inpcb->inp_vflag == INP_IPV4 ?
- inet_ntoa(&p->t_inpcb->inp_inc.inc_ie.ie_dependladdr.ie46_local.ia46_addr4.s_addr) :
- inet_ntoa6(&p->t_inpcb->inp_inc.inc_ie.ie_dependladdr.ie6_local);
+ inet_ntoa(&p->t_inpcb->inp_inc.inc_ie.ie_dependladdr.id46_addr.ia46_addr4.s_addr) :
+ inet_ntoa6(&p->t_inpcb->inp_inc.inc_ie.ie_dependladdr.id6_addr);
tcps_raddr = p == NULL ? 0 :
p->t_inpcb->inp_vflag == INP_IPV4 ?
- inet_ntoa(&p->t_inpcb->inp_inc.inc_ie.ie_dependfaddr.ie46_foreign.ia46_addr4.s_addr) :
- inet_ntoa6(&p->t_inpcb->inp_inc.inc_ie.ie_dependfaddr.ie6_foreign);
+ inet_ntoa(&p->t_inpcb->inp_inc.inc_ie.ie_dependfaddr.id46_addr.ia46_addr4.s_addr) :
+ inet_ntoa6(&p->t_inpcb->inp_inc.inc_ie.ie_dependfaddr.id6_addr);
tcps_state = p == NULL ? -1 : p->t_state;
tcps_iss = p == NULL ? 0 : p->iss;
tcps_irs = p == NULL ? 0 : p->irs;
Index: sys/kern/uipc_debug.c
===================================================================
--- sys/kern/uipc_debug.c
+++ sys/kern/uipc_debug.c
@@ -75,7 +75,7 @@
}
static void
-db_print_sooptions(short so_options)
+db_print_sooptions(int so_options)
{
int comma;
@@ -120,6 +120,10 @@
db_printf("%sSO_REUSEPORT", comma ? ", " : "");
comma = 1;
}
+ if (so_options & SO_REUSEPORT_LB) {
+ db_printf("%sSO_REUSEPORT_LB", comma ? ", " : "");
+ comma = 1;
+ }
if (so_options & SO_TIMESTAMP) {
db_printf("%sSO_TIMESTAMP", comma ? ", " : "");
comma = 1;
Index: sys/kern/uipc_socket.c
===================================================================
--- sys/kern/uipc_socket.c
+++ sys/kern/uipc_socket.c
@@ -1055,6 +1055,100 @@
sodealloc(so);
}
+/*
+ * Let socket in same load balance group (same port and address)
+ * inherit pending sockets of the closing socket.
+ *
+ * "so_inh" will inherit sockets from "so"
+ */
+void
+soinherit(struct socket *so, struct socket *so_inh)
+{
+ TAILQ_HEAD(, socket) comp, incomp;
+ struct socket *sp, *head, *head_inh;
+ int qlen, incqlen;
+
+ KASSERT(so->so_options & SO_ACCEPTCONN,
+ ("so does not accept connection"));
+ KASSERT(so_inh->so_options & SO_ACCEPTCONN,
+ ("so_inh does not accept connection"));
+
+
+restart:
+ SOCK_LOCK(so);
+ if ((head = so->so_listen) != NULL &&
+ __predict_false(SOLISTEN_TRYLOCK(head) == 0)) {
+ SOCK_UNLOCK(so);
+ goto restart;
+ }
+
+restart_inh:
+ SOCK_LOCK(so_inh);
+ if ((head_inh = so_inh->so_listen) != NULL &&
+ __predict_false(SOLISTEN_TRYLOCK(head_inh) == 0)) {
+ SOCK_UNLOCK(so_inh);
+ goto restart_inh;
+ }
+
+ TAILQ_INIT(&comp);
+ TAILQ_INIT(&incomp);
+
+ /*
+ * Save completed queue and incompleted queue
+ */
+ TAILQ_CONCAT(&comp, &so->sol_comp, so_list);
+ qlen = so->sol_qlen;
+ so->sol_qlen = 0;
+
+ TAILQ_CONCAT(&incomp, &so->sol_incomp, so_list);
+ incqlen = so->sol_incqlen;
+ so->sol_incqlen = 0;
+
+ /*
+ * Append the saved completed queue and incompleted
+ * queue to the socket inherits them.
+ *
+ * XXX
+ * This may temporarily break the inheriting socket's
+ * so_qlimit.
+ */
+ TAILQ_FOREACH(sp, &comp, so_list) {
+ refcount_acquire(&so_inh->so_count);
+ sp->so_listen = so_inh;
+ crfree(sp->so_cred);
+ sp->so_cred = crhold(so_inh->so_cred);
+ }
+
+ TAILQ_FOREACH(sp, &incomp, so_list) {
+ refcount_acquire(&so_inh->so_count);
+ sp->so_listen = so_inh;
+ crfree(sp->so_cred);
+ sp->so_cred = crhold(so_inh->so_cred);
+ }
+
+ TAILQ_CONCAT(&so_inh->sol_comp, &comp, so_list);
+ so_inh->sol_qlen += qlen;
+
+ TAILQ_CONCAT(&so_inh->sol_incomp, &incomp, so_list);
+ so_inh->sol_incqlen += incqlen;
+
+ SOCK_UNLOCK(so);
+ if(head != NULL)
+ SOLISTEN_UNLOCK(head);
+
+ SOCK_UNLOCK(so_inh);
+ if(head_inh != NULL) {
+ if(qlen > 0) {
+ /*
+ * "New" connections have arrived
+ */
+ solisten_wakeup(head_inh);
+ } else {
+ SOLISTEN_UNLOCK(head_inh);
+ }
+ }
+}
+
/*
* Close a socket on last file table reference removal. Initiate disconnect
* if connected. Free socket when disconnect complete.
@@ -2775,6 +2869,7 @@
case SO_BROADCAST:
case SO_REUSEADDR:
case SO_REUSEPORT:
+ case SO_REUSEPORT_LB:
case SO_OOBINLINE:
case SO_TIMESTAMP:
case SO_BINTIME:
@@ -2993,6 +3088,7 @@
case SO_KEEPALIVE:
case SO_REUSEADDR:
case SO_REUSEPORT:
+ case SO_REUSEPORT_LB:
case SO_BROADCAST:
case SO_OOBINLINE:
case SO_ACCEPTCONN:
Index: sys/netinet/in_pcb.h
===================================================================
--- sys/netinet/in_pcb.h
+++ sys/netinet/in_pcb.h
@@ -76,6 +76,11 @@
struct in_addr ia46_addr4;
};
+union in_dependaddr {
+ struct in_addr_4in6 id46_addr;
+ struct in6_addr id6_addr;
+};
+
/*
* NOTE: ipv6 addrs should be 64-bit aligned, per RFC 2553. in_conninfo has
* some extra padding to accomplish this.
@@ -86,22 +91,14 @@
u_int16_t ie_fport; /* foreign port */
u_int16_t ie_lport; /* local port */
/* protocol dependent part, local and foreign addr */
- union {
- /* foreign host table entry */
- struct in_addr_4in6 ie46_foreign;
- struct in6_addr ie6_foreign;
- } ie_dependfaddr;
- union {
- /* local host table entry */
- struct in_addr_4in6 ie46_local;
- struct in6_addr ie6_local;
- } ie_dependladdr;
+ union in_dependaddr ie_dependfaddr; /* foreign host table entry */
+ union in_dependaddr ie_dependladdr; /* local host table entry */
+#define ie_faddr ie_dependfaddr.id46_addr.ia46_addr4
+#define ie_laddr ie_dependladdr.id46_addr.ia46_addr4
+#define ie6_faddr ie_dependfaddr.id6_addr
+#define ie6_laddr ie_dependladdr.id6_addr
u_int32_t ie6_zoneid; /* scope zone id */
};
-#define ie_faddr ie_dependfaddr.ie46_foreign.ia46_addr4
-#define ie_laddr ie_dependladdr.ie46_local.ia46_addr4
-#define ie6_faddr ie_dependfaddr.ie6_foreign
-#define ie6_laddr ie_dependladdr.ie6_local
/*
* XXX The defines for inc_* are hacks and should be changed to direct
@@ -328,6 +325,21 @@
u_short phd_port;
};
+struct inpcblbgroup {
+ LIST_ENTRY(inpcblbgroup) il_list;
+ uint16_t il_lport;
+ u_char il_vflag;
+ u_char il_pad;
+ uint32_t il_pad2;
+ union in_dependaddr il_dependladdr;
+#define il_laddr il_dependladdr.id46_addr.ia46_addr4
+#define il6_laddr il_dependladdr.id6_addr
+ uint32_t il_inpsiz; /* size of il_inp[] */
+ uint32_t il_inpcnt; /* # of elem in il_inp[] */
+ struct inpcb *il_inp[];
+};
+LIST_HEAD(inpcblbgrouphead, inpcblbgroup);
+
/*-
* Global data structure for each high-level protocol (UDP, TCP, ...) in both
* IPv4 and IPv6. Holds inpcb lists and information for managing them.
@@ -420,6 +432,13 @@
struct inpcbhead *ipi_wildbase; /* (p) */
u_long ipi_wildmask; /* (p) */
+ /*
+ * Load balanced group used by the SO_REUSEPORT_LB option,
+ * hashed by local address and local port.
+ */
+ struct inpcblbgrouphead *ipi_lbgrouphashbase;
+ u_long ipi_lbgrouphashmask;
+
/*
* Pointer to network stack instance
*/
@@ -506,7 +525,7 @@
inp_inpcbtotcpcb(struct inpcb *inp);
void inp_4tuple_get(struct inpcb *inp, uint32_t *laddr, uint16_t *lp,
uint32_t *faddr, uint16_t *fp);
-short inp_so_options(const struct inpcb *inp);
+int inp_so_options(const struct inpcb *inp);
#endif /* _KERNEL */
@@ -569,6 +588,10 @@
(((faddr) ^ ((faddr) >> 16) ^ ntohs((lport) ^ (fport))) & (mask))
#define INP_PCBPORTHASH(lport, mask) \
(ntohs((lport)) & (mask))
+#define INP_PCBLBGROUP_PORTHASH(lport, mask) \
+ (ntohs((lport)) & (mask))
+#define INP_PCBLBGROUP_PKTHASH(faddr, lport, fport) \
+ ((faddr) ^ ((faddr) >> 16) ^ ntohs((lport) ^ (fport)))
#define INP6_PCBHASHKEY(faddr) ((faddr)->s6_addr32[3])
/*
@@ -636,6 +659,7 @@
#define INP_RECVRSSBUCKETID 0x00000200 /* populate recv datagram with bucket id */
#define INP_RATE_LIMIT_CHANGED 0x00000400 /* rate limit needs attention */
#define INP_ORIGDSTADDR 0x00000800 /* receive IP dst address/port */
+#define INP_REUSEPORT_LB 0x00001000 /* SO_REUSEPORT_LB option is set */
/*
* Flags passed to in_pcblookup*() functions.
@@ -738,6 +762,8 @@
struct inpcb *
in_pcblookup(struct inpcbinfo *, struct in_addr, u_int,
struct in_addr, u_int, int, struct ifnet *);
+struct inpcb *
+ in_pcblookup_lbgroup_last(const struct inpcb *inp);
struct inpcb *
in_pcblookup_mbuf(struct inpcbinfo *, struct in_addr, u_int,
struct in_addr, u_int, int, struct ifnet *, struct mbuf *);
Index: sys/netinet/in_pcb.c
===================================================================
--- sys/netinet/in_pcb.c
+++ sys/netinet/in_pcb.c
@@ -102,6 +102,9 @@
#include <security/mac/mac_framework.h>
+#define INPCBLBGROUP_SIZMIN 8
+#define INPCBLBGROUP_SIZMAX 256
+
static struct callout ipport_tick_callout;
/*
@@ -211,6 +214,185 @@
* functions often modify hash chains or addresses in pcbs.
*/
+static struct inpcblbgroup *
+in_pcblbgroup_alloc(struct inpcblbgrouphead *hdr, u_char vflag,
+ uint16_t port, const union in_dependaddr *addr, int size)
+{
+ struct inpcblbgroup *grp;
+
+ size_t bytes = __offsetof(struct inpcblbgroup, il_inp[size]);
+ grp = malloc(bytes, M_PCB, M_ZERO | M_NOWAIT);
+ if(!grp)
+ return NULL;
+ grp->il_vflag = vflag;
+ grp->il_lport = port;
+ grp->il_dependladdr = *addr;
+ grp->il_inpsiz = size;
+ LIST_INSERT_HEAD(hdr, grp, il_list);
+
+ return grp;
+}
+
+static void
+in_pcblbgroup_free(struct inpcblbgroup *grp)
+{
+ LIST_REMOVE(grp, il_list);
+ free(grp, M_TEMP);
+}
+
+static struct inpcblbgroup *
+in_pcblbgroup_resize(struct inpcblbgrouphead *hdr,
+ struct inpcblbgroup *old_grp, int size)
+{
+ struct inpcblbgroup *grp;
+ int i;
+
+ grp = in_pcblbgroup_alloc(hdr, old_grp->il_vflag,
+ old_grp->il_lport, &old_grp->il_dependladdr, size);
+ if(!grp)
+ return NULL;
+
+ KASSERT(old_grp->il_inpcnt < grp->il_inpsiz,
+ ("invalid new local group size %d and old local group count %d",
+ grp->il_inpsiz, old_grp->il_inpcnt));
+ for (i = 0; i < old_grp->il_inpcnt; ++i)
+ grp->il_inp[i] = old_grp->il_inp[i];
+ grp->il_inpcnt = old_grp->il_inpcnt;
+
+ in_pcblbgroup_free(old_grp);
+
+ return grp;
+}
+
+/*
+ * Add PCB to lb group (load balance used by SO_REUSEPORT_LB)
+ */
+static int
+in_pcbinslbgrouphash(struct inpcb *inp, struct inpcbinfo *pcbinfo)
+{
+ struct inpcblbgrouphead *hdr;
+ struct inpcblbgroup *grp;
+
+ uint16_t hashmask = pcbinfo->ipi_lbgrouphashmask;
+ uint16_t lport = inp->inp_lport;
+ uint32_t group_index = INP_PCBLBGROUP_PORTHASH(lport, hashmask);
+
+ hdr = &pcbinfo->ipi_lbgrouphashbase[group_index];
+
+ struct ucred *cred;
+
+ if (pcbinfo->ipi_lbgrouphashbase == NULL)
+ return 0;
+
+ /*
+ * don't allow jailed socket to join local group
+ */
+ if (inp->inp_socket != NULL)
+ cred = inp->inp_socket->so_cred;
+ else
+ cred = NULL;
+ if (cred != NULL && jailed(cred))
+ return 0;
+
+#ifdef INET6
+ /*
+ * don't allow IPv4 mapped INET6 wild socket
+ */
+ if ((inp->inp_vflag & INP_IPV4) &&
+ inp->inp_laddr.s_addr == INADDR_ANY &&
+ INP_CHECK_SOCKAF(inp->inp_socket, AF_INET6)) {
+ return 0;
+ }
+#endif
+
+ hdr = &pcbinfo->ipi_lbgrouphashbase[
+ INP_PCBLBGROUP_PORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask)];
+
+ LIST_FOREACH(grp, hdr, il_list) {
+ if (grp->il_vflag == inp->inp_vflag &&
+ grp->il_lport == inp->inp_lport &&
+ memcmp(&grp->il_dependladdr,
+ &inp->inp_inc.inc_ie.ie_dependladdr,
+ sizeof(grp->il_dependladdr)) == 0) {
+ break;
+ }
+ }
+ if (grp == NULL) {
+ /* Create new load balance group */
+ grp = in_pcblbgroup_alloc(hdr, inp->inp_vflag,
+ inp->inp_lport, &inp->inp_inc.inc_ie.ie_dependladdr,
+ INPCBLBGROUP_SIZMIN);
+ if(!grp)
+ return (ENOBUFS);
+ } else if (grp->il_inpcnt == grp->il_inpsiz) {
+ if (grp->il_inpsiz >= INPCBLBGROUP_SIZMAX) {
+ static int limit_logged = 0;
+
+ if (!limit_logged) {
+ limit_logged = 1;
+ printf("lb group port %d, "
+ "limit reached\n", ntohs(grp->il_lport));
+ }
+ return 0;
+ }
+
+ /* Expand this local group */
+ grp = in_pcblbgroup_resize(hdr, grp, grp->il_inpsiz * 2);
+ if(!grp)
+ return (ENOBUFS);
+ }
+
+ KASSERT(grp->il_inpcnt < grp->il_inpsiz,
+ ("invalid local group size %d and count %d",
+ grp->il_inpsiz, grp->il_inpcnt));
+
+ grp->il_inp[grp->il_inpcnt] = inp;
+ grp->il_inpcnt++;
+ return 0;
+}
+
+static void
+in_pcbremlbgrouphash(struct inpcb *inp, struct inpcbinfo *pcbinfo)
+{
+ struct inpcblbgrouphead *hdr;
+ struct inpcblbgroup *grp;
+
+ if (pcbinfo->ipi_lbgrouphashbase == NULL)
+ return;
+
+ hdr = &pcbinfo->ipi_lbgrouphashbase[
+ INP_PCBLBGROUP_PORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask)];
+
+ LIST_FOREACH(grp, hdr, il_list) {
+ int i;
+
+ for (i = 0; i < grp->il_inpcnt; ++i) {
+ if (grp->il_inp[i] != inp)
+ continue;
+
+ if (grp->il_inpcnt == 1) {
+ /* Free this local group */
+ in_pcblbgroup_free(grp);
+ } else {
+ /* Pull up inpcbs */
+ for (; i + 1 < grp->il_inpcnt; ++i)
+ grp->il_inp[i] = grp->il_inp[i + 1];
+ grp->il_inpcnt--;
+
+ if (grp->il_inpsiz > INPCBLBGROUP_SIZMIN &&
+ grp->il_inpcnt <= (grp->il_inpsiz / 4)) {
+ /* Shrink this local group */
+ struct inpcblbgroup *new_grp =
+ in_pcblbgroup_resize(hdr, grp, grp->il_inpsiz / 2);
+ if(new_grp)
+ grp = new_grp;
+ }
+ }
+ return;
+ }
+ }
+}
+
/*
* Different protocols initialize their inpcbs differently - giving
* different name to the lock. But they all are disposed the same.
@@ -246,6 +428,8 @@
&pcbinfo->ipi_hashmask);
pcbinfo->ipi_porthashbase = hashinit(porthash_nelements, M_PCB,
&pcbinfo->ipi_porthashmask);
+ pcbinfo->ipi_lbgrouphashbase = hashinit(hash_nelements, M_PCB,
+ &pcbinfo->ipi_lbgrouphashmask);
#ifdef PCBGROUP
in_pcbgroup_init(pcbinfo, hashfields, hash_nelements);
#endif
@@ -269,6 +453,8 @@
hashdestroy(pcbinfo->ipi_hashbase, M_PCB, pcbinfo->ipi_hashmask);
hashdestroy(pcbinfo->ipi_porthashbase, M_PCB,
pcbinfo->ipi_porthashmask);
+ hashdestroy(pcbinfo->ipi_lbgrouphashbase, M_PCB,
+ pcbinfo->ipi_lbgrouphashmask);
#ifdef PCBGROUP
in_pcbgroup_destroy(pcbinfo);
#endif
@@ -507,18 +693,20 @@
/*
* Return cached socket options.
*/
-short
+int
inp_so_options(const struct inpcb *inp)
{
- short so_options;
+ int so_options;
- so_options = 0;
+ so_options = 0;
- if ((inp->inp_flags2 & INP_REUSEPORT) != 0)
- so_options |= SO_REUSEPORT;
- if ((inp->inp_flags2 & INP_REUSEADDR) != 0)
- so_options |= SO_REUSEADDR;
- return (so_options);
+ if ((inp->inp_flags2 & INP_REUSEPORT_LB) != 0)
+ so_options |= SO_REUSEPORT_LB;
+ if ((inp->inp_flags2 & INP_REUSEPORT) != 0)
+ so_options |= SO_REUSEPORT;
+ if ((inp->inp_flags2 & INP_REUSEADDR) != 0)
+ so_options |= SO_REUSEADDR;
+ return (so_options);
}
#endif /* INET || INET6 */
@@ -574,6 +762,12 @@
int lookupflags = 0, reuseport = (so->so_options & SO_REUSEPORT);
int error;
+ /*
+ * XXX Maybe we could let SO_REUSEPORT_LB set SO_REUSEPORT bit here
+ * so that we don't have to add to the (already messy) code below
+ */
+ int reuseport_lb = (so->so_options & SO_REUSEPORT_LB);
+
/*
* No state changes, so read locks are sufficient here.
*/
@@ -585,7 +779,7 @@
laddr.s_addr = *laddrp;
if (nam != NULL && laddr.s_addr != INADDR_ANY)
return (EINVAL);
- if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) == 0)
+ if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT|SO_REUSEPORT_LB)) == 0)
lookupflags = INPLOOKUP_WILDCARD;
if (nam == NULL) {
if ((error = prison_local_ip4(cred, &laddr)) != 0)
@@ -622,16 +816,20 @@
*/
if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) != 0)
reuseport = SO_REUSEADDR|SO_REUSEPORT;
+ // XXX: How to deal with SO_REUSEPORT_LB here?
+ // Added equivalent treatment as SO_REUSEPORT here for now
+ if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT_LB)) != 0)
+ reuseport_lb = SO_REUSEADDR|SO_REUSEPORT_LB;
} else if (sin->sin_addr.s_addr != INADDR_ANY) {
sin->sin_port = 0; /* yech... */
bzero(&sin->sin_zero, sizeof(sin->sin_zero));
/*
- * Is the address a local IP address?
+ * Is the address a local IP address?
* If INP_BINDANY is set, then the socket may be bound
* to any endpoint address, local or not.
*/
if ((inp->inp_flags & INP_BINDANY) == 0 &&
- ifa_ifwithaddr_check((struct sockaddr *)sin) == 0)
+ ifa_ifwithaddr_check((struct sockaddr *)sin) == 0)
return (EADDRNOTAVAIL);
}
laddr = sin->sin_addr;
@@ -661,7 +859,8 @@
ntohl(t->inp_faddr.s_addr) == INADDR_ANY) &&
(ntohl(sin->sin_addr.s_addr) != INADDR_ANY ||
ntohl(t->inp_laddr.s_addr) != INADDR_ANY ||
- (t->inp_flags2 & INP_REUSEPORT) == 0) &&
+ (t->inp_flags2 & INP_REUSEPORT) ||
+ (t->inp_flags2 & INP_REUSEPORT_LB) == 0) &&
(inp->inp_cred->cr_uid !=
t->inp_cred->cr_uid))
return (EADDRINUSE);
@@ -686,11 +885,14 @@
*/
tw = intotw(t);
if (tw == NULL ||
- (reuseport & tw->tw_so_options) == 0)
+ ((reuseport & tw->tw_so_options) == 0 &&
+ (reuseport_lb & tw->tw_so_options) == 0)) {
return (EADDRINUSE);
+ }
} else if (t &&
- ((inp->inp_flags2 & INP_BINDMULTI) == 0) &&
- (reuseport & inp_so_options(t)) == 0) {
+ ((inp->inp_flags2 & INP_BINDMULTI) == 0) &&
+ (reuseport & inp_so_options(t)) == 0 &&
+ (reuseport_lb & inp_so_options(t)) == 0) {
#ifdef INET6
if (ntohl(sin->sin_addr.s_addr) !=
INADDR_ANY ||
@@ -699,7 +901,7 @@
(inp->inp_vflag & INP_IPV6PROTO) == 0 ||
(t->inp_vflag & INP_IPV6PROTO) == 0)
#endif
- return (EADDRINUSE);
+ return (EADDRINUSE);
if (t && (! in_pcbbind_check_bindmulti(inp, t)))
return (EADDRINUSE);
}
@@ -1360,6 +1562,7 @@
struct inpcbport *phd = inp->inp_phd;
INP_HASH_WLOCK(inp->inp_pcbinfo);
+ in_pcbremlbgrouphash(inp, inp->inp_pcbinfo);
LIST_REMOVE(inp, inp_hash);
LIST_REMOVE(inp, inp_portlist);
if (LIST_FIRST(&phd->phd_pcblist) == NULL) {
@@ -1620,6 +1823,98 @@
}
#undef INP_LOOKUP_MAPPED_PCB_COST
+struct inpcb *
+in_pcblookup_lbgroup_last(const struct inpcb *inp)
+{
+ const struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
+ const struct inpcblbgrouphead *hdr;
+ const struct inpcblbgroup *grp;
+ int i;
+
+ if (pcbinfo->ipi_lbgrouphashbase == NULL)
+ return NULL;
+
+ hdr = &pcbinfo->ipi_lbgrouphashbase[
+ INP_PCBLBGROUP_PORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask)];
+
+ LIST_FOREACH(grp, hdr, il_list) {
+ if (grp->il_vflag == inp->inp_vflag &&
+ grp->il_lport == inp->inp_lport &&
+ memcmp(&grp->il_dependladdr,
+ &inp->inp_inc.inc_ie.ie_dependladdr,
+ sizeof(grp->il_dependladdr)) == 0) {
+ break;
+ }
+ }
+ if (grp == NULL || grp->il_inpcnt == 1)
+ return NULL;
+
+ KASSERT(grp->il_inpcnt >= 2,
+ ("invalid lbgroup inp count %d", grp->il_inpcnt));
+ for (i = 0; i < grp->il_inpcnt; ++i) {
+ if (grp->il_inp[i] == inp) {
+ int last = grp->il_inpcnt - 1;
+
+ if (i == last)
+ last = grp->il_inpcnt - 2;
+ return grp->il_inp[last];
+ }
+ }
+ return NULL;
+}
+
+static struct inpcb *
+in_pcblookup_lbgroup(const struct inpcbinfo *pcbinfo,
+ const struct in_addr *laddr, uint16_t lport, const struct in_addr *faddr,
+ uint16_t fport, int lookupflags)
+{
+ struct inpcb *local_wild = NULL;
+ const struct inpcblbgrouphead *hdr;
+ struct inpcblbgroup *grp;
+ struct inpcblbgroup *grp_local_wild;
+
+ hdr = &pcbinfo->ipi_lbgrouphashbase[
+ INP_PCBLBGROUP_PORTHASH(lport, pcbinfo->ipi_lbgrouphashmask)];
+
+ /*
+ * Order of socket selection:
+ * 1. non-wild.
+ * 2. wild (if lookupflags contains INPLOOKUP_WILDCARD).
+ *
+ * NOTE:
+ * - Load balanced group does not contain jailed sockets
+ * - Load balanced group does not contain IPv4 mapped INET6 wild sockets
+ */
+ LIST_FOREACH(grp, hdr, il_list) {
+#ifdef INET6
+ if (!(grp->il_vflag & INP_IPV4))
+ continue;
+#endif
+
+ if (grp->il_lport == lport) {
+
+ uint32_t idx = 0;
+ int pkt_hash = INP_PCBLBGROUP_PKTHASH(faddr->s_addr, lport, fport);
+
+ idx = pkt_hash % grp->il_inpcnt;
+
+ if (grp->il_laddr.s_addr == laddr->s_addr) {
+ return grp->il_inp[idx];
+ } else {
+ if (grp->il_laddr.s_addr == INADDR_ANY &&
+ (lookupflags & INPLOOKUP_WILDCARD)) {
+ local_wild = grp->il_inp[idx];
+ grp_local_wild = grp;
+ }
+ }
+ }
+ }
+ if (local_wild != NULL) {
+ return local_wild;
+ }
+ return NULL;
+}
+
#ifdef PCBGROUP
/*
* Lookup PCB in hash list, using pcbgroup tables.
@@ -1883,6 +2178,16 @@
if (tmpinp != NULL)
return (tmpinp);
+ /*
+ * Then look in lb group
+ */
+ if (pcbinfo->ipi_lbgrouphashbase != NULL) {
+ inp = in_pcblookup_lbgroup(pcbinfo, &laddr, lport, &faddr, fport, lookupflags);
+ if (inp != NULL) {
+ return inp;
+ }
+ }
+
/*
* Then look for a wildcard match, if requested.
*/
@@ -2085,6 +2390,7 @@
struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
struct inpcbport *phd;
u_int32_t hashkey_faddr;
+ int so_options;
INP_WLOCK_ASSERT(inp);
INP_HASH_WLOCK_ASSERT(pcbinfo);
@@ -2105,6 +2411,20 @@
pcbporthash = &pcbinfo->ipi_porthashbase[
INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_porthashmask)];
+
+ /*
+ * Add entry in lb group
+ * Only do this if SO_REUSEPORT_LB is set
+ */
+ so_options = inp_so_options(inp);
+ if(so_options & SO_REUSEPORT_LB) {
+ int ret = in_pcbinslbgrouphash(inp, pcbinfo);
+ if(ret) {
+ // pcb lb group malloc fail (ret=ENOBUFS)
+ return ret;
+ }
+ }
+
/*
* Go through port list and look for a head for this lport.
*/
@@ -2231,6 +2551,10 @@
struct inpcbport *phd = inp->inp_phd;
INP_HASH_WLOCK(pcbinfo);
+
+ // XXX Only do if SO_REUSEPORT_LB set?
+ in_pcbremlbgrouphash(inp, pcbinfo);
+
LIST_REMOVE(inp, inp_hash);
LIST_REMOVE(inp, inp_portlist);
if (LIST_FIRST(&phd->phd_pcblist) == NULL) {
Index: sys/netinet/ip_output.c
===================================================================
--- sys/netinet/ip_output.c
+++ sys/netinet/ip_output.c
@@ -987,6 +987,15 @@
INP_WUNLOCK(inp);
error = 0;
break;
+ case SO_REUSEPORT_LB:
+ INP_WLOCK(inp);
+ if ((so->so_options & SO_REUSEPORT_LB) != 0)
+ inp->inp_flags2 |= INP_REUSEPORT_LB;
+ else
+ inp->inp_flags2 &= ~INP_REUSEPORT_LB;
+ INP_WUNLOCK(inp);
+ error = 0;
+ break;
case SO_SETFIB:
INP_WLOCK(inp);
inp->inp_inc.inc_fibnum = so->so_fibnum;
Index: sys/netinet/tcp_subr.c
===================================================================
--- sys/netinet/tcp_subr.c
+++ sys/netinet/tcp_subr.c
@@ -1633,10 +1633,28 @@
{
struct inpcb *inp = tp->t_inpcb;
struct socket *so;
+ struct inpcb *inp_inh = NULL;
+ int listen = tp->t_state & TCPS_LISTEN;
INP_INFO_LOCK_ASSERT(&V_tcbinfo);
INP_WLOCK_ASSERT(inp);
+ if (listen) {
+ /*
+ * Pending socket/syncache inheritance
+ *
+ * If this is a listen(2) socket, find another listen(2)
+ * socket in the same local group, which could inherit
+ * the syncache and sockets pending on the completion
+ * and incompletion queues.
+ *
+ * NOTE:
+ * Currently the inheritance could only happen on the
+ * listen(2) sockets with SO_REUSEPORT_LB set.
+ */
+ inp_inh = in_pcblookup_lbgroup_last(inp);
+ }
+
#ifdef TCP_OFFLOAD
if (tp->t_state == TCPS_LISTEN)
tcp_offload_listen_stop(tp);
@@ -1658,7 +1676,16 @@
tcp_state_change(tp, TCPS_CLOSED);
KASSERT(inp->inp_socket != NULL, ("tcp_close: inp_socket NULL"));
so = inp->inp_socket;
+
soisdisconnected(so);
+
+ if(listen)
+ {
+ if(inp_inh != NULL && inp_inh->inp_socket != NULL) {
+ soinherit(so, inp_inh->inp_socket);
+ }
+ }
+
if (inp->inp_flags & INP_SOCKREF) {
KASSERT(so->so_state & SS_PROTOREF,
("tcp_close: !SS_PROTOREF"));
Index: sys/netinet6/in6_pcb.c
===================================================================
--- sys/netinet6/in6_pcb.c
+++ sys/netinet6/in6_pcb.c
@@ -123,6 +123,12 @@
int error, lookupflags = 0;
int reuseport = (so->so_options & SO_REUSEPORT);
+ /*
+ * XXX Maybe we could let SO_REUSEPORT_LB set SO_REUSEPORT bit here
+ * so that we don't have to add to the (already messy) code below
+ */
+ int reuseport_lb = (so->so_options & SO_REUSEPORT_LB);
+
INP_WLOCK_ASSERT(inp);
INP_HASH_WLOCK_ASSERT(pcbinfo);
@@ -130,7 +136,7 @@
return (EADDRNOTAVAIL);
if (inp->inp_lport || !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr))
return (EINVAL);
- if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) == 0)
+ if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT|SO_REUSEPORT_LB)) == 0)
lookupflags = INPLOOKUP_WILDCARD;
if (nam == NULL) {
if ((error = prison_local_ip6(cred, &inp->in6p_laddr,
@@ -164,6 +170,10 @@
*/
if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) != 0)
reuseport = SO_REUSEADDR|SO_REUSEPORT;
+ // XXX: How to deal with SO_REUSEPORT_LB here?
+ // Added equivalent treatment as SO_REUSEPORT here for now
+ if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT_LB)) != 0)
+ reuseport_lb = SO_REUSEADDR|SO_REUSEPORT_LB;
} else if (!IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) {
struct ifaddr *ifa;
@@ -212,7 +222,8 @@
IN6_IS_ADDR_UNSPECIFIED(&t->in6p_faddr)) &&
(!IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr) ||
!IN6_IS_ADDR_UNSPECIFIED(&t->in6p_laddr) ||
- (t->inp_flags2 & INP_REUSEPORT) == 0) &&
+ (t->inp_flags2 & INP_REUSEPORT) ||
+ (t->inp_flags2 & INP_REUSEPORT_LB) == 0) &&
(inp->inp_cred->cr_uid !=
t->inp_cred->cr_uid))
return (EADDRINUSE);
@@ -262,34 +273,39 @@
*/
tw = intotw(t);
if (tw == NULL ||
- (reuseport & tw->tw_so_options) == 0)
+ ((reuseport & tw->tw_so_options) == 0 &&
+ (reuseport_lb & tw->tw_so_options) == 0))
return (EADDRINUSE);
- } else if (t && (reuseport & inp_so_options(t)) == 0) {
+ } else if (t && (reuseport & inp_so_options(t)) == 0 &&
+ (reuseport_lb & inp_so_options(t)) == 0) {
return (EADDRINUSE);
}
#ifdef INET
if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0 &&
- IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) {
+ IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) {
struct sockaddr_in sin;
in6_sin6_2_sin(&sin, sin6);
t = in_pcblookup_local(pcbinfo, sin.sin_addr,
- lport, lookupflags, cred);
+ lport, lookupflags, cred);
if (t && t->inp_flags & INP_TIMEWAIT) {
tw = intotw(t);
if (tw == NULL)
return (EADDRINUSE);
if ((reuseport & tw->tw_so_options) == 0
- && (ntohl(t->inp_laddr.s_addr) !=
- INADDR_ANY || ((inp->inp_vflag &
- INP_IPV6PROTO) ==
- (t->inp_vflag & INP_IPV6PROTO))))
+ && (reuseport_lb & tw->tw_so_options) == 0
+ && (ntohl(t->inp_laddr.s_addr) !=
+ INADDR_ANY || ((inp->inp_vflag &
+ INP_IPV6PROTO) ==
+ (t->inp_vflag & INP_IPV6PROTO))))
return (EADDRINUSE);
} else if (t &&
- (reuseport & inp_so_options(t)) == 0 &&
- (ntohl(t->inp_laddr.s_addr) != INADDR_ANY ||
- (t->inp_vflag & INP_IPV6PROTO) != 0))
+ (reuseport & inp_so_options(t)) == 0 &&
+ (reuseport_lb & inp_so_options(t)) == 0 &&
+ (ntohl(t->inp_laddr.s_addr) != INADDR_ANY ||
+ (t->inp_vflag & INP_IPV6PROTO) != 0)) {
return (EADDRINUSE);
+ }
}
#endif
}
@@ -854,6 +870,54 @@
return inp;
}
+static struct inpcb *
+in6_pcblookup_lbgroup(const struct inpcbinfo *pcbinfo,
+ const struct in6_addr *laddr, uint16_t lport, const struct in6_addr *faddr,
+ uint16_t fport, int lookupflags)
+{
+ struct inpcb *local_wild = NULL;
+ const struct inpcblbgrouphead *hdr;
+ struct inpcblbgroup *grp;
+ struct inpcblbgroup *grp_local_wild;
+
+ hdr = &pcbinfo->ipi_lbgrouphashbase[
+ INP_PCBLBGROUP_PORTHASH(lport, pcbinfo->ipi_lbgrouphashmask)];
+
+ /*
+ * Order of socket selection:
+ * 1. non-wild.
+ * 2. wild (if lookupflags contains INPLOOKUP_WILDCARD).
+ *
+ * NOTE:
+ * - Load balanced group does not contain jailed sockets
+ * - Load balanced does not contain IPv4 mapped INET6 wild sockets
+ */
+ LIST_FOREACH(grp, hdr, il_list) {
+
+ if (grp->il_lport == lport) {
+ uint32_t idx = 0;
+ int pkt_hash = INP_PCBLBGROUP_PKTHASH(
+ INP6_PCBHASHKEY(faddr), lport, fport);
+
+ idx = pkt_hash % grp->il_inpcnt;
+
+ if (IN6_ARE_ADDR_EQUAL(&grp->il6_laddr, laddr)) {
+ return grp->il_inp[idx];
+ } else {
+ if (IN6_IS_ADDR_UNSPECIFIED(&grp->il6_laddr) &&
+ (lookupflags & INPLOOKUP_WILDCARD)) {
+ local_wild = grp->il_inp[idx];
+ grp_local_wild = grp;
+ }
+ }
+ }
+ }
+ if (local_wild != NULL) {
+ return local_wild;
+ }
+ return NULL;
+}
+
#ifdef PCBGROUP
/*
* Lookup PCB in hash list, using pcbgroup tables.
@@ -1040,6 +1104,8 @@
}
#endif /* PCBGROUP */
+
+
/*
* Lookup PCB in hash list.
*/
@@ -1085,6 +1151,17 @@
if (tmpinp != NULL)
return (tmpinp);
+ /*
+ * Then look in lb group
+ */
+ if (pcbinfo->ipi_lbgrouphashbase != NULL) {
+ inp = in6_pcblookup_lbgroup(pcbinfo, laddr, lport, faddr,
+ fport, lookupflags);
+ if (inp != NULL) {
+ return inp;
+ }
+ }
+
/*
* Then look for a wildcard match, if requested.
*/
Index: sys/netinet6/ip6_output.c
===================================================================
--- sys/netinet6/ip6_output.c
+++ sys/netinet6/ip6_output.c
@@ -1452,6 +1452,15 @@
INP_WUNLOCK(in6p);
error = 0;
break;
+ case SO_REUSEPORT_LB:
+ INP_WLOCK(in6p);
+ if ((so->so_options & SO_REUSEPORT_LB) != 0)
+ in6p->inp_flags2 |= INP_REUSEPORT_LB;
+ else
+ in6p->inp_flags2 &= ~INP_REUSEPORT_LB;
+ INP_WUNLOCK(in6p);
+ error = 0;
+ break;
case SO_SETFIB:
INP_WLOCK(in6p);
in6p->inp_inc.inc_fibnum = so->so_fibnum;
Index: sys/sys/socket.h
===================================================================
--- sys/sys/socket.h
+++ sys/sys/socket.h
@@ -145,6 +145,9 @@
#define SO_NO_OFFLOAD 0x4000 /* socket cannot be offloaded */
#define SO_NO_DDP 0x8000 /* disable direct data placement */
+// XXX: so_options was only 16 bit, now globally increased to 32 bit
+#define SO_REUSEPORT_LB 0x00010000 /* reuse with load balancing */
+
/*
* Additional options, not kept in so_options.
*/
Index: sys/sys/socketvar.h
===================================================================
--- sys/sys/socketvar.h
+++ sys/sys/socketvar.h
@@ -78,7 +78,7 @@
struct selinfo so_rdsel; /* (b/cr) for so_rcv/so_comp */
struct selinfo so_wrsel; /* (b/cs) for so_snd */
short so_type; /* (a) generic type, see socket.h */
- short so_options; /* (b) from socket call, see socket.h */
+ int so_options; /* (b) from socket call, see socket.h */
short so_linger; /* time to linger close(2) */
short so_state; /* (b) internal state flags SS_* */
void *so_pcb; /* protocol control block */
@@ -200,7 +200,7 @@
size_t xso_len; /* length of this structure */
struct socket *xso_so; /* makes a convenient handle sometimes */
short so_type;
- short so_options;
+ int so_options;
short so_linger;
short so_state;
caddr_t so_pcb; /* another convenient handle */
@@ -386,6 +386,7 @@
int sodisconnect(struct socket *so);
struct sockaddr *sodupsockaddr(const struct sockaddr *sa, int mflags);
void sofree(struct socket *so);
+void soinherit(struct socket *so, struct socket *so_inh);
void sohasoutofband(struct socket *so);
int solisten(struct socket *so, int backlog, struct thread *td);
void solisten_proto(struct socket *so, int backlog);
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Fri, Nov 15, 3:40 PM (17 h, 20 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
14644770
Default Alt Text
D11003.id33740.diff (30 KB)
Attached To
Mode
D11003: Load balance sockets with new SO_REUSEPORT_LB option
Attached
Detach File
Event Timeline
Log In to Comment