Page Menu
Home
FreeBSD
Search
Configure Global Search
Log In
Files
F115465237
D11003.id30483.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Flag For Later
Award Token
Size
41 KB
Referenced Files
None
Subscribers
None
D11003.id30483.diff
View Options
Index: sys/kern/uipc_debug.c
===================================================================
--- sys/kern/uipc_debug.c
+++ sys/kern/uipc_debug.c
@@ -75,7 +75,7 @@
}
static void
-db_print_sooptions(short so_options)
+db_print_sooptions(int so_options)
{
int comma;
@@ -120,6 +120,10 @@
db_printf("%sSO_REUSEPORT", comma ? ", " : "");
comma = 1;
}
+ if (so_options & SO_REUSEPORT_LB) {
+ db_printf("%sSO_REUSEPORT_LB", comma ? ", " : "");
+ comma = 1;
+ }
if (so_options & SO_TIMESTAMP) {
db_printf("%sSO_TIMESTAMP", comma ? ", " : "");
comma = 1;
Index: sys/kern/uipc_socket.c
===================================================================
--- sys/kern/uipc_socket.c
+++ sys/kern/uipc_socket.c
@@ -280,7 +280,7 @@
static void
socket_hhook_register(int subtype)
{
-
+
if (hhook_head_register(HHOOK_TYPE_SOCKET, subtype,
&V_socket_hhh[subtype],
HHOOK_NOWAIT|HHOOK_HEADISINVNET) != 0)
@@ -290,7 +290,7 @@
static void
socket_hhook_deregister(int subtype)
{
-
+
if (hhook_head_deregister(V_socket_hhh[subtype]) != 0)
printf("%s: WARNING: unable to deregister hook\n", __func__);
}
@@ -448,6 +448,8 @@
static void
sodealloc(struct socket *so)
{
+ if(so->inherit)
+ printf("%s] dealloc inherited socket %p\n", __func__, so);
KASSERT(so->so_count == 0, ("sodealloc(): so_count %d", so->so_count));
KASSERT(so->so_pcb == NULL, ("sodealloc(): so_pcb != NULL"));
@@ -936,6 +938,9 @@
SOCK_UNLOCK(so);
sorele(head);
+ if(so->inherit)
+ printf("%s] dequeueing inherited socket %p from socket %p\n", __func__, so, head);
+
*ret = so;
return (0);
}
@@ -963,6 +968,9 @@
void
sofree(struct socket *so)
{
+ if(so->inherit)
+ printf("%s] inherited socket %p\n", __func__, so);
+
struct protosw *pr = so->so_proto;
SOCK_LOCK_ASSERT(so);
@@ -1005,6 +1013,7 @@
TAILQ_REMOVE(&sol->sol_incomp, so, so_list);
sol->sol_incqlen--;
/* This is guarenteed not to be the last. */
+ printf("%s] calling refcount_release\n", __func__);
refcount_release(&sol->so_count);
so->so_qstate = SQ_NONE;
so->so_listen = NULL;
@@ -1053,6 +1062,114 @@
}
/*
+ * Let socket in same load balance group (same port and address)
+ * inherit pending sockets of the closing socket.
+ *
+ * "so_inh" will inherit sockets from "so"
+ */
+void
+soinherit(struct socket *so, struct socket *so_inh)
+{
+ struct thread *td = curthread;
+ struct proc *p = td->td_proc;
+ int pid = p->p_pid;
+ printf("%s] pid %d\n", __func__, pid);
+
+ TAILQ_HEAD(, socket) comp, incomp;
+ struct socket *sp, *head, *head_inh;
+ int qlen, incqlen;
+
+ KASSERT(so->so_options & SO_ACCEPTCONN,
+ ("so does not accept connection"));
+ KASSERT(so_inh->so_options & SO_ACCEPTCONN,
+ ("so_inh does not accept connection"));
+
+ // XXX: Do we need to lock head?
+restart:
+ SOCK_LOCK(so);
+ if ((head = so->so_listen) != NULL &&
+ __predict_false(SOLISTEN_TRYLOCK(head) == 0)) {
+ SOCK_UNLOCK(so);
+ goto restart;
+ }
+
+restart_inh:
+ SOCK_LOCK(so_inh);
+ if ((head_inh = so_inh->so_listen) != NULL &&
+ __predict_false(SOLISTEN_TRYLOCK(head_inh) == 0)) {
+ SOCK_UNLOCK(so_inh);
+ goto restart_inh;
+ }
+
+ TAILQ_INIT(&comp);
+ TAILQ_INIT(&incomp);
+
+ /*
+ * Save completed queue and incompleted queue
+ */
+ TAILQ_CONCAT(&comp, &so->sol_comp, so_list);
+ qlen = so->sol_qlen;
+ so->sol_qlen = 0;
+
+ TAILQ_CONCAT(&incomp, &so->sol_incomp, so_list);
+ incqlen = so->sol_incqlen;
+ so->sol_incqlen = 0;
+
+ printf("%s] got closing socket qlen %d\n", __func__, qlen);
+ printf("%s] got closing socket incqlen %d\n", __func__, incqlen);
+
+ /*
+ * Append the saved completed queue and incompleted
+ * queue to the socket inherits them.
+ *
+ * XXX:
+ * This may temporarily break the inheriting socket's
+ * so_qlimit.
+ */
+ TAILQ_FOREACH(sp, &comp, so_list) {
+ /* XXX: got a problem with negative refcount, is this the correct solution? */
+ refcount_acquire(&so_inh->so_count);
+ sp->so_listen = so_inh;
+ sp->inherit = 1; // for debugging
+ crfree(sp->so_cred);
+ sp->so_cred = crhold(so_inh->so_cred);
+ // XXX: Something more we need to do here?
+ printf("%s] listening socket %p is inheriting comp socket %p\n", __func__, so_inh, sp);
+ }
+
+ TAILQ_FOREACH(sp, &incomp, so_list) {
+ /* XXX: got a problem with negative refcount, is this the correct solution? */
+ refcount_acquire(&so_inh->so_count);
+ sp->inherit = 1; // for debugging
+ sp->so_listen = so_inh;
+ crfree(sp->so_cred);
+ sp->so_cred = crhold(so_inh->so_cred);
+ // XXX: Something more we need to do here?
+ printf("%s] listening socket %p is inheriting incomp socket %p\n", __func__, so_inh, sp);
+ }
+
+ TAILQ_CONCAT(&so_inh->sol_comp, &comp, so_list);
+ so_inh->sol_qlen += qlen;
+
+ TAILQ_CONCAT(&so_inh->sol_incomp, &incomp, so_list);
+ so_inh->sol_incqlen += incqlen;
+
+ SOCK_UNLOCK(so);
+ if(head != NULL)
+ SOLISTEN_UNLOCK(head);
+
+ SOCK_UNLOCK(so_inh);
+ if(head_inh != NULL) {
+ if(qlen > 0) {
+ /* "New" connections have arrived */
+ solisten_wakeup(head_inh);
+ } else {
+ SOLISTEN_UNLOCK(head_inh);
+ }
+ }
+}
+
+/*
* Close a socket on last file table reference removal. Initiate disconnect
* if connected. Free socket when disconnect complete.
*
@@ -1063,6 +1180,9 @@
int
soclose(struct socket *so)
{
+ if(so->inherit)
+ printf("%s] inherited socket %p\n", __func__, so);
+
struct accept_queue lqueue;
bool listening;
int error = 0;
@@ -1114,6 +1234,7 @@
sp->so_listen = NULL;
SOCK_UNLOCK(sp);
/* Guaranteed not to be the last. */
+ printf("%s] calling refcount_release\n", __func__);
refcount_release(&so->so_count);
}
}
@@ -1192,6 +1313,8 @@
int
soconnect(struct socket *so, struct sockaddr *nam, struct thread *td)
{
+ if(so->inherit)
+ printf("%s] connecting inherited socket %p\n", __func__, so);
return (soconnectat(AT_FDCWD, so, nam, td));
}
@@ -1247,6 +1370,9 @@
int
sodisconnect(struct socket *so)
{
+ if(so->inherit)
+ printf("%s] disconnecting inherited socket %p\n", __func__, so);
+
int error;
if ((so->so_state & SS_ISCONNECTED) == 0)
@@ -1429,6 +1555,8 @@
sosend_generic(struct socket *so, struct sockaddr *addr, struct uio *uio,
struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
{
+ if(so->inherit)
+ printf("%s] send_generic on inherited socket %p\n", __func__, so);
long space;
ssize_t resid;
int clen = 0, error, dontroute;
@@ -1610,6 +1738,14 @@
sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
{
+ if(so->inherit) {
+ struct thread *td = curthread;
+ struct proc *p = td->td_proc;
+ int pid = p->p_pid;
+ printf("%s] pid %d\n", __func__, pid);
+
+ printf("%s] send on inherited socket %p\n", __func__, so);
+ }
int error;
CURVNET_SET(so->so_vnet);
@@ -2547,6 +2683,9 @@
soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio,
struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
{
+ if(so->inherit)
+ printf("%s] receiving on inherited socket %p\n", __func__, so);
+
int error;
CURVNET_SET(so->so_vnet);
@@ -2772,6 +2911,7 @@
case SO_BROADCAST:
case SO_REUSEADDR:
case SO_REUSEPORT:
+ case SO_REUSEPORT_LB:
case SO_OOBINLINE:
case SO_TIMESTAMP:
case SO_BINTIME:
@@ -3021,6 +3161,7 @@
case SO_KEEPALIVE:
case SO_REUSEADDR:
case SO_REUSEPORT:
+ case SO_REUSEPORT_LB:
case SO_BROADCAST:
case SO_OOBINLINE:
case SO_ACCEPTCONN:
Index: sys/netinet/in_pcb.h
===================================================================
--- sys/netinet/in_pcb.h
+++ sys/netinet/in_pcb.h
@@ -76,6 +76,11 @@
struct in_addr ia46_addr4;
};
+union in_dependaddr {
+ struct in_addr_4in6 id46_addr;
+ struct in6_addr id6_addr;
+};
+
/*
* NOTE: ipv6 addrs should be 64-bit aligned, per RFC 2553. in_conninfo has
* some extra padding to accomplish this.
@@ -86,22 +91,14 @@
u_int16_t ie_fport; /* foreign port */
u_int16_t ie_lport; /* local port */
/* protocol dependent part, local and foreign addr */
- union {
- /* foreign host table entry */
- struct in_addr_4in6 ie46_foreign;
- struct in6_addr ie6_foreign;
- } ie_dependfaddr;
- union {
- /* local host table entry */
- struct in_addr_4in6 ie46_local;
- struct in6_addr ie6_local;
- } ie_dependladdr;
+ union in_dependaddr ie_dependfaddr; /* foreign host table entry */
+ union in_dependaddr ie_dependladdr; /* local host table entry */
+#define ie_faddr ie_dependfaddr.id46_addr.ia46_addr4
+#define ie_laddr ie_dependladdr.id46_addr.ia46_addr4
+#define ie6_faddr ie_dependfaddr.id6_addr
+#define ie6_laddr ie_dependladdr.id6_addr
u_int32_t ie6_zoneid; /* scope zone id */
};
-#define ie_faddr ie_dependfaddr.ie46_foreign.ia46_addr4
-#define ie_laddr ie_dependladdr.ie46_local.ia46_addr4
-#define ie6_faddr ie_dependfaddr.ie6_foreign
-#define ie6_laddr ie_dependladdr.ie6_local
/*
* XXX The defines for inc_* are hacks and should be changed to direct
@@ -328,6 +325,21 @@
u_short phd_port;
};
+struct inpcblbgroup {
+ LIST_ENTRY(inpcblbgroup) il_list;
+ uint16_t il_lport;
+ u_char il_vflag;
+ u_char il_pad;
+ uint32_t il_pad2;
+ union in_dependaddr il_dependladdr;
+#define il_laddr il_dependladdr.id46_addr.ia46_addr4
+#define il6_laddr il_dependladdr.id6_addr
+ uint32_t il_inpsiz; /* size of il_inp[] */
+ uint32_t il_inpcnt; /* # of elem in il_inp[] */
+ struct inpcb *il_inp[];
+};
+LIST_HEAD(inpcblbgrouphead, inpcblbgroup);
+
/*-
* Global data structure for each high-level protocol (UDP, TCP, ...) in both
* IPv4 and IPv6. Holds inpcb lists and information for managing them.
@@ -421,6 +433,13 @@
u_long ipi_wildmask; /* (p) */
/*
+ * Load balanced group used by the SO_REUSEPORT_LB option,
+ * hashed by local address and local port.
+ */
+ struct inpcblbgrouphead *ipi_lbgrouphashbase;
+ u_long ipi_lbgrouphashmask;
+
+ /*
* Pointer to network stack instance
*/
struct vnet *ipi_vnet; /* (c) */
@@ -506,7 +525,7 @@
inp_inpcbtotcpcb(struct inpcb *inp);
void inp_4tuple_get(struct inpcb *inp, uint32_t *laddr, uint16_t *lp,
uint32_t *faddr, uint16_t *fp);
-short inp_so_options(const struct inpcb *inp);
+int inp_so_options(const struct inpcb *inp);
#endif /* _KERNEL */
@@ -569,6 +588,10 @@
(((faddr) ^ ((faddr) >> 16) ^ ntohs((lport) ^ (fport))) & (mask))
#define INP_PCBPORTHASH(lport, mask) \
(ntohs((lport)) & (mask))
+#define INP_PCBLBGROUP_PORTHASH(lport, mask) \
+ (ntohs((lport)) & (mask))
+#define INP_PCBLBGROUP_PKTHASH(faddr, lport, fport) \
+ ((faddr) ^ ((faddr) >> 16) ^ ntohs((lport) ^ (fport)))
#define INP6_PCBHASHKEY(faddr) ((faddr)->s6_addr32[3])
/*
@@ -624,11 +647,11 @@
/*
* Flags for inp_flags2.
*/
-#define INP_LLE_VALID 0x00000001 /* cached lle is valid */
+#define INP_LLE_VALID 0x00000001 /* cached lle is valid */
#define INP_RT_VALID 0x00000002 /* cached rtentry is valid */
#define INP_PCBGROUPWILD 0x00000004 /* in pcbgroup wildcard list */
#define INP_REUSEPORT 0x00000008 /* SO_REUSEPORT option is set */
-#define INP_FREED 0x00000010 /* inp itself is not valid */
+#define INP_FREED 0x00000010 /* inp itself is not valid */
#define INP_REUSEADDR 0x00000020 /* SO_REUSEADDR option is set */
#define INP_BINDMULTI 0x00000040 /* IP_BINDMULTI option is set */
#define INP_RSS_BUCKET_SET 0x00000080 /* IP_RSS_LISTEN_BUCKET is set */
@@ -636,6 +659,7 @@
#define INP_RECVRSSBUCKETID 0x00000200 /* populate recv datagram with bucket id */
#define INP_RATE_LIMIT_CHANGED 0x00000400 /* rate limit needs attention */
#define INP_ORIGDSTADDR 0x00000800 /* receive IP dst address/port */
+#define INP_REUSEPORT_LB 0x00001000 /* SO_REUSEPORT_LB option is set */
/*
* Flags passed to in_pcblookup*() functions.
@@ -739,6 +763,8 @@
in_pcblookup(struct inpcbinfo *, struct in_addr, u_int,
struct in_addr, u_int, int, struct ifnet *);
struct inpcb *
+ in_pcblookup_lbgroup_last(const struct inpcb *inp);
+struct inpcb *
in_pcblookup_mbuf(struct inpcbinfo *, struct in_addr, u_int,
struct in_addr, u_int, int, struct ifnet *, struct mbuf *);
void in_pcbnotifyall(struct inpcbinfo *pcbinfo, struct in_addr,
Index: sys/netinet/in_pcb.c
===================================================================
--- sys/netinet/in_pcb.c
+++ sys/netinet/in_pcb.c
@@ -102,6 +102,9 @@
#include <security/mac/mac_framework.h>
+#define INPCBLBGROUP_SIZMIN 8
+#define INPCBLBGROUP_SIZMAX 256
+
static struct callout ipport_tick_callout;
/*
@@ -211,6 +214,173 @@
* functions often modify hash chains or addresses in pcbs.
*/
+static struct inpcblbgroup *
+in_pcblbgroup_alloc(struct inpcblbgrouphead *hdr, u_char vflag,
+ uint16_t port, const union in_dependaddr *addr, int size)
+{
+ struct inpcblbgroup *grp;
+
+ size_t bytes = __offsetof(struct inpcblbgroup, il_inp[size]);
+ grp = malloc(bytes, M_PCB, M_WAITOK | M_ZERO);
+ grp->il_vflag = vflag;
+ grp->il_lport = port;
+ grp->il_dependladdr = *addr;
+ grp->il_inpsiz = size;
+ LIST_INSERT_HEAD(hdr, grp, il_list);
+
+ return grp;
+}
+
+static void
+in_pcblbgroup_free(struct inpcblbgroup *grp)
+{
+ LIST_REMOVE(grp, il_list);
+ free(grp, M_TEMP);
+}
+
+static struct inpcblbgroup *
+in_pcblbgroup_resize(struct inpcblbgrouphead *hdr,
+ struct inpcblbgroup *old_grp, int size)
+{
+ struct inpcblbgroup *grp;
+ int i;
+
+ grp = in_pcblbgroup_alloc(hdr, old_grp->il_vflag,
+ old_grp->il_lport, &old_grp->il_dependladdr, size);
+
+ KASSERT(old_grp->il_inpcnt < grp->il_inpsiz,
+ ("invalid new local group size %d and old local group count %d",
+ grp->il_inpsiz, old_grp->il_inpcnt));
+ for (i = 0; i < old_grp->il_inpcnt; ++i)
+ grp->il_inp[i] = old_grp->il_inp[i];
+ grp->il_inpcnt = old_grp->il_inpcnt;
+
+ in_pcblbgroup_free(old_grp);
+
+ return grp;
+}
+
+/*
+ * Add PCB to lb group (load balance used by SO_REUSEPORT_LB)
+ */
+static void
+in_pcbinslbgrouphash(struct inpcb *inp, struct inpcbinfo *pcbinfo)
+{
+ struct inpcblbgrouphead *hdr;
+ struct inpcblbgroup *grp;
+
+ uint16_t hashmask = pcbinfo->ipi_lbgrouphashmask;
+ uint16_t lport = inp->inp_lport;
+ uint32_t group_index = INP_PCBLBGROUP_PORTHASH(lport, hashmask);
+
+ hdr = &pcbinfo->ipi_lbgrouphashbase[group_index];
+
+ struct ucred *cred;
+
+ if (pcbinfo->ipi_lbgrouphashbase == NULL)
+ return;
+
+ /*
+ * don't allow jailed socket to join local group
+ */
+ if (inp->inp_socket != NULL)
+ cred = inp->inp_socket->so_cred;
+ else
+ cred = NULL;
+ if (cred != NULL && jailed(cred))
+ return;
+
+#ifdef INET6
+ /*
+ * don't allow IPv4 mapped INET6 wild socket
+ */
+ if ((inp->inp_vflag & INP_IPV4) &&
+ inp->inp_laddr.s_addr == INADDR_ANY &&
+ INP_CHECK_SOCKAF(inp->inp_socket, AF_INET6))
+ return;
+#endif
+
+ hdr = &pcbinfo->ipi_lbgrouphashbase[
+ INP_PCBLBGROUP_PORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask)];
+
+ LIST_FOREACH(grp, hdr, il_list) {
+ if (grp->il_vflag == inp->inp_vflag &&
+ grp->il_lport == inp->inp_lport &&
+ memcmp(&grp->il_dependladdr,
+ &inp->inp_inc.inc_ie.ie_dependladdr,
+ sizeof(grp->il_dependladdr)) == 0) {
+ break;
+ }
+ }
+ if (grp == NULL) {
+ /* Create new local group */
+ grp = in_pcblbgroup_alloc(hdr, inp->inp_vflag,
+ inp->inp_lport, &inp->inp_inc.inc_ie.ie_dependladdr,
+ INPCBLBGROUP_SIZMIN);
+ } else if (grp->il_inpcnt == grp->il_inpsiz) {
+ if (grp->il_inpsiz >= INPCBLBGROUP_SIZMAX) {
+ static int limit_logged = 0;
+
+ if (!limit_logged) {
+ limit_logged = 1;
+ printf("lb group port %d, "
+ "limit reached\n", ntohs(grp->il_lport));
+ }
+ return;
+ }
+
+ /* Expand this local group */
+ grp = in_pcblbgroup_resize(hdr, grp, grp->il_inpsiz * 2);
+ }
+
+ KASSERT(grp->il_inpcnt < grp->il_inpsiz,
+ ("invalid local group size %d and count %d",
+ grp->il_inpsiz, grp->il_inpcnt));
+
+ grp->il_inp[grp->il_inpcnt] = inp;
+ grp->il_inpcnt++;
+}
+
+static void
+in_pcbremlbgrouphash(struct inpcb *inp, struct inpcbinfo *pcbinfo)
+{
+ struct inpcblbgrouphead *hdr;
+ struct inpcblbgroup *grp;
+
+ if (pcbinfo->ipi_lbgrouphashbase == NULL)
+ return;
+
+ hdr = &pcbinfo->ipi_lbgrouphashbase[
+ INP_PCBLBGROUP_PORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask)];
+
+ LIST_FOREACH(grp, hdr, il_list) {
+ int i;
+
+ for (i = 0; i < grp->il_inpcnt; ++i) {
+ if (grp->il_inp[i] != inp)
+ continue;
+
+ if (grp->il_inpcnt == 1) {
+ /* Free this local group */
+ in_pcblbgroup_free(grp);
+ } else {
+ /* Pull up inpcbs */
+ for (; i + 1 < grp->il_inpcnt; ++i)
+ grp->il_inp[i] = grp->il_inp[i + 1];
+ grp->il_inpcnt--;
+
+ if (grp->il_inpsiz > INPCBLBGROUP_SIZMIN &&
+ grp->il_inpcnt <= (grp->il_inpsiz / 4)) {
+ /* Shrink this local group */
+ grp = in_pcblbgroup_resize(hdr, grp,
+ grp->il_inpsiz / 2);
+ }
+ }
+ return;
+ }
+ }
+}
+
/*
* Different protocols initialize their inpcbs differently - giving
* different name to the lock. But they all are disposed the same.
@@ -246,6 +416,8 @@
&pcbinfo->ipi_hashmask);
pcbinfo->ipi_porthashbase = hashinit(porthash_nelements, M_PCB,
&pcbinfo->ipi_porthashmask);
+ pcbinfo->ipi_lbgrouphashbase = hashinit(hash_nelements, M_PCB,
+ &pcbinfo->ipi_lbgrouphashmask);
#ifdef PCBGROUP
in_pcbgroup_init(pcbinfo, hashfields, hash_nelements);
#endif
@@ -269,6 +441,8 @@
hashdestroy(pcbinfo->ipi_hashbase, M_PCB, pcbinfo->ipi_hashmask);
hashdestroy(pcbinfo->ipi_porthashbase, M_PCB,
pcbinfo->ipi_porthashmask);
+ hashdestroy(pcbinfo->ipi_lbgrouphashbase, M_PCB,
+ pcbinfo->ipi_lbgrouphashmask);
#ifdef PCBGROUP
in_pcbgroup_destroy(pcbinfo);
#endif
@@ -507,18 +681,20 @@
/*
* Return cached socket options.
*/
-short
+int
inp_so_options(const struct inpcb *inp)
{
- short so_options;
+ int so_options;
- so_options = 0;
+ so_options = 0;
- if ((inp->inp_flags2 & INP_REUSEPORT) != 0)
- so_options |= SO_REUSEPORT;
- if ((inp->inp_flags2 & INP_REUSEADDR) != 0)
- so_options |= SO_REUSEADDR;
- return (so_options);
+ if ((inp->inp_flags2 & INP_REUSEPORT_LB) != 0)
+ so_options |= SO_REUSEPORT_LB;
+ if ((inp->inp_flags2 & INP_REUSEPORT) != 0)
+ so_options |= SO_REUSEPORT;
+ if ((inp->inp_flags2 & INP_REUSEADDR) != 0)
+ so_options |= SO_REUSEADDR;
+ return (so_options);
}
#endif /* INET || INET6 */
@@ -575,6 +751,12 @@
int error;
/*
+ * XXX Maybe we could let SO_REUSEPORT_LB set SO_REUSEPORT bit here
+ * so that we don't have to add to the (already messy) code below
+ */
+ int reuseport_lb = (so->so_options & SO_REUSEPORT_LB);
+
+ /*
* No state changes, so read locks are sufficient here.
*/
INP_LOCK_ASSERT(inp);
@@ -585,7 +767,7 @@
laddr.s_addr = *laddrp;
if (nam != NULL && laddr.s_addr != INADDR_ANY)
return (EINVAL);
- if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) == 0)
+ if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT|SO_REUSEPORT_LB)) == 0)
lookupflags = INPLOOKUP_WILDCARD;
if (nam == NULL) {
if ((error = prison_local_ip4(cred, &laddr)) != 0)
@@ -620,18 +802,23 @@
* and a multicast address is bound on both
* new and duplicated sockets.
*/
+
+ // XXX: How to deal with SO_REUSEPORT_LB here?
+ // Added equivalent treatment as SO_REUSEPORT here for now
+ if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT_LB)) != 0)
+ reuseport_lb = SO_REUSEADDR|SO_REUSEPORT_LB;
if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) != 0)
reuseport = SO_REUSEADDR|SO_REUSEPORT;
} else if (sin->sin_addr.s_addr != INADDR_ANY) {
sin->sin_port = 0; /* yech... */
bzero(&sin->sin_zero, sizeof(sin->sin_zero));
/*
- * Is the address a local IP address?
+ * Is the address a local IP address?
* If INP_BINDANY is set, then the socket may be bound
* to any endpoint address, local or not.
*/
if ((inp->inp_flags & INP_BINDANY) == 0 &&
- ifa_ifwithaddr_check((struct sockaddr *)sin) == 0)
+ ifa_ifwithaddr_check((struct sockaddr *)sin) == 0)
return (EADDRNOTAVAIL);
}
laddr = sin->sin_addr;
@@ -661,7 +848,8 @@
ntohl(t->inp_faddr.s_addr) == INADDR_ANY) &&
(ntohl(sin->sin_addr.s_addr) != INADDR_ANY ||
ntohl(t->inp_laddr.s_addr) != INADDR_ANY ||
- (t->inp_flags2 & INP_REUSEPORT) == 0) &&
+ (t->inp_flags2 & INP_REUSEPORT) ||
+ (t->inp_flags2 & INP_REUSEPORT_LB) == 0) &&
(inp->inp_cred->cr_uid !=
t->inp_cred->cr_uid))
return (EADDRINUSE);
@@ -686,11 +874,14 @@
*/
tw = intotw(t);
if (tw == NULL ||
- (reuseport & tw->tw_so_options) == 0)
+ ((reuseport & tw->tw_so_options) == 0 &&
+ (reuseport_lb & tw->tw_so_options) == 0)) {
return (EADDRINUSE);
+ }
} else if (t &&
- ((inp->inp_flags2 & INP_BINDMULTI) == 0) &&
- (reuseport & inp_so_options(t)) == 0) {
+ ((inp->inp_flags2 & INP_BINDMULTI) == 0) &&
+ (reuseport & inp_so_options(t)) == 0 &&
+ (reuseport_lb & inp_so_options(t)) == 0) {
#ifdef INET6
if (ntohl(sin->sin_addr.s_addr) !=
INADDR_ANY ||
@@ -699,7 +890,7 @@
(inp->inp_vflag & INP_IPV6PROTO) == 0 ||
(t->inp_vflag & INP_IPV6PROTO) == 0)
#endif
- return (EADDRINUSE);
+ return (EADDRINUSE);
if (t && (! in_pcbbind_check_bindmulti(inp, t)))
return (EADDRINUSE);
}
@@ -816,7 +1007,7 @@
/*
* If we found a route, use the address corresponding to
* the outgoing interface.
- *
+ *
* Otherwise assume faddr is reachable on a directly connected
* network and try to find a corresponding interface to take
* the source address from.
@@ -1360,6 +1551,7 @@
struct inpcbport *phd = inp->inp_phd;
INP_HASH_WLOCK(inp->inp_pcbinfo);
+ in_pcbremlbgrouphash(inp, inp->inp_pcbinfo);
LIST_REMOVE(inp, inp_hash);
LIST_REMOVE(inp, inp_portlist);
if (LIST_FIRST(&phd->phd_pcblist) == NULL) {
@@ -1620,6 +1812,100 @@
}
#undef INP_LOOKUP_MAPPED_PCB_COST
+struct inpcb *
+in_pcblookup_lbgroup_last(const struct inpcb *inp)
+{
+ const struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
+ const struct inpcblbgrouphead *hdr;
+ const struct inpcblbgroup *grp;
+ int i;
+
+ if (pcbinfo->ipi_lbgrouphashbase == NULL)
+ return NULL;
+
+ hdr = &pcbinfo->ipi_lbgrouphashbase[
+ INP_PCBLBGROUP_PORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask)];
+
+ LIST_FOREACH(grp, hdr, il_list) {
+ if (grp->il_vflag == inp->inp_vflag &&
+ grp->il_lport == inp->inp_lport &&
+ memcmp(&grp->il_dependladdr,
+ &inp->inp_inc.inc_ie.ie_dependladdr,
+ sizeof(grp->il_dependladdr)) == 0) {
+ break;
+ }
+ }
+ if (grp == NULL || grp->il_inpcnt == 1)
+ return NULL;
+
+ KASSERT(grp->il_inpcnt >= 2,
+ ("invalid lbgroup inp count %d", grp->il_inpcnt));
+ for (i = 0; i < grp->il_inpcnt; ++i) {
+ if (grp->il_inp[i] == inp) {
+ int last = grp->il_inpcnt - 1;
+
+ if (i == last)
+ last = grp->il_inpcnt - 2;
+ printf("%s] returning inp at index %d (last)\n", __func__, last);
+ return grp->il_inp[last];
+ }
+ }
+ printf("%s] returning NULL\n", __func__);
+ return NULL;
+}
+
+static struct inpcb *
+in_pcblookup_lbgroup(const struct inpcbinfo *pcbinfo,
+ const struct in_addr *laddr, uint16_t lport, const struct in_addr *faddr,
+ uint16_t fport, int lookupflags)
+{
+ struct inpcb *local_wild = NULL;
+ const struct inpcblbgrouphead *hdr;
+ struct inpcblbgroup *grp;
+ struct inpcblbgroup *grp_local_wild;
+
+ hdr = &pcbinfo->ipi_lbgrouphashbase[
+ INP_PCBLBGROUP_PORTHASH(lport, pcbinfo->ipi_lbgrouphashmask)];
+
+ /*
+ * Order of socket selection:
+ * 1. non-wild.
+ * 2. wild (if lookupflags contains INPLOOKUP_WILDCARD).
+ *
+ * NOTE:
+ * - Local group does not contain jailed sockets
+ * - Local group does not contain IPv4 mapped INET6 wild sockets
+ */
+ LIST_FOREACH(grp, hdr, il_list) {
+#ifdef INET6
+ if (!(grp->il_vflag & INP_IPV4))
+ continue;
+#endif
+
+ if (grp->il_lport == lport) {
+
+ uint32_t idx = 0;
+ int pkt_hash = INP_PCBLBGROUP_PKTHASH(faddr->s_addr, lport, fport);
+
+ idx = pkt_hash % grp->il_inpcnt;
+
+ if (grp->il_laddr.s_addr == laddr->s_addr) {
+ return grp->il_inp[idx];
+ } else {
+ if (grp->il_laddr.s_addr == INADDR_ANY &&
+ (lookupflags & INPLOOKUP_WILDCARD)) {
+ local_wild = grp->il_inp[idx];
+ grp_local_wild = grp;
+ }
+ }
+ }
+ }
+ if (local_wild != NULL) {
+ return local_wild;
+ }
+ return NULL;
+}
+
#ifdef PCBGROUP
/*
* Lookup PCB in hash list, using pcbgroup tables.
@@ -1884,6 +2170,16 @@
return (tmpinp);
/*
+ * Then look in lb group
+ */
+ if (pcbinfo->ipi_lbgrouphashbase != NULL) {
+ inp = in_pcblookup_lbgroup(pcbinfo, &laddr, lport, &faddr, fport, lookupflags);
+ if (inp != NULL) {
+ return inp;
+ }
+ }
+
+ /*
* Then look for a wildcard match, if requested.
*/
if ((lookupflags & INPLOOKUP_WILDCARD) != 0) {
@@ -2085,6 +2381,7 @@
struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
struct inpcbport *phd;
u_int32_t hashkey_faddr;
+ int so_options;
INP_WLOCK_ASSERT(inp);
INP_HASH_WLOCK_ASSERT(pcbinfo);
@@ -2105,6 +2402,16 @@
pcbporthash = &pcbinfo->ipi_porthashbase[
INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_porthashmask)];
+
+ /*
+ * Add entry in lb group
+ * Only do this if SO_REUSEPORT_LB is set
+ */
+ so_options = inp_so_options(inp);
+ if(so_options & SO_REUSEPORT_LB) {
+ in_pcbinslbgrouphash(inp, pcbinfo);
+ }
+
/*
* Go through port list and look for a head for this lport.
*/
@@ -2231,6 +2538,10 @@
struct inpcbport *phd = inp->inp_phd;
INP_HASH_WLOCK(pcbinfo);
+
+ // XXX Only do if SO_REUSEPORT_LB set?
+ in_pcbremlbgrouphash(inp, pcbinfo);
+
LIST_REMOVE(inp, inp_hash);
LIST_REMOVE(inp, inp_portlist);
if (LIST_FIRST(&phd->phd_pcblist) == NULL) {
@@ -2319,7 +2630,7 @@
callout_stop(&ipport_tick_callout);
}
-/*
+/*
* The ipport_callout should start running at about the time we attach the
* inet or inet6 domains.
*/
@@ -2333,7 +2644,7 @@
EVENTHANDLER_REGISTER(shutdown_pre_sync, ip_fini, NULL,
SHUTDOWN_PRI_DEFAULT);
}
-SYSINIT(ipport_tick_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_MIDDLE,
+SYSINIT(ipport_tick_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_MIDDLE,
ipport_tick_init, NULL);
void
Index: sys/netinet/ip_output.c
===================================================================
--- sys/netinet/ip_output.c
+++ sys/netinet/ip_output.c
@@ -993,6 +993,15 @@
INP_WUNLOCK(inp);
error = 0;
break;
+ case SO_REUSEPORT_LB:
+ INP_WLOCK(inp);
+ if ((so->so_options & SO_REUSEPORT_LB) != 0)
+ inp->inp_flags2 |= INP_REUSEPORT_LB;
+ else
+ inp->inp_flags2 &= ~INP_REUSEPORT_LB;
+ INP_WUNLOCK(inp);
+ error = 0;
+ break;
case SO_SETFIB:
INP_WLOCK(inp);
inp->inp_inc.inc_fibnum = so->so_fibnum;
Index: sys/netinet/tcp_input.c
===================================================================
--- sys/netinet/tcp_input.c
+++ sys/netinet/tcp_input.c
@@ -334,7 +334,7 @@
}
}
-void
+void
cc_conn_init(struct tcpcb *tp)
{
struct hc_metrics_lite metrics;
@@ -437,7 +437,7 @@
EXIT_RECOVERY(tp->t_flags);
if (CC_ALGO(tp)->cong_signal == NULL) {
/*
- * RFC5681 Section 3.1
+ * RFC5681 Section 3.1
* ssthresh = max (FlightSize / 2, 2*SMSS) eq (4)
*/
tp->snd_ssthresh =
@@ -1387,9 +1387,11 @@
TCP_PROBE3(debug__input, tp, th, m);
tcp_dooptions(&to, optp, optlen, TO_SYN);
#ifdef TCP_RFC7413
+ printf("%s] inp %p (TCP_RFC7413)\n", __func__, inp);
if (syncache_add(&inc, &to, th, inp, &so, m, NULL, NULL))
goto tfo_socket_result;
#else
+ /* printf("%s] inp %p\n", __func__, inp); */
syncache_add(&inc, &to, th, inp, &so, m, NULL, NULL);
#endif
/*
@@ -1564,7 +1566,7 @@
#ifdef TCP_RFC7413
int tfo_syn;
#endif
-
+
#ifdef TCPDEBUG
/*
* The size of tcp_saveipgen must be the size of the max ip header,
@@ -1770,7 +1772,7 @@
th->th_seq == tp->rcv_nxt &&
(thflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK &&
tp->snd_nxt == tp->snd_max &&
- tiwin && tiwin == tp->snd_wnd &&
+ tiwin && tiwin == tp->snd_wnd &&
((tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN)) == 0) &&
LIST_EMPTY(&tp->t_segq) &&
((to.to_flags & TOF_TS) == 0 ||
@@ -1850,7 +1852,7 @@
if (SEQ_GT(tp->snd_una, tp->snd_recover) &&
SEQ_LEQ(th->th_ack, tp->snd_recover))
tp->snd_recover = th->th_ack - 1;
-
+
/*
* Let the congestion control algorithm update
* congestion control related information. This
@@ -1999,7 +2001,7 @@
goto dropwithreset;
} else if (thflags & TH_SYN) {
/* non-initial SYN is ignored */
- if ((tcp_timer_active(tp, TT_DELACK) ||
+ if ((tcp_timer_active(tp, TT_DELACK) ||
tcp_timer_active(tp, TT_REXMT)))
goto drop;
} else if (!(thflags & (TH_ACK|TH_FIN|TH_RST))) {
@@ -2065,7 +2067,7 @@
tp->t_flags |= TF_ECN_PERMIT;
TCPSTAT_INC(tcps_ecn_shs);
}
-
+
/*
* Received <SYN,ACK> in SYN_SENT[*] state.
* Transitions:
@@ -2383,14 +2385,14 @@
/*
* If last ACK falls within this segment's sequence numbers,
* record its timestamp.
- * NOTE:
+ * NOTE:
* 1) That the test incorporates suggestions from the latest
* proposal of the tcplw@cray.com list (Braden 1993/04/26).
* 2) That updating only on newer timestamps interferes with
* our earlier PAWS tests, so this check should be solely
* predicated on the sequence space of this segment.
- * 3) That we modify the segment boundary check to be
- * Last.ACK.Sent <= SEG.SEQ + SEG.Len
+ * 3) That we modify the segment boundary check to be
+ * Last.ACK.Sent <= SEG.SEQ + SEG.Len
* instead of RFC1323's
* Last.ACK.Sent < SEG.SEQ + SEG.Len,
* This modified check allows us to overcome RFC1323's
@@ -2469,7 +2471,7 @@
/*
* Account for the ACK of our SYN prior to
* regular ACK processing below.
- */
+ */
tp->snd_una++;
}
/*
@@ -2598,10 +2600,10 @@
if ((tp->t_flags & TF_SACK_PERMIT) &&
IN_FASTRECOVERY(tp->t_flags)) {
int awnd;
-
+
/*
* Compute the amount of data in flight first.
- * We can inject new data into the pipe iff
+ * We can inject new data into the pipe iff
* we have less than 1/2 the original window's
* worth of data in flight.
*/
Index: sys/netinet/tcp_subr.c
===================================================================
--- sys/netinet/tcp_subr.c
+++ sys/netinet/tcp_subr.c
@@ -245,7 +245,7 @@
tcp_do_segment,
tcp_default_ctloutput,
NULL,
- NULL,
+ NULL,
NULL,
NULL,
NULL,
@@ -305,11 +305,11 @@
find_and_ref_tcp_functions(struct tcp_function_set *fs)
{
struct tcp_function_block *blk;
-
- rw_rlock(&tcp_function_lock);
+
+ rw_rlock(&tcp_function_lock);
blk = find_tcp_functions_locked(fs);
if (blk)
- refcount_acquire(&blk->tfb_refcnt);
+ refcount_acquire(&blk->tfb_refcnt);
rw_runlock(&tcp_function_lock);
return(blk);
}
@@ -318,10 +318,10 @@
find_and_ref_tcp_fb(struct tcp_function_block *blk)
{
struct tcp_function_block *rblk;
-
- rw_rlock(&tcp_function_lock);
+
+ rw_rlock(&tcp_function_lock);
rblk = find_tcp_fb_locked(blk, NULL);
- if (rblk)
+ if (rblk)
refcount_acquire(&rblk->tfb_refcnt);
rw_runlock(&tcp_function_lock);
return(rblk);
@@ -343,7 +343,7 @@
strcpy(fs.function_set_name, blk->tfb_tcp_block_name);
fs.pcbcnt = blk->tfb_refcnt;
}
- rw_runlock(&tcp_function_lock);
+ rw_runlock(&tcp_function_lock);
error = sysctl_handle_string(oidp, fs.function_set_name,
sizeof(fs.function_set_name), req);
@@ -354,8 +354,8 @@
rw_wlock(&tcp_function_lock);
blk = find_tcp_functions_locked(&fs);
if ((blk == NULL) ||
- (blk->tfb_flags & TCP_FUNC_BEING_REMOVED)) {
- error = ENOENT;
+ (blk->tfb_flags & TCP_FUNC_BEING_REMOVED)) {
+ error = ENOENT;
goto done;
}
tcp_func_set_ptr = blk;
@@ -397,7 +397,7 @@
bufsz -= linesz;
outsz = linesz;
- rw_rlock(&tcp_function_lock);
+ rw_rlock(&tcp_function_lock);
TAILQ_FOREACH(f, &t_functions, tf_next) {
alias = (f->tf_name != f->tf_fb->tfb_tcp_block_name);
linesz = snprintf(cp, bufsz, "%-32s%c %-32s %u\n",
@@ -537,7 +537,7 @@
(blk->tfb_tcp_do_segment == NULL) ||
(blk->tfb_tcp_ctloutput == NULL) ||
(strlen(blk->tfb_tcp_block_name) == 0)) {
- /*
+ /*
* These functions are required and you
* need a name.
*/
@@ -549,7 +549,7 @@
blk->tfb_tcp_timer_active ||
blk->tfb_tcp_timer_stop) {
/*
- * If you define one timer function you
+ * If you define one timer function you
* must have them all.
*/
if ((blk->tfb_tcp_timer_stop_all == NULL) ||
@@ -651,7 +651,7 @@
{
struct tcp_function *f;
int error=ENOENT;
-
+
if (strcmp(blk->tfb_tcp_block_name, "default") == 0) {
/* You can't un-register the default */
return (EPERM);
@@ -665,7 +665,7 @@
if (blk->tfb_refcnt) {
/* Still tcb attached, mark it. */
blk->tfb_flags |= TCP_FUNC_BEING_REMOVED;
- rw_wunlock(&tcp_function_lock);
+ rw_wunlock(&tcp_function_lock);
return (EBUSY);
}
while (find_tcp_fb_locked(blk, &f) != NULL) {
@@ -1069,7 +1069,7 @@
m = n;
} else {
/*
- * reuse the mbuf.
+ * reuse the mbuf.
* XXX MRT We inherit the FIB, which is lucky.
*/
m_freem(m->m_next);
@@ -1439,6 +1439,9 @@
{
struct socket *so = tp->t_inpcb->inp_socket;
+ if(so->inherit)
+ printf("%s] inherited socket %p\n", __func__, so);
+
INP_INFO_LOCK_ASSERT(&V_tcbinfo);
INP_WLOCK_ASSERT(tp->t_inpcb);
@@ -1481,12 +1484,12 @@
tcp_timer_stop(tp, TT_2MSL);
tcp_timer_stop(tp, TT_DELACK);
if (tp->t_fb->tfb_tcp_timer_stop_all) {
- /*
- * Call the stop-all function of the methods,
+ /*
+ * Call the stop-all function of the methods,
* this function should call the tcp_timer_stop()
* method with each of the function specific timeouts.
* That stop will be called via the tfb_tcp_timer_stop()
- * which should use the async drain function of the
+ * which should use the async drain function of the
* callout system (see tcp_var.h).
*/
tp->t_fb->tfb_tcp_timer_stop_all(tp);
@@ -1556,7 +1559,7 @@
if (tp->t_flags & TF_TOE)
tcp_offload_detach(tp);
#endif
-
+
tcp_free_sackholes(tp);
#ifdef TCPPCAP
@@ -1594,7 +1597,7 @@
{
struct inpcb *inp;
struct tcpcb *tp;
-
+
tp = (struct tcpcb *)ptp;
CURVNET_SET(tp->t_vnet);
INP_INFO_RLOCK(&V_tcbinfo);
@@ -1633,10 +1636,38 @@
{
struct inpcb *inp = tp->t_inpcb;
struct socket *so;
+ struct inpcb *inp_inh = NULL;
+ int listen = tp->t_state & TCPS_LISTEN;
+
+ printf("%s] inp %p\n", __func__, inp);
INP_INFO_LOCK_ASSERT(&V_tcbinfo);
INP_WLOCK_ASSERT(inp);
+ if (listen) {
+ /*
+ * Pending socket inheritance
+ *
+ * If this is a listen(2) socket, find another listen(2)
+ * socket in the same local group, which could inherit
+ * the syncache and sockets pending on the completion
+ * and incompletion queues.
+ *
+ * NOTE:
+ * Currently the inheritance could only happen on the
+ * listen(2) sockets with SO_REUSEPORT_LB set.
+ */
+
+ // XXX: How to handle this?
+ // ASSERT_IN_NETISR(0);
+
+ inp_inh = in_pcblookup_lbgroup_last(inp);
+ if (inp_inh != NULL)
+ printf("%s] inp %p will inherit from inp %p\n", __func__, inp_inh, inp);
+ else
+ printf("%s] there is none that can inherit from inp %p\n", __func__, inp);
+ }
+
#ifdef TCP_OFFLOAD
if (tp->t_state == TCPS_LISTEN)
tcp_offload_listen_stop(tp);
@@ -1658,7 +1689,33 @@
tcp_state_change(tp, TCPS_CLOSED);
KASSERT(inp->inp_socket != NULL, ("tcp_close: inp_socket NULL"));
so = inp->inp_socket;
+
+ if(so->inherit)
+ printf("%s] inherited socket %p\n", __func__, so);
+
soisdisconnected(so);
+
+ // Socket inherit
+ if(listen)
+ {
+ // XXX What do we do with syncache?
+ // syncache seem to be stored separately from sockets/inps,
+ // no need to do anything??
+
+ // from dflybsd impl:
+ // syncache_destroy(tp, tp_inh);
+
+
+ if(inp_inh == NULL) {
+ printf("%s] inp_inh is NULL, can't inherit\n", __func__);
+ } else if(inp_inh->inp_socket == NULL) {
+ printf("%s] inp_inh->inp_socket is NULL, can't inherit\n", __func__);
+ } else {
+ soinherit(so, inp_inh->inp_socket);
+ }
+ }
+
+
if (inp->inp_flags & INP_SOCKREF) {
KASSERT(so->so_state & SS_PROTOREF,
("tcp_close: !SS_PROTOREF"));
@@ -2023,7 +2080,7 @@
if (cmd == PRC_MSGSIZE)
notify = tcp_mtudisc_notify;
else if (V_icmp_may_rst && (cmd == PRC_UNREACH_ADMIN_PROHIB ||
- cmd == PRC_UNREACH_PORT || cmd == PRC_UNREACH_PROTOCOL ||
+ cmd == PRC_UNREACH_PORT || cmd == PRC_UNREACH_PROTOCOL ||
cmd == PRC_TIMXCEED_INTRANS) && ip)
notify = tcp_drop_syn_sent;
@@ -2159,7 +2216,7 @@
if (cmd == PRC_MSGSIZE)
notify = tcp_mtudisc_notify;
else if (V_icmp_may_rst && (cmd == PRC_UNREACH_ADMIN_PROHIB ||
- cmd == PRC_UNREACH_PORT || cmd == PRC_UNREACH_PROTOCOL ||
+ cmd == PRC_UNREACH_PORT || cmd == PRC_UNREACH_PROTOCOL ||
cmd == PRC_TIMXCEED_INTRANS) && ip6 != NULL)
notify = tcp_drop_syn_sent;
@@ -2437,7 +2494,7 @@
KASSERT(tp != NULL, ("tcp_mtudisc: tp == NULL"));
tcp_mss_update(tp, -1, mtuoffer, NULL, NULL);
-
+
so = inp->inp_socket;
SOCKBUF_LOCK(&so->so_snd);
/* If the mss is larger than the socket buffer, decrease the mss. */
Index: sys/netinet/tcp_syncache.c
===================================================================
--- sys/netinet/tcp_syncache.c
+++ sys/netinet/tcp_syncache.c
@@ -372,6 +372,7 @@
static void
syncache_drop(struct syncache *sc, struct syncache_head *sch)
{
+ printf("%s]\n", __func__);
SCH_LOCK_ASSERT(sch);
@@ -794,7 +795,7 @@
struct sockaddr_in sin;
inp->inp_options = (m) ? ip_srcroute(m) : NULL;
-
+
if (inp->inp_options == NULL) {
inp->inp_options = sc->sc_ipopts;
sc->sc_ipopts = NULL;
@@ -838,11 +839,11 @@
if (V_functions_inherit_listen_socket_stack && blk != tp->t_fb) {
/*
* Our parents t_fb was not the default,
- * we need to release our ref on tp->t_fb and
+ * we need to release our ref on tp->t_fb and
* pickup one on the new entry.
*/
struct tcp_function_block *rblk;
-
+
rblk = find_and_ref_tcp_fb(blk);
KASSERT(rblk != NULL,
("cannot find blk %p out of syncache?", blk));
@@ -853,7 +854,7 @@
if (tp->t_fb->tfb_tcp_fb_init) {
(*tp->t_fb->tfb_tcp_fb_init)(tp);
}
- }
+ }
tp->snd_wl1 = sc->sc_irs;
tp->snd_max = tp->iss + 1;
tp->snd_nxt = tp->iss + 1;
@@ -1066,7 +1067,7 @@
#endif /* TCP_SIGNATURE */
/*
* Pull out the entry to unlock the bucket row.
- *
+ *
* NOTE: We must decrease TCPS_SYN_RECEIVED count here, not
* tcp_state_change(). The tcpcb is not existent at this
* moment. A new one will be allocated via syncache_socket->
@@ -1231,6 +1232,7 @@
struct inpcb *inp, struct socket **lsop, struct mbuf *m, void *tod,
void *todctx)
{
+ /* printf("%s] inp %p\n", __func__, inp); */
struct tcpcb *tp;
struct socket *so;
struct syncache *sc = NULL;
@@ -2046,7 +2048,7 @@
}
static struct syncache *
-syncookie_lookup(struct in_conninfo *inc, struct syncache_head *sch,
+syncookie_lookup(struct in_conninfo *inc, struct syncache_head *sch,
struct syncache *sc, struct tcphdr *th, struct tcpopt *to,
struct socket *lso)
{
@@ -2084,7 +2086,7 @@
sc->sc_flags = 0;
bcopy(inc, &sc->sc_inc, sizeof(struct in_conninfo));
sc->sc_ipopts = NULL;
-
+
sc->sc_irs = seq;
sc->sc_iss = ack;
Index: sys/netinet6/ip6_output.c
===================================================================
--- sys/netinet6/ip6_output.c
+++ sys/netinet6/ip6_output.c
@@ -1460,6 +1460,15 @@
INP_WUNLOCK(in6p);
error = 0;
break;
+ case SO_REUSEPORT_LB:
+ INP_WLOCK(in6p);
+ if ((so->so_options & SO_REUSEPORT_LB) != 0)
+ in6p->inp_flags2 |= INP_REUSEPORT_LB;
+ else
+ in6p->inp_flags2 &= ~INP_REUSEPORT_LB;
+ INP_WUNLOCK(in6p);
+ error = 0;
+ break;
case SO_SETFIB:
INP_WLOCK(in6p);
in6p->inp_inc.inc_fibnum = so->so_fibnum;
Index: sys/sys/socket.h
===================================================================
--- sys/sys/socket.h
+++ sys/sys/socket.h
@@ -145,6 +145,9 @@
#define SO_NO_OFFLOAD 0x4000 /* socket cannot be offloaded */
#define SO_NO_DDP 0x8000 /* disable direct data placement */
+// XXX: so_options only 16 bit.. (increased to 32)
+#define SO_REUSEPORT_LB 0x00010000 /* reuse with load balancing */
+
/*
* Additional options, not kept in so_options.
*/
Index: sys/sys/socketvar.h
===================================================================
--- sys/sys/socketvar.h
+++ sys/sys/socketvar.h
@@ -73,12 +73,13 @@
*/
TAILQ_HEAD(accept_queue, socket);
struct socket {
+ uint32_t inherit; /* temporarily added for debugging */
struct mtx so_lock;
volatile u_int so_count; /* (b / refcount) */
struct selinfo so_rdsel; /* (b/cr) for so_rcv/so_comp */
struct selinfo so_wrsel; /* (b/cs) for so_snd */
short so_type; /* (a) generic type, see socket.h */
- short so_options; /* (b) from socket call, see socket.h */
+ int so_options; /* (b) from socket call, see socket.h */
short so_linger; /* time to linger close(2) */
short so_state; /* (b) internal state flags SS_* */
void *so_pcb; /* protocol control block */
@@ -200,12 +201,12 @@
size_t xso_len; /* length of this structure */
struct socket *xso_so; /* makes a convenient handle sometimes */
short so_type;
- short so_options;
+ int so_options;
short so_linger;
short so_state;
caddr_t so_pcb; /* another convenient handle */
- int xso_protocol;
- int xso_family;
+ int xso_protocol;
+ int xso_family;
u_int so_qlen;
u_int so_incqlen;
u_int so_qlimit;
@@ -386,6 +387,7 @@
int sodisconnect(struct socket *so);
struct sockaddr *sodupsockaddr(const struct sockaddr *sa, int mflags);
void sofree(struct socket *so);
+void soinherit(struct socket *so, struct socket *so_inh);
void sohasoutofband(struct socket *so);
int solisten(struct socket *so, int backlog, struct thread *td);
void solisten_proto(struct socket *so, int backlog);
@@ -431,7 +433,6 @@
void solisten_wakeup(struct socket *);
int selsocket(struct socket *so, int events, struct timeval *tv,
struct thread *td);
-
/*
* Accept filter functions (duh).
*/
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Fri, Apr 25, 3:29 AM (11 h, 59 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
17774478
Default Alt Text
D11003.id30483.diff (41 KB)
Attached To
Mode
D11003: Load balance sockets with new SO_REUSEPORT_LB option
Attached
Detach File
Event Timeline
Log In to Comment