Page MenuHomeFreeBSD

D11003.id30483.diff
No OneTemporary

D11003.id30483.diff

Index: sys/kern/uipc_debug.c
===================================================================
--- sys/kern/uipc_debug.c
+++ sys/kern/uipc_debug.c
@@ -75,7 +75,7 @@
}
static void
-db_print_sooptions(short so_options)
+db_print_sooptions(int so_options)
{
int comma;
@@ -120,6 +120,10 @@
db_printf("%sSO_REUSEPORT", comma ? ", " : "");
comma = 1;
}
+ if (so_options & SO_REUSEPORT_LB) {
+ db_printf("%sSO_REUSEPORT_LB", comma ? ", " : "");
+ comma = 1;
+ }
if (so_options & SO_TIMESTAMP) {
db_printf("%sSO_TIMESTAMP", comma ? ", " : "");
comma = 1;
Index: sys/kern/uipc_socket.c
===================================================================
--- sys/kern/uipc_socket.c
+++ sys/kern/uipc_socket.c
@@ -280,7 +280,7 @@
static void
socket_hhook_register(int subtype)
{
-
+
if (hhook_head_register(HHOOK_TYPE_SOCKET, subtype,
&V_socket_hhh[subtype],
HHOOK_NOWAIT|HHOOK_HEADISINVNET) != 0)
@@ -290,7 +290,7 @@
static void
socket_hhook_deregister(int subtype)
{
-
+
if (hhook_head_deregister(V_socket_hhh[subtype]) != 0)
printf("%s: WARNING: unable to deregister hook\n", __func__);
}
@@ -448,6 +448,8 @@
static void
sodealloc(struct socket *so)
{
+ if(so->inherit)
+ printf("%s] dealloc inherited socket %p\n", __func__, so);
KASSERT(so->so_count == 0, ("sodealloc(): so_count %d", so->so_count));
KASSERT(so->so_pcb == NULL, ("sodealloc(): so_pcb != NULL"));
@@ -936,6 +938,9 @@
SOCK_UNLOCK(so);
sorele(head);
+ if(so->inherit)
+ printf("%s] dequeueing inherited socket %p from socket %p\n", __func__, so, head);
+
*ret = so;
return (0);
}
@@ -963,6 +968,9 @@
void
sofree(struct socket *so)
{
+ if(so->inherit)
+ printf("%s] inherited socket %p\n", __func__, so);
+
struct protosw *pr = so->so_proto;
SOCK_LOCK_ASSERT(so);
@@ -1005,6 +1013,7 @@
TAILQ_REMOVE(&sol->sol_incomp, so, so_list);
sol->sol_incqlen--;
/* This is guarenteed not to be the last. */
+ printf("%s] calling refcount_release\n", __func__);
refcount_release(&sol->so_count);
so->so_qstate = SQ_NONE;
so->so_listen = NULL;
@@ -1053,6 +1062,114 @@
}
/*
+ * Let socket in same load balance group (same port and address)
+ * inherit pending sockets of the closing socket.
+ *
+ * "so_inh" will inherit sockets from "so"
+ */
+void
+soinherit(struct socket *so, struct socket *so_inh)
+{
+ struct thread *td = curthread;
+ struct proc *p = td->td_proc;
+ int pid = p->p_pid;
+ printf("%s] pid %d\n", __func__, pid);
+
+ TAILQ_HEAD(, socket) comp, incomp;
+ struct socket *sp, *head, *head_inh;
+ int qlen, incqlen;
+
+ KASSERT(so->so_options & SO_ACCEPTCONN,
+ ("so does not accept connection"));
+ KASSERT(so_inh->so_options & SO_ACCEPTCONN,
+ ("so_inh does not accept connection"));
+
+ // XXX: Do we need to lock head?
+restart:
+ SOCK_LOCK(so);
+ if ((head = so->so_listen) != NULL &&
+ __predict_false(SOLISTEN_TRYLOCK(head) == 0)) {
+ SOCK_UNLOCK(so);
+ goto restart;
+ }
+
+restart_inh:
+ SOCK_LOCK(so_inh);
+ if ((head_inh = so_inh->so_listen) != NULL &&
+ __predict_false(SOLISTEN_TRYLOCK(head_inh) == 0)) {
+ SOCK_UNLOCK(so_inh);
+ goto restart_inh;
+ }
+
+ TAILQ_INIT(&comp);
+ TAILQ_INIT(&incomp);
+
+ /*
+ * Save completed queue and incompleted queue
+ */
+ TAILQ_CONCAT(&comp, &so->sol_comp, so_list);
+ qlen = so->sol_qlen;
+ so->sol_qlen = 0;
+
+ TAILQ_CONCAT(&incomp, &so->sol_incomp, so_list);
+ incqlen = so->sol_incqlen;
+ so->sol_incqlen = 0;
+
+ printf("%s] got closing socket qlen %d\n", __func__, qlen);
+ printf("%s] got closing socket incqlen %d\n", __func__, incqlen);
+
+ /*
+ * Append the saved completed queue and incompleted
+ * queue to the socket inherits them.
+ *
+ * XXX:
+ * This may temporarily break the inheriting socket's
+ * so_qlimit.
+ */
+ TAILQ_FOREACH(sp, &comp, so_list) {
+ /* XXX: got a problem with negative refcount, is this the correct solution? */
+ refcount_acquire(&so_inh->so_count);
+ sp->so_listen = so_inh;
+ sp->inherit = 1; // for debugging
+ crfree(sp->so_cred);
+ sp->so_cred = crhold(so_inh->so_cred);
+ // XXX: Something more we need to do here?
+ printf("%s] listening socket %p is inheriting comp socket %p\n", __func__, so_inh, sp);
+ }
+
+ TAILQ_FOREACH(sp, &incomp, so_list) {
+ /* XXX: got a problem with negative refcount, is this the correct solution? */
+ refcount_acquire(&so_inh->so_count);
+ sp->inherit = 1; // for debugging
+ sp->so_listen = so_inh;
+ crfree(sp->so_cred);
+ sp->so_cred = crhold(so_inh->so_cred);
+ // XXX: Something more we need to do here?
+ printf("%s] listening socket %p is inheriting incomp socket %p\n", __func__, so_inh, sp);
+ }
+
+ TAILQ_CONCAT(&so_inh->sol_comp, &comp, so_list);
+ so_inh->sol_qlen += qlen;
+
+ TAILQ_CONCAT(&so_inh->sol_incomp, &incomp, so_list);
+ so_inh->sol_incqlen += incqlen;
+
+ SOCK_UNLOCK(so);
+ if(head != NULL)
+ SOLISTEN_UNLOCK(head);
+
+ SOCK_UNLOCK(so_inh);
+ if(head_inh != NULL) {
+ if(qlen > 0) {
+ /* "New" connections have arrived */
+ solisten_wakeup(head_inh);
+ } else {
+ SOLISTEN_UNLOCK(head_inh);
+ }
+ }
+}
+
+/*
* Close a socket on last file table reference removal. Initiate disconnect
* if connected. Free socket when disconnect complete.
*
@@ -1063,6 +1180,9 @@
int
soclose(struct socket *so)
{
+ if(so->inherit)
+ printf("%s] inherited socket %p\n", __func__, so);
+
struct accept_queue lqueue;
bool listening;
int error = 0;
@@ -1114,6 +1234,7 @@
sp->so_listen = NULL;
SOCK_UNLOCK(sp);
/* Guaranteed not to be the last. */
+ printf("%s] calling refcount_release\n", __func__);
refcount_release(&so->so_count);
}
}
@@ -1192,6 +1313,8 @@
int
soconnect(struct socket *so, struct sockaddr *nam, struct thread *td)
{
+ if(so->inherit)
+ printf("%s] connecting inherited socket %p\n", __func__, so);
return (soconnectat(AT_FDCWD, so, nam, td));
}
@@ -1247,6 +1370,9 @@
int
sodisconnect(struct socket *so)
{
+ if(so->inherit)
+ printf("%s] disconnecting inherited socket %p\n", __func__, so);
+
int error;
if ((so->so_state & SS_ISCONNECTED) == 0)
@@ -1429,6 +1555,8 @@
sosend_generic(struct socket *so, struct sockaddr *addr, struct uio *uio,
struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
{
+ if(so->inherit)
+ printf("%s] send_generic on inherited socket %p\n", __func__, so);
long space;
ssize_t resid;
int clen = 0, error, dontroute;
@@ -1610,6 +1738,14 @@
sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
{
+ if(so->inherit) {
+ struct thread *td = curthread;
+ struct proc *p = td->td_proc;
+ int pid = p->p_pid;
+ printf("%s] pid %d\n", __func__, pid);
+
+ printf("%s] send on inherited socket %p\n", __func__, so);
+ }
int error;
CURVNET_SET(so->so_vnet);
@@ -2547,6 +2683,9 @@
soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio,
struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
{
+ if(so->inherit)
+ printf("%s] receiving on inherited socket %p\n", __func__, so);
+
int error;
CURVNET_SET(so->so_vnet);
@@ -2772,6 +2911,7 @@
case SO_BROADCAST:
case SO_REUSEADDR:
case SO_REUSEPORT:
+ case SO_REUSEPORT_LB:
case SO_OOBINLINE:
case SO_TIMESTAMP:
case SO_BINTIME:
@@ -3021,6 +3161,7 @@
case SO_KEEPALIVE:
case SO_REUSEADDR:
case SO_REUSEPORT:
+ case SO_REUSEPORT_LB:
case SO_BROADCAST:
case SO_OOBINLINE:
case SO_ACCEPTCONN:
Index: sys/netinet/in_pcb.h
===================================================================
--- sys/netinet/in_pcb.h
+++ sys/netinet/in_pcb.h
@@ -76,6 +76,11 @@
struct in_addr ia46_addr4;
};
+union in_dependaddr {
+ struct in_addr_4in6 id46_addr;
+ struct in6_addr id6_addr;
+};
+
/*
* NOTE: ipv6 addrs should be 64-bit aligned, per RFC 2553. in_conninfo has
* some extra padding to accomplish this.
@@ -86,22 +91,14 @@
u_int16_t ie_fport; /* foreign port */
u_int16_t ie_lport; /* local port */
/* protocol dependent part, local and foreign addr */
- union {
- /* foreign host table entry */
- struct in_addr_4in6 ie46_foreign;
- struct in6_addr ie6_foreign;
- } ie_dependfaddr;
- union {
- /* local host table entry */
- struct in_addr_4in6 ie46_local;
- struct in6_addr ie6_local;
- } ie_dependladdr;
+ union in_dependaddr ie_dependfaddr; /* foreign host table entry */
+ union in_dependaddr ie_dependladdr; /* local host table entry */
+#define ie_faddr ie_dependfaddr.id46_addr.ia46_addr4
+#define ie_laddr ie_dependladdr.id46_addr.ia46_addr4
+#define ie6_faddr ie_dependfaddr.id6_addr
+#define ie6_laddr ie_dependladdr.id6_addr
u_int32_t ie6_zoneid; /* scope zone id */
};
-#define ie_faddr ie_dependfaddr.ie46_foreign.ia46_addr4
-#define ie_laddr ie_dependladdr.ie46_local.ia46_addr4
-#define ie6_faddr ie_dependfaddr.ie6_foreign
-#define ie6_laddr ie_dependladdr.ie6_local
/*
* XXX The defines for inc_* are hacks and should be changed to direct
@@ -328,6 +325,21 @@
u_short phd_port;
};
+struct inpcblbgroup {
+ LIST_ENTRY(inpcblbgroup) il_list;
+ uint16_t il_lport;
+ u_char il_vflag;
+ u_char il_pad;
+ uint32_t il_pad2;
+ union in_dependaddr il_dependladdr;
+#define il_laddr il_dependladdr.id46_addr.ia46_addr4
+#define il6_laddr il_dependladdr.id6_addr
+ uint32_t il_inpsiz; /* size of il_inp[] */
+ uint32_t il_inpcnt; /* # of elem in il_inp[] */
+ struct inpcb *il_inp[];
+};
+LIST_HEAD(inpcblbgrouphead, inpcblbgroup);
+
/*-
* Global data structure for each high-level protocol (UDP, TCP, ...) in both
* IPv4 and IPv6. Holds inpcb lists and information for managing them.
@@ -421,6 +433,13 @@
u_long ipi_wildmask; /* (p) */
/*
+ * Load balanced group used by the SO_REUSEPORT_LB option,
+ * hashed by local address and local port.
+ */
+ struct inpcblbgrouphead *ipi_lbgrouphashbase;
+ u_long ipi_lbgrouphashmask;
+
+ /*
* Pointer to network stack instance
*/
struct vnet *ipi_vnet; /* (c) */
@@ -506,7 +525,7 @@
inp_inpcbtotcpcb(struct inpcb *inp);
void inp_4tuple_get(struct inpcb *inp, uint32_t *laddr, uint16_t *lp,
uint32_t *faddr, uint16_t *fp);
-short inp_so_options(const struct inpcb *inp);
+int inp_so_options(const struct inpcb *inp);
#endif /* _KERNEL */
@@ -569,6 +588,10 @@
(((faddr) ^ ((faddr) >> 16) ^ ntohs((lport) ^ (fport))) & (mask))
#define INP_PCBPORTHASH(lport, mask) \
(ntohs((lport)) & (mask))
+#define INP_PCBLBGROUP_PORTHASH(lport, mask) \
+ (ntohs((lport)) & (mask))
+#define INP_PCBLBGROUP_PKTHASH(faddr, lport, fport) \
+ ((faddr) ^ ((faddr) >> 16) ^ ntohs((lport) ^ (fport)))
#define INP6_PCBHASHKEY(faddr) ((faddr)->s6_addr32[3])
/*
@@ -624,11 +647,11 @@
/*
* Flags for inp_flags2.
*/
-#define INP_LLE_VALID 0x00000001 /* cached lle is valid */
+#define INP_LLE_VALID 0x00000001 /* cached lle is valid */
#define INP_RT_VALID 0x00000002 /* cached rtentry is valid */
#define INP_PCBGROUPWILD 0x00000004 /* in pcbgroup wildcard list */
#define INP_REUSEPORT 0x00000008 /* SO_REUSEPORT option is set */
-#define INP_FREED 0x00000010 /* inp itself is not valid */
+#define INP_FREED 0x00000010 /* inp itself is not valid */
#define INP_REUSEADDR 0x00000020 /* SO_REUSEADDR option is set */
#define INP_BINDMULTI 0x00000040 /* IP_BINDMULTI option is set */
#define INP_RSS_BUCKET_SET 0x00000080 /* IP_RSS_LISTEN_BUCKET is set */
@@ -636,6 +659,7 @@
#define INP_RECVRSSBUCKETID 0x00000200 /* populate recv datagram with bucket id */
#define INP_RATE_LIMIT_CHANGED 0x00000400 /* rate limit needs attention */
#define INP_ORIGDSTADDR 0x00000800 /* receive IP dst address/port */
+#define INP_REUSEPORT_LB 0x00001000 /* SO_REUSEPORT_LB option is set */
/*
* Flags passed to in_pcblookup*() functions.
@@ -739,6 +763,8 @@
in_pcblookup(struct inpcbinfo *, struct in_addr, u_int,
struct in_addr, u_int, int, struct ifnet *);
struct inpcb *
+ in_pcblookup_lbgroup_last(const struct inpcb *inp);
+struct inpcb *
in_pcblookup_mbuf(struct inpcbinfo *, struct in_addr, u_int,
struct in_addr, u_int, int, struct ifnet *, struct mbuf *);
void in_pcbnotifyall(struct inpcbinfo *pcbinfo, struct in_addr,
Index: sys/netinet/in_pcb.c
===================================================================
--- sys/netinet/in_pcb.c
+++ sys/netinet/in_pcb.c
@@ -102,6 +102,9 @@
#include <security/mac/mac_framework.h>
+#define INPCBLBGROUP_SIZMIN 8
+#define INPCBLBGROUP_SIZMAX 256
+
static struct callout ipport_tick_callout;
/*
@@ -211,6 +214,173 @@
* functions often modify hash chains or addresses in pcbs.
*/
+static struct inpcblbgroup *
+in_pcblbgroup_alloc(struct inpcblbgrouphead *hdr, u_char vflag,
+ uint16_t port, const union in_dependaddr *addr, int size)
+{
+ struct inpcblbgroup *grp;
+
+ size_t bytes = __offsetof(struct inpcblbgroup, il_inp[size]);
+ grp = malloc(bytes, M_PCB, M_WAITOK | M_ZERO);
+ grp->il_vflag = vflag;
+ grp->il_lport = port;
+ grp->il_dependladdr = *addr;
+ grp->il_inpsiz = size;
+ LIST_INSERT_HEAD(hdr, grp, il_list);
+
+ return grp;
+}
+
+static void
+in_pcblbgroup_free(struct inpcblbgroup *grp)
+{
+ LIST_REMOVE(grp, il_list);
+ free(grp, M_TEMP);
+}
+
+static struct inpcblbgroup *
+in_pcblbgroup_resize(struct inpcblbgrouphead *hdr,
+ struct inpcblbgroup *old_grp, int size)
+{
+ struct inpcblbgroup *grp;
+ int i;
+
+ grp = in_pcblbgroup_alloc(hdr, old_grp->il_vflag,
+ old_grp->il_lport, &old_grp->il_dependladdr, size);
+
+ KASSERT(old_grp->il_inpcnt < grp->il_inpsiz,
+ ("invalid new local group size %d and old local group count %d",
+ grp->il_inpsiz, old_grp->il_inpcnt));
+ for (i = 0; i < old_grp->il_inpcnt; ++i)
+ grp->il_inp[i] = old_grp->il_inp[i];
+ grp->il_inpcnt = old_grp->il_inpcnt;
+
+ in_pcblbgroup_free(old_grp);
+
+ return grp;
+}
+
+/*
+ * Add PCB to lb group (load balance used by SO_REUSEPORT_LB)
+ */
+static void
+in_pcbinslbgrouphash(struct inpcb *inp, struct inpcbinfo *pcbinfo)
+{
+ struct inpcblbgrouphead *hdr;
+ struct inpcblbgroup *grp;
+
+ uint16_t hashmask = pcbinfo->ipi_lbgrouphashmask;
+ uint16_t lport = inp->inp_lport;
+ uint32_t group_index = INP_PCBLBGROUP_PORTHASH(lport, hashmask);
+
+ hdr = &pcbinfo->ipi_lbgrouphashbase[group_index];
+
+ struct ucred *cred;
+
+ if (pcbinfo->ipi_lbgrouphashbase == NULL)
+ return;
+
+ /*
+ * don't allow jailed socket to join local group
+ */
+ if (inp->inp_socket != NULL)
+ cred = inp->inp_socket->so_cred;
+ else
+ cred = NULL;
+ if (cred != NULL && jailed(cred))
+ return;
+
+#ifdef INET6
+ /*
+ * don't allow IPv4 mapped INET6 wild socket
+ */
+ if ((inp->inp_vflag & INP_IPV4) &&
+ inp->inp_laddr.s_addr == INADDR_ANY &&
+ INP_CHECK_SOCKAF(inp->inp_socket, AF_INET6))
+ return;
+#endif
+
+ hdr = &pcbinfo->ipi_lbgrouphashbase[
+ INP_PCBLBGROUP_PORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask)];
+
+ LIST_FOREACH(grp, hdr, il_list) {
+ if (grp->il_vflag == inp->inp_vflag &&
+ grp->il_lport == inp->inp_lport &&
+ memcmp(&grp->il_dependladdr,
+ &inp->inp_inc.inc_ie.ie_dependladdr,
+ sizeof(grp->il_dependladdr)) == 0) {
+ break;
+ }
+ }
+ if (grp == NULL) {
+ /* Create new local group */
+ grp = in_pcblbgroup_alloc(hdr, inp->inp_vflag,
+ inp->inp_lport, &inp->inp_inc.inc_ie.ie_dependladdr,
+ INPCBLBGROUP_SIZMIN);
+ } else if (grp->il_inpcnt == grp->il_inpsiz) {
+ if (grp->il_inpsiz >= INPCBLBGROUP_SIZMAX) {
+ static int limit_logged = 0;
+
+ if (!limit_logged) {
+ limit_logged = 1;
+ printf("lb group port %d, "
+ "limit reached\n", ntohs(grp->il_lport));
+ }
+ return;
+ }
+
+ /* Expand this local group */
+ grp = in_pcblbgroup_resize(hdr, grp, grp->il_inpsiz * 2);
+ }
+
+ KASSERT(grp->il_inpcnt < grp->il_inpsiz,
+ ("invalid local group size %d and count %d",
+ grp->il_inpsiz, grp->il_inpcnt));
+
+ grp->il_inp[grp->il_inpcnt] = inp;
+ grp->il_inpcnt++;
+}
+
+static void
+in_pcbremlbgrouphash(struct inpcb *inp, struct inpcbinfo *pcbinfo)
+{
+ struct inpcblbgrouphead *hdr;
+ struct inpcblbgroup *grp;
+
+ if (pcbinfo->ipi_lbgrouphashbase == NULL)
+ return;
+
+ hdr = &pcbinfo->ipi_lbgrouphashbase[
+ INP_PCBLBGROUP_PORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask)];
+
+ LIST_FOREACH(grp, hdr, il_list) {
+ int i;
+
+ for (i = 0; i < grp->il_inpcnt; ++i) {
+ if (grp->il_inp[i] != inp)
+ continue;
+
+ if (grp->il_inpcnt == 1) {
+ /* Free this local group */
+ in_pcblbgroup_free(grp);
+ } else {
+ /* Pull up inpcbs */
+ for (; i + 1 < grp->il_inpcnt; ++i)
+ grp->il_inp[i] = grp->il_inp[i + 1];
+ grp->il_inpcnt--;
+
+ if (grp->il_inpsiz > INPCBLBGROUP_SIZMIN &&
+ grp->il_inpcnt <= (grp->il_inpsiz / 4)) {
+ /* Shrink this local group */
+ grp = in_pcblbgroup_resize(hdr, grp,
+ grp->il_inpsiz / 2);
+ }
+ }
+ return;
+ }
+ }
+}
+
/*
* Different protocols initialize their inpcbs differently - giving
* different name to the lock. But they all are disposed the same.
@@ -246,6 +416,8 @@
&pcbinfo->ipi_hashmask);
pcbinfo->ipi_porthashbase = hashinit(porthash_nelements, M_PCB,
&pcbinfo->ipi_porthashmask);
+ pcbinfo->ipi_lbgrouphashbase = hashinit(hash_nelements, M_PCB,
+ &pcbinfo->ipi_lbgrouphashmask);
#ifdef PCBGROUP
in_pcbgroup_init(pcbinfo, hashfields, hash_nelements);
#endif
@@ -269,6 +441,8 @@
hashdestroy(pcbinfo->ipi_hashbase, M_PCB, pcbinfo->ipi_hashmask);
hashdestroy(pcbinfo->ipi_porthashbase, M_PCB,
pcbinfo->ipi_porthashmask);
+ hashdestroy(pcbinfo->ipi_lbgrouphashbase, M_PCB,
+ pcbinfo->ipi_lbgrouphashmask);
#ifdef PCBGROUP
in_pcbgroup_destroy(pcbinfo);
#endif
@@ -507,18 +681,20 @@
/*
* Return cached socket options.
*/
-short
+int
inp_so_options(const struct inpcb *inp)
{
- short so_options;
+ int so_options;
- so_options = 0;
+ so_options = 0;
- if ((inp->inp_flags2 & INP_REUSEPORT) != 0)
- so_options |= SO_REUSEPORT;
- if ((inp->inp_flags2 & INP_REUSEADDR) != 0)
- so_options |= SO_REUSEADDR;
- return (so_options);
+ if ((inp->inp_flags2 & INP_REUSEPORT_LB) != 0)
+ so_options |= SO_REUSEPORT_LB;
+ if ((inp->inp_flags2 & INP_REUSEPORT) != 0)
+ so_options |= SO_REUSEPORT;
+ if ((inp->inp_flags2 & INP_REUSEADDR) != 0)
+ so_options |= SO_REUSEADDR;
+ return (so_options);
}
#endif /* INET || INET6 */
@@ -575,6 +751,12 @@
int error;
/*
+ * XXX Maybe we could let SO_REUSEPORT_LB set SO_REUSEPORT bit here
+ * so that we don't have to add to the (already messy) code below
+ */
+ int reuseport_lb = (so->so_options & SO_REUSEPORT_LB);
+
+ /*
* No state changes, so read locks are sufficient here.
*/
INP_LOCK_ASSERT(inp);
@@ -585,7 +767,7 @@
laddr.s_addr = *laddrp;
if (nam != NULL && laddr.s_addr != INADDR_ANY)
return (EINVAL);
- if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) == 0)
+ if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT|SO_REUSEPORT_LB)) == 0)
lookupflags = INPLOOKUP_WILDCARD;
if (nam == NULL) {
if ((error = prison_local_ip4(cred, &laddr)) != 0)
@@ -620,18 +802,23 @@
* and a multicast address is bound on both
* new and duplicated sockets.
*/
+
+ // XXX: How to deal with SO_REUSEPORT_LB here?
+ // Added equivalent treatment as SO_REUSEPORT here for now
+ if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT_LB)) != 0)
+ reuseport_lb = SO_REUSEADDR|SO_REUSEPORT_LB;
if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) != 0)
reuseport = SO_REUSEADDR|SO_REUSEPORT;
} else if (sin->sin_addr.s_addr != INADDR_ANY) {
sin->sin_port = 0; /* yech... */
bzero(&sin->sin_zero, sizeof(sin->sin_zero));
/*
- * Is the address a local IP address?
+ * Is the address a local IP address?
* If INP_BINDANY is set, then the socket may be bound
* to any endpoint address, local or not.
*/
if ((inp->inp_flags & INP_BINDANY) == 0 &&
- ifa_ifwithaddr_check((struct sockaddr *)sin) == 0)
+ ifa_ifwithaddr_check((struct sockaddr *)sin) == 0)
return (EADDRNOTAVAIL);
}
laddr = sin->sin_addr;
@@ -661,7 +848,8 @@
ntohl(t->inp_faddr.s_addr) == INADDR_ANY) &&
(ntohl(sin->sin_addr.s_addr) != INADDR_ANY ||
ntohl(t->inp_laddr.s_addr) != INADDR_ANY ||
- (t->inp_flags2 & INP_REUSEPORT) == 0) &&
+ (t->inp_flags2 & INP_REUSEPORT) ||
+ (t->inp_flags2 & INP_REUSEPORT_LB) == 0) &&
(inp->inp_cred->cr_uid !=
t->inp_cred->cr_uid))
return (EADDRINUSE);
@@ -686,11 +874,14 @@
*/
tw = intotw(t);
if (tw == NULL ||
- (reuseport & tw->tw_so_options) == 0)
+ ((reuseport & tw->tw_so_options) == 0 &&
+ (reuseport_lb & tw->tw_so_options) == 0)) {
return (EADDRINUSE);
+ }
} else if (t &&
- ((inp->inp_flags2 & INP_BINDMULTI) == 0) &&
- (reuseport & inp_so_options(t)) == 0) {
+ ((inp->inp_flags2 & INP_BINDMULTI) == 0) &&
+ (reuseport & inp_so_options(t)) == 0 &&
+ (reuseport_lb & inp_so_options(t)) == 0) {
#ifdef INET6
if (ntohl(sin->sin_addr.s_addr) !=
INADDR_ANY ||
@@ -699,7 +890,7 @@
(inp->inp_vflag & INP_IPV6PROTO) == 0 ||
(t->inp_vflag & INP_IPV6PROTO) == 0)
#endif
- return (EADDRINUSE);
+ return (EADDRINUSE);
if (t && (! in_pcbbind_check_bindmulti(inp, t)))
return (EADDRINUSE);
}
@@ -816,7 +1007,7 @@
/*
* If we found a route, use the address corresponding to
* the outgoing interface.
- *
+ *
* Otherwise assume faddr is reachable on a directly connected
* network and try to find a corresponding interface to take
* the source address from.
@@ -1360,6 +1551,7 @@
struct inpcbport *phd = inp->inp_phd;
INP_HASH_WLOCK(inp->inp_pcbinfo);
+ in_pcbremlbgrouphash(inp, inp->inp_pcbinfo);
LIST_REMOVE(inp, inp_hash);
LIST_REMOVE(inp, inp_portlist);
if (LIST_FIRST(&phd->phd_pcblist) == NULL) {
@@ -1620,6 +1812,100 @@
}
#undef INP_LOOKUP_MAPPED_PCB_COST
+struct inpcb *
+in_pcblookup_lbgroup_last(const struct inpcb *inp)
+{
+ const struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
+ const struct inpcblbgrouphead *hdr;
+ const struct inpcblbgroup *grp;
+ int i;
+
+ if (pcbinfo->ipi_lbgrouphashbase == NULL)
+ return NULL;
+
+ hdr = &pcbinfo->ipi_lbgrouphashbase[
+ INP_PCBLBGROUP_PORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask)];
+
+ LIST_FOREACH(grp, hdr, il_list) {
+ if (grp->il_vflag == inp->inp_vflag &&
+ grp->il_lport == inp->inp_lport &&
+ memcmp(&grp->il_dependladdr,
+ &inp->inp_inc.inc_ie.ie_dependladdr,
+ sizeof(grp->il_dependladdr)) == 0) {
+ break;
+ }
+ }
+ if (grp == NULL || grp->il_inpcnt == 1)
+ return NULL;
+
+ KASSERT(grp->il_inpcnt >= 2,
+ ("invalid lbgroup inp count %d", grp->il_inpcnt));
+ for (i = 0; i < grp->il_inpcnt; ++i) {
+ if (grp->il_inp[i] == inp) {
+ int last = grp->il_inpcnt - 1;
+
+ if (i == last)
+ last = grp->il_inpcnt - 2;
+ printf("%s] returning inp at index %d (last)\n", __func__, last);
+ return grp->il_inp[last];
+ }
+ }
+ printf("%s] returning NULL\n", __func__);
+ return NULL;
+}
+
+static struct inpcb *
+in_pcblookup_lbgroup(const struct inpcbinfo *pcbinfo,
+ const struct in_addr *laddr, uint16_t lport, const struct in_addr *faddr,
+ uint16_t fport, int lookupflags)
+{
+ struct inpcb *local_wild = NULL;
+ const struct inpcblbgrouphead *hdr;
+ struct inpcblbgroup *grp;
+ struct inpcblbgroup *grp_local_wild;
+
+ hdr = &pcbinfo->ipi_lbgrouphashbase[
+ INP_PCBLBGROUP_PORTHASH(lport, pcbinfo->ipi_lbgrouphashmask)];
+
+ /*
+ * Order of socket selection:
+ * 1. non-wild.
+ * 2. wild (if lookupflags contains INPLOOKUP_WILDCARD).
+ *
+ * NOTE:
+ * - Local group does not contain jailed sockets
+ * - Local group does not contain IPv4 mapped INET6 wild sockets
+ */
+ LIST_FOREACH(grp, hdr, il_list) {
+#ifdef INET6
+ if (!(grp->il_vflag & INP_IPV4))
+ continue;
+#endif
+
+ if (grp->il_lport == lport) {
+
+ uint32_t idx = 0;
+ int pkt_hash = INP_PCBLBGROUP_PKTHASH(faddr->s_addr, lport, fport);
+
+ idx = pkt_hash % grp->il_inpcnt;
+
+ if (grp->il_laddr.s_addr == laddr->s_addr) {
+ return grp->il_inp[idx];
+ } else {
+ if (grp->il_laddr.s_addr == INADDR_ANY &&
+ (lookupflags & INPLOOKUP_WILDCARD)) {
+ local_wild = grp->il_inp[idx];
+ grp_local_wild = grp;
+ }
+ }
+ }
+ }
+ if (local_wild != NULL) {
+ return local_wild;
+ }
+ return NULL;
+}
+
#ifdef PCBGROUP
/*
* Lookup PCB in hash list, using pcbgroup tables.
@@ -1884,6 +2170,16 @@
return (tmpinp);
/*
+ * Then look in lb group
+ */
+ if (pcbinfo->ipi_lbgrouphashbase != NULL) {
+ inp = in_pcblookup_lbgroup(pcbinfo, &laddr, lport, &faddr, fport, lookupflags);
+ if (inp != NULL) {
+ return inp;
+ }
+ }
+
+ /*
* Then look for a wildcard match, if requested.
*/
if ((lookupflags & INPLOOKUP_WILDCARD) != 0) {
@@ -2085,6 +2381,7 @@
struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
struct inpcbport *phd;
u_int32_t hashkey_faddr;
+ int so_options;
INP_WLOCK_ASSERT(inp);
INP_HASH_WLOCK_ASSERT(pcbinfo);
@@ -2105,6 +2402,16 @@
pcbporthash = &pcbinfo->ipi_porthashbase[
INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_porthashmask)];
+
+ /*
+ * Add entry in lb group
+ * Only do this if SO_REUSEPORT_LB is set
+ */
+ so_options = inp_so_options(inp);
+ if(so_options & SO_REUSEPORT_LB) {
+ in_pcbinslbgrouphash(inp, pcbinfo);
+ }
+
/*
* Go through port list and look for a head for this lport.
*/
@@ -2231,6 +2538,10 @@
struct inpcbport *phd = inp->inp_phd;
INP_HASH_WLOCK(pcbinfo);
+
+ // XXX Only do if SO_REUSEPORT_LB set?
+ in_pcbremlbgrouphash(inp, pcbinfo);
+
LIST_REMOVE(inp, inp_hash);
LIST_REMOVE(inp, inp_portlist);
if (LIST_FIRST(&phd->phd_pcblist) == NULL) {
@@ -2319,7 +2630,7 @@
callout_stop(&ipport_tick_callout);
}
-/*
+/*
* The ipport_callout should start running at about the time we attach the
* inet or inet6 domains.
*/
@@ -2333,7 +2644,7 @@
EVENTHANDLER_REGISTER(shutdown_pre_sync, ip_fini, NULL,
SHUTDOWN_PRI_DEFAULT);
}
-SYSINIT(ipport_tick_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_MIDDLE,
+SYSINIT(ipport_tick_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_MIDDLE,
ipport_tick_init, NULL);
void
Index: sys/netinet/ip_output.c
===================================================================
--- sys/netinet/ip_output.c
+++ sys/netinet/ip_output.c
@@ -993,6 +993,15 @@
INP_WUNLOCK(inp);
error = 0;
break;
+ case SO_REUSEPORT_LB:
+ INP_WLOCK(inp);
+ if ((so->so_options & SO_REUSEPORT_LB) != 0)
+ inp->inp_flags2 |= INP_REUSEPORT_LB;
+ else
+ inp->inp_flags2 &= ~INP_REUSEPORT_LB;
+ INP_WUNLOCK(inp);
+ error = 0;
+ break;
case SO_SETFIB:
INP_WLOCK(inp);
inp->inp_inc.inc_fibnum = so->so_fibnum;
Index: sys/netinet/tcp_input.c
===================================================================
--- sys/netinet/tcp_input.c
+++ sys/netinet/tcp_input.c
@@ -334,7 +334,7 @@
}
}
-void
+void
cc_conn_init(struct tcpcb *tp)
{
struct hc_metrics_lite metrics;
@@ -437,7 +437,7 @@
EXIT_RECOVERY(tp->t_flags);
if (CC_ALGO(tp)->cong_signal == NULL) {
/*
- * RFC5681 Section 3.1
+ * RFC5681 Section 3.1
* ssthresh = max (FlightSize / 2, 2*SMSS) eq (4)
*/
tp->snd_ssthresh =
@@ -1387,9 +1387,11 @@
TCP_PROBE3(debug__input, tp, th, m);
tcp_dooptions(&to, optp, optlen, TO_SYN);
#ifdef TCP_RFC7413
+ printf("%s] inp %p (TCP_RFC7413)\n", __func__, inp);
if (syncache_add(&inc, &to, th, inp, &so, m, NULL, NULL))
goto tfo_socket_result;
#else
+ /* printf("%s] inp %p\n", __func__, inp); */
syncache_add(&inc, &to, th, inp, &so, m, NULL, NULL);
#endif
/*
@@ -1564,7 +1566,7 @@
#ifdef TCP_RFC7413
int tfo_syn;
#endif
-
+
#ifdef TCPDEBUG
/*
* The size of tcp_saveipgen must be the size of the max ip header,
@@ -1770,7 +1772,7 @@
th->th_seq == tp->rcv_nxt &&
(thflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK &&
tp->snd_nxt == tp->snd_max &&
- tiwin && tiwin == tp->snd_wnd &&
+ tiwin && tiwin == tp->snd_wnd &&
((tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN)) == 0) &&
LIST_EMPTY(&tp->t_segq) &&
((to.to_flags & TOF_TS) == 0 ||
@@ -1850,7 +1852,7 @@
if (SEQ_GT(tp->snd_una, tp->snd_recover) &&
SEQ_LEQ(th->th_ack, tp->snd_recover))
tp->snd_recover = th->th_ack - 1;
-
+
/*
* Let the congestion control algorithm update
* congestion control related information. This
@@ -1999,7 +2001,7 @@
goto dropwithreset;
} else if (thflags & TH_SYN) {
/* non-initial SYN is ignored */
- if ((tcp_timer_active(tp, TT_DELACK) ||
+ if ((tcp_timer_active(tp, TT_DELACK) ||
tcp_timer_active(tp, TT_REXMT)))
goto drop;
} else if (!(thflags & (TH_ACK|TH_FIN|TH_RST))) {
@@ -2065,7 +2067,7 @@
tp->t_flags |= TF_ECN_PERMIT;
TCPSTAT_INC(tcps_ecn_shs);
}
-
+
/*
* Received <SYN,ACK> in SYN_SENT[*] state.
* Transitions:
@@ -2383,14 +2385,14 @@
/*
* If last ACK falls within this segment's sequence numbers,
* record its timestamp.
- * NOTE:
+ * NOTE:
* 1) That the test incorporates suggestions from the latest
* proposal of the tcplw@cray.com list (Braden 1993/04/26).
* 2) That updating only on newer timestamps interferes with
* our earlier PAWS tests, so this check should be solely
* predicated on the sequence space of this segment.
- * 3) That we modify the segment boundary check to be
- * Last.ACK.Sent <= SEG.SEQ + SEG.Len
+ * 3) That we modify the segment boundary check to be
+ * Last.ACK.Sent <= SEG.SEQ + SEG.Len
* instead of RFC1323's
* Last.ACK.Sent < SEG.SEQ + SEG.Len,
* This modified check allows us to overcome RFC1323's
@@ -2469,7 +2471,7 @@
/*
* Account for the ACK of our SYN prior to
* regular ACK processing below.
- */
+ */
tp->snd_una++;
}
/*
@@ -2598,10 +2600,10 @@
if ((tp->t_flags & TF_SACK_PERMIT) &&
IN_FASTRECOVERY(tp->t_flags)) {
int awnd;
-
+
/*
* Compute the amount of data in flight first.
- * We can inject new data into the pipe iff
+ * We can inject new data into the pipe iff
* we have less than 1/2 the original window's
* worth of data in flight.
*/
Index: sys/netinet/tcp_subr.c
===================================================================
--- sys/netinet/tcp_subr.c
+++ sys/netinet/tcp_subr.c
@@ -245,7 +245,7 @@
tcp_do_segment,
tcp_default_ctloutput,
NULL,
- NULL,
+ NULL,
NULL,
NULL,
NULL,
@@ -305,11 +305,11 @@
find_and_ref_tcp_functions(struct tcp_function_set *fs)
{
struct tcp_function_block *blk;
-
- rw_rlock(&tcp_function_lock);
+
+ rw_rlock(&tcp_function_lock);
blk = find_tcp_functions_locked(fs);
if (blk)
- refcount_acquire(&blk->tfb_refcnt);
+ refcount_acquire(&blk->tfb_refcnt);
rw_runlock(&tcp_function_lock);
return(blk);
}
@@ -318,10 +318,10 @@
find_and_ref_tcp_fb(struct tcp_function_block *blk)
{
struct tcp_function_block *rblk;
-
- rw_rlock(&tcp_function_lock);
+
+ rw_rlock(&tcp_function_lock);
rblk = find_tcp_fb_locked(blk, NULL);
- if (rblk)
+ if (rblk)
refcount_acquire(&rblk->tfb_refcnt);
rw_runlock(&tcp_function_lock);
return(rblk);
@@ -343,7 +343,7 @@
strcpy(fs.function_set_name, blk->tfb_tcp_block_name);
fs.pcbcnt = blk->tfb_refcnt;
}
- rw_runlock(&tcp_function_lock);
+ rw_runlock(&tcp_function_lock);
error = sysctl_handle_string(oidp, fs.function_set_name,
sizeof(fs.function_set_name), req);
@@ -354,8 +354,8 @@
rw_wlock(&tcp_function_lock);
blk = find_tcp_functions_locked(&fs);
if ((blk == NULL) ||
- (blk->tfb_flags & TCP_FUNC_BEING_REMOVED)) {
- error = ENOENT;
+ (blk->tfb_flags & TCP_FUNC_BEING_REMOVED)) {
+ error = ENOENT;
goto done;
}
tcp_func_set_ptr = blk;
@@ -397,7 +397,7 @@
bufsz -= linesz;
outsz = linesz;
- rw_rlock(&tcp_function_lock);
+ rw_rlock(&tcp_function_lock);
TAILQ_FOREACH(f, &t_functions, tf_next) {
alias = (f->tf_name != f->tf_fb->tfb_tcp_block_name);
linesz = snprintf(cp, bufsz, "%-32s%c %-32s %u\n",
@@ -537,7 +537,7 @@
(blk->tfb_tcp_do_segment == NULL) ||
(blk->tfb_tcp_ctloutput == NULL) ||
(strlen(blk->tfb_tcp_block_name) == 0)) {
- /*
+ /*
* These functions are required and you
* need a name.
*/
@@ -549,7 +549,7 @@
blk->tfb_tcp_timer_active ||
blk->tfb_tcp_timer_stop) {
/*
- * If you define one timer function you
+ * If you define one timer function you
* must have them all.
*/
if ((blk->tfb_tcp_timer_stop_all == NULL) ||
@@ -651,7 +651,7 @@
{
struct tcp_function *f;
int error=ENOENT;
-
+
if (strcmp(blk->tfb_tcp_block_name, "default") == 0) {
/* You can't un-register the default */
return (EPERM);
@@ -665,7 +665,7 @@
if (blk->tfb_refcnt) {
/* Still tcb attached, mark it. */
blk->tfb_flags |= TCP_FUNC_BEING_REMOVED;
- rw_wunlock(&tcp_function_lock);
+ rw_wunlock(&tcp_function_lock);
return (EBUSY);
}
while (find_tcp_fb_locked(blk, &f) != NULL) {
@@ -1069,7 +1069,7 @@
m = n;
} else {
/*
- * reuse the mbuf.
+ * reuse the mbuf.
* XXX MRT We inherit the FIB, which is lucky.
*/
m_freem(m->m_next);
@@ -1439,6 +1439,9 @@
{
struct socket *so = tp->t_inpcb->inp_socket;
+ if(so->inherit)
+ printf("%s] inherited socket %p\n", __func__, so);
+
INP_INFO_LOCK_ASSERT(&V_tcbinfo);
INP_WLOCK_ASSERT(tp->t_inpcb);
@@ -1481,12 +1484,12 @@
tcp_timer_stop(tp, TT_2MSL);
tcp_timer_stop(tp, TT_DELACK);
if (tp->t_fb->tfb_tcp_timer_stop_all) {
- /*
- * Call the stop-all function of the methods,
+ /*
+ * Call the stop-all function of the methods,
* this function should call the tcp_timer_stop()
* method with each of the function specific timeouts.
* That stop will be called via the tfb_tcp_timer_stop()
- * which should use the async drain function of the
+ * which should use the async drain function of the
* callout system (see tcp_var.h).
*/
tp->t_fb->tfb_tcp_timer_stop_all(tp);
@@ -1556,7 +1559,7 @@
if (tp->t_flags & TF_TOE)
tcp_offload_detach(tp);
#endif
-
+
tcp_free_sackholes(tp);
#ifdef TCPPCAP
@@ -1594,7 +1597,7 @@
{
struct inpcb *inp;
struct tcpcb *tp;
-
+
tp = (struct tcpcb *)ptp;
CURVNET_SET(tp->t_vnet);
INP_INFO_RLOCK(&V_tcbinfo);
@@ -1633,10 +1636,38 @@
{
struct inpcb *inp = tp->t_inpcb;
struct socket *so;
+ struct inpcb *inp_inh = NULL;
+ int listen = tp->t_state & TCPS_LISTEN;
+
+ printf("%s] inp %p\n", __func__, inp);
INP_INFO_LOCK_ASSERT(&V_tcbinfo);
INP_WLOCK_ASSERT(inp);
+ if (listen) {
+ /*
+ * Pending socket inheritance
+ *
+ * If this is a listen(2) socket, find another listen(2)
+ * socket in the same local group, which could inherit
+ * the syncache and sockets pending on the completion
+ * and incompletion queues.
+ *
+ * NOTE:
+ * Currently the inheritance could only happen on the
+ * listen(2) sockets with SO_REUSEPORT_LB set.
+ */
+
+ // XXX: How to handle this?
+ // ASSERT_IN_NETISR(0);
+
+ inp_inh = in_pcblookup_lbgroup_last(inp);
+ if (inp_inh != NULL)
+ printf("%s] inp %p will inherit from inp %p\n", __func__, inp_inh, inp);
+ else
+ printf("%s] there is none that can inherit from inp %p\n", __func__, inp);
+ }
+
#ifdef TCP_OFFLOAD
if (tp->t_state == TCPS_LISTEN)
tcp_offload_listen_stop(tp);
@@ -1658,7 +1689,33 @@
tcp_state_change(tp, TCPS_CLOSED);
KASSERT(inp->inp_socket != NULL, ("tcp_close: inp_socket NULL"));
so = inp->inp_socket;
+
+ if(so->inherit)
+ printf("%s] inherited socket %p\n", __func__, so);
+
soisdisconnected(so);
+
+ // Socket inherit
+ if(listen)
+ {
+ // XXX What do we do with syncache?
+ // syncache seem to be stored separately from sockets/inps,
+ // no need to do anything??
+
+ // from dflybsd impl:
+ // syncache_destroy(tp, tp_inh);
+
+
+ if(inp_inh == NULL) {
+ printf("%s] inp_inh is NULL, can't inherit\n", __func__);
+ } else if(inp_inh->inp_socket == NULL) {
+ printf("%s] inp_inh->inp_socket is NULL, can't inherit\n", __func__);
+ } else {
+ soinherit(so, inp_inh->inp_socket);
+ }
+ }
+
+
if (inp->inp_flags & INP_SOCKREF) {
KASSERT(so->so_state & SS_PROTOREF,
("tcp_close: !SS_PROTOREF"));
@@ -2023,7 +2080,7 @@
if (cmd == PRC_MSGSIZE)
notify = tcp_mtudisc_notify;
else if (V_icmp_may_rst && (cmd == PRC_UNREACH_ADMIN_PROHIB ||
- cmd == PRC_UNREACH_PORT || cmd == PRC_UNREACH_PROTOCOL ||
+ cmd == PRC_UNREACH_PORT || cmd == PRC_UNREACH_PROTOCOL ||
cmd == PRC_TIMXCEED_INTRANS) && ip)
notify = tcp_drop_syn_sent;
@@ -2159,7 +2216,7 @@
if (cmd == PRC_MSGSIZE)
notify = tcp_mtudisc_notify;
else if (V_icmp_may_rst && (cmd == PRC_UNREACH_ADMIN_PROHIB ||
- cmd == PRC_UNREACH_PORT || cmd == PRC_UNREACH_PROTOCOL ||
+ cmd == PRC_UNREACH_PORT || cmd == PRC_UNREACH_PROTOCOL ||
cmd == PRC_TIMXCEED_INTRANS) && ip6 != NULL)
notify = tcp_drop_syn_sent;
@@ -2437,7 +2494,7 @@
KASSERT(tp != NULL, ("tcp_mtudisc: tp == NULL"));
tcp_mss_update(tp, -1, mtuoffer, NULL, NULL);
-
+
so = inp->inp_socket;
SOCKBUF_LOCK(&so->so_snd);
/* If the mss is larger than the socket buffer, decrease the mss. */
Index: sys/netinet/tcp_syncache.c
===================================================================
--- sys/netinet/tcp_syncache.c
+++ sys/netinet/tcp_syncache.c
@@ -372,6 +372,7 @@
static void
syncache_drop(struct syncache *sc, struct syncache_head *sch)
{
+ printf("%s]\n", __func__);
SCH_LOCK_ASSERT(sch);
@@ -794,7 +795,7 @@
struct sockaddr_in sin;
inp->inp_options = (m) ? ip_srcroute(m) : NULL;
-
+
if (inp->inp_options == NULL) {
inp->inp_options = sc->sc_ipopts;
sc->sc_ipopts = NULL;
@@ -838,11 +839,11 @@
if (V_functions_inherit_listen_socket_stack && blk != tp->t_fb) {
/*
* Our parents t_fb was not the default,
- * we need to release our ref on tp->t_fb and
+ * we need to release our ref on tp->t_fb and
* pickup one on the new entry.
*/
struct tcp_function_block *rblk;
-
+
rblk = find_and_ref_tcp_fb(blk);
KASSERT(rblk != NULL,
("cannot find blk %p out of syncache?", blk));
@@ -853,7 +854,7 @@
if (tp->t_fb->tfb_tcp_fb_init) {
(*tp->t_fb->tfb_tcp_fb_init)(tp);
}
- }
+ }
tp->snd_wl1 = sc->sc_irs;
tp->snd_max = tp->iss + 1;
tp->snd_nxt = tp->iss + 1;
@@ -1066,7 +1067,7 @@
#endif /* TCP_SIGNATURE */
/*
* Pull out the entry to unlock the bucket row.
- *
+ *
* NOTE: We must decrease TCPS_SYN_RECEIVED count here, not
* tcp_state_change(). The tcpcb is not existent at this
* moment. A new one will be allocated via syncache_socket->
@@ -1231,6 +1232,7 @@
struct inpcb *inp, struct socket **lsop, struct mbuf *m, void *tod,
void *todctx)
{
+ /* printf("%s] inp %p\n", __func__, inp); */
struct tcpcb *tp;
struct socket *so;
struct syncache *sc = NULL;
@@ -2046,7 +2048,7 @@
}
static struct syncache *
-syncookie_lookup(struct in_conninfo *inc, struct syncache_head *sch,
+syncookie_lookup(struct in_conninfo *inc, struct syncache_head *sch,
struct syncache *sc, struct tcphdr *th, struct tcpopt *to,
struct socket *lso)
{
@@ -2084,7 +2086,7 @@
sc->sc_flags = 0;
bcopy(inc, &sc->sc_inc, sizeof(struct in_conninfo));
sc->sc_ipopts = NULL;
-
+
sc->sc_irs = seq;
sc->sc_iss = ack;
Index: sys/netinet6/ip6_output.c
===================================================================
--- sys/netinet6/ip6_output.c
+++ sys/netinet6/ip6_output.c
@@ -1460,6 +1460,15 @@
INP_WUNLOCK(in6p);
error = 0;
break;
+ case SO_REUSEPORT_LB:
+ INP_WLOCK(in6p);
+ if ((so->so_options & SO_REUSEPORT_LB) != 0)
+ in6p->inp_flags2 |= INP_REUSEPORT_LB;
+ else
+ in6p->inp_flags2 &= ~INP_REUSEPORT_LB;
+ INP_WUNLOCK(in6p);
+ error = 0;
+ break;
case SO_SETFIB:
INP_WLOCK(in6p);
in6p->inp_inc.inc_fibnum = so->so_fibnum;
Index: sys/sys/socket.h
===================================================================
--- sys/sys/socket.h
+++ sys/sys/socket.h
@@ -145,6 +145,9 @@
#define SO_NO_OFFLOAD 0x4000 /* socket cannot be offloaded */
#define SO_NO_DDP 0x8000 /* disable direct data placement */
+// XXX: so_options only 16 bit.. (increased to 32)
+#define SO_REUSEPORT_LB 0x00010000 /* reuse with load balancing */
+
/*
* Additional options, not kept in so_options.
*/
Index: sys/sys/socketvar.h
===================================================================
--- sys/sys/socketvar.h
+++ sys/sys/socketvar.h
@@ -73,12 +73,13 @@
*/
TAILQ_HEAD(accept_queue, socket);
struct socket {
+ uint32_t inherit; /* temporarily added for debugging */
struct mtx so_lock;
volatile u_int so_count; /* (b / refcount) */
struct selinfo so_rdsel; /* (b/cr) for so_rcv/so_comp */
struct selinfo so_wrsel; /* (b/cs) for so_snd */
short so_type; /* (a) generic type, see socket.h */
- short so_options; /* (b) from socket call, see socket.h */
+ int so_options; /* (b) from socket call, see socket.h */
short so_linger; /* time to linger close(2) */
short so_state; /* (b) internal state flags SS_* */
void *so_pcb; /* protocol control block */
@@ -200,12 +201,12 @@
size_t xso_len; /* length of this structure */
struct socket *xso_so; /* makes a convenient handle sometimes */
short so_type;
- short so_options;
+ int so_options;
short so_linger;
short so_state;
caddr_t so_pcb; /* another convenient handle */
- int xso_protocol;
- int xso_family;
+ int xso_protocol;
+ int xso_family;
u_int so_qlen;
u_int so_incqlen;
u_int so_qlimit;
@@ -386,6 +387,7 @@
int sodisconnect(struct socket *so);
struct sockaddr *sodupsockaddr(const struct sockaddr *sa, int mflags);
void sofree(struct socket *so);
+void soinherit(struct socket *so, struct socket *so_inh);
void sohasoutofband(struct socket *so);
int solisten(struct socket *so, int backlog, struct thread *td);
void solisten_proto(struct socket *so, int backlog);
@@ -431,7 +433,6 @@
void solisten_wakeup(struct socket *);
int selsocket(struct socket *so, int events, struct timeval *tv,
struct thread *td);
-
/*
* Accept filter functions (duh).
*/

File Metadata

Mime Type
text/plain
Expires
Fri, Apr 25, 3:29 AM (11 h, 59 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
17774478
Default Alt Text
D11003.id30483.diff (41 KB)

Event Timeline