Page MenuHomeFreeBSD

D32585.id97510.diff
No OneTemporary

D32585.id97510.diff

Index: sys/kern/subr_witness.c
===================================================================
--- sys/kern/subr_witness.c
+++ sys/kern/subr_witness.c
@@ -564,15 +564,15 @@
/*
* UDP/IP
*/
- { "udp", &lock_class_mtx_sleep },
{ "udpinp", &lock_class_rw },
+ { "udp", &lock_class_mtx_sleep },
{ "so_snd", &lock_class_mtx_sleep },
{ NULL, NULL },
/*
* TCP/IP
*/
- { "tcp", &lock_class_mtx_sleep },
{ "tcpinp", &lock_class_rw },
+ { "tcp", &lock_class_mtx_sleep },
{ "so_snd", &lock_class_mtx_sleep },
{ NULL, NULL },
/*
Index: sys/kern/uipc_ktls.c
===================================================================
--- sys/kern/uipc_ktls.c
+++ sys/kern/uipc_ktls.c
@@ -810,10 +810,6 @@
inp = so->so_pcb;
INP_WLOCK(inp);
- if (inp->inp_flags2 & INP_FREED) {
- INP_WUNLOCK(inp);
- return (ECONNRESET);
- }
if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
INP_WUNLOCK(inp);
return (ECONNRESET);
@@ -865,10 +861,6 @@
int error;
INP_RLOCK(inp);
- if (inp->inp_flags2 & INP_FREED) {
- INP_RUNLOCK(inp);
- return (ECONNRESET);
- }
if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
INP_RUNLOCK(inp);
return (ECONNRESET);
@@ -2476,8 +2468,7 @@
INP_WLOCK(inp);
so = inp->inp_socket;
MPASS(so != NULL);
- if ((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) ||
- (inp->inp_flags2 & INP_FREED)) {
+ if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
goto out;
}
@@ -2489,7 +2480,6 @@
counter_u64_add(ktls_ifnet_disable_ok, 1);
/* ktls_set_tx_mode() drops inp wlock, so recheck flags */
if ((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) == 0 &&
- (inp->inp_flags2 & INP_FREED) == 0 &&
(tp = intotcpcb(inp)) != NULL &&
tp->t_fb->tfb_hwtls_change != NULL)
(*tp->t_fb->tfb_hwtls_change)(tp, 0);
Index: sys/netinet/in_pcb.h
===================================================================
--- sys/netinet/in_pcb.h
+++ sys/netinet/in_pcb.h
@@ -49,7 +49,9 @@
#ifdef _KERNEL
#include <sys/lock.h>
+#include <sys/proc.h>
#include <sys/rwlock.h>
+#include <sys/smr.h>
#include <net/vnet.h>
#include <vm/uma.h>
#endif
@@ -133,32 +135,19 @@
* struct inpcb captures the network layer state for TCP, UDP, and raw IPv4 and
* IPv6 sockets. In the case of TCP and UDP, further per-connection state is
* hung off of inp_ppcb most of the time. Almost all fields of struct inpcb
- * are static after creation or protected by a per-inpcb rwlock, inp_lock. A
- * few fields are protected by multiple locks as indicated in the locking notes
- * below. For these fields, all of the listed locks must be write-locked for
- * any modifications. However, these fields can be safely read while any one of
- * the listed locks are read-locked. This model can permit greater concurrency
- * for read operations. For example, connections can be looked up while only
- * holding a read lock on the global pcblist lock. This is important for
- * performance when attempting to find the connection for a packet given its IP
- * and port tuple.
+ * are static after creation or protected by a per-inpcb rwlock, inp_lock.
*
- * One noteworthy exception is that the global pcbinfo lock follows a different
- * set of rules in relation to the inp_list field. Rather than being
- * write-locked for modifications and read-locked for list iterations, it must
- * be read-locked during modifications and write-locked during list iterations.
- * This ensures that the relatively rare global list iterations safely walk a
- * stable snapshot of connections while allowing more common list modifications
- * to safely grab the pcblist lock just while adding or removing a connection
- * from the global list.
+ * A inpcb database is indexed by addresses/ports hash as well as list of
+ * all pcbs that belong to a certain proto. Database lookups or list traversals
+ * are be performed inside SMR section. Once desired PCB is found its own
+ * lock is to be obtained and SMR section exited.
*
* Key:
* (b) - Protected by the hpts lock.
* (c) - Constant after initialization
- * (e) - Protected by the net_epoch_prempt epoch
+ * (e) - Protected by the SMR section
* (i) - Protected by the inpcb lock
* (p) - Protected by the pcbinfo lock for the inpcb
- * (l) - Protected by the pcblist lock for the inpcb
* (h) - Protected by the pcbhash lock for the inpcb
* (s) - Protected by another subsystem's locks
* (x) - Undefined locking
@@ -219,17 +208,13 @@
* socket has been freed), or there may be close(2)-related races.
*
* The inp_vflag field is overloaded, and would otherwise ideally be (c).
- *
- * TODO: Currently only the TCP stack is leveraging the global pcbinfo lock
- * read-lock usage during modification, this model can be applied to other
- * protocols (especially SCTP).
*/
struct icmp6_filter;
struct inpcbpolicy;
struct m_snd_tag;
struct inpcb {
/* Cache line #1 (amd64) */
- CK_LIST_ENTRY(inpcb) inp_hash; /* [w](h/i) [r](e/i) hash list */
+ CK_LIST_ENTRY(inpcb) inp_hash; /* (w:h/r:e) hash list */
struct rwlock inp_lock;
/* Cache line #2 (amd64) */
#define inp_start_zero inp_hpts
@@ -311,8 +296,8 @@
int in6p_cksum;
short in6p_hops;
};
- CK_LIST_ENTRY(inpcb) inp_portlist; /* (i/h) */
- struct inpcbport *inp_phd; /* (i/h) head of this list */
+ CK_LIST_ENTRY(inpcb) inp_portlist; /* (r:e/w:h) port list */
+ struct inpcbport *inp_phd; /* (r:e/w:h) head of this list */
inp_gen_t inp_gencnt; /* (c) generation count */
void *spare_ptr; /* Spare pointer. */
rt_gen_t inp_rt_cookie; /* generation for route entry */
@@ -320,10 +305,7 @@
struct route inp_route;
struct route_in6 inp_route6;
};
- CK_LIST_ENTRY(inpcb) inp_list; /* (p/l) list for all PCBs for proto */
- /* (e[r]) for list iteration */
- /* (p[w]/l) for addition/removal */
- struct epoch_context inp_epoch_ctx;
+ CK_LIST_ENTRY(inpcb) inp_list; /* (r:e/w:p) all PCBs for proto */
};
#endif /* _KERNEL */
@@ -396,80 +378,58 @@
#endif
#endif /* _SYS_SOCKETVAR_H_ */
-struct inpcbport {
- struct epoch_context phd_epoch_ctx;
- CK_LIST_ENTRY(inpcbport) phd_hash;
- struct inpcbhead phd_pcblist;
- u_short phd_port;
-};
-
-/*-
+#ifdef _KERNEL
+/*
* Global data structure for each high-level protocol (UDP, TCP, ...) in both
* IPv4 and IPv6. Holds inpcb lists and information for managing them.
*
- * Each pcbinfo is protected by three locks: ipi_lock, ipi_hash_lock and
- * ipi_list_lock:
- * - ipi_lock covering the global pcb list stability during loop iteration,
- * - ipi_hash_lock covering the hashed lookup tables,
- * - ipi_list_lock covering mutable global fields (such as the global
- * pcb list)
- *
- * The lock order is:
- *
- * ipi_lock (before)
- * inpcb locks (before)
- * ipi_list locks (before)
+ * The pcbs are protected with SMR section and thus all lists in inpcbinfo
+ * are CK-lists. Locking is required to insert a pcb into database. Two
+ * locks are provided: one for the hash and one for the global list of pcbs,
+ * as well as overall count and generation count.
*
* Locking key:
*
* (c) Constant or nearly constant after initialisation
- * (e) - Protected by the net_epoch_prempt epoch
+ * (e) Protected by SMR section
* (g) Locked by ipi_lock
- * (l) Locked by ipi_list_lock
- * (h) Read using either net_epoch_preempt or inpcb lock; write requires both ipi_hash_lock and inpcb lock
- * (x) Synchronisation properties poorly defined
+ * (h) Locked by ipi_hash_lock
*/
struct inpcbinfo {
/*
* Global lock protecting inpcb list modification
*/
struct mtx ipi_lock;
-
- /*
- * Global list of inpcbs on the protocol.
- */
- struct inpcbhead *ipi_listhead; /* [r](e) [w](g/l) */
- u_int ipi_count; /* (l) */
+ struct inpcbhead ipi_listhead; /* (r:e/w:g) */
+ u_int ipi_count; /* (g) */
/*
* Generation count -- incremented each time a connection is allocated
* or freed.
*/
- u_quad_t ipi_gencnt; /* (l) */
+ u_quad_t ipi_gencnt; /* (g) */
/*
* Fields associated with port lookup and allocation.
*/
- u_short ipi_lastport; /* (x) */
- u_short ipi_lastlow; /* (x) */
- u_short ipi_lasthi; /* (x) */
+ u_short ipi_lastport; /* (h) */
+ u_short ipi_lastlow; /* (h) */
+ u_short ipi_lasthi; /* (h) */
/*
* UMA zone from which inpcbs are allocated for this protocol.
*/
- struct uma_zone *ipi_zone; /* (c) */
-
- /*
- * Global lock protecting modification hash lookup tables.
- */
- struct mtx ipi_hash_lock;
+ uma_zone_t ipi_zone; /* (c) */
+ uma_zone_t ipi_portzone; /* (c) */
+ smr_t ipi_smr; /* (c) */
/*
* Global hash of inpcbs, hashed by local and foreign addresses and
* port numbers.
*/
- struct inpcbhead *ipi_hashbase; /* (h) */
- u_long ipi_hashmask; /* (h) */
+ struct mtx ipi_hash_lock;
+ struct inpcbhead *ipi_hashbase; /* (r:e/w:h) */
+ u_long ipi_hashmask; /* (c) */
/*
* Global hash of inpcbs, hashed by only local port number.
@@ -481,26 +441,15 @@
* Load balance groups used for the SO_REUSEPORT_LB option,
* hashed by local port.
*/
- struct inpcblbgrouphead *ipi_lbgrouphashbase; /* (h) */
+ struct inpcblbgrouphead *ipi_lbgrouphashbase; /* (r:e/w:h) */
u_long ipi_lbgrouphashmask; /* (h) */
/*
* Pointer to network stack instance
*/
struct vnet *ipi_vnet; /* (c) */
-
- /*
- * general use 2
- */
- void *ipi_pspare[2];
-
- /*
- * Global lock protecting global inpcb list, inpcb count, etc.
- */
- struct rwlock ipi_list_lock;
};
-#ifdef _KERNEL
/*
* Load balance groups used for the SO_REUSEPORT_LB socket option. Each group
* (or unique address:port combination) can be re-used at most
@@ -571,51 +520,22 @@
#endif /* _KERNEL */
-#define INP_INFO_LOCK_INIT(ipi, d) \
- mtx_init(&(ipi)->ipi_lock, (d), NULL, MTX_DEF| MTX_RECURSE)
-#define INP_INFO_LOCK_DESTROY(ipi) mtx_destroy(&(ipi)->ipi_lock)
-#define INP_INFO_WLOCK(ipi) mtx_lock(&(ipi)->ipi_lock)
+#define INP_INFO_WLOCK(ipi) mtx_lock(&(ipi)->ipi_lock)
#define INP_INFO_TRY_WLOCK(ipi) mtx_trylock(&(ipi)->ipi_lock)
#define INP_INFO_WLOCKED(ipi) mtx_owned(&(ipi)->ipi_lock)
#define INP_INFO_WUNLOCK(ipi) mtx_unlock(&(ipi)->ipi_lock)
-#define INP_INFO_LOCK_ASSERT(ipi) MPASS(in_epoch(net_epoch_preempt) || mtx_owned(&(ipi)->ipi_lock))
+#define INP_INFO_LOCK_ASSERT(ipi) MPASS(SMR_ENTERED((ipi)->ipi_smr) || \
+ mtx_owned(&(ipi)->ipi_lock))
#define INP_INFO_WLOCK_ASSERT(ipi) mtx_assert(&(ipi)->ipi_lock, MA_OWNED)
#define INP_INFO_WUNLOCK_ASSERT(ipi) \
- mtx_assert(&(ipi)->ipi_lock, MA_NOTOWNED)
-
-#define INP_LIST_LOCK_INIT(ipi, d) \
- rw_init_flags(&(ipi)->ipi_list_lock, (d), 0)
-#define INP_LIST_LOCK_DESTROY(ipi) rw_destroy(&(ipi)->ipi_list_lock)
-#define INP_LIST_RLOCK(ipi) rw_rlock(&(ipi)->ipi_list_lock)
-#define INP_LIST_WLOCK(ipi) rw_wlock(&(ipi)->ipi_list_lock)
-#define INP_LIST_TRY_RLOCK(ipi) rw_try_rlock(&(ipi)->ipi_list_lock)
-#define INP_LIST_TRY_WLOCK(ipi) rw_try_wlock(&(ipi)->ipi_list_lock)
-#define INP_LIST_TRY_UPGRADE(ipi) rw_try_upgrade(&(ipi)->ipi_list_lock)
-#define INP_LIST_RUNLOCK(ipi) rw_runlock(&(ipi)->ipi_list_lock)
-#define INP_LIST_WUNLOCK(ipi) rw_wunlock(&(ipi)->ipi_list_lock)
-#define INP_LIST_LOCK_ASSERT(ipi) \
- rw_assert(&(ipi)->ipi_list_lock, RA_LOCKED)
-#define INP_LIST_RLOCK_ASSERT(ipi) \
- rw_assert(&(ipi)->ipi_list_lock, RA_RLOCKED)
-#define INP_LIST_WLOCK_ASSERT(ipi) \
- rw_assert(&(ipi)->ipi_list_lock, RA_WLOCKED)
-#define INP_LIST_UNLOCK_ASSERT(ipi) \
- rw_assert(&(ipi)->ipi_list_lock, RA_UNLOCKED)
-
-#define INP_HASH_LOCK_INIT(ipi, d) mtx_init(&(ipi)->ipi_hash_lock, (d), NULL, MTX_DEF)
-#define INP_HASH_LOCK_DESTROY(ipi) mtx_destroy(&(ipi)->ipi_hash_lock)
+ mtx_assert(&(ipi)->ipi_lock, MA_NOTOWNED)
+
#define INP_HASH_WLOCK(ipi) mtx_lock(&(ipi)->ipi_hash_lock)
#define INP_HASH_WUNLOCK(ipi) mtx_unlock(&(ipi)->ipi_hash_lock)
-#define INP_HASH_LOCK_ASSERT(ipi) MPASS(in_epoch(net_epoch_preempt) || mtx_owned(&(ipi)->ipi_hash_lock))
-#define INP_HASH_WLOCK_ASSERT(ipi) mtx_assert(&(ipi)->ipi_hash_lock, MA_OWNED);
-
-#define INP_GROUP_LOCK_INIT(ipg, d) mtx_init(&(ipg)->ipg_lock, (d), NULL, \
- MTX_DEF | MTX_DUPOK)
-#define INP_GROUP_LOCK_DESTROY(ipg) mtx_destroy(&(ipg)->ipg_lock)
-
-#define INP_GROUP_LOCK(ipg) mtx_lock(&(ipg)->ipg_lock)
-#define INP_GROUP_LOCK_ASSERT(ipg) mtx_assert(&(ipg)->ipg_lock, MA_OWNED)
-#define INP_GROUP_UNLOCK(ipg) mtx_unlock(&(ipg)->ipg_lock)
+#define INP_HASH_LOCK_ASSERT(ipi) MPASS(SMR_ENTERED((ipi)->ipi_smr) || \
+ mtx_owned(&(ipi)->ipi_hash_lock))
+#define INP_HASH_WLOCK_ASSERT(ipi) mtx_assert(&(ipi)->ipi_hash_lock, \
+ MA_OWNED)
#define INP_PCBHASH(faddr, lport, fport, mask) \
(((faddr) ^ ((faddr) >> 16) ^ ntohs((lport) ^ (fport))) & (mask))
@@ -644,7 +564,7 @@
#define INP_ANONPORT 0x00000040 /* port chosen for user */
#define INP_RECVIF 0x00000080 /* receive incoming interface */
#define INP_MTUDISC 0x00000100 /* user can do MTU discovery */
- /* 0x000200 unused: was INP_FAITH */
+/* INP_FREED 0x00000200 private to in_pcb.c */
#define INP_RECVTTL 0x00000400 /* receive incoming IP TTL */
#define INP_DONTFRAG 0x00000800 /* don't fragment packet */
#define INP_BINDANY 0x00001000 /* allow bind to any address */
@@ -682,7 +602,7 @@
#define INP_MBUF_ACKCMP 0x00000002 /* TCP mbuf ack compression ok */
/* 0x00000004 */
#define INP_REUSEPORT 0x00000008 /* SO_REUSEPORT option is set */
-#define INP_FREED 0x00000010 /* inp itself is not valid */
+/* 0x00000010 */
#define INP_REUSEADDR 0x00000020 /* SO_REUSEADDR option is set */
#define INP_BINDMULTI 0x00000040 /* IP_BINDMULTI option is set */
#define INP_RSS_BUCKET_SET 0x00000080 /* IP_RSS_LISTEN_BUCKET is set */
@@ -702,15 +622,18 @@
#define INP_2PCP_BASE INP_2PCP_BIT0
#define INP_2PCP_MASK (INP_2PCP_BIT0 | INP_2PCP_BIT1 | INP_2PCP_BIT2)
#define INP_2PCP_SHIFT 18 /* shift PCP field in/out of inp_flags2 */
+
/*
- * Flags passed to in_pcblookup*() functions.
+ * Flags passed to in_pcblookup*(), inp_smr_lock() and inp_next().
*/
-#define INPLOOKUP_WILDCARD 0x00000001 /* Allow wildcard sockets. */
-#define INPLOOKUP_RLOCKPCB 0x00000002 /* Return inpcb read-locked. */
-#define INPLOOKUP_WLOCKPCB 0x00000004 /* Return inpcb write-locked. */
+typedef enum {
+ INPLOOKUP_WILDCARD = 0x00000001, /* Allow wildcard sockets. */
+ INPLOOKUP_RLOCKPCB = 0x00000002, /* Return inpcb read-locked. */
+ INPLOOKUP_WLOCKPCB = 0x00000004, /* Return inpcb write-locked. */} inp_lookup_t;
#define INPLOOKUP_MASK (INPLOOKUP_WILDCARD | INPLOOKUP_RLOCKPCB | \
INPLOOKUP_WLOCKPCB)
+#define INPLOOKUP_LOCKMASK (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)
#define sotoinpcb(so) ((struct inpcb *)(so)->so_pcb)
@@ -718,13 +641,6 @@
#define INP_CHECK_SOCKAF(so, af) (INP_SOCKAF(so) == af)
-/*
- * Constants for pcbinfo.ipi_hashfields.
- */
-#define IPI_HASHFIELDS_NONE 0
-#define IPI_HASHFIELDS_2TUPLE 1
-#define IPI_HASHFIELDS_4TUPLE 2
-
#ifdef _KERNEL
VNET_DECLARE(int, ipport_reservedhigh);
VNET_DECLARE(int, ipport_reservedlow);
@@ -755,8 +671,8 @@
#define V_ipport_tcpallocs VNET(ipport_tcpallocs)
void in_pcbinfo_destroy(struct inpcbinfo *);
-void in_pcbinfo_init(struct inpcbinfo *, const char *, struct inpcbhead *,
- int, int, char *, uma_init, u_int);
+void in_pcbinfo_init(struct inpcbinfo *, const char *, u_int, int, char *,
+ uma_init);
int in_pcbbind_check_bindmulti(const struct inpcb *ni,
const struct inpcb *oi);
@@ -788,8 +704,37 @@
int, struct inpcb *(*)(struct inpcb *, int));
void in_pcbref(struct inpcb *);
void in_pcbrehash(struct inpcb *);
-int in_pcbrele_rlocked(struct inpcb *);
-int in_pcbrele_wlocked(struct inpcb *);
+bool in_pcbrele_rlocked(struct inpcb *);
+bool in_pcbrele_wlocked(struct inpcb *);
+
+typedef bool inp_match_t(const struct inpcb *, void *);
+struct inpcb_iterator {
+ const struct inpcbinfo *ipi;
+ struct inpcb *inp;
+ inp_match_t *match;
+ void *ctx;
+ int hash;
+#define INP_ALL_LIST -1
+ const inp_lookup_t lock;
+};
+
+/* Note: sparse initializers guarantee .inp = NULL. */
+#define INP_ITERATOR(_ipi, _lock, _match, _ctx) \
+ { \
+ .ipi = (_ipi), \
+ .lock = (_lock), \
+ .hash = INP_ALL_LIST, \
+ .match = (_match), \
+ .ctx = (_ctx), \
+ }
+#define INP_ALL_ITERATOR(_ipi, _lock) \
+ { \
+ .ipi = (_ipi), \
+ .lock = (_lock), \
+ .hash = INP_ALL_LIST, \
+ }
+
+struct inpcb *inp_next(struct inpcb_iterator *);
void in_losing(struct inpcb *);
void in_pcbsetsolabel(struct socket *so);
int in_getpeeraddr(struct socket *so, struct sockaddr **nam);
Index: sys/netinet/in_pcb.c
===================================================================
--- sys/netinet/in_pcb.c
+++ sys/netinet/in_pcb.c
@@ -114,6 +114,7 @@
#define INPCBLBGROUP_SIZMIN 8
#define INPCBLBGROUP_SIZMAX 256
+#define INP_FREED 0x00000200 /* See in_pcb.h. */
static struct callout ipport_tick_callout;
@@ -146,7 +147,6 @@
#define V_ipport_tcplastcount VNET(ipport_tcplastcount)
-static void in_pcbremlists(struct inpcb *inp);
#ifdef INET
static struct inpcb *in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo,
struct in_addr faddr, u_int fport_arg,
@@ -515,38 +515,43 @@
INP_LOCK_DESTROY(inp);
}
+/* Make sure it is safe to use hashinit(9) on CK_LIST. */
+CTASSERT(sizeof(struct inpcbhead) == sizeof(LIST_HEAD(, inpcb)));
+
/*
* Initialize an inpcbinfo -- we should be able to reduce the number of
* arguments in time.
*/
void
in_pcbinfo_init(struct inpcbinfo *pcbinfo, const char *name,
- struct inpcbhead *listhead, int hash_nelements, int porthash_nelements,
- char *inpcbzone_name, uma_init inpcbzone_init, u_int hashfields)
+ u_int hash_nelements, int porthash_nelements, char *inpcbzone_name,
+ uma_init inpcbzone_init)
{
- porthash_nelements = imin(porthash_nelements, IPPORT_MAX + 1);
-
- INP_INFO_LOCK_INIT(pcbinfo, name);
- INP_HASH_LOCK_INIT(pcbinfo, "pcbinfohash"); /* XXXRW: argument? */
- INP_LIST_LOCK_INIT(pcbinfo, "pcbinfolist");
+ mtx_init(&pcbinfo->ipi_lock, name, NULL, MTX_DEF);
+ mtx_init(&pcbinfo->ipi_hash_lock, "pcbinfohash", NULL, MTX_DEF);
#ifdef VIMAGE
pcbinfo->ipi_vnet = curvnet;
#endif
- pcbinfo->ipi_listhead = listhead;
- CK_LIST_INIT(pcbinfo->ipi_listhead);
+ CK_LIST_INIT(&pcbinfo->ipi_listhead);
pcbinfo->ipi_count = 0;
pcbinfo->ipi_hashbase = hashinit(hash_nelements, M_PCB,
&pcbinfo->ipi_hashmask);
+ porthash_nelements = imin(porthash_nelements, IPPORT_MAX + 1);
pcbinfo->ipi_porthashbase = hashinit(porthash_nelements, M_PCB,
&pcbinfo->ipi_porthashmask);
pcbinfo->ipi_lbgrouphashbase = hashinit(porthash_nelements, M_PCB,
&pcbinfo->ipi_lbgrouphashmask);
pcbinfo->ipi_zone = uma_zcreate(inpcbzone_name, sizeof(struct inpcb),
- NULL, NULL, inpcbzone_init, inpcb_fini, UMA_ALIGN_PTR, 0);
+ NULL, NULL, inpcbzone_init, inpcb_fini, UMA_ALIGN_PTR,
+ UMA_ZONE_SMR);
uma_zone_set_max(pcbinfo->ipi_zone, maxsockets);
uma_zone_set_warning(pcbinfo->ipi_zone,
"kern.ipc.maxsockets limit reached");
+ pcbinfo->ipi_smr = uma_zone_get_smr(pcbinfo->ipi_zone);
+ pcbinfo->ipi_portzone = uma_zcreate(inpcbzone_name,
+ sizeof(struct inpcbport), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
+ uma_zone_set_smr(pcbinfo->ipi_portzone, pcbinfo->ipi_smr);
}
/*
@@ -565,9 +570,8 @@
hashdestroy(pcbinfo->ipi_lbgrouphashbase, M_PCB,
pcbinfo->ipi_lbgrouphashmask);
uma_zdestroy(pcbinfo->ipi_zone);
- INP_LIST_LOCK_DESTROY(pcbinfo);
- INP_HASH_LOCK_DESTROY(pcbinfo);
- INP_INFO_LOCK_DESTROY(pcbinfo);
+ mtx_destroy(&pcbinfo->ipi_hash_lock);
+ mtx_destroy(&pcbinfo->ipi_lock);
}
/*
@@ -581,7 +585,7 @@
int error;
error = 0;
- inp = uma_zalloc(pcbinfo->ipi_zone, M_NOWAIT);
+ inp = uma_zalloc_smr(pcbinfo->ipi_zone, M_NOWAIT);
if (inp == NULL)
return (ENOBUFS);
bzero(&inp->inp_start_zero, inp_zero_size);
@@ -613,33 +617,38 @@
if (V_ip6_v6only)
inp->inp_flags |= IN6P_IPV6_V6ONLY;
}
-#endif
- INP_WLOCK(inp);
- INP_LIST_WLOCK(pcbinfo);
- CK_LIST_INSERT_HEAD(pcbinfo->ipi_listhead, inp, inp_list);
- pcbinfo->ipi_count++;
- so->so_pcb = (caddr_t)inp;
-#ifdef INET6
if (V_ip6_auto_flowlabel)
inp->inp_flags |= IN6P_AUTOFLOWLABEL;
#endif
- inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
- refcount_init(&inp->inp_refcount, 1); /* Reference from inpcbinfo */
-
/*
* Routes in inpcb's can cache L2 as well; they are guaranteed
* to be cleaned up.
*/
inp->inp_route.ro_flags = RT_LLE_CACHE;
- INP_LIST_WUNLOCK(pcbinfo);
+#ifdef TCPHPTS
+ /*
+ * If using hpts lets drop a random number in so
+ * not all new connections fall on the same CPU.
+ */
+ inp->inp_hpts_cpu = inp->inp_input_cpu = hpts_random_cpu(inp);
+#endif
+ refcount_init(&inp->inp_refcount, 1); /* Reference from socket. */
+ INP_WLOCK(inp);
+ INP_INFO_WLOCK(pcbinfo);
+ pcbinfo->ipi_count++;
+ inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
+ CK_LIST_INSERT_HEAD(&pcbinfo->ipi_listhead, inp, inp_list);
+ INP_INFO_WUNLOCK(pcbinfo);
+ so->so_pcb = inp;
+
+ return (0);
+
#if defined(IPSEC) || defined(IPSEC_SUPPORT) || defined(MAC)
out:
- if (error != 0) {
- crfree(inp->inp_cred);
- uma_zfree(pcbinfo->ipi_zone, inp);
- }
-#endif
+ crfree(inp->inp_cred);
+ uma_zfree_smr(pcbinfo->ipi_zone, inp);
return (error);
+#endif
}
#ifdef INET
@@ -1350,7 +1359,6 @@
in_addr_t *laddrp, u_short *lportp, in_addr_t *faddrp, u_short *fportp,
struct inpcb **oinpp, struct ucred *cred)
{
- struct rm_priotracker in_ifa_tracker;
struct sockaddr_in *sin = (struct sockaddr_in *)nam;
struct in_ifaddr *ia;
struct inpcb *oinp;
@@ -1399,20 +1407,16 @@
* choose the broadcast address for that interface.
*/
if (faddr.s_addr == INADDR_ANY) {
- IN_IFADDR_RLOCK(&in_ifa_tracker);
faddr =
IA_SIN(CK_STAILQ_FIRST(&V_in_ifaddrhead))->sin_addr;
- IN_IFADDR_RUNLOCK(&in_ifa_tracker);
if (cred != NULL &&
(error = prison_get_ip4(cred, &faddr)) != 0)
return (error);
} else if (faddr.s_addr == (u_long)INADDR_BROADCAST) {
- IN_IFADDR_RLOCK(&in_ifa_tracker);
if (CK_STAILQ_FIRST(&V_in_ifaddrhead)->ia_ifp->if_flags &
IFF_BROADCAST)
faddr = satosin(&CK_STAILQ_FIRST(
&V_in_ifaddrhead)->ia_broadaddr)->sin_addr;
- IN_IFADDR_RUNLOCK(&in_ifa_tracker);
}
}
if (laddr.s_addr == INADDR_ANY) {
@@ -1430,7 +1434,6 @@
imo = inp->inp_moptions;
if (imo->imo_multicast_ifp != NULL) {
ifp = imo->imo_multicast_ifp;
- IN_IFADDR_RLOCK(&in_ifa_tracker);
CK_STAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) {
if ((ia->ia_ifp == ifp) &&
(cred == NULL ||
@@ -1444,7 +1447,6 @@
laddr = ia->ia_addr.sin_addr;
error = 0;
}
- IN_IFADDR_RUNLOCK(&in_ifa_tracker);
}
}
if (error)
@@ -1515,192 +1517,267 @@
}
/*
- * in_pcbref() bumps the reference count on an inpcb in order to maintain
- * stability of an inpcb pointer despite the inpcb lock being released. This
- * is used in TCP when the inpcbinfo lock needs to be acquired or upgraded,
- * but where the inpcb lock may already held.
+ * inpcb hash lookups are protected by SMR section.
*
- * in_pcbref() should be used only to provide brief memory stability, and
- * must always be followed by a call to INP_WLOCK() and in_pcbrele() to
- * garbage collect the inpcb if it has been in_pcbfree()'d from another
- * context. Until in_pcbrele() has returned that the inpcb is still valid,
- * lock and rele are the *only* safe operations that may be performed on the
- * inpcb.
- *
- * While the inpcb will not be freed, releasing the inpcb lock means that the
- * connection's state may change, so the caller should be careful to
- * revalidate any cached state on reacquiring the lock. Drop the reference
- * using in_pcbrele().
+ * Once desired pcb has been found, switching from SMR section to a pcb
+ * lock is performed with inp_smr_lock(). We can not use INP_(W|R)LOCK
+ * here because SMR is a critical section.
+ * In 99%+ cases inp_smr_lock() would obtain the lock immediately.
*/
-void
-in_pcbref(struct inpcb *inp)
+static inline void
+inp_lock(struct inpcb *inp, const inp_lookup_t lock)
+{
+
+ return (lock == INPLOOKUP_RLOCKPCB ? \
+ rw_rlock(&inp->inp_lock) : rw_wlock(&inp->inp_lock));
+}
+
+static inline void
+inp_unlock(struct inpcb *inp, const inp_lookup_t lock)
{
- KASSERT(inp->inp_refcount > 0, ("%s: refcount 0", __func__));
+ return (lock == INPLOOKUP_RLOCKPCB ? \
+ rw_runlock(&inp->inp_lock) : rw_wunlock(&inp->inp_lock));
+}
- refcount_acquire(&inp->inp_refcount);
+static inline int
+inp_trylock(struct inpcb *inp, const inp_lookup_t lock)
+{
+
+ return (lock == INPLOOKUP_RLOCKPCB ? \
+ rw_try_rlock(&inp->inp_lock) : rw_try_wlock(&inp->inp_lock));
}
-/*
- * Drop a refcount on an inpcb elevated using in_pcbref(); because a call to
- * in_pcbfree() may have been made between in_pcbref() and in_pcbrele(), we
- * return a flag indicating whether or not the inpcb remains valid. If it is
- * valid, we return with the inpcb lock held.
- *
- * Notice that, unlike in_pcbref(), the inpcb lock must be held to drop a
- * reference on an inpcb. Historically more work was done here (actually, in
- * in_pcbfree_internal()) but has been moved to in_pcbfree() to avoid the
- * need for the pcbinfo lock in in_pcbrele(). Deferring the free is entirely
- * about memory stability (and continued use of the write lock).
- */
-int
-in_pcbrele_rlocked(struct inpcb *inp)
+static inline bool
+in_pcbrele(struct inpcb *inp, const inp_lookup_t lock)
{
- struct inpcbinfo *pcbinfo;
- KASSERT(inp->inp_refcount > 0, ("%s: refcount 0", __func__));
+ return (lock == INPLOOKUP_RLOCKPCB ? \
+ in_pcbrele_rlocked(inp) : in_pcbrele_wlocked(inp));
+}
- INP_RLOCK_ASSERT(inp);
+bool
+inp_smr_lock(struct inpcb *inp, const inp_lookup_t lock)
+{
- if (refcount_release(&inp->inp_refcount) == 0) {
- /*
- * If the inpcb has been freed, let the caller know, even if
- * this isn't the last reference.
- */
- if (inp->inp_flags2 & INP_FREED) {
- INP_RUNLOCK(inp);
- return (1);
- }
- return (0);
+ MPASS(lock == INPLOOKUP_RLOCKPCB || lock == INPLOOKUP_WLOCKPCB);
+ SMR_ASSERT_ENTERED(inp->inp_pcbinfo->ipi_smr);
+
+ if (__predict_true(inp_trylock(inp, lock))) {
+ smr_exit(inp->inp_pcbinfo->ipi_smr);
+check_freed:
+ if (__predict_false(inp->inp_flags & INP_FREED)) {
+ inp_unlock(inp, lock);
+ return (false);
+ } else
+ return (true);
}
- KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__));
-#ifdef TCPHPTS
- if (inp->inp_in_hpts || inp->inp_in_input) {
- struct tcp_hpts_entry *hpts;
- /*
- * We should not be on the hpts at
- * this point in any form. we must
- * get the lock to be sure.
- */
- hpts = tcp_hpts_lock(inp);
- if (inp->inp_in_hpts)
- panic("Hpts:%p inp:%p at free still on hpts",
- hpts, inp);
- mtx_unlock(&hpts->p_mtx);
- hpts = tcp_input_lock(inp);
- if (inp->inp_in_input)
- panic("Hpts:%p inp:%p at free still on input hpts",
- hpts, inp);
- mtx_unlock(&hpts->p_mtx);
+ if (__predict_true(refcount_acquire_if_not_zero(&inp->inp_refcount))) {
+ smr_exit(inp->inp_pcbinfo->ipi_smr);
+ inp_lock(inp, lock);
+ if (__predict_false(in_pcbrele(inp, lock)))
+ return (false);
+ else
+ goto check_freed;
+ } else {
+ smr_exit(inp->inp_pcbinfo->ipi_smr);
+ return (false);
}
-#endif
- INP_RUNLOCK(inp);
- pcbinfo = inp->inp_pcbinfo;
- uma_zfree(pcbinfo->ipi_zone, inp);
- return (1);
}
-int
-in_pcbrele_wlocked(struct inpcb *inp)
-{
- struct inpcbinfo *pcbinfo;
+/*
+ * inp_next() - inpcb hash/list traversal iterator
+ *
+ * Requires initialized struct inpcb_iterator for context.
+ * The structure can be initialized with INP_ITERATOR() or INP_ALL_ITERATOR().
+ *
+ * - Iterator can have either write-lock or read-lock semantics, that can not
+ * be changed later.
+ * - Iterator can iterate either over all pcbs list (INP_ALL_LIST), or through
+ * a single hash slot. Note: only rip_input() does the latter.
+ * - Iterator may have optional bool matching function. The matching function
+ * will be executed for each inpcb in the SMR context, so it can not acquire
+ * locks and can safely access only immutable fields of inpcb.
+ *
+ * A fresh initialized iterator has NULL inpcb in its context and that
+ * means that inp_next() call would return the very first inpcb on the list
+ * locked with desired semantic. In all following calls the context pointer
+ * shall hold the current inpcb pointer. The KPI user is not supposed to
+ * unlock the current inpcb! Upon end of traversal inp_next() will return NULL
+ * and write NULL to its context. After end of traversal an iterator can be
+ * reused. Note: only rip_input() reuses iterator.
+ *
+ * List traversals have the following features/constraints:
+ * - New entries won't be seen, as they are always added to the head of a list.
+ * - Removed entries won't stop traversal as long as they are not added to
+ * a different list. This is violated by in_pcbrehash().
+ */
+#define II_LIST_FIRST(ipi, hash) \
+ (((hash) == INP_ALL_LIST) ? \
+ CK_LIST_FIRST(&(ipi)->ipi_listhead) : \
+ CK_LIST_FIRST(&(ipi)->ipi_hashbase[(hash)]))
+#define II_LIST_NEXT(inp, hash) \
+ (((hash) == INP_ALL_LIST) ? \
+ CK_LIST_NEXT((inp), inp_list) : \
+ CK_LIST_NEXT((inp), inp_hash))
+#define II_LOCK_ASSERT(inp, lock) \
+ rw_assert(&(inp)->inp_lock, \
+ (lock) == INPLOOKUP_RLOCKPCB ? RA_RLOCKED : RA_WLOCKED )
+struct inpcb *
+inp_next(struct inpcb_iterator *ii)
+{
+ struct inpcbhead freelist = CK_LIST_HEAD_INITIALIZER(freelist);
+ const struct inpcbinfo *ipi = ii->ipi;
+ inp_match_t *match = ii->match;
+ void *ctx = ii->ctx;
+ inp_lookup_t lock = ii->lock;
+ int hash = ii->hash;
+ struct inpcb *inp, *next, *tmp;
+
+ if (ii->inp == NULL) { /* First call. */
+ smr_enter(ipi->ipi_smr);
+ /* This is unrolled CK_LIST_FOREACH(). */
+ for (inp = II_LIST_FIRST(ipi, hash);
+ inp != NULL;
+ inp = II_LIST_NEXT(inp, hash)) {
+ if (match != NULL && (match)(inp, ctx) == false)
+ continue;
+ if (__predict_true(inp_smr_lock(inp, lock)))
+ break;
+ else {
+ smr_enter(ipi->ipi_smr);
+ MPASS(inp != II_LIST_FIRST(ipi, hash));
+ inp = II_LIST_FIRST(ipi, hash);
+ }
+ }
- KASSERT(inp->inp_refcount > 0, ("%s: refcount 0", __func__));
+ if (inp == NULL)
+ smr_exit(ipi->ipi_smr);
+ else
+ ii->inp = inp;
- INP_WLOCK_ASSERT(inp);
+ return (inp);
+ }
- if (refcount_release(&inp->inp_refcount) == 0) {
- /*
- * If the inpcb has been freed, let the caller know, even if
- * this isn't the last reference.
- */
- if (inp->inp_flags2 & INP_FREED) {
- INP_WUNLOCK(inp);
- return (1);
+ inp = ii->inp;
+
+ II_LOCK_ASSERT(inp, lock);
+ smr_enter(ipi->ipi_smr);
+next:
+ next = II_LIST_NEXT(inp, hash);
+ inp_unlock(inp, lock);
+ inp = next;
+next1:
+ if (inp == NULL) {
+ smr_exit(ipi->ipi_smr);
+ goto found;
+ }
+
+ if (match != NULL && (match)(inp, ctx) == false) {
+ inp = II_LIST_NEXT(inp, hash);
+ goto next1;
+ }
+
+ if (__predict_true(inp_trylock(inp, lock))) {
+ if (__predict_false(inp->inp_flags & INP_FREED))
+ goto next;
+ else {
+ smr_exit(ipi->ipi_smr);
+ goto found;
}
- return (0);
}
- KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__));
-#ifdef TCPHPTS
- if (inp->inp_in_hpts || inp->inp_in_input) {
- struct tcp_hpts_entry *hpts;
- /*
- * We should not be on the hpts at
- * this point in any form. we must
- * get the lock to be sure.
- */
- hpts = tcp_hpts_lock(inp);
- if (inp->inp_in_hpts)
- panic("Hpts:%p inp:%p at free still on hpts",
- hpts, inp);
- mtx_unlock(&hpts->p_mtx);
- hpts = tcp_input_lock(inp);
- if (inp->inp_in_input)
- panic("Hpts:%p inp:%p at free still on input hpts",
- hpts, inp);
- mtx_unlock(&hpts->p_mtx);
+ if (__predict_true(refcount_acquire_if_not_zero(&inp->inp_refcount))) {
+ smr_exit(ipi->ipi_smr);
+ inp_lock(inp, lock);
+ if (__predict_true(refcount_release(&inp->inp_refcount) == 0)) {
+ if (__predict_false(inp->inp_flags & INP_FREED)) {
+ smr_enter(ipi->ipi_smr);
+ goto next;
+ }
+ goto found;
+ }
+ MPASS(inp->inp_flags & INP_FREED);
+ smr_enter(ipi->ipi_smr);
+ next = II_LIST_NEXT(inp, hash);
+ inp_unlock(inp, lock);
+ if (hash == INP_ALL_LIST)
+ CK_LIST_INSERT_HEAD(&freelist, inp, inp_list);
+ else
+ CK_LIST_INSERT_HEAD(&freelist, inp, inp_hash);
+ inp = next;
+ goto next1;
+ } else
+ goto next;
+
+found:
+ if (__predict_false(CK_LIST_FIRST(&freelist) != NULL)) {
+ if (hash == INP_ALL_LIST)
+ CK_LIST_FOREACH_SAFE(next, &freelist, inp_list, tmp)
+ uma_zfree_smr(ipi->ipi_zone, next);
+ else
+ CK_LIST_FOREACH_SAFE(next, &freelist, inp_hash, tmp)
+ uma_zfree_smr(ipi->ipi_zone, next);
}
-#endif
- INP_WUNLOCK(inp);
- pcbinfo = inp->inp_pcbinfo;
- uma_zfree(pcbinfo->ipi_zone, inp);
- return (1);
+
+ return ((ii->inp = inp));
}
-static void
-inpcbport_free(epoch_context_t ctx)
+/*
+ * in_pcbref() bumps the reference count on an inpcb in order to maintain
+ * stability of an inpcb pointer despite the inpcb lock being released or
+ * SMR section exited.
+ *
+ * To free a reference later in_pcbrele_(r|w)locked() must be performed.
+ */
+void
+in_pcbref(struct inpcb *inp)
{
- struct inpcbport *phd;
+ u_int old __diagused;
- phd = __containerof(ctx, struct inpcbport, phd_epoch_ctx);
- free(phd, M_PCB);
+ old = refcount_acquire(&inp->inp_refcount);
+ KASSERT(old > 0, ("%s: refcount 0", __func__));
}
-static void
-in_pcbfree_deferred(epoch_context_t ctx)
+/*
+ * Drop a refcount on an inpcb elevated using in_pcbref(), potentially
+ * freeing the pcb, if the reference was very last.
+ */
+bool
+in_pcbrele_rlocked(struct inpcb *inp)
{
- struct inpcb *inp;
- int released __unused;
- inp = __containerof(ctx, struct inpcb, inp_epoch_ctx);
+ INP_RLOCK_ASSERT(inp);
- INP_WLOCK(inp);
- CURVNET_SET(inp->inp_vnet);
-#ifdef INET
- struct ip_moptions *imo = inp->inp_moptions;
- inp->inp_moptions = NULL;
-#endif
- /* XXXRW: Do as much as possible here. */
-#if defined(IPSEC) || defined(IPSEC_SUPPORT)
- if (inp->inp_sp != NULL)
- ipsec_delete_pcbpolicy(inp);
-#endif
-#ifdef INET6
- struct ip6_moptions *im6o = NULL;
- if (inp->inp_vflag & INP_IPV6PROTO) {
- ip6_freepcbopts(inp->in6p_outputopts);
- im6o = inp->in6p_moptions;
- inp->in6p_moptions = NULL;
- }
-#endif
- if (inp->inp_options)
- (void)m_free(inp->inp_options);
- inp->inp_vflag = 0;
- crfree(inp->inp_cred);
-#ifdef MAC
- mac_inpcb_destroy(inp);
-#endif
- released = in_pcbrele_wlocked(inp);
- MPASS(released);
-#ifdef INET6
- ip6_freemoptions(im6o);
-#endif
-#ifdef INET
- inp_freemoptions(imo);
-#endif
- CURVNET_RESTORE();
+ if (refcount_release(&inp->inp_refcount) == 0)
+ return (false);
+
+ MPASS(inp->inp_flags & INP_FREED);
+ MPASS(inp->inp_socket == NULL);
+ MPASS(inp->inp_in_hpts == 0);
+ MPASS(inp->inp_in_input == 0);
+ INP_RUNLOCK(inp);
+ uma_zfree_smr(inp->inp_pcbinfo->ipi_zone, inp);
+ return (true);
+}
+
+bool
+in_pcbrele_wlocked(struct inpcb *inp)
+{
+
+ INP_WLOCK_ASSERT(inp);
+
+ if (refcount_release(&inp->inp_refcount) == 0)
+ return (false);
+
+ MPASS(inp->inp_flags & INP_FREED);
+ MPASS(inp->inp_socket == NULL);
+ MPASS(inp->inp_in_hpts == 0);
+ MPASS(inp->inp_in_input == 0);
+ INP_WUNLOCK(inp);
+ uma_zfree_smr(inp->inp_pcbinfo->ipi_zone, inp);
+ return (true);
}
/*
@@ -1708,32 +1785,81 @@
* reference count, which should occur only after the inpcb has been detached
* from its socket. If another thread holds a temporary reference (acquired
* using in_pcbref()) then the free is deferred until that reference is
- * released using in_pcbrele(), but the inpcb is still unlocked. Almost all
- * work, including removal from global lists, is done in this context, where
- * the pcbinfo lock is held.
+ * released using in_pcbrele_(r|w)locked(), but the inpcb is still unlocked.
+ * Almost all work, including removal from global lists, is done in this
+ * context, where the pcbinfo lock is held.
*/
void
in_pcbfree(struct inpcb *inp)
{
struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
+#ifdef INET
+ struct ip_moptions *imo;
+#endif
+#ifdef INET6
+ struct ip6_moptions *im6o;
+#endif
+ INP_WLOCK_ASSERT(inp);
KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__));
- KASSERT((inp->inp_flags2 & INP_FREED) == 0,
+ KASSERT((inp->inp_flags & INP_FREED) == 0,
("%s: called twice for pcb %p", __func__, inp));
- if (inp->inp_flags2 & INP_FREED) {
- INP_WUNLOCK(inp);
- return;
+
+ inp->inp_flags |= INP_FREED;
+ INP_INFO_WLOCK(pcbinfo);
+ inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
+ pcbinfo->ipi_count--;
+ CK_LIST_REMOVE(inp, inp_list);
+ INP_INFO_WUNLOCK(pcbinfo);
+
+ if (inp->inp_flags & INP_INHASHLIST) {
+ struct inpcbport *phd = inp->inp_phd;
+
+ INP_HASH_WLOCK(pcbinfo);
+ /* XXX: Only do if SO_REUSEPORT_LB set? */
+ in_pcbremlbgrouphash(inp);
+
+ CK_LIST_REMOVE(inp, inp_hash);
+ CK_LIST_REMOVE(inp, inp_portlist);
+ if (CK_LIST_FIRST(&phd->phd_pcblist) == NULL) {
+ CK_LIST_REMOVE(phd, phd_hash);
+ uma_zfree_smr(pcbinfo->ipi_portzone, phd);
+ }
+ INP_HASH_WUNLOCK(pcbinfo);
+ inp->inp_flags &= ~INP_INHASHLIST;
}
- INP_WLOCK_ASSERT(inp);
- INP_LIST_WLOCK(pcbinfo);
- in_pcbremlists(inp);
- INP_LIST_WUNLOCK(pcbinfo);
+ crfree(inp->inp_cred);
RO_INVALIDATE_CACHE(&inp->inp_route);
- /* mark as destruction in progress */
- inp->inp_flags2 |= INP_FREED;
- INP_WUNLOCK(inp);
- NET_EPOCH_CALL(in_pcbfree_deferred, &inp->inp_epoch_ctx);
+#ifdef MAC
+ mac_inpcb_destroy(inp);
+#endif
+#if defined(IPSEC) || defined(IPSEC_SUPPORT)
+ if (inp->inp_sp != NULL)
+ ipsec_delete_pcbpolicy(inp);
+#endif
+#ifdef INET
+ if (inp->inp_options)
+ (void)m_free(inp->inp_options);
+ imo = inp->inp_moptions;
+#endif
+#ifdef INET6
+ if (inp->inp_vflag & INP_IPV6PROTO) {
+ ip6_freepcbopts(inp->in6p_outputopts);
+ im6o = inp->in6p_moptions;
+ } else
+ im6o = NULL;
+#endif
+
+ if (__predict_false(in_pcbrele_wlocked(inp) == false)) {
+ INP_WUNLOCK(inp);
+ }
+#ifdef INET6
+ ip6_freemoptions(im6o);
+#endif
+#ifdef INET
+ inp_freemoptions(imo);
+#endif
}
/*
@@ -1774,7 +1900,7 @@
CK_LIST_REMOVE(inp, inp_portlist);
if (CK_LIST_FIRST(&phd->phd_pcblist) == NULL) {
CK_LIST_REMOVE(phd, phd_hash);
- NET_EPOCH_CALL(inpcbport_free, &phd->phd_epoch_ctx);
+ uma_zfree_smr(inp->inp_pcbinfo->ipi_portzone, phd);
}
INP_HASH_WUNLOCK(inp->inp_pcbinfo);
inp->inp_flags &= ~INP_INHASHLIST;
@@ -1845,7 +1971,7 @@
struct inpcb *inp, *inp_temp;
INP_INFO_WLOCK(pcbinfo);
- CK_LIST_FOREACH_SAFE(inp, pcbinfo->ipi_listhead, inp_list, inp_temp) {
+ CK_LIST_FOREACH_SAFE(inp, &pcbinfo->ipi_listhead, inp_list, inp_temp) {
INP_WLOCK(inp);
#ifdef INET6
if ((inp->inp_vflag & INP_IPV4) == 0) {
@@ -1864,49 +1990,57 @@
INP_INFO_WUNLOCK(pcbinfo);
}
+static bool
+inp_v4_multi_match(const struct inpcb *inp, void *v __unused)
+{
+
+ if ((inp->inp_vflag & INP_IPV4) && inp->inp_moptions != NULL)
+ return (true);
+ else
+ return (false);
+}
+
void
in_pcbpurgeif0(struct inpcbinfo *pcbinfo, struct ifnet *ifp)
{
+ struct inpcb_iterator inpi = INP_ITERATOR(pcbinfo, INPLOOKUP_WLOCKPCB,
+ inp_v4_multi_match, NULL);
struct inpcb *inp;
struct in_multi *inm;
struct in_mfilter *imf;
struct ip_moptions *imo;
- INP_INFO_WLOCK(pcbinfo);
- CK_LIST_FOREACH(inp, pcbinfo->ipi_listhead, inp_list) {
- INP_WLOCK(inp);
+ IN_MULTI_LOCK_ASSERT();
+
+ while ((inp = inp_next(&inpi)) != NULL) {
+ INP_WLOCK_ASSERT(inp);
+
imo = inp->inp_moptions;
- if ((inp->inp_vflag & INP_IPV4) &&
- imo != NULL) {
- /*
- * Unselect the outgoing interface if it is being
- * detached.
- */
- if (imo->imo_multicast_ifp == ifp)
- imo->imo_multicast_ifp = NULL;
+ /*
+ * Unselect the outgoing interface if it is being
+ * detached.
+ */
+ if (imo->imo_multicast_ifp == ifp)
+ imo->imo_multicast_ifp = NULL;
- /*
- * Drop multicast group membership if we joined
- * through the interface being detached.
- *
- * XXX This can all be deferred to an epoch_call
- */
+ /*
+ * Drop multicast group membership if we joined
+ * through the interface being detached.
+ *
+ * XXX This can all be deferred to an epoch_call
+ */
restart:
- IP_MFILTER_FOREACH(imf, &imo->imo_head) {
- if ((inm = imf->imf_inm) == NULL)
- continue;
- if (inm->inm_ifp != ifp)
- continue;
- ip_mfilter_remove(&imo->imo_head, imf);
- IN_MULTI_LOCK_ASSERT();
- in_leavegroup_locked(inm, NULL);
- ip_mfilter_free(imf);
- goto restart;
- }
+ IP_MFILTER_FOREACH(imf, &imo->imo_head) {
+ if ((inm = imf->imf_inm) == NULL)
+ continue;
+ if (inm->inm_ifp != ifp)
+ continue;
+ ip_mfilter_remove(&imo->imo_head, imf);
+ in_leavegroup_locked(inm, NULL);
+ ip_mfilter_free(imf);
+ goto restart;
}
- INP_WUNLOCK(inp);
}
- INP_INFO_WUNLOCK(pcbinfo);
}
/*
@@ -1928,7 +2062,6 @@
KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0,
("%s: invalid lookup flags %d", __func__, lookupflags));
-
INP_HASH_LOCK_ASSERT(pcbinfo);
if ((lookupflags & INPLOOKUP_WILDCARD) == 0) {
@@ -2091,8 +2224,9 @@
/*
* Lookup PCB in hash list, using pcbinfo tables. This variation assumes
- * that the caller has locked the hash list, and will not perform any further
- * locking or reference operations on either the hash list or the connection.
+ * that the caller has either locked the hash list, which usually happens
+ * for bind(2) operations, or is in SMR section, which happens when sorting
+ * out incoming packets.
*/
static struct inpcb *
in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in_addr faddr,
@@ -2233,20 +2367,15 @@
{
struct inpcb *inp;
+ smr_enter(pcbinfo->ipi_smr);
inp = in_pcblookup_hash_locked(pcbinfo, faddr, fport, laddr, lport,
lookupflags & INPLOOKUP_WILDCARD, ifp, numa_domain);
if (inp != NULL) {
- if (lookupflags & INPLOOKUP_WLOCKPCB) {
- INP_WLOCK(inp);
- } else if (lookupflags & INPLOOKUP_RLOCKPCB) {
- INP_RLOCK(inp);
- } else
- panic("%s: locking bug", __func__);
- if (__predict_false(inp->inp_flags2 & INP_FREED)) {
- INP_UNLOCK(inp);
+ if (__predict_false(inp_smr_lock(inp,
+ (lookupflags & INPLOOKUP_LOCKMASK)) == false))
inp = NULL;
- }
- }
+ } else
+ smr_exit(pcbinfo->ipi_smr);
return (inp);
}
@@ -2341,11 +2470,10 @@
* If none exists, malloc one and tack it on.
*/
if (phd == NULL) {
- phd = malloc(sizeof(struct inpcbport), M_PCB, M_NOWAIT);
+ phd = uma_zalloc_smr(pcbinfo->ipi_portzone, M_NOWAIT);
if (phd == NULL) {
return (ENOBUFS); /* XXX */
}
- bzero(&phd->phd_epoch_ctx, sizeof(struct epoch_context));
phd->phd_port = inp->inp_lport;
CK_LIST_INIT(&phd->phd_pcblist);
CK_LIST_INSERT_HEAD(pcbporthash, phd, phd_hash);
@@ -2363,6 +2491,10 @@
* changed. NOTE: This does not handle the case of the lport changing (the
* hashed port list would have to be updated as well), so the lport must
* not change after in_pcbinshash() has been called.
+ *
+ * XXXGL: a race between this function and SMR-protected hash iterator
+ * will lead to iterator traversing a possibly wrong hash list. However,
+ * this race should have been here since change from rwlock to epoch.
*/
void
in_pcbrehash(struct inpcb *inp)
@@ -2391,39 +2523,6 @@
CK_LIST_INSERT_HEAD(head, inp, inp_hash);
}
-/*
- * Remove PCB from various lists.
- */
-static void
-in_pcbremlists(struct inpcb *inp)
-{
- struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
-
- INP_WLOCK_ASSERT(inp);
- INP_LIST_WLOCK_ASSERT(pcbinfo);
-
- inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
- if (inp->inp_flags & INP_INHASHLIST) {
- struct inpcbport *phd = inp->inp_phd;
-
- INP_HASH_WLOCK(pcbinfo);
-
- /* XXX: Only do if SO_REUSEPORT_LB set? */
- in_pcbremlbgrouphash(inp);
-
- CK_LIST_REMOVE(inp, inp_hash);
- CK_LIST_REMOVE(inp, inp_portlist);
- if (CK_LIST_FIRST(&phd->phd_pcblist) == NULL) {
- CK_LIST_REMOVE(phd, phd_hash);
- NET_EPOCH_CALL(inpcbport_free, &phd->phd_epoch_ctx);
- }
- INP_HASH_WUNLOCK(pcbinfo);
- inp->inp_flags &= ~INP_INHASHLIST;
- }
- CK_LIST_REMOVE(inp, inp_list);
- pcbinfo->ipi_count--;
-}
-
/*
* Check for alternatives when higher level complains
* about service problems. For now, invalidate cached
@@ -2558,15 +2657,12 @@
void
inp_apply_all(void (*func)(struct inpcb *, void *), void *arg)
{
+ struct inpcb_iterator inpi = INP_ALL_ITERATOR(&V_tcbinfo,
+ INPLOOKUP_WLOCKPCB);
struct inpcb *inp;
- INP_INFO_WLOCK(&V_tcbinfo);
- CK_LIST_FOREACH(inp, V_tcbinfo.ipi_listhead, inp_list) {
- INP_WLOCK(inp);
+ while ((inp = inp_next(&inpi)) != NULL)
func(inp, arg);
- INP_WUNLOCK(inp);
- }
- INP_INFO_WUNLOCK(&V_tcbinfo);
}
struct socket *
Index: sys/netinet/in_pcb_var.h
===================================================================
--- sys/netinet/in_pcb_var.h
+++ sys/netinet/in_pcb_var.h
@@ -44,6 +44,7 @@
* Definitions shared between netinet/in_pcb.c and netinet6/in6_pcb.c
*/
+bool inp_smr_lock(struct inpcb *, const inp_lookup_t);
int in_pcb_lport(struct inpcb *, struct in_addr *, u_short *,
struct ucred *, int);
int in_pcb_lport_dest(struct inpcb *inp, struct sockaddr *lsa,
@@ -52,4 +53,10 @@
struct inpcb * in_pcblookup_local(struct inpcbinfo *, struct in_addr, u_short,
int, struct ucred *);
+struct inpcbport {
+ struct inpcbhead phd_pcblist;
+ CK_LIST_ENTRY(inpcbport) phd_hash;
+ u_short phd_port;
+};
+
#endif /* !_NETINET_IN_PCB_VAR_H_ */
Index: sys/netinet/ip_divert.c
===================================================================
--- sys/netinet/ip_divert.c
+++ sys/netinet/ip_divert.c
@@ -111,10 +111,7 @@
*/
/* Internal variables. */
-VNET_DEFINE_STATIC(struct inpcbhead, divcb);
VNET_DEFINE_STATIC(struct inpcbinfo, divcbinfo);
-
-#define V_divcb VNET(divcb)
#define V_divcbinfo VNET(divcbinfo)
static u_long div_sendspace = DIVSNDQ; /* XXX sysctl ? */
@@ -154,8 +151,7 @@
* allocate one-entry hash lists than it is to check all over the
* place for hashbase == NULL.
*/
- in_pcbinfo_init(&V_divcbinfo, "div", &V_divcb, 1, 1, "divcb",
- div_inpcb_init, IPI_HASHFIELDS_NONE);
+ in_pcbinfo_init(&V_divcbinfo, "div", 1, 1, "divcb", div_inpcb_init);
}
static void
@@ -181,6 +177,14 @@
return (IPPROTO_DONE);
}
+static bool
+div_port_match(const struct inpcb *inp, void *v)
+{
+ uint16_t nport = *(uint16_t *)v;
+
+ return (inp->inp_lport == nport);
+}
+
/*
* Divert a packet by passing it up to the divert socket at port 'port'.
*
@@ -195,6 +199,8 @@
struct socket *sa;
u_int16_t nport;
struct sockaddr_in divsrc;
+ struct inpcb_iterator inpi = INP_ITERATOR(&V_divcbinfo,
+ INPLOOKUP_RLOCKPCB, div_port_match, &nport);
struct m_tag *mtag;
NET_EPOCH_ASSERT();
@@ -281,27 +287,20 @@
/* Put packet on socket queue, if any */
sa = NULL;
+ /* nport is inp_next's context. */
nport = htons((u_int16_t)(((struct ipfw_rule_ref *)(mtag+1))->info));
- CK_LIST_FOREACH(inp, &V_divcb, inp_list) {
+ while ((inp = inp_next(&inpi)) != NULL) {
+ sa = inp->inp_socket;
+ SOCKBUF_LOCK(&sa->so_rcv);
+ if (sbappendaddr_locked(&sa->so_rcv,
+ (struct sockaddr *)&divsrc, m, NULL) == 0) {
+ soroverflow_locked(sa);
+ sa = NULL; /* force mbuf reclaim below */
+ } else
+ sorwakeup_locked(sa);
/* XXX why does only one socket match? */
- if (inp->inp_lport == nport) {
- INP_RLOCK(inp);
- if (__predict_false(inp->inp_flags2 & INP_FREED)) {
- INP_RUNLOCK(inp);
- continue;
- }
- sa = inp->inp_socket;
- SOCKBUF_LOCK(&sa->so_rcv);
- if (sbappendaddr_locked(&sa->so_rcv,
- (struct sockaddr *)&divsrc, m,
- (struct mbuf *)0) == 0) {
- soroverflow_locked(sa);
- sa = NULL; /* force mbuf reclaim below */
- } else
- sorwakeup_locked(sa);
- INP_RUNLOCK(inp);
- break;
- }
+ INP_RUNLOCK(inp);
+ break;
}
if (sa == NULL) {
m_freem(m);
@@ -596,14 +595,10 @@
error = soreserve(so, div_sendspace, div_recvspace);
if (error)
return error;
- INP_INFO_WLOCK(&V_divcbinfo);
error = in_pcballoc(so, &V_divcbinfo);
- if (error) {
- INP_INFO_WUNLOCK(&V_divcbinfo);
+ if (error)
return error;
- }
inp = (struct inpcb *)so->so_pcb;
- INP_INFO_WUNLOCK(&V_divcbinfo);
inp->inp_ip_p = proto;
inp->inp_vflag |= INP_IPV4;
inp->inp_flags |= INP_HDRINCL;
@@ -618,11 +613,9 @@
inp = sotoinpcb(so);
KASSERT(inp != NULL, ("div_detach: inp == NULL"));
- INP_INFO_WLOCK(&V_divcbinfo);
INP_WLOCK(inp);
in_pcbdetach(inp);
in_pcbfree(inp);
- INP_INFO_WUNLOCK(&V_divcbinfo);
}
static int
@@ -702,8 +695,9 @@
static int
div_pcblist(SYSCTL_HANDLER_ARGS)
{
+ struct inpcb_iterator inpi = INP_ALL_ITERATOR(&V_divcbinfo,
+ INPLOOKUP_RLOCKPCB);
struct xinpgen xig;
- struct epoch_tracker et;
struct inpcb *inp;
int error;
@@ -731,21 +725,18 @@
if (error)
return error;
- NET_EPOCH_ENTER(et);
- for (inp = CK_LIST_FIRST(V_divcbinfo.ipi_listhead);
- inp != NULL;
- inp = CK_LIST_NEXT(inp, inp_list)) {
- INP_RLOCK(inp);
+ while ((inp = inp_next(&inpi)) != NULL) {
if (inp->inp_gencnt <= xig.xig_gen) {
struct xinpcb xi;
in_pcbtoxinpcb(inp, &xi);
- INP_RUNLOCK(inp);
error = SYSCTL_OUT(req, &xi, sizeof xi);
- } else
- INP_RUNLOCK(inp);
+ if (error) {
+ INP_RUNLOCK(inp);
+ break;
+ }
+ }
}
- NET_EPOCH_EXIT(et);
if (!error) {
/*
Index: sys/netinet/ip_gre.c
===================================================================
--- sys/netinet/ip_gre.c
+++ sys/netinet/ip_gre.c
@@ -223,25 +223,11 @@
in_gre_udp_input(struct mbuf *m, int off, struct inpcb *inp,
const struct sockaddr *sa, void *ctx)
{
- struct epoch_tracker et;
struct gre_socket *gs;
struct gre_softc *sc;
in_addr_t dst;
- NET_EPOCH_ENTER(et);
- /*
- * udp_append() holds reference to inp, it is safe to check
- * inp_flags2 without INP_RLOCK().
- * If socket was closed before we have entered NET_EPOCH section,
- * INP_FREED flag should be set. Otherwise it should be safe to
- * make access to ctx data, because gre_so will be freed by
- * gre_sofree() via NET_EPOCH_CALL().
- */
- if (__predict_false(inp->inp_flags2 & INP_FREED)) {
- NET_EPOCH_EXIT(et);
- m_freem(m);
- return;
- }
+ NET_EPOCH_ASSERT();
gs = (struct gre_socket *)ctx;
dst = ((const struct sockaddr_in *)sa)->sin_addr.s_addr;
@@ -251,11 +237,9 @@
}
if (sc != NULL && (GRE2IFP(sc)->if_flags & IFF_UP) != 0){
gre_input(m, off + sizeof(struct udphdr), IPPROTO_UDP, sc);
- NET_EPOCH_EXIT(et);
return;
}
m_freem(m);
- NET_EPOCH_EXIT(et);
}
static int
Index: sys/netinet/raw_ip.c
===================================================================
--- sys/netinet/raw_ip.c
+++ sys/netinet/raw_ip.c
@@ -88,10 +88,7 @@
&VNET_NAME(ip_defttl), 0,
"Maximum TTL on IP packets");
-VNET_DEFINE(struct inpcbhead, ripcb);
VNET_DEFINE(struct inpcbinfo, ripcbinfo);
-
-#define V_ripcb VNET(ripcb)
#define V_ripcbinfo VNET(ripcbinfo)
/*
@@ -161,7 +158,7 @@
struct inpcbhead *pcbhash;
int hash;
- INP_INFO_WLOCK_ASSERT(pcbinfo);
+ INP_HASH_WLOCK_ASSERT(pcbinfo);
INP_WLOCK_ASSERT(inp);
if (inp->inp_ip_p != 0 &&
@@ -179,7 +176,7 @@
rip_delhash(struct inpcb *inp)
{
- INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo);
+ INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
INP_WLOCK_ASSERT(inp);
CK_LIST_REMOVE(inp, inp_hash);
@@ -213,8 +210,8 @@
rip_init(void)
{
- in_pcbinfo_init(&V_ripcbinfo, "rip", &V_ripcb, INP_PCBHASH_RAW_SIZE,
- 1, "ripcb", rip_inpcb_init, IPI_HASHFIELDS_NONE);
+ in_pcbinfo_init(&V_ripcbinfo, "rip", INP_PCBHASH_RAW_SIZE, 1, "ripcb",
+ rip_inpcb_init);
EVENTHANDLER_REGISTER(maxsockets_change, rip_zone_change, NULL,
EVENTHANDLER_PRI_ANY);
}
@@ -231,47 +228,90 @@
#ifdef INET
static int
-rip_append(struct inpcb *last, struct ip *ip, struct mbuf *n,
+rip_append(struct inpcb *inp, struct ip *ip, struct mbuf *m,
struct sockaddr_in *ripsrc)
{
- int policyfail = 0;
+ struct socket *so = inp->inp_socket;
+ struct mbuf *n, *opts = NULL;
- INP_LOCK_ASSERT(last);
+ INP_LOCK_ASSERT(inp);
#if defined(IPSEC) || defined(IPSEC_SUPPORT)
/* check AH/ESP integrity. */
- if (IPSEC_ENABLED(ipv4)) {
- if (IPSEC_CHECK_POLICY(ipv4, n, last) != 0)
- policyfail = 1;
- }
+ if (IPSEC_ENABLED(ipv4) && IPSEC_CHECK_POLICY(ipv4, m, inp) != 0)
+ return (0);
#endif /* IPSEC */
#ifdef MAC
- if (!policyfail && mac_inpcb_check_deliver(last, n) != 0)
- policyfail = 1;
+ if (mac_inpcb_check_deliver(inp, m) != 0)
+ return (0);
#endif
/* Check the minimum TTL for socket. */
- if (last->inp_ip_minttl && last->inp_ip_minttl > ip->ip_ttl)
- policyfail = 1;
- if (!policyfail) {
- struct mbuf *opts = NULL;
- struct socket *so;
-
- so = last->inp_socket;
- if ((last->inp_flags & INP_CONTROLOPTS) ||
- (so->so_options & (SO_TIMESTAMP | SO_BINTIME)))
- ip_savecontrol(last, &opts, ip, n);
- SOCKBUF_LOCK(&so->so_rcv);
- if (sbappendaddr_locked(&so->so_rcv,
- (struct sockaddr *)ripsrc, n, opts) == 0) {
- soroverflow_locked(so);
- m_freem(n);
- if (opts)
- m_freem(opts);
- } else
- sorwakeup_locked(so);
- } else
+ if (inp->inp_ip_minttl && inp->inp_ip_minttl > ip->ip_ttl)
+ return (0);
+
+ if ((n = m_copym(m, 0, M_COPYALL, M_NOWAIT)) == NULL)
+ return (0);
+
+ if ((inp->inp_flags & INP_CONTROLOPTS) ||
+ (so->so_options & (SO_TIMESTAMP | SO_BINTIME)))
+ ip_savecontrol(inp, &opts, ip, n);
+ SOCKBUF_LOCK(&so->so_rcv);
+ if (sbappendaddr_locked(&so->so_rcv,
+ (struct sockaddr *)ripsrc, n, opts) == 0) {
+ soroverflow_locked(so);
m_freem(n);
- return (policyfail);
+ if (opts)
+ m_freem(opts);
+ return (0);
+ }
+ sorwakeup_locked(so);
+
+ return (1);
+}
+
+struct rip_inp_match_ctx {
+ struct ip *ip;
+ int proto;
+};
+
+static bool
+rip_inp_match1(const struct inpcb *inp, void *v)
+{
+ struct rip_inp_match_ctx *ctx = v;
+
+ if (inp->inp_ip_p != ctx->proto)
+ return (false);
+#ifdef INET6
+ /* XXX inp locking */
+ if ((inp->inp_vflag & INP_IPV4) == 0)
+ return (false);
+#endif
+ if (inp->inp_laddr.s_addr != ctx->ip->ip_dst.s_addr)
+ return (false);
+ if (inp->inp_faddr.s_addr != ctx->ip->ip_src.s_addr)
+ return (false);
+ return (true);
+}
+
+static bool
+rip_inp_match2(const struct inpcb *inp, void *v)
+{
+ struct rip_inp_match_ctx *ctx = v;
+
+ if (inp->inp_ip_p && inp->inp_ip_p != ctx->proto)
+ return (false);
+#ifdef INET6
+ /* XXX inp locking */
+ if ((inp->inp_vflag & INP_IPV4) == 0)
+ return (false);
+#endif
+ if (!in_nullhost(inp->inp_laddr) &&
+ !in_hosteq(inp->inp_laddr, ctx->ip->ip_dst))
+ return (false);
+ if (!in_nullhost(inp->inp_faddr) &&
+ !in_hosteq(inp->inp_faddr, ctx->ip->ip_src))
+ return (false);
+ return (true);
}
/*
@@ -281,102 +321,57 @@
int
rip_input(struct mbuf **mp, int *offp, int proto)
{
+ struct rip_inp_match_ctx ctx = {
+ .ip = mtod(*mp, struct ip *),
+ .proto = proto,
+ };
+ struct inpcb_iterator inpi = INP_ITERATOR(&V_ripcbinfo,
+ INPLOOKUP_RLOCKPCB, rip_inp_match1, &ctx);
struct ifnet *ifp;
struct mbuf *m = *mp;
- struct ip *ip = mtod(m, struct ip *);
- struct inpcb *inp, *last;
+ struct inpcb *inp;
struct sockaddr_in ripsrc;
- int hash;
-
- NET_EPOCH_ASSERT();
+ int appended;
*mp = NULL;
+ appended = 0;
bzero(&ripsrc, sizeof(ripsrc));
ripsrc.sin_len = sizeof(ripsrc);
ripsrc.sin_family = AF_INET;
- ripsrc.sin_addr = ip->ip_src;
- last = NULL;
+ ripsrc.sin_addr = ctx.ip->ip_src;
ifp = m->m_pkthdr.rcvif;
- hash = INP_PCBHASH_RAW(proto, ip->ip_src.s_addr,
- ip->ip_dst.s_addr, V_ripcbinfo.ipi_hashmask);
- CK_LIST_FOREACH(inp, &V_ripcbinfo.ipi_hashbase[hash], inp_hash) {
- if (inp->inp_ip_p != proto)
- continue;
-#ifdef INET6
- /* XXX inp locking */
- if ((inp->inp_vflag & INP_IPV4) == 0)
- continue;
-#endif
- if (inp->inp_laddr.s_addr != ip->ip_dst.s_addr)
- continue;
- if (inp->inp_faddr.s_addr != ip->ip_src.s_addr)
- continue;
- if (last != NULL) {
- struct mbuf *n;
-
- n = m_copym(m, 0, M_COPYALL, M_NOWAIT);
- if (n != NULL)
- (void) rip_append(last, ip, n, &ripsrc);
- /* XXX count dropped packet */
- INP_RUNLOCK(last);
- last = NULL;
- }
- INP_RLOCK(inp);
- if (__predict_false(inp->inp_flags2 & INP_FREED))
- goto skip_1;
- if (jailed_without_vnet(inp->inp_cred)) {
+ inpi.hash = INP_PCBHASH_RAW(proto, ctx.ip->ip_src.s_addr,
+ ctx.ip->ip_dst.s_addr, V_ripcbinfo.ipi_hashmask);
+ while ((inp = inp_next(&inpi)) != NULL) {
+ INP_RLOCK_ASSERT(inp);
+ if (jailed_without_vnet(inp->inp_cred) &&
+ prison_check_ip4(inp->inp_cred, &ctx.ip->ip_dst) != 0) {
/*
* XXX: If faddr was bound to multicast group,
* jailed raw socket will drop datagram.
*/
- if (prison_check_ip4(inp->inp_cred, &ip->ip_dst) != 0)
- goto skip_1;
- }
- last = inp;
- continue;
- skip_1:
- INP_RUNLOCK(inp);
- }
- CK_LIST_FOREACH(inp, &V_ripcbinfo.ipi_hashbase[0], inp_hash) {
- if (inp->inp_ip_p && inp->inp_ip_p != proto)
- continue;
-#ifdef INET6
- /* XXX inp locking */
- if ((inp->inp_vflag & INP_IPV4) == 0)
continue;
-#endif
- if (!in_nullhost(inp->inp_laddr) &&
- !in_hosteq(inp->inp_laddr, ip->ip_dst))
- continue;
- if (!in_nullhost(inp->inp_faddr) &&
- !in_hosteq(inp->inp_faddr, ip->ip_src))
- continue;
- if (last != NULL) {
- struct mbuf *n;
-
- n = m_copym(m, 0, M_COPYALL, M_NOWAIT);
- if (n != NULL)
- (void) rip_append(last, ip, n, &ripsrc);
- /* XXX count dropped packet */
- INP_RUNLOCK(last);
- last = NULL;
}
- INP_RLOCK(inp);
- if (__predict_false(inp->inp_flags2 & INP_FREED))
- goto skip_2;
- if (jailed_without_vnet(inp->inp_cred)) {
+ appended += rip_append(inp, ctx.ip, m, &ripsrc);
+ }
+
+ inpi.hash = 0;
+ inpi.match = rip_inp_match2;
+ MPASS(inpi.inp == NULL);
+ while ((inp = inp_next(&inpi)) != NULL) {
+ INP_RLOCK_ASSERT(inp);
+ if (jailed_without_vnet(inp->inp_cred) &&
+ !IN_MULTICAST(ntohl(ctx.ip->ip_dst.s_addr)) &&
+ prison_check_ip4(inp->inp_cred, &ctx.ip->ip_dst) != 0)
/*
* Allow raw socket in jail to receive multicast;
* assume process had PRIV_NETINET_RAW at attach,
* and fall through into normal filter path if so.
*/
- if (!IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) &&
- prison_check_ip4(inp->inp_cred, &ip->ip_dst) != 0)
- goto skip_2;
- }
+ continue;
/*
* If this raw socket has multicast state, and we
* have received a multicast, check if this socket
@@ -384,7 +379,7 @@
* the responsibility of the transport layer.
*/
if (inp->inp_moptions != NULL &&
- IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) {
+ IN_MULTICAST(ntohl(ctx.ip->ip_dst.s_addr))) {
/*
* If the incoming datagram is for IGMP, allow it
* through unconditionally to the raw socket.
@@ -406,7 +401,7 @@
bzero(&group, sizeof(struct sockaddr_in));
group.sin_len = sizeof(struct sockaddr_in);
group.sin_family = AF_INET;
- group.sin_addr = ip->ip_dst;
+ group.sin_addr = ctx.ip->ip_dst;
blocked = imo_multi_filter(inp->inp_moptions,
ifp,
@@ -416,27 +411,18 @@
if (blocked != MCAST_PASS) {
IPSTAT_INC(ips_notmember);
- goto skip_2;
+ continue;
}
}
- last = inp;
- continue;
- skip_2:
- INP_RUNLOCK(inp);
- }
- if (last != NULL) {
- if (rip_append(last, ip, m, &ripsrc) != 0)
- IPSTAT_INC(ips_delivered);
- INP_RUNLOCK(last);
- } else {
- if (inetsw[ip_protox[ip->ip_p]].pr_input == rip_input) {
- IPSTAT_INC(ips_noproto);
- IPSTAT_DEC(ips_delivered);
- icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_PROTOCOL, 0, 0);
- } else {
- m_freem(m);
- }
+ appended += rip_append(inp, ctx.ip, m, &ripsrc);
}
+ if (appended == 0 &&
+ inetsw[ip_protox[ctx.ip->ip_p]].pr_input == rip_input) {
+ IPSTAT_INC(ips_noproto);
+ IPSTAT_DEC(ips_delivered);
+ icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_PROTOCOL, 0, 0);
+ } else
+ m_freem(m);
return (IPPROTO_DONE);
}
@@ -906,18 +892,16 @@
error = soreserve(so, rip_sendspace, rip_recvspace);
if (error)
return (error);
- INP_INFO_WLOCK(&V_ripcbinfo);
error = in_pcballoc(so, &V_ripcbinfo);
- if (error) {
- INP_INFO_WUNLOCK(&V_ripcbinfo);
+ if (error)
return (error);
- }
inp = (struct inpcb *)so->so_pcb;
inp->inp_vflag |= INP_IPV4;
inp->inp_ip_p = proto;
inp->inp_ip_ttl = V_ip_defttl;
+ INP_HASH_WLOCK(&V_ripcbinfo);
rip_inshash(inp);
- INP_INFO_WUNLOCK(&V_ripcbinfo);
+ INP_HASH_WUNLOCK(&V_ripcbinfo);
INP_WUNLOCK(inp);
return (0);
}
@@ -932,9 +916,10 @@
KASSERT(inp->inp_faddr.s_addr == INADDR_ANY,
("rip_detach: not closed"));
- INP_INFO_WLOCK(&V_ripcbinfo);
INP_WLOCK(inp);
+ INP_HASH_WLOCK(&V_ripcbinfo);
rip_delhash(inp);
+ INP_HASH_WUNLOCK(&V_ripcbinfo);
if (so == V_ip_mrouter && ip_mrouter_done)
ip_mrouter_done();
if (ip_rsvp_force_done)
@@ -943,7 +928,6 @@
ip_rsvp_done();
in_pcbdetach(inp);
in_pcbfree(inp);
- INP_INFO_WUNLOCK(&V_ripcbinfo);
}
static void
@@ -952,16 +936,16 @@
struct inpcbinfo *pcbinfo;
pcbinfo = inp->inp_pcbinfo;
- INP_INFO_WLOCK(pcbinfo);
INP_WLOCK(inp);
+ INP_HASH_WLOCK(pcbinfo);
rip_delhash(inp);
inp->inp_faddr.s_addr = INADDR_ANY;
rip_inshash(inp);
+ INP_HASH_WUNLOCK(pcbinfo);
SOCK_LOCK(so);
so->so_state &= ~SS_ISCONNECTED;
SOCK_UNLOCK(so);
INP_WUNLOCK(inp);
- INP_INFO_WUNLOCK(pcbinfo);
}
static void
@@ -1027,13 +1011,13 @@
ifa_ifwithaddr_check((struct sockaddr *)addr) == 0))
return (EADDRNOTAVAIL);
- INP_INFO_WLOCK(&V_ripcbinfo);
INP_WLOCK(inp);
+ INP_HASH_WLOCK(&V_ripcbinfo);
rip_delhash(inp);
inp->inp_laddr = addr->sin_addr;
rip_inshash(inp);
+ INP_HASH_WUNLOCK(&V_ripcbinfo);
INP_WUNLOCK(inp);
- INP_INFO_WUNLOCK(&V_ripcbinfo);
return (0);
}
@@ -1053,14 +1037,14 @@
inp = sotoinpcb(so);
KASSERT(inp != NULL, ("rip_connect: inp == NULL"));
- INP_INFO_WLOCK(&V_ripcbinfo);
INP_WLOCK(inp);
+ INP_HASH_WLOCK(&V_ripcbinfo);
rip_delhash(inp);
inp->inp_faddr = addr->sin_addr;
rip_inshash(inp);
+ INP_HASH_WUNLOCK(&V_ripcbinfo);
soisconnected(so);
INP_WUNLOCK(inp);
- INP_INFO_WUNLOCK(&V_ripcbinfo);
return (0);
}
@@ -1126,8 +1110,9 @@
static int
rip_pcblist(SYSCTL_HANDLER_ARGS)
{
+ struct inpcb_iterator inpi = INP_ALL_ITERATOR(&V_ripcbinfo,
+ INPLOOKUP_RLOCKPCB);
struct xinpgen xig;
- struct epoch_tracker et;
struct inpcb *inp;
int error;
@@ -1155,24 +1140,19 @@
if (error)
return (error);
- NET_EPOCH_ENTER(et);
- for (inp = CK_LIST_FIRST(V_ripcbinfo.ipi_listhead);
- inp != NULL;
- inp = CK_LIST_NEXT(inp, inp_list)) {
- INP_RLOCK(inp);
+ while ((inp = inp_next(&inpi)) != NULL) {
if (inp->inp_gencnt <= xig.xig_gen &&
cr_canseeinpcb(req->td->td_ucred, inp) == 0) {
struct xinpcb xi;
in_pcbtoxinpcb(inp, &xi);
- INP_RUNLOCK(inp);
error = SYSCTL_OUT(req, &xi, sizeof xi);
- if (error)
+ if (error) {
+ INP_RUNLOCK(inp);
break;
- } else
- INP_RUNLOCK(inp);
+ }
+ }
}
- NET_EPOCH_EXIT(et);
if (!error) {
/*
Index: sys/netinet/tcp_input.c
===================================================================
--- sys/netinet/tcp_input.c
+++ sys/netinet/tcp_input.c
@@ -238,8 +238,6 @@
&VNET_NAME(tcp_autorcvbuf_max), 0,
"Max size of automatic receive buffer");
-VNET_DEFINE(struct inpcbhead, tcb);
-#define tcb6 tcb /* for KAME src sync over BSD*'s */
VNET_DEFINE(struct inpcbinfo, tcbinfo);
/*
Index: sys/netinet/tcp_lro.c
===================================================================
--- sys/netinet/tcp_lro.c
+++ sys/netinet/tcp_lro.c
@@ -1301,8 +1301,7 @@
/* Check if the inp is dead, Jim. */
if (tp == NULL ||
- (inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) ||
- (inp->inp_flags2 & INP_FREED)) {
+ (inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT))) {
INP_WUNLOCK(inp);
return (TCP_LRO_CANNOT);
}
Index: sys/netinet/tcp_subr.c
===================================================================
--- sys/netinet/tcp_subr.c
+++ sys/netinet/tcp_subr.c
@@ -1352,7 +1352,7 @@
VNET_FOREACH(vnet_iter) {
CURVNET_SET(vnet_iter);
INP_INFO_WLOCK(&V_tcbinfo);
- CK_LIST_FOREACH(inp, V_tcbinfo.ipi_listhead, inp_list) {
+ CK_LIST_FOREACH(inp, &V_tcbinfo.ipi_listhead, inp_list) {
INP_WLOCK(inp);
if (inp->inp_flags & INP_TIMEWAIT) {
INP_WUNLOCK(inp);
@@ -1454,8 +1454,8 @@
"clipped from %d to %d.\n", __func__, oldhashsize,
hashsize);
}
- in_pcbinfo_init(&V_tcbinfo, "tcp", &V_tcb, hashsize, hashsize,
- "tcp_inpcb", tcp_inpcb_init, IPI_HASHFIELDS_4TUPLE);
+ in_pcbinfo_init(&V_tcbinfo, "tcp", hashsize, hashsize,
+ "tcp_inpcb", tcp_inpcb_init);
/*
* These have to be type stable for the benefit of the timers.
@@ -1565,9 +1565,9 @@
* Sleep to let all tcpcb timers really disappear and cleanup.
*/
for (;;) {
- INP_LIST_RLOCK(&V_tcbinfo);
+ INP_INFO_WLOCK(&V_tcbinfo);
n = V_tcbinfo.ipi_count;
- INP_LIST_RUNLOCK(&V_tcbinfo);
+ INP_INFO_WUNLOCK(&V_tcbinfo);
if (n == 0)
break;
pause("tcpdes", hz / 10);
@@ -2241,7 +2241,7 @@
* therefore don't enter the loop below until the connection
* list has stabilised.
*/
- CK_LIST_FOREACH(inp, &V_tcb, inp_list) {
+ CK_LIST_FOREACH(inp, &V_tcbinfo.ipi_listhead, inp_list) {
INP_WLOCK(inp);
/* Important to skip tcptw structs. */
if (!(inp->inp_flags & INP_TIMEWAIT) &&
@@ -2290,7 +2290,6 @@
struct socket *so = tp->t_inpcb->inp_socket;
NET_EPOCH_ASSERT();
- INP_INFO_LOCK_ASSERT(&V_tcbinfo);
INP_WLOCK_ASSERT(tp->t_inpcb);
if (TCPS_HAVERCVDSYN(tp->t_state)) {
@@ -2502,7 +2501,6 @@
struct inpcb *inp = tp->t_inpcb;
struct socket *so;
- INP_INFO_LOCK_ASSERT(&V_tcbinfo);
INP_WLOCK_ASSERT(inp);
#ifdef TCP_OFFLOAD
@@ -2561,7 +2559,7 @@
* useful.
*/
INP_INFO_WLOCK(&V_tcbinfo);
- CK_LIST_FOREACH(inpb, V_tcbinfo.ipi_listhead, inp_list) {
+ CK_LIST_FOREACH(inpb, &V_tcbinfo.ipi_listhead, inp_list) {
INP_WLOCK(inpb);
if (inpb->inp_flags & INP_TIMEWAIT) {
INP_WUNLOCK(inpb);
@@ -2602,7 +2600,6 @@
{
struct tcpcb *tp;
- INP_INFO_LOCK_ASSERT(&V_tcbinfo);
INP_WLOCK_ASSERT(inp);
if ((inp->inp_flags & INP_TIMEWAIT) ||
@@ -2648,9 +2645,9 @@
static int
tcp_pcblist(SYSCTL_HANDLER_ARGS)
{
- struct epoch_tracker et;
- struct inpcb *inp;
+ struct inpcb_iterator inpi = INP_ALL_ITERATOR(&V_tcbinfo, INPLOOKUP_RLOCKPCB);
struct xinpgen xig;
+ struct inpcb *inp;
int error;
if (req->newptr != NULL)
@@ -2683,11 +2680,7 @@
if (error)
return (error);
- NET_EPOCH_ENTER(et);
- for (inp = CK_LIST_FIRST(V_tcbinfo.ipi_listhead);
- inp != NULL;
- inp = CK_LIST_NEXT(inp, inp_list)) {
- INP_RLOCK(inp);
+ while ((inp = inp_next(&inpi)) != NULL) {
if (inp->inp_gencnt <= xig.xig_gen) {
int crerr;
@@ -2708,17 +2701,15 @@
struct xtcpcb xt;
tcp_inptoxtp(inp, &xt);
- INP_RUNLOCK(inp);
error = SYSCTL_OUT(req, &xt, sizeof xt);
- if (error)
+ if (error) {
+ INP_RUNLOCK(inp);
break;
- else
+ } else
continue;
}
}
- INP_RUNLOCK(inp);
}
- NET_EPOCH_EXIT(et);
if (!error) {
/*
Index: sys/netinet/tcp_var.h
===================================================================
--- sys/netinet/tcp_var.h
+++ sys/netinet/tcp_var.h
@@ -908,7 +908,6 @@
VNET_DECLARE(int, tcp_sendspace);
VNET_DECLARE(int, tcp_udp_tunneling_overhead);
VNET_DECLARE(int, tcp_udp_tunneling_port);
-VNET_DECLARE(struct inpcbhead, tcb);
VNET_DECLARE(struct inpcbinfo, tcbinfo);
#define V_tcp_do_lrd VNET(tcp_do_lrd)
@@ -917,7 +916,6 @@
#define V_tcp_do_newcwv VNET(tcp_do_newcwv)
#define V_drop_synfin VNET(drop_synfin)
#define V_path_mtu_discovery VNET(path_mtu_discovery)
-#define V_tcb VNET(tcb)
#define V_tcbinfo VNET(tcbinfo)
#define V_tcp_abc_l_var VNET(tcp_abc_l_var)
#define V_tcp_autorcvbuf_max VNET(tcp_autorcvbuf_max)
Index: sys/netinet/udp_usrreq.c
===================================================================
--- sys/netinet/udp_usrreq.c
+++ sys/netinet/udp_usrreq.c
@@ -143,9 +143,7 @@
SYSCTL_ULONG(_net_inet_udp, UDPCTL_RECVSPACE, recvspace, CTLFLAG_RW,
&udp_recvspace, 0, "Maximum space for incoming UDP datagrams");
-VNET_DEFINE(struct inpcbhead, udb); /* from udp_var.h */
VNET_DEFINE(struct inpcbinfo, udbinfo);
-VNET_DEFINE(struct inpcbhead, ulitecb);
VNET_DEFINE(struct inpcbinfo, ulitecbinfo);
VNET_DEFINE_STATIC(uma_zone_t, udpcb_zone);
#define V_udpcb_zone VNET(udpcb_zone)
@@ -207,8 +205,8 @@
* Once we can calculate the flowid that way and re-establish
* a 4-tuple, flip this to 4-tuple.
*/
- in_pcbinfo_init(&V_udbinfo, "udp", &V_udb, UDBHASHSIZE, UDBHASHSIZE,
- "udp_inpcb", udp_inpcb_init, IPI_HASHFIELDS_2TUPLE);
+ in_pcbinfo_init(&V_udbinfo, "udp", UDBHASHSIZE, UDBHASHSIZE,
+ "udp_inpcb", udp_inpcb_init);
V_udpcb_zone = uma_zcreate("udpcb", sizeof(struct udpcb),
NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
uma_zone_set_max(V_udpcb_zone, maxsockets);
@@ -221,9 +219,8 @@
udplite_init(void)
{
- in_pcbinfo_init(&V_ulitecbinfo, "udplite", &V_ulitecb, UDBHASHSIZE,
- UDBHASHSIZE, "udplite_inpcb", udplite_inpcb_init,
- IPI_HASHFIELDS_2TUPLE);
+ in_pcbinfo_init(&V_ulitecbinfo, "udplite", UDBHASHSIZE,
+ UDBHASHSIZE, "udplite_inpcb", udplite_inpcb_init);
}
/*
@@ -389,6 +386,123 @@
return (0);
}
+static bool
+udp_multi_match(const struct inpcb *inp, void *v)
+{
+ struct ip *ip = v;
+ struct udphdr *uh = (struct udphdr *)(ip + 1);
+
+ if (inp->inp_lport != uh->uh_dport)
+ return (false);
+#ifdef INET6
+ if ((inp->inp_vflag & INP_IPV4) == 0)
+ return (false);
+#endif
+ if (inp->inp_laddr.s_addr != INADDR_ANY &&
+ inp->inp_laddr.s_addr != ip->ip_dst.s_addr)
+ return (false);
+ if (inp->inp_faddr.s_addr != INADDR_ANY &&
+ inp->inp_faddr.s_addr != ip->ip_src.s_addr)
+ return (false);
+ if (inp->inp_fport != 0 &&
+ inp->inp_fport != uh->uh_sport)
+ return (false);
+
+ return (true);
+}
+
+static int
+udp_multi_input(struct mbuf *m, int proto, struct sockaddr_in *udp_in)
+{
+ struct ip *ip = mtod(m, struct ip *);
+ struct inpcb_iterator inpi = INP_ITERATOR(udp_get_inpcbinfo(proto),
+ INPLOOKUP_RLOCKPCB, udp_multi_match, ip);
+ struct udphdr *uh = (struct udphdr *)(ip + 1);
+ struct inpcb *inp;
+ struct mbuf *n;
+ int appends = 0;
+
+ MPASS(ip->ip_hl == sizeof(struct ip) >> 2);
+
+ while ((inp = inp_next(&inpi)) != NULL) {
+ /*
+ * XXXRW: Because we weren't holding either the inpcb
+ * or the hash lock when we checked for a match
+ * before, we should probably recheck now that the
+ * inpcb lock is held.
+ */
+ /*
+ * Handle socket delivery policy for any-source
+ * and source-specific multicast. [RFC3678]
+ */
+ if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) {
+ struct ip_moptions *imo;
+ struct sockaddr_in group;
+ int blocked;
+
+ imo = inp->inp_moptions;
+ if (imo == NULL)
+ continue;
+ bzero(&group, sizeof(struct sockaddr_in));
+ group.sin_len = sizeof(struct sockaddr_in);
+ group.sin_family = AF_INET;
+ group.sin_addr = ip->ip_dst;
+
+ blocked = imo_multi_filter(imo, m->m_pkthdr.rcvif,
+ (struct sockaddr *)&group,
+ (struct sockaddr *)&udp_in[0]);
+ if (blocked != MCAST_PASS) {
+ if (blocked == MCAST_NOTGMEMBER)
+ IPSTAT_INC(ips_notmember);
+ if (blocked == MCAST_NOTSMEMBER ||
+ blocked == MCAST_MUTED)
+ UDPSTAT_INC(udps_filtermcast);
+ continue;
+ }
+ }
+ if ((n = m_copym(m, 0, M_COPYALL, M_NOWAIT)) != NULL) {
+ if (proto == IPPROTO_UDPLITE)
+ UDPLITE_PROBE(receive, NULL, inp, ip, inp, uh);
+ else
+ UDP_PROBE(receive, NULL, inp, ip, inp, uh);
+ if (udp_append(inp, ip, n, sizeof(struct ip), udp_in)) {
+ INP_RUNLOCK(inp);
+ break;
+ } else
+ appends++;
+ }
+ /*
+ * Don't look for additional matches if this one does
+ * not have either the SO_REUSEPORT or SO_REUSEADDR
+ * socket options set. This heuristic avoids
+ * searching through all pcbs in the common case of a
+ * non-shared port. It assumes that an application
+ * will never clear these options after setting them.
+ */
+ if ((inp->inp_socket->so_options &
+ (SO_REUSEPORT|SO_REUSEPORT_LB|SO_REUSEADDR)) == 0) {
+ INP_RUNLOCK(inp);
+ break;
+ }
+ }
+ m_freem(m);
+
+ if (appends == 0) {
+ /*
+ * No matching pcb found; discard datagram. (No need
+ * to send an ICMP Port Unreachable for a broadcast
+ * or multicast datgram.)
+ */
+ UDPSTAT_INC(udps_noport);
+ if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)))
+ UDPSTAT_INC(udps_noportmcast);
+ else
+ UDPSTAT_INC(udps_noportbcast);
+ }
+
+ return (IPPROTO_DONE);
+}
+
int
udp_input(struct mbuf **mp, int *offp, int proto)
{
@@ -524,140 +638,15 @@
}
}
- pcbinfo = udp_get_inpcbinfo(proto);
if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) ||
- in_broadcast(ip->ip_dst, ifp)) {
- struct inpcb *last;
- struct inpcbhead *pcblist;
-
- NET_EPOCH_ASSERT();
-
- pcblist = udp_get_pcblist(proto);
- last = NULL;
- CK_LIST_FOREACH(inp, pcblist, inp_list) {
- if (inp->inp_lport != uh->uh_dport)
- continue;
-#ifdef INET6
- if ((inp->inp_vflag & INP_IPV4) == 0)
- continue;
-#endif
- if (inp->inp_laddr.s_addr != INADDR_ANY &&
- inp->inp_laddr.s_addr != ip->ip_dst.s_addr)
- continue;
- if (inp->inp_faddr.s_addr != INADDR_ANY &&
- inp->inp_faddr.s_addr != ip->ip_src.s_addr)
- continue;
- if (inp->inp_fport != 0 &&
- inp->inp_fport != uh->uh_sport)
- continue;
-
- INP_RLOCK(inp);
-
- if (__predict_false(inp->inp_flags2 & INP_FREED)) {
- INP_RUNLOCK(inp);
- continue;
- }
-
- /*
- * XXXRW: Because we weren't holding either the inpcb
- * or the hash lock when we checked for a match
- * before, we should probably recheck now that the
- * inpcb lock is held.
- */
-
- /*
- * Handle socket delivery policy for any-source
- * and source-specific multicast. [RFC3678]
- */
- if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) {
- struct ip_moptions *imo;
- struct sockaddr_in group;
- int blocked;
-
- imo = inp->inp_moptions;
- if (imo == NULL) {
- INP_RUNLOCK(inp);
- continue;
- }
- bzero(&group, sizeof(struct sockaddr_in));
- group.sin_len = sizeof(struct sockaddr_in);
- group.sin_family = AF_INET;
- group.sin_addr = ip->ip_dst;
-
- blocked = imo_multi_filter(imo, ifp,
- (struct sockaddr *)&group,
- (struct sockaddr *)&udp_in[0]);
- if (blocked != MCAST_PASS) {
- if (blocked == MCAST_NOTGMEMBER)
- IPSTAT_INC(ips_notmember);
- if (blocked == MCAST_NOTSMEMBER ||
- blocked == MCAST_MUTED)
- UDPSTAT_INC(udps_filtermcast);
- INP_RUNLOCK(inp);
- continue;
- }
- }
- if (last != NULL) {
- struct mbuf *n;
-
- if ((n = m_copym(m, 0, M_COPYALL, M_NOWAIT)) !=
- NULL) {
- if (proto == IPPROTO_UDPLITE)
- UDPLITE_PROBE(receive, NULL, last, ip,
- last, uh);
- else
- UDP_PROBE(receive, NULL, last, ip, last,
- uh);
- if (udp_append(last, ip, n, iphlen,
- udp_in)) {
- INP_RUNLOCK(inp);
- goto badunlocked;
- }
- }
- /* Release PCB lock taken on previous pass. */
- INP_RUNLOCK(last);
- }
- last = inp;
- /*
- * Don't look for additional matches if this one does
- * not have either the SO_REUSEPORT or SO_REUSEADDR
- * socket options set. This heuristic avoids
- * searching through all pcbs in the common case of a
- * non-shared port. It assumes that an application
- * will never clear these options after setting them.
- */
- if ((last->inp_socket->so_options &
- (SO_REUSEPORT|SO_REUSEPORT_LB|SO_REUSEADDR)) == 0)
- break;
- }
+ in_broadcast(ip->ip_dst, ifp))
+ return (udp_multi_input(m, proto, udp_in));
- if (last == NULL) {
- /*
- * No matching pcb found; discard datagram. (No need
- * to send an ICMP Port Unreachable for a broadcast
- * or multicast datgram.)
- */
- UDPSTAT_INC(udps_noport);
- if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)))
- UDPSTAT_INC(udps_noportmcast);
- else
- UDPSTAT_INC(udps_noportbcast);
- goto badunlocked;
- }
- if (proto == IPPROTO_UDPLITE)
- UDPLITE_PROBE(receive, NULL, last, ip, last, uh);
- else
- UDP_PROBE(receive, NULL, last, ip, last, uh);
- if (udp_append(last, ip, m, iphlen, udp_in) == 0)
- INP_RUNLOCK(last);
- return (IPPROTO_DONE);
- }
+ pcbinfo = udp_get_inpcbinfo(proto);
/*
* Locate pcb for datagram.
- */
-
- /*
+ *
* Grab info from PACKET_TAG_IPFORWARD tag prepended to the chain.
*/
if ((m->m_flags & M_IP_NEXTHOP) &&
@@ -857,8 +846,9 @@
static int
udp_pcblist(SYSCTL_HANDLER_ARGS)
{
+ struct inpcb_iterator inpi = INP_ALL_ITERATOR(&V_udbinfo,
+ INPLOOKUP_RLOCKPCB);
struct xinpgen xig;
- struct epoch_tracker et;
struct inpcb *inp;
int error;
@@ -886,24 +876,19 @@
if (error)
return (error);
- NET_EPOCH_ENTER(et);
- for (inp = CK_LIST_FIRST(V_udbinfo.ipi_listhead);
- inp != NULL;
- inp = CK_LIST_NEXT(inp, inp_list)) {
- INP_RLOCK(inp);
+ while ((inp = inp_next(&inpi)) != NULL) {
if (inp->inp_gencnt <= xig.xig_gen &&
cr_canseeinpcb(req->td->td_ucred, inp) == 0) {
struct xinpcb xi;
in_pcbtoxinpcb(inp, &xi);
- INP_RUNLOCK(inp);
error = SYSCTL_OUT(req, &xi, sizeof xi);
- if (error)
+ if (error) {
+ INP_RUNLOCK(inp);
break;
- } else
- INP_RUNLOCK(inp);
+ }
+ }
}
- NET_EPOCH_EXIT(et);
if (!error) {
/*
@@ -1289,15 +1274,16 @@
laddr = inp->inp_laddr;
lport = inp->inp_lport;
if (src.sin_family == AF_INET) {
- INP_HASH_LOCK_ASSERT(pcbinfo);
if ((lport == 0) ||
(laddr.s_addr == INADDR_ANY &&
src.sin_addr.s_addr == INADDR_ANY)) {
error = EINVAL;
goto release;
}
+ INP_HASH_WLOCK(pcbinfo);
error = in_pcbbind_setup(inp, (struct sockaddr *)&src,
&laddr.s_addr, &lport, td->td_ucred);
+ INP_HASH_WUNLOCK(pcbinfo);
if (error)
goto release;
}
@@ -1340,12 +1326,14 @@
inp->inp_lport == 0 ||
sin->sin_addr.s_addr == INADDR_ANY ||
sin->sin_addr.s_addr == INADDR_BROADCAST) {
- INP_HASH_LOCK_ASSERT(pcbinfo);
+ INP_HASH_WLOCK(pcbinfo);
error = in_pcbconnect_setup(inp, addr, &laddr.s_addr,
&lport, &faddr.s_addr, &fport, NULL,
td->td_ucred);
- if (error)
+ if (error) {
+ INP_HASH_WUNLOCK(pcbinfo);
goto release;
+ }
/*
* XXXRW: Why not commit the port if the address is
@@ -1362,7 +1350,6 @@
if (prison_flag(td->td_ucred, PR_IP4))
inp->inp_laddr = laddr;
inp->inp_lport = lport;
- INP_HASH_WLOCK(pcbinfo);
error = in_pcbinshash(inp);
INP_HASH_WUNLOCK(pcbinfo);
if (error != 0) {
@@ -1371,7 +1358,8 @@
goto release;
}
inp->inp_flags |= INP_ANONPORT;
- }
+ } else
+ INP_HASH_WUNLOCK(pcbinfo);
} else {
faddr = sin->sin_addr;
fport = sin->sin_port;
@@ -1565,12 +1553,9 @@
error = soreserve(so, udp_sendspace, udp_recvspace);
if (error)
return (error);
- INP_INFO_WLOCK(pcbinfo);
error = in_pcballoc(so, pcbinfo);
- if (error) {
- INP_INFO_WUNLOCK(pcbinfo);
+ if (error)
return (error);
- }
inp = sotoinpcb(so);
inp->inp_vflag |= INP_IPV4;
@@ -1582,12 +1567,10 @@
if (error) {
in_pcbdetach(inp);
in_pcbfree(inp);
- INP_INFO_WUNLOCK(pcbinfo);
return (error);
}
-
INP_WUNLOCK(inp);
- INP_INFO_WUNLOCK(pcbinfo);
+
return (0);
}
#endif /* INET */
@@ -1723,14 +1706,12 @@
KASSERT(inp != NULL, ("udp_detach: inp == NULL"));
KASSERT(inp->inp_faddr.s_addr == INADDR_ANY,
("udp_detach: not disconnected"));
- INP_INFO_WLOCK(pcbinfo);
INP_WLOCK(inp);
up = intoudpcb(inp);
KASSERT(up != NULL, ("%s: up == NULL", __func__));
inp->inp_ppcb = NULL;
in_pcbdetach(inp);
in_pcbfree(inp);
- INP_INFO_WUNLOCK(pcbinfo);
udp_discardcb(up);
}
Index: sys/netinet/udp_var.h
===================================================================
--- sys/netinet/udp_var.h
+++ sys/netinet/udp_var.h
@@ -136,13 +136,9 @@
SYSCTL_DECL(_net_inet_udp);
extern struct pr_usrreqs udp_usrreqs;
-VNET_DECLARE(struct inpcbhead, udb);
VNET_DECLARE(struct inpcbinfo, udbinfo);
-VNET_DECLARE(struct inpcbhead, ulitecb);
VNET_DECLARE(struct inpcbinfo, ulitecbinfo);
-#define V_udb VNET(udb)
#define V_udbinfo VNET(udbinfo)
-#define V_ulitecb VNET(ulitecb)
#define V_ulitecbinfo VNET(ulitecbinfo)
extern u_long udp_sendspace;
@@ -163,12 +159,6 @@
return (protocol == IPPROTO_UDP) ? &V_udbinfo : &V_ulitecbinfo;
}
-static __inline struct inpcbhead *
-udp_get_pcblist(int protocol)
-{
- return (protocol == IPPROTO_UDP) ? &V_udb : &V_ulitecb;
-}
-
int udp_newudpcb(struct inpcb *);
void udp_discardcb(struct udpcb *);
Index: sys/netinet6/icmp6.c
===================================================================
--- sys/netinet6/icmp6.c
+++ sys/netinet6/icmp6.c
@@ -124,14 +124,12 @@
#endif /* VIMAGE */
VNET_DECLARE(struct inpcbinfo, ripcbinfo);
-VNET_DECLARE(struct inpcbhead, ripcb);
VNET_DECLARE(int, icmp6errppslim);
VNET_DEFINE_STATIC(int, icmp6errpps_count) = 0;
VNET_DEFINE_STATIC(struct timeval, icmp6errppslim_last);
VNET_DECLARE(int, icmp6_nodeinfo);
#define V_ripcbinfo VNET(ripcbinfo)
-#define V_ripcb VNET(ripcb)
#define V_icmp6errppslim VNET(icmp6errppslim)
#define V_icmp6errpps_count VNET(icmp6errpps_count)
#define V_icmp6errppslim_last VNET(icmp6errppslim_last)
@@ -1875,21 +1873,39 @@
return (copied);
}
+static bool
+icmp6_rip6_match(const struct inpcb *inp, void *v)
+{
+ struct ip6_hdr *ip6 = v;
+
+ if ((inp->inp_vflag & INP_IPV6) == 0)
+ return (false);
+ if (inp->inp_ip_p != IPPROTO_ICMPV6)
+ return (false);
+ if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) &&
+ !IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr, &ip6->ip6_dst))
+ return (false);
+ if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr) &&
+ !IN6_ARE_ADDR_EQUAL(&inp->in6p_faddr, &ip6->ip6_src))
+ return (false);
+ return (true);
+}
+
/*
* XXX almost dup'ed code with rip6_input.
*/
static int
icmp6_rip6_input(struct mbuf **mp, int off)
{
- struct mbuf *m = *mp;
+ struct mbuf *n, *m = *mp;
struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
+ struct inpcb_iterator inpi = INP_ITERATOR(&V_ripcbinfo,
+ INPLOOKUP_RLOCKPCB, icmp6_rip6_match, ip6);
struct inpcb *inp;
- struct inpcb *last = NULL;
struct sockaddr_in6 fromsa;
struct icmp6_hdr *icmp6;
struct mbuf *opts = NULL;
-
- NET_EPOCH_ASSERT();
+ int delivered = 0;
/* This is assumed to be safe; icmp6_input() does a pullup. */
icmp6 = (struct icmp6_hdr *)((caddr_t)ip6 + off);
@@ -1908,125 +1924,64 @@
return (IPPROTO_DONE);
}
- CK_LIST_FOREACH(inp, &V_ripcb, inp_list) {
- if ((inp->inp_vflag & INP_IPV6) == 0)
- continue;
- if (inp->inp_ip_p != IPPROTO_ICMPV6)
- continue;
- if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) &&
- !IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr, &ip6->ip6_dst))
- continue;
- if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr) &&
- !IN6_ARE_ADDR_EQUAL(&inp->in6p_faddr, &ip6->ip6_src))
- continue;
- INP_RLOCK(inp);
- if (__predict_false(inp->inp_flags2 & INP_FREED)) {
- INP_RUNLOCK(inp);
- continue;
- }
+ while ((inp = inp_next(&inpi)) != NULL) {
if (ICMP6_FILTER_WILLBLOCK(icmp6->icmp6_type,
- inp->in6p_icmp6filt)) {
- INP_RUNLOCK(inp);
+ inp->in6p_icmp6filt))
continue;
- }
- if (last != NULL) {
- struct mbuf *n = NULL;
-
- /*
- * Recent network drivers tend to allocate a single
- * mbuf cluster, rather than to make a couple of
- * mbufs without clusters. Also, since the IPv6 code
- * path tries to avoid m_pullup(), it is highly
- * probable that we still have an mbuf cluster here
- * even though the necessary length can be stored in an
- * mbuf's internal buffer.
- * Meanwhile, the default size of the receive socket
- * buffer for raw sockets is not so large. This means
- * the possibility of packet loss is relatively higher
- * than before. To avoid this scenario, we copy the
- * received data to a separate mbuf that does not use
- * a cluster, if possible.
- * XXX: it is better to copy the data after stripping
- * intermediate headers.
- */
- if ((m->m_flags & M_EXT) && m->m_next == NULL &&
- m->m_len <= MHLEN) {
- n = m_get(M_NOWAIT, m->m_type);
- if (n != NULL) {
- if (m_dup_pkthdr(n, m, M_NOWAIT)) {
- bcopy(m->m_data, n->m_data,
- m->m_len);
- n->m_len = m->m_len;
- } else {
- m_free(n);
- n = NULL;
- }
- }
- }
- if (n != NULL ||
- (n = m_copym(m, 0, M_COPYALL, M_NOWAIT)) != NULL) {
- if (last->inp_flags & INP_CONTROLOPTS)
- ip6_savecontrol(last, n, &opts);
- /* strip intermediate headers */
- m_adj(n, off);
- SOCKBUF_LOCK(&last->inp_socket->so_rcv);
- if (sbappendaddr_locked(
- &last->inp_socket->so_rcv,
- (struct sockaddr *)&fromsa, n, opts)
- == 0) {
- soroverflow_locked(last->inp_socket);
- m_freem(n);
- if (opts) {
- m_freem(opts);
- }
- } else
- sorwakeup_locked(last->inp_socket);
- opts = NULL;
- }
- INP_RUNLOCK(last);
- }
- last = inp;
- }
- if (last != NULL) {
- if (last->inp_flags & INP_CONTROLOPTS)
- ip6_savecontrol(last, m, &opts);
- /* strip intermediate headers */
- m_adj(m, off);
-
- /* avoid using mbuf clusters if possible (see above) */
+ /*
+ * Recent network drivers tend to allocate a single
+ * mbuf cluster, rather than to make a couple of
+ * mbufs without clusters. Also, since the IPv6 code
+ * path tries to avoid m_pullup(), it is highly
+ * probable that we still have an mbuf cluster here
+ * even though the necessary length can be stored in an
+ * mbuf's internal buffer.
+ * Meanwhile, the default size of the receive socket
+ * buffer for raw sockets is not so large. This means
+ * the possibility of packet loss is relatively higher
+ * than before. To avoid this scenario, we copy the
+ * received data to a separate mbuf that does not use
+ * a cluster, if possible.
+ * XXX: it is better to copy the data after stripping
+ * intermediate headers.
+ */
if ((m->m_flags & M_EXT) && m->m_next == NULL &&
m->m_len <= MHLEN) {
- struct mbuf *n;
-
n = m_get(M_NOWAIT, m->m_type);
if (n != NULL) {
if (m_dup_pkthdr(n, m, M_NOWAIT)) {
bcopy(m->m_data, n->m_data, m->m_len);
n->m_len = m->m_len;
-
- m_freem(m);
- m = n;
} else {
- m_freem(n);
+ m_free(n);
n = NULL;
}
}
- }
- SOCKBUF_LOCK(&last->inp_socket->so_rcv);
- if (sbappendaddr_locked(&last->inp_socket->so_rcv,
- (struct sockaddr *)&fromsa, m, opts) == 0) {
- m_freem(m);
+ } else
+ n = m_copym(m, 0, M_COPYALL, M_NOWAIT);
+ if (n == NULL)
+ continue;
+ if (inp->inp_flags & INP_CONTROLOPTS)
+ ip6_savecontrol(inp, n, &opts);
+ /* strip intermediate headers */
+ m_adj(n, off);
+ SOCKBUF_LOCK(&inp->inp_socket->so_rcv);
+ if (sbappendaddr_locked(&inp->inp_socket->so_rcv,
+ (struct sockaddr *)&fromsa, n, opts) == 0) {
+ soroverflow_locked(inp->inp_socket);
+ m_freem(n);
if (opts)
m_freem(opts);
- soroverflow_locked(last->inp_socket);
- } else
- sorwakeup_locked(last->inp_socket);
- INP_RUNLOCK(last);
- } else {
- m_freem(m);
- IP6STAT_DEC(ip6s_delivered);
+ } else {
+ sorwakeup_locked(inp->inp_socket);
+ delivered++;
+ }
+ opts = NULL;
}
+ m_freem(m);
*mp = NULL;
+ if (delivered == 0)
+ IP6STAT_DEC(ip6s_delivered);
return (IPPROTO_DONE);
}
Index: sys/netinet6/in6_pcb.c
===================================================================
--- sys/netinet6/in6_pcb.c
+++ sys/netinet6/in6_pcb.c
@@ -718,7 +718,7 @@
}
errno = inet6ctlerrmap[cmd];
INP_INFO_WLOCK(pcbinfo);
- CK_LIST_FOREACH_SAFE(inp, pcbinfo->ipi_listhead, inp_list, inp_temp) {
+ CK_LIST_FOREACH_SAFE(inp, &pcbinfo->ipi_listhead, inp_list, inp_temp) {
INP_WLOCK(inp);
if ((inp->inp_vflag & INP_IPV6) == 0) {
INP_WUNLOCK(inp);
@@ -868,49 +868,54 @@
}
}
+static bool
+in6_multi_match(const struct inpcb *inp, void *v __unused)
+{
+
+ if ((inp->inp_vflag & INP_IPV6) && inp->in6p_moptions != NULL)
+ return (true);
+ else
+ return (false);
+}
+
void
in6_pcbpurgeif0(struct inpcbinfo *pcbinfo, struct ifnet *ifp)
{
+ struct inpcb_iterator inpi = INP_ITERATOR(pcbinfo, INPLOOKUP_RLOCKPCB,
+ in6_multi_match, NULL);
struct inpcb *inp;
struct in6_multi *inm;
struct in6_mfilter *imf;
struct ip6_moptions *im6o;
- INP_INFO_WLOCK(pcbinfo);
- CK_LIST_FOREACH(inp, pcbinfo->ipi_listhead, inp_list) {
- INP_WLOCK(inp);
- if (__predict_false(inp->inp_flags2 & INP_FREED)) {
- INP_WUNLOCK(inp);
- continue;
- }
+ IN6_MULTI_LOCK_ASSERT();
+
+ while ((inp = inp_next(&inpi)) != NULL) {
+ INP_RLOCK_ASSERT(inp);
+
im6o = inp->in6p_moptions;
- if ((inp->inp_vflag & INP_IPV6) && im6o != NULL) {
- /*
- * Unselect the outgoing ifp for multicast if it
- * is being detached.
- */
- if (im6o->im6o_multicast_ifp == ifp)
- im6o->im6o_multicast_ifp = NULL;
- /*
- * Drop multicast group membership if we joined
- * through the interface being detached.
- */
+ /*
+ * Unselect the outgoing ifp for multicast if it
+ * is being detached.
+ */
+ if (im6o->im6o_multicast_ifp == ifp)
+ im6o->im6o_multicast_ifp = NULL;
+ /*
+ * Drop multicast group membership if we joined
+ * through the interface being detached.
+ */
restart:
- IP6_MFILTER_FOREACH(imf, &im6o->im6o_head) {
- if ((inm = imf->im6f_in6m) == NULL)
- continue;
- if (inm->in6m_ifp != ifp)
- continue;
- ip6_mfilter_remove(&im6o->im6o_head, imf);
- IN6_MULTI_LOCK_ASSERT();
- in6_leavegroup_locked(inm, NULL);
- ip6_mfilter_free(imf);
- goto restart;
- }
+ IP6_MFILTER_FOREACH(imf, &im6o->im6o_head) {
+ if ((inm = imf->im6f_in6m) == NULL)
+ continue;
+ if (inm->in6m_ifp != ifp)
+ continue;
+ ip6_mfilter_remove(&im6o->im6o_head, imf);
+ in6_leavegroup_locked(inm, NULL);
+ ip6_mfilter_free(imf);
+ goto restart;
}
- INP_WUNLOCK(inp);
}
- INP_INFO_WUNLOCK(pcbinfo);
}
/*
@@ -1126,20 +1131,16 @@
{
struct inpcb *inp;
+ smr_enter(pcbinfo->ipi_smr);
inp = in6_pcblookup_hash_locked(pcbinfo, faddr, fport, laddr, lport,
lookupflags & INPLOOKUP_WILDCARD, ifp, numa_domain);
if (inp != NULL) {
- if (lookupflags & INPLOOKUP_WLOCKPCB) {
- INP_WLOCK(inp);
- } else if (lookupflags & INPLOOKUP_RLOCKPCB) {
- INP_RLOCK(inp);
- } else
- panic("%s: locking bug", __func__);
- if (__predict_false(inp->inp_flags2 & INP_FREED)) {
- INP_UNLOCK(inp);
+ if (__predict_false(inp_smr_lock(inp,
+ (lookupflags & INPLOOKUP_LOCKMASK)) == false))
inp = NULL;
- }
- }
+ } else
+ smr_exit(pcbinfo->ipi_smr);
+
return (inp);
}
Index: sys/netinet6/ip6_gre.c
===================================================================
--- sys/netinet6/ip6_gre.c
+++ sys/netinet6/ip6_gre.c
@@ -216,30 +216,15 @@
in6_gre_udp_input(struct mbuf *m, int off, struct inpcb *inp,
const struct sockaddr *sa, void *ctx)
{
- struct epoch_tracker et;
struct gre_socket *gs;
struct gre_softc *sc;
struct sockaddr_in6 dst;
- NET_EPOCH_ENTER(et);
- /*
- * udp_append() holds reference to inp, it is safe to check
- * inp_flags2 without INP_RLOCK().
- * If socket was closed before we have entered NET_EPOCH section,
- * INP_FREED flag should be set. Otherwise it should be safe to
- * make access to ctx data, because gre_so will be freed by
- * gre_sofree() via NET_EPOCH_CALL().
- */
- if (__predict_false(inp->inp_flags2 & INP_FREED)) {
- NET_EPOCH_EXIT(et);
- m_freem(m);
- return;
- }
+ NET_EPOCH_ASSERT();
gs = (struct gre_socket *)ctx;
dst = *(const struct sockaddr_in6 *)sa;
if (sa6_embedscope(&dst, 0)) {
- NET_EPOCH_EXIT(et);
m_freem(m);
return;
}
@@ -249,11 +234,9 @@
}
if (sc != NULL && (GRE2IFP(sc)->if_flags & IFF_UP) != 0){
gre_input(m, off + sizeof(struct udphdr), IPPROTO_UDP, sc);
- NET_EPOCH_EXIT(et);
return;
}
m_freem(m);
- NET_EPOCH_EXIT(et);
}
static int
Index: sys/netinet6/raw_ip6.c
===================================================================
--- sys/netinet6/raw_ip6.c
+++ sys/netinet6/raw_ip6.c
@@ -119,9 +119,7 @@
* Raw interface to IP6 protocol.
*/
-VNET_DECLARE(struct inpcbhead, ripcb);
VNET_DECLARE(struct inpcbinfo, ripcbinfo);
-#define V_ripcb VNET(ripcb)
#define V_ripcbinfo VNET(ripcbinfo)
extern u_long rip_sendspace;
@@ -153,6 +151,33 @@
int (*ip6_mforward)(struct ip6_hdr *, struct ifnet *, struct mbuf *);
int (*mrt6_ioctl)(u_long, caddr_t);
+struct rip6_inp_match_ctx {
+ struct ip6_hdr *ip6;
+ int proto;
+};
+
+static bool
+rip6_inp_match(const struct inpcb *inp, void *v)
+{
+ struct rip6_inp_match_ctx *c = v;
+ struct ip6_hdr *ip6 = c->ip6;
+ int proto = c->proto;
+
+ /* XXX inp locking */
+ if ((inp->inp_vflag & INP_IPV6) == 0)
+ return (false);
+ if (inp->inp_ip_p && inp->inp_ip_p != proto)
+ return (false);
+ if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) &&
+ !IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr, &ip6->ip6_dst))
+ return (false);
+ if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr) &&
+ !IN6_ARE_ADDR_EQUAL(&inp->in6p_faddr, &ip6->ip6_src))
+ return (false);
+
+ return (true);
+}
+
/*
* Setup generic address and protocol structures for raw_input routine, then
* pass them along with mbuf chain.
@@ -161,12 +186,15 @@
rip6_input(struct mbuf **mp, int *offp, int proto)
{
struct ifnet *ifp;
- struct mbuf *m = *mp;
+ struct mbuf *n, *m = *mp;
struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
struct inpcb *inp;
- struct inpcb *last = NULL;
struct mbuf *opts = NULL;
struct sockaddr_in6 fromsa;
+ struct rip6_inp_match_ctx ctx = { .ip6 = ip6, .proto = proto };
+ struct inpcb_iterator inpi = INP_ITERATOR(&V_ripcbinfo,
+ INPLOOKUP_RLOCKPCB, rip6_inp_match, &ctx);
+ int delivered = 0;
NET_EPOCH_ASSERT();
@@ -176,70 +204,27 @@
ifp = m->m_pkthdr.rcvif;
- CK_LIST_FOREACH(inp, &V_ripcb, inp_list) {
- /* XXX inp locking */
- if ((inp->inp_vflag & INP_IPV6) == 0)
- continue;
- if (inp->inp_ip_p &&
- inp->inp_ip_p != proto)
- continue;
- if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) &&
- !IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr, &ip6->ip6_dst))
- continue;
- if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr) &&
- !IN6_ARE_ADDR_EQUAL(&inp->in6p_faddr, &ip6->ip6_src))
- continue;
- if (last != NULL) {
- struct mbuf *n = m_copym(m, 0, M_COPYALL, M_NOWAIT);
-
+ while ((inp = inp_next(&inpi)) != NULL) {
+ INP_RLOCK_ASSERT(inp);
#if defined(IPSEC) || defined(IPSEC_SUPPORT)
- /*
- * Check AH/ESP integrity.
- */
- if (IPSEC_ENABLED(ipv6)) {
- if (n != NULL &&
- IPSEC_CHECK_POLICY(ipv6, n, last) != 0) {
- m_freem(n);
- /* Do not inject data into pcb. */
- n = NULL;
- }
- }
-#endif /* IPSEC */
- if (n) {
- if (last->inp_flags & INP_CONTROLOPTS ||
- last->inp_socket->so_options & SO_TIMESTAMP)
- ip6_savecontrol(last, n, &opts);
- /* strip intermediate headers */
- m_adj(n, *offp);
- if (sbappendaddr(&last->inp_socket->so_rcv,
- (struct sockaddr *)&fromsa,
- n, opts) == 0) {
- soroverflow(last->inp_socket);
- m_freem(n);
- if (opts)
- m_freem(opts);
- RIP6STAT_INC(rip6s_fullsock);
- } else
- sorwakeup(last->inp_socket);
- opts = NULL;
- }
- INP_RUNLOCK(last);
- last = NULL;
+ /*
+ * Check AH/ESP integrity.
+ */
+ if (IPSEC_ENABLED(ipv6) &&
+ IPSEC_CHECK_POLICY(ipv6, m, inp) != 0) {
+ /* Do not inject data into pcb. */
+ continue;
}
- INP_RLOCK(inp);
- if (__predict_false(inp->inp_flags2 & INP_FREED))
- goto skip_2;
- if (jailed_without_vnet(inp->inp_cred)) {
+#endif /* IPSEC */
+ if (jailed_without_vnet(inp->inp_cred) &&
+ !IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) &&
+ prison_check_ip6(inp->inp_cred, &ip6->ip6_dst) != 0)
/*
* Allow raw socket in jail to receive multicast;
* assume process had PRIV_NETINET_RAW at attach,
* and fall through into normal filter path if so.
*/
- if (!IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) &&
- prison_check_ip6(inp->inp_cred,
- &ip6->ip6_dst) != 0)
- goto skip_2;
- }
+ continue;
if (inp->in6p_cksum != -1) {
RIP6STAT_INC(rip6s_isum);
if (m->m_pkthdr.len - (*offp + inp->in6p_cksum) < 2 ||
@@ -251,8 +236,9 @@
* ICMP6 message. Set proto to IPPROTO_NONE
* to achieve that.
*/
+ INP_RUNLOCK(inp);
proto = IPPROTO_NONE;
- goto skip_2;
+ break;
}
}
/*
@@ -298,43 +284,30 @@
}
if (blocked != MCAST_PASS) {
IP6STAT_INC(ip6s_notmember);
- goto skip_2;
+ continue;
}
}
- last = inp;
- continue;
-skip_2:
- INP_RUNLOCK(inp);
- }
-#if defined(IPSEC) || defined(IPSEC_SUPPORT)
- /*
- * Check AH/ESP integrity.
- */
- if (IPSEC_ENABLED(ipv6) && last != NULL &&
- IPSEC_CHECK_POLICY(ipv6, m, last) != 0) {
- m_freem(m);
- IP6STAT_DEC(ip6s_delivered);
- /* Do not inject data into pcb. */
- INP_RUNLOCK(last);
- } else
-#endif /* IPSEC */
- if (last != NULL) {
- if (last->inp_flags & INP_CONTROLOPTS ||
- last->inp_socket->so_options & SO_TIMESTAMP)
- ip6_savecontrol(last, m, &opts);
- /* Strip intermediate headers. */
- m_adj(m, *offp);
- if (sbappendaddr(&last->inp_socket->so_rcv,
- (struct sockaddr *)&fromsa, m, opts) == 0) {
- soroverflow(last->inp_socket);
- m_freem(m);
+ if ((n = m_copym(m, 0, M_COPYALL, M_NOWAIT)) == NULL)
+ continue;
+ if (inp->inp_flags & INP_CONTROLOPTS ||
+ inp->inp_socket->so_options & SO_TIMESTAMP)
+ ip6_savecontrol(inp, n, &opts);
+ /* strip intermediate headers */
+ m_adj(n, *offp);
+ if (sbappendaddr(&inp->inp_socket->so_rcv,
+ (struct sockaddr *)&fromsa, n, opts) == 0) {
+ soroverflow(inp->inp_socket);
+ m_freem(n);
if (opts)
m_freem(opts);
RIP6STAT_INC(rip6s_fullsock);
- } else
- sorwakeup(last->inp_socket);
- INP_RUNLOCK(last);
- } else {
+ } else {
+ sorwakeup(inp->inp_socket);
+ delivered++;
+ }
+ opts = NULL;
+ }
+ if (delivered == 0) {
RIP6STAT_INC(rip6s_nosock);
if (m->m_flags & M_MCAST)
RIP6STAT_INC(rip6s_nosockmcast);
@@ -345,7 +318,8 @@
ICMP6_PARAMPROB_NEXTHEADER,
ip6_get_prevhdr(m, *offp));
IP6STAT_DEC(ip6s_delivered);
- }
+ } else
+ m_freem(m);
return (IPPROTO_DONE);
}
@@ -678,15 +652,12 @@
filter = malloc(sizeof(struct icmp6_filter), M_PCB, M_NOWAIT);
if (filter == NULL)
return (ENOMEM);
- INP_INFO_WLOCK(&V_ripcbinfo);
error = in_pcballoc(so, &V_ripcbinfo);
if (error) {
- INP_INFO_WUNLOCK(&V_ripcbinfo);
free(filter, M_PCB);
return (error);
}
inp = (struct inpcb *)so->so_pcb;
- INP_INFO_WUNLOCK(&V_ripcbinfo);
inp->inp_vflag |= INP_IPV6;
inp->inp_ip_p = (long)proto;
inp->in6p_hops = -1; /* use kernel default */
@@ -708,12 +679,10 @@
if (so == V_ip6_mrouter && ip6_mrouter_done)
ip6_mrouter_done();
/* xxx: RSVP */
- INP_INFO_WLOCK(&V_ripcbinfo);
INP_WLOCK(inp);
free(inp->in6p_icmp6filt, M_PCB);
in_pcbdetach(inp);
in_pcbfree(inp);
- INP_INFO_WUNLOCK(&V_ripcbinfo);
}
/* XXXRW: This can't ever be called. */
Index: sys/netinet6/udp6_usrreq.c
===================================================================
--- sys/netinet6/udp6_usrreq.c
+++ sys/netinet6/udp6_usrreq.c
@@ -207,6 +207,137 @@
return (0);
}
+struct udp6_multi_match_ctx {
+ struct ip6_hdr *ip6;
+ struct udphdr *uh;
+};
+
+static bool
+udp6_multi_match(const struct inpcb *inp, void *v)
+{
+ struct udp6_multi_match_ctx *ctx = v;
+
+ if ((inp->inp_vflag & INP_IPV6) == 0)
+ return(false);
+ if (inp->inp_lport != ctx->uh->uh_dport)
+ return(false);
+ if (inp->inp_fport != 0 && inp->inp_fport != ctx->uh->uh_sport)
+ return(false);
+ if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) &&
+ !IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr, &ctx->ip6->ip6_dst))
+ return (false);
+ if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr) &&
+ (!IN6_ARE_ADDR_EQUAL(&inp->in6p_faddr, &ctx->ip6->ip6_src) ||
+ inp->inp_fport != ctx->uh->uh_sport))
+ return (false);
+
+ return (true);
+}
+
+static int
+udp6_multi_input(struct mbuf *m, int off, int proto,
+ struct sockaddr_in6 *fromsa)
+{
+ struct udp6_multi_match_ctx ctx;
+ struct inpcb_iterator inpi = INP_ITERATOR(udp_get_inpcbinfo(proto),
+ INPLOOKUP_RLOCKPCB, udp6_multi_match, &ctx);
+ struct inpcb *inp;
+ struct ip6_moptions *imo;
+ struct mbuf *n;
+ int appends = 0;
+
+ /*
+ * In the event that laddr should be set to the link-local
+ * address (this happens in RIPng), the multicast address
+ * specified in the received packet will not match laddr. To
+ * handle this situation, matching is relaxed if the
+ * receiving interface is the same as one specified in the
+ * socket and if the destination multicast address matches
+ * one of the multicast groups specified in the socket.
+ */
+
+ /*
+ * KAME note: traditionally we dropped udpiphdr from mbuf
+ * here. We need udphdr for IPsec processing so we do that
+ * later.
+ */
+ ctx.ip6 = mtod(m, struct ip6_hdr *);
+ ctx.uh = (struct udphdr *)((char *)ctx.ip6 + off);
+ while ((inp = inp_next(&inpi)) != NULL) {
+ INP_RLOCK_ASSERT(inp);
+ /*
+ * XXXRW: Because we weren't holding either the inpcb
+ * or the hash lock when we checked for a match
+ * before, we should probably recheck now that the
+ * inpcb lock is (supposed to be) held.
+ */
+ /*
+ * Handle socket delivery policy for any-source
+ * and source-specific multicast. [RFC3678]
+ */
+ if ((imo = inp->in6p_moptions) != NULL) {
+ struct sockaddr_in6 mcaddr;
+ int blocked;
+
+ bzero(&mcaddr, sizeof(struct sockaddr_in6));
+ mcaddr.sin6_len = sizeof(struct sockaddr_in6);
+ mcaddr.sin6_family = AF_INET6;
+ mcaddr.sin6_addr = ctx.ip6->ip6_dst;
+
+ blocked = im6o_mc_filter(imo, m->m_pkthdr.rcvif,
+ (struct sockaddr *)&mcaddr,
+ (struct sockaddr *)&fromsa[0]);
+ if (blocked != MCAST_PASS) {
+ if (blocked == MCAST_NOTGMEMBER)
+ IP6STAT_INC(ip6s_notmember);
+ if (blocked == MCAST_NOTSMEMBER ||
+ blocked == MCAST_MUTED)
+ UDPSTAT_INC(udps_filtermcast);
+ continue;
+ }
+ }
+ if ((n = m_copym(m, 0, M_COPYALL, M_NOWAIT)) != NULL) {
+ if (proto == IPPROTO_UDPLITE)
+ UDPLITE_PROBE(receive, NULL, inp, ctx.ip6,
+ inp, ctx.uh);
+ else
+ UDP_PROBE(receive, NULL, inp, ctx.ip6, inp,
+ ctx.uh);
+ if (udp6_append(inp, n, off, fromsa)) {
+ INP_RUNLOCK(inp);
+ break;
+ } else
+ appends++;
+ }
+ /*
+ * Don't look for additional matches if this one does
+ * not have either the SO_REUSEPORT or SO_REUSEADDR
+ * socket options set. This heuristic avoids
+ * searching through all pcbs in the common case of a
+ * non-shared port. It assumes that an application
+ * will never clear these options after setting them.
+ */
+ if ((inp->inp_socket->so_options &
+ (SO_REUSEPORT|SO_REUSEPORT_LB|SO_REUSEADDR)) == 0) {
+ INP_RUNLOCK(inp);
+ break;
+ }
+ }
+ m_freem(m);
+
+ if (appends == 0) {
+ /*
+ * No matching pcb found; discard datagram. (No need
+ * to send an ICMP Port Unreachable for a broadcast
+ * or multicast datgram.)
+ */
+ UDPSTAT_INC(udps_noport);
+ UDPSTAT_INC(udps_noportmcast);
+ }
+
+ return (IPPROTO_DONE);
+}
+
int
udp6_input(struct mbuf **mp, int *offp, int proto)
{
@@ -311,144 +442,11 @@
fromsa[1].sin6_port = uh->uh_dport;
pcbinfo = udp_get_inpcbinfo(nxt);
- if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) {
- struct inpcb *last;
- struct inpcbhead *pcblist;
- struct ip6_moptions *imo;
-
- /*
- * In the event that laddr should be set to the link-local
- * address (this happens in RIPng), the multicast address
- * specified in the received packet will not match laddr. To
- * handle this situation, matching is relaxed if the
- * receiving interface is the same as one specified in the
- * socket and if the destination multicast address matches
- * one of the multicast groups specified in the socket.
- */
-
- /*
- * KAME note: traditionally we dropped udpiphdr from mbuf
- * here. We need udphdr for IPsec processing so we do that
- * later.
- */
- pcblist = udp_get_pcblist(nxt);
- last = NULL;
- CK_LIST_FOREACH(inp, pcblist, inp_list) {
- if ((inp->inp_vflag & INP_IPV6) == 0)
- continue;
- if (inp->inp_lport != uh->uh_dport)
- continue;
- if (inp->inp_fport != 0 &&
- inp->inp_fport != uh->uh_sport)
- continue;
- if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) {
- if (!IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr,
- &ip6->ip6_dst))
- continue;
- }
- if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) {
- if (!IN6_ARE_ADDR_EQUAL(&inp->in6p_faddr,
- &ip6->ip6_src) ||
- inp->inp_fport != uh->uh_sport)
- continue;
- }
-
- INP_RLOCK(inp);
-
- if (__predict_false(inp->inp_flags2 & INP_FREED)) {
- INP_RUNLOCK(inp);
- continue;
- }
-
- /*
- * XXXRW: Because we weren't holding either the inpcb
- * or the hash lock when we checked for a match
- * before, we should probably recheck now that the
- * inpcb lock is (supposed to be) held.
- */
-
- /*
- * Handle socket delivery policy for any-source
- * and source-specific multicast. [RFC3678]
- */
- imo = inp->in6p_moptions;
- if (imo != NULL) {
- struct sockaddr_in6 mcaddr;
- int blocked;
-
- bzero(&mcaddr, sizeof(struct sockaddr_in6));
- mcaddr.sin6_len = sizeof(struct sockaddr_in6);
- mcaddr.sin6_family = AF_INET6;
- mcaddr.sin6_addr = ip6->ip6_dst;
-
- blocked = im6o_mc_filter(imo, ifp,
- (struct sockaddr *)&mcaddr,
- (struct sockaddr *)&fromsa[0]);
- if (blocked != MCAST_PASS) {
- if (blocked == MCAST_NOTGMEMBER)
- IP6STAT_INC(ip6s_notmember);
- if (blocked == MCAST_NOTSMEMBER ||
- blocked == MCAST_MUTED)
- UDPSTAT_INC(udps_filtermcast);
- INP_RUNLOCK(inp);
- continue;
- }
- }
-
- if (last != NULL) {
- struct mbuf *n;
-
- if ((n = m_copym(m, 0, M_COPYALL, M_NOWAIT)) !=
- NULL) {
- if (nxt == IPPROTO_UDPLITE)
- UDPLITE_PROBE(receive, NULL,
- last, ip6, last, uh);
- else
- UDP_PROBE(receive, NULL, last,
- ip6, last, uh);
- if (udp6_append(last, n, off,
- fromsa)) {
- INP_RUNLOCK(inp);
- goto badunlocked;
- }
- }
- /* Release PCB lock taken on previous pass. */
- INP_RUNLOCK(last);
- }
- last = inp;
- /*
- * Don't look for additional matches if this one does
- * not have either the SO_REUSEPORT or SO_REUSEADDR
- * socket options set. This heuristic avoids
- * searching through all pcbs in the common case of a
- * non-shared port. It assumes that an application
- * will never clear these options after setting them.
- */
- if ((last->inp_socket->so_options &
- (SO_REUSEPORT|SO_REUSEPORT_LB|SO_REUSEADDR)) == 0)
- break;
- }
-
- if (last == NULL) {
- /*
- * No matching pcb found; discard datagram. (No need
- * to send an ICMP Port Unreachable for a broadcast
- * or multicast datgram.)
- */
- UDPSTAT_INC(udps_noport);
- UDPSTAT_INC(udps_noportmcast);
- goto badunlocked;
- }
-
- if (nxt == IPPROTO_UDPLITE)
- UDPLITE_PROBE(receive, NULL, last, ip6, last, uh);
- else
- UDP_PROBE(receive, NULL, last, ip6, last, uh);
- if (udp6_append(last, m, off, fromsa) == 0)
- INP_RUNLOCK(last);
+ if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) {
*mp = NULL;
- return (IPPROTO_DONE);
+ return (udp6_multi_input(m, off, proto, fromsa));
}
+
/*
* Locate pcb for datagram.
*/
@@ -1042,12 +1040,9 @@
if (error)
return (error);
}
- INP_INFO_WLOCK(pcbinfo);
error = in_pcballoc(so, pcbinfo);
- if (error) {
- INP_INFO_WUNLOCK(pcbinfo);
+ if (error)
return (error);
- }
inp = (struct inpcb *)so->so_pcb;
inp->inp_vflag |= INP_IPV6;
if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0)
@@ -1066,11 +1061,9 @@
if (error) {
in_pcbdetach(inp);
in_pcbfree(inp);
- INP_INFO_WUNLOCK(pcbinfo);
return (error);
}
INP_WUNLOCK(inp);
- INP_INFO_WUNLOCK(pcbinfo);
return (0);
}
@@ -1274,13 +1267,11 @@
inp = sotoinpcb(so);
KASSERT(inp != NULL, ("udp6_detach: inp == NULL"));
- INP_INFO_WLOCK(pcbinfo);
INP_WLOCK(inp);
up = intoudpcb(inp);
KASSERT(up != NULL, ("%s: up == NULL", __func__));
in_pcbdetach(inp);
in_pcbfree(inp);
- INP_INFO_WUNLOCK(pcbinfo);
udp_discardcb(up);
}

File Metadata

Mime Type
text/plain
Expires
Tue, Apr 22, 5:09 PM (2 h, 8 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
17711583
Default Alt Text
D32585.id97510.diff (106 KB)

Event Timeline