Page Menu
Home
FreeBSD
Search
Configure Global Search
Log In
Files
F115257148
D32585.id97510.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Flag For Later
Award Token
Size
106 KB
Referenced Files
None
Subscribers
None
D32585.id97510.diff
View Options
Index: sys/kern/subr_witness.c
===================================================================
--- sys/kern/subr_witness.c
+++ sys/kern/subr_witness.c
@@ -564,15 +564,15 @@
/*
* UDP/IP
*/
- { "udp", &lock_class_mtx_sleep },
{ "udpinp", &lock_class_rw },
+ { "udp", &lock_class_mtx_sleep },
{ "so_snd", &lock_class_mtx_sleep },
{ NULL, NULL },
/*
* TCP/IP
*/
- { "tcp", &lock_class_mtx_sleep },
{ "tcpinp", &lock_class_rw },
+ { "tcp", &lock_class_mtx_sleep },
{ "so_snd", &lock_class_mtx_sleep },
{ NULL, NULL },
/*
Index: sys/kern/uipc_ktls.c
===================================================================
--- sys/kern/uipc_ktls.c
+++ sys/kern/uipc_ktls.c
@@ -810,10 +810,6 @@
inp = so->so_pcb;
INP_WLOCK(inp);
- if (inp->inp_flags2 & INP_FREED) {
- INP_WUNLOCK(inp);
- return (ECONNRESET);
- }
if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
INP_WUNLOCK(inp);
return (ECONNRESET);
@@ -865,10 +861,6 @@
int error;
INP_RLOCK(inp);
- if (inp->inp_flags2 & INP_FREED) {
- INP_RUNLOCK(inp);
- return (ECONNRESET);
- }
if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
INP_RUNLOCK(inp);
return (ECONNRESET);
@@ -2476,8 +2468,7 @@
INP_WLOCK(inp);
so = inp->inp_socket;
MPASS(so != NULL);
- if ((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) ||
- (inp->inp_flags2 & INP_FREED)) {
+ if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
goto out;
}
@@ -2489,7 +2480,6 @@
counter_u64_add(ktls_ifnet_disable_ok, 1);
/* ktls_set_tx_mode() drops inp wlock, so recheck flags */
if ((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) == 0 &&
- (inp->inp_flags2 & INP_FREED) == 0 &&
(tp = intotcpcb(inp)) != NULL &&
tp->t_fb->tfb_hwtls_change != NULL)
(*tp->t_fb->tfb_hwtls_change)(tp, 0);
Index: sys/netinet/in_pcb.h
===================================================================
--- sys/netinet/in_pcb.h
+++ sys/netinet/in_pcb.h
@@ -49,7 +49,9 @@
#ifdef _KERNEL
#include <sys/lock.h>
+#include <sys/proc.h>
#include <sys/rwlock.h>
+#include <sys/smr.h>
#include <net/vnet.h>
#include <vm/uma.h>
#endif
@@ -133,32 +135,19 @@
* struct inpcb captures the network layer state for TCP, UDP, and raw IPv4 and
* IPv6 sockets. In the case of TCP and UDP, further per-connection state is
* hung off of inp_ppcb most of the time. Almost all fields of struct inpcb
- * are static after creation or protected by a per-inpcb rwlock, inp_lock. A
- * few fields are protected by multiple locks as indicated in the locking notes
- * below. For these fields, all of the listed locks must be write-locked for
- * any modifications. However, these fields can be safely read while any one of
- * the listed locks are read-locked. This model can permit greater concurrency
- * for read operations. For example, connections can be looked up while only
- * holding a read lock on the global pcblist lock. This is important for
- * performance when attempting to find the connection for a packet given its IP
- * and port tuple.
+ * are static after creation or protected by a per-inpcb rwlock, inp_lock.
*
- * One noteworthy exception is that the global pcbinfo lock follows a different
- * set of rules in relation to the inp_list field. Rather than being
- * write-locked for modifications and read-locked for list iterations, it must
- * be read-locked during modifications and write-locked during list iterations.
- * This ensures that the relatively rare global list iterations safely walk a
- * stable snapshot of connections while allowing more common list modifications
- * to safely grab the pcblist lock just while adding or removing a connection
- * from the global list.
+ * A inpcb database is indexed by addresses/ports hash as well as list of
+ * all pcbs that belong to a certain proto. Database lookups or list traversals
+ * are be performed inside SMR section. Once desired PCB is found its own
+ * lock is to be obtained and SMR section exited.
*
* Key:
* (b) - Protected by the hpts lock.
* (c) - Constant after initialization
- * (e) - Protected by the net_epoch_prempt epoch
+ * (e) - Protected by the SMR section
* (i) - Protected by the inpcb lock
* (p) - Protected by the pcbinfo lock for the inpcb
- * (l) - Protected by the pcblist lock for the inpcb
* (h) - Protected by the pcbhash lock for the inpcb
* (s) - Protected by another subsystem's locks
* (x) - Undefined locking
@@ -219,17 +208,13 @@
* socket has been freed), or there may be close(2)-related races.
*
* The inp_vflag field is overloaded, and would otherwise ideally be (c).
- *
- * TODO: Currently only the TCP stack is leveraging the global pcbinfo lock
- * read-lock usage during modification, this model can be applied to other
- * protocols (especially SCTP).
*/
struct icmp6_filter;
struct inpcbpolicy;
struct m_snd_tag;
struct inpcb {
/* Cache line #1 (amd64) */
- CK_LIST_ENTRY(inpcb) inp_hash; /* [w](h/i) [r](e/i) hash list */
+ CK_LIST_ENTRY(inpcb) inp_hash; /* (w:h/r:e) hash list */
struct rwlock inp_lock;
/* Cache line #2 (amd64) */
#define inp_start_zero inp_hpts
@@ -311,8 +296,8 @@
int in6p_cksum;
short in6p_hops;
};
- CK_LIST_ENTRY(inpcb) inp_portlist; /* (i/h) */
- struct inpcbport *inp_phd; /* (i/h) head of this list */
+ CK_LIST_ENTRY(inpcb) inp_portlist; /* (r:e/w:h) port list */
+ struct inpcbport *inp_phd; /* (r:e/w:h) head of this list */
inp_gen_t inp_gencnt; /* (c) generation count */
void *spare_ptr; /* Spare pointer. */
rt_gen_t inp_rt_cookie; /* generation for route entry */
@@ -320,10 +305,7 @@
struct route inp_route;
struct route_in6 inp_route6;
};
- CK_LIST_ENTRY(inpcb) inp_list; /* (p/l) list for all PCBs for proto */
- /* (e[r]) for list iteration */
- /* (p[w]/l) for addition/removal */
- struct epoch_context inp_epoch_ctx;
+ CK_LIST_ENTRY(inpcb) inp_list; /* (r:e/w:p) all PCBs for proto */
};
#endif /* _KERNEL */
@@ -396,80 +378,58 @@
#endif
#endif /* _SYS_SOCKETVAR_H_ */
-struct inpcbport {
- struct epoch_context phd_epoch_ctx;
- CK_LIST_ENTRY(inpcbport) phd_hash;
- struct inpcbhead phd_pcblist;
- u_short phd_port;
-};
-
-/*-
+#ifdef _KERNEL
+/*
* Global data structure for each high-level protocol (UDP, TCP, ...) in both
* IPv4 and IPv6. Holds inpcb lists and information for managing them.
*
- * Each pcbinfo is protected by three locks: ipi_lock, ipi_hash_lock and
- * ipi_list_lock:
- * - ipi_lock covering the global pcb list stability during loop iteration,
- * - ipi_hash_lock covering the hashed lookup tables,
- * - ipi_list_lock covering mutable global fields (such as the global
- * pcb list)
- *
- * The lock order is:
- *
- * ipi_lock (before)
- * inpcb locks (before)
- * ipi_list locks (before)
+ * The pcbs are protected with SMR section and thus all lists in inpcbinfo
+ * are CK-lists. Locking is required to insert a pcb into database. Two
+ * locks are provided: one for the hash and one for the global list of pcbs,
+ * as well as overall count and generation count.
*
* Locking key:
*
* (c) Constant or nearly constant after initialisation
- * (e) - Protected by the net_epoch_prempt epoch
+ * (e) Protected by SMR section
* (g) Locked by ipi_lock
- * (l) Locked by ipi_list_lock
- * (h) Read using either net_epoch_preempt or inpcb lock; write requires both ipi_hash_lock and inpcb lock
- * (x) Synchronisation properties poorly defined
+ * (h) Locked by ipi_hash_lock
*/
struct inpcbinfo {
/*
* Global lock protecting inpcb list modification
*/
struct mtx ipi_lock;
-
- /*
- * Global list of inpcbs on the protocol.
- */
- struct inpcbhead *ipi_listhead; /* [r](e) [w](g/l) */
- u_int ipi_count; /* (l) */
+ struct inpcbhead ipi_listhead; /* (r:e/w:g) */
+ u_int ipi_count; /* (g) */
/*
* Generation count -- incremented each time a connection is allocated
* or freed.
*/
- u_quad_t ipi_gencnt; /* (l) */
+ u_quad_t ipi_gencnt; /* (g) */
/*
* Fields associated with port lookup and allocation.
*/
- u_short ipi_lastport; /* (x) */
- u_short ipi_lastlow; /* (x) */
- u_short ipi_lasthi; /* (x) */
+ u_short ipi_lastport; /* (h) */
+ u_short ipi_lastlow; /* (h) */
+ u_short ipi_lasthi; /* (h) */
/*
* UMA zone from which inpcbs are allocated for this protocol.
*/
- struct uma_zone *ipi_zone; /* (c) */
-
- /*
- * Global lock protecting modification hash lookup tables.
- */
- struct mtx ipi_hash_lock;
+ uma_zone_t ipi_zone; /* (c) */
+ uma_zone_t ipi_portzone; /* (c) */
+ smr_t ipi_smr; /* (c) */
/*
* Global hash of inpcbs, hashed by local and foreign addresses and
* port numbers.
*/
- struct inpcbhead *ipi_hashbase; /* (h) */
- u_long ipi_hashmask; /* (h) */
+ struct mtx ipi_hash_lock;
+ struct inpcbhead *ipi_hashbase; /* (r:e/w:h) */
+ u_long ipi_hashmask; /* (c) */
/*
* Global hash of inpcbs, hashed by only local port number.
@@ -481,26 +441,15 @@
* Load balance groups used for the SO_REUSEPORT_LB option,
* hashed by local port.
*/
- struct inpcblbgrouphead *ipi_lbgrouphashbase; /* (h) */
+ struct inpcblbgrouphead *ipi_lbgrouphashbase; /* (r:e/w:h) */
u_long ipi_lbgrouphashmask; /* (h) */
/*
* Pointer to network stack instance
*/
struct vnet *ipi_vnet; /* (c) */
-
- /*
- * general use 2
- */
- void *ipi_pspare[2];
-
- /*
- * Global lock protecting global inpcb list, inpcb count, etc.
- */
- struct rwlock ipi_list_lock;
};
-#ifdef _KERNEL
/*
* Load balance groups used for the SO_REUSEPORT_LB socket option. Each group
* (or unique address:port combination) can be re-used at most
@@ -571,51 +520,22 @@
#endif /* _KERNEL */
-#define INP_INFO_LOCK_INIT(ipi, d) \
- mtx_init(&(ipi)->ipi_lock, (d), NULL, MTX_DEF| MTX_RECURSE)
-#define INP_INFO_LOCK_DESTROY(ipi) mtx_destroy(&(ipi)->ipi_lock)
-#define INP_INFO_WLOCK(ipi) mtx_lock(&(ipi)->ipi_lock)
+#define INP_INFO_WLOCK(ipi) mtx_lock(&(ipi)->ipi_lock)
#define INP_INFO_TRY_WLOCK(ipi) mtx_trylock(&(ipi)->ipi_lock)
#define INP_INFO_WLOCKED(ipi) mtx_owned(&(ipi)->ipi_lock)
#define INP_INFO_WUNLOCK(ipi) mtx_unlock(&(ipi)->ipi_lock)
-#define INP_INFO_LOCK_ASSERT(ipi) MPASS(in_epoch(net_epoch_preempt) || mtx_owned(&(ipi)->ipi_lock))
+#define INP_INFO_LOCK_ASSERT(ipi) MPASS(SMR_ENTERED((ipi)->ipi_smr) || \
+ mtx_owned(&(ipi)->ipi_lock))
#define INP_INFO_WLOCK_ASSERT(ipi) mtx_assert(&(ipi)->ipi_lock, MA_OWNED)
#define INP_INFO_WUNLOCK_ASSERT(ipi) \
- mtx_assert(&(ipi)->ipi_lock, MA_NOTOWNED)
-
-#define INP_LIST_LOCK_INIT(ipi, d) \
- rw_init_flags(&(ipi)->ipi_list_lock, (d), 0)
-#define INP_LIST_LOCK_DESTROY(ipi) rw_destroy(&(ipi)->ipi_list_lock)
-#define INP_LIST_RLOCK(ipi) rw_rlock(&(ipi)->ipi_list_lock)
-#define INP_LIST_WLOCK(ipi) rw_wlock(&(ipi)->ipi_list_lock)
-#define INP_LIST_TRY_RLOCK(ipi) rw_try_rlock(&(ipi)->ipi_list_lock)
-#define INP_LIST_TRY_WLOCK(ipi) rw_try_wlock(&(ipi)->ipi_list_lock)
-#define INP_LIST_TRY_UPGRADE(ipi) rw_try_upgrade(&(ipi)->ipi_list_lock)
-#define INP_LIST_RUNLOCK(ipi) rw_runlock(&(ipi)->ipi_list_lock)
-#define INP_LIST_WUNLOCK(ipi) rw_wunlock(&(ipi)->ipi_list_lock)
-#define INP_LIST_LOCK_ASSERT(ipi) \
- rw_assert(&(ipi)->ipi_list_lock, RA_LOCKED)
-#define INP_LIST_RLOCK_ASSERT(ipi) \
- rw_assert(&(ipi)->ipi_list_lock, RA_RLOCKED)
-#define INP_LIST_WLOCK_ASSERT(ipi) \
- rw_assert(&(ipi)->ipi_list_lock, RA_WLOCKED)
-#define INP_LIST_UNLOCK_ASSERT(ipi) \
- rw_assert(&(ipi)->ipi_list_lock, RA_UNLOCKED)
-
-#define INP_HASH_LOCK_INIT(ipi, d) mtx_init(&(ipi)->ipi_hash_lock, (d), NULL, MTX_DEF)
-#define INP_HASH_LOCK_DESTROY(ipi) mtx_destroy(&(ipi)->ipi_hash_lock)
+ mtx_assert(&(ipi)->ipi_lock, MA_NOTOWNED)
+
#define INP_HASH_WLOCK(ipi) mtx_lock(&(ipi)->ipi_hash_lock)
#define INP_HASH_WUNLOCK(ipi) mtx_unlock(&(ipi)->ipi_hash_lock)
-#define INP_HASH_LOCK_ASSERT(ipi) MPASS(in_epoch(net_epoch_preempt) || mtx_owned(&(ipi)->ipi_hash_lock))
-#define INP_HASH_WLOCK_ASSERT(ipi) mtx_assert(&(ipi)->ipi_hash_lock, MA_OWNED);
-
-#define INP_GROUP_LOCK_INIT(ipg, d) mtx_init(&(ipg)->ipg_lock, (d), NULL, \
- MTX_DEF | MTX_DUPOK)
-#define INP_GROUP_LOCK_DESTROY(ipg) mtx_destroy(&(ipg)->ipg_lock)
-
-#define INP_GROUP_LOCK(ipg) mtx_lock(&(ipg)->ipg_lock)
-#define INP_GROUP_LOCK_ASSERT(ipg) mtx_assert(&(ipg)->ipg_lock, MA_OWNED)
-#define INP_GROUP_UNLOCK(ipg) mtx_unlock(&(ipg)->ipg_lock)
+#define INP_HASH_LOCK_ASSERT(ipi) MPASS(SMR_ENTERED((ipi)->ipi_smr) || \
+ mtx_owned(&(ipi)->ipi_hash_lock))
+#define INP_HASH_WLOCK_ASSERT(ipi) mtx_assert(&(ipi)->ipi_hash_lock, \
+ MA_OWNED)
#define INP_PCBHASH(faddr, lport, fport, mask) \
(((faddr) ^ ((faddr) >> 16) ^ ntohs((lport) ^ (fport))) & (mask))
@@ -644,7 +564,7 @@
#define INP_ANONPORT 0x00000040 /* port chosen for user */
#define INP_RECVIF 0x00000080 /* receive incoming interface */
#define INP_MTUDISC 0x00000100 /* user can do MTU discovery */
- /* 0x000200 unused: was INP_FAITH */
+/* INP_FREED 0x00000200 private to in_pcb.c */
#define INP_RECVTTL 0x00000400 /* receive incoming IP TTL */
#define INP_DONTFRAG 0x00000800 /* don't fragment packet */
#define INP_BINDANY 0x00001000 /* allow bind to any address */
@@ -682,7 +602,7 @@
#define INP_MBUF_ACKCMP 0x00000002 /* TCP mbuf ack compression ok */
/* 0x00000004 */
#define INP_REUSEPORT 0x00000008 /* SO_REUSEPORT option is set */
-#define INP_FREED 0x00000010 /* inp itself is not valid */
+/* 0x00000010 */
#define INP_REUSEADDR 0x00000020 /* SO_REUSEADDR option is set */
#define INP_BINDMULTI 0x00000040 /* IP_BINDMULTI option is set */
#define INP_RSS_BUCKET_SET 0x00000080 /* IP_RSS_LISTEN_BUCKET is set */
@@ -702,15 +622,18 @@
#define INP_2PCP_BASE INP_2PCP_BIT0
#define INP_2PCP_MASK (INP_2PCP_BIT0 | INP_2PCP_BIT1 | INP_2PCP_BIT2)
#define INP_2PCP_SHIFT 18 /* shift PCP field in/out of inp_flags2 */
+
/*
- * Flags passed to in_pcblookup*() functions.
+ * Flags passed to in_pcblookup*(), inp_smr_lock() and inp_next().
*/
-#define INPLOOKUP_WILDCARD 0x00000001 /* Allow wildcard sockets. */
-#define INPLOOKUP_RLOCKPCB 0x00000002 /* Return inpcb read-locked. */
-#define INPLOOKUP_WLOCKPCB 0x00000004 /* Return inpcb write-locked. */
+typedef enum {
+ INPLOOKUP_WILDCARD = 0x00000001, /* Allow wildcard sockets. */
+ INPLOOKUP_RLOCKPCB = 0x00000002, /* Return inpcb read-locked. */
+ INPLOOKUP_WLOCKPCB = 0x00000004, /* Return inpcb write-locked. */} inp_lookup_t;
#define INPLOOKUP_MASK (INPLOOKUP_WILDCARD | INPLOOKUP_RLOCKPCB | \
INPLOOKUP_WLOCKPCB)
+#define INPLOOKUP_LOCKMASK (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)
#define sotoinpcb(so) ((struct inpcb *)(so)->so_pcb)
@@ -718,13 +641,6 @@
#define INP_CHECK_SOCKAF(so, af) (INP_SOCKAF(so) == af)
-/*
- * Constants for pcbinfo.ipi_hashfields.
- */
-#define IPI_HASHFIELDS_NONE 0
-#define IPI_HASHFIELDS_2TUPLE 1
-#define IPI_HASHFIELDS_4TUPLE 2
-
#ifdef _KERNEL
VNET_DECLARE(int, ipport_reservedhigh);
VNET_DECLARE(int, ipport_reservedlow);
@@ -755,8 +671,8 @@
#define V_ipport_tcpallocs VNET(ipport_tcpallocs)
void in_pcbinfo_destroy(struct inpcbinfo *);
-void in_pcbinfo_init(struct inpcbinfo *, const char *, struct inpcbhead *,
- int, int, char *, uma_init, u_int);
+void in_pcbinfo_init(struct inpcbinfo *, const char *, u_int, int, char *,
+ uma_init);
int in_pcbbind_check_bindmulti(const struct inpcb *ni,
const struct inpcb *oi);
@@ -788,8 +704,37 @@
int, struct inpcb *(*)(struct inpcb *, int));
void in_pcbref(struct inpcb *);
void in_pcbrehash(struct inpcb *);
-int in_pcbrele_rlocked(struct inpcb *);
-int in_pcbrele_wlocked(struct inpcb *);
+bool in_pcbrele_rlocked(struct inpcb *);
+bool in_pcbrele_wlocked(struct inpcb *);
+
+typedef bool inp_match_t(const struct inpcb *, void *);
+struct inpcb_iterator {
+ const struct inpcbinfo *ipi;
+ struct inpcb *inp;
+ inp_match_t *match;
+ void *ctx;
+ int hash;
+#define INP_ALL_LIST -1
+ const inp_lookup_t lock;
+};
+
+/* Note: sparse initializers guarantee .inp = NULL. */
+#define INP_ITERATOR(_ipi, _lock, _match, _ctx) \
+ { \
+ .ipi = (_ipi), \
+ .lock = (_lock), \
+ .hash = INP_ALL_LIST, \
+ .match = (_match), \
+ .ctx = (_ctx), \
+ }
+#define INP_ALL_ITERATOR(_ipi, _lock) \
+ { \
+ .ipi = (_ipi), \
+ .lock = (_lock), \
+ .hash = INP_ALL_LIST, \
+ }
+
+struct inpcb *inp_next(struct inpcb_iterator *);
void in_losing(struct inpcb *);
void in_pcbsetsolabel(struct socket *so);
int in_getpeeraddr(struct socket *so, struct sockaddr **nam);
Index: sys/netinet/in_pcb.c
===================================================================
--- sys/netinet/in_pcb.c
+++ sys/netinet/in_pcb.c
@@ -114,6 +114,7 @@
#define INPCBLBGROUP_SIZMIN 8
#define INPCBLBGROUP_SIZMAX 256
+#define INP_FREED 0x00000200 /* See in_pcb.h. */
static struct callout ipport_tick_callout;
@@ -146,7 +147,6 @@
#define V_ipport_tcplastcount VNET(ipport_tcplastcount)
-static void in_pcbremlists(struct inpcb *inp);
#ifdef INET
static struct inpcb *in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo,
struct in_addr faddr, u_int fport_arg,
@@ -515,38 +515,43 @@
INP_LOCK_DESTROY(inp);
}
+/* Make sure it is safe to use hashinit(9) on CK_LIST. */
+CTASSERT(sizeof(struct inpcbhead) == sizeof(LIST_HEAD(, inpcb)));
+
/*
* Initialize an inpcbinfo -- we should be able to reduce the number of
* arguments in time.
*/
void
in_pcbinfo_init(struct inpcbinfo *pcbinfo, const char *name,
- struct inpcbhead *listhead, int hash_nelements, int porthash_nelements,
- char *inpcbzone_name, uma_init inpcbzone_init, u_int hashfields)
+ u_int hash_nelements, int porthash_nelements, char *inpcbzone_name,
+ uma_init inpcbzone_init)
{
- porthash_nelements = imin(porthash_nelements, IPPORT_MAX + 1);
-
- INP_INFO_LOCK_INIT(pcbinfo, name);
- INP_HASH_LOCK_INIT(pcbinfo, "pcbinfohash"); /* XXXRW: argument? */
- INP_LIST_LOCK_INIT(pcbinfo, "pcbinfolist");
+ mtx_init(&pcbinfo->ipi_lock, name, NULL, MTX_DEF);
+ mtx_init(&pcbinfo->ipi_hash_lock, "pcbinfohash", NULL, MTX_DEF);
#ifdef VIMAGE
pcbinfo->ipi_vnet = curvnet;
#endif
- pcbinfo->ipi_listhead = listhead;
- CK_LIST_INIT(pcbinfo->ipi_listhead);
+ CK_LIST_INIT(&pcbinfo->ipi_listhead);
pcbinfo->ipi_count = 0;
pcbinfo->ipi_hashbase = hashinit(hash_nelements, M_PCB,
&pcbinfo->ipi_hashmask);
+ porthash_nelements = imin(porthash_nelements, IPPORT_MAX + 1);
pcbinfo->ipi_porthashbase = hashinit(porthash_nelements, M_PCB,
&pcbinfo->ipi_porthashmask);
pcbinfo->ipi_lbgrouphashbase = hashinit(porthash_nelements, M_PCB,
&pcbinfo->ipi_lbgrouphashmask);
pcbinfo->ipi_zone = uma_zcreate(inpcbzone_name, sizeof(struct inpcb),
- NULL, NULL, inpcbzone_init, inpcb_fini, UMA_ALIGN_PTR, 0);
+ NULL, NULL, inpcbzone_init, inpcb_fini, UMA_ALIGN_PTR,
+ UMA_ZONE_SMR);
uma_zone_set_max(pcbinfo->ipi_zone, maxsockets);
uma_zone_set_warning(pcbinfo->ipi_zone,
"kern.ipc.maxsockets limit reached");
+ pcbinfo->ipi_smr = uma_zone_get_smr(pcbinfo->ipi_zone);
+ pcbinfo->ipi_portzone = uma_zcreate(inpcbzone_name,
+ sizeof(struct inpcbport), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
+ uma_zone_set_smr(pcbinfo->ipi_portzone, pcbinfo->ipi_smr);
}
/*
@@ -565,9 +570,8 @@
hashdestroy(pcbinfo->ipi_lbgrouphashbase, M_PCB,
pcbinfo->ipi_lbgrouphashmask);
uma_zdestroy(pcbinfo->ipi_zone);
- INP_LIST_LOCK_DESTROY(pcbinfo);
- INP_HASH_LOCK_DESTROY(pcbinfo);
- INP_INFO_LOCK_DESTROY(pcbinfo);
+ mtx_destroy(&pcbinfo->ipi_hash_lock);
+ mtx_destroy(&pcbinfo->ipi_lock);
}
/*
@@ -581,7 +585,7 @@
int error;
error = 0;
- inp = uma_zalloc(pcbinfo->ipi_zone, M_NOWAIT);
+ inp = uma_zalloc_smr(pcbinfo->ipi_zone, M_NOWAIT);
if (inp == NULL)
return (ENOBUFS);
bzero(&inp->inp_start_zero, inp_zero_size);
@@ -613,33 +617,38 @@
if (V_ip6_v6only)
inp->inp_flags |= IN6P_IPV6_V6ONLY;
}
-#endif
- INP_WLOCK(inp);
- INP_LIST_WLOCK(pcbinfo);
- CK_LIST_INSERT_HEAD(pcbinfo->ipi_listhead, inp, inp_list);
- pcbinfo->ipi_count++;
- so->so_pcb = (caddr_t)inp;
-#ifdef INET6
if (V_ip6_auto_flowlabel)
inp->inp_flags |= IN6P_AUTOFLOWLABEL;
#endif
- inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
- refcount_init(&inp->inp_refcount, 1); /* Reference from inpcbinfo */
-
/*
* Routes in inpcb's can cache L2 as well; they are guaranteed
* to be cleaned up.
*/
inp->inp_route.ro_flags = RT_LLE_CACHE;
- INP_LIST_WUNLOCK(pcbinfo);
+#ifdef TCPHPTS
+ /*
+ * If using hpts lets drop a random number in so
+ * not all new connections fall on the same CPU.
+ */
+ inp->inp_hpts_cpu = inp->inp_input_cpu = hpts_random_cpu(inp);
+#endif
+ refcount_init(&inp->inp_refcount, 1); /* Reference from socket. */
+ INP_WLOCK(inp);
+ INP_INFO_WLOCK(pcbinfo);
+ pcbinfo->ipi_count++;
+ inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
+ CK_LIST_INSERT_HEAD(&pcbinfo->ipi_listhead, inp, inp_list);
+ INP_INFO_WUNLOCK(pcbinfo);
+ so->so_pcb = inp;
+
+ return (0);
+
#if defined(IPSEC) || defined(IPSEC_SUPPORT) || defined(MAC)
out:
- if (error != 0) {
- crfree(inp->inp_cred);
- uma_zfree(pcbinfo->ipi_zone, inp);
- }
-#endif
+ crfree(inp->inp_cred);
+ uma_zfree_smr(pcbinfo->ipi_zone, inp);
return (error);
+#endif
}
#ifdef INET
@@ -1350,7 +1359,6 @@
in_addr_t *laddrp, u_short *lportp, in_addr_t *faddrp, u_short *fportp,
struct inpcb **oinpp, struct ucred *cred)
{
- struct rm_priotracker in_ifa_tracker;
struct sockaddr_in *sin = (struct sockaddr_in *)nam;
struct in_ifaddr *ia;
struct inpcb *oinp;
@@ -1399,20 +1407,16 @@
* choose the broadcast address for that interface.
*/
if (faddr.s_addr == INADDR_ANY) {
- IN_IFADDR_RLOCK(&in_ifa_tracker);
faddr =
IA_SIN(CK_STAILQ_FIRST(&V_in_ifaddrhead))->sin_addr;
- IN_IFADDR_RUNLOCK(&in_ifa_tracker);
if (cred != NULL &&
(error = prison_get_ip4(cred, &faddr)) != 0)
return (error);
} else if (faddr.s_addr == (u_long)INADDR_BROADCAST) {
- IN_IFADDR_RLOCK(&in_ifa_tracker);
if (CK_STAILQ_FIRST(&V_in_ifaddrhead)->ia_ifp->if_flags &
IFF_BROADCAST)
faddr = satosin(&CK_STAILQ_FIRST(
&V_in_ifaddrhead)->ia_broadaddr)->sin_addr;
- IN_IFADDR_RUNLOCK(&in_ifa_tracker);
}
}
if (laddr.s_addr == INADDR_ANY) {
@@ -1430,7 +1434,6 @@
imo = inp->inp_moptions;
if (imo->imo_multicast_ifp != NULL) {
ifp = imo->imo_multicast_ifp;
- IN_IFADDR_RLOCK(&in_ifa_tracker);
CK_STAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) {
if ((ia->ia_ifp == ifp) &&
(cred == NULL ||
@@ -1444,7 +1447,6 @@
laddr = ia->ia_addr.sin_addr;
error = 0;
}
- IN_IFADDR_RUNLOCK(&in_ifa_tracker);
}
}
if (error)
@@ -1515,192 +1517,267 @@
}
/*
- * in_pcbref() bumps the reference count on an inpcb in order to maintain
- * stability of an inpcb pointer despite the inpcb lock being released. This
- * is used in TCP when the inpcbinfo lock needs to be acquired or upgraded,
- * but where the inpcb lock may already held.
+ * inpcb hash lookups are protected by SMR section.
*
- * in_pcbref() should be used only to provide brief memory stability, and
- * must always be followed by a call to INP_WLOCK() and in_pcbrele() to
- * garbage collect the inpcb if it has been in_pcbfree()'d from another
- * context. Until in_pcbrele() has returned that the inpcb is still valid,
- * lock and rele are the *only* safe operations that may be performed on the
- * inpcb.
- *
- * While the inpcb will not be freed, releasing the inpcb lock means that the
- * connection's state may change, so the caller should be careful to
- * revalidate any cached state on reacquiring the lock. Drop the reference
- * using in_pcbrele().
+ * Once desired pcb has been found, switching from SMR section to a pcb
+ * lock is performed with inp_smr_lock(). We can not use INP_(W|R)LOCK
+ * here because SMR is a critical section.
+ * In 99%+ cases inp_smr_lock() would obtain the lock immediately.
*/
-void
-in_pcbref(struct inpcb *inp)
+static inline void
+inp_lock(struct inpcb *inp, const inp_lookup_t lock)
+{
+
+ return (lock == INPLOOKUP_RLOCKPCB ? \
+ rw_rlock(&inp->inp_lock) : rw_wlock(&inp->inp_lock));
+}
+
+static inline void
+inp_unlock(struct inpcb *inp, const inp_lookup_t lock)
{
- KASSERT(inp->inp_refcount > 0, ("%s: refcount 0", __func__));
+ return (lock == INPLOOKUP_RLOCKPCB ? \
+ rw_runlock(&inp->inp_lock) : rw_wunlock(&inp->inp_lock));
+}
- refcount_acquire(&inp->inp_refcount);
+static inline int
+inp_trylock(struct inpcb *inp, const inp_lookup_t lock)
+{
+
+ return (lock == INPLOOKUP_RLOCKPCB ? \
+ rw_try_rlock(&inp->inp_lock) : rw_try_wlock(&inp->inp_lock));
}
-/*
- * Drop a refcount on an inpcb elevated using in_pcbref(); because a call to
- * in_pcbfree() may have been made between in_pcbref() and in_pcbrele(), we
- * return a flag indicating whether or not the inpcb remains valid. If it is
- * valid, we return with the inpcb lock held.
- *
- * Notice that, unlike in_pcbref(), the inpcb lock must be held to drop a
- * reference on an inpcb. Historically more work was done here (actually, in
- * in_pcbfree_internal()) but has been moved to in_pcbfree() to avoid the
- * need for the pcbinfo lock in in_pcbrele(). Deferring the free is entirely
- * about memory stability (and continued use of the write lock).
- */
-int
-in_pcbrele_rlocked(struct inpcb *inp)
+static inline bool
+in_pcbrele(struct inpcb *inp, const inp_lookup_t lock)
{
- struct inpcbinfo *pcbinfo;
- KASSERT(inp->inp_refcount > 0, ("%s: refcount 0", __func__));
+ return (lock == INPLOOKUP_RLOCKPCB ? \
+ in_pcbrele_rlocked(inp) : in_pcbrele_wlocked(inp));
+}
- INP_RLOCK_ASSERT(inp);
+bool
+inp_smr_lock(struct inpcb *inp, const inp_lookup_t lock)
+{
- if (refcount_release(&inp->inp_refcount) == 0) {
- /*
- * If the inpcb has been freed, let the caller know, even if
- * this isn't the last reference.
- */
- if (inp->inp_flags2 & INP_FREED) {
- INP_RUNLOCK(inp);
- return (1);
- }
- return (0);
+ MPASS(lock == INPLOOKUP_RLOCKPCB || lock == INPLOOKUP_WLOCKPCB);
+ SMR_ASSERT_ENTERED(inp->inp_pcbinfo->ipi_smr);
+
+ if (__predict_true(inp_trylock(inp, lock))) {
+ smr_exit(inp->inp_pcbinfo->ipi_smr);
+check_freed:
+ if (__predict_false(inp->inp_flags & INP_FREED)) {
+ inp_unlock(inp, lock);
+ return (false);
+ } else
+ return (true);
}
- KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__));
-#ifdef TCPHPTS
- if (inp->inp_in_hpts || inp->inp_in_input) {
- struct tcp_hpts_entry *hpts;
- /*
- * We should not be on the hpts at
- * this point in any form. we must
- * get the lock to be sure.
- */
- hpts = tcp_hpts_lock(inp);
- if (inp->inp_in_hpts)
- panic("Hpts:%p inp:%p at free still on hpts",
- hpts, inp);
- mtx_unlock(&hpts->p_mtx);
- hpts = tcp_input_lock(inp);
- if (inp->inp_in_input)
- panic("Hpts:%p inp:%p at free still on input hpts",
- hpts, inp);
- mtx_unlock(&hpts->p_mtx);
+ if (__predict_true(refcount_acquire_if_not_zero(&inp->inp_refcount))) {
+ smr_exit(inp->inp_pcbinfo->ipi_smr);
+ inp_lock(inp, lock);
+ if (__predict_false(in_pcbrele(inp, lock)))
+ return (false);
+ else
+ goto check_freed;
+ } else {
+ smr_exit(inp->inp_pcbinfo->ipi_smr);
+ return (false);
}
-#endif
- INP_RUNLOCK(inp);
- pcbinfo = inp->inp_pcbinfo;
- uma_zfree(pcbinfo->ipi_zone, inp);
- return (1);
}
-int
-in_pcbrele_wlocked(struct inpcb *inp)
-{
- struct inpcbinfo *pcbinfo;
+/*
+ * inp_next() - inpcb hash/list traversal iterator
+ *
+ * Requires initialized struct inpcb_iterator for context.
+ * The structure can be initialized with INP_ITERATOR() or INP_ALL_ITERATOR().
+ *
+ * - Iterator can have either write-lock or read-lock semantics, that can not
+ * be changed later.
+ * - Iterator can iterate either over all pcbs list (INP_ALL_LIST), or through
+ * a single hash slot. Note: only rip_input() does the latter.
+ * - Iterator may have optional bool matching function. The matching function
+ * will be executed for each inpcb in the SMR context, so it can not acquire
+ * locks and can safely access only immutable fields of inpcb.
+ *
+ * A fresh initialized iterator has NULL inpcb in its context and that
+ * means that inp_next() call would return the very first inpcb on the list
+ * locked with desired semantic. In all following calls the context pointer
+ * shall hold the current inpcb pointer. The KPI user is not supposed to
+ * unlock the current inpcb! Upon end of traversal inp_next() will return NULL
+ * and write NULL to its context. After end of traversal an iterator can be
+ * reused. Note: only rip_input() reuses iterator.
+ *
+ * List traversals have the following features/constraints:
+ * - New entries won't be seen, as they are always added to the head of a list.
+ * - Removed entries won't stop traversal as long as they are not added to
+ * a different list. This is violated by in_pcbrehash().
+ */
+#define II_LIST_FIRST(ipi, hash) \
+ (((hash) == INP_ALL_LIST) ? \
+ CK_LIST_FIRST(&(ipi)->ipi_listhead) : \
+ CK_LIST_FIRST(&(ipi)->ipi_hashbase[(hash)]))
+#define II_LIST_NEXT(inp, hash) \
+ (((hash) == INP_ALL_LIST) ? \
+ CK_LIST_NEXT((inp), inp_list) : \
+ CK_LIST_NEXT((inp), inp_hash))
+#define II_LOCK_ASSERT(inp, lock) \
+ rw_assert(&(inp)->inp_lock, \
+ (lock) == INPLOOKUP_RLOCKPCB ? RA_RLOCKED : RA_WLOCKED )
+struct inpcb *
+inp_next(struct inpcb_iterator *ii)
+{
+ struct inpcbhead freelist = CK_LIST_HEAD_INITIALIZER(freelist);
+ const struct inpcbinfo *ipi = ii->ipi;
+ inp_match_t *match = ii->match;
+ void *ctx = ii->ctx;
+ inp_lookup_t lock = ii->lock;
+ int hash = ii->hash;
+ struct inpcb *inp, *next, *tmp;
+
+ if (ii->inp == NULL) { /* First call. */
+ smr_enter(ipi->ipi_smr);
+ /* This is unrolled CK_LIST_FOREACH(). */
+ for (inp = II_LIST_FIRST(ipi, hash);
+ inp != NULL;
+ inp = II_LIST_NEXT(inp, hash)) {
+ if (match != NULL && (match)(inp, ctx) == false)
+ continue;
+ if (__predict_true(inp_smr_lock(inp, lock)))
+ break;
+ else {
+ smr_enter(ipi->ipi_smr);
+ MPASS(inp != II_LIST_FIRST(ipi, hash));
+ inp = II_LIST_FIRST(ipi, hash);
+ }
+ }
- KASSERT(inp->inp_refcount > 0, ("%s: refcount 0", __func__));
+ if (inp == NULL)
+ smr_exit(ipi->ipi_smr);
+ else
+ ii->inp = inp;
- INP_WLOCK_ASSERT(inp);
+ return (inp);
+ }
- if (refcount_release(&inp->inp_refcount) == 0) {
- /*
- * If the inpcb has been freed, let the caller know, even if
- * this isn't the last reference.
- */
- if (inp->inp_flags2 & INP_FREED) {
- INP_WUNLOCK(inp);
- return (1);
+ inp = ii->inp;
+
+ II_LOCK_ASSERT(inp, lock);
+ smr_enter(ipi->ipi_smr);
+next:
+ next = II_LIST_NEXT(inp, hash);
+ inp_unlock(inp, lock);
+ inp = next;
+next1:
+ if (inp == NULL) {
+ smr_exit(ipi->ipi_smr);
+ goto found;
+ }
+
+ if (match != NULL && (match)(inp, ctx) == false) {
+ inp = II_LIST_NEXT(inp, hash);
+ goto next1;
+ }
+
+ if (__predict_true(inp_trylock(inp, lock))) {
+ if (__predict_false(inp->inp_flags & INP_FREED))
+ goto next;
+ else {
+ smr_exit(ipi->ipi_smr);
+ goto found;
}
- return (0);
}
- KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__));
-#ifdef TCPHPTS
- if (inp->inp_in_hpts || inp->inp_in_input) {
- struct tcp_hpts_entry *hpts;
- /*
- * We should not be on the hpts at
- * this point in any form. we must
- * get the lock to be sure.
- */
- hpts = tcp_hpts_lock(inp);
- if (inp->inp_in_hpts)
- panic("Hpts:%p inp:%p at free still on hpts",
- hpts, inp);
- mtx_unlock(&hpts->p_mtx);
- hpts = tcp_input_lock(inp);
- if (inp->inp_in_input)
- panic("Hpts:%p inp:%p at free still on input hpts",
- hpts, inp);
- mtx_unlock(&hpts->p_mtx);
+ if (__predict_true(refcount_acquire_if_not_zero(&inp->inp_refcount))) {
+ smr_exit(ipi->ipi_smr);
+ inp_lock(inp, lock);
+ if (__predict_true(refcount_release(&inp->inp_refcount) == 0)) {
+ if (__predict_false(inp->inp_flags & INP_FREED)) {
+ smr_enter(ipi->ipi_smr);
+ goto next;
+ }
+ goto found;
+ }
+ MPASS(inp->inp_flags & INP_FREED);
+ smr_enter(ipi->ipi_smr);
+ next = II_LIST_NEXT(inp, hash);
+ inp_unlock(inp, lock);
+ if (hash == INP_ALL_LIST)
+ CK_LIST_INSERT_HEAD(&freelist, inp, inp_list);
+ else
+ CK_LIST_INSERT_HEAD(&freelist, inp, inp_hash);
+ inp = next;
+ goto next1;
+ } else
+ goto next;
+
+found:
+ if (__predict_false(CK_LIST_FIRST(&freelist) != NULL)) {
+ if (hash == INP_ALL_LIST)
+ CK_LIST_FOREACH_SAFE(next, &freelist, inp_list, tmp)
+ uma_zfree_smr(ipi->ipi_zone, next);
+ else
+ CK_LIST_FOREACH_SAFE(next, &freelist, inp_hash, tmp)
+ uma_zfree_smr(ipi->ipi_zone, next);
}
-#endif
- INP_WUNLOCK(inp);
- pcbinfo = inp->inp_pcbinfo;
- uma_zfree(pcbinfo->ipi_zone, inp);
- return (1);
+
+ return ((ii->inp = inp));
}
-static void
-inpcbport_free(epoch_context_t ctx)
+/*
+ * in_pcbref() bumps the reference count on an inpcb in order to maintain
+ * stability of an inpcb pointer despite the inpcb lock being released or
+ * SMR section exited.
+ *
+ * To free a reference later in_pcbrele_(r|w)locked() must be performed.
+ */
+void
+in_pcbref(struct inpcb *inp)
{
- struct inpcbport *phd;
+ u_int old __diagused;
- phd = __containerof(ctx, struct inpcbport, phd_epoch_ctx);
- free(phd, M_PCB);
+ old = refcount_acquire(&inp->inp_refcount);
+ KASSERT(old > 0, ("%s: refcount 0", __func__));
}
-static void
-in_pcbfree_deferred(epoch_context_t ctx)
+/*
+ * Drop a refcount on an inpcb elevated using in_pcbref(), potentially
+ * freeing the pcb, if the reference was very last.
+ */
+bool
+in_pcbrele_rlocked(struct inpcb *inp)
{
- struct inpcb *inp;
- int released __unused;
- inp = __containerof(ctx, struct inpcb, inp_epoch_ctx);
+ INP_RLOCK_ASSERT(inp);
- INP_WLOCK(inp);
- CURVNET_SET(inp->inp_vnet);
-#ifdef INET
- struct ip_moptions *imo = inp->inp_moptions;
- inp->inp_moptions = NULL;
-#endif
- /* XXXRW: Do as much as possible here. */
-#if defined(IPSEC) || defined(IPSEC_SUPPORT)
- if (inp->inp_sp != NULL)
- ipsec_delete_pcbpolicy(inp);
-#endif
-#ifdef INET6
- struct ip6_moptions *im6o = NULL;
- if (inp->inp_vflag & INP_IPV6PROTO) {
- ip6_freepcbopts(inp->in6p_outputopts);
- im6o = inp->in6p_moptions;
- inp->in6p_moptions = NULL;
- }
-#endif
- if (inp->inp_options)
- (void)m_free(inp->inp_options);
- inp->inp_vflag = 0;
- crfree(inp->inp_cred);
-#ifdef MAC
- mac_inpcb_destroy(inp);
-#endif
- released = in_pcbrele_wlocked(inp);
- MPASS(released);
-#ifdef INET6
- ip6_freemoptions(im6o);
-#endif
-#ifdef INET
- inp_freemoptions(imo);
-#endif
- CURVNET_RESTORE();
+ if (refcount_release(&inp->inp_refcount) == 0)
+ return (false);
+
+ MPASS(inp->inp_flags & INP_FREED);
+ MPASS(inp->inp_socket == NULL);
+ MPASS(inp->inp_in_hpts == 0);
+ MPASS(inp->inp_in_input == 0);
+ INP_RUNLOCK(inp);
+ uma_zfree_smr(inp->inp_pcbinfo->ipi_zone, inp);
+ return (true);
+}
+
+bool
+in_pcbrele_wlocked(struct inpcb *inp)
+{
+
+ INP_WLOCK_ASSERT(inp);
+
+ if (refcount_release(&inp->inp_refcount) == 0)
+ return (false);
+
+ MPASS(inp->inp_flags & INP_FREED);
+ MPASS(inp->inp_socket == NULL);
+ MPASS(inp->inp_in_hpts == 0);
+ MPASS(inp->inp_in_input == 0);
+ INP_WUNLOCK(inp);
+ uma_zfree_smr(inp->inp_pcbinfo->ipi_zone, inp);
+ return (true);
}
/*
@@ -1708,32 +1785,81 @@
* reference count, which should occur only after the inpcb has been detached
* from its socket. If another thread holds a temporary reference (acquired
* using in_pcbref()) then the free is deferred until that reference is
- * released using in_pcbrele(), but the inpcb is still unlocked. Almost all
- * work, including removal from global lists, is done in this context, where
- * the pcbinfo lock is held.
+ * released using in_pcbrele_(r|w)locked(), but the inpcb is still unlocked.
+ * Almost all work, including removal from global lists, is done in this
+ * context, where the pcbinfo lock is held.
*/
void
in_pcbfree(struct inpcb *inp)
{
struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
+#ifdef INET
+ struct ip_moptions *imo;
+#endif
+#ifdef INET6
+ struct ip6_moptions *im6o;
+#endif
+ INP_WLOCK_ASSERT(inp);
KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__));
- KASSERT((inp->inp_flags2 & INP_FREED) == 0,
+ KASSERT((inp->inp_flags & INP_FREED) == 0,
("%s: called twice for pcb %p", __func__, inp));
- if (inp->inp_flags2 & INP_FREED) {
- INP_WUNLOCK(inp);
- return;
+
+ inp->inp_flags |= INP_FREED;
+ INP_INFO_WLOCK(pcbinfo);
+ inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
+ pcbinfo->ipi_count--;
+ CK_LIST_REMOVE(inp, inp_list);
+ INP_INFO_WUNLOCK(pcbinfo);
+
+ if (inp->inp_flags & INP_INHASHLIST) {
+ struct inpcbport *phd = inp->inp_phd;
+
+ INP_HASH_WLOCK(pcbinfo);
+ /* XXX: Only do if SO_REUSEPORT_LB set? */
+ in_pcbremlbgrouphash(inp);
+
+ CK_LIST_REMOVE(inp, inp_hash);
+ CK_LIST_REMOVE(inp, inp_portlist);
+ if (CK_LIST_FIRST(&phd->phd_pcblist) == NULL) {
+ CK_LIST_REMOVE(phd, phd_hash);
+ uma_zfree_smr(pcbinfo->ipi_portzone, phd);
+ }
+ INP_HASH_WUNLOCK(pcbinfo);
+ inp->inp_flags &= ~INP_INHASHLIST;
}
- INP_WLOCK_ASSERT(inp);
- INP_LIST_WLOCK(pcbinfo);
- in_pcbremlists(inp);
- INP_LIST_WUNLOCK(pcbinfo);
+ crfree(inp->inp_cred);
RO_INVALIDATE_CACHE(&inp->inp_route);
- /* mark as destruction in progress */
- inp->inp_flags2 |= INP_FREED;
- INP_WUNLOCK(inp);
- NET_EPOCH_CALL(in_pcbfree_deferred, &inp->inp_epoch_ctx);
+#ifdef MAC
+ mac_inpcb_destroy(inp);
+#endif
+#if defined(IPSEC) || defined(IPSEC_SUPPORT)
+ if (inp->inp_sp != NULL)
+ ipsec_delete_pcbpolicy(inp);
+#endif
+#ifdef INET
+ if (inp->inp_options)
+ (void)m_free(inp->inp_options);
+ imo = inp->inp_moptions;
+#endif
+#ifdef INET6
+ if (inp->inp_vflag & INP_IPV6PROTO) {
+ ip6_freepcbopts(inp->in6p_outputopts);
+ im6o = inp->in6p_moptions;
+ } else
+ im6o = NULL;
+#endif
+
+ if (__predict_false(in_pcbrele_wlocked(inp) == false)) {
+ INP_WUNLOCK(inp);
+ }
+#ifdef INET6
+ ip6_freemoptions(im6o);
+#endif
+#ifdef INET
+ inp_freemoptions(imo);
+#endif
}
/*
@@ -1774,7 +1900,7 @@
CK_LIST_REMOVE(inp, inp_portlist);
if (CK_LIST_FIRST(&phd->phd_pcblist) == NULL) {
CK_LIST_REMOVE(phd, phd_hash);
- NET_EPOCH_CALL(inpcbport_free, &phd->phd_epoch_ctx);
+ uma_zfree_smr(inp->inp_pcbinfo->ipi_portzone, phd);
}
INP_HASH_WUNLOCK(inp->inp_pcbinfo);
inp->inp_flags &= ~INP_INHASHLIST;
@@ -1845,7 +1971,7 @@
struct inpcb *inp, *inp_temp;
INP_INFO_WLOCK(pcbinfo);
- CK_LIST_FOREACH_SAFE(inp, pcbinfo->ipi_listhead, inp_list, inp_temp) {
+ CK_LIST_FOREACH_SAFE(inp, &pcbinfo->ipi_listhead, inp_list, inp_temp) {
INP_WLOCK(inp);
#ifdef INET6
if ((inp->inp_vflag & INP_IPV4) == 0) {
@@ -1864,49 +1990,57 @@
INP_INFO_WUNLOCK(pcbinfo);
}
+static bool
+inp_v4_multi_match(const struct inpcb *inp, void *v __unused)
+{
+
+ if ((inp->inp_vflag & INP_IPV4) && inp->inp_moptions != NULL)
+ return (true);
+ else
+ return (false);
+}
+
void
in_pcbpurgeif0(struct inpcbinfo *pcbinfo, struct ifnet *ifp)
{
+ struct inpcb_iterator inpi = INP_ITERATOR(pcbinfo, INPLOOKUP_WLOCKPCB,
+ inp_v4_multi_match, NULL);
struct inpcb *inp;
struct in_multi *inm;
struct in_mfilter *imf;
struct ip_moptions *imo;
- INP_INFO_WLOCK(pcbinfo);
- CK_LIST_FOREACH(inp, pcbinfo->ipi_listhead, inp_list) {
- INP_WLOCK(inp);
+ IN_MULTI_LOCK_ASSERT();
+
+ while ((inp = inp_next(&inpi)) != NULL) {
+ INP_WLOCK_ASSERT(inp);
+
imo = inp->inp_moptions;
- if ((inp->inp_vflag & INP_IPV4) &&
- imo != NULL) {
- /*
- * Unselect the outgoing interface if it is being
- * detached.
- */
- if (imo->imo_multicast_ifp == ifp)
- imo->imo_multicast_ifp = NULL;
+ /*
+ * Unselect the outgoing interface if it is being
+ * detached.
+ */
+ if (imo->imo_multicast_ifp == ifp)
+ imo->imo_multicast_ifp = NULL;
- /*
- * Drop multicast group membership if we joined
- * through the interface being detached.
- *
- * XXX This can all be deferred to an epoch_call
- */
+ /*
+ * Drop multicast group membership if we joined
+ * through the interface being detached.
+ *
+ * XXX This can all be deferred to an epoch_call
+ */
restart:
- IP_MFILTER_FOREACH(imf, &imo->imo_head) {
- if ((inm = imf->imf_inm) == NULL)
- continue;
- if (inm->inm_ifp != ifp)
- continue;
- ip_mfilter_remove(&imo->imo_head, imf);
- IN_MULTI_LOCK_ASSERT();
- in_leavegroup_locked(inm, NULL);
- ip_mfilter_free(imf);
- goto restart;
- }
+ IP_MFILTER_FOREACH(imf, &imo->imo_head) {
+ if ((inm = imf->imf_inm) == NULL)
+ continue;
+ if (inm->inm_ifp != ifp)
+ continue;
+ ip_mfilter_remove(&imo->imo_head, imf);
+ in_leavegroup_locked(inm, NULL);
+ ip_mfilter_free(imf);
+ goto restart;
}
- INP_WUNLOCK(inp);
}
- INP_INFO_WUNLOCK(pcbinfo);
}
/*
@@ -1928,7 +2062,6 @@
KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0,
("%s: invalid lookup flags %d", __func__, lookupflags));
-
INP_HASH_LOCK_ASSERT(pcbinfo);
if ((lookupflags & INPLOOKUP_WILDCARD) == 0) {
@@ -2091,8 +2224,9 @@
/*
* Lookup PCB in hash list, using pcbinfo tables. This variation assumes
- * that the caller has locked the hash list, and will not perform any further
- * locking or reference operations on either the hash list or the connection.
+ * that the caller has either locked the hash list, which usually happens
+ * for bind(2) operations, or is in SMR section, which happens when sorting
+ * out incoming packets.
*/
static struct inpcb *
in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in_addr faddr,
@@ -2233,20 +2367,15 @@
{
struct inpcb *inp;
+ smr_enter(pcbinfo->ipi_smr);
inp = in_pcblookup_hash_locked(pcbinfo, faddr, fport, laddr, lport,
lookupflags & INPLOOKUP_WILDCARD, ifp, numa_domain);
if (inp != NULL) {
- if (lookupflags & INPLOOKUP_WLOCKPCB) {
- INP_WLOCK(inp);
- } else if (lookupflags & INPLOOKUP_RLOCKPCB) {
- INP_RLOCK(inp);
- } else
- panic("%s: locking bug", __func__);
- if (__predict_false(inp->inp_flags2 & INP_FREED)) {
- INP_UNLOCK(inp);
+ if (__predict_false(inp_smr_lock(inp,
+ (lookupflags & INPLOOKUP_LOCKMASK)) == false))
inp = NULL;
- }
- }
+ } else
+ smr_exit(pcbinfo->ipi_smr);
return (inp);
}
@@ -2341,11 +2470,10 @@
* If none exists, malloc one and tack it on.
*/
if (phd == NULL) {
- phd = malloc(sizeof(struct inpcbport), M_PCB, M_NOWAIT);
+ phd = uma_zalloc_smr(pcbinfo->ipi_portzone, M_NOWAIT);
if (phd == NULL) {
return (ENOBUFS); /* XXX */
}
- bzero(&phd->phd_epoch_ctx, sizeof(struct epoch_context));
phd->phd_port = inp->inp_lport;
CK_LIST_INIT(&phd->phd_pcblist);
CK_LIST_INSERT_HEAD(pcbporthash, phd, phd_hash);
@@ -2363,6 +2491,10 @@
* changed. NOTE: This does not handle the case of the lport changing (the
* hashed port list would have to be updated as well), so the lport must
* not change after in_pcbinshash() has been called.
+ *
+ * XXXGL: a race between this function and SMR-protected hash iterator
+ * will lead to iterator traversing a possibly wrong hash list. However,
+ * this race should have been here since change from rwlock to epoch.
*/
void
in_pcbrehash(struct inpcb *inp)
@@ -2391,39 +2523,6 @@
CK_LIST_INSERT_HEAD(head, inp, inp_hash);
}
-/*
- * Remove PCB from various lists.
- */
-static void
-in_pcbremlists(struct inpcb *inp)
-{
- struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
-
- INP_WLOCK_ASSERT(inp);
- INP_LIST_WLOCK_ASSERT(pcbinfo);
-
- inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
- if (inp->inp_flags & INP_INHASHLIST) {
- struct inpcbport *phd = inp->inp_phd;
-
- INP_HASH_WLOCK(pcbinfo);
-
- /* XXX: Only do if SO_REUSEPORT_LB set? */
- in_pcbremlbgrouphash(inp);
-
- CK_LIST_REMOVE(inp, inp_hash);
- CK_LIST_REMOVE(inp, inp_portlist);
- if (CK_LIST_FIRST(&phd->phd_pcblist) == NULL) {
- CK_LIST_REMOVE(phd, phd_hash);
- NET_EPOCH_CALL(inpcbport_free, &phd->phd_epoch_ctx);
- }
- INP_HASH_WUNLOCK(pcbinfo);
- inp->inp_flags &= ~INP_INHASHLIST;
- }
- CK_LIST_REMOVE(inp, inp_list);
- pcbinfo->ipi_count--;
-}
-
/*
* Check for alternatives when higher level complains
* about service problems. For now, invalidate cached
@@ -2558,15 +2657,12 @@
void
inp_apply_all(void (*func)(struct inpcb *, void *), void *arg)
{
+ struct inpcb_iterator inpi = INP_ALL_ITERATOR(&V_tcbinfo,
+ INPLOOKUP_WLOCKPCB);
struct inpcb *inp;
- INP_INFO_WLOCK(&V_tcbinfo);
- CK_LIST_FOREACH(inp, V_tcbinfo.ipi_listhead, inp_list) {
- INP_WLOCK(inp);
+ while ((inp = inp_next(&inpi)) != NULL)
func(inp, arg);
- INP_WUNLOCK(inp);
- }
- INP_INFO_WUNLOCK(&V_tcbinfo);
}
struct socket *
Index: sys/netinet/in_pcb_var.h
===================================================================
--- sys/netinet/in_pcb_var.h
+++ sys/netinet/in_pcb_var.h
@@ -44,6 +44,7 @@
* Definitions shared between netinet/in_pcb.c and netinet6/in6_pcb.c
*/
+bool inp_smr_lock(struct inpcb *, const inp_lookup_t);
int in_pcb_lport(struct inpcb *, struct in_addr *, u_short *,
struct ucred *, int);
int in_pcb_lport_dest(struct inpcb *inp, struct sockaddr *lsa,
@@ -52,4 +53,10 @@
struct inpcb * in_pcblookup_local(struct inpcbinfo *, struct in_addr, u_short,
int, struct ucred *);
+struct inpcbport {
+ struct inpcbhead phd_pcblist;
+ CK_LIST_ENTRY(inpcbport) phd_hash;
+ u_short phd_port;
+};
+
#endif /* !_NETINET_IN_PCB_VAR_H_ */
Index: sys/netinet/ip_divert.c
===================================================================
--- sys/netinet/ip_divert.c
+++ sys/netinet/ip_divert.c
@@ -111,10 +111,7 @@
*/
/* Internal variables. */
-VNET_DEFINE_STATIC(struct inpcbhead, divcb);
VNET_DEFINE_STATIC(struct inpcbinfo, divcbinfo);
-
-#define V_divcb VNET(divcb)
#define V_divcbinfo VNET(divcbinfo)
static u_long div_sendspace = DIVSNDQ; /* XXX sysctl ? */
@@ -154,8 +151,7 @@
* allocate one-entry hash lists than it is to check all over the
* place for hashbase == NULL.
*/
- in_pcbinfo_init(&V_divcbinfo, "div", &V_divcb, 1, 1, "divcb",
- div_inpcb_init, IPI_HASHFIELDS_NONE);
+ in_pcbinfo_init(&V_divcbinfo, "div", 1, 1, "divcb", div_inpcb_init);
}
static void
@@ -181,6 +177,14 @@
return (IPPROTO_DONE);
}
+static bool
+div_port_match(const struct inpcb *inp, void *v)
+{
+ uint16_t nport = *(uint16_t *)v;
+
+ return (inp->inp_lport == nport);
+}
+
/*
* Divert a packet by passing it up to the divert socket at port 'port'.
*
@@ -195,6 +199,8 @@
struct socket *sa;
u_int16_t nport;
struct sockaddr_in divsrc;
+ struct inpcb_iterator inpi = INP_ITERATOR(&V_divcbinfo,
+ INPLOOKUP_RLOCKPCB, div_port_match, &nport);
struct m_tag *mtag;
NET_EPOCH_ASSERT();
@@ -281,27 +287,20 @@
/* Put packet on socket queue, if any */
sa = NULL;
+ /* nport is inp_next's context. */
nport = htons((u_int16_t)(((struct ipfw_rule_ref *)(mtag+1))->info));
- CK_LIST_FOREACH(inp, &V_divcb, inp_list) {
+ while ((inp = inp_next(&inpi)) != NULL) {
+ sa = inp->inp_socket;
+ SOCKBUF_LOCK(&sa->so_rcv);
+ if (sbappendaddr_locked(&sa->so_rcv,
+ (struct sockaddr *)&divsrc, m, NULL) == 0) {
+ soroverflow_locked(sa);
+ sa = NULL; /* force mbuf reclaim below */
+ } else
+ sorwakeup_locked(sa);
/* XXX why does only one socket match? */
- if (inp->inp_lport == nport) {
- INP_RLOCK(inp);
- if (__predict_false(inp->inp_flags2 & INP_FREED)) {
- INP_RUNLOCK(inp);
- continue;
- }
- sa = inp->inp_socket;
- SOCKBUF_LOCK(&sa->so_rcv);
- if (sbappendaddr_locked(&sa->so_rcv,
- (struct sockaddr *)&divsrc, m,
- (struct mbuf *)0) == 0) {
- soroverflow_locked(sa);
- sa = NULL; /* force mbuf reclaim below */
- } else
- sorwakeup_locked(sa);
- INP_RUNLOCK(inp);
- break;
- }
+ INP_RUNLOCK(inp);
+ break;
}
if (sa == NULL) {
m_freem(m);
@@ -596,14 +595,10 @@
error = soreserve(so, div_sendspace, div_recvspace);
if (error)
return error;
- INP_INFO_WLOCK(&V_divcbinfo);
error = in_pcballoc(so, &V_divcbinfo);
- if (error) {
- INP_INFO_WUNLOCK(&V_divcbinfo);
+ if (error)
return error;
- }
inp = (struct inpcb *)so->so_pcb;
- INP_INFO_WUNLOCK(&V_divcbinfo);
inp->inp_ip_p = proto;
inp->inp_vflag |= INP_IPV4;
inp->inp_flags |= INP_HDRINCL;
@@ -618,11 +613,9 @@
inp = sotoinpcb(so);
KASSERT(inp != NULL, ("div_detach: inp == NULL"));
- INP_INFO_WLOCK(&V_divcbinfo);
INP_WLOCK(inp);
in_pcbdetach(inp);
in_pcbfree(inp);
- INP_INFO_WUNLOCK(&V_divcbinfo);
}
static int
@@ -702,8 +695,9 @@
static int
div_pcblist(SYSCTL_HANDLER_ARGS)
{
+ struct inpcb_iterator inpi = INP_ALL_ITERATOR(&V_divcbinfo,
+ INPLOOKUP_RLOCKPCB);
struct xinpgen xig;
- struct epoch_tracker et;
struct inpcb *inp;
int error;
@@ -731,21 +725,18 @@
if (error)
return error;
- NET_EPOCH_ENTER(et);
- for (inp = CK_LIST_FIRST(V_divcbinfo.ipi_listhead);
- inp != NULL;
- inp = CK_LIST_NEXT(inp, inp_list)) {
- INP_RLOCK(inp);
+ while ((inp = inp_next(&inpi)) != NULL) {
if (inp->inp_gencnt <= xig.xig_gen) {
struct xinpcb xi;
in_pcbtoxinpcb(inp, &xi);
- INP_RUNLOCK(inp);
error = SYSCTL_OUT(req, &xi, sizeof xi);
- } else
- INP_RUNLOCK(inp);
+ if (error) {
+ INP_RUNLOCK(inp);
+ break;
+ }
+ }
}
- NET_EPOCH_EXIT(et);
if (!error) {
/*
Index: sys/netinet/ip_gre.c
===================================================================
--- sys/netinet/ip_gre.c
+++ sys/netinet/ip_gre.c
@@ -223,25 +223,11 @@
in_gre_udp_input(struct mbuf *m, int off, struct inpcb *inp,
const struct sockaddr *sa, void *ctx)
{
- struct epoch_tracker et;
struct gre_socket *gs;
struct gre_softc *sc;
in_addr_t dst;
- NET_EPOCH_ENTER(et);
- /*
- * udp_append() holds reference to inp, it is safe to check
- * inp_flags2 without INP_RLOCK().
- * If socket was closed before we have entered NET_EPOCH section,
- * INP_FREED flag should be set. Otherwise it should be safe to
- * make access to ctx data, because gre_so will be freed by
- * gre_sofree() via NET_EPOCH_CALL().
- */
- if (__predict_false(inp->inp_flags2 & INP_FREED)) {
- NET_EPOCH_EXIT(et);
- m_freem(m);
- return;
- }
+ NET_EPOCH_ASSERT();
gs = (struct gre_socket *)ctx;
dst = ((const struct sockaddr_in *)sa)->sin_addr.s_addr;
@@ -251,11 +237,9 @@
}
if (sc != NULL && (GRE2IFP(sc)->if_flags & IFF_UP) != 0){
gre_input(m, off + sizeof(struct udphdr), IPPROTO_UDP, sc);
- NET_EPOCH_EXIT(et);
return;
}
m_freem(m);
- NET_EPOCH_EXIT(et);
}
static int
Index: sys/netinet/raw_ip.c
===================================================================
--- sys/netinet/raw_ip.c
+++ sys/netinet/raw_ip.c
@@ -88,10 +88,7 @@
&VNET_NAME(ip_defttl), 0,
"Maximum TTL on IP packets");
-VNET_DEFINE(struct inpcbhead, ripcb);
VNET_DEFINE(struct inpcbinfo, ripcbinfo);
-
-#define V_ripcb VNET(ripcb)
#define V_ripcbinfo VNET(ripcbinfo)
/*
@@ -161,7 +158,7 @@
struct inpcbhead *pcbhash;
int hash;
- INP_INFO_WLOCK_ASSERT(pcbinfo);
+ INP_HASH_WLOCK_ASSERT(pcbinfo);
INP_WLOCK_ASSERT(inp);
if (inp->inp_ip_p != 0 &&
@@ -179,7 +176,7 @@
rip_delhash(struct inpcb *inp)
{
- INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo);
+ INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
INP_WLOCK_ASSERT(inp);
CK_LIST_REMOVE(inp, inp_hash);
@@ -213,8 +210,8 @@
rip_init(void)
{
- in_pcbinfo_init(&V_ripcbinfo, "rip", &V_ripcb, INP_PCBHASH_RAW_SIZE,
- 1, "ripcb", rip_inpcb_init, IPI_HASHFIELDS_NONE);
+ in_pcbinfo_init(&V_ripcbinfo, "rip", INP_PCBHASH_RAW_SIZE, 1, "ripcb",
+ rip_inpcb_init);
EVENTHANDLER_REGISTER(maxsockets_change, rip_zone_change, NULL,
EVENTHANDLER_PRI_ANY);
}
@@ -231,47 +228,90 @@
#ifdef INET
static int
-rip_append(struct inpcb *last, struct ip *ip, struct mbuf *n,
+rip_append(struct inpcb *inp, struct ip *ip, struct mbuf *m,
struct sockaddr_in *ripsrc)
{
- int policyfail = 0;
+ struct socket *so = inp->inp_socket;
+ struct mbuf *n, *opts = NULL;
- INP_LOCK_ASSERT(last);
+ INP_LOCK_ASSERT(inp);
#if defined(IPSEC) || defined(IPSEC_SUPPORT)
/* check AH/ESP integrity. */
- if (IPSEC_ENABLED(ipv4)) {
- if (IPSEC_CHECK_POLICY(ipv4, n, last) != 0)
- policyfail = 1;
- }
+ if (IPSEC_ENABLED(ipv4) && IPSEC_CHECK_POLICY(ipv4, m, inp) != 0)
+ return (0);
#endif /* IPSEC */
#ifdef MAC
- if (!policyfail && mac_inpcb_check_deliver(last, n) != 0)
- policyfail = 1;
+ if (mac_inpcb_check_deliver(inp, m) != 0)
+ return (0);
#endif
/* Check the minimum TTL for socket. */
- if (last->inp_ip_minttl && last->inp_ip_minttl > ip->ip_ttl)
- policyfail = 1;
- if (!policyfail) {
- struct mbuf *opts = NULL;
- struct socket *so;
-
- so = last->inp_socket;
- if ((last->inp_flags & INP_CONTROLOPTS) ||
- (so->so_options & (SO_TIMESTAMP | SO_BINTIME)))
- ip_savecontrol(last, &opts, ip, n);
- SOCKBUF_LOCK(&so->so_rcv);
- if (sbappendaddr_locked(&so->so_rcv,
- (struct sockaddr *)ripsrc, n, opts) == 0) {
- soroverflow_locked(so);
- m_freem(n);
- if (opts)
- m_freem(opts);
- } else
- sorwakeup_locked(so);
- } else
+ if (inp->inp_ip_minttl && inp->inp_ip_minttl > ip->ip_ttl)
+ return (0);
+
+ if ((n = m_copym(m, 0, M_COPYALL, M_NOWAIT)) == NULL)
+ return (0);
+
+ if ((inp->inp_flags & INP_CONTROLOPTS) ||
+ (so->so_options & (SO_TIMESTAMP | SO_BINTIME)))
+ ip_savecontrol(inp, &opts, ip, n);
+ SOCKBUF_LOCK(&so->so_rcv);
+ if (sbappendaddr_locked(&so->so_rcv,
+ (struct sockaddr *)ripsrc, n, opts) == 0) {
+ soroverflow_locked(so);
m_freem(n);
- return (policyfail);
+ if (opts)
+ m_freem(opts);
+ return (0);
+ }
+ sorwakeup_locked(so);
+
+ return (1);
+}
+
+struct rip_inp_match_ctx {
+ struct ip *ip;
+ int proto;
+};
+
+static bool
+rip_inp_match1(const struct inpcb *inp, void *v)
+{
+ struct rip_inp_match_ctx *ctx = v;
+
+ if (inp->inp_ip_p != ctx->proto)
+ return (false);
+#ifdef INET6
+ /* XXX inp locking */
+ if ((inp->inp_vflag & INP_IPV4) == 0)
+ return (false);
+#endif
+ if (inp->inp_laddr.s_addr != ctx->ip->ip_dst.s_addr)
+ return (false);
+ if (inp->inp_faddr.s_addr != ctx->ip->ip_src.s_addr)
+ return (false);
+ return (true);
+}
+
+static bool
+rip_inp_match2(const struct inpcb *inp, void *v)
+{
+ struct rip_inp_match_ctx *ctx = v;
+
+ if (inp->inp_ip_p && inp->inp_ip_p != ctx->proto)
+ return (false);
+#ifdef INET6
+ /* XXX inp locking */
+ if ((inp->inp_vflag & INP_IPV4) == 0)
+ return (false);
+#endif
+ if (!in_nullhost(inp->inp_laddr) &&
+ !in_hosteq(inp->inp_laddr, ctx->ip->ip_dst))
+ return (false);
+ if (!in_nullhost(inp->inp_faddr) &&
+ !in_hosteq(inp->inp_faddr, ctx->ip->ip_src))
+ return (false);
+ return (true);
}
/*
@@ -281,102 +321,57 @@
int
rip_input(struct mbuf **mp, int *offp, int proto)
{
+ struct rip_inp_match_ctx ctx = {
+ .ip = mtod(*mp, struct ip *),
+ .proto = proto,
+ };
+ struct inpcb_iterator inpi = INP_ITERATOR(&V_ripcbinfo,
+ INPLOOKUP_RLOCKPCB, rip_inp_match1, &ctx);
struct ifnet *ifp;
struct mbuf *m = *mp;
- struct ip *ip = mtod(m, struct ip *);
- struct inpcb *inp, *last;
+ struct inpcb *inp;
struct sockaddr_in ripsrc;
- int hash;
-
- NET_EPOCH_ASSERT();
+ int appended;
*mp = NULL;
+ appended = 0;
bzero(&ripsrc, sizeof(ripsrc));
ripsrc.sin_len = sizeof(ripsrc);
ripsrc.sin_family = AF_INET;
- ripsrc.sin_addr = ip->ip_src;
- last = NULL;
+ ripsrc.sin_addr = ctx.ip->ip_src;
ifp = m->m_pkthdr.rcvif;
- hash = INP_PCBHASH_RAW(proto, ip->ip_src.s_addr,
- ip->ip_dst.s_addr, V_ripcbinfo.ipi_hashmask);
- CK_LIST_FOREACH(inp, &V_ripcbinfo.ipi_hashbase[hash], inp_hash) {
- if (inp->inp_ip_p != proto)
- continue;
-#ifdef INET6
- /* XXX inp locking */
- if ((inp->inp_vflag & INP_IPV4) == 0)
- continue;
-#endif
- if (inp->inp_laddr.s_addr != ip->ip_dst.s_addr)
- continue;
- if (inp->inp_faddr.s_addr != ip->ip_src.s_addr)
- continue;
- if (last != NULL) {
- struct mbuf *n;
-
- n = m_copym(m, 0, M_COPYALL, M_NOWAIT);
- if (n != NULL)
- (void) rip_append(last, ip, n, &ripsrc);
- /* XXX count dropped packet */
- INP_RUNLOCK(last);
- last = NULL;
- }
- INP_RLOCK(inp);
- if (__predict_false(inp->inp_flags2 & INP_FREED))
- goto skip_1;
- if (jailed_without_vnet(inp->inp_cred)) {
+ inpi.hash = INP_PCBHASH_RAW(proto, ctx.ip->ip_src.s_addr,
+ ctx.ip->ip_dst.s_addr, V_ripcbinfo.ipi_hashmask);
+ while ((inp = inp_next(&inpi)) != NULL) {
+ INP_RLOCK_ASSERT(inp);
+ if (jailed_without_vnet(inp->inp_cred) &&
+ prison_check_ip4(inp->inp_cred, &ctx.ip->ip_dst) != 0) {
/*
* XXX: If faddr was bound to multicast group,
* jailed raw socket will drop datagram.
*/
- if (prison_check_ip4(inp->inp_cred, &ip->ip_dst) != 0)
- goto skip_1;
- }
- last = inp;
- continue;
- skip_1:
- INP_RUNLOCK(inp);
- }
- CK_LIST_FOREACH(inp, &V_ripcbinfo.ipi_hashbase[0], inp_hash) {
- if (inp->inp_ip_p && inp->inp_ip_p != proto)
- continue;
-#ifdef INET6
- /* XXX inp locking */
- if ((inp->inp_vflag & INP_IPV4) == 0)
continue;
-#endif
- if (!in_nullhost(inp->inp_laddr) &&
- !in_hosteq(inp->inp_laddr, ip->ip_dst))
- continue;
- if (!in_nullhost(inp->inp_faddr) &&
- !in_hosteq(inp->inp_faddr, ip->ip_src))
- continue;
- if (last != NULL) {
- struct mbuf *n;
-
- n = m_copym(m, 0, M_COPYALL, M_NOWAIT);
- if (n != NULL)
- (void) rip_append(last, ip, n, &ripsrc);
- /* XXX count dropped packet */
- INP_RUNLOCK(last);
- last = NULL;
}
- INP_RLOCK(inp);
- if (__predict_false(inp->inp_flags2 & INP_FREED))
- goto skip_2;
- if (jailed_without_vnet(inp->inp_cred)) {
+ appended += rip_append(inp, ctx.ip, m, &ripsrc);
+ }
+
+ inpi.hash = 0;
+ inpi.match = rip_inp_match2;
+ MPASS(inpi.inp == NULL);
+ while ((inp = inp_next(&inpi)) != NULL) {
+ INP_RLOCK_ASSERT(inp);
+ if (jailed_without_vnet(inp->inp_cred) &&
+ !IN_MULTICAST(ntohl(ctx.ip->ip_dst.s_addr)) &&
+ prison_check_ip4(inp->inp_cred, &ctx.ip->ip_dst) != 0)
/*
* Allow raw socket in jail to receive multicast;
* assume process had PRIV_NETINET_RAW at attach,
* and fall through into normal filter path if so.
*/
- if (!IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) &&
- prison_check_ip4(inp->inp_cred, &ip->ip_dst) != 0)
- goto skip_2;
- }
+ continue;
/*
* If this raw socket has multicast state, and we
* have received a multicast, check if this socket
@@ -384,7 +379,7 @@
* the responsibility of the transport layer.
*/
if (inp->inp_moptions != NULL &&
- IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) {
+ IN_MULTICAST(ntohl(ctx.ip->ip_dst.s_addr))) {
/*
* If the incoming datagram is for IGMP, allow it
* through unconditionally to the raw socket.
@@ -406,7 +401,7 @@
bzero(&group, sizeof(struct sockaddr_in));
group.sin_len = sizeof(struct sockaddr_in);
group.sin_family = AF_INET;
- group.sin_addr = ip->ip_dst;
+ group.sin_addr = ctx.ip->ip_dst;
blocked = imo_multi_filter(inp->inp_moptions,
ifp,
@@ -416,27 +411,18 @@
if (blocked != MCAST_PASS) {
IPSTAT_INC(ips_notmember);
- goto skip_2;
+ continue;
}
}
- last = inp;
- continue;
- skip_2:
- INP_RUNLOCK(inp);
- }
- if (last != NULL) {
- if (rip_append(last, ip, m, &ripsrc) != 0)
- IPSTAT_INC(ips_delivered);
- INP_RUNLOCK(last);
- } else {
- if (inetsw[ip_protox[ip->ip_p]].pr_input == rip_input) {
- IPSTAT_INC(ips_noproto);
- IPSTAT_DEC(ips_delivered);
- icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_PROTOCOL, 0, 0);
- } else {
- m_freem(m);
- }
+ appended += rip_append(inp, ctx.ip, m, &ripsrc);
}
+ if (appended == 0 &&
+ inetsw[ip_protox[ctx.ip->ip_p]].pr_input == rip_input) {
+ IPSTAT_INC(ips_noproto);
+ IPSTAT_DEC(ips_delivered);
+ icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_PROTOCOL, 0, 0);
+ } else
+ m_freem(m);
return (IPPROTO_DONE);
}
@@ -906,18 +892,16 @@
error = soreserve(so, rip_sendspace, rip_recvspace);
if (error)
return (error);
- INP_INFO_WLOCK(&V_ripcbinfo);
error = in_pcballoc(so, &V_ripcbinfo);
- if (error) {
- INP_INFO_WUNLOCK(&V_ripcbinfo);
+ if (error)
return (error);
- }
inp = (struct inpcb *)so->so_pcb;
inp->inp_vflag |= INP_IPV4;
inp->inp_ip_p = proto;
inp->inp_ip_ttl = V_ip_defttl;
+ INP_HASH_WLOCK(&V_ripcbinfo);
rip_inshash(inp);
- INP_INFO_WUNLOCK(&V_ripcbinfo);
+ INP_HASH_WUNLOCK(&V_ripcbinfo);
INP_WUNLOCK(inp);
return (0);
}
@@ -932,9 +916,10 @@
KASSERT(inp->inp_faddr.s_addr == INADDR_ANY,
("rip_detach: not closed"));
- INP_INFO_WLOCK(&V_ripcbinfo);
INP_WLOCK(inp);
+ INP_HASH_WLOCK(&V_ripcbinfo);
rip_delhash(inp);
+ INP_HASH_WUNLOCK(&V_ripcbinfo);
if (so == V_ip_mrouter && ip_mrouter_done)
ip_mrouter_done();
if (ip_rsvp_force_done)
@@ -943,7 +928,6 @@
ip_rsvp_done();
in_pcbdetach(inp);
in_pcbfree(inp);
- INP_INFO_WUNLOCK(&V_ripcbinfo);
}
static void
@@ -952,16 +936,16 @@
struct inpcbinfo *pcbinfo;
pcbinfo = inp->inp_pcbinfo;
- INP_INFO_WLOCK(pcbinfo);
INP_WLOCK(inp);
+ INP_HASH_WLOCK(pcbinfo);
rip_delhash(inp);
inp->inp_faddr.s_addr = INADDR_ANY;
rip_inshash(inp);
+ INP_HASH_WUNLOCK(pcbinfo);
SOCK_LOCK(so);
so->so_state &= ~SS_ISCONNECTED;
SOCK_UNLOCK(so);
INP_WUNLOCK(inp);
- INP_INFO_WUNLOCK(pcbinfo);
}
static void
@@ -1027,13 +1011,13 @@
ifa_ifwithaddr_check((struct sockaddr *)addr) == 0))
return (EADDRNOTAVAIL);
- INP_INFO_WLOCK(&V_ripcbinfo);
INP_WLOCK(inp);
+ INP_HASH_WLOCK(&V_ripcbinfo);
rip_delhash(inp);
inp->inp_laddr = addr->sin_addr;
rip_inshash(inp);
+ INP_HASH_WUNLOCK(&V_ripcbinfo);
INP_WUNLOCK(inp);
- INP_INFO_WUNLOCK(&V_ripcbinfo);
return (0);
}
@@ -1053,14 +1037,14 @@
inp = sotoinpcb(so);
KASSERT(inp != NULL, ("rip_connect: inp == NULL"));
- INP_INFO_WLOCK(&V_ripcbinfo);
INP_WLOCK(inp);
+ INP_HASH_WLOCK(&V_ripcbinfo);
rip_delhash(inp);
inp->inp_faddr = addr->sin_addr;
rip_inshash(inp);
+ INP_HASH_WUNLOCK(&V_ripcbinfo);
soisconnected(so);
INP_WUNLOCK(inp);
- INP_INFO_WUNLOCK(&V_ripcbinfo);
return (0);
}
@@ -1126,8 +1110,9 @@
static int
rip_pcblist(SYSCTL_HANDLER_ARGS)
{
+ struct inpcb_iterator inpi = INP_ALL_ITERATOR(&V_ripcbinfo,
+ INPLOOKUP_RLOCKPCB);
struct xinpgen xig;
- struct epoch_tracker et;
struct inpcb *inp;
int error;
@@ -1155,24 +1140,19 @@
if (error)
return (error);
- NET_EPOCH_ENTER(et);
- for (inp = CK_LIST_FIRST(V_ripcbinfo.ipi_listhead);
- inp != NULL;
- inp = CK_LIST_NEXT(inp, inp_list)) {
- INP_RLOCK(inp);
+ while ((inp = inp_next(&inpi)) != NULL) {
if (inp->inp_gencnt <= xig.xig_gen &&
cr_canseeinpcb(req->td->td_ucred, inp) == 0) {
struct xinpcb xi;
in_pcbtoxinpcb(inp, &xi);
- INP_RUNLOCK(inp);
error = SYSCTL_OUT(req, &xi, sizeof xi);
- if (error)
+ if (error) {
+ INP_RUNLOCK(inp);
break;
- } else
- INP_RUNLOCK(inp);
+ }
+ }
}
- NET_EPOCH_EXIT(et);
if (!error) {
/*
Index: sys/netinet/tcp_input.c
===================================================================
--- sys/netinet/tcp_input.c
+++ sys/netinet/tcp_input.c
@@ -238,8 +238,6 @@
&VNET_NAME(tcp_autorcvbuf_max), 0,
"Max size of automatic receive buffer");
-VNET_DEFINE(struct inpcbhead, tcb);
-#define tcb6 tcb /* for KAME src sync over BSD*'s */
VNET_DEFINE(struct inpcbinfo, tcbinfo);
/*
Index: sys/netinet/tcp_lro.c
===================================================================
--- sys/netinet/tcp_lro.c
+++ sys/netinet/tcp_lro.c
@@ -1301,8 +1301,7 @@
/* Check if the inp is dead, Jim. */
if (tp == NULL ||
- (inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) ||
- (inp->inp_flags2 & INP_FREED)) {
+ (inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT))) {
INP_WUNLOCK(inp);
return (TCP_LRO_CANNOT);
}
Index: sys/netinet/tcp_subr.c
===================================================================
--- sys/netinet/tcp_subr.c
+++ sys/netinet/tcp_subr.c
@@ -1352,7 +1352,7 @@
VNET_FOREACH(vnet_iter) {
CURVNET_SET(vnet_iter);
INP_INFO_WLOCK(&V_tcbinfo);
- CK_LIST_FOREACH(inp, V_tcbinfo.ipi_listhead, inp_list) {
+ CK_LIST_FOREACH(inp, &V_tcbinfo.ipi_listhead, inp_list) {
INP_WLOCK(inp);
if (inp->inp_flags & INP_TIMEWAIT) {
INP_WUNLOCK(inp);
@@ -1454,8 +1454,8 @@
"clipped from %d to %d.\n", __func__, oldhashsize,
hashsize);
}
- in_pcbinfo_init(&V_tcbinfo, "tcp", &V_tcb, hashsize, hashsize,
- "tcp_inpcb", tcp_inpcb_init, IPI_HASHFIELDS_4TUPLE);
+ in_pcbinfo_init(&V_tcbinfo, "tcp", hashsize, hashsize,
+ "tcp_inpcb", tcp_inpcb_init);
/*
* These have to be type stable for the benefit of the timers.
@@ -1565,9 +1565,9 @@
* Sleep to let all tcpcb timers really disappear and cleanup.
*/
for (;;) {
- INP_LIST_RLOCK(&V_tcbinfo);
+ INP_INFO_WLOCK(&V_tcbinfo);
n = V_tcbinfo.ipi_count;
- INP_LIST_RUNLOCK(&V_tcbinfo);
+ INP_INFO_WUNLOCK(&V_tcbinfo);
if (n == 0)
break;
pause("tcpdes", hz / 10);
@@ -2241,7 +2241,7 @@
* therefore don't enter the loop below until the connection
* list has stabilised.
*/
- CK_LIST_FOREACH(inp, &V_tcb, inp_list) {
+ CK_LIST_FOREACH(inp, &V_tcbinfo.ipi_listhead, inp_list) {
INP_WLOCK(inp);
/* Important to skip tcptw structs. */
if (!(inp->inp_flags & INP_TIMEWAIT) &&
@@ -2290,7 +2290,6 @@
struct socket *so = tp->t_inpcb->inp_socket;
NET_EPOCH_ASSERT();
- INP_INFO_LOCK_ASSERT(&V_tcbinfo);
INP_WLOCK_ASSERT(tp->t_inpcb);
if (TCPS_HAVERCVDSYN(tp->t_state)) {
@@ -2502,7 +2501,6 @@
struct inpcb *inp = tp->t_inpcb;
struct socket *so;
- INP_INFO_LOCK_ASSERT(&V_tcbinfo);
INP_WLOCK_ASSERT(inp);
#ifdef TCP_OFFLOAD
@@ -2561,7 +2559,7 @@
* useful.
*/
INP_INFO_WLOCK(&V_tcbinfo);
- CK_LIST_FOREACH(inpb, V_tcbinfo.ipi_listhead, inp_list) {
+ CK_LIST_FOREACH(inpb, &V_tcbinfo.ipi_listhead, inp_list) {
INP_WLOCK(inpb);
if (inpb->inp_flags & INP_TIMEWAIT) {
INP_WUNLOCK(inpb);
@@ -2602,7 +2600,6 @@
{
struct tcpcb *tp;
- INP_INFO_LOCK_ASSERT(&V_tcbinfo);
INP_WLOCK_ASSERT(inp);
if ((inp->inp_flags & INP_TIMEWAIT) ||
@@ -2648,9 +2645,9 @@
static int
tcp_pcblist(SYSCTL_HANDLER_ARGS)
{
- struct epoch_tracker et;
- struct inpcb *inp;
+ struct inpcb_iterator inpi = INP_ALL_ITERATOR(&V_tcbinfo, INPLOOKUP_RLOCKPCB);
struct xinpgen xig;
+ struct inpcb *inp;
int error;
if (req->newptr != NULL)
@@ -2683,11 +2680,7 @@
if (error)
return (error);
- NET_EPOCH_ENTER(et);
- for (inp = CK_LIST_FIRST(V_tcbinfo.ipi_listhead);
- inp != NULL;
- inp = CK_LIST_NEXT(inp, inp_list)) {
- INP_RLOCK(inp);
+ while ((inp = inp_next(&inpi)) != NULL) {
if (inp->inp_gencnt <= xig.xig_gen) {
int crerr;
@@ -2708,17 +2701,15 @@
struct xtcpcb xt;
tcp_inptoxtp(inp, &xt);
- INP_RUNLOCK(inp);
error = SYSCTL_OUT(req, &xt, sizeof xt);
- if (error)
+ if (error) {
+ INP_RUNLOCK(inp);
break;
- else
+ } else
continue;
}
}
- INP_RUNLOCK(inp);
}
- NET_EPOCH_EXIT(et);
if (!error) {
/*
Index: sys/netinet/tcp_var.h
===================================================================
--- sys/netinet/tcp_var.h
+++ sys/netinet/tcp_var.h
@@ -908,7 +908,6 @@
VNET_DECLARE(int, tcp_sendspace);
VNET_DECLARE(int, tcp_udp_tunneling_overhead);
VNET_DECLARE(int, tcp_udp_tunneling_port);
-VNET_DECLARE(struct inpcbhead, tcb);
VNET_DECLARE(struct inpcbinfo, tcbinfo);
#define V_tcp_do_lrd VNET(tcp_do_lrd)
@@ -917,7 +916,6 @@
#define V_tcp_do_newcwv VNET(tcp_do_newcwv)
#define V_drop_synfin VNET(drop_synfin)
#define V_path_mtu_discovery VNET(path_mtu_discovery)
-#define V_tcb VNET(tcb)
#define V_tcbinfo VNET(tcbinfo)
#define V_tcp_abc_l_var VNET(tcp_abc_l_var)
#define V_tcp_autorcvbuf_max VNET(tcp_autorcvbuf_max)
Index: sys/netinet/udp_usrreq.c
===================================================================
--- sys/netinet/udp_usrreq.c
+++ sys/netinet/udp_usrreq.c
@@ -143,9 +143,7 @@
SYSCTL_ULONG(_net_inet_udp, UDPCTL_RECVSPACE, recvspace, CTLFLAG_RW,
&udp_recvspace, 0, "Maximum space for incoming UDP datagrams");
-VNET_DEFINE(struct inpcbhead, udb); /* from udp_var.h */
VNET_DEFINE(struct inpcbinfo, udbinfo);
-VNET_DEFINE(struct inpcbhead, ulitecb);
VNET_DEFINE(struct inpcbinfo, ulitecbinfo);
VNET_DEFINE_STATIC(uma_zone_t, udpcb_zone);
#define V_udpcb_zone VNET(udpcb_zone)
@@ -207,8 +205,8 @@
* Once we can calculate the flowid that way and re-establish
* a 4-tuple, flip this to 4-tuple.
*/
- in_pcbinfo_init(&V_udbinfo, "udp", &V_udb, UDBHASHSIZE, UDBHASHSIZE,
- "udp_inpcb", udp_inpcb_init, IPI_HASHFIELDS_2TUPLE);
+ in_pcbinfo_init(&V_udbinfo, "udp", UDBHASHSIZE, UDBHASHSIZE,
+ "udp_inpcb", udp_inpcb_init);
V_udpcb_zone = uma_zcreate("udpcb", sizeof(struct udpcb),
NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
uma_zone_set_max(V_udpcb_zone, maxsockets);
@@ -221,9 +219,8 @@
udplite_init(void)
{
- in_pcbinfo_init(&V_ulitecbinfo, "udplite", &V_ulitecb, UDBHASHSIZE,
- UDBHASHSIZE, "udplite_inpcb", udplite_inpcb_init,
- IPI_HASHFIELDS_2TUPLE);
+ in_pcbinfo_init(&V_ulitecbinfo, "udplite", UDBHASHSIZE,
+ UDBHASHSIZE, "udplite_inpcb", udplite_inpcb_init);
}
/*
@@ -389,6 +386,123 @@
return (0);
}
+static bool
+udp_multi_match(const struct inpcb *inp, void *v)
+{
+ struct ip *ip = v;
+ struct udphdr *uh = (struct udphdr *)(ip + 1);
+
+ if (inp->inp_lport != uh->uh_dport)
+ return (false);
+#ifdef INET6
+ if ((inp->inp_vflag & INP_IPV4) == 0)
+ return (false);
+#endif
+ if (inp->inp_laddr.s_addr != INADDR_ANY &&
+ inp->inp_laddr.s_addr != ip->ip_dst.s_addr)
+ return (false);
+ if (inp->inp_faddr.s_addr != INADDR_ANY &&
+ inp->inp_faddr.s_addr != ip->ip_src.s_addr)
+ return (false);
+ if (inp->inp_fport != 0 &&
+ inp->inp_fport != uh->uh_sport)
+ return (false);
+
+ return (true);
+}
+
+static int
+udp_multi_input(struct mbuf *m, int proto, struct sockaddr_in *udp_in)
+{
+ struct ip *ip = mtod(m, struct ip *);
+ struct inpcb_iterator inpi = INP_ITERATOR(udp_get_inpcbinfo(proto),
+ INPLOOKUP_RLOCKPCB, udp_multi_match, ip);
+ struct udphdr *uh = (struct udphdr *)(ip + 1);
+ struct inpcb *inp;
+ struct mbuf *n;
+ int appends = 0;
+
+ MPASS(ip->ip_hl == sizeof(struct ip) >> 2);
+
+ while ((inp = inp_next(&inpi)) != NULL) {
+ /*
+ * XXXRW: Because we weren't holding either the inpcb
+ * or the hash lock when we checked for a match
+ * before, we should probably recheck now that the
+ * inpcb lock is held.
+ */
+ /*
+ * Handle socket delivery policy for any-source
+ * and source-specific multicast. [RFC3678]
+ */
+ if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) {
+ struct ip_moptions *imo;
+ struct sockaddr_in group;
+ int blocked;
+
+ imo = inp->inp_moptions;
+ if (imo == NULL)
+ continue;
+ bzero(&group, sizeof(struct sockaddr_in));
+ group.sin_len = sizeof(struct sockaddr_in);
+ group.sin_family = AF_INET;
+ group.sin_addr = ip->ip_dst;
+
+ blocked = imo_multi_filter(imo, m->m_pkthdr.rcvif,
+ (struct sockaddr *)&group,
+ (struct sockaddr *)&udp_in[0]);
+ if (blocked != MCAST_PASS) {
+ if (blocked == MCAST_NOTGMEMBER)
+ IPSTAT_INC(ips_notmember);
+ if (blocked == MCAST_NOTSMEMBER ||
+ blocked == MCAST_MUTED)
+ UDPSTAT_INC(udps_filtermcast);
+ continue;
+ }
+ }
+ if ((n = m_copym(m, 0, M_COPYALL, M_NOWAIT)) != NULL) {
+ if (proto == IPPROTO_UDPLITE)
+ UDPLITE_PROBE(receive, NULL, inp, ip, inp, uh);
+ else
+ UDP_PROBE(receive, NULL, inp, ip, inp, uh);
+ if (udp_append(inp, ip, n, sizeof(struct ip), udp_in)) {
+ INP_RUNLOCK(inp);
+ break;
+ } else
+ appends++;
+ }
+ /*
+ * Don't look for additional matches if this one does
+ * not have either the SO_REUSEPORT or SO_REUSEADDR
+ * socket options set. This heuristic avoids
+ * searching through all pcbs in the common case of a
+ * non-shared port. It assumes that an application
+ * will never clear these options after setting them.
+ */
+ if ((inp->inp_socket->so_options &
+ (SO_REUSEPORT|SO_REUSEPORT_LB|SO_REUSEADDR)) == 0) {
+ INP_RUNLOCK(inp);
+ break;
+ }
+ }
+ m_freem(m);
+
+ if (appends == 0) {
+ /*
+ * No matching pcb found; discard datagram. (No need
+ * to send an ICMP Port Unreachable for a broadcast
+ * or multicast datgram.)
+ */
+ UDPSTAT_INC(udps_noport);
+ if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)))
+ UDPSTAT_INC(udps_noportmcast);
+ else
+ UDPSTAT_INC(udps_noportbcast);
+ }
+
+ return (IPPROTO_DONE);
+}
+
int
udp_input(struct mbuf **mp, int *offp, int proto)
{
@@ -524,140 +638,15 @@
}
}
- pcbinfo = udp_get_inpcbinfo(proto);
if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) ||
- in_broadcast(ip->ip_dst, ifp)) {
- struct inpcb *last;
- struct inpcbhead *pcblist;
-
- NET_EPOCH_ASSERT();
-
- pcblist = udp_get_pcblist(proto);
- last = NULL;
- CK_LIST_FOREACH(inp, pcblist, inp_list) {
- if (inp->inp_lport != uh->uh_dport)
- continue;
-#ifdef INET6
- if ((inp->inp_vflag & INP_IPV4) == 0)
- continue;
-#endif
- if (inp->inp_laddr.s_addr != INADDR_ANY &&
- inp->inp_laddr.s_addr != ip->ip_dst.s_addr)
- continue;
- if (inp->inp_faddr.s_addr != INADDR_ANY &&
- inp->inp_faddr.s_addr != ip->ip_src.s_addr)
- continue;
- if (inp->inp_fport != 0 &&
- inp->inp_fport != uh->uh_sport)
- continue;
-
- INP_RLOCK(inp);
-
- if (__predict_false(inp->inp_flags2 & INP_FREED)) {
- INP_RUNLOCK(inp);
- continue;
- }
-
- /*
- * XXXRW: Because we weren't holding either the inpcb
- * or the hash lock when we checked for a match
- * before, we should probably recheck now that the
- * inpcb lock is held.
- */
-
- /*
- * Handle socket delivery policy for any-source
- * and source-specific multicast. [RFC3678]
- */
- if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) {
- struct ip_moptions *imo;
- struct sockaddr_in group;
- int blocked;
-
- imo = inp->inp_moptions;
- if (imo == NULL) {
- INP_RUNLOCK(inp);
- continue;
- }
- bzero(&group, sizeof(struct sockaddr_in));
- group.sin_len = sizeof(struct sockaddr_in);
- group.sin_family = AF_INET;
- group.sin_addr = ip->ip_dst;
-
- blocked = imo_multi_filter(imo, ifp,
- (struct sockaddr *)&group,
- (struct sockaddr *)&udp_in[0]);
- if (blocked != MCAST_PASS) {
- if (blocked == MCAST_NOTGMEMBER)
- IPSTAT_INC(ips_notmember);
- if (blocked == MCAST_NOTSMEMBER ||
- blocked == MCAST_MUTED)
- UDPSTAT_INC(udps_filtermcast);
- INP_RUNLOCK(inp);
- continue;
- }
- }
- if (last != NULL) {
- struct mbuf *n;
-
- if ((n = m_copym(m, 0, M_COPYALL, M_NOWAIT)) !=
- NULL) {
- if (proto == IPPROTO_UDPLITE)
- UDPLITE_PROBE(receive, NULL, last, ip,
- last, uh);
- else
- UDP_PROBE(receive, NULL, last, ip, last,
- uh);
- if (udp_append(last, ip, n, iphlen,
- udp_in)) {
- INP_RUNLOCK(inp);
- goto badunlocked;
- }
- }
- /* Release PCB lock taken on previous pass. */
- INP_RUNLOCK(last);
- }
- last = inp;
- /*
- * Don't look for additional matches if this one does
- * not have either the SO_REUSEPORT or SO_REUSEADDR
- * socket options set. This heuristic avoids
- * searching through all pcbs in the common case of a
- * non-shared port. It assumes that an application
- * will never clear these options after setting them.
- */
- if ((last->inp_socket->so_options &
- (SO_REUSEPORT|SO_REUSEPORT_LB|SO_REUSEADDR)) == 0)
- break;
- }
+ in_broadcast(ip->ip_dst, ifp))
+ return (udp_multi_input(m, proto, udp_in));
- if (last == NULL) {
- /*
- * No matching pcb found; discard datagram. (No need
- * to send an ICMP Port Unreachable for a broadcast
- * or multicast datgram.)
- */
- UDPSTAT_INC(udps_noport);
- if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)))
- UDPSTAT_INC(udps_noportmcast);
- else
- UDPSTAT_INC(udps_noportbcast);
- goto badunlocked;
- }
- if (proto == IPPROTO_UDPLITE)
- UDPLITE_PROBE(receive, NULL, last, ip, last, uh);
- else
- UDP_PROBE(receive, NULL, last, ip, last, uh);
- if (udp_append(last, ip, m, iphlen, udp_in) == 0)
- INP_RUNLOCK(last);
- return (IPPROTO_DONE);
- }
+ pcbinfo = udp_get_inpcbinfo(proto);
/*
* Locate pcb for datagram.
- */
-
- /*
+ *
* Grab info from PACKET_TAG_IPFORWARD tag prepended to the chain.
*/
if ((m->m_flags & M_IP_NEXTHOP) &&
@@ -857,8 +846,9 @@
static int
udp_pcblist(SYSCTL_HANDLER_ARGS)
{
+ struct inpcb_iterator inpi = INP_ALL_ITERATOR(&V_udbinfo,
+ INPLOOKUP_RLOCKPCB);
struct xinpgen xig;
- struct epoch_tracker et;
struct inpcb *inp;
int error;
@@ -886,24 +876,19 @@
if (error)
return (error);
- NET_EPOCH_ENTER(et);
- for (inp = CK_LIST_FIRST(V_udbinfo.ipi_listhead);
- inp != NULL;
- inp = CK_LIST_NEXT(inp, inp_list)) {
- INP_RLOCK(inp);
+ while ((inp = inp_next(&inpi)) != NULL) {
if (inp->inp_gencnt <= xig.xig_gen &&
cr_canseeinpcb(req->td->td_ucred, inp) == 0) {
struct xinpcb xi;
in_pcbtoxinpcb(inp, &xi);
- INP_RUNLOCK(inp);
error = SYSCTL_OUT(req, &xi, sizeof xi);
- if (error)
+ if (error) {
+ INP_RUNLOCK(inp);
break;
- } else
- INP_RUNLOCK(inp);
+ }
+ }
}
- NET_EPOCH_EXIT(et);
if (!error) {
/*
@@ -1289,15 +1274,16 @@
laddr = inp->inp_laddr;
lport = inp->inp_lport;
if (src.sin_family == AF_INET) {
- INP_HASH_LOCK_ASSERT(pcbinfo);
if ((lport == 0) ||
(laddr.s_addr == INADDR_ANY &&
src.sin_addr.s_addr == INADDR_ANY)) {
error = EINVAL;
goto release;
}
+ INP_HASH_WLOCK(pcbinfo);
error = in_pcbbind_setup(inp, (struct sockaddr *)&src,
&laddr.s_addr, &lport, td->td_ucred);
+ INP_HASH_WUNLOCK(pcbinfo);
if (error)
goto release;
}
@@ -1340,12 +1326,14 @@
inp->inp_lport == 0 ||
sin->sin_addr.s_addr == INADDR_ANY ||
sin->sin_addr.s_addr == INADDR_BROADCAST) {
- INP_HASH_LOCK_ASSERT(pcbinfo);
+ INP_HASH_WLOCK(pcbinfo);
error = in_pcbconnect_setup(inp, addr, &laddr.s_addr,
&lport, &faddr.s_addr, &fport, NULL,
td->td_ucred);
- if (error)
+ if (error) {
+ INP_HASH_WUNLOCK(pcbinfo);
goto release;
+ }
/*
* XXXRW: Why not commit the port if the address is
@@ -1362,7 +1350,6 @@
if (prison_flag(td->td_ucred, PR_IP4))
inp->inp_laddr = laddr;
inp->inp_lport = lport;
- INP_HASH_WLOCK(pcbinfo);
error = in_pcbinshash(inp);
INP_HASH_WUNLOCK(pcbinfo);
if (error != 0) {
@@ -1371,7 +1358,8 @@
goto release;
}
inp->inp_flags |= INP_ANONPORT;
- }
+ } else
+ INP_HASH_WUNLOCK(pcbinfo);
} else {
faddr = sin->sin_addr;
fport = sin->sin_port;
@@ -1565,12 +1553,9 @@
error = soreserve(so, udp_sendspace, udp_recvspace);
if (error)
return (error);
- INP_INFO_WLOCK(pcbinfo);
error = in_pcballoc(so, pcbinfo);
- if (error) {
- INP_INFO_WUNLOCK(pcbinfo);
+ if (error)
return (error);
- }
inp = sotoinpcb(so);
inp->inp_vflag |= INP_IPV4;
@@ -1582,12 +1567,10 @@
if (error) {
in_pcbdetach(inp);
in_pcbfree(inp);
- INP_INFO_WUNLOCK(pcbinfo);
return (error);
}
-
INP_WUNLOCK(inp);
- INP_INFO_WUNLOCK(pcbinfo);
+
return (0);
}
#endif /* INET */
@@ -1723,14 +1706,12 @@
KASSERT(inp != NULL, ("udp_detach: inp == NULL"));
KASSERT(inp->inp_faddr.s_addr == INADDR_ANY,
("udp_detach: not disconnected"));
- INP_INFO_WLOCK(pcbinfo);
INP_WLOCK(inp);
up = intoudpcb(inp);
KASSERT(up != NULL, ("%s: up == NULL", __func__));
inp->inp_ppcb = NULL;
in_pcbdetach(inp);
in_pcbfree(inp);
- INP_INFO_WUNLOCK(pcbinfo);
udp_discardcb(up);
}
Index: sys/netinet/udp_var.h
===================================================================
--- sys/netinet/udp_var.h
+++ sys/netinet/udp_var.h
@@ -136,13 +136,9 @@
SYSCTL_DECL(_net_inet_udp);
extern struct pr_usrreqs udp_usrreqs;
-VNET_DECLARE(struct inpcbhead, udb);
VNET_DECLARE(struct inpcbinfo, udbinfo);
-VNET_DECLARE(struct inpcbhead, ulitecb);
VNET_DECLARE(struct inpcbinfo, ulitecbinfo);
-#define V_udb VNET(udb)
#define V_udbinfo VNET(udbinfo)
-#define V_ulitecb VNET(ulitecb)
#define V_ulitecbinfo VNET(ulitecbinfo)
extern u_long udp_sendspace;
@@ -163,12 +159,6 @@
return (protocol == IPPROTO_UDP) ? &V_udbinfo : &V_ulitecbinfo;
}
-static __inline struct inpcbhead *
-udp_get_pcblist(int protocol)
-{
- return (protocol == IPPROTO_UDP) ? &V_udb : &V_ulitecb;
-}
-
int udp_newudpcb(struct inpcb *);
void udp_discardcb(struct udpcb *);
Index: sys/netinet6/icmp6.c
===================================================================
--- sys/netinet6/icmp6.c
+++ sys/netinet6/icmp6.c
@@ -124,14 +124,12 @@
#endif /* VIMAGE */
VNET_DECLARE(struct inpcbinfo, ripcbinfo);
-VNET_DECLARE(struct inpcbhead, ripcb);
VNET_DECLARE(int, icmp6errppslim);
VNET_DEFINE_STATIC(int, icmp6errpps_count) = 0;
VNET_DEFINE_STATIC(struct timeval, icmp6errppslim_last);
VNET_DECLARE(int, icmp6_nodeinfo);
#define V_ripcbinfo VNET(ripcbinfo)
-#define V_ripcb VNET(ripcb)
#define V_icmp6errppslim VNET(icmp6errppslim)
#define V_icmp6errpps_count VNET(icmp6errpps_count)
#define V_icmp6errppslim_last VNET(icmp6errppslim_last)
@@ -1875,21 +1873,39 @@
return (copied);
}
+static bool
+icmp6_rip6_match(const struct inpcb *inp, void *v)
+{
+ struct ip6_hdr *ip6 = v;
+
+ if ((inp->inp_vflag & INP_IPV6) == 0)
+ return (false);
+ if (inp->inp_ip_p != IPPROTO_ICMPV6)
+ return (false);
+ if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) &&
+ !IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr, &ip6->ip6_dst))
+ return (false);
+ if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr) &&
+ !IN6_ARE_ADDR_EQUAL(&inp->in6p_faddr, &ip6->ip6_src))
+ return (false);
+ return (true);
+}
+
/*
* XXX almost dup'ed code with rip6_input.
*/
static int
icmp6_rip6_input(struct mbuf **mp, int off)
{
- struct mbuf *m = *mp;
+ struct mbuf *n, *m = *mp;
struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
+ struct inpcb_iterator inpi = INP_ITERATOR(&V_ripcbinfo,
+ INPLOOKUP_RLOCKPCB, icmp6_rip6_match, ip6);
struct inpcb *inp;
- struct inpcb *last = NULL;
struct sockaddr_in6 fromsa;
struct icmp6_hdr *icmp6;
struct mbuf *opts = NULL;
-
- NET_EPOCH_ASSERT();
+ int delivered = 0;
/* This is assumed to be safe; icmp6_input() does a pullup. */
icmp6 = (struct icmp6_hdr *)((caddr_t)ip6 + off);
@@ -1908,125 +1924,64 @@
return (IPPROTO_DONE);
}
- CK_LIST_FOREACH(inp, &V_ripcb, inp_list) {
- if ((inp->inp_vflag & INP_IPV6) == 0)
- continue;
- if (inp->inp_ip_p != IPPROTO_ICMPV6)
- continue;
- if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) &&
- !IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr, &ip6->ip6_dst))
- continue;
- if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr) &&
- !IN6_ARE_ADDR_EQUAL(&inp->in6p_faddr, &ip6->ip6_src))
- continue;
- INP_RLOCK(inp);
- if (__predict_false(inp->inp_flags2 & INP_FREED)) {
- INP_RUNLOCK(inp);
- continue;
- }
+ while ((inp = inp_next(&inpi)) != NULL) {
if (ICMP6_FILTER_WILLBLOCK(icmp6->icmp6_type,
- inp->in6p_icmp6filt)) {
- INP_RUNLOCK(inp);
+ inp->in6p_icmp6filt))
continue;
- }
- if (last != NULL) {
- struct mbuf *n = NULL;
-
- /*
- * Recent network drivers tend to allocate a single
- * mbuf cluster, rather than to make a couple of
- * mbufs without clusters. Also, since the IPv6 code
- * path tries to avoid m_pullup(), it is highly
- * probable that we still have an mbuf cluster here
- * even though the necessary length can be stored in an
- * mbuf's internal buffer.
- * Meanwhile, the default size of the receive socket
- * buffer for raw sockets is not so large. This means
- * the possibility of packet loss is relatively higher
- * than before. To avoid this scenario, we copy the
- * received data to a separate mbuf that does not use
- * a cluster, if possible.
- * XXX: it is better to copy the data after stripping
- * intermediate headers.
- */
- if ((m->m_flags & M_EXT) && m->m_next == NULL &&
- m->m_len <= MHLEN) {
- n = m_get(M_NOWAIT, m->m_type);
- if (n != NULL) {
- if (m_dup_pkthdr(n, m, M_NOWAIT)) {
- bcopy(m->m_data, n->m_data,
- m->m_len);
- n->m_len = m->m_len;
- } else {
- m_free(n);
- n = NULL;
- }
- }
- }
- if (n != NULL ||
- (n = m_copym(m, 0, M_COPYALL, M_NOWAIT)) != NULL) {
- if (last->inp_flags & INP_CONTROLOPTS)
- ip6_savecontrol(last, n, &opts);
- /* strip intermediate headers */
- m_adj(n, off);
- SOCKBUF_LOCK(&last->inp_socket->so_rcv);
- if (sbappendaddr_locked(
- &last->inp_socket->so_rcv,
- (struct sockaddr *)&fromsa, n, opts)
- == 0) {
- soroverflow_locked(last->inp_socket);
- m_freem(n);
- if (opts) {
- m_freem(opts);
- }
- } else
- sorwakeup_locked(last->inp_socket);
- opts = NULL;
- }
- INP_RUNLOCK(last);
- }
- last = inp;
- }
- if (last != NULL) {
- if (last->inp_flags & INP_CONTROLOPTS)
- ip6_savecontrol(last, m, &opts);
- /* strip intermediate headers */
- m_adj(m, off);
-
- /* avoid using mbuf clusters if possible (see above) */
+ /*
+ * Recent network drivers tend to allocate a single
+ * mbuf cluster, rather than to make a couple of
+ * mbufs without clusters. Also, since the IPv6 code
+ * path tries to avoid m_pullup(), it is highly
+ * probable that we still have an mbuf cluster here
+ * even though the necessary length can be stored in an
+ * mbuf's internal buffer.
+ * Meanwhile, the default size of the receive socket
+ * buffer for raw sockets is not so large. This means
+ * the possibility of packet loss is relatively higher
+ * than before. To avoid this scenario, we copy the
+ * received data to a separate mbuf that does not use
+ * a cluster, if possible.
+ * XXX: it is better to copy the data after stripping
+ * intermediate headers.
+ */
if ((m->m_flags & M_EXT) && m->m_next == NULL &&
m->m_len <= MHLEN) {
- struct mbuf *n;
-
n = m_get(M_NOWAIT, m->m_type);
if (n != NULL) {
if (m_dup_pkthdr(n, m, M_NOWAIT)) {
bcopy(m->m_data, n->m_data, m->m_len);
n->m_len = m->m_len;
-
- m_freem(m);
- m = n;
} else {
- m_freem(n);
+ m_free(n);
n = NULL;
}
}
- }
- SOCKBUF_LOCK(&last->inp_socket->so_rcv);
- if (sbappendaddr_locked(&last->inp_socket->so_rcv,
- (struct sockaddr *)&fromsa, m, opts) == 0) {
- m_freem(m);
+ } else
+ n = m_copym(m, 0, M_COPYALL, M_NOWAIT);
+ if (n == NULL)
+ continue;
+ if (inp->inp_flags & INP_CONTROLOPTS)
+ ip6_savecontrol(inp, n, &opts);
+ /* strip intermediate headers */
+ m_adj(n, off);
+ SOCKBUF_LOCK(&inp->inp_socket->so_rcv);
+ if (sbappendaddr_locked(&inp->inp_socket->so_rcv,
+ (struct sockaddr *)&fromsa, n, opts) == 0) {
+ soroverflow_locked(inp->inp_socket);
+ m_freem(n);
if (opts)
m_freem(opts);
- soroverflow_locked(last->inp_socket);
- } else
- sorwakeup_locked(last->inp_socket);
- INP_RUNLOCK(last);
- } else {
- m_freem(m);
- IP6STAT_DEC(ip6s_delivered);
+ } else {
+ sorwakeup_locked(inp->inp_socket);
+ delivered++;
+ }
+ opts = NULL;
}
+ m_freem(m);
*mp = NULL;
+ if (delivered == 0)
+ IP6STAT_DEC(ip6s_delivered);
return (IPPROTO_DONE);
}
Index: sys/netinet6/in6_pcb.c
===================================================================
--- sys/netinet6/in6_pcb.c
+++ sys/netinet6/in6_pcb.c
@@ -718,7 +718,7 @@
}
errno = inet6ctlerrmap[cmd];
INP_INFO_WLOCK(pcbinfo);
- CK_LIST_FOREACH_SAFE(inp, pcbinfo->ipi_listhead, inp_list, inp_temp) {
+ CK_LIST_FOREACH_SAFE(inp, &pcbinfo->ipi_listhead, inp_list, inp_temp) {
INP_WLOCK(inp);
if ((inp->inp_vflag & INP_IPV6) == 0) {
INP_WUNLOCK(inp);
@@ -868,49 +868,54 @@
}
}
+static bool
+in6_multi_match(const struct inpcb *inp, void *v __unused)
+{
+
+ if ((inp->inp_vflag & INP_IPV6) && inp->in6p_moptions != NULL)
+ return (true);
+ else
+ return (false);
+}
+
void
in6_pcbpurgeif0(struct inpcbinfo *pcbinfo, struct ifnet *ifp)
{
+ struct inpcb_iterator inpi = INP_ITERATOR(pcbinfo, INPLOOKUP_RLOCKPCB,
+ in6_multi_match, NULL);
struct inpcb *inp;
struct in6_multi *inm;
struct in6_mfilter *imf;
struct ip6_moptions *im6o;
- INP_INFO_WLOCK(pcbinfo);
- CK_LIST_FOREACH(inp, pcbinfo->ipi_listhead, inp_list) {
- INP_WLOCK(inp);
- if (__predict_false(inp->inp_flags2 & INP_FREED)) {
- INP_WUNLOCK(inp);
- continue;
- }
+ IN6_MULTI_LOCK_ASSERT();
+
+ while ((inp = inp_next(&inpi)) != NULL) {
+ INP_RLOCK_ASSERT(inp);
+
im6o = inp->in6p_moptions;
- if ((inp->inp_vflag & INP_IPV6) && im6o != NULL) {
- /*
- * Unselect the outgoing ifp for multicast if it
- * is being detached.
- */
- if (im6o->im6o_multicast_ifp == ifp)
- im6o->im6o_multicast_ifp = NULL;
- /*
- * Drop multicast group membership if we joined
- * through the interface being detached.
- */
+ /*
+ * Unselect the outgoing ifp for multicast if it
+ * is being detached.
+ */
+ if (im6o->im6o_multicast_ifp == ifp)
+ im6o->im6o_multicast_ifp = NULL;
+ /*
+ * Drop multicast group membership if we joined
+ * through the interface being detached.
+ */
restart:
- IP6_MFILTER_FOREACH(imf, &im6o->im6o_head) {
- if ((inm = imf->im6f_in6m) == NULL)
- continue;
- if (inm->in6m_ifp != ifp)
- continue;
- ip6_mfilter_remove(&im6o->im6o_head, imf);
- IN6_MULTI_LOCK_ASSERT();
- in6_leavegroup_locked(inm, NULL);
- ip6_mfilter_free(imf);
- goto restart;
- }
+ IP6_MFILTER_FOREACH(imf, &im6o->im6o_head) {
+ if ((inm = imf->im6f_in6m) == NULL)
+ continue;
+ if (inm->in6m_ifp != ifp)
+ continue;
+ ip6_mfilter_remove(&im6o->im6o_head, imf);
+ in6_leavegroup_locked(inm, NULL);
+ ip6_mfilter_free(imf);
+ goto restart;
}
- INP_WUNLOCK(inp);
}
- INP_INFO_WUNLOCK(pcbinfo);
}
/*
@@ -1126,20 +1131,16 @@
{
struct inpcb *inp;
+ smr_enter(pcbinfo->ipi_smr);
inp = in6_pcblookup_hash_locked(pcbinfo, faddr, fport, laddr, lport,
lookupflags & INPLOOKUP_WILDCARD, ifp, numa_domain);
if (inp != NULL) {
- if (lookupflags & INPLOOKUP_WLOCKPCB) {
- INP_WLOCK(inp);
- } else if (lookupflags & INPLOOKUP_RLOCKPCB) {
- INP_RLOCK(inp);
- } else
- panic("%s: locking bug", __func__);
- if (__predict_false(inp->inp_flags2 & INP_FREED)) {
- INP_UNLOCK(inp);
+ if (__predict_false(inp_smr_lock(inp,
+ (lookupflags & INPLOOKUP_LOCKMASK)) == false))
inp = NULL;
- }
- }
+ } else
+ smr_exit(pcbinfo->ipi_smr);
+
return (inp);
}
Index: sys/netinet6/ip6_gre.c
===================================================================
--- sys/netinet6/ip6_gre.c
+++ sys/netinet6/ip6_gre.c
@@ -216,30 +216,15 @@
in6_gre_udp_input(struct mbuf *m, int off, struct inpcb *inp,
const struct sockaddr *sa, void *ctx)
{
- struct epoch_tracker et;
struct gre_socket *gs;
struct gre_softc *sc;
struct sockaddr_in6 dst;
- NET_EPOCH_ENTER(et);
- /*
- * udp_append() holds reference to inp, it is safe to check
- * inp_flags2 without INP_RLOCK().
- * If socket was closed before we have entered NET_EPOCH section,
- * INP_FREED flag should be set. Otherwise it should be safe to
- * make access to ctx data, because gre_so will be freed by
- * gre_sofree() via NET_EPOCH_CALL().
- */
- if (__predict_false(inp->inp_flags2 & INP_FREED)) {
- NET_EPOCH_EXIT(et);
- m_freem(m);
- return;
- }
+ NET_EPOCH_ASSERT();
gs = (struct gre_socket *)ctx;
dst = *(const struct sockaddr_in6 *)sa;
if (sa6_embedscope(&dst, 0)) {
- NET_EPOCH_EXIT(et);
m_freem(m);
return;
}
@@ -249,11 +234,9 @@
}
if (sc != NULL && (GRE2IFP(sc)->if_flags & IFF_UP) != 0){
gre_input(m, off + sizeof(struct udphdr), IPPROTO_UDP, sc);
- NET_EPOCH_EXIT(et);
return;
}
m_freem(m);
- NET_EPOCH_EXIT(et);
}
static int
Index: sys/netinet6/raw_ip6.c
===================================================================
--- sys/netinet6/raw_ip6.c
+++ sys/netinet6/raw_ip6.c
@@ -119,9 +119,7 @@
* Raw interface to IP6 protocol.
*/
-VNET_DECLARE(struct inpcbhead, ripcb);
VNET_DECLARE(struct inpcbinfo, ripcbinfo);
-#define V_ripcb VNET(ripcb)
#define V_ripcbinfo VNET(ripcbinfo)
extern u_long rip_sendspace;
@@ -153,6 +151,33 @@
int (*ip6_mforward)(struct ip6_hdr *, struct ifnet *, struct mbuf *);
int (*mrt6_ioctl)(u_long, caddr_t);
+struct rip6_inp_match_ctx {
+ struct ip6_hdr *ip6;
+ int proto;
+};
+
+static bool
+rip6_inp_match(const struct inpcb *inp, void *v)
+{
+ struct rip6_inp_match_ctx *c = v;
+ struct ip6_hdr *ip6 = c->ip6;
+ int proto = c->proto;
+
+ /* XXX inp locking */
+ if ((inp->inp_vflag & INP_IPV6) == 0)
+ return (false);
+ if (inp->inp_ip_p && inp->inp_ip_p != proto)
+ return (false);
+ if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) &&
+ !IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr, &ip6->ip6_dst))
+ return (false);
+ if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr) &&
+ !IN6_ARE_ADDR_EQUAL(&inp->in6p_faddr, &ip6->ip6_src))
+ return (false);
+
+ return (true);
+}
+
/*
* Setup generic address and protocol structures for raw_input routine, then
* pass them along with mbuf chain.
@@ -161,12 +186,15 @@
rip6_input(struct mbuf **mp, int *offp, int proto)
{
struct ifnet *ifp;
- struct mbuf *m = *mp;
+ struct mbuf *n, *m = *mp;
struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
struct inpcb *inp;
- struct inpcb *last = NULL;
struct mbuf *opts = NULL;
struct sockaddr_in6 fromsa;
+ struct rip6_inp_match_ctx ctx = { .ip6 = ip6, .proto = proto };
+ struct inpcb_iterator inpi = INP_ITERATOR(&V_ripcbinfo,
+ INPLOOKUP_RLOCKPCB, rip6_inp_match, &ctx);
+ int delivered = 0;
NET_EPOCH_ASSERT();
@@ -176,70 +204,27 @@
ifp = m->m_pkthdr.rcvif;
- CK_LIST_FOREACH(inp, &V_ripcb, inp_list) {
- /* XXX inp locking */
- if ((inp->inp_vflag & INP_IPV6) == 0)
- continue;
- if (inp->inp_ip_p &&
- inp->inp_ip_p != proto)
- continue;
- if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) &&
- !IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr, &ip6->ip6_dst))
- continue;
- if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr) &&
- !IN6_ARE_ADDR_EQUAL(&inp->in6p_faddr, &ip6->ip6_src))
- continue;
- if (last != NULL) {
- struct mbuf *n = m_copym(m, 0, M_COPYALL, M_NOWAIT);
-
+ while ((inp = inp_next(&inpi)) != NULL) {
+ INP_RLOCK_ASSERT(inp);
#if defined(IPSEC) || defined(IPSEC_SUPPORT)
- /*
- * Check AH/ESP integrity.
- */
- if (IPSEC_ENABLED(ipv6)) {
- if (n != NULL &&
- IPSEC_CHECK_POLICY(ipv6, n, last) != 0) {
- m_freem(n);
- /* Do not inject data into pcb. */
- n = NULL;
- }
- }
-#endif /* IPSEC */
- if (n) {
- if (last->inp_flags & INP_CONTROLOPTS ||
- last->inp_socket->so_options & SO_TIMESTAMP)
- ip6_savecontrol(last, n, &opts);
- /* strip intermediate headers */
- m_adj(n, *offp);
- if (sbappendaddr(&last->inp_socket->so_rcv,
- (struct sockaddr *)&fromsa,
- n, opts) == 0) {
- soroverflow(last->inp_socket);
- m_freem(n);
- if (opts)
- m_freem(opts);
- RIP6STAT_INC(rip6s_fullsock);
- } else
- sorwakeup(last->inp_socket);
- opts = NULL;
- }
- INP_RUNLOCK(last);
- last = NULL;
+ /*
+ * Check AH/ESP integrity.
+ */
+ if (IPSEC_ENABLED(ipv6) &&
+ IPSEC_CHECK_POLICY(ipv6, m, inp) != 0) {
+ /* Do not inject data into pcb. */
+ continue;
}
- INP_RLOCK(inp);
- if (__predict_false(inp->inp_flags2 & INP_FREED))
- goto skip_2;
- if (jailed_without_vnet(inp->inp_cred)) {
+#endif /* IPSEC */
+ if (jailed_without_vnet(inp->inp_cred) &&
+ !IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) &&
+ prison_check_ip6(inp->inp_cred, &ip6->ip6_dst) != 0)
/*
* Allow raw socket in jail to receive multicast;
* assume process had PRIV_NETINET_RAW at attach,
* and fall through into normal filter path if so.
*/
- if (!IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) &&
- prison_check_ip6(inp->inp_cred,
- &ip6->ip6_dst) != 0)
- goto skip_2;
- }
+ continue;
if (inp->in6p_cksum != -1) {
RIP6STAT_INC(rip6s_isum);
if (m->m_pkthdr.len - (*offp + inp->in6p_cksum) < 2 ||
@@ -251,8 +236,9 @@
* ICMP6 message. Set proto to IPPROTO_NONE
* to achieve that.
*/
+ INP_RUNLOCK(inp);
proto = IPPROTO_NONE;
- goto skip_2;
+ break;
}
}
/*
@@ -298,43 +284,30 @@
}
if (blocked != MCAST_PASS) {
IP6STAT_INC(ip6s_notmember);
- goto skip_2;
+ continue;
}
}
- last = inp;
- continue;
-skip_2:
- INP_RUNLOCK(inp);
- }
-#if defined(IPSEC) || defined(IPSEC_SUPPORT)
- /*
- * Check AH/ESP integrity.
- */
- if (IPSEC_ENABLED(ipv6) && last != NULL &&
- IPSEC_CHECK_POLICY(ipv6, m, last) != 0) {
- m_freem(m);
- IP6STAT_DEC(ip6s_delivered);
- /* Do not inject data into pcb. */
- INP_RUNLOCK(last);
- } else
-#endif /* IPSEC */
- if (last != NULL) {
- if (last->inp_flags & INP_CONTROLOPTS ||
- last->inp_socket->so_options & SO_TIMESTAMP)
- ip6_savecontrol(last, m, &opts);
- /* Strip intermediate headers. */
- m_adj(m, *offp);
- if (sbappendaddr(&last->inp_socket->so_rcv,
- (struct sockaddr *)&fromsa, m, opts) == 0) {
- soroverflow(last->inp_socket);
- m_freem(m);
+ if ((n = m_copym(m, 0, M_COPYALL, M_NOWAIT)) == NULL)
+ continue;
+ if (inp->inp_flags & INP_CONTROLOPTS ||
+ inp->inp_socket->so_options & SO_TIMESTAMP)
+ ip6_savecontrol(inp, n, &opts);
+ /* strip intermediate headers */
+ m_adj(n, *offp);
+ if (sbappendaddr(&inp->inp_socket->so_rcv,
+ (struct sockaddr *)&fromsa, n, opts) == 0) {
+ soroverflow(inp->inp_socket);
+ m_freem(n);
if (opts)
m_freem(opts);
RIP6STAT_INC(rip6s_fullsock);
- } else
- sorwakeup(last->inp_socket);
- INP_RUNLOCK(last);
- } else {
+ } else {
+ sorwakeup(inp->inp_socket);
+ delivered++;
+ }
+ opts = NULL;
+ }
+ if (delivered == 0) {
RIP6STAT_INC(rip6s_nosock);
if (m->m_flags & M_MCAST)
RIP6STAT_INC(rip6s_nosockmcast);
@@ -345,7 +318,8 @@
ICMP6_PARAMPROB_NEXTHEADER,
ip6_get_prevhdr(m, *offp));
IP6STAT_DEC(ip6s_delivered);
- }
+ } else
+ m_freem(m);
return (IPPROTO_DONE);
}
@@ -678,15 +652,12 @@
filter = malloc(sizeof(struct icmp6_filter), M_PCB, M_NOWAIT);
if (filter == NULL)
return (ENOMEM);
- INP_INFO_WLOCK(&V_ripcbinfo);
error = in_pcballoc(so, &V_ripcbinfo);
if (error) {
- INP_INFO_WUNLOCK(&V_ripcbinfo);
free(filter, M_PCB);
return (error);
}
inp = (struct inpcb *)so->so_pcb;
- INP_INFO_WUNLOCK(&V_ripcbinfo);
inp->inp_vflag |= INP_IPV6;
inp->inp_ip_p = (long)proto;
inp->in6p_hops = -1; /* use kernel default */
@@ -708,12 +679,10 @@
if (so == V_ip6_mrouter && ip6_mrouter_done)
ip6_mrouter_done();
/* xxx: RSVP */
- INP_INFO_WLOCK(&V_ripcbinfo);
INP_WLOCK(inp);
free(inp->in6p_icmp6filt, M_PCB);
in_pcbdetach(inp);
in_pcbfree(inp);
- INP_INFO_WUNLOCK(&V_ripcbinfo);
}
/* XXXRW: This can't ever be called. */
Index: sys/netinet6/udp6_usrreq.c
===================================================================
--- sys/netinet6/udp6_usrreq.c
+++ sys/netinet6/udp6_usrreq.c
@@ -207,6 +207,137 @@
return (0);
}
+struct udp6_multi_match_ctx {
+ struct ip6_hdr *ip6;
+ struct udphdr *uh;
+};
+
+static bool
+udp6_multi_match(const struct inpcb *inp, void *v)
+{
+ struct udp6_multi_match_ctx *ctx = v;
+
+ if ((inp->inp_vflag & INP_IPV6) == 0)
+ return(false);
+ if (inp->inp_lport != ctx->uh->uh_dport)
+ return(false);
+ if (inp->inp_fport != 0 && inp->inp_fport != ctx->uh->uh_sport)
+ return(false);
+ if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) &&
+ !IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr, &ctx->ip6->ip6_dst))
+ return (false);
+ if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr) &&
+ (!IN6_ARE_ADDR_EQUAL(&inp->in6p_faddr, &ctx->ip6->ip6_src) ||
+ inp->inp_fport != ctx->uh->uh_sport))
+ return (false);
+
+ return (true);
+}
+
+static int
+udp6_multi_input(struct mbuf *m, int off, int proto,
+ struct sockaddr_in6 *fromsa)
+{
+ struct udp6_multi_match_ctx ctx;
+ struct inpcb_iterator inpi = INP_ITERATOR(udp_get_inpcbinfo(proto),
+ INPLOOKUP_RLOCKPCB, udp6_multi_match, &ctx);
+ struct inpcb *inp;
+ struct ip6_moptions *imo;
+ struct mbuf *n;
+ int appends = 0;
+
+ /*
+ * In the event that laddr should be set to the link-local
+ * address (this happens in RIPng), the multicast address
+ * specified in the received packet will not match laddr. To
+ * handle this situation, matching is relaxed if the
+ * receiving interface is the same as one specified in the
+ * socket and if the destination multicast address matches
+ * one of the multicast groups specified in the socket.
+ */
+
+ /*
+ * KAME note: traditionally we dropped udpiphdr from mbuf
+ * here. We need udphdr for IPsec processing so we do that
+ * later.
+ */
+ ctx.ip6 = mtod(m, struct ip6_hdr *);
+ ctx.uh = (struct udphdr *)((char *)ctx.ip6 + off);
+ while ((inp = inp_next(&inpi)) != NULL) {
+ INP_RLOCK_ASSERT(inp);
+ /*
+ * XXXRW: Because we weren't holding either the inpcb
+ * or the hash lock when we checked for a match
+ * before, we should probably recheck now that the
+ * inpcb lock is (supposed to be) held.
+ */
+ /*
+ * Handle socket delivery policy for any-source
+ * and source-specific multicast. [RFC3678]
+ */
+ if ((imo = inp->in6p_moptions) != NULL) {
+ struct sockaddr_in6 mcaddr;
+ int blocked;
+
+ bzero(&mcaddr, sizeof(struct sockaddr_in6));
+ mcaddr.sin6_len = sizeof(struct sockaddr_in6);
+ mcaddr.sin6_family = AF_INET6;
+ mcaddr.sin6_addr = ctx.ip6->ip6_dst;
+
+ blocked = im6o_mc_filter(imo, m->m_pkthdr.rcvif,
+ (struct sockaddr *)&mcaddr,
+ (struct sockaddr *)&fromsa[0]);
+ if (blocked != MCAST_PASS) {
+ if (blocked == MCAST_NOTGMEMBER)
+ IP6STAT_INC(ip6s_notmember);
+ if (blocked == MCAST_NOTSMEMBER ||
+ blocked == MCAST_MUTED)
+ UDPSTAT_INC(udps_filtermcast);
+ continue;
+ }
+ }
+ if ((n = m_copym(m, 0, M_COPYALL, M_NOWAIT)) != NULL) {
+ if (proto == IPPROTO_UDPLITE)
+ UDPLITE_PROBE(receive, NULL, inp, ctx.ip6,
+ inp, ctx.uh);
+ else
+ UDP_PROBE(receive, NULL, inp, ctx.ip6, inp,
+ ctx.uh);
+ if (udp6_append(inp, n, off, fromsa)) {
+ INP_RUNLOCK(inp);
+ break;
+ } else
+ appends++;
+ }
+ /*
+ * Don't look for additional matches if this one does
+ * not have either the SO_REUSEPORT or SO_REUSEADDR
+ * socket options set. This heuristic avoids
+ * searching through all pcbs in the common case of a
+ * non-shared port. It assumes that an application
+ * will never clear these options after setting them.
+ */
+ if ((inp->inp_socket->so_options &
+ (SO_REUSEPORT|SO_REUSEPORT_LB|SO_REUSEADDR)) == 0) {
+ INP_RUNLOCK(inp);
+ break;
+ }
+ }
+ m_freem(m);
+
+ if (appends == 0) {
+ /*
+ * No matching pcb found; discard datagram. (No need
+ * to send an ICMP Port Unreachable for a broadcast
+ * or multicast datgram.)
+ */
+ UDPSTAT_INC(udps_noport);
+ UDPSTAT_INC(udps_noportmcast);
+ }
+
+ return (IPPROTO_DONE);
+}
+
int
udp6_input(struct mbuf **mp, int *offp, int proto)
{
@@ -311,144 +442,11 @@
fromsa[1].sin6_port = uh->uh_dport;
pcbinfo = udp_get_inpcbinfo(nxt);
- if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) {
- struct inpcb *last;
- struct inpcbhead *pcblist;
- struct ip6_moptions *imo;
-
- /*
- * In the event that laddr should be set to the link-local
- * address (this happens in RIPng), the multicast address
- * specified in the received packet will not match laddr. To
- * handle this situation, matching is relaxed if the
- * receiving interface is the same as one specified in the
- * socket and if the destination multicast address matches
- * one of the multicast groups specified in the socket.
- */
-
- /*
- * KAME note: traditionally we dropped udpiphdr from mbuf
- * here. We need udphdr for IPsec processing so we do that
- * later.
- */
- pcblist = udp_get_pcblist(nxt);
- last = NULL;
- CK_LIST_FOREACH(inp, pcblist, inp_list) {
- if ((inp->inp_vflag & INP_IPV6) == 0)
- continue;
- if (inp->inp_lport != uh->uh_dport)
- continue;
- if (inp->inp_fport != 0 &&
- inp->inp_fport != uh->uh_sport)
- continue;
- if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) {
- if (!IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr,
- &ip6->ip6_dst))
- continue;
- }
- if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) {
- if (!IN6_ARE_ADDR_EQUAL(&inp->in6p_faddr,
- &ip6->ip6_src) ||
- inp->inp_fport != uh->uh_sport)
- continue;
- }
-
- INP_RLOCK(inp);
-
- if (__predict_false(inp->inp_flags2 & INP_FREED)) {
- INP_RUNLOCK(inp);
- continue;
- }
-
- /*
- * XXXRW: Because we weren't holding either the inpcb
- * or the hash lock when we checked for a match
- * before, we should probably recheck now that the
- * inpcb lock is (supposed to be) held.
- */
-
- /*
- * Handle socket delivery policy for any-source
- * and source-specific multicast. [RFC3678]
- */
- imo = inp->in6p_moptions;
- if (imo != NULL) {
- struct sockaddr_in6 mcaddr;
- int blocked;
-
- bzero(&mcaddr, sizeof(struct sockaddr_in6));
- mcaddr.sin6_len = sizeof(struct sockaddr_in6);
- mcaddr.sin6_family = AF_INET6;
- mcaddr.sin6_addr = ip6->ip6_dst;
-
- blocked = im6o_mc_filter(imo, ifp,
- (struct sockaddr *)&mcaddr,
- (struct sockaddr *)&fromsa[0]);
- if (blocked != MCAST_PASS) {
- if (blocked == MCAST_NOTGMEMBER)
- IP6STAT_INC(ip6s_notmember);
- if (blocked == MCAST_NOTSMEMBER ||
- blocked == MCAST_MUTED)
- UDPSTAT_INC(udps_filtermcast);
- INP_RUNLOCK(inp);
- continue;
- }
- }
-
- if (last != NULL) {
- struct mbuf *n;
-
- if ((n = m_copym(m, 0, M_COPYALL, M_NOWAIT)) !=
- NULL) {
- if (nxt == IPPROTO_UDPLITE)
- UDPLITE_PROBE(receive, NULL,
- last, ip6, last, uh);
- else
- UDP_PROBE(receive, NULL, last,
- ip6, last, uh);
- if (udp6_append(last, n, off,
- fromsa)) {
- INP_RUNLOCK(inp);
- goto badunlocked;
- }
- }
- /* Release PCB lock taken on previous pass. */
- INP_RUNLOCK(last);
- }
- last = inp;
- /*
- * Don't look for additional matches if this one does
- * not have either the SO_REUSEPORT or SO_REUSEADDR
- * socket options set. This heuristic avoids
- * searching through all pcbs in the common case of a
- * non-shared port. It assumes that an application
- * will never clear these options after setting them.
- */
- if ((last->inp_socket->so_options &
- (SO_REUSEPORT|SO_REUSEPORT_LB|SO_REUSEADDR)) == 0)
- break;
- }
-
- if (last == NULL) {
- /*
- * No matching pcb found; discard datagram. (No need
- * to send an ICMP Port Unreachable for a broadcast
- * or multicast datgram.)
- */
- UDPSTAT_INC(udps_noport);
- UDPSTAT_INC(udps_noportmcast);
- goto badunlocked;
- }
-
- if (nxt == IPPROTO_UDPLITE)
- UDPLITE_PROBE(receive, NULL, last, ip6, last, uh);
- else
- UDP_PROBE(receive, NULL, last, ip6, last, uh);
- if (udp6_append(last, m, off, fromsa) == 0)
- INP_RUNLOCK(last);
+ if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) {
*mp = NULL;
- return (IPPROTO_DONE);
+ return (udp6_multi_input(m, off, proto, fromsa));
}
+
/*
* Locate pcb for datagram.
*/
@@ -1042,12 +1040,9 @@
if (error)
return (error);
}
- INP_INFO_WLOCK(pcbinfo);
error = in_pcballoc(so, pcbinfo);
- if (error) {
- INP_INFO_WUNLOCK(pcbinfo);
+ if (error)
return (error);
- }
inp = (struct inpcb *)so->so_pcb;
inp->inp_vflag |= INP_IPV6;
if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0)
@@ -1066,11 +1061,9 @@
if (error) {
in_pcbdetach(inp);
in_pcbfree(inp);
- INP_INFO_WUNLOCK(pcbinfo);
return (error);
}
INP_WUNLOCK(inp);
- INP_INFO_WUNLOCK(pcbinfo);
return (0);
}
@@ -1274,13 +1267,11 @@
inp = sotoinpcb(so);
KASSERT(inp != NULL, ("udp6_detach: inp == NULL"));
- INP_INFO_WLOCK(pcbinfo);
INP_WLOCK(inp);
up = intoudpcb(inp);
KASSERT(up != NULL, ("%s: up == NULL", __func__));
in_pcbdetach(inp);
in_pcbfree(inp);
- INP_INFO_WUNLOCK(pcbinfo);
udp_discardcb(up);
}
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Tue, Apr 22, 10:53 PM (7 h, 51 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
17711583
Default Alt Text
D32585.id97510.diff (106 KB)
Attached To
Mode
D32585: SMR protection for inpcbs
Attached
Detach File
Event Timeline
Log In to Comment