Page Menu
Home
FreeBSD
Search
Configure Global Search
Log In
Files
F107048744
D44151.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Flag For Later
Award Token
Size
39 KB
Referenced Files
None
Subscribers
None
D44151.diff
View Options
diff --git a/sys/kern/uipc_usrreq.c b/sys/kern/uipc_usrreq.c
--- a/sys/kern/uipc_usrreq.c
+++ b/sys/kern/uipc_usrreq.c
@@ -5,7 +5,7 @@
* The Regents of the University of California. All Rights Reserved.
* Copyright (c) 2004-2009 Robert N. M. Watson All Rights Reserved.
* Copyright (c) 2018 Matthew Macy
- * Copyright (c) 2022 Gleb Smirnoff <glebius@FreeBSD.org>
+ * Copyright (c) 2022-2024 Gleb Smirnoff <glebius@FreeBSD.org>
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -141,11 +141,14 @@
static struct task unp_defer_task;
/*
- * Both send and receive buffers are allocated PIPSIZ bytes of buffering for
- * stream sockets, although the total for sender and receiver is actually
- * only PIPSIZ.
+ * SOCK_STREAM and SOCK_SEQPACKET unix(4) sockets fully bypass the send buffer,
+ * however the notion of send buffer still makes sense with them. Its size is
+ * the amount of space that a send(2) syscall may copyin(9) before checking
+ * with the receive buffer of a peer. Although not linked anywhere yet,
+ * pointed to by a stack variable, effectively it is a buffer that needs to be
+ * sized.
*
- * Datagram sockets really use the sendspace as the maximum datagram size,
+ * SOCK_DGRAM sockets really use the sendspace as the maximum datagram size,
* and don't really want to reserve the sendspace. Their recvspace should be
* large enough for at least one max-size datagram plus address.
*/
@@ -156,7 +159,7 @@
static u_long unpst_recvspace = PIPSIZ;
static u_long unpdg_maxdgram = 8*1024; /* support 8KB syslog msgs */
static u_long unpdg_recvspace = 16*1024;
-static u_long unpsp_sendspace = PIPSIZ; /* really max datagram size */
+static u_long unpsp_sendspace = PIPSIZ;
static u_long unpsp_recvspace = PIPSIZ;
static SYSCTL_NODE(_net, PF_LOCAL, local, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
@@ -300,13 +303,12 @@
static void unp_scan(struct mbuf *, void (*)(struct filedescent **, int));
static void unp_discard(struct file *);
static void unp_freerights(struct filedescent **, int);
-static int unp_internalize(struct mbuf **, struct thread *,
- struct mbuf **, u_int *, u_int *);
+static int unp_internalize(struct mbuf *, struct mchain *,
+ struct thread *);
static void unp_internalize_fp(struct file *);
static int unp_externalize(struct mbuf *, struct mbuf **, int);
static int unp_externalize_fp(struct file *);
-static struct mbuf *unp_addsockcred(struct thread *, struct mbuf *,
- int, struct mbuf **, u_int *, u_int *);
+static void unp_addsockcred(struct thread *, struct mchain *, int);
static void unp_process_defers(void * __unused, int);
static void
@@ -449,6 +451,7 @@
case SOCK_STREAM:
sendspace = unpst_sendspace;
recvspace = unpst_recvspace;
+ STAILQ_INIT(&so->so_rcv.sb_mbq);
break;
case SOCK_DGRAM:
@@ -466,6 +469,7 @@
case SOCK_SEQPACKET:
sendspace = unpsp_sendspace;
recvspace = unpsp_recvspace;
+ STAILQ_INIT(&so->so_rcv.sb_mbq);
break;
default:
@@ -797,6 +801,10 @@
taskqueue_enqueue_timeout(taskqueue_thread, &unp_gc_task, -1);
switch (so->so_type) {
+ case SOCK_STREAM:
+ case SOCK_SEQPACKET:
+ MPASS(STAILQ_EMPTY(&so->so_rcv.sb_mbq));
+ break;
case SOCK_DGRAM:
/*
* Everything should have been unlinked/freed by unp_dispose()
@@ -852,6 +860,10 @@
error = solisten_proto_check(so);
if (error == 0) {
cru2xt(td, &unp->unp_peercred);
+ (void)chgsbsize(so->so_cred->cr_uidinfo, &so->so_snd.sb_hiwat,
+ 0, RLIM_INFINITY);
+ (void)chgsbsize(so->so_cred->cr_uidinfo, &so->so_rcv.sb_hiwat,
+ 0, RLIM_INFINITY);
solisten_proto(so, backlog);
}
SOCK_UNLOCK(so);
@@ -885,187 +897,563 @@
return (0);
}
-static int
-uipc_rcvd(struct socket *so, int flags)
+/*
+ * pr_sosend() called with mbuf instead of uio is a kernel thread. NFS,
+ * netgraph(4) and other subsystems can call into socket code. The
+ * function will condition the mbuf so that it can be safely put onto socket
+ * buffer and calculate its char count and mbuf count.
+ *
+ * Note: we don't support receiving control data from a kernel thread. Our
+ * pr_sosend methods have MPASS() to check that. This may change.
+ */
+static void
+uipc_reset_kernel_mbuf(struct mbuf *m, struct mchain *mc)
{
- struct unpcb *unp, *unp2;
- struct socket *so2;
- u_int mbcnt, sbcc;
- unp = sotounpcb(so);
- KASSERT(unp != NULL, ("%s: unp == NULL", __func__));
- KASSERT(so->so_type == SOCK_STREAM || so->so_type == SOCK_SEQPACKET,
- ("%s: socktype %d", __func__, so->so_type));
+ M_ASSERTPKTHDR(m);
- /*
- * Adjust backpressure on sender and wakeup any waiting to write.
- *
- * The unp lock is acquired to maintain the validity of the unp_conn
- * pointer; no lock on unp2 is required as unp2->unp_socket will be
- * static as long as we don't permit unp2 to disconnect from unp,
- * which is prevented by the lock on unp. We cache values from
- * so_rcv to avoid holding the so_rcv lock over the entire
- * transaction on the remote so_snd.
- */
- SOCKBUF_LOCK(&so->so_rcv);
- mbcnt = so->so_rcv.sb_mbcnt;
- sbcc = sbavail(&so->so_rcv);
- SOCKBUF_UNLOCK(&so->so_rcv);
- /*
- * There is a benign race condition at this point. If we're planning to
- * clear SB_STOP, but uipc_send is called on the connected socket at
- * this instant, it might add data to the sockbuf and set SB_STOP. Then
- * we would erroneously clear SB_STOP below, even though the sockbuf is
- * full. The race is benign because the only ill effect is to allow the
- * sockbuf to exceed its size limit, and the size limits are not
- * strictly guaranteed anyway.
- */
- UNP_PCB_LOCK(unp);
- unp2 = unp->unp_conn;
- if (unp2 == NULL) {
- UNP_PCB_UNLOCK(unp);
- return (0);
+ m_clrprotoflags(m);
+ m_tag_delete_chain(m, NULL);
+ m->m_pkthdr.rcvif = NULL;
+ m->m_pkthdr.flowid = 0;
+ m->m_pkthdr.csum_flags = 0;
+ m->m_pkthdr.fibnum = 0;
+ m->m_pkthdr.rsstype = 0;
+
+ mc_init_m(mc, m);
+ MPASS(m->m_pkthdr.len == mc->mc_len);
+}
+
+#ifdef SOCKBUF_DEBUG
+static inline void
+uipc_stream_sbcheck(struct sockbuf *sb)
+{
+ struct mbuf *d;
+ u_int dcc, dctl, dmbcnt;
+
+ dcc = dctl = dmbcnt = 0;
+ STAILQ_FOREACH(d, &sb->sb_mbq, m_stailq) {
+ if (d->m_type == MT_CONTROL)
+ dctl += d->m_len;
+ else if (d->m_type == MT_DATA)
+ dcc += d->m_len;
+ else
+ MPASS(0);
+ dmbcnt += MSIZE;
+ if (d->m_flags & M_EXT)
+ dmbcnt += d->m_ext.ext_size;
+ if (d->m_stailq.stqe_next == NULL)
+ MPASS(sb->sb_mbq.stqh_last == &d->m_stailq.stqe_next);
}
- so2 = unp2->unp_socket;
- SOCKBUF_LOCK(&so2->so_snd);
- if (sbcc < so2->so_snd.sb_hiwat && mbcnt < so2->so_snd.sb_mbmax)
- so2->so_snd.sb_flags &= ~SB_STOP;
- sowwakeup_locked(so2);
- UNP_PCB_UNLOCK(unp);
- return (0);
+ MPASS(dcc == sb->sb_acc);
+ MPASS(dcc == sb->sb_ccc);
+ MPASS(dctl == sb->sb_ctl);
+ MPASS(dmbcnt == sb->sb_mbcnt);
+}
+#define UIPC_STREAM_SBCHECK(sb) uipc_stream_sbcheck(sb)
+#else
+#define UIPC_STREAM_SBCHECK(sb) do {} while (0)
+#endif
+
+/*
+ * uipc_stream_sbspace() returns how much a writer can send, limited by char
+ * count or mbuf memory use, whatever ends first.
+ *
+ * XXXGL: sb_mbcnt may overcommit sb_mbmax in case if previous write observed
+ * 'space < mbspace', but mchain allocated to hold 'space' bytes of data ended
+ * up with 'mc_mlen > mbspace'. A typical scenario would be a full buffer with
+ * writer trying to push in a large write, and a slow reader, that reads just
+ * a few bytes at a time. In that case writer will keep creating new mbufs
+ * with mc_split(). These mbufs will carry little chars, but will all point at
+ * the same cluster, thus each adding cluster size to sb_mbcnt. This means we
+ * will count same cluster many times potentially underutilizing socket buffer.
+ * We aren't optimizing towards ineffective readers. Classic socket buffer had
+ * the same "feature".
+ */
+static inline u_int
+uipc_stream_sbspace(struct sockbuf *sb)
+{
+ u_int space, mbspace;
+
+ MPASS(sb->sb_hiwat >= sb->sb_ccc + sb->sb_ctl);
+ space = sb->sb_hiwat - sb->sb_ccc - sb->sb_ctl;
+ if (__predict_true(sb->sb_mbmax >= sb->sb_mbcnt))
+ mbspace = sb->sb_mbmax - sb->sb_mbcnt;
+ else
+ return (0);
+
+ return (min(space, mbspace));
}
static int
-uipc_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam,
- struct mbuf *control, struct thread *td)
+uipc_sosend_stream_or_seqpacket(struct socket *so, struct sockaddr *addr,
+ struct uio *uio, struct mbuf *m, struct mbuf *c, int flags,
+ struct thread *td)
{
struct unpcb *unp, *unp2;
struct socket *so2;
- u_int mbcnt, sbcc;
+ struct sockbuf *sb;
+ struct mchain mc, cmc;
+ ssize_t resid, sent;
+ bool nonblock, eor;
int error;
- unp = sotounpcb(so);
- KASSERT(unp != NULL, ("%s: unp == NULL", __func__));
- KASSERT(so->so_type == SOCK_STREAM || so->so_type == SOCK_SEQPACKET,
- ("%s: socktype %d", __func__, so->so_type));
+ MPASS((uio != NULL && m == NULL) || (m != NULL && uio == NULL));
+ MPASS(m == NULL || c == NULL);
- error = 0;
- if (flags & PRUS_OOB) {
+ if (__predict_false(flags & MSG_OOB)) {
error = EOPNOTSUPP;
- goto release;
+ goto out;
}
- if (control != NULL &&
- (error = unp_internalize(&control, td, NULL, NULL, NULL)))
- goto release;
- unp2 = NULL;
- if ((so->so_state & SS_ISCONNECTED) == 0) {
- if (nam != NULL) {
- if ((error = unp_connect(so, nam, td)) != 0)
- goto out;
- } else {
- error = ENOTCONN;
+ nonblock = (so->so_state & SS_NBIO) ||
+ (flags & (MSG_DONTWAIT | MSG_NBIO));
+ eor = flags & MSG_EOR;
+
+ mc = MCHAIN_INITIALIZER(&mc);
+ cmc = MCHAIN_INITIALIZER(&cmc);
+ sent = 0;
+
+ if (m == NULL) {
+ if (c != NULL && (error = unp_internalize(c, &cmc, td)))
goto out;
- }
- }
+ /*
+ * Optimization for a case when our send fits into the receive
+ * buffer - do the copyin before taking any locks, sized to our
+ * send buffer. Later copyins will also take into account
+ * space in the peer's receive buffer.
+ */
+ resid = uio->uio_resid;
+ error = mc_uiotomc(&mc, uio, so->so_snd.sb_hiwat, 0, M_WAITOK,
+ eor ? M_EOR : 0);
+ if (__predict_false(error))
+ goto out2;
+ } else
+ uipc_reset_kernel_mbuf(m, &mc);
+
+ error = SOCK_IO_SEND_LOCK(so, SBLOCKWAIT(flags));
+ if (error)
+ goto out2;
+ unp = sotounpcb(so);
UNP_PCB_LOCK(unp);
- if ((unp2 = unp_pcb_lock_peer(unp)) == NULL) {
+ unp2 = unp_pcb_lock_peer(unp);
+ if (__predict_false(so->so_error != 0)) {
+ error = so->so_error;
+ so->so_error = 0;
UNP_PCB_UNLOCK(unp);
- error = ENOTCONN;
- goto out;
- } else if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
- unp_pcb_unlock_pair(unp, unp2);
- error = EPIPE;
- goto out;
+ if (unp2 != NULL)
+ UNP_PCB_UNLOCK(unp2);
+ goto out3;
}
- UNP_PCB_UNLOCK(unp);
- if ((so2 = unp2->unp_socket) == NULL) {
- UNP_PCB_UNLOCK(unp2);
- error = ENOTCONN;
- goto out;
+ if (__predict_false(unp2 == NULL)) {
+ /*
+ * Different error code for a previously connected socket and
+ * a never connected one. The SS_ISDISCONNECTED is set in the
+ * unp_soisdisconnected() and is synchronized by the pcb lock.
+ */
+ error = so->so_state & SS_ISDISCONNECTED ? EPIPE : ENOTCONN;
+ UNP_PCB_UNLOCK(unp);
+ goto out3;
}
- SOCKBUF_LOCK(&so2->so_rcv);
+ UNP_PCB_UNLOCK(unp);
+
if (unp2->unp_flags & UNP_WANTCRED_MASK) {
/*
* Credentials are passed only once on SOCK_STREAM and
* SOCK_SEQPACKET (LOCAL_CREDS => WANTCRED_ONESHOT), or
* forever (LOCAL_CREDS_PERSISTENT => WANTCRED_ALWAYS).
*/
- control = unp_addsockcred(td, control, unp2->unp_flags, NULL,
- NULL, NULL);
+ unp_addsockcred(td, &cmc, unp2->unp_flags);
unp2->unp_flags &= ~UNP_WANTCRED_ONESHOT;
}
/*
- * Send to paired receive port and wake up readers. Don't
- * check for space available in the receive buffer if we're
- * attaching ancillary data; Unix domain sockets only check
- * for space in the sending sockbuf, and that check is
- * performed one level up the stack. At that level we cannot
- * precisely account for the amount of buffer space used
- * (e.g., because control messages are not yet internalized).
+ * Cycle through the data to send and available space in the peer's
+ * receive buffer. Put a reference on the peer socket, so that it
+ * doesn't get freed while we sbwait(). If peer goes away, we will
+ * observe the SBS_CANTRCVMORE and our sorele() will finalize peer's
+ * socket destruction.
*/
- switch (so->so_type) {
- case SOCK_STREAM:
- if (control != NULL) {
- sbappendcontrol_locked(&so2->so_rcv,
- m->m_len > 0 ? m : NULL, control, flags);
- control = NULL;
- } else
- sbappend_locked(&so2->so_rcv, m, flags);
- break;
+ so2 = unp2->unp_socket;
+ soref(so2);
+ UNP_PCB_UNLOCK(unp2);
+ sb = &so2->so_rcv;
+ while (mc.mc_len + cmc.mc_len > 0) {
+ struct mchain mcnext = MCHAIN_INITIALIZER(&mcnext);
+ u_int space;
- case SOCK_SEQPACKET:
- if (sbappendaddr_nospacecheck_locked(&so2->so_rcv,
- &sun_noname, m, control))
- control = NULL;
- break;
+ SOCK_RECVBUF_LOCK(so2);
+restart:
+ UIPC_STREAM_SBCHECK(sb);
+ if (__predict_false(cmc.mc_len > sb->sb_hiwat)) {
+ SOCK_RECVBUF_UNLOCK(so2);
+ error = EMSGSIZE;
+ goto out4;
+ }
+ if (__predict_false(sb->sb_state & SBS_CANTRCVMORE)) {
+ SOCK_RECVBUF_UNLOCK(so2);
+ error = EPIPE;
+ goto out4;
+ }
+ /*
+ * Wait on the peer socket receive buffer until we have enough
+ * space to put at least control. The data is a stream and can
+ * be put partially, but control is really a datagram.
+ */
+ space = uipc_stream_sbspace(sb);
+ if (space < sb->sb_lowat || space < cmc.mc_len) {
+ if (nonblock) {
+ SOCK_RECVBUF_UNLOCK(so2);
+ error = EWOULDBLOCK;
+ goto out4;
+ }
+ if ((error = sbwait(so2, SO_RCV)) != 0) {
+ SOCK_RECVBUF_UNLOCK(so2);
+ goto out4;
+ } else
+ goto restart;
+ }
+ MPASS(space >= cmc.mc_len);
+ space -= cmc.mc_len;
+ if (space == 0) {
+ /* There is space only to send control. */
+ MPASS(!STAILQ_EMPTY(&cmc.mc_q));
+ mcnext = mc;
+ mc = MCHAIN_INITIALIZER(&mc);
+ } else if (space < mc.mc_len) {
+ /* Not enough space. */
+ if (__predict_false(mc_split(&mc, &mcnext, space,
+ M_NOWAIT) == ENOMEM)) {
+ /*
+ * If allocation failed use M_WAITOK and merge
+ * the chain back. Next time mc_split() will
+ * easily split at the same place. Only if we
+ * race with setsockopt(SO_RCVBUF) shrinking
+ * sb_hiwat can this happen more than once.
+ */
+ SOCK_RECVBUF_UNLOCK(so2);
+ (void)mc_split(&mc, &mcnext, space, M_WAITOK);
+ mc_concat(&mc, &mcnext);
+ SOCK_RECVBUF_LOCK(so2);
+ goto restart;
+ }
+ MPASS(mc.mc_len == space);
+ }
+ if (!STAILQ_EMPTY(&cmc.mc_q)) {
+ STAILQ_CONCAT(&sb->sb_mbq, &cmc.mc_q);
+ sb->sb_ctl += cmc.mc_len;
+ sb->sb_mbcnt += cmc.mc_mlen;
+ cmc.mc_len = 0;
+ }
+ sent += mc.mc_len;
+ sb->sb_acc += mc.mc_len;
+ sb->sb_ccc += mc.mc_len;
+ sb->sb_mbcnt += mc.mc_mlen;
+ STAILQ_CONCAT(&sb->sb_mbq, &mc.mc_q);
+ UIPC_STREAM_SBCHECK(sb);
+ space = uipc_stream_sbspace(sb);
+ sorwakeup_locked(so2);
+ mc = mcnext;
+ if (STAILQ_EMPTY(&mc.mc_q) &&
+ uio != NULL && uio->uio_resid > 0) {
+ /*
+ * Copyin sum of peer's receive buffer space and our
+ * sb_hiwat, which is our virtual send buffer size.
+ * See comment above unpst_sendspace declaration.
+ * We are reading sb_hiwat locklessly, cause a) we
+ * don't care about an application that does send(2)
+ * and setsockopt(2) racing internally, and for an
+ * application that does this in sequence we will see
+ * the correct value cause sbsetopt() uses buffer lock
+ * and we also have already acquired it at least once.
+ */
+ error = mc_uiotomc(&mc, uio, space +
+ atomic_load_int(&so->so_snd.sb_hiwat), 0, M_WAITOK,
+ eor ? M_EOR : 0);
+ if (__predict_false(error))
+ goto out4;
+ }
}
- mbcnt = so2->so_rcv.sb_mbcnt;
- sbcc = sbavail(&so2->so_rcv);
- if (sbcc)
- sorwakeup_locked(so2);
- else
- SOCKBUF_UNLOCK(&so2->so_rcv);
+ MPASS(STAILQ_EMPTY(&mc.mc_q));
- /*
- * The PCB lock on unp2 protects the SB_STOP flag. Without it,
- * it would be possible for uipc_rcvd to be called at this
- * point, drain the receiving sockbuf, clear SB_STOP, and then
- * we would set SB_STOP below. That could lead to an empty
- * sockbuf having SB_STOP set
- */
- SOCKBUF_LOCK(&so->so_snd);
- if (sbcc >= so->so_snd.sb_hiwat || mbcnt >= so->so_snd.sb_mbmax)
- so->so_snd.sb_flags |= SB_STOP;
- SOCKBUF_UNLOCK(&so->so_snd);
- UNP_PCB_UNLOCK(unp2);
- m = NULL;
+ td->td_ru.ru_msgsnd++;
+out4:
+ sorele(so2);
+out3:
+ SOCK_IO_SEND_UNLOCK(so);
+out2:
+ if (!mc_empty(&cmc))
+ unp_scan(mc_first(&cmc), unp_freerights);
out:
+ mc_freem(&mc);
+ mc_freem(&cmc);
+
+ if (uio != NULL)
+ uio->uio_resid = resid - sent;
+
+ return (error);
+}
+
+static int
+uipc_soreceive_stream_or_seqpacket(struct socket *so, struct sockaddr **psa,
+ struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
+{
+ struct sockbuf *sb = &so->so_rcv;
+ struct mbuf *control, *m, *first, *last, *next;
+ u_int ctl, space, datalen, mbcnt, lastlen;
+ int error, flags;
+ bool nonblock, waitall, peek;
+
+ MPASS(mp0 == NULL);
+
+ if (psa != NULL)
+ *psa = NULL;
+ if (controlp != NULL)
+ *controlp = NULL;
+
+ flags = flagsp != NULL ? *flagsp : 0;
+ nonblock = (so->so_state & SS_NBIO) ||
+ (flags & (MSG_DONTWAIT | MSG_NBIO));
+ peek = flags & MSG_PEEK;
+ waitall = (flags & MSG_WAITALL) && !peek;
+
/*
- * PRUS_EOF is equivalent to pr_send followed by pr_shutdown.
+ * This check may fail only on a socket that never went through
+ * connect(2). We can check this locklessly, cause: a) for a new born
+ * socket we don't care about applications that may race internally
+ * between connect(2) and recv(2), and b) for a dying socket if we
+ * miss update by unp_sosidisconnected(), we would still get the check
+ * correct. For dying socket we would observe SBS_CANTRCVMORE later.
*/
- if (flags & PRUS_EOF) {
- UNP_PCB_LOCK(unp);
- socantsendmore(so);
- unp_shutdown(unp);
- UNP_PCB_UNLOCK(unp);
+ if (__predict_false((atomic_load_short(&so->so_state) &
+ (SS_ISCONNECTED|SS_ISDISCONNECTED)) == 0))
+ return (ENOTCONN);
+
+ error = SOCK_IO_RECV_LOCK(so, SBLOCKWAIT(flags));
+ if (__predict_false(error))
+ return (error);
+
+restart:
+ SOCK_RECVBUF_LOCK(so);
+ UIPC_STREAM_SBCHECK(sb);
+ while (sb->sb_acc < sb->sb_lowat &&
+ (sb->sb_ctl == 0 || controlp == NULL)) {
+ if (so->so_error) {
+ error = so->so_error;
+ if (!peek)
+ so->so_error = 0;
+ SOCK_RECVBUF_UNLOCK(so);
+ SOCK_IO_RECV_UNLOCK(so);
+ return (error);
+ }
+ if (sb->sb_state & SBS_CANTRCVMORE) {
+ SOCK_RECVBUF_UNLOCK(so);
+ SOCK_IO_RECV_UNLOCK(so);
+ return (0);
+ }
+ if (nonblock) {
+ SOCK_RECVBUF_UNLOCK(so);
+ SOCK_IO_RECV_UNLOCK(so);
+ return (EWOULDBLOCK);
+ }
+ error = sbwait(so, SO_RCV);
+ if (error) {
+ SOCK_RECVBUF_UNLOCK(so);
+ SOCK_IO_RECV_UNLOCK(so);
+ return (error);
+ }
}
- if (control != NULL && error != 0)
- unp_scan(control, unp_freerights);
-release:
- if (control != NULL)
- m_freem(control);
+ MPASS(STAILQ_FIRST(&sb->sb_mbq));
+ MPASS(sb->sb_acc > 0 || sb->sb_ctl > 0);
+
+ mbcnt = 0;
+ ctl = 0;
+ first = STAILQ_FIRST(&sb->sb_mbq);
+ if (first->m_type == MT_CONTROL) {
+ control = first;
+ STAILQ_FOREACH_FROM(first, &sb->sb_mbq, m_stailq) {
+ if (first->m_type != MT_CONTROL)
+ break;
+ ctl += first->m_len;
+ mbcnt += MSIZE;
+ if (first->m_flags & M_EXT)
+ mbcnt += first->m_ext.ext_size;
+ }
+ } else
+ control = NULL;
+
/*
- * In case of PRUS_NOTREADY, uipc_ready() is responsible
- * for freeing memory.
- */
- if (m != NULL && (flags & PRUS_NOTREADY) == 0)
- m_freem(m);
- return (error);
+ * Find split point for the next copyout. On exit from the loop:
+ * last == NULL - socket to be flushed
+ * last != NULL
+ * lastlen > last->m_len - uio to be filled, last to be adjusted
+ * lastlen == 0 - MT_CONTROL or M_EOR encountered
+ */
+ space = uio->uio_resid;
+ datalen = 0;
+ for (m = first, last = NULL; m != NULL; m = STAILQ_NEXT(m, m_stailq)) {
+ if (m->m_type != MT_DATA) {
+ last = m;
+ lastlen = 0;
+ break;
+ }
+ if (space >= m->m_len) {
+ space -= m->m_len;
+ datalen += m->m_len;
+ mbcnt += MSIZE;
+ if (m->m_flags & M_EXT)
+ mbcnt += m->m_ext.ext_size;
+ if (m->m_flags & M_EOR) {
+ last = STAILQ_NEXT(m, m_stailq);
+ lastlen = 0;
+ flags |= MSG_EOR;
+ break;
+ }
+ } else {
+ datalen += space;
+ last = m;
+ lastlen = space;
+ break;
+ }
+ }
+
+ UIPC_STREAM_SBCHECK(sb);
+ if (!peek) {
+ if (last == NULL)
+ STAILQ_INIT(&sb->sb_mbq);
+ else {
+ STAILQ_FIRST(&sb->sb_mbq) = last;
+ MPASS(last->m_len > lastlen);
+ last->m_len -= lastlen;
+ last->m_data += lastlen;
+ }
+ MPASS(sb->sb_acc >= datalen);
+ sb->sb_acc -= datalen;
+ sb->sb_ccc -= datalen;
+ MPASS(sb->sb_ctl >= ctl);
+ sb->sb_ctl -= ctl;
+ MPASS(sb->sb_mbcnt >= mbcnt);
+ sb->sb_mbcnt -= mbcnt;
+ UIPC_STREAM_SBCHECK(sb);
+ /* Mind the name. We are waking writer here, not reader. */
+ sorwakeup_locked(so);
+ } else
+ SOCK_RECVBUF_UNLOCK(so);
+
+ while (control != NULL && control->m_type == MT_CONTROL) {
+ if (!peek) {
+ struct mbuf *c;
+
+ /*
+ * unp_externalize() failure must abort entire read(2).
+ * Such failure should also free the problematic
+ * control, so that socket is not left in a state
+ * where it can't progress forward with reading.
+ * Probability of such a failure is really low, so it
+ * is fine that we need to perform pretty complex
+ * operation here to reconstruct the buffer.
+ * XXXGL: unp_externalize() used to be
+ * dom_externalize() KBI and it frees whole chain, so
+ * we need to feed it with mbufs one by one.
+ */
+ c = control;
+ control = STAILQ_NEXT(c, m_stailq);
+ STAILQ_NEXT(c, m_stailq) = NULL;
+ error = unp_externalize(c, controlp, flags);
+ if (__predict_false(error)) {
+ SOCK_RECVBUF_LOCK(so);
+ UIPC_STREAM_SBCHECK(sb);
+ MPASS(!(sb->sb_state & SBS_CANTRCVMORE));
+ /* XXXGL: STAILQ_PREPEND */
+ if (STAILQ_EMPTY(&sb->sb_mbq))
+ STAILQ_INSERT_HEAD(&sb->sb_mbq,
+ control, m_stailq);
+ else
+ STAILQ_FIRST(&sb->sb_mbq) = control;
+ sb->sb_ctl = sb->sb_acc = sb->sb_ccc =
+ sb->sb_mbcnt = 0;
+ STAILQ_FOREACH(m, &sb->sb_mbq, m_stailq) {
+ if (m->m_type == MT_DATA) {
+ sb->sb_acc += m->m_len;
+ sb->sb_ccc += m->m_len;
+ } else {
+ sb->sb_ctl += m->m_len;
+ }
+ sb->sb_mbcnt += MSIZE;
+ if (m->m_flags & M_EXT)
+ sb->sb_mbcnt +=
+ m->m_ext.ext_size;
+ }
+ UIPC_STREAM_SBCHECK(sb);
+ SOCK_RECVBUF_UNLOCK(so);
+ SOCK_IO_RECV_UNLOCK(so);
+ return (error);
+ }
+ if (controlp != NULL) {
+ while (*controlp != NULL)
+ controlp = &(*controlp)->m_next;
+ }
+ } else {
+ /*
+ * XXXGL
+ *
+ * In MSG_PEEK case control is not externalized. This
+ * means we are leaking some kernel pointers to the
+ * userland. They are useless to a law-abiding
+ * application, but may be useful to a malware. This
+ * is what the historical implementation in the
+ * soreceive_generic() did. To be improved?
+ */
+ if (controlp != NULL) {
+ *controlp = m_copym(control, 0, control->m_len,
+ M_WAITOK);
+ controlp = &(*controlp)->m_next;
+ }
+ control = STAILQ_NEXT(control, m_stailq);
+ }
+ }
+
+ for (m = first; m != last; m = next) {
+ next = STAILQ_NEXT(m, m_stailq);
+ error = uiomove(mtod(m, char *), m->m_len, uio);
+ if (__predict_false(error)) {
+ SOCK_IO_RECV_UNLOCK(so);
+ if (!peek)
+ for (; m != last; m = next) {
+ next = STAILQ_NEXT(m, m_stailq);
+ m_free(m);
+ }
+ return (error);
+ }
+ if (!peek)
+ m_free(m);
+ }
+ if (last != NULL && lastlen > 0) {
+ if (!peek) {
+ MPASS(!(m->m_flags & M_PKTHDR));
+ MPASS(last->m_data - M_START(last) >= lastlen);
+ error = uiomove(mtod(last, char *) - lastlen,
+ lastlen, uio);
+ } else
+ error = uiomove(mtod(last, char *), lastlen, uio);
+ if (__predict_false(error)) {
+ SOCK_IO_RECV_UNLOCK(so);
+ return (error);
+ }
+ }
+ if (waitall && !(flags & MSG_EOR) && uio->uio_resid > 0)
+ goto restart;
+ SOCK_IO_RECV_UNLOCK(so);
+
+ if (flagsp != NULL)
+ *flagsp |= flags;
+
+ uio->uio_td->td_ru.ru_msgrcv++;
+
+ return (0);
}
/* PF_UNIX/SOCK_DGRAM version of sbspace() */
@@ -1111,7 +1499,8 @@
const struct sockaddr *from;
struct socket *so2;
struct sockbuf *sb;
- struct mbuf *f, *clast;
+ struct mchain cmc = MCHAIN_INITIALIZER(&cmc);
+ struct mbuf *f;
u_int cc, ctl, mbcnt;
u_int dcc __diagused, dctl __diagused, dmbcnt __diagused;
int error;
@@ -1120,7 +1509,6 @@
error = 0;
f = NULL;
- ctl = 0;
if (__predict_false(flags & MSG_OOB)) {
error = EOPNOTSUPP;
@@ -1139,16 +1527,14 @@
f = m_gethdr(M_WAITOK, MT_SONAME);
cc = m->m_pkthdr.len;
mbcnt = MSIZE + m->m_pkthdr.memlen;
- if (c != NULL &&
- (error = unp_internalize(&c, td, &clast, &ctl, &mbcnt)))
+ if (c != NULL && (error = unp_internalize(c, &cmc, td)))
goto out;
} else {
- /* pr_sosend() with mbuf usually is a kernel thread. */
-
- M_ASSERTPKTHDR(m);
- if (__predict_false(c != NULL))
- panic("%s: control from a kernel thread", __func__);
+ struct mchain mc;
+ uipc_reset_kernel_mbuf(m, &mc);
+ cc = mc.mc_len;
+ mbcnt = mc.mc_mlen;
if (__predict_false(m->m_pkthdr.len > unpdg_maxdgram)) {
error = EMSGSIZE;
goto out;
@@ -1157,22 +1543,6 @@
error = ENOBUFS;
goto out;
}
- /* Condition the foreign mbuf to our standards. */
- m_clrprotoflags(m);
- m_tag_delete_chain(m, NULL);
- m->m_pkthdr.rcvif = NULL;
- m->m_pkthdr.flowid = 0;
- m->m_pkthdr.csum_flags = 0;
- m->m_pkthdr.fibnum = 0;
- m->m_pkthdr.rsstype = 0;
-
- cc = m->m_pkthdr.len;
- mbcnt = MSIZE;
- for (struct mbuf *mb = m; mb != NULL; mb = mb->m_next) {
- mbcnt += MSIZE;
- if (mb->m_flags & M_EXT)
- mbcnt += mb->m_ext.ext_size;
- }
}
unp = sotounpcb(so);
@@ -1224,8 +1594,7 @@
}
if (unp2->unp_flags & UNP_WANTCRED_MASK)
- c = unp_addsockcred(td, c, unp2->unp_flags, &clast, &ctl,
- &mbcnt);
+ unp_addsockcred(td, &cmc, unp2->unp_flags);
if (unp->unp_addr != NULL)
from = (struct sockaddr *)unp->unp_addr;
else
@@ -1233,25 +1602,21 @@
f->m_len = from->sa_len;
MPASS(from->sa_len <= MLEN);
bcopy(from, mtod(f, void *), from->sa_len);
- ctl += f->m_len;
/*
* Concatenate mbufs: from -> control -> data.
* Save overall cc and mbcnt in "from" mbuf.
*/
- if (c != NULL) {
-#ifdef INVARIANTS
- struct mbuf *mc;
-
- for (mc = c; mc->m_next != NULL; mc = mc->m_next);
- MPASS(mc == clast);
-#endif
- f->m_next = c;
- clast->m_next = m;
- c = NULL;
+ if (!STAILQ_EMPTY(&cmc.mc_q)) {
+ f->m_next = mc_first(&cmc);
+ mc_last(&cmc)->m_next = m;
+ /* XXXGL: This is dirty as well as rollback after ENOBUFS. */
+ STAILQ_INIT(&cmc.mc_q);
} else
f->m_next = m;
m = NULL;
+ ctl = f->m_len + cmc.mc_len;
+ mbcnt += cmc.mc_mlen;
#ifdef INVARIANTS
dcc = dctl = dmbcnt = 0;
for (struct mbuf *mb = f; mb != NULL; mb = mb->m_next) {
@@ -1317,7 +1682,7 @@
soroverflow_locked(so2);
error = ENOBUFS;
if (f->m_next->m_type == MT_CONTROL) {
- c = f->m_next;
+ STAILQ_FIRST(&cmc.mc_q) = f->m_next;
f->m_next = NULL;
}
}
@@ -1332,13 +1697,12 @@
out3:
SOCK_IO_SEND_UNLOCK(so);
out2:
- if (c)
- unp_scan(c, unp_freerights);
+ if (!mc_empty(&cmc))
+ unp_scan(mc_first(&cmc), unp_freerights);
out:
if (f)
m_freem(f);
- if (c)
- m_freem(c);
+ mc_freem(&cmc);
if (m)
m_freem(m);
@@ -1579,6 +1943,7 @@
return (0);
}
+#if 0 /* No sendfile support. */
static bool
uipc_ready_scan(struct socket *so, struct mbuf *m, int count, int *errorp)
{
@@ -1658,6 +2023,7 @@
}
return (error);
}
+#endif
static int
uipc_sense(struct socket *so, struct stat *sb)
@@ -2096,6 +2462,19 @@
}
}
+static void
+unp_soisdisconnected(struct socket *so)
+{
+ SOCK_LOCK(so);
+ MPASS(!SOLISTENING(so));
+ so->so_state |= SS_ISDISCONNECTED;
+ so->so_state &= ~SS_ISCONNECTED;
+ SOCK_RECVBUF_LOCK(so);
+ socantrcvmore_locked(so);
+ SOCK_UNLOCK(so);
+ wakeup(&so->so_timeo); /* XXXGL: is this needed? */
+}
+
static void
unp_disconnect(struct unpcb *unp, struct unpcb *unp2)
{
@@ -2168,12 +2547,10 @@
case SOCK_STREAM:
case SOCK_SEQPACKET:
- if (so)
- soisdisconnected(so);
+ unp_soisdisconnected(so);
MPASS(unp2->unp_conn == unp);
unp2->unp_conn = NULL;
- if (so2)
- soisdisconnected(so2);
+ unp_soisdisconnected(so2);
break;
}
@@ -2378,13 +2755,12 @@
/*
* Regardless of whether the socket's peer dropped the connection
* with this socket by aborting or disconnecting, POSIX requires
- * that ECONNRESET is returned.
+ * that ECONNRESET is returned on next connected send(2) in case of
+ * a SOCK_DGRAM socket and EPIPE for SOCK_STREAM.
*/
-
UNP_PCB_LOCK(unp);
so = unp->unp_socket;
- if (so)
- so->so_error = ECONNRESET;
+ so->so_error = so->so_proto->pr_type == SOCK_DGRAM ? ECONNRESET : EPIPE;
if ((unp2 = unp_pcb_lock_peer(unp)) != NULL) {
/* Last reference dropped in unp_disconnect(). */
unp_pcb_rele_notlast(unp);
@@ -2584,15 +2960,14 @@
}
static int
-unp_internalize(struct mbuf **controlp, struct thread *td,
- struct mbuf **clast, u_int *space, u_int *mbcnt)
+unp_internalize(struct mbuf *control, struct mchain *mc, struct thread *td)
{
- struct mbuf *control, **initial_controlp;
struct proc *p;
struct filedesc *fdesc;
struct bintime *bt;
struct cmsghdr *cm;
struct cmsgcred *cmcred;
+ struct mbuf *m;
struct filedescent *fde, **fdep, *fdev;
struct file *fp;
struct timeval *tv;
@@ -2602,15 +2977,13 @@
int i, j, error, *fdp, oldfds;
u_int newlen;
- MPASS((*controlp)->m_next == NULL); /* COMPAT_OLDSOCK may violate */
+ MPASS(control->m_next == NULL); /* COMPAT_OLDSOCK may violate */
UNP_LINK_UNLOCK_ASSERT();
p = td->td_proc;
fdesc = p->p_fd;
error = 0;
- control = *controlp;
- *controlp = NULL;
- initial_controlp = controlp;
+ *mc = MCHAIN_INITIALIZER(mc);
for (clen = control->m_len, cm = mtod(control, struct cmsghdr *),
data = CMSG_DATA(cm);
@@ -2624,10 +2997,10 @@
datalen = (char *)cm + cm->cmsg_len - (char *)data;
switch (cm->cmsg_type) {
case SCM_CREDS:
- *controlp = sbcreatecontrol(NULL, sizeof(*cmcred),
- SCM_CREDS, SOL_SOCKET, M_WAITOK);
+ m = sbcreatecontrol(NULL, sizeof(*cmcred), SCM_CREDS,
+ SOL_SOCKET, M_WAITOK);
cmcred = (struct cmsgcred *)
- CMSG_DATA(mtod(*controlp, struct cmsghdr *));
+ CMSG_DATA(mtod(m, struct cmsghdr *));
cmcred->cmcred_pid = p->p_pid;
cmcred->cmcred_uid = td->td_ucred->cr_ruid;
cmcred->cmcred_gid = td->td_ucred->cr_rgid;
@@ -2680,8 +3053,8 @@
* Now replace the integer FDs with pointers to the
* file structure and capability rights.
*/
- *controlp = sbcreatecontrol(NULL, newlen,
- SCM_RIGHTS, SOL_SOCKET, M_WAITOK);
+ m = sbcreatecontrol(NULL, newlen, SCM_RIGHTS,
+ SOL_SOCKET, M_WAITOK);
fdp = data;
for (i = 0; i < oldfds; i++, fdp++) {
if (!fhold(fdesc->fd_ofiles[*fdp].fde_file)) {
@@ -2697,7 +3070,7 @@
}
fdp = data;
fdep = (struct filedescent **)
- CMSG_DATA(mtod(*controlp, struct cmsghdr *));
+ CMSG_DATA(mtod(m, struct cmsghdr *));
fdev = malloc(sizeof(*fdev) * oldfds, M_FILECAPS,
M_WAITOK);
for (i = 0; i < oldfds; i++, fdev++, fdp++) {
@@ -2712,34 +3085,34 @@
break;
case SCM_TIMESTAMP:
- *controlp = sbcreatecontrol(NULL, sizeof(*tv),
- SCM_TIMESTAMP, SOL_SOCKET, M_WAITOK);
+ m = sbcreatecontrol(NULL, sizeof(*tv), SCM_TIMESTAMP,
+ SOL_SOCKET, M_WAITOK);
tv = (struct timeval *)
- CMSG_DATA(mtod(*controlp, struct cmsghdr *));
+ CMSG_DATA(mtod(m, struct cmsghdr *));
microtime(tv);
break;
case SCM_BINTIME:
- *controlp = sbcreatecontrol(NULL, sizeof(*bt),
- SCM_BINTIME, SOL_SOCKET, M_WAITOK);
+ m = sbcreatecontrol(NULL, sizeof(*bt), SCM_BINTIME,
+ SOL_SOCKET, M_WAITOK);
bt = (struct bintime *)
- CMSG_DATA(mtod(*controlp, struct cmsghdr *));
+ CMSG_DATA(mtod(m, struct cmsghdr *));
bintime(bt);
break;
case SCM_REALTIME:
- *controlp = sbcreatecontrol(NULL, sizeof(*ts),
- SCM_REALTIME, SOL_SOCKET, M_WAITOK);
+ m = sbcreatecontrol(NULL, sizeof(*ts), SCM_REALTIME,
+ SOL_SOCKET, M_WAITOK);
ts = (struct timespec *)
- CMSG_DATA(mtod(*controlp, struct cmsghdr *));
+ CMSG_DATA(mtod(m, struct cmsghdr *));
nanotime(ts);
break;
case SCM_MONOTONIC:
- *controlp = sbcreatecontrol(NULL, sizeof(*ts),
- SCM_MONOTONIC, SOL_SOCKET, M_WAITOK);
+ m = sbcreatecontrol(NULL, sizeof(*ts), SCM_MONOTONIC,
+ SOL_SOCKET, M_WAITOK);
ts = (struct timespec *)
- CMSG_DATA(mtod(*controlp, struct cmsghdr *));
+ CMSG_DATA(mtod(m, struct cmsghdr *));
nanouptime(ts);
break;
@@ -2748,28 +3121,20 @@
goto out;
}
- if (space != NULL) {
- *space += (*controlp)->m_len;
- *mbcnt += MSIZE;
- if ((*controlp)->m_flags & M_EXT)
- *mbcnt += (*controlp)->m_ext.ext_size;
- *clast = *controlp;
- }
- controlp = &(*controlp)->m_next;
+ mc_append(mc, m);
}
if (clen > 0)
error = EINVAL;
out:
- if (error != 0 && initial_controlp != NULL)
- unp_internalize_cleanup_rights(*initial_controlp);
+ if (error != 0)
+ unp_internalize_cleanup_rights(mc_first(mc));
m_freem(control);
return (error);
}
-static struct mbuf *
-unp_addsockcred(struct thread *td, struct mbuf *control, int mode,
- struct mbuf **clast, u_int *space, u_int *mbcnt)
+static void
+unp_addsockcred(struct thread *td, struct mchain *mc, int mode)
{
struct mbuf *m, *n, *n_prev;
const struct cmsghdr *cm;
@@ -2785,9 +3150,10 @@
cmsgtype = SCM_CREDS;
}
+ /* XXXGL: uipc_sosend_*() need to be improved so that we can M_WAITOK */
m = sbcreatecontrol(NULL, ctrlsz, cmsgtype, SOL_SOCKET, M_NOWAIT);
if (m == NULL)
- return (control);
+ return;
MPASS((m->m_flags & M_EXT) == 0 && m->m_next == NULL);
if (mode & UNP_WANTCRED_ALWAYS) {
@@ -2821,50 +3187,18 @@
* created SCM_CREDS control message (struct sockcred) has another
* format.
*/
- if (control != NULL && cmsgtype == SCM_CREDS)
- for (n = control, n_prev = NULL; n != NULL;) {
+ if (!STAILQ_EMPTY(&mc->mc_q) && cmsgtype == SCM_CREDS)
+ STAILQ_FOREACH_SAFE(n, &mc->mc_q, m_stailq, n_prev) {
cm = mtod(n, struct cmsghdr *);
if (cm->cmsg_level == SOL_SOCKET &&
cm->cmsg_type == SCM_CREDS) {
- if (n_prev == NULL)
- control = n->m_next;
- else
- n_prev->m_next = n->m_next;
- if (space != NULL) {
- MPASS(*space >= n->m_len);
- *space -= n->m_len;
- MPASS(*mbcnt >= MSIZE);
- *mbcnt -= MSIZE;
- if (n->m_flags & M_EXT) {
- MPASS(*mbcnt >=
- n->m_ext.ext_size);
- *mbcnt -= n->m_ext.ext_size;
- }
- MPASS(clast);
- if (*clast == n) {
- MPASS(n->m_next == NULL);
- if (n_prev == NULL)
- *clast = m;
- else
- *clast = n_prev;
- }
- }
- n = m_free(n);
- } else {
- n_prev = n;
- n = n->m_next;
+ mc_remove(mc, n);
+ m_free(n);
}
}
/* Prepend it to the head. */
- m->m_next = control;
- if (space != NULL) {
- *space += m->m_len;
- *mbcnt += MSIZE;
- if (control == NULL)
- *clast = m;
- }
- return (m);
+ mc_prepend(mc, m);
}
static struct unpcb *
@@ -3032,7 +3366,7 @@
break;
case SOCK_STREAM:
case SOCK_SEQPACKET:
- unp_scan(so->so_rcv.sb_mb, op);
+ unp_scan(STAILQ_FIRST(&so->so_rcv.sb_mbq), op);
break;
}
SOCK_RECVBUF_UNLOCK(so);
@@ -3242,39 +3576,34 @@
}
m = STAILQ_FIRST(&sb->uxdg_mb);
STAILQ_INIT(&sb->uxdg_mb);
- /* XXX: our shortened sbrelease() */
- (void)chgsbsize(so->so_cred->cr_uidinfo, &sb->sb_hiwat, 0,
- RLIM_INFINITY);
- /*
- * XXXGL Mark sb with SBS_CANTRCVMORE. This is needed to
- * prevent uipc_sosend_dgram() or unp_disconnect() adding more
- * data to the socket.
- * We came here either through shutdown(2) or from the final
- * sofree(). The sofree() case is simple as it guarantees
- * that no more sends will happen, however we can race with
- * unp_disconnect() from our peer. The shutdown(2) case is
- * more exotic. It would call into unp_dispose() only if
- * socket is SS_ISCONNECTED. This is possible if we did
- * connect(2) on this socket and we also had it bound with
- * bind(2) and receive connections from other sockets.
- * Because uipc_shutdown() violates POSIX (see comment
- * there) we will end up here shutting down our receive side.
- * Of course this will have affect not only on the peer we
- * connect(2)ed to, but also on all of the peers who had
- * connect(2)ed to us. Their sends would end up with ENOBUFS.
- */
- sb->sb_state |= SBS_CANTRCVMORE;
break;
case SOCK_STREAM:
case SOCK_SEQPACKET:
sb = &so->so_rcv;
- m = sbcut_locked(sb, sb->sb_ccc);
- KASSERT(sb->sb_ccc == 0 && sb->sb_mb == 0 && sb->sb_mbcnt == 0,
- ("%s: ccc %u mb %p mbcnt %u", __func__,
- sb->sb_ccc, (void *)sb->sb_mb, sb->sb_mbcnt));
- sbrelease_locked(so, SO_RCV);
+ m = STAILQ_FIRST(&sb->sb_mbq);
+ STAILQ_FIRST(&sb->sb_mbq) = NULL;
+ sb->sb_acc = sb->sb_ccc = sb->sb_ctl = sb->sb_mbcnt = 0;
break;
}
+ /*
+ * Mark sb with SBS_CANTRCVMORE. This is needed to prevent
+ * uipc_sosend_*() or unp_disconnect() adding more data to the socket.
+ * We came here either through shutdown(2) or from the final sofree().
+ * The sofree() case is simple as it guarantees that no more sends will
+ * happen, however we can race with unp_disconnect() from our peer.
+ * The shutdown(2) case is more exotic. It would call into
+ * unp_dispose() only if socket is SS_ISCONNECTED. This is possible if
+ * we did connect(2) on this socket and we also had it bound with
+ * bind(2) and receive connections from other sockets. Because
+ * uipc_shutdown() violates POSIX (see comment there) this applies to
+ * SOCK_DGRAM as well. For SOCK_DGRAM this SBS_CANTRCVMORE will have
+ * affect not only on the peer we connect(2)ed to, but also on all of
+ * the peers who had connect(2)ed to us. Their sends would end up
+ * with ENOBUFS.
+ */
+ sb->sb_state |= SBS_CANTRCVMORE;
+ (void)chgsbsize(so->so_cred->cr_uidinfo, &sb->sb_hiwat, 0,
+ RLIM_INFINITY);
SOCK_RECVBUF_UNLOCK(so);
SOCK_IO_RECV_UNLOCK(so);
@@ -3333,7 +3662,7 @@
*/
static struct protosw streamproto = {
.pr_type = SOCK_STREAM,
- .pr_flags = PR_CONNREQUIRED | PR_WANTRCVD | PR_CAPATTACH,
+ .pr_flags = PR_CONNREQUIRED | PR_CAPATTACH | PR_SOCKBUF,
.pr_ctloutput = &uipc_ctloutput,
.pr_abort = uipc_abort,
.pr_accept = uipc_peeraddr,
@@ -3347,13 +3676,11 @@
.pr_disconnect = uipc_disconnect,
.pr_listen = uipc_listen,
.pr_peeraddr = uipc_peeraddr,
- .pr_rcvd = uipc_rcvd,
- .pr_send = uipc_send,
- .pr_ready = uipc_ready,
.pr_sense = uipc_sense,
.pr_shutdown = uipc_shutdown,
.pr_sockaddr = uipc_sockaddr,
- .pr_soreceive = soreceive_generic,
+ .pr_sosend = uipc_sosend_stream_or_seqpacket,
+ .pr_soreceive = uipc_soreceive_stream_or_seqpacket,
.pr_close = uipc_close,
};
@@ -3382,13 +3709,7 @@
static struct protosw seqpacketproto = {
.pr_type = SOCK_SEQPACKET,
- /*
- * XXXRW: For now, PR_ADDR because soreceive will bump into them
- * due to our use of sbappendaddr. A new sbappend variants is needed
- * that supports both atomic record writes and control data.
- */
- .pr_flags = PR_ADDR | PR_ATOMIC | PR_CONNREQUIRED |
- PR_WANTRCVD | PR_CAPATTACH,
+ .pr_flags = PR_CONNREQUIRED | PR_CAPATTACH | PR_SOCKBUF,
.pr_ctloutput = &uipc_ctloutput,
.pr_abort = uipc_abort,
.pr_accept = uipc_peeraddr,
@@ -3402,12 +3723,11 @@
.pr_disconnect = uipc_disconnect,
.pr_listen = uipc_listen,
.pr_peeraddr = uipc_peeraddr,
- .pr_rcvd = uipc_rcvd,
- .pr_send = uipc_send,
.pr_sense = uipc_sense,
.pr_shutdown = uipc_shutdown,
.pr_sockaddr = uipc_sockaddr,
- .pr_soreceive = soreceive_generic, /* XXX: or...? */
+ .pr_sosend = uipc_sosend_stream_or_seqpacket,
+ .pr_soreceive = uipc_soreceive_stream_or_seqpacket,
.pr_close = uipc_close,
};
diff --git a/sys/sys/sockbuf.h b/sys/sys/sockbuf.h
--- a/sys/sys/sockbuf.h
+++ b/sys/sys/sockbuf.h
@@ -130,6 +130,13 @@
uint64_t sb_tls_seqno; /* TLS seqno */
struct ktls_session *sb_tls_info; /* TLS state */
};
+ /*
+ * PF_UNIX/SOCK_STREAM and PF_UNIX/SOCK_SEQPACKET
+ * A most simple stream buffer.
+ */
+ struct {
+ STAILQ_HEAD(, mbuf) sb_mbq;
+ };
/*
* PF_UNIX/SOCK_DGRAM
*
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Fri, Jan 10, 10:28 AM (13 h, 40 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
15732664
Default Alt Text
D44151.diff (39 KB)
Attached To
Mode
D44151: unix: new implementation of unix/stream & unix/seqpacket
Attached
Detach File
Event Timeline
Log In to Comment