Page MenuHomeFreeBSD

D32505.id96899.diff
No OneTemporary

D32505.id96899.diff

diff --git a/lib/libc/sys/Symbol.map b/lib/libc/sys/Symbol.map
--- a/lib/libc/sys/Symbol.map
+++ b/lib/libc/sys/Symbol.map
@@ -421,6 +421,7 @@
_Fork;
fspacectl;
membarrier;
+ rseq;
};
FBSDprivate_1.0 {
diff --git a/sys/compat/freebsd32/capabilities.conf b/sys/compat/freebsd32/capabilities.conf
--- a/sys/compat/freebsd32/capabilities.conf
+++ b/sys/compat/freebsd32/capabilities.conf
@@ -552,6 +552,10 @@
recvfrom
recvmsg
+##
+##
+rseq
+
##
## Allow real-time scheduling primitives to be used.
##
diff --git a/sys/compat/freebsd32/syscalls.master b/sys/compat/freebsd32/syscalls.master
--- a/sys/compat/freebsd32/syscalls.master
+++ b/sys/compat/freebsd32/syscalls.master
@@ -1183,5 +1183,7 @@
struct spacectl_range32 *rmsr); }
581 AUE_NULL STD|NOPROTO { int membarrier(int cmd, unsigned flags, \
int cpu_id); }
+582 AUE_NULL STD|NOPROTO {int rseq(void *rseq, uint32_t rseqlen, \
+ int flags, uint32_t sig); }
; vim: syntax=off
diff --git a/sys/conf/files b/sys/conf/files
--- a/sys/conf/files
+++ b/sys/conf/files
@@ -3837,6 +3837,7 @@
kern/kern_rctl.c standard
kern/kern_resource.c standard
kern/kern_rmlock.c standard
+kern/kern_rseq.c standard
kern/kern_rwlock.c standard
kern/kern_sdt.c optional kdtrace_hooks
kern/kern_sema.c standard
diff --git a/sys/kern/kern_exec.c b/sys/kern/kern_exec.c
--- a/sys/kern/kern_exec.c
+++ b/sys/kern/kern_exec.c
@@ -804,6 +804,7 @@
/* STOPs are no longer ignored, arrange for AST */
signotify(td);
}
+ td->td_rseq_abi = NULL;
if ((imgp->sysent->sv_setid_allowed != NULL &&
!(*imgp->sysent->sv_setid_allowed)(td, imgp)) ||
diff --git a/sys/kern/kern_rseq.c b/sys/kern/kern_rseq.c
new file mode 100644
--- /dev/null
+++ b/sys/kern/kern_rseq.c
@@ -0,0 +1,226 @@
+/*-
+ * Copyright (c) 2021 The FreeBSD Foundation
+ *
+ * This software were developed by Konstantin Belousov <kib@FreeBSD.org>
+ * under sponsorship from the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/sysproto.h>
+#include <sys/rseq.h>
+
+static void
+rseq_inactivate(struct thread *td)
+{
+ td->td_rseq_abi = NULL;
+}
+
+static void
+rseq_inactivate_sig(struct thread *td, void *addr, int si_code)
+{
+ ksiginfo_t ksi;
+
+ rseq_inactivate(td);
+ ksiginfo_init_trap(&ksi);
+ ksi.ksi_signo = SIGSEGV;
+ ksi.ksi_code = si_code;
+ ksi.ksi_trapno = 0;
+ ksi.ksi_addr = addr;
+ trapsignal(td, &ksi);
+}
+
+void
+rseq_ast(struct thread *td)
+{
+ struct rseq rs;
+ struct rseq_cs rc;
+ register_t pc;
+ int cpu, error;
+
+ if (td->td_rseq_abi == NULL)
+ return;
+
+ /*
+ * We cannot enter critical section there to keep td_oncpu
+ * valid due to userspace access. We do not even want to
+ * sched_pin() for the same reason.
+ *
+ * It is fine to get a context switch after reading td_oncpu,
+ * since this would cause new AST pending and we re-enter this
+ * function to update rseq cpu number.
+ *
+ * Microoptimize 64bit architectures by doing single 64bit
+ * write for cpu ids. For instance, on SMAP-enabled amd64
+ * this saves two serialization instructions STAC/CLAC.
+ */
+ cpu = td->td_oncpu;
+#ifdef __LP64__
+ rs.cpu_id_start = cpu;
+ rs.cpu_id = cpu;
+ error = suword64((char *)td->td_rseq_abi + offsetof(struct rseq,
+ cpu_id_start), *(uint64_t *)(char *)&rs.cpu_id_start);
+#else
+ error = suword((char *)td->td_rseq_abi + offsetof(struct rseq,
+ cpu_id_start), cpu);
+ if (error == 0) {
+ error = suword((char *)td->td_rseq_abi +
+ offsetof(struct rseq, cpu_id), cpu);
+ }
+#endif
+ if (error != 0) {
+ rseq_inactivate_sig(td, td->td_rseq_abi, SEGV_RSEQ_W);
+ return;
+ }
+
+ error = copyin(td->td_rseq_abi, &rs, sizeof(rs));
+ if (error != 0) {
+ rseq_inactivate_sig(td, td->td_rseq_abi, SEGV_RSEQ_R);
+ return;
+ }
+
+ if (rs.rseq_cs.ptr64 == 0)
+ return;
+
+ critical_enter();
+ if ((rs.flags & RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT) != 0 &&
+ ((rs.flags & RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE) != 0 ||
+ td->td_oncpu == td->td_lastcpu))
+ return;
+ critical_exit();
+
+ error = copyin((void *)rs.rseq_cs.ptr64, &rc, sizeof(rc));
+ if (error != 0) {
+ rseq_inactivate_sig(td, (void *)rs.rseq_cs.ptr64, SEGV_RSEQ_R);
+ return;
+ }
+ if (rc.version != 0) {
+ rseq_inactivate_sig(td, (void *)rs.rseq_cs.ptr64, SEGV_RSEQ_R);
+ return;
+ }
+
+ critical_enter();
+ if ((rc.flags & RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT) != 0 &&
+ ((rc.flags & RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE) != 0 ||
+ td->td_oncpu == td->td_lastcpu))
+ return;
+ critical_exit();
+
+ pc = TRAPF_PC(td->td_frame);
+ if (pc >= rc.start_ip && pc < rc.start_ip + rc.post_commit_offset) {
+ /* XXXKIB check signature */
+ TRAPF_PC(td->td_frame) = rc.abort_ip + 4;
+ }
+}
+
+void
+rseq_before_sig(struct thread *td)
+{
+ struct rseq rs;
+ struct rseq_cs rc;
+ int error;
+
+ td->td_pflags &= ~TDP2_RSEQ_SIG;
+ if (td->td_rseq_abi == NULL)
+ return;
+
+ error = copyin(td->td_rseq_abi, &rs, sizeof(rs));
+ if (error != 0) {
+ rseq_inactivate_sig(td, td->td_rseq_abi, SEGV_RSEQ_R);
+ return;
+ }
+
+ if (rs.rseq_cs.ptr64 == 0 ||
+ (rs.flags & RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL) != 0)
+ return;
+
+ error = copyin((void *)rs.rseq_cs.ptr64, &rc, sizeof(rc));
+ if (error != 0) {
+ rseq_inactivate_sig(td, (void *)rs.rseq_cs.ptr64, SEGV_RSEQ_R);
+ return;
+ }
+
+ if ((rc.flags & RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL) != 0)
+ return;
+ /* XXXKIB check signature */
+
+ td->td_pflags |= TDP2_RSEQ_SIG;
+ td->td_rseq_start_ip = rc.start_ip;
+ td->td_rseq_end_ip = rc.start_ip + rc.post_commit_offset;
+ td->td_rseq_abort_ip = rc.abort_ip;
+}
+
+void
+rseq_on_sig(struct thread *td)
+{
+ register_t pc;
+
+ if ((td->td_pflags & TDP2_RSEQ_SIG) == 0)
+ return;
+ td->td_pflags &= ~TDP2_RSEQ_SIG;
+ pc = TRAPF_PC(td->td_frame);
+ if (pc >= td->td_rseq_start_ip && pc < td->td_rseq_end_ip)
+ TRAPF_PC(td->td_frame) = td->td_rseq_abort_ip;
+}
+
+static int
+kern_rseq(struct thread *td, uintptr_t rseq, uint32_t rseqlen, int flags,
+ uint32_t sig)
+{
+ if (rseqlen != sizeof(struct rseq))
+ return (EINVAL);
+
+ if (flags == RSEQ_FLAG_UNREGISTER) {
+ if (rseq != 0 || td->td_rseq_abi == NULL)
+ return (EINVAL);
+ if (sig != td->td_rseq_sig)
+ return (EPERM);
+ rseq_inactivate(td);
+ return (0);
+ }
+
+ if (td->td_rseq_abi != NULL)
+ return (EBUSY);
+ if (flags != 0 || rseq == 0 ||
+ trunc_page(rseq) != trunc_page(rseq + rseqlen))
+ return (EINVAL);
+
+ td->td_rseq_abi = (void *)rseq;
+ td->td_rseq_sig = sig;
+ thread_lock(td);
+ td->td_flags |= TDF_ASTPENDING | TDF_RSEQ;
+ thread_unlock(td);
+ return (0);
+}
+
+int
+sys_rseq(struct thread *td, struct rseq_args *uap)
+{
+ return (kern_rseq(td, (uintptr_t)uap->rseq, uap->rseqlen,
+ uap->flags, uap->sig));
+}
diff --git a/sys/kern/kern_sig.c b/sys/kern/kern_sig.c
--- a/sys/kern/kern_sig.c
+++ b/sys/kern/kern_sig.c
@@ -69,6 +69,7 @@
#include <sys/posix4.h>
#include <sys/racct.h>
#include <sys/resourcevar.h>
+#include <sys/rseq.h>
#include <sys/sdt.h>
#include <sys/sbuf.h>
#include <sys/sleepqueue.h>
@@ -2017,6 +2018,7 @@
KASSERT(_SIG_VALID(sig), ("invalid signal"));
sigfastblock_fetch(td);
+ rseq_before_sig(td);
PROC_LOCK(p);
ps = p->p_sigacts;
mtx_lock(&ps->ps_mtx);
@@ -2030,6 +2032,7 @@
ktrpsig(sig, ps->ps_sigact[_SIG_IDX(sig)],
&td->td_sigmask, code);
#endif
+ rseq_on_sig(td);
(*p->p_sysent->sv_sendsig)(ps->ps_sigact[_SIG_IDX(sig)],
ksi, &td->td_sigmask);
postsig_done(sig, td, ps);
@@ -3204,6 +3207,7 @@
if (p->p_sig == sig) {
p->p_sig = 0;
}
+ rseq_on_sig(td);
(*p->p_sysent->sv_sendsig)(action, &ksi, &returnmask);
postsig_done(sig, td, ps);
}
diff --git a/sys/kern/kern_synch.c b/sys/kern/kern_synch.c
--- a/sys/kern/kern_synch.c
+++ b/sys/kern/kern_synch.c
@@ -53,6 +53,7 @@
#include <sys/mutex.h>
#include <sys/proc.h>
#include <sys/resourcevar.h>
+#include <sys/rseq.h>
#include <sys/sched.h>
#include <sys/sdt.h>
#include <sys/signalvar.h>
@@ -491,7 +492,7 @@
mi_switch(int flags)
{
uint64_t runtime, new_switchtime;
- struct thread *td;
+ struct thread *td, *td1;
td = curthread; /* XXX */
THREAD_LOCK_ASSERT(td, MA_OWNED | MA_NOTRECURSED);
@@ -549,9 +550,14 @@
/*
* If the last thread was exiting, finish cleaning it up.
*/
- if ((td = PCPU_GET(deadthread))) {
+ if ((td1 = PCPU_GET(deadthread))) {
PCPU_SET(deadthread, NULL);
- thread_stash(td);
+ thread_stash(td1);
+ }
+ if (td->td_rseq_abi != NULL) {
+ thread_lock(td);
+ td->td_flags |= TDF_ASTPENDING | TDF_RSEQ;
+ thread_unlock(td);
}
spinlock_exit();
}
diff --git a/sys/kern/kern_thread.c b/sys/kern/kern_thread.c
--- a/sys/kern/kern_thread.c
+++ b/sys/kern/kern_thread.c
@@ -89,9 +89,9 @@
"struct thread KBI td_flags");
_Static_assert(offsetof(struct thread, td_pflags) == 0x110,
"struct thread KBI td_pflags");
-_Static_assert(offsetof(struct thread, td_frame) == 0x4a8,
+_Static_assert(offsetof(struct thread, td_frame) == 0x4c8,
"struct thread KBI td_frame");
-_Static_assert(offsetof(struct thread, td_emuldata) == 0x6b0,
+_Static_assert(offsetof(struct thread, td_emuldata) == 0x6d0,
"struct thread KBI td_emuldata");
_Static_assert(offsetof(struct proc, p_flag) == 0xb8,
"struct proc KBI p_flag");
diff --git a/sys/kern/subr_trap.c b/sys/kern/subr_trap.c
--- a/sys/kern/subr_trap.c
+++ b/sys/kern/subr_trap.c
@@ -64,6 +64,7 @@
#include <sys/ptrace.h>
#include <sys/racct.h>
#include <sys/resourcevar.h>
+#include <sys/rseq.h>
#include <sys/sched.h>
#include <sys/signalvar.h>
#include <sys/syscall.h>
@@ -243,7 +244,7 @@
flags = td->td_flags;
td->td_flags &= ~(TDF_ASTPENDING | TDF_NEEDSIGCHK | TDF_NEEDSUSPCHK |
TDF_NEEDRESCHED | TDF_ALRMPEND | TDF_PROFPEND | TDF_MACPEND |
- TDF_KQTICKLED);
+ TDF_KQTICKLED | TDF_RSEQ);
thread_unlock(td);
VM_CNT_INC(v_trap);
@@ -332,6 +333,7 @@
if (flags & TDF_NEEDSIGCHK || p->p_pendingcnt > 0 ||
!SIGISEMPTY(p->p_siglist)) {
sigfastblock_fetch(td);
+ rseq_before_sig(td);
PROC_LOCK(p);
mtx_lock(&p->p_sigacts->ps_mtx);
while ((sig = cursig(td)) != 0) {
@@ -354,6 +356,9 @@
*/
sigfastblock_setpend(td, resched_sigs);
+ if ((flags & TDF_RSEQ) != 0)
+ rseq_ast(td);
+
#ifdef KTRACE
KTRUSERRET(td);
#endif
diff --git a/sys/kern/syscalls.master b/sys/kern/syscalls.master
--- a/sys/kern/syscalls.master
+++ b/sys/kern/syscalls.master
@@ -3268,6 +3268,14 @@
int cpu_id
);
}
+582 AUE_NULL STD|CAPENABLED {
+ int rseq(
+ void *rseq,
+ uint32_t rseqlen,
+ int flags,
+ uint32_t sig
+ );
+ }
; Please copy any additions and changes to the following compatability tables:
; sys/compat/freebsd32/syscalls.master
diff --git a/sys/sys/proc.h b/sys/sys/proc.h
--- a/sys/sys/proc.h
+++ b/sys/sys/proc.h
@@ -316,6 +316,11 @@
size_t td_vslock_sz; /* (k) amount of vslock-ed space */
struct kcov_info *td_kcov_info; /* (*) Kernel code coverage data */
u_int td_ucredref; /* (k) references on td_realucred */
+ uint32_t td_rseq_sig; /* (k) abort handler signature */
+ void *td_rseq_abi; /* (k) usermode rseq */
+ register_t td_rseq_start_ip;/* (k) */
+ register_t td_rseq_end_ip; /* (k) */
+ register_t td_rseq_abort_ip;/* (k) */
#define td_endzero td_sigmask
/* Copied during fork1() or create_thread(). */
@@ -462,7 +467,7 @@
#define TDF_THRWAKEUP 0x00100000 /* Libthr thread must not suspend itself. */
#define TDF_SEINTR 0x00200000 /* EINTR on stop attempts. */
#define TDF_SWAPINREQ 0x00400000 /* Swapin request due to wakeup. */
-#define TDF_UNUSED23 0x00800000 /* --available-- */
+#define TDF_RSEQ 0x00800000 /* rseq active */
#define TDF_SCHED0 0x01000000 /* Reserved for scheduler private use */
#define TDF_SCHED1 0x02000000 /* Reserved for scheduler private use */
#define TDF_SCHED2 0x04000000 /* Reserved for scheduler private use */
@@ -531,6 +536,7 @@
#define TDP2_SBPAGES 0x00000001 /* Owns sbusy on some pages */
#define TDP2_COMPAT32RB 0x00000002 /* compat32 ABI for robust lists */
#define TDP2_ACCT 0x00000004 /* Doing accounting */
+#define TDP2_RSEQ_SIG 0x00000008
/*
* Reasons that the current thread can not be run yet.
diff --git a/sys/sys/rseq.h b/sys/sys/rseq.h
new file mode 100644
--- /dev/null
+++ b/sys/sys/rseq.h
@@ -0,0 +1,96 @@
+/*-
+ * Copyright (c) 2021 The FreeBSD Foundation
+ *
+ * This software were developed by Konstantin Belousov <kib@FreeBSD.org>
+ * under sponsorship from the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef __SYS_RSEQ_H__
+#define __SYS_RSEQ_H__
+
+#include <sys/cdefs.h>
+#include <sys/endian.h>
+
+enum rseq_cpu_id_state {
+ RSEQ_CPU_ID_UNINITIALIZED = -1,
+ RSEQ_CPU_ID_REGISTRATION_FAILED = -2,
+};
+
+enum rseq_flags {
+ RSEQ_FLAG_UNREGISTER = 1,
+};
+
+enum rseq_cs_flags {
+ RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT = 0x00000001,
+ RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL = 0x00000002,
+ RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE = 0x00000004,
+};
+
+struct rseq_cs {
+ uint32_t version;
+ uint32_t flags;
+ uint64_t start_ip;
+ uint64_t post_commit_offset;
+ uint64_t abort_ip;
+};
+
+struct rseq {
+ uint32_t cpu_id_start;
+ uint32_t cpu_id;
+ union {
+ uint64_t ptr64;
+#ifdef __LP64__
+ uint64_t ptr;
+#else
+ struct {
+#if _BYTE_ORDER == _BIG_ENDIAN
+ uint32_t pad;
+ uint32_t ptr32;
+#else /* BYTE_ORDER */
+ uint32_t ptr32;
+ uint32_t pad;
+#endif /* BYTE_ORDER */
+ } ptr;
+#endif /* LP64 */
+ } rseq_cs;
+ uint32_t flags;
+};
+
+#ifdef _KERNEL
+
+#define TD_RSEQ_ACTIVE 0x00000001
+
+void rseq_ast(struct thread *td);
+void rseq_before_sig(struct thread *td);
+void rseq_on_sig(struct thread *td);
+
+#else /* _KERNEL */
+
+__BEGIN_DECLS
+int rseq(volatile struct rseq *rseq, uint32_t rseqlen, int flags, uint32_t sig);
+__END_DECLS
+
+#endif /* _KERNEL */
+
+#endif /* __SYS_RSEQ_H__ */
diff --git a/sys/sys/signal.h b/sys/sys/signal.h
--- a/sys/sys/signal.h
+++ b/sys/sys/signal.h
@@ -329,6 +329,8 @@
#define SEGV_ACCERR 2 /* Invalid permissions for mapped */
/* object. */
#define SEGV_PKUERR 100 /* x86: PKU violation */
+#define SEGV_RSEQ_R 101 /* rseq access read fault */
+#define SEGV_RSEQ_W 102 /* rseq access write fault */
/* codes for SIGFPE */
#define FPE_INTOVF 1 /* Integer overflow. */

File Metadata

Mime Type
text/plain
Expires
Mon, Sep 23, 3:47 AM (8 h, 35 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
12504647
Default Alt Text
D32505.id96899.diff (16 KB)

Event Timeline