Page Menu
Home
FreeBSD
Search
Configure Global Search
Log In
Files
F115729815
D32505.id100265.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Flag For Later
Award Token
Size
42 KB
Referenced Files
None
Subscribers
None
D32505.id100265.diff
View Options
diff --git a/lib/libc/gen/Makefile.inc b/lib/libc/gen/Makefile.inc
--- a/lib/libc/gen/Makefile.inc
+++ b/lib/libc/gen/Makefile.inc
@@ -120,6 +120,7 @@
readpassphrase.c \
recvmmsg.c \
rewinddir.c \
+ rseq_abi.c \
scandir.c \
scandir_b.c \
scandir-compat11.c \
diff --git a/lib/libc/gen/Symbol.map b/lib/libc/gen/Symbol.map
--- a/lib/libc/gen/Symbol.map
+++ b/lib/libc/gen/Symbol.map
@@ -436,6 +436,7 @@
};
FBSD_1.7 {
+ __rseq_abi;
posix_spawn_file_actions_addchdir_np;
posix_spawn_file_actions_addclosefrom_np;
posix_spawn_file_actions_addfchdir_np;
@@ -569,4 +570,6 @@
__fillcontextx;
__fillcontextx2;
__getcontextx_size;
+
+ __rseq_abi_init;
};
diff --git a/lib/libc/gen/rseq_abi.c b/lib/libc/gen/rseq_abi.c
new file mode 100644
--- /dev/null
+++ b/lib/libc/gen/rseq_abi.c
@@ -0,0 +1,55 @@
+/*-
+ * Copyright (c) 2021 The FreeBSD Foundation
+ *
+ * This software were developed by Konstantin Belousov <kib@FreeBSD.org>
+ * under sponsorship from the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/types.h>
+#include <sys/auxv.h>
+#include <sys/rseq.h>
+#include <sched.h>
+#include "libc_private.h"
+
+_Thread_local volatile struct rseq __rseq_abi __weak_symbol;
+
+static void __main_rseq_abi_init(void) __attribute__((__constructor__,
+ __used__));
+static void
+__main_rseq_abi_init(void)
+{
+ __rseq_abi_init();
+}
+
+void
+__rseq_abi_init(void)
+{
+ int bsdflags;
+
+ if (_elf_aux_info(AT_BSDFLAGS, &bsdflags, sizeof(bsdflags)) != 0 ||
+ (bsdflags & ELF_BSDF_RSEQ1) == 0)
+ return;
+ __rseq_abi.cpu_id_start = sched_getcpu();
+ rseq(&__rseq_abi, sizeof(__rseq_abi), 0, 0/* XXXKIB */);
+}
diff --git a/lib/libc/include/libc_private.h b/lib/libc/include/libc_private.h
--- a/lib/libc/include/libc_private.h
+++ b/lib/libc/include/libc_private.h
@@ -435,4 +435,6 @@
struct __nl_cat_d *__catopen_l(const char *name, int type,
struct _xlocale *locale);
+void __rseq_abi_init(void);
+
#endif /* _LIBC_PRIVATE_H_ */
diff --git a/lib/libc/sys/Symbol.map b/lib/libc/sys/Symbol.map
--- a/lib/libc/sys/Symbol.map
+++ b/lib/libc/sys/Symbol.map
@@ -419,6 +419,8 @@
FBSD_1.7 {
_Fork;
fspacectl;
+ membarrier;
+ rseq;
swapoff;
};
diff --git a/lib/libthr/thread/thr_create.c b/lib/libthr/thread/thr_create.c
--- a/lib/libthr/thread/thr_create.c
+++ b/lib/libthr/thread/thr_create.c
@@ -288,6 +288,8 @@
curthread->attr.stacksize_attr;
#endif
+ __rseq_abi_init();
+
/* Run the current thread's start routine with argument: */
_pthread_exit(curthread->start_routine(curthread->arg));
diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c
--- a/sys/amd64/amd64/pmap.c
+++ b/sys/amd64/amd64/pmap.c
@@ -10051,6 +10051,12 @@
PCPU_SET(ucr3, PMAP_NO_CR3);
}
+void
+pmap_active_cpus(pmap_t pmap, cpuset_t *res)
+{
+ *res = pmap->pm_active;
+}
+
void
pmap_sync_icache(pmap_t pm, vm_offset_t va, vm_size_t sz)
{
diff --git a/sys/amd64/amd64/support.S b/sys/amd64/amd64/support.S
--- a/sys/amd64/amd64/support.S
+++ b/sys/amd64/amd64/support.S
@@ -1977,3 +1977,24 @@
popq %rax
retq
END(mds_handler_silvermont)
+
+/*
+ * Do the same as Linux and execute IRET explicitly, despite IPI
+ * return does it as well.
+ */
+ENTRY(cpu_sync_core)
+/*
+ * Can utilize SERIALIZE when instruction is moved from
+ * 'future extensions' to SDM.
+ */
+ movq (%rsp), %rdx
+ movl %ss, %eax
+ pushq %rax
+ pushq %rsp
+ addq $16, (%rsp)
+ pushfq
+ movl %cs, %eax
+ pushq %rax
+ pushq %rdx
+ iretq
+END(cpu_sync_core)
diff --git a/sys/arm/arm/pmap-v6.c b/sys/arm/arm/pmap-v6.c
--- a/sys/arm/arm/pmap-v6.c
+++ b/sys/arm/arm/pmap-v6.c
@@ -6214,6 +6214,12 @@
critical_exit();
}
+void
+pmap_active_cpus(pmap_t pmap, cpuset_t *res)
+{
+ *res = pmap->pm_active;
+}
+
/*
* Perform the pmap work for mincore(2). If the page is not both referenced and
* modified by this pmap, returns its physical address so that the caller can
diff --git a/sys/arm/arm/vm_machdep.c b/sys/arm/arm/vm_machdep.c
--- a/sys/arm/arm/vm_machdep.c
+++ b/sys/arm/arm/vm_machdep.c
@@ -320,3 +320,8 @@
return (EINVAL);
}
+
+void
+cpu_sync_core(void)
+{
+}
diff --git a/sys/arm64/arm64/vm_machdep.c b/sys/arm64/arm64/vm_machdep.c
--- a/sys/arm64/arm64/vm_machdep.c
+++ b/sys/arm64/arm64/vm_machdep.c
@@ -312,3 +312,14 @@
if (busdma_swi_pending != 0)
busdma_swi();
}
+
+void
+cpu_sync_core(void)
+{
+ /*
+ * Do nothing. According to ARM ARMv8 D1.11 Exception return
+ * If FEAT_ExS is not implemented, or if FEAT_ExS is
+ * implemented and the SCTLR_ELx.EOS field is set, exception
+ * return from ELx is a context synchronization event.
+ */
+}
diff --git a/sys/arm64/include/pmap.h b/sys/arm64/include/pmap.h
--- a/sys/arm64/include/pmap.h
+++ b/sys/arm64/include/pmap.h
@@ -152,6 +152,8 @@
(uint64_t)(asid) << ASID_TO_OPERAND_SHIFT; \
})
+#define PMAP_WANT_ACTIVE_CPUS_NAIVE
+
extern vm_offset_t virtual_avail;
extern vm_offset_t virtual_end;
diff --git a/sys/conf/files b/sys/conf/files
--- a/sys/conf/files
+++ b/sys/conf/files
@@ -3795,6 +3795,7 @@
kern/kern_loginclass.c standard
kern/kern_malloc.c standard
kern/kern_mbuf.c standard
+kern/kern_membarrier.c standard
kern/kern_mib.c standard
kern/kern_module.c standard
kern/kern_mtxpool.c standard
@@ -3813,6 +3814,7 @@
kern/kern_rctl.c standard
kern/kern_resource.c standard
kern/kern_rmlock.c standard
+kern/kern_rseq.c standard
kern/kern_rwlock.c standard
kern/kern_sdt.c optional kdtrace_hooks
kern/kern_sema.c standard
diff --git a/sys/i386/i386/pmap_base.c b/sys/i386/i386/pmap_base.c
--- a/sys/i386/i386/pmap_base.c
+++ b/sys/i386/i386/pmap_base.c
@@ -946,6 +946,12 @@
pmap_methods_ptr->pm_kremove(va);
}
+void
+pmap_active_cpus(pmap_t pmap, cpuset_t *res)
+{
+ *res = pmap->pm_active;
+}
+
extern struct pmap_methods pmap_pae_methods, pmap_nopae_methods;
int pae_mode;
SYSCTL_INT(_vm_pmap, OID_AUTO, pae_mode, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
diff --git a/sys/i386/i386/support.s b/sys/i386/i386/support.s
--- a/sys/i386/i386/support.s
+++ b/sys/i386/i386/support.s
@@ -580,3 +580,11 @@
movl %eax, %cr0
3: ret
END(mds_handler_silvermont)
+
+ENTRY(cpu_sync_core)
+ popl %eax
+ pushfl
+ pushl %cs
+ pushl %eax
+ iretl
+END(cpu_sync_core)
diff --git a/sys/kern/imgact_elf.c b/sys/kern/imgact_elf.c
--- a/sys/kern/imgact_elf.c
+++ b/sys/kern/imgact_elf.c
@@ -212,6 +212,11 @@
CTLFLAG_RWTUN, &__elfN(sigfastblock), 0,
"enable sigfastblock for new processes");
+static int __elfN(rseq1) = 1;
+SYSCTL_INT(__CONCAT(_kern_elf, __ELF_WORD_SIZE), OID_AUTO, rseq1,
+ CTLFLAG_RWTUN, &__elfN(rseq1), 0,
+ "enable rseq v1 ABI for new processes");
+
static bool __elfN(allow_wx) = true;
SYSCTL_BOOL(__CONCAT(_kern_elf, __ELF_WORD_SIZE), OID_AUTO, allow_wx,
CTLFLAG_RWTUN, &__elfN(allow_wx), 0,
@@ -1463,8 +1468,9 @@
AUXARGS_ENTRY(pos, AT_HWCAP, *imgp->sysent->sv_hwcap);
if (imgp->sysent->sv_hwcap2 != NULL)
AUXARGS_ENTRY(pos, AT_HWCAP2, *imgp->sysent->sv_hwcap2);
- AUXARGS_ENTRY(pos, AT_BSDFLAGS, __elfN(sigfastblock) ?
- ELF_BSDF_SIGFASTBLK : 0);
+ AUXARGS_ENTRY(pos, AT_BSDFLAGS,
+ (__elfN(sigfastblock) ? ELF_BSDF_SIGFASTBLK : 0) |
+ (__elfN(rseq1) ? ELF_BSDF_RSEQ1 : 0 ));
AUXARGS_ENTRY(pos, AT_ARGC, imgp->args->argc);
AUXARGS_ENTRY_PTR(pos, AT_ARGV, imgp->argv);
AUXARGS_ENTRY(pos, AT_ENVC, imgp->args->envc);
diff --git a/sys/kern/kern_exec.c b/sys/kern/kern_exec.c
--- a/sys/kern/kern_exec.c
+++ b/sys/kern/kern_exec.c
@@ -826,12 +826,15 @@
p->p_flag2 &= ~P2_NOTRACE;
if ((p->p_flag2 & P2_STKGAP_DISABLE_EXEC) == 0)
p->p_flag2 &= ~P2_STKGAP_DISABLE;
+ p->p_flag2 &= ~(P2_MEMBAR_PRIVE | P2_MEMBAR_PRIVE_SYNCORE |
+ P2_MEMBAR_GLOBE);
if (p->p_flag & P_PPWAIT) {
p->p_flag &= ~(P_PPWAIT | P_PPTRACE);
cv_broadcast(&p->p_pwait);
/* STOPs are no longer ignored, arrange for AST */
signotify(td);
}
+ td->td_rseq_abi = NULL;
if ((imgp->sysent->sv_setid_allowed != NULL &&
!(*imgp->sysent->sv_setid_allowed)(td, imgp)) ||
diff --git a/sys/kern/kern_membarrier.c b/sys/kern/kern_membarrier.c
new file mode 100644
--- /dev/null
+++ b/sys/kern/kern_membarrier.c
@@ -0,0 +1,276 @@
+/*-
+ * Copyright (c) 2021 The FreeBSD Foundation
+ *
+ * This software were developed by Konstantin Belousov <kib@FreeBSD.org>
+ * under sponsorship from the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/cpuset.h>
+#include <sys/lock.h>
+#include <sys/membarrier.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/sched.h>
+#include <sys/smp.h>
+#include <sys/syscallsubr.h>
+#include <sys/sysproto.h>
+
+#include <vm/vm_param.h>
+#include <vm/vm.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+
+#define MEMBARRIER_SUPPORTED_CMDS ( \
+ MEMBARRIER_CMD_GLOBAL | \
+ MEMBARRIER_CMD_GLOBAL_EXPEDITED | \
+ MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED | \
+ MEMBARRIER_CMD_PRIVATE_EXPEDITED | \
+ MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED | \
+ MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE | \
+ MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE | \
+ MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ | \
+ MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ)
+
+static void
+membarrier_action_rseq(void *arg __unused)
+{
+ struct thread *td;
+
+ td = curthread;
+ thread_lock(td);
+ td->td_flags |= TDF_ASTPENDING;
+ td->td_flags2 |= TDF2_RSEQ_MB;
+ thread_unlock(td);
+}
+
+static void
+membarrier_action_seqcst(void *arg __unused)
+{
+ atomic_thread_fence_seq_cst();
+}
+
+static void
+membarrier_action_seqcst_sync_core(void *arg __unused)
+{
+ atomic_thread_fence_seq_cst();
+ cpu_sync_core();
+}
+
+static void
+do_membarrier_ipi(cpuset_t *csp, void (*func)(void *))
+{
+ atomic_thread_fence_seq_cst();
+ smp_rendezvous_cpus(*csp, smp_no_rendezvous_barrier, func,
+ smp_no_rendezvous_barrier, NULL);
+ atomic_thread_fence_seq_cst();
+}
+
+static void
+check_cpu_switched(int c, cpuset_t *csp, uint64_t *swt, bool init)
+{
+ struct pcpu *pc;
+ uint64_t sw;
+
+ if (CPU_ISSET(c, csp))
+ return;
+
+ pc = cpuid_to_pcpu[c];
+ if (pc->pc_curthread == pc->pc_idlethread) {
+ CPU_SET(c, csp);
+ return;
+ }
+
+ /*
+ * Sync with context switch to ensure that override of
+ * pc_curthread with non-idle thread pointer is visible before
+ * reading of pc_switchtime.
+ */
+ atomic_thread_fence_acq();
+
+ sw = pc->pc_switchtime;
+ if (init)
+ swt[c] = sw;
+ else if (sw != swt[c])
+ CPU_SET(c, csp);
+}
+
+/*
+ *
+ * XXXKIB: We execute the requested action (seq_cst and possibly
+ * sync_core) on current CPU as well. There is no guarantee that
+ * current thread executes anything with the full fence semantics
+ * during syscall execution. Similarly, cpu_core_sync() semantics
+ * might be not provided by the syscall return. E.g. on amd64 we
+ * typically return without IRET.
+ */
+int
+kern_membarrier(struct thread *td, int cmd, unsigned flags, int cpu_id)
+{
+ struct proc *p, *p1;
+ struct thread *td1;
+ cpuset_t cs;
+ uint64_t *swt;
+ int c, error;
+ bool first;
+
+ if (flags != 0 || (cmd & ~MEMBARRIER_SUPPORTED_CMDS) != 0)
+ return (EINVAL);
+
+ if (cmd == MEMBARRIER_CMD_QUERY) {
+ td->td_retval[0] = MEMBARRIER_SUPPORTED_CMDS;
+ return (0);
+ }
+
+ p = td->td_proc;
+ error = 0;
+
+ switch (cmd) {
+ case MEMBARRIER_CMD_GLOBAL:
+ swt = malloc((mp_maxid + 1) * sizeof(*swt), M_TEMP, M_WAITOK);
+ CPU_ZERO(&cs);
+ sched_pin();
+ CPU_SET(PCPU_GET(cpuid), &cs);
+ for (first = true; error == 0; first = false) {
+ CPU_FOREACH(c)
+ check_cpu_switched(c, &cs, swt, first);
+ if (CPU_CMP(&cs, &all_cpus) == 0)
+ break;
+ error = pause_sig("mmbr", 1);
+ if (error == EWOULDBLOCK)
+ error = 0;
+ }
+ sched_unpin();
+ free(swt, M_TEMP);
+ atomic_thread_fence_seq_cst();
+ break;
+
+ case MEMBARRIER_CMD_GLOBAL_EXPEDITED:
+ if ((td->td_proc->p_flag2 & P2_MEMBAR_GLOBE) == 0) {
+ error = EPERM;
+ } else {
+ CPU_ZERO(&cs);
+ CPU_FOREACH(c) {
+ td1 = cpuid_to_pcpu[c]->pc_curthread;
+ p1 = td1->td_proc;
+ if (p1 != NULL &&
+ (p1->p_flag2 & P2_MEMBAR_GLOBE) != 0)
+ CPU_SET(c, &cs);
+ }
+ do_membarrier_ipi(&cs, membarrier_action_seqcst);
+ }
+ break;
+
+ case MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED:
+ if ((p->p_flag2 & P2_MEMBAR_GLOBE) == 0) {
+ PROC_LOCK(p);
+ p->p_flag2 |= P2_MEMBAR_GLOBE;
+ PROC_UNLOCK(p);
+ }
+ break;
+
+ case MEMBARRIER_CMD_PRIVATE_EXPEDITED:
+ if ((td->td_proc->p_flag2 & P2_MEMBAR_PRIVE) == 0) {
+ error = EPERM;
+ } else {
+ pmap_active_cpus(vmspace_pmap(p->p_vmspace), &cs);
+ do_membarrier_ipi(&cs, membarrier_action_seqcst);
+ }
+ break;
+
+ case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED:
+ if ((p->p_flag2 & P2_MEMBAR_PRIVE) == 0) {
+ PROC_LOCK(p);
+ p->p_flag2 |= P2_MEMBAR_PRIVE;
+ PROC_UNLOCK(p);
+ }
+ break;
+
+ case MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE:
+ if ((td->td_proc->p_flag2 & P2_MEMBAR_PRIVE_SYNCORE) == 0) {
+ error = EPERM;
+ } else {
+ /*
+ * Calculating the IPI multicast mask from
+ * pmap active mask means that we do not call
+ * cpu_sync_core() on CPUs that were missed
+ * from pmap active mask but could be switched
+ * from or to meantime. This is fine at least
+ * on amd64 because threads always use slow
+ * (IRETQ) path to return from syscall after
+ * context switch.
+ */
+ pmap_active_cpus(vmspace_pmap(p->p_vmspace), &cs);
+
+ do_membarrier_ipi(&cs,
+ membarrier_action_seqcst_sync_core);
+ }
+ break;
+
+ case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE:
+ if ((p->p_flag2 & P2_MEMBAR_PRIVE_SYNCORE) == 0) {
+ PROC_LOCK(p);
+ p->p_flag2 |= P2_MEMBAR_PRIVE_SYNCORE;
+ PROC_UNLOCK(p);
+ }
+ break;
+
+ case MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ:
+ if ((td->td_proc->p_flag2 & P2_MEMBAR_PRIVE_RSEQ) == 0) {
+ error = EPERM;
+ break;
+ }
+ pmap_active_cpus(vmspace_pmap(p->p_vmspace), &cs);
+ if ((flags & MEMBARRIER_CMD_FLAG_CPU) != 0) {
+ if (!CPU_ISSET(cpu_id, &cs))
+ break;
+ CPU_ZERO(&cs);
+ CPU_SET(cpu_id, &cs);
+ }
+ do_membarrier_ipi(&cs, membarrier_action_rseq);
+ break;
+
+ case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ:
+ if ((p->p_flag2 & P2_MEMBAR_PRIVE_RSEQ) == 0) {
+ PROC_LOCK(p);
+ p->p_flag2 |= P2_MEMBAR_PRIVE_RSEQ;
+ PROC_UNLOCK(p);
+ }
+ break;
+
+ default:
+ error = EINVAL;
+ break;
+ }
+
+ return (error);
+}
+
+int
+sys_membarrier(struct thread *td, struct membarrier_args *uap)
+{
+ return (kern_membarrier(td, uap->cmd, uap->flags, uap->cpu_id));
+}
diff --git a/sys/kern/kern_rseq.c b/sys/kern/kern_rseq.c
new file mode 100644
--- /dev/null
+++ b/sys/kern/kern_rseq.c
@@ -0,0 +1,280 @@
+/*-
+ * Copyright (c) 2021 The FreeBSD Foundation
+ *
+ * This software were developed by Konstantin Belousov <kib@FreeBSD.org>
+ * under sponsorship from the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/sysproto.h>
+#include <sys/rseq.h>
+
+static void
+rseq_inactivate(struct thread *td)
+{
+ td->td_rseq_abi = NULL;
+}
+
+static void
+rseq_inactivate_sig(struct thread *td, void *addr, int si_code)
+{
+ ksiginfo_t ksi;
+
+ rseq_inactivate(td);
+ ksiginfo_init_trap(&ksi);
+ ksi.ksi_signo = SIGSEGV;
+ ksi.ksi_code = si_code;
+ ksi.ksi_trapno = 0;
+ ksi.ksi_addr = addr;
+ trapsignal(td, &ksi);
+}
+
+void
+rseq_ast(struct thread *td)
+{
+ struct rseq rs;
+ struct rseq_cs rc;
+ void *sig_addr;
+ register_t pc;
+ uint32_t usig;
+ int cpu, error;
+ bool clear_cs;
+
+ if (td->td_rseq_abi == NULL)
+ return;
+
+ /*
+ * We cannot enter critical section there to keep td_oncpu
+ * valid due to userspace access. We do not even want to
+ * sched_pin() for the same reason.
+ *
+ * It is fine to get a context switch after reading td_oncpu,
+ * since this would cause new AST pending and we re-enter this
+ * function to update rseq cpu number.
+ *
+ * Microoptimize 64bit architectures by doing single 64bit
+ * write for cpu ids. For instance, on SMAP-enabled amd64
+ * this saves two serialization instructions STAC/CLAC.
+ */
+ cpu = td->td_oncpu;
+#ifdef __LP64__
+ rs.cpu_id_start = cpu;
+ rs.cpu_id = cpu;
+ error = suword64((char *)td->td_rseq_abi + offsetof(struct rseq,
+ cpu_id_start), *(uint64_t *)(char *)&rs.cpu_id_start);
+#else
+ error = suword((char *)td->td_rseq_abi + offsetof(struct rseq,
+ cpu_id_start), cpu);
+ if (error == 0) {
+ error = suword((char *)td->td_rseq_abi +
+ offsetof(struct rseq, cpu_id), cpu);
+ }
+#endif
+ if (error != 0) {
+ rseq_inactivate_sig(td, td->td_rseq_abi, SEGV_RSEQ_W);
+ return;
+ }
+
+ error = copyin(td->td_rseq_abi, &rs, sizeof(rs));
+ if (error != 0) {
+ rseq_inactivate_sig(td, td->td_rseq_abi, SEGV_RSEQ_R);
+ return;
+ }
+
+ if (rs.rseq_cs.ptr64 == 0)
+ return;
+ clear_cs = false;
+
+ critical_enter();
+ if ((td->td_flags2 & (TDF2_RSEQ_CLRCS | TDF2_RSEQ_MB)) == 0 &&
+ (rs.flags & RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT) != 0 &&
+ ((rs.flags & RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE) != 0 ||
+ td->td_oncpu == td->td_lastcpu))
+ return;
+ critical_exit();
+
+ error = copyin((void *)rs.rseq_cs.ptr64, &rc, sizeof(rc));
+ if (error != 0) {
+ rseq_inactivate_sig(td, (void *)rs.rseq_cs.ptr64, SEGV_RSEQ_R);
+ return;
+ }
+ if (rc.version != 0) {
+ rseq_inactivate_sig(td, (void *)rs.rseq_cs.ptr64, SEGV_RSEQ_R);
+ return;
+ }
+
+ critical_enter();
+ if ((td->td_flags2 & (TDF2_RSEQ_CLRCS | TDF2_RSEQ_MB)) == 0 &&
+ (rc.flags & RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT) != 0 &&
+ ((rc.flags & RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE) != 0 ||
+ td->td_oncpu == td->td_lastcpu))
+ return;
+ critical_exit();
+
+ if ((td->td_flags2 & (TDF2_RSEQ_CLRCS | TDF2_RSEQ_MB)) != 0) {
+ if ((td->td_flags2 & TDF2_RSEQ_CLRCS) != 0)
+ clear_cs = true;
+ thread_lock(td);
+ td->td_flags2 &= ~(TDF2_RSEQ_CLRCS | TDF2_RSEQ_MB);
+ thread_unlock(td);
+ }
+ pc = TRAPF_PC(td->td_frame);
+ if (!clear_cs &&
+ pc >= rc.start_ip && pc < rc.start_ip + rc.post_commit_offset) {
+ /* check signature */
+ sig_addr = (void *)(rc.abort_ip - sizeof(usig));
+ error = copyin(sig_addr, &usig, sizeof(usig));
+ if (error != 0) {
+ rseq_inactivate_sig(td, sig_addr, SEGV_RSEQ_R);
+ return;
+ }
+ if (usig != td->td_rseq_sig) {
+ rseq_inactivate_sig(td, sig_addr, SEGV_RSEQ_SIG);
+ return;
+ }
+
+ TRAPF_PC(td->td_frame) = rc.abort_ip;
+ clear_cs = true;
+ }
+ if (clear_cs) {
+ if (suword64((char *)td->td_rseq_abi + offsetof(struct rseq,
+ rseq_cs.ptr), 0) == -1) {
+ rseq_inactivate_sig(td, (char *)td->td_rseq_abi +
+ offsetof(struct rseq, rseq_cs.ptr),
+ SEGV_RSEQ_W);
+ return;
+ }
+ }
+}
+
+void
+rseq_before_sig(struct thread *td)
+{
+ struct rseq rs;
+ struct rseq_cs rc;
+ uint32_t usig;
+ int error;
+
+ td->td_pflags2 &= ~TDP2_RSEQ_SIG;
+ if (td->td_rseq_abi == NULL)
+ return;
+
+ error = copyin(td->td_rseq_abi, &rs, sizeof(rs));
+ if (error != 0) {
+ rseq_inactivate_sig(td, td->td_rseq_abi, SEGV_RSEQ_R);
+ return;
+ }
+
+ if (rs.rseq_cs.ptr64 == 0 ||
+ (rs.flags & RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL) != 0)
+ return;
+
+ error = copyin((void *)rs.rseq_cs.ptr64, &rc, sizeof(rc));
+ if (error != 0) {
+ rseq_inactivate_sig(td, (void *)rs.rseq_cs.ptr64, SEGV_RSEQ_R);
+ return;
+ }
+
+ if ((rc.flags & RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL) != 0)
+ return;
+
+ /* check signature */
+ error = copyin((void *)(rc.start_ip - sizeof(usig)), &usig,
+ sizeof(usig));
+ if (error != 0) {
+ rseq_inactivate_sig(td, (void *)(rc.start_ip - sizeof(usig)),
+ SEGV_RSEQ_R);
+ return;
+ }
+ if (usig != td->td_rseq_sig) {
+ rseq_inactivate_sig(td, (void *)(rc.start_ip - sizeof(usig)),
+ SEGV_RSEQ_SIG);
+ return;
+ }
+
+ td->td_pflags2 |= TDP2_RSEQ_SIG;
+ td->td_rseq_start_ip = rc.start_ip;
+ td->td_rseq_end_ip = rc.start_ip + rc.post_commit_offset;
+ td->td_rseq_abort_ip = rc.abort_ip;
+}
+
+void
+rseq_on_sig(struct thread *td)
+{
+ register_t pc;
+
+ if ((td->td_pflags2 & TDP2_RSEQ_SIG) == 0)
+ return;
+ td->td_pflags2 &= ~TDP2_RSEQ_SIG;
+ pc = TRAPF_PC(td->td_frame);
+ if (pc >= td->td_rseq_start_ip && pc < td->td_rseq_end_ip) {
+ TRAPF_PC(td->td_frame) = td->td_rseq_abort_ip;
+ thread_lock(td);
+ td->td_flags |= TDF_ASTPENDING | TDF_RSEQ;
+ td->td_flags2 |= TDF2_RSEQ_CLRCS;
+ thread_unlock(td);
+ }
+}
+
+static int
+kern_rseq(struct thread *td, uintptr_t rseq, uint32_t rseqlen, int flags,
+ uint32_t sig)
+{
+ if (rseqlen != sizeof(struct rseq))
+ return (EINVAL);
+
+ if (flags == RSEQ_FLAG_UNREGISTER) {
+ if (rseq != 0 || td->td_rseq_abi == NULL)
+ return (EINVAL);
+ if (sig != td->td_rseq_sig)
+ return (EPERM);
+ rseq_inactivate(td);
+ return (0);
+ }
+
+ if (td->td_rseq_abi != NULL)
+ return (EBUSY);
+ if (flags != 0 || rseq == 0 ||
+ trunc_page(rseq) != trunc_page(rseq + rseqlen))
+ return (EINVAL);
+
+ td->td_rseq_abi = (void *)rseq;
+ td->td_rseq_sig = sig;
+ thread_lock(td);
+ td->td_flags |= TDF_ASTPENDING | TDF_RSEQ;
+ thread_unlock(td);
+ return (0);
+}
+
+int
+sys_rseq(struct thread *td, struct rseq_args *uap)
+{
+ return (kern_rseq(td, (uintptr_t)uap->rseq, uap->rseqlen,
+ uap->flags, uap->sig));
+}
diff --git a/sys/kern/kern_sig.c b/sys/kern/kern_sig.c
--- a/sys/kern/kern_sig.c
+++ b/sys/kern/kern_sig.c
@@ -69,6 +69,7 @@
#include <sys/posix4.h>
#include <sys/racct.h>
#include <sys/resourcevar.h>
+#include <sys/rseq.h>
#include <sys/sdt.h>
#include <sys/sbuf.h>
#include <sys/sleepqueue.h>
@@ -2029,6 +2030,7 @@
KASSERT(_SIG_VALID(sig), ("invalid signal"));
sigfastblock_fetch(td);
+ rseq_before_sig(td);
PROC_LOCK(p);
ps = p->p_sigacts;
mtx_lock(&ps->ps_mtx);
@@ -2042,6 +2044,7 @@
ktrpsig(sig, ps->ps_sigact[_SIG_IDX(sig)],
&td->td_sigmask, code);
#endif
+ rseq_on_sig(td);
(*p->p_sysent->sv_sendsig)(ps->ps_sigact[_SIG_IDX(sig)],
ksi, &td->td_sigmask);
postsig_done(sig, td, ps);
@@ -3253,6 +3256,7 @@
if (p->p_sig == sig) {
p->p_sig = 0;
}
+ rseq_on_sig(td);
(*p->p_sysent->sv_sendsig)(action, &ksi, &returnmask);
postsig_done(sig, td, ps);
}
diff --git a/sys/kern/kern_synch.c b/sys/kern/kern_synch.c
--- a/sys/kern/kern_synch.c
+++ b/sys/kern/kern_synch.c
@@ -53,6 +53,7 @@
#include <sys/mutex.h>
#include <sys/proc.h>
#include <sys/resourcevar.h>
+#include <sys/rseq.h>
#include <sys/sched.h>
#include <sys/sdt.h>
#include <sys/signalvar.h>
@@ -491,7 +492,7 @@
mi_switch(int flags)
{
uint64_t runtime, new_switchtime;
- struct thread *td;
+ struct thread *td, *td1;
td = curthread; /* XXX */
THREAD_LOCK_ASSERT(td, MA_OWNED | MA_NOTRECURSED);
@@ -549,9 +550,14 @@
/*
* If the last thread was exiting, finish cleaning it up.
*/
- if ((td = PCPU_GET(deadthread))) {
+ if ((td1 = PCPU_GET(deadthread))) {
PCPU_SET(deadthread, NULL);
- thread_stash(td);
+ thread_stash(td1);
+ }
+ if (td->td_rseq_abi != NULL) {
+ thread_lock(td);
+ td->td_flags |= TDF_ASTPENDING | TDF_RSEQ;
+ thread_unlock(td);
}
spinlock_exit();
}
diff --git a/sys/kern/kern_thread.c b/sys/kern/kern_thread.c
--- a/sys/kern/kern_thread.c
+++ b/sys/kern/kern_thread.c
@@ -87,11 +87,11 @@
#ifdef __amd64__
_Static_assert(offsetof(struct thread, td_flags) == 0x108,
"struct thread KBI td_flags");
-_Static_assert(offsetof(struct thread, td_pflags) == 0x110,
+_Static_assert(offsetof(struct thread, td_pflags) == 0x114,
"struct thread KBI td_pflags");
-_Static_assert(offsetof(struct thread, td_frame) == 0x4a8,
+_Static_assert(offsetof(struct thread, td_frame) == 0x4d0,
"struct thread KBI td_frame");
-_Static_assert(offsetof(struct thread, td_emuldata) == 0x6b0,
+_Static_assert(offsetof(struct thread, td_emuldata) == 0x6e0,
"struct thread KBI td_emuldata");
_Static_assert(offsetof(struct proc, p_flag) == 0xb8,
"struct proc KBI p_flag");
@@ -109,9 +109,9 @@
"struct thread KBI td_flags");
_Static_assert(offsetof(struct thread, td_pflags) == 0xa4,
"struct thread KBI td_pflags");
-_Static_assert(offsetof(struct thread, td_frame) == 0x308,
+_Static_assert(offsetof(struct thread, td_frame) == 0x31c,
"struct thread KBI td_frame");
-_Static_assert(offsetof(struct thread, td_emuldata) == 0x34c,
+_Static_assert(offsetof(struct thread, td_emuldata) == 0x360,
"struct thread KBI td_emuldata");
_Static_assert(offsetof(struct proc, p_flag) == 0x6c,
"struct proc KBI p_flag");
diff --git a/sys/kern/subr_trap.c b/sys/kern/subr_trap.c
--- a/sys/kern/subr_trap.c
+++ b/sys/kern/subr_trap.c
@@ -64,6 +64,7 @@
#include <sys/ptrace.h>
#include <sys/racct.h>
#include <sys/resourcevar.h>
+#include <sys/rseq.h>
#include <sys/sched.h>
#include <sys/signalvar.h>
#include <sys/syscall.h>
@@ -243,7 +244,7 @@
flags = td->td_flags;
td->td_flags &= ~(TDF_ASTPENDING | TDF_NEEDSIGCHK | TDF_NEEDSUSPCHK |
TDF_NEEDRESCHED | TDF_ALRMPEND | TDF_PROFPEND | TDF_MACPEND |
- TDF_KQTICKLED);
+ TDF_KQTICKLED | TDF_RSEQ);
thread_unlock(td);
VM_CNT_INC(v_trap);
@@ -332,6 +333,7 @@
if (flags & TDF_NEEDSIGCHK || p->p_pendingcnt > 0 ||
!SIGISEMPTY(p->p_siglist)) {
sigfastblock_fetch(td);
+ rseq_before_sig(td);
PROC_LOCK(p);
mtx_lock(&p->p_sigacts->ps_mtx);
while ((sig = cursig(td)) != 0) {
@@ -354,6 +356,9 @@
*/
sigfastblock_setpend(td, resched_sigs);
+ if ((flags & TDF_RSEQ) != 0)
+ rseq_ast(td);
+
#ifdef KTRACE
KTRUSERRET(td);
#endif
diff --git a/sys/kern/syscalls.master b/sys/kern/syscalls.master
--- a/sys/kern/syscalls.master
+++ b/sys/kern/syscalls.master
@@ -3299,13 +3299,28 @@
581 AUE_NULL STD|CAPENABLED {
int sched_getcpu(void);
}
-
582 AUE_SWAPOFF STD {
int swapoff(
_In_z_ const char *name,
u_int flags,
);
}
+583 AUE_NULL STD|CAPENABLED {
+ int membarrier(
+ int cmd,
+ unsigned flags,
+ int cpu_id
+ );
+ }
+584 AUE_NULL STD|CAPENABLED {
+ int rseq(
+ _Inout_updates_bytes_(rseqlen) void *rseq,
+ uint32_t rseqlen,
+ int flags,
+ uint32_t sig
+ );
+ }
+
; Please copy any additions and changes to the following compatability tables:
; sys/compat/freebsd32/syscalls.master
diff --git a/sys/mips/mips/pmap.c b/sys/mips/mips/pmap.c
--- a/sys/mips/mips/pmap.c
+++ b/sys/mips/mips/pmap.c
@@ -3763,3 +3763,9 @@
return (FALSE);
}
}
+
+void
+pmap_active_cpus(pmap_t pmap, cpuset_t *res)
+{
+ *res = pmap->pm_active;
+}
diff --git a/sys/mips/mips/vm_machdep.c b/sys/mips/mips/vm_machdep.c
--- a/sys/mips/mips/vm_machdep.c
+++ b/sys/mips/mips/vm_machdep.c
@@ -459,6 +459,11 @@
return (EINVAL);
}
+void
+cpu_sync_core(void)
+{
+}
+
/*
* Software interrupt handler for queued VM system processing.
*/
diff --git a/sys/powerpc/powerpc/pmap_dispatch.c b/sys/powerpc/powerpc/pmap_dispatch.c
--- a/sys/powerpc/powerpc/pmap_dispatch.c
+++ b/sys/powerpc/powerpc/pmap_dispatch.c
@@ -255,3 +255,9 @@
return (FALSE);
}
}
+
+void
+pmap_active_cpus(pmap_t pmap, cpuset_t *res)
+{
+ *res = pmap->pm_active;
+}
diff --git a/sys/powerpc/powerpc/vm_machdep.c b/sys/powerpc/powerpc/vm_machdep.c
--- a/sys/powerpc/powerpc/vm_machdep.c
+++ b/sys/powerpc/powerpc/vm_machdep.c
@@ -268,3 +268,9 @@
return (EINVAL);
}
+
+void
+cpu_sync_core(void)
+{
+ isync();
+}
diff --git a/sys/riscv/riscv/pmap.c b/sys/riscv/riscv/pmap.c
--- a/sys/riscv/riscv/pmap.c
+++ b/sys/riscv/riscv/pmap.c
@@ -4433,6 +4433,12 @@
PCPU_SET(curpmap, pmap);
}
+void
+pmap_active_cpus(pmap_t pmap, cpuset_t *res)
+{
+ *res = pmap->pm_active;
+}
+
void
pmap_sync_icache(pmap_t pmap, vm_offset_t va, vm_size_t sz)
{
diff --git a/sys/riscv/riscv/vm_machdep.c b/sys/riscv/riscv/vm_machdep.c
--- a/sys/riscv/riscv/vm_machdep.c
+++ b/sys/riscv/riscv/vm_machdep.c
@@ -51,6 +51,7 @@
#include <machine/riscvreg.h>
#include <machine/cpu.h>
+#include <machine/cpufunc.h>
#include <machine/pcb.h>
#include <machine/frame.h>
#include <machine/sbi.h>
@@ -276,3 +277,9 @@
/* Nothing to do here - busdma bounce buffers are not implemented. */
}
+
+void
+cpu_sync_core(void)
+{
+ fence_i();
+}
diff --git a/sys/sys/elf_common.h b/sys/sys/elf_common.h
--- a/sys/sys/elf_common.h
+++ b/sys/sys/elf_common.h
@@ -1497,5 +1497,6 @@
#define R_X86_64_REX_GOTPCRELX 42
#define ELF_BSDF_SIGFASTBLK 0x0001 /* Kernel supports fast sigblock */
+#define ELF_BSDF_RSEQ1 0x0002 /* Kernel support for rseq v1 */
#endif /* !_SYS_ELF_COMMON_H_ */
diff --git a/sys/sys/membarrier.h b/sys/sys/membarrier.h
new file mode 100644
--- /dev/null
+++ b/sys/sys/membarrier.h
@@ -0,0 +1,64 @@
+/*-
+ * Copyright (c) 2021 The FreeBSD Foundation
+ *
+ * This software were developed by Konstantin Belousov <kib@FreeBSD.org>
+ * under sponsorship from the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef __SYS_MEMBARRIER_H__
+#define __SYS_MEMBARRIER_H__
+
+#include <sys/cdefs.h>
+
+/*
+ * The enum membarrier_cmd values are bits. The MEMBARRIER_CMD_QUERY
+ * command returns a bitset indicating which commands are supported.
+ * Also the value of MEMBARRIER_CMD_QUERY is zero, so it is
+ * effectively not returned by the query.
+ */
+enum membarrier_cmd {
+ MEMBARRIER_CMD_QUERY = 0x00000000,
+ MEMBARRIER_CMD_GLOBAL = 0x00000001,
+ MEMBARRIER_CMD_SHARED = MEMBARRIER_CMD_GLOBAL,
+ MEMBARRIER_CMD_GLOBAL_EXPEDITED = 0x00000002,
+ MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED = 0x00000004,
+ MEMBARRIER_CMD_PRIVATE_EXPEDITED = 0x00000008,
+ MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED = 0x00000010,
+ MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE = 0x00000020,
+ MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE = 0x00000040,
+ MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ = 0x00000080,
+ MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ = 0x00000100,
+};
+
+enum membarrier_cmd_flag {
+ MEMBARRIER_CMD_FLAG_CPU = 0x00000001,
+};
+
+#ifndef _KERNEL
+__BEGIN_DECLS
+int membarrier(int, unsigned, int);
+__END_DECLS
+#endif /* _KERNEL */
+
+#endif /* __SYS_MEMBARRIER_H__ */
diff --git a/sys/sys/proc.h b/sys/sys/proc.h
--- a/sys/sys/proc.h
+++ b/sys/sys/proc.h
@@ -262,6 +262,7 @@
/* Cleared during fork1() */
#define td_startzero td_flags
int td_flags; /* (t) TDF_* flags. */
+ int td_flags2; /* (t) TDF2_* flags. */
int td_inhibitors; /* (t) Why can not run. */
int td_pflags; /* (k) Private thread (TDP_*) flags. */
int td_pflags2; /* (k) Private thread (TDP2_*) flags. */
@@ -322,6 +323,11 @@
size_t td_vslock_sz; /* (k) amount of vslock-ed space */
struct kcov_info *td_kcov_info; /* (*) Kernel code coverage data */
u_int td_ucredref; /* (k) references on td_realucred */
+ uint32_t td_rseq_sig; /* (k) abort handler signature */
+ void *td_rseq_abi; /* (k) usermode rseq */
+ register_t td_rseq_start_ip;/* (k) */
+ register_t td_rseq_end_ip; /* (k) */
+ register_t td_rseq_abort_ip;/* (k) */
#define td_endzero td_sigmask
/* Copied during fork1() or create_thread(). */
@@ -468,7 +474,7 @@
#define TDF_THRWAKEUP 0x00100000 /* Libthr thread must not suspend itself. */
#define TDF_SEINTR 0x00200000 /* EINTR on stop attempts. */
#define TDF_SWAPINREQ 0x00400000 /* Swapin request due to wakeup. */
-#define TDF_UNUSED23 0x00800000 /* --available-- */
+#define TDF_RSEQ 0x00800000 /* rseq active */
#define TDF_SCHED0 0x01000000 /* Reserved for scheduler private use */
#define TDF_SCHED1 0x02000000 /* Reserved for scheduler private use */
#define TDF_SCHED2 0x04000000 /* Reserved for scheduler private use */
@@ -477,6 +483,9 @@
#define TDF_PROFPEND 0x20000000 /* Pending SIGPROF needs to be posted. */
#define TDF_MACPEND 0x40000000 /* AST-based MAC event pending. */
+#define TDF2_RSEQ_CLRCS 0x00000001 /* rseq clear rc */
+#define TDF2_RSEQ_MB 0x00000002 /* MEMBARRIER_RSEQ requested */
+
/* Userland debug flags */
#define TDB_SUSPEND 0x00000001 /* Thread is suspended by debugger */
#define TDB_XSIG 0x00000002 /* Thread is exchanging signal under trace */
@@ -537,6 +546,7 @@
#define TDP2_SBPAGES 0x00000001 /* Owns sbusy on some pages */
#define TDP2_COMPAT32RB 0x00000002 /* compat32 ABI for robust lists */
#define TDP2_ACCT 0x00000004 /* Doing accounting */
+#define TDP2_RSEQ_SIG 0x00000008
/*
* Reasons that the current thread can not be run yet.
@@ -847,6 +857,14 @@
#define P2_NO_NEW_PRIVS 0x00008000 /* Ignore setuid */
#define P2_WXORX_DISABLE 0x00010000 /* WX mappings enabled */
#define P2_WXORX_ENABLE_EXEC 0x00020000 /* WXORX enabled after exec */
+#define P2_MEMBAR_PRIVE 0x00040000 /* membar private expedited
+ registered */
+#define P2_MEMBAR_PRIVE_SYNCORE 0x00080000 /* membar private expedited
+ sync core registered */
+#define P2_MEMBAR_GLOBE 0x00100000 /* membar global expedited
+ registered */
+#define P2_MEMBAR_PRIVE_RSEQ 0x00200000 /* membar private expedited
+ rseq registered */
/* Flags protected by proctree_lock, kept in p_treeflags. */
#define P_TREE_ORPHANED 0x00000001 /* Reparented, on orphan list */
@@ -1170,6 +1188,7 @@
int cpu_idle_wakeup(int);
extern void (*cpu_idle_hook)(sbintime_t); /* Hook to machdep CPU idler. */
void cpu_switch(struct thread *, struct thread *, struct mtx *);
+void cpu_sync_core(void);
void cpu_throw(struct thread *, struct thread *) __dead2;
bool curproc_sigkilled(void);
void userret(struct thread *, struct trapframe *);
diff --git a/sys/sys/rseq.h b/sys/sys/rseq.h
new file mode 100644
--- /dev/null
+++ b/sys/sys/rseq.h
@@ -0,0 +1,99 @@
+/*-
+ * Copyright (c) 2021 The FreeBSD Foundation
+ *
+ * This software were developed by Konstantin Belousov <kib@FreeBSD.org>
+ * under sponsorship from the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef __SYS_RSEQ_H__
+#define __SYS_RSEQ_H__
+
+#include <sys/cdefs.h>
+#include <sys/types.h>
+#include <sys/endian.h>
+
+enum rseq_cpu_id_state {
+ RSEQ_CPU_ID_UNINITIALIZED = -1,
+ RSEQ_CPU_ID_REGISTRATION_FAILED = -2,
+};
+
+enum rseq_flags {
+ RSEQ_FLAG_UNREGISTER = 1,
+};
+
+enum rseq_cs_flags {
+ RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT = 0x00000001,
+ RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL = 0x00000002,
+ RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE = 0x00000004,
+};
+
+struct rseq_cs {
+ uint32_t version;
+ uint32_t flags;
+ uint64_t start_ip;
+ uint64_t post_commit_offset;
+ uint64_t abort_ip;
+};
+
+struct rseq {
+ uint32_t cpu_id_start;
+ uint32_t cpu_id;
+ union {
+ uint64_t ptr64;
+#ifdef __LP64__
+ uint64_t ptr;
+#else
+ struct {
+#if _BYTE_ORDER == _BIG_ENDIAN
+ uint32_t pad;
+ uint32_t ptr32;
+#else /* BYTE_ORDER */
+ uint32_t ptr32;
+ uint32_t pad;
+#endif /* BYTE_ORDER */
+ } ptr;
+#endif /* LP64 */
+ } rseq_cs;
+ uint32_t flags;
+};
+
+#ifdef _KERNEL
+
+#define TD_RSEQ_ACTIVE 0x00000001
+
+void rseq_ast(struct thread *td);
+void rseq_before_sig(struct thread *td);
+void rseq_on_sig(struct thread *td);
+
+#else /* _KERNEL */
+
+__BEGIN_DECLS
+extern __thread volatile struct rseq __rseq_abi __weak_symbol;
+
+int rseq(volatile struct rseq *rseq, uint32_t rseqlen, int flags, uint32_t sig);
+__END_DECLS
+
+#endif /* _KERNEL */
+
+#endif /* __SYS_RSEQ_H__ */
diff --git a/sys/sys/signal.h b/sys/sys/signal.h
--- a/sys/sys/signal.h
+++ b/sys/sys/signal.h
@@ -329,6 +329,9 @@
#define SEGV_ACCERR 2 /* Invalid permissions for mapped */
/* object. */
#define SEGV_PKUERR 100 /* x86: PKU violation */
+#define SEGV_RSEQ_R 101 /* rseq access read fault */
+#define SEGV_RSEQ_W 102 /* rseq access write fault */
+#define SEGV_RSEQ_SIG 103 /* rseq signature check fault */
/* codes for SIGFPE */
#define FPE_INTOVF 1 /* Integer overflow. */
diff --git a/sys/sys/syscallsubr.h b/sys/sys/syscallsubr.h
--- a/sys/sys/syscallsubr.h
+++ b/sys/sys/syscallsubr.h
@@ -200,6 +200,8 @@
int inherit);
int kern_mkdirat(struct thread *td, int fd, const char *path,
enum uio_seg segflg, int mode);
+int kern_membarrier(struct thread *td, int cmd, unsigned flags,
+ int cpu_id);
int kern_mkfifoat(struct thread *td, int fd, const char *path,
enum uio_seg pathseg, int mode);
int kern_mknodat(struct thread *td, int fd, const char *path,
diff --git a/sys/vm/pmap.h b/sys/vm/pmap.h
--- a/sys/vm/pmap.h
+++ b/sys/vm/pmap.h
@@ -92,6 +92,7 @@
#include <machine/pmap.h>
#ifdef _KERNEL
+#include <sys/_cpuset.h>
struct thread;
/*
@@ -120,6 +121,7 @@
#define PMAP_TS_REFERENCED_MAX 5
void pmap_activate(struct thread *td);
+void pmap_active_cpus(pmap_t pmap, cpuset_t *res);
void pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
int advice);
void pmap_align_superpage(vm_object_t, vm_ooffset_t, vm_offset_t *,
diff --git a/sys/vm/vm_kern.c b/sys/vm/vm_kern.c
--- a/sys/vm/vm_kern.c
+++ b/sys/vm/vm_kern.c
@@ -79,6 +79,7 @@
#include <sys/malloc.h>
#include <sys/proc.h>
#include <sys/rwlock.h>
+#include <sys/smp.h>
#include <sys/sysctl.h>
#include <sys/vmem.h>
#include <sys/vmmeter.h>
@@ -881,6 +882,31 @@
#endif
}
+#ifdef PMAP_WANT_ACTIVE_CPUS_NAIVE
+void
+pmap_active_cpus(pmap_t pmap, cpuset_t *res)
+{
+ struct thread *td;
+ struct proc *p;
+ struct vmspace *vm;
+ int c;
+
+ CPU_ZERO(res);
+ CPU_FOREACH(c) {
+ td = cpuid_to_pcpu[c]->pc_curthread;
+ p = td->td_proc;
+ if (p == NULL)
+ continue;
+ vm = vmspace_acquire_ref(p);
+ if (vm == NULL)
+ continue;
+ if (pmap == vmspace_pmap(vm))
+ CPU_SET(c, res);
+ vmspace_free(vm);
+ }
+}
+#endif
+
/*
* Allow userspace to directly trigger the VM drain routine for testing
* purposes.
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Mon, Apr 28, 7:41 PM (3 h, 41 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
17833011
Default Alt Text
D32505.id100265.diff (42 KB)
Attached To
Mode
D32505: Add rseq(2)
Attached
Detach File
Event Timeline
Log In to Comment