Page MenuHomeFreeBSD

D32505.id100265.diff
No OneTemporary

D32505.id100265.diff

diff --git a/lib/libc/gen/Makefile.inc b/lib/libc/gen/Makefile.inc
--- a/lib/libc/gen/Makefile.inc
+++ b/lib/libc/gen/Makefile.inc
@@ -120,6 +120,7 @@
readpassphrase.c \
recvmmsg.c \
rewinddir.c \
+ rseq_abi.c \
scandir.c \
scandir_b.c \
scandir-compat11.c \
diff --git a/lib/libc/gen/Symbol.map b/lib/libc/gen/Symbol.map
--- a/lib/libc/gen/Symbol.map
+++ b/lib/libc/gen/Symbol.map
@@ -436,6 +436,7 @@
};
FBSD_1.7 {
+ __rseq_abi;
posix_spawn_file_actions_addchdir_np;
posix_spawn_file_actions_addclosefrom_np;
posix_spawn_file_actions_addfchdir_np;
@@ -569,4 +570,6 @@
__fillcontextx;
__fillcontextx2;
__getcontextx_size;
+
+ __rseq_abi_init;
};
diff --git a/lib/libc/gen/rseq_abi.c b/lib/libc/gen/rseq_abi.c
new file mode 100644
--- /dev/null
+++ b/lib/libc/gen/rseq_abi.c
@@ -0,0 +1,55 @@
+/*-
+ * Copyright (c) 2021 The FreeBSD Foundation
+ *
+ * This software were developed by Konstantin Belousov <kib@FreeBSD.org>
+ * under sponsorship from the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/types.h>
+#include <sys/auxv.h>
+#include <sys/rseq.h>
+#include <sched.h>
+#include "libc_private.h"
+
+_Thread_local volatile struct rseq __rseq_abi __weak_symbol;
+
+static void __main_rseq_abi_init(void) __attribute__((__constructor__,
+ __used__));
+static void
+__main_rseq_abi_init(void)
+{
+ __rseq_abi_init();
+}
+
+void
+__rseq_abi_init(void)
+{
+ int bsdflags;
+
+ if (_elf_aux_info(AT_BSDFLAGS, &bsdflags, sizeof(bsdflags)) != 0 ||
+ (bsdflags & ELF_BSDF_RSEQ1) == 0)
+ return;
+ __rseq_abi.cpu_id_start = sched_getcpu();
+ rseq(&__rseq_abi, sizeof(__rseq_abi), 0, 0/* XXXKIB */);
+}
diff --git a/lib/libc/include/libc_private.h b/lib/libc/include/libc_private.h
--- a/lib/libc/include/libc_private.h
+++ b/lib/libc/include/libc_private.h
@@ -435,4 +435,6 @@
struct __nl_cat_d *__catopen_l(const char *name, int type,
struct _xlocale *locale);
+void __rseq_abi_init(void);
+
#endif /* _LIBC_PRIVATE_H_ */
diff --git a/lib/libc/sys/Symbol.map b/lib/libc/sys/Symbol.map
--- a/lib/libc/sys/Symbol.map
+++ b/lib/libc/sys/Symbol.map
@@ -419,6 +419,8 @@
FBSD_1.7 {
_Fork;
fspacectl;
+ membarrier;
+ rseq;
swapoff;
};
diff --git a/lib/libthr/thread/thr_create.c b/lib/libthr/thread/thr_create.c
--- a/lib/libthr/thread/thr_create.c
+++ b/lib/libthr/thread/thr_create.c
@@ -288,6 +288,8 @@
curthread->attr.stacksize_attr;
#endif
+ __rseq_abi_init();
+
/* Run the current thread's start routine with argument: */
_pthread_exit(curthread->start_routine(curthread->arg));
diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c
--- a/sys/amd64/amd64/pmap.c
+++ b/sys/amd64/amd64/pmap.c
@@ -10051,6 +10051,12 @@
PCPU_SET(ucr3, PMAP_NO_CR3);
}
+void
+pmap_active_cpus(pmap_t pmap, cpuset_t *res)
+{
+ *res = pmap->pm_active;
+}
+
void
pmap_sync_icache(pmap_t pm, vm_offset_t va, vm_size_t sz)
{
diff --git a/sys/amd64/amd64/support.S b/sys/amd64/amd64/support.S
--- a/sys/amd64/amd64/support.S
+++ b/sys/amd64/amd64/support.S
@@ -1977,3 +1977,24 @@
popq %rax
retq
END(mds_handler_silvermont)
+
+/*
+ * Do the same as Linux and execute IRET explicitly, despite IPI
+ * return does it as well.
+ */
+ENTRY(cpu_sync_core)
+/*
+ * Can utilize SERIALIZE when instruction is moved from
+ * 'future extensions' to SDM.
+ */
+ movq (%rsp), %rdx
+ movl %ss, %eax
+ pushq %rax
+ pushq %rsp
+ addq $16, (%rsp)
+ pushfq
+ movl %cs, %eax
+ pushq %rax
+ pushq %rdx
+ iretq
+END(cpu_sync_core)
diff --git a/sys/arm/arm/pmap-v6.c b/sys/arm/arm/pmap-v6.c
--- a/sys/arm/arm/pmap-v6.c
+++ b/sys/arm/arm/pmap-v6.c
@@ -6214,6 +6214,12 @@
critical_exit();
}
+void
+pmap_active_cpus(pmap_t pmap, cpuset_t *res)
+{
+ *res = pmap->pm_active;
+}
+
/*
* Perform the pmap work for mincore(2). If the page is not both referenced and
* modified by this pmap, returns its physical address so that the caller can
diff --git a/sys/arm/arm/vm_machdep.c b/sys/arm/arm/vm_machdep.c
--- a/sys/arm/arm/vm_machdep.c
+++ b/sys/arm/arm/vm_machdep.c
@@ -320,3 +320,8 @@
return (EINVAL);
}
+
+void
+cpu_sync_core(void)
+{
+}
diff --git a/sys/arm64/arm64/vm_machdep.c b/sys/arm64/arm64/vm_machdep.c
--- a/sys/arm64/arm64/vm_machdep.c
+++ b/sys/arm64/arm64/vm_machdep.c
@@ -312,3 +312,14 @@
if (busdma_swi_pending != 0)
busdma_swi();
}
+
+void
+cpu_sync_core(void)
+{
+ /*
+ * Do nothing. According to ARM ARMv8 D1.11 Exception return
+ * If FEAT_ExS is not implemented, or if FEAT_ExS is
+ * implemented and the SCTLR_ELx.EOS field is set, exception
+ * return from ELx is a context synchronization event.
+ */
+}
diff --git a/sys/arm64/include/pmap.h b/sys/arm64/include/pmap.h
--- a/sys/arm64/include/pmap.h
+++ b/sys/arm64/include/pmap.h
@@ -152,6 +152,8 @@
(uint64_t)(asid) << ASID_TO_OPERAND_SHIFT; \
})
+#define PMAP_WANT_ACTIVE_CPUS_NAIVE
+
extern vm_offset_t virtual_avail;
extern vm_offset_t virtual_end;
diff --git a/sys/conf/files b/sys/conf/files
--- a/sys/conf/files
+++ b/sys/conf/files
@@ -3795,6 +3795,7 @@
kern/kern_loginclass.c standard
kern/kern_malloc.c standard
kern/kern_mbuf.c standard
+kern/kern_membarrier.c standard
kern/kern_mib.c standard
kern/kern_module.c standard
kern/kern_mtxpool.c standard
@@ -3813,6 +3814,7 @@
kern/kern_rctl.c standard
kern/kern_resource.c standard
kern/kern_rmlock.c standard
+kern/kern_rseq.c standard
kern/kern_rwlock.c standard
kern/kern_sdt.c optional kdtrace_hooks
kern/kern_sema.c standard
diff --git a/sys/i386/i386/pmap_base.c b/sys/i386/i386/pmap_base.c
--- a/sys/i386/i386/pmap_base.c
+++ b/sys/i386/i386/pmap_base.c
@@ -946,6 +946,12 @@
pmap_methods_ptr->pm_kremove(va);
}
+void
+pmap_active_cpus(pmap_t pmap, cpuset_t *res)
+{
+ *res = pmap->pm_active;
+}
+
extern struct pmap_methods pmap_pae_methods, pmap_nopae_methods;
int pae_mode;
SYSCTL_INT(_vm_pmap, OID_AUTO, pae_mode, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
diff --git a/sys/i386/i386/support.s b/sys/i386/i386/support.s
--- a/sys/i386/i386/support.s
+++ b/sys/i386/i386/support.s
@@ -580,3 +580,11 @@
movl %eax, %cr0
3: ret
END(mds_handler_silvermont)
+
+ENTRY(cpu_sync_core)
+ popl %eax
+ pushfl
+ pushl %cs
+ pushl %eax
+ iretl
+END(cpu_sync_core)
diff --git a/sys/kern/imgact_elf.c b/sys/kern/imgact_elf.c
--- a/sys/kern/imgact_elf.c
+++ b/sys/kern/imgact_elf.c
@@ -212,6 +212,11 @@
CTLFLAG_RWTUN, &__elfN(sigfastblock), 0,
"enable sigfastblock for new processes");
+static int __elfN(rseq1) = 1;
+SYSCTL_INT(__CONCAT(_kern_elf, __ELF_WORD_SIZE), OID_AUTO, rseq1,
+ CTLFLAG_RWTUN, &__elfN(rseq1), 0,
+ "enable rseq v1 ABI for new processes");
+
static bool __elfN(allow_wx) = true;
SYSCTL_BOOL(__CONCAT(_kern_elf, __ELF_WORD_SIZE), OID_AUTO, allow_wx,
CTLFLAG_RWTUN, &__elfN(allow_wx), 0,
@@ -1463,8 +1468,9 @@
AUXARGS_ENTRY(pos, AT_HWCAP, *imgp->sysent->sv_hwcap);
if (imgp->sysent->sv_hwcap2 != NULL)
AUXARGS_ENTRY(pos, AT_HWCAP2, *imgp->sysent->sv_hwcap2);
- AUXARGS_ENTRY(pos, AT_BSDFLAGS, __elfN(sigfastblock) ?
- ELF_BSDF_SIGFASTBLK : 0);
+ AUXARGS_ENTRY(pos, AT_BSDFLAGS,
+ (__elfN(sigfastblock) ? ELF_BSDF_SIGFASTBLK : 0) |
+ (__elfN(rseq1) ? ELF_BSDF_RSEQ1 : 0 ));
AUXARGS_ENTRY(pos, AT_ARGC, imgp->args->argc);
AUXARGS_ENTRY_PTR(pos, AT_ARGV, imgp->argv);
AUXARGS_ENTRY(pos, AT_ENVC, imgp->args->envc);
diff --git a/sys/kern/kern_exec.c b/sys/kern/kern_exec.c
--- a/sys/kern/kern_exec.c
+++ b/sys/kern/kern_exec.c
@@ -826,12 +826,15 @@
p->p_flag2 &= ~P2_NOTRACE;
if ((p->p_flag2 & P2_STKGAP_DISABLE_EXEC) == 0)
p->p_flag2 &= ~P2_STKGAP_DISABLE;
+ p->p_flag2 &= ~(P2_MEMBAR_PRIVE | P2_MEMBAR_PRIVE_SYNCORE |
+ P2_MEMBAR_GLOBE);
if (p->p_flag & P_PPWAIT) {
p->p_flag &= ~(P_PPWAIT | P_PPTRACE);
cv_broadcast(&p->p_pwait);
/* STOPs are no longer ignored, arrange for AST */
signotify(td);
}
+ td->td_rseq_abi = NULL;
if ((imgp->sysent->sv_setid_allowed != NULL &&
!(*imgp->sysent->sv_setid_allowed)(td, imgp)) ||
diff --git a/sys/kern/kern_membarrier.c b/sys/kern/kern_membarrier.c
new file mode 100644
--- /dev/null
+++ b/sys/kern/kern_membarrier.c
@@ -0,0 +1,276 @@
+/*-
+ * Copyright (c) 2021 The FreeBSD Foundation
+ *
+ * This software were developed by Konstantin Belousov <kib@FreeBSD.org>
+ * under sponsorship from the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/cpuset.h>
+#include <sys/lock.h>
+#include <sys/membarrier.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/sched.h>
+#include <sys/smp.h>
+#include <sys/syscallsubr.h>
+#include <sys/sysproto.h>
+
+#include <vm/vm_param.h>
+#include <vm/vm.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+
+#define MEMBARRIER_SUPPORTED_CMDS ( \
+ MEMBARRIER_CMD_GLOBAL | \
+ MEMBARRIER_CMD_GLOBAL_EXPEDITED | \
+ MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED | \
+ MEMBARRIER_CMD_PRIVATE_EXPEDITED | \
+ MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED | \
+ MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE | \
+ MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE | \
+ MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ | \
+ MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ)
+
+static void
+membarrier_action_rseq(void *arg __unused)
+{
+ struct thread *td;
+
+ td = curthread;
+ thread_lock(td);
+ td->td_flags |= TDF_ASTPENDING;
+ td->td_flags2 |= TDF2_RSEQ_MB;
+ thread_unlock(td);
+}
+
+static void
+membarrier_action_seqcst(void *arg __unused)
+{
+ atomic_thread_fence_seq_cst();
+}
+
+static void
+membarrier_action_seqcst_sync_core(void *arg __unused)
+{
+ atomic_thread_fence_seq_cst();
+ cpu_sync_core();
+}
+
+static void
+do_membarrier_ipi(cpuset_t *csp, void (*func)(void *))
+{
+ atomic_thread_fence_seq_cst();
+ smp_rendezvous_cpus(*csp, smp_no_rendezvous_barrier, func,
+ smp_no_rendezvous_barrier, NULL);
+ atomic_thread_fence_seq_cst();
+}
+
+static void
+check_cpu_switched(int c, cpuset_t *csp, uint64_t *swt, bool init)
+{
+ struct pcpu *pc;
+ uint64_t sw;
+
+ if (CPU_ISSET(c, csp))
+ return;
+
+ pc = cpuid_to_pcpu[c];
+ if (pc->pc_curthread == pc->pc_idlethread) {
+ CPU_SET(c, csp);
+ return;
+ }
+
+ /*
+ * Sync with context switch to ensure that override of
+ * pc_curthread with non-idle thread pointer is visible before
+ * reading of pc_switchtime.
+ */
+ atomic_thread_fence_acq();
+
+ sw = pc->pc_switchtime;
+ if (init)
+ swt[c] = sw;
+ else if (sw != swt[c])
+ CPU_SET(c, csp);
+}
+
+/*
+ *
+ * XXXKIB: We execute the requested action (seq_cst and possibly
+ * sync_core) on current CPU as well. There is no guarantee that
+ * current thread executes anything with the full fence semantics
+ * during syscall execution. Similarly, cpu_core_sync() semantics
+ * might be not provided by the syscall return. E.g. on amd64 we
+ * typically return without IRET.
+ */
+int
+kern_membarrier(struct thread *td, int cmd, unsigned flags, int cpu_id)
+{
+ struct proc *p, *p1;
+ struct thread *td1;
+ cpuset_t cs;
+ uint64_t *swt;
+ int c, error;
+ bool first;
+
+ if (flags != 0 || (cmd & ~MEMBARRIER_SUPPORTED_CMDS) != 0)
+ return (EINVAL);
+
+ if (cmd == MEMBARRIER_CMD_QUERY) {
+ td->td_retval[0] = MEMBARRIER_SUPPORTED_CMDS;
+ return (0);
+ }
+
+ p = td->td_proc;
+ error = 0;
+
+ switch (cmd) {
+ case MEMBARRIER_CMD_GLOBAL:
+ swt = malloc((mp_maxid + 1) * sizeof(*swt), M_TEMP, M_WAITOK);
+ CPU_ZERO(&cs);
+ sched_pin();
+ CPU_SET(PCPU_GET(cpuid), &cs);
+ for (first = true; error == 0; first = false) {
+ CPU_FOREACH(c)
+ check_cpu_switched(c, &cs, swt, first);
+ if (CPU_CMP(&cs, &all_cpus) == 0)
+ break;
+ error = pause_sig("mmbr", 1);
+ if (error == EWOULDBLOCK)
+ error = 0;
+ }
+ sched_unpin();
+ free(swt, M_TEMP);
+ atomic_thread_fence_seq_cst();
+ break;
+
+ case MEMBARRIER_CMD_GLOBAL_EXPEDITED:
+ if ((td->td_proc->p_flag2 & P2_MEMBAR_GLOBE) == 0) {
+ error = EPERM;
+ } else {
+ CPU_ZERO(&cs);
+ CPU_FOREACH(c) {
+ td1 = cpuid_to_pcpu[c]->pc_curthread;
+ p1 = td1->td_proc;
+ if (p1 != NULL &&
+ (p1->p_flag2 & P2_MEMBAR_GLOBE) != 0)
+ CPU_SET(c, &cs);
+ }
+ do_membarrier_ipi(&cs, membarrier_action_seqcst);
+ }
+ break;
+
+ case MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED:
+ if ((p->p_flag2 & P2_MEMBAR_GLOBE) == 0) {
+ PROC_LOCK(p);
+ p->p_flag2 |= P2_MEMBAR_GLOBE;
+ PROC_UNLOCK(p);
+ }
+ break;
+
+ case MEMBARRIER_CMD_PRIVATE_EXPEDITED:
+ if ((td->td_proc->p_flag2 & P2_MEMBAR_PRIVE) == 0) {
+ error = EPERM;
+ } else {
+ pmap_active_cpus(vmspace_pmap(p->p_vmspace), &cs);
+ do_membarrier_ipi(&cs, membarrier_action_seqcst);
+ }
+ break;
+
+ case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED:
+ if ((p->p_flag2 & P2_MEMBAR_PRIVE) == 0) {
+ PROC_LOCK(p);
+ p->p_flag2 |= P2_MEMBAR_PRIVE;
+ PROC_UNLOCK(p);
+ }
+ break;
+
+ case MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE:
+ if ((td->td_proc->p_flag2 & P2_MEMBAR_PRIVE_SYNCORE) == 0) {
+ error = EPERM;
+ } else {
+ /*
+ * Calculating the IPI multicast mask from
+ * pmap active mask means that we do not call
+ * cpu_sync_core() on CPUs that were missed
+ * from pmap active mask but could be switched
+ * from or to meantime. This is fine at least
+ * on amd64 because threads always use slow
+ * (IRETQ) path to return from syscall after
+ * context switch.
+ */
+ pmap_active_cpus(vmspace_pmap(p->p_vmspace), &cs);
+
+ do_membarrier_ipi(&cs,
+ membarrier_action_seqcst_sync_core);
+ }
+ break;
+
+ case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE:
+ if ((p->p_flag2 & P2_MEMBAR_PRIVE_SYNCORE) == 0) {
+ PROC_LOCK(p);
+ p->p_flag2 |= P2_MEMBAR_PRIVE_SYNCORE;
+ PROC_UNLOCK(p);
+ }
+ break;
+
+ case MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ:
+ if ((td->td_proc->p_flag2 & P2_MEMBAR_PRIVE_RSEQ) == 0) {
+ error = EPERM;
+ break;
+ }
+ pmap_active_cpus(vmspace_pmap(p->p_vmspace), &cs);
+ if ((flags & MEMBARRIER_CMD_FLAG_CPU) != 0) {
+ if (!CPU_ISSET(cpu_id, &cs))
+ break;
+ CPU_ZERO(&cs);
+ CPU_SET(cpu_id, &cs);
+ }
+ do_membarrier_ipi(&cs, membarrier_action_rseq);
+ break;
+
+ case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ:
+ if ((p->p_flag2 & P2_MEMBAR_PRIVE_RSEQ) == 0) {
+ PROC_LOCK(p);
+ p->p_flag2 |= P2_MEMBAR_PRIVE_RSEQ;
+ PROC_UNLOCK(p);
+ }
+ break;
+
+ default:
+ error = EINVAL;
+ break;
+ }
+
+ return (error);
+}
+
+int
+sys_membarrier(struct thread *td, struct membarrier_args *uap)
+{
+ return (kern_membarrier(td, uap->cmd, uap->flags, uap->cpu_id));
+}
diff --git a/sys/kern/kern_rseq.c b/sys/kern/kern_rseq.c
new file mode 100644
--- /dev/null
+++ b/sys/kern/kern_rseq.c
@@ -0,0 +1,280 @@
+/*-
+ * Copyright (c) 2021 The FreeBSD Foundation
+ *
+ * This software were developed by Konstantin Belousov <kib@FreeBSD.org>
+ * under sponsorship from the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/sysproto.h>
+#include <sys/rseq.h>
+
+static void
+rseq_inactivate(struct thread *td)
+{
+ td->td_rseq_abi = NULL;
+}
+
+static void
+rseq_inactivate_sig(struct thread *td, void *addr, int si_code)
+{
+ ksiginfo_t ksi;
+
+ rseq_inactivate(td);
+ ksiginfo_init_trap(&ksi);
+ ksi.ksi_signo = SIGSEGV;
+ ksi.ksi_code = si_code;
+ ksi.ksi_trapno = 0;
+ ksi.ksi_addr = addr;
+ trapsignal(td, &ksi);
+}
+
+void
+rseq_ast(struct thread *td)
+{
+ struct rseq rs;
+ struct rseq_cs rc;
+ void *sig_addr;
+ register_t pc;
+ uint32_t usig;
+ int cpu, error;
+ bool clear_cs;
+
+ if (td->td_rseq_abi == NULL)
+ return;
+
+ /*
+ * We cannot enter critical section there to keep td_oncpu
+ * valid due to userspace access. We do not even want to
+ * sched_pin() for the same reason.
+ *
+ * It is fine to get a context switch after reading td_oncpu,
+ * since this would cause new AST pending and we re-enter this
+ * function to update rseq cpu number.
+ *
+ * Microoptimize 64bit architectures by doing single 64bit
+ * write for cpu ids. For instance, on SMAP-enabled amd64
+ * this saves two serialization instructions STAC/CLAC.
+ */
+ cpu = td->td_oncpu;
+#ifdef __LP64__
+ rs.cpu_id_start = cpu;
+ rs.cpu_id = cpu;
+ error = suword64((char *)td->td_rseq_abi + offsetof(struct rseq,
+ cpu_id_start), *(uint64_t *)(char *)&rs.cpu_id_start);
+#else
+ error = suword((char *)td->td_rseq_abi + offsetof(struct rseq,
+ cpu_id_start), cpu);
+ if (error == 0) {
+ error = suword((char *)td->td_rseq_abi +
+ offsetof(struct rseq, cpu_id), cpu);
+ }
+#endif
+ if (error != 0) {
+ rseq_inactivate_sig(td, td->td_rseq_abi, SEGV_RSEQ_W);
+ return;
+ }
+
+ error = copyin(td->td_rseq_abi, &rs, sizeof(rs));
+ if (error != 0) {
+ rseq_inactivate_sig(td, td->td_rseq_abi, SEGV_RSEQ_R);
+ return;
+ }
+
+ if (rs.rseq_cs.ptr64 == 0)
+ return;
+ clear_cs = false;
+
+ critical_enter();
+ if ((td->td_flags2 & (TDF2_RSEQ_CLRCS | TDF2_RSEQ_MB)) == 0 &&
+ (rs.flags & RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT) != 0 &&
+ ((rs.flags & RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE) != 0 ||
+ td->td_oncpu == td->td_lastcpu))
+ return;
+ critical_exit();
+
+ error = copyin((void *)rs.rseq_cs.ptr64, &rc, sizeof(rc));
+ if (error != 0) {
+ rseq_inactivate_sig(td, (void *)rs.rseq_cs.ptr64, SEGV_RSEQ_R);
+ return;
+ }
+ if (rc.version != 0) {
+ rseq_inactivate_sig(td, (void *)rs.rseq_cs.ptr64, SEGV_RSEQ_R);
+ return;
+ }
+
+ critical_enter();
+ if ((td->td_flags2 & (TDF2_RSEQ_CLRCS | TDF2_RSEQ_MB)) == 0 &&
+ (rc.flags & RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT) != 0 &&
+ ((rc.flags & RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE) != 0 ||
+ td->td_oncpu == td->td_lastcpu))
+ return;
+ critical_exit();
+
+ if ((td->td_flags2 & (TDF2_RSEQ_CLRCS | TDF2_RSEQ_MB)) != 0) {
+ if ((td->td_flags2 & TDF2_RSEQ_CLRCS) != 0)
+ clear_cs = true;
+ thread_lock(td);
+ td->td_flags2 &= ~(TDF2_RSEQ_CLRCS | TDF2_RSEQ_MB);
+ thread_unlock(td);
+ }
+ pc = TRAPF_PC(td->td_frame);
+ if (!clear_cs &&
+ pc >= rc.start_ip && pc < rc.start_ip + rc.post_commit_offset) {
+ /* check signature */
+ sig_addr = (void *)(rc.abort_ip - sizeof(usig));
+ error = copyin(sig_addr, &usig, sizeof(usig));
+ if (error != 0) {
+ rseq_inactivate_sig(td, sig_addr, SEGV_RSEQ_R);
+ return;
+ }
+ if (usig != td->td_rseq_sig) {
+ rseq_inactivate_sig(td, sig_addr, SEGV_RSEQ_SIG);
+ return;
+ }
+
+ TRAPF_PC(td->td_frame) = rc.abort_ip;
+ clear_cs = true;
+ }
+ if (clear_cs) {
+ if (suword64((char *)td->td_rseq_abi + offsetof(struct rseq,
+ rseq_cs.ptr), 0) == -1) {
+ rseq_inactivate_sig(td, (char *)td->td_rseq_abi +
+ offsetof(struct rseq, rseq_cs.ptr),
+ SEGV_RSEQ_W);
+ return;
+ }
+ }
+}
+
+void
+rseq_before_sig(struct thread *td)
+{
+ struct rseq rs;
+ struct rseq_cs rc;
+ uint32_t usig;
+ int error;
+
+ td->td_pflags2 &= ~TDP2_RSEQ_SIG;
+ if (td->td_rseq_abi == NULL)
+ return;
+
+ error = copyin(td->td_rseq_abi, &rs, sizeof(rs));
+ if (error != 0) {
+ rseq_inactivate_sig(td, td->td_rseq_abi, SEGV_RSEQ_R);
+ return;
+ }
+
+ if (rs.rseq_cs.ptr64 == 0 ||
+ (rs.flags & RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL) != 0)
+ return;
+
+ error = copyin((void *)rs.rseq_cs.ptr64, &rc, sizeof(rc));
+ if (error != 0) {
+ rseq_inactivate_sig(td, (void *)rs.rseq_cs.ptr64, SEGV_RSEQ_R);
+ return;
+ }
+
+ if ((rc.flags & RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL) != 0)
+ return;
+
+ /* check signature */
+ error = copyin((void *)(rc.start_ip - sizeof(usig)), &usig,
+ sizeof(usig));
+ if (error != 0) {
+ rseq_inactivate_sig(td, (void *)(rc.start_ip - sizeof(usig)),
+ SEGV_RSEQ_R);
+ return;
+ }
+ if (usig != td->td_rseq_sig) {
+ rseq_inactivate_sig(td, (void *)(rc.start_ip - sizeof(usig)),
+ SEGV_RSEQ_SIG);
+ return;
+ }
+
+ td->td_pflags2 |= TDP2_RSEQ_SIG;
+ td->td_rseq_start_ip = rc.start_ip;
+ td->td_rseq_end_ip = rc.start_ip + rc.post_commit_offset;
+ td->td_rseq_abort_ip = rc.abort_ip;
+}
+
+void
+rseq_on_sig(struct thread *td)
+{
+ register_t pc;
+
+ if ((td->td_pflags2 & TDP2_RSEQ_SIG) == 0)
+ return;
+ td->td_pflags2 &= ~TDP2_RSEQ_SIG;
+ pc = TRAPF_PC(td->td_frame);
+ if (pc >= td->td_rseq_start_ip && pc < td->td_rseq_end_ip) {
+ TRAPF_PC(td->td_frame) = td->td_rseq_abort_ip;
+ thread_lock(td);
+ td->td_flags |= TDF_ASTPENDING | TDF_RSEQ;
+ td->td_flags2 |= TDF2_RSEQ_CLRCS;
+ thread_unlock(td);
+ }
+}
+
+static int
+kern_rseq(struct thread *td, uintptr_t rseq, uint32_t rseqlen, int flags,
+ uint32_t sig)
+{
+ if (rseqlen != sizeof(struct rseq))
+ return (EINVAL);
+
+ if (flags == RSEQ_FLAG_UNREGISTER) {
+ if (rseq != 0 || td->td_rseq_abi == NULL)
+ return (EINVAL);
+ if (sig != td->td_rseq_sig)
+ return (EPERM);
+ rseq_inactivate(td);
+ return (0);
+ }
+
+ if (td->td_rseq_abi != NULL)
+ return (EBUSY);
+ if (flags != 0 || rseq == 0 ||
+ trunc_page(rseq) != trunc_page(rseq + rseqlen))
+ return (EINVAL);
+
+ td->td_rseq_abi = (void *)rseq;
+ td->td_rseq_sig = sig;
+ thread_lock(td);
+ td->td_flags |= TDF_ASTPENDING | TDF_RSEQ;
+ thread_unlock(td);
+ return (0);
+}
+
+int
+sys_rseq(struct thread *td, struct rseq_args *uap)
+{
+ return (kern_rseq(td, (uintptr_t)uap->rseq, uap->rseqlen,
+ uap->flags, uap->sig));
+}
diff --git a/sys/kern/kern_sig.c b/sys/kern/kern_sig.c
--- a/sys/kern/kern_sig.c
+++ b/sys/kern/kern_sig.c
@@ -69,6 +69,7 @@
#include <sys/posix4.h>
#include <sys/racct.h>
#include <sys/resourcevar.h>
+#include <sys/rseq.h>
#include <sys/sdt.h>
#include <sys/sbuf.h>
#include <sys/sleepqueue.h>
@@ -2029,6 +2030,7 @@
KASSERT(_SIG_VALID(sig), ("invalid signal"));
sigfastblock_fetch(td);
+ rseq_before_sig(td);
PROC_LOCK(p);
ps = p->p_sigacts;
mtx_lock(&ps->ps_mtx);
@@ -2042,6 +2044,7 @@
ktrpsig(sig, ps->ps_sigact[_SIG_IDX(sig)],
&td->td_sigmask, code);
#endif
+ rseq_on_sig(td);
(*p->p_sysent->sv_sendsig)(ps->ps_sigact[_SIG_IDX(sig)],
ksi, &td->td_sigmask);
postsig_done(sig, td, ps);
@@ -3253,6 +3256,7 @@
if (p->p_sig == sig) {
p->p_sig = 0;
}
+ rseq_on_sig(td);
(*p->p_sysent->sv_sendsig)(action, &ksi, &returnmask);
postsig_done(sig, td, ps);
}
diff --git a/sys/kern/kern_synch.c b/sys/kern/kern_synch.c
--- a/sys/kern/kern_synch.c
+++ b/sys/kern/kern_synch.c
@@ -53,6 +53,7 @@
#include <sys/mutex.h>
#include <sys/proc.h>
#include <sys/resourcevar.h>
+#include <sys/rseq.h>
#include <sys/sched.h>
#include <sys/sdt.h>
#include <sys/signalvar.h>
@@ -491,7 +492,7 @@
mi_switch(int flags)
{
uint64_t runtime, new_switchtime;
- struct thread *td;
+ struct thread *td, *td1;
td = curthread; /* XXX */
THREAD_LOCK_ASSERT(td, MA_OWNED | MA_NOTRECURSED);
@@ -549,9 +550,14 @@
/*
* If the last thread was exiting, finish cleaning it up.
*/
- if ((td = PCPU_GET(deadthread))) {
+ if ((td1 = PCPU_GET(deadthread))) {
PCPU_SET(deadthread, NULL);
- thread_stash(td);
+ thread_stash(td1);
+ }
+ if (td->td_rseq_abi != NULL) {
+ thread_lock(td);
+ td->td_flags |= TDF_ASTPENDING | TDF_RSEQ;
+ thread_unlock(td);
}
spinlock_exit();
}
diff --git a/sys/kern/kern_thread.c b/sys/kern/kern_thread.c
--- a/sys/kern/kern_thread.c
+++ b/sys/kern/kern_thread.c
@@ -87,11 +87,11 @@
#ifdef __amd64__
_Static_assert(offsetof(struct thread, td_flags) == 0x108,
"struct thread KBI td_flags");
-_Static_assert(offsetof(struct thread, td_pflags) == 0x110,
+_Static_assert(offsetof(struct thread, td_pflags) == 0x114,
"struct thread KBI td_pflags");
-_Static_assert(offsetof(struct thread, td_frame) == 0x4a8,
+_Static_assert(offsetof(struct thread, td_frame) == 0x4d0,
"struct thread KBI td_frame");
-_Static_assert(offsetof(struct thread, td_emuldata) == 0x6b0,
+_Static_assert(offsetof(struct thread, td_emuldata) == 0x6e0,
"struct thread KBI td_emuldata");
_Static_assert(offsetof(struct proc, p_flag) == 0xb8,
"struct proc KBI p_flag");
@@ -109,9 +109,9 @@
"struct thread KBI td_flags");
_Static_assert(offsetof(struct thread, td_pflags) == 0xa4,
"struct thread KBI td_pflags");
-_Static_assert(offsetof(struct thread, td_frame) == 0x308,
+_Static_assert(offsetof(struct thread, td_frame) == 0x31c,
"struct thread KBI td_frame");
-_Static_assert(offsetof(struct thread, td_emuldata) == 0x34c,
+_Static_assert(offsetof(struct thread, td_emuldata) == 0x360,
"struct thread KBI td_emuldata");
_Static_assert(offsetof(struct proc, p_flag) == 0x6c,
"struct proc KBI p_flag");
diff --git a/sys/kern/subr_trap.c b/sys/kern/subr_trap.c
--- a/sys/kern/subr_trap.c
+++ b/sys/kern/subr_trap.c
@@ -64,6 +64,7 @@
#include <sys/ptrace.h>
#include <sys/racct.h>
#include <sys/resourcevar.h>
+#include <sys/rseq.h>
#include <sys/sched.h>
#include <sys/signalvar.h>
#include <sys/syscall.h>
@@ -243,7 +244,7 @@
flags = td->td_flags;
td->td_flags &= ~(TDF_ASTPENDING | TDF_NEEDSIGCHK | TDF_NEEDSUSPCHK |
TDF_NEEDRESCHED | TDF_ALRMPEND | TDF_PROFPEND | TDF_MACPEND |
- TDF_KQTICKLED);
+ TDF_KQTICKLED | TDF_RSEQ);
thread_unlock(td);
VM_CNT_INC(v_trap);
@@ -332,6 +333,7 @@
if (flags & TDF_NEEDSIGCHK || p->p_pendingcnt > 0 ||
!SIGISEMPTY(p->p_siglist)) {
sigfastblock_fetch(td);
+ rseq_before_sig(td);
PROC_LOCK(p);
mtx_lock(&p->p_sigacts->ps_mtx);
while ((sig = cursig(td)) != 0) {
@@ -354,6 +356,9 @@
*/
sigfastblock_setpend(td, resched_sigs);
+ if ((flags & TDF_RSEQ) != 0)
+ rseq_ast(td);
+
#ifdef KTRACE
KTRUSERRET(td);
#endif
diff --git a/sys/kern/syscalls.master b/sys/kern/syscalls.master
--- a/sys/kern/syscalls.master
+++ b/sys/kern/syscalls.master
@@ -3299,13 +3299,28 @@
581 AUE_NULL STD|CAPENABLED {
int sched_getcpu(void);
}
-
582 AUE_SWAPOFF STD {
int swapoff(
_In_z_ const char *name,
u_int flags,
);
}
+583 AUE_NULL STD|CAPENABLED {
+ int membarrier(
+ int cmd,
+ unsigned flags,
+ int cpu_id
+ );
+ }
+584 AUE_NULL STD|CAPENABLED {
+ int rseq(
+ _Inout_updates_bytes_(rseqlen) void *rseq,
+ uint32_t rseqlen,
+ int flags,
+ uint32_t sig
+ );
+ }
+
; Please copy any additions and changes to the following compatability tables:
; sys/compat/freebsd32/syscalls.master
diff --git a/sys/mips/mips/pmap.c b/sys/mips/mips/pmap.c
--- a/sys/mips/mips/pmap.c
+++ b/sys/mips/mips/pmap.c
@@ -3763,3 +3763,9 @@
return (FALSE);
}
}
+
+void
+pmap_active_cpus(pmap_t pmap, cpuset_t *res)
+{
+ *res = pmap->pm_active;
+}
diff --git a/sys/mips/mips/vm_machdep.c b/sys/mips/mips/vm_machdep.c
--- a/sys/mips/mips/vm_machdep.c
+++ b/sys/mips/mips/vm_machdep.c
@@ -459,6 +459,11 @@
return (EINVAL);
}
+void
+cpu_sync_core(void)
+{
+}
+
/*
* Software interrupt handler for queued VM system processing.
*/
diff --git a/sys/powerpc/powerpc/pmap_dispatch.c b/sys/powerpc/powerpc/pmap_dispatch.c
--- a/sys/powerpc/powerpc/pmap_dispatch.c
+++ b/sys/powerpc/powerpc/pmap_dispatch.c
@@ -255,3 +255,9 @@
return (FALSE);
}
}
+
+void
+pmap_active_cpus(pmap_t pmap, cpuset_t *res)
+{
+ *res = pmap->pm_active;
+}
diff --git a/sys/powerpc/powerpc/vm_machdep.c b/sys/powerpc/powerpc/vm_machdep.c
--- a/sys/powerpc/powerpc/vm_machdep.c
+++ b/sys/powerpc/powerpc/vm_machdep.c
@@ -268,3 +268,9 @@
return (EINVAL);
}
+
+void
+cpu_sync_core(void)
+{
+ isync();
+}
diff --git a/sys/riscv/riscv/pmap.c b/sys/riscv/riscv/pmap.c
--- a/sys/riscv/riscv/pmap.c
+++ b/sys/riscv/riscv/pmap.c
@@ -4433,6 +4433,12 @@
PCPU_SET(curpmap, pmap);
}
+void
+pmap_active_cpus(pmap_t pmap, cpuset_t *res)
+{
+ *res = pmap->pm_active;
+}
+
void
pmap_sync_icache(pmap_t pmap, vm_offset_t va, vm_size_t sz)
{
diff --git a/sys/riscv/riscv/vm_machdep.c b/sys/riscv/riscv/vm_machdep.c
--- a/sys/riscv/riscv/vm_machdep.c
+++ b/sys/riscv/riscv/vm_machdep.c
@@ -51,6 +51,7 @@
#include <machine/riscvreg.h>
#include <machine/cpu.h>
+#include <machine/cpufunc.h>
#include <machine/pcb.h>
#include <machine/frame.h>
#include <machine/sbi.h>
@@ -276,3 +277,9 @@
/* Nothing to do here - busdma bounce buffers are not implemented. */
}
+
+void
+cpu_sync_core(void)
+{
+ fence_i();
+}
diff --git a/sys/sys/elf_common.h b/sys/sys/elf_common.h
--- a/sys/sys/elf_common.h
+++ b/sys/sys/elf_common.h
@@ -1497,5 +1497,6 @@
#define R_X86_64_REX_GOTPCRELX 42
#define ELF_BSDF_SIGFASTBLK 0x0001 /* Kernel supports fast sigblock */
+#define ELF_BSDF_RSEQ1 0x0002 /* Kernel support for rseq v1 */
#endif /* !_SYS_ELF_COMMON_H_ */
diff --git a/sys/sys/membarrier.h b/sys/sys/membarrier.h
new file mode 100644
--- /dev/null
+++ b/sys/sys/membarrier.h
@@ -0,0 +1,64 @@
+/*-
+ * Copyright (c) 2021 The FreeBSD Foundation
+ *
+ * This software were developed by Konstantin Belousov <kib@FreeBSD.org>
+ * under sponsorship from the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef __SYS_MEMBARRIER_H__
+#define __SYS_MEMBARRIER_H__
+
+#include <sys/cdefs.h>
+
+/*
+ * The enum membarrier_cmd values are bits. The MEMBARRIER_CMD_QUERY
+ * command returns a bitset indicating which commands are supported.
+ * Also the value of MEMBARRIER_CMD_QUERY is zero, so it is
+ * effectively not returned by the query.
+ */
+enum membarrier_cmd {
+ MEMBARRIER_CMD_QUERY = 0x00000000,
+ MEMBARRIER_CMD_GLOBAL = 0x00000001,
+ MEMBARRIER_CMD_SHARED = MEMBARRIER_CMD_GLOBAL,
+ MEMBARRIER_CMD_GLOBAL_EXPEDITED = 0x00000002,
+ MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED = 0x00000004,
+ MEMBARRIER_CMD_PRIVATE_EXPEDITED = 0x00000008,
+ MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED = 0x00000010,
+ MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE = 0x00000020,
+ MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE = 0x00000040,
+ MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ = 0x00000080,
+ MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ = 0x00000100,
+};
+
+enum membarrier_cmd_flag {
+ MEMBARRIER_CMD_FLAG_CPU = 0x00000001,
+};
+
+#ifndef _KERNEL
+__BEGIN_DECLS
+int membarrier(int, unsigned, int);
+__END_DECLS
+#endif /* _KERNEL */
+
+#endif /* __SYS_MEMBARRIER_H__ */
diff --git a/sys/sys/proc.h b/sys/sys/proc.h
--- a/sys/sys/proc.h
+++ b/sys/sys/proc.h
@@ -262,6 +262,7 @@
/* Cleared during fork1() */
#define td_startzero td_flags
int td_flags; /* (t) TDF_* flags. */
+ int td_flags2; /* (t) TDF2_* flags. */
int td_inhibitors; /* (t) Why can not run. */
int td_pflags; /* (k) Private thread (TDP_*) flags. */
int td_pflags2; /* (k) Private thread (TDP2_*) flags. */
@@ -322,6 +323,11 @@
size_t td_vslock_sz; /* (k) amount of vslock-ed space */
struct kcov_info *td_kcov_info; /* (*) Kernel code coverage data */
u_int td_ucredref; /* (k) references on td_realucred */
+ uint32_t td_rseq_sig; /* (k) abort handler signature */
+ void *td_rseq_abi; /* (k) usermode rseq */
+ register_t td_rseq_start_ip;/* (k) */
+ register_t td_rseq_end_ip; /* (k) */
+ register_t td_rseq_abort_ip;/* (k) */
#define td_endzero td_sigmask
/* Copied during fork1() or create_thread(). */
@@ -468,7 +474,7 @@
#define TDF_THRWAKEUP 0x00100000 /* Libthr thread must not suspend itself. */
#define TDF_SEINTR 0x00200000 /* EINTR on stop attempts. */
#define TDF_SWAPINREQ 0x00400000 /* Swapin request due to wakeup. */
-#define TDF_UNUSED23 0x00800000 /* --available-- */
+#define TDF_RSEQ 0x00800000 /* rseq active */
#define TDF_SCHED0 0x01000000 /* Reserved for scheduler private use */
#define TDF_SCHED1 0x02000000 /* Reserved for scheduler private use */
#define TDF_SCHED2 0x04000000 /* Reserved for scheduler private use */
@@ -477,6 +483,9 @@
#define TDF_PROFPEND 0x20000000 /* Pending SIGPROF needs to be posted. */
#define TDF_MACPEND 0x40000000 /* AST-based MAC event pending. */
+#define TDF2_RSEQ_CLRCS 0x00000001 /* rseq clear rc */
+#define TDF2_RSEQ_MB 0x00000002 /* MEMBARRIER_RSEQ requested */
+
/* Userland debug flags */
#define TDB_SUSPEND 0x00000001 /* Thread is suspended by debugger */
#define TDB_XSIG 0x00000002 /* Thread is exchanging signal under trace */
@@ -537,6 +546,7 @@
#define TDP2_SBPAGES 0x00000001 /* Owns sbusy on some pages */
#define TDP2_COMPAT32RB 0x00000002 /* compat32 ABI for robust lists */
#define TDP2_ACCT 0x00000004 /* Doing accounting */
+#define TDP2_RSEQ_SIG 0x00000008
/*
* Reasons that the current thread can not be run yet.
@@ -847,6 +857,14 @@
#define P2_NO_NEW_PRIVS 0x00008000 /* Ignore setuid */
#define P2_WXORX_DISABLE 0x00010000 /* WX mappings enabled */
#define P2_WXORX_ENABLE_EXEC 0x00020000 /* WXORX enabled after exec */
+#define P2_MEMBAR_PRIVE 0x00040000 /* membar private expedited
+ registered */
+#define P2_MEMBAR_PRIVE_SYNCORE 0x00080000 /* membar private expedited
+ sync core registered */
+#define P2_MEMBAR_GLOBE 0x00100000 /* membar global expedited
+ registered */
+#define P2_MEMBAR_PRIVE_RSEQ 0x00200000 /* membar private expedited
+ rseq registered */
/* Flags protected by proctree_lock, kept in p_treeflags. */
#define P_TREE_ORPHANED 0x00000001 /* Reparented, on orphan list */
@@ -1170,6 +1188,7 @@
int cpu_idle_wakeup(int);
extern void (*cpu_idle_hook)(sbintime_t); /* Hook to machdep CPU idler. */
void cpu_switch(struct thread *, struct thread *, struct mtx *);
+void cpu_sync_core(void);
void cpu_throw(struct thread *, struct thread *) __dead2;
bool curproc_sigkilled(void);
void userret(struct thread *, struct trapframe *);
diff --git a/sys/sys/rseq.h b/sys/sys/rseq.h
new file mode 100644
--- /dev/null
+++ b/sys/sys/rseq.h
@@ -0,0 +1,99 @@
+/*-
+ * Copyright (c) 2021 The FreeBSD Foundation
+ *
+ * This software were developed by Konstantin Belousov <kib@FreeBSD.org>
+ * under sponsorship from the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef __SYS_RSEQ_H__
+#define __SYS_RSEQ_H__
+
+#include <sys/cdefs.h>
+#include <sys/types.h>
+#include <sys/endian.h>
+
+enum rseq_cpu_id_state {
+ RSEQ_CPU_ID_UNINITIALIZED = -1,
+ RSEQ_CPU_ID_REGISTRATION_FAILED = -2,
+};
+
+enum rseq_flags {
+ RSEQ_FLAG_UNREGISTER = 1,
+};
+
+enum rseq_cs_flags {
+ RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT = 0x00000001,
+ RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL = 0x00000002,
+ RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE = 0x00000004,
+};
+
+struct rseq_cs {
+ uint32_t version;
+ uint32_t flags;
+ uint64_t start_ip;
+ uint64_t post_commit_offset;
+ uint64_t abort_ip;
+};
+
+struct rseq {
+ uint32_t cpu_id_start;
+ uint32_t cpu_id;
+ union {
+ uint64_t ptr64;
+#ifdef __LP64__
+ uint64_t ptr;
+#else
+ struct {
+#if _BYTE_ORDER == _BIG_ENDIAN
+ uint32_t pad;
+ uint32_t ptr32;
+#else /* BYTE_ORDER */
+ uint32_t ptr32;
+ uint32_t pad;
+#endif /* BYTE_ORDER */
+ } ptr;
+#endif /* LP64 */
+ } rseq_cs;
+ uint32_t flags;
+};
+
+#ifdef _KERNEL
+
+#define TD_RSEQ_ACTIVE 0x00000001
+
+void rseq_ast(struct thread *td);
+void rseq_before_sig(struct thread *td);
+void rseq_on_sig(struct thread *td);
+
+#else /* _KERNEL */
+
+__BEGIN_DECLS
+extern __thread volatile struct rseq __rseq_abi __weak_symbol;
+
+int rseq(volatile struct rseq *rseq, uint32_t rseqlen, int flags, uint32_t sig);
+__END_DECLS
+
+#endif /* _KERNEL */
+
+#endif /* __SYS_RSEQ_H__ */
diff --git a/sys/sys/signal.h b/sys/sys/signal.h
--- a/sys/sys/signal.h
+++ b/sys/sys/signal.h
@@ -329,6 +329,9 @@
#define SEGV_ACCERR 2 /* Invalid permissions for mapped */
/* object. */
#define SEGV_PKUERR 100 /* x86: PKU violation */
+#define SEGV_RSEQ_R 101 /* rseq access read fault */
+#define SEGV_RSEQ_W 102 /* rseq access write fault */
+#define SEGV_RSEQ_SIG 103 /* rseq signature check fault */
/* codes for SIGFPE */
#define FPE_INTOVF 1 /* Integer overflow. */
diff --git a/sys/sys/syscallsubr.h b/sys/sys/syscallsubr.h
--- a/sys/sys/syscallsubr.h
+++ b/sys/sys/syscallsubr.h
@@ -200,6 +200,8 @@
int inherit);
int kern_mkdirat(struct thread *td, int fd, const char *path,
enum uio_seg segflg, int mode);
+int kern_membarrier(struct thread *td, int cmd, unsigned flags,
+ int cpu_id);
int kern_mkfifoat(struct thread *td, int fd, const char *path,
enum uio_seg pathseg, int mode);
int kern_mknodat(struct thread *td, int fd, const char *path,
diff --git a/sys/vm/pmap.h b/sys/vm/pmap.h
--- a/sys/vm/pmap.h
+++ b/sys/vm/pmap.h
@@ -92,6 +92,7 @@
#include <machine/pmap.h>
#ifdef _KERNEL
+#include <sys/_cpuset.h>
struct thread;
/*
@@ -120,6 +121,7 @@
#define PMAP_TS_REFERENCED_MAX 5
void pmap_activate(struct thread *td);
+void pmap_active_cpus(pmap_t pmap, cpuset_t *res);
void pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
int advice);
void pmap_align_superpage(vm_object_t, vm_ooffset_t, vm_offset_t *,
diff --git a/sys/vm/vm_kern.c b/sys/vm/vm_kern.c
--- a/sys/vm/vm_kern.c
+++ b/sys/vm/vm_kern.c
@@ -79,6 +79,7 @@
#include <sys/malloc.h>
#include <sys/proc.h>
#include <sys/rwlock.h>
+#include <sys/smp.h>
#include <sys/sysctl.h>
#include <sys/vmem.h>
#include <sys/vmmeter.h>
@@ -881,6 +882,31 @@
#endif
}
+#ifdef PMAP_WANT_ACTIVE_CPUS_NAIVE
+void
+pmap_active_cpus(pmap_t pmap, cpuset_t *res)
+{
+ struct thread *td;
+ struct proc *p;
+ struct vmspace *vm;
+ int c;
+
+ CPU_ZERO(res);
+ CPU_FOREACH(c) {
+ td = cpuid_to_pcpu[c]->pc_curthread;
+ p = td->td_proc;
+ if (p == NULL)
+ continue;
+ vm = vmspace_acquire_ref(p);
+ if (vm == NULL)
+ continue;
+ if (pmap == vmspace_pmap(vm))
+ CPU_SET(c, res);
+ vmspace_free(vm);
+ }
+}
+#endif
+
/*
* Allow userspace to directly trigger the VM drain routine for testing
* purposes.

File Metadata

Mime Type
text/plain
Expires
Mon, Apr 28, 7:41 PM (3 h, 41 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
17833011
Default Alt Text
D32505.id100265.diff (42 KB)

Event Timeline