D45393.diff
No OneTemporary
Actions

Size

80 KB

Referenced Files

None

Subscribers

None

D45393.diff
View Options

	diff --git a/sys/amd64/include/runq.h b/sys/amd64/include/runq.h
	deleted file mode 100644
	--- a/sys/amd64/include/runq.h
	+++ /dev/null
	@@ -1,46 +0,0 @@
	-/*-
	- * SPDX-License-Identifier: BSD-2-Clause
	- *
	- * Copyright (c) 2001 Jake Burkholder <jake@FreeBSD.org>
	- * All rights reserved.
	- *
	- * Redistribution and use in source and binary forms, with or without
	- * modification, are permitted provided that the following conditions
	- * are met:
	- * 1. Redistributions of source code must retain the above copyright
	- * notice, this list of conditions and the following disclaimer.
	- * 2. Redistributions in binary form must reproduce the above copyright
	- * notice, this list of conditions and the following disclaimer in the
	- * documentation and/or other materials provided with the distribution.
	- *
	- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	- * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	- * SUCH DAMAGE.
	- */
	-
	-#ifndef _MACHINE_RUNQ_H_
	-#define _MACHINE_RUNQ_H_
	-
	-#define RQB_LEN (1) /* Number of priority status words. */
	-#define RQB_L2BPW (6) /* Log2(sizeof(rqb_word_t) * NBBY)). */
	-#define RQB_BPW (1<<RQB_L2BPW) /* Bits in an rqb_word_t. */
	-
	-#define RQB_BIT(pri) (1ul << ((pri) & (RQB_BPW - 1)))
	-#define RQB_WORD(pri) ((pri) >> RQB_L2BPW)
	-
	-#define RQB_FFS(word) (bsfq(word))
	-
	-/*
	- * Type of run queue status word.
	- */
	-typedef u_int64_t rqb_word_t;
	-
	-#endif
	diff --git a/sys/arm/include/runq.h b/sys/arm/include/runq.h
	deleted file mode 100644
	--- a/sys/arm/include/runq.h
	+++ /dev/null
	@@ -1,46 +0,0 @@
	-/*-
	- * SPDX-License-Identifier: BSD-2-Clause
	- *
	- * Copyright (c) 2001 Jake Burkholder <jake@FreeBSD.org>
	- * All rights reserved.
	- *
	- * Redistribution and use in source and binary forms, with or without
	- * modification, are permitted provided that the following conditions
	- * are met:
	- * 1. Redistributions of source code must retain the above copyright
	- * notice, this list of conditions and the following disclaimer.
	- * 2. Redistributions in binary form must reproduce the above copyright
	- * notice, this list of conditions and the following disclaimer in the
	- * documentation and/or other materials provided with the distribution.
	- *
	- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	- * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	- * SUCH DAMAGE.
	- */
	-
	-#ifndef _MACHINE_RUNQ_H_
	-#define _MACHINE_RUNQ_H_
	-
	-#define RQB_LEN (2) /* Number of priority status words. */
	-#define RQB_L2BPW (5) /* Log2(sizeof(rqb_word_t) * NBBY)). */
	-#define RQB_BPW (1<<RQB_L2BPW) /* Bits in an rqb_word_t. */
	-
	-#define RQB_BIT(pri) (1 << ((pri) & (RQB_BPW - 1)))
	-#define RQB_WORD(pri) ((pri) >> RQB_L2BPW)
	-
	-#define RQB_FFS(word) (ffs(word) - 1)
	-
	-/*
	- * Type of run queue status word.
	- */
	-typedef u_int32_t rqb_word_t;
	-
	-#endif
	diff --git a/sys/arm64/include/runq.h b/sys/arm64/include/runq.h
	deleted file mode 100644
	--- a/sys/arm64/include/runq.h
	+++ /dev/null
	@@ -1,50 +0,0 @@
	-/*-
	- * Copyright (c) 2001 Jake Burkholder <jake@FreeBSD.org>
	- * All rights reserved.
	- *
	- * Redistribution and use in source and binary forms, with or without
	- * modification, are permitted provided that the following conditions
	- * are met:
	- * 1. Redistributions of source code must retain the above copyright
	- * notice, this list of conditions and the following disclaimer.
	- * 2. Redistributions in binary form must reproduce the above copyright
	- * notice, this list of conditions and the following disclaimer in the
	- * documentation and/or other materials provided with the distribution.
	- *
	- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	- * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	- * SUCH DAMAGE.
	- */
	-
	-#ifdef __arm__
	-#include <arm/runq.h>
	-#else /* !__arm__ */
	-
	-#ifndef _MACHINE_RUNQ_H_
	-#define _MACHINE_RUNQ_H_
	-
	-#define RQB_LEN (1) /* Number of priority status words. */
	-#define RQB_L2BPW (6) /* Log2(sizeof(rqb_word_t) * NBBY)). */
	-#define RQB_BPW (1<<RQB_L2BPW) /* Bits in an rqb_word_t. */
	-
	-#define RQB_BIT(pri) (1ul << ((pri) & (RQB_BPW - 1)))
	-#define RQB_WORD(pri) ((pri) >> RQB_L2BPW)
	-
	-#define RQB_FFS(word) (ffsl(word) - 1)
	-
	-/*
	- * Type of run queue status word.
	- */
	-typedef unsigned long rqb_word_t;
	-
	-#endif
	-
	-#endif /* !__arm__ */
	diff --git a/sys/cam/ctl/ctl.c b/sys/cam/ctl/ctl.c
	--- a/sys/cam/ctl/ctl.c
	+++ b/sys/cam/ctl/ctl.c
	@@ -14307,7 +14307,7 @@

	CTL_DEBUG_PRINT(("ctl_work_thread starting\n"));
	thread_lock(curthread);
	- sched_prio(curthread, PUSER - 1);
	+ sched_prio(curthread, PRI_MAX_KERN);
	thread_unlock(curthread);

	while (!softc->shutdown) {
	@@ -14399,7 +14399,7 @@

	CTL_DEBUG_PRINT(("ctl_thresh_thread starting\n"));
	thread_lock(curthread);
	- sched_prio(curthread, PUSER - 1);
	+ sched_prio(curthread, PRI_MAX_KERN);
	thread_unlock(curthread);

	while (!softc->shutdown) {
	diff --git a/sys/contrib/openzfs/include/os/freebsd/spl/sys/proc.h b/sys/contrib/openzfs/include/os/freebsd/spl/sys/proc.h
	--- a/sys/contrib/openzfs/include/os/freebsd/spl/sys/proc.h
	+++ b/sys/contrib/openzfs/include/os/freebsd/spl/sys/proc.h
	@@ -44,7 +44,9 @@
	#ifdef _KERNEL
	#define CPU curcpu
	#define minclsyspri PRIBIO
	-#define defclsyspri minclsyspri
	+#define defclsyspri minclsyspri
	+/* Write issue taskq priority. */
	+#define wtqclsyspri ((PVM + PRIBIO) / 2)
	#define maxclsyspri PVM
	#define max_ncpus (mp_maxid + 1)
	#define boot_max_ncpus (mp_maxid + 1)
	diff --git a/sys/contrib/openzfs/include/os/linux/spl/sys/sysmacros.h b/sys/contrib/openzfs/include/os/linux/spl/sys/sysmacros.h
	--- a/sys/contrib/openzfs/include/os/linux/spl/sys/sysmacros.h
	+++ b/sys/contrib/openzfs/include/os/linux/spl/sys/sysmacros.h
	@@ -91,8 +91,10 @@
	* Treat shim tasks as SCHED_NORMAL tasks
	*/
	#define minclsyspri (MAX_PRIO-1)
	-#define maxclsyspri (MAX_RT_PRIO)
	#define defclsyspri (DEFAULT_PRIO)
	+/* Write issue taskq priority. */
	+#define wtqclsyspri (MAX_RT_PRIO + 1)
	+#define maxclsyspri (MAX_RT_PRIO)

	#ifndef NICE_TO_PRIO
	#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20)
	diff --git a/sys/contrib/openzfs/include/sys/zfs_context.h b/sys/contrib/openzfs/include/sys/zfs_context.h
	--- a/sys/contrib/openzfs/include/sys/zfs_context.h
	+++ b/sys/contrib/openzfs/include/sys/zfs_context.h
	@@ -622,8 +622,10 @@
	* Process priorities as defined by setpriority(2) and getpriority(2).
	*/
	#define minclsyspri 19
	-#define maxclsyspri -20
	#define defclsyspri 0
	+/* Write issue taskq priority. */
	+#define wtqclsyspri -19
	+#define maxclsyspri -20

	#define CPU_SEQID ((uintptr_t)pthread_self() & (max_ncpus - 1))
	#define CPU_SEQID_UNSTABLE CPU_SEQID
	diff --git a/sys/contrib/openzfs/module/zfs/spa.c b/sys/contrib/openzfs/module/zfs/spa.c
	--- a/sys/contrib/openzfs/module/zfs/spa.c
	+++ b/sys/contrib/openzfs/module/zfs/spa.c
	@@ -1158,29 +1158,14 @@
	spa->spa_proc, zio_taskq_basedc, flags);
	} else {
	#endif
	- pri_t pri = maxclsyspri;
	/*
	* The write issue taskq can be extremely CPU
	* intensive. Run it at slightly less important
	* priority than the other taskqs.
	- *
	- * Under Linux and FreeBSD this means incrementing
	- * the priority value as opposed to platforms like
	- * illumos where it should be decremented.
	- *
	- * On FreeBSD, if priorities divided by four (RQ_PPQ)
	- * are equal then a difference between them is
	- * insignificant.
	*/
	- if (t == ZIO_TYPE_WRITE && q == ZIO_TASKQ_ISSUE) {
	-#if defined(__linux__)
	- pri++;
	-#elif defined(__FreeBSD__)
	- pri += 4;
	-#else
	-#error "unknown OS"
	-#endif
	- }
	+ const pri_t pri = (t == ZIO_TYPE_WRITE &&
	+ q == ZIO_TASKQ_ISSUE) ?
	+ wtqclsyspri : maxclsyspri;
	tq = taskq_create_proc(name, value, pri, 50,
	INT_MAX, spa->spa_proc, flags);
	#ifdef HAVE_SYSDC
	diff --git a/sys/dev/beri/beri_ring.c b/sys/dev/beri/beri_ring.c
	--- a/sys/dev/beri/beri_ring.c
	+++ b/sys/dev/beri/beri_ring.c
	@@ -170,7 +170,7 @@
	}

	mtx_lock(&sc->beri_mtx);
	- selwakeuppri(&sc->beri_rsel, PZERO + 1);
	+ selwakeuppri(&sc->beri_rsel, PZERO);
	KNOTE_LOCKED(&sc->beri_rsel.si_note, 0);
	mtx_unlock(&sc->beri_mtx);
	}
	@@ -190,7 +190,7 @@
	}

	mtx_lock(&sc->beri_mtx);
	- selwakeuppri(&sc->beri_rsel, PZERO + 1);
	+ selwakeuppri(&sc->beri_rsel, PZERO);
	KNOTE_LOCKED(&sc->beri_rsel.si_note, 0);
	mtx_unlock(&sc->beri_mtx);
	}
	diff --git a/sys/dev/firewire/firewirereg.h b/sys/dev/firewire/firewirereg.h
	--- a/sys/dev/firewire/firewirereg.h
	+++ b/sys/dev/firewire/firewirereg.h
	@@ -293,7 +293,7 @@
	extern devclass_t firewire_devclass;
	extern int firewire_phydma_enable;

	-#define FWPRI ((PZERO + 8) \| PCATCH)
	+#define FWPRI (PWAIT \| PCATCH)

	#define CALLOUT_INIT(x) callout_init(x, 1 /* mpsafe */)

	diff --git a/sys/dev/syscons/syscons.c b/sys/dev/syscons/syscons.c
	--- a/sys/dev/syscons/syscons.c
	+++ b/sys/dev/syscons/syscons.c
	@@ -1310,7 +1310,7 @@
	if (i == sc->cur_scp->index)
	return 0;
	error =
	- tsleep(VTY_WCHAN(sc, i), (PZERO + 1) \| PCATCH, "waitvt", 0);
	+ tsleep(VTY_WCHAN(sc, i), PZERO \| PCATCH, "waitvt", 0);
	return error;

	case VT_GETACTIVE: /* get active vty # */
	diff --git a/sys/dev/usb/usb_process.h b/sys/dev/usb/usb_process.h
	--- a/sys/dev/usb/usb_process.h
	+++ b/sys/dev/usb/usb_process.h
	@@ -31,7 +31,6 @@
	#ifndef USB_GLOBAL_INCLUDE_FILE
	#include <sys/interrupt.h>
	#include <sys/priority.h>
	-#include <sys/runq.h>
	#endif

	/* defines */
	diff --git a/sys/dev/vkbd/vkbd.c b/sys/dev/vkbd/vkbd.c
	--- a/sys/dev/vkbd/vkbd.c
	+++ b/sys/dev/vkbd/vkbd.c
	@@ -82,7 +82,7 @@
	#define VKBD_UNLOCK(s) mtx_unlock(&(s)->ks_lock)
	#define VKBD_LOCK_ASSERT(s, w) mtx_assert(&(s)->ks_lock, w)
	#define VKBD_SLEEP(s, f, d, t) \
	- msleep(&(s)->f, &(s)->ks_lock, PCATCH \| (PZERO + 1), d, t)
	+ msleep(&(s)->f, &(s)->ks_lock, PCATCH \| PZERO, d, t)
	#else
	#define VKBD_LOCK_DECL
	#define VKBD_LOCK_INIT(s)
	@@ -90,7 +90,7 @@
	#define VKBD_LOCK(s)
	#define VKBD_UNLOCK(s)
	#define VKBD_LOCK_ASSERT(s, w)
	-#define VKBD_SLEEP(s, f, d, t) tsleep(&(s)->f, PCATCH \| (PZERO + 1), d, t)
	+#define VKBD_SLEEP(s, f, d, t) tsleep(&(s)->f, PCATCH \| PZERO, d, t)
	#endif

	#define VKBD_KEYBOARD(d) \
	@@ -268,8 +268,8 @@
	VKBD_SLEEP(state, ks_task, "vkbdc", 0);

	/* wakeup poll()ers */
	- selwakeuppri(&state->ks_rsel, PZERO + 1);
	- selwakeuppri(&state->ks_wsel, PZERO + 1);
	+ selwakeuppri(&state->ks_rsel, PZERO);
	+ selwakeuppri(&state->ks_wsel, PZERO);

	state->ks_flags &= ~OPEN;
	state->ks_dev = NULL;
	@@ -498,7 +498,7 @@

	if (!(state->ks_flags & STATUS)) {
	state->ks_flags \|= STATUS;
	- selwakeuppri(&state->ks_rsel, PZERO + 1);
	+ selwakeuppri(&state->ks_rsel, PZERO);
	wakeup(&state->ks_flags);
	}
	}
	@@ -531,7 +531,7 @@
	q->head = 0;

	/* wakeup ks_inq writers/poll()ers */
	- selwakeuppri(&state->ks_wsel, PZERO + 1);
	+ selwakeuppri(&state->ks_wsel, PZERO);
	wakeup(q);

	return (c);
	@@ -1246,7 +1246,7 @@

	/* flush ks_inq and wakeup writers/poll()ers */
	state->ks_inq.head = state->ks_inq.tail = state->ks_inq.cc = 0;
	- selwakeuppri(&state->ks_wsel, PZERO + 1);
	+ selwakeuppri(&state->ks_wsel, PZERO);
	wakeup(&state->ks_inq);
	}

	diff --git a/sys/fs/fuse/fuse_device.c b/sys/fs/fuse/fuse_device.c
	--- a/sys/fs/fuse/fuse_device.c
	+++ b/sys/fs/fuse/fuse_device.c
	@@ -152,7 +152,7 @@
	FUSE_LOCK();
	fuse_lck_mtx_lock(fdata->aw_mtx);
	/* wakup poll()ers */
	- selwakeuppri(&fdata->ks_rsel, PZERO + 1);
	+ selwakeuppri(&fdata->ks_rsel, PZERO);
	/* Don't let syscall handlers wait in vain */
	while ((tick = fuse_aw_pop(fdata))) {
	fuse_lck_mtx_lock(tick->tk_aw_mtx);
	diff --git a/sys/fs/fuse/fuse_io.c b/sys/fs/fuse/fuse_io.c
	--- a/sys/fs/fuse/fuse_io.c
	+++ b/sys/fs/fuse/fuse_io.c
	@@ -932,7 +932,7 @@
	if (vp->v_mount->mnt_kern_flag & MNTK_UNMOUNTF)
	return EIO;
	fvdat->flag \|= FN_FLUSHWANT;
	- tsleep(&fvdat->flag, PRIBIO + 2, "fusevinv", 2 * hz);
	+ tsleep(&fvdat->flag, PRIBIO, "fusevinv", 2 * hz);
	error = 0;
	if (p != NULL) {
	PROC_LOCK(p);
	diff --git a/sys/fs/fuse/fuse_ipc.c b/sys/fs/fuse/fuse_ipc.c
	--- a/sys/fs/fuse/fuse_ipc.c
	+++ b/sys/fs/fuse/fuse_ipc.c
	@@ -593,7 +593,7 @@
	fuse_lck_mtx_lock(data->ms_mtx);
	data->dataflags \|= FSESS_DEAD;
	wakeup_one(data);
	- selwakeuppri(&data->ks_rsel, PZERO + 1);
	+ selwakeuppri(&data->ks_rsel, PZERO);
	wakeup(&data->ticketer);
	fuse_lck_mtx_unlock(data->ms_mtx);
	FUSE_UNLOCK();
	@@ -669,7 +669,7 @@
	else
	fuse_ms_push(ftick);
	wakeup_one(ftick->tk_data);
	- selwakeuppri(&ftick->tk_data->ks_rsel, PZERO + 1);
	+ selwakeuppri(&ftick->tk_data->ks_rsel, PZERO);
	KNOTE_LOCKED(&ftick->tk_data->ks_rsel.si_note, 0);
	fuse_lck_mtx_unlock(ftick->tk_data->ms_mtx);
	}
	diff --git a/sys/fs/nfs/nfs_commonsubs.c b/sys/fs/nfs/nfs_commonsubs.c
	--- a/sys/fs/nfs/nfs_commonsubs.c
	+++ b/sys/fs/nfs/nfs_commonsubs.c
	@@ -4644,7 +4644,7 @@
	ts.tv_sec = 0;
	ts.tv_nsec = 0;
	(void) nfsmsleep((caddr_t)flagp, NFSSOCKMUTEXPTR,
	- PZERO - 1, "nfsndlck", &ts);
	+ PVFS, "nfsndlck", &ts);
	}
	*flagp \|= NFSR_SNDLOCK;
	NFSUNLOCKSOCK();
	diff --git a/sys/fs/nfsserver/nfs_nfsdcache.c b/sys/fs/nfsserver/nfs_nfsdcache.c
	--- a/sys/fs/nfsserver/nfs_nfsdcache.c
	+++ b/sys/fs/nfsserver/nfs_nfsdcache.c
	@@ -392,7 +392,7 @@
	nfsaddr_match(NETFAMILY(rp), &rp->rc_haddr, nd->nd_nam)) {
	if ((rp->rc_flag & RC_LOCKED) != 0) {
	rp->rc_flag \|= RC_WANTED;
	- (void)mtx_sleep(rp, mutex, (PZERO - 1) \| PDROP,
	+ (void)mtx_sleep(rp, mutex, PVFS \| PDROP,
	"nfsrc", 10 * hz);
	goto loop;
	}
	@@ -678,7 +678,7 @@
	rp = hitrp;
	if ((rp->rc_flag & RC_LOCKED) != 0) {
	rp->rc_flag \|= RC_WANTED;
	- (void)mtx_sleep(rp, mutex, (PZERO - 1) \| PDROP,
	+ (void)mtx_sleep(rp, mutex, PVFS \| PDROP,
	"nfsrc", 10 * hz);
	goto tryagain;
	}
	@@ -750,7 +750,7 @@
	mtx_assert(mutex, MA_OWNED);
	while ((rp->rc_flag & RC_LOCKED) != 0) {
	rp->rc_flag \|= RC_WANTED;
	- (void)mtx_sleep(rp, mutex, PZERO - 1, "nfsrc", 0);
	+ (void)mtx_sleep(rp, mutex, PVFS, "nfsrc", 0);
	}
	rp->rc_flag \|= RC_LOCKED;
	}
	diff --git a/sys/fs/nfsserver/nfs_nfsdstate.c b/sys/fs/nfsserver/nfs_nfsdstate.c
	--- a/sys/fs/nfsserver/nfs_nfsdstate.c
	+++ b/sys/fs/nfsserver/nfs_nfsdstate.c
	@@ -507,7 +507,7 @@
	NFSLOCKSTATE();
	while (clp->lc_cbref) {
	clp->lc_flags \|= LCL_WAKEUPWANTED;
	- (void)mtx_sleep(clp, NFSSTATEMUTEXPTR, PZERO - 1,
	+ (void)mtx_sleep(clp, NFSSTATEMUTEXPTR, PVFS,
	"nfsd clp", 10 * hz);
	}
	NFSUNLOCKSTATE();
	@@ -574,7 +574,7 @@
	NFSLOCKSTATE();
	while (clp->lc_cbref) {
	clp->lc_flags \|= LCL_WAKEUPWANTED;
	- (void)mtx_sleep(clp, NFSSTATEMUTEXPTR, PZERO - 1,
	+ (void)mtx_sleep(clp, NFSSTATEMUTEXPTR, PVFS,
	"nfsdclp", 10 * hz);
	}
	NFSUNLOCKSTATE();
	diff --git a/sys/fs/smbfs/smbfs_io.c b/sys/fs/smbfs/smbfs_io.c
	--- a/sys/fs/smbfs/smbfs_io.c
	+++ b/sys/fs/smbfs/smbfs_io.c
	@@ -629,7 +629,7 @@

	while (np->n_flag & NFLUSHINPROG) {
	np->n_flag \|= NFLUSHWANT;
	- error = tsleep(&np->n_flag, PRIBIO + 2, "smfsvinv", 2 * hz);
	+ error = tsleep(&np->n_flag, PRIBIO, "smfsvinv", 2 * hz);
	error = smb_td_intr(td);
	if (error == EINTR)
	return EINTR;
	diff --git a/sys/i386/include/runq.h b/sys/i386/include/runq.h
	deleted file mode 100644
	--- a/sys/i386/include/runq.h
	+++ /dev/null
	@@ -1,46 +0,0 @@
	-/*-
	- * SPDX-License-Identifier: BSD-2-Clause
	- *
	- * Copyright (c) 2001 Jake Burkholder <jake@FreeBSD.org>
	- * All rights reserved.
	- *
	- * Redistribution and use in source and binary forms, with or without
	- * modification, are permitted provided that the following conditions
	- * are met:
	- * 1. Redistributions of source code must retain the above copyright
	- * notice, this list of conditions and the following disclaimer.
	- * 2. Redistributions in binary form must reproduce the above copyright
	- * notice, this list of conditions and the following disclaimer in the
	- * documentation and/or other materials provided with the distribution.
	- *
	- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	- * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	- * SUCH DAMAGE.
	- */
	-
	-#ifndef _MACHINE_RUNQ_H_
	-#define _MACHINE_RUNQ_H_
	-
	-#define RQB_LEN (2) /* Number of priority status words. */
	-#define RQB_L2BPW (5) /* Log2(sizeof(rqb_word_t) * NBBY)). */
	-#define RQB_BPW (1<<RQB_L2BPW) /* Bits in an rqb_word_t. */
	-
	-#define RQB_BIT(pri) (1 << ((pri) & (RQB_BPW - 1)))
	-#define RQB_WORD(pri) ((pri) >> RQB_L2BPW)
	-
	-#define RQB_FFS(word) (ffs(word) - 1)
	-
	-/*
	- * Type of run queue status word.
	- */
	-typedef u_int32_t rqb_word_t;
	-
	-#endif
	diff --git a/sys/kern/kern_rmlock.c b/sys/kern/kern_rmlock.c
	--- a/sys/kern/kern_rmlock.c
	+++ b/sys/kern/kern_rmlock.c
	@@ -1010,7 +1010,8 @@

	mtx_lock(&rms->mtx);
	while (rms->writers > 0)
	- msleep(&rms->readers, &rms->mtx, PUSER - 1, mtx_name(&rms->mtx), 0);
	+ msleep(&rms->readers, &rms->mtx, PRI_MAX_KERN,
	+ mtx_name(&rms->mtx), 0);
	critical_enter();
	rms_int_readers_inc(rms, rms_int_pcpu(rms));
	mtx_unlock(&rms->mtx);
	@@ -1197,7 +1198,7 @@
	mtx_lock(&rms->mtx);
	rms->writers++;
	if (rms->writers > 1) {
	- msleep(&rms->owner, &rms->mtx, (PUSER - 1),
	+ msleep(&rms->owner, &rms->mtx, PRI_MAX_KERN,
	mtx_name(&rms->mtx), 0);
	MPASS(rms->readers == 0);
	KASSERT(rms->owner == RMS_TRANSIENT,
	@@ -1213,7 +1214,7 @@
	rms_assert_no_pcpu_readers(rms);

	if (rms->readers > 0) {
	- msleep(&rms->writers, &rms->mtx, (PUSER - 1),
	+ msleep(&rms->writers, &rms->mtx, PRI_MAX_KERN,
	mtx_name(&rms->mtx), 0);
	}

	diff --git a/sys/kern/kern_switch.c b/sys/kern/kern_switch.c
	--- a/sys/kern/kern_switch.c
	+++ b/sys/kern/kern_switch.c
	@@ -38,6 +38,7 @@
	#include <sys/mutex.h>
	#include <sys/proc.h>
	#include <sys/queue.h>
	+#include <sys/runq.h>
	#include <sys/sched.h>
	#include <sys/smp.h>
	#include <sys/sysctl.h>
	@@ -57,8 +58,6 @@
	#endif
	#endif

	-CTASSERT((RQB_BPW * RQB_LEN) == RQ_NQS);
	-
	/*
	* kern.sched.preemption allows user space to determine if preemption support
	* is compiled in or not. It is not currently a boot or runtime flag that
	@@ -253,6 +252,35 @@
	/************************************************************************
	* SYSTEM RUN QUEUE manipulations and tests *
	************************************************************************/
	+_Static_assert(RQSW_BPW == (1 << RQSW_L2BPW),
	+ "RQSW_L2BPW and RQSW_BPW / 'rqsw_t' mismatch");
	+_Static_assert(RQ_NQS <= 256,
	+ "'td_rqindex' must be turned into a bigger unsigned type");
	+/* A macro instead of a function to get the proper calling function's name. */
	+#define CHECK_IDX(idx) ({ \
	+ __typeof(idx) _idx __unused = (idx); \
	+ KASSERT(0 <= _idx && _idx < RQ_NQS, \
	+ ("%s: %s out of range: %d", __func__, __STRING(idx), _idx)); \
	+})
	+
	+/* Status words' individual bit manipulators' internals. */
	+typedef uintptr_t runq_sw_op(int idx, int sw_idx, rqsw_t sw_bit,
	+ rqsw_t *swp);
	+static inline uintptr_t runq_sw_apply(struct runq *rq, int idx,
	+ runq_sw_op *op);
	+
	+static inline uintptr_t runq_sw_set_not_empty_op(int idx, int sw_idx,
	+ rqsw_t sw_bit, rqsw_t *swp);
	+static inline uintptr_t runq_sw_set_empty_op(int idx, int sw_idx,
	+ rqsw_t sw_bit, rqsw_t *swp);
	+static inline uintptr_t runq_sw_is_empty_op(int idx, int sw_idx,
	+ rqsw_t sw_bit, rqsw_t *swp);
	+
	+/* Status words' individual bit manipulators. */
	+static inline void runq_sw_set_not_empty(struct runq *rq, int idx);
	+static inline void runq_sw_set_empty(struct runq *rq, int idx);
	+static inline bool runq_sw_is_empty(struct runq *rq, int idx);
	+
	/*
	* Initialize a run structure.
	*/
	@@ -261,98 +289,96 @@
	{
	int i;

	- bzero(rq, sizeof *rq);
	+ bzero(rq, sizeof(*rq));
	for (i = 0; i < RQ_NQS; i++)
	TAILQ_INIT(&rq->rq_queues[i]);
	}

	/*
	- * Clear the status bit of the queue corresponding to priority level pri,
	- * indicating that it is empty.
	+ * Helper to implement functions operating on a particular status word bit.
	+ *
	+ * The operator is passed the initial 'idx', the corresponding status word index
	+ * in 'rq_status' in 'sw_idx', a status word with only that bit set in 'sw_bit'
	+ * and a pointer to the corresponding status word in 'swp'.
	*/
	-static __inline void
	-runq_clrbit(struct runq *rq, int pri)
	+static inline uintptr_t
	+runq_sw_apply(struct runq rq, int idx, runq_sw_op op)
	{
	- struct rqbits *rqb;
	+ rqsw_t *swp;
	+ rqsw_t sw_bit;
	+ int sw_idx;

	- rqb = &rq->rq_status;
	- CTR4(KTR_RUNQ, "runq_clrbit: bits=%#x %#x bit=%#x word=%d",
	- rqb->rqb_bits[RQB_WORD(pri)],
	- rqb->rqb_bits[RQB_WORD(pri)] & ~RQB_BIT(pri),
	- RQB_BIT(pri), RQB_WORD(pri));
	- rqb->rqb_bits[RQB_WORD(pri)] &= ~RQB_BIT(pri);
	+ CHECK_IDX(idx);
	+
	+ sw_idx = RQSW_IDX(idx);
	+ sw_bit = RQSW_BIT(idx);
	+ swp = &rq->rq_status.rq_sw[sw_idx];
	+
	+ return (op(idx, sw_idx, sw_bit, swp));
	}

	/*
	- * Find the index of the first non-empty run queue. This is done by
	- * scanning the status bits, a set bit indicates a non-empty queue.
	+ * Modify the status words to indicate that some queue is not empty.
	+ *
	+ * Sets the status bit corresponding to the queue at index 'idx'.
	*/
	-static __inline int
	-runq_findbit(struct runq *rq)
	+static inline uintptr_t
	+runq_sw_set_not_empty_op(int idx, int sw_idx, rqsw_t sw_bit, rqsw_t *swp)
	{
	- struct rqbits *rqb;
	- int pri;
	- int i;
	+ rqsw_t old_sw __unused = *swp;

	- rqb = &rq->rq_status;
	- for (i = 0; i < RQB_LEN; i++)
	- if (rqb->rqb_bits[i]) {
	- pri = RQB_FFS(rqb->rqb_bits[i]) + (i << RQB_L2BPW);
	- CTR3(KTR_RUNQ, "runq_findbit: bits=%#x i=%d pri=%d",
	- rqb->rqb_bits[i], i, pri);
	- return (pri);
	- }
	-
	- return (-1);
	+ *swp \|= sw_bit;
	+ CTR4(KTR_RUNQ, "runq_sw_set_not_empty: idx=%d sw_idx=%d bits=%#x->%#x",
	+ idx, sw_idx, old_sw, *swp);
	+ return (0);
	}
	-
	-static __inline int
	-runq_findbit_from(struct runq *rq, u_char pri)
	+static inline void
	+runq_sw_set_not_empty(struct runq *rq, int idx)
	{
	- struct rqbits *rqb;
	- rqb_word_t mask;
	- int i;
	-
	- /*
	- * Set the mask for the first word so we ignore priorities before 'pri'.
	- */
	- mask = (rqb_word_t)-1 << (pri & (RQB_BPW - 1));
	- rqb = &rq->rq_status;
	-again:
	- for (i = RQB_WORD(pri); i < RQB_LEN; mask = -1, i++) {
	- mask = rqb->rqb_bits[i] & mask;
	- if (mask == 0)
	- continue;
	- pri = RQB_FFS(mask) + (i << RQB_L2BPW);
	- CTR3(KTR_RUNQ, "runq_findbit_from: bits=%#x i=%d pri=%d",
	- mask, i, pri);
	- return (pri);
	- }
	- if (pri == 0)
	- return (-1);
	- /*
	- * Wrap back around to the beginning of the list just once so we
	- * scan the whole thing.
	- */
	- pri = 0;
	- goto again;
	+ (void)runq_sw_apply(rq, idx, &runq_sw_set_not_empty_op);
	}

	/*
	- * Set the status bit of the queue corresponding to priority level pri,
	- * indicating that it is non-empty.
	+ * Modify the status words to indicate that some queue is empty.
	+ *
	+ * Clears the status bit corresponding to the queue at index 'idx'.
	*/
	-static __inline void
	-runq_setbit(struct runq *rq, int pri)
	+static inline uintptr_t
	+runq_sw_set_empty_op(int idx, int sw_idx, rqsw_t sw_bit, rqsw_t *swp)
	{
	- struct rqbits *rqb;
	+ rqsw_t old_sw __unused = *swp;

	- rqb = &rq->rq_status;
	- CTR4(KTR_RUNQ, "runq_setbit: bits=%#x %#x bit=%#x word=%d",
	- rqb->rqb_bits[RQB_WORD(pri)],
	- rqb->rqb_bits[RQB_WORD(pri)] \| RQB_BIT(pri),
	- RQB_BIT(pri), RQB_WORD(pri));
	- rqb->rqb_bits[RQB_WORD(pri)] \|= RQB_BIT(pri);
	+ *swp &= ~sw_bit;
	+ CTR4(KTR_RUNQ, "runq_sw_set_empty: idx=%d sw_idx=%d bits=%#x->%#x",
	+ idx, sw_idx, old_sw, *swp);
	+ return (0);
	+}
	+static inline void
	+runq_sw_set_empty(struct runq *rq, int idx)
	+{
	+ (void)runq_sw_apply(rq, idx, &runq_sw_set_empty_op);
	+}
	+
	+/*
	+ * Returns whether the status words indicate that some queue is empty.
	+ */
	+static inline uintptr_t
	+runq_sw_is_empty_op(int idx, int sw_idx, rqsw_t sw_bit, rqsw_t *swp)
	+{
	+ return ((*swp & sw_bit) == 0);
	+}
	+static inline bool
	+runq_sw_is_empty(struct runq *rq, int idx)
	+{
	+ return (runq_sw_apply(rq, idx, &runq_sw_is_empty_op));
	+}
	+
	+/*
	+ * Returns whether a particular queue is empty.
	+ */
	+bool runq_is_queue_empty(struct runq *rq, int idx)
	+{
	+ return (runq_sw_is_empty(rq, idx));
	}

	/*
	@@ -362,102 +388,183 @@
	void
	runq_add(struct runq rq, struct thread td, int flags)
	{
	- struct rqhead *rqh;
	- int pri;

	- pri = td->td_priority / RQ_PPQ;
	- td->td_rqindex = pri;
	- runq_setbit(rq, pri);
	- rqh = &rq->rq_queues[pri];
	- CTR4(KTR_RUNQ, "runq_add: td=%p pri=%d %d rqh=%p",
	- td, td->td_priority, pri, rqh);
	- if (flags & SRQ_PREEMPTED) {
	- TAILQ_INSERT_HEAD(rqh, td, td_runq);
	- } else {
	- TAILQ_INSERT_TAIL(rqh, td, td_runq);
	- }
	+ runq_add_idx(rq, td, RQ_PRI_TO_QUEUE_IDX(td->td_priority), flags);
	}

	void
	-runq_add_pri(struct runq rq, struct thread td, u_char pri, int flags)
	+runq_add_idx(struct runq rq, struct thread td, int idx, int flags)
	{
	- struct rqhead *rqh;
	+ struct rq_queue *rqq;

	- KASSERT(pri < RQ_NQS, ("runq_add_pri: %d out of range", pri));
	- td->td_rqindex = pri;
	- runq_setbit(rq, pri);
	- rqh = &rq->rq_queues[pri];
	- CTR4(KTR_RUNQ, "runq_add_pri: td=%p pri=%d idx=%d rqh=%p",
	- td, td->td_priority, pri, rqh);
	- if (flags & SRQ_PREEMPTED) {
	- TAILQ_INSERT_HEAD(rqh, td, td_runq);
	- } else {
	- TAILQ_INSERT_TAIL(rqh, td, td_runq);
	- }
	+ /*
	+ * runq_sw_*() functions assert that 'idx' is non-negative and below
	+ * 'RQ_NQS', and a static assert upper in this file ensures that
	+ * 'RQ_NQS' is no more than 256.
	+ */
	+ td->td_rqindex = idx;
	+ runq_sw_set_not_empty(rq, idx);
	+ rqq = &rq->rq_queues[idx];
	+ CTR4(KTR_RUNQ, "runq_add_idx: td=%p pri=%d idx=%d rqq=%p",
	+ td, td->td_priority, idx, rqq);
	+ if (flags & SRQ_PREEMPTED)
	+ TAILQ_INSERT_HEAD(rqq, td, td_runq);
	+ else
	+ TAILQ_INSERT_TAIL(rqq, td, td_runq);
	}
	+
	/*
	- * Return true if there are runnable processes of any priority on the run
	- * queue, false otherwise. Has no side effects, does not modify the run
	- * queue structure.
	+ * Remove the thread from the queue specified by its priority, and clear the
	+ * corresponding status bit if the queue becomes empty.
	+ *
	+ * Returns whether the corresponding queue is empty after removal.
	+ */
	+bool
	+runq_remove(struct runq rq, struct thread td)
	+{
	+ struct rq_queue *rqq;
	+ int idx;
	+
	+ KASSERT(td->td_flags & TDF_INMEM, ("runq_remove: Thread swapped out"));
	+ idx = td->td_rqindex;
	+ CHECK_IDX(idx);
	+ rqq = &rq->rq_queues[idx];
	+ CTR4(KTR_RUNQ, "runq_remove: td=%p pri=%d idx=%d rqq=%p",
	+ td, td->td_priority, idx, rqq);
	+ TAILQ_REMOVE(rqq, td, td_runq);
	+ if (TAILQ_EMPTY(rqq)) {
	+ runq_sw_set_empty(rq, idx);
	+ CTR1(KTR_RUNQ, "runq_remove: queue at idx=%d now empty", idx);
	+ return (true);
	+ }
	+ return (false);
	+}
	+
	+static inline int
	+runq_findq_status_word(struct runq *const rq, const int w_idx,
	+ const rqsw_t w, runq_pred_t const pred, void const pred_data)
	+{
	+ struct rq_queue *q;
	+ rqsw_t tw = w;
	+ int idx, b_idx;
	+
	+ while (tw != 0) {
	+ b_idx = RQSW_BSF(tw);
	+ idx = RQSW_TO_QUEUE_IDX(w_idx, b_idx);
	+ q = &rq->rq_queues[idx];
	+ KASSERT(!TAILQ_EMPTY(q),
	+ ("runq_findq(): No thread on non-empty queue with idx=%d",
	+ idx));
	+ if (pred(idx, q, pred_data))
	+ return (idx);
	+ tw &= ~RQSW_BIT(idx);
	+ }
	+
	+ return (-1);
	+}
	+
	+/*
	+ * Find in the passed range (bounds included) the index of the first (i.e.,
	+ * having lower index) non-empty queue that passes pred().
	+ *
	+ * Considered queues are those with index 'lvl_min' up to 'lvl_max' (bounds
	+ * included). If no queue matches, returns -1.
	+ *
	+ * This is done by scanning the status words (a set bit indicates a non-empty
	+ * queue) and calling pred() with corresponding queue indices. pred() must
	+ * return whether the corresponding queue is accepted. It is passed private
	+ * data through 'pred_data', which can be used both for extra input and output.
	*/
	int
	-runq_check(struct runq *rq)
	+runq_findq(struct runq *const rq, const int lvl_min, const int lvl_max,
	+ runq_pred_t const pred, void const pred_data)
	{
	- struct rqbits *rqb;
	- int i;
	+ rqsw_t const (*const rqsw)[RQSW_NB] = &rq->rq_status.rq_sw;
	+ rqsw_t w;
	+ int i, last, idx;

	- rqb = &rq->rq_status;
	- for (i = 0; i < RQB_LEN; i++)
	- if (rqb->rqb_bits[i]) {
	- CTR2(KTR_RUNQ, "runq_check: bits=%#x i=%d",
	- rqb->rqb_bits[i], i);
	- return (1);
	- }
	- CTR0(KTR_RUNQ, "runq_check: empty");
	+ CHECK_IDX(lvl_min);
	+ CHECK_IDX(lvl_max);
	+ KASSERT(lvl_min <= lvl_max,
	+ ("lvl_min: %d > lvl_max: %d!", lvl_min, lvl_max));

	- return (0);
	+ i = RQSW_IDX(lvl_min);
	+ last = RQSW_IDX(lvl_max);
	+ /* Clear bits for runqueues below 'lvl_min'. */
	+ w = (*rqsw)[i] & ~(RQSW_BIT(lvl_min) - 1);
	+ if (i == last)
	+ goto last_mask;
	+ idx = runq_findq_status_word(rq, i, w, pred, pred_data);
	+ if (idx != -1)
	+ goto return_idx;
	+
	+ for (++i; i < last; ++i) {
	+ w = (*rqsw)[i];
	+ idx = runq_findq_status_word(rq, i, w, pred, pred_data);
	+ if (idx != -1)
	+ goto return_idx;
	+ }
	+
	+ MPASS(i == last);
	+ w = (*rqsw)[i];
	+last_mask:
	+ /* Clear bits for runqueues above 'lvl_max'. */
	+ w &= (RQSW_BIT(lvl_max) - 1) \| RQSW_BIT(lvl_max);
	+ idx = runq_findq_status_word(rq, i, w, pred, pred_data);
	+ if (idx != -1)
	+ goto return_idx;
	+ return (-1);
	+return_idx:
	+ CTR4(KTR_RUNQ, "runq_findq: bits=%#x->%#x i=%d idx=%d",
	+ (*rqsw)[i], w, i, idx);
	+ return (idx);
	+}
	+
	+static bool
	+runq_first_thread_pred(const int idx, struct rq_queue const q, void const data)
	+{
	+ struct thread **const tdp = data;
	+ struct thread *const td = TAILQ_FIRST(q);
	+
	+ *tdp = td;
	+ return (true);
	+}
	+
	+/* Make sure it has an external definition. */
	+extern inline struct thread *
	+runq_first_thread_range(struct runq *const rq, const int lvl_min,
	+ const int lvl_max)
	+{
	+ struct thread *td = NULL;
	+
	+ (void)runq_findq(rq, lvl_min, lvl_max, runq_first_thread_pred, &td);
	+ return (td);
	+}
	+
	+static inline struct thread *
	+runq_first_thread(struct runq *const rq)
	+{
	+
	+ return (runq_first_thread_range(rq, 0, RQ_NQS - 1));
	}

	/*
	- * Find the highest priority process on the run queue.
	+ * Return true if there are some processes of any priority on the run queue,
	+ * false otherwise. Has no side effects.
	*/
	-struct thread *
	-runq_choose_fuzz(struct runq *rq, int fuzz)
	+bool
	+runq_not_empty(struct runq *rq)
	{
	- struct rqhead *rqh;
	- struct thread *td;
	- int pri;
	+ struct thread *const td = runq_first_thread(rq);

	- while ((pri = runq_findbit(rq)) != -1) {
	- rqh = &rq->rq_queues[pri];
	- /* fuzz == 1 is normal.. 0 or less are ignored */
	- if (fuzz > 1) {
	- /*
	- * In the first couple of entries, check if
	- * there is one for our CPU as a preference.
	- */
	- int count = fuzz;
	- int cpu = PCPU_GET(cpuid);
	- struct thread *td2;
	- td2 = td = TAILQ_FIRST(rqh);
	-
	- while (count-- && td2) {
	- if (td2->td_lastcpu == cpu) {
	- td = td2;
	- break;
	- }
	- td2 = TAILQ_NEXT(td2, td_runq);
	- }
	- } else
	- td = TAILQ_FIRST(rqh);
	- KASSERT(td != NULL, ("runq_choose_fuzz: no proc on busy queue"));
	- CTR3(KTR_RUNQ,
	- "runq_choose_fuzz: pri=%d thread=%p rqh=%p", pri, td, rqh);
	- return (td);
	+ if (td != NULL) {
	+ CTR2(KTR_RUNQ, "runq_not_empty: idx=%d, td=%p",
	+ td->td_rqindex, td);
	+ return (true);
	}
	- CTR1(KTR_RUNQ, "runq_choose_fuzz: idleproc pri=%d", pri);

	- return (NULL);
	+ CTR0(KTR_RUNQ, "runq_not_empty: empty");
	+ return (false);
	}

	/*
	@@ -466,73 +573,74 @@
	struct thread *
	runq_choose(struct runq *rq)
	{
	- struct rqhead *rqh;
	struct thread *td;
	- int pri;

	- while ((pri = runq_findbit(rq)) != -1) {
	- rqh = &rq->rq_queues[pri];
	- td = TAILQ_FIRST(rqh);
	- KASSERT(td != NULL, ("runq_choose: no thread on busy queue"));
	- CTR3(KTR_RUNQ,
	- "runq_choose: pri=%d thread=%p rqh=%p", pri, td, rqh);
	+ td = runq_first_thread(rq);
	+ if (td != NULL) {
	+ CTR2(KTR_RUNQ, "runq_choose: idx=%d td=%p", td->td_rqindex, td);
	return (td);
	}
	- CTR1(KTR_RUNQ, "runq_choose: idlethread pri=%d", pri);

	+ CTR0(KTR_RUNQ, "runq_choose: idlethread");
	return (NULL);
	}

	-struct thread *
	-runq_choose_from(struct runq *rq, u_char idx)
	+struct runq_fuzz_pred_data {
	+ int fuzz;
	+ struct thread *td;
	+};
	+
	+static bool
	+runq_fuzz_pred(const int idx, struct rq_queue const q, void const data)
	{
	- struct rqhead *rqh;
	+ struct runq_fuzz_pred_data *const d = data;
	+ const int fuzz = d->fuzz;
	struct thread *td;
	- int pri;

	- if ((pri = runq_findbit_from(rq, idx)) != -1) {
	- rqh = &rq->rq_queues[pri];
	- td = TAILQ_FIRST(rqh);
	- KASSERT(td != NULL, ("runq_choose: no thread on busy queue"));
	- CTR4(KTR_RUNQ,
	- "runq_choose_from: pri=%d thread=%p idx=%d rqh=%p",
	- pri, td, td->td_rqindex, rqh);
	- return (td);
	+ td = TAILQ_FIRST(q);
	+
	+ if (fuzz > 1) {
	+ /*
	+ * In the first couple of entries, check if
	+ * there is one for our CPU as a preference.
	+ */
	+ struct thread *td2 = td;
	+ int count = fuzz;
	+ int cpu = PCPU_GET(cpuid);
	+
	+ while (count-- != 0 && td2 != NULL) {
	+ if (td2->td_lastcpu == cpu) {
	+ td = td2;
	+ break;
	+ }
	+ td2 = TAILQ_NEXT(td2, td_runq);
	+ }
	}
	- CTR1(KTR_RUNQ, "runq_choose_from: idlethread pri=%d", pri);

	- return (NULL);
	+ d->td = td;
	+ return (true);
	}
	+
	/*
	- * Remove the thread from the queue specified by its priority, and clear the
	- * corresponding status bit if the queue becomes empty.
	- * Caller must set state afterwards.
	+ * Find the highest priority process on the run queue.
	*/
	-void
	-runq_remove(struct runq rq, struct thread td)
	+struct thread *
	+runq_choose_fuzz(struct runq *rq, int fuzz)
	{
	+ struct runq_fuzz_pred_data data = {
	+ .fuzz = fuzz,
	+ .td = NULL
	+ };
	+ int idx;

	- runq_remove_idx(rq, td, NULL);
	-}
	-
	-void
	-runq_remove_idx(struct runq rq, struct thread td, u_char *idx)
	-{
	- struct rqhead *rqh;
	- u_char pri;
	-
	- KASSERT(td->td_flags & TDF_INMEM,
	- ("runq_remove_idx: thread swapped out"));
	- pri = td->td_rqindex;
	- KASSERT(pri < RQ_NQS, ("runq_remove_idx: Invalid index %d\n", pri));
	- rqh = &rq->rq_queues[pri];
	- CTR4(KTR_RUNQ, "runq_remove_idx: td=%p, pri=%d %d rqh=%p",
	- td, td->td_priority, pri, rqh);
	- TAILQ_REMOVE(rqh, td, td_runq);
	- if (TAILQ_EMPTY(rqh)) {
	- CTR0(KTR_RUNQ, "runq_remove_idx: empty");
	- runq_clrbit(rq, pri);
	- if (idx != NULL && *idx == pri)
	- *idx = (pri + 1) % RQ_NQS;
	+ idx = runq_findq(rq, 0, RQ_NQS - 1, runq_fuzz_pred, &data);
	+ if (idx != -1) {
	+ MPASS(data.td != NULL);
	+ CTR2(KTR_RUNQ, "runq_choose_fuzz: idx=%d td=%p", idx, data.td);
	+ return (data.td);
	}
	+
	+ MPASS(data.td == NULL);
	+ CTR0(KTR_RUNQ, "runq_choose_fuzz: idlethread");
	+ return (NULL);
	}
	diff --git a/sys/kern/sched_4bsd.c b/sys/kern/sched_4bsd.c
	--- a/sys/kern/sched_4bsd.c
	+++ b/sys/kern/sched_4bsd.c
	@@ -48,6 +48,7 @@
	#include <sys/mutex.h>
	#include <sys/proc.h>
	#include <sys/resourcevar.h>
	+#include <sys/runq.h>
	#include <sys/sched.h>
	#include <sys/sdt.h>
	#include <sys/smp.h>
	@@ -72,15 +73,17 @@
	* INVERSE_ESTCPU_WEIGHT is only suitable for statclock() frequencies in
	* the range 100-256 Hz (approximately).
	*/
	-#define ESTCPULIM(e) \
	- min((e), INVERSE_ESTCPU_WEIGHT * (NICE_WEIGHT * (PRIO_MAX - PRIO_MIN) - \
	- RQ_PPQ) + INVERSE_ESTCPU_WEIGHT - 1)
	#ifdef SMP
	#define INVERSE_ESTCPU_WEIGHT (8 * smp_cpus)
	#else
	#define INVERSE_ESTCPU_WEIGHT 8 /* 1 / (priorities per estcpu level). */
	#endif
	#define NICE_WEIGHT 1 /* Priorities per nice level. */
	+#define ESTCPULIM(e) \
	+ min((e), INVERSE_ESTCPU_WEIGHT * \
	+ (NICE_WEIGHT * (PRIO_MAX - PRIO_MIN) + \
	+ PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE) \
	+ + INVERSE_ESTCPU_WEIGHT - 1)

	#define TS_NAME_LEN (MAXCOMLEN + sizeof(" td ") + sizeof(__XSTRING(UINT_MAX)))

	@@ -683,13 +686,14 @@
	/* Nothing needed. */
	}

	-int
	+bool
	sched_runnable(void)
	{
	#ifdef SMP
	- return runq_check(&runq) + runq_check(&runq_pcpu[PCPU_GET(cpuid)]);
	+ return (runq_not_empty(&runq) \|\|
	+ runq_not_empty(&runq_pcpu[PCPU_GET(cpuid)]));
	#else
	- return runq_check(&runq);
	+ return (runq_not_empty(&runq));
	#endif
	}

	@@ -871,7 +875,7 @@
	if (td->td_priority == prio)
	return;
	td->td_priority = prio;
	- if (TD_ON_RUNQ(td) && td->td_rqindex != (prio / RQ_PPQ)) {
	+ if (TD_ON_RUNQ(td) && td->td_rqindex != RQ_PRI_TO_QUEUE_IDX(prio)) {
	sched_rem(td);
	sched_add(td, SRQ_BORING \| SRQ_HOLDTD);
	}
	@@ -1682,7 +1686,7 @@
	for (;;) {
	mtx_assert(&Giant, MA_NOTOWNED);

	- while (sched_runnable() == 0) {
	+ while (!sched_runnable()) {
	cpu_idle(stat->idlecalls + stat->oldidlecalls > 64);
	stat->idlecalls++;
	}
	diff --git a/sys/kern/sched_ule.c b/sys/kern/sched_ule.c
	--- a/sys/kern/sched_ule.c
	+++ b/sys/kern/sched_ule.c
	@@ -52,6 +52,7 @@
	#include <sys/proc.h>
	#include <sys/resource.h>
	#include <sys/resourcevar.h>
	+#include <sys/runq.h>
	#include <sys/sched.h>
	#include <sys/sdt.h>
	#include <sys/smp.h>
	@@ -87,10 +88,9 @@
	* Thread scheduler specific section. All fields are protected
	* by the thread lock.
	*/
	-struct td_sched {
	- struct runq ts_runq; / Run-queue we're queued on. */
	+struct td_sched {
	short ts_flags; /* TSF_* flags. */
	- int ts_cpu; /* CPU that we have affinity for. */
	+ int ts_cpu; /* CPU we are on, or were last on. */
	int ts_rltick; /* Real last tick, for affinity. */
	int ts_slice; /* Ticks of slice remaining. */
	u_int ts_slptime; /* Number of ticks we vol. slept */
	@@ -130,23 +130,6 @@
	#define PRI_MIN_BATCH (PRI_MIN_TIMESHARE + PRI_INTERACT_RANGE)
	#define PRI_MAX_BATCH PRI_MAX_TIMESHARE

	-/*
	- * Cpu percentage computation macros and defines.
	- *
	- * SCHED_TICK_SECS: Number of seconds to average the cpu usage across.
	- * SCHED_TICK_TARG: Number of hz ticks to average the cpu usage across.
	- * SCHED_TICK_MAX: Maximum number of ticks before scaling back.
	- * SCHED_TICK_SHIFT: Shift factor to avoid rounding away results.
	- * SCHED_TICK_HZ: Compute the number of hz ticks for a given ticks count.
	- * SCHED_TICK_TOTAL: Gives the amount of time we've been recording ticks.
	- */
	-#define SCHED_TICK_SECS 10
	-#define SCHED_TICK_TARG (hz * SCHED_TICK_SECS)
	-#define SCHED_TICK_MAX (SCHED_TICK_TARG + hz)
	-#define SCHED_TICK_SHIFT 10
	-#define SCHED_TICK_HZ(ts) ((ts)->ts_ticks >> SCHED_TICK_SHIFT)
	-#define SCHED_TICK_TOTAL(ts) (max((ts)->ts_ltick - (ts)->ts_ftick, hz))
	-
	/*
	* These macros determine priorities for non-interactive threads. They are
	* assigned a priority based on their recent cpu utilization as expressed
	@@ -169,6 +152,48 @@
	(roundup(SCHED_TICK_TOTAL((ts)), SCHED_PRI_RANGE) / SCHED_PRI_RANGE))
	#define SCHED_PRI_NICE(nice) (nice)

	+/*
	+ * Runqueue indices for the implemented scheduling policies' priority bounds.
	+ *
	+ * In ULE's implementation, realtime policy covers the ITHD, REALTIME and
	+ * INTERACT (see above) ranges, timesharing the BATCH range (see above), and
	+ * idle policy the IDLE range.
	+ *
	+ * Priorities from these ranges must not be assigned to the same runqueue's
	+ * queue.
	+ */
	+#define RQ_RT_POL_MIN (RQ_PRI_TO_QUEUE_IDX(PRI_MIN_ITHD))
	+#define RQ_RT_POL_MAX (RQ_PRI_TO_QUEUE_IDX(PRI_MAX_INTERACT))
	+#define RQ_TS_POL_MIN (RQ_PRI_TO_QUEUE_IDX(PRI_MIN_BATCH))
	+#define RQ_TS_POL_MAX (RQ_PRI_TO_QUEUE_IDX(PRI_MAX_BATCH))
	+#define RQ_ID_POL_MIN (RQ_PRI_TO_QUEUE_IDX(PRI_MIN_IDLE))
	+#define RQ_ID_POL_MAX (RQ_PRI_TO_QUEUE_IDX(PRI_MAX_IDLE))
	+
	+_Static_assert(RQ_RT_POL_MAX != RQ_TS_POL_MIN,
	+ "ULE's realtime and timeshare policies' runqueue ranges overlap");
	+_Static_assert(RQ_TS_POL_MAX != RQ_ID_POL_MIN,
	+ "ULE's timeshare and idle policies' runqueue ranges overlap");
	+
	+/* Helper to treat the timeshare range as a circular group of queues. */
	+#define RQ_TS_POL_MODULO (RQ_TS_POL_MAX - RQ_TS_POL_MIN + 1)
	+
	+/*
	+ * Cpu percentage computation macros and defines.
	+ *
	+ * SCHED_TICK_SECS: Number of seconds to average the cpu usage across.
	+ * SCHED_TICK_TARG: Number of hz ticks to average the cpu usage across.
	+ * SCHED_TICK_MAX: Maximum number of ticks before scaling back.
	+ * SCHED_TICK_SHIFT: Shift factor to avoid rounding away results.
	+ * SCHED_TICK_HZ: Compute the number of hz ticks for a given ticks count.
	+ * SCHED_TICK_TOTAL: Gives the amount of time we've been recording ticks.
	+ */
	+#define SCHED_TICK_SECS 10
	+#define SCHED_TICK_TARG (hz * SCHED_TICK_SECS)
	+#define SCHED_TICK_MAX (SCHED_TICK_TARG + hz)
	+#define SCHED_TICK_SHIFT 10
	+#define SCHED_TICK_HZ(ts) ((ts)->ts_ticks >> SCHED_TICK_SHIFT)
	+#define SCHED_TICK_TOTAL(ts) (max((ts)->ts_ltick - (ts)->ts_ftick, hz))
	+
	/*
	* These determine the interactivity of a process. Interactivity differs from
	* cpu utilization in that it expresses the voluntary time slept vs time ran
	@@ -252,12 +277,10 @@
	short tdq_oldswitchcnt; /* (l) Switches last tick. */
	u_char tdq_lowpri; /* (ts) Lowest priority thread. */
	u_char tdq_owepreempt; /* (f) Remote preemption pending. */
	- u_char tdq_idx; /* (t) Current insert index. */
	- u_char tdq_ridx; /* (t) Current removal index. */
	+ u_char tdq_ts_off; /* (t) TS insertion offset. */
	+ u_char tdq_ts_deq_off; /* (t) TS dequeue offset. */
	int tdq_id; /* (c) cpuid. */
	- struct runq tdq_realtime; /* (t) real-time run queue. */
	- struct runq tdq_timeshare; /* (t) timeshare run queue. */
	- struct runq tdq_idle; /* (t) Queue of IDLE threads. */
	+ struct runq tdq_runq; /* (t) Run queue. */
	char tdq_name[TDQ_NAME_LEN];
	#ifdef KTR
	char tdq_loadname[TDQ_LOADNAME_LEN];
	@@ -329,12 +352,17 @@
	static void sched_pctcpu_update(struct td_sched *, int);

	/* Operations on per processor queues */
	+static inline struct thread runq_choose_realtime(struct runq const rq);
	+static inline struct thread runq_choose_timeshare(struct runq const rq,
	+ int off);
	+static inline struct thread runq_choose_idle(struct runq const rq);
	static struct thread tdq_choose(struct tdq );
	+
	static void tdq_setup(struct tdq *, int i);
	static void tdq_load_add(struct tdq , struct thread );
	static void tdq_load_rem(struct tdq , struct thread );
	-static __inline void tdq_runq_add(struct tdq , struct thread , int);
	-static __inline void tdq_runq_rem(struct tdq , struct thread );
	+static inline void tdq_runq_add(struct tdq , struct thread , int);
	+static inline void tdq_runq_rem(struct tdq , struct thread );
	static inline int sched_shouldpreempt(int, int, int);
	static void tdq_print(int cpu);
	static void runq_print(struct runq *rq);
	@@ -343,8 +371,19 @@
	static int tdq_move(struct tdq , struct tdq );
	static int tdq_idled(struct tdq *);
	static void tdq_notify(struct tdq *, int lowpri);
	+
	+static bool runq_steal_pred(const int idx, struct rq_queue *const q,
	+ void *const data);
	+static inline struct thread runq_steal_range(struct runq const rq,
	+ const int lvl_min, const int lvl_max, int cpu);
	+static inline struct thread runq_steal_realtime(struct runq const rq,
	+ int cpu);
	+static inline struct thread runq_steal_timeshare(struct runq const rq,
	+ int cpu, int off);
	+static inline struct thread runq_steal_idle(struct runq const rq,
	+ int cpu);
	static struct thread tdq_steal(struct tdq , int);
	-static struct thread runq_steal(struct runq , int);
	+
	static int sched_pickcpu(struct thread *, int);
	static void sched_balance(void);
	static bool sched_balance_pair(struct tdq , struct tdq );
	@@ -386,20 +425,20 @@
	static void
	runq_print(struct runq *rq)
	{
	- struct rqhead *rqh;
	+ struct rq_queue *rqq;
	struct thread *td;
	int pri;
	int j;
	int i;

	- for (i = 0; i < RQB_LEN; i++) {
	+ for (i = 0; i < RQSW_NB; i++) {
	printf("\t\trunq bits %d 0x%zx\n",
	- i, rq->rq_status.rqb_bits[i]);
	- for (j = 0; j < RQB_BPW; j++)
	- if (rq->rq_status.rqb_bits[i] & (1ul << j)) {
	- pri = j + (i << RQB_L2BPW);
	- rqh = &rq->rq_queues[pri];
	- TAILQ_FOREACH(td, rqh, td_runq) {
	+ i, rq->rq_status.rq_sw[i]);
	+ for (j = 0; j < RQSW_BPW; j++)
	+ if (rq->rq_status.rq_sw[i] & (1ul << j)) {
	+ pri = RQSW_TO_QUEUE_IDX(i, j);
	+ rqq = &rq->rq_queues[pri];
	+ TAILQ_FOREACH(td, rqq, td_runq) {
	printf("\t\t\ttd %p(%s) priority %d rqindex %d pri %d\n",
	td, td->td_name, td->td_priority,
	td->td_rqindex, pri);
	@@ -419,21 +458,17 @@
	tdq = TDQ_CPU(cpu);

	printf("tdq %d:\n", TDQ_ID(tdq));
	- printf("\tlock %p\n", TDQ_LOCKPTR(tdq));
	- printf("\tLock name: %s\n", tdq->tdq_name);
	- printf("\tload: %d\n", tdq->tdq_load);
	- printf("\tswitch cnt: %d\n", tdq->tdq_switchcnt);
	- printf("\told switch cnt: %d\n", tdq->tdq_oldswitchcnt);
	- printf("\ttimeshare idx: %d\n", tdq->tdq_idx);
	- printf("\ttimeshare ridx: %d\n", tdq->tdq_ridx);
	+ printf("\tlock %p\n", TDQ_LOCKPTR(tdq));
	+ printf("\tLock name: %s\n", tdq->tdq_name);
	+ printf("\tload: %d\n", tdq->tdq_load);
	+ printf("\tswitch cnt: %d\n", tdq->tdq_switchcnt);
	+ printf("\told switch cnt: %d\n", tdq->tdq_oldswitchcnt);
	+ printf("\tTS insert offset: %d\n", tdq->tdq_ts_off);
	+ printf("\tTS dequeue offset: %d\n", tdq->tdq_ts_deq_off);
	printf("\tload transferable: %d\n", tdq->tdq_transferable);
	printf("\tlowest priority: %d\n", tdq->tdq_lowpri);
	- printf("\trealtime runq:\n");
	- runq_print(&tdq->tdq_realtime);
	- printf("\ttimeshare runq:\n");
	- runq_print(&tdq->tdq_timeshare);
	- printf("\tidle runq:\n");
	- runq_print(&tdq->tdq_idle);
	+ printf("\trunq:\n");
	+ runq_print(&tdq->tdq_runq);
	}

	static inline int
	@@ -474,11 +509,11 @@
	* date with what is actually on the run-queue. Selects the correct
	* queue position for timeshare threads.
	*/
	-static __inline void
	+static inline void
	tdq_runq_add(struct tdq tdq, struct thread td, int flags)
	{
	struct td_sched *ts;
	- u_char pri;
	+ u_char pri, idx;

	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
	THREAD_LOCK_BLOCKED_ASSERT(td, MA_OWNED);
	@@ -490,62 +525,68 @@
	tdq->tdq_transferable++;
	ts->ts_flags \|= TSF_XFERABLE;
	}
	- if (pri < PRI_MIN_BATCH) {
	- ts->ts_runq = &tdq->tdq_realtime;
	- } else if (pri <= PRI_MAX_BATCH) {
	- ts->ts_runq = &tdq->tdq_timeshare;
	- KASSERT(pri <= PRI_MAX_BATCH && pri >= PRI_MIN_BATCH,
	- ("Invalid priority %d on timeshare runq", pri));
	+ if (PRI_MIN_BATCH <= pri && pri <= PRI_MAX_BATCH) {
	/*
	- * This queue contains only priorities between MIN and MAX
	- * batch. Use the whole queue to represent these values.
	+ * The queues allocated to the batch range are not used as
	+ * a simple array but as a "circular" one where the insertion
	+ * index (derived from 'pri') is offset by 'tdq_ts_off'. 'idx'
	+ * is first set to the offset of the wanted queue in the TS'
	+ * selection policy range.
	*/
	- if ((flags & (SRQ_BORROWING\|SRQ_PREEMPTED)) == 0) {
	- pri = RQ_NQS * (pri - PRI_MIN_BATCH) / PRI_BATCH_RANGE;
	- pri = (pri + tdq->tdq_idx) % RQ_NQS;
	+ if ((flags & (SRQ_BORROWING\|SRQ_PREEMPTED)) != 0)
	+ /* Current queue from which processes are being run. */
	+ idx = tdq->tdq_ts_deq_off;
	+ else {
	+ idx = (RQ_PRI_TO_QUEUE_IDX(pri) - RQ_TS_POL_MIN +
	+ tdq->tdq_ts_off) % RQ_TS_POL_MODULO;
	/*
	- * This effectively shortens the queue by one so we
	- * can have a one slot difference between idx and
	- * ridx while we wait for threads to drain.
	+ * We avoid enqueuing low priority threads in the queue
	+ * that we are still draining, effectively shortening
	+ * the runqueue by one queue.
	*/
	- if (tdq->tdq_ridx != tdq->tdq_idx &&
	- pri == tdq->tdq_ridx)
	- pri = (unsigned char)(pri - 1) % RQ_NQS;
	- } else
	- pri = tdq->tdq_ridx;
	- runq_add_pri(ts->ts_runq, td, pri, flags);
	- return;
	+ if (tdq->tdq_ts_deq_off != tdq->tdq_ts_off &&
	+ idx == tdq->tdq_ts_deq_off)
	+ /* Ensure the dividend is positive. */
	+ idx = (idx - 1 + RQ_TS_POL_MODULO) %
	+ RQ_TS_POL_MODULO;
	+ }
	+ /* Absolute queue index. */
	+ idx += RQ_TS_POL_MIN;
	+ runq_add_idx(&tdq->tdq_runq, td, idx, flags);
	} else
	- ts->ts_runq = &tdq->tdq_idle;
	- runq_add(ts->ts_runq, td, flags);
	+ runq_add(&tdq->tdq_runq, td, flags);
	}

	-/*
	+/*
	* Remove a thread from a run-queue. This typically happens when a thread
	* is selected to run. Running threads are not on the queue and the
	* transferable count does not reflect them.
	*/
	-static __inline void
	+static inline void
	tdq_runq_rem(struct tdq tdq, struct thread td)
	{
	struct td_sched *ts;
	+ bool queue_empty;

	ts = td_get_sched(td);
	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
	THREAD_LOCK_BLOCKED_ASSERT(td, MA_OWNED);
	- KASSERT(ts->ts_runq != NULL,
	- ("tdq_runq_remove: thread %p null ts_runq", td));
	if (ts->ts_flags & TSF_XFERABLE) {
	tdq->tdq_transferable--;
	ts->ts_flags &= ~TSF_XFERABLE;
	}
	- if (ts->ts_runq == &tdq->tdq_timeshare) {
	- if (tdq->tdq_idx != tdq->tdq_ridx)
	- runq_remove_idx(ts->ts_runq, td, &tdq->tdq_ridx);
	- else
	- runq_remove_idx(ts->ts_runq, td, NULL);
	- } else
	- runq_remove(ts->ts_runq, td);
	+ queue_empty = runq_remove(&tdq->tdq_runq, td);
	+ /*
	+ * If thread has a batch priority and the queue from which it was
	+ * removed is now empty, advance the batch's queue removal index if it
	+ * lags with respect to the batch's queue insertion index.
	+ */
	+ if (queue_empty && PRI_MIN_BATCH <= td->td_priority &&
	+ td->td_priority <= PRI_MAX_BATCH &&
	+ tdq->tdq_ts_off != tdq->tdq_ts_deq_off &&
	+ tdq->tdq_ts_deq_off == td->td_rqindex)
	+ tdq->tdq_ts_deq_off = (tdq->tdq_ts_deq_off + 1) %
	+ RQ_TS_POL_MODULO;
	}

	/*
	@@ -1178,82 +1219,84 @@
	ipi_cpu(cpu, IPI_PREEMPT);
	}

	+struct runq_steal_pred_data {
	+ struct thread *td;
	+ int cpu;
	+};
	+
	+static bool
	+runq_steal_pred(const int idx, struct rq_queue const q, void const data)
	+{
	+ struct runq_steal_pred_data *const d = data;
	+ struct thread *td;
	+
	+ TAILQ_FOREACH(td, q, td_runq) {
	+ if (THREAD_CAN_MIGRATE(td) && THREAD_CAN_SCHED(td, d->cpu)) {
	+ d->td = td;
	+ return (true);
	+ }
	+ }
	+
	+ return (false);
	+}
	+
	+/*
	+ * Steals load contained in queues with indices in the specified range.
	+ */
	+static inline struct thread *
	+runq_steal_range(struct runq *const rq, const int lvl_min, const int lvl_max,
	+ int cpu)
	+{
	+ struct runq_steal_pred_data data = {
	+ .td = NULL,
	+ .cpu = cpu,
	+ };
	+ int idx;
	+
	+ idx = runq_findq(rq, lvl_min, lvl_max, &runq_steal_pred, &data);
	+ if (idx != -1) {
	+ MPASS(data.td != NULL);
	+ return (data.td);
	+ }
	+
	+ MPASS(data.td == NULL);
	+ return (NULL);
	+}
	+
	+static inline struct thread *
	+runq_steal_realtime(struct runq *const rq, int cpu)
	+{
	+
	+ return (runq_steal_range(rq, RQ_RT_POL_MIN, RQ_RT_POL_MAX, cpu));
	+}
	+
	/*
	* Steals load from a timeshare queue. Honors the rotating queue head
	* index.
	*/
	-static struct thread *
	-runq_steal_from(struct runq *rq, int cpu, u_char start)
	+static inline struct thread *
	+runq_steal_timeshare(struct runq *const rq, int cpu, int off)
	{
	- struct rqbits *rqb;
	- struct rqhead *rqh;
	- struct thread td, first;
	- int bit;
	- int i;
	-
	- rqb = &rq->rq_status;
	- bit = start & (RQB_BPW -1);
	- first = NULL;
	-again:
	- for (i = RQB_WORD(start); i < RQB_LEN; bit = 0, i++) {
	- if (rqb->rqb_bits[i] == 0)
	- continue;
	- if (bit == 0)
	- bit = RQB_FFS(rqb->rqb_bits[i]);
	- for (; bit < RQB_BPW; bit++) {
	- if ((rqb->rqb_bits[i] & (1ul << bit)) == 0)
	- continue;
	- rqh = &rq->rq_queues[bit + (i << RQB_L2BPW)];
	- TAILQ_FOREACH(td, rqh, td_runq) {
	- if (first) {
	- if (THREAD_CAN_MIGRATE(td) &&
	- THREAD_CAN_SCHED(td, cpu))
	- return (td);
	- } else
	- first = td;
	- }
	- }
	- }
	- if (start != 0) {
	- start = 0;
	- goto again;
	- }
	-
	- if (first && THREAD_CAN_MIGRATE(first) &&
	- THREAD_CAN_SCHED(first, cpu))
	- return (first);
	- return (NULL);
	-}
	-
	-/*
	- * Steals load from a standard linear queue.
	- */
	-static struct thread *
	-runq_steal(struct runq *rq, int cpu)
	-{
	- struct rqhead *rqh;
	- struct rqbits *rqb;
	struct thread *td;
	- int word;
	- int bit;

	- rqb = &rq->rq_status;
	- for (word = 0; word < RQB_LEN; word++) {
	- if (rqb->rqb_bits[word] == 0)
	- continue;
	- for (bit = 0; bit < RQB_BPW; bit++) {
	- if ((rqb->rqb_bits[word] & (1ul << bit)) == 0)
	- continue;
	- rqh = &rq->rq_queues[bit + (word << RQB_L2BPW)];
	- TAILQ_FOREACH(td, rqh, td_runq)
	- if (THREAD_CAN_MIGRATE(td) &&
	- THREAD_CAN_SCHED(td, cpu))
	- return (td);
	- }
	- }
	- return (NULL);
	+ MPASS(0 <= off && off < RQ_TS_POL_MODULO);
	+
	+ td = runq_steal_range(rq, RQ_TS_POL_MIN + off, RQ_TS_POL_MAX, cpu);
	+ if (td != NULL \|\| off == 0)
	+ return (td);
	+
	+ td = runq_steal_range(rq, RQ_TS_POL_MIN, RQ_TS_POL_MIN + off - 1, cpu);
	+ return (td);
	}

	+static inline struct thread *
	+runq_steal_idle(struct runq *const rq, int cpu)
	+{
	+
	+ return (runq_steal_range(rq, RQ_ID_POL_MIN, RQ_ID_POL_MAX, cpu));
	+}
	+
	+
	/*
	* Attempt to steal a thread in priority order from a thread queue.
	*/
	@@ -1263,12 +1306,13 @@
	struct thread *td;

	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
	- if ((td = runq_steal(&tdq->tdq_realtime, cpu)) != NULL)
	+ td = runq_steal_realtime(&tdq->tdq_runq, cpu);
	+ if (td != NULL)
	return (td);
	- if ((td = runq_steal_from(&tdq->tdq_timeshare,
	- cpu, tdq->tdq_ridx)) != NULL)
	+ td = runq_steal_timeshare(&tdq->tdq_runq, cpu, tdq->tdq_ts_deq_off);
	+ if (td != NULL)
	return (td);
	- return (runq_steal(&tdq->tdq_idle, cpu));
	+ return (runq_steal_idle(&tdq->tdq_runq, cpu));
	}

	/*
	@@ -1450,6 +1494,35 @@
	}
	#endif

	+static inline struct thread *
	+runq_choose_realtime(struct runq *const rq)
	+{
	+
	+ return (runq_first_thread_range(rq, RQ_RT_POL_MIN, RQ_RT_POL_MAX));
	+}
	+
	+static struct thread *
	+runq_choose_timeshare(struct runq *const rq, int off)
	+{
	+ struct thread *td;
	+
	+ MPASS(0 <= off && off < RQ_TS_POL_MODULO);
	+
	+ td = runq_first_thread_range(rq, RQ_TS_POL_MIN + off, RQ_TS_POL_MAX);
	+ if (td != NULL \|\| off == 0)
	+ return (td);
	+
	+ td = runq_first_thread_range(rq, RQ_TS_POL_MIN, RQ_TS_POL_MIN + off - 1);
	+ return (td);
	+}
	+
	+static inline struct thread *
	+runq_choose_idle(struct runq *const rq)
	+{
	+
	+ return (runq_first_thread_range(rq, RQ_ID_POL_MIN, RQ_ID_POL_MAX));
	+}
	+
	/*
	* Pick the highest priority task we have and return it.
	*/
	@@ -1459,17 +1532,17 @@
	struct thread *td;

	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
	- td = runq_choose(&tdq->tdq_realtime);
	+ td = runq_choose_realtime(&tdq->tdq_runq);
	if (td != NULL)
	return (td);
	- td = runq_choose_from(&tdq->tdq_timeshare, tdq->tdq_ridx);
	+ td = runq_choose_timeshare(&tdq->tdq_runq, tdq->tdq_ts_deq_off);
	if (td != NULL) {
	KASSERT(td->td_priority >= PRI_MIN_BATCH,
	("tdq_choose: Invalid priority on timeshare queue %d",
	td->td_priority));
	return (td);
	}
	- td = runq_choose(&tdq->tdq_idle);
	+ td = runq_choose_idle(&tdq->tdq_runq);
	if (td != NULL) {
	KASSERT(td->td_priority >= PRI_MIN_IDLE,
	("tdq_choose: Invalid priority on idle queue %d",
	@@ -1489,9 +1562,7 @@

	if (bootverbose)
	printf("ULE: setup cpu %d\n", id);
	- runq_init(&tdq->tdq_realtime);
	- runq_init(&tdq->tdq_timeshare);
	- runq_init(&tdq->tdq_idle);
	+ runq_init(&tdq->tdq_runq);
	tdq->tdq_id = id;
	snprintf(tdq->tdq_name, sizeof(tdq->tdq_name),
	"sched lock %d", (int)TDQ_ID(tdq));
	@@ -2595,13 +2666,14 @@
	tdq->tdq_switchcnt = tdq->tdq_load;

	/*
	- * Advance the insert index once for each tick to ensure that all
	+ * Advance the insert offset once for each tick to ensure that all
	* threads get a chance to run.
	*/
	- if (tdq->tdq_idx == tdq->tdq_ridx) {
	- tdq->tdq_idx = (tdq->tdq_idx + 1) % RQ_NQS;
	- if (TAILQ_EMPTY(&tdq->tdq_timeshare.rq_queues[tdq->tdq_ridx]))
	- tdq->tdq_ridx = tdq->tdq_idx;
	+ if (tdq->tdq_ts_off == tdq->tdq_ts_deq_off) {
	+ tdq->tdq_ts_off = (tdq->tdq_ts_off + 1) % RQ_TS_POL_MODULO;
	+ if (runq_is_queue_empty(&tdq->tdq_runq,
	+ tdq->tdq_ts_deq_off + RQ_TS_POL_MIN))
	+ tdq->tdq_ts_deq_off = tdq->tdq_ts_off;
	}
	ts = td_get_sched(td);
	sched_pctcpu_update(ts, 1);
	@@ -2655,24 +2727,20 @@
	* Return whether the current CPU has runnable tasks. Used for in-kernel
	* cooperative idle threads.
	*/
	-int
	+bool
	sched_runnable(void)
	{
	struct tdq *tdq;
	- int load;
	-
	- load = 1;

	tdq = TDQ_SELF();
	if ((curthread->td_flags & TDF_IDLETD) != 0) {
	if (TDQ_LOAD(tdq) > 0)
	- goto out;
	+ return (true);
	} else
	if (TDQ_LOAD(tdq) - 1 > 0)
	- goto out;
	- load = 0;
	-out:
	- return (load);
	+ return (true);
	+
	+ return (false);
	}

	/*
	diff --git a/sys/kern/subr_log.c b/sys/kern/subr_log.c
	--- a/sys/kern/subr_log.c
	+++ b/sys/kern/subr_log.c
	@@ -47,7 +47,7 @@
	#include <sys/filedesc.h>
	#include <sys/sysctl.h>

	-#define LOG_RDPRI (PZERO + 1)
	+#define LOG_RDPRI PZERO

	#define LOG_ASYNC 0x04

	diff --git a/sys/kern/sysv_msg.c b/sys/kern/sysv_msg.c
	--- a/sys/kern/sysv_msg.c
	+++ b/sys/kern/sysv_msg.c
	@@ -894,7 +894,7 @@
	we_own_it = 1;
	}
	DPRINTF(("msgsnd: goodnight\n"));
	- error = msleep(msqkptr, &msq_mtx, (PZERO - 4) \| PCATCH,
	+ error = msleep(msqkptr, &msq_mtx, PVFS \| PCATCH,
	"msgsnd", hz);
	DPRINTF(("msgsnd: good morning, error=%d\n", error));
	if (we_own_it)
	@@ -1303,7 +1303,7 @@
	*/

	DPRINTF(("msgrcv: goodnight\n"));
	- error = msleep(msqkptr, &msq_mtx, (PZERO - 4) \| PCATCH,
	+ error = msleep(msqkptr, &msq_mtx, PVFS \| PCATCH,
	"msgrcv", 0);
	DPRINTF(("msgrcv: good morning (error=%d)\n", error));

	diff --git a/sys/kern/sysv_sem.c b/sys/kern/sysv_sem.c
	--- a/sys/kern/sysv_sem.c
	+++ b/sys/kern/sysv_sem.c
	@@ -1309,7 +1309,7 @@
	semptr->semncnt++;

	DPRINTF(("semop: good night!\n"));
	- error = msleep_sbt(semakptr, sema_mtxp, (PZERO - 4) \| PCATCH,
	+ error = msleep_sbt(semakptr, sema_mtxp, PVFS \| PCATCH,
	"semwait", sbt, precision, C_ABSOLUTE);
	DPRINTF(("semop: good morning (error=%d)!\n", error));
	/* return code is checked below, after sem[nz]cnt-- */
	diff --git a/sys/kern/vfs_bio.c b/sys/kern/vfs_bio.c
	--- a/sys/kern/vfs_bio.c
	+++ b/sys/kern/vfs_bio.c
	@@ -756,7 +756,7 @@
	break;
	}
	error = msleep(&bd->bd_wanted, BD_LOCKPTR(bd),
	- (PRIBIO + 4) \| slpflag, "newbuf", slptimeo);
	+ PVFS \| slpflag, "newbuf", slptimeo);
	if (error != 0)
	break;
	}
	@@ -2654,8 +2654,7 @@
	mtx_lock(&bdirtylock);
	while (buf_dirty_count_severe()) {
	bdirtywait = 1;
	- msleep(&bdirtywait, &bdirtylock, (PRIBIO + 4),
	- "flswai", 0);
	+ msleep(&bdirtywait, &bdirtylock, PVFS, "flswai", 0);
	}
	mtx_unlock(&bdirtylock);
	}
	@@ -5234,7 +5233,7 @@
	while (bo->bo_numoutput) {
	bo->bo_flag \|= BO_WWAIT;
	error = msleep(&bo->bo_numoutput, BO_LOCKPTR(bo),
	- slpflag \| (PRIBIO + 1), "bo_wwait", timeo);
	+ slpflag \| PRIBIO, "bo_wwait", timeo);
	if (error)
	break;
	}
	diff --git a/sys/kern/vfs_vnops.c b/sys/kern/vfs_vnops.c
	--- a/sys/kern/vfs_vnops.c
	+++ b/sys/kern/vfs_vnops.c
	@@ -787,7 +787,7 @@
	}
	DROP_GIANT();
	sleepq_add(&fp->f_vnread_flags, NULL, "vofflock", 0, 0);
	- sleepq_wait(&fp->f_vnread_flags, PUSER -1);
	+ sleepq_wait(&fp->f_vnread_flags, PRI_MAX_KERN);
	PICKUP_GIANT();
	sleepq_lock(&fp->f_vnread_flags);
	state = atomic_load_16(flagsp);
	@@ -849,7 +849,7 @@
	if ((flags & FOF_NOLOCK) == 0) {
	while (fp->f_vnread_flags & FOFFSET_LOCKED) {
	fp->f_vnread_flags \|= FOFFSET_LOCK_WAITING;
	- msleep(&fp->f_vnread_flags, mtxp, PUSER -1,
	+ msleep(&fp->f_vnread_flags, mtxp, PRI_MAX_KERN,
	"vofflock", 0);
	}
	fp->f_vnread_flags \|= FOFFSET_LOCKED;
	@@ -1897,7 +1897,7 @@
	if (flags & V_PCATCH)
	mflags \|= PCATCH;
	}
	- mflags \|= (PUSER - 1);
	+ mflags \|= PRI_MAX_KERN;
	while ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0) {
	if ((flags & V_NOWAIT) != 0) {
	error = EWOULDBLOCK;
	@@ -2022,7 +2022,7 @@
	if ((flags & V_PCATCH) != 0)
	mflags \|= PCATCH;
	}
	- mflags \|= (PUSER - 1) \| PDROP;
	+ mflags \|= PRI_MAX_KERN \| PDROP;
	error = msleep(&mp->mnt_flag, MNT_MTX(mp), mflags, "suspfs", 0);
	vfs_rel(mp);
	if (error == 0)
	@@ -2107,7 +2107,7 @@
	return (EALREADY);
	}
	while (mp->mnt_kern_flag & MNTK_SUSPEND)
	- msleep(&mp->mnt_flag, MNT_MTX(mp), PUSER - 1, "wsuspfs", 0);
	+ msleep(&mp->mnt_flag, MNT_MTX(mp), PRI_MAX_KERN, "wsuspfs", 0);

	/*
	* Unmount holds a write reference on the mount point. If we
	@@ -2128,7 +2128,7 @@
	mp->mnt_susp_owner = curthread;
	if (mp->mnt_writeopcount > 0)
	(void) msleep(&mp->mnt_writeopcount,
	- MNT_MTX(mp), (PUSER - 1)\|PDROP, "suspwt", 0);
	+ MNT_MTX(mp), PRI_MAX_KERN \| PDROP, "suspwt", 0);
	else
	MNT_IUNLOCK(mp);
	if ((error = VFS_SYNC(mp, MNT_SUSPEND)) != 0) {
	diff --git a/sys/net/if_tuntap.c b/sys/net/if_tuntap.c
	--- a/sys/net/if_tuntap.c
	+++ b/sys/net/if_tuntap.c
	@@ -834,7 +834,7 @@
	tp->tun_flags &= ~TUN_RWAIT;
	wakeup(tp);
	}
	- selwakeuppri(&tp->tun_rsel, PZERO + 1);
	+ selwakeuppri(&tp->tun_rsel, PZERO);
	KNOTE_LOCKED(&tp->tun_rsel.si_note, 0);
	if (tp->tun_flags & TUN_ASYNC && tp->tun_sigio) {
	TUN_UNLOCK(tp);
	@@ -1172,7 +1172,7 @@
	CURVNET_RESTORE();

	funsetown(&tp->tun_sigio);
	- selwakeuppri(&tp->tun_rsel, PZERO + 1);
	+ selwakeuppri(&tp->tun_rsel, PZERO);
	KNOTE_LOCKED(&tp->tun_rsel.si_note, 0);
	TUNDEBUG (ifp, "closed\n");
	tp->tun_flags &= ~TUN_OPEN;
	@@ -1706,7 +1706,7 @@
	return (EWOULDBLOCK);
	}
	tp->tun_flags \|= TUN_RWAIT;
	- error = mtx_sleep(tp, &tp->tun_mtx, PCATCH \| (PZERO + 1),
	+ error = mtx_sleep(tp, &tp->tun_mtx, PCATCH \| PZERO,
	"tunread", 0);
	if (error != 0) {
	TUN_UNLOCK(tp);
	diff --git a/sys/netgraph/ng_device.c b/sys/netgraph/ng_device.c
	--- a/sys/netgraph/ng_device.c
	+++ b/sys/netgraph/ng_device.c
	@@ -462,7 +462,7 @@
	mtx_lock(&priv->ngd_mtx);
	priv->flags \|= NGDF_RWAIT;
	if ((error = msleep(priv, &priv->ngd_mtx,
	- PDROP \| PCATCH \| (PZERO + 1),
	+ PDROP \| PCATCH \| PZERO,
	"ngdread", 0)) != 0)
	return (error);
	}
	diff --git a/sys/powerpc/include/runq.h b/sys/powerpc/include/runq.h
	deleted file mode 100644
	--- a/sys/powerpc/include/runq.h
	+++ /dev/null
	@@ -1,55 +0,0 @@
	-/*-
	- * SPDX-License-Identifier: BSD-2-Clause
	- *
	- * Copyright (c) 2001 Jake Burkholder <jake@FreeBSD.org>
	- * All rights reserved.
	- *
	- * Redistribution and use in source and binary forms, with or without
	- * modification, are permitted provided that the following conditions
	- * are met:
	- * 1. Redistributions of source code must retain the above copyright
	- * notice, this list of conditions and the following disclaimer.
	- * 2. Redistributions in binary form must reproduce the above copyright
	- * notice, this list of conditions and the following disclaimer in the
	- * documentation and/or other materials provided with the distribution.
	- *
	- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	- * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	- * SUCH DAMAGE.
	- */
	-
	-#ifndef _MACHINE_RUNQ_H_
	-#define _MACHINE_RUNQ_H_
	-
	-#ifdef __powerpc64__
	-#define RQB_LEN (1UL) /* Number of priority status words. */
	-#define RQB_L2BPW (6UL) /* Log2(sizeof(rqb_word_t) * NBBY)). */
	-#else
	-#define RQB_LEN (2) /* Number of priority status words. */
	-#define RQB_L2BPW (5) /* Log2(sizeof(rqb_word_t) * NBBY)). */
	-#endif
	-#define RQB_BPW (1UL<<RQB_L2BPW) /* Bits in an rqb_word_t. */
	-
	-#define RQB_BIT(pri) (1UL << ((pri) & (RQB_BPW - 1)))
	-#define RQB_WORD(pri) ((pri) >> RQB_L2BPW)
	-
	-#define RQB_FFS(word) (ffsl(word) - 1)
	-
	-/*
	- * Type of run queue status word.
	- */
	-#ifdef __powerpc64__
	-typedef u_int64_t rqb_word_t;
	-#else
	-typedef u_int32_t rqb_word_t;
	-#endif
	-
	-#endif
	diff --git a/sys/riscv/include/runq.h b/sys/riscv/include/runq.h
	deleted file mode 100644
	--- a/sys/riscv/include/runq.h
	+++ /dev/null
	@@ -1,44 +0,0 @@
	-/*-
	- * Copyright (c) 2001 Jake Burkholder <jake@FreeBSD.org>
	- * All rights reserved.
	- *
	- * Redistribution and use in source and binary forms, with or without
	- * modification, are permitted provided that the following conditions
	- * are met:
	- * 1. Redistributions of source code must retain the above copyright
	- * notice, this list of conditions and the following disclaimer.
	- * 2. Redistributions in binary form must reproduce the above copyright
	- * notice, this list of conditions and the following disclaimer in the
	- * documentation and/or other materials provided with the distribution.
	- *
	- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	- * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	- * SUCH DAMAGE.
	- */
	-
	-#ifndef _MACHINE_RUNQ_H_
	-#define _MACHINE_RUNQ_H_
	-
	-#define RQB_LEN (1) /* Number of priority status words. */
	-#define RQB_L2BPW (6) /* Log2(sizeof(rqb_word_t) * NBBY)). */
	-#define RQB_BPW (1<<RQB_L2BPW) /* Bits in an rqb_word_t. */
	-
	-#define RQB_BIT(pri) (1ul << ((pri) & (RQB_BPW - 1)))
	-#define RQB_WORD(pri) ((pri) >> RQB_L2BPW)
	-
	-#define RQB_FFS(word) (ffsl(word) - 1)
	-
	-/*
	- * Type of run queue status word.
	- */
	-typedef unsigned long rqb_word_t;
	-
	-#endif
	diff --git a/sys/sys/buf.h b/sys/sys/buf.h
	--- a/sys/sys/buf.h
	+++ b/sys/sys/buf.h
	@@ -296,7 +296,7 @@
	* Initialize a lock.
	*/
	#define BUF_LOCKINIT(bp, wmesg) \
	- lockinit(&(bp)->b_lock, PRIBIO + 4, wmesg, 0, LK_NEW)
	+ lockinit(&(bp)->b_lock, PVFS, wmesg, 0, LK_NEW)
	/*
	*
	* Get a lock sleeping non-interruptably until it becomes available.
	@@ -311,7 +311,7 @@
	*/
	#define BUF_TIMELOCK(bp, locktype, interlock, wmesg, catch, timo) \
	_lockmgr_args_rw(&(bp)->b_lock, (locktype) \| LK_TIMELOCK, \
	- (interlock), (wmesg), (PRIBIO + 4) \| (catch), (timo), \
	+ (interlock), (wmesg), PVFS \| (catch), (timo), \
	LOCK_FILE, LOCK_LINE)

	/*
	diff --git a/sys/sys/param.h b/sys/sys/param.h
	--- a/sys/sys/param.h
	+++ b/sys/sys/param.h
	@@ -73,7 +73,7 @@
	* cannot include sys/param.h and should only be updated here.
	*/
	#undef __FreeBSD_version
	-#define __FreeBSD_version 1500018
	+#define __FreeBSD_version 1500019

	/*
	* __FreeBSD_kernel__ indicates that this system uses the kernel of FreeBSD,
	diff --git a/sys/sys/priority.h b/sys/sys/priority.h
	--- a/sys/sys/priority.h
	+++ b/sys/sys/priority.h
	@@ -64,17 +64,23 @@
	*/

	/*
	- * Priorities range from 0 to 255, but differences of less then 4 (RQ_PPQ)
	- * are insignificant. Ranges are as follows:
	+ * Priorities range from 0 to 255. Ranges are as follows:
	*
	- * Interrupt threads: 0 - 15
	- * Realtime user threads: 16 - 47
	- * Top half kernel threads: 48 - 87
	- * Time sharing user threads: 88 - 223
	+ * Interrupt threads: 0 - 7
	+ * Realtime user threads: 8 - 39
	+ * Top half kernel threads: 40 - 55
	+ * Time sharing user threads: 56 - 223
	* Idle user threads: 224 - 255
	*
	- * XXX If/When the specific interrupt thread and top half thread ranges
	- * disappear, a larger range can be used for user processes.
	+ * Priority levels of rtprio(2)'s RTP_PRIO_FIFO and RTP_PRIO_REALTIME and
	+ * POSIX's SCHED_FIFO and SCHED_RR are directly mapped to the internal realtime
	+ * range mentioned above by a simple translation. This range's length
	+ * consequently cannot be changed without impacts on the scheduling priority
	+ * code, and in any case must never be smaller than 32 for POSIX compliance and
	+ * rtprio(2) backwards compatibility. Similarly, priority levels of rtprio(2)'s
	+ * RTP_PRIO_IDLE are directly mapped to the internal idle range above (and,
	+ * soon, those of the to-be-introduced SCHED_IDLE policy as well), so changing
	+ * that range is subject to the same caveats and restrictions.
	*/

	#define PRI_MIN (0) /* Highest priority. */
	@@ -88,34 +94,34 @@
	* decay to lower priorities if they run for full time slices.
	*/
	#define PI_REALTIME (PRI_MIN_ITHD + 0)
	-#define PI_INTR (PRI_MIN_ITHD + 4)
	+#define PI_INTR (PRI_MIN_ITHD + 1)
	#define PI_AV PI_INTR
	#define PI_NET PI_INTR
	#define PI_DISK PI_INTR
	#define PI_TTY PI_INTR
	#define PI_DULL PI_INTR
	-#define PI_SOFT (PRI_MIN_ITHD + 8)
	+#define PI_SOFT (PRI_MIN_ITHD + 2)
	#define PI_SOFTCLOCK PI_SOFT
	#define PI_SWI(x) PI_SOFT

	-#define PRI_MIN_REALTIME (16)
	+#define PRI_MIN_REALTIME (8)
	#define PRI_MAX_REALTIME (PRI_MIN_KERN - 1)

	-#define PRI_MIN_KERN (48)
	+#define PRI_MIN_KERN (40)
	#define PRI_MAX_KERN (PRI_MIN_TIMESHARE - 1)

	#define PSWP (PRI_MIN_KERN + 0)
	-#define PVM (PRI_MIN_KERN + 4)
	-#define PINOD (PRI_MIN_KERN + 8)
	-#define PRIBIO (PRI_MIN_KERN + 12)
	-#define PVFS (PRI_MIN_KERN + 16)
	-#define PZERO (PRI_MIN_KERN + 20)
	-#define PSOCK (PRI_MIN_KERN + 24)
	-#define PWAIT (PRI_MIN_KERN + 28)
	-#define PLOCK (PRI_MIN_KERN + 32)
	-#define PPAUSE (PRI_MIN_KERN + 36)
	+#define PVM (PRI_MIN_KERN + 1)
	+#define PINOD (PRI_MIN_KERN + 2)
	+#define PRIBIO (PRI_MIN_KERN + 3)
	+#define PVFS (PRI_MIN_KERN + 4)
	+#define PZERO (PRI_MIN_KERN + 5)
	+#define PSOCK (PRI_MIN_KERN + 6)
	+#define PWAIT (PRI_MIN_KERN + 7)
	+#define PLOCK (PRI_MIN_KERN + 8)
	+#define PPAUSE (PRI_MIN_KERN + 9)

	-#define PRI_MIN_TIMESHARE (88)
	+#define PRI_MIN_TIMESHARE (56)
	#define PRI_MAX_TIMESHARE (PRI_MIN_IDLE - 1)

	#define PUSER (PRI_MIN_TIMESHARE)
	diff --git a/sys/sys/proc.h b/sys/sys/proc.h
	--- a/sys/sys/proc.h
	+++ b/sys/sys/proc.h
	@@ -53,7 +53,6 @@
	#include <sys/osd.h>
	#include <sys/priority.h>
	#include <sys/rtprio.h> /* XXX. */
	-#include <sys/runq.h>
	#include <sys/resource.h>
	#include <sys/sigio.h>
	#include <sys/signal.h>
	diff --git a/sys/sys/runq.h b/sys/sys/runq.h
	--- a/sys/sys/runq.h
	+++ b/sys/sys/runq.h
	@@ -29,7 +29,11 @@
	#ifndef _RUNQ_H_
	#define _RUNQ_H_

	-#include <machine/runq.h>
	+#ifndef _KERNEL
	+#error "no user-serviceable parts inside"
	+#endif
	+
	+#include <sys/types.h> /* For bool. */

	struct thread;

	@@ -37,20 +41,65 @@
	* Run queue parameters.
	*/

	-#define RQ_NQS (64) /* Number of run queues. */
	-#define RQ_PPQ (4) /* Priorities per queue. */
	+#define RQ_MAX_PRIO (255) /* Maximum priority (minimum is 0). */
	+#define RQ_PPQ (1) /* Priorities per queue. */

	/*
	- * Head of run queues.
	+ * Convenience macros from <sys/param.h>.
	*/
	-TAILQ_HEAD(rqhead, thread);
	+#ifndef NBBY
	+#define NBBY 8
	+#endif
	+#ifndef howmany
	+#define howmany(x, y) (((x)+((y)-1))/(y))
	+#endif
	+
	+/*
	+ * Deduced from the above parameters and machine ones.
	+ */
	+#define RQ_NQS (howmany(RQ_MAX_PRIO + 1, RQ_PPQ)) /* Number of run queues. */
	+#define RQ_PRI_TO_QUEUE_IDX(pri) ((pri) / RQ_PPQ) /* Priority to queue index. */
	+
	+typedef unsigned long rqsw_t; /* runq's status words type. */
	+#define RQSW_BPW (sizeof(rqsw_t) * NBBY) /* Bits per runq word. */
	+#if defined(_LP64)
	+#define RQSW_L2BPW (6) /* Log2(sizeof(rqsw_t) * NBBY)). */
	+#elif defined(_ILP32)
	+#define RQSW_L2BPW (5) /* Log2(sizeof(rqsw_t) * NBBY)). */
	+#else
	+#error Not _LP64 nor _ILP32!
	+#endif
	+/*
	+ * That RQSW_BPW and RQSW_L2BPW are consistent is checked by a static assertion.
	+ */
	+
	+/* Number of status words to cover RQ_NQS queues. */
	+#define RQSW_NB (howmany(RQ_NQS, RQSW_BPW))
	+#define RQSW_IDX(idx) ((idx) >> RQSW_L2BPW)
	+#define RQSW_BIT_IDX(idx) ((idx) & (RQSW_BPW - 1))
	+#define RQSW_BIT(idx) (1ul << RQSW_BIT_IDX(idx))
	+#define RQSW_BSF(word) ({ \
	+ int _res = ffsl((long)(word)); /* Assumes two-complement. */ \
	+ MPASS(_res > 0); \
	+ _res - 1; \
	+})
	+#define RQSW_TO_QUEUE_IDX(word_idx, bit_idx) \
	+ (((word_idx) << RQSW_L2BPW) + (bit_idx))
	+#define RQSW_FIRST_QUEUE_IDX(word_idx, word) \
	+ RQSW_TO_QUEUE_IDX(word_idx, RQSW_BSF(word))
	+
	+
	+/*
	+ * The queue for a given index as a list of threads.
	+ */
	+TAILQ_HEAD(rq_queue, thread);

	/*
	* Bit array which maintains the status of a run queue. When a queue is
	* non-empty the bit corresponding to the queue number will be set.
	*/
	-struct rqbits {
	- rqb_word_t rqb_bits[RQB_LEN];
	+struct rq_status {
	+ rqsw_t rq_sw[RQSW_NB];
	};

	/*
	@@ -58,18 +107,29 @@
	* are placed, and a structure to maintain the status of each queue.
	*/
	struct runq {
	- struct rqbits rq_status;
	- struct rqhead rq_queues[RQ_NQS];
	+ struct rq_status rq_status;
	+ struct rq_queue rq_queues[RQ_NQS];
	};

	-void runq_add(struct runq , struct thread , int);
	-void runq_add_pri(struct runq , struct thread , u_char, int);
	-int runq_check(struct runq *);
	-struct thread runq_choose(struct runq );
	-struct thread runq_choose_from(struct runq , u_char);
	-struct thread runq_choose_fuzz(struct runq , int);
	void runq_init(struct runq *);
	-void runq_remove(struct runq , struct thread );
	-void runq_remove_idx(struct runq , struct thread , u_char *);
	+bool runq_is_queue_empty(struct runq *, int _idx);
	+void runq_add(struct runq , struct thread , int _flags);
	+void runq_add_idx(struct runq , struct thread , int _idx, int _flags);
	+bool runq_remove(struct runq , struct thread );
	+
	+/*
	+ * Implementation helpers for common and scheduler-specific runq_choose*()
	+ * functions.
	+ */
	+typedef bool runq_pred_t(int _idx, struct rq_queue , void _data);
	+int runq_findq(struct runq *const rq, const int lvl_min,
	+ const int lvl_max,
	+ runq_pred_t const pred, void const pred_data);
	+struct thread runq_first_thread_range(struct runq const rq,
	+ const int lvl_min, const int lvl_max);
	+
	+bool runq_not_empty(struct runq *);
	+struct thread runq_choose(struct runq );
	+struct thread runq_choose_fuzz(struct runq , int _fuzz);

	#endif
	diff --git a/sys/sys/sched.h b/sys/sys/sched.h
	--- a/sys/sys/sched.h
	+++ b/sys/sys/sched.h
	@@ -63,6 +63,9 @@
	#define _SCHED_H_

	#ifdef _KERNEL
	+
	+#include <sys/types.h> /* For bool. */
	+
	/*
	* General scheduling info.
	*
	@@ -74,7 +77,7 @@
	*/
	int sched_load(void);
	int sched_rr_interval(void);
	-int sched_runnable(void);
	+bool sched_runnable(void);

	/*
	* Proc related scheduling hooks.
	diff --git a/sys/ufs/ffs/ffs_softdep.c b/sys/ufs/ffs/ffs_softdep.c
	--- a/sys/ufs/ffs/ffs_softdep.c
	+++ b/sys/ufs/ffs/ffs_softdep.c
	@@ -497,7 +497,7 @@
	while (mp->mnt_secondary_writes != 0) {
	BO_UNLOCK(bo);
	msleep(&mp->mnt_secondary_writes, MNT_MTX(mp),
	- (PUSER - 1) \| PDROP, "secwr", 0);
	+ PRI_MAX_KERN \| PDROP, "secwr", 0);
	BO_LOCK(bo);
	MNT_ILOCK(mp);
	}
	@@ -14561,7 +14561,7 @@
	while (mp->mnt_secondary_writes != 0) {
	BO_UNLOCK(bo);
	msleep(&mp->mnt_secondary_writes, MNT_MTX(mp),
	- (PUSER - 1) \| PDROP, "secwr", 0);
	+ PRI_MAX_KERN \| PDROP, "secwr", 0);
	BO_LOCK(bo);
	MNT_ILOCK(mp);
	}
	@@ -14601,7 +14601,7 @@
	BO_UNLOCK(bo);
	msleep(&mp->mnt_secondary_writes,
	MNT_MTX(mp),
	- (PUSER - 1) \| PDROP, "secwr", 0);
	+ PRI_MAX_KERN \| PDROP, "secwr", 0);
	BO_LOCK(bo);
	continue;
	}
	diff --git a/sys/ufs/ffs/ffs_vnops.c b/sys/ufs/ffs/ffs_vnops.c
	--- a/sys/ufs/ffs/ffs_vnops.c
	+++ b/sys/ufs/ffs/ffs_vnops.c
	@@ -1399,8 +1399,7 @@
	VI_LOCK(vp);
	while (ip->i_flag & IN_EA_LOCKED) {
	UFS_INODE_SET_FLAG(ip, IN_EA_LOCKWAIT);
	- msleep(&ip->i_ea_refs, &vp->v_interlock, PINOD + 2, "ufs_ea",
	- 0);
	+ msleep(&ip->i_ea_refs, &vp->v_interlock, PINOD, "ufs_ea", 0);
	}
	UFS_INODE_SET_FLAG(ip, IN_EA_LOCKED);
	VI_UNLOCK(vp);
	diff --git a/sys/ufs/ufs/ufs_quota.c b/sys/ufs/ufs/ufs_quota.c
	--- a/sys/ufs/ufs/ufs_quota.c
	+++ b/sys/ufs/ufs/ufs_quota.c
	@@ -179,7 +179,7 @@
	if ((dq = ip->i_dquot[i]) == NODQUOT)
	continue;
	DQI_LOCK(dq);
	- DQI_WAIT(dq, PINOD+1, "chkdq1");
	+ DQI_WAIT(dq, PINOD, "chkdq1");
	ncurblocks = dq->dq_curblocks + change;
	if (ncurblocks >= 0)
	dq->dq_curblocks = ncurblocks;
	@@ -201,7 +201,7 @@
	continue;
	warn = 0;
	DQI_LOCK(dq);
	- DQI_WAIT(dq, PINOD+1, "chkdq2");
	+ DQI_WAIT(dq, PINOD, "chkdq2");
	if (do_check) {
	error = chkdqchg(ip, change, cred, i, &warn);
	if (error) {
	@@ -215,7 +215,7 @@
	if (dq == NODQUOT)
	continue;
	DQI_LOCK(dq);
	- DQI_WAIT(dq, PINOD+1, "chkdq3");
	+ DQI_WAIT(dq, PINOD, "chkdq3");
	ncurblocks = dq->dq_curblocks - change;
	if (ncurblocks >= 0)
	dq->dq_curblocks = ncurblocks;
	@@ -320,7 +320,7 @@
	if ((dq = ip->i_dquot[i]) == NODQUOT)
	continue;
	DQI_LOCK(dq);
	- DQI_WAIT(dq, PINOD+1, "chkiq1");
	+ DQI_WAIT(dq, PINOD, "chkiq1");
	if (dq->dq_curinodes >= -change)
	dq->dq_curinodes += change;
	else
	@@ -341,7 +341,7 @@
	continue;
	warn = 0;
	DQI_LOCK(dq);
	- DQI_WAIT(dq, PINOD+1, "chkiq2");
	+ DQI_WAIT(dq, PINOD, "chkiq2");
	if (do_check) {
	error = chkiqchg(ip, change, cred, i, &warn);
	if (error) {
	@@ -355,7 +355,7 @@
	if (dq == NODQUOT)
	continue;
	DQI_LOCK(dq);
	- DQI_WAIT(dq, PINOD+1, "chkiq3");
	+ DQI_WAIT(dq, PINOD, "chkiq3");
	if (dq->dq_curinodes >= change)
	dq->dq_curinodes -= change;
	else
	@@ -855,7 +855,7 @@
	return (error);
	dq = ndq;
	DQI_LOCK(dq);
	- DQI_WAIT(dq, PINOD+1, "setqta");
	+ DQI_WAIT(dq, PINOD, "setqta");
	/*
	* Copy all but the current values.
	* Reset time limit if previously had no soft limit or were
	@@ -918,7 +918,7 @@
	return (error);
	dq = ndq;
	DQI_LOCK(dq);
	- DQI_WAIT(dq, PINOD+1, "setuse");
	+ DQI_WAIT(dq, PINOD, "setuse");
	/*
	* Reset time limit if have a soft limit and were
	* previously under it, but are now over it.
	@@ -1314,7 +1314,7 @@
	if (dq != NULL) {
	DQH_UNLOCK();
	hfound: DQI_LOCK(dq);
	- DQI_WAIT(dq, PINOD+1, "dqget");
	+ DQI_WAIT(dq, PINOD, "dqget");
	DQI_UNLOCK(dq);
	if (dq->dq_ump == NULL) {
	dqrele(vp, dq);
	@@ -1588,7 +1588,7 @@
	vn_lock(dqvp, LK_EXCLUSIVE \| LK_RETRY);

	DQI_LOCK(dq);
	- DQI_WAIT(dq, PINOD+2, "dqsync");
	+ DQI_WAIT(dq, PINOD, "dqsync");
	if ((dq->dq_flags & DQ_MOD) == 0)
	goto out;
	dq->dq_flags \|= DQ_LOCK;
	@@ -1742,7 +1742,7 @@
	if ((dq = qrp[i]) == NODQUOT)
	continue;
	DQI_LOCK(dq);
	- DQI_WAIT(dq, PINOD+1, "adjqta");
	+ DQI_WAIT(dq, PINOD, "adjqta");
	ncurblocks = dq->dq_curblocks + blkcount;
	if (ncurblocks >= 0)
	dq->dq_curblocks = ncurblocks;
	diff --git a/tests/sys/kern/ptrace_test.c b/tests/sys/kern/ptrace_test.c
	--- a/tests/sys/kern/ptrace_test.c
	+++ b/tests/sys/kern/ptrace_test.c
	@@ -34,7 +34,6 @@
	#include <sys/ptrace.h>
	#include <sys/procfs.h>
	#include <sys/queue.h>
	-#include <sys/runq.h>
	#include <sys/syscall.h>
	#include <sys/sysctl.h>
	#include <sys/user.h>
	@@ -2027,7 +2026,7 @@
	sched_get_priority_min(SCHED_FIFO)) / 2;
	CHILD_REQUIRE(pthread_setschedparam(pthread_self(),
	SCHED_FIFO, &sched_param) == 0);
	- sched_param.sched_priority -= RQ_PPQ;
	+ sched_param.sched_priority -= 1;
	CHILD_REQUIRE(pthread_setschedparam(t, SCHED_FIFO,
	&sched_param) == 0);

	@@ -2130,7 +2129,7 @@
	sched_get_priority_min(SCHED_FIFO)) / 2;
	CHILD_REQUIRE(pthread_setschedparam(pthread_self(),
	SCHED_FIFO, &sched_param) == 0);
	- sched_param.sched_priority -= RQ_PPQ;
	+ sched_param.sched_priority -= 1;
	CHILD_REQUIRE(pthread_setschedparam(t, SCHED_FIFO,
	&sched_param) == 0);

File Metadata

Mime Type: text/plain
Expires: Sun, Sep 22, 12:38 PM (22 h, 10 m)
Storage Engine: blob
Storage Format: Raw Data
Storage Handle: 12451471
Default Alt Text: D45393.diff (80 KB)

D45393.diffNo OneTemporaryActions

D45393.diffView Options

File Metadata

Event Timeline

D45393.diff
No OneTemporary
Actions

D45393.diff
View Options