Page MenuHomeFreeBSD

D14633.diff
No OneTemporary

D14633.diff

Index: gnu/usr.bin/gdb/kgdb/trgt_i386.c
===================================================================
--- gnu/usr.bin/gdb/kgdb/trgt_i386.c
+++ gnu/usr.bin/gdb/kgdb/trgt_i386.c
@@ -29,6 +29,8 @@
#include <sys/param.h>
#include <sys/proc.h>
+#include <vm/vm.h>
+#include <vm/pmap.h>
#include <machine/pcb.h>
#include <machine/frame.h>
#include <machine/segments.h>
@@ -279,12 +281,26 @@
char buf[MAX_REGISTER_SIZE];
struct kgdb_frame_cache *cache;
char *pname;
+ CORE_ADDR pcx;
+ uintptr_t addr, setidt_disp;
cache = *this_cache;
if (cache == NULL) {
cache = FRAME_OBSTACK_ZALLOC(struct kgdb_frame_cache);
*this_cache = cache;
- cache->pc = frame_func_unwind(next_frame);
+ pcx = frame_pc_unwind(next_frame);
+ if (pcx >= PMAP_TRM_MIN_ADDRESS) {
+ addr = kgdb_lookup("setidt_disp");
+ if (addr != 0) {
+ if (kvm_read(kvm, addr, &setidt_disp,
+ sizeof(setidt_disp)) !=
+ sizeof(setidt_disp))
+ warnx("kvm_read: %s", kvm_geterr(kvm));
+ else
+ pcx -= setidt_disp;
+ }
+ }
+ cache->pc = pcx;
find_pc_partial_function(cache->pc, &pname, NULL, NULL);
if (pname[0] != 'X')
cache->frame_type = FT_NORMAL;
@@ -373,6 +389,8 @@
CORE_ADDR pc;
pc = frame_pc_unwind(next_frame);
+ if (pc >= PMAP_TRM_MIN_ADDRESS)
+ return (&kgdb_trgt_trapframe_unwind);
pname = NULL;
find_pc_partial_function(pc, &pname, NULL, NULL);
if (pname == NULL)
Index: sys/conf/files.i386
===================================================================
--- sys/conf/files.i386
+++ sys/conf/files.i386
@@ -483,6 +483,7 @@
i386/i386/bios.c standard
i386/i386/bioscall.s standard
i386/i386/bpf_jit_machdep.c optional bpf_jitter
+i386/i386/copyout.c standard
i386/i386/db_disasm.c optional ddb
i386/i386/db_interface.c optional ddb
i386/i386/db_trace.c optional ddb
Index: sys/conf/ldscript.i386
===================================================================
--- sys/conf/ldscript.i386
+++ sys/conf/ldscript.i386
@@ -6,7 +6,7 @@
SECTIONS
{
/* Read-only sections, merged into text segment: */
- . = kernbase + kernload + SIZEOF_HEADERS;
+ . = kernbase + SIZEOF_HEADERS;
.interp : { *(.interp) }
.hash : { *(.hash) }
.gnu.hash : { *(.gnu.hash) }
Index: sys/dev/dcons/dcons_crom.c
===================================================================
--- sys/dev/dcons/dcons_crom.c
+++ sys/dev/dcons/dcons_crom.c
@@ -109,7 +109,11 @@
static off_t idt_paddr;
/* XXX */
+#ifdef __amd64__
idt_paddr = (char *)idt - (char *)KERNBASE;
+#else /* __i386__ */
+ idt_paddr = (off_t)pmap_kextract((vm_offset_t)idt);
+#endif
crom_add_entry(&sc->unit, DCONS_CSR_KEY_RESET_HI, ADDR_HI(idt_paddr));
crom_add_entry(&sc->unit, DCONS_CSR_KEY_RESET_LO, ADDR_LO(idt_paddr));
Index: sys/dev/dcons/dcons_os.c
===================================================================
--- sys/dev/dcons/dcons_os.c
+++ sys/dev/dcons/dcons_os.c
@@ -309,11 +309,16 @@
* Allow read/write access to dcons buffer.
*/
for (pa = trunc_page(addr); pa < addr + size; pa += PAGE_SIZE)
- *vtopte(KERNBASE + pa) |= PG_RW;
+ *vtopte(PMAP_MAP_LOW + pa) |= PG_RW;
invltlb();
#endif
/* XXX P to V */
+#ifdef __amd64__
dg.buf = (struct dcons_buf *)(vm_offset_t)(KERNBASE + addr);
+#else /* __i386__ */
+ dg.buf = (struct dcons_buf *)((vm_offset_t)PMAP_MAP_LOW +
+ addr);
+#endif
dg.size = size;
if (dcons_load_buffer(dg.buf, dg.size, sc) < 0)
dg.buf = NULL;
Index: sys/dev/hyperv/vmbus/i386/vmbus_vector.S
===================================================================
--- sys/dev/hyperv/vmbus/i386/vmbus_vector.S
+++ sys/dev/hyperv/vmbus/i386/vmbus_vector.S
@@ -26,11 +26,12 @@
* $FreeBSD$
*/
+#include "assym.inc"
+
+#include <machine/psl.h>
#include <machine/asmacros.h>
#include <machine/specialreg.h>
-#include "assym.inc"
-
/*
* This is the Hyper-V vmbus channel direct callback interrupt.
* Only used when it is running on Hyper-V.
@@ -42,6 +43,7 @@
PUSH_FRAME
SET_KERNEL_SREGS
cld
+ KENTER
FAKE_MCOUNT(TF_EIP(%esp))
pushl %esp
call vmbus_handle_intr
Index: sys/dev/ppc/ppc.c
===================================================================
--- sys/dev/ppc/ppc.c
+++ sys/dev/ppc/ppc.c
@@ -51,6 +51,7 @@
#include <vm/vm.h>
#include <vm/pmap.h>
#include <machine/vmparam.h>
+#include <machine/pc/bios.h>
#endif
#include <dev/ppbus/ppbconf.h>
@@ -121,7 +122,7 @@
* BIOS printer list - used by BIOS probe.
*/
#define BIOS_PPC_PORTS 0x408
-#define BIOS_PORTS (short *)(KERNBASE+BIOS_PPC_PORTS)
+#define BIOS_PORTS ((short *)BIOS_PADDRTOVADDR(BIOS_PPC_PORTS))
#define BIOS_MAX_PPC 4
#endif
Index: sys/dev/syscons/syscons.c
===================================================================
--- sys/dev/syscons/syscons.c
+++ sys/dev/syscons/syscons.c
@@ -288,7 +288,11 @@
* This is enough for ec_putc() to work very early on x86
* if the kernel starts in normal color text mode.
*/
+#ifdef __amd64__
fb = KERNBASE + 0xb8000;
+#else /* __i386__ */
+ fb = PMAP_MAP_LOW + 0xb8000;
+#endif
xsize = 80;
ysize = 25;
#endif
Index: sys/i386/conf/NOTES
===================================================================
--- sys/i386/conf/NOTES
+++ sys/i386/conf/NOTES
@@ -894,19 +894,6 @@
#
options PMAP_SHPGPERPROC=201
-#
-# Change the size of the kernel virtual address space. Due to
-# constraints in loader(8) on i386, this must be a multiple of 4.
-# 256 = 1 GB of kernel address space. Increasing this also causes
-# a reduction of the address space in user processes. 512 splits
-# the 4GB cpu address space in half (2GB user, 2GB kernel). For PAE
-# kernels, the value will need to be double non-PAE. A value of 1024
-# for PAE kernels is necessary to split the address space in half.
-# This will likely need to be increased to handle memory sizes >4GB.
-# PAE kernels default to a value of 512.
-#
-options KVA_PAGES=260
-
#
# Number of initial kernel page table pages used for early bootstrap.
# This number should include enough pages to map the kernel, any
@@ -951,22 +938,6 @@
#####################################################################
# VM OPTIONS
-# Disable the 4 MByte page PSE CPU feature. The PSE feature allows the
-# kernel to use 4 MByte pages to map the kernel instead of 4k pages.
-# This saves on the amount of memory needed for page tables needed to
-# map the kernel. You should only disable this feature as a temporary
-# workaround if you are having problems with it enabled.
-#
-#options DISABLE_PSE
-
-# Disable the global pages PGE CPU feature. The PGE feature allows pages
-# to be marked with the PG_G bit. TLB entries for these pages are not
-# flushed from the cache when %cr3 is reloaded. This can make context
-# switches less expensive. You should only disable this feature as a
-# temporary workaround if you are having problems with it enabled.
-#
-#options DISABLE_PG_G
-
# KSTACK_PAGES is the number of memory pages to assign to the kernel
# stack of each thread.
Index: sys/i386/i386/apic_vector.s
===================================================================
--- sys/i386/i386/apic_vector.s
+++ sys/i386/i386/apic_vector.s
@@ -39,6 +39,7 @@
#include "opt_smp.h"
#include <machine/asmacros.h>
+#include <machine/psl.h>
#include <machine/specialreg.h>
#include <x86/apicreg.h>
@@ -67,34 +68,39 @@
* translates that into a vector, and passes the vector to the
* lapic_handle_intr() function.
*/
-#define ISR_VEC(index, vec_name) \
- .text ; \
- SUPERALIGN_TEXT ; \
-IDTVEC(vec_name ## _pti) ; \
-IDTVEC(vec_name) ; \
- PUSH_FRAME ; \
- SET_KERNEL_SREGS ; \
- cld ; \
- FAKE_MCOUNT(TF_EIP(%esp)) ; \
- cmpl $0,x2apic_mode ; \
- je 1f ; \
- movl $(MSR_APIC_ISR0 + index),%ecx ; \
- rdmsr ; \
- jmp 2f ; \
-1: ; \
- movl lapic_map, %edx ;/* pointer to local APIC */ \
- movl LA_ISR + 16 * (index)(%edx), %eax ; /* load ISR */ \
-2: ; \
- bsrl %eax, %eax ; /* index of highest set bit in ISR */ \
- jz 3f ; \
- addl $(32 * index),%eax ; \
- pushl %esp ; \
- pushl %eax ; /* pass the IRQ */ \
- call lapic_handle_intr ; \
- addl $8, %esp ; /* discard parameter */ \
-3: ; \
- MEXITCOUNT ; \
+ .macro ISR_VEC index, vec_name
+ .text
+ SUPERALIGN_TEXT
+ .globl X\()\vec_name\()_pti, X\()\vec_name
+
+X\()\vec_name\()_pti:
+X\()\vec_name:
+ PUSH_FRAME
+ SET_KERNEL_SREGS
+ cld
+ KENTER
+ FAKE_MCOUNT(TF_EIP(%esp))
+ cmpl $0,x2apic_mode
+ je 2f
+ movl $(MSR_APIC_ISR0 + \index),%ecx
+ rdmsr
+ jmp 3f
+2:
+ movl lapic_map, %edx /* pointer to local APIC */
+ movl LA_ISR + 16 * \index(%edx), %eax /* load ISR */
+3:
+ bsrl %eax, %eax /* index of highest set bit in ISR */
+ jz 4f
+ addl $(32 * \index),%eax
+ pushl %esp
+ pushl %eax /* pass the IRQ */
+ movl $lapic_handle_intr, %eax
+ call *%eax
+ addl $8, %esp /* discard parameter */
+4:
+ MEXITCOUNT
jmp doreti
+ .endm
/*
* Handle "spurious INTerrupts".
@@ -111,13 +117,13 @@
iret
- ISR_VEC(1, apic_isr1)
- ISR_VEC(2, apic_isr2)
- ISR_VEC(3, apic_isr3)
- ISR_VEC(4, apic_isr4)
- ISR_VEC(5, apic_isr5)
- ISR_VEC(6, apic_isr6)
- ISR_VEC(7, apic_isr7)
+ ISR_VEC 1, apic_isr1
+ ISR_VEC 2, apic_isr2
+ ISR_VEC 3, apic_isr3
+ ISR_VEC 4, apic_isr4
+ ISR_VEC 5, apic_isr5
+ ISR_VEC 6, apic_isr6
+ ISR_VEC 7, apic_isr7
/*
* Local APIC periodic timer handler.
@@ -129,9 +135,11 @@
PUSH_FRAME
SET_KERNEL_SREGS
cld
+ KENTER
FAKE_MCOUNT(TF_EIP(%esp))
pushl %esp
- call lapic_handle_timer
+ movl $lapic_handle_timer, %eax
+ call *%eax
add $4, %esp
MEXITCOUNT
jmp doreti
@@ -146,8 +154,10 @@
PUSH_FRAME
SET_KERNEL_SREGS
cld
+ KENTER
FAKE_MCOUNT(TF_EIP(%esp))
- call lapic_handle_cmc
+ movl $lapic_handle_cmc, %eax
+ call *%eax
MEXITCOUNT
jmp doreti
@@ -161,8 +171,10 @@
PUSH_FRAME
SET_KERNEL_SREGS
cld
+ KENTER
FAKE_MCOUNT(TF_EIP(%esp))
- call lapic_handle_error
+ movl $lapic_handle_error, %eax
+ call *%eax
MEXITCOUNT
jmp doreti
@@ -177,9 +189,11 @@
PUSH_FRAME
SET_KERNEL_SREGS
cld
+ KENTER
FAKE_MCOUNT(TF_EIP(%esp))
pushl %esp
- call xen_intr_handle_upcall
+ movl $xen_intr_handle_upcall, %eax
+ call *%eax
add $4, %esp
MEXITCOUNT
jmp doreti
@@ -200,9 +214,9 @@
PUSH_FRAME
SET_KERNEL_SREGS
cld
-
- call invltlb_handler
-
+ KENTER
+ movl $invltlb_handler, %eax
+ call *%eax
jmp invltlb_ret
/*
@@ -214,9 +228,9 @@
PUSH_FRAME
SET_KERNEL_SREGS
cld
-
- call invlpg_handler
-
+ KENTER
+ movl $invlpg_handler, %eax
+ call *%eax
jmp invltlb_ret
/*
@@ -228,9 +242,9 @@
PUSH_FRAME
SET_KERNEL_SREGS
cld
-
- call invlrng_handler
-
+ KENTER
+ movl $invlrng_handler, %eax
+ call *%eax
jmp invltlb_ret
/*
@@ -242,9 +256,9 @@
PUSH_FRAME
SET_KERNEL_SREGS
cld
-
- call invlcache_handler
-
+ KENTER
+ movl $invlcache_handler, %eax
+ call *%eax
jmp invltlb_ret
/*
@@ -256,12 +270,11 @@
PUSH_FRAME
SET_KERNEL_SREGS
cld
-
+ KENTER
call as_lapic_eoi
-
FAKE_MCOUNT(TF_EIP(%esp))
-
- call ipi_bitmap_handler
+ movl $ipi_bitmap_handler, %eax
+ call *%eax
MEXITCOUNT
jmp doreti
@@ -274,9 +287,10 @@
PUSH_FRAME
SET_KERNEL_SREGS
cld
-
+ KENTER
call as_lapic_eoi
- call cpustop_handler
+ movl $cpustop_handler, %eax
+ call *%eax
jmp doreti
/*
@@ -288,9 +302,10 @@
PUSH_FRAME
SET_KERNEL_SREGS
cld
-
+ KENTER
call as_lapic_eoi
- call cpususpend_handler
+ movl $cpususpend_handler, %eax
+ call *%eax
jmp doreti
/*
@@ -304,14 +319,14 @@
PUSH_FRAME
SET_KERNEL_SREGS
cld
-
+ KENTER
#ifdef COUNT_IPIS
movl PCPU(CPUID), %eax
movl ipi_rendezvous_counts(,%eax,4), %eax
incl (%eax)
#endif
- call smp_rendezvous_action
-
+ movl $smp_rendezvous_action, %eax
+ call *%eax
call as_lapic_eoi
jmp doreti
Index: sys/i386/i386/atpic_vector.s
===================================================================
--- sys/i386/i386/atpic_vector.s
+++ sys/i386/i386/atpic_vector.s
@@ -36,6 +36,7 @@
* master and slave interrupt controllers.
*/
+#include <machine/psl.h>
#include <machine/asmacros.h>
#include "assym.inc"
@@ -43,37 +44,41 @@
/*
* Macros for interrupt entry, call to handler, and exit.
*/
-#define INTR(irq_num, vec_name) \
- .text ; \
- SUPERALIGN_TEXT ; \
-IDTVEC(vec_name ##_pti) ; \
-IDTVEC(vec_name) ; \
- PUSH_FRAME ; \
- SET_KERNEL_SREGS ; \
- cld ; \
-; \
- FAKE_MCOUNT(TF_EIP(%esp)) ; \
- pushl %esp ; \
- pushl $irq_num; /* pass the IRQ */ \
- call atpic_handle_intr ; \
- addl $8, %esp ; /* discard the parameters */ \
-; \
- MEXITCOUNT ; \
+ .macro INTR irq_num, vec_name
+ .text
+ SUPERALIGN_TEXT
+ .globl X\()\vec_name\()_pti, X\()\vec_name
+
+X\()\vec_name\()_pti:
+X\()\vec_name:
+ PUSH_FRAME
+ SET_KERNEL_SREGS
+ cld
+ KENTER
+ FAKE_MCOUNT(TF_EIP(%esp))
+ pushl %esp
+ pushl $\irq_num /* pass the IRQ */
+ movl $atpic_handle_intr, %eax
+ call *%eax
+ addl $8, %esp /* discard the parameters */
+
+ MEXITCOUNT
jmp doreti
+ .endm
- INTR(0, atpic_intr0)
- INTR(1, atpic_intr1)
- INTR(2, atpic_intr2)
- INTR(3, atpic_intr3)
- INTR(4, atpic_intr4)
- INTR(5, atpic_intr5)
- INTR(6, atpic_intr6)
- INTR(7, atpic_intr7)
- INTR(8, atpic_intr8)
- INTR(9, atpic_intr9)
- INTR(10, atpic_intr10)
- INTR(11, atpic_intr11)
- INTR(12, atpic_intr12)
- INTR(13, atpic_intr13)
- INTR(14, atpic_intr14)
- INTR(15, atpic_intr15)
+ INTR 0, atpic_intr0
+ INTR 1, atpic_intr1
+ INTR 2, atpic_intr2
+ INTR 3, atpic_intr3
+ INTR 4, atpic_intr4
+ INTR 5, atpic_intr5
+ INTR 6, atpic_intr6
+ INTR 7, atpic_intr7
+ INTR 8, atpic_intr8
+ INTR 9, atpic_intr9
+ INTR 10, atpic_intr10
+ INTR 11, atpic_intr11
+ INTR 12, atpic_intr12
+ INTR 13, atpic_intr13
+ INTR 14, atpic_intr14
+ INTR 15, atpic_intr15
Index: sys/i386/i386/bios.c
===================================================================
--- sys/i386/i386/bios.c
+++ sys/i386/i386/bios.c
@@ -305,6 +305,7 @@
}
extern int vm86pa;
+extern u_long vm86phystk;
extern void bios16_jmp(void);
/*
@@ -329,7 +330,7 @@
int flags = BIOSCODE_FLAG | BIOSDATA_FLAG;
u_int i, arg_start, arg_end;
pt_entry_t *pte;
- pd_entry_t *ptd;
+ pd_entry_t *ptd, orig_ptd;
arg_start = 0xffffffff;
arg_end = 0;
@@ -390,27 +391,14 @@
args->seg.code32.base = (u_int)&bios16_jmp & PG_FRAME;
args->seg.code32.limit = 0xffff;
- ptd = (pd_entry_t *)rcr3();
-#if defined(PAE) || defined(PAE_TABLES)
- if (ptd == IdlePDPT)
-#else
- if (ptd == IdlePTD)
-#endif
- {
- /*
- * no page table, so create one and install it.
- */
- pte = (pt_entry_t *)malloc(PAGE_SIZE, M_TEMP, M_WAITOK);
- ptd = (pd_entry_t *)((u_int)IdlePTD + KERNBASE);
- *pte = (vm86pa - PAGE_SIZE) | PG_RW | PG_V;
- *ptd = vtophys(pte) | PG_RW | PG_V;
- } else {
- /*
- * this is a user-level page table
- */
- pte = PTmap;
- *pte = (vm86pa - PAGE_SIZE) | PG_RW | PG_V;
- }
+ /*
+ * no page table, so create one and install it.
+ */
+ pte = (pt_entry_t *)malloc(PAGE_SIZE, M_TEMP, M_WAITOK);
+ ptd = IdlePTD;
+ *pte = vm86phystk | PG_RW | PG_V;
+ orig_ptd = *ptd;
+ *ptd = vtophys(pte) | PG_RW | PG_V;
pmap_invalidate_all(kernel_pmap); /* XXX insurance for now */
stack_top = stack;
@@ -464,20 +452,12 @@
i = bios16_call(&args->r, stack_top);
- if (pte == PTmap) {
- *pte = 0; /* remove entry */
- /*
- * XXX only needs to be invlpg(0) but that doesn't work on the 386
- */
- pmap_invalidate_all(kernel_pmap);
- } else {
- *ptd = 0; /* remove page table */
- /*
- * XXX only needs to be invlpg(0) but that doesn't work on the 386
- */
- pmap_invalidate_all(kernel_pmap);
- free(pte, M_TEMP); /* ... and free it */
- }
+ *ptd = orig_ptd; /* remove page table */
+ /*
+ * XXX only needs to be invlpg(0) but that doesn't work on the 386
+ */
+ pmap_invalidate_all(kernel_pmap);
+ free(pte, M_TEMP); /* ... and free it */
return (i);
}
Index: sys/i386/i386/copyout.c
===================================================================
--- /dev/null
+++ sys/i386/i386/copyout.c
@@ -0,0 +1,489 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2018 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed by Konstantin Belousov <kib@FreeBSD.org>
+ * under sponsorship from the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/pcpu.h>
+#include <sys/proc.h>
+#include <sys/sched.h>
+#include <sys/sysctl.h>
+#include <sys/systm.h>
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_extern.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <vm/vm_page.h>
+
+#if defined(PAE) || defined(PAE_TABLES)
+#define KCR3 ((u_int)IdlePDPT)
+#else
+#define KCR3 ((u_int)IdlePTD)
+#endif
+
+int copyin_fast(const void *udaddr, void *kaddr, size_t len, u_int);
+static int (*copyin_fast_tramp)(const void *, void *, size_t, u_int);
+int copyout_fast(const void *kaddr, void *udaddr, size_t len, u_int);
+static int (*copyout_fast_tramp)(const void *, void *, size_t, u_int);
+int fubyte_fast(volatile const void *base, u_int kcr3);
+static int (*fubyte_fast_tramp)(volatile const void *, u_int);
+int fuword16_fast(volatile const void *base, u_int kcr3);
+static int (*fuword16_fast_tramp)(volatile const void *, u_int);
+int fueword_fast(volatile const void *base, long *val, u_int kcr3);
+static int (*fueword_fast_tramp)(volatile const void *, long *, u_int);
+int subyte_fast(volatile void *base, int val, u_int kcr3);
+static int (*subyte_fast_tramp)(volatile void *, int, u_int);
+int suword16_fast(volatile void *base, int val, u_int kcr3);
+static int (*suword16_fast_tramp)(volatile void *, int, u_int);
+int suword_fast(volatile void *base, long val, u_int kcr3);
+static int (*suword_fast_tramp)(volatile void *, long, u_int);
+
+static int fast_copyout = 1;
+SYSCTL_INT(_machdep, OID_AUTO, fast_copyout, CTLFLAG_RWTUN,
+ &fast_copyout, 0,
+ "");
+
+void
+copyout_init_tramp(void)
+{
+
+ copyin_fast_tramp = (int (*)(const void *, void *, size_t, u_int))(
+ (uintptr_t)copyin_fast + setidt_disp);
+ copyout_fast_tramp = (int (*)(const void *, void *, size_t, u_int))(
+ (uintptr_t)copyout_fast + setidt_disp);
+ fubyte_fast_tramp = (int (*)(volatile const void *, u_int))(
+ (uintptr_t)fubyte_fast + setidt_disp);
+ fuword16_fast_tramp = (int (*)(volatile const void *, u_int))(
+ (uintptr_t)fuword16_fast + setidt_disp);
+ fueword_fast_tramp = (int (*)(volatile const void *, long *, u_int))(
+ (uintptr_t)fueword_fast + setidt_disp);
+ subyte_fast_tramp = (int (*)(volatile void *, int, u_int))(
+ (uintptr_t)subyte_fast + setidt_disp);
+ suword16_fast_tramp = (int (*)(volatile void *, int, u_int))(
+ (uintptr_t)suword16_fast + setidt_disp);
+ suword_fast_tramp = (int (*)(volatile void *, long, u_int))(
+ (uintptr_t)suword_fast + setidt_disp);
+}
+
+static int
+cp_slow0(vm_offset_t uva, size_t len, bool write,
+ void (*f)(vm_offset_t, void *), void *arg)
+{
+ struct pcpu *pc;
+ vm_page_t m[2];
+ pt_entry_t *pte;
+ vm_offset_t kaddr;
+ int error, i, plen;
+ bool sleepable;
+
+ plen = howmany(uva - trunc_page(uva) + len, PAGE_SIZE);
+ MPASS(plen <= nitems(m));
+ error = 0;
+ i = vm_fault_quick_hold_pages(&curproc->p_vmspace->vm_map, uva, len,
+ (write ? VM_PROT_WRITE : VM_PROT_READ) | VM_PROT_QUICK_NOFAULT,
+ m, nitems(m));
+ if (i != plen)
+ return (EFAULT);
+ sched_pin();
+ pc = get_pcpu();
+ if (!THREAD_CAN_SLEEP() || curthread->td_vslock_sz > 0 ||
+ (curthread->td_pflags & TDP_NOFAULTING) != 0) {
+ sleepable = false;
+ mtx_lock(&pc->pc_copyout_mlock);
+ kaddr = pc->pc_copyout_maddr;
+ } else {
+ sleepable = true;
+ sx_xlock(&pc->pc_copyout_slock);
+ kaddr = pc->pc_copyout_saddr;
+ }
+ for (i = 0, pte = vtopte(kaddr); i < plen; i++, pte++) {
+ *pte = PG_V | PG_RW | PG_A | PG_M | VM_PAGE_TO_PHYS(m[i]) |
+ pmap_cache_bits(pmap_page_get_memattr(m[i]), FALSE);
+ invlpg(kaddr + ptoa(i));
+ }
+ kaddr += uva - trunc_page(uva);
+ f(kaddr, arg);
+ sched_unpin();
+ if (sleepable)
+ sx_xunlock(&pc->pc_copyout_slock);
+ else
+ mtx_unlock(&pc->pc_copyout_mlock);
+ for (i = 0; i < plen; i++) {
+ vm_page_lock(m[i]);
+ vm_page_unhold(m[i]);
+ vm_page_unlock(m[i]);
+ }
+ return (error);
+}
+
+struct copyinstr_arg0 {
+ vm_offset_t kc;
+ size_t len;
+ size_t alen;
+ bool end;
+};
+
+static void
+copyinstr_slow0(vm_offset_t kva, void *arg)
+{
+ struct copyinstr_arg0 *ca;
+ char c;
+
+ ca = arg;
+ MPASS(ca->alen == 0 && ca->len > 0 && !ca->end);
+ while (ca->alen < ca->len && !ca->end) {
+ c = *(char *)(kva + ca->alen);
+ *(char *)ca->kc = c;
+ ca->alen++;
+ ca->kc++;
+ if (c == '\0')
+ ca->end = true;
+ }
+}
+
+int
+copyinstr(const void *udaddr, void *kaddr, size_t maxlen, size_t *lencopied)
+{
+ struct copyinstr_arg0 ca;
+ vm_offset_t uc;
+ size_t plen;
+ int error;
+
+ error = 0;
+ ca.end = false;
+ for (plen = 0, uc = (vm_offset_t)udaddr, ca.kc = (vm_offset_t)kaddr;
+ plen < maxlen && !ca.end; uc += ca.alen, plen += ca.alen) {
+ ca.len = round_page(uc) - uc;
+ if (ca.len == 0)
+ ca.len = PAGE_SIZE;
+ if (plen + ca.len > maxlen)
+ ca.len = maxlen - plen;
+ ca.alen = 0;
+ if (cp_slow0(uc, ca.len, false, copyinstr_slow0, &ca) != 0) {
+ error = EFAULT;
+ break;
+ }
+ }
+ if (!ca.end && plen == maxlen && error == 0)
+ error = ENAMETOOLONG;
+ if (lencopied != NULL)
+ *lencopied = plen;
+ return (error);
+}
+
+struct copyin_arg0 {
+ vm_offset_t kc;
+ size_t len;
+};
+
+static void
+copyin_slow0(vm_offset_t kva, void *arg)
+{
+ struct copyin_arg0 *ca;
+
+ ca = arg;
+ bcopy((void *)kva, (void *)ca->kc, ca->len);
+}
+
+int
+copyin(const void *udaddr, void *kaddr, size_t len)
+{
+ struct copyin_arg0 ca;
+ vm_offset_t uc;
+ size_t plen;
+
+ if ((uintptr_t)udaddr + len < (uintptr_t)udaddr ||
+ (uintptr_t)udaddr + len > VM_MAXUSER_ADDRESS)
+ return (-1);
+ if (len == 0 || (fast_copyout && len <= TRAMP_COPYOUT_SZ &&
+ copyin_fast_tramp(udaddr, kaddr, len, KCR3) == 0))
+ return (0);
+ for (plen = 0, uc = (vm_offset_t)udaddr, ca.kc = (vm_offset_t)kaddr;
+ plen < len; uc += ca.len, ca.kc += ca.len, plen += ca.len) {
+ ca.len = round_page(uc) - uc;
+ if (ca.len == 0)
+ ca.len = PAGE_SIZE;
+ if (plen + ca.len > len)
+ ca.len = len - plen;
+ if (cp_slow0(uc, ca.len, false, copyin_slow0, &ca) != 0)
+ return (EFAULT);
+ }
+ return (0);
+}
+
+static void
+copyout_slow0(vm_offset_t kva, void *arg)
+{
+ struct copyin_arg0 *ca;
+
+ ca = arg;
+ bcopy((void *)ca->kc, (void *)kva, ca->len);
+}
+
+int
+copyout(const void *kaddr, void *udaddr, size_t len)
+{
+ struct copyin_arg0 ca;
+ vm_offset_t uc;
+ size_t plen;
+
+ if ((uintptr_t)udaddr + len < (uintptr_t)udaddr ||
+ (uintptr_t)udaddr + len > VM_MAXUSER_ADDRESS)
+ return (-1);
+ if (len == 0 || (fast_copyout && len <= TRAMP_COPYOUT_SZ &&
+ copyout_fast_tramp(kaddr, udaddr, len, KCR3) == 0))
+ return (0);
+ for (plen = 0, uc = (vm_offset_t)udaddr, ca.kc = (vm_offset_t)kaddr;
+ plen < len; uc += ca.len, ca.kc += ca.len, plen += ca.len) {
+ ca.len = round_page(uc) - uc;
+ if (ca.len == 0)
+ ca.len = PAGE_SIZE;
+ if (plen + ca.len > len)
+ ca.len = len - plen;
+ if (cp_slow0(uc, ca.len, true, copyout_slow0, &ca) != 0)
+ return (EFAULT);
+ }
+ return (0);
+}
+
+/*
+ * Fetch (load) a 32-bit word, a 16-bit word, or an 8-bit byte from user
+ * memory.
+ */
+
+static void
+fubyte_slow0(vm_offset_t kva, void *arg)
+{
+
+ *(int *)arg = *(u_char *)kva;
+}
+
+int
+fubyte(volatile const void *base)
+{
+ int res;
+
+ if ((uintptr_t)base + sizeof(uint8_t) < (uintptr_t)base ||
+ (uintptr_t)base + sizeof(uint8_t) > VM_MAXUSER_ADDRESS)
+ return (-1);
+ if (fast_copyout) {
+ res = fubyte_fast_tramp(base, KCR3);
+ if (res != -1)
+ return (res);
+ }
+ if (cp_slow0((vm_offset_t)base, sizeof(char), false, fubyte_slow0,
+ &res) != 0)
+ return (-1);
+ return (res);
+}
+
+static void
+fuword16_slow0(vm_offset_t kva, void *arg)
+{
+
+ *(int *)arg = *(uint16_t *)kva;
+}
+
+int
+fuword16(volatile const void *base)
+{
+ int res;
+
+ if ((uintptr_t)base + sizeof(uint16_t) < (uintptr_t)base ||
+ (uintptr_t)base + sizeof(uint16_t) > VM_MAXUSER_ADDRESS)
+ return (-1);
+ if (fast_copyout) {
+ res = fuword16_fast_tramp(base, KCR3);
+ if (res != -1)
+ return (res);
+ }
+ if (cp_slow0((vm_offset_t)base, sizeof(uint16_t), false,
+ fuword16_slow0, &res) != 0)
+ return (-1);
+ return (res);
+}
+
+static void
+fueword_slow0(vm_offset_t kva, void *arg)
+{
+
+ *(uint32_t *)arg = *(uint32_t *)kva;
+}
+
+int
+fueword(volatile const void *base, long *val)
+{
+ uint32_t res;
+
+ if ((uintptr_t)base + sizeof(*val) < (uintptr_t)base ||
+ (uintptr_t)base + sizeof(*val) > VM_MAXUSER_ADDRESS)
+ return (-1);
+ if (fast_copyout) {
+ if (fueword_fast_tramp(base, val, KCR3) == 0)
+ return (0);
+ }
+ if (cp_slow0((vm_offset_t)base, sizeof(long), false, fueword_slow0,
+ &res) != 0)
+ return (-1);
+ *val = res;
+ return (0);
+}
+
+int
+fueword32(volatile const void *base, int32_t *val)
+{
+
+ return (fueword(base, (long *)val));
+}
+
+/*
+ * Store a 32-bit word, a 16-bit word, or an 8-bit byte to user memory.
+ */
+
+static void
+subyte_slow0(vm_offset_t kva, void *arg)
+{
+
+ *(u_char *)kva = *(int *)arg;
+}
+
+int
+subyte(volatile void *base, int byte)
+{
+
+ if ((uintptr_t)base + sizeof(uint8_t) < (uintptr_t)base ||
+ (uintptr_t)base + sizeof(uint8_t) > VM_MAXUSER_ADDRESS)
+ return (-1);
+ if (fast_copyout && subyte_fast_tramp(base, byte, KCR3) == 0)
+ return (0);
+ return (cp_slow0((vm_offset_t)base, sizeof(u_char), true, subyte_slow0,
+ &byte) != 0 ? -1 : 0);
+}
+
+static void
+suword16_slow0(vm_offset_t kva, void *arg)
+{
+
+ *(int *)kva = *(uint16_t *)arg;
+}
+
+int
+suword16(volatile void *base, int word)
+{
+
+ if ((uintptr_t)base + sizeof(uint16_t) < (uintptr_t)base ||
+ (uintptr_t)base + sizeof(uint16_t) > VM_MAXUSER_ADDRESS)
+ return (-1);
+ if (fast_copyout && suword16_fast_tramp(base, word, KCR3) == 0)
+ return (0);
+ return (cp_slow0((vm_offset_t)base, sizeof(int16_t), true,
+ suword16_slow0, &word) != 0 ? -1 : 0);
+}
+
+static void
+suword_slow0(vm_offset_t kva, void *arg)
+{
+
+ *(int *)kva = *(uint32_t *)arg;
+}
+
+int
+suword(volatile void *base, long word)
+{
+
+ if ((uintptr_t)base + sizeof(word) < (uintptr_t)base ||
+ (uintptr_t)base + sizeof(word) > VM_MAXUSER_ADDRESS)
+ return (-1);
+ if (fast_copyout && suword_fast_tramp(base, word, KCR3) == 0)
+ return (0);
+ return (cp_slow0((vm_offset_t)base, sizeof(long), true,
+ suword_slow0, &word) != 0 ? -1 : 0);
+}
+
+int
+suword32(volatile void *base, int32_t word)
+{
+
+ return (suword(base, word));
+}
+
+struct casueword_arg0 {
+ uint32_t oldval;
+ uint32_t newval;
+};
+
+static void
+casueword_slow0(vm_offset_t kva, void *arg)
+{
+ struct casueword_arg0 *ca;
+
+ ca = arg;
+ atomic_fcmpset_int((u_int *)kva, &ca->oldval, ca->newval);
+}
+
+int
+casueword32(volatile uint32_t *base, uint32_t oldval, uint32_t *oldvalp,
+ uint32_t newval)
+{
+ struct casueword_arg0 ca;
+ int res;
+
+ ca.oldval = oldval;
+ ca.newval = newval;
+ res = cp_slow0((vm_offset_t)base, sizeof(int32_t), true,
+ casueword_slow0, &ca);
+ if (res == 0) {
+ *oldvalp = ca.oldval;
+ return (0);
+ }
+ return (-1);
+}
+
+int
+casueword(volatile u_long *base, u_long oldval, u_long *oldvalp, u_long newval)
+{
+ struct casueword_arg0 ca;
+ int res;
+
+ ca.oldval = oldval;
+ ca.newval = newval;
+ res = cp_slow0((vm_offset_t)base, sizeof(int32_t), true,
+ casueword_slow0, &ca);
+ if (res == 0) {
+ *oldvalp = ca.oldval;
+ return (0);
+ }
+ return (-1);
+}
Index: sys/i386/i386/copyout_fast.s
===================================================================
--- /dev/null
+++ sys/i386/i386/copyout_fast.s
@@ -0,0 +1,362 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2018 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed by Konstantin Belousov <kib@FreeBSD.org>
+ * under sponsorship from the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <machine/asmacros.h>
+#include <machine/cputypes.h>
+#include <machine/pmap.h>
+#include <machine/specialreg.h>
+
+#include "assym.inc"
+
+ .text
+
+ENTRY(copyout_fast)
+ pushl %ebp
+ movl %esp, %ebp
+ pushl %esi
+ pushl %edi
+ pushl %ebx
+
+ movl $copyout_fault,%edx
+ movl 20(%ebp),%ebx /* KCR3 */
+
+ movl PCPU(CURPCB),%eax
+ movl PCB_CR3(%eax),%edi
+
+ cli
+ movl PCPU(TRAMPSTK),%esi
+ movl PCPU(COPYOUT_BUF),%eax
+ subl $4,%esi
+ movl %eax,(%esi)
+ movl 12(%ebp),%eax /* udaddr */
+ subl $4,%esi
+ movl %eax,(%esi)
+ movl 16(%ebp),%eax /* len */
+ subl $4,%esi
+ movl %eax,(%esi)
+
+ subl $4, %esi
+ movl %edi, (%esi)
+
+ movl 8(%ebp),%eax /* kaddr */
+ subl $4,%esi
+ movl %eax,(%esi)
+ movl PCPU(COPYOUT_BUF),%eax
+ subl $4,%esi
+ movl %eax,(%esi)
+ movl 16(%ebp),%eax /* len */
+ subl $4,%esi
+ movl %eax,(%esi)
+
+ movl %esp,%eax
+ movl %esi,%esp
+
+ /* bcopy(%esi = kaddr, %edi = PCPU(copyout_buf), %ecx = len) */
+ popl %ecx
+ popl %edi
+ popl %esi
+ rep; movsb
+
+ popl %edi
+ movl %edi,%cr3
+
+ /* bcopy(%esi = PCPU(copyout_buf), %edi = udaddr, %ecx = len) */
+ popl %ecx
+ popl %edi
+ popl %esi
+ rep; movsb
+
+ movl %ebx,%cr3
+ movl %eax,%esp
+ sti
+
+ xorl %eax,%eax
+ popl %ebx
+ popl %edi
+ popl %esi
+ leave
+ ret
+END(copyout_fast)
+
+ENTRY(copyin_fast)
+ pushl %ebp
+ movl %esp, %ebp
+ pushl %esi
+ pushl %edi
+ pushl %ebx
+
+ movl $copyout_fault,%edx
+ movl 20(%ebp),%ebx /* KCR3 */
+
+ movl PCPU(CURPCB),%eax
+ movl PCB_CR3(%eax),%edi
+
+ cli
+ movl PCPU(TRAMPSTK),%esi
+ movl PCPU(COPYOUT_BUF),%eax
+ subl $4,%esi
+ movl %eax,(%esi)
+ movl 12(%ebp),%eax /* kaddr */
+ subl $4,%esi
+ movl %eax,(%esi)
+ movl 16(%ebp),%eax /* len */
+ subl $4,%esi
+ movl %eax,(%esi)
+
+ movl 8(%ebp),%eax /* udaddr */
+ subl $4,%esi
+ movl %eax,(%esi)
+ movl PCPU(COPYOUT_BUF),%eax
+ subl $4,%esi
+ movl %eax,(%esi)
+ movl 16(%ebp),%eax /* len */
+ subl $4,%esi
+ movl %eax,(%esi)
+
+ movl %esp,%eax
+ movl %esi,%esp
+ movl %edi,%cr3
+
+ /* bcopy(%esi = udaddr, %edi = PCPU(copyout_buf), %ecx = len) */
+ popl %ecx
+ popl %edi
+ popl %esi
+ rep; movsb
+
+ movl %ebx,%cr3
+
+ /* bcopy(%esi = PCPU(copyout_buf), %edi = kaddr, %ecx = len) */
+ popl %ecx
+ popl %edi
+ popl %esi
+ rep; movsb
+
+ movl %eax,%esp
+ sti
+
+ xorl %eax,%eax
+ popl %ebx
+ popl %edi
+ popl %esi
+ leave
+ ret
+END(copyin_fast)
+
+ ALIGN_TEXT
+copyout_fault:
+ movl %eax,%esp
+ sti
+ movl $EFAULT,%eax
+ popl %ebx
+ popl %edi
+ popl %esi
+ leave
+ ret
+
+ENTRY(fueword_fast)
+ pushl %ebp
+ movl %esp,%ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ movl 8(%ebp),%ecx /* from */
+ movl PCPU(CURPCB),%eax
+ movl PCB_CR3(%eax),%eax
+ movl $fusufault,%edx
+ movl 16(%ebp),%ebx
+ movl %esp,%esi
+ cli
+ movl PCPU(TRAMPSTK),%esp
+ movl %eax,%cr3
+ movl (%ecx),%eax
+ movl %ebx,%cr3
+ movl %esi,%esp
+ sti
+ movl 12(%ebp),%edx
+ movl %eax,(%edx)
+ xorl %eax,%eax
+ popl %edi
+ popl %esi
+ popl %ebx
+ leave
+ ret
+END(fueword_fast)
+
+ENTRY(fuword16_fast)
+ pushl %ebp
+ movl %esp,%ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ movl 8(%ebp),%ecx /* from */
+ movl PCPU(CURPCB),%eax
+ movl PCB_CR3(%eax),%eax
+ movl $fusufault,%edx
+ movl 12(%ebp),%ebx
+ movl %esp,%esi
+ cli
+ movl PCPU(TRAMPSTK),%esp
+ movl %eax,%cr3
+ movzwl (%ecx),%eax
+ movl %ebx,%cr3
+ movl %esi,%esp
+ sti
+ popl %edi
+ popl %esi
+ popl %ebx
+ leave
+ ret
+END(fuword16_fast)
+
+ENTRY(fubyte_fast)
+ pushl %ebp
+ movl %esp,%ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ movl 8(%ebp),%ecx /* from */
+ movl PCPU(CURPCB),%eax
+ movl PCB_CR3(%eax),%eax
+ movl $fusufault,%edx
+ movl 12(%ebp),%ebx
+ movl %esp,%esi
+ cli
+ movl PCPU(TRAMPSTK),%esp
+ movl %eax,%cr3
+ movzbl (%ecx),%eax
+ movl %ebx,%cr3
+ movl %esi,%esp
+ sti
+ popl %edi
+ popl %esi
+ popl %ebx
+ leave
+ ret
+END(fubyte_fast)
+
+ ALIGN_TEXT
+fusufault:
+ movl %esi,%esp
+ sti
+ xorl %eax,%eax
+ decl %eax
+ popl %edi
+ popl %esi
+ popl %ebx
+ leave
+ ret
+
+ENTRY(suword_fast)
+ pushl %ebp
+ movl %esp,%ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ movl PCPU(CURPCB),%eax
+ movl PCB_CR3(%eax),%eax
+ movl $fusufault,%edx
+ movl 8(%ebp),%ecx /* to */
+ movl 12(%ebp),%edi /* val */
+ movl 16(%ebp),%ebx
+ movl %esp,%esi
+ cli
+ movl PCPU(TRAMPSTK),%esp
+ movl %eax,%cr3
+ movl %edi,(%ecx)
+ movl %ebx,%cr3
+ movl %esi,%esp
+ sti
+ xorl %eax,%eax
+ popl %edi
+ popl %esi
+ popl %ebx
+ leave
+ ret
+END(suword_fast)
+
+ENTRY(suword16_fast)
+ pushl %ebp
+ movl %esp,%ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ movl PCPU(CURPCB),%eax
+ movl PCB_CR3(%eax),%eax
+ movl $fusufault,%edx
+ movl 8(%ebp),%ecx /* to */
+ movl 12(%ebp),%edi /* val */
+ movl 16(%ebp),%ebx
+ movl %esp,%esi
+ cli
+ movl PCPU(TRAMPSTK),%esp
+ movl %eax,%cr3
+ movw %di,(%ecx)
+ movl %ebx,%cr3
+ movl %esi,%esp
+ sti
+ xorl %eax,%eax
+ popl %edi
+ popl %esi
+ popl %ebx
+ leave
+ ret
+END(suword16_fast)
+
+ENTRY(subyte_fast)
+ pushl %ebp
+ movl %esp,%ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ movl PCPU(CURPCB),%eax
+ movl PCB_CR3(%eax),%eax
+ movl $fusufault,%edx
+ movl 8(%ebp),%ecx /* to */
+ movl 12(%ebp),%edi /* val */
+ movl 16(%ebp),%ebx
+ movl %esp,%esi
+ cli
+ movl PCPU(TRAMPSTK),%esp
+ movl %eax,%cr3
+ movl %edi,%eax
+ movb %al,(%ecx)
+ movl %ebx,%cr3
+ movl %esi,%esp
+ sti
+ xorl %eax,%eax
+ popl %edi
+ popl %esi
+ popl %ebx
+ leave
+ ret
+END(subyte_fast)
Index: sys/i386/i386/db_interface.c
===================================================================
--- sys/i386/i386/db_interface.c
+++ sys/i386/i386/db_interface.c
@@ -115,4 +115,7 @@
db_printf("APIC ID = %d\n", pc->pc_apic_id);
db_printf("currentldt = 0x%x\n", pc->pc_currentldt);
+ db_printf("trampstk = 0x%x\n", pc->pc_trampstk);
+ db_printf("kesp0 = 0x%x\n", pc->pc_kesp0);
+ db_printf("common_tssp = 0x%x\n", (u_int)pc->pc_common_tssp);
}
Index: sys/i386/i386/db_trace.c
===================================================================
--- sys/i386/i386/db_trace.c
+++ sys/i386/i386/db_trace.c
@@ -317,7 +317,12 @@
* actually made the call.
*/
frame_type = NORMAL;
- sym = db_search_symbol(eip - 1, DB_STGY_ANY, &offset);
+ if (eip >= PMAP_TRM_MIN_ADDRESS) {
+ sym = db_search_symbol(eip - 1 - setidt_disp, DB_STGY_ANY,
+ &offset);
+ } else {
+ sym = db_search_symbol(eip - 1, DB_STGY_ANY, &offset);
+ }
db_symbol_values(sym, &name, NULL);
if (name != NULL) {
if (strcmp(name, "calltrap") == 0 ||
@@ -357,9 +362,9 @@
* switch to a known good state.
*/
if (frame_type == DOUBLE_FAULT) {
- esp = PCPU_GET(common_tss.tss_esp);
- eip = PCPU_GET(common_tss.tss_eip);
- ebp = PCPU_GET(common_tss.tss_ebp);
+ esp = PCPU_GET(common_tssp)->tss_esp;
+ eip = PCPU_GET(common_tssp)->tss_eip;
+ ebp = PCPU_GET(common_tssp)->tss_ebp;
db_printf(
"--- trap 0x17, eip = %#r, esp = %#r, ebp = %#r ---\n",
eip, esp, ebp);
@@ -379,30 +384,41 @@
else
tf = (struct trapframe *)((int)*fp + 12);
- if (INKERNEL((int) tf)) {
- esp = get_esp(tf);
- eip = tf->tf_eip;
- ebp = tf->tf_ebp;
- switch (frame_type) {
- case TRAP:
- db_printf("--- trap %#r", tf->tf_trapno);
- break;
- case SYSCALL:
- db_printf("--- syscall");
- decode_syscall(tf->tf_eax, td);
- break;
- case TRAP_TIMERINT:
- case TRAP_INTERRUPT:
- case INTERRUPT:
- db_printf("--- interrupt");
- break;
- default:
- panic("The moon has moved again.");
- }
- db_printf(", eip = %#r, esp = %#r, ebp = %#r ---\n", eip,
- esp, ebp);
+ esp = get_esp(tf);
+ eip = tf->tf_eip;
+ ebp = tf->tf_ebp;
+ switch (frame_type) {
+ case TRAP:
+ db_printf("--- trap %#r", tf->tf_trapno);
+ break;
+ case SYSCALL:
+ db_printf("--- syscall");
+ decode_syscall(tf->tf_eax, td);
+ break;
+ case TRAP_TIMERINT:
+ case TRAP_INTERRUPT:
+ case INTERRUPT:
+ db_printf("--- interrupt");
+ break;
+ default:
+ panic("The moon has moved again.");
}
-
+ db_printf(", eip = %#r, esp = %#r, ebp = %#r ---\n", eip, esp, ebp);
+
+ switch (frame_type) {
+ case TRAP:
+ case TRAP_TIMERINT:
+ case TRAP_INTERRUPT:
+ case INTERRUPT:
+ if ((tf->tf_eflags & PSL_VM) != 0 ||
+ (tf->tf_cs & SEL_RPL_MASK) != 0)
+ ebp = 0;
+ break;
+ case SYSCALL:
+ ebp = 0;
+ break;
+ }
+
*ip = (db_addr_t) eip;
*fp = (struct i386_frame *) ebp;
}
@@ -432,6 +448,10 @@
return (0);
}
+ /* 'frame' can be null initially. Just print the pc then. */
+ if (frame == NULL)
+ goto out;
+
/*
* If an indirect call via an invalid pointer caused a trap,
* %pc contains the invalid address while the return address
@@ -540,15 +560,20 @@
db_nextframe(&frame, &pc, td);
- if (INKERNEL((int)pc) && !INKERNEL((int) frame)) {
+out:
+ /*
+ * 'frame' can be null here, either because it was initially
+ * null or because db_nextframe() found no frame.
+ * db_nextframe() may also have found a non-kernel frame.
+ * !INKERNEL() classifies both. Stop tracing if either,
+ * after printing the pc if it is the kernel.
+ */
+ if (frame == NULL || frame <= actframe) {
sym = db_search_symbol(pc, DB_STGY_ANY, &offset);
db_symbol_values(sym, &name, NULL);
db_print_stack_entry(name, 0, 0, 0, pc, frame);
break;
}
- if (!INKERNEL((int) frame)) {
- break;
- }
}
return (0);
Index: sys/i386/i386/elf_machdep.c
===================================================================
--- sys/i386/i386/elf_machdep.c
+++ sys/i386/i386/elf_machdep.c
@@ -137,7 +137,6 @@
(sysinit_cfunc_t) elf32_insert_brand_entry,
&kfreebsd_brand_info);
-
void
elf32_dump_thread(struct thread *td, void *dst, size_t *off)
{
Index: sys/i386/i386/exception.s
===================================================================
--- sys/i386/i386/exception.s
+++ sys/i386/i386/exception.s
@@ -1,11 +1,13 @@
/*-
* Copyright (c) 1989, 1990 William F. Jolitz.
* Copyright (c) 1990 The Regents of the University of California.
- * Copyright (c) 2007 The FreeBSD Foundation
+ * Copyright (c) 2007, 2018 The FreeBSD Foundation
* All rights reserved.
*
* Portions of this software were developed by A. Joseph Koshy under
* sponsorship from the FreeBSD Foundation and Google, Inc.
+ * Portions of this software were developed by Konstantin Belousov
+ * <kib@FreeBSD.org> under sponsorship from the FreeBSD Foundation.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -38,14 +40,11 @@
#include "opt_atpic.h"
#include "opt_hwpmc_hooks.h"
-#include <machine/asmacros.h>
-#include <machine/psl.h>
-#include <machine/trap.h>
-
#include "assym.inc"
-#define SEL_RPL_MASK 0x0003
-#define GSEL_KPL 0x0020 /* GSEL(GCODE_SEL, SEL_KPL) */
+#include <machine/psl.h>
+#include <machine/asmacros.h>
+#include <machine/trap.h>
#ifdef KDTRACE_HOOKS
.bss
@@ -63,20 +62,19 @@
.zero 8
#endif
.text
-#ifdef HWPMC_HOOKS
- ENTRY(start_exceptions)
-#endif
+ENTRY(start_exceptions)
+ .globl tramp_idleptd
+tramp_idleptd: .long 0
+
/*****************************************************************************/
/* Trap handling */
/*****************************************************************************/
/*
* Trap and fault vector routines.
*
- * Most traps are 'trap gates', SDT_SYS386TGT. A trap gate pushes state on
- * the stack that mostly looks like an interrupt, but does not disable
- * interrupts. A few of the traps we are use are interrupt gates,
- * SDT_SYS386IGT, which are nearly the same thing except interrupts are
- * disabled on entry.
+ * All traps are 'interrupt gates', SDT_SYS386IGT. Interrupts are disabled
+ * by hardware to not allow interrupts until code switched to the kernel
+ * address space and the kernel thread stack.
*
* The cpu will push a certain amount of state onto the kernel stack for
* the current process. The amount of state depends on the type of trap
@@ -92,6 +90,10 @@
* must restore them prior to calling 'iret'. The cpu adjusts the %cs and
* %ss segment registers, but does not mess with %ds, %es, or %fs. Thus we
* must load them with appropriate values for supervisor mode operation.
+ *
+ * This code is not executed at the linked address, it is copied to the
+ * trampoline area. As the consequence, all code there and in included files
+ * must be PIC.
*/
MCOUNT_LABEL(user)
@@ -103,8 +105,6 @@
pushl $0; TRAP(T_DIVIDE)
IDTVEC(dbg)
pushl $0; TRAP(T_TRCTRAP)
-IDTVEC(nmi)
- pushl $0; TRAP(T_NMI)
IDTVEC(bpt)
pushl $0; TRAP(T_BPTFLT)
IDTVEC(dtrace_ret)
@@ -124,15 +124,23 @@
IDTVEC(tss)
TRAP(T_TSSFLT)
IDTVEC(missing)
- TRAP(T_SEGNPFLT)
+ pushl $T_SEGNPFLT
+ jmp irettraps
IDTVEC(stk)
- TRAP(T_STKFLT)
+ pushl $T_STKFLT
+ jmp irettraps
IDTVEC(prot)
- TRAP(T_PROTFLT)
+ pushl $T_PROTFLT
+ jmp irettraps
IDTVEC(page)
- TRAP(T_PAGEFLT)
-IDTVEC(mchk)
- pushl $0; TRAP(T_MCHK)
+ cmpl $PMAP_TRM_MIN_ADDRESS, TF_EIP-TF_ERR(%esp)
+ jb 1f
+ movl %ebx, %cr3
+ movl %edx, TF_EIP-TF_ERR(%esp)
+ addl $4, %esp
+ iret
+1: pushl $T_PAGEFLT
+ jmp alltraps
IDTVEC(rsvd_pti)
IDTVEC(rsvd)
pushl $0; TRAP(T_RESERVED)
@@ -144,7 +152,8 @@
pushl $0; TRAP(T_XMMFLT)
/*
- * All traps except ones for syscalls jump to alltraps. If
+ * All traps except ones for syscalls or invalid segment,
+ * jump to alltraps. If
* interrupts were enabled when the trap occurred, then interrupts
* are enabled now if the trap was through a trap gate, else
* disabled if the trap was through an interrupt gate. Note that
@@ -156,20 +165,16 @@
.globl alltraps
.type alltraps,@function
alltraps:
- pushal
- pushl $0
- movw %ds,(%esp)
- pushl $0
- movw %es,(%esp)
- pushl $0
- movw %fs,(%esp)
+ PUSH_FRAME2
alltraps_with_regs_pushed:
SET_KERNEL_SREGS
cld
+ KENTER
FAKE_MCOUNT(TF_EIP(%esp))
calltrap:
pushl %esp
- call trap
+ movl $trap,%eax
+ call *%eax
add $4, %esp
/*
@@ -178,28 +183,84 @@
MEXITCOUNT
jmp doreti
+ .globl irettraps
+ .type irettraps,@function
+irettraps:
+ testl $PSL_VM, TF_EFLAGS-TF_TRAPNO(%esp)
+ jnz alltraps
+ testb $SEL_RPL_MASK, TF_CS-TF_TRAPNO(%esp)
+ jnz alltraps
+
+ /*
+ * Kernel mode.
+ * The special case there is the kernel mode with user %cr3 and
+ * trampoline stack. We need to copy both current frame and the
+ * hardware portion of the frame we tried to return to, to the
+ * normal stack. This logic must follow the stack unwind order
+ * in doreti.
+ */
+ PUSH_FRAME2
+ SET_KERNEL_SREGS
+ cld
+ call 1f
+1: popl %ebx
+ leal (doreti_iret - 1b)(%ebx), %edx
+ cmpl %edx, TF_EIP(%esp)
+ jne 2f
+ movl $(2 * TF_SZ - TF_EIP), %ecx
+ jmp 6f
+2: leal (doreti_popl_ds - 1b)(%ebx), %edx
+ cmpl %edx, TF_EIP(%esp)
+ jne 3f
+ movl $(2 * TF_SZ - TF_DS), %ecx
+ jmp 6f
+3: leal (doreti_popl_es - 1b)(%ebx), %edx
+ cmpl %edx, TF_EIP(%esp)
+ jne 4f
+ movl $(2 * TF_SZ - TF_ES), %ecx
+ jmp 6f
+4: leal (doreti_popl_fs - 1b)(%ebx), %edx
+ cmpl %edx, TF_EIP(%esp)
+ jne 5f
+ movl $(2 * TF_SZ - TF_FS), %ecx
+ jmp 6f
+ /* kernel mode, normal */
+5: FAKE_MCOUNT(TF_EIP(%esp))
+ jmp calltrap
+6: cmpl $PMAP_TRM_MIN_ADDRESS, %esp /* trampoline stack ? */
+ jb 5b /* if not, no need to change stacks */
+ movl (tramp_idleptd - 1b)(%ebx), %eax
+ movl %eax, %cr3
+ movl PCPU(KESP0), %edx
+ subl %ecx, %edx
+ movl %edx, %edi
+ movl %esp, %esi
+ rep; movsb
+ movl %edx, %esp
+ FAKE_MCOUNT(TF_EIP(%esp))
+ jmp calltrap
+
/*
* Privileged instruction fault.
*/
#ifdef KDTRACE_HOOKS
SUPERALIGN_TEXT
IDTVEC(ill)
- /*
- * Check if a DTrace hook is registered. The default (data) segment
- * cannot be used for this since %ds is not known good until we
- * verify that the entry was from kernel mode.
- */
- cmpl $0,%ss:dtrace_invop_jump_addr
- je norm_ill
-
/*
* Check if this is a user fault. If so, just handle it as a normal
* trap.
*/
- cmpl $GSEL_KPL, 4(%esp) /* Check the code segment */
- jne norm_ill
testl $PSL_VM, 8(%esp) /* and vm86 mode. */
jnz norm_ill
+ cmpl $GSEL_KPL, 4(%esp) /* Check the code segment */
+ jne norm_ill
+
+ /*
+ * Check if a DTrace hook is registered. The trampoline cannot
+ * be instrumented.
+ */
+ cmpl $0, dtrace_invop_jump_addr
+ je norm_ill
/*
* This is a kernel instruction fault that might have been caused
@@ -221,47 +282,43 @@
* Process the instruction fault in the normal way.
*/
norm_ill:
- pushl $0
- TRAP(T_PRIVINFLT)
+ pushl $0
+ pushl $T_PRIVINFLT
+ jmp alltraps
#endif
-/*
- * Call gate entry for syscalls (lcall 7,0).
- * This is used by FreeBSD 1.x a.out executables and "old" NetBSD executables.
- *
- * The intersegment call has been set up to specify one dummy parameter.
- * This leaves a place to put eflags so that the call frame can be
- * converted to a trap frame. Note that the eflags is (semi-)bogusly
- * pushed into (what will be) tf_err and then copied later into the
- * final spot. It has to be done this way because esp can't be just
- * temporarily altered for the pushfl - an interrupt might come in
- * and clobber the saved cs/eip.
- */
- SUPERALIGN_TEXT
-IDTVEC(lcall_syscall)
- pushfl /* save eflags */
- popl 8(%esp) /* shuffle into tf_eflags */
- pushl $7 /* sizeof "lcall 7,0" */
- pushl $0 /* tf_trapno */
- pushal
- pushl $0
- movw %ds,(%esp)
+IDTVEC(mchk)
pushl $0
- movw %es,(%esp)
+ pushl $T_MCHK
+ jmp nmi_mchk_common
+
+IDTVEC(nmi)
pushl $0
- movw %fs,(%esp)
+ pushl $T_NMI
+nmi_mchk_common:
+ PUSH_FRAME2
SET_KERNEL_SREGS
cld
+ /*
+ * Save %cr3 into tf_err. There is no good place to put it.
+ * Always reload %cr3, since we might have interrupted the
+ * kernel entry or exit.
+ * Do not switch to the thread kernel stack, otherwise we might
+ * obliterate the previous context partially copied from the
+ * trampoline stack.
+ */
+ movl %cr3, %eax
+ movl %eax, TF_ERR(%esp)
+ call 1f
+1: popl %eax
+ movl (tramp_idleptd - 1b)(%eax), %eax
+ movl %eax, %cr3
FAKE_MCOUNT(TF_EIP(%esp))
- pushl %esp
- call syscall
- add $4, %esp
- MEXITCOUNT
- jmp doreti
+ jmp calltrap
/*
* Trap gate entry for syscalls (int 0x80).
- * This is used by FreeBSD ELF executables, "new" NetBSD executables, and all
+ * This is used by FreeBSD ELF executables, "new" a.out executables, and all
* Linux executables.
*
* Even though the name says 'int0x80', this is actually a trap gate, not an
@@ -272,18 +329,15 @@
IDTVEC(int0x80_syscall)
pushl $2 /* sizeof "int 0x80" */
pushl $0 /* tf_trapno */
- pushal
- pushl $0
- movw %ds,(%esp)
- pushl $0
- movw %es,(%esp)
- pushl $0
- movw %fs,(%esp)
+ PUSH_FRAME2
SET_KERNEL_SREGS
cld
+ MOVE_STACKS
+ sti
FAKE_MCOUNT(TF_EIP(%esp))
pushl %esp
- call syscall
+ movl $syscall, %eax
+ call *%eax
add $4, %esp
MEXITCOUNT
jmp doreti
@@ -292,7 +346,8 @@
pushl %esp /* trapframe pointer */
pushl %ebx /* arg1 */
pushl %esi /* function */
- call fork_exit
+ movl $fork_exit, %eax
+ call *%eax
addl $12,%esp
/* cut from syscall */
@@ -343,6 +398,8 @@
.text
MCOUNT_LABEL(eintr)
+#include <i386/i386/copyout_fast.s>
+
/*
* void doreti(struct trapframe)
*
@@ -375,7 +432,7 @@
movl PCPU(CURPCB),%ecx
testl $PCB_VM86CALL,PCB_FLAGS(%ecx)
jz doreti_ast
- jmp doreti_exit
+ jmp doreti_popl_fs
doreti_notvm86:
testb $SEL_RPL_MASK,TF_CS(%esp) /* are we returning to user mode? */
@@ -393,7 +450,8 @@
je doreti_exit
sti
pushl %esp /* pass a pointer to the trapframe */
- call ast
+ movl $ast, %eax
+ call *%eax
add $4,%esp
jmp doreti_ast
@@ -407,6 +465,23 @@
doreti_exit:
MEXITCOUNT
+ cmpl $T_NMI, TF_TRAPNO(%esp)
+ je doreti_iret_nmi
+ cmpl $T_MCHK, TF_TRAPNO(%esp)
+ je doreti_iret_nmi
+ testl $SEL_RPL_MASK, TF_CS(%esp)
+ jz doreti_popl_fs
+ movl %esp, %esi
+ movl PCPU(TRAMPSTK), %edx
+ movl $TF_SZ, %ecx
+ subl %ecx, %edx
+ movl %edx, %edi
+ rep; movsb
+ movl %edx, %esp
+ movl PCPU(CURPCB),%eax
+ movl PCB_CR3(%eax), %eax
+ movl %eax, %cr3
+
.globl doreti_popl_fs
doreti_popl_fs:
popl %fs
@@ -422,6 +497,11 @@
doreti_iret:
iret
+doreti_iret_nmi:
+ movl TF_ERR(%esp), %eax
+ movl %eax, %cr3
+ jmp doreti_popl_fs
+
/*
* doreti_iret_fault and friends. Alternative return code for
* the case where we get a fault in the doreti_exit code
@@ -440,7 +520,8 @@
ALIGN_TEXT
.globl doreti_iret_fault
doreti_iret_fault:
- subl $8,%esp
+ pushl $0 /* tf_err */
+ pushl $0 /* tf_trapno XXXKIB: provide more useful value ? */
pushal
pushl $0
movw %ds,(%esp)
@@ -460,10 +541,10 @@
doreti_popl_fs_fault:
testb $SEL_RPL_MASK,TF_CS-TF_FS(%esp)
jz doreti_popl_fs_kfault
- sti
movl $0,TF_ERR(%esp) /* XXX should be the error code */
movl $T_PROTFLT,TF_TRAPNO(%esp)
- jmp alltraps_with_regs_pushed
+ SET_KERNEL_SREGS
+ jmp calltrap
doreti_popl_ds_kfault:
movl $0,(%esp)
@@ -474,7 +555,7 @@
doreti_popl_fs_kfault:
movl $0,(%esp)
jmp doreti_popl_fs
-
+
#ifdef HWPMC_HOOKS
doreti_nmi:
/*
@@ -482,6 +563,8 @@
* was from user mode and if so whether the current thread
* needs a user call chain capture.
*/
+ testl $PSL_VM, TF_EFLAGS(%esp)
+ jnz doreti_exit
testb $SEL_RPL_MASK,TF_CS(%esp)
jz doreti_exit
movl PCPU(CURTHREAD),%eax /* curthread present? */
@@ -489,12 +572,21 @@
jz doreti_exit
testl $TDP_CALLCHAIN,TD_PFLAGS(%eax) /* flagged for capture? */
jz doreti_exit
+ /*
+ * Switch to thread stack. Reset tf_trapno to not indicate NMI,
+ * to cause normal userspace exit.
+ */
+ movl $T_RESERVED, TF_TRAPNO(%esp)
+ NMOVE_STACKS
/*
* Take the processor out of NMI mode by executing a fake "iret".
*/
pushfl
pushl %cs
- pushl $outofnmi
+ call 1f
+1: popl %eax
+ leal (outofnmi-1b)(%eax),%eax
+ pushl %eax
iret
outofnmi:
/*
@@ -511,5 +603,6 @@
call *%ecx
addl $12,%esp
jmp doreti_ast
- ENTRY(end_exceptions)
#endif
+
+ENTRY(end_exceptions)
Index: sys/i386/i386/genassym.c
===================================================================
--- sys/i386/i386/genassym.c
+++ sys/i386/i386/genassym.c
@@ -74,6 +74,7 @@
#include <x86/apicreg.h>
#endif
#include <machine/cpu.h>
+#include <machine/pcb_ext.h>
#include <machine/pcb.h>
#include <machine/sigframe.h>
#include <machine/vm86.h>
@@ -141,6 +142,8 @@
ASSYM(PCB_DBREGS, PCB_DBREGS);
ASSYM(PCB_EXT, offsetof(struct pcb, pcb_ext));
+ASSYM(PCB_EXT_TSS, offsetof(struct pcb_ext, ext_tss));
+
ASSYM(PCB_FSD, offsetof(struct pcb, pcb_fsd));
ASSYM(PCB_GSD, offsetof(struct pcb, pcb_gsd));
ASSYM(PCB_VM86, offsetof(struct pcb, pcb_vm86));
@@ -164,6 +167,7 @@
ASSYM(TF_EIP, offsetof(struct trapframe, tf_eip));
ASSYM(TF_CS, offsetof(struct trapframe, tf_cs));
ASSYM(TF_EFLAGS, offsetof(struct trapframe, tf_eflags));
+ASSYM(TF_SZ, sizeof(struct trapframe));
ASSYM(SIGF_HANDLER, offsetof(struct sigframe, sf_ahu.sf_handler));
#ifdef COMPAT_43
@@ -206,7 +210,7 @@
ASSYM(PC_FPCURTHREAD, offsetof(struct pcpu, pc_fpcurthread));
ASSYM(PC_IDLETHREAD, offsetof(struct pcpu, pc_idlethread));
ASSYM(PC_CURPCB, offsetof(struct pcpu, pc_curpcb));
-ASSYM(PC_COMMON_TSS, offsetof(struct pcpu, pc_common_tss));
+ASSYM(PC_COMMON_TSSP, offsetof(struct pcpu, pc_common_tssp));
ASSYM(PC_COMMON_TSSD, offsetof(struct pcpu, pc_common_tssd));
ASSYM(PC_TSS_GDT, offsetof(struct pcpu, pc_tss_gdt));
ASSYM(PC_FSGS_GDT, offsetof(struct pcpu, pc_fsgs_gdt));
@@ -214,6 +218,9 @@
ASSYM(PC_CPUID, offsetof(struct pcpu, pc_cpuid));
ASSYM(PC_CURPMAP, offsetof(struct pcpu, pc_curpmap));
ASSYM(PC_PRIVATE_TSS, offsetof(struct pcpu, pc_private_tss));
+ASSYM(PC_KESP0, offsetof(struct pcpu, pc_kesp0));
+ASSYM(PC_TRAMPSTK, offsetof(struct pcpu, pc_trampstk));
+ASSYM(PC_COPYOUT_BUF, offsetof(struct pcpu, pc_copyout_buf));
#ifdef DEV_APIC
ASSYM(LA_EOI, LAPIC_EOI * LAPIC_MEM_MUL);
@@ -227,6 +234,10 @@
ASSYM(BC32SEL, GSEL(GBIOSCODE32_SEL, SEL_KPL));
ASSYM(GPROC0_SEL, GPROC0_SEL);
ASSYM(VM86_FRAMESIZE, sizeof(struct vm86frame));
+ASSYM(VM86_STACK_SPACE, VM86_STACK_SPACE);
+
+ASSYM(PMAP_TRM_MIN_ADDRESS, PMAP_TRM_MIN_ADDRESS);
+ASSYM(TRAMP_COPYOUT_SZ, TRAMP_COPYOUT_SZ);
#ifdef HWPMC_HOOKS
ASSYM(PMC_FN_USER_CALLCHAIN, PMC_FN_USER_CALLCHAIN);
Index: sys/i386/i386/locore.s
===================================================================
--- sys/i386/i386/locore.s
+++ sys/i386/i386/locore.s
@@ -53,14 +53,6 @@
#include "assym.inc"
-/*
- * XXX
- *
- * Note: This version greatly munged to avoid various assembler errors
- * that may be fixed in newer versions of gas. Perhaps newer versions
- * will have more pleasant appearance.
- */
-
/*
* PTmap is recursive pagemap at top of virtual address space.
* Within PTmap, the page directory can be found (third indirection).
@@ -71,7 +63,7 @@
.set PTDpde,PTD + (PTDPTDI * PDESIZE)
/*
- * Compiled KERNBASE location and the kernel load address
+ * Compiled KERNBASE location and the kernel load address, now identical.
*/
.globl kernbase
.set kernbase,KERNBASE
@@ -90,83 +82,6 @@
.globl bootinfo
bootinfo: .space BOOTINFO_SIZE /* bootinfo that we can handle */
- .globl KERNend
-KERNend: .long 0 /* phys addr end of kernel (just after bss) */
-physfree: .long 0 /* phys addr of next free page */
-
- .globl IdlePTD
-IdlePTD: .long 0 /* phys addr of kernel PTD */
-
-#if defined(PAE) || defined(PAE_TABLES)
- .globl IdlePDPT
-IdlePDPT: .long 0 /* phys addr of kernel PDPT */
-#endif
-
- .globl KPTmap
-KPTmap: .long 0 /* address of kernel page tables */
-
- .globl KPTphys
-KPTphys: .long 0 /* phys addr of kernel page tables */
-
- .globl proc0kstack
-proc0kstack: .long 0 /* address of proc 0 kstack space */
-p0kpa: .long 0 /* phys addr of proc0's STACK */
-
-vm86phystk: .long 0 /* PA of vm86/bios stack */
-
- .globl vm86paddr, vm86pa
-vm86paddr: .long 0 /* address of vm86 region */
-vm86pa: .long 0 /* phys addr of vm86 region */
-
-/**********************************************************************
- *
- * Some handy macros
- *
- */
-
-#define R(foo) ((foo)-KERNBASE)
-
-#define ALLOCPAGES(foo) \
- movl R(physfree), %esi ; \
- movl $((foo)*PAGE_SIZE), %eax ; \
- addl %esi, %eax ; \
- movl %eax, R(physfree) ; \
- movl %esi, %edi ; \
- movl $((foo)*PAGE_SIZE),%ecx ; \
- xorl %eax,%eax ; \
- cld ; \
- rep ; \
- stosb
-
-/*
- * fillkpt
- * eax = page frame address
- * ebx = index into page table
- * ecx = how many pages to map
- * base = base address of page dir/table
- * prot = protection bits
- */
-#define fillkpt(base, prot) \
- shll $PTESHIFT,%ebx ; \
- addl base,%ebx ; \
- orl $PG_V,%eax ; \
- orl prot,%eax ; \
-1: movl %eax,(%ebx) ; \
- addl $PAGE_SIZE,%eax ; /* increment physical address */ \
- addl $PTESIZE,%ebx ; /* next pte */ \
- loop 1b
-
-/*
- * fillkptphys(prot)
- * eax = physical address
- * ecx = how many pages to map
- * prot = protection bits
- */
-#define fillkptphys(prot) \
- movl %eax, %ebx ; \
- shrl $PAGE_SHIFT, %ebx ; \
- fillkpt(R(KPTphys), prot)
-
.text
/**********************************************************************
*
@@ -179,6 +94,7 @@
movw $0x1234,0x472
/* Set up a real frame in case the double return in newboot is executed. */
+ xorl %ebp,%ebp
pushl %ebp
movl %esp, %ebp
@@ -204,8 +120,8 @@
* inactive from now until we switch to new ones, since we don't load any
* more segment registers or permit interrupts until after the switch.
*/
- movl $R(end),%ecx
- movl $R(edata),%edi
+ movl $end,%ecx
+ movl $edata,%edi
subl %edi,%ecx
xorl %eax,%eax
cld
@@ -220,48 +136,10 @@
* the old stack, but it need not be, since recover_bootinfo actually
* returns via the old frame.
*/
- movl $R(tmpstk),%esp
+ movl $tmpstk,%esp
call identify_cpu
- call create_pagetables
-
-/*
- * If the CPU has support for VME, turn it on.
- */
- testl $CPUID_VME, R(cpu_feature)
- jz 1f
- movl %cr4, %eax
- orl $CR4_VME, %eax
- movl %eax, %cr4
-1:
-
-/* Now enable paging */
-#if defined(PAE) || defined(PAE_TABLES)
- movl R(IdlePDPT), %eax
- movl %eax, %cr3
- movl %cr4, %edx
- orl $CR4_PAE, %edx
- movl %edx, %cr4
-#else
- movl R(IdlePTD), %eax
- movl %eax,%cr3 /* load ptd addr into mmu */
-#endif
- movl %cr0,%edx /* get control word */
- orl $CR0_PE|CR0_PG,%edx /* enable paging */
- movl %edx,%cr0 /* and let's page NOW! */
-
- pushl $begin /* jump to high virtualized address */
- ret
-
-begin:
- /*
- * Now running relocated at KERNBASE where the system is linked to run.
- *
- * Remove the lowest part of the double mapping of low memory to get
- * some null pointer checks.
- */
- movl $0,PTD
- movl %eax,%cr3 /* invalidate TLB */
+ call pmap_cold
/* set up bootstrap stack */
movl proc0kstack,%eax /* location of in-kernel stack */
@@ -375,7 +253,7 @@
cmpl $0,%esi
je 2f /* No kernelname */
movl $MAXPATHLEN,%ecx /* Brute force!!! */
- movl $R(kernelname),%edi
+ movl $kernelname,%edi
cmpb $'/',(%esi) /* Make sure it starts with a slash */
je 1f
movb $'/',(%edi)
@@ -403,7 +281,7 @@
* Copy the common part of the bootinfo struct
*/
movl %ebx,%esi
- movl $R(bootinfo),%edi
+ movl $bootinfo,%edi
cmpl $BOOTINFO_SIZE,%ecx
jbe got_common_bi_size
movl $BOOTINFO_SIZE,%ecx
@@ -420,12 +298,12 @@
movl BI_NFS_DISKLESS(%ebx),%esi
cmpl $0,%esi
je olddiskboot
- movl $R(nfs_diskless),%edi
+ movl $nfs_diskless,%edi
movl $NFSDISKLESS_SIZE,%ecx
cld
rep
movsb
- movl $R(nfs_diskless_valid),%edi
+ movl $nfs_diskless_valid,%edi
movl $1,(%edi)
#endif
#endif
@@ -438,9 +316,9 @@
*/
olddiskboot:
movl 8(%ebp),%eax
- movl %eax,R(boothowto)
+ movl %eax,boothowto
movl 12(%ebp),%eax
- movl %eax,R(bootdev)
+ movl %eax,bootdev
ret
@@ -478,16 +356,16 @@
divl %ecx
jz trynexgen
popfl
- movl $CPU_386,R(cpu)
+ movl $CPU_386,cpu
jmp 3f
trynexgen:
popfl
- movl $CPU_NX586,R(cpu)
- movl $0x4778654e,R(cpu_vendor) # store vendor string
- movl $0x72446e65,R(cpu_vendor+4)
- movl $0x6e657669,R(cpu_vendor+8)
- movl $0,R(cpu_vendor+12)
+ movl $CPU_NX586,cpu
+ movl $0x4778654e,cpu_vendor # store vendor string
+ movl $0x72446e65,cpu_vendor+4
+ movl $0x6e657669,cpu_vendor+8
+ movl $0,cpu_vendor+12
jmp 3f
try486: /* Try to toggle identification flag; does not exist on early 486s. */
@@ -506,7 +384,7 @@
testl %eax,%eax
jnz trycpuid
- movl $CPU_486,R(cpu)
+ movl $CPU_486,cpu
/*
* Check Cyrix CPU
@@ -533,250 +411,46 @@
* CPU, we couldn't distinguish it from Cyrix's (including IBM
* brand of Cyrix CPUs).
*/
- movl $0x69727943,R(cpu_vendor) # store vendor string
- movl $0x736e4978,R(cpu_vendor+4)
- movl $0x64616574,R(cpu_vendor+8)
+ movl $0x69727943,cpu_vendor # store vendor string
+ movl $0x736e4978,cpu_vendor+4
+ movl $0x64616574,cpu_vendor+8
jmp 3f
trycpuid: /* Use the `cpuid' instruction. */
xorl %eax,%eax
cpuid # cpuid 0
- movl %eax,R(cpu_high) # highest capability
- movl %ebx,R(cpu_vendor) # store vendor string
- movl %edx,R(cpu_vendor+4)
- movl %ecx,R(cpu_vendor+8)
- movb $0,R(cpu_vendor+12)
+ movl %eax,cpu_high # highest capability
+ movl %ebx,cpu_vendor # store vendor string
+ movl %edx,cpu_vendor+4
+ movl %ecx,cpu_vendor+8
+ movb $0,cpu_vendor+12
movl $1,%eax
cpuid # cpuid 1
- movl %eax,R(cpu_id) # store cpu_id
- movl %ebx,R(cpu_procinfo) # store cpu_procinfo
- movl %edx,R(cpu_feature) # store cpu_feature
- movl %ecx,R(cpu_feature2) # store cpu_feature2
+ movl %eax,cpu_id # store cpu_id
+ movl %ebx,cpu_procinfo # store cpu_procinfo
+ movl %edx,cpu_feature # store cpu_feature
+ movl %ecx,cpu_feature2 # store cpu_feature2
rorl $8,%eax # extract family type
andl $15,%eax
cmpl $5,%eax
jae 1f
/* less than Pentium; must be 486 */
- movl $CPU_486,R(cpu)
+ movl $CPU_486,cpu
jmp 3f
1:
/* a Pentium? */
cmpl $5,%eax
jne 2f
- movl $CPU_586,R(cpu)
+ movl $CPU_586,cpu
jmp 3f
2:
/* Greater than Pentium...call it a Pentium Pro */
- movl $CPU_686,R(cpu)
+ movl $CPU_686,cpu
3:
ret
-
-/**********************************************************************
- *
- * Create the first page directory and its page tables.
- *
- */
-
-create_pagetables:
-
-/* Find end of kernel image (rounded up to a page boundary). */
- movl $R(_end),%esi
-
-/* Include symbols, if any. */
- movl R(bootinfo+BI_ESYMTAB),%edi
- testl %edi,%edi
- je over_symalloc
- movl %edi,%esi
- movl $KERNBASE,%edi
- addl %edi,R(bootinfo+BI_SYMTAB)
- addl %edi,R(bootinfo+BI_ESYMTAB)
-over_symalloc:
-
-/* If we are told where the end of the kernel space is, believe it. */
- movl R(bootinfo+BI_KERNEND),%edi
- testl %edi,%edi
- je no_kernend
- movl %edi,%esi
-no_kernend:
-
- addl $PDRMASK,%esi /* Play conservative for now, and */
- andl $~PDRMASK,%esi /* ... round up to PDR boundary */
- movl %esi,R(KERNend) /* save end of kernel */
- movl %esi,R(physfree) /* next free page is at end of kernel */
-
-/* Allocate Kernel Page Tables */
- ALLOCPAGES(NKPT)
- movl %esi,R(KPTphys)
- addl $(KERNBASE-(KPTDI<<(PDRSHIFT-PAGE_SHIFT+PTESHIFT))),%esi
- movl %esi,R(KPTmap)
-
-/* Allocate Page Table Directory */
-#if defined(PAE) || defined(PAE_TABLES)
- /* XXX only need 32 bytes (easier for now) */
- ALLOCPAGES(1)
- movl %esi,R(IdlePDPT)
-#endif
- ALLOCPAGES(NPGPTD)
- movl %esi,R(IdlePTD)
-
-/* Allocate KSTACK */
- ALLOCPAGES(TD0_KSTACK_PAGES)
- movl %esi,R(p0kpa)
- addl $KERNBASE, %esi
- movl %esi, R(proc0kstack)
-
- ALLOCPAGES(1) /* vm86/bios stack */
- movl %esi,R(vm86phystk)
-
- ALLOCPAGES(3) /* pgtable + ext + IOPAGES */
- movl %esi,R(vm86pa)
- addl $KERNBASE, %esi
- movl %esi, R(vm86paddr)
-
-/*
- * Enable PSE and PGE.
- */
-#ifndef DISABLE_PSE
- testl $CPUID_PSE, R(cpu_feature)
- jz 1f
- movl $PG_PS, R(pseflag)
- movl %cr4, %eax
- orl $CR4_PSE, %eax
- movl %eax, %cr4
-1:
-#endif
-#ifndef DISABLE_PG_G
- testl $CPUID_PGE, R(cpu_feature)
- jz 2f
- movl $PG_G, R(pgeflag)
- movl %cr4, %eax
- orl $CR4_PGE, %eax
- movl %eax, %cr4
-2:
-#endif
-
-/*
- * Initialize page table pages mapping physical address zero through the
- * (physical) end of the kernel. Many of these pages must be reserved,
- * and we reserve them all and map them linearly for convenience. We do
- * this even if we've enabled PSE above; we'll just switch the corresponding
- * kernel PDEs before we turn on paging.
- *
- * XXX: We waste some pages here in the PSE case!
- *
- * This and all other page table entries allow read and write access for
- * various reasons. Kernel mappings never have any access restrictions.
- */
- xorl %eax, %eax
- movl R(KERNend),%ecx
- shrl $PAGE_SHIFT,%ecx
- fillkptphys($PG_RW)
-
-/* Map page table pages. */
- movl R(KPTphys),%eax
- movl $NKPT,%ecx
- fillkptphys($PG_RW)
-
-/* Map page directory. */
-#if defined(PAE) || defined(PAE_TABLES)
- movl R(IdlePDPT), %eax
- movl $1, %ecx
- fillkptphys($PG_RW)
-#endif
-
- movl R(IdlePTD), %eax
- movl $NPGPTD, %ecx
- fillkptphys($PG_RW)
-
-/* Map proc0's KSTACK in the physical way ... */
- movl R(p0kpa), %eax
- movl $(TD0_KSTACK_PAGES), %ecx
- fillkptphys($PG_RW)
-
-/* Map ISA hole */
- movl $ISA_HOLE_START, %eax
- movl $ISA_HOLE_LENGTH>>PAGE_SHIFT, %ecx
- fillkptphys($PG_RW)
-
-/* Map space for the vm86 region */
- movl R(vm86phystk), %eax
- movl $4, %ecx
- fillkptphys($PG_RW)
-
-/* Map page 0 into the vm86 page table */
- movl $0, %eax
- movl $0, %ebx
- movl $1, %ecx
- fillkpt(R(vm86pa), $PG_RW|PG_U)
-
-/* ...likewise for the ISA hole */
- movl $ISA_HOLE_START, %eax
- movl $ISA_HOLE_START>>PAGE_SHIFT, %ebx
- movl $ISA_HOLE_LENGTH>>PAGE_SHIFT, %ecx
- fillkpt(R(vm86pa), $PG_RW|PG_U)
-
-/*
- * Create an identity mapping for low physical memory, including the kernel.
- * This is only used to map the 2 instructions for jumping to 'begin' in
- * locore (we map everything to avoid having to determine where these
- * instructions are). ACPI resume will transiently restore the first PDE in
- * this mapping (and depend on this PDE's page table created here not being
- * destroyed). See pmap_bootstrap() for more details.
- *
- * Note: There are errata concerning large pages and physical address zero,
- * so a PG_PS mapping should not be used for PDE 0. Our double mapping
- * avoids this automatically by not using PG_PS for PDE #KPDI so that PAT
- * bits can be set at the page level for i/o pages below 1 MB.
- */
- movl R(KPTphys), %eax
- xorl %ebx, %ebx
- movl $NKPT, %ecx
- fillkpt(R(IdlePTD), $PG_RW)
-
-/*
- * Install PDEs for PTs covering enough kva to bootstrap. Then for the PSE
- * case, replace the PDEs whose coverage is strictly within the kernel
- * (between KERNLOAD (rounded up) and KERNend) by large-page PDEs.
- */
- movl R(KPTphys), %eax
- movl $KPTDI, %ebx
- movl $NKPT, %ecx
- fillkpt(R(IdlePTD), $PG_RW)
- cmpl $0,R(pseflag)
- je done_pde
-
- movl R(KERNend), %ecx
- movl $(KERNLOAD + PDRMASK) & ~PDRMASK, %eax
- subl %eax, %ecx
- shrl $PDRSHIFT, %ecx
- movl $KPTDI + ((KERNLOAD + PDRMASK) >> PDRSHIFT), %ebx
- shll $PDESHIFT, %ebx
- addl R(IdlePTD), %ebx
- orl $(PG_V|PG_RW|PG_PS), %eax
-1: movl %eax, (%ebx)
- addl $(1 << PDRSHIFT), %eax
- addl $PDESIZE, %ebx
- loop 1b
-
-done_pde:
-/* install a pde recursively mapping page directory as a page table */
- movl R(IdlePTD), %eax
- movl $PTDPTDI, %ebx
- movl $NPGPTD,%ecx
- fillkpt(R(IdlePTD), $PG_RW)
-
-#if defined(PAE) || defined(PAE_TABLES)
- movl R(IdlePTD), %eax
- xorl %ebx, %ebx
- movl $NPGPTD, %ecx
- fillkpt(R(IdlePDPT), $0x0)
-#endif
-
- ret
-
#ifdef XENHVM
/* Xen Hypercall page */
.text
Index: sys/i386/i386/machdep.c
===================================================================
--- sys/i386/i386/machdep.c
+++ sys/i386/i386/machdep.c
@@ -1,6 +1,7 @@
/*-
* SPDX-License-Identifier: BSD-4-Clause
*
+ * Copyright (c) 2018 The FreeBSD Foundation
* Copyright (c) 1992 Terrence R. Lambert.
* Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
* All rights reserved.
@@ -8,6 +9,9 @@
* This code is derived from software contributed to Berkeley by
* William Jolitz.
*
+ * Portions of this software were developed by A. Joseph Koshy under
+ * sponsorship from the FreeBSD Foundation and Google, Inc.
+ *
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
@@ -81,9 +85,7 @@
#include <sys/rwlock.h>
#include <sys/sched.h>
#include <sys/signalvar.h>
-#ifdef SMP
#include <sys/smp.h>
-#endif
#include <sys/syscallsubr.h>
#include <sys/sysctl.h>
#include <sys/sysent.h>
@@ -128,6 +130,7 @@
#include <machine/reg.h>
#include <machine/sigframe.h>
#include <machine/specialreg.h>
+#include <machine/sysarch.h>
#include <machine/trap.h>
#include <machine/vm86.h>
#include <x86/init.h>
@@ -152,8 +155,8 @@
/* Sanity check for __curthread() */
CTASSERT(offsetof(struct pcpu, pc_curthread) == 0);
-extern register_t init386(int first);
-extern void dblfault_handler(void);
+register_t init386(int first);
+void dblfault_handler(void);
static void cpu_startup(void *);
static void fpstate_drop(struct thread *td);
@@ -210,14 +213,18 @@
struct mem_range_softc mem_range_softc;
- /* Default init_ops implementation. */
- struct init_ops init_ops = {
+extern char start_exceptions[], end_exceptions[];
+
+extern struct sysentvec elf32_freebsd_sysvec;
+
+/* Default init_ops implementation. */
+struct init_ops init_ops = {
.early_clock_source_init = i8254_init,
.early_delay = i8254_delay,
#ifdef DEV_APIC
.msi_init = msi_init,
#endif
- };
+};
static void
cpu_startup(dummy)
@@ -1098,24 +1105,59 @@
return (EJUSTRETURN);
}
+#ifdef COMPAT_43
+static void
+setup_priv_lcall_gate(struct proc *p)
+{
+ struct i386_ldt_args uap;
+ union descriptor desc;
+ u_int lcall_addr;
+
+ bzero(&uap, sizeof(uap));
+ uap.start = 0;
+ uap.num = 1;
+ lcall_addr = p->p_sysent->sv_psstrings - sz_lcall_tramp;
+ bzero(&desc, sizeof(desc));
+ desc.sd.sd_type = SDT_MEMERA;
+ desc.sd.sd_dpl = SEL_UPL;
+ desc.sd.sd_p = 1;
+ desc.sd.sd_def32 = 1;
+ desc.sd.sd_gran = 1;
+ desc.sd.sd_lolimit = 0xffff;
+ desc.sd.sd_hilimit = 0xf;
+ desc.sd.sd_lobase = lcall_addr;
+ desc.sd.sd_hibase = lcall_addr >> 24;
+ i386_set_ldt(curthread, &uap, &desc);
+}
+#endif
+
/*
* Reset registers to default values on exec.
*/
void
exec_setregs(struct thread *td, struct image_params *imgp, u_long stack)
{
- struct trapframe *regs = td->td_frame;
- struct pcb *pcb = td->td_pcb;
+ struct trapframe *regs;
+ struct pcb *pcb;
+
+ regs = td->td_frame;
+ pcb = td->td_pcb;
/* Reset pc->pcb_gs and %gs before possibly invalidating it. */
pcb->pcb_gs = _udatasel;
load_gs(_udatasel);
mtx_lock_spin(&dt_lock);
- if (td->td_proc->p_md.md_ldt)
+ if (td->td_proc->p_md.md_ldt != NULL)
user_ldt_free(td);
else
mtx_unlock_spin(&dt_lock);
+
+#ifdef COMPAT_43
+ if (td->td_proc->p_sysent->sv_psstrings !=
+ elf32_freebsd_sysvec.sv_psstrings)
+ setup_priv_lcall_gate(td->td_proc);
+#endif
/*
* Reset the fs and gs bases. The values from the old address
@@ -1217,18 +1259,22 @@
int _default_ldt;
-union descriptor gdt[NGDT * MAXCPU]; /* global descriptor table */
-union descriptor ldt[NLDT]; /* local descriptor table */
+struct mtx dt_lock; /* lock for GDT and LDT */
+
+union descriptor gdt0[NGDT]; /* initial global descriptor table */
+union descriptor *gdt = gdt0; /* global descriptor table */
+
+union descriptor *ldt; /* local descriptor table */
+
static struct gate_descriptor idt0[NIDT];
struct gate_descriptor *idt = &idt0[0]; /* interrupt descriptor table */
-struct region_descriptor r_gdt, r_idt; /* table descriptors */
-struct mtx dt_lock; /* lock for GDT and LDT */
-static struct i386tss dblfault_tss;
-static char dblfault_stack[PAGE_SIZE];
+static struct i386tss *dblfault_tss;
+static char *dblfault_stack;
-extern vm_offset_t proc0kstack;
+static struct i386tss common_tss0;
+vm_offset_t proc0kstack;
/*
* software prototypes -- in more palatable form.
@@ -1329,8 +1375,8 @@
.ssd_def32 = 0,
.ssd_gran = 0 },
/* GLDT_SEL 10 LDT Descriptor */
-{ .ssd_base = (int) ldt,
- .ssd_limit = sizeof(ldt)-1,
+{ .ssd_base = 0,
+ .ssd_limit = sizeof(union descriptor) * NLDT - 1,
.ssd_type = SDT_SYSLDT,
.ssd_dpl = SEL_UPL,
.ssd_p = 1,
@@ -1338,7 +1384,7 @@
.ssd_def32 = 0,
.ssd_gran = 0 },
/* GUSERLDT_SEL 11 User LDT Descriptor per process */
-{ .ssd_base = (int) ldt,
+{ .ssd_base = 0,
.ssd_limit = (512 * sizeof(union descriptor)-1),
.ssd_type = SDT_SYSLDT,
.ssd_dpl = 0,
@@ -1347,7 +1393,7 @@
.ssd_def32 = 0,
.ssd_gran = 0 },
/* GPANIC_SEL 12 Panic Tss Descriptor */
-{ .ssd_base = (int) &dblfault_tss,
+{ .ssd_base = 0,
.ssd_limit = sizeof(struct i386tss)-1,
.ssd_type = SDT_SYS386TSS,
.ssd_dpl = 0,
@@ -1468,25 +1514,31 @@
.ssd_gran = 1 },
};
+uintptr_t setidt_disp;
+
void
-setidt(idx, func, typ, dpl, selec)
- int idx;
- inthand_t *func;
- int typ;
- int dpl;
- int selec;
+setidt(int idx, inthand_t *func, int typ, int dpl, int selec)
+{
+ uintptr_t off;
+
+ off = func != NULL ? (uintptr_t)func + setidt_disp : 0;
+ setidt_nodisp(idx, off, typ, dpl, selec);
+}
+
+void
+setidt_nodisp(int idx, uintptr_t off, int typ, int dpl, int selec)
{
struct gate_descriptor *ip;
ip = idt + idx;
- ip->gd_looffset = (int)func;
+ ip->gd_looffset = off;
ip->gd_selector = selec;
ip->gd_stkcpy = 0;
ip->gd_xx = 0;
ip->gd_type = typ;
ip->gd_dpl = dpl;
ip->gd_p = 1;
- ip->gd_hioffset = ((int)func)>>16 ;
+ ip->gd_hioffset = ((u_int)off) >> 16 ;
}
extern inthand_t
@@ -1501,7 +1553,7 @@
#ifdef XENHVM
IDTVEC(xen_intr_upcall),
#endif
- IDTVEC(lcall_syscall), IDTVEC(int0x80_syscall);
+ IDTVEC(int0x80_syscall);
#ifdef DDB
/*
@@ -1512,15 +1564,29 @@
{
struct gate_descriptor *ip;
int idx;
- uintptr_t func;
+ uintptr_t func, func_trm;
+ bool trm;
ip = idt;
for (idx = 0; idx < NIDT && !db_pager_quit; idx++) {
- func = (ip->gd_hioffset << 16 | ip->gd_looffset);
- if (func != (uintptr_t)&IDTVEC(rsvd)) {
- db_printf("%3d\t", idx);
- db_printsym(func, DB_STGY_PROC);
- db_printf("\n");
+ if (ip->gd_type == SDT_SYSTASKGT) {
+ db_printf("%3d\t<TASK>\n", idx);
+ } else {
+ func = (ip->gd_hioffset << 16 | ip->gd_looffset);
+ if (func >= PMAP_TRM_MIN_ADDRESS) {
+ func_trm = func;
+ func -= setidt_disp;
+ trm = true;
+ } else
+ trm = false;
+ if (func != (uintptr_t)&IDTVEC(rsvd)) {
+ db_printf("%3d\t", idx);
+ db_printsym(func, DB_STGY_PROC);
+ if (trm)
+ db_printf(" (trampoline %#x)",
+ func_trm);
+ db_printf("\n");
+ }
}
ip++;
}
@@ -1567,6 +1633,24 @@
db_printf("dr6\t0x%08x\n", rdr6());
db_printf("dr7\t0x%08x\n", rdr7());
}
+
+DB_SHOW_COMMAND(frame, db_show_frame)
+{
+ struct trapframe *frame;
+
+ frame = have_addr ? (struct trapframe *)addr : curthread->td_frame;
+ printf("ss %#x esp %#x efl %#x cs %#x eip %#x\n",
+ frame->tf_ss, frame->tf_esp, frame->tf_eflags, frame->tf_cs,
+ frame->tf_eip);
+ printf("err %#x trapno %d\n", frame->tf_err, frame->tf_trapno);
+ printf("ds %#x es %#x fs %#x\n",
+ frame->tf_ds, frame->tf_es, frame->tf_fs);
+ printf("eax %#x ecx %#x edx %#x ebx %#x\n",
+ frame->tf_eax, frame->tf_ecx, frame->tf_edx, frame->tf_ebx);
+ printf("ebp %#x esi %#x edi %#x\n",
+ frame->tf_ebp, frame->tf_esi, frame->tf_edi);
+
+}
#endif
void
@@ -1693,7 +1777,6 @@
static void
basemem_setup(void)
{
- vm_paddr_t pa;
pt_entry_t *pte;
int i;
@@ -1703,30 +1786,6 @@
basemem = 640;
}
- /*
- * XXX if biosbasemem is now < 640, there is a `hole'
- * between the end of base memory and the start of
- * ISA memory. The hole may be empty or it may
- * contain BIOS code or data. Map it read/write so
- * that the BIOS can write to it. (Memory from 0 to
- * the physical end of the kernel is mapped read-only
- * to begin with and then parts of it are remapped.
- * The parts that aren't remapped form holes that
- * remain read-only and are unused by the kernel.
- * The base memory area is below the physical end of
- * the kernel and right now forms a read-only hole.
- * The part of it from PAGE_SIZE to
- * (trunc_page(biosbasemem * 1024) - 1) will be
- * remapped and used by the kernel later.)
- *
- * This code is similar to the code used in
- * pmap_mapdev, but since no memory needs to be
- * allocated we simply change the mapping.
- */
- for (pa = trunc_page(basemem * 1024);
- pa < ISA_HOLE_START; pa += PAGE_SIZE)
- pmap_kenter(KERNBASE + pa, pa);
-
/*
* Map pages between basemem and ISA_HOLE_START, if any, r/w into
* the vm86 page table so that vm86 can scribble on them using
@@ -1807,9 +1866,8 @@
* the kernel page table so we can use it as a buffer. The
* kernel will unmap this page later.
*/
- pmap_kenter(KERNBASE + (1 << PAGE_SHIFT), 1 << PAGE_SHIFT);
vmc.npages = 0;
- smap = (void *)vm86_addpage(&vmc, 1, KERNBASE + (1 << PAGE_SHIFT));
+ smap = (void *)vm86_addpage(&vmc, 1, PMAP_MAP_LOW + ptoa(1));
res = vm86_getptr(&vmc, (vm_offset_t)smap, &vmf.vmf_es, &vmf.vmf_di);
KASSERT(res != 0, ("vm86_getptr() failed: address not found"));
@@ -2130,13 +2188,119 @@
#endif
}
+static void
+fixup_idt(void)
+{
+ struct gate_descriptor *ip;
+ uintptr_t off;
+ int x;
+
+ for (x = 0; x < NIDT; x++) {
+ ip = &idt[x];
+ if (ip->gd_type != SDT_SYS386IGT &&
+ ip->gd_type != SDT_SYS386TGT)
+ continue;
+ off = ip->gd_looffset + (((u_int)ip->gd_hioffset) << 16);
+ KASSERT(off >= (uintptr_t)start_exceptions &&
+ off < (uintptr_t)end_exceptions,
+ ("IDT[%d] type %d off %#x", x, ip->gd_type, off));
+ off += setidt_disp;
+ MPASS(off >= PMAP_TRM_MIN_ADDRESS &&
+ off < PMAP_TRM_MAX_ADDRESS);
+ ip->gd_looffset = off;
+ ip->gd_hioffset = off >> 16;
+ }
+}
+
+static void
+i386_setidt1(void)
+{
+ int x;
+
+ /* exceptions */
+ for (x = 0; x < NIDT; x++)
+ setidt(x, &IDTVEC(rsvd), SDT_SYS386IGT, SEL_KPL,
+ GSEL(GCODE_SEL, SEL_KPL));
+ setidt(IDT_DE, &IDTVEC(div), SDT_SYS386IGT, SEL_KPL,
+ GSEL(GCODE_SEL, SEL_KPL));
+ setidt(IDT_DB, &IDTVEC(dbg), SDT_SYS386IGT, SEL_KPL,
+ GSEL(GCODE_SEL, SEL_KPL));
+ setidt(IDT_NMI, &IDTVEC(nmi), SDT_SYS386IGT, SEL_KPL,
+ GSEL(GCODE_SEL, SEL_KPL));
+ setidt(IDT_BP, &IDTVEC(bpt), SDT_SYS386IGT, SEL_UPL,
+ GSEL(GCODE_SEL, SEL_KPL));
+ setidt(IDT_OF, &IDTVEC(ofl), SDT_SYS386IGT, SEL_UPL,
+ GSEL(GCODE_SEL, SEL_KPL));
+ setidt(IDT_BR, &IDTVEC(bnd), SDT_SYS386IGT, SEL_KPL,
+ GSEL(GCODE_SEL, SEL_KPL));
+ setidt(IDT_UD, &IDTVEC(ill), SDT_SYS386IGT, SEL_KPL,
+ GSEL(GCODE_SEL, SEL_KPL));
+ setidt(IDT_NM, &IDTVEC(dna), SDT_SYS386IGT, SEL_KPL,
+ GSEL(GCODE_SEL, SEL_KPL));
+ setidt(IDT_DF, 0, SDT_SYSTASKGT, SEL_KPL, GSEL(GPANIC_SEL,
+ SEL_KPL));
+ setidt(IDT_FPUGP, &IDTVEC(fpusegm), SDT_SYS386IGT,
+ SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
+ setidt(IDT_TS, &IDTVEC(tss), SDT_SYS386IGT, SEL_KPL,
+ GSEL(GCODE_SEL, SEL_KPL));
+ setidt(IDT_NP, &IDTVEC(missing), SDT_SYS386IGT, SEL_KPL,
+ GSEL(GCODE_SEL, SEL_KPL));
+ setidt(IDT_SS, &IDTVEC(stk), SDT_SYS386IGT, SEL_KPL,
+ GSEL(GCODE_SEL, SEL_KPL));
+ setidt(IDT_GP, &IDTVEC(prot), SDT_SYS386IGT, SEL_KPL,
+ GSEL(GCODE_SEL, SEL_KPL));
+ setidt(IDT_PF, &IDTVEC(page), SDT_SYS386IGT, SEL_KPL,
+ GSEL(GCODE_SEL, SEL_KPL));
+ setidt(IDT_MF, &IDTVEC(fpu), SDT_SYS386TGT, SEL_KPL,
+ GSEL(GCODE_SEL, SEL_KPL));
+ setidt(IDT_AC, &IDTVEC(align), SDT_SYS386IGT, SEL_KPL,
+ GSEL(GCODE_SEL, SEL_KPL));
+ setidt(IDT_MC, &IDTVEC(mchk), SDT_SYS386IGT, SEL_KPL,
+ GSEL(GCODE_SEL, SEL_KPL));
+ setidt(IDT_XF, &IDTVEC(xmm), SDT_SYS386IGT, SEL_KPL,
+ GSEL(GCODE_SEL, SEL_KPL));
+ setidt(IDT_SYSCALL, &IDTVEC(int0x80_syscall),
+ SDT_SYS386IGT, SEL_UPL, GSEL(GCODE_SEL, SEL_KPL));
+#ifdef KDTRACE_HOOKS
+ setidt(IDT_DTRACE_RET, &IDTVEC(dtrace_ret),
+ SDT_SYS386IGT, SEL_UPL, GSEL(GCODE_SEL, SEL_KPL));
+#endif
+#ifdef XENHVM
+ setidt(IDT_EVTCHN, &IDTVEC(xen_intr_upcall),
+ SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
+#endif
+}
+
+static void
+i386_setidt2(void)
+{
+
+ setidt(IDT_UD, &IDTVEC(ill), SDT_SYS386IGT, SEL_KPL,
+ GSEL(GCODE_SEL, SEL_KPL));
+ setidt(IDT_GP, &IDTVEC(prot), SDT_SYS386IGT, SEL_KPL,
+ GSEL(GCODE_SEL, SEL_KPL));
+}
+
+#if defined(DEV_ISA) && !defined(DEV_ATPIC)
+static void
+i386_setidt3(void)
+{
+
+ setidt(IDT_IO_INTS + 7, IDTVEC(spuriousint),
+ SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
+ setidt(IDT_IO_INTS + 15, IDTVEC(spuriousint),
+ SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
+}
+#endif
+
register_t
init386(int first)
{
- struct gate_descriptor *gdp;
+ struct region_descriptor r_gdt, r_idt; /* table descriptors */
int gsel_tss, metadata_missing, x, pa;
struct pcpu *pc;
struct xstate_hdr *xhdr;
+ vm_offset_t addend;
int late_console;
thread0.td_kstack = proc0kstack;
@@ -2148,18 +2312,23 @@
*/
proc_linkup0(&proc0, &thread0);
- metadata_missing = 0;
if (bootinfo.bi_modulep) {
- preload_metadata = (caddr_t)bootinfo.bi_modulep + KERNBASE;
- preload_bootstrap_relocate(KERNBASE);
+ metadata_missing = 0;
+ addend = (vm_paddr_t)bootinfo.bi_modulep < KERNBASE ?
+ PMAP_MAP_LOW : 0;
+ preload_metadata = (caddr_t)bootinfo.bi_modulep + addend;
+ preload_bootstrap_relocate(addend);
} else {
metadata_missing = 1;
}
- if (bootinfo.bi_envp != 0)
- init_static_kenv((char *)bootinfo.bi_envp + KERNBASE, 0);
- else
+ if (bootinfo.bi_envp != 0) {
+ addend = (vm_paddr_t)bootinfo.bi_envp < KERNBASE ?
+ PMAP_MAP_LOW : 0;
+ init_static_kenv((char *)bootinfo.bi_envp + addend, 0);
+ } else {
init_static_kenv(NULL, 0);
+ }
identify_hypervisor();
@@ -2179,21 +2348,21 @@
pc = &__pcpu[0];
gdt_segs[GPRIV_SEL].ssd_limit = atop(0 - 1);
- gdt_segs[GPRIV_SEL].ssd_base = (int) pc;
- gdt_segs[GPROC0_SEL].ssd_base = (int) &pc->pc_common_tss;
+ gdt_segs[GPRIV_SEL].ssd_base = (int)pc;
+ gdt_segs[GPROC0_SEL].ssd_base = (int)&common_tss0;
for (x = 0; x < NGDT; x++)
- ssdtosd(&gdt_segs[x], &gdt[x].sd);
+ ssdtosd(&gdt_segs[x], &gdt0[x].sd);
- r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1;
- r_gdt.rd_base = (int) gdt;
+ r_gdt.rd_limit = NGDT * sizeof(gdt0[0]) - 1;
+ r_gdt.rd_base = (int)gdt0;
mtx_init(&dt_lock, "descriptor tables", NULL, MTX_SPIN);
lgdt(&r_gdt);
pcpu_init(pc, 0, sizeof(struct pcpu));
for (pa = first; pa < first + DPCPU_SIZE; pa += PAGE_SIZE)
- pmap_kenter(pa + KERNBASE, pa);
- dpcpu_init((void *)(first + KERNBASE), 0);
+ pmap_kenter(pa, pa);
+ dpcpu_init((void *)first, 0);
first += DPCPU_SIZE;
PCPU_SET(prvspace, pc);
PCPU_SET(curthread, &thread0);
@@ -2210,67 +2379,7 @@
mutex_init();
mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS | MTX_NOPROFILE);
- /* make ldt memory segments */
- ldt_segs[LUCODE_SEL].ssd_limit = atop(0 - 1);
- ldt_segs[LUDATA_SEL].ssd_limit = atop(0 - 1);
- for (x = 0; x < nitems(ldt_segs); x++)
- ssdtosd(&ldt_segs[x], &ldt[x].sd);
-
- _default_ldt = GSEL(GLDT_SEL, SEL_KPL);
- lldt(_default_ldt);
- PCPU_SET(currentldt, _default_ldt);
-
- /* exceptions */
- for (x = 0; x < NIDT; x++)
- setidt(x, &IDTVEC(rsvd), SDT_SYS386TGT, SEL_KPL,
- GSEL(GCODE_SEL, SEL_KPL));
- setidt(IDT_DE, &IDTVEC(div), SDT_SYS386TGT, SEL_KPL,
- GSEL(GCODE_SEL, SEL_KPL));
- setidt(IDT_DB, &IDTVEC(dbg), SDT_SYS386IGT, SEL_KPL,
- GSEL(GCODE_SEL, SEL_KPL));
- setidt(IDT_NMI, &IDTVEC(nmi), SDT_SYS386IGT, SEL_KPL,
- GSEL(GCODE_SEL, SEL_KPL));
- setidt(IDT_BP, &IDTVEC(bpt), SDT_SYS386IGT, SEL_UPL,
- GSEL(GCODE_SEL, SEL_KPL));
- setidt(IDT_OF, &IDTVEC(ofl), SDT_SYS386TGT, SEL_UPL,
- GSEL(GCODE_SEL, SEL_KPL));
- setidt(IDT_BR, &IDTVEC(bnd), SDT_SYS386TGT, SEL_KPL,
- GSEL(GCODE_SEL, SEL_KPL));
- setidt(IDT_UD, &IDTVEC(ill), SDT_SYS386TGT, SEL_KPL,
- GSEL(GCODE_SEL, SEL_KPL));
- setidt(IDT_NM, &IDTVEC(dna), SDT_SYS386TGT, SEL_KPL
- , GSEL(GCODE_SEL, SEL_KPL));
- setidt(IDT_DF, 0, SDT_SYSTASKGT, SEL_KPL, GSEL(GPANIC_SEL, SEL_KPL));
- setidt(IDT_FPUGP, &IDTVEC(fpusegm), SDT_SYS386TGT, SEL_KPL,
- GSEL(GCODE_SEL, SEL_KPL));
- setidt(IDT_TS, &IDTVEC(tss), SDT_SYS386TGT, SEL_KPL,
- GSEL(GCODE_SEL, SEL_KPL));
- setidt(IDT_NP, &IDTVEC(missing), SDT_SYS386TGT, SEL_KPL,
- GSEL(GCODE_SEL, SEL_KPL));
- setidt(IDT_SS, &IDTVEC(stk), SDT_SYS386TGT, SEL_KPL,
- GSEL(GCODE_SEL, SEL_KPL));
- setidt(IDT_GP, &IDTVEC(prot), SDT_SYS386TGT, SEL_KPL,
- GSEL(GCODE_SEL, SEL_KPL));
- setidt(IDT_PF, &IDTVEC(page), SDT_SYS386IGT, SEL_KPL,
- GSEL(GCODE_SEL, SEL_KPL));
- setidt(IDT_MF, &IDTVEC(fpu), SDT_SYS386TGT, SEL_KPL,
- GSEL(GCODE_SEL, SEL_KPL));
- setidt(IDT_AC, &IDTVEC(align), SDT_SYS386TGT, SEL_KPL,
- GSEL(GCODE_SEL, SEL_KPL));
- setidt(IDT_MC, &IDTVEC(mchk), SDT_SYS386TGT, SEL_KPL,
- GSEL(GCODE_SEL, SEL_KPL));
- setidt(IDT_XF, &IDTVEC(xmm), SDT_SYS386TGT, SEL_KPL,
- GSEL(GCODE_SEL, SEL_KPL));
- setidt(IDT_SYSCALL, &IDTVEC(int0x80_syscall), SDT_SYS386TGT, SEL_UPL,
- GSEL(GCODE_SEL, SEL_KPL));
-#ifdef KDTRACE_HOOKS
- setidt(IDT_DTRACE_RET, &IDTVEC(dtrace_ret), SDT_SYS386TGT, SEL_UPL,
- GSEL(GCODE_SEL, SEL_KPL));
-#endif
-#ifdef XENHVM
- setidt(IDT_EVTCHN, &IDTVEC(xen_intr_upcall), SDT_SYS386IGT, SEL_KPL,
- GSEL(GCODE_SEL, SEL_KPL));
-#endif
+ i386_setidt1();
r_idt.rd_limit = sizeof(idt0) - 1;
r_idt.rd_base = (int) idt;
@@ -2283,41 +2392,21 @@
clock_init();
finishidentcpu(); /* Final stage of CPU initialization */
- setidt(IDT_UD, &IDTVEC(ill), SDT_SYS386TGT, SEL_KPL,
- GSEL(GCODE_SEL, SEL_KPL));
- setidt(IDT_GP, &IDTVEC(prot), SDT_SYS386TGT, SEL_KPL,
- GSEL(GCODE_SEL, SEL_KPL));
+ i386_setidt2();
initializecpu(); /* Initialize CPU registers */
initializecpucache();
/* pointer to selector slot for %fs/%gs */
PCPU_SET(fsgs_gdt, &gdt[GUFS_SEL].sd);
- dblfault_tss.tss_esp = dblfault_tss.tss_esp0 = dblfault_tss.tss_esp1 =
- dblfault_tss.tss_esp2 = (int)&dblfault_stack[sizeof(dblfault_stack)];
- dblfault_tss.tss_ss = dblfault_tss.tss_ss0 = dblfault_tss.tss_ss1 =
- dblfault_tss.tss_ss2 = GSEL(GDATA_SEL, SEL_KPL);
-#if defined(PAE) || defined(PAE_TABLES)
- dblfault_tss.tss_cr3 = (int)IdlePDPT;
-#else
- dblfault_tss.tss_cr3 = (int)IdlePTD;
-#endif
- dblfault_tss.tss_eip = (int)dblfault_handler;
- dblfault_tss.tss_eflags = PSL_KERNEL;
- dblfault_tss.tss_ds = dblfault_tss.tss_es =
- dblfault_tss.tss_gs = GSEL(GDATA_SEL, SEL_KPL);
- dblfault_tss.tss_fs = GSEL(GPRIV_SEL, SEL_KPL);
- dblfault_tss.tss_cs = GSEL(GCODE_SEL, SEL_KPL);
- dblfault_tss.tss_ldt = GSEL(GLDT_SEL, SEL_KPL);
-
/* Initialize the tss (except for the final esp0) early for vm86. */
- PCPU_SET(common_tss.tss_esp0, thread0.td_kstack +
- thread0.td_kstack_pages * PAGE_SIZE - 16);
- PCPU_SET(common_tss.tss_ss0, GSEL(GDATA_SEL, SEL_KPL));
+ common_tss0.tss_esp0 = thread0.td_kstack + thread0.td_kstack_pages *
+ PAGE_SIZE - VM86_STACK_SPACE;
+ common_tss0.tss_ss0 = GSEL(GDATA_SEL, SEL_KPL);
+ common_tss0.tss_ioopt = sizeof(struct i386tss) << 16;
gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
PCPU_SET(tss_gdt, &gdt[GPROC0_SEL].sd);
PCPU_SET(common_tssd, *PCPU_GET(tss_gdt));
- PCPU_SET(common_tss.tss_ioopt, (sizeof (struct i386tss)) << 16);
ltr(gsel_tss);
/* Initialize the PIC early for vm86 calls. */
@@ -2333,10 +2422,7 @@
* Point the ICU spurious interrupt vectors at the APIC spurious
* interrupt handler.
*/
- setidt(IDT_IO_INTS + 7, IDTVEC(spuriousint), SDT_SYS386IGT, SEL_KPL,
- GSEL(GCODE_SEL, SEL_KPL));
- setidt(IDT_IO_INTS + 15, IDTVEC(spuriousint), SDT_SYS386IGT, SEL_KPL,
- GSEL(GCODE_SEL, SEL_KPL));
+ i386_setidt3();
#endif
#endif
@@ -2386,22 +2472,11 @@
PCPU_SET(curpcb, thread0.td_pcb);
/* Move esp0 in the tss to its final place. */
/* Note: -16 is so we can grow the trapframe if we came from vm86 */
- PCPU_SET(common_tss.tss_esp0, (vm_offset_t)thread0.td_pcb - 16);
+ common_tss0.tss_esp0 = (vm_offset_t)thread0.td_pcb - VM86_STACK_SPACE;
+ PCPU_SET(kesp0, common_tss0.tss_esp0);
gdt[GPROC0_SEL].sd.sd_type = SDT_SYS386TSS; /* clear busy bit */
ltr(gsel_tss);
- /* make a call gate to reenter kernel with */
- gdp = &ldt[LSYS5CALLS_SEL].gd;
-
- x = (int) &IDTVEC(lcall_syscall);
- gdp->gd_looffset = x;
- gdp->gd_selector = GSEL(GCODE_SEL,SEL_KPL);
- gdp->gd_stkcpy = 1;
- gdp->gd_type = SDT_SYS386CGT;
- gdp->gd_dpl = SEL_UPL;
- gdp->gd_p = 1;
- gdp->gd_hioffset = x >> 16;
-
/* transfer to user mode */
_ucodesel = GSEL(GUCODE_SEL, SEL_UPL);
@@ -2427,6 +2502,133 @@
return ((register_t)thread0.td_pcb);
}
+extern u_int tramp_idleptd;
+
+static void
+machdep_init_trampoline(void)
+{
+ struct region_descriptor r_gdt, r_idt;
+ struct i386tss *tss;
+ char *copyout_buf, *trampoline, *tramp_stack_base;
+ u_int *tramp_idleptd_reloced;
+ int x;
+
+ gdt = pmap_trm_alloc(sizeof(union descriptor) * NGDT * mp_ncpus,
+ M_NOWAIT | M_ZERO);
+ bcopy(gdt0, gdt, sizeof(union descriptor) * NGDT);
+ r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1;
+ r_gdt.rd_base = (int)gdt;
+ lgdt(&r_gdt);
+
+ tss = pmap_trm_alloc(sizeof(struct i386tss) * mp_ncpus,
+ M_NOWAIT | M_ZERO);
+ bcopy(&common_tss0, tss, sizeof(struct i386tss));
+ gdt[GPROC0_SEL].sd.sd_lobase = (int)tss;
+ gdt[GPROC0_SEL].sd.sd_hibase = (u_int)tss >> 24;
+ gdt[GPROC0_SEL].sd.sd_type = SDT_SYS386TSS;
+ ltr(GSEL(GPROC0_SEL, SEL_KPL));
+
+ PCPU_SET(fsgs_gdt, &gdt[GUFS_SEL].sd);
+ PCPU_SET(tss_gdt, &gdt[GPROC0_SEL].sd);
+ PCPU_SET(common_tssd, *PCPU_GET(tss_gdt));
+ PCPU_SET(common_tssp, tss);
+
+ trampoline = pmap_trm_alloc(end_exceptions - start_exceptions,
+ M_NOWAIT);
+ bcopy(start_exceptions, trampoline, end_exceptions - start_exceptions);
+ tramp_stack_base = pmap_trm_alloc(TRAMP_STACK_SZ, M_NOWAIT);
+ PCPU_SET(trampstk, (uintptr_t)tramp_stack_base + TRAMP_STACK_SZ -
+ VM86_STACK_SPACE);
+ tss[0].tss_esp0 = PCPU_GET(trampstk);
+
+ idt = pmap_trm_alloc(sizeof(idt0), M_NOWAIT | M_ZERO);
+ bcopy(idt0, idt, sizeof(idt0));
+
+ /* Re-initialize new IDT since the handlers were relocated */
+ setidt_disp = trampoline - start_exceptions;
+ fixup_idt();
+
+ tramp_idleptd_reloced = (u_int *)((uintptr_t)&tramp_idleptd +
+ setidt_disp);
+#if defined(PAE) || defined(PAE_TABLES)
+ *tramp_idleptd_reloced = (u_int)IdlePDPT;
+#else
+ *tramp_idleptd_reloced = (u_int)IdlePTD;
+#endif
+
+ r_idt.rd_limit = sizeof(struct gate_descriptor) * NIDT - 1;
+ r_idt.rd_base = (int)idt;
+ lidt(&r_idt);
+
+ /* dblfault TSS */
+ dblfault_tss = pmap_trm_alloc(sizeof(struct i386tss), M_NOWAIT | M_ZERO);
+ dblfault_stack = pmap_trm_alloc(PAGE_SIZE, M_NOWAIT);
+ dblfault_tss->tss_esp = dblfault_tss->tss_esp0 =
+ dblfault_tss->tss_esp1 = dblfault_tss->tss_esp2 =
+ (int)dblfault_stack + PAGE_SIZE;
+ dblfault_tss->tss_ss = dblfault_tss->tss_ss0 = dblfault_tss->tss_ss1 =
+ dblfault_tss->tss_ss2 = GSEL(GDATA_SEL, SEL_KPL);
+#if defined(PAE) || defined(PAE_TABLES)
+ dblfault_tss->tss_cr3 = (int)IdlePDPT;
+#else
+ dblfault_tss->tss_cr3 = (int)IdlePTD;
+#endif
+ dblfault_tss->tss_eip = (int)dblfault_handler;
+ dblfault_tss->tss_eflags = PSL_KERNEL;
+ dblfault_tss->tss_ds = dblfault_tss->tss_es =
+ dblfault_tss->tss_gs = GSEL(GDATA_SEL, SEL_KPL);
+ dblfault_tss->tss_fs = GSEL(GPRIV_SEL, SEL_KPL);
+ dblfault_tss->tss_cs = GSEL(GCODE_SEL, SEL_KPL);
+ dblfault_tss->tss_ldt = GSEL(GLDT_SEL, SEL_KPL);
+ gdt[GPANIC_SEL].sd.sd_lobase = (int)dblfault_tss;
+ gdt[GPANIC_SEL].sd.sd_hibase = (u_int)dblfault_tss >> 24;
+
+ /* make ldt memory segments */
+ ldt = pmap_trm_alloc(sizeof(union descriptor) * NLDT,
+ M_NOWAIT | M_ZERO);
+ gdt[GLDT_SEL].sd.sd_lobase = (int)ldt;
+ gdt[GLDT_SEL].sd.sd_hibase = (u_int)ldt >> 24;
+ ldt_segs[LUCODE_SEL].ssd_limit = atop(0 - 1);
+ ldt_segs[LUDATA_SEL].ssd_limit = atop(0 - 1);
+ for (x = 0; x < nitems(ldt_segs); x++)
+ ssdtosd(&ldt_segs[x], &ldt[x].sd);
+
+ _default_ldt = GSEL(GLDT_SEL, SEL_KPL);
+ lldt(_default_ldt);
+ PCPU_SET(currentldt, _default_ldt);
+
+ copyout_buf = pmap_trm_alloc(TRAMP_COPYOUT_SZ, M_NOWAIT);
+ PCPU_SET(copyout_buf, copyout_buf);
+ copyout_init_tramp();
+}
+SYSINIT(vm_mem, SI_SUB_VM, SI_ORDER_SECOND, machdep_init_trampoline, NULL);
+
+#ifdef COMPAT_43
+static void
+i386_setup_lcall_gate(void)
+{
+ struct sysentvec *sv;
+ struct user_segment_descriptor desc;
+ u_int lcall_addr;
+
+ sv = &elf32_freebsd_sysvec;
+ lcall_addr = (uintptr_t)sv->sv_psstrings - sz_lcall_tramp;
+
+ bzero(&desc, sizeof(desc));
+ desc.sd_type = SDT_MEMERA;
+ desc.sd_dpl = SEL_UPL;
+ desc.sd_p = 1;
+ desc.sd_def32 = 1;
+ desc.sd_gran = 1;
+ desc.sd_lolimit = 0xffff;
+ desc.sd_hilimit = 0xf;
+ desc.sd_lobase = lcall_addr;
+ desc.sd_hibase = lcall_addr >> 24;
+ bcopy(&desc, &ldt[LSYS5CALLS_SEL], sizeof(desc));
+}
+SYSINIT(elf32, SI_SUB_EXEC, SI_ORDER_ANY, i386_setup_lcall_gate, NULL);
+#endif
+
void
cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size)
{
@@ -2507,6 +2709,7 @@
static void
f00f_hack(void *unused)
{
+ struct region_descriptor r_idt;
struct gate_descriptor *new_idt;
vm_offset_t tmp;
@@ -2517,16 +2720,19 @@
printf("Intel Pentium detected, installing workaround for F00F bug\n");
- tmp = kmem_malloc(kernel_arena, PAGE_SIZE * 2, M_WAITOK | M_ZERO);
+ tmp = (vm_offset_t)pmap_trm_alloc(PAGE_SIZE * 3, M_NOWAIT | M_ZERO);
if (tmp == 0)
panic("kmem_malloc returned 0");
+ tmp = round_page(tmp);
/* Put the problematic entry (#6) at the end of the lower page. */
- new_idt = (struct gate_descriptor*)
+ new_idt = (struct gate_descriptor *)
(tmp + PAGE_SIZE - 7 * sizeof(struct gate_descriptor));
bcopy(idt, new_idt, sizeof(idt0));
r_idt.rd_base = (u_int)new_idt;
+ r_idt.rd_limit = sizeof(idt0) - 1;
lidt(&r_idt);
+ /* SMP machines do not need the F00F hack. */
idt = new_idt;
pmap_protect(kernel_pmap, tmp, tmp + PAGE_SIZE, VM_PROT_READ);
}
Index: sys/i386/i386/mem.c
===================================================================
--- sys/i386/i386/mem.c
+++ sys/i386/i386/mem.c
@@ -92,9 +92,6 @@
return EIO;
if (dev2unit(dev) == CDEV_MINOR_KMEM && uio->uio_resid > 0) {
- if (uio->uio_offset < (vm_offset_t)VADDR(PTDPTDI, 0))
- return (EFAULT);
-
if (!kernacc((caddr_t)(int)uio->uio_offset, uio->uio_resid,
uio->uio_rw == UIO_READ ? VM_PROT_READ : VM_PROT_WRITE))
return (EFAULT);
Index: sys/i386/i386/minidump_machdep.c
===================================================================
--- sys/i386/i386/minidump_machdep.c
+++ sys/i386/i386/minidump_machdep.c
@@ -190,7 +190,7 @@
* page written corresponds to 2MB of space
*/
ptesize += PAGE_SIZE;
- pd = (pd_entry_t *)((uintptr_t)IdlePTD + KERNBASE); /* always mapped! */
+ pd = IdlePTD; /* always mapped! */
j = va >> PDRSHIFT;
if ((pd[j] & (PG_PS | PG_V)) == (PG_PS | PG_V)) {
/* This is an entire 2M page. */
@@ -281,7 +281,7 @@
/* Dump kernel page table pages */
for (va = KERNBASE; va < kernel_vm_end; va += NBPDR) {
/* We always write a page, even if it is zero */
- pd = (pd_entry_t *)((uintptr_t)IdlePTD + KERNBASE); /* always mapped! */
+ pd = IdlePTD; /* always mapped! */
j = va >> PDRSHIFT;
if ((pd[j] & (PG_PS | PG_V)) == (PG_PS | PG_V)) {
/* This is a single 2M block. Generate a fake PTP */
Index: sys/i386/i386/mp_machdep.c
===================================================================
--- sys/i386/i386/mp_machdep.c
+++ sys/i386/i386/mp_machdep.c
@@ -83,8 +83,8 @@
#include <machine/cpu.h>
#define WARMBOOT_TARGET 0
-#define WARMBOOT_OFF (KERNBASE + 0x0467)
-#define WARMBOOT_SEG (KERNBASE + 0x0469)
+#define WARMBOOT_OFF (PMAP_MAP_LOW + 0x0467)
+#define WARMBOOT_SEG (PMAP_MAP_LOW + 0x0469)
#define CMOS_REG (0x70)
#define CMOS_DATA (0x71)
@@ -139,6 +139,8 @@
static int start_all_aps(void);
static int start_ap(int apic_id);
+static char *ap_copyout_buf;
+static char *ap_tramp_stack_base;
/*
* Initialize the IPI handlers and start up the AP's.
*/
@@ -207,10 +209,10 @@
init_secondary(void)
{
struct pcpu *pc;
- vm_offset_t addr;
- int gsel_tss;
- int x, myid;
- u_int cr0;
+ struct i386tss *common_tssp;
+ struct region_descriptor r_gdt, r_idt;
+ int gsel_tss, myid, x;
+ u_int cr0;
/* bootAP is set in start_ap() to our ID. */
myid = bootAP;
@@ -224,11 +226,13 @@
pc->pc_apic_id = cpu_apic_ids[myid];
pc->pc_prvspace = pc;
pc->pc_curthread = 0;
+ pc->pc_common_tssp = common_tssp = &(__pcpu[0].pc_common_tssp)[myid];
fix_cpuid();
- gdt_segs[GPRIV_SEL].ssd_base = (int) pc;
- gdt_segs[GPROC0_SEL].ssd_base = (int) &pc->pc_common_tss;
+ gdt_segs[GPRIV_SEL].ssd_base = (int)pc;
+ gdt_segs[GPROC0_SEL].ssd_base = (int)common_tssp;
+ gdt_segs[GLDT_SEL].ssd_base = (int)ldt;
for (x = 0; x < NGDT; x++) {
ssdtosd(&gdt_segs[x], &gdt[myid * NGDT + x].sd);
@@ -238,21 +242,27 @@
r_gdt.rd_base = (int) &gdt[myid * NGDT];
lgdt(&r_gdt); /* does magic intra-segment return */
+ r_idt.rd_limit = sizeof(struct gate_descriptor) * NIDT - 1;
+ r_idt.rd_base = (int)idt;
lidt(&r_idt);
lldt(_default_ldt);
PCPU_SET(currentldt, _default_ldt);
+ PCPU_SET(trampstk, (uintptr_t)ap_tramp_stack_base + TRAMP_STACK_SZ -
+ VM86_STACK_SPACE);
+
gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
gdt[myid * NGDT + GPROC0_SEL].sd.sd_type = SDT_SYS386TSS;
- PCPU_SET(common_tss.tss_esp0, 0); /* not used until after switch */
- PCPU_SET(common_tss.tss_ss0, GSEL(GDATA_SEL, SEL_KPL));
- PCPU_SET(common_tss.tss_ioopt, (sizeof (struct i386tss)) << 16);
+ common_tssp->tss_esp0 = PCPU_GET(trampstk);
+ common_tssp->tss_ss0 = GSEL(GDATA_SEL, SEL_KPL);
+ common_tssp->tss_ioopt = sizeof(struct i386tss) << 16;
PCPU_SET(tss_gdt, &gdt[myid * NGDT + GPROC0_SEL].sd);
PCPU_SET(common_tssd, *PCPU_GET(tss_gdt));
ltr(gsel_tss);
PCPU_SET(fsgs_gdt, &gdt[myid * NGDT + GUFS_SEL].sd);
+ PCPU_SET(copyout_buf, ap_copyout_buf);
/*
* Set to a known state:
@@ -274,8 +284,6 @@
/* BSP may have changed PTD while we were waiting */
invltlb();
- for (addr = 0; addr < NKPT * NBPDR - 1; addr += PAGE_SIZE)
- invlpg(addr);
#if defined(I586_CPU) && !defined(NO_F00F_HACK)
lidt(&r_idt);
@@ -287,17 +295,20 @@
/*
* start each AP in our list
*/
-/* Lowest 1MB is already mapped: don't touch*/
#define TMPMAP_START 1
static int
start_all_aps(void)
{
u_char mpbiosreason;
u_int32_t mpbioswarmvec;
- int apic_id, cpu, i;
+ int apic_id, cpu;
mtx_init(&ap_boot_mtx, "ap boot", NULL, MTX_SPIN);
+ /* Remap lowest 1MB */
+ IdlePTD[0] = IdlePTD[1];
+ load_cr3(rcr3()); /* invalidate TLB */
+
/* install the AP 1st level boot code */
install_ap_tramp();
@@ -306,11 +317,7 @@
outb(CMOS_REG, BIOS_RESET);
mpbiosreason = inb(CMOS_DATA);
- /* set up temporary P==V mapping for AP boot */
- /* XXX this is a hack, we should boot the AP on its own stack/PTD */
- for (i = TMPMAP_START; i < NKPT; i++)
- PTD[i] = PTD[KPTDI + i];
- invltlb();
+ /* take advantage of the P==V mapping for PTD[0] for AP boot */
/* start each AP */
for (cpu = 1; cpu < mp_ncpus; cpu++) {
@@ -332,6 +339,9 @@
PAGE_SIZE - 4;
bootAP = cpu;
+ ap_tramp_stack_base = pmap_trm_alloc(TRAMP_STACK_SZ, M_NOWAIT);
+ ap_copyout_buf = pmap_trm_alloc(TRAMP_COPYOUT_SZ, M_NOWAIT);
+
/* attempt to start the Application Processor */
CHECK_INIT(99); /* setup checkpoints */
if (!start_ap(apic_id)) {
@@ -347,17 +357,16 @@
CPU_SET(cpu, &all_cpus); /* record AP in CPU map */
}
+ /* Unmap lowest 1MB again */
+ IdlePTD[0] = 0;
+ load_cr3(rcr3());
+
/* restore the warmstart vector */
*(u_int32_t *) WARMBOOT_OFF = mpbioswarmvec;
outb(CMOS_REG, BIOS_RESET);
outb(CMOS_DATA, mpbiosreason);
- /* Undo V==P hack from above */
- for (i = TMPMAP_START; i < NKPT; i++)
- PTD[i] = 0;
- pmap_invalidate_range(kernel_pmap, 0, NKPT * NBPDR - 1);
-
/* number of APs actually started */
return mp_naps;
}
@@ -379,7 +388,7 @@
{
int x;
int size = *(int *) ((u_long) & bootMP_size);
- vm_offset_t va = boot_address + KERNBASE;
+ vm_offset_t va = boot_address;
u_char *src = (u_char *) ((u_long) bootMP);
u_char *dst = (u_char *) va;
u_int boot_base = (u_int) bootMP;
@@ -409,7 +418,7 @@
/* modify the ljmp target for MPentry() */
dst32 = (u_int32_t *) (dst + ((u_int) bigJump - boot_base) + 1);
- *dst32 = ((u_int) MPentry - KERNBASE);
+ *dst32 = (u_int)MPentry;
/* modify the target for boot code segment */
dst16 = (u_int16_t *) (dst + ((u_int) bootCodeSeg - boot_base));
Index: sys/i386/i386/mpboot.s
===================================================================
--- sys/i386/i386/mpboot.s
+++ sys/i386/i386/mpboot.s
@@ -37,8 +37,6 @@
#include "assym.inc"
-#define R(x) ((x)-KERNBASE)
-
/*
* this code MUST be enabled here and in mp_machdep.c
* it follows the very early stages of AP boot by placing values in CMOS ram.
@@ -80,18 +78,14 @@
movl $1,%eax
cpuid /* Retrieve features */
movl %cr4,%eax
-#ifndef DISABLE_PSE
testl $CPUID_PSE,%edx
jz 1f
orl $CR4_PSE,%eax /* Enable PSE */
1:
-#endif
-#ifndef DISABLE_PG_G
testl $CPUID_PGE,%edx
jz 1f
orl $CR4_PGE,%eax /* Enable PGE */
1:
-#endif
testl $CPUID_VME,%edx
jz 1f
orl $CR4_VME,%eax /* Enable VME */
@@ -100,13 +94,13 @@
/* Now enable paging mode */
#if defined(PAE) || defined(PAE_TABLES)
- movl R(IdlePDPT), %eax
+ movl IdlePDPT, %eax
movl %eax, %cr3
movl %cr4, %eax
orl $CR4_PAE, %eax
movl %eax, %cr4
#else
- movl R(IdlePTD), %eax
+ movl IdlePTD, %eax
movl %eax,%cr3
#endif
movl %cr0,%eax
Index: sys/i386/i386/pmap.c
===================================================================
--- sys/i386/i386/pmap.c
+++ sys/i386/i386/pmap.c
@@ -47,6 +47,8 @@
/*-
* Copyright (c) 2003 Networks Associates Technology, Inc.
* All rights reserved.
+ * Copyright (c) 2018 The FreeBSD Foundation
+ * All rights reserved.
*
* This software was developed for the FreeBSD Project by Jake Burkholder,
* Safeport Network Services, and Network Associates Laboratories, the
@@ -54,6 +56,10 @@
* DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA
* CHATS research program.
*
+ * Portions of this software were developed by
+ * Konstantin Belousov <kib@FreeBSD.org> under sponsorship from
+ * the FreeBSD Foundation.
+ *
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
@@ -121,6 +127,7 @@
#include <sys/sched.h>
#include <sys/sysctl.h>
#include <sys/smp.h>
+#include <sys/vmem.h>
#include <vm/vm.h>
#include <vm/vm_param.h>
@@ -141,6 +148,7 @@
#include <machine/intr_machdep.h>
#include <x86/apicvar.h>
#endif
+#include <machine/bootinfo.h>
#include <machine/cpu.h>
#include <machine/cputypes.h>
#include <machine/md_var.h>
@@ -190,9 +198,6 @@
#define pmap_pte_set_prot(pte, v) ((*(int *)pte &= ~PG_PROT), (*(int *)pte |= (v)))
struct pmap kernel_pmap_store;
-LIST_HEAD(pmaplist, pmap);
-static struct pmaplist allpmaps;
-static struct mtx allpmaps_lock;
vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */
vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */
@@ -200,9 +205,7 @@
int pseflag = 0; /* PG_PS or-in */
static int nkpt = NKPT;
-vm_offset_t kernel_vm_end = KERNBASE + NKPT * NBPDR;
-extern u_int32_t KERNend;
-extern u_int32_t KPTphys;
+vm_offset_t kernel_vm_end = /* 0 + */ NKPT * NBPDR;
#if defined(PAE) || defined(PAE_TABLES)
pt_entry_t pg_nx;
@@ -343,29 +346,213 @@
static void *pmap_pdpt_allocf(uma_zone_t zone, vm_size_t bytes, int domain,
uint8_t *flags, int wait);
#endif
-static void pmap_set_pg(void);
+static void pmap_init_trm(void);
static __inline void pagezero(void *page);
CTASSERT(1 << PDESHIFT == sizeof(pd_entry_t));
CTASSERT(1 << PTESHIFT == sizeof(pt_entry_t));
+void pmap_cold(void);
+extern char _end[];
+u_long physfree; /* phys addr of next free page */
+u_long vm86phystk; /* PA of vm86/bios stack */
+u_long vm86paddr; /* address of vm86 region */
+int vm86pa; /* phys addr of vm86 region */
+u_long KERNend; /* phys addr end of kernel (just after bss) */
+pd_entry_t *IdlePTD; /* phys addr of kernel PTD */
+#if defined(PAE) || defined(PAE_TABLES)
+pdpt_entry_t *IdlePDPT; /* phys addr of kernel PDPT */
+#endif
+pt_entry_t *KPTmap; /* address of kernel page tables */
+u_long KPTphys; /* phys addr of kernel page tables */
+
+static u_long
+allocpages(u_int cnt, u_long *physfree)
+{
+ u_long res;
+
+ res = *physfree;
+ *physfree += PAGE_SIZE * cnt;
+ bzero((void *)res, PAGE_SIZE * cnt);
+ return (res);
+}
+
+static void
+pmap_cold_map(u_long pa, u_long va, u_long cnt)
+{
+ pt_entry_t *pt;
+
+ for (pt = (pt_entry_t *)KPTphys + atop(va); cnt > 0;
+ cnt--, pt++, va += PAGE_SIZE, pa += PAGE_SIZE)
+ *pt = pa | PG_V | PG_RW | PG_A | PG_M;
+}
+
+static void
+pmap_cold_mapident(u_long pa, u_long cnt)
+{
+
+ pmap_cold_map(pa, pa, cnt);
+}
+
+_Static_assert(2 * NBPDR == KERNBASE, "Broken double-map of zero PTD");
+
/*
- * If you get an error here, then you set KVA_PAGES wrong! See the
- * description of KVA_PAGES in sys/i386/include/pmap.h. It must be
- * multiple of 4 for a normal kernel, or a multiple of 8 for a PAE.
+ * Called from locore.s before paging is enabled. Sets up the first
+ * kernel page table. Since kernel is mapped with PA == VA, this code
+ * does not require relocations.
*/
-CTASSERT(KERNBASE % (1 << 24) == 0);
+void
+pmap_cold(void)
+{
+ pt_entry_t *pt;
+ u_long a;
+ u_int cr3, ncr4;
+
+ physfree = (u_long)&_end;
+ if (bootinfo.bi_esymtab != 0)
+ physfree = bootinfo.bi_esymtab;
+ if (bootinfo.bi_kernend != 0)
+ physfree = bootinfo.bi_kernend;
+ physfree = roundup2(physfree, NBPDR);
+ KERNend = physfree;
+
+ /* Allocate Kernel Page Tables */
+ KPTphys = allocpages(NKPT, &physfree);
+ KPTmap = (pt_entry_t *)KPTphys;
+
+ /* Allocate Page Table Directory */
+#if defined(PAE) || defined(PAE_TABLES)
+ /* XXX only need 32 bytes (easier for now) */
+ IdlePDPT = (pdpt_entry_t *)allocpages(1, &physfree);
+#endif
+ IdlePTD = (pd_entry_t *)allocpages(NPGPTD, &physfree);
+
+ /*
+ * Allocate KSTACK. Leave a guard page between IdlePTD and
+ * proc0kstack, to control stack overflow for thread0 and
+ * prevent corruption of the page table. We leak the guard
+ * physical memory due to 1:1 mappings.
+ */
+ allocpages(1, &physfree);
+ proc0kstack = allocpages(TD0_KSTACK_PAGES, &physfree);
+
+ /* vm86/bios stack */
+ vm86phystk = allocpages(1, &physfree);
+
+ /* pgtable + ext + IOPAGES */
+ vm86paddr = vm86pa = allocpages(3, &physfree);
+
+ /* Install page tables into PTD. Page table page 1 is wasted. */
+ for (a = 0; a < NKPT; a++)
+ IdlePTD[a] = (KPTphys + ptoa(a)) | PG_V | PG_RW | PG_A | PG_M;
+
+#if defined(PAE) || defined(PAE_TABLES)
+ /* PAE install PTD pointers into PDPT */
+ for (a = 0; a < NPGPTD; a++)
+ IdlePDPT[a] = ((u_int)IdlePTD + ptoa(a)) | PG_V;
+#endif
+
+ /*
+ * Install recursive mapping for kernel page tables into
+ * itself.
+ */
+ for (a = 0; a < NPGPTD; a++)
+ IdlePTD[PTDPTDI + a] = ((u_int)IdlePTD + ptoa(a)) | PG_V |
+ PG_RW;
+
+ /*
+ * Initialize page table pages mapping physical address zero
+ * through the (physical) end of the kernel. Many of these
+ * pages must be reserved, and we reserve them all and map
+ * them linearly for convenience. We do this even if we've
+ * enabled PSE above; we'll just switch the corresponding
+ * kernel PDEs before we turn on paging.
+ *
+ * This and all other page table entries allow read and write
+ * access for various reasons. Kernel mappings never have any
+ * access restrictions.
+ */
+ pmap_cold_mapident(0, atop(NBPDR));
+ pmap_cold_map(0, NBPDR, atop(NBPDR));
+ pmap_cold_mapident(KERNBASE, atop(KERNend - KERNBASE));
+
+ /* Map page table directory */
+#if defined(PAE) || defined(PAE_TABLES)
+ pmap_cold_mapident((u_long)IdlePDPT, 1);
+#endif
+ pmap_cold_mapident((u_long)IdlePTD, NPGPTD);
+
+ /* Map early KPTmap. It is really pmap_cold_mapident. */
+ pmap_cold_map(KPTphys, (u_long)KPTmap, NKPT);
+
+ /* Map proc0kstack */
+ pmap_cold_mapident(proc0kstack, TD0_KSTACK_PAGES);
+ /* ISA hole already mapped */
+
+ pmap_cold_mapident(vm86phystk, 1);
+ pmap_cold_mapident(vm86pa, 3);
+
+ /* Map page 0 into the vm86 page table */
+ *(pt_entry_t *)vm86pa = 0 | PG_RW | PG_U | PG_A | PG_M | PG_V;
+
+ /* ...likewise for the ISA hole for vm86 */
+ for (pt = (pt_entry_t *)vm86pa + atop(ISA_HOLE_START), a = 0;
+ a < atop(ISA_HOLE_LENGTH); a++, pt++)
+ *pt = (ISA_HOLE_START + ptoa(a)) | PG_RW | PG_U | PG_A |
+ PG_M | PG_V;
+
+ /* Enable PSE, PGE, VME, and PAE if configured. */
+ ncr4 = 0;
+ if ((cpu_feature & CPUID_PSE) != 0) {
+ ncr4 |= CR4_PSE;
+ /*
+ * Superpage mapping of the kernel text. Existing 4k
+ * page table pages are wasted.
+ */
+ for (a = KERNBASE; a < KERNend; a += NBPDR)
+ IdlePTD[a >> PDRSHIFT] = a | PG_PS | PG_A | PG_M |
+ PG_RW | PG_V;
+ }
+ if ((cpu_feature & CPUID_PGE) != 0) {
+ ncr4 |= CR4_PGE;
+ pgeflag = PG_G;
+ }
+ ncr4 |= (cpu_feature & CPUID_VME) != 0 ? CR4_VME : 0;
+#if defined(PAE) || defined(PAE_TABLES)
+ ncr4 |= CR4_PAE;
+#endif
+ if (ncr4 != 0)
+ load_cr4(rcr4() | ncr4);
+
+ /* Now enable paging */
+#if defined(PAE) || defined(PAE_TABLES)
+ cr3 = (u_int)IdlePDPT;
+#else
+ cr3 = (u_int)IdlePTD;
+#endif
+ load_cr3(cr3);
+ load_cr0(rcr0() | CR0_PG);
+
+ /*
+ * Now running relocated at KERNBASE where the system is
+ * linked to run.
+ */
+
+ /*
+ * Remove the lowest part of the double mapping of low memory
+ * to get some null pointer checks.
+ */
+ IdlePTD[0] = 0;
+ load_cr3(cr3); /* invalidate TLB */
+}
/*
* Bootstrap the system enough to run with virtual memory.
*
* On the i386 this is called after mapping has already been enabled
+ * in locore.s with the page table created in pmap_cold(),
* and just syncs the pmap module with what has already been done.
- * [We can't call it easily with mapping off since the kernel is not
- * mapped with PA == VA, hence we would have to relocate every address
- * from the linked base (virtual) address "KERNBASE" to the actual
- * (physical) address starting relative to 0]
*/
void
pmap_bootstrap(vm_paddr_t firstaddr)
@@ -391,7 +578,7 @@
* page that it allocated. Preferably, locore would provide a first
* unused virtual address in addition to "firstaddr".
*/
- virtual_avail = (vm_offset_t) KERNBASE + firstaddr;
+ virtual_avail = (vm_offset_t)firstaddr;
virtual_end = VM_MAX_KERNEL_ADDRESS;
@@ -399,9 +586,9 @@
* Initialize the kernel pmap (which is statically allocated).
*/
PMAP_LOCK_INIT(kernel_pmap);
- kernel_pmap->pm_pdir = (pd_entry_t *) (KERNBASE + (u_int)IdlePTD);
+ kernel_pmap->pm_pdir = IdlePTD;
#if defined(PAE) || defined(PAE_TABLES)
- kernel_pmap->pm_pdpt = (pdpt_entry_t *) (KERNBASE + (u_int)IdlePDPT);
+ kernel_pmap->pm_pdpt = IdlePDPT;
#endif
CPU_FILL(&kernel_pmap->pm_active); /* don't allow deactivation */
TAILQ_INIT(&kernel_pmap->pm_pvchunk);
@@ -411,19 +598,6 @@
*/
rw_init(&pvh_global_lock, "pmap pv global");
- LIST_INIT(&allpmaps);
-
- /*
- * Request a spin mutex so that changes to allpmaps cannot be
- * preempted by smp_rendezvous_cpus(). Otherwise,
- * pmap_update_pde_kernel() could access allpmaps while it is
- * being changed.
- */
- mtx_init(&allpmaps_lock, "allpmaps", NULL, MTX_SPIN);
- mtx_lock_spin(&allpmaps_lock);
- LIST_INSERT_HEAD(&allpmaps, kernel_pmap, pm_list);
- mtx_unlock_spin(&allpmaps_lock);
-
/*
* Reserve some special page table entries/VA space for temporary
* mapping of pages.
@@ -474,14 +648,7 @@
SYSMAP(pt_entry_t *, KPTD, KPTmap, KVA_PAGES)
for (i = 0; i < NKPT; i++)
- KPTD[i] = (KPTphys + (i << PAGE_SHIFT)) | pgeflag | PG_RW | PG_V;
-
- /*
- * Adjust the start of the KPTD and KPTmap so that the implementation
- * of pmap_kextract() and pmap_growkernel() can be made simpler.
- */
- KPTD -= KPTDI;
- KPTmap -= i386_btop(KPTDI << PDRSHIFT);
+ KPTD[i] = (KPTphys + ptoa(i)) | PG_RW | PG_V;
/*
* PADDR1 and PADDR2 are used by pmap_pte_quick() and pmap_pte(),
@@ -494,18 +661,6 @@
virtual_avail = va;
- /*
- * Finish removing the identity mapping (virt == phys) of low memory.
- * It was only used for 2 instructions in locore. locore then
- * unmapped the first PTD to get some null pointer checks. ACPI
- * wakeup will map the first PTD transiently to use it for 1
- * instruction. The double mapping for low memory is not usable in
- * normal operation since it breaks trapping of null pointers and
- * causes inconsistencies in page tables when combined with PG_G.
- */
- for (i = 1; i < NKPT; i++)
- PTD[i] = 0;
-
/*
* Initialize the PAT MSR if present.
* pmap_init_pat() clears and sets CR4_PGE, which, as a
@@ -515,9 +670,6 @@
* comes with PAT. Both features were added for Pentium Pro.
*/
pmap_init_pat();
-
- /* Turn on PG_G on kernel page(s) */
- pmap_set_pg();
}
static void
@@ -529,21 +681,32 @@
CPU_FOREACH(i) {
pc = pcpu_find(i);
+ mtx_init(&pc->pc_copyout_mlock, "cpmlk", NULL, MTX_DEF |
+ MTX_NEW);
+ pc->pc_copyout_maddr = kva_alloc(ptoa(2));
+ if (pc->pc_copyout_maddr == 0)
+ panic("unable to allocate non-sleepable copyout KVA");
+ sx_init(&pc->pc_copyout_slock, "cpslk");
+ pc->pc_copyout_saddr = kva_alloc(ptoa(2));
+ if (pc->pc_copyout_saddr == 0)
+ panic("unable to allocate sleepable copyout KVA");
+
/*
- * Skip if the mapping has already been initialized,
+ * Skip if the mappings have already been initialized,
* i.e. this is the BSP.
*/
if (pc->pc_cmap_addr1 != 0)
continue;
+
mtx_init(&pc->pc_cmap_lock, "SYSMAPS", NULL, MTX_DEF);
pages = kva_alloc(PAGE_SIZE * 3);
if (pages == 0)
- panic("%s: unable to allocate KVA", __func__);
+ panic("unable to allocate CMAP KVA");
pc->pc_cmap_pte1 = vtopte(pages);
pc->pc_cmap_pte2 = vtopte(pages + PAGE_SIZE);
pc->pc_cmap_addr1 = (caddr_t)pages;
pc->pc_cmap_addr2 = (caddr_t)(pages + PAGE_SIZE);
- pc->pc_qmap_addr = pages + (PAGE_SIZE * 2);
+ pc->pc_qmap_addr = pages + atop(2);
}
}
@@ -653,39 +816,6 @@
load_cr4(cr4);
}
-/*
- * Set PG_G on kernel pages. Only the BSP calls this when SMP is turned on.
- */
-static void
-pmap_set_pg(void)
-{
- pt_entry_t *pte;
- vm_offset_t va, endva;
-
- if (pgeflag == 0)
- return;
-
- endva = KERNBASE + KERNend;
-
- if (pseflag) {
- va = KERNBASE + roundup2(KERNLOAD, NBPDR);
- while (va < endva) {
- pdir_pde(PTD, va) |= pgeflag;
- invltlb(); /* Flush non-PG_G entries. */
- va += NBPDR;
- }
- } else {
- va = (vm_offset_t)btext;
- while (va < endva) {
- pte = vtopte(va);
- if (*pte)
- *pte |= pgeflag;
- invltlb(); /* Flush non-PG_G entries. */
- va += PAGE_SIZE;
- }
- }
-}
-
/*
* Initialize a vm_page's machine-dependent fields.
*/
@@ -783,12 +913,12 @@
* page table pages.
*/
for (i = 0; i < NKPT; i++) {
- mpte = PHYS_TO_VM_PAGE(KPTphys + (i << PAGE_SHIFT));
+ mpte = PHYS_TO_VM_PAGE(KPTphys + ptoa(i));
KASSERT(mpte >= vm_page_array &&
mpte < &vm_page_array[vm_page_array_size],
("pmap_init: page table page is out of range"));
mpte->pindex = i + KPTDI;
- mpte->phys_addr = KPTphys + (i << PAGE_SHIFT);
+ mpte->phys_addr = KPTphys + ptoa(i);
}
/*
@@ -859,6 +989,8 @@
#endif
pmap_initialized = 1;
+ pmap_init_trm();
+
if (!bootverbose)
return;
for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
@@ -868,6 +1000,7 @@
printf("PPIM %u: PA=%#jx, VA=%#x, size=%#x, mode=%#x\n", i,
(uintmax_t)ppim->pa, ppim->va, ppim->sz, ppim->mode);
}
+
}
@@ -935,21 +1068,9 @@
pmap_kenter_pde(vm_offset_t va, pd_entry_t newpde)
{
pd_entry_t *pde;
- pmap_t pmap;
- boolean_t PTD_updated;
-
- PTD_updated = FALSE;
- mtx_lock_spin(&allpmaps_lock);
- LIST_FOREACH(pmap, &allpmaps, pm_list) {
- if ((pmap->pm_pdir[PTDPTDI] & PG_FRAME) == (PTDpde[0] &
- PG_FRAME))
- PTD_updated = TRUE;
- pde = pmap_pde(pmap, va);
- pde_store(pde, newpde);
- }
- mtx_unlock_spin(&allpmaps_lock);
- KASSERT(PTD_updated,
- ("pmap_kenter_pde: current page table is not in allpmaps"));
+
+ pde = pmap_pde(kernel_pmap, va);
+ pde_store(pde, newpde);
}
/*
@@ -962,47 +1083,23 @@
static void
pmap_update_pde_invalidate(vm_offset_t va, pd_entry_t newpde)
{
- u_long cr4;
if ((newpde & PG_PS) == 0)
/* Demotion: flush a specific 2MB page mapping. */
invlpg(va);
- else if ((newpde & PG_G) == 0)
+ else /* if ((newpde & PG_G) == 0) */
/*
* Promotion: flush every 4KB page mapping from the TLB
* because there are too many to flush individually.
*/
invltlb();
- else {
- /*
- * Promotion: flush every 4KB page mapping from the TLB,
- * including any global (PG_G) mappings.
- */
- cr4 = rcr4();
- load_cr4(cr4 & ~CR4_PGE);
- /*
- * Although preemption at this point could be detrimental to
- * performance, it would not lead to an error. PG_G is simply
- * ignored if CR4.PGE is clear. Moreover, in case this block
- * is re-entered, the load_cr4() either above or below will
- * modify CR4.PGE flushing the TLB.
- */
- load_cr4(cr4 | CR4_PGE);
- }
}
void
invltlb_glob(void)
{
- uint64_t cr4;
- if (pgeflag == 0) {
- invltlb();
- } else {
- cr4 = rcr4();
- load_cr4(cr4 & ~CR4_PGE);
- load_cr4(cr4 | CR4_PGE);
- }
+ invltlb();
}
@@ -1033,15 +1130,15 @@
u_int cpuid;
sched_pin();
- if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) {
+ if (pmap == kernel_pmap) {
invlpg(va);
mask = &all_cpus;
+ } else if (!CPU_CMP(&pmap->pm_active, &all_cpus)) {
+ mask = &all_cpus;
} else {
cpuid = PCPU_GET(cpuid);
other_cpus = all_cpus;
CPU_CLR(cpuid, &other_cpus);
- if (CPU_ISSET(cpuid, &pmap->pm_active))
- invlpg(va);
CPU_AND(&other_cpus, &pmap->pm_active);
mask = &other_cpus;
}
@@ -1065,17 +1162,16 @@
}
sched_pin();
- if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) {
+ if (pmap == kernel_pmap) {
for (addr = sva; addr < eva; addr += PAGE_SIZE)
invlpg(addr);
mask = &all_cpus;
+ } else if (!CPU_CMP(&pmap->pm_active, &all_cpus)) {
+ mask = &all_cpus;
} else {
cpuid = PCPU_GET(cpuid);
other_cpus = all_cpus;
CPU_CLR(cpuid, &other_cpus);
- if (CPU_ISSET(cpuid, &pmap->pm_active))
- for (addr = sva; addr < eva; addr += PAGE_SIZE)
- invlpg(addr);
CPU_AND(&other_cpus, &pmap->pm_active);
mask = &other_cpus;
}
@@ -1091,17 +1187,14 @@
sched_pin();
if (pmap == kernel_pmap) {
- invltlb_glob();
+ invltlb();
mask = &all_cpus;
} else if (!CPU_CMP(&pmap->pm_active, &all_cpus)) {
- invltlb();
mask = &all_cpus;
} else {
cpuid = PCPU_GET(cpuid);
other_cpus = all_cpus;
CPU_CLR(cpuid, &other_cpus);
- if (CPU_ISSET(cpuid, &pmap->pm_active))
- invltlb();
CPU_AND(&other_cpus, &pmap->pm_active);
mask = &other_cpus;
}
@@ -1132,19 +1225,10 @@
{
struct pde_action *act = arg;
pd_entry_t *pde;
- pmap_t pmap;
if (act->store == PCPU_GET(cpuid)) {
-
- /*
- * Elsewhere, this operation requires allpmaps_lock for
- * synchronization. Here, it does not because it is being
- * performed in the context of an all_cpus rendezvous.
- */
- LIST_FOREACH(pmap, &allpmaps, pm_list) {
- pde = pmap_pde(pmap, act->va);
- pde_store(pde, act->newpde);
- }
+ pde = pmap_pde(kernel_pmap, act->va);
+ pde_store(pde, act->newpde);
}
}
@@ -1219,7 +1303,7 @@
pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
{
- if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active))
+ if (pmap == kernel_pmap)
invlpg(va);
}
@@ -1228,7 +1312,7 @@
{
vm_offset_t addr;
- if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active))
+ if (pmap == kernel_pmap)
for (addr = sva; addr < eva; addr += PAGE_SIZE)
invlpg(addr);
}
@@ -1238,8 +1322,6 @@
{
if (pmap == kernel_pmap)
- invltlb_glob();
- else if (!CPU_EMPTY(&pmap->pm_active))
invltlb();
}
@@ -1371,8 +1453,7 @@
pmap_is_current(pmap_t pmap)
{
- return (pmap == kernel_pmap || pmap ==
- vmspace_pmap(curthread->td_proc->p_vmspace));
+ return (pmap == kernel_pmap);
}
/*
@@ -1570,7 +1651,7 @@
pt_entry_t *pte;
pte = vtopte(va);
- pte_store(pte, pa | PG_RW | PG_V | pgeflag);
+ pte_store(pte, pa | PG_RW | PG_V);
}
static __inline void
@@ -1579,7 +1660,7 @@
pt_entry_t *pte;
pte = vtopte(va);
- pte_store(pte, pa | PG_RW | PG_V | pgeflag | pmap_cache_bits(mode, 0));
+ pte_store(pte, pa | PG_RW | PG_V | pmap_cache_bits(mode, 0));
}
/*
@@ -1638,7 +1719,7 @@
pseflag) {
KASSERT((va & PDRMASK) == 0,
("pmap_map: misaligned va %#x", va));
- newpde = start | PG_PS | pgeflag | PG_RW | PG_V;
+ newpde = start | PG_PS | PG_RW | PG_V;
pmap_kenter_pde(va, newpde);
va += NBPDR;
start += NBPDR;
@@ -1678,9 +1759,9 @@
if ((*pte & (PG_FRAME | PG_PTE_CACHE)) != pa) {
oldpte |= *pte;
#if defined(PAE) || defined(PAE_TABLES)
- pte_store(pte, pa | pgeflag | pg_nx | PG_RW | PG_V);
+ pte_store(pte, pa | pg_nx | PG_RW | PG_V);
#else
- pte_store(pte, pa | pgeflag | PG_RW | PG_V);
+ pte_store(pte, pa | PG_RW | PG_V);
#endif
}
pte++;
@@ -1809,7 +1890,7 @@
pd_entry_t ptepde;
vm_page_t mpte;
- if (va >= VM_MAXUSER_ADDRESS)
+ if (pmap == kernel_pmap)
return (0);
ptepde = *pmap_pde(pmap, va);
mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME);
@@ -1824,14 +1905,9 @@
{
PMAP_LOCK_INIT(pmap);
- /*
- * Since the page table directory is shared with the kernel pmap,
- * which is already included in the list "allpmaps", this pmap does
- * not need to be inserted into that list.
- */
- pmap->pm_pdir = (pd_entry_t *)(KERNBASE + (vm_offset_t)IdlePTD);
+ pmap->pm_pdir = IdlePTD;
#if defined(PAE) || defined(PAE_TABLES)
- pmap->pm_pdpt = (pdpt_entry_t *)(KERNBASE + (vm_offset_t)IdlePDPT);
+ pmap->pm_pdpt = IdlePDPT;
#endif
pmap->pm_root.rt_root = 0;
CPU_ZERO(&pmap->pm_active);
@@ -1847,8 +1923,7 @@
int
pmap_pinit(pmap_t pmap)
{
- vm_page_t m, ptdpg[NPGPTD];
- vm_paddr_t pa;
+ vm_page_t m;
int i;
/*
@@ -1878,32 +1953,25 @@
for (i = 0; i < NPGPTD;) {
m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ |
VM_ALLOC_WIRED | VM_ALLOC_ZERO);
- if (m == NULL)
+ if (m == NULL) {
vm_wait(NULL);
- else
- ptdpg[i++] = m;
+ } else {
+ pmap->pm_ptdpg[i] = m;
+#if defined(PAE) || defined(PAE_TABLES)
+ pmap->pm_pdpt[i] = VM_PAGE_TO_PHYS(m) | PG_V;
+#endif
+ i++;
+ }
}
- pmap_qenter((vm_offset_t)pmap->pm_pdir, ptdpg, NPGPTD);
+ pmap_qenter((vm_offset_t)pmap->pm_pdir, pmap->pm_ptdpg, NPGPTD);
for (i = 0; i < NPGPTD; i++)
- if ((ptdpg[i]->flags & PG_ZERO) == 0)
+ if ((pmap->pm_ptdpg[i]->flags & PG_ZERO) == 0)
pagezero(pmap->pm_pdir + (i * NPDEPG));
- mtx_lock_spin(&allpmaps_lock);
- LIST_INSERT_HEAD(&allpmaps, pmap, pm_list);
- /* Copy the kernel page table directory entries. */
- bcopy(PTD + KPTDI, pmap->pm_pdir + KPTDI, nkpt * sizeof(pd_entry_t));
- mtx_unlock_spin(&allpmaps_lock);
-
- /* install self-referential address mapping entry(s) */
- for (i = 0; i < NPGPTD; i++) {
- pa = VM_PAGE_TO_PHYS(ptdpg[i]);
- pmap->pm_pdir[PTDPTDI + i] = pa | PG_V | PG_RW | PG_A | PG_M;
-#if defined(PAE) || defined(PAE_TABLES)
- pmap->pm_pdpt[i] = pa | PG_V;
-#endif
- }
+ /* Install the trampoline mapping. */
+ pmap->pm_pdir[TRPTDI] = PTD[TRPTDI];
CPU_ZERO(&pmap->pm_active);
TAILQ_INIT(&pmap->pm_pvchunk);
@@ -2016,7 +2084,7 @@
void
pmap_release(pmap_t pmap)
{
- vm_page_t m, ptdpg[NPGPTD];
+ vm_page_t m;
int i;
KASSERT(pmap->pm_stats.resident_count == 0,
@@ -2027,27 +2095,16 @@
KASSERT(CPU_EMPTY(&pmap->pm_active),
("releasing active pmap %p", pmap));
- mtx_lock_spin(&allpmaps_lock);
- LIST_REMOVE(pmap, pm_list);
- mtx_unlock_spin(&allpmaps_lock);
-
- for (i = 0; i < NPGPTD; i++)
- ptdpg[i] = PHYS_TO_VM_PAGE(pmap->pm_pdir[PTDPTDI + i] &
- PG_FRAME);
-
- bzero(pmap->pm_pdir + PTDPTDI, (nkpt + NPGPTD) *
- sizeof(*pmap->pm_pdir));
-
pmap_qremove((vm_offset_t)pmap->pm_pdir, NPGPTD);
for (i = 0; i < NPGPTD; i++) {
- m = ptdpg[i];
+ m = pmap->pm_ptdpg[i];
#if defined(PAE) || defined(PAE_TABLES)
KASSERT(VM_PAGE_TO_PHYS(m) == (pmap->pm_pdpt[i] & PG_FRAME),
("pmap_release: got wrong ptd page"));
#endif
vm_page_unwire_noq(m);
- vm_page_free_zero(m);
+ vm_page_free(m);
}
}
@@ -2107,7 +2164,7 @@
pmap_zero_page(nkpg);
ptppaddr = VM_PAGE_TO_PHYS(nkpg);
newpdir = (pd_entry_t) (ptppaddr | PG_V | PG_RW | PG_A | PG_M);
- pdir_pde(KPTD, kernel_vm_end) = pgeflag | newpdir;
+ pdir_pde(KPTD, kernel_vm_end) = newpdir;
pmap_kenter_pde(kernel_vm_end, newpdir);
kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK;
@@ -2665,7 +2722,7 @@
" in pmap %p", va, pmap);
return (FALSE);
}
- if (va < VM_MAXUSER_ADDRESS)
+ if (pmap != kernel_pmap)
pmap->pm_stats.resident_count++;
}
mptepa = VM_PAGE_TO_PHYS(mpte);
@@ -2676,7 +2733,7 @@
* temporarily map the page table page (mpte) into the kernel's
* address space at either PADDR1 or PADDR2.
*/
- if (va >= KERNBASE)
+ if (pmap == kernel_pmap)
firstpte = &KPTmap[i386_btop(trunc_4mpage(va))];
else if (curthread->td_pinned > 0 && rw_wowned(&pvh_global_lock)) {
if ((*PMAP1 & PG_FRAME) != mptepa) {
@@ -3471,9 +3528,11 @@
mpte = NULL;
wired = (flags & PMAP_ENTER_WIRED) != 0;
- KASSERT(va <= VM_MAX_KERNEL_ADDRESS, ("pmap_enter: toobig"));
- KASSERT(va < UPT_MIN_ADDRESS || va >= UPT_MAX_ADDRESS,
- ("pmap_enter: invalid to pmap_enter page table pages (va: 0x%x)",
+ KASSERT((pmap == kernel_pmap && va < VM_MAX_KERNEL_ADDRESS) ||
+ (pmap != kernel_pmap && va < VM_MAXUSER_ADDRESS),
+ ("pmap_enter: toobig k%d %#x", pmap == kernel_pmap, va));
+ KASSERT(va < PMAP_TRM_MIN_ADDRESS,
+ ("pmap_enter: invalid to pmap_enter into trampoline (va: 0x%x)",
va));
if ((m->oflags & VPO_UNMANAGED) == 0 && !vm_page_xbusied(m))
VM_OBJECT_ASSERT_LOCKED(m->object);
@@ -3483,7 +3542,7 @@
sched_pin();
pde = pmap_pde(pmap, va);
- if (va < VM_MAXUSER_ADDRESS) {
+ if (pmap != kernel_pmap) {
/*
* va is for UVA.
* In the case that a page table page is not resident,
@@ -3582,7 +3641,8 @@
* Enter on the PV list if part of our managed memory.
*/
if ((m->oflags & VPO_UNMANAGED) == 0) {
- KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva,
+ KASSERT(pmap != kernel_pmap || va < kmi.clean_sva ||
+ va >= kmi.clean_eva,
("pmap_enter: managed mapping within the clean submap"));
if (pv == NULL)
pv = get_pv_entry(pmap, FALSE);
@@ -3614,10 +3674,8 @@
#endif
if (wired)
newpte |= PG_W;
- if (va < VM_MAXUSER_ADDRESS)
+ if (pmap != kernel_pmap)
newpte |= PG_U;
- if (pmap == kernel_pmap)
- newpte |= pgeflag;
/*
* if the mapping or permission bits are different, we need
@@ -3802,8 +3860,8 @@
vm_paddr_t pa;
struct spglist free;
- KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva ||
- (m->oflags & VPO_UNMANAGED) != 0,
+ KASSERT(pmap != kernel_pmap || va < kmi.clean_sva ||
+ va >= kmi.clean_eva || (m->oflags & VPO_UNMANAGED) != 0,
("pmap_enter_quick_locked: managed mapping within the clean submap"));
rw_assert(&pvh_global_lock, RA_WLOCKED);
PMAP_LOCK_ASSERT(pmap, MA_OWNED);
@@ -3812,7 +3870,7 @@
* In the case that a page table page is not
* resident, we are creating it here.
*/
- if (va < VM_MAXUSER_ADDRESS) {
+ if (pmap != kernel_pmap) {
u_int ptepindex;
pd_entry_t ptepa;
@@ -3848,18 +3906,14 @@
mpte = NULL;
}
- /*
- * This call to vtopte makes the assumption that we are
- * entering the page into the current pmap. In order to support
- * quick entry into any pmap, one would likely use pmap_pte_quick.
- * But that isn't as quick as vtopte.
- */
- pte = vtopte(va);
+ /* XXXKIB: pmap_pte_quick() instead ? */
+ pte = pmap_pte(pmap, va);
if (*pte) {
if (mpte != NULL) {
mpte->wire_count--;
mpte = NULL;
}
+ pmap_pte_release(pte);
return (mpte);
}
@@ -3877,6 +3931,7 @@
mpte = NULL;
}
+ pmap_pte_release(pte);
return (mpte);
}
@@ -3898,6 +3953,7 @@
pte_store(pte, pa | PG_V | PG_U);
else
pte_store(pte, pa | PG_V | PG_U | PG_MANAGED);
+ pmap_pte_release(pte);
return (mpte);
}
@@ -4089,122 +4145,18 @@
* from the source map to the range dst_addr/len
* in the destination map.
*
- * This routine is only advisory and need not do anything.
+ * This routine is only advisory and need not do anything. Since
+ * current pmap is always the kernel pmap when executing in
+ * kernel, and we do not copy from the kernel pmap to a user
+ * pmap, this optimization is not usable in 4/4G full split i386
+ * world.
*/
void
pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
vm_offset_t src_addr)
{
- struct spglist free;
- vm_offset_t addr;
- vm_offset_t end_addr = src_addr + len;
- vm_offset_t pdnxt;
-
- if (dst_addr != src_addr)
- return;
-
- if (!pmap_is_current(src_pmap))
- return;
-
- rw_wlock(&pvh_global_lock);
- if (dst_pmap < src_pmap) {
- PMAP_LOCK(dst_pmap);
- PMAP_LOCK(src_pmap);
- } else {
- PMAP_LOCK(src_pmap);
- PMAP_LOCK(dst_pmap);
- }
- sched_pin();
- for (addr = src_addr; addr < end_addr; addr = pdnxt) {
- pt_entry_t *src_pte, *dst_pte;
- vm_page_t dstmpte, srcmpte;
- pd_entry_t srcptepaddr;
- u_int ptepindex;
-
- KASSERT(addr < UPT_MIN_ADDRESS,
- ("pmap_copy: invalid to pmap_copy page tables"));
-
- pdnxt = (addr + NBPDR) & ~PDRMASK;
- if (pdnxt < addr)
- pdnxt = end_addr;
- ptepindex = addr >> PDRSHIFT;
-
- srcptepaddr = src_pmap->pm_pdir[ptepindex];
- if (srcptepaddr == 0)
- continue;
-
- if (srcptepaddr & PG_PS) {
- if ((addr & PDRMASK) != 0 || addr + NBPDR > end_addr)
- continue;
- if (dst_pmap->pm_pdir[ptepindex] == 0 &&
- ((srcptepaddr & PG_MANAGED) == 0 ||
- pmap_pv_insert_pde(dst_pmap, addr, srcptepaddr &
- PG_PS_FRAME))) {
- dst_pmap->pm_pdir[ptepindex] = srcptepaddr &
- ~PG_W;
- dst_pmap->pm_stats.resident_count +=
- NBPDR / PAGE_SIZE;
- pmap_pde_mappings++;
- }
- continue;
- }
-
- srcmpte = PHYS_TO_VM_PAGE(srcptepaddr & PG_FRAME);
- KASSERT(srcmpte->wire_count > 0,
- ("pmap_copy: source page table page is unused"));
-
- if (pdnxt > end_addr)
- pdnxt = end_addr;
-
- src_pte = vtopte(addr);
- while (addr < pdnxt) {
- pt_entry_t ptetemp;
- ptetemp = *src_pte;
- /*
- * we only virtual copy managed pages
- */
- if ((ptetemp & PG_MANAGED) != 0) {
- dstmpte = pmap_allocpte(dst_pmap, addr,
- PMAP_ENTER_NOSLEEP);
- if (dstmpte == NULL)
- goto out;
- dst_pte = pmap_pte_quick(dst_pmap, addr);
- if (*dst_pte == 0 &&
- pmap_try_insert_pv_entry(dst_pmap, addr,
- PHYS_TO_VM_PAGE(ptetemp & PG_FRAME))) {
- /*
- * Clear the wired, modified, and
- * accessed (referenced) bits
- * during the copy.
- */
- *dst_pte = ptetemp & ~(PG_W | PG_M |
- PG_A);
- dst_pmap->pm_stats.resident_count++;
- } else {
- SLIST_INIT(&free);
- if (pmap_unwire_ptp(dst_pmap, dstmpte,
- &free)) {
- pmap_invalidate_page(dst_pmap,
- addr);
- vm_page_free_pages_toq(&free,
- true);
- }
- goto out;
- }
- if (dstmpte->wire_count >= srcmpte->wire_count)
- break;
- }
- addr += PAGE_SIZE;
- src_pte++;
- }
- }
-out:
- sched_unpin();
- rw_wunlock(&pvh_global_lock);
- PMAP_UNLOCK(src_pmap);
- PMAP_UNLOCK(dst_pmap);
-}
+}
/*
* Zero 1 page of virtual memory mapped from a hardware page by the caller.
@@ -4519,7 +4471,7 @@
pte = pmap_pde(pmap, pv->pv_va);
tpte = *pte;
if ((tpte & PG_PS) == 0) {
- pte = vtopte(pv->pv_va);
+ pte = pmap_pte_quick(pmap, pv->pv_va);
tpte = *pte & ~PG_PTE_PAT;
}
@@ -4685,8 +4637,10 @@
PMAP_LOCK(pmap);
pde = pmap_pde(pmap, addr);
if (*pde != 0 && (*pde & PG_PS) == 0) {
- pte = vtopte(addr);
- rv = *pte == 0;
+ pte = pmap_pte(pmap, addr);
+ if (pte != NULL)
+ rv = *pte == 0;
+ pmap_pte_release(pte);
}
PMAP_UNLOCK(pmap);
return (rv);
@@ -5188,8 +5142,8 @@
size = round_page(offset + size);
pa = pa & PG_FRAME;
- if (pa < KERNLOAD && pa + size <= KERNLOAD)
- va = KERNBASE + pa;
+ if (pa < PMAP_MAP_LOW && pa + size <= PMAP_MAP_LOW)
+ va = pa + PMAP_MAP_LOW;
else if (!pmap_initialized) {
va = 0;
for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
@@ -5248,7 +5202,7 @@
vm_offset_t offset;
int i;
- if (va >= KERNBASE && va + size <= KERNBASE + KERNLOAD)
+ if (va >= PMAP_MAP_LOW && va <= KERNBASE && va + size <= KERNBASE)
return;
offset = va & PAGE_MASK;
size = round_page(offset + size);
@@ -5545,7 +5499,6 @@
* pmap_activate is for the current thread on the current cpu
*/
td->td_pcb->pcb_cr3 = cr3;
- load_cr3(cr3);
PCPU_SET(curpmap, pmap);
critical_exit();
}
@@ -5613,6 +5566,80 @@
critical_exit();
}
+static vmem_t *pmap_trm_arena;
+static vmem_addr_t pmap_trm_arena_last = PMAP_TRM_MIN_ADDRESS;
+static int trm_guard = PAGE_SIZE;
+
+static int
+pmap_trm_import(void *unused __unused, vmem_size_t size, int flags,
+ vmem_addr_t *addrp)
+{
+ vm_page_t m;
+ vmem_addr_t af, addr, prev_addr;
+ pt_entry_t *trm_pte;
+
+ prev_addr = atomic_load_long(&pmap_trm_arena_last);
+ size = round_page(size) + trm_guard;
+ for (;;) {
+ if (prev_addr + size < prev_addr || prev_addr + size < size ||
+ prev_addr + size > PMAP_TRM_MAX_ADDRESS)
+ return (ENOMEM);
+ addr = prev_addr + size;
+ if (atomic_fcmpset_int(&pmap_trm_arena_last, &prev_addr, addr))
+ break;
+ }
+ prev_addr += trm_guard;
+ trm_pte = PTmap + atop(prev_addr);
+ for (af = prev_addr; af < addr; af += PAGE_SIZE) {
+ m = vm_page_alloc(NULL, 0, VM_ALLOC_NOOBJ | VM_ALLOC_NOBUSY |
+ VM_ALLOC_NORMAL | VM_ALLOC_WIRED | VM_ALLOC_WAITOK);
+ pte_store(&trm_pte[atop(af - prev_addr)], VM_PAGE_TO_PHYS(m) |
+ PG_M | PG_A | PG_RW | PG_V | pgeflag |
+ pmap_cache_bits(VM_MEMATTR_DEFAULT, FALSE));
+ }
+ *addrp = prev_addr;
+ return (0);
+}
+
+static
+void pmap_init_trm(void)
+{
+ vm_page_t pd_m;
+
+ TUNABLE_INT_FETCH("machdep.trm_guard", &trm_guard);
+ if ((trm_guard & PAGE_MASK) != 0)
+ trm_guard = 0;
+ pmap_trm_arena = vmem_create("i386trampoline", 0, 0, 1, 0, M_WAITOK);
+ vmem_set_import(pmap_trm_arena, pmap_trm_import, NULL, NULL, PAGE_SIZE);
+ pd_m = vm_page_alloc(NULL, 0, VM_ALLOC_NOOBJ | VM_ALLOC_NOBUSY |
+ VM_ALLOC_NORMAL | VM_ALLOC_WIRED | VM_ALLOC_WAITOK | VM_ALLOC_ZERO);
+ if ((pd_m->flags & PG_ZERO) == 0)
+ pmap_zero_page(pd_m);
+ PTD[TRPTDI] = VM_PAGE_TO_PHYS(pd_m) | PG_M | PG_A | PG_RW | PG_V |
+ pmap_cache_bits(VM_MEMATTR_DEFAULT, TRUE);
+}
+
+void *
+pmap_trm_alloc(size_t size, int flags)
+{
+ vmem_addr_t res;
+ int error;
+
+ MPASS((flags & ~(M_WAITOK | M_NOWAIT | M_ZERO)) == 0);
+ error = vmem_xalloc(pmap_trm_arena, roundup2(size, 4), sizeof(int),
+ 0, 0, VMEM_ADDR_MIN, VMEM_ADDR_MAX, flags | M_FIRSTFIT, &res);
+ if (error != 0)
+ return (NULL);
+ return ((void *)res);
+}
+
+void
+pmap_trm_free(void *addr, size_t size)
+{
+
+ vmem_free(pmap_trm_arena, (uintptr_t)addr, roundup2(size, 4));
+}
+
#if defined(PMAP_DEBUG)
pmap_pid_dump(int pid)
{
Index: sys/i386/i386/sigtramp.s
===================================================================
--- sys/i386/i386/sigtramp.s
+++ sys/i386/i386/sigtramp.s
@@ -95,6 +95,25 @@
pushl %eax /* junk to fake return addr. */
int $0x80 /* enter kernel with args */
0: jmp 0b
+
+/*
+ * Our lcall $7,$0 handler remains in user mode (ring 3), since lcalls
+ * don't change the interrupt mask, so if this one went directly to the
+ * kernel then there would be a window with interrupts enabled in kernel
+ * mode, and all interrupt handlers would have to be almost as complicated
+ * as the NMI handler to support this.
+ *
+ * Instead, convert the lcall to an int0x80 call. The kernel does most
+ * of the conversion by popping the lcall return values off the user
+ * stack and returning to them instead of to here, except when the
+ * conversion itself fails. Adjusting the stack here is impossible for
+ * vfork() and harder for other syscalls.
+ */
+ ALIGN_TEXT
+lcall_tramp:
+ int $0x80
+1: jmp 1b
+
#endif /* COMPAT_43 */
ALIGN_TEXT
@@ -113,4 +132,7 @@
.globl szosigcode
szosigcode:
.long esigcode-osigcode
+ .globl sz_lcall_tramp
+sz_lcall_tramp:
+ .long esigcode-lcall_tramp
#endif
Index: sys/i386/i386/support.s
===================================================================
--- sys/i386/i386/support.s
+++ sys/i386/i386/support.s
@@ -251,196 +251,6 @@
ret
END(memcpy)
-/*****************************************************************************/
-/* copyout and fubyte family */
-/*****************************************************************************/
-/*
- * Access user memory from inside the kernel. These routines and possibly
- * the math- and DOS emulators should be the only places that do this.
- *
- * We have to access the memory with user's permissions, so use a segment
- * selector with RPL 3. For writes to user space we have to additionally
- * check the PTE for write permission, because the 386 does not check
- * write permissions when we are executing with EPL 0. The 486 does check
- * this if the WP bit is set in CR0, so we can use a simpler version here.
- *
- * These routines set curpcb->pcb_onfault for the time they execute. When a
- * protection violation occurs inside the functions, the trap handler
- * returns to *curpcb->pcb_onfault instead of the function.
- */
-
-/*
- * copyout(from_kernel, to_user, len) - MP SAFE
- */
-ENTRY(copyout)
- movl PCPU(CURPCB),%eax
- movl $copyout_fault,PCB_ONFAULT(%eax)
- pushl %esi
- pushl %edi
- pushl %ebx
- movl 16(%esp),%esi
- movl 20(%esp),%edi
- movl 24(%esp),%ebx
- testl %ebx,%ebx /* anything to do? */
- jz done_copyout
-
- /*
- * Check explicitly for non-user addresses. This check is essential
- * because it prevents usermode from writing into the kernel. We do
- * not verify anywhere else that the user did not specify a rogue
- * address.
- */
- /*
- * First, prevent address wrapping.
- */
- movl %edi,%eax
- addl %ebx,%eax
- jc copyout_fault
-/*
- * XXX STOP USING VM_MAXUSER_ADDRESS.
- * It is an end address, not a max, so every time it is used correctly it
- * looks like there is an off by one error, and of course it caused an off
- * by one error in several places.
- */
- cmpl $VM_MAXUSER_ADDRESS,%eax
- ja copyout_fault
-
- /* bcopy(%esi, %edi, %ebx) */
- movl %ebx,%ecx
-
- shrl $2,%ecx
- rep
- movsl
- movb %bl,%cl
- andb $3,%cl
- rep
- movsb
-
-done_copyout:
- popl %ebx
- popl %edi
- popl %esi
- xorl %eax,%eax
- movl PCPU(CURPCB),%edx
- movl %eax,PCB_ONFAULT(%edx)
- ret
-END(copyout)
-
- ALIGN_TEXT
-copyout_fault:
- popl %ebx
- popl %edi
- popl %esi
- movl PCPU(CURPCB),%edx
- movl $0,PCB_ONFAULT(%edx)
- movl $EFAULT,%eax
- ret
-
-/*
- * copyin(from_user, to_kernel, len) - MP SAFE
- */
-ENTRY(copyin)
- movl PCPU(CURPCB),%eax
- movl $copyin_fault,PCB_ONFAULT(%eax)
- pushl %esi
- pushl %edi
- movl 12(%esp),%esi /* caddr_t from */
- movl 16(%esp),%edi /* caddr_t to */
- movl 20(%esp),%ecx /* size_t len */
-
- /*
- * make sure address is valid
- */
- movl %esi,%edx
- addl %ecx,%edx
- jc copyin_fault
- cmpl $VM_MAXUSER_ADDRESS,%edx
- ja copyin_fault
-
- movb %cl,%al
- shrl $2,%ecx /* copy longword-wise */
- rep
- movsl
- movb %al,%cl
- andb $3,%cl /* copy remaining bytes */
- rep
- movsb
-
- popl %edi
- popl %esi
- xorl %eax,%eax
- movl PCPU(CURPCB),%edx
- movl %eax,PCB_ONFAULT(%edx)
- ret
-END(copyin)
-
- ALIGN_TEXT
-copyin_fault:
- popl %edi
- popl %esi
- movl PCPU(CURPCB),%edx
- movl $0,PCB_ONFAULT(%edx)
- movl $EFAULT,%eax
- ret
-
-/*
- * casueword. Compare and set user word. Returns -1 on fault,
- * 0 on non-faulting access. The current value is in *oldp.
- */
-ALTENTRY(casueword32)
-ENTRY(casueword)
- movl PCPU(CURPCB),%ecx
- movl $fusufault,PCB_ONFAULT(%ecx)
- movl 4(%esp),%edx /* dst */
- movl 8(%esp),%eax /* old */
- movl 16(%esp),%ecx /* new */
-
- cmpl $VM_MAXUSER_ADDRESS-4,%edx /* verify address is valid */
- ja fusufault
-
-#ifdef SMP
- lock
-#endif
- cmpxchgl %ecx,(%edx) /* Compare and set. */
-
- /*
- * The old value is in %eax. If the store succeeded it will be the
- * value we expected (old) from before the store, otherwise it will
- * be the current value.
- */
-
- movl PCPU(CURPCB),%ecx
- movl $0,PCB_ONFAULT(%ecx)
- movl 12(%esp),%edx /* oldp */
- movl %eax,(%edx)
- xorl %eax,%eax
- ret
-END(casueword32)
-END(casueword)
-
-/*
- * Fetch (load) a 32-bit word, a 16-bit word, or an 8-bit byte from user
- * memory.
- */
-
-ALTENTRY(fueword32)
-ENTRY(fueword)
- movl PCPU(CURPCB),%ecx
- movl $fusufault,PCB_ONFAULT(%ecx)
- movl 4(%esp),%edx /* from */
-
- cmpl $VM_MAXUSER_ADDRESS-4,%edx /* verify address is valid */
- ja fusufault
-
- movl (%edx),%eax
- movl $0,PCB_ONFAULT(%ecx)
- movl 8(%esp),%edx
- movl %eax,(%edx)
- xorl %eax,%eax
- ret
-END(fueword32)
-END(fueword)
-
/*
* fuswintr() and suswintr() are specialized variants of fuword16() and
* suword16(), respectively. They are called from the profiling code,
@@ -455,167 +265,6 @@
END(suswintr)
END(fuswintr)
-ENTRY(fuword16)
- movl PCPU(CURPCB),%ecx
- movl $fusufault,PCB_ONFAULT(%ecx)
- movl 4(%esp),%edx
-
- cmpl $VM_MAXUSER_ADDRESS-2,%edx
- ja fusufault
-
- movzwl (%edx),%eax
- movl $0,PCB_ONFAULT(%ecx)
- ret
-END(fuword16)
-
-ENTRY(fubyte)
- movl PCPU(CURPCB),%ecx
- movl $fusufault,PCB_ONFAULT(%ecx)
- movl 4(%esp),%edx
-
- cmpl $VM_MAXUSER_ADDRESS-1,%edx
- ja fusufault
-
- movzbl (%edx),%eax
- movl $0,PCB_ONFAULT(%ecx)
- ret
-END(fubyte)
-
- ALIGN_TEXT
-fusufault:
- movl PCPU(CURPCB),%ecx
- xorl %eax,%eax
- movl %eax,PCB_ONFAULT(%ecx)
- decl %eax
- ret
-
-/*
- * Store a 32-bit word, a 16-bit word, or an 8-bit byte to user memory.
- * All these functions are MPSAFE.
- */
-
-ALTENTRY(suword32)
-ENTRY(suword)
- movl PCPU(CURPCB),%ecx
- movl $fusufault,PCB_ONFAULT(%ecx)
- movl 4(%esp),%edx
-
- cmpl $VM_MAXUSER_ADDRESS-4,%edx /* verify address validity */
- ja fusufault
-
- movl 8(%esp),%eax
- movl %eax,(%edx)
- xorl %eax,%eax
- movl PCPU(CURPCB),%ecx
- movl %eax,PCB_ONFAULT(%ecx)
- ret
-END(suword32)
-END(suword)
-
-ENTRY(suword16)
- movl PCPU(CURPCB),%ecx
- movl $fusufault,PCB_ONFAULT(%ecx)
- movl 4(%esp),%edx
-
- cmpl $VM_MAXUSER_ADDRESS-2,%edx /* verify address validity */
- ja fusufault
-
- movw 8(%esp),%ax
- movw %ax,(%edx)
- xorl %eax,%eax
- movl PCPU(CURPCB),%ecx /* restore trashed register */
- movl %eax,PCB_ONFAULT(%ecx)
- ret
-END(suword16)
-
-ENTRY(subyte)
- movl PCPU(CURPCB),%ecx
- movl $fusufault,PCB_ONFAULT(%ecx)
- movl 4(%esp),%edx
-
- cmpl $VM_MAXUSER_ADDRESS-1,%edx /* verify address validity */
- ja fusufault
-
- movb 8(%esp),%al
- movb %al,(%edx)
- xorl %eax,%eax
- movl PCPU(CURPCB),%ecx /* restore trashed register */
- movl %eax,PCB_ONFAULT(%ecx)
- ret
-END(subyte)
-
-/*
- * copyinstr(from, to, maxlen, int *lencopied) - MP SAFE
- *
- * copy a string from 'from' to 'to', stop when a 0 character is reached.
- * return ENAMETOOLONG if string is longer than maxlen, and
- * EFAULT on protection violations. If lencopied is non-zero,
- * return the actual length in *lencopied.
- */
-ENTRY(copyinstr)
- pushl %esi
- pushl %edi
- movl PCPU(CURPCB),%ecx
- movl $cpystrflt,PCB_ONFAULT(%ecx)
-
- movl 12(%esp),%esi /* %esi = from */
- movl 16(%esp),%edi /* %edi = to */
- movl 20(%esp),%edx /* %edx = maxlen */
-
- movl $VM_MAXUSER_ADDRESS,%eax
-
- /* make sure 'from' is within bounds */
- subl %esi,%eax
- jbe cpystrflt
-
- /* restrict maxlen to <= VM_MAXUSER_ADDRESS-from */
- cmpl %edx,%eax
- jae 1f
- movl %eax,%edx
- movl %eax,20(%esp)
-1:
- incl %edx
-
-2:
- decl %edx
- jz 3f
-
- lodsb
- stosb
- orb %al,%al
- jnz 2b
-
- /* Success -- 0 byte reached */
- decl %edx
- xorl %eax,%eax
- jmp cpystrflt_x
-3:
- /* edx is zero - return ENAMETOOLONG or EFAULT */
- cmpl $VM_MAXUSER_ADDRESS,%esi
- jae cpystrflt
-4:
- movl $ENAMETOOLONG,%eax
- jmp cpystrflt_x
-
-cpystrflt:
- movl $EFAULT,%eax
-
-cpystrflt_x:
- /* set *lencopied and return %eax */
- movl PCPU(CURPCB),%ecx
- movl $0,PCB_ONFAULT(%ecx)
- movl 20(%esp),%ecx
- subl %edx,%ecx
- movl 24(%esp),%edx
- testl %edx,%edx
- jz 1f
- movl %ecx,(%edx)
-1:
- popl %edi
- popl %esi
- ret
-END(copyinstr)
-
/*
* copystr(from, to, maxlen, int *lencopied) - MP SAFE
*/
Index: sys/i386/i386/swtch.s
===================================================================
--- sys/i386/i386/swtch.s
+++ sys/i386/i386/swtch.s
@@ -86,8 +86,6 @@
1:
movl 8(%esp),%ecx /* New thread */
movl TD_PCB(%ecx),%edx
- movl PCB_CR3(%edx),%eax
- movl %eax,%cr3
/* set bit in new pm_active */
movl TD_PROC(%ecx),%eax
movl P_VMSPACE(%eax), %ebx
@@ -157,7 +155,7 @@
popl %eax
1:
- /* Save is done. Now fire up new thread. Leave old vmspace. */
+ /* Save is done. Now fire up new thread. */
movl 4(%esp),%edi
movl 8(%esp),%ecx /* New thread */
movl 12(%esp),%esi /* New lock */
@@ -167,15 +165,10 @@
#endif
movl TD_PCB(%ecx),%edx
- /* switch address space */
- movl PCB_CR3(%edx),%eax
- movl %cr3,%ebx /* The same address space? */
- cmpl %ebx,%eax
- je sw0
- movl %eax,%cr3 /* new address space */
+ /* Switchout td_lock */
movl %esi,%eax
movl PCPU(CPUID),%esi
- SETOP %eax,TD_LOCK(%edi) /* Switchout td_lock */
+ SETOP %eax,TD_LOCK(%edi)
/* Release bit from old pmap->pm_active */
movl PCPU(CURPMAP), %ebx
@@ -200,26 +193,28 @@
sw1:
BLOCK_SPIN(%ecx)
/*
- * At this point, we've switched address spaces and are ready
+ * At this point, we have managed thread locks and are ready
* to load up the rest of the next context.
*/
+
+ /* Load a pointer to the thread kernel stack into PCPU. */
+ leal -VM86_STACK_SPACE(%edx), %eax /* leave space for vm86 */
+ movl %eax, PCPU(KESP0)
+
cmpl $0, PCB_EXT(%edx) /* has pcb extension? */
je 1f /* If not, use the default */
movl $1, PCPU(PRIVATE_TSS) /* mark use of private tss */
movl PCB_EXT(%edx), %edi /* new tss descriptor */
+ movl PCPU(TRAMPSTK), %ebx
+ movl %ebx, PCB_EXT_TSS+TSS_ESP0(%edi)
jmp 2f /* Load it up */
1: /*
* Use the common default TSS instead of our own.
- * Set our stack pointer into the TSS, it's set to just
- * below the PCB. In C, common_tss.tss_esp0 = &pcb - 16;
- */
- leal -16(%edx), %ebx /* leave space for vm86 */
- movl %ebx, PCPU(COMMON_TSS) + TSS_ESP0
-
- /*
- * Test this CPU's bit in the bitmap to see if this
- * CPU was using a private TSS.
+ * Stack pointer in the common TSS points to the trampoline stack
+ * already and should be not changed.
+ *
+ * Test this CPU's flag to see if this CPU was using a private TSS.
*/
cmpl $0, PCPU(PRIVATE_TSS) /* Already using the common? */
je 3f /* if so, skip reloading */
Index: sys/i386/i386/sys_machdep.c
===================================================================
--- sys/i386/i386/sys_machdep.c
+++ sys/i386/i386/sys_machdep.c
@@ -294,10 +294,8 @@
0 /* granularity */
};
- ext = (struct pcb_ext *)kmem_malloc(kernel_arena, ctob(IOPAGES+1),
- M_WAITOK | M_ZERO);
+ ext = pmap_trm_alloc(ctob(IOPAGES + 1), M_WAITOK | M_ZERO);
/* -16 is so we can convert a trapframe into vm86trapframe inplace */
- ext->ext_tss.tss_esp0 = (vm_offset_t)td->td_pcb - 16;
ext->ext_tss.tss_ss0 = GSEL(GDATA_SEL, SEL_KPL);
/*
* The last byte of the i/o map must be followed by an 0xff byte.
@@ -323,6 +321,7 @@
/* Switch to the new TSS. */
critical_enter();
+ ext->ext_tss.tss_esp0 = PCPU_GET(trampstk);
td->td_pcb->pcb_ext = ext;
PCPU_SET(private_tss, 1);
*PCPU_GET(tss_gdt) = ext->ext_tssd;
@@ -457,8 +456,8 @@
new_ldt = malloc(sizeof(struct proc_ldt), M_SUBPROC, M_WAITOK);
new_ldt->ldt_len = len = NEW_MAX_LD(len);
- new_ldt->ldt_base = (caddr_t)kmem_malloc(kernel_arena,
- len * sizeof(union descriptor), M_WAITOK | M_ZERO);
+ new_ldt->ldt_base = pmap_trm_alloc(len * sizeof(union descriptor),
+ M_WAITOK | M_ZERO);
new_ldt->ldt_refcnt = 1;
new_ldt->ldt_active = 0;
@@ -473,7 +472,7 @@
bcopy(pldt->ldt_base, new_ldt->ldt_base,
len * sizeof(union descriptor));
} else
- bcopy(ldt, new_ldt->ldt_base, sizeof(ldt));
+ bcopy(ldt, new_ldt->ldt_base, sizeof(union descriptor) * NLDT);
return (new_ldt);
}
@@ -510,8 +509,8 @@
mtx_assert(&dt_lock, MA_OWNED);
if (--pldt->ldt_refcnt == 0) {
mtx_unlock_spin(&dt_lock);
- kmem_free(kernel_arena, (vm_offset_t)pldt->ldt_base,
- pldt->ldt_len * sizeof(union descriptor));
+ pmap_trm_free(pldt->ldt_base, pldt->ldt_len *
+ sizeof(union descriptor));
free(pldt, M_SUBPROC);
} else
mtx_unlock_spin(&dt_lock);
@@ -767,8 +766,7 @@
* free the new object and return.
*/
mtx_unlock_spin(&dt_lock);
- kmem_free(kernel_arena,
- (vm_offset_t)new_ldt->ldt_base,
+ pmap_trm_free(new_ldt->ldt_base,
new_ldt->ldt_len * sizeof(union descriptor));
free(new_ldt, M_SUBPROC);
mtx_lock_spin(&dt_lock);
@@ -801,8 +799,8 @@
mtx_unlock_spin(&dt_lock);
#endif
if (old_ldt_base != NULL_LDT_BASE) {
- kmem_free(kernel_arena, (vm_offset_t)old_ldt_base,
- old_ldt_len * sizeof(union descriptor));
+ pmap_trm_free(old_ldt_base, old_ldt_len *
+ sizeof(union descriptor));
free(new_ldt, M_SUBPROC);
}
mtx_lock_spin(&dt_lock);
Index: sys/i386/i386/trap.c
===================================================================
--- sys/i386/i386/trap.c
+++ sys/i386/i386/trap.c
@@ -47,6 +47,7 @@
*/
#include "opt_clock.h"
+#include "opt_compat.h"
#include "opt_cpu.h"
#include "opt_hwpmc_hooks.h"
#include "opt_isa.h"
@@ -117,45 +118,60 @@
static void trap_fatal(struct trapframe *, vm_offset_t);
void dblfault_handler(void);
-extern inthand_t IDTVEC(lcall_syscall);
-
#define MAX_TRAP_MSG 32
-static char *trap_msg[] = {
- "", /* 0 unused */
- "privileged instruction fault", /* 1 T_PRIVINFLT */
- "", /* 2 unused */
- "breakpoint instruction fault", /* 3 T_BPTFLT */
- "", /* 4 unused */
- "", /* 5 unused */
- "arithmetic trap", /* 6 T_ARITHTRAP */
- "", /* 7 unused */
- "", /* 8 unused */
- "general protection fault", /* 9 T_PROTFLT */
- "trace trap", /* 10 T_TRCTRAP */
- "", /* 11 unused */
- "page fault", /* 12 T_PAGEFLT */
- "", /* 13 unused */
- "alignment fault", /* 14 T_ALIGNFLT */
- "", /* 15 unused */
- "", /* 16 unused */
- "", /* 17 unused */
- "integer divide fault", /* 18 T_DIVIDE */
- "non-maskable interrupt trap", /* 19 T_NMI */
- "overflow trap", /* 20 T_OFLOW */
- "FPU bounds check fault", /* 21 T_BOUND */
- "FPU device not available", /* 22 T_DNA */
- "double fault", /* 23 T_DOUBLEFLT */
- "FPU operand fetch fault", /* 24 T_FPOPFLT */
- "invalid TSS fault", /* 25 T_TSSFLT */
- "segment not present fault", /* 26 T_SEGNPFLT */
- "stack fault", /* 27 T_STKFLT */
- "machine check trap", /* 28 T_MCHK */
- "SIMD floating-point exception", /* 29 T_XMMFLT */
- "reserved (unknown) fault", /* 30 T_RESERVED */
- "", /* 31 unused (reserved) */
- "DTrace pid return trap", /* 32 T_DTRACE_RET */
+
+struct trap_data {
+ bool ei;
+ const char *msg;
+};
+
+static const struct trap_data trap_data[] = {
+ [T_PRIVINFLT] = { .ei = true, .msg = "privileged instruction fault" },
+ [T_BPTFLT] = { .ei = false, .msg = "breakpoint instruction fault" },
+ [T_ARITHTRAP] = { .ei = true, .msg = "arithmetic trap" },
+ [T_PROTFLT] = { .ei = true, .msg = "general protection fault" },
+ [T_TRCTRAP] = { .ei = false, .msg = "trace trap" },
+ [T_PAGEFLT] = { .ei = true, .msg = "page fault" },
+ [T_ALIGNFLT] = { .ei = true, .msg = "alignment fault" },
+ [T_DIVIDE] = { .ei = true, .msg = "integer divide fault" },
+ [T_NMI] = { .ei = false, .msg = "non-maskable interrupt trap" },
+ [T_OFLOW] = { .ei = true, .msg = "overflow trap" },
+ [T_BOUND] = { .ei = true, .msg = "FPU bounds check fault" },
+ [T_DNA] = { .ei = true, .msg = "FPU device not available" },
+ [T_DOUBLEFLT] = { .ei = false, .msg = "double fault" },
+ [T_FPOPFLT] = { .ei = true, .msg = "FPU operand fetch fault" },
+ [T_TSSFLT] = { .ei = true, .msg = "invalid TSS fault" },
+ [T_SEGNPFLT] = { .ei = true, .msg = "segment not present fault" },
+ [T_STKFLT] = { .ei = true, .msg = "stack fault" },
+ [T_MCHK] = { .ei = true, .msg = "machine check trap" },
+ [T_XMMFLT] = { .ei = true, .msg = "SIMD floating-point exception" },
+ [T_DTRACE_RET] ={ .ei = true, .msg = "DTrace pid return trap" },
};
+static bool
+trap_enable_intr(int trapno)
+{
+
+ MPASS(trapno > 0);
+ if (trapno < nitems(trap_data) && trap_data[trapno].msg != NULL)
+ return (trap_data[trapno].ei);
+ return (false);
+}
+
+static const char *
+trap_msg(int trapno)
+{
+ const char *res;
+ static const char unkn[] = "UNKNOWN";
+
+ res = NULL;
+ if (trapno < nitems(trap_data))
+ res = trap_data[trapno].msg;
+ if (res == NULL)
+ res = unkn;
+ return (res);
+}
+
#if defined(I586_CPU) && !defined(NO_F00F_HACK)
int has_f00f_bug = 0; /* Initialized so that it can be patched. */
#endif
@@ -201,6 +217,9 @@
VM_CNT_INC(v_trap);
type = frame->tf_trapno;
+ KASSERT((read_eflags() & PSL_I) == 0,
+ ("trap: interrupts enaabled, type %d frame %p", type, frame));
+
#ifdef SMP
/* Handler for NMI IPIs used for stopping CPUs. */
if (type == T_NMI && ipi_nmi_handler() == 0)
@@ -257,53 +276,34 @@
return;
#endif
- if ((frame->tf_eflags & PSL_I) == 0) {
- /*
- * Buggy application or kernel code has disabled
- * interrupts and then trapped. Enabling interrupts
- * now is wrong, but it is better than running with
- * interrupts disabled until they are accidentally
- * enabled later.
- */
- if (TRAPF_USERMODE(frame) &&
- (curpcb->pcb_flags & PCB_VM86CALL) == 0)
- uprintf(
- "pid %ld (%s): trap %d with interrupts disabled\n",
- (long)curproc->p_pid, curthread->td_name, type);
- else if (type != T_NMI && type != T_BPTFLT &&
- type != T_TRCTRAP &&
- frame->tf_eip != (int)cpu_switch_load_gs) {
- /*
- * XXX not quite right, since this may be for a
- * multiple fault in user mode.
- */
- printf("kernel trap %d with interrupts disabled\n",
- type);
- /*
- * Page faults need interrupts disabled until later,
- * and we shouldn't enable interrupts while holding
- * a spin lock.
- */
- if (type != T_PAGEFLT &&
- td->td_md.md_spinlock_count == 0)
- enable_intr();
- }
- }
- eva = 0;
- if (type == T_PAGEFLT) {
- /*
- * For some Cyrix CPUs, %cr2 is clobbered by
- * interrupts. This problem is worked around by using
- * an interrupt gate for the pagefault handler. We
- * are finally ready to read %cr2 and conditionally
- * reenable interrupts. If we hold a spin lock, then
- * we must not reenable interrupts. This might be a
- * spurious page fault.
- */
+ /*
+ * We must not allow context switches until %cr2 is read.
+ * Also, for some Cyrix CPUs, %cr2 is clobbered by interrupts.
+ * All faults use interrupt gates, so %cr2 can be safely read
+ * now, before optional enable of the interrupts below.
+ */
+ if (type == T_PAGEFLT)
eva = rcr2();
- if (td->td_md.md_spinlock_count == 0)
- enable_intr();
- }
+
+ /*
+ * Buggy application or kernel code has disabled interrupts
+ * and then trapped. Enabling interrupts now is wrong, but it
+ * is better than running with interrupts disabled until they
+ * are accidentally enabled later.
+ */
+ if ((frame->tf_eflags & PSL_I) == 0 && TRAPF_USERMODE(frame) &&
+ (curpcb->pcb_flags & PCB_VM86CALL) == 0)
+ uprintf("pid %ld (%s): trap %d with interrupts disabled\n",
+ (long)curproc->p_pid, curthread->td_name, type);
+
+ /*
+ * Conditionally reenable interrupts. If we hold a spin lock,
+ * then we must not reenable interrupts. This might be a
+ * spurious page fault.
+ */
+ if (trap_enable_intr(type) && td->td_md.md_spinlock_count == 0 &&
+ frame->tf_eip != (int)cpu_switch_load_gs)
+ enable_intr();
if (TRAPF_USERMODE(frame) && (curpcb->pcb_flags & PCB_VM86CALL) == 0) {
/* user trap */
@@ -583,24 +583,40 @@
* problem here and not have to check all the
* selectors and pointers when the user changes
* them.
+ *
+ * N.B. Comparing to long mode, 32-bit mode
+ * does not push %esp on the trap frame,
+ * because iretl faulted while in ring 0. As
+ * the consequence, there is no need to fixup
+ * the stack pointer for doreti_iret_fault,
+ * the fixup and the complimentary trap() call
+ * are executed on the main thread stack, not
+ * on the trampoline stack.
*/
- if (frame->tf_eip == (int)doreti_iret) {
- frame->tf_eip = (int)doreti_iret_fault;
+ if (frame->tf_eip == (int)doreti_iret + setidt_disp) {
+ frame->tf_eip = (int)doreti_iret_fault +
+ setidt_disp;
return;
}
if (type == T_STKFLT)
break;
- if (frame->tf_eip == (int)doreti_popl_ds) {
- frame->tf_eip = (int)doreti_popl_ds_fault;
+ if (frame->tf_eip == (int)doreti_popl_ds +
+ setidt_disp) {
+ frame->tf_eip = (int)doreti_popl_ds_fault +
+ setidt_disp;
return;
}
- if (frame->tf_eip == (int)doreti_popl_es) {
- frame->tf_eip = (int)doreti_popl_es_fault;
+ if (frame->tf_eip == (int)doreti_popl_es +
+ setidt_disp) {
+ frame->tf_eip = (int)doreti_popl_es_fault +
+ setidt_disp;
return;
}
- if (frame->tf_eip == (int)doreti_popl_fs) {
- frame->tf_eip = (int)doreti_popl_fs_fault;
+ if (frame->tf_eip == (int)doreti_popl_fs +
+ setidt_disp) {
+ frame->tf_eip = (int)doreti_popl_fs_fault +
+ setidt_disp;
return;
}
if (curpcb->pcb_onfault != NULL) {
@@ -627,23 +643,6 @@
case T_TRCTRAP: /* trace trap */
kernel_trctrap:
- if (frame->tf_eip == (int)IDTVEC(lcall_syscall)) {
- /*
- * We've just entered system mode via the
- * syscall lcall. Continue single stepping
- * silently until the syscall handler has
- * saved the flags.
- */
- return;
- }
- if (frame->tf_eip == (int)IDTVEC(lcall_syscall) + 1) {
- /*
- * The syscall handler has now saved the
- * flags. Stop single stepping it.
- */
- frame->tf_eflags &= ~PSL_T;
- return;
- }
/*
* Ignore debug register trace traps due to
* accesses in the user's address space, which
@@ -711,10 +710,11 @@
ksi.ksi_trapno = type;
if (uprintf_signal) {
uprintf("pid %d comm %s: signal %d err %x code %d type %d "
- "addr 0x%x esp 0x%08x eip 0x%08x "
+ "addr 0x%x ss 0x%04x esp 0x%08x cs 0x%04x eip 0x%08x "
"<%02x %02x %02x %02x %02x %02x %02x %02x>\n",
p->p_pid, p->p_comm, signo, frame->tf_err, ucode, type,
- addr, frame->tf_esp, frame->tf_eip,
+ addr, frame->tf_ss, frame->tf_esp, frame->tf_cs,
+ frame->tf_eip,
fubyte((void *)(frame->tf_eip + 0)),
fubyte((void *)(frame->tf_eip + 1)),
fubyte((void *)(frame->tf_eip + 2)),
@@ -791,7 +791,7 @@
}
}
va = trunc_page(eva);
- if (va >= KERNBASE) {
+ if (va >= PMAP_TRM_MIN_ADDRESS) {
/*
* Don't allow user-mode faults in kernel address space.
* An exception: if the faulting address is the invalid
@@ -806,20 +806,17 @@
#endif
if (usermode)
return (SIGSEGV);
-
- map = kernel_map;
+ trap_fatal(frame, eva);
+ return (-1);
} else {
- map = &p->p_vmspace->vm_map;
+ map = usermode ? &p->p_vmspace->vm_map : kernel_map;
/*
- * When accessing a user-space address, kernel must be
- * ready to accept the page fault, and provide a
- * handling routine. Since accessing the address
- * without the handler is a bug, do not try to handle
- * it normally, and panic immediately.
+ * Kernel cannot access a user-space address directly
+ * because user pages are not mapped. Also, page
+ * faults must not be caused during the interrupts.
*/
- if (!usermode && (td->td_intr_nesting_level != 0 ||
- curpcb->pcb_onfault == NULL)) {
+ if (!usermode && td->td_intr_nesting_level != 0) {
trap_fatal(frame, eva);
return (-1);
}
@@ -882,17 +879,12 @@
int code, ss, esp;
u_int type;
struct soft_segment_descriptor softseg;
- char *msg;
code = frame->tf_err;
type = frame->tf_trapno;
sdtossd(&gdt[IDXSEL(frame->tf_cs & 0xffff)].sd, &softseg);
- if (type <= MAX_TRAP_MSG)
- msg = trap_msg[type];
- else
- msg = "UNKNOWN";
- printf("\n\nFatal trap %d: %s while in %s mode\n", type, msg,
+ printf("\n\nFatal trap %d: %s while in %s mode\n", type, trap_msg(type),
frame->tf_eflags & PSL_VM ? "vm86" :
ISPL(frame->tf_cs) == SEL_UPL ? "user" : "kernel");
#ifdef SMP
@@ -955,8 +947,8 @@
}
#endif
printf("trap number = %d\n", type);
- if (type <= MAX_TRAP_MSG)
- panic("%s", trap_msg[type]);
+ if (trap_msg(type) != NULL)
+ panic("%s", trap_msg(type));
else
panic("unknown/reserved trap");
}
@@ -974,16 +966,16 @@
* of this is that "trace <ebp>" in ddb won't work.
*/
void
-dblfault_handler()
+dblfault_handler(void)
{
#ifdef KDTRACE_HOOKS
if (dtrace_doubletrap_func != NULL)
(*dtrace_doubletrap_func)();
#endif
printf("\nFatal double fault:\n");
- printf("eip = 0x%x\n", PCPU_GET(common_tss.tss_eip));
- printf("esp = 0x%x\n", PCPU_GET(common_tss.tss_esp));
- printf("ebp = 0x%x\n", PCPU_GET(common_tss.tss_ebp));
+ printf("eip = 0x%x\n", PCPU_GET(common_tssp)->tss_eip);
+ printf("esp = 0x%x\n", PCPU_GET(common_tssp)->tss_esp);
+ printf("ebp = 0x%x\n", PCPU_GET(common_tssp)->tss_ebp);
#ifdef SMP
/* two separate prints in case of a trap on an unmapped page */
printf("cpuid = %d; ", PCPU_GET(cpuid));
@@ -1001,13 +993,42 @@
caddr_t params;
long tmp;
int error;
+#ifdef COMPAT_43
+ u_int32_t eip;
+ int cs;
+#endif
p = td->td_proc;
frame = td->td_frame;
sa = &td->td_sa;
- params = (caddr_t)frame->tf_esp + sizeof(int);
+#ifdef COMPAT_43
+ if (__predict_false(frame->tf_cs == 7 && frame->tf_eip == 2)) {
+ /*
+ * In lcall $7,$0 after int $0x80. Convert the user
+ * frame to what it would be for a direct int 0x80 instead
+ * of lcall $7,$0, by popping the lcall return address.
+ */
+ error = fueword32((void *)frame->tf_esp, &eip);
+ if (error == -1)
+ return (EFAULT);
+ cs = fuword16((void *)(frame->tf_esp + sizeof(u_int32_t)));
+ if (cs == -1)
+ return (EFAULT);
+
+ /*
+ * Unwind in-kernel frame after all stack frame pieces
+ * were successfully read.
+ */
+ frame->tf_eip = eip;
+ frame->tf_cs = cs;
+ frame->tf_esp += 2 * sizeof(u_int32_t);
+ frame->tf_err = 7; /* size of lcall $7,$0 */
+ }
+#endif
+
sa->code = frame->tf_eax;
+ params = (caddr_t)frame->tf_esp + sizeof(uint32_t);
/*
* Need to check if this is a 32 bit or 64 bit syscall.
@@ -1020,7 +1041,7 @@
if (error == -1)
return (EFAULT);
sa->code = tmp;
- params += sizeof(int);
+ params += sizeof(uint32_t);
} else if (sa->code == SYS___syscall) {
/*
* Like syscall, but code is a quad, so as to maintain
@@ -1043,7 +1064,7 @@
if (params != NULL && sa->narg != 0)
error = copyin(params, (caddr_t)sa->args,
- (u_int)(sa->narg * sizeof(int)));
+ (u_int)(sa->narg * sizeof(uint32_t)));
else
error = 0;
Index: sys/i386/i386/vm86.c
===================================================================
--- sys/i386/i386/vm86.c
+++ sys/i386/i386/vm86.c
@@ -78,6 +78,55 @@
#define PUSH_MASK ~(PSL_VM | PSL_RF | PSL_I)
#define POP_MASK ~(PSL_VIP | PSL_VIF | PSL_VM | PSL_RF | PSL_IOPL)
+static int
+vm86_suword16(volatile void *base, int word)
+{
+
+ if (curthread->td_critnest != 0) {
+ *(volatile uint16_t *)base = word;
+ return (0);
+ }
+ return (suword16(base, word));
+}
+
+static int
+vm86_suword(volatile void *base, long word)
+{
+
+ if (curthread->td_critnest != 0) {
+ *(volatile long *)base = word;
+ return (0);
+ }
+ return (suword(base, word));
+}
+
+static int
+vm86_fubyte(volatile const void *base)
+{
+
+ if (curthread->td_critnest != 0)
+ return (*(volatile const u_char *)base);
+ return (fubyte(base));
+}
+
+static int
+vm86_fuword16(volatile const void *base)
+{
+
+ if (curthread->td_critnest != 0)
+ return (*(volatile const uint16_t *)base);
+ return (fuword16(base));
+}
+
+static long
+vm86_fuword(volatile const void *base)
+{
+
+ if (curthread->td_critnest != 0)
+ return (*(volatile const long *)base);
+ return (fuword(base));
+}
+
static __inline caddr_t
MAKE_ADDR(u_short sel, u_short off)
{
@@ -101,20 +150,20 @@
PUSH(u_short x, struct vm86frame *vmf)
{
vmf->vmf_sp -= 2;
- suword16(MAKE_ADDR(vmf->vmf_ss, vmf->vmf_sp), x);
+ vm86_suword16(MAKE_ADDR(vmf->vmf_ss, vmf->vmf_sp), x);
}
static __inline void
PUSHL(u_int x, struct vm86frame *vmf)
{
vmf->vmf_sp -= 4;
- suword(MAKE_ADDR(vmf->vmf_ss, vmf->vmf_sp), x);
+ vm86_suword(MAKE_ADDR(vmf->vmf_ss, vmf->vmf_sp), x);
}
static __inline u_short
POP(struct vm86frame *vmf)
{
- u_short x = fuword16(MAKE_ADDR(vmf->vmf_ss, vmf->vmf_sp));
+ u_short x = vm86_fuword16(MAKE_ADDR(vmf->vmf_ss, vmf->vmf_sp));
vmf->vmf_sp += 2;
return (x);
@@ -123,7 +172,7 @@
static __inline u_int
POPL(struct vm86frame *vmf)
{
- u_int x = fuword(MAKE_ADDR(vmf->vmf_ss, vmf->vmf_sp));
+ u_int x = vm86_fuword(MAKE_ADDR(vmf->vmf_ss, vmf->vmf_sp));
vmf->vmf_sp += 4;
return (x);
@@ -152,16 +201,16 @@
retcode = SIGTRAP;
addr = MAKE_ADDR(vmf->vmf_cs, vmf->vmf_ip);
- i_byte = fubyte(addr);
+ i_byte = vm86_fubyte(addr);
if (i_byte == ADDRESS_SIZE_PREFIX) {
- i_byte = fubyte(++addr);
+ i_byte = vm86_fubyte(++addr);
inc_ip++;
}
if (vm86->vm86_has_vme) {
switch (i_byte) {
case OPERAND_SIZE_PREFIX:
- i_byte = fubyte(++addr);
+ i_byte = vm86_fubyte(++addr);
inc_ip++;
switch (i_byte) {
case PUSHF:
@@ -241,7 +290,7 @@
switch (i_byte) {
case OPERAND_SIZE_PREFIX:
- i_byte = fubyte(++addr);
+ i_byte = vm86_fubyte(++addr);
inc_ip++;
switch (i_byte) {
case PUSHF:
@@ -293,7 +342,7 @@
return (retcode);
case INTn:
- i_byte = fubyte(addr + 1);
+ i_byte = vm86_fubyte(addr + 1);
if ((vm86->vm86_intmap[i_byte >> 3] & (1 << (i_byte & 7))) != 0)
break;
if (vm86->vm86_eflags & PSL_VIF)
@@ -303,7 +352,7 @@
PUSH((vmf->vmf_flags & PUSH_MASK) | PSL_IOPL, vmf);
PUSH(vmf->vmf_cs, vmf);
PUSH(vmf->vmf_ip + inc_ip + 1, vmf); /* increment IP */
- GET_VEC(fuword((caddr_t)(i_byte * 4)),
+ GET_VEC(vm86_fuword((caddr_t)(i_byte * 4)),
&vmf->vmf_cs, &vmf->vmf_ip);
vmf->vmf_flags &= ~PSL_T;
vm86->vm86_eflags &= ~PSL_VIF;
@@ -548,6 +597,7 @@
void
vm86_trap(struct vm86frame *vmf)
{
+ void (*p)(struct vm86frame *);
caddr_t addr;
/* "should not happen" */
@@ -560,21 +610,26 @@
else
vmf->vmf_trapno = vmf->vmf_trapno << 16;
- vm86_biosret(vmf);
+ p = (void (*)(struct vm86frame *))((uintptr_t)vm86_biosret +
+ setidt_disp);
+ p(vmf);
}
int
vm86_intcall(int intnum, struct vm86frame *vmf)
{
+ int (*p)(struct vm86frame *);
int retval;
if (intnum < 0 || intnum > 0xff)
return (EINVAL);
vmf->vmf_trapno = intnum;
+ p = (int (*)(struct vm86frame *))((uintptr_t)vm86_bioscall +
+ setidt_disp);
mtx_lock(&vm86_lock);
critical_enter();
- retval = vm86_bioscall(vmf);
+ retval = p(vmf);
critical_exit();
mtx_unlock(&vm86_lock);
return (retval);
@@ -589,10 +644,12 @@
int
vm86_datacall(int intnum, struct vm86frame *vmf, struct vm86context *vmc)
{
- pt_entry_t *pte = (pt_entry_t *)vm86paddr;
+ pt_entry_t *pte;
+ int (*p)(struct vm86frame *);
vm_paddr_t page;
int i, entry, retval;
+ pte = (pt_entry_t *)vm86paddr;
mtx_lock(&vm86_lock);
for (i = 0; i < vmc->npages; i++) {
page = vtophys(vmc->pmap[i].kva & PG_FRAME);
@@ -603,8 +660,10 @@
}
vmf->vmf_trapno = intnum;
+ p = (int (*)(struct vm86frame *))((uintptr_t)vm86_bioscall +
+ setidt_disp);
critical_enter();
- retval = vm86_bioscall(vmf);
+ retval = p(vmf);
critical_exit();
for (i = 0; i < vmc->npages; i++) {
Index: sys/i386/i386/vm86bios.s
===================================================================
--- sys/i386/i386/vm86bios.s
+++ sys/i386/i386/vm86bios.s
@@ -100,9 +100,8 @@
movl %cr3,%eax
pushl %eax /* save address space */
- movl IdlePTD,%ecx
+ movl IdlePTD,%ecx /* va (and pa) of Idle PTD */
movl %ecx,%ebx
- addl $KERNBASE,%ebx /* va of Idle PTD */
movl 0(%ebx),%eax
pushl %eax /* old ptde != 0 when booting */
pushl %ebx /* keep for reuse */
@@ -119,7 +118,8 @@
movl SCR_VMFRAME(%edx),%esp /* switch to new stack */
pushl %esp
- call vm86_prepcall /* finish setup */
+ movl $vm86_prepcall, %eax
+ call *%eax /* finish setup */
add $4, %esp
/*
Index: sys/i386/i386/vm_machdep.c
===================================================================
--- sys/i386/i386/vm_machdep.c
+++ sys/i386/i386/vm_machdep.c
@@ -204,9 +204,11 @@
* Create a new fresh stack for the new process.
* Copy the trap frame for the return to user mode as if from a
* syscall. This copies most of the user mode register values.
- * The -16 is so we can expand the trapframe if we go to vm86.
+ * The -VM86_STACK_SPACE (-16) is so we can expand the trapframe
+ * if we go to vm86.
*/
- td2->td_frame = (struct trapframe *)((caddr_t)td2->td_pcb - 16) - 1;
+ td2->td_frame = (struct trapframe *)((caddr_t)td2->td_pcb -
+ VM86_STACK_SPACE) - 1;
bcopy(td1->td_frame, td2->td_frame, sizeof(struct trapframe));
td2->td_frame->tf_eax = 0; /* Child returns zero */
@@ -238,7 +240,7 @@
pcb2->pcb_ebp = 0;
pcb2->pcb_esp = (int)td2->td_frame - sizeof(void *);
pcb2->pcb_ebx = (int)td2; /* fork_trampoline argument */
- pcb2->pcb_eip = (int)fork_trampoline;
+ pcb2->pcb_eip = (int)fork_trampoline + setidt_disp;
/*-
* pcb2->pcb_dr*: cloned above.
* pcb2->pcb_savefpu: cloned above.
@@ -344,8 +346,7 @@
* XXX do we need to move the TSS off the allocated pages
* before freeing them? (not done here)
*/
- kmem_free(kernel_arena, (vm_offset_t)pcb->pcb_ext,
- ctob(IOPAGES + 1));
+ pmap_trm_free(pcb->pcb_ext, ctob(IOPAGES + 1));
pcb->pcb_ext = NULL;
}
}
@@ -367,7 +368,8 @@
struct xstate_hdr *xhdr;
td->td_pcb = pcb = get_pcb_td(td);
- td->td_frame = (struct trapframe *)((caddr_t)pcb - 16) - 1;
+ td->td_frame = (struct trapframe *)((caddr_t)pcb -
+ VM86_STACK_SPACE) - 1;
pcb->pcb_ext = NULL;
pcb->pcb_save = get_pcb_user_save_pcb(pcb);
if (use_xsave) {
@@ -462,7 +464,7 @@
pcb2->pcb_ebp = 0;
pcb2->pcb_esp = (int)td->td_frame - sizeof(void *); /* trampoline arg */
pcb2->pcb_ebx = (int)td; /* trampoline arg */
- pcb2->pcb_eip = (int)fork_trampoline;
+ pcb2->pcb_eip = (int)fork_trampoline + setidt_disp;
pcb2->pcb_gs = rgs();
/*
* If we didn't copy the pcb, we'd need to do the following registers:
@@ -581,7 +583,7 @@
*/
ptep = vtopte(sf->kva);
opte = *ptep;
- *ptep = VM_PAGE_TO_PHYS(sf->m) | pgeflag | PG_RW | PG_V |
+ *ptep = VM_PAGE_TO_PHYS(sf->m) | PG_RW | PG_V |
pmap_cache_bits(sf->m->md.pat_mode, 0);
/*
Index: sys/i386/include/asmacros.h
===================================================================
--- sys/i386/include/asmacros.h
+++ sys/i386/include/asmacros.h
@@ -1,3 +1,4 @@
+/* -*- mode: asm -*- */
/*-
* SPDX-License-Identifier: BSD-3-Clause
*
@@ -135,6 +136,10 @@
#endif /* GPROF */
#ifdef LOCORE
+
+#define GSEL_KPL 0x0020 /* GSEL(GCODE_SEL, SEL_KPL) */
+#define SEL_RPL_MASK 0x0003
+
/*
* Convenience macro for declaring interrupt entry points.
*/
@@ -144,16 +149,21 @@
/*
* Macros to create and destroy a trap frame.
*/
-#define PUSH_FRAME \
- pushl $0 ; /* dummy error code */ \
- pushl $0 ; /* dummy trap type */ \
- pushal ; /* 8 ints */ \
- pushl $0 ; /* save data and extra segments ... */ \
- movw %ds,(%esp) ; \
- pushl $0 ; \
- movw %es,(%esp) ; \
- pushl $0 ; \
+ .macro PUSH_FRAME2
+ pushal
+ pushl $0
+ movw %ds,(%esp)
+ pushl $0
+ movw %es,(%esp)
+ pushl $0
movw %fs,(%esp)
+ .endm
+
+ .macro PUSH_FRAME
+ pushl $0 /* dummy error code */
+ pushl $0 /* dummy trap type */
+ PUSH_FRAME2
+ .endm
/*
* Access per-CPU data.
@@ -167,12 +177,43 @@
/*
* Setup the kernel segment registers.
*/
-#define SET_KERNEL_SREGS \
- movl $KDSEL, %eax ; /* reload with kernel's data segment */ \
- movl %eax, %ds ; \
- movl %eax, %es ; \
- movl $KPSEL, %eax ; /* reload with per-CPU data segment */ \
+ .macro SET_KERNEL_SREGS
+ movl $KDSEL, %eax /* reload with kernel's data segment */
+ movl %eax, %ds
+ movl %eax, %es
+ movl $KPSEL, %eax /* reload with per-CPU data segment */
movl %eax, %fs
+ .endm
+
+ .macro NMOVE_STACKS
+ movl PCPU(KESP0), %edx
+ movl $TF_SZ, %ecx
+ testl $PSL_VM, TF_EFLAGS(%esp)
+ jz 1001f
+ addl $(4*4), %ecx
+1001: subl %ecx, %edx
+ movl %edx, %edi
+ movl %esp, %esi
+ rep; movsb
+ movl %edx, %esp
+ .endm
+
+ .macro MOVE_STACKS
+ call 1000f
+1000: popl %eax
+ movl (tramp_idleptd - 1000b)(%eax), %eax
+ movl %eax, %cr3
+ NMOVE_STACKS
+ .endm
+
+ .macro KENTER
+ testl $PSL_VM, TF_EFLAGS(%esp)
+ jnz 2f
+ testb $SEL_RPL_MASK, TF_CS(%esp)
+ jz 2f
+1: MOVE_STACKS
+2:
+ .endm
#endif /* LOCORE */
Index: sys/i386/include/frame.h
===================================================================
--- sys/i386/include/frame.h
+++ sys/i386/include/frame.h
@@ -41,4 +41,8 @@
#define CS_SECURE(cs) (ISPL(cs) == SEL_UPL)
#define EFL_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
+#define TRAMP_STACK_SZ 4096
+#define TRAMP_COPYOUT_SZ 128
+#define VM86_STACK_SPACE 16
+
#endif /* _I386_FRAME_H_ */
Index: sys/i386/include/md_var.h
===================================================================
--- sys/i386/include/md_var.h
+++ sys/i386/include/md_var.h
@@ -45,14 +45,18 @@
#endif
#ifdef COMPAT_43
extern int szosigcode;
+extern int sz_lcall_tramp;
#endif
extern uint32_t *vm_page_dump;
+extern vm_offset_t proc0kstack;
+extern uintptr_t setidt_disp;
struct segment_descriptor;
union savefpu;
void bcopyb(const void *from, void *to, size_t len);
void cpu_switch_load_gs(void) __asm(__STRING(cpu_switch_load_gs));
+void copyout_init_tramp(void);
void doreti_iret(void) __asm(__STRING(doreti_iret));
void doreti_iret_fault(void) __asm(__STRING(doreti_iret_fault));
void doreti_popl_ds(void) __asm(__STRING(doreti_popl_ds));
@@ -71,6 +75,7 @@
void set_fsbase(struct thread *td, uint32_t base);
void set_gsbase(struct thread *td, uint32_t base);
void setidt(int idx, alias_for_inthand_t *func, int typ, int dpl, int selec);
+void setidt_nodisp(int idx, uintptr_t func, int typ, int dpl, int selec);
union savefpu *get_pcb_user_save_td(struct thread *td);
union savefpu *get_pcb_user_save_pcb(struct pcb *pcb);
Index: sys/i386/include/param.h
===================================================================
--- sys/i386/include/param.h
+++ sys/i386/include/param.h
@@ -164,7 +164,6 @@
#define pgtok(x) ((x) * (PAGE_SIZE / 1024))
-#define INKERNEL(va) (((vm_offset_t)(va)) >= VM_MAXUSER_ADDRESS && \
- ((vm_offset_t)(va)) < VM_MAX_KERNEL_ADDRESS)
+#define INKERNEL(va) (TRUE)
#endif /* !_I386_INCLUDE_PARAM_H_ */
Index: sys/i386/include/pc/bios.h
===================================================================
--- sys/i386/include/pc/bios.h
+++ sys/i386/include/pc/bios.h
@@ -267,8 +267,8 @@
};
#ifdef _KERNEL
-#define BIOS_PADDRTOVADDR(x) ((x) + KERNBASE)
-#define BIOS_VADDRTOPADDR(x) ((x) - KERNBASE)
+#define BIOS_PADDRTOVADDR(x) ((x) + PMAP_MAP_LOW)
+#define BIOS_VADDRTOPADDR(x) ((x) - PMAP_MAP_LOW)
struct bios_oem_signature {
char * anchor; /* search anchor string in BIOS memory */
Index: sys/i386/include/pcpu.h
===================================================================
--- sys/i386/include/pcpu.h
+++ sys/i386/include/pcpu.h
@@ -42,21 +42,23 @@
#include <sys/_mutex.h>
/*
- * The SMP parts are setup in pmap.c and locore.s for the BSP, and
- * mp_machdep.c sets up the data for the AP's to "see" when they awake.
- * The reason for doing it via a struct is so that an array of pointers
- * to each CPU's data can be set up for things like "check curproc on all
- * other processors"
+ * The SMP parts are setup in pmap.c and machdep.c for the BSP, and
+ * pmap.c and mp_machdep.c sets up the data for the AP's to "see" when
+ * they awake. The reason for doing it via a struct is so that an
+ * array of pointers to each CPU's data can be set up for things like
+ * "check curproc on all other processors"
*/
#define PCPU_MD_FIELDS \
char pc_monitorbuf[128] __aligned(128); /* cache line */ \
struct pcpu *pc_prvspace; /* Self-reference */ \
struct pmap *pc_curpmap; \
- struct i386tss pc_common_tss; \
struct segment_descriptor pc_common_tssd; \
struct segment_descriptor *pc_tss_gdt; \
struct segment_descriptor *pc_fsgs_gdt; \
+ struct i386tss *pc_common_tssp; \
+ u_int pc_kesp0; \
+ u_int pc_trampstk; \
int pc_currentldt; \
u_int pc_acpi_id; /* ACPI CPU id */ \
u_int pc_apic_id; \
@@ -69,8 +71,13 @@
caddr_t pc_cmap_addr1; \
caddr_t pc_cmap_addr2; \
vm_offset_t pc_qmap_addr; /* KVA for temporary mappings */\
+ vm_offset_t pc_copyout_maddr; \
+ vm_offset_t pc_copyout_saddr; \
+ struct mtx pc_copyout_mlock; \
+ struct sx pc_copyout_slock; \
+ char *pc_copyout_buf; \
uint32_t pc_smp_tlb_done; /* TLB op acknowledgement */ \
- char __pad[445]
+ char __pad[550]
#ifdef _KERNEL
Index: sys/i386/include/pmap.h
===================================================================
--- sys/i386/include/pmap.h
+++ sys/i386/include/pmap.h
@@ -112,12 +112,10 @@
* For PAE, the page table page unit size is 2MB. This means that 512 pages
* is 1 Gigabyte. Double everything. It must be a multiple of 8 for PAE.
*/
-#ifndef KVA_PAGES
#if defined(PAE) || defined(PAE_TABLES)
-#define KVA_PAGES 512
+#define KVA_PAGES (512*4)
#else
-#define KVA_PAGES 256
-#endif
+#define KVA_PAGES (256*4)
#endif
/*
@@ -150,12 +148,13 @@
/*
* The *PTDI values control the layout of virtual memory
- *
- * XXX This works for now, but I am not real happy with it, I'll fix it
- * right after I fix locore.s and the magic 28K hole
*/
-#define KPTDI (NPDEPTD-NKPDE) /* start of kernel virtual pde's */
-#define PTDPTDI (KPTDI-NPGPTD) /* ptd entry that points to ptd! */
+#define KPTDI 0 /* start of kernel virtual pde's */
+#define LOWPTDI 1 /* low memory map pde */
+#define KERNPTDI 2 /* start of kernel text pde */
+#define PTDPTDI (NPDEPTD - 1 - NPGPTD) /* ptd entry that points
+ to ptd! */
+#define TRPTDI (NPDEPTD - 1) /* u/k trampoline ptd */
/*
* XXX doesn't really belong here I guess...
@@ -311,6 +310,7 @@
table */
#endif
struct vm_radix pm_root; /* spare page table pages */
+ vm_page_t pm_ptdpg[NPGPTD];
};
typedef struct pmap *pmap_t;
@@ -396,6 +396,8 @@
void pmap_invalidate_cache_pages(vm_page_t *pages, int count);
void pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva,
boolean_t force);
+void *pmap_trm_alloc(size_t size, int flags);
+void pmap_trm_free(void *addr, size_t size);
void invltlb_glob(void);
Index: sys/i386/include/segments.h
===================================================================
--- sys/i386/include/segments.h
+++ sys/i386/include/segments.h
@@ -84,11 +84,10 @@
#ifdef _KERNEL
extern int _default_ldt;
-extern union descriptor gdt[];
-extern union descriptor ldt[NLDT];
+extern union descriptor *gdt;
+extern union descriptor *ldt;
extern struct soft_segment_descriptor gdt_segs[];
extern struct gate_descriptor *idt;
-extern struct region_descriptor r_gdt, r_idt;
void lgdt(struct region_descriptor *rdp);
void sdtossd(struct segment_descriptor *sdp,
Index: sys/i386/include/vmparam.h
===================================================================
--- sys/i386/include/vmparam.h
+++ sys/i386/include/vmparam.h
@@ -136,7 +136,7 @@
* Kernel physical load address.
*/
#ifndef KERNLOAD
-#define KERNLOAD (1 << PDRSHIFT)
+#define KERNLOAD (KERNPTDI << PDRSHIFT)
#endif /* !defined(KERNLOAD) */
/*
@@ -146,23 +146,47 @@
* messy at times, but hey, we'll do anything to save a page :-)
*/
-#define VM_MAX_KERNEL_ADDRESS VADDR(KPTDI+NKPDE-1, NPTEPG-1)
+#define VM_MAX_KERNEL_ADDRESS VADDR(PTDPTDI, 0)
-#define VM_MIN_KERNEL_ADDRESS VADDR(PTDPTDI, PTDPTDI)
+#define VM_MIN_KERNEL_ADDRESS 0
-#define KERNBASE VADDR(KPTDI, 0)
+#define KERNBASE KERNLOAD
#define UPT_MAX_ADDRESS VADDR(PTDPTDI, PTDPTDI)
#define UPT_MIN_ADDRESS VADDR(PTDPTDI, 0)
-#define VM_MAXUSER_ADDRESS VADDR(PTDPTDI, 0)
+#define VM_MAXUSER_ADDRESS VADDR(TRPTDI, 0)
#define SHAREDPAGE (VM_MAXUSER_ADDRESS - PAGE_SIZE)
#define USRSTACK SHAREDPAGE
-#define VM_MAX_ADDRESS VADDR(PTDPTDI, PTDPTDI)
+#define VM_MAX_ADDRESS VADDR(PTDPTDI, 0)
#define VM_MIN_ADDRESS ((vm_offset_t)0)
+#define PMAP_TRM_MIN_ADDRESS VM_MAXUSER_ADDRESS
+#define PMAP_TRM_MAX_ADDRESS 0xffffffff
+
+#define PMAP_MAP_LOW VADDR(LOWPTDI, 0)
+
+/*
+ * KVA layout. The unit of the system allocation is single PDE, which
+ * represents NBPDR bytes, aligned to NBPDR. NBPDR is 4M for non-PAE
+ * page tables, and 2M for PAE. Addresses below are shown for non-PAE.
+ *
+ * 0x00000000 - 0x003fffff Transient identity map of low memory (0-4M),
+ * normally disabled to catch NULL derefs.
+ * 0x00400000 - 0x007fffff Fixed mapping of the low memory (0-4M).
+ * 0x00800000 - 0xffbfffff KERNBASE (VA) == KERNLOAD (PA), kernel
+ * text + data and all kernel maps. Managed
+ * by MI VM.
+ * 0xffc00000 - 0xffdfffff Recursive kernel page table mapping, pointed
+ * to by PTmap. PTD[] recusively points
+ * into PTmap.
+ * 0xffe00000 - 0xffffffff Kernel/User mode shared PDE, contains GDT,
+ * IDT, TSS, LDT, trampoline code and stacks.
+ * Managed by pmap_trm_alloc().
+ */
+
/*
* How many physical pages per kmem arena virtual page.
*/
Index: sys/kern/imgact_aout.c
===================================================================
--- sys/kern/imgact_aout.c
+++ sys/kern/imgact_aout.c
@@ -67,7 +67,12 @@
static int exec_aout_imgact(struct image_params *imgp);
static int aout_fixup(register_t **stack_base, struct image_params *imgp);
+#define AOUT32_USRSTACK 0xbfc00000
+
#if defined(__i386__)
+
+#define AOUT32_PS_STRINGS (AOUT32_USRSTACK - sizeof(struct ps_strings))
+
struct sysentvec aout_sysvec = {
.sv_size = SYS_MAXSYSCALL,
.sv_table = sysent,
@@ -85,9 +90,9 @@
.sv_minsigstksz = MINSIGSTKSZ,
.sv_pagesize = PAGE_SIZE,
.sv_minuser = VM_MIN_ADDRESS,
- .sv_maxuser = VM_MAXUSER_ADDRESS,
- .sv_usrstack = USRSTACK,
- .sv_psstrings = PS_STRINGS,
+ .sv_maxuser = AOUT32_USRSTACK,
+ .sv_usrstack = AOUT32_USRSTACK,
+ .sv_psstrings = AOUT32_PS_STRINGS,
.sv_stackprot = VM_PROT_ALL,
.sv_copyout_strings = exec_copyout_strings,
.sv_setregs = exec_setregs,
@@ -104,10 +109,9 @@
#elif defined(__amd64__)
-#define AOUT32_USRSTACK 0xbfc00000
#define AOUT32_PS_STRINGS \
(AOUT32_USRSTACK - sizeof(struct freebsd32_ps_strings))
-#define AOUT32_MINUSER FREEBSD32_MINUSER
+#define AOUT32_MINUSER FREEBSD32_MINUSER
extern const char *freebsd32_syscallnames[];
extern u_long ia32_maxssiz;
Index: sys/kern/subr_witness.c
===================================================================
--- sys/kern/subr_witness.c
+++ sys/kern/subr_witness.c
@@ -480,7 +480,9 @@
static unsigned int w_generation = 0;
static const char w_notrunning[] = "Witness not running\n";
static const char w_stillcold[] = "Witness is still cold\n";
-
+#ifdef __i386__
+static const char w_notallowed[] = "The sysctl is disabled on the arch\n";
+#endif
static struct witness_order_list_entry order_lists[] = {
/*
@@ -2779,6 +2781,11 @@
struct sbuf *sb;
int error;
+#ifdef __i386__
+ error = SYSCTL_OUT(req, w_notallowed, sizeof(w_notallowed));
+ return (error);
+#endif
+
if (witness_watch < 1) {
error = SYSCTL_OUT(req, w_notrunning, sizeof(w_notrunning));
return (error);
Index: sys/x86/acpica/acpi_wakeup.c
===================================================================
--- sys/x86/acpica/acpi_wakeup.c
+++ sys/x86/acpica/acpi_wakeup.c
@@ -141,8 +141,13 @@
}
#define WARMBOOT_TARGET 0
+#ifdef __amd64__
#define WARMBOOT_OFF (KERNBASE + 0x0467)
#define WARMBOOT_SEG (KERNBASE + 0x0469)
+#else /* __i386__ */
+#define WARMBOOT_OFF (PMAP_MAP_LOW + 0x0467)
+#define WARMBOOT_SEG (PMAP_MAP_LOW + 0x0469)
+#endif
#define CMOS_REG (0x70)
#define CMOS_DATA (0x71)
@@ -186,7 +191,7 @@
* cpususpend_handler() and we will release them soon. Then each
* will invalidate its TLB.
*/
- kernel_pmap->pm_pdir[0] = 0;
+ PTD[KPTDI] = 0;
invltlb_glob();
#endif
@@ -256,7 +261,7 @@
* be careful to use the kernel map (PTD[0] is for curthread
* which may be a user thread in deprecated APIs).
*/
- kernel_pmap->pm_pdir[0] = PTD[KPTDI];
+ PTD[KPTDI] = PTD[LOWPTDI];
#endif
/* Call ACPICA to enter the desired sleep state */
Index: sys/x86/x86/local_apic.c
===================================================================
--- sys/x86/x86/local_apic.c
+++ sys/x86/x86/local_apic.c
@@ -78,11 +78,9 @@
#ifdef __amd64__
#define SDT_APIC SDT_SYSIGT
-#define SDT_APICT SDT_SYSIGT
#define GSEL_APIC 0
#else
#define SDT_APIC SDT_SYS386IGT
-#define SDT_APICT SDT_SYS386TGT
#define GSEL_APIC GSEL(GCODE_SEL, SEL_KPL)
#endif
@@ -517,7 +515,7 @@
/* Local APIC CMCI. */
setidt(APIC_CMC_INT, pti ? IDTVEC(cmcint_pti) : IDTVEC(cmcint),
- SDT_APICT, SEL_KPL, GSEL_APIC);
+ SDT_APIC, SEL_KPL, GSEL_APIC);
if ((resource_int_value("apic", 0, "clock", &i) != 0 || i != 0)) {
arat = 0;
@@ -1605,7 +1603,7 @@
* We can not currently clear the idt entry because other cpus
* may have a valid vector at this offset.
*/
- setidt(vector, pti ? &IDTVEC(rsvd_pti) : &IDTVEC(rsvd), SDT_APICT,
+ setidt(vector, pti ? &IDTVEC(rsvd_pti) : &IDTVEC(rsvd), SDT_APIC,
SEL_KPL, GSEL_APIC);
#endif
}
@@ -2146,7 +2144,7 @@
KASSERT(func != (uintptr_t)&IDTVEC(rsvd) &&
func != (uintptr_t)&IDTVEC(rsvd_pti),
("invalid idtfunc %#lx", func));
- setidt(vector, pti ? &IDTVEC(rsvd_pti) : &IDTVEC(rsvd), SDT_APICT,
+ setidt(vector, pti ? &IDTVEC(rsvd_pti) : &IDTVEC(rsvd), SDT_APIC,
SEL_KPL, GSEL_APIC);
mtx_unlock_spin(&icu_lock);
}
Index: sys/x86/x86/mp_x86.c
===================================================================
--- sys/x86/x86/mp_x86.c
+++ sys/x86/x86/mp_x86.c
@@ -1686,8 +1686,10 @@
generation = smp_tlb_generation;
if (smp_tlb_pmap == kernel_pmap)
invltlb_glob();
+#ifdef __amd64__
else
invltlb();
+#endif
PCPU_SET(smp_tlb_done, generation);
}
@@ -1704,7 +1706,10 @@
#endif /* COUNT_IPIS */
generation = smp_tlb_generation; /* Overlap with serialization */
- invlpg(smp_tlb_addr1);
+#ifdef __i386__
+ if (smp_tlb_pmap == kernel_pmap)
+#endif
+ invlpg(smp_tlb_addr1);
PCPU_SET(smp_tlb_done, generation);
}
@@ -1724,10 +1729,13 @@
addr = smp_tlb_addr1;
addr2 = smp_tlb_addr2;
generation = smp_tlb_generation; /* Overlap with serialization */
- do {
- invlpg(addr);
- addr += PAGE_SIZE;
- } while (addr < addr2);
+#ifdef __i386__
+ if (smp_tlb_pmap == kernel_pmap)
+#endif
+ do {
+ invlpg(addr);
+ addr += PAGE_SIZE;
+ } while (addr < addr2);
PCPU_SET(smp_tlb_done, generation);
}
Index: sys/x86/x86/mptable.c
===================================================================
--- sys/x86/x86/mptable.c
+++ sys/x86/x86/mptable.c
@@ -221,8 +221,13 @@
search_for_sig(u_int32_t target, int count)
{
int x;
- u_int32_t *addr = (u_int32_t *) (KERNBASE + target);
+ u_int32_t *addr;
+#ifdef __amd64__
+ addr = (u_int32_t *) (KERNBASE + target);
+#else /* __i386__ */
+ addr = (u_int32_t *) (PMAP_MAP_LOW + target);
+#endif
for (x = 0; x < count; x += 4)
if (addr[x] == MP_SIG)
/* make array index a byte index */
@@ -253,7 +258,13 @@
u_int32_t target;
/* see if EBDA exists */
- if ((segment = (u_long) * (u_short *) (KERNBASE + 0x40e)) != 0) {
+ if ((segment = (u_long) * (u_short *) (
+#ifdef __amd64__
+ KERNBASE
+#else /* __i386__ */
+ PMAP_MAP_LOW
+#endif
+ + 0x40e)) != 0) {
/* search first 1K of EBDA */
target = (u_int32_t) (segment << 4);
if ((x = search_for_sig(target, 1024 / 4)) >= 0)

File Metadata

Mime Type
text/plain
Expires
Sat, Jan 25, 7:44 PM (18 h, 37 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
16161474
Default Alt Text
D14633.diff (183 KB)

Event Timeline