Page Menu
Home
FreeBSD
Search
Configure Global Search
Log In
Files
F101981652
D46397.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Flag For Later
Award Token
Size
27 KB
Referenced Files
None
Subscribers
None
D46397.diff
View Options
diff --git a/sys/amd64/pt/pt.h b/sys/amd64/pt/pt.h
new file mode 100644
--- /dev/null
+++ b/sys/amd64/pt/pt.h
@@ -0,0 +1,57 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2023 Bojan Novković <bnovkov@freebsd.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef _AMD64_PT_PT_H_
+#define _AMD64_PT_PT_H_
+
+#include <sys/types.h>
+
+#include <x86/include/specialreg.h>
+
+#define PT_IP_FILTER_MAX_RANGES (2) /* Intel SDM Vol. 3C, 33-29 */
+
+struct pt_cpu_config {
+ uint64_t rtit_ctl;
+ register_t cr3_filter;
+ int nranges;
+ struct ipf_range {
+ vm_offset_t start;
+ vm_offset_t end;
+ } ip_ranges[PT_IP_FILTER_MAX_RANGES];
+ uint32_t mtc_freq;
+ uint32_t cyc_thresh;
+ uint32_t psb_freq;
+};
+
+#ifdef _KERNEL
+
+#define PT_SUPPORTED_FLAGS \
+ (RTIT_CTL_MTCEN | RTIT_CTL_CR3FILTER | RTIT_CTL_DIS_TNT | \
+ RTIT_CTL_USER | RTIT_CTL_OS | RTIT_CTL_BRANCHEN)
+
+#endif /* _KERNEL */
+#endif /* !_AMD64_PT_PT_H_ */
diff --git a/sys/amd64/pt/pt.c b/sys/amd64/pt/pt.c
new file mode 100644
--- /dev/null
+++ b/sys/amd64/pt/pt.c
@@ -0,0 +1,941 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2023 Bojan Novković <bnovkov@freebsd.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * Intel Processor Trace (PT) backend
+ *
+ * Driver Design Overview
+ *
+ * - Since PT is configured on a per-core basis, the driver uses
+ * 'smp_rendezvous' to start and disable tracing on each target core.
+ * - PT-specific resources are stored in a 'struct pt_ctx' context structure for
+ * each traced CPU core or thread. Upon initialization, a ToPA configuration
+ * is generated for each 'pt_ctx' structure using the HWT tracing buffers.
+ * The HWT tracing buffer is split into 4K ToPA entries. Currently, each
+ * 4K ToPA entry is configured to trigger an interrupt after it is filled.
+ * - The PT driver uses the XSAVE/XRSTOR PT extensions to load and save all
+ * relevant PT registers. Every time a traced thread is switched
+ * out or in, its state will be saved to or loaded from its corresponding
+ * 'pt_ctx' context.
+ * - When tracing starts, the PT hardware will start writing data into the
+ * tracing buffer. When a TOPA_INT entry is filled, it will trigger an
+ * interrupt before continuing. The interrupt handler will then fetch the
+ * last valid tracing buffer offset and enqueue a HWT_RECORD_BUFFER record.
+ * The driver is currently configured to use the NMI interrupt line.
+ * - The userspace PT backend waits for incoming HWT_RECORD_BUFFER records
+ * and uses the offsets to decode data from the tracing buffer.
+ *
+ * Future improvements and limitations
+ *
+ * - We currently configure the PT hardware to trigger an interrupt whenever
+ * a 4K ToPA entry is filled. While this is fine when tracing smaller
+ * functions or infrequent code paths, this will generate too much interrupt
+ * traffic when tracing hotter functions. A proper solution for this issue
+ * should estimate the amount of data generated by the current configuration
+ * and use it to determine interrupt frequency.
+ *
+ * - Support for more tracing options and PT features.
+ *
+ */
+
+#include <sys/cdefs.h>
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/event.h>
+#include <sys/hwt.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/module.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/queue.h>
+#include <sys/sdt.h>
+#include <sys/smp.h>
+#include <sys/lock.h>
+#include <sys/errno.h>
+#include <sys/taskqueue.h>
+#include <sys/domainset.h>
+#include <sys/sleepqueue.h>
+
+#include <vm/vm.h>
+#include <vm/vm_page.h>
+
+#include <machine/cpufunc.h>
+#include <machine/param.h>
+#include <machine/atomic.h>
+#include <machine/smp.h>
+#include <machine/specialreg.h>
+
+#include <x86/apicvar.h>
+#include <x86/x86_var.h>
+
+#include <dev/hwt/hwt_vm.h>
+#include <dev/hwt/hwt_thread.h>
+#include <dev/hwt/hwt_cpu.h>
+#include <dev/hwt/hwt_config.h>
+#include <dev/hwt/hwt_context.h>
+#include <dev/hwt/hwt_backend.h>
+#include <dev/hwt/hwt_hook.h>
+#include <dev/hwt/hwt_intr.h>
+#include <dev/hwt/hwt_record.h>
+
+#include <amd64/pt/pt.h>
+
+#ifdef PT_DEBUG
+#define dprintf(fmt, ...) printf(fmt, ##__VA_ARGS__)
+#else
+#define dprintf(fmt, ...)
+#endif
+
+#define PT_XSAVE_MASK (XFEATURE_ENABLED_X87 | XFEATURE_ENABLED_SSE)
+#define PT_MAX_IP_RANGES 2
+
+MALLOC_DEFINE(M_PT, "pt", "Intel Processor Trace");
+
+SDT_PROVIDER_DEFINE(pt);
+SDT_PROBE_DEFINE(pt, , , topa__intr);
+
+TASKQUEUE_FAST_DEFINE_THREAD(pt);
+
+static void pt_send_buffer_record(void *arg, int pending __unused);
+static int pt_topa_intr(struct trapframe *tf);
+
+struct pt_buffer {
+ uint64_t *topa_hw; /* ToPA table entries. */
+ size_t size;
+ struct mtx lock; /* Lock for fields below. */
+ vm_offset_t offset;
+ uint64_t wrap_count;
+ int curpage;
+};
+
+struct pt_ctx {
+ struct pt_buffer buf; /* ToPA buffer metadata */
+ struct task task; /* ToPA buffer notification task */
+ struct hwt_context *hwt_ctx;
+ struct pt_save_area save_area; /* PT XSAVE area */
+ int id;
+};
+/* PT tracing contexts used for CPU mode. */
+static struct pt_ctx *pt_pcpu_ctx;
+
+enum pt_cpu_state {
+ PT_DISABLED = 0,
+ PT_STOPPED,
+ PT_ACTIVE
+};
+
+static struct pt_cpu {
+ struct pt_ctx *ctx; /* active PT tracing context */
+ enum pt_cpu_state state; /* used as part of trace stop protocol */
+} *pt_pcpu;
+
+/*
+ * PT-related CPUID bits.
+ */
+static struct pt_cpu_info {
+ uint32_t l0_eax;
+ uint32_t l0_ebx;
+ uint32_t l0_ecx;
+ uint32_t l1_eax;
+ uint32_t l1_ebx;
+} pt_info;
+
+static bool initialized = false;
+
+static __inline enum pt_cpu_state
+pt_cpu_get_state(int cpu_id)
+{
+ return (atomic_load_int(&pt_pcpu[cpu_id].state));
+}
+
+static __inline void
+pt_cpu_set_state(int cpu_id, enum pt_cpu_state state)
+{
+ atomic_store_int(&pt_pcpu[cpu_id].state, state);
+}
+
+/*
+ * Updates current trace buffer offset from the
+ * ToPA MSRs. Records if the trace buffer wrapped.
+ */
+static __inline void
+pt_update_buffer(struct pt_buffer *buf)
+{
+ uint64_t reg;
+ int curpage;
+
+ /* Update buffer offset. */
+ reg = rdmsr(MSR_IA32_RTIT_OUTPUT_MASK_PTRS);
+ curpage = (reg & 0xffffff80) >> 7;
+ mtx_lock_spin(&buf->lock);
+ /* Check if the output wrapped. */
+ if (buf->curpage > curpage)
+ buf->wrap_count++;
+ buf->curpage = curpage;
+ buf->offset = reg >> 32;
+ mtx_unlock_spin(&buf->lock);
+
+ dprintf("%s: wrap_cnt: %lu, curpage: %d, offset: %zu\n", __func__,
+ buf->wrap_count, buf->curpage, buf->offset);
+}
+
+static __inline void
+pt_fill_buffer_record(int id, struct pt_buffer *buf,
+ struct hwt_record_entry *rec)
+{
+ rec->record_type = HWT_RECORD_BUFFER;
+ rec->buf_id = id;
+ rec->curpage = buf->curpage;
+ rec->offset = buf->offset + (buf->wrap_count * buf->size);
+}
+
+/*
+ * Enables or disables tracing on curcpu
+ * using the XSAVE/XRSTOR PT extensions.
+ */
+static void
+pt_cpu_toggle_local(struct pt_save_area *save_area, bool enable)
+{
+ u_long xcr0, cr0;
+ u_long xss;
+
+ cr0 = rcr0();
+ if (cr0 & CR0_TS)
+ clts();
+ xcr0 = rxcr(XCR0);
+ if ((xcr0 & PT_XSAVE_MASK) != PT_XSAVE_MASK)
+ load_xcr(XCR0, xcr0 | PT_XSAVE_MASK);
+ xss = rdmsr(MSR_IA32_XSS);
+ wrmsr(MSR_IA32_XSS, xss | XFEATURE_ENABLED_PT);
+
+ if (!enable) {
+ KASSERT((rdmsr(MSR_IA32_RTIT_CTL) & RTIT_CTL_TRACEEN) != 0,
+ ("%s: PT is disabled", __func__));
+ xsaves((uint8_t *)save_area, XFEATURE_ENABLED_PT);
+ } else {
+ KASSERT((rdmsr(MSR_IA32_RTIT_CTL) & RTIT_CTL_TRACEEN) == 0,
+ ("%s: PT is enabled", __func__));
+ xrstors((uint8_t *)save_area, XFEATURE_ENABLED_PT);
+ }
+ wrmsr(MSR_IA32_XSS, xss);
+ if ((xcr0 & PT_XSAVE_MASK) != PT_XSAVE_MASK)
+ load_xcr(XCR0, xcr0);
+ if (cr0 & CR0_TS)
+ load_cr0(cr0);
+}
+
+/*
+ * Starts PT tracing on 'curcpu'.
+ */
+static void
+pt_cpu_start(void *dummy)
+{
+ struct pt_cpu *cpu = &pt_pcpu[curcpu];
+
+ MPASS(cpu->ctx != NULL);
+
+ dprintf("%s: curcpu %d\n", __func__, curcpu);
+ load_cr4(rcr4() | CR4_XSAVE);
+ wrmsr(MSR_IA32_RTIT_STATUS, 0);
+ pt_cpu_set_state(curcpu, PT_ACTIVE);
+ pt_cpu_toggle_local(&cpu->ctx->save_area, true);
+}
+
+/*
+ * Stops PT tracing on 'curcpu'.
+ * Updates trace buffer offset to ensure
+ * any data generated between the last interrupt
+ * and the trace stop gets picked up by userspace.
+ */
+static void
+pt_cpu_stop(void *dummy)
+{
+ struct pt_cpu *cpu;
+ struct pt_ctx *ctx;
+
+ /* Shutdown may occur before PT gets properly configured. */
+ if (pt_cpu_get_state(curcpu) == PT_DISABLED)
+ return;
+
+ cpu = &pt_pcpu[curcpu];
+ ctx = cpu->ctx;
+ MPASS(ctx != NULL);
+ dprintf("%s: curcpu %d\n", __func__, curcpu);
+
+ pt_cpu_set_state(curcpu, PT_STOPPED);
+ pt_cpu_toggle_local(&cpu->ctx->save_area, false);
+ pt_update_buffer(&ctx->buf);
+}
+
+/*
+ * Prepares the Table of Physical Addresses (ToPA) metadata for 'pt_ctx'.
+ * The HWT trace buffer is split into 4K ToPA table entries and used
+ * as a circular buffer, meaning that the last ToPA entry points to
+ * the first ToPA entry. Each entry is configured to raise an
+ * interrupt after being filled.
+ */
+static int
+pt_topa_prepare(struct pt_ctx *ctx, struct hwt_vm *vm)
+{
+ struct pt_buffer *buf;
+ size_t topa_size;
+ int i;
+
+ topa_size = TOPA_SIZE_4K;
+ buf = &ctx->buf;
+
+ KASSERT(buf->topa_hw == NULL,
+ ("%s: ToPA info already exists", __func__));
+ buf->topa_hw = mallocarray(vm->npages + 1, sizeof(uint64_t), M_PT,
+ M_ZERO | M_WAITOK);
+ if (buf->topa_hw == NULL)
+ return (ENOMEM);
+
+ dprintf("%s: ToPA virt addr %p\n", __func__, buf->topa_hw);
+ buf->size = vm->npages * PAGE_SIZE;
+ for (i = 0; i < vm->npages; i++) {
+ buf->topa_hw[i] = VM_PAGE_TO_PHYS(vm->pages[i]) | topa_size;
+ /*
+ * XXX: TOPA_INT should ideally be set according to
+ * expected amount of incoming trace data. Too few TOPA_INT
+ * entries will not trigger interrupts often enough when tracing
+ * smaller functions.
+ */
+ buf->topa_hw[i] |= TOPA_INT;
+ }
+ buf->topa_hw[vm->npages] = (uint64_t)vtophys(buf->topa_hw) | TOPA_END;
+
+ return (0);
+}
+
+/*
+ * Configures IP filtering for trace generation.
+ * A maximum of 2 ranges can be specified due to
+ * limitations imposed by the XSAVE/XRSTOR PT extensions.
+ */
+static int
+pt_configure_ranges(struct pt_ctx *ctx, struct pt_cpu_config *cfg)
+{
+ struct pt_ext_area *pt_ext;
+ struct pt_save_area *save_area;
+ int nranges_supp, n, error = 0;
+
+ save_area = &ctx->save_area;
+ pt_ext = &save_area->pt_ext_area;
+
+ if (pt_info.l0_ebx & CPUPT_IPF) {
+ nranges_supp = (pt_info.l1_eax & CPUPT_NADDR_M) >>
+ CPUPT_NADDR_S;
+
+ if (nranges_supp > PT_IP_FILTER_MAX_RANGES)
+ nranges_supp = PT_IP_FILTER_MAX_RANGES;
+ n = cfg->nranges;
+ if (n > nranges_supp) {
+ printf("%s: %d IP filtering ranges requested, CPU "
+ "supports %d, truncating\n",
+ __func__, n, nranges_supp);
+ n = nranges_supp;
+ }
+
+ switch (n) {
+ case 2:
+ pt_ext->rtit_ctl |= (1UL << RTIT_CTL_ADDR_CFG_S(1));
+ pt_ext->rtit_addr1_a = cfg->ip_ranges[1].start;
+ pt_ext->rtit_addr1_b = cfg->ip_ranges[1].end;
+ case 1:
+ pt_ext->rtit_ctl |= (1UL << RTIT_CTL_ADDR_CFG_S(0));
+ pt_ext->rtit_addr0_a = cfg->ip_ranges[0].start;
+ pt_ext->rtit_addr0_b = cfg->ip_ranges[0].end;
+ break;
+ default:
+ error = (EINVAL);
+ break;
+ };
+ } else
+ error = (ENXIO);
+
+ return (error);
+}
+
+static int
+pt_init_ctx(struct pt_ctx *pt_ctx, struct hwt_vm *vm, int ctx_id)
+{
+
+ dprintf("%s: ctx id %d\n", __func__, ctx_id);
+
+ KASSERT(pt_ctx->buf.topa_hw == NULL,
+ ("%s: active ToPA buffer in context %p\n", __func__, pt_ctx));
+
+ memset(pt_ctx, 0, sizeof(struct pt_ctx));
+ mtx_init(&pt_ctx->buf.lock, "pttopa", NULL, MTX_SPIN);
+ dprintf("%s: preparing ToPA buffer\n", __func__);
+ if (pt_topa_prepare(pt_ctx, vm) != 0) {
+ dprintf("%s: failed to prepare ToPA buffer\n", __func__);
+ return (ENOMEM);
+ }
+
+ pt_ctx->id = ctx_id;
+ TASK_INIT(&pt_ctx->task, 0, pt_send_buffer_record, pt_ctx);
+
+ return (0);
+}
+
+static void
+pt_deinit_ctx(struct pt_ctx *pt_ctx)
+{
+
+ if (pt_ctx->buf.topa_hw != NULL)
+ free(pt_ctx->buf.topa_hw, M_PT);
+ memset(pt_ctx, 0, sizeof(*pt_ctx));
+ pt_ctx->buf.topa_hw = NULL;
+}
+
+/*
+ * HWT backend configuration method.
+ *
+ * Checks and translates the user-defined configuration to a
+ * set of PT tracing features. Uses the feature set to initialize
+ * the tracing context for the target CPU or thread.
+ */
+static int
+pt_backend_configure(struct hwt_context *ctx, int cpu_id, int thread_id)
+{
+ struct hwt_cpu *hwt_cpu;
+ struct hwt_thread *thr;
+ struct pt_ctx *pt_ctx;
+ struct pt_cpu_config *cfg;
+ struct pt_ext_area *pt_ext;
+ struct xstate_hdr *hdr;
+ int error;
+
+ dprintf("%s\n", __func__);
+
+ cfg = (struct pt_cpu_config *)ctx->config;
+ pt_ctx = NULL;
+
+ /* Clear any flags we don't support yet. */
+ cfg->rtit_ctl &= PT_SUPPORTED_FLAGS;
+ if (cfg->rtit_ctl & RTIT_CTL_MTCEN) {
+ if ((pt_info.l0_ebx & CPUPT_MTC) == 0) {
+ printf("%s: CPU does not support generating MTC "
+ "packets\n", __func__);
+ return (ENXIO);
+ }
+ }
+
+ if (cfg->rtit_ctl & RTIT_CTL_CR3FILTER) {
+ if ((pt_info.l0_ebx & CPUPT_CR3) == 0) {
+ printf("%s: CPU does not support CR3 filtering\n",
+ __func__);
+ return (ENXIO);
+ }
+ }
+
+ if (cfg->rtit_ctl & RTIT_CTL_DIS_TNT) {
+ if ((pt_info.l0_ebx & CPUPT_DIS_TNT) == 0) {
+ printf("%s: CPU does not support TNT\n", __func__);
+ return (ENXIO);
+ }
+ }
+ /* TODO: support for more config bits. */
+
+ if (ctx->mode == HWT_MODE_CPU) {
+ TAILQ_FOREACH(hwt_cpu, &ctx->cpus, next) {
+ if (hwt_cpu->cpu_id != cpu_id)
+ continue;
+ pt_ctx = &pt_pcpu_ctx[cpu_id];
+ break;
+ }
+ } else {
+ TAILQ_FOREACH(thr, &ctx->threads, next) {
+ if (thr->thread_id != thread_id)
+ continue;
+ KASSERT(thr->private != NULL,
+ ("%s: hwt thread private"
+ " not set, thr %p",
+ __func__, thr));
+ pt_ctx = (struct pt_ctx *)thr->private;
+ break;
+ }
+ }
+ if (pt_ctx == NULL)
+ return (ENOENT);
+
+ dprintf("%s: preparing MSRs\n", __func__);
+ pt_ext = &pt_ctx->save_area.pt_ext_area;
+ hdr = &pt_ctx->save_area.header;
+
+ pt_ext->rtit_ctl |= cfg->rtit_ctl;
+ if (cfg->nranges != 0) {
+ dprintf("%s: preparing IPF ranges\n", __func__);
+ if ((error = pt_configure_ranges(pt_ctx, cfg)) != 0)
+ return (error);
+ }
+ pt_ctx->hwt_ctx = ctx;
+ pt_ext->rtit_ctl |= RTIT_CTL_TOPA;
+ pt_ext->rtit_output_base = (uint64_t)vtophys(pt_ctx->buf.topa_hw);
+ pt_ext->rtit_output_mask_ptrs = 0x7f;
+ hdr->xstate_bv = XFEATURE_ENABLED_PT;
+ hdr->xstate_xcomp_bv = XFEATURE_ENABLED_PT |
+ (1ULL << 63);
+ pt_ext->rtit_ctl |= RTIT_CTL_TRACEEN;
+ pt_pcpu[cpu_id].ctx = pt_ctx;
+ pt_cpu_set_state(cpu_id, PT_STOPPED);
+
+ return (0);
+}
+
+/*
+ * hwt backend trace start operation. CPU affine.
+ */
+static void
+pt_backend_enable(struct hwt_context *ctx, int cpu_id)
+{
+
+ KASSERT(curcpu == cpu_id,
+ ("%s: attempting to start PT on another cpu", __func__));
+ pt_cpu_start(NULL);
+ CPU_SET(cpu_id, &ctx->cpu_map);
+}
+
+/*
+ * hwt backend trace stop operation. CPU affine.
+ */
+static void
+pt_backend_disable(struct hwt_context *ctx, int cpu_id)
+{
+ struct pt_cpu *cpu;
+
+ KASSERT(curcpu == cpu_id,
+ ("%s: attempting to disable PT on another cpu", __func__));
+ pt_cpu_stop(NULL);
+ CPU_CLR(cpu_id, &ctx->cpu_map);
+ cpu = &pt_pcpu[cpu_id];
+ cpu->ctx = NULL;
+}
+
+/*
+ * hwt backend trace start operation for remote CPUs.
+ */
+static void
+pt_backend_enable_smp(struct hwt_context *ctx)
+{
+
+ dprintf("%s\n", __func__);
+ KASSERT(ctx->mode == HWT_MODE_CPU,
+ ("%s: should only be used for CPU mode", __func__));
+ smp_rendezvous_cpus(ctx->cpu_map, NULL, pt_cpu_start, NULL, NULL);
+}
+
+/*
+ * hwt backend trace stop operation for remote CPUs.
+ */
+static void
+pt_backend_disable_smp(struct hwt_context *ctx)
+{
+
+ dprintf("%s\n", __func__);
+ if (CPU_EMPTY(&ctx->cpu_map)) {
+ dprintf("%s: empty cpu map\n", __func__);
+ return;
+ }
+ smp_rendezvous_cpus(ctx->cpu_map, NULL, pt_cpu_stop, NULL, NULL);
+}
+
+/*
+ * HWT backend initialization method.
+ *
+ * Installs the ToPA interrupt handler and initializes
+ * the tracing contexts used for HWT_MODE_CPU.
+ */
+static int
+pt_backend_init(struct hwt_context *ctx)
+{
+ struct hwt_cpu *hwt_cpu;
+ int error;
+
+ dprintf("%s\n", __func__);
+ if (ctx->mode == HWT_MODE_CPU) {
+ TAILQ_FOREACH(hwt_cpu, &ctx->cpus, next) {
+ error = pt_init_ctx(&pt_pcpu_ctx[hwt_cpu->cpu_id],
+ hwt_cpu->vm, hwt_cpu->cpu_id);
+ if (error)
+ return (error);
+ }
+ }
+
+ return (0);
+}
+
+/*
+ * HWT backend teardown method.
+ *
+ * Removes the ToPA interrupt handler, stops tracing on all active CPUs,
+ * and releases all previously allocated ToPA metadata.
+ */
+static void
+pt_backend_deinit(struct hwt_context *ctx)
+{
+ struct pt_ctx *pt_ctx;
+ struct hwt_thread *thr;
+ int cpu_id;
+
+ dprintf("%s\n", __func__);
+
+ pt_backend_disable_smp(ctx);
+ if (ctx->mode == HWT_MODE_THREAD) {
+ TAILQ_FOREACH(thr, &ctx->threads, next) {
+ KASSERT(thr->private != NULL,
+ ("%s: thr->private not set", __func__));
+ pt_ctx = (struct pt_ctx *)thr->private;
+ pt_deinit_ctx(pt_ctx);
+ }
+ } else {
+ CPU_FOREACH(cpu_id) {
+ if (!CPU_ISSET(cpu_id, &ctx->cpu_map))
+ continue;
+ if (pt_pcpu[cpu_id].ctx != NULL) {
+ KASSERT(pt_pcpu[cpu_id].ctx ==
+ &pt_pcpu_ctx[cpu_id],
+ ("%s: CPU mode tracing with non-cpu mode PT"
+ "context active",
+ __func__));
+ pt_pcpu[cpu_id].ctx = NULL;
+ }
+ pt_ctx = &pt_pcpu_ctx[cpu_id];
+ pt_deinit_ctx(pt_ctx);
+ memset(&pt_pcpu[cpu_id], 0, sizeof(struct pt_cpu));
+ }
+ }
+}
+
+/*
+ * Fetches current offset into the tracing buffer.
+ */
+static int
+pt_backend_read(struct hwt_vm *vm, int *curpage, vm_offset_t *curpage_offset,
+ uint64_t *data)
+{
+ struct pt_buffer *buf;
+
+ if (vm->ctx->mode == HWT_MODE_THREAD)
+ buf = &((struct pt_ctx *)vm->thr->private)->buf;
+ else
+ buf = &pt_pcpu[vm->cpu->cpu_id].ctx->buf;
+ mtx_lock_spin(&buf->lock);
+ *curpage = buf->curpage;
+ *curpage_offset = buf->offset + (buf->wrap_count * vm->ctx->bufsize);
+ mtx_unlock_spin(&buf->lock);
+
+ return (0);
+}
+
+/*
+ * HWT thread creation hook.
+ * Allocates and associates a 'struct pt_ctx' for a given hwt thread.
+ */
+static int
+pt_backend_alloc_thread(struct hwt_thread *thr)
+{
+ struct pt_ctx *pt_ctx;
+ int error;
+
+ /* Omit M_WAITOK since this might get invoked a non-sleepable context */
+ pt_ctx = malloc(sizeof(*pt_ctx), M_PT, M_NOWAIT | M_ZERO);
+ if (pt_ctx == NULL)
+ return (ENOMEM);
+
+ error = pt_init_ctx(pt_ctx, thr->vm, thr->thread_id);
+ if (error)
+ return (error);
+
+ thr->private = pt_ctx;
+ return (0);
+}
+/*
+ * HWT thread teardown hook.
+ */
+static void
+pt_backend_free_thread(struct hwt_thread *thr)
+{
+ struct pt_ctx *ctx;
+
+ ctx = (struct pt_ctx *)thr->private;
+
+ pt_deinit_ctx(ctx);
+ free(ctx, M_PT);
+}
+
+static void
+pt_backend_dump(int cpu_id)
+{
+}
+
+static struct hwt_backend_ops pt_ops = {
+ .hwt_backend_init = pt_backend_init,
+ .hwt_backend_deinit = pt_backend_deinit,
+
+ .hwt_backend_configure = pt_backend_configure,
+
+ .hwt_backend_enable = pt_backend_enable,
+ .hwt_backend_disable = pt_backend_disable,
+
+#ifdef SMP
+ .hwt_backend_enable_smp = pt_backend_enable_smp,
+ .hwt_backend_disable_smp = pt_backend_disable_smp,
+#endif
+
+ .hwt_backend_read = pt_backend_read,
+ .hwt_backend_dump = pt_backend_dump,
+
+ .hwt_backend_thread_alloc = pt_backend_alloc_thread,
+ .hwt_backend_thread_free = pt_backend_free_thread,
+};
+
+static struct hwt_backend backend = {
+ .ops = &pt_ops,
+ .name = "pt",
+ .kva_req = 1,
+};
+
+/*
+ * Reads the latest valid trace buffer offset and enqueues
+ * a HWT_RECORD_BUFFER record.
+ * Used as a taskqueue routine from the ToPA interrupt handler.
+ */
+static void
+pt_send_buffer_record(void *arg, int pending __unused)
+{
+ struct hwt_record_entry record;
+ struct pt_ctx *ctx = (struct pt_ctx *)arg;
+
+ /* Prepare buffer record. */
+ mtx_lock_spin(&ctx->buf.lock);
+ pt_fill_buffer_record(ctx->id, &ctx->buf, &record);
+ mtx_unlock_spin(&ctx->buf.lock);
+ hwt_record_ctx(ctx->hwt_ctx, &record, M_ZERO | M_NOWAIT);
+}
+static void
+pt_topa_status_clear(void)
+{
+ uint64_t reg;
+
+ reg = rdmsr(MSR_IA_GLOBAL_STATUS_RESET);
+ reg &= ~GLOBAL_STATUS_FLAG_TRACETOPAPMI;
+ reg |= GLOBAL_STATUS_FLAG_TRACETOPAPMI;
+ wrmsr(MSR_IA_GLOBAL_STATUS_RESET, reg);
+}
+
+/*
+ * ToPA PMI handler.
+ *
+ * Invoked every time a ToPA entry marked with TOPA_INT is filled.
+ * Uses taskqueue to enqueue a buffer record for userspace.
+ * Re-enables the PC interrupt line as long as tracing is active.
+ */
+static int
+pt_topa_intr(struct trapframe *tf)
+{
+ struct pt_buffer *buf;
+ struct pt_ctx *ctx;
+ uint64_t reg;
+
+ SDT_PROBE0(pt, , , topa__intr);
+
+ if (pt_cpu_get_state(curcpu) != PT_ACTIVE) {
+ return (0);
+ }
+ reg = rdmsr(MSR_IA_GLOBAL_STATUS);
+ if ((reg & GLOBAL_STATUS_FLAG_TRACETOPAPMI) == 0) {
+ /* ACK spurious or leftover interrupt. */
+ pt_topa_status_clear();
+ return (1);
+ }
+
+ ctx = pt_pcpu[curcpu].ctx;
+ buf = &ctx->buf;
+ KASSERT(buf->topa_hw != NULL,
+ ("%s: ToPA PMI interrupt with invalid buffer", __func__));
+
+ pt_cpu_toggle_local(&ctx->save_area, false);
+ pt_update_buffer(buf);
+ pt_topa_status_clear();
+ taskqueue_enqueue_flags(taskqueue_pt, &ctx->task,
+ TASKQUEUE_FAIL_IF_PENDING);
+
+ if (pt_cpu_get_state(curcpu) == PT_ACTIVE) {
+ pt_cpu_toggle_local(&ctx->save_area, true);
+ lapic_reenable_pcint();
+ }
+ return (1);
+}
+
+/*
+ * Module initialization.
+ *
+ * Saves all PT-related cpuid info, registers itself as a HWT backend,
+ * and allocates metadata required to keep track of tracing operations
+ * on each CPU.
+ */
+static int
+pt_init(void)
+{
+ u_int cp[4];
+ int error;
+
+ dprintf("pt: Enumerating part 1\n");
+ cpuid_count(CPUID_PT_LEAF, 0, cp);
+ dprintf("pt: Maximum valid sub-leaf Index: %x\n", cp[0]);
+ dprintf("pt: ebx %x\n", cp[1]);
+ dprintf("pt: ecx %x\n", cp[2]);
+
+ pt_info.l0_eax = cp[0];
+ pt_info.l0_ebx = cp[1];
+ pt_info.l0_ecx = cp[2];
+
+ dprintf("pt: Enumerating part 2\n");
+ cpuid_count(CPUID_PT_LEAF, 1, cp);
+ dprintf("pt: eax %x\n", cp[0]);
+ dprintf("pt: ebx %x\n", cp[1]);
+
+ pt_info.l1_eax = cp[0];
+ pt_info.l1_ebx = cp[1];
+
+ error = hwt_backend_register(&backend);
+ if (error != 0) {
+ printf("pt: unable to register hwt backend, error %d\n", error);
+ return (error);
+ }
+ pt_pcpu = mallocarray(mp_ncpus, sizeof(struct pt_cpu), M_PT,
+ M_ZERO | M_WAITOK);
+ pt_pcpu_ctx = mallocarray(mp_ncpus, sizeof(struct pt_ctx), M_PT,
+ M_ZERO | M_WAITOK);
+
+ nmi_register_handler(pt_topa_intr);
+ if (!lapic_enable_pcint()) {
+ printf("pt: failed to setup interrupt line\n");
+ return (error);
+ }
+ initialized = true;
+
+ return (0);
+}
+
+/*
+ * Checks whether the CPU support Intel PT.
+ *
+ * The driver relies on XSAVE/XRSTOR PT extensions,
+ * Table of Physical Addresses (ToPA) support, and
+ * support for multiple ToPA entries.
+ */
+static bool
+pt_supported(void)
+{
+ u_int cp[4];
+
+ /* Intel SDM Vol. 3C, 33-30 */
+ if ((cpu_stdext_feature & CPUID_STDEXT_PROCTRACE) == 0) {
+ printf("pt: CPU does not support Intel Processor Trace\n");
+ return (false);
+ }
+
+ /* Require XSAVE support. */
+ if ((cpu_feature2 & CPUID2_XSAVE) == 0) {
+ printf("pt: XSAVE is not supported\n");
+ return (false);
+ }
+
+ cpuid_count(0xd, 0x0, cp);
+ if ((cp[0] & PT_XSAVE_MASK) != PT_XSAVE_MASK) {
+ printf("pt: CPU does not support X87 or SSE: %x", cp[0]);
+ return (false);
+ }
+
+ cpuid_count(0xd, 0x1, cp);
+ if ((cp[0] & (1 << 0)) == 0) {
+ printf("pt: XSAVE compaction is not supported\n");
+ return (false);
+ }
+ if ((cp[0] & (1 << 3)) == 0) {
+ printf("pt: XSAVES/XRSTORS are not supported\n");
+ return (false);
+ }
+
+ /* Require ToPA support. */
+ cpuid_count(PT_CPUID, 0, cp);
+ if ((cp[2] & CPUPT_TOPA) == 0) {
+ printf("pt: ToPA is not supported\n");
+ return (false);
+ }
+ if ((cp[2] & CPUPT_TOPA_MULTI) == 0) {
+ printf("pt: multiple ToPA outputs are not supported\n");
+ return (false);
+ }
+
+ return (true);
+}
+
+static void
+pt_deinit(void)
+{
+ if (!initialized)
+ return;
+ nmi_remove_handler(pt_topa_intr);
+ lapic_disable_pcint();
+ hwt_backend_unregister(&backend);
+ free(pt_pcpu, M_PT);
+ free(pt_pcpu_ctx, M_PT);
+ pt_pcpu = NULL;
+ initialized = false;
+}
+
+static int
+pt_modevent(module_t mod, int type, void *data)
+{
+ switch (type) {
+ case MOD_LOAD:
+ if (!pt_supported() || pt_init() != 0) {
+ return (ENXIO);
+ }
+ break;
+ case MOD_UNLOAD:
+ pt_deinit();
+ break;
+ default:
+ break;
+ }
+
+ return (0);
+}
+
+static moduledata_t pt_mod = { "intel_pt", pt_modevent, NULL };
+
+DECLARE_MODULE(intel_pt, pt_mod, SI_SUB_DRIVERS, SI_ORDER_FIRST);
+MODULE_DEPEND(intel_pt, hwt, 1, 1, 1);
+MODULE_VERSION(intel_pt, 1);
diff --git a/sys/modules/pt/Makefile b/sys/modules/pt/Makefile
new file mode 100644
--- /dev/null
+++ b/sys/modules/pt/Makefile
@@ -0,0 +1,8 @@
+
+.PATH: ${SRCTOP}/sys/amd64/pt
+
+KMOD= pt
+SRCS= pt.c pt.h device_if.h bus_if.h
+SRCS+= opt_hwpmc_hooks.h opt_kstack_pages.h
+
+.include <bsd.kmod.mk>
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Thu, Nov 7, 3:31 AM (21 h, 35 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
14503976
Default Alt Text
D46397.diff (27 KB)
Attached To
Mode
D46397: amd64: Intel Processor Trace support
Attached
Detach File
Event Timeline
Log In to Comment