Commit 5aa90a84 authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'x86-pti-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 page table isolation updates from Thomas Gleixner:
 "This is the final set of enabling page table isolation on x86:

   - Infrastructure patches for handling the extra page tables.

   - Patches which map the various bits and pieces which are required to
     get in and out of user space into the user space visible page
     tables.

   - The required changes to have CR3 switching in the entry/exit code.

   - Optimizations for the CR3 switching along with documentation how
     the ASID/PCID mechanism works.

   - Updates to dump pagetables to cover the user space page tables for
     W+X scans and extra debugfs files to analyze both the kernel and
     the user space visible page tables

  The whole functionality is compile time controlled via a config switch
  and can be turned on/off on the command line as well"

* 'x86-pti-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (32 commits)
  x86/ldt: Make the LDT mapping RO
  x86/mm/dump_pagetables: Allow dumping current pagetables
  x86/mm/dump_pagetables: Check user space page table for WX pages
  x86/mm/dump_pagetables: Add page table directory to the debugfs VFS hierarchy
  x86/mm/pti: Add Kconfig
  x86/dumpstack: Indicate in Oops whether PTI is configured and enabled
  x86/mm: Clarify the whole ASID/kernel PCID/user PCID naming
  x86/mm: Use INVPCID for __native_flush_tlb_single()
  x86/mm: Optimize RESTORE_CR3
  x86/mm: Use/Fix PCID to optimize user/kernel switches
  x86/mm: Abstract switching CR3
  x86/mm: Allow flushing for future ASID switches
  x86/pti: Map the vsyscall page if needed
  x86/pti: Put the LDT in its own PGD if PTI is on
  x86/mm/64: Make a full PGD-entry size hole in the memory map
  x86/events/intel/ds: Map debug buffers in cpu_entry_area
  x86/cpu_entry_area: Add debugstore entries to cpu_entry_area
  x86/mm/pti: Map ESPFIX into user space
  x86/mm/pti: Share entry text PMD
  x86/entry: Align entry text section to PMD boundary
  ...
parents 61233580 9f5cb6b3
......@@ -2708,6 +2708,8 @@
steal time is computed, but won't influence scheduler
behaviour
nopti [X86-64] Disable kernel page table isolation
nolapic [X86-32,APIC] Do not enable or use the local APIC.
nolapic_timer [X86-32,APIC] Do not use the local APIC timer.
......@@ -3282,6 +3284,12 @@
pt. [PARIDE]
See Documentation/blockdev/paride.txt.
pti= [X86_64]
Control user/kernel address space isolation:
on - enable
off - disable
auto - default setting
pty.legacy_count=
[KNL] Number of legacy pty's. Overwrites compiled-in
default number.
......
......@@ -12,6 +12,7 @@ ffffea0000000000 - ffffeaffffffffff (=40 bits) virtual memory map (1TB)
... unused hole ...
ffffec0000000000 - fffffbffffffffff (=44 bits) kasan shadow memory (16TB)
... unused hole ...
fffffe0000000000 - fffffe7fffffffff (=39 bits) LDT remap for PTI
fffffe8000000000 - fffffeffffffffff (=39 bits) cpu_entry_area mapping
ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks
... unused hole ...
......@@ -29,8 +30,8 @@ Virtual memory map with 5 level page tables:
hole caused by [56:63] sign extension
ff00000000000000 - ff0fffffffffffff (=52 bits) guard hole, reserved for hypervisor
ff10000000000000 - ff8fffffffffffff (=55 bits) direct mapping of all phys. memory
ff90000000000000 - ff91ffffffffffff (=49 bits) hole
ff92000000000000 - ffd1ffffffffffff (=54 bits) vmalloc/ioremap space
ff90000000000000 - ff9fffffffffffff (=52 bits) LDT remap for PTI
ffa0000000000000 - ffd1ffffffffffff (=54 bits) vmalloc/ioremap space (12800 TB)
ffd2000000000000 - ffd3ffffffffffff (=49 bits) hole
ffd4000000000000 - ffd5ffffffffffff (=49 bits) virtual memory map (512TB)
... unused hole ...
......
......@@ -23,6 +23,9 @@
*/
#undef CONFIG_AMD_MEM_ENCRYPT
/* No PAGE_TABLE_ISOLATION support needed either: */
#undef CONFIG_PAGE_TABLE_ISOLATION
#include "misc.h"
/* These actually do the work of building the kernel identity maps. */
......
/* SPDX-License-Identifier: GPL-2.0 */
#include <linux/jump_label.h>
#include <asm/unwind_hints.h>
#include <asm/cpufeatures.h>
#include <asm/page_types.h>
#include <asm/percpu.h>
#include <asm/asm-offsets.h>
#include <asm/processor-flags.h>
/*
......@@ -187,6 +192,146 @@ For 32-bit we have the following conventions - kernel is built with
#endif
.endm
#ifdef CONFIG_PAGE_TABLE_ISOLATION
/*
* PAGE_TABLE_ISOLATION PGDs are 8k. Flip bit 12 to switch between the two
* halves:
*/
#define PTI_SWITCH_PGTABLES_MASK (1<<PAGE_SHIFT)
#define PTI_SWITCH_MASK (PTI_SWITCH_PGTABLES_MASK|(1<<X86_CR3_PTI_SWITCH_BIT))
.macro SET_NOFLUSH_BIT reg:req
bts $X86_CR3_PCID_NOFLUSH_BIT, \reg
.endm
.macro ADJUST_KERNEL_CR3 reg:req
ALTERNATIVE "", "SET_NOFLUSH_BIT \reg", X86_FEATURE_PCID
/* Clear PCID and "PAGE_TABLE_ISOLATION bit", point CR3 at kernel pagetables: */
andq $(~PTI_SWITCH_MASK), \reg
.endm
.macro SWITCH_TO_KERNEL_CR3 scratch_reg:req
ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
mov %cr3, \scratch_reg
ADJUST_KERNEL_CR3 \scratch_reg
mov \scratch_reg, %cr3
.Lend_\@:
.endm
#define THIS_CPU_user_pcid_flush_mask \
PER_CPU_VAR(cpu_tlbstate) + TLB_STATE_user_pcid_flush_mask
.macro SWITCH_TO_USER_CR3_NOSTACK scratch_reg:req scratch_reg2:req
ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
mov %cr3, \scratch_reg
ALTERNATIVE "jmp .Lwrcr3_\@", "", X86_FEATURE_PCID
/*
* Test if the ASID needs a flush.
*/
movq \scratch_reg, \scratch_reg2
andq $(0x7FF), \scratch_reg /* mask ASID */
bt \scratch_reg, THIS_CPU_user_pcid_flush_mask
jnc .Lnoflush_\@
/* Flush needed, clear the bit */
btr \scratch_reg, THIS_CPU_user_pcid_flush_mask
movq \scratch_reg2, \scratch_reg
jmp .Lwrcr3_\@
.Lnoflush_\@:
movq \scratch_reg2, \scratch_reg
SET_NOFLUSH_BIT \scratch_reg
.Lwrcr3_\@:
/* Flip the PGD and ASID to the user version */
orq $(PTI_SWITCH_MASK), \scratch_reg
mov \scratch_reg, %cr3
.Lend_\@:
.endm
.macro SWITCH_TO_USER_CR3_STACK scratch_reg:req
pushq %rax
SWITCH_TO_USER_CR3_NOSTACK scratch_reg=\scratch_reg scratch_reg2=%rax
popq %rax
.endm
.macro SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg:req save_reg:req
ALTERNATIVE "jmp .Ldone_\@", "", X86_FEATURE_PTI
movq %cr3, \scratch_reg
movq \scratch_reg, \save_reg
/*
* Is the "switch mask" all zero? That means that both of
* these are zero:
*
* 1. The user/kernel PCID bit, and
* 2. The user/kernel "bit" that points CR3 to the
* bottom half of the 8k PGD
*
* That indicates a kernel CR3 value, not a user CR3.
*/
testq $(PTI_SWITCH_MASK), \scratch_reg
jz .Ldone_\@
ADJUST_KERNEL_CR3 \scratch_reg
movq \scratch_reg, %cr3
.Ldone_\@:
.endm
.macro RESTORE_CR3 scratch_reg:req save_reg:req
ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
ALTERNATIVE "jmp .Lwrcr3_\@", "", X86_FEATURE_PCID
/*
* KERNEL pages can always resume with NOFLUSH as we do
* explicit flushes.
*/
bt $X86_CR3_PTI_SWITCH_BIT, \save_reg
jnc .Lnoflush_\@
/*
* Check if there's a pending flush for the user ASID we're
* about to set.
*/
movq \save_reg, \scratch_reg
andq $(0x7FF), \scratch_reg
bt \scratch_reg, THIS_CPU_user_pcid_flush_mask
jnc .Lnoflush_\@
btr \scratch_reg, THIS_CPU_user_pcid_flush_mask
jmp .Lwrcr3_\@
.Lnoflush_\@:
SET_NOFLUSH_BIT \save_reg
.Lwrcr3_\@:
/*
* The CR3 write could be avoided when not changing its value,
* but would require a CR3 read *and* a scratch register.
*/
movq \save_reg, %cr3
.Lend_\@:
.endm
#else /* CONFIG_PAGE_TABLE_ISOLATION=n: */
.macro SWITCH_TO_KERNEL_CR3 scratch_reg:req
.endm
.macro SWITCH_TO_USER_CR3_NOSTACK scratch_reg:req scratch_reg2:req
.endm
.macro SWITCH_TO_USER_CR3_STACK scratch_reg:req
.endm
.macro SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg:req save_reg:req
.endm
.macro RESTORE_CR3 scratch_reg:req save_reg:req
.endm
#endif
#endif /* CONFIG_X86_64 */
/*
......
......@@ -23,7 +23,6 @@
#include <asm/segment.h>
#include <asm/cache.h>
#include <asm/errno.h>
#include "calling.h"
#include <asm/asm-offsets.h>
#include <asm/msr.h>
#include <asm/unistd.h>
......@@ -40,6 +39,8 @@
#include <asm/frame.h>
#include <linux/err.h>
#include "calling.h"
.code64
.section .entry.text, "ax"
......@@ -168,6 +169,9 @@ ENTRY(entry_SYSCALL_64_trampoline)
/* Stash the user RSP. */
movq %rsp, RSP_SCRATCH
/* Note: using %rsp as a scratch reg. */
SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp
/* Load the top of the task stack into RSP */
movq CPU_ENTRY_AREA_tss + TSS_sp1 + CPU_ENTRY_AREA, %rsp
......@@ -207,6 +211,10 @@ ENTRY(entry_SYSCALL_64)
*/
swapgs
/*
* This path is not taken when PAGE_TABLE_ISOLATION is disabled so it
* is not required to switch CR3.
*/
movq %rsp, PER_CPU_VAR(rsp_scratch)
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
......@@ -403,6 +411,7 @@ syscall_return_via_sysret:
* We are on the trampoline stack. All regs except RDI are live.
* We can do future final exit work right here.
*/
SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi
popq %rdi
popq %rsp
......@@ -740,6 +749,8 @@ GLOBAL(swapgs_restore_regs_and_return_to_usermode)
* We can do future final exit work right here.
*/
SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi
/* Restore RDI. */
popq %rdi
SWAPGS
......@@ -822,7 +833,9 @@ native_irq_return_ldt:
*/
pushq %rdi /* Stash user RDI */
SWAPGS
SWAPGS /* to kernel GS */
SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi /* to kernel CR3 */
movq PER_CPU_VAR(espfix_waddr), %rdi
movq %rax, (0*8)(%rdi) /* user RAX */
movq (1*8)(%rsp), %rax /* user RIP */
......@@ -838,7 +851,6 @@ native_irq_return_ldt:
/* Now RAX == RSP. */
andl $0xffff0000, %eax /* RAX = (RSP & 0xffff0000) */
popq %rdi /* Restore user RDI */
/*
* espfix_stack[31:16] == 0. The page tables are set up such that
......@@ -849,7 +861,11 @@ native_irq_return_ldt:
* still points to an RO alias of the ESPFIX stack.
*/
orq PER_CPU_VAR(espfix_stack), %rax
SWAPGS
SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi
SWAPGS /* to user GS */
popq %rdi /* Restore user RDI */
movq %rax, %rsp
UNWIND_HINT_IRET_REGS offset=8
......@@ -949,6 +965,8 @@ ENTRY(switch_to_thread_stack)
UNWIND_HINT_FUNC
pushq %rdi
/* Need to switch before accessing the thread stack. */
SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi
movq %rsp, %rdi
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
UNWIND_HINT sp_offset=16 sp_reg=ORC_REG_DI
......@@ -1250,7 +1268,11 @@ ENTRY(paranoid_entry)
js 1f /* negative -> in kernel */
SWAPGS
xorl %ebx, %ebx
1: ret
1:
SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg=%rax save_reg=%r14
ret
END(paranoid_entry)
/*
......@@ -1272,6 +1294,7 @@ ENTRY(paranoid_exit)
testl %ebx, %ebx /* swapgs needed? */
jnz .Lparanoid_exit_no_swapgs
TRACE_IRQS_IRETQ
RESTORE_CR3 scratch_reg=%rbx save_reg=%r14
SWAPGS_UNSAFE_STACK
jmp .Lparanoid_exit_restore
.Lparanoid_exit_no_swapgs:
......@@ -1299,6 +1322,8 @@ ENTRY(error_entry)
* from user mode due to an IRET fault.
*/
SWAPGS
/* We have user CR3. Change to kernel CR3. */
SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
.Lerror_entry_from_usermode_after_swapgs:
/* Put us onto the real thread stack. */
......@@ -1345,6 +1370,7 @@ ENTRY(error_entry)
* .Lgs_change's error handler with kernel gsbase.
*/
SWAPGS
SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
jmp .Lerror_entry_done
.Lbstep_iret:
......@@ -1354,10 +1380,11 @@ ENTRY(error_entry)
.Lerror_bad_iret:
/*
* We came from an IRET to user mode, so we have user gsbase.
* Switch to kernel gsbase:
* We came from an IRET to user mode, so we have user
* gsbase and CR3. Switch to kernel gsbase and CR3:
*/
SWAPGS
SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
/*
* Pretend that the exception came from user mode: set up pt_regs
......@@ -1389,6 +1416,10 @@ END(error_exit)
/*
* Runs on exception stack. Xen PV does not go through this path at all,
* so we can use real assembly here.
*
* Registers:
* %r14: Used to save/restore the CR3 of the interrupted context
* when PAGE_TABLE_ISOLATION is in use. Do not clobber.
*/
ENTRY(nmi)
UNWIND_HINT_IRET_REGS
......@@ -1452,6 +1483,7 @@ ENTRY(nmi)
swapgs
cld
SWITCH_TO_KERNEL_CR3 scratch_reg=%rdx
movq %rsp, %rdx
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
UNWIND_HINT_IRET_REGS base=%rdx offset=8
......@@ -1704,6 +1736,8 @@ end_repeat_nmi:
movq $-1, %rsi
call do_nmi
RESTORE_CR3 scratch_reg=%r15 save_reg=%r14
testl %ebx, %ebx /* swapgs needed? */
jnz nmi_restore
nmi_swapgs:
......
......@@ -49,6 +49,10 @@
ENTRY(entry_SYSENTER_compat)
/* Interrupts are off on entry. */
SWAPGS
/* We are about to clobber %rsp anyway, clobbering here is OK */
SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
/*
......@@ -215,6 +219,12 @@ GLOBAL(entry_SYSCALL_compat_after_hwframe)
pushq $0 /* pt_regs->r14 = 0 */
pushq $0 /* pt_regs->r15 = 0 */
/*
* We just saved %rdi so it is safe to clobber. It is not
* preserved during the C calls inside TRACE_IRQS_OFF anyway.
*/
SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi
/*
* User mode is traced as though IRQs are on, and SYSENTER
* turned them off.
......@@ -256,10 +266,22 @@ sysret32_from_system_call:
* when the system call started, which is already known to user
* code. We zero R8-R10 to avoid info leaks.
*/
movq RSP-ORIG_RAX(%rsp), %rsp
/*
* The original userspace %rsp (RSP-ORIG_RAX(%rsp)) is stored
* on the process stack which is not mapped to userspace and
* not readable after we SWITCH_TO_USER_CR3. Delay the CR3
* switch until after after the last reference to the process
* stack.
*
* %r8/%r9 are zeroed before the sysret, thus safe to clobber.
*/
SWITCH_TO_USER_CR3_NOSTACK scratch_reg=%r8 scratch_reg2=%r9
xorq %r8, %r8
xorq %r9, %r9
xorq %r10, %r10
movq RSP-ORIG_RAX(%rsp), %rsp
swapgs
sysretl
END(entry_SYSCALL_compat)
......
......@@ -344,14 +344,14 @@ int in_gate_area_no_mm(unsigned long addr)
* vsyscalls but leave the page not present. If so, we skip calling
* this.
*/
static void __init set_vsyscall_pgtable_user_bits(void)
void __init set_vsyscall_pgtable_user_bits(pgd_t *root)
{
pgd_t *pgd;
p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
pgd = pgd_offset_k(VSYSCALL_ADDR);
pgd = pgd_offset_pgd(root, VSYSCALL_ADDR);
set_pgd(pgd, __pgd(pgd_val(*pgd) | _PAGE_USER));
p4d = p4d_offset(pgd, VSYSCALL_ADDR);
#if CONFIG_PGTABLE_LEVELS >= 5
......@@ -373,7 +373,7 @@ void __init map_vsyscall(void)
vsyscall_mode == NATIVE
? PAGE_KERNEL_VSYSCALL
: PAGE_KERNEL_VVAR);
set_vsyscall_pgtable_user_bits();
set_vsyscall_pgtable_user_bits(swapper_pg_dir);
}
BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_PAGE) !=
......
......@@ -3,16 +3,18 @@
#include <linux/types.h>
#include <linux/slab.h>
#include <asm/cpu_entry_area.h>
#include <asm/perf_event.h>
#include <asm/insn.h>
#include "../perf_event.h"
/* Waste a full page so it can be mapped into the cpu_entry_area */
DEFINE_PER_CPU_PAGE_ALIGNED(struct debug_store, cpu_debug_store);
/* The size of a BTS record in bytes: */
#define BTS_RECORD_SIZE 24
#define BTS_BUFFER_SIZE (PAGE_SIZE << 4)
#define PEBS_BUFFER_SIZE (PAGE_SIZE << 4)
#define PEBS_FIXUP_SIZE PAGE_SIZE
/*
......@@ -279,17 +281,52 @@ void fini_debug_store_on_cpu(int cpu)
static DEFINE_PER_CPU(void *, insn_buffer);
static int alloc_pebs_buffer(int cpu)
static void ds_update_cea(void *cea, void *addr, size_t size, pgprot_t prot)
{
struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
phys_addr_t pa;
size_t msz = 0;
pa = virt_to_phys(addr);
for (; msz < size; msz += PAGE_SIZE, pa += PAGE_SIZE, cea += PAGE_SIZE)
cea_set_pte(cea, pa, prot);
}
static void ds_clear_cea(void *cea, size_t size)
{
size_t msz = 0;
for (; msz < size; msz += PAGE_SIZE, cea += PAGE_SIZE)
cea_set_pte(cea, 0, PAGE_NONE);
}
static void *dsalloc_pages(size_t size, gfp_t flags, int cpu)
{
unsigned int order = get_order(size);
int node = cpu_to_node(cpu);
int max;
void *buffer, *ibuffer;
struct page *page;
page = __alloc_pages_node(node, flags | __GFP_ZERO, order);
return page ? page_address(page) : NULL;
}
static void dsfree_pages(const void *buffer, size_t size)
{
if (buffer)
free_pages((unsigned long)buffer, get_order(size));
}
static int alloc_pebs_buffer(int cpu)
{
struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu);
struct debug_store *ds = hwev->ds;
size_t bsiz = x86_pmu.pebs_buffer_size;
int max, node = cpu_to_node(cpu);
void *buffer, *ibuffer, *cea;
if (!x86_pmu.pebs)
return 0;
buffer = kzalloc_node(x86_pmu.pebs_buffer_size, GFP_KERNEL, node);
buffer = dsalloc_pages(bsiz, GFP_KERNEL, cpu);
if (unlikely(!buffer))
return -ENOMEM;
......@@ -300,25 +337,27 @@ static int alloc_pebs_buffer(int cpu)
if (x86_pmu.intel_cap.pebs_format < 2) {
ibuffer = kzalloc_node(PEBS_FIXUP_SIZE, GFP_KERNEL, node);
if (!ibuffer) {
kfree(buffer);
dsfree_pages(buffer, bsiz);
return -ENOMEM;
}
per_cpu(insn_buffer, cpu) = ibuffer;
}
max = x86_pmu.pebs_buffer_size / x86_pmu.pebs_record_size;
ds->pebs_buffer_base = (u64)(unsigned long)buffer;
hwev->ds_pebs_vaddr = buffer;
/* Update the cpu entry area mapping */
cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.pebs_buffer;
ds->pebs_buffer_base = (unsigned long) cea;
ds_update_cea(cea, buffer, bsiz, PAGE_KERNEL);
ds->pebs_index = ds->pebs_buffer_base;
ds->pebs_absolute_maximum = ds->pebs_buffer_base +
max * x86_pmu.pebs_record_size;
max = x86_pmu.pebs_record_size * (bsiz / x86_pmu.pebs_record_size);
ds->pebs_absolute_maximum = ds->pebs_buffer_base + max;
return 0;
}
static void release_pebs_buffer(int cpu)
{
struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu);
struct debug_store *ds = hwev->ds;
void *cea;
if (!ds || !x86_pmu.pebs)
return;
......@@ -326,73 +365,70 @@ static void release_pebs_buffer(int cpu)
kfree(per_cpu(insn_buffer, cpu));
per_cpu(insn_buffer, cpu) = NULL;
kfree((void *)(unsigned long)ds->pebs_buffer_base);
/* Clear the fixmap */
cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.pebs_buffer;
ds_clear_cea(cea, x86_pmu.pebs_buffer_size);
ds->pebs_buffer_base = 0;
dsfree_pages(hwev->ds_pebs_vaddr, x86_pmu.pebs_buffer_size);
hwev->ds_pebs_vaddr = NULL;
}
static int alloc_bts_buffer(int cpu)
{
struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
int node = cpu_to_node(cpu);
int max, thresh;
void *buffer;
struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu);
struct debug_store *ds = hwev->ds;
void *buffer, *cea;
int max;
if (!x86_pmu.bts)
return 0;
buffer = kzalloc_node(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_NOWARN, node);
buffer = dsalloc_pages(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_NOWARN, cpu);
if (unlikely(!buffer)) {
WARN_ONCE(1, "%s: BTS buffer allocation failure\n", __func__);
return -ENOMEM;
}
max = BTS_BUFFER_SIZE / BTS_RECORD_SIZE;
thresh = max / 16;
ds->bts_buffer_base = (u64)(unsigned long)buffer;
hwev->ds_bts_vaddr = buffer;
/* Update the fixmap */
cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.bts_buffer;
ds->bts_buffer_base = (unsigned long) cea;
ds_update_cea(cea, buffer, BTS_BUFFER_SIZE, PAGE_KERNEL);
ds->bts_index = ds->bts_buffer_base;
ds->bts_absolute_maximum = ds->bts_buffer_base +
max * BTS_RECORD_SIZE;
ds->bts_interrupt_threshold = ds->bts_absolute_maximum -
thresh * BTS_RECORD_SIZE;
max = BTS_RECORD_SIZE * (BTS_BUFFER_SIZE / BTS_RECORD_SIZE);
ds->bts_absolute_maximum = ds->bts_buffer_base + max;
ds->bts_interrupt_threshold = ds->bts_absolute_maximum - (max / 16);
return 0;
}
static void release_bts_buffer(int cpu)
{
struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu);
struct debug_store *ds = hwev->ds;
void *cea;
if (!ds || !x86_pmu.bts)
return;
kfree((void *)(unsigned long)ds->bts_buffer_base);
/* Clear the fixmap */
cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.bts_buffer;
ds_clear_cea(cea, BTS_BUFFER_SIZE);
ds->bts_buffer_base = 0;
dsfree_pages(hwev->ds_bts_vaddr, BTS_BUFFER_SIZE);
hwev->ds_bts_vaddr = NULL;
}
static int alloc_ds_buffer(int cpu)
{
int node = cpu_to_node(cpu);
struct debug_store *ds;
ds = kzalloc_node(sizeof(*ds), GFP_KERNEL, node);
if (unlikely(!ds))
return -ENOMEM;
struct debug_store *ds = &get_cpu_entry_area(cpu)->cpu_debug_store;
memset(ds, 0, sizeof(*ds));
per_cpu(cpu_hw_events, cpu).ds = ds;
return 0;
}
static void release_ds_buffer(int cpu)
{
struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
if (!ds)
return;
per_cpu(cpu_hw_events, cpu).ds = NULL;
kfree(ds);
}
void release_ds_buffers(void)
......
......@@ -14,6 +14,8 @@
#include <linux/perf_event.h>
#include <asm/intel_ds.h>
/* To enable MSR tracing please use the generic trace points. */
/*
......@@ -77,8 +79,6 @@ struct amd_nb {
struct event_constraint event_constraints[X86_PMC_IDX_MAX];
};
/* The maximal number of PEBS events: */
#define MAX_PEBS_EVENTS 8
#define PEBS_COUNTER_MASK ((1ULL << MAX_PEBS_EVENTS) - 1)
/*
......@@ -95,23 +95,6 @@ struct amd_nb {
PERF_SAMPLE_TRANSACTION | PERF_SAMPLE_PHYS_ADDR | \
PERF_SAMPLE_REGS_INTR | PERF_SAMPLE_REGS_USER)
/*
* A debug store configuration.
*
* We only support architectures that use 64bit fields.
*/
struct debug_store {
u64 bts_buffer_base;
u64 bts_index;
u64 bts_absolute_maximum;
u64 bts_interrupt_threshold;
u64 pebs_buffer_base;
u64 pebs_index;
u64 pebs_absolute_maximum;
u64 pebs_interrupt_threshold;
u64 pebs_event_reset[MAX_PEBS_EVENTS];
};
#define PEBS_REGS \
(PERF_REG_X86_AX | \
PERF_REG_X86_BX | \
......@@ -216,6 +199,8 @@ struct cpu_hw_events {
* Intel DebugStore bits
*/
struct debug_store *ds;
void *ds_pebs_vaddr;
void *ds_bts_vaddr;
u64 pebs_enabled;
int n_pebs;
int n_large_pebs;
......