Commit d22fff81 authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'x86-mm-for-linus' of git://

Pull x86 mm updates from Ingo Molnar:

 - Extend the memmap= boot parameter syntax to allow the redeclaration
   and dropping of existing ranges, and to support all e820 range types
   (Jan H. Schönherr)

 - Improve the W+X boot time security checks to remove false positive
   warnings on Xen (Jan Beulich)

 - Support booting as Xen PVH guest (Juergen Gross)

 - Improved 5-level paging (LA57) support, in particular it's possible
   now to have a single kernel image for both 4-level and 5-level
   hardware (Kirill A. Shutemov)

 - AMD hardware RAM encryption support (SME/SEV) fixes (Tom Lendacky)

 - Preparatory commits for hardware-encrypted RAM support on Intel CPUs.
   (Kirill A. Shutemov)

 - Improved Intel-MID support (Andy Shevchenko)

 - Show EFI page tables in page_tables debug files (Andy Lutomirski)

 - ... plus misc fixes and smaller cleanups

* 'x86-mm-for-linus' of git:// (56 commits)
  x86/cpu/tme: Fix spelling: "configuation" -> "configuration"
  x86/boot: Fix SEV boot failure from change to __PHYSICAL_MASK_SHIFT
  x86/mm: Update comment in detect_tme() regarding x86_phys_bits
  x86/mm/32: Remove unused node_memmap_size_bytes() & CONFIG_NEED_NODE_MEMMAP_SIZE logic
  x86/mm: Remove pointless checks in vmalloc_fault
  x86/platform/intel-mid: Add special handling for ACPI HW reduced platforms
  ACPI, x86/boot: Introduce the ->reduced_hw_early_init() ACPI callback
  ACPI, x86/boot: Split out acpi_generic_reduce_hw_init() and export
  x86/pconfig: Provide defines and helper to run MKTME_KEY_PROG leaf
  x86/pconfig: Detect PCONFIG targets
  x86/tme: Detect if TME and MKTME is activated by BIOS
  x86/boot/compressed/64: Handle 5-level paging boot if kernel is above 4G
  x86/boot/compressed/64: Use page table in trampoline memory
  x86/boot/compressed/64: Use stack from trampoline memory
  x86/boot/compressed/64: Make sure we have a 32-bit code segment
  x86/mm: Do not use paravirtualized calls in native_set_p4d()
  kdump, vmcoreinfo: Export pgtable_l5_enabled value
  x86/boot/compressed/64: Prepare new top-level page table for trampoline
  x86/boot/compressed/64: Set up trampoline memory
  x86/boot/compressed/64: Save and restore trampoline memory
parents 986b37c0 eaeb8e76
......@@ -2248,6 +2248,15 @@
The memory region may be marked as e820 type 12 (0xc)
and is NVDIMM or ADR memory.
[KNL,ACPI] Convert memory within the specified region
from <oldtype> to <newtype>. If "-<oldtype>" is left
out, the whole region will be marked as <newtype>,
even if previously unavailable. If "+<newtype>" is left
out, matching memory will be removed. Types are
specified as e820 types, e.g., 1 = RAM, 2 = reserved,
3 = ACPI, 12 = PRAM.
memory_corruption_check=0/1 [X86]
Some BIOSes seem to corrupt the first 64k of
memory when doing things like suspend/resume.
......@@ -20,12 +20,9 @@ Documentation/x86/x86_64/mm.txt
CONFIG_X86_5LEVEL=y enables the feature.
So far, a kernel compiled with the option enabled will be able to boot
only on machines that supports the feature -- see for 'la57' flag in
The plan is to implement boot-time switching between 4- and 5-level paging
in the future.
Kernel with CONFIG_X86_5LEVEL=y still able to boot on 4-level hardware.
In this case additional page table level -- p4d -- will be folded at
== User-space and large virtual address space ==
......@@ -1461,6 +1461,8 @@ config X86_PAE
config X86_5LEVEL
bool "Enable 5-level page tables support"
depends on X86_64
5-level paging enables access to larger address space:
......@@ -1469,8 +1471,8 @@ config X86_5LEVEL
It will be supported by future Intel CPUs.
Note: a kernel with this option enabled can only be booted
on machines that support the feature.
A kernel with the option enabled can be booted on machines that
support 4- or 5-level paging.
See Documentation/x86/x86_64/5level-paging.txt for more
......@@ -1595,10 +1597,6 @@ config ARCH_HAVE_MEMORY_PRESENT
def_bool y
depends on X86_32 && DISCONTIGMEM
def_bool y
depends on X86_32 && (DISCONTIGMEM || SPARSEMEM)
def_bool y
depends on X86_32 && !NUMA
......@@ -2174,10 +2172,17 @@ config PHYSICAL_ALIGN
Don't change this unless you know what you are doing.
This option makes base addresses of vmalloc and vmemmap as well as
__PAGE_OFFSET movable during boot.
bool "Randomize the kernel memory sections"
depends on X86_64
Randomizes the base virtual address of kernel memory sections
......@@ -78,7 +78,7 @@ vmlinux-objs-y := $(obj)/ $(obj)/head_$(BITS).o $(obj)/misc.o \
vmlinux-objs-$(CONFIG_EARLY_PRINTK) += $(obj)/early_serial_console.o
vmlinux-objs-$(CONFIG_RANDOMIZE_BASE) += $(obj)/kaslr.o
ifdef CONFIG_X86_64
vmlinux-objs-$(CONFIG_RANDOMIZE_BASE) += $(obj)/pagetable.o
vmlinux-objs-$(CONFIG_RANDOMIZE_BASE) += $(obj)/kaslr_64.o
vmlinux-objs-y += $(obj)/mem_encrypt.o
vmlinux-objs-y += $(obj)/pgtable_64.o
......@@ -33,6 +33,7 @@
#include <asm/processor-flags.h>
#include <asm/asm-offsets.h>
#include <asm/bootparam.h>
#include "pgtable.h"
* Locally defined symbols should be marked hidden:
......@@ -304,55 +305,77 @@ ENTRY(startup_64)
/* Set up the stack */
leaq boot_stack_end(%rbx), %rsp
#ifdef CONFIG_X86_5LEVEL
* Check if we need to enable 5-level paging.
* RSI holds real mode data and need to be preserved across
* a function call.
* At this point we are in long mode with 4-level paging enabled,
* but we might want to enable 5-level paging or vice versa.
* The problem is that we cannot do it directly. Setting or clearing
* CR4.LA57 in long mode would trigger #GP. So we need to switch off
* long mode and paging first.
* We also need a trampoline in lower memory to switch over from
* 4- to 5-level paging for cases when the bootloader puts the kernel
* above 4G, but didn't enable 5-level paging for us.
* The same trampoline can be used to switch from 5- to 4-level paging
* mode, like when starting 4-level paging kernel via kexec() when
* original kernel worked in 5-level paging mode.
* For the trampoline, we need the top page table to reside in lower
* memory as we don't have a way to load 64-bit values into CR3 in
* 32-bit mode.
* We go though the trampoline even if we don't have to: if we're
* already in a desired paging mode. This way the trampoline code gets
* tested on every boot.
pushq %rsi
call l5_paging_required
popq %rsi
/* If l5_paging_required() returned zero, we're done here. */
cmpq $0, %rax
je lvl5
/* Make sure we have GDT with 32-bit code segment */
leaq gdt(%rip), %rax
movq %rax, gdt64+2(%rip)
lgdt gdt64(%rip)
* At this point we are in long mode with 4-level paging enabled,
* but we want to enable 5-level paging.
* paging_prepare() sets up the trampoline and checks if we need to
* enable 5-level paging.
* The problem is that we cannot do it directly. Setting LA57 in
* long mode would trigger #GP. So we need to switch off long mode
* first.
* Address of the trampoline is returned in RAX.
* Non zero RDX on return means we need to enable 5-level paging.
* NOTE: This is not going to work if bootloader put us above 4G
* limit.
* The first step is go into compatibility mode.
* RSI holds real mode data and needs to be preserved across
* this function call.
pushq %rsi
call paging_prepare
popq %rsi
/* Clear additional page table */
leaq lvl5_pgtable(%rbx), %rdi
xorq %rax, %rax
movq $(PAGE_SIZE/8), %rcx
rep stosq
/* Save the trampoline address in RCX */
movq %rax, %rcx
* Setup current CR3 as the first and only entry in a new top level
* page table.
* Load the address of trampoline_return() into RDI.
* It will be used by the trampoline to return to the main code.
movq %cr3, %rdi
leaq 0x7 (%rdi), %rax
movq %rax, lvl5_pgtable(%rbx)
leaq trampoline_return(%rip), %rdi
/* Switch to compatibility mode (CS.L = 0 CS.D = 1) via far return */
pushq $__KERNEL32_CS
leaq compatible_mode(%rip), %rax
pushq %rax
/* Restore the stack, the 32-bit trampoline uses its own stack */
leaq boot_stack_end(%rbx), %rsp
* cleanup_trampoline() would restore trampoline memory.
* RSI holds real mode data and needs to be preserved across
* this function call.
pushq %rsi
call cleanup_trampoline
popq %rsi
/* Zero EFLAGS */
pushq $0
......@@ -490,46 +513,82 @@ relocated:
jmp *%rax
#ifdef CONFIG_X86_5LEVEL
/* Setup data and stack segments */
* This is the 32-bit trampoline that will be copied over to low memory.
* RDI contains the return address (might be above 4G).
* ECX contains the base address of the trampoline memory.
* Non zero RDX on return means we need to enable 5-level paging.
/* Set up data and stack segments */
movl $__KERNEL_DS, %eax
movl %eax, %ds
movl %eax, %ss
/* Set up new stack */
leal TRAMPOLINE_32BIT_STACK_END(%ecx), %esp
/* Disable paging */
movl %cr0, %eax
btrl $X86_CR0_PG_BIT, %eax
movl %eax, %cr0
/* Point CR3 to 5-level paging */
leal lvl5_pgtable(%ebx), %eax
movl %eax, %cr3
/* Check what paging mode we want to be in after the trampoline */
cmpl $0, %edx
jz 1f
/* Enable PAE and LA57 mode */
/* We want 5-level paging: don't touch CR3 if it already points to 5-level page tables */
movl %cr4, %eax
testl $X86_CR4_LA57, %eax
jnz 3f
jmp 2f
/* We want 4-level paging: don't touch CR3 if it already points to 4-level page tables */
movl %cr4, %eax
orl $(X86_CR4_PAE | X86_CR4_LA57), %eax
testl $X86_CR4_LA57, %eax
jz 3f
/* Point CR3 to the trampoline's new top level page table */
movl %eax, %cr3
/* Enable PAE and LA57 (if required) paging modes */
movl $X86_CR4_PAE, %eax
cmpl $0, %edx
jz 1f
orl $X86_CR4_LA57, %eax
movl %eax, %cr4
/* Calculate address we are running at */
call 1f
1: popl %edi
subl $1b, %edi
/* Calculate address of paging_enabled() once we are executing in the trampoline */
leal paging_enabled - trampoline_32bit_src + TRAMPOLINE_32BIT_CODE_OFFSET(%ecx), %eax
/* Prepare stack for far return to Long Mode */
/* Prepare the stack for far return to Long Mode */
pushl $__KERNEL_CS
leal lvl5(%edi), %eax
push %eax
pushl %eax
/* Enable paging back */
/* Enable paging again */
movl $(X86_CR0_PG | X86_CR0_PE), %eax
movl %eax, %cr0
/* Return from the trampoline */
jmp *%rdi
* The trampoline code has a size limit.
* Make sure we fail to compile if the trampoline code grows
* beyond TRAMPOLINE_32BIT_CODE_SIZE bytes.
.org trampoline_32bit_src + TRAMPOLINE_32BIT_CODE_SIZE
/* This isn't an x86-64 CPU so hang */
/* This isn't an x86-64 CPU, so hang intentionally, we cannot continue */
jmp 1b
......@@ -537,6 +596,11 @@ no_longmode:
#include "../../kernel/verify_cpu.S"
.word gdt_end - gdt
.long 0
.word 0
.quad 0
.word gdt_end - gdt
.long gdt
......@@ -585,7 +649,3 @@ boot_stack_end:
.balign 4096
.fill BOOT_PGT_SIZE, 1, 0
#ifdef CONFIG_X86_5LEVEL
.fill PAGE_SIZE, 1, 0
......@@ -46,6 +46,12 @@
#define STATIC
#include <linux/decompress/mm.h>
#ifdef CONFIG_X86_5LEVEL
unsigned int pgtable_l5_enabled __ro_after_init;
unsigned int pgdir_shift __ro_after_init = 39;
unsigned int ptrs_per_p4d __ro_after_init = 1;
extern unsigned long get_cmd_line_ptr(void);
/* Simplified build-specific string for starting entropy. */
......@@ -723,6 +729,14 @@ void choose_random_location(unsigned long input,
#ifdef CONFIG_X86_5LEVEL
if (__read_cr4() & X86_CR4_LA57) {
pgtable_l5_enabled = 1;
pgdir_shift = 48;
ptrs_per_p4d = 512;
boot_params->hdr.loadflags |= KASLR_FLAG;
/* Prepare to add new identity pagetables on demand. */
......@@ -16,13 +16,6 @@
#define __pa(x) ((unsigned long)(x))
#define __va(x) ((void *)((unsigned long)(x)))
* The pgtable.h and mm/ident_map.c includes make use of the SME related
* information which is not used in the compressed image support. Un-define
* the SME support to avoid any compile and link errors.
/* No PAGE_TABLE_ISOLATION support needed either: */
......@@ -85,13 +78,14 @@ static struct x86_mapping_info mapping_info;
/* Locates and clears a region for a new top level page table. */
void initialize_identity_maps(void)
unsigned long sev_me_mask = get_sev_encryption_mask();
/* If running as an SEV guest, the encryption mask is required. */
/* Init mapping_info with run-time function/buffer pointers. */
mapping_info.alloc_pgt_page = alloc_pgt_page;
mapping_info.context = &pgt_data;
mapping_info.page_flag = __PAGE_KERNEL_LARGE_EXEC | sev_me_mask;
mapping_info.kernpg_flag = _KERNPG_TABLE | sev_me_mask;
mapping_info.page_flag = __PAGE_KERNEL_LARGE_EXEC | sme_me_mask;
mapping_info.kernpg_flag = _KERNPG_TABLE;
* It should be impossible for this not to already be true,
......@@ -88,9 +88,7 @@ ENTRY(get_sev_encryption_bit)
xor %rax, %rax
push %rbp
push %rdx
......@@ -101,9 +99,7 @@ ENTRY(get_sev_encryption_mask)
testl %eax, %eax
jz .Lno_sev_mask
xor %rdx, %rdx
bts %rax, %rdx /* Create the encryption mask */
mov %rdx, %rax /* ... and return it */
bts %rax, sme_me_mask(%rip) /* Create the encryption mask */
movq %rbp, %rsp /* Restore original stack pointer */
......@@ -112,9 +108,16 @@ ENTRY(get_sev_encryption_mask)
pop %rbp
xor %rax, %rax
.int 0xffffffff
.balign 8
.quad 0
......@@ -14,6 +14,7 @@
#include "misc.h"
#include "error.h"
#include "pgtable.h"
#include "../string.h"
#include "../voffset.h"
......@@ -169,16 +170,6 @@ void __puthex(unsigned long value)
static bool l5_supported(void)
/* Check if leaf 7 is supported. */
if (native_cpuid_eax(0) < 7)
return 0;
/* Check if la57 is supported. */
return native_cpuid_ecx(7) & (1 << (X86_FEATURE_LA57 & 31));
static void handle_relocations(void *output, unsigned long output_len,
unsigned long virt_addr)
......@@ -376,12 +367,6 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap,
debug_putstr("early console in extract_kernel\n");
if (IS_ENABLED(CONFIG_X86_5LEVEL) && !l5_supported()) {
error("This linux kernel as configured requires 5-level paging\n"
"This CPU does not support the required 'cr4.la57' feature\n"
"Unable to boot - please use a kernel appropriate for your CPU\n");
free_mem_ptr = heap; /* Heap */
free_mem_end_ptr = heap + BOOT_HEAP_SIZE;
......@@ -392,6 +377,11 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap,
#ifdef CONFIG_X86_64
/* Report address of 32-bit trampoline */
* The memory hole needed for the kernel is the larger of either
* the entire decompressed kernel plus relocation table, or the
......@@ -12,6 +12,11 @@
#ifdef CONFIG_X86_5LEVEL
/* cpu_feature_enabled() cannot be used that early */
#define pgtable_l5_enabled __pgtable_l5_enabled
#include <linux/linkage.h>
#include <linux/screen_info.h>
#include <linux/elf.h>
......@@ -109,6 +114,6 @@ static inline void console_init(void)
{ }
unsigned long get_sev_encryption_mask(void);
void set_sev_encryption_mask(void);
#ifndef __ASSEMBLER__
extern unsigned long *trampoline_32bit;
extern void trampoline_32bit_src(void *return_ptr);
#endif /* __ASSEMBLER__ */
#include <asm/processor.h>
#include "pgtable.h"
#include "../string.h"
* __force_order is used by special_insns.h asm code to force instruction
......@@ -9,20 +11,144 @@
unsigned long __force_order;
int l5_paging_required(void)
#define BIOS_START_MIN 0x20000U /* 128K, less than this is insane */
#define BIOS_START_MAX 0x9f000U /* 640K, absolute maximum */
struct paging_config {
unsigned long trampoline_start;
unsigned long l5_required;
/* Buffer to preserve trampoline memory */
static char trampoline_save[TRAMPOLINE_32BIT_SIZE];
* The page table is going to be used instead of page table in the trampoline
* memory.
* It must not be in BSS as BSS is cleared after cleanup_trampoline().
static char top_pgtable[PAGE_SIZE] __aligned(PAGE_SIZE) __section(.data);
* Trampoline address will be printed by extract_kernel() for debugging
* purposes.
* Avoid putting the pointer into .bss as it will be cleared between
* paging_prepare() and extract_kernel().
unsigned long *trampoline_32bit __section(.data);
struct paging_config paging_prepare(void)
/* Check if leaf 7 is supported. */
struct paging_config paging_config = {};
unsigned long bios_start, ebda_start;
* Check if LA57 is desired and supported.
* There are two parts to the check:
* - if the kernel supports 5-level paging: CONFIG_X86_5LEVEL=y
* - if the machine supports 5-level paging:
* + CPUID leaf 7 is supported
* + the leaf has the feature bit set
* That's substitute for boot_cpu_has() in early boot code.
native_cpuid_eax(0) >= 7 &&
(native_cpuid_ecx(7) & (1 << (X86_FEATURE_LA57 & 31)))) {
paging_config.l5_required = 1;
* Find a suitable spot for the trampoline.
* This code is based on reserve_bios_regions().
ebda_start = *(unsigned short *)0x40e << 4;
bios_start = *(unsigned short *)0x413 << 10;