vmx.c 381 KB
Newer Older
Avi Kivity's avatar
Avi Kivity committed
1
2
3
4
5
6
7
/*
 * Kernel-based Virtual Machine driver for Linux
 *
 * This module enables machines with Intel VT-x extensions to run virtual
 * machines without emulation or binary translation.
 *
 * Copyright (C) 2006 Qumranet, Inc.
8
 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
Avi Kivity's avatar
Avi Kivity committed
9
10
11
12
13
14
15
16
17
18
 *
 * Authors:
 *   Avi Kivity   <avi@qumranet.com>
 *   Yaniv Kamay  <yaniv@qumranet.com>
 *
 * This work is licensed under the terms of the GNU GPL, version 2.  See
 * the COPYING file in the top-level directory.
 *
 */

19
#include "irq.h"
20
#include "mmu.h"
Avi Kivity's avatar
Avi Kivity committed
21
#include "cpuid.h"
22
#include "lapic.h"
Avi Kivity's avatar
Avi Kivity committed
23

24
#include <linux/kvm_host.h>
Avi Kivity's avatar
Avi Kivity committed
25
#include <linux/module.h>
26
#include <linux/kernel.h>
Avi Kivity's avatar
Avi Kivity committed
27
28
#include <linux/mm.h>
#include <linux/highmem.h>
Alexey Dobriyan's avatar
Alexey Dobriyan committed
29
#include <linux/sched.h>
30
#include <linux/moduleparam.h>
31
#include <linux/mod_devicetable.h>
32
#include <linux/trace_events.h>
33
#include <linux/slab.h>
34
#include <linux/tboot.h>
35
#include <linux/hrtimer.h>
36
#include <linux/frame.h>
37
#include <linux/nospec.h>
38
#include "kvm_cache_regs.h"
39
#include "x86.h"
Avi Kivity's avatar
Avi Kivity committed
40

41
#include <asm/cpu.h>
Avi Kivity's avatar
Avi Kivity committed
42
#include <asm/io.h>
43
#include <asm/desc.h>
44
#include <asm/vmx.h>
45
#include <asm/virtext.h>
46
#include <asm/mce.h>
47
#include <asm/fpu/internal.h>
48
#include <asm/perf_event.h>
49
#include <asm/debugreg.h>
50
#include <asm/kexec.h>
51
#include <asm/apic.h>
52
#include <asm/irq_remapping.h>
53
#include <asm/mmu_context.h>
54
#include <asm/spec-ctrl.h>
55
#include <asm/mshyperv.h>
Avi Kivity's avatar
Avi Kivity committed
56

57
#include "trace.h"
58
#include "pmu.h"
59
#include "vmx_evmcs.h"
60

61
#define __ex(x) __kvm_handle_fault_on_reboot(x)
62
63
#define __ex_clear(x, reg) \
	____kvm_handle_fault_on_reboot(x, "xor " reg " , " reg)
64

Avi Kivity's avatar
Avi Kivity committed
65
66
67
MODULE_AUTHOR("Qumranet");
MODULE_LICENSE("GPL");

68
69
70
71
72
73
static const struct x86_cpu_id vmx_cpu_id[] = {
	X86_FEATURE_MATCH(X86_FEATURE_VMX),
	{}
};
MODULE_DEVICE_TABLE(x86cpu, vmx_cpu_id);

74
static bool __read_mostly enable_vpid = 1;
75
module_param_named(vpid, enable_vpid, bool, 0444);
76

77
78
79
static bool __read_mostly enable_vnmi = 1;
module_param_named(vnmi, enable_vnmi, bool, S_IRUGO);

80
static bool __read_mostly flexpriority_enabled = 1;
81
module_param_named(flexpriority, flexpriority_enabled, bool, S_IRUGO);
82

83
static bool __read_mostly enable_ept = 1;
84
module_param_named(ept, enable_ept, bool, S_IRUGO);
Sheng Yang's avatar
Sheng Yang committed
85

86
static bool __read_mostly enable_unrestricted_guest = 1;
87
88
89
module_param_named(unrestricted_guest,
			enable_unrestricted_guest, bool, S_IRUGO);

90
91
92
static bool __read_mostly enable_ept_ad_bits = 1;
module_param_named(eptad, enable_ept_ad_bits, bool, S_IRUGO);

93
static bool __read_mostly emulate_invalid_guest_state = true;
94
module_param(emulate_invalid_guest_state, bool, S_IRUGO);
95

96
static bool __read_mostly fasteoi = 1;
97
98
module_param(fasteoi, bool, S_IRUGO);

99
static bool __read_mostly enable_apicv = 1;
100
module_param(enable_apicv, bool, S_IRUGO);
101

102
103
static bool __read_mostly enable_shadow_vmcs = 1;
module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO);
104
105
106
107
108
/*
 * If nested=1, nested virtualization is supported, i.e., guests may use
 * VMX and be a hypervisor for its own guests. If nested=0, guests may not
 * use VMX instructions.
 */
109
static bool __read_mostly nested = 0;
110
111
module_param(nested, bool, S_IRUGO);

112
113
static u64 __read_mostly host_xss;

Kai Huang's avatar
Kai Huang committed
114
115
116
static bool __read_mostly enable_pml = 1;
module_param_named(pml, enable_pml, bool, S_IRUGO);

117
118
119
120
121
122
123
124
#define MSR_TYPE_R	1
#define MSR_TYPE_W	2
#define MSR_TYPE_RW	3

#define MSR_BITMAP_MODE_X2APIC		1
#define MSR_BITMAP_MODE_X2APIC_APICV	2
#define MSR_BITMAP_MODE_LM		4

125
126
#define KVM_VMX_TSC_MULTIPLIER_MAX     0xffffffffffffffffULL

127
128
129
130
131
132
133
/* Guest_tsc -> host_tsc conversion requires 64-bit division.  */
static int __read_mostly cpu_preemption_timer_multi;
static bool __read_mostly enable_preemption_timer = 1;
#ifdef CONFIG_X86_64
module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO);
#endif

134
#define KVM_GUEST_CR0_MASK (X86_CR0_NW | X86_CR0_CD)
135
136
137
138
#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR0_NE
#define KVM_VM_CR0_ALWAYS_ON				\
	(KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | 	\
	 X86_CR0_WP | X86_CR0_PG | X86_CR0_PE)
139
140
#define KVM_CR4_GUEST_OWNED_BITS				      \
	(X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR      \
141
	 | X86_CR4_OSXMMEXCPT | X86_CR4_LA57 | X86_CR4_TSD)
142

143
#define KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR4_VMXE
144
145
146
#define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE)
#define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE)

147
148
#define RMODE_GUEST_OWNED_EFLAGS_BITS (~(X86_EFLAGS_IOPL | X86_EFLAGS_VM))

149
150
#define VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE 5

151
152
153
154
155
156
157
158
159
160
/*
 * Hyper-V requires all of these, so mark them as supported even though
 * they are just treated the same as all-context.
 */
#define VMX_VPID_EXTENT_SUPPORTED_MASK		\
	(VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT |	\
	VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT |	\
	VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT |	\
	VMX_VPID_EXTENT_SINGLE_NON_GLOBAL_BIT)

161
162
163
164
/*
 * These 2 parameters are used to config the controls for Pause-Loop Exiting:
 * ple_gap:    upper bound on the amount of time between two successive
 *             executions of PAUSE in a loop. Also indicate if ple enabled.
165
 *             According to test, this time is usually smaller than 128 cycles.
166
167
168
169
170
171
 * ple_window: upper bound on the amount of time a guest is allowed to execute
 *             in a PAUSE loop. Tests indicate that most spinlocks are held for
 *             less than 2^12 cycles
 * Time is measured based on a counter that runs at the same rate as the TSC,
 * refer SDM volume 3b section 21.6.13 & 22.1.3.
 */
172
static unsigned int ple_gap = KVM_DEFAULT_PLE_GAP;
173

174
175
static unsigned int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW;
module_param(ple_window, uint, 0444);
176

177
/* Default doubles per-vcpu window every exit. */
178
static unsigned int ple_window_grow = KVM_DEFAULT_PLE_WINDOW_GROW;
179
module_param(ple_window_grow, uint, 0444);
180
181

/* Default resets per-vcpu window every exit to ple_window. */
182
static unsigned int ple_window_shrink = KVM_DEFAULT_PLE_WINDOW_SHRINK;
183
module_param(ple_window_shrink, uint, 0444);
184
185

/* Default is to compute the maximum so we can never overflow. */
186
187
static unsigned int ple_window_max        = KVM_VMX_DEFAULT_PLE_WINDOW_MAX;
module_param(ple_window_max, uint, 0444);
188

Avi Kivity's avatar
Avi Kivity committed
189
190
extern const ulong vmx_return;

191
static DEFINE_STATIC_KEY_FALSE(vmx_l1d_should_flush);
192
static DEFINE_STATIC_KEY_FALSE(vmx_l1d_flush_cond);
193
static DEFINE_MUTEX(vmx_l1d_flush_mutex);
194

195
196
/* Storage for pre module init parameter parsing */
static enum vmx_l1d_flush_state __read_mostly vmentry_l1d_flush_param = VMENTER_L1D_FLUSH_AUTO;
197
198
199
200
201

static const struct {
	const char *option;
	enum vmx_l1d_flush_state cmd;
} vmentry_l1d_param[] = {
202
	{"auto",	VMENTER_L1D_FLUSH_AUTO},
203
204
205
206
207
	{"never",	VMENTER_L1D_FLUSH_NEVER},
	{"cond",	VMENTER_L1D_FLUSH_COND},
	{"always",	VMENTER_L1D_FLUSH_ALWAYS},
};

208
209
210
211
#define L1D_CACHE_ORDER 4
static void *vmx_l1d_flush_pages;

static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf)
212
{
213
	struct page *page;
214
	unsigned int i;
215

216
217
218
	if (!enable_ept) {
		l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_EPT_DISABLED;
		return 0;
219
220
	}

221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
	/* If set to auto use the default l1tf mitigation method */
	if (l1tf == VMENTER_L1D_FLUSH_AUTO) {
		switch (l1tf_mitigation) {
		case L1TF_MITIGATION_OFF:
			l1tf = VMENTER_L1D_FLUSH_NEVER;
			break;
		case L1TF_MITIGATION_FLUSH_NOWARN:
		case L1TF_MITIGATION_FLUSH:
		case L1TF_MITIGATION_FLUSH_NOSMT:
			l1tf = VMENTER_L1D_FLUSH_COND;
			break;
		case L1TF_MITIGATION_FULL:
		case L1TF_MITIGATION_FULL_FORCE:
			l1tf = VMENTER_L1D_FLUSH_ALWAYS;
			break;
		}
	} else if (l1tf_mitigation == L1TF_MITIGATION_FULL_FORCE) {
		l1tf = VMENTER_L1D_FLUSH_ALWAYS;
	}

241
242
243
244
245
246
	if (l1tf != VMENTER_L1D_FLUSH_NEVER && !vmx_l1d_flush_pages &&
	    !boot_cpu_has(X86_FEATURE_FLUSH_L1D)) {
		page = alloc_pages(GFP_KERNEL, L1D_CACHE_ORDER);
		if (!page)
			return -ENOMEM;
		vmx_l1d_flush_pages = page_address(page);
247
248
249
250
251
252
253
254
255
256

		/*
		 * Initialize each page with a different pattern in
		 * order to protect against KSM in the nested
		 * virtualization case.
		 */
		for (i = 0; i < 1u << L1D_CACHE_ORDER; ++i) {
			memset(vmx_l1d_flush_pages + i * PAGE_SIZE, i + 1,
			       PAGE_SIZE);
		}
257
258
259
260
	}

	l1tf_vmx_mitigation = l1tf;

261
262
263
264
	if (l1tf != VMENTER_L1D_FLUSH_NEVER)
		static_branch_enable(&vmx_l1d_should_flush);
	else
		static_branch_disable(&vmx_l1d_should_flush);
265

266
267
	if (l1tf == VMENTER_L1D_FLUSH_COND)
		static_branch_enable(&vmx_l1d_flush_cond);
268
	else
269
		static_branch_disable(&vmx_l1d_flush_cond);
270
271
272
273
274
275
276
277
278
	return 0;
}

static int vmentry_l1d_flush_parse(const char *s)
{
	unsigned int i;

	if (s) {
		for (i = 0; i < ARRAY_SIZE(vmentry_l1d_param); i++) {
279
			if (sysfs_streq(s, vmentry_l1d_param[i].option))
280
281
282
				return vmentry_l1d_param[i].cmd;
		}
	}
283
284
285
	return -EINVAL;
}

286
287
static int vmentry_l1d_flush_set(const char *s, const struct kernel_param *kp)
{
288
	int l1tf, ret;
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307

	if (!boot_cpu_has(X86_BUG_L1TF))
		return 0;

	l1tf = vmentry_l1d_flush_parse(s);
	if (l1tf < 0)
		return l1tf;

	/*
	 * Has vmx_init() run already? If not then this is the pre init
	 * parameter parsing. In that case just store the value and let
	 * vmx_init() do the proper setup after enable_ept has been
	 * established.
	 */
	if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_AUTO) {
		vmentry_l1d_flush_param = l1tf;
		return 0;
	}

308
309
310
311
	mutex_lock(&vmx_l1d_flush_mutex);
	ret = vmx_setup_l1d_flush(l1tf);
	mutex_unlock(&vmx_l1d_flush_mutex);
	return ret;
312
313
}

314
315
static int vmentry_l1d_flush_get(char *s, const struct kernel_param *kp)
{
316
	return sprintf(s, "%s\n", vmentry_l1d_param[l1tf_vmx_mitigation].option);
317
318
319
320
321
322
}

static const struct kernel_param_ops vmentry_l1d_flush_ops = {
	.set = vmentry_l1d_flush_set,
	.get = vmentry_l1d_flush_get,
};
323
module_param_cb(vmentry_l1d_flush, &vmentry_l1d_flush_ops, NULL, 0644);
324

325
326
327
328
329
330
331
332
struct kvm_vmx {
	struct kvm kvm;

	unsigned int tss_addr;
	bool ept_identity_pagetable_done;
	gpa_t ept_identity_map_addr;
};

333
#define NR_AUTOLOAD_MSRS 8
334

335
336
337
338
339
340
struct vmcs {
	u32 revision_id;
	u32 abort;
	char data[0];
};

341
342
343
344
345
346
347
/*
 * Track a VMCS that may be loaded on a certain CPU. If it is (cpu!=-1), also
 * remember whether it was VMLAUNCHed, and maintain a linked list of all VMCSs
 * loaded on this CPU (so we can clear them if the CPU goes down).
 */
struct loaded_vmcs {
	struct vmcs *vmcs;
348
	struct vmcs *shadow_vmcs;
349
	int cpu;
350
351
	bool launched;
	bool nmi_known_unmasked;
352
353
	unsigned long vmcs_host_cr3;	/* May not match real cr3 */
	unsigned long vmcs_host_cr4;	/* May not match real cr4 */
354
355
356
357
	/* Support for vnmi-less CPUs */
	int soft_vnmi_blocked;
	ktime_t entry_time;
	s64 vnmi_blocked_time;
358
	unsigned long *msr_bitmap;
359
360
361
	struct list_head loaded_vmcss_on_cpu_link;
};

362
363
364
struct shared_msr_entry {
	unsigned index;
	u64 data;
365
	u64 mask;
366
367
};

368
369
370
371
372
373
374
/*
 * struct vmcs12 describes the state that our guest hypervisor (L1) keeps for a
 * single nested guest (L2), hence the name vmcs12. Any VMX implementation has
 * a VMCS structure, and vmcs12 is our emulated VMX's VMCS. This structure is
 * stored in guest memory specified by VMPTRLD, but is opaque to the guest,
 * which must access it using VMREAD/VMWRITE/VMCLEAR instructions.
 * More than one of these structures may exist, if L1 runs multiple L2 guests.
Jim Mattson's avatar
Jim Mattson committed
375
 * nested_vmx_run() will use the data here to build the vmcs02: a VMCS for the
376
377
378
 * underlying hardware which will be used to run L2.
 * This structure is packed to ensure that its layout is identical across
 * machines (necessary for live migration).
379
380
381
382
383
 *
 * IMPORTANT: Changing the layout of existing fields in this structure
 * will break save/restore compatibility with older kvm releases. When
 * adding new fields, either use space in the reserved padding* arrays
 * or add the new fields to the end of the structure.
384
 */
385
typedef u64 natural_width;
386
387
388
389
390
391
struct __packed vmcs12 {
	/* According to the Intel spec, a VMCS region must start with the
	 * following two fields. Then follow implementation-specific data.
	 */
	u32 revision_id;
	u32 abort;
392

Nadav Har'El's avatar
Nadav Har'El committed
393
394
395
	u32 launch_state; /* set to 0 by VMCLEAR, to 1 by VMLAUNCH */
	u32 padding[7]; /* room for future expansion */

396
397
398
399
400
401
402
403
404
	u64 io_bitmap_a;
	u64 io_bitmap_b;
	u64 msr_bitmap;
	u64 vm_exit_msr_store_addr;
	u64 vm_exit_msr_load_addr;
	u64 vm_entry_msr_load_addr;
	u64 tsc_offset;
	u64 virtual_apic_page_addr;
	u64 apic_access_addr;
405
	u64 posted_intr_desc_addr;
406
	u64 ept_pointer;
407
408
409
410
	u64 eoi_exit_bitmap0;
	u64 eoi_exit_bitmap1;
	u64 eoi_exit_bitmap2;
	u64 eoi_exit_bitmap3;
411
	u64 xss_exit_bitmap;
412
413
414
415
416
417
418
419
420
421
	u64 guest_physical_address;
	u64 vmcs_link_pointer;
	u64 guest_ia32_debugctl;
	u64 guest_ia32_pat;
	u64 guest_ia32_efer;
	u64 guest_ia32_perf_global_ctrl;
	u64 guest_pdptr0;
	u64 guest_pdptr1;
	u64 guest_pdptr2;
	u64 guest_pdptr3;
422
	u64 guest_bndcfgs;
423
424
425
	u64 host_ia32_pat;
	u64 host_ia32_efer;
	u64 host_ia32_perf_global_ctrl;
426
427
428
429
430
431
	u64 vmread_bitmap;
	u64 vmwrite_bitmap;
	u64 vm_function_control;
	u64 eptp_list_address;
	u64 pml_address;
	u64 padding64[3]; /* room for future expansion */
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
	/*
	 * To allow migration of L1 (complete with its L2 guests) between
	 * machines of different natural widths (32 or 64 bit), we cannot have
	 * unsigned long fields with no explict size. We use u64 (aliased
	 * natural_width) instead. Luckily, x86 is little-endian.
	 */
	natural_width cr0_guest_host_mask;
	natural_width cr4_guest_host_mask;
	natural_width cr0_read_shadow;
	natural_width cr4_read_shadow;
	natural_width cr3_target_value0;
	natural_width cr3_target_value1;
	natural_width cr3_target_value2;
	natural_width cr3_target_value3;
	natural_width exit_qualification;
	natural_width guest_linear_address;
	natural_width guest_cr0;
	natural_width guest_cr3;
	natural_width guest_cr4;
	natural_width guest_es_base;
	natural_width guest_cs_base;
	natural_width guest_ss_base;
	natural_width guest_ds_base;
	natural_width guest_fs_base;
	natural_width guest_gs_base;
	natural_width guest_ldtr_base;
	natural_width guest_tr_base;
	natural_width guest_gdtr_base;
	natural_width guest_idtr_base;
	natural_width guest_dr7;
	natural_width guest_rsp;
	natural_width guest_rip;
	natural_width guest_rflags;
	natural_width guest_pending_dbg_exceptions;
	natural_width guest_sysenter_esp;
	natural_width guest_sysenter_eip;
	natural_width host_cr0;
	natural_width host_cr3;
	natural_width host_cr4;
	natural_width host_fs_base;
	natural_width host_gs_base;
	natural_width host_tr_base;
	natural_width host_gdtr_base;
	natural_width host_idtr_base;
	natural_width host_ia32_sysenter_esp;
	natural_width host_ia32_sysenter_eip;
	natural_width host_rsp;
	natural_width host_rip;
	natural_width paddingl[8]; /* room for future expansion */
	u32 pin_based_vm_exec_control;
	u32 cpu_based_vm_exec_control;
	u32 exception_bitmap;
	u32 page_fault_error_code_mask;
	u32 page_fault_error_code_match;
	u32 cr3_target_count;
	u32 vm_exit_controls;
	u32 vm_exit_msr_store_count;
	u32 vm_exit_msr_load_count;
	u32 vm_entry_controls;
	u32 vm_entry_msr_load_count;
	u32 vm_entry_intr_info_field;
	u32 vm_entry_exception_error_code;
	u32 vm_entry_instruction_len;
	u32 tpr_threshold;
	u32 secondary_vm_exec_control;
	u32 vm_instruction_error;
	u32 vm_exit_reason;
	u32 vm_exit_intr_info;
	u32 vm_exit_intr_error_code;
	u32 idt_vectoring_info_field;
	u32 idt_vectoring_error_code;
	u32 vm_exit_instruction_len;
	u32 vmx_instruction_info;
	u32 guest_es_limit;
	u32 guest_cs_limit;
	u32 guest_ss_limit;
	u32 guest_ds_limit;
	u32 guest_fs_limit;
	u32 guest_gs_limit;
	u32 guest_ldtr_limit;
	u32 guest_tr_limit;
	u32 guest_gdtr_limit;
	u32 guest_idtr_limit;
	u32 guest_es_ar_bytes;
	u32 guest_cs_ar_bytes;
	u32 guest_ss_ar_bytes;
	u32 guest_ds_ar_bytes;
	u32 guest_fs_ar_bytes;
	u32 guest_gs_ar_bytes;
	u32 guest_ldtr_ar_bytes;
	u32 guest_tr_ar_bytes;
	u32 guest_interruptibility_info;
	u32 guest_activity_state;
	u32 guest_sysenter_cs;
	u32 host_ia32_sysenter_cs;
527
528
	u32 vmx_preemption_timer_value;
	u32 padding32[7]; /* room for future expansion */
529
	u16 virtual_processor_id;
530
	u16 posted_intr_nv;
531
532
533
534
535
536
537
538
	u16 guest_es_selector;
	u16 guest_cs_selector;
	u16 guest_ss_selector;
	u16 guest_ds_selector;
	u16 guest_fs_selector;
	u16 guest_gs_selector;
	u16 guest_ldtr_selector;
	u16 guest_tr_selector;
539
	u16 guest_intr_status;
540
541
542
543
544
545
546
	u16 host_es_selector;
	u16 host_cs_selector;
	u16 host_ss_selector;
	u16 host_ds_selector;
	u16 host_fs_selector;
	u16 host_gs_selector;
	u16 host_tr_selector;
547
	u16 guest_pml_index;
548
549
};

550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
/*
 * For save/restore compatibility, the vmcs12 field offsets must not change.
 */
#define CHECK_OFFSET(field, loc)				\
	BUILD_BUG_ON_MSG(offsetof(struct vmcs12, field) != (loc),	\
		"Offset of " #field " in struct vmcs12 has changed.")

static inline void vmx_check_vmcs12_offsets(void) {
	CHECK_OFFSET(revision_id, 0);
	CHECK_OFFSET(abort, 4);
	CHECK_OFFSET(launch_state, 8);
	CHECK_OFFSET(io_bitmap_a, 40);
	CHECK_OFFSET(io_bitmap_b, 48);
	CHECK_OFFSET(msr_bitmap, 56);
	CHECK_OFFSET(vm_exit_msr_store_addr, 64);
	CHECK_OFFSET(vm_exit_msr_load_addr, 72);
	CHECK_OFFSET(vm_entry_msr_load_addr, 80);
	CHECK_OFFSET(tsc_offset, 88);
	CHECK_OFFSET(virtual_apic_page_addr, 96);
	CHECK_OFFSET(apic_access_addr, 104);
	CHECK_OFFSET(posted_intr_desc_addr, 112);
	CHECK_OFFSET(ept_pointer, 120);
	CHECK_OFFSET(eoi_exit_bitmap0, 128);
	CHECK_OFFSET(eoi_exit_bitmap1, 136);
	CHECK_OFFSET(eoi_exit_bitmap2, 144);
	CHECK_OFFSET(eoi_exit_bitmap3, 152);
	CHECK_OFFSET(xss_exit_bitmap, 160);
	CHECK_OFFSET(guest_physical_address, 168);
	CHECK_OFFSET(vmcs_link_pointer, 176);
	CHECK_OFFSET(guest_ia32_debugctl, 184);
	CHECK_OFFSET(guest_ia32_pat, 192);
	CHECK_OFFSET(guest_ia32_efer, 200);
	CHECK_OFFSET(guest_ia32_perf_global_ctrl, 208);
	CHECK_OFFSET(guest_pdptr0, 216);
	CHECK_OFFSET(guest_pdptr1, 224);
	CHECK_OFFSET(guest_pdptr2, 232);
	CHECK_OFFSET(guest_pdptr3, 240);
	CHECK_OFFSET(guest_bndcfgs, 248);
	CHECK_OFFSET(host_ia32_pat, 256);
	CHECK_OFFSET(host_ia32_efer, 264);
	CHECK_OFFSET(host_ia32_perf_global_ctrl, 272);
	CHECK_OFFSET(vmread_bitmap, 280);
	CHECK_OFFSET(vmwrite_bitmap, 288);
	CHECK_OFFSET(vm_function_control, 296);
	CHECK_OFFSET(eptp_list_address, 304);
	CHECK_OFFSET(pml_address, 312);
	CHECK_OFFSET(cr0_guest_host_mask, 344);
	CHECK_OFFSET(cr4_guest_host_mask, 352);
	CHECK_OFFSET(cr0_read_shadow, 360);
	CHECK_OFFSET(cr4_read_shadow, 368);
	CHECK_OFFSET(cr3_target_value0, 376);
	CHECK_OFFSET(cr3_target_value1, 384);
	CHECK_OFFSET(cr3_target_value2, 392);
	CHECK_OFFSET(cr3_target_value3, 400);
	CHECK_OFFSET(exit_qualification, 408);
	CHECK_OFFSET(guest_linear_address, 416);
	CHECK_OFFSET(guest_cr0, 424);
	CHECK_OFFSET(guest_cr3, 432);
	CHECK_OFFSET(guest_cr4, 440);
	CHECK_OFFSET(guest_es_base, 448);
	CHECK_OFFSET(guest_cs_base, 456);
	CHECK_OFFSET(guest_ss_base, 464);
	CHECK_OFFSET(guest_ds_base, 472);
	CHECK_OFFSET(guest_fs_base, 480);
	CHECK_OFFSET(guest_gs_base, 488);
	CHECK_OFFSET(guest_ldtr_base, 496);
	CHECK_OFFSET(guest_tr_base, 504);
	CHECK_OFFSET(guest_gdtr_base, 512);
	CHECK_OFFSET(guest_idtr_base, 520);
	CHECK_OFFSET(guest_dr7, 528);
	CHECK_OFFSET(guest_rsp, 536);
	CHECK_OFFSET(guest_rip, 544);
	CHECK_OFFSET(guest_rflags, 552);
	CHECK_OFFSET(guest_pending_dbg_exceptions, 560);
	CHECK_OFFSET(guest_sysenter_esp, 568);
	CHECK_OFFSET(guest_sysenter_eip, 576);
	CHECK_OFFSET(host_cr0, 584);
	CHECK_OFFSET(host_cr3, 592);
	CHECK_OFFSET(host_cr4, 600);
	CHECK_OFFSET(host_fs_base, 608);
	CHECK_OFFSET(host_gs_base, 616);
	CHECK_OFFSET(host_tr_base, 624);
	CHECK_OFFSET(host_gdtr_base, 632);
	CHECK_OFFSET(host_idtr_base, 640);
	CHECK_OFFSET(host_ia32_sysenter_esp, 648);
	CHECK_OFFSET(host_ia32_sysenter_eip, 656);
	CHECK_OFFSET(host_rsp, 664);
	CHECK_OFFSET(host_rip, 672);
	CHECK_OFFSET(pin_based_vm_exec_control, 744);
	CHECK_OFFSET(cpu_based_vm_exec_control, 748);
	CHECK_OFFSET(exception_bitmap, 752);
	CHECK_OFFSET(page_fault_error_code_mask, 756);
	CHECK_OFFSET(page_fault_error_code_match, 760);
	CHECK_OFFSET(cr3_target_count, 764);
	CHECK_OFFSET(vm_exit_controls, 768);
	CHECK_OFFSET(vm_exit_msr_store_count, 772);
	CHECK_OFFSET(vm_exit_msr_load_count, 776);
	CHECK_OFFSET(vm_entry_controls, 780);
	CHECK_OFFSET(vm_entry_msr_load_count, 784);
	CHECK_OFFSET(vm_entry_intr_info_field, 788);
	CHECK_OFFSET(vm_entry_exception_error_code, 792);
	CHECK_OFFSET(vm_entry_instruction_len, 796);
	CHECK_OFFSET(tpr_threshold, 800);
	CHECK_OFFSET(secondary_vm_exec_control, 804);
	CHECK_OFFSET(vm_instruction_error, 808);
	CHECK_OFFSET(vm_exit_reason, 812);
	CHECK_OFFSET(vm_exit_intr_info, 816);
	CHECK_OFFSET(vm_exit_intr_error_code, 820);
	CHECK_OFFSET(idt_vectoring_info_field, 824);
	CHECK_OFFSET(idt_vectoring_error_code, 828);
	CHECK_OFFSET(vm_exit_instruction_len, 832);
	CHECK_OFFSET(vmx_instruction_info, 836);
	CHECK_OFFSET(guest_es_limit, 840);
	CHECK_OFFSET(guest_cs_limit, 844);
	CHECK_OFFSET(guest_ss_limit, 848);
	CHECK_OFFSET(guest_ds_limit, 852);
	CHECK_OFFSET(guest_fs_limit, 856);
	CHECK_OFFSET(guest_gs_limit, 860);
	CHECK_OFFSET(guest_ldtr_limit, 864);
	CHECK_OFFSET(guest_tr_limit, 868);
	CHECK_OFFSET(guest_gdtr_limit, 872);
	CHECK_OFFSET(guest_idtr_limit, 876);
	CHECK_OFFSET(guest_es_ar_bytes, 880);
	CHECK_OFFSET(guest_cs_ar_bytes, 884);
	CHECK_OFFSET(guest_ss_ar_bytes, 888);
	CHECK_OFFSET(guest_ds_ar_bytes, 892);
	CHECK_OFFSET(guest_fs_ar_bytes, 896);
	CHECK_OFFSET(guest_gs_ar_bytes, 900);
	CHECK_OFFSET(guest_ldtr_ar_bytes, 904);
	CHECK_OFFSET(guest_tr_ar_bytes, 908);
	CHECK_OFFSET(guest_interruptibility_info, 912);
	CHECK_OFFSET(guest_activity_state, 916);
	CHECK_OFFSET(guest_sysenter_cs, 920);
	CHECK_OFFSET(host_ia32_sysenter_cs, 924);
	CHECK_OFFSET(vmx_preemption_timer_value, 928);
	CHECK_OFFSET(virtual_processor_id, 960);
	CHECK_OFFSET(posted_intr_nv, 962);
	CHECK_OFFSET(guest_es_selector, 964);
	CHECK_OFFSET(guest_cs_selector, 966);
	CHECK_OFFSET(guest_ss_selector, 968);
	CHECK_OFFSET(guest_ds_selector, 970);
	CHECK_OFFSET(guest_fs_selector, 972);
	CHECK_OFFSET(guest_gs_selector, 974);
	CHECK_OFFSET(guest_ldtr_selector, 976);
	CHECK_OFFSET(guest_tr_selector, 978);
	CHECK_OFFSET(guest_intr_status, 980);
	CHECK_OFFSET(host_es_selector, 982);
	CHECK_OFFSET(host_cs_selector, 984);
	CHECK_OFFSET(host_ss_selector, 986);
	CHECK_OFFSET(host_ds_selector, 988);
	CHECK_OFFSET(host_fs_selector, 990);
	CHECK_OFFSET(host_gs_selector, 992);
	CHECK_OFFSET(host_tr_selector, 994);
	CHECK_OFFSET(guest_pml_index, 996);
}

706
707
708
709
/*
 * VMCS12_REVISION is an arbitrary id that should be changed if the content or
 * layout of struct vmcs12 is changed. MSR_IA32_VMX_BASIC returns this id, and
 * VMPTRLD verifies that the VMCS region that L1 is loading contains this id.
710
711
712
 *
 * IMPORTANT: Changing this value will break save/restore compatibility with
 * older kvm releases.
713
714
715
716
717
718
719
720
721
722
 */
#define VMCS12_REVISION 0x11e57ed0

/*
 * VMCS12_SIZE is the number of bytes L1 should allocate for the VMXON region
 * and any VMCS region. Although only sizeof(struct vmcs12) are used by the
 * current implementation, 4K are reserved to avoid future complications.
 */
#define VMCS12_SIZE 0x1000

723
724
725
726
727
728
/*
 * VMCS12_MAX_FIELD_INDEX is the highest index value used in any
 * supported VMCS12 field encoding.
 */
#define VMCS12_MAX_FIELD_INDEX 0x17

729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
struct nested_vmx_msrs {
	/*
	 * We only store the "true" versions of the VMX capability MSRs. We
	 * generate the "non-true" versions by setting the must-be-1 bits
	 * according to the SDM.
	 */
	u32 procbased_ctls_low;
	u32 procbased_ctls_high;
	u32 secondary_ctls_low;
	u32 secondary_ctls_high;
	u32 pinbased_ctls_low;
	u32 pinbased_ctls_high;
	u32 exit_ctls_low;
	u32 exit_ctls_high;
	u32 entry_ctls_low;
	u32 entry_ctls_high;
	u32 misc_low;
	u32 misc_high;
	u32 ept_caps;
	u32 vpid_caps;
	u64 basic;
	u64 cr0_fixed0;
	u64 cr0_fixed1;
	u64 cr4_fixed0;
	u64 cr4_fixed1;
	u64 vmcs_enum;
	u64 vmfunc_controls;
};

758
759
760
761
762
763
764
/*
 * The nested_vmx structure is part of vcpu_vmx, and holds information we need
 * for correct emulation of VMX (i.e., nested VMX) on this vcpu.
 */
struct nested_vmx {
	/* Has the level1 guest done vmxon? */
	bool vmxon;
765
	gpa_t vmxon_ptr;
766
	bool pml_full;
767
768
769

	/* The guest-physical address of the current VMCS L1 keeps for L2 */
	gpa_t current_vmptr;
770
771
772
	/*
	 * Cache of the guest's VMCS, existing outside of guest memory.
	 * Loaded from guest memory during VMPTRLD. Flushed to guest
773
	 * memory during VMCLEAR and VMPTRLD.
774
775
	 */
	struct vmcs12 *cached_vmcs12;
776
777
778
779
780
	/*
	 * Indicates if the shadow vmcs must be updated with the
	 * data hold by vmcs12
	 */
	bool sync_shadow_vmcs;
781
	bool dirty_vmcs12;
782

783
784
	bool change_vmcs01_virtual_apic_mode;

785
786
	/* L2 must run next, and mustn't decide to exit to L1. */
	bool nested_run_pending;
Jim Mattson's avatar
Jim Mattson committed
787
788
789

	struct loaded_vmcs vmcs02;

790
	/*
Jim Mattson's avatar
Jim Mattson committed
791
792
	 * Guest pages referred to in the vmcs02 with host-physical
	 * pointers, so we must keep them pinned while L2 runs.
793
794
	 */
	struct page *apic_access_page;
795
	struct page *virtual_apic_page;
796
797
798
799
	struct page *pi_desc_page;
	struct pi_desc *pi_desc;
	bool pi_pending;
	u16 posted_intr_nv;
800
801
802

	struct hrtimer preemption_timer;
	bool preemption_timer_expired;
803
804
805

	/* to migrate it to L2 if VM_ENTRY_LOAD_DEBUG_CONTROLS is off */
	u64 vmcs01_debugctl;
806

Wanpeng Li's avatar
Wanpeng Li committed
807
808
809
	u16 vpid02;
	u16 last_vpid;

810
	struct nested_vmx_msrs msrs;
811
812
813
814
815
816
817
818

	/* SMM related state */
	struct {
		/* in VMX operation on SMM entry? */
		bool vmxon;
		/* in guest mode on SMM entry? */
		bool guest_mode;
	} smm;
819
820
};

821
#define POSTED_INTR_ON  0
822
823
#define POSTED_INTR_SN  1

824
825
826
/* Posted-Interrupt Descriptor */
struct pi_desc {
	u32 pir[8];     /* Posted interrupt requested */
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
	union {
		struct {
				/* bit 256 - Outstanding Notification */
			u16	on	: 1,
				/* bit 257 - Suppress Notification */
				sn	: 1,
				/* bit 271:258 - Reserved */
				rsvd_1	: 14;
				/* bit 279:272 - Notification Vector */
			u8	nv;
				/* bit 287:280 - Reserved */
			u8	rsvd_2;
				/* bit 319:288 - Notification Destination */
			u32	ndst;
		};
		u64 control;
	};
	u32 rsvd[6];
845
846
} __aligned(64);

847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
static bool pi_test_and_set_on(struct pi_desc *pi_desc)
{
	return test_and_set_bit(POSTED_INTR_ON,
			(unsigned long *)&pi_desc->control);
}

static bool pi_test_and_clear_on(struct pi_desc *pi_desc)
{
	return test_and_clear_bit(POSTED_INTR_ON,
			(unsigned long *)&pi_desc->control);
}

static int pi_test_and_set_pir(int vector, struct pi_desc *pi_desc)
{
	return test_and_set_bit(vector, (unsigned long *)pi_desc->pir);
}

864
865
866
867
868
869
870
871
872
873
874
875
static inline void pi_clear_sn(struct pi_desc *pi_desc)
{
	return clear_bit(POSTED_INTR_SN,
			(unsigned long *)&pi_desc->control);
}

static inline void pi_set_sn(struct pi_desc *pi_desc)
{
	return set_bit(POSTED_INTR_SN,
			(unsigned long *)&pi_desc->control);
}

876
877
878
879
880
881
static inline void pi_clear_on(struct pi_desc *pi_desc)
{
	clear_bit(POSTED_INTR_ON,
  		  (unsigned long *)&pi_desc->control);
}

882
883
884
885
886
887
888
889
890
891
892
893
static inline int pi_test_on(struct pi_desc *pi_desc)
{
	return test_bit(POSTED_INTR_ON,
			(unsigned long *)&pi_desc->control);
}

static inline int pi_test_sn(struct pi_desc *pi_desc)
{
	return test_bit(POSTED_INTR_SN,
			(unsigned long *)&pi_desc->control);
}

894
895
896
897
898
struct vmx_msrs {
	unsigned int		nr;
	struct vmx_msr_entry	val[NR_AUTOLOAD_MSRS];
};

899
struct vcpu_vmx {
900
	struct kvm_vcpu       vcpu;
901
	unsigned long         host_rsp;
902
	u8                    fail;
903
	u8		      msr_bitmap_mode;
904
	u32                   exit_intr_info;
905
	u32                   idt_vectoring_info;
906
	ulong                 rflags;
907
	struct shared_msr_entry *guest_msrs;
908
909
	int                   nmsrs;
	int                   save_nmsrs;
910
	unsigned long	      host_idt_base;
911
#ifdef CONFIG_X86_64
912
913
	u64 		      msr_host_kernel_gs_base;
	u64 		      msr_guest_kernel_gs_base;
914
#endif
Ashok Raj's avatar
Ashok Raj committed
915

916
	u64 		      arch_capabilities;
917
	u64 		      spec_ctrl;
918

919
920
	u32 vm_entry_controls_shadow;
	u32 vm_exit_controls_shadow;
921
922
	u32 secondary_exec_control;

923
924
925
926
927
928
929
930
	/*
	 * loaded_vmcs points to the VMCS currently used in this vcpu. For a
	 * non-nested (L1) guest, it always points to vmcs01. For a nested
	 * guest (L2), it points to a different VMCS.
	 */
	struct loaded_vmcs    vmcs01;
	struct loaded_vmcs   *loaded_vmcs;
	bool                  __launched; /* temporary, used in vmx_vcpu_run */
931
	struct msr_autoload {
932
933
		struct vmx_msrs guest;
		struct vmx_msrs host;
934
	} msr_autoload;
935
936
937
	struct {
		int           loaded;
		u16           fs_sel, gs_sel, ldt_sel;
938
939
940
#ifdef CONFIG_X86_64
		u16           ds_sel, es_sel;
#endif
941
942
		int           gs_ldt_reload_needed;
		int           fs_reload_needed;
943
		u64           msr_host_bndcfgs;
Mike Day's avatar
Mike Day committed
944
	} host_state;
945
	struct {
946
		int vm86_active;
947
		ulong save_rflags;
948
949
950
951
		struct kvm_segment segs[8];
	} rmode;
	struct {
		u32 bitmask; /* 4 bits per segment (1 bit per field) */
952
953
954
955
956
		struct kvm_save_segment {
			u16 selector;
			unsigned long base;
			u32 limit;
			u32 ar;
957
		} seg[8];
958
	} segment_cache;
959
	int vpid;
960
	bool emulation_required;
961

962
	u32 exit_reason;
963

964
965
966
	/* Posted interrupt descriptor */
	struct pi_desc pi_desc;

967
968
	/* Support for a guest hypervisor (nested VMX) */
	struct nested_vmx nested;
969
970
971
972

	/* Dynamic PLE window. */
	int ple_window;
	bool ple_window_dirty;
Kai Huang's avatar
Kai Huang committed
973
974
975
976

	/* Support for PML */
#define PML_ENTITY_NUM		512
	struct page *pml_pg;
977

978
979
980
	/* apic deadline value in host tsc */
	u64 hv_deadline_tsc;

981
	u64 current_tsc_ratio;
982
983

	u32 host_pkru;
984

985
986
	unsigned long host_debugctlmsr;

987
988
989
990
991
	/*
	 * Only bits masked by msr_ia32_feature_control_valid_bits can be set in
	 * msr_ia32_feature_control. FEATURE_CONTROL_LOCKED is always included
	 * in msr_ia32_feature_control_valid_bits.
	 */
992
	u64 msr_ia32_feature_control;
993
	u64 msr_ia32_feature_control_valid_bits;
994
995
};

996
997
998
999
1000
1001
1002
1003
1004
enum segment_cache_field {
	SEG_FIELD_SEL = 0,
	SEG_FIELD_BASE = 1,
	SEG_FIELD_LIMIT = 2,
	SEG_FIELD_AR = 3,

	SEG_FIELD_NR = 4
};

1005
1006
1007
1008
1009
static inline struct kvm_vmx *to_kvm_vmx(struct kvm *kvm)
{
	return container_of(kvm, struct kvm_vmx, kvm);
}

1010
1011
static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
{
1012
	return container_of(vcpu, struct vcpu_vmx, vcpu);
1013
1014
}

1015
1016
1017
1018
1019
static struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu)
{
	return &(to_vmx(vcpu)->pi_desc);
}

1020
#define ROL16(val, n) ((u16)(((u16)(val) << (n)) | ((u16)(val) >> (16 - (n)))))
1021
#define VMCS12_OFFSET(x) offsetof(struct vmcs12, x)
1022
1023
1024
1025
#define FIELD(number, name)	[ROL16(number, 6)] = VMCS12_OFFSET(name)
#define FIELD64(number, name)						\
	FIELD(number, name),						\
	[ROL16(number##_HIGH, 6)] = VMCS12_OFFSET(name) + sizeof(u32)
1026

1027

1028
static u16 shadow_read_only_fields[] = {
1029
1030
#define SHADOW_FIELD_RO(x) x,
#include "vmx_shadow_fields.h"
1031
};
1032
static int max_shadow_read_only_fields =
1033
1034
	ARRAY_SIZE(shadow_read_only_fields);

1035
static u16 shadow_read_write_fields[] = {
1036
1037
#define SHADOW_FIELD_RW(x) x,
#include "vmx_shadow_fields.h"
1038
};
1039
static int max_shadow_read_write_fields =
1040
1041
	ARRAY_SIZE(shadow_read_write_fields);

1042
static const unsigned short vmcs_field_to_offset_table[] = {
1043
	FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id),
1044
	FIELD(POSTED_INTR_NV, posted_intr_nv),
1045
1046
1047
1048
1049
1050
1051
1052
	FIELD(GUEST_ES_SELECTOR, guest_es_selector),
	FIELD(GUEST_CS_SELECTOR, guest_cs_selector),
	FIELD(GUEST_SS_SELECTOR, guest_ss_selector),
	FIELD(GUEST_DS_SELECTOR, guest_ds_selector),
	FIELD(GUEST_FS_SELECTOR, guest_fs_selector),
	FIELD(GUEST_GS_SELECTOR, guest_gs_selector),
	FIELD(GUEST_LDTR_SELECTOR, guest_ldtr_selector),
	FIELD(GUEST_TR_SELECTOR, guest_tr_selector),
1053
	FIELD(GUEST_INTR_STATUS, guest_intr_status),
1054
	FIELD(GUEST_PML_INDEX, guest_pml_index),
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
	FIELD(HOST_ES_SELECTOR, host_es_selector),
	FIELD(HOST_CS_SELECTOR, host_cs_selector),
	FIELD(HOST_SS_SELECTOR, host_ss_selector),
	FIELD(HOST_DS_SELECTOR, host_ds_selector),
	FIELD(HOST_FS_SELECTOR, host_fs_selector),
	FIELD(HOST_GS_SELECTOR, host_gs_selector),
	FIELD(HOST_TR_SELECTOR, host_tr_selector),
	FIELD64(IO_BITMAP_A, io_bitmap_a),
	FIELD64(IO_BITMAP_B, io_bitmap_b),
	FIELD64(MSR_BITMAP, msr_bitmap),
	FIELD64(VM_EXIT_MSR_STORE_ADDR, vm_exit_msr_store_addr),
	FIELD64(VM_EXIT_MSR_LOAD_ADDR, vm_exit_msr_load_addr),
	FIELD64(VM_ENTRY_MSR_LOAD_ADDR, vm_entry_msr_load_addr),
1068
	FIELD64(PML_ADDRESS, pml_address),
1069
1070
1071
	FIELD64(TSC_OFFSET, tsc_offset),
	FIELD64(VIRTUAL_APIC_PAGE_ADDR, virtual_apic_page_addr),
	FIELD64(APIC_ACCESS_ADDR, apic_access_addr),
1072
	FIELD64(POSTED_INTR_DESC_ADDR, posted_intr_desc_addr),
1073
	FIELD64(VM_FUNCTION_CONTROL, vm_function_control),
1074
	FIELD64(EPT_POINTER, ept_pointer),
1075
1076
1077
1078
	FIELD64(EOI_EXIT_BITMAP0, eoi_exit_bitmap0),
	FIELD64(EOI_EXIT_BITMAP1, eoi_exit_bitmap1),
	FIELD64(EOI_EXIT_BITMAP2, eoi_exit_bitmap2),
	FIELD64(EOI_EXIT_BITMAP3, eoi_exit_bitmap3),
1079
	FIELD64(EPTP_LIST_ADDRESS, eptp_list_address),
1080
1081
	FIELD64(VMREAD_BITMAP, vmread_bitmap),
	FIELD64(VMWRITE_BITMAP, vmwrite_bitmap),
1082
	FIELD64(XSS_EXIT_BITMAP, xss_exit_bitmap),
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
	FIELD64(GUEST_PHYSICAL_ADDRESS, guest_physical_address),
	FIELD64(VMCS_LINK_POINTER, vmcs_link_pointer),
	FIELD64(GUEST_IA32_DEBUGCTL, guest_ia32_debugctl),
	FIELD64(GUEST_IA32_PAT, guest_ia32_pat),
	FIELD64(GUEST_IA32_EFER, guest_ia32_efer),
	FIELD64(GUEST_IA32_PERF_GLOBAL_CTRL, guest_ia32_perf_global_ctrl),
	FIELD64(GUEST_PDPTR0, guest_pdptr0),
	FIELD64(GUEST_PDPTR1, guest_pdptr1),
	FIELD64(GUEST_PDPTR2, guest_pdptr2),
	FIELD64(GUEST_PDPTR3, guest_pdptr3),
1093
	FIELD64(GUEST_BNDCFGS, guest_bndcfgs),
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
	FIELD64(HOST_IA32_PAT, host_ia32_pat),
	FIELD64(HOST_IA32_EFER, host_ia32_efer),
	FIELD64(HOST_IA32_PERF_GLOBAL_CTRL, host_ia32_perf_global_ctrl),
	FIELD(PIN_BASED_VM_EXEC_CONTROL, pin_based_vm_exec_control),
	FIELD(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control),
	FIELD(EXCEPTION_BITMAP, exception_bitmap),
	FIELD(PAGE_FAULT_ERROR_CODE_MASK, page_fault_error_code_mask),
	FIELD(PAGE_FAULT_ERROR_CODE_MATCH, page_fault_error_code_match),
	FIELD(CR3_TARGET_COUNT, cr3_target_count),
	FIELD(VM_EXIT_CONTROLS, vm_exit_controls),
	FIELD(VM_EXIT_MSR_STORE_COUNT, vm_exit_msr_store_count),
	FIELD(VM_EXIT_MSR_LOAD_COUNT, vm_exit_msr_load_count),
	FIELD(VM_ENTRY_CONTROLS, vm_entry_controls),
	FIELD(VM_ENTRY_MSR_LOAD_COUNT, vm_entry_msr_load_count),
	FIELD(VM_ENTRY_INTR_INFO_FIELD, vm_entry_intr_info_field),
	FIELD(VM_ENTRY_EXCEPTION_ERROR_CODE, vm_entry_exception_error_code),
	FIELD(VM_ENTRY_INSTRUCTION_LEN, vm_entry_instruction_len),
	FIELD(TPR_THRESHOLD, tpr_threshold),
	FIELD(SECONDARY_VM_EXEC_CONTROL, secondary_vm_exec_control),
	FIELD(VM_INSTRUCTION_ERROR, vm_instruction_error),
	FIELD(VM_EXIT_REASON, vm_exit_reason),
	FIELD(VM_EXIT_INTR_INFO, vm_exit_intr_info),
	FIELD(VM_EXIT_INTR_ERROR_CODE, vm_exit_intr_error_code),
	FIELD(IDT_VECTORING_INFO_FIELD, idt_vectoring_info_field),
	FIELD(IDT_VECTORING_ERROR_CODE, idt_vectoring_error_code),
	FIELD(VM_EXIT_INSTRUCTION_LEN, vm_exit_instruction_len),
	FIELD(VMX_INSTRUCTION_INFO, vmx_instruction_info),
	FIELD(GUEST_ES_LIMIT, guest_es_limit),
	FIELD(GUEST_CS_LIMIT, guest_cs_limit),
	FIELD(GUEST_SS_LIMIT, guest_ss_limit),
	FIELD(GUEST_DS_LIMIT, guest_ds_limit),
	FIELD(GUEST_FS_LIMIT, guest_fs_limit),
	FIELD(GUEST_GS_LIMIT, guest_gs_limit),
	FIELD(GUEST_LDTR_LIMIT, guest_ldtr_limit),
	FIELD(GUEST_TR_LIMIT, guest_tr_limit),
	FIELD(GUEST_GDTR_LIMIT, guest_gdtr_limit),
	FIELD(GUEST_IDTR_LIMIT, guest_idtr_limit),
	FIELD(GUEST_ES_AR_BYTES, guest_es_ar_bytes),
	FIELD(GUEST_CS_AR_BYTES, guest_cs_ar_bytes),
	FIELD(GUEST_SS_AR_BYTES, guest_ss_ar_bytes),
	FIELD(GUEST_DS_AR_BYTES, guest_ds_ar_bytes),
	FIELD(GUEST_FS_AR_BYTES, guest_fs_ar_bytes),
	FIELD(GUEST_GS_AR_BYTES, guest_gs_ar_bytes),
	FIELD(GUEST_LDTR_AR_BYTES, guest_ldtr_ar_bytes),
	FIELD(GUEST_TR_AR_BYTES, guest_tr_ar_bytes),
	FIELD(GUEST_INTERRUPTIBILITY_INFO, guest_interruptibility_info),
	FIELD(GUEST_ACTIVITY_STATE, guest_activity_state),
	FIELD(GUEST_SYSENTER_CS, guest_sysenter_cs),
	FIELD(HOST_IA32_SYSENTER_CS, host_ia32_sysenter_cs),
1143
	FIELD(VMX_PREEMPTION_TIMER_VALUE, vmx_preemption_timer_value),
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
	FIELD(CR0_GUEST_HOST_MASK, cr0_guest_host_mask),
	FIELD(CR4_GUEST_HOST_MASK, cr4_guest_host_mask),
	FIELD(CR0_READ_SHADOW, cr0_read_shadow),
	FIELD(CR4_READ_SHADOW, cr4_read_shadow),
	FIELD(CR3_TARGET_VALUE0, cr3_target_value0),
	FIELD(CR3_TARGET_VALUE1, cr3_target_value1),
	FIELD(CR3_TARGET_VALUE2, cr3_target_value2),
	FIELD(CR3_TARGET_VALUE3, cr3_target_value3),
	FIELD(EXIT_QUALIFICATION, exit_qualification),
	FIELD(GUEST_LINEAR_ADDRESS, guest_linear_address),
	FIELD(GUEST_CR0, guest_cr0),
	FIELD(GUEST_CR3, guest_cr3),
	FIELD(GUEST_CR4, guest_cr4),
	FIELD(GUEST_ES_BASE, guest_es_base),
	FIELD(GUEST_CS_BASE, guest_cs_base),
	FIELD(GUEST_SS_BASE, guest_ss_base),
	FIELD(GUEST_DS_BASE, guest_ds_base),
	FIELD(GUEST_FS_BASE, guest_fs_base),
	FIELD(GUEST_GS_BASE, guest_gs_base),
	FIELD(GUEST_LDTR_BASE, guest_ldtr_base),
	FIELD(GUEST_TR_BASE, guest_tr_base),
	FIELD(GUEST_GDTR_BASE, guest_gdtr_base),
	FIELD(GUEST_IDTR_BASE, guest_idtr_base),
	FIELD(GUEST_DR7, guest_dr7),
	FIELD(GUEST_RSP, guest_rsp),
	FIELD(GUEST_RIP, guest_rip),
	FIELD(GUEST_RFLAGS, guest_rflags),
	FIELD(GUEST_PENDING_DBG_EXCEPTIONS, guest_pending_dbg_exceptions),
	FIELD(GUEST_SYSENTER_ESP, guest_sysenter_esp),
	FIELD(GUEST_SYSENTER_EIP, guest_sysenter_eip),
	FIELD(HOST_CR0, host_cr0),
	FIELD(HOST_CR3, host_cr3),
	FIELD(HOST_CR4, host_cr4),
	FIELD(HOST_FS_BASE, host_fs_base),
	FIELD(HOST_GS_BASE, host_gs_base),
	FIELD(HOST_TR_BASE, host_tr_base),
	FIELD(HOST_GDTR_BASE, host_gdtr_base),
	FIELD(HOST_IDTR_BASE, host_idtr_base),
	FIELD(HOST_IA32_SYSENTER_ESP, host_ia32_sysenter_esp),
	FIELD(HOST_IA32_SYSENTER_EIP, host_ia32_sysenter_eip),
	FIELD(HOST_RSP, host_rsp),
	FIELD(HOST_RIP, host_rip),
};

static inline short vmcs_field_to_offset(unsigned long field)
{
1190
1191
	const size_t size = ARRAY_SIZE(vmcs_field_to_offset_table);
	unsigned short offset;
1192
1193
1194
1195
	unsigned index;

	if (field >> 15)
		return -ENOENT;
1196

1197
	index = ROL16(field, 6);
1198
	if (index >= size)
1199
1200
		return -ENOENT;

1201
1202
	index = array_index_nospec(index, size);
	offset = vmcs_field_to_offset_table[index];
1203
	if (offset == 0)
1204
		return -ENOENT;
1205
	return offset;
1206
1207
}

1208
1209
static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu)
{
1210
	return to_vmx(vcpu)->nested.cached_vmcs12;
1211
1212
}

1213
static bool nested_ept_ad_enabled(struct kvm_vcpu *vcpu);
Nadav Har'El's avatar
Nadav Har'El committed
1214
static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu);
1215
static u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa);
1216
static bool vmx_xsaves_supported(void);
1217
1218
1219
1220
static void vmx_set_segment(struct kvm_vcpu *vcpu,
			    struct kvm_segment *var, int seg);
static void vmx_get_segment(struct kvm_vcpu *vcpu,
			    struct kvm_segment *var, int seg);
1221
1222
static bool guest_state_valid(struct kvm_vcpu *vcpu);
static u32 vmx_segment_access_rights(struct kvm_segment *var);
1223
static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx);
1224
1225
1226
1227
static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu);
static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked);
static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12,
					    u16 error_code);
1228
static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu);
Ashok Raj's avatar
Ashok Raj committed
1229
1230
static void __always_inline vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
							  u32 msr, int type);
1231

Avi Kivity's avatar
Avi Kivity committed
1232
1233
static DEFINE_PER_CPU(struct vmcs *, vmxarea);
static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
1234
1235
1236
1237
1238
/*
 * We maintain a per-CPU linked-list of VMCS loaded on that CPU. This is needed
 * when a CPU is brought down, and we need to VMCLEAR all VMCSs loaded on it.
 */
static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu);
Avi Kivity's avatar
Avi Kivity committed
1239

1240
1241
1242
1243
1244
1245
1246
/*
 * We maintian a per-CPU linked-list of vCPU, so in wakeup_handler() we
 * can find which vCPU should be waken up.
 */
static DEFINE_PER_CPU(struct list_head, blocked_vcpu_on_cpu);
static DEFINE_PER_CPU(spinlock_t, blocked_vcpu_on_cpu_lock);

1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
enum {
	VMX_VMREAD_BITMAP,
	VMX_VMWRITE_BITMAP,
	VMX_BITMAP_NR
};

static unsigned long *vmx_bitmap[VMX_BITMAP_NR];

#define vmx_vmread_bitmap                    (vmx_bitmap[VMX_VMREAD_BITMAP])
#define vmx_vmwrite_bitmap                   (vmx_bitmap[VMX_VMWRITE_BITMAP])
1257

1258
static bool cpu_has_load_ia32_efer;
1259
static bool cpu_has_load_perf_global_ctrl;
1260

1261
1262
1263
static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS);
static DEFINE_SPINLOCK(vmx_vpid_lock);

1264
static struct vmcs_config {
Avi Kivity's avatar
Avi Kivity committed
1265
1266
	int size;
	int order;
1267
	u32 basic_cap;
Avi Kivity's avatar
Avi Kivity committed
1268
	u32 revision_id;
1269
1270
	u32 pin_based_exec_ctrl;
	u32 cpu_based_exec_ctrl;
1271
	u32 cpu_based_2nd_exec_ctrl;
1272
1273
	u32 vmexit_ctrl;
	u32 vmentry_ctrl;
1274
	struct nested_vmx_msrs nested;
1275
} vmcs_config;
Avi Kivity's avatar
Avi Kivity committed
1276

Hannes Eder's avatar
Hannes Eder committed
1277
static struct vmx_capability {
Sheng Yang's avatar
Sheng Yang committed
1278
1279
1280
1281
	u32 ept;
	u32 vpid;
} vmx_capability;

Avi Kivity's avatar
Avi Kivity committed
1282
1283
1284
1285
1286
1287
1288
1289
#define VMX_SEGMENT_FIELD(seg)					\
	[VCPU_SREG_##seg] = {                                   \
		.selector = GUEST_##seg##_SELECTOR,		\
		.base = GUEST_##seg##_BASE,		   	\
		.limit = GUEST_##seg##_LIMIT,		   	\
		.ar_bytes = GUEST_##seg##_AR_BYTES,	   	\
	}

1290
static const struct kvm_vmx_segment_field {
Avi Kivity's avatar
Avi Kivity committed
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
	unsigned selector;
	unsigned base;
	unsigned limit;
	unsigned ar_bytes;
} kvm_vmx_segment_fields[] = {
	VMX_SEGMENT_FIELD(CS),
	VMX_SEGMENT_FIELD(DS),
	VMX_SEGMENT_FIELD(ES),
	VMX_SEGMENT_FIELD(FS),
	VMX_SEGMENT_FIELD(GS),
	VMX_SEGMENT_FIELD(SS),
	VMX_SEGMENT_FIELD(TR),
	VMX_SEGMENT_FIELD(LDTR),
};

1306
1307
static u64 host_efer;

Avi Kivity's avatar
Avi Kivity committed
1308
1309
static void ept_save_pdptrs(struct kvm_vcpu *vcpu);

1310
/*
Brian Gerst's avatar
Brian Gerst committed
1311
 * Keep MSR_STAR at the end, as setup_msrs() will try to optimize it
1312
1313
 * away by decrementing the array size.
 */
Avi Kivity's avatar
Avi Kivity committed
1314
static const u32 vmx_msr_index[] = {
1315
#ifdef CONFIG_X86_64
1316
	MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR,
Avi Kivity's avatar
Avi Kivity committed
1317
#endif
Brian Gerst's avatar
Brian Gerst committed
1318
	MSR_EFER, MSR_TSC_AUX, MSR_STAR,
Avi Kivity's avatar
Avi Kivity committed
1319
1320
};

1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335