mmu.c 80.7 KB
Newer Older
Avi Kivity's avatar
Avi Kivity committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
/*
 * Kernel-based Virtual Machine driver for Linux
 *
 * This module enables machines with Intel VT-x extensions to run virtual
 * machines without emulation or binary translation.
 *
 * MMU support
 *
 * Copyright (C) 2006 Qumranet, Inc.
 *
 * Authors:
 *   Yaniv Kamay  <yaniv@qumranet.com>
 *   Avi Kivity   <avi@qumranet.com>
 *
 * This work is licensed under the terms of the GNU GPL, version 2.  See
 * the COPYING file in the top-level directory.
 *
 */
Avi Kivity's avatar
Avi Kivity committed
19

20
#include "mmu.h"
21
#include "x86.h"
Avi Kivity's avatar
Avi Kivity committed
22
#include "kvm_cache_regs.h"
Avi Kivity's avatar
Avi Kivity committed
23

24
#include <linux/kvm_host.h>
Avi Kivity's avatar
Avi Kivity committed
25 26 27 28 29
#include <linux/types.h>
#include <linux/string.h>
#include <linux/mm.h>
#include <linux/highmem.h>
#include <linux/module.h>
30
#include <linux/swap.h>
Marcelo Tosatti's avatar
Marcelo Tosatti committed
31
#include <linux/hugetlb.h>
32
#include <linux/compiler.h>
33
#include <linux/srcu.h>
34
#include <linux/slab.h>
Avi Kivity's avatar
Avi Kivity committed
35

Avi Kivity's avatar
Avi Kivity committed
36 37
#include <asm/page.h>
#include <asm/cmpxchg.h>
38
#include <asm/io.h>
39
#include <asm/vmx.h>
Avi Kivity's avatar
Avi Kivity committed
40

41 42 43 44 45 46 47
/*
 * When setting this variable to true it enables Two-Dimensional-Paging
 * where the hardware walks 2 page tables:
 * 1. the guest-virtual to guest-physical
 * 2. while doing 1. it walks guest-physical to host-physical
 * If the hardware supports that we don't need to do shadow paging.
 */
48
bool tdp_enabled = false;
49

50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72
#undef MMU_DEBUG

#undef AUDIT

#ifdef AUDIT
static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg);
#else
static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) {}
#endif

#ifdef MMU_DEBUG

#define pgprintk(x...) do { if (dbg) printk(x); } while (0)
#define rmap_printk(x...) do { if (dbg) printk(x); } while (0)

#else

#define pgprintk(x...) do { } while (0)
#define rmap_printk(x...) do { } while (0)

#endif

#if defined(MMU_DEBUG) || defined(AUDIT)
73 74
static int dbg = 0;
module_param(dbg, bool, 0644);
75
#endif
Avi Kivity's avatar
Avi Kivity committed
76

77 78 79
static int oos_shadow = 1;
module_param(oos_shadow, bool, 0644);

80 81 82
#ifndef MMU_DEBUG
#define ASSERT(x) do { } while (0)
#else
Avi Kivity's avatar
Avi Kivity committed
83 84 85 86 87
#define ASSERT(x)							\
	if (!(x)) {							\
		printk(KERN_WARNING "assertion failed %s:%d: %s\n",	\
		       __FILE__, __LINE__, #x);				\
	}
88
#endif
Avi Kivity's avatar
Avi Kivity committed
89 90 91 92 93 94 95 96 97

#define PT_FIRST_AVAIL_BITS_SHIFT 9
#define PT64_SECOND_AVAIL_BITS_SHIFT 52

#define VALID_PAGE(x) ((x) != INVALID_PAGE)

#define PT64_LEVEL_BITS 9

#define PT64_LEVEL_SHIFT(level) \
Mike Day's avatar
Mike Day committed
98
		(PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS)
Avi Kivity's avatar
Avi Kivity committed
99 100 101 102 103 104 105 106 107 108 109

#define PT64_LEVEL_MASK(level) \
		(((1ULL << PT64_LEVEL_BITS) - 1) << PT64_LEVEL_SHIFT(level))

#define PT64_INDEX(address, level)\
	(((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1))


#define PT32_LEVEL_BITS 10

#define PT32_LEVEL_SHIFT(level) \
Mike Day's avatar
Mike Day committed
110
		(PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS)
Avi Kivity's avatar
Avi Kivity committed
111 112 113

#define PT32_LEVEL_MASK(level) \
		(((1ULL << PT32_LEVEL_BITS) - 1) << PT32_LEVEL_SHIFT(level))
114 115 116
#define PT32_LVL_OFFSET_MASK(level) \
	(PT32_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \
						* PT32_LEVEL_BITS))) - 1))
Avi Kivity's avatar
Avi Kivity committed
117 118 119 120 121

#define PT32_INDEX(address, level)\
	(((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1))


122
#define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1))
Avi Kivity's avatar
Avi Kivity committed
123 124
#define PT64_DIR_BASE_ADDR_MASK \
	(PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + PT64_LEVEL_BITS)) - 1))
125 126 127 128 129 130
#define PT64_LVL_ADDR_MASK(level) \
	(PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \
						* PT64_LEVEL_BITS))) - 1))
#define PT64_LVL_OFFSET_MASK(level) \
	(PT64_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \
						* PT64_LEVEL_BITS))) - 1))
Avi Kivity's avatar
Avi Kivity committed
131 132 133 134

#define PT32_BASE_ADDR_MASK PAGE_MASK
#define PT32_DIR_BASE_ADDR_MASK \
	(PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1))
135 136 137
#define PT32_LVL_ADDR_MASK(level) \
	(PAGE_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \
					    * PT32_LEVEL_BITS))) - 1))
Avi Kivity's avatar
Avi Kivity committed
138

139 140
#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \
			| PT64_NX_MASK)
Avi Kivity's avatar
Avi Kivity committed
141

142 143
#define RMAP_EXT 4

144 145 146 147 148
#define ACC_EXEC_MASK    1
#define ACC_WRITE_MASK   PT_WRITABLE_MASK
#define ACC_USER_MASK    PT_USER_MASK
#define ACC_ALL          (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK)

149 150
#include <trace/events/kvm.h>

151 152 153
#define CREATE_TRACE_POINTS
#include "mmutrace.h"

154 155
#define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)

156 157
#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)

158
struct kvm_rmap_desc {
Avi Kivity's avatar
Avi Kivity committed
159
	u64 *sptes[RMAP_EXT];
160 161 162
	struct kvm_rmap_desc *more;
};

163 164 165 166 167 168 169 170 171 172 173 174 175
struct kvm_shadow_walk_iterator {
	u64 addr;
	hpa_t shadow_addr;
	int level;
	u64 *sptep;
	unsigned index;
};

#define for_each_shadow_entry(_vcpu, _addr, _walker)    \
	for (shadow_walk_init(&(_walker), _vcpu, _addr);	\
	     shadow_walk_okay(&(_walker));			\
	     shadow_walk_next(&(_walker)))

176
typedef int (*mmu_parent_walk_fn) (struct kvm_mmu_page *sp);
Marcelo Tosatti's avatar
Marcelo Tosatti committed
177

178 179
static struct kmem_cache *pte_chain_cache;
static struct kmem_cache *rmap_desc_cache;
180
static struct kmem_cache *mmu_page_header_cache;
181

182 183
static u64 __read_mostly shadow_trap_nonpresent_pte;
static u64 __read_mostly shadow_notrap_nonpresent_pte;
Sheng Yang's avatar
Sheng Yang committed
184 185 186 187 188 189
static u64 __read_mostly shadow_base_present_pte;
static u64 __read_mostly shadow_nx_mask;
static u64 __read_mostly shadow_x_mask;	/* mutual exclusive with nx_mask */
static u64 __read_mostly shadow_user_mask;
static u64 __read_mostly shadow_accessed_mask;
static u64 __read_mostly shadow_dirty_mask;
190

191 192 193 194 195
static inline u64 rsvd_bits(int s, int e)
{
	return ((1ULL << (e - s + 1)) - 1) << s;
}

196 197 198 199 200 201 202
void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte)
{
	shadow_trap_nonpresent_pte = trap_pte;
	shadow_notrap_nonpresent_pte = notrap_pte;
}
EXPORT_SYMBOL_GPL(kvm_mmu_set_nonpresent_ptes);

Sheng Yang's avatar
Sheng Yang committed
203 204 205 206 207 208 209
void kvm_mmu_set_base_ptes(u64 base_pte)
{
	shadow_base_present_pte = base_pte;
}
EXPORT_SYMBOL_GPL(kvm_mmu_set_base_ptes);

void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
210
		u64 dirty_mask, u64 nx_mask, u64 x_mask)
Sheng Yang's avatar
Sheng Yang committed
211 212 213 214 215 216 217 218 219
{
	shadow_user_mask = user_mask;
	shadow_accessed_mask = accessed_mask;
	shadow_dirty_mask = dirty_mask;
	shadow_nx_mask = nx_mask;
	shadow_x_mask = x_mask;
}
EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);

220
static bool is_write_protection(struct kvm_vcpu *vcpu)
Avi Kivity's avatar
Avi Kivity committed
221
{
222
	return kvm_read_cr0_bits(vcpu, X86_CR0_WP);
Avi Kivity's avatar
Avi Kivity committed
223 224 225 226 227 228 229
}

static int is_cpuid_PSE36(void)
{
	return 1;
}

230 231
static int is_nx(struct kvm_vcpu *vcpu)
{
232
	return vcpu->arch.efer & EFER_NX;
233 234
}

235 236 237 238 239 240
static int is_shadow_present_pte(u64 pte)
{
	return pte != shadow_trap_nonpresent_pte
		&& pte != shadow_notrap_nonpresent_pte;
}

Marcelo Tosatti's avatar
Marcelo Tosatti committed
241 242 243 244 245
static int is_large_pte(u64 pte)
{
	return pte & PT_PAGE_SIZE_MASK;
}

246
static int is_writable_pte(unsigned long pte)
Avi Kivity's avatar
Avi Kivity committed
247 248 249 250
{
	return pte & PT_WRITABLE_MASK;
}

251
static int is_dirty_gpte(unsigned long pte)
252
{
Avi Kivity's avatar
Avi Kivity committed
253
	return pte & PT_DIRTY_MASK;
254 255
}

256
static int is_rmap_spte(u64 pte)
257
{
258
	return is_shadow_present_pte(pte);
259 260
}

261 262 263 264
static int is_last_spte(u64 pte, int level)
{
	if (level == PT_PAGE_TABLE_LEVEL)
		return 1;
265
	if (is_large_pte(pte))
266 267 268 269
		return 1;
	return 0;
}

270
static pfn_t spte_to_pfn(u64 pte)
271
{
272
	return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
273 274
}

275 276 277 278 279 280 281
static gfn_t pse36_gfn_delta(u32 gpte)
{
	int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGE_SHIFT;

	return (gpte & PT32_DIR_PSE36_MASK) << shift;
}

Avi Kivity's avatar
Avi Kivity committed
282
static void __set_spte(u64 *sptep, u64 spte)
283 284 285 286 287 288 289 290
{
#ifdef CONFIG_X86_64
	set_64bit((unsigned long *)sptep, spte);
#else
	set_64bit((unsigned long long *)sptep, spte);
#endif
}

291
static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
292
				  struct kmem_cache *base_cache, int min)
293 294 295 296
{
	void *obj;

	if (cache->nobjs >= min)
297
		return 0;
298
	while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
299
		obj = kmem_cache_zalloc(base_cache, GFP_KERNEL);
300
		if (!obj)
301
			return -ENOMEM;
302 303
		cache->objects[cache->nobjs++] = obj;
	}
304
	return 0;
305 306 307 308 309 310 311 312
}

static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
{
	while (mc->nobjs)
		kfree(mc->objects[--mc->nobjs]);
}

Avi Kivity's avatar
Avi Kivity committed
313
static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache,
314
				       int min)
Avi Kivity's avatar
Avi Kivity committed
315 316 317 318 319 320
{
	struct page *page;

	if (cache->nobjs >= min)
		return 0;
	while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
321
		page = alloc_page(GFP_KERNEL);
Avi Kivity's avatar
Avi Kivity committed
322 323 324 325 326 327 328 329 330 331
		if (!page)
			return -ENOMEM;
		cache->objects[cache->nobjs++] = page_address(page);
	}
	return 0;
}

static void mmu_free_memory_cache_page(struct kvm_mmu_memory_cache *mc)
{
	while (mc->nobjs)
332
		free_page((unsigned long)mc->objects[--mc->nobjs]);
Avi Kivity's avatar
Avi Kivity committed
333 334
}

335
static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu)
336
{
337 338
	int r;

339
	r = mmu_topup_memory_cache(&vcpu->arch.mmu_pte_chain_cache,
340
				   pte_chain_cache, 4);
341 342
	if (r)
		goto out;
343
	r = mmu_topup_memory_cache(&vcpu->arch.mmu_rmap_desc_cache,
344
				   rmap_desc_cache, 4);
345 346
	if (r)
		goto out;
347
	r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8);
348 349
	if (r)
		goto out;
350
	r = mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache,
351
				   mmu_page_header_cache, 4);
352 353
out:
	return r;
354 355 356 357
}

static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
{
358 359 360 361
	mmu_free_memory_cache(&vcpu->arch.mmu_pte_chain_cache);
	mmu_free_memory_cache(&vcpu->arch.mmu_rmap_desc_cache);
	mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache);
	mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache);
362 363 364 365 366 367 368 369 370 371 372 373 374 375
}

static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc,
				    size_t size)
{
	void *p;

	BUG_ON(!mc->nobjs);
	p = mc->objects[--mc->nobjs];
	return p;
}

static struct kvm_pte_chain *mmu_alloc_pte_chain(struct kvm_vcpu *vcpu)
{
376
	return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_chain_cache,
377 378 379
				      sizeof(struct kvm_pte_chain));
}

380
static void mmu_free_pte_chain(struct kvm_pte_chain *pc)
381
{
382
	kfree(pc);
383 384 385 386
}

static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu)
{
387
	return mmu_memory_cache_alloc(&vcpu->arch.mmu_rmap_desc_cache,
388 389 390
				      sizeof(struct kvm_rmap_desc));
}

391
static void mmu_free_rmap_desc(struct kvm_rmap_desc *rd)
392
{
393
	kfree(rd);
394 395
}

Marcelo Tosatti's avatar
Marcelo Tosatti committed
396 397 398 399
/*
 * Return the pointer to the largepage write count for a given
 * gfn, handling slots that are not large page aligned.
 */
400 401 402
static int *slot_largepage_idx(gfn_t gfn,
			       struct kvm_memory_slot *slot,
			       int level)
Marcelo Tosatti's avatar
Marcelo Tosatti committed
403 404 405
{
	unsigned long idx;

406 407 408
	idx = (gfn / KVM_PAGES_PER_HPAGE(level)) -
	      (slot->base_gfn / KVM_PAGES_PER_HPAGE(level));
	return &slot->lpage_info[level - 2][idx].write_count;
Marcelo Tosatti's avatar
Marcelo Tosatti committed
409 410 411 412
}

static void account_shadowed(struct kvm *kvm, gfn_t gfn)
{
413
	struct kvm_memory_slot *slot;
Marcelo Tosatti's avatar
Marcelo Tosatti committed
414
	int *write_count;
415
	int i;
Marcelo Tosatti's avatar
Marcelo Tosatti committed
416

417
	gfn = unalias_gfn(kvm, gfn);
418 419 420 421 422 423 424

	slot = gfn_to_memslot_unaliased(kvm, gfn);
	for (i = PT_DIRECTORY_LEVEL;
	     i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
		write_count   = slot_largepage_idx(gfn, slot, i);
		*write_count += 1;
	}
Marcelo Tosatti's avatar
Marcelo Tosatti committed
425 426 427 428
}

static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn)
{
429
	struct kvm_memory_slot *slot;
Marcelo Tosatti's avatar
Marcelo Tosatti committed
430
	int *write_count;
431
	int i;
Marcelo Tosatti's avatar
Marcelo Tosatti committed
432

433
	gfn = unalias_gfn(kvm, gfn);
434
	slot = gfn_to_memslot_unaliased(kvm, gfn);
435 436 437 438 439 440
	for (i = PT_DIRECTORY_LEVEL;
	     i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
		write_count   = slot_largepage_idx(gfn, slot, i);
		*write_count -= 1;
		WARN_ON(*write_count < 0);
	}
Marcelo Tosatti's avatar
Marcelo Tosatti committed
441 442
}

443 444 445
static int has_wrprotected_page(struct kvm *kvm,
				gfn_t gfn,
				int level)
Marcelo Tosatti's avatar
Marcelo Tosatti committed
446
{
447
	struct kvm_memory_slot *slot;
Marcelo Tosatti's avatar
Marcelo Tosatti committed
448 449
	int *largepage_idx;

450 451
	gfn = unalias_gfn(kvm, gfn);
	slot = gfn_to_memslot_unaliased(kvm, gfn);
Marcelo Tosatti's avatar
Marcelo Tosatti committed
452
	if (slot) {
453
		largepage_idx = slot_largepage_idx(gfn, slot, level);
Marcelo Tosatti's avatar
Marcelo Tosatti committed
454 455 456 457 458 459
		return *largepage_idx;
	}

	return 1;
}

460
static int host_mapping_level(struct kvm *kvm, gfn_t gfn)
Marcelo Tosatti's avatar
Marcelo Tosatti committed
461
{
462
	unsigned long page_size;
463
	int i, ret = 0;
Marcelo Tosatti's avatar
Marcelo Tosatti committed
464

465
	page_size = kvm_host_page_size(kvm, gfn);
Marcelo Tosatti's avatar
Marcelo Tosatti committed
466

467 468 469 470 471 472 473 474
	for (i = PT_PAGE_TABLE_LEVEL;
	     i < (PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES); ++i) {
		if (page_size >= KVM_HPAGE_SIZE(i))
			ret = i;
		else
			break;
	}

475
	return ret;
Marcelo Tosatti's avatar
Marcelo Tosatti committed
476 477
}

478
static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
Marcelo Tosatti's avatar
Marcelo Tosatti committed
479 480
{
	struct kvm_memory_slot *slot;
481
	int host_level, level, max_level;
Marcelo Tosatti's avatar
Marcelo Tosatti committed
482 483 484

	slot = gfn_to_memslot(vcpu->kvm, large_gfn);
	if (slot && slot->dirty_bitmap)
485
		return PT_PAGE_TABLE_LEVEL;
Marcelo Tosatti's avatar
Marcelo Tosatti committed
486

487 488 489 490 491
	host_level = host_mapping_level(vcpu->kvm, large_gfn);

	if (host_level == PT_PAGE_TABLE_LEVEL)
		return host_level;

492 493 494 495
	max_level = kvm_x86_ops->get_lpage_level() < host_level ?
		kvm_x86_ops->get_lpage_level() : host_level;

	for (level = PT_DIRECTORY_LEVEL; level <= max_level; ++level)
496 497 498 499
		if (has_wrprotected_page(vcpu->kvm, large_gfn, level))
			break;

	return level - 1;
Marcelo Tosatti's avatar
Marcelo Tosatti committed
500 501
}

502 503 504 505 506
/*
 * Take gfn and return the reverse mapping to it.
 * Note: gfn must be unaliased before this function get called
 */

507
static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level)
508 509
{
	struct kvm_memory_slot *slot;
Marcelo Tosatti's avatar
Marcelo Tosatti committed
510
	unsigned long idx;
511 512

	slot = gfn_to_memslot(kvm, gfn);
513
	if (likely(level == PT_PAGE_TABLE_LEVEL))
Marcelo Tosatti's avatar
Marcelo Tosatti committed
514 515
		return &slot->rmap[gfn - slot->base_gfn];

516 517
	idx = (gfn / KVM_PAGES_PER_HPAGE(level)) -
		(slot->base_gfn / KVM_PAGES_PER_HPAGE(level));
Marcelo Tosatti's avatar
Marcelo Tosatti committed
518

519
	return &slot->lpage_info[level - 2][idx].rmap_pde;
520 521
}

522 523 524
/*
 * Reverse mapping data structures:
 *
525 526
 * If rmapp bit zero is zero, then rmapp point to the shadw page table entry
 * that points to page_address(page).
527
 *
528 529
 * If rmapp bit zero is one, (then rmap & ~1) points to a struct kvm_rmap_desc
 * containing more mappings.
530 531 532 533
 *
 * Returns the number of rmap entries before the spte was added or zero if
 * the spte was not added.
 *
534
 */
535
static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
536
{
537
	struct kvm_mmu_page *sp;
538
	struct kvm_rmap_desc *desc;
539
	unsigned long *rmapp;
540
	int i, count = 0;
541

542
	if (!is_rmap_spte(*spte))
543
		return count;
544
	gfn = unalias_gfn(vcpu->kvm, gfn);
545 546
	sp = page_header(__pa(spte));
	sp->gfns[spte - sp->spt] = gfn;
547
	rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level);
548
	if (!*rmapp) {
549
		rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte);
550 551
		*rmapp = (unsigned long)spte;
	} else if (!(*rmapp & 1)) {
552
		rmap_printk("rmap_add: %p %llx 1->many\n", spte, *spte);
553
		desc = mmu_alloc_rmap_desc(vcpu);
Avi Kivity's avatar
Avi Kivity committed
554 555
		desc->sptes[0] = (u64 *)*rmapp;
		desc->sptes[1] = spte;
556
		*rmapp = (unsigned long)desc | 1;
557 558
	} else {
		rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte);
559
		desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
Avi Kivity's avatar
Avi Kivity committed
560
		while (desc->sptes[RMAP_EXT-1] && desc->more) {
561
			desc = desc->more;
562 563
			count += RMAP_EXT;
		}
Avi Kivity's avatar
Avi Kivity committed
564
		if (desc->sptes[RMAP_EXT-1]) {
565
			desc->more = mmu_alloc_rmap_desc(vcpu);
566 567
			desc = desc->more;
		}
Avi Kivity's avatar
Avi Kivity committed
568
		for (i = 0; desc->sptes[i]; ++i)
569
			;
Avi Kivity's avatar
Avi Kivity committed
570
		desc->sptes[i] = spte;
571
	}
572
	return count;
573 574
}

575
static void rmap_desc_remove_entry(unsigned long *rmapp,
576 577 578 579 580 581
				   struct kvm_rmap_desc *desc,
				   int i,
				   struct kvm_rmap_desc *prev_desc)
{
	int j;

Avi Kivity's avatar
Avi Kivity committed
582
	for (j = RMAP_EXT - 1; !desc->sptes[j] && j > i; --j)
583
		;
Avi Kivity's avatar
Avi Kivity committed
584 585
	desc->sptes[i] = desc->sptes[j];
	desc->sptes[j] = NULL;
586 587 588
	if (j != 0)
		return;
	if (!prev_desc && !desc->more)
Avi Kivity's avatar
Avi Kivity committed
589
		*rmapp = (unsigned long)desc->sptes[0];
590 591 592 593
	else
		if (prev_desc)
			prev_desc->more = desc->more;
		else
594
			*rmapp = (unsigned long)desc->more | 1;
595
	mmu_free_rmap_desc(desc);
596 597
}

598
static void rmap_remove(struct kvm *kvm, u64 *spte)
599 600 601
{
	struct kvm_rmap_desc *desc;
	struct kvm_rmap_desc *prev_desc;
602
	struct kvm_mmu_page *sp;
603
	pfn_t pfn;
604
	unsigned long *rmapp;
605 606
	int i;

607
	if (!is_rmap_spte(*spte))
608
		return;
609
	sp = page_header(__pa(spte));
610
	pfn = spte_to_pfn(*spte);
Sheng Yang's avatar
Sheng Yang committed
611
	if (*spte & shadow_accessed_mask)
612
		kvm_set_pfn_accessed(pfn);
613
	if (is_writable_pte(*spte))
614
		kvm_set_pfn_dirty(pfn);
615
	rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt], sp->role.level);
616
	if (!*rmapp) {
617 618
		printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte);
		BUG();
619
	} else if (!(*rmapp & 1)) {
620
		rmap_printk("rmap_remove:  %p %llx 1->0\n", spte, *spte);
621
		if ((u64 *)*rmapp != spte) {
622 623 624 625
			printk(KERN_ERR "rmap_remove:  %p %llx 1->BUG\n",
			       spte, *spte);
			BUG();
		}
626
		*rmapp = 0;
627 628
	} else {
		rmap_printk("rmap_remove:  %p %llx many->many\n", spte, *spte);
629
		desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
630 631
		prev_desc = NULL;
		while (desc) {
Avi Kivity's avatar
Avi Kivity committed
632 633
			for (i = 0; i < RMAP_EXT && desc->sptes[i]; ++i)
				if (desc->sptes[i] == spte) {
634
					rmap_desc_remove_entry(rmapp,
635
							       desc, i,
636 637 638 639 640 641
							       prev_desc);
					return;
				}
			prev_desc = desc;
			desc = desc->more;
		}
642
		pr_err("rmap_remove: %p %llx many->many\n", spte, *spte);
643 644 645 646
		BUG();
	}
}

647
static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte)
648 649
{
	struct kvm_rmap_desc *desc;
650 651 652 653 654 655 656 657 658 659 660 661 662
	u64 *prev_spte;
	int i;

	if (!*rmapp)
		return NULL;
	else if (!(*rmapp & 1)) {
		if (!spte)
			return (u64 *)*rmapp;
		return NULL;
	}
	desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
	prev_spte = NULL;
	while (desc) {
Avi Kivity's avatar
Avi Kivity committed
663
		for (i = 0; i < RMAP_EXT && desc->sptes[i]; ++i) {
664
			if (prev_spte == spte)
Avi Kivity's avatar
Avi Kivity committed
665 666
				return desc->sptes[i];
			prev_spte = desc->sptes[i];
667 668 669 670 671 672
		}
		desc = desc->more;
	}
	return NULL;
}

673
static int rmap_write_protect(struct kvm *kvm, u64 gfn)
674
{
675
	unsigned long *rmapp;
676
	u64 *spte;
677
	int i, write_protected = 0;
678

679
	gfn = unalias_gfn(kvm, gfn);
680
	rmapp = gfn_to_rmap(kvm, gfn, PT_PAGE_TABLE_LEVEL);
681

682 683
	spte = rmap_next(kvm, rmapp, NULL);
	while (spte) {
684 685 686
		BUG_ON(!spte);
		BUG_ON(!(*spte & PT_PRESENT_MASK));
		rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
687
		if (is_writable_pte(*spte)) {
Avi Kivity's avatar
Avi Kivity committed
688
			__set_spte(spte, *spte & ~PT_WRITABLE_MASK);
689 690
			write_protected = 1;
		}
691
		spte = rmap_next(kvm, rmapp, spte);
692
	}
693
	if (write_protected) {
694
		pfn_t pfn;
695 696

		spte = rmap_next(kvm, rmapp, NULL);
697 698
		pfn = spte_to_pfn(*spte);
		kvm_set_pfn_dirty(pfn);
699 700
	}

Marcelo Tosatti's avatar
Marcelo Tosatti committed
701
	/* check for huge page mappings */
702 703 704 705 706 707 708 709 710
	for (i = PT_DIRECTORY_LEVEL;
	     i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
		rmapp = gfn_to_rmap(kvm, gfn, i);
		spte = rmap_next(kvm, rmapp, NULL);
		while (spte) {
			BUG_ON(!spte);
			BUG_ON(!(*spte & PT_PRESENT_MASK));
			BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK));
			pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn);
711
			if (is_writable_pte(*spte)) {
712 713 714 715 716 717 718
				rmap_remove(kvm, spte);
				--kvm->stat.lpages;
				__set_spte(spte, shadow_trap_nonpresent_pte);
				spte = NULL;
				write_protected = 1;
			}
			spte = rmap_next(kvm, rmapp, spte);
Marcelo Tosatti's avatar
Marcelo Tosatti committed
719 720 721
		}
	}

722
	return write_protected;
723 724
}

Frederik Deweerdt's avatar
Frederik Deweerdt committed
725 726
static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
			   unsigned long data)
727 728 729 730 731 732 733 734
{
	u64 *spte;
	int need_tlb_flush = 0;

	while ((spte = rmap_next(kvm, rmapp, NULL))) {
		BUG_ON(!(*spte & PT_PRESENT_MASK));
		rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte);
		rmap_remove(kvm, spte);
Avi Kivity's avatar
Avi Kivity committed
735
		__set_spte(spte, shadow_trap_nonpresent_pte);
736 737 738 739 740
		need_tlb_flush = 1;
	}
	return need_tlb_flush;
}

Frederik Deweerdt's avatar
Frederik Deweerdt committed
741 742
static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
			     unsigned long data)
743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765
{
	int need_flush = 0;
	u64 *spte, new_spte;
	pte_t *ptep = (pte_t *)data;
	pfn_t new_pfn;

	WARN_ON(pte_huge(*ptep));
	new_pfn = pte_pfn(*ptep);
	spte = rmap_next(kvm, rmapp, NULL);
	while (spte) {
		BUG_ON(!is_shadow_present_pte(*spte));
		rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", spte, *spte);
		need_flush = 1;
		if (pte_write(*ptep)) {
			rmap_remove(kvm, spte);
			__set_spte(spte, shadow_trap_nonpresent_pte);
			spte = rmap_next(kvm, rmapp, NULL);
		} else {
			new_spte = *spte &~ (PT64_BASE_ADDR_MASK);
			new_spte |= (u64)new_pfn << PAGE_SHIFT;

			new_spte &= ~PT_WRITABLE_MASK;
			new_spte &= ~SPTE_HOST_WRITEABLE;
766
			if (is_writable_pte(*spte))
767 768 769 770 771 772 773 774 775 776 777
				kvm_set_pfn_dirty(spte_to_pfn(*spte));
			__set_spte(spte, new_spte);
			spte = rmap_next(kvm, rmapp, spte);
		}
	}
	if (need_flush)
		kvm_flush_remote_tlbs(kvm);

	return 0;
}

Frederik Deweerdt's avatar
Frederik Deweerdt committed
778 779
static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
			  unsigned long data,
780
			  int (*handler)(struct kvm *kvm, unsigned long *rmapp,
Frederik Deweerdt's avatar
Frederik Deweerdt committed
781
					 unsigned long data))
782
{
783
	int i, j;
784
	int ret;
785
	int retval = 0;
786 787
	struct kvm_memslots *slots;

788
	slots = kvm_memslots(kvm);
789

790 791
	for (i = 0; i < slots->nmemslots; i++) {
		struct kvm_memory_slot *memslot = &slots->memslots[i];
792 793 794 795 796 797
		unsigned long start = memslot->userspace_addr;
		unsigned long end;

		end = start + (memslot->npages << PAGE_SHIFT);
		if (hva >= start && hva < end) {
			gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT;
798

799
			ret = handler(kvm, &memslot->rmap[gfn_offset], data);
800 801 802 803

			for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) {
				int idx = gfn_offset;
				idx /= KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL + j);
804
				ret |= handler(kvm,
805 806
					&memslot->lpage_info[j][idx].rmap_pde,
					data);
807
			}
808 809
			trace_kvm_age_page(hva, memslot, ret);
			retval |= ret;
810 811 812 813 814 815 816 817
		}
	}

	return retval;
}

int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
{
818 819 820 821 822
	return kvm_handle_hva(kvm, hva, 0, kvm_unmap_rmapp);
}

void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
{
Frederik Deweerdt's avatar
Frederik Deweerdt committed
823
	kvm_handle_hva(kvm, hva, (unsigned long)&pte, kvm_set_pte_rmapp);
824 825
}

Frederik Deweerdt's avatar
Frederik Deweerdt committed
826 827
static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
			 unsigned long data)
828 829 830 831
{
	u64 *spte;
	int young = 0;

832 833 834 835 836 837 838
	/*
	 * Emulate the accessed bit for EPT, by checking if this page has
	 * an EPT mapping, and clearing it if it does. On the next access,
	 * a new EPT mapping will be established.
	 * This has some overhead, but not as much as the cost of swapping
	 * out actively used pages or breaking up actively used hugepages.
	 */
839
	if (!shadow_accessed_mask)
840
		return kvm_unmap_rmapp(kvm, rmapp, data);
841

842 843 844 845 846 847 848 849 850 851 852 853 854 855 856
	spte = rmap_next(kvm, rmapp, NULL);
	while (spte) {
		int _young;
		u64 _spte = *spte;
		BUG_ON(!(_spte & PT_PRESENT_MASK));
		_young = _spte & PT_ACCESSED_MASK;
		if (_young) {
			young = 1;
			clear_bit(PT_ACCESSED_SHIFT, (unsigned long *)spte);
		}
		spte = rmap_next(kvm, rmapp, spte);
	}
	return young;
}

857 858
#define RMAP_RECYCLE_THRESHOLD 1000

859
static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
860 861
{
	unsigned long *rmapp;
862 863 864
	struct kvm_mmu_page *sp;

	sp = page_header(__pa(spte));
865 866

	gfn = unalias_gfn(vcpu->kvm, gfn);
867
	rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level);
868

869
	kvm_unmap_rmapp(vcpu->kvm, rmapp, 0);
870 871 872
	kvm_flush_remote_tlbs(vcpu->kvm);
}

873 874
int kvm_age_hva(struct kvm *kvm, unsigned long hva)
{
875
	return kvm_handle_hva(kvm, hva, 0, kvm_age_rmapp);
876 877
}

878
#ifdef MMU_DEBUG
879
static int is_empty_shadow_page(u64 *spt)
Avi Kivity's avatar
Avi Kivity committed
880
{
881 882 883
	u64 *pos;
	u64 *end;

884
	for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++)
885
		if (is_shadow_present_pte(*pos)) {
886
			printk(KERN_ERR "%s: %p %llx\n", __func__,
887
			       pos, *pos);
Avi Kivity's avatar
Avi Kivity committed
888
			return 0;
889
		}
Avi Kivity's avatar
Avi Kivity committed
890 891
	return 1;
}
892
#endif
Avi Kivity's avatar
Avi Kivity committed
893

894
static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp)
895