kvm.c 73.9 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
/*
 * QEMU KVM support
 *
 * Copyright (C) 2006-2008 Qumranet Technologies
 * Copyright IBM, Corp. 2008
 *
 * Authors:
 *  Anthony Liguori   <aliguori@us.ibm.com>
 *
 * This work is licensed under the terms of the GNU GPL, version 2 or later.
 * See the COPYING file in the top-level directory.
 *
 */

#include <sys/types.h>
#include <sys/ioctl.h>
#include <sys/mman.h>
18
#include <sys/utsname.h>
19 20

#include <linux/kvm.h>
Jan Kiszka's avatar
Jan Kiszka committed
21
#include <linux/kvm_para.h>
22 23

#include "qemu-common.h"
24 25
#include "sysemu/sysemu.h"
#include "sysemu/kvm.h"
26
#include "kvm_i386.h"
27
#include "cpu.h"
28
#include "exec/gdbstub.h"
29 30
#include "qemu/host-utils.h"
#include "qemu/config-file.h"
31 32
#include "hw/i386/pc.h"
#include "hw/i386/apic.h"
33
#include "exec/ioport.h"
34
#include <asm/hyperv.h>
35
#include "hw/pci/pci.h"
36 37 38 39

//#define DEBUG_KVM

#ifdef DEBUG_KVM
40
#define DPRINTF(fmt, ...) \
41 42
    do { fprintf(stderr, fmt, ## __VA_ARGS__); } while (0)
#else
43
#define DPRINTF(fmt, ...) \
44 45 46
    do { } while (0)
#endif

47 48 49
#define MSR_KVM_WALL_CLOCK  0x11
#define MSR_KVM_SYSTEM_TIME 0x12

50 51 52 53 54 55 56
#ifndef BUS_MCEERR_AR
#define BUS_MCEERR_AR 4
#endif
#ifndef BUS_MCEERR_AO
#define BUS_MCEERR_AO 5
#endif

57 58 59 60 61 62
const KVMCapabilityInfo kvm_arch_required_capabilities[] = {
    KVM_CAP_INFO(SET_TSS_ADDR),
    KVM_CAP_INFO(EXT_CPUID),
    KVM_CAP_INFO(MP_STATE),
    KVM_CAP_LAST_INFO
};
63

64 65
static bool has_msr_star;
static bool has_msr_hsave_pa;
66
static bool has_msr_tsc_adjust;
67
static bool has_msr_tsc_deadline;
68
static bool has_msr_feature_control;
69
static bool has_msr_async_pf_en;
70
static bool has_msr_pv_eoi_en;
71
static bool has_msr_misc_enable;
Liu Jinsong's avatar
Liu Jinsong committed
72
static bool has_msr_bndcfgs;
73
static bool has_msr_kvm_steal_time;
74
static int lm_capable_kernel;
75 76
static bool has_msr_hv_hypercall;
static bool has_msr_hv_vapic;
77
static bool has_msr_hv_tsc;
78

Paolo Bonzini's avatar
Paolo Bonzini committed
79 80 81
static bool has_msr_architectural_pmu;
static uint32_t num_architectural_pmu_counters;

82 83 84 85 86
bool kvm_allows_irq0_override(void)
{
    return !kvm_irqchip_in_kernel() || kvm_has_gsi_routing();
}

87 88 89 90 91 92
static struct kvm_cpuid2 *try_get_cpuid(KVMState *s, int max)
{
    struct kvm_cpuid2 *cpuid;
    int r, size;

    size = sizeof(*cpuid) + max * sizeof(*cpuid->entries);
93
    cpuid = (struct kvm_cpuid2 *)g_malloc0(size);
94 95
    cpuid->nent = max;
    r = kvm_ioctl(s, KVM_GET_SUPPORTED_CPUID, cpuid);
96 97 98
    if (r == 0 && cpuid->nent >= max) {
        r = -E2BIG;
    }
99 100
    if (r < 0) {
        if (r == -E2BIG) {
101
            g_free(cpuid);
102 103 104 105 106 107 108 109 110 111
            return NULL;
        } else {
            fprintf(stderr, "KVM_GET_SUPPORTED_CPUID failed: %s\n",
                    strerror(-r));
            exit(1);
        }
    }
    return cpuid;
}

112 113 114 115 116 117 118 119 120 121 122 123 124
/* Run KVM_GET_SUPPORTED_CPUID ioctl(), allocating a buffer large enough
 * for all entries.
 */
static struct kvm_cpuid2 *get_supported_cpuid(KVMState *s)
{
    struct kvm_cpuid2 *cpuid;
    int max = 1;
    while ((cpuid = try_get_cpuid(s, max)) == NULL) {
        max *= 2;
    }
    return cpuid;
}

125 126 127 128 129 130 131 132 133 134 135
struct kvm_para_features {
    int cap;
    int feature;
} para_features[] = {
    { KVM_CAP_CLOCKSOURCE, KVM_FEATURE_CLOCKSOURCE },
    { KVM_CAP_NOP_IO_DELAY, KVM_FEATURE_NOP_IO_DELAY },
    { KVM_CAP_PV_MMU, KVM_FEATURE_MMU_OP },
    { KVM_CAP_ASYNC_PF, KVM_FEATURE_ASYNC_PF },
    { -1, -1 }
};

136
static int get_para_features(KVMState *s)
137 138 139 140
{
    int i, features = 0;

    for (i = 0; i < ARRAY_SIZE(para_features) - 1; i++) {
141
        if (kvm_check_extension(s, para_features[i].cap)) {
142 143 144 145 146 147 148 149
            features |= (1 << para_features[i].feature);
        }
    }

    return features;
}


150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171
/* Returns the value for a specific register on the cpuid entry
 */
static uint32_t cpuid_entry_get_reg(struct kvm_cpuid_entry2 *entry, int reg)
{
    uint32_t ret = 0;
    switch (reg) {
    case R_EAX:
        ret = entry->eax;
        break;
    case R_EBX:
        ret = entry->ebx;
        break;
    case R_ECX:
        ret = entry->ecx;
        break;
    case R_EDX:
        ret = entry->edx;
        break;
    }
    return ret;
}

172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188
/* Find matching entry for function/index on kvm_cpuid2 struct
 */
static struct kvm_cpuid_entry2 *cpuid_find_entry(struct kvm_cpuid2 *cpuid,
                                                 uint32_t function,
                                                 uint32_t index)
{
    int i;
    for (i = 0; i < cpuid->nent; ++i) {
        if (cpuid->entries[i].function == function &&
            cpuid->entries[i].index == index) {
            return &cpuid->entries[i];
        }
    }
    /* not found: */
    return NULL;
}

189
uint32_t kvm_arch_get_supported_cpuid(KVMState *s, uint32_t function,
190
                                      uint32_t index, int reg)
191 192 193 194
{
    struct kvm_cpuid2 *cpuid;
    uint32_t ret = 0;
    uint32_t cpuid_1_edx;
195
    bool found = false;
196

197
    cpuid = get_supported_cpuid(s);
198

199 200 201 202
    struct kvm_cpuid_entry2 *entry = cpuid_find_entry(cpuid, function, index);
    if (entry) {
        found = true;
        ret = cpuid_entry_get_reg(entry, reg);
203 204
    }

205 206
    /* Fixups for the data returned by KVM, below */

207 208 209
    if (function == 1 && reg == R_EDX) {
        /* KVM before 2.6.30 misreports the following features */
        ret |= CPUID_MTRR | CPUID_PAT | CPUID_MCE | CPUID_MCA;
210 211 212 213 214
    } else if (function == 1 && reg == R_ECX) {
        /* We can set the hypervisor flag, even if KVM does not return it on
         * GET_SUPPORTED_CPUID
         */
        ret |= CPUID_EXT_HYPERVISOR;
215 216 217 218 219 220 221 222
        /* tsc-deadline flag is not returned by GET_SUPPORTED_CPUID, but it
         * can be enabled if the kernel has KVM_CAP_TSC_DEADLINE_TIMER,
         * and the irqchip is in the kernel.
         */
        if (kvm_irqchip_in_kernel() &&
                kvm_check_extension(s, KVM_CAP_TSC_DEADLINE_TIMER)) {
            ret |= CPUID_EXT_TSC_DEADLINE_TIMER;
        }
223 224 225 226 227 228

        /* x2apic is reported by GET_SUPPORTED_CPUID, but it can't be enabled
         * without the in-kernel irqchip
         */
        if (!kvm_irqchip_in_kernel()) {
            ret &= ~CPUID_EXT_X2APIC;
229
        }
230 231 232 233 234 235
    } else if (function == 0x80000001 && reg == R_EDX) {
        /* On Intel, kvm returns cpuid according to the Intel spec,
         * so add missing bits according to the AMD spec:
         */
        cpuid_1_edx = kvm_arch_get_supported_cpuid(s, 1, 0, R_EDX);
        ret |= cpuid_1_edx & CPUID_EXT2_AMD_ALIASES;
236 237
    }

238
    g_free(cpuid);
239

240
    /* fallback for older kernels */
241
    if ((function == KVM_CPUID_FEATURES) && !found) {
242
        ret = get_para_features(s);
243
    }
244 245

    return ret;
246 247
}

248 249 250 251 252 253 254 255 256 257 258 259 260 261 262
typedef struct HWPoisonPage {
    ram_addr_t ram_addr;
    QLIST_ENTRY(HWPoisonPage) list;
} HWPoisonPage;

static QLIST_HEAD(, HWPoisonPage) hwpoison_page_list =
    QLIST_HEAD_INITIALIZER(hwpoison_page_list);

static void kvm_unpoison_all(void *param)
{
    HWPoisonPage *page, *next_page;

    QLIST_FOREACH_SAFE(page, &hwpoison_page_list, list, next_page) {
        QLIST_REMOVE(page, list);
        qemu_ram_remap(page->ram_addr, TARGET_PAGE_SIZE);
263
        g_free(page);
264 265 266 267 268 269 270 271 272 273 274 275
    }
}

static void kvm_hwpoison_page_add(ram_addr_t ram_addr)
{
    HWPoisonPage *page;

    QLIST_FOREACH(page, &hwpoison_page_list, list) {
        if (page->ram_addr == ram_addr) {
            return;
        }
    }
276
    page = g_malloc(sizeof(HWPoisonPage));
277 278 279 280
    page->ram_addr = ram_addr;
    QLIST_INSERT_HEAD(&hwpoison_page_list, page, list);
}

281 282 283 284 285
static int kvm_get_mce_cap_supported(KVMState *s, uint64_t *mce_cap,
                                     int *max_banks)
{
    int r;

286
    r = kvm_check_extension(s, KVM_CAP_MCE);
287 288 289 290 291 292 293
    if (r > 0) {
        *max_banks = r;
        return kvm_ioctl(s, KVM_X86_GET_MCE_CAP_SUPPORTED, mce_cap);
    }
    return -ENOSYS;
}

294
static void kvm_mce_inject(X86CPU *cpu, hwaddr paddr, int code)
295
{
296
    CPUX86State *env = &cpu->env;
297 298 299
    uint64_t status = MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN |
                      MCI_STATUS_MISCV | MCI_STATUS_ADDRV | MCI_STATUS_S;
    uint64_t mcg_status = MCG_STATUS_MCIP;
300

301 302 303 304 305 306
    if (code == BUS_MCEERR_AR) {
        status |= MCI_STATUS_AR | 0x134;
        mcg_status |= MCG_STATUS_EIPV;
    } else {
        status |= 0xc0;
        mcg_status |= MCG_STATUS_RIPV;
307
    }
308
    cpu_x86_inject_mce(NULL, cpu, 9, status, mcg_status, paddr,
309 310 311
                       (MCM_ADDR_PHYS << 6) | 0xc,
                       cpu_x86_support_mca_broadcast(env) ?
                       MCE_INJECT_BROADCAST : 0);
312 313 314 315 316 317 318 319
}

static void hardware_memory_error(void)
{
    fprintf(stderr, "Hardware memory error!\n");
    exit(1);
}

320
int kvm_arch_on_sigbus_vcpu(CPUState *c, int code, void *addr)
321
{
322 323
    X86CPU *cpu = X86_CPU(c);
    CPUX86State *env = &cpu->env;
324
    ram_addr_t ram_addr;
325
    hwaddr paddr;
326 327

    if ((env->mcg_cap & MCG_SER_P) && addr
328
        && (code == BUS_MCEERR_AR || code == BUS_MCEERR_AO)) {
329
        if (qemu_ram_addr_from_host(addr, &ram_addr) == NULL ||
330
            !kvm_physical_memory_addr_from_host(c->kvm_state, addr, &paddr)) {
331 332 333 334 335 336 337 338 339
            fprintf(stderr, "Hardware memory error for memory used by "
                    "QEMU itself instead of guest system!\n");
            /* Hope we are lucky for AO MCE */
            if (code == BUS_MCEERR_AO) {
                return 0;
            } else {
                hardware_memory_error();
            }
        }
340
        kvm_hwpoison_page_add(ram_addr);
341
        kvm_mce_inject(cpu, paddr, code);
342
    } else {
343 344 345 346 347 348 349 350 351 352 353 354 355
        if (code == BUS_MCEERR_AO) {
            return 0;
        } else if (code == BUS_MCEERR_AR) {
            hardware_memory_error();
        } else {
            return 1;
        }
    }
    return 0;
}

int kvm_arch_on_sigbus(int code, void *addr)
{
356 357 358
    X86CPU *cpu = X86_CPU(first_cpu);

    if ((cpu->env.mcg_cap & MCG_SER_P) && addr && code == BUS_MCEERR_AO) {
359
        ram_addr_t ram_addr;
360
        hwaddr paddr;
361 362

        /* Hope we are lucky for AO MCE */
363
        if (qemu_ram_addr_from_host(addr, &ram_addr) == NULL ||
364
            !kvm_physical_memory_addr_from_host(first_cpu->kvm_state,
365
                                                addr, &paddr)) {
366 367 368 369
            fprintf(stderr, "Hardware memory error for memory used by "
                    "QEMU itself instead of guest system!: %p\n", addr);
            return 0;
        }
370
        kvm_hwpoison_page_add(ram_addr);
371
        kvm_mce_inject(X86_CPU(first_cpu), paddr, code);
372
    } else {
373 374 375 376 377 378 379 380 381 382
        if (code == BUS_MCEERR_AO) {
            return 0;
        } else if (code == BUS_MCEERR_AR) {
            hardware_memory_error();
        } else {
            return 1;
        }
    }
    return 0;
}
383

384
static int kvm_inject_mce_oldstyle(X86CPU *cpu)
385
{
386 387
    CPUX86State *env = &cpu->env;

388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410
    if (!kvm_has_vcpu_events() && env->exception_injected == EXCP12_MCHK) {
        unsigned int bank, bank_num = env->mcg_cap & 0xff;
        struct kvm_x86_mce mce;

        env->exception_injected = -1;

        /*
         * There must be at least one bank in use if an MCE is pending.
         * Find it and use its values for the event injection.
         */
        for (bank = 0; bank < bank_num; bank++) {
            if (env->mce_banks[bank * 4 + 1] & MCI_STATUS_VAL) {
                break;
            }
        }
        assert(bank < bank_num);

        mce.bank = bank;
        mce.status = env->mce_banks[bank * 4 + 1];
        mce.mcg_status = env->mcg_status;
        mce.addr = env->mce_banks[bank * 4 + 2];
        mce.misc = env->mce_banks[bank * 4 + 3];

411
        return kvm_vcpu_ioctl(CPU(cpu), KVM_X86_SET_MCE, &mce);
412 413 414 415
    }
    return 0;
}

416
static void cpu_update_state(void *opaque, int running, RunState state)
417
{
418
    CPUX86State *env = opaque;
419 420 421 422 423 424

    if (running) {
        env->tsc_valid = false;
    }
}

425
unsigned long kvm_arch_vcpu_id(CPUState *cs)
426
{
427 428
    X86CPU *cpu = X86_CPU(cs);
    return cpu->env.cpuid_apic_id;
429 430
}

431 432 433 434 435 436 437 438 439 440 441 442
#ifndef KVM_CPUID_SIGNATURE_NEXT
#define KVM_CPUID_SIGNATURE_NEXT                0x40000100
#endif

static bool hyperv_hypercall_available(X86CPU *cpu)
{
    return cpu->hyperv_vapic ||
           (cpu->hyperv_spinlock_attempts != HYPERV_SPINLOCK_NEVER_RETRY);
}

static bool hyperv_enabled(X86CPU *cpu)
{
443 444 445
    CPUState *cs = CPU(cpu);
    return kvm_check_extension(cs->kvm_state, KVM_CAP_HYPERV) > 0 &&
           (hyperv_hypercall_available(cpu) ||
446
            cpu->hyperv_time  ||
447
            cpu->hyperv_relaxed_timing);
448 449
}

450
#define KVM_MAX_CPUID_ENTRIES  100
451

452
int kvm_arch_init_vcpu(CPUState *cs)
453 454
{
    struct {
455
        struct kvm_cpuid2 cpuid;
456
        struct kvm_cpuid_entry2 entries[KVM_MAX_CPUID_ENTRIES];
457
    } QEMU_PACKED cpuid_data;
458 459
    X86CPU *cpu = X86_CPU(cs);
    CPUX86State *env = &cpu->env;
460
    uint32_t limit, i, j, cpuid_i;
461
    uint32_t unused;
462 463
    struct kvm_cpuid_entry2 *c;
    uint32_t signature[3];
464
    int kvm_base = KVM_CPUID_SIGNATURE;
465
    int r;
466

467 468
    memset(&cpuid_data, 0, sizeof(cpuid_data));

469 470
    cpuid_i = 0;

471
    /* Paravirtualization CPUIDs */
472 473 474
    if (hyperv_enabled(cpu)) {
        c = &cpuid_data.entries[cpuid_i++];
        c->function = HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS;
475 476
        memcpy(signature, "Microsoft Hv", 12);
        c->eax = HYPERV_CPUID_MIN;
477 478 479
        c->ebx = signature[0];
        c->ecx = signature[1];
        c->edx = signature[2];
480

481 482
        c = &cpuid_data.entries[cpuid_i++];
        c->function = HYPERV_CPUID_INTERFACE;
483 484
        memcpy(signature, "Hv#1\0\0\0\0\0\0\0\0", 12);
        c->eax = signature[0];
485 486 487
        c->ebx = 0;
        c->ecx = 0;
        c->edx = 0;
488 489 490 491 492 493 494 495

        c = &cpuid_data.entries[cpuid_i++];
        c->function = HYPERV_CPUID_VERSION;
        c->eax = 0x00001bbc;
        c->ebx = 0x00060001;

        c = &cpuid_data.entries[cpuid_i++];
        c->function = HYPERV_CPUID_FEATURES;
496
        if (cpu->hyperv_relaxed_timing) {
497 498
            c->eax |= HV_X64_MSR_HYPERCALL_AVAILABLE;
        }
499
        if (cpu->hyperv_vapic) {
500 501
            c->eax |= HV_X64_MSR_HYPERCALL_AVAILABLE;
            c->eax |= HV_X64_MSR_APIC_ACCESS_AVAILABLE;
502
            has_msr_hv_vapic = true;
503
        }
504 505 506 507 508 509 510
        if (cpu->hyperv_time &&
            kvm_check_extension(cs->kvm_state, KVM_CAP_HYPERV_TIME) > 0) {
            c->eax |= HV_X64_MSR_HYPERCALL_AVAILABLE;
            c->eax |= HV_X64_MSR_TIME_REF_COUNT_AVAILABLE;
            c->eax |= 0x200;
            has_msr_hv_tsc = true;
        }
511 512
        c = &cpuid_data.entries[cpuid_i++];
        c->function = HYPERV_CPUID_ENLIGHTMENT_INFO;
513
        if (cpu->hyperv_relaxed_timing) {
514 515
            c->eax |= HV_X64_RELAXED_TIMING_RECOMMENDED;
        }
516
        if (has_msr_hv_vapic) {
517 518
            c->eax |= HV_X64_APIC_ACCESS_RECOMMENDED;
        }
519
        c->ebx = cpu->hyperv_spinlock_attempts;
520 521 522 523 524 525

        c = &cpuid_data.entries[cpuid_i++];
        c->function = HYPERV_CPUID_IMPLEMENT_LIMITS;
        c->eax = 0x40;
        c->ebx = 0x40;

526
        kvm_base = KVM_CPUID_SIGNATURE_NEXT;
527
        has_msr_hv_hypercall = true;
528 529
    }

530 531 532 533 534 535 536 537 538 539 540 541
    memcpy(signature, "KVMKVMKVM\0\0\0", 12);
    c = &cpuid_data.entries[cpuid_i++];
    c->function = KVM_CPUID_SIGNATURE | kvm_base;
    c->eax = 0;
    c->ebx = signature[0];
    c->ecx = signature[1];
    c->edx = signature[2];

    c = &cpuid_data.entries[cpuid_i++];
    c->function = KVM_CPUID_FEATURES | kvm_base;
    c->eax = env->features[FEAT_KVM];

542
    has_msr_async_pf_en = c->eax & (1 << KVM_FEATURE_ASYNC_PF);
543

544 545
    has_msr_pv_eoi_en = c->eax & (1 << KVM_FEATURE_PV_EOI);

546 547
    has_msr_kvm_steal_time = c->eax & (1 << KVM_FEATURE_STEAL_TIME);

548
    cpu_x86_cpuid(env, 0, 0, &limit, &unused, &unused, &unused);
549 550

    for (i = 0; i <= limit; i++) {
551 552 553 554
        if (cpuid_i == KVM_MAX_CPUID_ENTRIES) {
            fprintf(stderr, "unsupported level value: 0x%x\n", limit);
            abort();
        }
555
        c = &cpuid_data.entries[cpuid_i++];
556 557

        switch (i) {
558 559 560 561 562
        case 2: {
            /* Keep reading function 2 till all the input is received */
            int times;

            c->function = i;
563 564 565 566
            c->flags = KVM_CPUID_FLAG_STATEFUL_FUNC |
                       KVM_CPUID_FLAG_STATE_READ_NEXT;
            cpu_x86_cpuid(env, i, 0, &c->eax, &c->ebx, &c->ecx, &c->edx);
            times = c->eax & 0xff;
567 568

            for (j = 1; j < times; ++j) {
569 570 571 572 573
                if (cpuid_i == KVM_MAX_CPUID_ENTRIES) {
                    fprintf(stderr, "cpuid_data is full, no space for "
                            "cpuid(eax:2):eax & 0xf = 0x%x\n", times);
                    abort();
                }
574
                c = &cpuid_data.entries[cpuid_i++];
575
                c->function = i;
576 577
                c->flags = KVM_CPUID_FLAG_STATEFUL_FUNC;
                cpu_x86_cpuid(env, i, 0, &c->eax, &c->ebx, &c->ecx, &c->edx);
578 579 580
            }
            break;
        }
581 582 583 584
        case 4:
        case 0xb:
        case 0xd:
            for (j = 0; ; j++) {
585 586 587
                if (i == 0xd && j == 64) {
                    break;
                }
588 589 590
                c->function = i;
                c->flags = KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
                c->index = j;
591
                cpu_x86_cpuid(env, i, j, &c->eax, &c->ebx, &c->ecx, &c->edx);
592

593
                if (i == 4 && c->eax == 0) {
594
                    break;
595 596
                }
                if (i == 0xb && !(c->ecx & 0xff00)) {
597
                    break;
598 599
                }
                if (i == 0xd && c->eax == 0) {
600
                    continue;
601
                }
602 603 604 605 606
                if (cpuid_i == KVM_MAX_CPUID_ENTRIES) {
                    fprintf(stderr, "cpuid_data is full, no space for "
                            "cpuid(eax:0x%x,ecx:0x%x)\n", i, j);
                    abort();
                }
607
                c = &cpuid_data.entries[cpuid_i++];
608 609 610 611
            }
            break;
        default:
            c->function = i;
612 613
            c->flags = 0;
            cpu_x86_cpuid(env, i, 0, &c->eax, &c->ebx, &c->ecx, &c->edx);
614 615
            break;
        }
616
    }
Paolo Bonzini's avatar
Paolo Bonzini committed
617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635

    if (limit >= 0x0a) {
        uint32_t ver;

        cpu_x86_cpuid(env, 0x0a, 0, &ver, &unused, &unused, &unused);
        if ((ver & 0xff) > 0) {
            has_msr_architectural_pmu = true;
            num_architectural_pmu_counters = (ver & 0xff00) >> 8;

            /* Shouldn't be more than 32, since that's the number of bits
             * available in EBX to tell us _which_ counters are available.
             * Play it safe.
             */
            if (num_architectural_pmu_counters > MAX_GP_COUNTERS) {
                num_architectural_pmu_counters = MAX_GP_COUNTERS;
            }
        }
    }

636
    cpu_x86_cpuid(env, 0x80000000, 0, &limit, &unused, &unused, &unused);
637 638

    for (i = 0x80000000; i <= limit; i++) {
639 640 641 642
        if (cpuid_i == KVM_MAX_CPUID_ENTRIES) {
            fprintf(stderr, "unsupported xlevel value: 0x%x\n", limit);
            abort();
        }
643
        c = &cpuid_data.entries[cpuid_i++];
644 645

        c->function = i;
646 647
        c->flags = 0;
        cpu_x86_cpuid(env, i, 0, &c->eax, &c->ebx, &c->ecx, &c->edx);
648 649
    }

650 651 652 653 654
    /* Call Centaur's CPUID instructions they are supported. */
    if (env->cpuid_xlevel2 > 0) {
        cpu_x86_cpuid(env, 0xC0000000, 0, &limit, &unused, &unused, &unused);

        for (i = 0xC0000000; i <= limit; i++) {
655 656 657 658
            if (cpuid_i == KVM_MAX_CPUID_ENTRIES) {
                fprintf(stderr, "unsupported xlevel2 value: 0x%x\n", limit);
                abort();
            }
659 660 661 662 663 664 665 666
            c = &cpuid_data.entries[cpuid_i++];

            c->function = i;
            c->flags = 0;
            cpu_x86_cpuid(env, i, 0, &c->eax, &c->ebx, &c->ecx, &c->edx);
        }
    }

667 668
    cpuid_data.cpuid.nent = cpuid_i;

669
    if (((env->cpuid_version >> 8)&0xF) >= 6
670
        && (env->features[FEAT_1_EDX] & (CPUID_MCE | CPUID_MCA)) ==
671
           (CPUID_MCE | CPUID_MCA)
672
        && kvm_check_extension(cs->kvm_state, KVM_CAP_MCE) > 0) {
673 674
        uint64_t mcg_cap;
        int banks;
675
        int ret;
676

677
        ret = kvm_get_mce_cap_supported(cs->kvm_state, &mcg_cap, &banks);
678 679 680
        if (ret < 0) {
            fprintf(stderr, "kvm_get_mce_cap_supported: %s", strerror(-ret));
            return ret;
681
        }
682 683 684 685 686 687

        if (banks > MCE_BANKS_DEF) {
            banks = MCE_BANKS_DEF;
        }
        mcg_cap &= MCE_CAP_DEF;
        mcg_cap |= banks;
688
        ret = kvm_vcpu_ioctl(cs, KVM_X86_SETUP_MCE, &mcg_cap);
689 690 691 692 693 694
        if (ret < 0) {
            fprintf(stderr, "KVM_X86_SETUP_MCE: %s", strerror(-ret));
            return ret;
        }

        env->mcg_cap = mcg_cap;
695 696
    }

697 698
    qemu_add_vm_change_state_handler(cpu_update_state, env);

699 700 701 702 703 704
    c = cpuid_find_entry(&cpuid_data.cpuid, 1, 0);
    if (c) {
        has_msr_feature_control = !!(c->ecx & CPUID_EXT_VMX) ||
                                  !!(c->ecx & CPUID_EXT_SMX);
    }

705
    cpuid_data.cpuid.padding = 0;
706
    r = kvm_vcpu_ioctl(cs, KVM_SET_CPUID2, &cpuid_data);
707 708 709
    if (r) {
        return r;
    }
710

711
    r = kvm_check_extension(cs->kvm_state, KVM_CAP_TSC_CONTROL);
712
    if (r && env->tsc_khz) {
713
        r = kvm_vcpu_ioctl(cs, KVM_SET_TSC_KHZ, env->tsc_khz);
714 715 716 717 718 719
        if (r < 0) {
            fprintf(stderr, "KVM_SET_TSC_KHZ failed\n");
            return r;
        }
    }

720 721 722 723
    if (kvm_has_xsave()) {
        env->kvm_xsave_buf = qemu_memalign(4096, sizeof(struct kvm_xsave));
    }

724
    return 0;
725 726
}

727
void kvm_arch_reset_vcpu(CPUState *cs)
Jan Kiszka's avatar
Jan Kiszka committed
728
{
729 730
    X86CPU *cpu = X86_CPU(cs);
    CPUX86State *env = &cpu->env;
731

732
    env->exception_injected = -1;
733
    env->interrupt_injected = -1;
734
    env->xcr0 = 1;
Marcelo Tosatti's avatar
Marcelo Tosatti committed
735
    if (kvm_irqchip_in_kernel()) {
736
        env->mp_state = cpu_is_bsp(cpu) ? KVM_MP_STATE_RUNNABLE :
Marcelo Tosatti's avatar
Marcelo Tosatti committed
737 738 739 740
                                          KVM_MP_STATE_UNINITIALIZED;
    } else {
        env->mp_state = KVM_MP_STATE_RUNNABLE;
    }
Jan Kiszka's avatar
Jan Kiszka committed
741 742
}

743
static int kvm_get_supported_msrs(KVMState *s)
744
{
745
    static int kvm_supported_msrs;
746
    int ret = 0;
747 748

    /* first time */
749
    if (kvm_supported_msrs == 0) {
750 751
        struct kvm_msr_list msr_list, *kvm_msr_list;

752
        kvm_supported_msrs = -1;
753 754 755

        /* Obtain MSR list from KVM.  These are the MSRs that we must
         * save/restore */
756
        msr_list.nmsrs = 0;
757
        ret = kvm_ioctl(s, KVM_GET_MSR_INDEX_LIST, &msr_list);
758
        if (ret < 0 && ret != -E2BIG) {
759
            return ret;
760
        }
761 762
        /* Old kernel modules had a bug and could write beyond the provided
           memory. Allocate at least a safe amount of 1K. */
763
        kvm_msr_list = g_malloc0(MAX(1024, sizeof(msr_list) +
764 765
                                              msr_list.nmsrs *
                                              sizeof(msr_list.indices[0])));
766

767
        kvm_msr_list->nmsrs = msr_list.nmsrs;
768
        ret = kvm_ioctl(s, KVM_GET_MSR_INDEX_LIST, kvm_msr_list);
769 770 771 772 773
        if (ret >= 0) {
            int i;

            for (i = 0; i < kvm_msr_list->nmsrs; i++) {
                if (kvm_msr_list->indices[i] == MSR_STAR) {
774
                    has_msr_star = true;
775 776 777
                    continue;
                }
                if (kvm_msr_list->indices[i] == MSR_VM_HSAVE_PA) {
778
                    has_msr_hsave_pa = true;
779
                    continue;
780
                }
781 782 783 784
                if (kvm_msr_list->indices[i] == MSR_TSC_ADJUST) {
                    has_msr_tsc_adjust = true;
                    continue;
                }
785 786 787 788
                if (kvm_msr_list->indices[i] == MSR_IA32_TSCDEADLINE) {
                    has_msr_tsc_deadline = true;
                    continue;
                }
789 790 791 792
                if (kvm_msr_list->indices[i] == MSR_IA32_MISC_ENABLE) {
                    has_msr_misc_enable = true;
                    continue;
                }
Liu Jinsong's avatar
Liu Jinsong committed
793 794 795 796
                if (kvm_msr_list->indices[i] == MSR_IA32_BNDCFGS) {
                    has_msr_bndcfgs = true;
                    continue;
                }
797 798 799
            }
        }

800
        g_free(kvm_msr_list);
801 802
    }

803
    return ret;
804 805
}

806
int kvm_arch_init(KVMState *s)
807
{
808
    uint64_t identity_base = 0xfffbc000;
809
    uint64_t shadow_mem;
810
    int ret;
811
    struct utsname utsname;
812

813
    ret = kvm_get_supported_msrs(s);
814 815 816
    if (ret < 0) {
        return ret;
    }
817 818 819 820

    uname(&utsname);
    lm_capable_kernel = strcmp(utsname.machine, "x86_64") == 0;

821
    /*
822 823 824 825 826 827 828 829 830
     * On older Intel CPUs, KVM uses vm86 mode to emulate 16-bit code directly.
     * In order to use vm86 mode, an EPT identity map and a TSS  are needed.
     * Since these must be part of guest physical memory, we need to allocate
     * them, both by setting their start addresses in the kernel and by
     * creating a corresponding e820 entry. We need 4 pages before the BIOS.
     *
     * Older KVM versions may not support setting the identity map base. In
     * that case we need to stick with the default, i.e. a 256K maximum BIOS
     * size.
831
     */
832 833 834 835 836 837 838 839
    if (kvm_check_extension(s, KVM_CAP_SET_IDENTITY_MAP_ADDR)) {
        /* Allows up to 16M BIOSes. */
        identity_base = 0xfeffc000;

        ret = kvm_vm_ioctl(s, KVM_SET_IDENTITY_MAP_ADDR, &identity_base);
        if (ret < 0) {
            return ret;
        }
840
    }
841

842 843
    /* Set TSS base one page after EPT identity map. */
    ret = kvm_vm_ioctl(s, KVM_SET_TSS_ADDR, identity_base + 0x1000);
844 845 846 847
    if (ret < 0) {
        return ret;
    }

848 849
    /* Tell fw_cfg to notify the BIOS to reserve the range. */
    ret = e820_add_entry(identity_base, 0x4000, E820_RESERVED);
850
    if (ret < 0) {
851
        fprintf(stderr, "e820_add_entry() table is full\n");
852 853
        return ret;
    }
854
    qemu_register_reset(kvm_unpoison_all, NULL);
855

856 857 858 859 860 861 862
    shadow_mem = qemu_opt_get_size(qemu_get_machine_opts(),
                                   "kvm_shadow_mem", -1);
    if (shadow_mem != -1) {
        shadow_mem /= 4096;
        ret = kvm_vm_ioctl(s, KVM_SET_NR_MMU_PAGES, shadow_mem);
        if (ret < 0) {
            return ret;
863 864
        }
    }
865
    return 0;
866
}
867

868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891
static void set_v8086_seg(struct kvm_segment *lhs, const SegmentCache *rhs)
{
    lhs->selector = rhs->selector;
    lhs->base = rhs->base;
    lhs->limit = rhs->limit;
    lhs->type = 3;
    lhs->present = 1;
    lhs->dpl = 3;
    lhs->db = 0;
    lhs->s = 1;
    lhs->l = 0;
    lhs->g = 0;
    lhs->avl = 0;
    lhs->unusable = 0;
}

static void set_seg(struct kvm_segment *lhs, const SegmentCache *rhs)
{
    unsigned flags = rhs->flags;
    lhs->selector = rhs->selector;
    lhs->base = rhs->base;
    lhs->limit = rhs->limit;
    lhs->type = (flags >> DESC_TYPE_SHIFT) & 15;
    lhs->present = (flags & DESC_P_MASK) != 0;
892
    lhs->dpl = (flags >> DESC_DPL_SHIFT) & 3;
893 894 895 896 897 898
    lhs->db = (flags >> DESC_B_SHIFT) & 1;
    lhs->s = (flags & DESC_S_MASK) != 0;
    lhs->l = (flags >> DESC_L_SHIFT) & 1;
    lhs->g = (flags & DESC_G_MASK) != 0;
    lhs->avl = (flags & DESC_AVL_MASK) != 0;
    lhs->unusable = 0;
899
    lhs->padding = 0;
900 901 902 903 904 905 906
}

static void get_seg(SegmentCache *lhs, const struct kvm_segment *rhs)
{
    lhs->selector = rhs->selector;
    lhs->base = rhs->base;
    lhs->limit = rhs->limit;
907 908 909 910 911 912 913 914
    lhs->flags = (rhs->type << DESC_TYPE_SHIFT) |
                 (rhs->present * DESC_P_MASK) |
                 (rhs->dpl << DESC_DPL_SHIFT) |
                 (rhs->db << DESC_B_SHIFT) |
                 (rhs->s * DESC_S_MASK) |
                 (rhs->l << DESC_L_SHIFT) |
                 (rhs->g * DESC_G_MASK) |
                 (rhs->avl * DESC_AVL_MASK);
915 916 917 918
}

static void kvm_getput_reg(__u64 *kvm_reg, target_ulong *qemu_reg, int set)
{
919
    if (set) {
920
        *kvm_reg = *qemu_reg;
921
    } else {
922
        *qemu_reg = *kvm_reg;
923
    }
924 925
}

926
static int kvm_getput_regs(X86CPU *cpu, int set)
927
{
928
    CPUX86State *env = &cpu->env;
929 930 931 932
    struct kvm_regs regs;
    int ret = 0;

    if (!set) {
933
        ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_REGS, &regs);
934
        if (ret < 0) {
935
            return ret;
936
        }
937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960
    }

    kvm_getput_reg(&regs.rax, &env->regs[R_EAX], set);
    kvm_getput_reg(&regs.rbx, &env->regs[R_EBX], set);
    kvm_getput_reg(&regs.rcx, &env->regs[R_ECX], set);
    kvm_getput_reg(&regs.rdx, &env->regs[R_EDX], set);
    kvm_getput_reg(&regs.rsi, &env->regs[R_ESI], set);
    kvm_getput_reg(&regs.rdi, &env->regs[R_EDI], set);
    kvm_getput_reg(&regs.rsp, &env->regs[R_ESP], set);
    kvm_getput_reg(&regs.rbp, &env->regs[R_EBP], set);
#ifdef TARGET_X86_64
    kvm_getput_reg(&regs.r8, &env->regs[8], set);
    kvm_getput_reg(&regs.r9, &env->regs[9], set);
    kvm_getput_reg(&regs.r10, &env->regs[10], set);
    kvm_getput_reg(&regs.r11, &env->regs[11], set);
    kvm_getput_reg(&regs.r12, &env->regs[12], set);
    kvm_getput_reg(&regs.r13, &env->regs[13], set);
    kvm_getput_reg(&regs.r14, &env->regs[14], set);
    kvm_getput_reg(&regs.r15, &env->regs[15], set);
#endif

    kvm_getput_reg(&regs.rflags, &env->eflags, set);
    kvm_getput_reg(&regs.rip, &env->eip, set);

961
    if (set) {
962
        ret = kvm_vcpu_ioctl(CPU(cpu), KVM_SET_REGS, &regs);
963
    }
964 965 966 967

    return ret;
}

968
static int kvm_put_fpu(X86CPU *cpu)
969
{
970
    CPUX86State *env = &cpu->env;
971 972 973 974 975 976 977
    struct kvm_fpu fpu;
    int i;

    memset(&fpu, 0, sizeof fpu);
    fpu.fsw = env->fpus & ~(7 << 11);
    fpu.fsw |= (env->fpstt & 7) << 11;
    fpu.fcw = env->fpuc;
978 979 980
    fpu.last_opcode = env->fpop;
    fpu.last_ip = env->fpip;
    fpu.last_dp = env->fpdp;
981 982 983
    for (i = 0; i < 8; ++i) {
        fpu.ftwx |= (!env->fptags[i]) << i;
    }
984 985 986 987
    memcpy(fpu.fpr, env->fpregs, sizeof env->fpregs);
    memcpy(fpu.xmm, env->xmm_regs, sizeof env->xmm_regs);
    fpu.mxcsr = env->mxcsr;

988
    return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_FPU, &fpu);
989 990
}

991 992
#define XSAVE_FCW_FSW     0
#define XSAVE_FTW_FOP     1
993 994 995 996 997 998 999
#define XSAVE_CWD_RIP     2
#define XSAVE_CWD_RDP     4
#define XSAVE_MXCSR       6
#define XSAVE_ST_SPACE    8
#define XSAVE_XMM_SPACE   40
#define XSAVE_XSTATE_BV   128
#define XSAVE_YMMH_SPACE  144
Liu Jinsong's avatar
Liu Jinsong committed
1000 1001
#define XSAVE_BNDREGS     240
#define XSAVE_BNDCSR      256
1002

1003
static int kvm_put_xsave(X86CPU *cpu)
1004
{
1005
    CPUX86State *env = &cpu->env;
1006
    struct kvm_xsave* xsave = env->kvm_xsave_buf;
1007
    uint16_t cwd, swd, twd;
1008
    int i, r;
1009

1010
    if (!kvm_has_xsave()) {
1011
        return kvm_put_fpu(cpu);
1012
    }
1013 1014

    memset(xsave, 0, sizeof(struct kvm_xsave));
1015
    twd = 0;
1016 1017 1018
    swd = env->fpus & ~(7 << 11);
    swd |= (env->fpstt & 7) << 11;
    cwd = env->fpuc;
1019
    for (i = 0; i < 8; ++i) {
1020
        twd |= (!env->fptags[i]) << i;
1021
    }
1022 1023
    xsave->region[XSAVE_FCW_FSW] = (uint32_t)(swd << 16) + cwd;
    xsave->region[XSAVE_FTW_FOP] = (uint32_t)(env->fpop << 16) + twd;
1024 1025
    memcpy(&xsave->region[XSAVE_CWD_RIP], &env->fpip, sizeof(env->fpip));
    memcpy(&xsave->region[XSAVE_CWD_RDP], &env->fpdp, sizeof(env->fpdp));
1026 1027 1028 1029 1030 1031 1032 1033
    memcpy(&xsave->region[XSAVE_ST_SPACE], env->fpregs,
            sizeof env->fpregs);
    memcpy(&xsave->region[XSAVE_XMM_SPACE], env->xmm_regs,
            sizeof env->xmm_regs);
    xsave->region[XSAVE_MXCSR] = env->mxcsr;
    *(uint64_t *)&xsave->region[XSAVE_XSTATE_BV] = env->xstate_bv;
    memcpy(&xsave->region[XSAVE_YMMH_SPACE], env->ymmh_regs,
            sizeof env->ymmh_regs);
Liu Jinsong's avatar
Liu Jinsong committed
1034 1035 1036 1037
    memcpy(&xsave->region[XSAVE_BNDREGS], env->bnd_regs,
            sizeof env->bnd_regs);
    memcpy(&xsave->region[XSAVE_BNDCSR], &env->bndcs_regs,
            sizeof(env->bndcs_regs));
1038
    r = kvm_vcpu_ioctl(CPU(cpu), KVM_SET_XSAVE, xsave);
1039
    return r;
1040 1041
}

1042
static int kvm_put_xcrs(X86CPU *cpu)
1043
{
1044
    CPUX86State *env = &cpu->env;
1045 1046
    struct kvm_xcrs xcrs;

1047
    if (!kvm_has_xcrs()) {
1048
        return 0;
1049
    }
1050 1051 1052 1053 1054

    xcrs.nr_xcrs = 1;
    xcrs.flags = 0;
    xcrs.xcrs[0].xcr = 0;
    xcrs.xcrs[0].value = env->xcr0;
1055
    return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_XCRS, &xcrs);
1056 1057
}

1058
static int kvm_put_sregs(X86CPU *cpu)
1059
{
1060
    CPUX86State *env = &cpu->env;
1061 1062
    struct kvm_sregs sregs;

1063 1064 1065 1066 1067
    memset(sregs.interrupt_bitmap, 0, sizeof(sregs.interrupt_bitmap));
    if (env->interrupt_injected >= 0) {
        sregs.interrupt_bitmap[env->interrupt_injected / 64] |=
                (uint64_t)1 << (env->interrupt_injected % 64);
    }
1068 1069

    if ((env->eflags & VM_MASK)) {
1070 1071 1072 1073 1074 1075
        set_v8086_seg(&sregs.cs, &env->segs[R_CS]);
        set_v8086_seg(&sregs.ds, &env->segs[R_DS]);
        set_v8086_seg(&sregs.es, &env->segs[R_ES]);
        set_v8086_seg(&sregs.fs, &env->segs[R_FS]);
        set_v8086_seg(&sregs.gs, &env->segs[R_GS]);
        set_v8086_seg(&sregs.ss, &env->segs[R_SS]);
1076
    } else {
1077 1078 1079 1080 1081 1082
        set_seg(&sregs.cs, &env->segs[R_CS]);
        set_seg(&sregs.ds, &env->segs[R_DS]);
        set_seg(&sregs.es, &env->segs[R_ES]);
        set_seg(&sregs.fs, &env->segs[R_FS]);
        set_seg(&sregs.gs, &env->segs[R_GS]);
        set_seg(&sregs.ss, &env->segs[R_SS]);
1083 1084 1085 1086 1087 1088 1089
    }

    set_seg(&sregs.tr, &env->tr);
    set_seg(&sregs.ldt, &env->ldt);

    sregs.idt.limit = env->idt.limit;
    sregs.idt.base = env->idt.base;
1090
    memset(sregs.idt.padding, 0, sizeof sregs.idt.padding);
1091 1092
    sregs.gdt.limit = env->gdt.limit;
    sregs.gdt.base = env->gdt.base;
1093
    memset(sregs.gdt.padding, 0, sizeof sregs.gdt.padding);
1094 1095 1096 1097 1098 1099

    sregs.cr0 = env->cr[0];
    sregs.cr2 = env->cr[2];
    sregs.cr3 = env->cr[3];
    sregs.cr4 = env->cr[4];

1100 1101
    sregs.cr8 = cpu_get_apic_tpr(cpu->apic_state);
    sregs.apic_base = cpu_get_apic_base(cpu->apic_state);
1102 1103 1104

    sregs.efer = env->efer;

1105
    return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_SREGS, &sregs);
1106 1107 1108 1109 1110 1111 1112 1113 1114
}

static void kvm_msr_entry_set(struct kvm_msr_entry *entry,
                              uint32_t index, uint64_t value)
{
    entry->index = index;
    entry->data = value;
}

1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134
static int kvm_put_tscdeadline_msr(X86CPU *cpu)
{
    CPUX86State *env = &cpu->env;
    struct {
        struct kvm_msrs info;
        struct kvm_msr_entry entries[1];
    } msr_data;
    struct kvm_msr_entry *msrs = msr_data.entries;

    if (!has_msr_tsc_deadline) {
        return 0;
    }

    kvm_msr_entry_set(&msrs[0], MSR_IA32_TSCDEADLINE, env->tsc_deadline);

    msr_data.info.nmsrs = 1;

    return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_MSRS, &msr_data);
}

1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153
/*
 * Provide a separate write service for the feature control MSR in order to
 * kick the VCPU out of VMXON or even guest mode on reset. This has to be done
 * before writing any other state because forcibly leaving nested mode
 * invalidates the VCPU state.
 */
static int kvm_put_msr_feature_control(X86CPU *cpu)
{
    struct {
        struct kvm_msrs info;
        struct kvm_msr_entry entry;
    } msr_data;

    kvm_msr_entry_set(&msr_data.entry, MSR_IA32_FEATURE_CONTROL,
                      cpu->env.msr_ia32_feature_control);
    msr_data.info.nmsrs = 1;
    return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_MSRS, &msr_data);
}

1154
static int kvm_put_msrs(X86CPU *cpu, int level)
1155
{
1156
    CPUX86State *env = &cpu->env;
1157 1158 1159 1160 1161
    struct {
        struct kvm_msrs info;
        struct kvm_msr_entry entries[100];
    } msr_data;
    struct kvm_msr_entry *msrs = msr_data.entries;
Paolo Bonzini's avatar
Paolo Bonzini committed
1162
    int n = 0, i;
1163 1164 1165 1166

    kvm_msr_entry_set(&msrs[n++], MSR_IA32_SYSENTER_CS, env->sysenter_cs);
    kvm_msr_entry_set(&msrs[n++], MSR_IA32_SYSENTER_ESP, env->sysenter_esp);
    kvm_msr_entry_set(&msrs[n++], MSR_IA32_SYSENTER_EIP, env->sysenter_eip);
1167
    kvm_msr_entry_set(&msrs[n++], MSR_PAT, env->pat);
1168
    if (has_msr_star) {
1169 1170
        kvm_msr_entry_set(&msrs[n++], MSR_STAR, env->star);
    }
1171
    if (has_msr_hsave_pa) {
1172
        kvm_msr_entry_set(&msrs[n++], MSR_VM_HSAVE_PA, env->vm_hsave);
1173
    }
1174 1175 1176
    if (has_msr_tsc_adjust) {
        kvm_msr_entry_set(&msrs[n++], MSR_TSC_ADJUST, env->tsc_adjust);
    }
1177 1178 1179 1180
    if (has_msr_misc_enable) {
        kvm_msr_entry_set(&msrs[n++], MSR_IA32_MISC_ENABLE,
                          env->msr_ia32_misc_enable);
    }
1181 1182 1183
    if (has_msr_bndcfgs) {
        kvm_msr_entry_set(&msrs[n++], MSR_IA32_BNDCFGS, env->msr_bndcfgs);
    }
1184
#ifdef TARGET_X86_64
1185 1186 1187 1188 1189 1190
    if (lm_capable_kernel) {
        kvm_msr_entry_set(&msrs[n++], MSR_CSTAR, env->cstar);
        kvm_msr_entry_set(&msrs[n++], MSR_KERNELGSBASE, env->kernelgsbase);
        kvm_msr_entry_set(&msrs[n++], MSR_FMASK, env->fmask);
        kvm_msr_entry_set(&msrs[n++], MSR_LSTAR, env->lstar);
    }
1191
#endif
1192
    /*
Paolo Bonzini's avatar
Paolo Bonzini committed
1193 1194
     * The following MSRs have side effects on the guest or are too heavy
     * for normal writeback. Limit them to reset or full state updates.
1195 1196
     */
    if (level >= KVM_PUT_RESET_STATE) {
1197
        kvm_msr_entry_set(&msrs[n++], MSR_IA32_TSC, env->tsc);
1198 1199 1200
        kvm_msr_entry_set(&msrs[n++], MSR_KVM_SYSTEM_TIME,
                          env->system_time_msr);
        kvm_msr_entry_set(&msrs[n++], MSR_KVM_WALL_CLOCK, env->wall_clock_msr);
1201 1202 1203 1204
        if (has_msr_async_pf_en) {
            kvm_msr_entry_set(&msrs[n++], MSR_KVM_ASYNC_PF_EN,
                              env->async_pf_en_msr);
        }
1205 1206 1207 1208
        if (has_msr_pv_eoi_en) {
            kvm_msr_entry_set(&msrs[n++], MSR_KVM_PV_EOI_EN,
                              env->pv_eoi_en_msr);
        }
1209 1210 1211 1212
        if (has_msr_kvm_steal_time) {
            kvm_msr_entry_set(&msrs[n++], MSR_KVM_STEAL_TIME,
                              env->steal_time_msr);
        }
Paolo Bonzini's avatar
Paolo Bonzini committed
1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239
        if (has_msr_architectural_pmu) {
            /* Stop the counter.  */
            kvm_msr_entry_set(&msrs[n++], MSR_CORE_PERF_FIXED_CTR_CTRL, 0);
            kvm_msr_entry_set(&msrs[n++], MSR_CORE_PERF_GLOBAL_CTRL, 0);

            /* Set the counter values.  */
            for (i = 0; i < MAX_FIXED_COUNTERS; i++) {
                kvm_msr_entry_set(&msrs[n++], MSR_CORE_PERF_FIXED_CTR0 + i,
                                  env->msr_fixed_counters[i]);
            }
            for (i = 0; i < num_architectural_pmu_counters; i++) {
                kvm_msr_entry_set(&msrs[n++], MSR_P6_PERFCTR0 + i,
                                  env->msr_gp_counters[i]);
                kvm_msr_entry_set(&msrs[n++], MSR_P6_EVNTSEL0 + i,
                                  env->msr_gp_evtsel[i]);
            }
            kvm_msr_entry_set(&msrs[n++], MSR_CORE_PERF_GLOBAL_STATUS,
                              env->msr_global_status);
            kvm_msr_entry_set(&msrs[n++], MSR_CORE_PERF_GLOBAL_OVF_CTRL,
                              env->msr_global_ovf_ctrl);

            /* Now start the PMU.  */
            kvm_msr_entry_set(&msrs[n++], MSR_CORE_PERF_FIXED_CTR_CTRL,
                              env->msr_fixed_ctr_ctrl);
            kvm_msr_entry_set(&msrs[n++], MSR_CORE_PERF_GLOBAL_CTRL,
                              env->msr_global_ctrl);
        }
1240
        if (has_msr_hv_hypercall) {
1241 1242 1243 1244
            kvm_msr_entry_set(&msrs[n++], HV_X64_MSR_GUEST_OS_ID,
                              env->msr_hv_guest_os_id);
            kvm_msr_entry_set(&msrs[n++], HV_X64_MSR_HYPERCALL,
                              env->msr_hv_hypercall);
1245
        }
1246
        if (has_msr_hv_vapic) {
1247 1248
            kvm_msr_entry_set(&msrs[n++], HV_X64_MSR_APIC_ASSIST_PAGE,
                              env->msr_hv_vapic);
1249
        }
1250 1251 1252 1253
        if (has_msr_hv_tsc) {
            kvm_msr_entry_set(&msrs[n++], HV_X64_MSR_REFERENCE_TSC,
                              env->msr_hv_tsc);
        }
1254 1255 1256

        /* Note: MSR_IA32_FEATURE_CONTROL is written separately, see
         *       kvm_put_msr_feature_control. */
1257
    }
1258
    if (env->mcg_cap) {
1259
        int i;
1260

1261 1262 1263 1264
        kvm_msr_entry_set(&msrs[n++], MSR_MCG_STATUS, env->mcg_status);
        kvm_msr_entry_set(&msrs[n++], MSR_MCG_CTL, env->mcg_ctl);
        for (i = 0; i < (env->mcg_cap & 0xff) * 4; i++) {
            kvm_msr_entry_set(&msrs[n++], MSR_MC0_CTL + i, env->mce_banks[i]);
1265 1266
        }
    }
1267

1268 1269
    msr_data.info.nmsrs = n;

1270
    return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_MSRS, &msr_data);
1271 1272 1273 1274

}


1275
static int kvm_get_fpu(X86CPU *cpu)
1276
{
1277
    CPUX86State *env = &cpu->env;
1278 1279 1280
    struct kvm_fpu fpu;
    int i, ret;

1281
    ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_FPU, &fpu);
1282
    if (ret < 0) {
1283
        return ret;
1284
    }
1285 1286 1287 1288

    env->fpstt = (fpu.fsw >> 11) & 7;
    env->fpus = fpu.fsw;
    env->fpuc = fpu.fcw;
1289 1290 1291
    env->fpop = fpu.last_opcode;
    env->fpip = fpu.last_ip;
    env->fpdp = fpu.last_dp;
1292 1293 1294
    for (i = 0; i < 8; ++i) {
        env->fptags[i] = !((fpu.ftwx >> i) & 1);
    }
1295 1296 1297 1298 1299 1300 1301
    memcpy(env->fpregs, fpu.fpr, sizeof env->fpregs);
    memcpy(env->xmm_regs, fpu.xmm, sizeof env->xmm_regs);
    env->mxcsr = fpu.mxcsr;

    return 0;
}

1302
static int kvm_get_xsave(X86CPU *cpu)
1303
{