Skip to content
Snippets Groups Projects
x86.c 147 KiB
Newer Older
  • Learn to ignore specific revisions
  • /*
     * Kernel-based Virtual Machine driver for Linux
     *
     * derived from drivers/kvm/kvm_main.c
     *
     * Copyright (C) 2006 Qumranet, Inc.
    
     * Copyright (C) 2008 Qumranet, Inc.
     * Copyright IBM Corporation, 2008
    
     * Copyright 2010 Red Hat, Inc. and/or its affilates.
    
     *
     * Authors:
     *   Avi Kivity   <avi@qumranet.com>
     *   Yaniv Kamay  <yaniv@qumranet.com>
    
     *   Amit Shah    <amit.shah@qumranet.com>
     *   Ben-Ami Yassour <benami@il.ibm.com>
    
     *
     * This work is licensed under the terms of the GNU GPL, version 2.  See
     * the COPYING file in the top-level directory.
     *
     */
    
    
    #include <linux/kvm_host.h>
    
    #include "irq.h"
    
    Sheng Yang's avatar
    Sheng Yang committed
    #include "i8254.h"
    
    #include "tss.h"
    
    #include "kvm_cache_regs.h"
    
    #include <linux/clocksource.h>
    
    #include <linux/interrupt.h>
    
    #include <linux/kvm.h>
    #include <linux/fs.h>
    #include <linux/vmalloc.h>
    
    #include <linux/module.h>
    
    #include <linux/highmem.h>
    
    #include <linux/iommu.h>
    
    #include <linux/intel-iommu.h>
    
    #include <linux/cpufreq.h>
    
    #include <linux/user-return-notifier.h>
    
    #include <linux/srcu.h>
    
    #include <linux/perf_event.h>
    
    #include <linux/uaccess.h>
    
    Avi Kivity's avatar
    Avi Kivity committed
    #include <trace/events/kvm.h>
    
    #define CREATE_TRACE_POINTS
    #include "trace.h"
    
    #include <asm/msr.h>
    
    Sheng Yang's avatar
    Sheng Yang committed
    #include <asm/mtrr.h>
    
    Huang Ying's avatar
    Huang Ying committed
    #include <asm/mce.h>
    
    #include <asm/i387.h>
    
    Sheng Yang's avatar
    Sheng Yang committed
    #include <asm/xcr.h>
    
    #include <asm/pvclock.h>
    
    #define MAX_IO_MSRS 256
    
    #define CR0_RESERVED_BITS						\
    	(~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
    			  | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \
    			  | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG))
    #define CR4_RESERVED_BITS						\
    	(~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\
    			  | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE	\
    			  | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR	\
    
    			  | X86_CR4_OSXSAVE \
    
    			  | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE))
    
    #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
    
    Huang Ying's avatar
    Huang Ying committed
    
    #define KVM_MAX_MCE_BANKS 32
    #define KVM_MCE_CAP_SUPPORTED MCG_CTL_P
    
    
    /* EFER defaults:
     * - enable syscall per default because its emulated by KVM
     * - enable LME and LMA per default on 64 bit KVM
     */
    #ifdef CONFIG_X86_64
    static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffafeULL;
    #else
    static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffffeULL;
    #endif
    
    #define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM
    #define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
    
    static void update_cr8_intercept(struct kvm_vcpu *vcpu);
    
    static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
    				    struct kvm_cpuid_entry2 __user *entries);
    
    
    struct kvm_x86_ops *kvm_x86_ops;
    
    EXPORT_SYMBOL_GPL(kvm_x86_ops);
    
    int ignore_msrs = 0;
    module_param_named(ignore_msrs, ignore_msrs, bool, S_IRUGO | S_IWUSR);
    
    
    #define KVM_NR_SHARED_MSRS 16
    
    struct kvm_shared_msrs_global {
    	int nr;
    
    	u32 msrs[KVM_NR_SHARED_MSRS];
    
    };
    
    struct kvm_shared_msrs {
    	struct user_return_notifier urn;
    	bool registered;
    
    	struct kvm_shared_msr_values {
    		u64 host;
    		u64 curr;
    	} values[KVM_NR_SHARED_MSRS];
    
    };
    
    static struct kvm_shared_msrs_global __read_mostly shared_msrs_global;
    static DEFINE_PER_CPU(struct kvm_shared_msrs, shared_msrs);
    
    
    struct kvm_stats_debugfs_item debugfs_entries[] = {
    
    	{ "pf_fixed", VCPU_STAT(pf_fixed) },
    	{ "pf_guest", VCPU_STAT(pf_guest) },
    	{ "tlb_flush", VCPU_STAT(tlb_flush) },
    	{ "invlpg", VCPU_STAT(invlpg) },
    	{ "exits", VCPU_STAT(exits) },
    	{ "io_exits", VCPU_STAT(io_exits) },
    	{ "mmio_exits", VCPU_STAT(mmio_exits) },
    	{ "signal_exits", VCPU_STAT(signal_exits) },
    	{ "irq_window", VCPU_STAT(irq_window_exits) },
    
    	{ "nmi_window", VCPU_STAT(nmi_window_exits) },
    
    	{ "halt_exits", VCPU_STAT(halt_exits) },
    	{ "halt_wakeup", VCPU_STAT(halt_wakeup) },
    
    	{ "hypercalls", VCPU_STAT(hypercalls) },
    
    	{ "request_irq", VCPU_STAT(request_irq_exits) },
    	{ "irq_exits", VCPU_STAT(irq_exits) },
    	{ "host_state_reload", VCPU_STAT(host_state_reload) },
    	{ "efer_reload", VCPU_STAT(efer_reload) },
    	{ "fpu_reload", VCPU_STAT(fpu_reload) },
    	{ "insn_emulation", VCPU_STAT(insn_emulation) },
    	{ "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) },
    
    	{ "irq_injections", VCPU_STAT(irq_injections) },
    
    	{ "nmi_injections", VCPU_STAT(nmi_injections) },
    
    	{ "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) },
    	{ "mmu_pte_write", VM_STAT(mmu_pte_write) },
    	{ "mmu_pte_updated", VM_STAT(mmu_pte_updated) },
    	{ "mmu_pde_zapped", VM_STAT(mmu_pde_zapped) },
    	{ "mmu_flooded", VM_STAT(mmu_flooded) },
    	{ "mmu_recycled", VM_STAT(mmu_recycled) },
    
    	{ "mmu_cache_miss", VM_STAT(mmu_cache_miss) },
    
    	{ "mmu_unsync", VM_STAT(mmu_unsync) },
    
    	{ "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
    
    	{ "largepages", VM_STAT(lpages) },
    
    u64 __read_mostly host_xcr0;
    
    static inline u32 bit(int bitno)
    {
    	return 1 << (bitno & 31);
    }
    
    
    static void kvm_on_user_return(struct user_return_notifier *urn)
    {
    	unsigned slot;
    	struct kvm_shared_msrs *locals
    		= container_of(urn, struct kvm_shared_msrs, urn);
    
    	struct kvm_shared_msr_values *values;
    
    
    	for (slot = 0; slot < shared_msrs_global.nr; ++slot) {
    
    		values = &locals->values[slot];
    		if (values->host != values->curr) {
    			wrmsrl(shared_msrs_global.msrs[slot], values->host);
    			values->curr = values->host;
    
    		}
    	}
    	locals->registered = false;
    	user_return_notifier_unregister(urn);
    }
    
    
    static void shared_msr_update(unsigned slot, u32 msr)
    
    	struct kvm_shared_msrs *smsr;
    
    	smsr = &__get_cpu_var(shared_msrs);
    	/* only read, and nobody should modify it at this time,
    	 * so don't need lock */
    	if (slot >= shared_msrs_global.nr) {
    		printk(KERN_ERR "kvm: invalid MSR slot!");
    		return;
    	}
    	rdmsrl_safe(msr, &value);
    	smsr->values[slot].host = value;
    	smsr->values[slot].curr = value;
    }
    
    void kvm_define_shared_msr(unsigned slot, u32 msr)
    {
    
    	if (slot >= shared_msrs_global.nr)
    		shared_msrs_global.nr = slot + 1;
    
    	shared_msrs_global.msrs[slot] = msr;
    	/* we need ensured the shared_msr_global have been updated */
    	smp_wmb();
    
    }
    EXPORT_SYMBOL_GPL(kvm_define_shared_msr);
    
    static void kvm_shared_msr_cpu_online(void)
    {
    	unsigned i;
    
    	for (i = 0; i < shared_msrs_global.nr; ++i)
    
    		shared_msr_update(i, shared_msrs_global.msrs[i]);
    
    void kvm_set_shared_msr(unsigned slot, u64 value, u64 mask)
    
    {
    	struct kvm_shared_msrs *smsr = &__get_cpu_var(shared_msrs);
    
    
    	if (((value ^ smsr->values[slot].curr) & mask) == 0)
    
    		return;
    
    	smsr->values[slot].curr = value;
    	wrmsrl(shared_msrs_global.msrs[slot], value);
    
    	if (!smsr->registered) {
    		smsr->urn.on_user_return = kvm_on_user_return;
    		user_return_notifier_register(&smsr->urn);
    		smsr->registered = true;
    	}
    }
    EXPORT_SYMBOL_GPL(kvm_set_shared_msr);
    
    
    static void drop_user_return_notifiers(void *ignore)
    {
    	struct kvm_shared_msrs *smsr = &__get_cpu_var(shared_msrs);
    
    	if (smsr->registered)
    		kvm_on_user_return(&smsr->urn);
    }
    
    
    u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)
    {
    	if (irqchip_in_kernel(vcpu->kvm))
    
    		return vcpu->arch.apic_base;
    
    		return vcpu->arch.apic_base;
    
    }
    EXPORT_SYMBOL_GPL(kvm_get_apic_base);
    
    void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data)
    {
    	/* TODO: reserve bits check */
    	if (irqchip_in_kernel(vcpu->kvm))
    		kvm_lapic_set_base(vcpu, data);
    	else
    
    		vcpu->arch.apic_base = data;
    
    }
    EXPORT_SYMBOL_GPL(kvm_set_apic_base);
    
    
    #define EXCPT_BENIGN		0
    #define EXCPT_CONTRIBUTORY	1
    #define EXCPT_PF		2
    
    static int exception_class(int vector)
    {
    	switch (vector) {
    	case PF_VECTOR:
    		return EXCPT_PF;
    	case DE_VECTOR:
    	case TS_VECTOR:
    	case NP_VECTOR:
    	case SS_VECTOR:
    	case GP_VECTOR:
    		return EXCPT_CONTRIBUTORY;
    	default:
    		break;
    	}
    	return EXCPT_BENIGN;
    }
    
    static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
    
    		unsigned nr, bool has_error, u32 error_code,
    		bool reinject)
    
    {
    	u32 prev_nr;
    	int class1, class2;
    
    	if (!vcpu->arch.exception.pending) {
    	queue:
    		vcpu->arch.exception.pending = true;
    		vcpu->arch.exception.has_error_code = has_error;
    		vcpu->arch.exception.nr = nr;
    		vcpu->arch.exception.error_code = error_code;
    
    		vcpu->arch.exception.reinject = reinject;
    
    		return;
    	}
    
    	/* to check exception */
    	prev_nr = vcpu->arch.exception.nr;
    	if (prev_nr == DF_VECTOR) {
    		/* triple fault -> shutdown */
    
    		kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
    
    		return;
    	}
    	class1 = exception_class(prev_nr);
    	class2 = exception_class(nr);
    	if ((class1 == EXCPT_CONTRIBUTORY && class2 == EXCPT_CONTRIBUTORY)
    		|| (class1 == EXCPT_PF && class2 != EXCPT_BENIGN)) {
    		/* generate double fault per SDM Table 5-5 */
    		vcpu->arch.exception.pending = true;
    		vcpu->arch.exception.has_error_code = true;
    		vcpu->arch.exception.nr = DF_VECTOR;
    		vcpu->arch.exception.error_code = 0;
    	} else
    		/* replace previous exception with a new one in a hope
    		   that instruction re-execution will regenerate lost
    		   exception */
    		goto queue;
    }
    
    
    void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr)
    {
    
    	kvm_multiple_exception(vcpu, nr, false, 0, false);
    
    }
    EXPORT_SYMBOL_GPL(kvm_queue_exception);
    
    
    void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr)
    {
    	kvm_multiple_exception(vcpu, nr, false, 0, true);
    }
    EXPORT_SYMBOL_GPL(kvm_requeue_exception);
    
    
    void kvm_inject_page_fault(struct kvm_vcpu *vcpu)
    
    	unsigned error_code = vcpu->arch.fault.error_code;
    
    
    	vcpu->arch.cr2 = vcpu->arch.fault.address;
    
    	kvm_queue_exception_e(vcpu, PF_VECTOR, error_code);
    }
    
    
    void kvm_propagate_fault(struct kvm_vcpu *vcpu)
    {
    	u32 nested, error;
    
    	error   = vcpu->arch.fault.error_code;
    	nested  = error &  PFERR_NESTED_MASK;
    	error   = error & ~PFERR_NESTED_MASK;
    
    	vcpu->arch.fault.error_code = error;
    
    	if (mmu_is_nested(vcpu) && !nested)
    		vcpu->arch.nested_mmu.inject_page_fault(vcpu);
    	else
    		vcpu->arch.mmu.inject_page_fault(vcpu);
    }
    
    
    void kvm_inject_nmi(struct kvm_vcpu *vcpu)
    {
    	vcpu->arch.nmi_pending = 1;
    }
    EXPORT_SYMBOL_GPL(kvm_inject_nmi);
    
    
    void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
    {
    
    	kvm_multiple_exception(vcpu, nr, true, error_code, false);
    
    }
    EXPORT_SYMBOL_GPL(kvm_queue_exception_e);
    
    
    void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
    {
    	kvm_multiple_exception(vcpu, nr, true, error_code, true);
    }
    EXPORT_SYMBOL_GPL(kvm_requeue_exception_e);
    
    
    /*
     * Checks if cpl <= required_cpl; if true, return true.  Otherwise queue
     * a #GP and return false.
     */
    bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl)
    
    	if (kvm_x86_ops->get_cpl(vcpu) <= required_cpl)
    		return true;
    	kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
    	return false;
    
    EXPORT_SYMBOL_GPL(kvm_require_cpl);
    
    /*
     * This function will be used to read from the physical memory of the currently
     * running guest. The difference to kvm_read_guest_page is that this function
     * can read from guest physical or from the guest's guest physical memory.
     */
    int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
    			    gfn_t ngfn, void *data, int offset, int len,
    			    u32 access)
    {
    	gfn_t real_gfn;
    	gpa_t ngpa;
    
    	ngpa     = gfn_to_gpa(ngfn);
    	real_gfn = mmu->translate_gpa(vcpu, ngpa, access);
    	if (real_gfn == UNMAPPED_GVA)
    		return -EFAULT;
    
    	real_gfn = gpa_to_gfn(real_gfn);
    
    	return kvm_read_guest_page(vcpu->kvm, real_gfn, data, offset, len);
    }
    EXPORT_SYMBOL_GPL(kvm_read_guest_page_mmu);
    
    
    int kvm_read_nested_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn,
    			       void *data, int offset, int len, u32 access)
    {
    	return kvm_read_guest_page_mmu(vcpu, vcpu->arch.walk_mmu, gfn,
    				       data, offset, len, access);
    }
    
    
    /*
     * Load the pae pdptrs.  Return true is they are all valid.
     */
    
    int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3)
    
    {
    	gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
    	unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2;
    	int i;
    	int ret;
    
    	u64 pdpte[ARRAY_SIZE(mmu->pdptrs)];
    
    	ret = kvm_read_guest_page_mmu(vcpu, mmu, pdpt_gfn, pdpte,
    				      offset * sizeof(u64), sizeof(pdpte),
    				      PFERR_USER_MASK|PFERR_WRITE_MASK);
    
    	if (ret < 0) {
    		ret = 0;
    		goto out;
    	}
    	for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {
    
    		    (pdpte[i] & vcpu->arch.mmu.rsvd_bits_mask[0][2])) {
    
    	memcpy(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs));
    
    Avi Kivity's avatar
    Avi Kivity committed
    	__set_bit(VCPU_EXREG_PDPTR,
    		  (unsigned long *)&vcpu->arch.regs_avail);
    	__set_bit(VCPU_EXREG_PDPTR,
    		  (unsigned long *)&vcpu->arch.regs_dirty);
    
    EXPORT_SYMBOL_GPL(load_pdptrs);
    
    static bool pdptrs_changed(struct kvm_vcpu *vcpu)
    {
    
    	u64 pdpte[ARRAY_SIZE(vcpu->arch.walk_mmu->pdptrs)];
    
    	int offset;
    	gfn_t gfn;
    
    	int r;
    
    	if (is_long_mode(vcpu) || !is_pae(vcpu))
    		return false;
    
    
    Avi Kivity's avatar
    Avi Kivity committed
    	if (!test_bit(VCPU_EXREG_PDPTR,
    		      (unsigned long *)&vcpu->arch.regs_avail))
    		return true;
    
    
    	gfn = (vcpu->arch.cr3 & ~31u) >> PAGE_SHIFT;
    	offset = (vcpu->arch.cr3 & ~31u) & (PAGE_SIZE - 1);
    	r = kvm_read_nested_guest_page(vcpu, gfn, pdpte, offset, sizeof(pdpte),
    				       PFERR_USER_MASK | PFERR_WRITE_MASK);
    
    	changed = memcmp(pdpte, vcpu->arch.walk_mmu->pdptrs, sizeof(pdpte)) != 0;
    
    int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
    
    	unsigned long old_cr0 = kvm_read_cr0(vcpu);
    	unsigned long update_bits = X86_CR0_PG | X86_CR0_WP |
    				    X86_CR0_CD | X86_CR0_NW;
    
    
    	cr0 |= X86_CR0_ET;
    
    
    #ifdef CONFIG_X86_64
    
    	if (cr0 & 0xffffffff00000000UL)
    		return 1;
    
    #endif
    
    	cr0 &= ~CR0_RESERVED_BITS;
    
    	if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD))
    		return 1;
    
    	if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE))
    		return 1;
    
    
    	if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
    #ifdef CONFIG_X86_64
    
    		if ((vcpu->arch.efer & EFER_LME)) {
    
    			kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
    
    		if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.walk_mmu,
    						 vcpu->arch.cr3))
    
    	if ((cr0 ^ old_cr0) & update_bits)
    		kvm_mmu_reset_context(vcpu);
    
    void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
    
    	(void)kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0eul) | (msw & 0x0f));
    
    int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
    {
    	u64 xcr0;
    
    	/* Only support XCR_XFEATURE_ENABLED_MASK(xcr0) now  */
    	if (index != XCR_XFEATURE_ENABLED_MASK)
    		return 1;
    	xcr0 = xcr;
    	if (kvm_x86_ops->get_cpl(vcpu) != 0)
    		return 1;
    	if (!(xcr0 & XSTATE_FP))
    		return 1;
    	if ((xcr0 & XSTATE_YMM) && !(xcr0 & XSTATE_SSE))
    		return 1;
    	if (xcr0 & ~host_xcr0)
    		return 1;
    	vcpu->arch.xcr0 = xcr0;
    	vcpu->guest_xcr0_loaded = 0;
    	return 0;
    }
    
    int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
    {
    	if (__kvm_set_xcr(vcpu, index, xcr)) {
    		kvm_inject_gp(vcpu, 0);
    		return 1;
    	}
    	return 0;
    }
    EXPORT_SYMBOL_GPL(kvm_set_xcr);
    
    static bool guest_cpuid_has_xsave(struct kvm_vcpu *vcpu)
    {
    	struct kvm_cpuid_entry2 *best;
    
    	best = kvm_find_cpuid_entry(vcpu, 1, 0);
    	return best && (best->ecx & bit(X86_FEATURE_XSAVE));
    }
    
    static void update_cpuid(struct kvm_vcpu *vcpu)
    {
    	struct kvm_cpuid_entry2 *best;
    
    	best = kvm_find_cpuid_entry(vcpu, 1, 0);
    	if (!best)
    		return;
    
    	/* Update OSXSAVE bit */
    	if (cpu_has_xsave && best->function == 0x1) {
    		best->ecx &= ~(bit(X86_FEATURE_OSXSAVE));
    		if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE))
    			best->ecx |= bit(X86_FEATURE_OSXSAVE);
    	}
    }
    
    
    int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
    
    	unsigned long old_cr4 = kvm_read_cr4(vcpu);
    
    	unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE;
    
    
    	if (cr4 & CR4_RESERVED_BITS)
    		return 1;
    
    	if (!guest_cpuid_has_xsave(vcpu) && (cr4 & X86_CR4_OSXSAVE))
    		return 1;
    
    
    		if (!(cr4 & X86_CR4_PAE))
    			return 1;
    
    	} else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE)
    		   && ((cr4 ^ old_cr4) & pdptr_bits)
    
    		   && !load_pdptrs(vcpu, vcpu->arch.walk_mmu, vcpu->arch.cr3))
    
    		return 1;
    
    	if (cr4 & X86_CR4_VMXE)
    		return 1;
    
    	if ((cr4 ^ old_cr4) & pdptr_bits)
    		kvm_mmu_reset_context(vcpu);
    
    	if ((cr4 ^ old_cr4) & X86_CR4_OSXSAVE)
    		update_cpuid(vcpu);
    
    
    int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
    
    	if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) {
    
    		kvm_mmu_sync_roots(vcpu);
    
    		kvm_mmu_flush_tlb(vcpu);
    
    		if (cr3 & CR3_L_MODE_RESERVED_BITS)
    			return 1;
    
    			if (cr3 & CR3_PAE_RESERVED_BITS)
    				return 1;
    
    			if (is_paging(vcpu) &&
    			    !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))
    
    		}
    		/*
    		 * We don't check reserved bits in nonpae mode, because
    		 * this isn't enforced, and VMware depends on this.
    		 */
    	}
    
    	/*
    	 * Does the new cr3 value map to physical memory? (Note, we
    	 * catch an invalid cr3 even in real-mode, because it would
    	 * cause trouble later on when we turn on paging anyway.)
    	 *
    	 * A real CPU would silently accept an invalid cr3 and would
    	 * attempt to use it - with largely undefined (and often hard
    	 * to debug) behavior on the guest side.
    	 */
    	if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT)))
    
    		return 1;
    	vcpu->arch.cr3 = cr3;
    	vcpu->arch.mmu.new_cr3(vcpu);
    	return 0;
    }
    
    int __kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
    
    	if (cr8 & CR8_RESERVED_BITS)
    		return 1;
    
    	if (irqchip_in_kernel(vcpu->kvm))
    		kvm_lapic_set_tpr(vcpu, cr8);
    	else
    
    		vcpu->arch.cr8 = cr8;
    
    	return 0;
    }
    
    void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
    {
    	if (__kvm_set_cr8(vcpu, cr8))
    		kvm_inject_gp(vcpu, 0);
    
    unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
    
    {
    	if (irqchip_in_kernel(vcpu->kvm))
    		return kvm_lapic_get_cr8(vcpu);
    	else
    
    		return vcpu->arch.cr8;
    
    static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
    
    {
    	switch (dr) {
    	case 0 ... 3:
    		vcpu->arch.db[dr] = val;
    		if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
    			vcpu->arch.eff_db[dr] = val;
    		break;
    	case 4:
    
    		if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
    			return 1; /* #UD */
    
    		if (val & 0xffffffff00000000ULL)
    			return -1; /* #GP */
    
    		vcpu->arch.dr6 = (val & DR6_VOLATILE) | DR6_FIXED_1;
    		break;
    	case 5:
    
    		if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
    			return 1; /* #UD */
    
    		/* fall through */
    	default: /* 7 */
    
    		if (val & 0xffffffff00000000ULL)
    			return -1; /* #GP */
    
    		vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1;
    		if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
    			kvm_x86_ops->set_dr7(vcpu, vcpu->arch.dr7);
    			vcpu->arch.switch_db_regs = (val & DR7_BP_EN_MASK);
    		}
    		break;
    	}
    
    	return 0;
    }
    
    
    int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
    {
    	int res;
    
    	res = __kvm_set_dr(vcpu, dr, val);
    	if (res > 0)
    		kvm_queue_exception(vcpu, UD_VECTOR);
    	else if (res < 0)
    		kvm_inject_gp(vcpu, 0);
    
    	return res;
    }
    
    EXPORT_SYMBOL_GPL(kvm_set_dr);
    
    
    static int _kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
    
    {
    	switch (dr) {
    	case 0 ... 3:
    		*val = vcpu->arch.db[dr];
    		break;
    	case 4:
    
    		if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
    
    			return 1;
    		/* fall through */
    	case 6:
    		*val = vcpu->arch.dr6;
    		break;
    	case 5:
    
    		if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
    
    			return 1;
    		/* fall through */
    	default: /* 7 */
    		*val = vcpu->arch.dr7;
    		break;
    	}
    
    	return 0;
    }
    
    
    int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
    {
    	if (_kvm_get_dr(vcpu, dr, val)) {
    		kvm_queue_exception(vcpu, UD_VECTOR);
    		return 1;
    	}
    	return 0;
    }
    
    EXPORT_SYMBOL_GPL(kvm_get_dr);
    
    
    /*
     * List of msr numbers which we expose to userspace through KVM_GET_MSRS
     * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
     *
     * This list is modified at module load time to reflect the
    
     * capabilities of the host cpu. This capabilities test skips MSRs that are
     * kvm-specific. Those are put in the beginning of the list.
    
    #define KVM_SAVE_MSRS_BEGIN	7
    
    static u32 msrs_to_save[] = {
    
    	MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
    
    	MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
    
    	HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
    
    	HV_X64_MSR_APIC_ASSIST_PAGE,
    
    	MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
    
    	MSR_STAR,
    
    #ifdef CONFIG_X86_64
    	MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
    #endif
    
    	MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA
    
    };
    
    static unsigned num_msrs_to_save;
    
    static u32 emulated_msrs[] = {
    	MSR_IA32_MISC_ENABLE,
    
    	MSR_IA32_MCG_STATUS,
    	MSR_IA32_MCG_CTL,
    
    static int set_efer(struct kvm_vcpu *vcpu, u64 efer)
    
    	u64 old_efer = vcpu->arch.efer;
    
    
    	if (efer & efer_reserved_bits)
    		return 1;
    
    	    && (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME))
    		return 1;
    
    Alexander Graf's avatar
    Alexander Graf committed
    	if (efer & EFER_FFXSR) {
    		struct kvm_cpuid_entry2 *feat;
    
    		feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
    
    		if (!feat || !(feat->edx & bit(X86_FEATURE_FXSR_OPT)))
    			return 1;
    
    	if (efer & EFER_SVME) {
    		struct kvm_cpuid_entry2 *feat;
    
    		feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
    
    		if (!feat || !(feat->ecx & bit(X86_FEATURE_SVM)))
    			return 1;
    
    	efer |= vcpu->arch.efer & EFER_LMA;
    
    	kvm_x86_ops->set_efer(vcpu, efer);
    
    
    	vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled;
    	kvm_mmu_reset_context(vcpu);
    
    	/* Update reserved bits */
    	if ((efer ^ old_efer) & EFER_NX)
    		kvm_mmu_reset_context(vcpu);
    
    
    void kvm_enable_efer_bits(u64 mask)
    {
           efer_reserved_bits &= ~mask;
    }
    EXPORT_SYMBOL_GPL(kvm_enable_efer_bits);
    
    
    
    /*
     * Writes msr value into into the appropriate "register".
     * Returns 0 on success, non-0 otherwise.
     * Assumes vcpu_load() was already called.
     */
    int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
    {
    	return kvm_x86_ops->set_msr(vcpu, msr_index, data);
    }
    
    
    /*
     * Adapt set_msr() to msr_io()'s calling convention
     */
    static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
    {
    	return kvm_set_msr(vcpu, index, *data);
    }
    
    
    static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
    {
    
    	int version;
    	int r;
    
    	struct pvclock_wall_clock wc;
    
    	r = kvm_read_guest(kvm, wall_clock, &version, sizeof(version));
    	if (r)
    		return;
    
    	if (version & 1)
    		++version;  /* first time write, random junk */
    
    	++version;
    
    
    	kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
    
    
    	/*
    	 * The guest calculates current wall clock time by adding
    	 * system time (updated by kvm_write_guest_time below) to the
    	 * wall clock specified here.  guest system time equals host
    	 * system time for us, thus we must fill in host boot time here.
    	 */
    
    
    	wc.sec = boot.tv_sec;
    	wc.nsec = boot.tv_nsec;
    	wc.version = version;
    
    
    	kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc));
    
    	version++;
    	kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
    }
    
    
    static uint32_t div_frac(uint32_t dividend, uint32_t divisor)
    {
    	uint32_t quotient, remainder;
    
    	/* Don't try to replace with do_div(), this one calculates
    	 * "(dividend << 32) / divisor" */
    	__asm__ ( "divl %4"
    		  : "=a" (quotient), "=d" (remainder)
    		  : "0" (0), "1" (dividend), "r" (divisor) );
    	return quotient;
    }
    
    static void kvm_set_time_scale(uint32_t tsc_khz, struct pvclock_vcpu_time_info *hv_clock)
    {
    	uint64_t nsecs = 1000000000LL;
    	int32_t  shift = 0;
    	uint64_t tps64;
    	uint32_t tps32;
    
    	tps64 = tsc_khz * 1000LL;
    	while (tps64 > nsecs*2) {
    		tps64 >>= 1;
    		shift--;
    	}
    
    	tps32 = (uint32_t)tps64;
    	while (tps32 <= (uint32_t)nsecs) {
    		tps32 <<= 1;
    		shift++;
    	}
    
    	hv_clock->tsc_shift = shift;
    	hv_clock->tsc_to_system_mul = div_frac(nsecs, tps32);
    
    	pr_debug("%s: tsc_khz %u, tsc_shift %d, tsc_mul %u\n",
    
    		 __func__, tsc_khz, hv_clock->tsc_shift,
    
    static inline u64 get_kernel_ns(void)
    {
    	struct timespec ts;
    
    	WARN_ON(preemptible());
    	ktime_get_ts(&ts);
    	monotonic_to_bootbased(&ts);
    	return timespec_to_ns(&ts);
    }
    
    
    static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);
    
    
    static inline int kvm_tsc_changes_freq(void)
    {
    	int cpu = get_cpu();
    	int ret = !boot_cpu_has(X86_FEATURE_CONSTANT_TSC) &&
    		  cpufreq_quick_get(cpu) != 0;
    	put_cpu();
    	return ret;
    }
    
    
    static inline u64 nsec_to_cycles(u64 nsec)
    {
    
    	WARN_ON(preemptible());
    	if (kvm_tsc_changes_freq())
    		printk_once(KERN_WARNING
    		 "kvm: unreliable cycle conversion on adjustable rate TSC\n");
    
    	ret = nsec * __get_cpu_var(cpu_tsc_khz);
    	do_div(ret, USEC_PER_SEC);
    	return ret;
    
    void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
    {
    	struct kvm *kvm = vcpu->kvm;
    
    	u64 offset, ns, elapsed;
    
    	unsigned long flags;
    
    	s64 sdiff;
    
    
    	spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
    	offset = data - native_read_tsc();
    
    	ns = get_kernel_ns();
    
    	elapsed = ns - kvm->arch.last_tsc_nsec;
    
    	sdiff = data - kvm->arch.last_tsc_write;
    	if (sdiff < 0)
    		sdiff = -sdiff;