Skip to content
Snippets Groups Projects
x86.c 137 KiB
Newer Older
  • Learn to ignore specific revisions
  • /*
     * Kernel-based Virtual Machine driver for Linux
     *
     * derived from drivers/kvm/kvm_main.c
     *
     * Copyright (C) 2006 Qumranet, Inc.
    
     * Copyright (C) 2008 Qumranet, Inc.
     * Copyright IBM Corporation, 2008
    
     * Copyright 2010 Red Hat, Inc. and/or its affilates.
    
     *
     * Authors:
     *   Avi Kivity   <avi@qumranet.com>
     *   Yaniv Kamay  <yaniv@qumranet.com>
    
     *   Amit Shah    <amit.shah@qumranet.com>
     *   Ben-Ami Yassour <benami@il.ibm.com>
    
     *
     * This work is licensed under the terms of the GNU GPL, version 2.  See
     * the COPYING file in the top-level directory.
     *
     */
    
    
    #include <linux/kvm_host.h>
    
    #include "irq.h"
    
    Sheng Yang's avatar
    Sheng Yang committed
    #include "i8254.h"
    
    #include "tss.h"
    
    #include "kvm_cache_regs.h"
    
    #include <linux/clocksource.h>
    
    #include <linux/interrupt.h>
    
    #include <linux/kvm.h>
    #include <linux/fs.h>
    #include <linux/vmalloc.h>
    
    #include <linux/module.h>
    
    #include <linux/highmem.h>
    
    #include <linux/iommu.h>
    
    #include <linux/intel-iommu.h>
    
    #include <linux/cpufreq.h>
    
    #include <linux/user-return-notifier.h>
    
    #include <linux/srcu.h>
    
    #include <linux/perf_event.h>
    
    #include <linux/uaccess.h>
    
    Avi Kivity's avatar
    Avi Kivity committed
    #include <trace/events/kvm.h>
    
    #define CREATE_TRACE_POINTS
    #include "trace.h"
    
    #include <asm/msr.h>
    
    Sheng Yang's avatar
    Sheng Yang committed
    #include <asm/mtrr.h>
    
    Huang Ying's avatar
    Huang Ying committed
    #include <asm/mce.h>
    
    #include <asm/i387.h>
    
    Sheng Yang's avatar
    Sheng Yang committed
    #include <asm/xcr.h>
    
    #define MAX_IO_MSRS 256
    
    #define CR0_RESERVED_BITS						\
    	(~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
    			  | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \
    			  | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG))
    #define CR4_RESERVED_BITS						\
    	(~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\
    			  | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE	\
    			  | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR	\
    
    			  | X86_CR4_OSXSAVE \
    
    			  | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE))
    
    #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
    
    Huang Ying's avatar
    Huang Ying committed
    
    #define KVM_MAX_MCE_BANKS 32
    #define KVM_MCE_CAP_SUPPORTED MCG_CTL_P
    
    
    /* EFER defaults:
     * - enable syscall per default because its emulated by KVM
     * - enable LME and LMA per default on 64 bit KVM
     */
    #ifdef CONFIG_X86_64
    static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffafeULL;
    #else
    static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffffeULL;
    #endif
    
    #define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM
    #define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
    
    static void update_cr8_intercept(struct kvm_vcpu *vcpu);
    
    static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
    				    struct kvm_cpuid_entry2 __user *entries);
    
    
    struct kvm_x86_ops *kvm_x86_ops;
    
    EXPORT_SYMBOL_GPL(kvm_x86_ops);
    
    int ignore_msrs = 0;
    module_param_named(ignore_msrs, ignore_msrs, bool, S_IRUGO | S_IWUSR);
    
    
    #define KVM_NR_SHARED_MSRS 16
    
    struct kvm_shared_msrs_global {
    	int nr;
    
    	u32 msrs[KVM_NR_SHARED_MSRS];
    
    };
    
    struct kvm_shared_msrs {
    	struct user_return_notifier urn;
    	bool registered;
    
    	struct kvm_shared_msr_values {
    		u64 host;
    		u64 curr;
    	} values[KVM_NR_SHARED_MSRS];
    
    };
    
    static struct kvm_shared_msrs_global __read_mostly shared_msrs_global;
    static DEFINE_PER_CPU(struct kvm_shared_msrs, shared_msrs);
    
    
    struct kvm_stats_debugfs_item debugfs_entries[] = {
    
    	{ "pf_fixed", VCPU_STAT(pf_fixed) },
    	{ "pf_guest", VCPU_STAT(pf_guest) },
    	{ "tlb_flush", VCPU_STAT(tlb_flush) },
    	{ "invlpg", VCPU_STAT(invlpg) },
    	{ "exits", VCPU_STAT(exits) },
    	{ "io_exits", VCPU_STAT(io_exits) },
    	{ "mmio_exits", VCPU_STAT(mmio_exits) },
    	{ "signal_exits", VCPU_STAT(signal_exits) },
    	{ "irq_window", VCPU_STAT(irq_window_exits) },
    
    	{ "nmi_window", VCPU_STAT(nmi_window_exits) },
    
    	{ "halt_exits", VCPU_STAT(halt_exits) },
    	{ "halt_wakeup", VCPU_STAT(halt_wakeup) },
    
    	{ "hypercalls", VCPU_STAT(hypercalls) },
    
    	{ "request_irq", VCPU_STAT(request_irq_exits) },
    	{ "irq_exits", VCPU_STAT(irq_exits) },
    	{ "host_state_reload", VCPU_STAT(host_state_reload) },
    	{ "efer_reload", VCPU_STAT(efer_reload) },
    	{ "fpu_reload", VCPU_STAT(fpu_reload) },
    	{ "insn_emulation", VCPU_STAT(insn_emulation) },
    	{ "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) },
    
    	{ "irq_injections", VCPU_STAT(irq_injections) },
    
    	{ "nmi_injections", VCPU_STAT(nmi_injections) },
    
    	{ "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) },
    	{ "mmu_pte_write", VM_STAT(mmu_pte_write) },
    	{ "mmu_pte_updated", VM_STAT(mmu_pte_updated) },
    	{ "mmu_pde_zapped", VM_STAT(mmu_pde_zapped) },
    	{ "mmu_flooded", VM_STAT(mmu_flooded) },
    	{ "mmu_recycled", VM_STAT(mmu_recycled) },
    
    	{ "mmu_cache_miss", VM_STAT(mmu_cache_miss) },
    
    	{ "mmu_unsync", VM_STAT(mmu_unsync) },
    
    	{ "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
    
    	{ "largepages", VM_STAT(lpages) },
    
    u64 __read_mostly host_xcr0;
    
    static inline u32 bit(int bitno)
    {
    	return 1 << (bitno & 31);
    }
    
    
    static void kvm_on_user_return(struct user_return_notifier *urn)
    {
    	unsigned slot;
    	struct kvm_shared_msrs *locals
    		= container_of(urn, struct kvm_shared_msrs, urn);
    
    	struct kvm_shared_msr_values *values;
    
    
    	for (slot = 0; slot < shared_msrs_global.nr; ++slot) {
    
    		values = &locals->values[slot];
    		if (values->host != values->curr) {
    			wrmsrl(shared_msrs_global.msrs[slot], values->host);
    			values->curr = values->host;
    
    		}
    	}
    	locals->registered = false;
    	user_return_notifier_unregister(urn);
    }
    
    
    static void shared_msr_update(unsigned slot, u32 msr)
    
    	struct kvm_shared_msrs *smsr;
    
    	smsr = &__get_cpu_var(shared_msrs);
    	/* only read, and nobody should modify it at this time,
    	 * so don't need lock */
    	if (slot >= shared_msrs_global.nr) {
    		printk(KERN_ERR "kvm: invalid MSR slot!");
    		return;
    	}
    	rdmsrl_safe(msr, &value);
    	smsr->values[slot].host = value;
    	smsr->values[slot].curr = value;
    }
    
    void kvm_define_shared_msr(unsigned slot, u32 msr)
    {
    
    	if (slot >= shared_msrs_global.nr)
    		shared_msrs_global.nr = slot + 1;
    
    	shared_msrs_global.msrs[slot] = msr;
    	/* we need ensured the shared_msr_global have been updated */
    	smp_wmb();
    
    }
    EXPORT_SYMBOL_GPL(kvm_define_shared_msr);
    
    static void kvm_shared_msr_cpu_online(void)
    {
    	unsigned i;
    
    	for (i = 0; i < shared_msrs_global.nr; ++i)
    
    		shared_msr_update(i, shared_msrs_global.msrs[i]);
    
    void kvm_set_shared_msr(unsigned slot, u64 value, u64 mask)
    
    {
    	struct kvm_shared_msrs *smsr = &__get_cpu_var(shared_msrs);
    
    
    	if (((value ^ smsr->values[slot].curr) & mask) == 0)
    
    		return;
    
    	smsr->values[slot].curr = value;
    	wrmsrl(shared_msrs_global.msrs[slot], value);
    
    	if (!smsr->registered) {
    		smsr->urn.on_user_return = kvm_on_user_return;
    		user_return_notifier_register(&smsr->urn);
    		smsr->registered = true;
    	}
    }
    EXPORT_SYMBOL_GPL(kvm_set_shared_msr);
    
    
    static void drop_user_return_notifiers(void *ignore)
    {
    	struct kvm_shared_msrs *smsr = &__get_cpu_var(shared_msrs);
    
    	if (smsr->registered)
    		kvm_on_user_return(&smsr->urn);
    }
    
    
    u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)
    {
    	if (irqchip_in_kernel(vcpu->kvm))
    
    		return vcpu->arch.apic_base;
    
    		return vcpu->arch.apic_base;
    
    }
    EXPORT_SYMBOL_GPL(kvm_get_apic_base);
    
    void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data)
    {
    	/* TODO: reserve bits check */
    	if (irqchip_in_kernel(vcpu->kvm))
    		kvm_lapic_set_base(vcpu, data);
    	else
    
    		vcpu->arch.apic_base = data;
    
    }
    EXPORT_SYMBOL_GPL(kvm_set_apic_base);
    
    
    #define EXCPT_BENIGN		0
    #define EXCPT_CONTRIBUTORY	1
    #define EXCPT_PF		2
    
    static int exception_class(int vector)
    {
    	switch (vector) {
    	case PF_VECTOR:
    		return EXCPT_PF;
    	case DE_VECTOR:
    	case TS_VECTOR:
    	case NP_VECTOR:
    	case SS_VECTOR:
    	case GP_VECTOR:
    		return EXCPT_CONTRIBUTORY;
    	default:
    		break;
    	}
    	return EXCPT_BENIGN;
    }
    
    static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
    
    		unsigned nr, bool has_error, u32 error_code,
    		bool reinject)
    
    {
    	u32 prev_nr;
    	int class1, class2;
    
    	if (!vcpu->arch.exception.pending) {
    	queue:
    		vcpu->arch.exception.pending = true;
    		vcpu->arch.exception.has_error_code = has_error;
    		vcpu->arch.exception.nr = nr;
    		vcpu->arch.exception.error_code = error_code;
    
    		vcpu->arch.exception.reinject = reinject;
    
    		return;
    	}
    
    	/* to check exception */
    	prev_nr = vcpu->arch.exception.nr;
    	if (prev_nr == DF_VECTOR) {
    		/* triple fault -> shutdown */
    
    		kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
    
    		return;
    	}
    	class1 = exception_class(prev_nr);
    	class2 = exception_class(nr);
    	if ((class1 == EXCPT_CONTRIBUTORY && class2 == EXCPT_CONTRIBUTORY)
    		|| (class1 == EXCPT_PF && class2 != EXCPT_BENIGN)) {
    		/* generate double fault per SDM Table 5-5 */
    		vcpu->arch.exception.pending = true;
    		vcpu->arch.exception.has_error_code = true;
    		vcpu->arch.exception.nr = DF_VECTOR;
    		vcpu->arch.exception.error_code = 0;
    	} else
    		/* replace previous exception with a new one in a hope
    		   that instruction re-execution will regenerate lost
    		   exception */
    		goto queue;
    }
    
    
    void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr)
    {
    
    	kvm_multiple_exception(vcpu, nr, false, 0, false);
    
    }
    EXPORT_SYMBOL_GPL(kvm_queue_exception);
    
    
    void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr)
    {
    	kvm_multiple_exception(vcpu, nr, false, 0, true);
    }
    EXPORT_SYMBOL_GPL(kvm_requeue_exception);
    
    
    void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr,
    			   u32 error_code)
    {
    	++vcpu->stat.pf_guest;
    
    	vcpu->arch.cr2 = addr;
    
    	kvm_queue_exception_e(vcpu, PF_VECTOR, error_code);
    }
    
    
    void kvm_inject_nmi(struct kvm_vcpu *vcpu)
    {
    	vcpu->arch.nmi_pending = 1;
    }
    EXPORT_SYMBOL_GPL(kvm_inject_nmi);
    
    
    void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
    {
    
    	kvm_multiple_exception(vcpu, nr, true, error_code, false);
    
    }
    EXPORT_SYMBOL_GPL(kvm_queue_exception_e);
    
    
    void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
    {
    	kvm_multiple_exception(vcpu, nr, true, error_code, true);
    }
    EXPORT_SYMBOL_GPL(kvm_requeue_exception_e);
    
    
    /*
     * Checks if cpl <= required_cpl; if true, return true.  Otherwise queue
     * a #GP and return false.
     */
    bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl)
    
    	if (kvm_x86_ops->get_cpl(vcpu) <= required_cpl)
    		return true;
    	kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
    	return false;
    
    EXPORT_SYMBOL_GPL(kvm_require_cpl);
    
    /*
     * Load the pae pdptrs.  Return true is they are all valid.
     */
    int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
    {
    	gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
    	unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2;
    	int i;
    	int ret;
    
    	u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)];
    
    
    	ret = kvm_read_guest_page(vcpu->kvm, pdpt_gfn, pdpte,
    				  offset * sizeof(u64), sizeof(pdpte));
    	if (ret < 0) {
    		ret = 0;
    		goto out;
    	}
    	for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {
    
    		    (pdpte[i] & vcpu->arch.mmu.rsvd_bits_mask[0][2])) {
    
    	memcpy(vcpu->arch.pdptrs, pdpte, sizeof(vcpu->arch.pdptrs));
    
    Avi Kivity's avatar
    Avi Kivity committed
    	__set_bit(VCPU_EXREG_PDPTR,
    		  (unsigned long *)&vcpu->arch.regs_avail);
    	__set_bit(VCPU_EXREG_PDPTR,
    		  (unsigned long *)&vcpu->arch.regs_dirty);
    
    EXPORT_SYMBOL_GPL(load_pdptrs);
    
    static bool pdptrs_changed(struct kvm_vcpu *vcpu)
    {
    
    	u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)];
    
    	bool changed = true;
    	int r;
    
    	if (is_long_mode(vcpu) || !is_pae(vcpu))
    		return false;
    
    
    Avi Kivity's avatar
    Avi Kivity committed
    	if (!test_bit(VCPU_EXREG_PDPTR,
    		      (unsigned long *)&vcpu->arch.regs_avail))
    		return true;
    
    
    	r = kvm_read_guest(vcpu->kvm, vcpu->arch.cr3 & ~31u, pdpte, sizeof(pdpte));
    
    	changed = memcmp(pdpte, vcpu->arch.pdptrs, sizeof(pdpte)) != 0;
    
    int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
    
    	unsigned long old_cr0 = kvm_read_cr0(vcpu);
    	unsigned long update_bits = X86_CR0_PG | X86_CR0_WP |
    				    X86_CR0_CD | X86_CR0_NW;
    
    
    	cr0 |= X86_CR0_ET;
    
    
    #ifdef CONFIG_X86_64
    
    	if (cr0 & 0xffffffff00000000UL)
    		return 1;
    
    #endif
    
    	cr0 &= ~CR0_RESERVED_BITS;
    
    	if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD))
    		return 1;
    
    	if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE))
    		return 1;
    
    
    	if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
    #ifdef CONFIG_X86_64
    
    		if ((vcpu->arch.efer & EFER_LME)) {
    
    			kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
    
    		if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3))
    			return 1;
    
    	if ((cr0 ^ old_cr0) & update_bits)
    		kvm_mmu_reset_context(vcpu);
    
    void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
    
    	(void)kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0eul) | (msw & 0x0f));
    
    int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
    {
    	u64 xcr0;
    
    	/* Only support XCR_XFEATURE_ENABLED_MASK(xcr0) now  */
    	if (index != XCR_XFEATURE_ENABLED_MASK)
    		return 1;
    	xcr0 = xcr;
    	if (kvm_x86_ops->get_cpl(vcpu) != 0)
    		return 1;
    	if (!(xcr0 & XSTATE_FP))
    		return 1;
    	if ((xcr0 & XSTATE_YMM) && !(xcr0 & XSTATE_SSE))
    		return 1;
    	if (xcr0 & ~host_xcr0)
    		return 1;
    	vcpu->arch.xcr0 = xcr0;
    	vcpu->guest_xcr0_loaded = 0;
    	return 0;
    }
    
    int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
    {
    	if (__kvm_set_xcr(vcpu, index, xcr)) {
    		kvm_inject_gp(vcpu, 0);
    		return 1;
    	}
    	return 0;
    }
    EXPORT_SYMBOL_GPL(kvm_set_xcr);
    
    static bool guest_cpuid_has_xsave(struct kvm_vcpu *vcpu)
    {
    	struct kvm_cpuid_entry2 *best;
    
    	best = kvm_find_cpuid_entry(vcpu, 1, 0);
    	return best && (best->ecx & bit(X86_FEATURE_XSAVE));
    }
    
    static void update_cpuid(struct kvm_vcpu *vcpu)
    {
    	struct kvm_cpuid_entry2 *best;
    
    	best = kvm_find_cpuid_entry(vcpu, 1, 0);
    	if (!best)
    		return;
    
    	/* Update OSXSAVE bit */
    	if (cpu_has_xsave && best->function == 0x1) {
    		best->ecx &= ~(bit(X86_FEATURE_OSXSAVE));
    		if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE))
    			best->ecx |= bit(X86_FEATURE_OSXSAVE);
    	}
    }
    
    
    int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
    
    	unsigned long old_cr4 = kvm_read_cr4(vcpu);
    
    	unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE;
    
    
    	if (cr4 & CR4_RESERVED_BITS)
    		return 1;
    
    	if (!guest_cpuid_has_xsave(vcpu) && (cr4 & X86_CR4_OSXSAVE))
    		return 1;
    
    
    		if (!(cr4 & X86_CR4_PAE))
    			return 1;
    
    	} else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE)
    		   && ((cr4 ^ old_cr4) & pdptr_bits)
    
    		   && !load_pdptrs(vcpu, vcpu->arch.cr3))
    		return 1;
    
    	if (cr4 & X86_CR4_VMXE)
    		return 1;
    
    	if ((cr4 ^ old_cr4) & pdptr_bits)
    		kvm_mmu_reset_context(vcpu);
    
    	if ((cr4 ^ old_cr4) & X86_CR4_OSXSAVE)
    		update_cpuid(vcpu);
    
    
    int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
    
    	if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) {
    
    		kvm_mmu_sync_roots(vcpu);
    
    		kvm_mmu_flush_tlb(vcpu);
    
    		if (cr3 & CR3_L_MODE_RESERVED_BITS)
    			return 1;
    
    			if (cr3 & CR3_PAE_RESERVED_BITS)
    				return 1;
    			if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3))
    				return 1;
    
    		}
    		/*
    		 * We don't check reserved bits in nonpae mode, because
    		 * this isn't enforced, and VMware depends on this.
    		 */
    	}
    
    	/*
    	 * Does the new cr3 value map to physical memory? (Note, we
    	 * catch an invalid cr3 even in real-mode, because it would
    	 * cause trouble later on when we turn on paging anyway.)
    	 *
    	 * A real CPU would silently accept an invalid cr3 and would
    	 * attempt to use it - with largely undefined (and often hard
    	 * to debug) behavior on the guest side.
    	 */
    	if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT)))
    
    		return 1;
    	vcpu->arch.cr3 = cr3;
    	vcpu->arch.mmu.new_cr3(vcpu);
    	return 0;
    }
    
    int __kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
    
    	if (cr8 & CR8_RESERVED_BITS)
    		return 1;
    
    	if (irqchip_in_kernel(vcpu->kvm))
    		kvm_lapic_set_tpr(vcpu, cr8);
    	else
    
    		vcpu->arch.cr8 = cr8;
    
    	return 0;
    }
    
    void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
    {
    	if (__kvm_set_cr8(vcpu, cr8))
    		kvm_inject_gp(vcpu, 0);
    
    unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
    
    {
    	if (irqchip_in_kernel(vcpu->kvm))
    		return kvm_lapic_get_cr8(vcpu);
    	else
    
    		return vcpu->arch.cr8;
    
    static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
    
    {
    	switch (dr) {
    	case 0 ... 3:
    		vcpu->arch.db[dr] = val;
    		if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
    			vcpu->arch.eff_db[dr] = val;
    		break;
    	case 4:
    
    		if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
    			return 1; /* #UD */
    
    		if (val & 0xffffffff00000000ULL)
    			return -1; /* #GP */
    
    		vcpu->arch.dr6 = (val & DR6_VOLATILE) | DR6_FIXED_1;
    		break;
    	case 5:
    
    		if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
    			return 1; /* #UD */
    
    		/* fall through */
    	default: /* 7 */
    
    		if (val & 0xffffffff00000000ULL)
    			return -1; /* #GP */
    
    		vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1;
    		if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
    			kvm_x86_ops->set_dr7(vcpu, vcpu->arch.dr7);
    			vcpu->arch.switch_db_regs = (val & DR7_BP_EN_MASK);
    		}
    		break;
    	}
    
    	return 0;
    }
    
    
    int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
    {
    	int res;
    
    	res = __kvm_set_dr(vcpu, dr, val);
    	if (res > 0)
    		kvm_queue_exception(vcpu, UD_VECTOR);
    	else if (res < 0)
    		kvm_inject_gp(vcpu, 0);
    
    	return res;
    }
    
    EXPORT_SYMBOL_GPL(kvm_set_dr);
    
    
    static int _kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
    
    {
    	switch (dr) {
    	case 0 ... 3:
    		*val = vcpu->arch.db[dr];
    		break;
    	case 4:
    
    		if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
    
    			return 1;
    		/* fall through */
    	case 6:
    		*val = vcpu->arch.dr6;
    		break;
    	case 5:
    
    		if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
    
    			return 1;
    		/* fall through */
    	default: /* 7 */
    		*val = vcpu->arch.dr7;
    		break;
    	}
    
    	return 0;
    }
    
    
    int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
    {
    	if (_kvm_get_dr(vcpu, dr, val)) {
    		kvm_queue_exception(vcpu, UD_VECTOR);
    		return 1;
    	}
    	return 0;
    }
    
    EXPORT_SYMBOL_GPL(kvm_get_dr);
    
    
    /*
     * List of msr numbers which we expose to userspace through KVM_GET_MSRS
     * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
     *
     * This list is modified at module load time to reflect the
    
     * capabilities of the host cpu. This capabilities test skips MSRs that are
     * kvm-specific. Those are put in the beginning of the list.
    
    #define KVM_SAVE_MSRS_BEGIN	7
    
    static u32 msrs_to_save[] = {
    
    	MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
    
    	MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
    
    	HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
    
    	HV_X64_MSR_APIC_ASSIST_PAGE,
    
    	MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
    	MSR_K6_STAR,
    #ifdef CONFIG_X86_64
    	MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
    #endif
    
    	MSR_IA32_TSC, MSR_IA32_PERF_STATUS, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA
    
    };
    
    static unsigned num_msrs_to_save;
    
    static u32 emulated_msrs[] = {
    	MSR_IA32_MISC_ENABLE,
    };
    
    
    static int set_efer(struct kvm_vcpu *vcpu, u64 efer)
    
    	u64 old_efer = vcpu->arch.efer;
    
    
    	if (efer & efer_reserved_bits)
    		return 1;
    
    	    && (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME))
    		return 1;
    
    Alexander Graf's avatar
    Alexander Graf committed
    	if (efer & EFER_FFXSR) {
    		struct kvm_cpuid_entry2 *feat;
    
    		feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
    
    		if (!feat || !(feat->edx & bit(X86_FEATURE_FXSR_OPT)))
    			return 1;
    
    	if (efer & EFER_SVME) {
    		struct kvm_cpuid_entry2 *feat;
    
    		feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
    
    		if (!feat || !(feat->ecx & bit(X86_FEATURE_SVM)))
    			return 1;
    
    	efer |= vcpu->arch.efer & EFER_LMA;
    
    	kvm_x86_ops->set_efer(vcpu, efer);
    
    
    	vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled;
    	kvm_mmu_reset_context(vcpu);
    
    	/* Update reserved bits */
    	if ((efer ^ old_efer) & EFER_NX)
    		kvm_mmu_reset_context(vcpu);
    
    
    void kvm_enable_efer_bits(u64 mask)
    {
           efer_reserved_bits &= ~mask;
    }
    EXPORT_SYMBOL_GPL(kvm_enable_efer_bits);
    
    
    
    /*
     * Writes msr value into into the appropriate "register".
     * Returns 0 on success, non-0 otherwise.
     * Assumes vcpu_load() was already called.
     */
    int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
    {
    	return kvm_x86_ops->set_msr(vcpu, msr_index, data);
    }
    
    
    /*
     * Adapt set_msr() to msr_io()'s calling convention
     */
    static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
    {
    	return kvm_set_msr(vcpu, index, *data);
    }
    
    
    static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
    {
    
    	int version;
    	int r;
    
    	struct pvclock_wall_clock wc;
    
    	r = kvm_read_guest(kvm, wall_clock, &version, sizeof(version));
    	if (r)
    		return;
    
    	if (version & 1)
    		++version;  /* first time write, random junk */
    
    	++version;
    
    
    	kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
    
    
    	/*
    	 * The guest calculates current wall clock time by adding
    	 * system time (updated by kvm_write_guest_time below) to the
    	 * wall clock specified here.  guest system time equals host
    	 * system time for us, thus we must fill in host boot time here.
    	 */
    
    
    	wc.sec = boot.tv_sec;
    	wc.nsec = boot.tv_nsec;
    	wc.version = version;
    
    
    	kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc));
    
    	version++;
    	kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
    }
    
    
    static uint32_t div_frac(uint32_t dividend, uint32_t divisor)
    {
    	uint32_t quotient, remainder;
    
    	/* Don't try to replace with do_div(), this one calculates
    	 * "(dividend << 32) / divisor" */
    	__asm__ ( "divl %4"
    		  : "=a" (quotient), "=d" (remainder)
    		  : "0" (0), "1" (dividend), "r" (divisor) );
    	return quotient;
    }
    
    static void kvm_set_time_scale(uint32_t tsc_khz, struct pvclock_vcpu_time_info *hv_clock)
    {
    	uint64_t nsecs = 1000000000LL;
    	int32_t  shift = 0;
    	uint64_t tps64;
    	uint32_t tps32;
    
    	tps64 = tsc_khz * 1000LL;
    	while (tps64 > nsecs*2) {
    		tps64 >>= 1;
    		shift--;
    	}
    
    	tps32 = (uint32_t)tps64;
    	while (tps32 <= (uint32_t)nsecs) {
    		tps32 <<= 1;
    		shift++;
    	}
    
    	hv_clock->tsc_shift = shift;
    	hv_clock->tsc_to_system_mul = div_frac(nsecs, tps32);
    
    	pr_debug("%s: tsc_khz %u, tsc_shift %d, tsc_mul %u\n",
    
    		 __func__, tsc_khz, hv_clock->tsc_shift,
    
    static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);
    
    
    static void kvm_write_guest_time(struct kvm_vcpu *v)
    {
    	struct timespec ts;
    	unsigned long flags;
    	struct kvm_vcpu_arch *vcpu = &v->arch;
    	void *shared_kaddr;
    
    	this_tsc_khz = get_cpu_var(cpu_tsc_khz);
    	if (unlikely(vcpu->hv_clock_tsc_khz != this_tsc_khz)) {
    		kvm_set_time_scale(this_tsc_khz, &vcpu->hv_clock);
    		vcpu->hv_clock_tsc_khz = this_tsc_khz;
    
    	/* Keep irq disabled to prevent changes to the clock */
    	local_irq_save(flags);
    
    	kvm_get_msr(v, MSR_IA32_TSC, &vcpu->hv_clock.tsc_timestamp);
    
    	monotonic_to_bootbased(&ts);
    
    	local_irq_restore(flags);
    
    	/* With all the info we got, fill in the values */
    
    	vcpu->hv_clock.system_time = ts.tv_nsec +
    
    				     (NSEC_PER_SEC * (u64)ts.tv_sec) + v->kvm->arch.kvmclock_offset;
    
    
    	/*
    	 * The interface expects us to write an even number signaling that the
    	 * update is finished. Since the guest won't see the intermediate
    
    	 * state, we just increase by 2 at the end.
    
    	vcpu->hv_clock.version += 2;
    
    
    	shared_kaddr = kmap_atomic(vcpu->time_page, KM_USER0);
    
    	memcpy(shared_kaddr + vcpu->time_offset, &vcpu->hv_clock,
    
    	       sizeof(vcpu->hv_clock));
    
    
    	kunmap_atomic(shared_kaddr, KM_USER0);
    
    	mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT);
    }
    
    
    static int kvm_request_guest_time_update(struct kvm_vcpu *v)
    {
    	struct kvm_vcpu_arch *vcpu = &v->arch;
    
    	if (!vcpu->time_page)
    		return 0;
    
    	kvm_make_request(KVM_REQ_KVMCLOCK_UPDATE, v);
    
    Avi Kivity's avatar
    Avi Kivity committed
    static bool msr_mtrr_valid(unsigned msr)
    {
    	switch (msr) {
    	case 0x200 ... 0x200 + 2 * KVM_NR_VAR_MTRR - 1:
    	case MSR_MTRRfix64K_00000:
    	case MSR_MTRRfix16K_80000:
    	case MSR_MTRRfix16K_A0000:
    	case MSR_MTRRfix4K_C0000:
    	case MSR_MTRRfix4K_C8000:
    	case MSR_MTRRfix4K_D0000:
    	case MSR_MTRRfix4K_D8000:
    	case MSR_MTRRfix4K_E0000:
    	case MSR_MTRRfix4K_E8000:
    	case MSR_MTRRfix4K_F0000:
    	case MSR_MTRRfix4K_F8000:
    	case MSR_MTRRdefType:
    	case MSR_IA32_CR_PAT:
    		return true;
    	case 0x2f8:
    		return true;
    	}
    	return false;
    }
    
    
    static bool valid_pat_type(unsigned t)
    {
    	return t < 8 && (1 << t) & 0xf3; /* 0, 1, 4, 5, 6, 7 */
    }
    
    static bool valid_mtrr_type(unsigned t)
    {
    	return t < 8 && (1 << t) & 0x73; /* 0, 1, 4, 5, 6 */
    }
    
    static bool mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data)
    {
    	int i;
    
    	if (!msr_mtrr_valid(msr))
    		return false;
    
    	if (msr == MSR_IA32_CR_PAT) {
    		for (i = 0; i < 8; i++)
    			if (!valid_pat_type((data >> (i * 8)) & 0xff))
    				return false;
    		return true;