Skip to content
Snippets Groups Projects
x86.c 128 KiB
Newer Older
  • Learn to ignore specific revisions
  • /*
     * Kernel-based Virtual Machine driver for Linux
     *
     * derived from drivers/kvm/kvm_main.c
     *
     * Copyright (C) 2006 Qumranet, Inc.
    
     * Copyright (C) 2008 Qumranet, Inc.
     * Copyright IBM Corporation, 2008
    
     *
     * Authors:
     *   Avi Kivity   <avi@qumranet.com>
     *   Yaniv Kamay  <yaniv@qumranet.com>
    
     *   Amit Shah    <amit.shah@qumranet.com>
     *   Ben-Ami Yassour <benami@il.ibm.com>
    
     *
     * This work is licensed under the terms of the GNU GPL, version 2.  See
     * the COPYING file in the top-level directory.
     *
     */
    
    
    #include <linux/kvm_host.h>
    
    #include "irq.h"
    
    Sheng Yang's avatar
    Sheng Yang committed
    #include "i8254.h"
    
    #include "tss.h"
    
    #include "kvm_cache_regs.h"
    
    #include <linux/clocksource.h>
    
    #include <linux/interrupt.h>
    
    #include <linux/kvm.h>
    #include <linux/fs.h>
    #include <linux/vmalloc.h>
    
    #include <linux/module.h>
    
    #include <linux/highmem.h>
    
    #include <linux/iommu.h>
    
    #include <linux/intel-iommu.h>
    
    #include <linux/cpufreq.h>
    
    #include <linux/user-return-notifier.h>
    
    Avi Kivity's avatar
    Avi Kivity committed
    #include <trace/events/kvm.h>
    #undef TRACE_INCLUDE_FILE
    
    #define CREATE_TRACE_POINTS
    #include "trace.h"
    
    #include <asm/uaccess.h>
    
    #include <asm/msr.h>
    
    Sheng Yang's avatar
    Sheng Yang committed
    #include <asm/mtrr.h>
    
    Huang Ying's avatar
    Huang Ying committed
    #include <asm/mce.h>
    
    #define MAX_IO_MSRS 256
    
    #define CR0_RESERVED_BITS						\
    	(~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
    			  | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \
    			  | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG))
    #define CR4_RESERVED_BITS						\
    	(~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\
    			  | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE	\
    			  | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR	\
    			  | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE))
    
    #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
    
    Huang Ying's avatar
    Huang Ying committed
    
    #define KVM_MAX_MCE_BANKS 32
    #define KVM_MCE_CAP_SUPPORTED MCG_CTL_P
    
    
    /* EFER defaults:
     * - enable syscall per default because its emulated by KVM
     * - enable LME and LMA per default on 64 bit KVM
     */
    #ifdef CONFIG_X86_64
    static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffafeULL;
    #else
    static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffffeULL;
    #endif
    
    #define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM
    #define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
    
    static void update_cr8_intercept(struct kvm_vcpu *vcpu);
    
    static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
    				    struct kvm_cpuid_entry2 __user *entries);
    
    
    struct kvm_x86_ops *kvm_x86_ops;
    
    EXPORT_SYMBOL_GPL(kvm_x86_ops);
    
    int ignore_msrs = 0;
    module_param_named(ignore_msrs, ignore_msrs, bool, S_IRUGO | S_IWUSR);
    
    
    #define KVM_NR_SHARED_MSRS 16
    
    struct kvm_shared_msrs_global {
    	int nr;
    	struct kvm_shared_msr {
    		u32 msr;
    		u64 value;
    	} msrs[KVM_NR_SHARED_MSRS];
    };
    
    struct kvm_shared_msrs {
    	struct user_return_notifier urn;
    	bool registered;
    	u64 current_value[KVM_NR_SHARED_MSRS];
    };
    
    static struct kvm_shared_msrs_global __read_mostly shared_msrs_global;
    static DEFINE_PER_CPU(struct kvm_shared_msrs, shared_msrs);
    
    
    struct kvm_stats_debugfs_item debugfs_entries[] = {
    
    	{ "pf_fixed", VCPU_STAT(pf_fixed) },
    	{ "pf_guest", VCPU_STAT(pf_guest) },
    	{ "tlb_flush", VCPU_STAT(tlb_flush) },
    	{ "invlpg", VCPU_STAT(invlpg) },
    	{ "exits", VCPU_STAT(exits) },
    	{ "io_exits", VCPU_STAT(io_exits) },
    	{ "mmio_exits", VCPU_STAT(mmio_exits) },
    	{ "signal_exits", VCPU_STAT(signal_exits) },
    	{ "irq_window", VCPU_STAT(irq_window_exits) },
    
    	{ "nmi_window", VCPU_STAT(nmi_window_exits) },
    
    	{ "halt_exits", VCPU_STAT(halt_exits) },
    	{ "halt_wakeup", VCPU_STAT(halt_wakeup) },
    
    	{ "hypercalls", VCPU_STAT(hypercalls) },
    
    	{ "request_irq", VCPU_STAT(request_irq_exits) },
    	{ "irq_exits", VCPU_STAT(irq_exits) },
    	{ "host_state_reload", VCPU_STAT(host_state_reload) },
    	{ "efer_reload", VCPU_STAT(efer_reload) },
    	{ "fpu_reload", VCPU_STAT(fpu_reload) },
    	{ "insn_emulation", VCPU_STAT(insn_emulation) },
    	{ "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) },
    
    	{ "irq_injections", VCPU_STAT(irq_injections) },
    
    	{ "nmi_injections", VCPU_STAT(nmi_injections) },
    
    	{ "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) },
    	{ "mmu_pte_write", VM_STAT(mmu_pte_write) },
    	{ "mmu_pte_updated", VM_STAT(mmu_pte_updated) },
    	{ "mmu_pde_zapped", VM_STAT(mmu_pde_zapped) },
    	{ "mmu_flooded", VM_STAT(mmu_flooded) },
    	{ "mmu_recycled", VM_STAT(mmu_recycled) },
    
    	{ "mmu_cache_miss", VM_STAT(mmu_cache_miss) },
    
    	{ "mmu_unsync", VM_STAT(mmu_unsync) },
    
    	{ "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
    
    	{ "largepages", VM_STAT(lpages) },
    
    static void kvm_on_user_return(struct user_return_notifier *urn)
    {
    	unsigned slot;
    	struct kvm_shared_msr *global;
    	struct kvm_shared_msrs *locals
    		= container_of(urn, struct kvm_shared_msrs, urn);
    
    	for (slot = 0; slot < shared_msrs_global.nr; ++slot) {
    		global = &shared_msrs_global.msrs[slot];
    		if (global->value != locals->current_value[slot]) {
    			wrmsrl(global->msr, global->value);
    			locals->current_value[slot] = global->value;
    		}
    	}
    	locals->registered = false;
    	user_return_notifier_unregister(urn);
    }
    
    void kvm_define_shared_msr(unsigned slot, u32 msr)
    {
    	int cpu;
    	u64 value;
    
    	if (slot >= shared_msrs_global.nr)
    		shared_msrs_global.nr = slot + 1;
    	shared_msrs_global.msrs[slot].msr = msr;
    	rdmsrl_safe(msr, &value);
    	shared_msrs_global.msrs[slot].value = value;
    	for_each_online_cpu(cpu)
    		per_cpu(shared_msrs, cpu).current_value[slot] = value;
    }
    EXPORT_SYMBOL_GPL(kvm_define_shared_msr);
    
    static void kvm_shared_msr_cpu_online(void)
    {
    	unsigned i;
    	struct kvm_shared_msrs *locals = &__get_cpu_var(shared_msrs);
    
    	for (i = 0; i < shared_msrs_global.nr; ++i)
    		locals->current_value[i] = shared_msrs_global.msrs[i].value;
    }
    
    
    void kvm_set_shared_msr(unsigned slot, u64 value, u64 mask)
    
    {
    	struct kvm_shared_msrs *smsr = &__get_cpu_var(shared_msrs);
    
    
    	if (((value ^ smsr->current_value[slot]) & mask) == 0)
    
    		return;
    	smsr->current_value[slot] = value;
    	wrmsrl(shared_msrs_global.msrs[slot].msr, value);
    	if (!smsr->registered) {
    		smsr->urn.on_user_return = kvm_on_user_return;
    		user_return_notifier_register(&smsr->urn);
    		smsr->registered = true;
    	}
    }
    EXPORT_SYMBOL_GPL(kvm_set_shared_msr);
    
    
    static void drop_user_return_notifiers(void *ignore)
    {
    	struct kvm_shared_msrs *smsr = &__get_cpu_var(shared_msrs);
    
    	if (smsr->registered)
    		kvm_on_user_return(&smsr->urn);
    }
    
    
    unsigned long segment_base(u16 selector)
    {
    	struct descriptor_table gdt;
    
    	unsigned long table_base;
    	unsigned long v;
    
    	if (selector == 0)
    		return 0;
    
    
    	kvm_get_gdt(&gdt);
    
    	table_base = gdt.base;
    
    	if (selector & 4) {           /* from ldt */
    
    		u16 ldt_selector = kvm_read_ldt();
    
    
    		table_base = segment_base(ldt_selector);
    	}
    
    	d = (struct desc_struct *)(table_base + (selector & ~7));
    
    	v = get_desc_base(d);
    
    #ifdef CONFIG_X86_64
    
    	if (d->s == 0 && (d->type == 2 || d->type == 9 || d->type == 11))
    		v |= ((unsigned long)((struct ldttss_desc64 *)d)->base3) << 32;
    
    #endif
    	return v;
    }
    EXPORT_SYMBOL_GPL(segment_base);
    
    
    u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)
    {
    	if (irqchip_in_kernel(vcpu->kvm))
    
    		return vcpu->arch.apic_base;
    
    		return vcpu->arch.apic_base;
    
    }
    EXPORT_SYMBOL_GPL(kvm_get_apic_base);
    
    void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data)
    {
    	/* TODO: reserve bits check */
    	if (irqchip_in_kernel(vcpu->kvm))
    		kvm_lapic_set_base(vcpu, data);
    	else
    
    		vcpu->arch.apic_base = data;
    
    }
    EXPORT_SYMBOL_GPL(kvm_set_apic_base);
    
    
    void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr)
    {
    
    	WARN_ON(vcpu->arch.exception.pending);
    	vcpu->arch.exception.pending = true;
    	vcpu->arch.exception.has_error_code = false;
    	vcpu->arch.exception.nr = nr;
    
    }
    EXPORT_SYMBOL_GPL(kvm_queue_exception);
    
    
    void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr,
    			   u32 error_code)
    {
    	++vcpu->stat.pf_guest;
    
    	if (vcpu->arch.exception.pending) {
    
    		switch(vcpu->arch.exception.nr) {
    		case DF_VECTOR:
    
    			/* triple fault -> shutdown */
    			set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
    
    			return;
    		case PF_VECTOR:
    			vcpu->arch.exception.nr = DF_VECTOR;
    			vcpu->arch.exception.error_code = 0;
    			return;
    		default:
    			/* replace previous exception with a new one in a hope
    			   that instruction re-execution will regenerate lost
    			   exception */
    			vcpu->arch.exception.pending = false;
    			break;
    
    	vcpu->arch.cr2 = addr;
    
    	kvm_queue_exception_e(vcpu, PF_VECTOR, error_code);
    }
    
    
    void kvm_inject_nmi(struct kvm_vcpu *vcpu)
    {
    	vcpu->arch.nmi_pending = 1;
    }
    EXPORT_SYMBOL_GPL(kvm_inject_nmi);
    
    
    void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
    {
    
    	WARN_ON(vcpu->arch.exception.pending);
    	vcpu->arch.exception.pending = true;
    	vcpu->arch.exception.has_error_code = true;
    	vcpu->arch.exception.nr = nr;
    	vcpu->arch.exception.error_code = error_code;
    
    }
    EXPORT_SYMBOL_GPL(kvm_queue_exception_e);
    
    
    /*
     * Checks if cpl <= required_cpl; if true, return true.  Otherwise queue
     * a #GP and return false.
     */
    bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl)
    
    	if (kvm_x86_ops->get_cpl(vcpu) <= required_cpl)
    		return true;
    	kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
    	return false;
    
    EXPORT_SYMBOL_GPL(kvm_require_cpl);
    
    /*
     * Load the pae pdptrs.  Return true is they are all valid.
     */
    int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
    {
    	gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
    	unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2;
    	int i;
    	int ret;
    
    	u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)];
    
    
    	ret = kvm_read_guest_page(vcpu->kvm, pdpt_gfn, pdpte,
    				  offset * sizeof(u64), sizeof(pdpte));
    	if (ret < 0) {
    		ret = 0;
    		goto out;
    	}
    	for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {
    
    		    (pdpte[i] & vcpu->arch.mmu.rsvd_bits_mask[0][2])) {
    
    	memcpy(vcpu->arch.pdptrs, pdpte, sizeof(vcpu->arch.pdptrs));
    
    Avi Kivity's avatar
    Avi Kivity committed
    	__set_bit(VCPU_EXREG_PDPTR,
    		  (unsigned long *)&vcpu->arch.regs_avail);
    	__set_bit(VCPU_EXREG_PDPTR,
    		  (unsigned long *)&vcpu->arch.regs_dirty);
    
    EXPORT_SYMBOL_GPL(load_pdptrs);
    
    static bool pdptrs_changed(struct kvm_vcpu *vcpu)
    {
    
    	u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)];
    
    	bool changed = true;
    	int r;
    
    	if (is_long_mode(vcpu) || !is_pae(vcpu))
    		return false;
    
    
    Avi Kivity's avatar
    Avi Kivity committed
    	if (!test_bit(VCPU_EXREG_PDPTR,
    		      (unsigned long *)&vcpu->arch.regs_avail))
    		return true;
    
    
    	r = kvm_read_guest(vcpu->kvm, vcpu->arch.cr3 & ~31u, pdpte, sizeof(pdpte));
    
    	changed = memcmp(pdpte, vcpu->arch.pdptrs, sizeof(pdpte)) != 0;
    
    void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
    
    {
    	if (cr0 & CR0_RESERVED_BITS) {
    		printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n",
    
    		       cr0, vcpu->arch.cr0);
    
    		return;
    	}
    
    	if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) {
    		printk(KERN_DEBUG "set_cr0: #GP, CD == 0 && NW == 1\n");
    
    		return;
    	}
    
    	if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) {
    		printk(KERN_DEBUG "set_cr0: #GP, set PG flag "
    		       "and a clear PE flag\n");
    
    		return;
    	}
    
    	if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
    #ifdef CONFIG_X86_64
    
    		if ((vcpu->arch.shadow_efer & EFER_LME)) {
    
    			int cs_db, cs_l;
    
    			if (!is_pae(vcpu)) {
    				printk(KERN_DEBUG "set_cr0: #GP, start paging "
    				       "in long mode while PAE is disabled\n");
    
    				return;
    			}
    			kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
    			if (cs_l) {
    				printk(KERN_DEBUG "set_cr0: #GP, start paging "
    				       "in long mode while CS.L == 1\n");
    
    		if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3)) {
    
    			printk(KERN_DEBUG "set_cr0: #GP, pdptrs "
    			       "reserved bits\n");
    
    	vcpu->arch.cr0 = cr0;
    
    void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
    
    	kvm_set_cr0(vcpu, (vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f));
    
    void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
    
    	unsigned long old_cr4 = vcpu->arch.cr4;
    	unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE;
    
    
    	if (cr4 & CR4_RESERVED_BITS) {
    		printk(KERN_DEBUG "set_cr4: #GP, reserved bits\n");
    
    		return;
    	}
    
    	if (is_long_mode(vcpu)) {
    		if (!(cr4 & X86_CR4_PAE)) {
    			printk(KERN_DEBUG "set_cr4: #GP, clearing PAE while "
    			       "in long mode\n");
    
    	} else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE)
    		   && ((cr4 ^ old_cr4) & pdptr_bits)
    
    		   && !load_pdptrs(vcpu, vcpu->arch.cr3)) {
    
    		printk(KERN_DEBUG "set_cr4: #GP, pdptrs reserved bits\n");
    
    		return;
    	}
    
    	if (cr4 & X86_CR4_VMXE) {
    		printk(KERN_DEBUG "set_cr4: #GP, setting VMXE\n");
    
    		return;
    	}
    	kvm_x86_ops->set_cr4(vcpu, cr4);
    
    	vcpu->arch.cr4 = cr4;
    
    	vcpu->arch.mmu.base_role.cr4_pge = (cr4 & X86_CR4_PGE) && !tdp_enabled;
    
    void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
    
    	if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) {
    
    		kvm_mmu_sync_roots(vcpu);
    
    	if (is_long_mode(vcpu)) {
    		if (cr3 & CR3_L_MODE_RESERVED_BITS) {
    			printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n");
    
    			return;
    		}
    	} else {
    		if (is_pae(vcpu)) {
    			if (cr3 & CR3_PAE_RESERVED_BITS) {
    				printk(KERN_DEBUG
    				       "set_cr3: #GP, reserved bits\n");
    
    				return;
    			}
    			if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) {
    				printk(KERN_DEBUG "set_cr3: #GP, pdptrs "
    				       "reserved bits\n");
    
    				return;
    			}
    		}
    		/*
    		 * We don't check reserved bits in nonpae mode, because
    		 * this isn't enforced, and VMware depends on this.
    		 */
    	}
    
    	/*
    	 * Does the new cr3 value map to physical memory? (Note, we
    	 * catch an invalid cr3 even in real-mode, because it would
    	 * cause trouble later on when we turn on paging anyway.)
    	 *
    	 * A real CPU would silently accept an invalid cr3 and would
    	 * attempt to use it - with largely undefined (and often hard
    	 * to debug) behavior on the guest side.
    	 */
    	if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT)))
    
    		vcpu->arch.cr3 = cr3;
    		vcpu->arch.mmu.new_cr3(vcpu);
    
    void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
    
    {
    	if (cr8 & CR8_RESERVED_BITS) {
    		printk(KERN_DEBUG "set_cr8: #GP, reserved bits 0x%lx\n", cr8);
    
    		return;
    	}
    	if (irqchip_in_kernel(vcpu->kvm))
    		kvm_lapic_set_tpr(vcpu, cr8);
    	else
    
    		vcpu->arch.cr8 = cr8;
    
    unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
    
    {
    	if (irqchip_in_kernel(vcpu->kvm))
    		return kvm_lapic_get_cr8(vcpu);
    	else
    
    		return vcpu->arch.cr8;
    
    static inline u32 bit(int bitno)
    {
    	return 1 << (bitno & 31);
    }
    
    
    /*
     * List of msr numbers which we expose to userspace through KVM_GET_MSRS
     * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
     *
     * This list is modified at module load time to reflect the
    
     * capabilities of the host cpu. This capabilities test skips MSRs that are
     * kvm-specific. Those are put in the beginning of the list.
    
    
    #define KVM_SAVE_MSRS_BEGIN	2
    
    static u32 msrs_to_save[] = {
    
    	MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
    
    	MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
    	MSR_K6_STAR,
    #ifdef CONFIG_X86_64
    	MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
    #endif
    
    	MSR_IA32_TSC, MSR_IA32_PERF_STATUS, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA
    
    };
    
    static unsigned num_msrs_to_save;
    
    static u32 emulated_msrs[] = {
    	MSR_IA32_MISC_ENABLE,
    };
    
    
    static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
    {
    
    	if (efer & efer_reserved_bits) {
    
    		printk(KERN_DEBUG "set_efer: 0x%llx #GP, reserved bits\n",
    		       efer);
    
    	    && (vcpu->arch.shadow_efer & EFER_LME) != (efer & EFER_LME)) {
    
    		printk(KERN_DEBUG "set_efer: #GP, change LME while paging\n");
    
    Alexander Graf's avatar
    Alexander Graf committed
    	if (efer & EFER_FFXSR) {
    		struct kvm_cpuid_entry2 *feat;
    
    		feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
    		if (!feat || !(feat->edx & bit(X86_FEATURE_FXSR_OPT))) {
    			printk(KERN_DEBUG "set_efer: #GP, enable FFXSR w/o CPUID capability\n");
    			kvm_inject_gp(vcpu, 0);
    			return;
    		}
    	}
    
    
    	if (efer & EFER_SVME) {
    		struct kvm_cpuid_entry2 *feat;
    
    		feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
    		if (!feat || !(feat->ecx & bit(X86_FEATURE_SVM))) {
    			printk(KERN_DEBUG "set_efer: #GP, enable SVM w/o SVM\n");
    			kvm_inject_gp(vcpu, 0);
    			return;
    		}
    	}
    
    
    	kvm_x86_ops->set_efer(vcpu, efer);
    
    	efer &= ~EFER_LMA;
    
    	efer |= vcpu->arch.shadow_efer & EFER_LMA;
    
    	vcpu->arch.shadow_efer = efer;
    
    
    	vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled;
    	kvm_mmu_reset_context(vcpu);
    
    void kvm_enable_efer_bits(u64 mask)
    {
           efer_reserved_bits &= ~mask;
    }
    EXPORT_SYMBOL_GPL(kvm_enable_efer_bits);
    
    
    
    /*
     * Writes msr value into into the appropriate "register".
     * Returns 0 on success, non-0 otherwise.
     * Assumes vcpu_load() was already called.
     */
    int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
    {
    	return kvm_x86_ops->set_msr(vcpu, msr_index, data);
    }
    
    
    /*
     * Adapt set_msr() to msr_io()'s calling convention
     */
    static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
    {
    	return kvm_set_msr(vcpu, index, *data);
    }
    
    
    static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
    {
    	static int version;
    
    	struct pvclock_wall_clock wc;
    	struct timespec now, sys, boot;
    
    
    	if (!wall_clock)
    		return;
    
    	version++;
    
    	kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
    
    
    	/*
    	 * The guest calculates current wall clock time by adding
    	 * system time (updated by kvm_write_guest_time below) to the
    	 * wall clock specified here.  guest system time equals host
    	 * system time for us, thus we must fill in host boot time here.
    	 */
    	now = current_kernel_time();
    	ktime_get_ts(&sys);
    	boot = ns_to_timespec(timespec_to_ns(&now) - timespec_to_ns(&sys));
    
    	wc.sec = boot.tv_sec;
    	wc.nsec = boot.tv_nsec;
    	wc.version = version;
    
    
    	kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc));
    
    	version++;
    	kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
    }
    
    
    static uint32_t div_frac(uint32_t dividend, uint32_t divisor)
    {
    	uint32_t quotient, remainder;
    
    	/* Don't try to replace with do_div(), this one calculates
    	 * "(dividend << 32) / divisor" */
    	__asm__ ( "divl %4"
    		  : "=a" (quotient), "=d" (remainder)
    		  : "0" (0), "1" (dividend), "r" (divisor) );
    	return quotient;
    }
    
    static void kvm_set_time_scale(uint32_t tsc_khz, struct pvclock_vcpu_time_info *hv_clock)
    {
    	uint64_t nsecs = 1000000000LL;
    	int32_t  shift = 0;
    	uint64_t tps64;
    	uint32_t tps32;
    
    	tps64 = tsc_khz * 1000LL;
    	while (tps64 > nsecs*2) {
    		tps64 >>= 1;
    		shift--;
    	}
    
    	tps32 = (uint32_t)tps64;
    	while (tps32 <= (uint32_t)nsecs) {
    		tps32 <<= 1;
    		shift++;
    	}
    
    	hv_clock->tsc_shift = shift;
    	hv_clock->tsc_to_system_mul = div_frac(nsecs, tps32);
    
    	pr_debug("%s: tsc_khz %u, tsc_shift %d, tsc_mul %u\n",
    
    		 __func__, tsc_khz, hv_clock->tsc_shift,
    
    static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);
    
    
    static void kvm_write_guest_time(struct kvm_vcpu *v)
    {
    	struct timespec ts;
    	unsigned long flags;
    	struct kvm_vcpu_arch *vcpu = &v->arch;
    	void *shared_kaddr;
    
    	this_tsc_khz = get_cpu_var(cpu_tsc_khz);
    	if (unlikely(vcpu->hv_clock_tsc_khz != this_tsc_khz)) {
    		kvm_set_time_scale(this_tsc_khz, &vcpu->hv_clock);
    		vcpu->hv_clock_tsc_khz = this_tsc_khz;
    
    	/* Keep irq disabled to prevent changes to the clock */
    	local_irq_save(flags);
    
    	kvm_get_msr(v, MSR_IA32_TSC, &vcpu->hv_clock.tsc_timestamp);
    
    	ktime_get_ts(&ts);
    	local_irq_restore(flags);
    
    	/* With all the info we got, fill in the values */
    
    	vcpu->hv_clock.system_time = ts.tv_nsec +
    
    				     (NSEC_PER_SEC * (u64)ts.tv_sec) + v->kvm->arch.kvmclock_offset;
    
    
    	/*
    	 * The interface expects us to write an even number signaling that the
    	 * update is finished. Since the guest won't see the intermediate
    
    	 * state, we just increase by 2 at the end.
    
    	vcpu->hv_clock.version += 2;
    
    
    	shared_kaddr = kmap_atomic(vcpu->time_page, KM_USER0);
    
    	memcpy(shared_kaddr + vcpu->time_offset, &vcpu->hv_clock,
    
    	       sizeof(vcpu->hv_clock));
    
    
    	kunmap_atomic(shared_kaddr, KM_USER0);
    
    	mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT);
    }
    
    
    static int kvm_request_guest_time_update(struct kvm_vcpu *v)
    {
    	struct kvm_vcpu_arch *vcpu = &v->arch;
    
    	if (!vcpu->time_page)
    		return 0;
    	set_bit(KVM_REQ_KVMCLOCK_UPDATE, &v->requests);
    	return 1;
    }
    
    
    Avi Kivity's avatar
    Avi Kivity committed
    static bool msr_mtrr_valid(unsigned msr)
    {
    	switch (msr) {
    	case 0x200 ... 0x200 + 2 * KVM_NR_VAR_MTRR - 1:
    	case MSR_MTRRfix64K_00000:
    	case MSR_MTRRfix16K_80000:
    	case MSR_MTRRfix16K_A0000:
    	case MSR_MTRRfix4K_C0000:
    	case MSR_MTRRfix4K_C8000:
    	case MSR_MTRRfix4K_D0000:
    	case MSR_MTRRfix4K_D8000:
    	case MSR_MTRRfix4K_E0000:
    	case MSR_MTRRfix4K_E8000:
    	case MSR_MTRRfix4K_F0000:
    	case MSR_MTRRfix4K_F8000:
    	case MSR_MTRRdefType:
    	case MSR_IA32_CR_PAT:
    		return true;
    	case 0x2f8:
    		return true;
    	}
    	return false;
    }
    
    
    static bool valid_pat_type(unsigned t)
    {
    	return t < 8 && (1 << t) & 0xf3; /* 0, 1, 4, 5, 6, 7 */
    }
    
    static bool valid_mtrr_type(unsigned t)
    {
    	return t < 8 && (1 << t) & 0x73; /* 0, 1, 4, 5, 6 */
    }
    
    static bool mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data)
    {
    	int i;
    
    	if (!msr_mtrr_valid(msr))
    		return false;
    
    	if (msr == MSR_IA32_CR_PAT) {
    		for (i = 0; i < 8; i++)
    			if (!valid_pat_type((data >> (i * 8)) & 0xff))
    				return false;
    		return true;
    	} else if (msr == MSR_MTRRdefType) {
    		if (data & ~0xcff)
    			return false;
    		return valid_mtrr_type(data & 0xff);
    	} else if (msr >= MSR_MTRRfix64K_00000 && msr <= MSR_MTRRfix4K_F8000) {
    		for (i = 0; i < 8 ; i++)
    			if (!valid_mtrr_type((data >> (i * 8)) & 0xff))
    				return false;
    		return true;
    	}
    
    	/* variable MTRRs */
    	return valid_mtrr_type(data & 0xff);
    }
    
    
    Avi Kivity's avatar
    Avi Kivity committed
    static int set_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
    {
    
    Sheng Yang's avatar
    Sheng Yang committed
    	u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges;
    
    
    	if (!mtrr_valid(vcpu, msr, data))
    
    Avi Kivity's avatar
    Avi Kivity committed
    		return 1;
    
    
    Sheng Yang's avatar
    Sheng Yang committed
    	if (msr == MSR_MTRRdefType) {
    		vcpu->arch.mtrr_state.def_type = data;
    		vcpu->arch.mtrr_state.enabled = (data & 0xc00) >> 10;
    	} else if (msr == MSR_MTRRfix64K_00000)
    		p[0] = data;
    	else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000)
    		p[1 + msr - MSR_MTRRfix16K_80000] = data;
    	else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000)
    		p[3 + msr - MSR_MTRRfix4K_C0000] = data;
    	else if (msr == MSR_IA32_CR_PAT)
    		vcpu->arch.pat = data;
    	else {	/* Variable MTRRs */
    		int idx, is_mtrr_mask;
    		u64 *pt;
    
    		idx = (msr - 0x200) / 2;
    		is_mtrr_mask = msr - 0x200 - 2 * idx;
    		if (!is_mtrr_mask)
    			pt =
    			  (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].base_lo;
    		else
    			pt =
    			  (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].mask_lo;
    		*pt = data;
    	}
    
    	kvm_mmu_reset_context(vcpu);
    
    Avi Kivity's avatar
    Avi Kivity committed
    	return 0;
    }
    
    Huang Ying's avatar
    Huang Ying committed
    static int set_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 data)
    
    Huang Ying's avatar
    Huang Ying committed
    	u64 mcg_cap = vcpu->arch.mcg_cap;
    	unsigned bank_num = mcg_cap & 0xff;
    
    
    	switch (msr) {
    	case MSR_IA32_MCG_STATUS:
    
    Huang Ying's avatar
    Huang Ying committed
    		vcpu->arch.mcg_status = data;
    
    	case MSR_IA32_MCG_CTL:
    
    Huang Ying's avatar
    Huang Ying committed
    		if (!(mcg_cap & MCG_CTL_P))
    			return 1;
    		if (data != 0 && data != ~(u64)0)
    			return -1;
    		vcpu->arch.mcg_ctl = data;
    		break;
    	default:
    		if (msr >= MSR_IA32_MC0_CTL &&
    		    msr < MSR_IA32_MC0_CTL + 4 * bank_num) {
    			u32 offset = msr - MSR_IA32_MC0_CTL;
    			/* only 0 or all 1s can be written to IA32_MCi_CTL */
    			if ((offset & 0x3) == 0 &&
    			    data != 0 && data != ~(u64)0)
    				return -1;
    			vcpu->arch.mce_banks[offset] = data;
    			break;
    		}
    		return 1;
    	}
    	return 0;
    }
    
    
    static int xen_hvm_config(struct kvm_vcpu *vcpu, u64 data)
    {
    	struct kvm *kvm = vcpu->kvm;
    	int lm = is_long_mode(vcpu);
    	u8 *blob_addr = lm ? (u8 *)(long)kvm->arch.xen_hvm_config.blob_addr_64
    		: (u8 *)(long)kvm->arch.xen_hvm_config.blob_addr_32;
    	u8 blob_size = lm ? kvm->arch.xen_hvm_config.blob_size_64
    		: kvm->arch.xen_hvm_config.blob_size_32;
    	u32 page_num = data & ~PAGE_MASK;
    	u64 page_addr = data & PAGE_MASK;
    	u8 *page;
    	int r;
    
    	r = -E2BIG;
    	if (page_num >= blob_size)
    		goto out;
    	r = -ENOMEM;
    	page = kzalloc(PAGE_SIZE, GFP_KERNEL);
    	if (!page)
    		goto out;
    	r = -EFAULT;
    	if (copy_from_user(page, blob_addr + (page_num * PAGE_SIZE), PAGE_SIZE))
    		goto out_free;
    	if (kvm_write_guest(kvm, page_addr, page, PAGE_SIZE))
    		goto out_free;
    	r = 0;
    out_free:
    	kfree(page);
    out:
    	return r;
    }
    
    
    int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
    {
    	switch (msr) {
    	case MSR_EFER:
    		set_efer(vcpu, data);
    		break;
    
    	case MSR_K7_HWCR:
    		data &= ~(u64)0x40;	/* ignore flush filter disable */
    		if (data != 0) {
    			pr_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n",
    				data);
    			return 1;
    		}
    
    	case MSR_FAM10H_MMIO_CONF_BASE:
    		if (data != 0) {
    			pr_unimpl(vcpu, "unimplemented MMIO_CONF_BASE wrmsr: "
    				"0x%llx\n", data);
    			return 1;
    		}
    
    	case MSR_AMD64_NB_CFG:
    
    	case MSR_IA32_DEBUGCTLMSR:
    		if (!data) {
    			/* We support the non-activated case already */
    			break;
    		} else if (data & ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_BTF)) {
    			/* Values other than LBR and BTF are vendor-specific,
    			   thus reserved and should throw a #GP */
    			return 1;
    		}
    		pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n",
    			__func__, data);
    		break;
    
    	case MSR_IA32_UCODE_REV:
    	case MSR_IA32_UCODE_WRITE:
    
    	case MSR_VM_HSAVE_PA:
    
    	case MSR_AMD64_PATCH_LOADER: