Skip to content
Snippets Groups Projects
x86.c 109 KiB
Newer Older
  • Learn to ignore specific revisions
  • /*
     * Kernel-based Virtual Machine driver for Linux
     *
     * derived from drivers/kvm/kvm_main.c
     *
     * Copyright (C) 2006 Qumranet, Inc.
    
     * Copyright (C) 2008 Qumranet, Inc.
     * Copyright IBM Corporation, 2008
    
     *
     * Authors:
     *   Avi Kivity   <avi@qumranet.com>
     *   Yaniv Kamay  <yaniv@qumranet.com>
    
     *   Amit Shah    <amit.shah@qumranet.com>
     *   Ben-Ami Yassour <benami@il.ibm.com>
    
     *
     * This work is licensed under the terms of the GNU GPL, version 2.  See
     * the COPYING file in the top-level directory.
     *
     */
    
    
    #include <linux/kvm_host.h>
    
    #include "irq.h"
    
    Sheng Yang's avatar
    Sheng Yang committed
    #include "i8254.h"
    
    #include "tss.h"
    
    #include "kvm_cache_regs.h"
    
    #include <linux/clocksource.h>
    
    #include <linux/interrupt.h>
    
    #include <linux/kvm.h>
    #include <linux/fs.h>
    #include <linux/vmalloc.h>
    
    #include <linux/module.h>
    
    #include <linux/highmem.h>
    
    #include <linux/iommu.h>
    
    #include <linux/intel-iommu.h>
    
    #include <linux/cpufreq.h>
    
    
    #include <asm/uaccess.h>
    
    #include <asm/msr.h>
    
    Sheng Yang's avatar
    Sheng Yang committed
    #include <asm/mtrr.h>
    
    #define MAX_IO_MSRS 256
    
    #define CR0_RESERVED_BITS						\
    	(~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
    			  | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \
    			  | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG))
    #define CR4_RESERVED_BITS						\
    	(~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\
    			  | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE	\
    			  | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR	\
    			  | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE))
    
    #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
    
    /* EFER defaults:
     * - enable syscall per default because its emulated by KVM
     * - enable LME and LMA per default on 64 bit KVM
     */
    #ifdef CONFIG_X86_64
    static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffafeULL;
    #else
    static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffffeULL;
    #endif
    
    #define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM
    #define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
    
    static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
    				    struct kvm_cpuid_entry2 __user *entries);
    
    struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
    					      u32 function, u32 index);
    
    struct kvm_x86_ops *kvm_x86_ops;
    
    EXPORT_SYMBOL_GPL(kvm_x86_ops);
    
    struct kvm_stats_debugfs_item debugfs_entries[] = {
    
    	{ "pf_fixed", VCPU_STAT(pf_fixed) },
    	{ "pf_guest", VCPU_STAT(pf_guest) },
    	{ "tlb_flush", VCPU_STAT(tlb_flush) },
    	{ "invlpg", VCPU_STAT(invlpg) },
    	{ "exits", VCPU_STAT(exits) },
    	{ "io_exits", VCPU_STAT(io_exits) },
    	{ "mmio_exits", VCPU_STAT(mmio_exits) },
    	{ "signal_exits", VCPU_STAT(signal_exits) },
    	{ "irq_window", VCPU_STAT(irq_window_exits) },
    
    	{ "nmi_window", VCPU_STAT(nmi_window_exits) },
    
    	{ "halt_exits", VCPU_STAT(halt_exits) },
    	{ "halt_wakeup", VCPU_STAT(halt_wakeup) },
    
    	{ "hypercalls", VCPU_STAT(hypercalls) },
    
    	{ "request_irq", VCPU_STAT(request_irq_exits) },
    
    	{ "request_nmi", VCPU_STAT(request_nmi_exits) },
    
    	{ "irq_exits", VCPU_STAT(irq_exits) },
    	{ "host_state_reload", VCPU_STAT(host_state_reload) },
    	{ "efer_reload", VCPU_STAT(efer_reload) },
    	{ "fpu_reload", VCPU_STAT(fpu_reload) },
    	{ "insn_emulation", VCPU_STAT(insn_emulation) },
    	{ "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) },
    
    	{ "irq_injections", VCPU_STAT(irq_injections) },
    
    	{ "nmi_injections", VCPU_STAT(nmi_injections) },
    
    	{ "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) },
    	{ "mmu_pte_write", VM_STAT(mmu_pte_write) },
    	{ "mmu_pte_updated", VM_STAT(mmu_pte_updated) },
    	{ "mmu_pde_zapped", VM_STAT(mmu_pde_zapped) },
    	{ "mmu_flooded", VM_STAT(mmu_flooded) },
    	{ "mmu_recycled", VM_STAT(mmu_recycled) },
    
    	{ "mmu_cache_miss", VM_STAT(mmu_cache_miss) },
    
    	{ "mmu_unsync", VM_STAT(mmu_unsync) },
    
    	{ "mmu_unsync_global", VM_STAT(mmu_unsync_global) },
    
    	{ "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
    
    	{ "largepages", VM_STAT(lpages) },
    
    unsigned long segment_base(u16 selector)
    {
    	struct descriptor_table gdt;
    
    	unsigned long table_base;
    	unsigned long v;
    
    	if (selector == 0)
    		return 0;
    
    	asm("sgdt %0" : "=m"(gdt));
    	table_base = gdt.base;
    
    	if (selector & 4) {           /* from ldt */
    		u16 ldt_selector;
    
    		asm("sldt %0" : "=g"(ldt_selector));
    		table_base = segment_base(ldt_selector);
    	}
    
    	d = (struct desc_struct *)(table_base + (selector & ~7));
    	v = d->base0 | ((unsigned long)d->base1 << 16) |
    		((unsigned long)d->base2 << 24);
    
    #ifdef CONFIG_X86_64
    
    	if (d->s == 0 && (d->type == 2 || d->type == 9 || d->type == 11))
    		v |= ((unsigned long)((struct ldttss_desc64 *)d)->base3) << 32;
    
    #endif
    	return v;
    }
    EXPORT_SYMBOL_GPL(segment_base);
    
    
    u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)
    {
    	if (irqchip_in_kernel(vcpu->kvm))
    
    		return vcpu->arch.apic_base;
    
    		return vcpu->arch.apic_base;
    
    }
    EXPORT_SYMBOL_GPL(kvm_get_apic_base);
    
    void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data)
    {
    	/* TODO: reserve bits check */
    	if (irqchip_in_kernel(vcpu->kvm))
    		kvm_lapic_set_base(vcpu, data);
    	else
    
    		vcpu->arch.apic_base = data;
    
    }
    EXPORT_SYMBOL_GPL(kvm_set_apic_base);
    
    
    void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr)
    {
    
    	WARN_ON(vcpu->arch.exception.pending);
    	vcpu->arch.exception.pending = true;
    	vcpu->arch.exception.has_error_code = false;
    	vcpu->arch.exception.nr = nr;
    
    }
    EXPORT_SYMBOL_GPL(kvm_queue_exception);
    
    
    void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr,
    			   u32 error_code)
    {
    	++vcpu->stat.pf_guest;
    
    	if (vcpu->arch.exception.pending) {
    		if (vcpu->arch.exception.nr == PF_VECTOR) {
    			printk(KERN_DEBUG "kvm: inject_page_fault:"
    					" double fault 0x%lx\n", addr);
    			vcpu->arch.exception.nr = DF_VECTOR;
    			vcpu->arch.exception.error_code = 0;
    		} else if (vcpu->arch.exception.nr == DF_VECTOR) {
    			/* triple fault -> shutdown */
    			set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
    		}
    
    	vcpu->arch.cr2 = addr;
    
    	kvm_queue_exception_e(vcpu, PF_VECTOR, error_code);
    }
    
    
    void kvm_inject_nmi(struct kvm_vcpu *vcpu)
    {
    	vcpu->arch.nmi_pending = 1;
    }
    EXPORT_SYMBOL_GPL(kvm_inject_nmi);
    
    
    void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
    {
    
    	WARN_ON(vcpu->arch.exception.pending);
    	vcpu->arch.exception.pending = true;
    	vcpu->arch.exception.has_error_code = true;
    	vcpu->arch.exception.nr = nr;
    	vcpu->arch.exception.error_code = error_code;
    
    }
    EXPORT_SYMBOL_GPL(kvm_queue_exception_e);
    
    static void __queue_exception(struct kvm_vcpu *vcpu)
    {
    
    	kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr,
    				     vcpu->arch.exception.has_error_code,
    				     vcpu->arch.exception.error_code);
    
    /*
     * Load the pae pdptrs.  Return true is they are all valid.
     */
    int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
    {
    	gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
    	unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2;
    	int i;
    	int ret;
    
    	u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)];
    
    
    	ret = kvm_read_guest_page(vcpu->kvm, pdpt_gfn, pdpte,
    				  offset * sizeof(u64), sizeof(pdpte));
    	if (ret < 0) {
    		ret = 0;
    		goto out;
    	}
    	for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {
    		if ((pdpte[i] & 1) && (pdpte[i] & 0xfffffff0000001e6ull)) {
    			ret = 0;
    			goto out;
    		}
    	}
    	ret = 1;
    
    
    	memcpy(vcpu->arch.pdptrs, pdpte, sizeof(vcpu->arch.pdptrs));
    
    EXPORT_SYMBOL_GPL(load_pdptrs);
    
    static bool pdptrs_changed(struct kvm_vcpu *vcpu)
    {
    
    	u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)];
    
    	bool changed = true;
    	int r;
    
    	if (is_long_mode(vcpu) || !is_pae(vcpu))
    		return false;
    
    
    	r = kvm_read_guest(vcpu->kvm, vcpu->arch.cr3 & ~31u, pdpte, sizeof(pdpte));
    
    	changed = memcmp(pdpte, vcpu->arch.pdptrs, sizeof(pdpte)) != 0;
    
    void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
    
    {
    	if (cr0 & CR0_RESERVED_BITS) {
    		printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n",
    
    		       cr0, vcpu->arch.cr0);
    
    		return;
    	}
    
    	if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) {
    		printk(KERN_DEBUG "set_cr0: #GP, CD == 0 && NW == 1\n");
    
    		return;
    	}
    
    	if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) {
    		printk(KERN_DEBUG "set_cr0: #GP, set PG flag "
    		       "and a clear PE flag\n");
    
    		return;
    	}
    
    	if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
    #ifdef CONFIG_X86_64
    
    		if ((vcpu->arch.shadow_efer & EFER_LME)) {
    
    			int cs_db, cs_l;
    
    			if (!is_pae(vcpu)) {
    				printk(KERN_DEBUG "set_cr0: #GP, start paging "
    				       "in long mode while PAE is disabled\n");
    
    				return;
    			}
    			kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
    			if (cs_l) {
    				printk(KERN_DEBUG "set_cr0: #GP, start paging "
    				       "in long mode while CS.L == 1\n");
    
    		if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3)) {
    
    			printk(KERN_DEBUG "set_cr0: #GP, pdptrs "
    			       "reserved bits\n");
    
    	vcpu->arch.cr0 = cr0;
    
    	kvm_mmu_sync_global(vcpu);
    
    	kvm_mmu_reset_context(vcpu);
    	return;
    }
    
    void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
    
    	kvm_set_cr0(vcpu, (vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f));
    
    Feng (Eric) Liu's avatar
    Feng (Eric) Liu committed
    	KVMTRACE_1D(LMSW, vcpu,
    		    (u32)((vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f)),
    		    handler);
    
    void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
    
    	unsigned long old_cr4 = vcpu->arch.cr4;
    	unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE;
    
    
    	if (cr4 & CR4_RESERVED_BITS) {
    		printk(KERN_DEBUG "set_cr4: #GP, reserved bits\n");
    
    		return;
    	}
    
    	if (is_long_mode(vcpu)) {
    		if (!(cr4 & X86_CR4_PAE)) {
    			printk(KERN_DEBUG "set_cr4: #GP, clearing PAE while "
    			       "in long mode\n");
    
    	} else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE)
    		   && ((cr4 ^ old_cr4) & pdptr_bits)
    
    		   && !load_pdptrs(vcpu, vcpu->arch.cr3)) {
    
    		printk(KERN_DEBUG "set_cr4: #GP, pdptrs reserved bits\n");
    
    		return;
    	}
    
    	if (cr4 & X86_CR4_VMXE) {
    		printk(KERN_DEBUG "set_cr4: #GP, setting VMXE\n");
    
    		return;
    	}
    	kvm_x86_ops->set_cr4(vcpu, cr4);
    
    	vcpu->arch.cr4 = cr4;
    
    	vcpu->arch.mmu.base_role.cr4_pge = (cr4 & X86_CR4_PGE) && !tdp_enabled;
    
    	kvm_mmu_sync_global(vcpu);
    
    void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
    
    	if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) {
    
    		kvm_mmu_sync_roots(vcpu);
    
    	if (is_long_mode(vcpu)) {
    		if (cr3 & CR3_L_MODE_RESERVED_BITS) {
    			printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n");
    
    			return;
    		}
    	} else {
    		if (is_pae(vcpu)) {
    			if (cr3 & CR3_PAE_RESERVED_BITS) {
    				printk(KERN_DEBUG
    				       "set_cr3: #GP, reserved bits\n");
    
    				return;
    			}
    			if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) {
    				printk(KERN_DEBUG "set_cr3: #GP, pdptrs "
    				       "reserved bits\n");
    
    				return;
    			}
    		}
    		/*
    		 * We don't check reserved bits in nonpae mode, because
    		 * this isn't enforced, and VMware depends on this.
    		 */
    	}
    
    	/*
    	 * Does the new cr3 value map to physical memory? (Note, we
    	 * catch an invalid cr3 even in real-mode, because it would
    	 * cause trouble later on when we turn on paging anyway.)
    	 *
    	 * A real CPU would silently accept an invalid cr3 and would
    	 * attempt to use it - with largely undefined (and often hard
    	 * to debug) behavior on the guest side.
    	 */
    	if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT)))
    
    		vcpu->arch.cr3 = cr3;
    		vcpu->arch.mmu.new_cr3(vcpu);
    
    void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
    
    {
    	if (cr8 & CR8_RESERVED_BITS) {
    		printk(KERN_DEBUG "set_cr8: #GP, reserved bits 0x%lx\n", cr8);
    
    		return;
    	}
    	if (irqchip_in_kernel(vcpu->kvm))
    		kvm_lapic_set_tpr(vcpu, cr8);
    	else
    
    		vcpu->arch.cr8 = cr8;
    
    unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
    
    {
    	if (irqchip_in_kernel(vcpu->kvm))
    		return kvm_lapic_get_cr8(vcpu);
    	else
    
    		return vcpu->arch.cr8;
    
    static inline u32 bit(int bitno)
    {
    	return 1 << (bitno & 31);
    }
    
    
    /*
     * List of msr numbers which we expose to userspace through KVM_GET_MSRS
     * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
     *
     * This list is modified at module load time to reflect the
     * capabilities of the host cpu.
     */
    static u32 msrs_to_save[] = {
    	MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
    	MSR_K6_STAR,
    #ifdef CONFIG_X86_64
    	MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
    #endif
    
    	MSR_IA32_TIME_STAMP_COUNTER, MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
    
    	MSR_IA32_PERF_STATUS, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA
    
    };
    
    static unsigned num_msrs_to_save;
    
    static u32 emulated_msrs[] = {
    	MSR_IA32_MISC_ENABLE,
    };
    
    
    static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
    {
    
    	if (efer & efer_reserved_bits) {
    
    		printk(KERN_DEBUG "set_efer: 0x%llx #GP, reserved bits\n",
    		       efer);
    
    	    && (vcpu->arch.shadow_efer & EFER_LME) != (efer & EFER_LME)) {
    
    		printk(KERN_DEBUG "set_efer: #GP, change LME while paging\n");
    
    Alexander Graf's avatar
    Alexander Graf committed
    	if (efer & EFER_FFXSR) {
    		struct kvm_cpuid_entry2 *feat;
    
    		feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
    		if (!feat || !(feat->edx & bit(X86_FEATURE_FXSR_OPT))) {
    			printk(KERN_DEBUG "set_efer: #GP, enable FFXSR w/o CPUID capability\n");
    			kvm_inject_gp(vcpu, 0);
    			return;
    		}
    	}
    
    
    	if (efer & EFER_SVME) {
    		struct kvm_cpuid_entry2 *feat;
    
    		feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
    		if (!feat || !(feat->ecx & bit(X86_FEATURE_SVM))) {
    			printk(KERN_DEBUG "set_efer: #GP, enable SVM w/o SVM\n");
    			kvm_inject_gp(vcpu, 0);
    			return;
    		}
    	}
    
    
    	kvm_x86_ops->set_efer(vcpu, efer);
    
    	efer &= ~EFER_LMA;
    
    	efer |= vcpu->arch.shadow_efer & EFER_LMA;
    
    	vcpu->arch.shadow_efer = efer;
    
    void kvm_enable_efer_bits(u64 mask)
    {
           efer_reserved_bits &= ~mask;
    }
    EXPORT_SYMBOL_GPL(kvm_enable_efer_bits);
    
    
    
    /*
     * Writes msr value into into the appropriate "register".
     * Returns 0 on success, non-0 otherwise.
     * Assumes vcpu_load() was already called.
     */
    int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
    {
    	return kvm_x86_ops->set_msr(vcpu, msr_index, data);
    }
    
    
    /*
     * Adapt set_msr() to msr_io()'s calling convention
     */
    static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
    {
    	return kvm_set_msr(vcpu, index, *data);
    }
    
    
    static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
    {
    	static int version;
    
    	struct pvclock_wall_clock wc;
    	struct timespec now, sys, boot;
    
    
    	if (!wall_clock)
    		return;
    
    	version++;
    
    	kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
    
    
    	/*
    	 * The guest calculates current wall clock time by adding
    	 * system time (updated by kvm_write_guest_time below) to the
    	 * wall clock specified here.  guest system time equals host
    	 * system time for us, thus we must fill in host boot time here.
    	 */
    	now = current_kernel_time();
    	ktime_get_ts(&sys);
    	boot = ns_to_timespec(timespec_to_ns(&now) - timespec_to_ns(&sys));
    
    	wc.sec = boot.tv_sec;
    	wc.nsec = boot.tv_nsec;
    	wc.version = version;
    
    
    	kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc));
    
    	version++;
    	kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
    }
    
    
    static uint32_t div_frac(uint32_t dividend, uint32_t divisor)
    {
    	uint32_t quotient, remainder;
    
    	/* Don't try to replace with do_div(), this one calculates
    	 * "(dividend << 32) / divisor" */
    	__asm__ ( "divl %4"
    		  : "=a" (quotient), "=d" (remainder)
    		  : "0" (0), "1" (dividend), "r" (divisor) );
    	return quotient;
    }
    
    static void kvm_set_time_scale(uint32_t tsc_khz, struct pvclock_vcpu_time_info *hv_clock)
    {
    	uint64_t nsecs = 1000000000LL;
    	int32_t  shift = 0;
    	uint64_t tps64;
    	uint32_t tps32;
    
    	tps64 = tsc_khz * 1000LL;
    	while (tps64 > nsecs*2) {
    		tps64 >>= 1;
    		shift--;
    	}
    
    	tps32 = (uint32_t)tps64;
    	while (tps32 <= (uint32_t)nsecs) {
    		tps32 <<= 1;
    		shift++;
    	}
    
    	hv_clock->tsc_shift = shift;
    	hv_clock->tsc_to_system_mul = div_frac(nsecs, tps32);
    
    	pr_debug("%s: tsc_khz %u, tsc_shift %d, tsc_mul %u\n",
    
    		 __func__, tsc_khz, hv_clock->tsc_shift,
    
    static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);
    
    
    static void kvm_write_guest_time(struct kvm_vcpu *v)
    {
    	struct timespec ts;
    	unsigned long flags;
    	struct kvm_vcpu_arch *vcpu = &v->arch;
    	void *shared_kaddr;
    
    	if ((!vcpu->time_page))
    		return;
    
    
    	if (unlikely(vcpu->hv_clock_tsc_khz != __get_cpu_var(cpu_tsc_khz))) {
    		kvm_set_time_scale(__get_cpu_var(cpu_tsc_khz), &vcpu->hv_clock);
    		vcpu->hv_clock_tsc_khz = __get_cpu_var(cpu_tsc_khz);
    
    	/* Keep irq disabled to prevent changes to the clock */
    	local_irq_save(flags);
    	kvm_get_msr(v, MSR_IA32_TIME_STAMP_COUNTER,
    			  &vcpu->hv_clock.tsc_timestamp);
    	ktime_get_ts(&ts);
    	local_irq_restore(flags);
    
    	/* With all the info we got, fill in the values */
    
    	vcpu->hv_clock.system_time = ts.tv_nsec +
    				     (NSEC_PER_SEC * (u64)ts.tv_sec);
    	/*
    	 * The interface expects us to write an even number signaling that the
    	 * update is finished. Since the guest won't see the intermediate
    
    	 * state, we just increase by 2 at the end.
    
    	vcpu->hv_clock.version += 2;
    
    
    	shared_kaddr = kmap_atomic(vcpu->time_page, KM_USER0);
    
    	memcpy(shared_kaddr + vcpu->time_offset, &vcpu->hv_clock,
    
    	       sizeof(vcpu->hv_clock));
    
    
    	kunmap_atomic(shared_kaddr, KM_USER0);
    
    	mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT);
    }
    
    
    static int kvm_request_guest_time_update(struct kvm_vcpu *v)
    {
    	struct kvm_vcpu_arch *vcpu = &v->arch;
    
    	if (!vcpu->time_page)
    		return 0;
    	set_bit(KVM_REQ_KVMCLOCK_UPDATE, &v->requests);
    	return 1;
    }
    
    
    Avi Kivity's avatar
    Avi Kivity committed
    static bool msr_mtrr_valid(unsigned msr)
    {
    	switch (msr) {
    	case 0x200 ... 0x200 + 2 * KVM_NR_VAR_MTRR - 1:
    	case MSR_MTRRfix64K_00000:
    	case MSR_MTRRfix16K_80000:
    	case MSR_MTRRfix16K_A0000:
    	case MSR_MTRRfix4K_C0000:
    	case MSR_MTRRfix4K_C8000:
    	case MSR_MTRRfix4K_D0000:
    	case MSR_MTRRfix4K_D8000:
    	case MSR_MTRRfix4K_E0000:
    	case MSR_MTRRfix4K_E8000:
    	case MSR_MTRRfix4K_F0000:
    	case MSR_MTRRfix4K_F8000:
    	case MSR_MTRRdefType:
    	case MSR_IA32_CR_PAT:
    		return true;
    	case 0x2f8:
    		return true;
    	}
    	return false;
    }
    
    static int set_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
    {
    
    Sheng Yang's avatar
    Sheng Yang committed
    	u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges;
    
    
    Avi Kivity's avatar
    Avi Kivity committed
    	if (!msr_mtrr_valid(msr))
    		return 1;
    
    
    Sheng Yang's avatar
    Sheng Yang committed
    	if (msr == MSR_MTRRdefType) {
    		vcpu->arch.mtrr_state.def_type = data;
    		vcpu->arch.mtrr_state.enabled = (data & 0xc00) >> 10;
    	} else if (msr == MSR_MTRRfix64K_00000)
    		p[0] = data;
    	else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000)
    		p[1 + msr - MSR_MTRRfix16K_80000] = data;
    	else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000)
    		p[3 + msr - MSR_MTRRfix4K_C0000] = data;
    	else if (msr == MSR_IA32_CR_PAT)
    		vcpu->arch.pat = data;
    	else {	/* Variable MTRRs */
    		int idx, is_mtrr_mask;
    		u64 *pt;
    
    		idx = (msr - 0x200) / 2;
    		is_mtrr_mask = msr - 0x200 - 2 * idx;
    		if (!is_mtrr_mask)
    			pt =
    			  (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].base_lo;
    		else
    			pt =
    			  (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].mask_lo;
    		*pt = data;
    	}
    
    	kvm_mmu_reset_context(vcpu);
    
    Avi Kivity's avatar
    Avi Kivity committed
    	return 0;
    }
    
    
    int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
    {
    	switch (msr) {
    	case MSR_EFER:
    		set_efer(vcpu, data);
    		break;
    	case MSR_IA32_MC0_STATUS:
    		pr_unimpl(vcpu, "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n",
    
    		break;
    	case MSR_IA32_MCG_STATUS:
    		pr_unimpl(vcpu, "%s: MSR_IA32_MCG_STATUS 0x%llx, nop\n",
    
    	case MSR_IA32_MCG_CTL:
    		pr_unimpl(vcpu, "%s: MSR_IA32_MCG_CTL 0x%llx, nop\n",
    
    	case MSR_IA32_DEBUGCTLMSR:
    		if (!data) {
    			/* We support the non-activated case already */
    			break;
    		} else if (data & ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_BTF)) {
    			/* Values other than LBR and BTF are vendor-specific,
    			   thus reserved and should throw a #GP */
    			return 1;
    		}
    		pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n",
    			__func__, data);
    		break;
    
    	case MSR_IA32_UCODE_REV:
    	case MSR_IA32_UCODE_WRITE:
    
    	case MSR_VM_HSAVE_PA:
    
    Avi Kivity's avatar
    Avi Kivity committed
    	case 0x200 ... 0x2ff:
    		return set_msr_mtrr(vcpu, msr, data);
    
    	case MSR_IA32_APICBASE:
    		kvm_set_apic_base(vcpu, data);
    		break;
    	case MSR_IA32_MISC_ENABLE:
    
    		vcpu->arch.ia32_misc_enable_msr = data;
    
    	case MSR_KVM_WALL_CLOCK:
    		vcpu->kvm->arch.wall_clock = data;
    		kvm_write_wall_clock(vcpu->kvm, data);
    		break;
    	case MSR_KVM_SYSTEM_TIME: {
    		if (vcpu->arch.time_page) {
    			kvm_release_page_dirty(vcpu->arch.time_page);
    			vcpu->arch.time_page = NULL;
    		}
    
    		vcpu->arch.time = data;
    
    		/* we verify if the enable bit is set... */
    		if (!(data & 1))
    			break;
    
    		/* ...but clean it before doing the actual write */
    		vcpu->arch.time_offset = data & ~(PAGE_MASK | 1);
    
    		vcpu->arch.time_page =
    				gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT);
    
    		if (is_error_page(vcpu->arch.time_page)) {
    			kvm_release_page_clean(vcpu->arch.time_page);
    			vcpu->arch.time_page = NULL;
    		}
    
    
    		kvm_request_guest_time_update(vcpu);
    
    		pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", msr, data);
    
    		return 1;
    	}
    	return 0;
    }
    EXPORT_SYMBOL_GPL(kvm_set_msr_common);
    
    
    /*
     * Reads an msr value (of 'msr_index') into 'pdata'.
     * Returns 0 on success, non-0 otherwise.
     * Assumes vcpu_load() was already called.
     */
    int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
    {
    	return kvm_x86_ops->get_msr(vcpu, msr_index, pdata);
    }
    
    
    Avi Kivity's avatar
    Avi Kivity committed
    static int get_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
    {
    
    Sheng Yang's avatar
    Sheng Yang committed
    	u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges;
    
    
    Avi Kivity's avatar
    Avi Kivity committed
    	if (!msr_mtrr_valid(msr))
    		return 1;
    
    
    Sheng Yang's avatar
    Sheng Yang committed
    	if (msr == MSR_MTRRdefType)
    		*pdata = vcpu->arch.mtrr_state.def_type +
    			 (vcpu->arch.mtrr_state.enabled << 10);
    	else if (msr == MSR_MTRRfix64K_00000)
    		*pdata = p[0];
    	else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000)
    		*pdata = p[1 + msr - MSR_MTRRfix16K_80000];
    	else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000)
    		*pdata = p[3 + msr - MSR_MTRRfix4K_C0000];
    	else if (msr == MSR_IA32_CR_PAT)
    		*pdata = vcpu->arch.pat;
    	else {	/* Variable MTRRs */
    		int idx, is_mtrr_mask;
    		u64 *pt;
    
    		idx = (msr - 0x200) / 2;
    		is_mtrr_mask = msr - 0x200 - 2 * idx;
    		if (!is_mtrr_mask)
    			pt =
    			  (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].base_lo;
    		else
    			pt =
    			  (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].mask_lo;
    		*pdata = *pt;
    	}
    
    
    Avi Kivity's avatar
    Avi Kivity committed
    	return 0;
    }
    
    
    int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
    {
    	u64 data;
    
    	switch (msr) {
    	case 0xc0010010: /* SYSCFG */
    	case 0xc0010015: /* HWCR */
    	case MSR_IA32_PLATFORM_ID:
    	case MSR_IA32_P5_MC_ADDR:
    	case MSR_IA32_P5_MC_TYPE:
    	case MSR_IA32_MC0_CTL:
    	case MSR_IA32_MCG_STATUS:
    	case MSR_IA32_MCG_CAP:
    
    	case MSR_IA32_MCG_CTL:
    
    	case MSR_IA32_MC0_MISC:
    	case MSR_IA32_MC0_MISC+4:
    	case MSR_IA32_MC0_MISC+8:
    	case MSR_IA32_MC0_MISC+12:
    	case MSR_IA32_MC0_MISC+16:
    
    	case MSR_IA32_MC0_MISC+20:
    
    	case MSR_IA32_UCODE_REV:
    	case MSR_IA32_EBL_CR_POWERON:
    
    	case MSR_IA32_DEBUGCTLMSR:
    	case MSR_IA32_LASTBRANCHFROMIP:
    	case MSR_IA32_LASTBRANCHTOIP:
    	case MSR_IA32_LASTINTFROMIP:
    	case MSR_IA32_LASTINTTOIP:
    
    	case MSR_VM_HSAVE_PA:
    
    	case MSR_P6_EVNTSEL0:
    	case MSR_P6_EVNTSEL1:
    
    Avi Kivity's avatar
    Avi Kivity committed
    	case MSR_MTRRcap:
    		data = 0x500 | KVM_NR_VAR_MTRR;
    		break;
    	case 0x200 ... 0x2ff:
    		return get_msr_mtrr(vcpu, msr, pdata);
    
    	case 0xcd: /* fsb frequency */
    		data = 3;
    		break;
    	case MSR_IA32_APICBASE:
    		data = kvm_get_apic_base(vcpu);
    		break;
    	case MSR_IA32_MISC_ENABLE:
    
    		data = vcpu->arch.ia32_misc_enable_msr;
    
    	case MSR_IA32_PERF_STATUS:
    		/* TSC increment by tick */
    		data = 1000ULL;
    		/* CPU multiplier */
    		data |= (((uint64_t)4ULL) << 40);
    		break;
    
    		data = vcpu->arch.shadow_efer;
    
    	case MSR_KVM_WALL_CLOCK:
    		data = vcpu->kvm->arch.wall_clock;
    		break;
    	case MSR_KVM_SYSTEM_TIME:
    		data = vcpu->arch.time;
    		break;
    
    	default:
    		pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);
    		return 1;
    	}
    	*pdata = data;
    	return 0;
    }
    EXPORT_SYMBOL_GPL(kvm_get_msr_common);
    
    
    /*
     * Read or write a bunch of msrs. All parameters are kernel addresses.
     *
     * @return number of msrs set successfully.
     */
    static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs,
    		    struct kvm_msr_entry *entries,
    		    int (*do_msr)(struct kvm_vcpu *vcpu,
    				  unsigned index, u64 *data))
    {
    	int i;
    
    	vcpu_load(vcpu);
    
    
    	down_read(&vcpu->kvm->slots_lock);
    
    	for (i = 0; i < msrs->nmsrs; ++i)
    		if (do_msr(vcpu, entries[i].index, &entries[i].data))
    			break;
    
    	up_read(&vcpu->kvm->slots_lock);
    
    
    	vcpu_put(vcpu);
    
    	return i;
    }
    
    /*
     * Read or write a bunch of msrs. Parameters are user addresses.
     *
     * @return number of msrs set successfully.
     */
    static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs,
    		  int (*do_msr)(struct kvm_vcpu *vcpu,
    				unsigned index, u64 *data),
    		  int writeback)
    {
    	struct kvm_msrs msrs;
    	struct kvm_msr_entry *entries;
    	int r, n;
    	unsigned size;
    
    	r = -EFAULT;
    	if (copy_from_user(&msrs, user_msrs, sizeof msrs))
    		goto out;
    
    	r = -E2BIG;
    	if (msrs.nmsrs >= MAX_IO_MSRS)
    		goto out;
    
    	r = -ENOMEM;
    	size = sizeof(struct kvm_msr_entry) * msrs.nmsrs;
    	entries = vmalloc(size);
    	if (!entries)
    		goto out;
    
    	r = -EFAULT;
    	if (copy_from_user(entries, user_msrs->entries, size))
    		goto out_free;
    
    	r = n = __msr_io(vcpu, &msrs, entries, do_msr);
    	if (r < 0)
    		goto out_free;