Skip to content
Snippets Groups Projects
x86.c 70.8 KiB
Newer Older
  • Learn to ignore specific revisions
  • /*
     * Kernel-based Virtual Machine driver for Linux
     *
     * derived from drivers/kvm/kvm_main.c
     *
     * Copyright (C) 2006 Qumranet, Inc.
     *
     * Authors:
     *   Avi Kivity   <avi@qumranet.com>
     *   Yaniv Kamay  <yaniv@qumranet.com>
     *
     * This work is licensed under the terms of the GNU GPL, version 2.  See
     * the COPYING file in the top-level directory.
     *
     */
    
    
    #include "kvm.h"
    
    #include "x86_emulate.h"
    
    #include "segment_descriptor.h"
    
    #include "irq.h"
    
    #include <linux/kvm.h>
    #include <linux/fs.h>
    #include <linux/vmalloc.h>
    
    #include <linux/module.h>
    
    
    #include <asm/uaccess.h>
    
    #include <asm/msr.h>
    
    #define MAX_IO_MSRS 256
    
    #define CR0_RESERVED_BITS						\
    	(~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
    			  | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \
    			  | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG))
    #define CR4_RESERVED_BITS						\
    	(~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\
    			  | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE	\
    			  | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR	\
    			  | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE))
    
    #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
    
    #define EFER_RESERVED_BITS 0xfffffffffffff2fe
    
    #define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM
    #define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
    
    struct kvm_x86_ops *kvm_x86_ops;
    
    
    struct kvm_stats_debugfs_item debugfs_entries[] = {
    
    	{ "pf_fixed", VCPU_STAT(pf_fixed) },
    	{ "pf_guest", VCPU_STAT(pf_guest) },
    	{ "tlb_flush", VCPU_STAT(tlb_flush) },
    	{ "invlpg", VCPU_STAT(invlpg) },
    	{ "exits", VCPU_STAT(exits) },
    	{ "io_exits", VCPU_STAT(io_exits) },
    	{ "mmio_exits", VCPU_STAT(mmio_exits) },
    	{ "signal_exits", VCPU_STAT(signal_exits) },
    	{ "irq_window", VCPU_STAT(irq_window_exits) },
    	{ "halt_exits", VCPU_STAT(halt_exits) },
    	{ "halt_wakeup", VCPU_STAT(halt_wakeup) },
    	{ "request_irq", VCPU_STAT(request_irq_exits) },
    	{ "irq_exits", VCPU_STAT(irq_exits) },
    	{ "host_state_reload", VCPU_STAT(host_state_reload) },
    	{ "efer_reload", VCPU_STAT(efer_reload) },
    	{ "fpu_reload", VCPU_STAT(fpu_reload) },
    	{ "insn_emulation", VCPU_STAT(insn_emulation) },
    	{ "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) },
    
    	{ "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) },
    	{ "mmu_pte_write", VM_STAT(mmu_pte_write) },
    	{ "mmu_pte_updated", VM_STAT(mmu_pte_updated) },
    	{ "mmu_pde_zapped", VM_STAT(mmu_pde_zapped) },
    	{ "mmu_flooded", VM_STAT(mmu_flooded) },
    	{ "mmu_recycled", VM_STAT(mmu_recycled) },
    
    	{ "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
    
    unsigned long segment_base(u16 selector)
    {
    	struct descriptor_table gdt;
    	struct segment_descriptor *d;
    	unsigned long table_base;
    	unsigned long v;
    
    	if (selector == 0)
    		return 0;
    
    	asm("sgdt %0" : "=m"(gdt));
    	table_base = gdt.base;
    
    	if (selector & 4) {           /* from ldt */
    		u16 ldt_selector;
    
    		asm("sldt %0" : "=g"(ldt_selector));
    		table_base = segment_base(ldt_selector);
    	}
    	d = (struct segment_descriptor *)(table_base + (selector & ~7));
    	v = d->base_low | ((unsigned long)d->base_mid << 16) |
    		((unsigned long)d->base_high << 24);
    #ifdef CONFIG_X86_64
    	if (d->system == 0 && (d->type == 2 || d->type == 9 || d->type == 11))
    		v |= ((unsigned long) \
    		      ((struct segment_descriptor_64 *)d)->base_higher) << 32;
    #endif
    	return v;
    }
    EXPORT_SYMBOL_GPL(segment_base);
    
    
    u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)
    {
    	if (irqchip_in_kernel(vcpu->kvm))
    		return vcpu->apic_base;
    	else
    		return vcpu->apic_base;
    }
    EXPORT_SYMBOL_GPL(kvm_get_apic_base);
    
    void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data)
    {
    	/* TODO: reserve bits check */
    	if (irqchip_in_kernel(vcpu->kvm))
    		kvm_lapic_set_base(vcpu, data);
    	else
    		vcpu->apic_base = data;
    }
    EXPORT_SYMBOL_GPL(kvm_set_apic_base);
    
    
    void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr)
    {
    	WARN_ON(vcpu->exception.pending);
    	vcpu->exception.pending = true;
    	vcpu->exception.has_error_code = false;
    	vcpu->exception.nr = nr;
    }
    EXPORT_SYMBOL_GPL(kvm_queue_exception);
    
    
    void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr,
    			   u32 error_code)
    {
    	++vcpu->stat.pf_guest;
    	if (vcpu->exception.pending && vcpu->exception.nr == PF_VECTOR) {
    		printk(KERN_DEBUG "kvm: inject_page_fault:"
    		       " double fault 0x%lx\n", addr);
    		vcpu->exception.nr = DF_VECTOR;
    		vcpu->exception.error_code = 0;
    		return;
    	}
    	vcpu->cr2 = addr;
    	kvm_queue_exception_e(vcpu, PF_VECTOR, error_code);
    }
    
    
    void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
    {
    	WARN_ON(vcpu->exception.pending);
    	vcpu->exception.pending = true;
    	vcpu->exception.has_error_code = true;
    	vcpu->exception.nr = nr;
    	vcpu->exception.error_code = error_code;
    }
    EXPORT_SYMBOL_GPL(kvm_queue_exception_e);
    
    static void __queue_exception(struct kvm_vcpu *vcpu)
    {
    	kvm_x86_ops->queue_exception(vcpu, vcpu->exception.nr,
    				     vcpu->exception.has_error_code,
    				     vcpu->exception.error_code);
    }
    
    
    /*
     * Load the pae pdptrs.  Return true is they are all valid.
     */
    int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
    {
    	gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
    	unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2;
    	int i;
    	int ret;
    	u64 pdpte[ARRAY_SIZE(vcpu->pdptrs)];
    
    	mutex_lock(&vcpu->kvm->lock);
    	ret = kvm_read_guest_page(vcpu->kvm, pdpt_gfn, pdpte,
    				  offset * sizeof(u64), sizeof(pdpte));
    	if (ret < 0) {
    		ret = 0;
    		goto out;
    	}
    	for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {
    		if ((pdpte[i] & 1) && (pdpte[i] & 0xfffffff0000001e6ull)) {
    			ret = 0;
    			goto out;
    		}
    	}
    	ret = 1;
    
    	memcpy(vcpu->pdptrs, pdpte, sizeof(vcpu->pdptrs));
    out:
    	mutex_unlock(&vcpu->kvm->lock);
    
    	return ret;
    }
    
    
    static bool pdptrs_changed(struct kvm_vcpu *vcpu)
    {
    	u64 pdpte[ARRAY_SIZE(vcpu->pdptrs)];
    	bool changed = true;
    	int r;
    
    	if (is_long_mode(vcpu) || !is_pae(vcpu))
    		return false;
    
    	mutex_lock(&vcpu->kvm->lock);
    	r = kvm_read_guest(vcpu->kvm, vcpu->cr3 & ~31u, pdpte, sizeof(pdpte));
    	if (r < 0)
    		goto out;
    	changed = memcmp(pdpte, vcpu->pdptrs, sizeof(pdpte)) != 0;
    out:
    	mutex_unlock(&vcpu->kvm->lock);
    
    	return changed;
    }
    
    
    void set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
    {
    	if (cr0 & CR0_RESERVED_BITS) {
    		printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n",
    		       cr0, vcpu->cr0);
    
    		return;
    	}
    
    	if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) {
    		printk(KERN_DEBUG "set_cr0: #GP, CD == 0 && NW == 1\n");
    
    		return;
    	}
    
    	if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) {
    		printk(KERN_DEBUG "set_cr0: #GP, set PG flag "
    		       "and a clear PE flag\n");
    
    		return;
    	}
    
    	if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
    #ifdef CONFIG_X86_64
    		if ((vcpu->shadow_efer & EFER_LME)) {
    			int cs_db, cs_l;
    
    			if (!is_pae(vcpu)) {
    				printk(KERN_DEBUG "set_cr0: #GP, start paging "
    				       "in long mode while PAE is disabled\n");
    
    				return;
    			}
    			kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
    			if (cs_l) {
    				printk(KERN_DEBUG "set_cr0: #GP, start paging "
    				       "in long mode while CS.L == 1\n");
    
    				return;
    
    			}
    		} else
    #endif
    		if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->cr3)) {
    			printk(KERN_DEBUG "set_cr0: #GP, pdptrs "
    			       "reserved bits\n");
    
    			return;
    		}
    
    	}
    
    	kvm_x86_ops->set_cr0(vcpu, cr0);
    	vcpu->cr0 = cr0;
    
    	mutex_lock(&vcpu->kvm->lock);
    	kvm_mmu_reset_context(vcpu);
    	mutex_unlock(&vcpu->kvm->lock);
    	return;
    }
    EXPORT_SYMBOL_GPL(set_cr0);
    
    void lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
    {
    	set_cr0(vcpu, (vcpu->cr0 & ~0x0ful) | (msw & 0x0f));
    }
    EXPORT_SYMBOL_GPL(lmsw);
    
    void set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
    {
    	if (cr4 & CR4_RESERVED_BITS) {
    		printk(KERN_DEBUG "set_cr4: #GP, reserved bits\n");
    
    		return;
    	}
    
    	if (is_long_mode(vcpu)) {
    		if (!(cr4 & X86_CR4_PAE)) {
    			printk(KERN_DEBUG "set_cr4: #GP, clearing PAE while "
    			       "in long mode\n");
    
    			return;
    		}
    	} else if (is_paging(vcpu) && !is_pae(vcpu) && (cr4 & X86_CR4_PAE)
    		   && !load_pdptrs(vcpu, vcpu->cr3)) {
    		printk(KERN_DEBUG "set_cr4: #GP, pdptrs reserved bits\n");
    
    		return;
    	}
    
    	if (cr4 & X86_CR4_VMXE) {
    		printk(KERN_DEBUG "set_cr4: #GP, setting VMXE\n");
    
    		return;
    	}
    	kvm_x86_ops->set_cr4(vcpu, cr4);
    	vcpu->cr4 = cr4;
    	mutex_lock(&vcpu->kvm->lock);
    	kvm_mmu_reset_context(vcpu);
    	mutex_unlock(&vcpu->kvm->lock);
    }
    EXPORT_SYMBOL_GPL(set_cr4);
    
    void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
    {
    
    	if (cr3 == vcpu->cr3 && !pdptrs_changed(vcpu)) {
    		kvm_mmu_flush_tlb(vcpu);
    		return;
    	}
    
    
    	if (is_long_mode(vcpu)) {
    		if (cr3 & CR3_L_MODE_RESERVED_BITS) {
    			printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n");
    
    			return;
    		}
    	} else {
    		if (is_pae(vcpu)) {
    			if (cr3 & CR3_PAE_RESERVED_BITS) {
    				printk(KERN_DEBUG
    				       "set_cr3: #GP, reserved bits\n");
    
    				return;
    			}
    			if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) {
    				printk(KERN_DEBUG "set_cr3: #GP, pdptrs "
    				       "reserved bits\n");
    
    				return;
    			}
    		}
    		/*
    		 * We don't check reserved bits in nonpae mode, because
    		 * this isn't enforced, and VMware depends on this.
    		 */
    	}
    
    	mutex_lock(&vcpu->kvm->lock);
    	/*
    	 * Does the new cr3 value map to physical memory? (Note, we
    	 * catch an invalid cr3 even in real-mode, because it would
    	 * cause trouble later on when we turn on paging anyway.)
    	 *
    	 * A real CPU would silently accept an invalid cr3 and would
    	 * attempt to use it - with largely undefined (and often hard
    	 * to debug) behavior on the guest side.
    	 */
    	if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT)))
    
    	else {
    		vcpu->cr3 = cr3;
    		vcpu->mmu.new_cr3(vcpu);
    	}
    	mutex_unlock(&vcpu->kvm->lock);
    }
    EXPORT_SYMBOL_GPL(set_cr3);
    
    void set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
    {
    	if (cr8 & CR8_RESERVED_BITS) {
    		printk(KERN_DEBUG "set_cr8: #GP, reserved bits 0x%lx\n", cr8);
    
    		return;
    	}
    	if (irqchip_in_kernel(vcpu->kvm))
    		kvm_lapic_set_tpr(vcpu, cr8);
    	else
    		vcpu->cr8 = cr8;
    }
    EXPORT_SYMBOL_GPL(set_cr8);
    
    unsigned long get_cr8(struct kvm_vcpu *vcpu)
    {
    	if (irqchip_in_kernel(vcpu->kvm))
    		return kvm_lapic_get_cr8(vcpu);
    	else
    		return vcpu->cr8;
    }
    EXPORT_SYMBOL_GPL(get_cr8);
    
    
    /*
     * List of msr numbers which we expose to userspace through KVM_GET_MSRS
     * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
     *
     * This list is modified at module load time to reflect the
     * capabilities of the host cpu.
     */
    static u32 msrs_to_save[] = {
    	MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
    	MSR_K6_STAR,
    #ifdef CONFIG_X86_64
    	MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
    #endif
    	MSR_IA32_TIME_STAMP_COUNTER,
    };
    
    static unsigned num_msrs_to_save;
    
    static u32 emulated_msrs[] = {
    	MSR_IA32_MISC_ENABLE,
    };
    
    
    #ifdef CONFIG_X86_64
    
    static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
    {
    	if (efer & EFER_RESERVED_BITS) {
    		printk(KERN_DEBUG "set_efer: 0x%llx #GP, reserved bits\n",
    		       efer);
    
    		return;
    	}
    
    	if (is_paging(vcpu)
    	    && (vcpu->shadow_efer & EFER_LME) != (efer & EFER_LME)) {
    		printk(KERN_DEBUG "set_efer: #GP, change LME while paging\n");
    
    		return;
    	}
    
    	kvm_x86_ops->set_efer(vcpu, efer);
    
    	efer &= ~EFER_LMA;
    	efer |= vcpu->shadow_efer & EFER_LMA;
    
    	vcpu->shadow_efer = efer;
    }
    
    #endif
    
    /*
     * Writes msr value into into the appropriate "register".
     * Returns 0 on success, non-0 otherwise.
     * Assumes vcpu_load() was already called.
     */
    int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
    {
    	return kvm_x86_ops->set_msr(vcpu, msr_index, data);
    }
    
    
    /*
     * Adapt set_msr() to msr_io()'s calling convention
     */
    static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
    {
    	return kvm_set_msr(vcpu, index, *data);
    }
    
    
    
    int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
    {
    	switch (msr) {
    #ifdef CONFIG_X86_64
    	case MSR_EFER:
    		set_efer(vcpu, data);
    		break;
    #endif
    	case MSR_IA32_MC0_STATUS:
    		pr_unimpl(vcpu, "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n",
    		       __FUNCTION__, data);
    		break;
    	case MSR_IA32_MCG_STATUS:
    		pr_unimpl(vcpu, "%s: MSR_IA32_MCG_STATUS 0x%llx, nop\n",
    			__FUNCTION__, data);
    		break;
    	case MSR_IA32_UCODE_REV:
    	case MSR_IA32_UCODE_WRITE:
    	case 0x200 ... 0x2ff: /* MTRRs */
    		break;
    	case MSR_IA32_APICBASE:
    		kvm_set_apic_base(vcpu, data);
    		break;
    	case MSR_IA32_MISC_ENABLE:
    		vcpu->ia32_misc_enable_msr = data;
    		break;
    	default:
    		pr_unimpl(vcpu, "unhandled wrmsr: 0x%x\n", msr);
    		return 1;
    	}
    	return 0;
    }
    EXPORT_SYMBOL_GPL(kvm_set_msr_common);
    
    
    /*
     * Reads an msr value (of 'msr_index') into 'pdata'.
     * Returns 0 on success, non-0 otherwise.
     * Assumes vcpu_load() was already called.
     */
    int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
    {
    	return kvm_x86_ops->get_msr(vcpu, msr_index, pdata);
    }
    
    int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
    {
    	u64 data;
    
    	switch (msr) {
    	case 0xc0010010: /* SYSCFG */
    	case 0xc0010015: /* HWCR */
    	case MSR_IA32_PLATFORM_ID:
    	case MSR_IA32_P5_MC_ADDR:
    	case MSR_IA32_P5_MC_TYPE:
    	case MSR_IA32_MC0_CTL:
    	case MSR_IA32_MCG_STATUS:
    	case MSR_IA32_MCG_CAP:
    	case MSR_IA32_MC0_MISC:
    	case MSR_IA32_MC0_MISC+4:
    	case MSR_IA32_MC0_MISC+8:
    	case MSR_IA32_MC0_MISC+12:
    	case MSR_IA32_MC0_MISC+16:
    	case MSR_IA32_UCODE_REV:
    	case MSR_IA32_PERF_STATUS:
    	case MSR_IA32_EBL_CR_POWERON:
    		/* MTRR registers */
    	case 0xfe:
    	case 0x200 ... 0x2ff:
    		data = 0;
    		break;
    	case 0xcd: /* fsb frequency */
    		data = 3;
    		break;
    	case MSR_IA32_APICBASE:
    		data = kvm_get_apic_base(vcpu);
    		break;
    	case MSR_IA32_MISC_ENABLE:
    		data = vcpu->ia32_misc_enable_msr;
    		break;
    #ifdef CONFIG_X86_64
    	case MSR_EFER:
    		data = vcpu->shadow_efer;
    		break;
    #endif
    	default:
    		pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);
    		return 1;
    	}
    	*pdata = data;
    	return 0;
    }
    EXPORT_SYMBOL_GPL(kvm_get_msr_common);
    
    
    /*
     * Read or write a bunch of msrs. All parameters are kernel addresses.
     *
     * @return number of msrs set successfully.
     */
    static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs,
    		    struct kvm_msr_entry *entries,
    		    int (*do_msr)(struct kvm_vcpu *vcpu,
    				  unsigned index, u64 *data))
    {
    	int i;
    
    	vcpu_load(vcpu);
    
    	for (i = 0; i < msrs->nmsrs; ++i)
    		if (do_msr(vcpu, entries[i].index, &entries[i].data))
    			break;
    
    	vcpu_put(vcpu);
    
    	return i;
    }
    
    /*
     * Read or write a bunch of msrs. Parameters are user addresses.
     *
     * @return number of msrs set successfully.
     */
    static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs,
    		  int (*do_msr)(struct kvm_vcpu *vcpu,
    				unsigned index, u64 *data),
    		  int writeback)
    {
    	struct kvm_msrs msrs;
    	struct kvm_msr_entry *entries;
    	int r, n;
    	unsigned size;
    
    	r = -EFAULT;
    	if (copy_from_user(&msrs, user_msrs, sizeof msrs))
    		goto out;
    
    	r = -E2BIG;
    	if (msrs.nmsrs >= MAX_IO_MSRS)
    		goto out;
    
    	r = -ENOMEM;
    	size = sizeof(struct kvm_msr_entry) * msrs.nmsrs;
    	entries = vmalloc(size);
    	if (!entries)
    		goto out;
    
    	r = -EFAULT;
    	if (copy_from_user(entries, user_msrs->entries, size))
    		goto out_free;
    
    	r = n = __msr_io(vcpu, &msrs, entries, do_msr);
    	if (r < 0)
    		goto out_free;
    
    	r = -EFAULT;
    	if (writeback && copy_to_user(user_msrs->entries, entries, size))
    		goto out_free;
    
    	r = n;
    
    out_free:
    	vfree(entries);
    out:
    	return r;
    }
    
    
    /*
     * Make sure that a cpu that is being hot-unplugged does not have any vcpus
     * cached on it.
     */
    void decache_vcpus_on_cpu(int cpu)
    {
    	struct kvm *vm;
    	struct kvm_vcpu *vcpu;
    	int i;
    
    	spin_lock(&kvm_lock);
    	list_for_each_entry(vm, &vm_list, vm_list)
    		for (i = 0; i < KVM_MAX_VCPUS; ++i) {
    			vcpu = vm->vcpus[i];
    			if (!vcpu)
    				continue;
    			/*
    			 * If the vcpu is locked, then it is running on some
    			 * other cpu and therefore it is not cached on the
    			 * cpu in question.
    			 *
    			 * If it's not locked, check the last cpu it executed
    			 * on.
    			 */
    			if (mutex_trylock(&vcpu->mutex)) {
    				if (vcpu->cpu == cpu) {
    					kvm_x86_ops->vcpu_decache(vcpu);
    					vcpu->cpu = -1;
    				}
    				mutex_unlock(&vcpu->mutex);
    			}
    		}
    	spin_unlock(&kvm_lock);
    }
    
    
    int kvm_dev_ioctl_check_extension(long ext)
    {
    	int r;
    
    	switch (ext) {
    	case KVM_CAP_IRQCHIP:
    	case KVM_CAP_HLT:
    	case KVM_CAP_MMU_SHADOW_CACHE_CONTROL:
    	case KVM_CAP_USER_MEMORY:
    	case KVM_CAP_SET_TSS_ADDR:
    
    	case KVM_CAP_EXT_CPUID:
    
    long kvm_arch_dev_ioctl(struct file *filp,
    			unsigned int ioctl, unsigned long arg)
    {
    	void __user *argp = (void __user *)arg;
    	long r;
    
    	switch (ioctl) {
    	case KVM_GET_MSR_INDEX_LIST: {
    		struct kvm_msr_list __user *user_msr_list = argp;
    		struct kvm_msr_list msr_list;
    		unsigned n;
    
    		r = -EFAULT;
    		if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list))
    			goto out;
    		n = msr_list.nmsrs;
    		msr_list.nmsrs = num_msrs_to_save + ARRAY_SIZE(emulated_msrs);
    		if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list))
    			goto out;
    		r = -E2BIG;
    		if (n < num_msrs_to_save)
    			goto out;
    		r = -EFAULT;
    		if (copy_to_user(user_msr_list->indices, &msrs_to_save,
    				 num_msrs_to_save * sizeof(u32)))
    			goto out;
    		if (copy_to_user(user_msr_list->indices
    				 + num_msrs_to_save * sizeof(u32),
    				 &emulated_msrs,
    				 ARRAY_SIZE(emulated_msrs) * sizeof(u32)))
    			goto out;
    		r = 0;
    		break;
    	}
    	default:
    		r = -EINVAL;
    	}
    out:
    	return r;
    }
    
    
    void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
    {
    	kvm_x86_ops->vcpu_load(vcpu, cpu);
    }
    
    void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
    {
    	kvm_x86_ops->vcpu_put(vcpu);
    
    static int is_efer_nx(void)
    
    {
    	u64 efer;
    
    	rdmsrl(MSR_EFER, efer);
    
    	return efer & EFER_NX;
    }
    
    static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu)
    {
    	int i;
    	struct kvm_cpuid_entry2 *e, *entry;
    
    
    	entry = NULL;
    	for (i = 0; i < vcpu->cpuid_nent; ++i) {
    		e = &vcpu->cpuid_entries[i];
    		if (e->function == 0x80000001) {
    			entry = e;
    			break;
    		}
    	}
    
    	if (entry && (entry->edx & (1 << 20)) && !is_efer_nx()) {
    
    		entry->edx &= ~(1 << 20);
    		printk(KERN_INFO "kvm: guest NX capability removed\n");
    	}
    }
    
    
    /* when an old userspace process fills a new kernel module */
    
    static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
    				    struct kvm_cpuid *cpuid,
    				    struct kvm_cpuid_entry __user *entries)
    
    {
    	int r, i;
    	struct kvm_cpuid_entry *cpuid_entries;
    
    	r = -E2BIG;
    	if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
    		goto out;
    	r = -ENOMEM;
    	cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry) * cpuid->nent);
    	if (!cpuid_entries)
    		goto out;
    	r = -EFAULT;
    	if (copy_from_user(cpuid_entries, entries,
    			   cpuid->nent * sizeof(struct kvm_cpuid_entry)))
    		goto out_free;
    	for (i = 0; i < cpuid->nent; i++) {
    		vcpu->cpuid_entries[i].function = cpuid_entries[i].function;
    		vcpu->cpuid_entries[i].eax = cpuid_entries[i].eax;
    		vcpu->cpuid_entries[i].ebx = cpuid_entries[i].ebx;
    		vcpu->cpuid_entries[i].ecx = cpuid_entries[i].ecx;
    		vcpu->cpuid_entries[i].edx = cpuid_entries[i].edx;
    		vcpu->cpuid_entries[i].index = 0;
    		vcpu->cpuid_entries[i].flags = 0;
    		vcpu->cpuid_entries[i].padding[0] = 0;
    		vcpu->cpuid_entries[i].padding[1] = 0;
    		vcpu->cpuid_entries[i].padding[2] = 0;
    	}
    	vcpu->cpuid_nent = cpuid->nent;
    	cpuid_fix_nx_cap(vcpu);
    	r = 0;
    
    out_free:
    	vfree(cpuid_entries);
    out:
    	return r;
    }
    
    static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
    				    struct kvm_cpuid2 *cpuid,
    				    struct kvm_cpuid_entry2 __user *entries)
    
    {
    	int r;
    
    	r = -E2BIG;
    	if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
    		goto out;
    	r = -EFAULT;
    	if (copy_from_user(&vcpu->cpuid_entries, entries,
    
    			   cpuid->nent * sizeof(struct kvm_cpuid_entry2)))
    
    		goto out;
    	vcpu->cpuid_nent = cpuid->nent;
    	return 0;
    
    out:
    	return r;
    }
    
    
    static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
    				    struct kvm_cpuid2 *cpuid,
    				    struct kvm_cpuid_entry2 __user *entries)
    {
    	int r;
    
    	r = -E2BIG;
    	if (cpuid->nent < vcpu->cpuid_nent)
    		goto out;
    	r = -EFAULT;
    	if (copy_to_user(entries, &vcpu->cpuid_entries,
    			   vcpu->cpuid_nent * sizeof(struct kvm_cpuid_entry2)))
    		goto out;
    	return 0;
    
    out:
    	cpuid->nent = vcpu->cpuid_nent;
    	return r;
    }
    
    static inline u32 bit(int bitno)
    {
    	return 1 << (bitno & 31);
    }
    
    static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function,
    			  u32 index)
    {
    	entry->function = function;
    	entry->index = index;
    	cpuid_count(entry->function, entry->index,
    		&entry->eax, &entry->ebx, &entry->ecx, &entry->edx);
    	entry->flags = 0;
    }
    
    static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
    			 u32 index, int *nent, int maxnent)
    {
    	const u32 kvm_supported_word0_x86_features = bit(X86_FEATURE_FPU) |
    		bit(X86_FEATURE_VME) | bit(X86_FEATURE_DE) |
    		bit(X86_FEATURE_PSE) | bit(X86_FEATURE_TSC) |
    		bit(X86_FEATURE_MSR) | bit(X86_FEATURE_PAE) |
    		bit(X86_FEATURE_CX8) | bit(X86_FEATURE_APIC) |
    		bit(X86_FEATURE_SEP) | bit(X86_FEATURE_PGE) |
    		bit(X86_FEATURE_CMOV) | bit(X86_FEATURE_PSE36) |
    		bit(X86_FEATURE_CLFLSH) | bit(X86_FEATURE_MMX) |
    		bit(X86_FEATURE_FXSR) | bit(X86_FEATURE_XMM) |
    		bit(X86_FEATURE_XMM2) | bit(X86_FEATURE_SELFSNOOP);
    	const u32 kvm_supported_word1_x86_features = bit(X86_FEATURE_FPU) |
    		bit(X86_FEATURE_VME) | bit(X86_FEATURE_DE) |
    		bit(X86_FEATURE_PSE) | bit(X86_FEATURE_TSC) |
    		bit(X86_FEATURE_MSR) | bit(X86_FEATURE_PAE) |
    		bit(X86_FEATURE_CX8) | bit(X86_FEATURE_APIC) |
    		bit(X86_FEATURE_PGE) |
    		bit(X86_FEATURE_CMOV) | bit(X86_FEATURE_PSE36) |
    		bit(X86_FEATURE_MMX) | bit(X86_FEATURE_FXSR) |
    		bit(X86_FEATURE_SYSCALL) |
    		(bit(X86_FEATURE_NX) && is_efer_nx()) |
    #ifdef CONFIG_X86_64
    		bit(X86_FEATURE_LM) |
    #endif
    		bit(X86_FEATURE_MMXEXT) |
    		bit(X86_FEATURE_3DNOWEXT) |
    		bit(X86_FEATURE_3DNOW);
    	const u32 kvm_supported_word3_x86_features =
    		bit(X86_FEATURE_XMM3) | bit(X86_FEATURE_CX16);
    	const u32 kvm_supported_word6_x86_features =
    		bit(X86_FEATURE_LAHF_LM) | bit(X86_FEATURE_CMP_LEGACY);
    
    	/* all func 2 cpuid_count() should be called on the same cpu */
    	get_cpu();
    	do_cpuid_1_ent(entry, function, index);
    	++*nent;
    
    	switch (function) {
    	case 0:
    		entry->eax = min(entry->eax, (u32)0xb);
    		break;
    	case 1:
    		entry->edx &= kvm_supported_word0_x86_features;
    		entry->ecx &= kvm_supported_word3_x86_features;
    		break;
    	/* function 2 entries are STATEFUL. That is, repeated cpuid commands
    	 * may return different values. This forces us to get_cpu() before
    	 * issuing the first command, and also to emulate this annoying behavior
    	 * in kvm_emulate_cpuid() using KVM_CPUID_FLAG_STATE_READ_NEXT */
    	case 2: {
    		int t, times = entry->eax & 0xff;
    
    		entry->flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
    		for (t = 1; t < times && *nent < maxnent; ++t) {
    			do_cpuid_1_ent(&entry[t], function, 0);
    			entry[t].flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
    			++*nent;
    		}
    		break;
    	}
    	/* function 4 and 0xb have additional index. */
    	case 4: {
    		int index, cache_type;
    
    		entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
    		/* read more entries until cache_type is zero */
    		for (index = 1; *nent < maxnent; ++index) {
    			cache_type = entry[index - 1].eax & 0x1f;
    			if (!cache_type)
    				break;
    			do_cpuid_1_ent(&entry[index], function, index);
    			entry[index].flags |=
    			       KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
    			++*nent;
    		}
    		break;
    	}
    	case 0xb: {
    		int index, level_type;
    
    		entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
    		/* read more entries until level_type is zero */
    		for (index = 1; *nent < maxnent; ++index) {
    			level_type = entry[index - 1].ecx & 0xff;
    			if (!level_type)
    				break;
    			do_cpuid_1_ent(&entry[index], function, index);
    			entry[index].flags |=
    			       KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
    			++*nent;
    		}
    		break;
    	}
    	case 0x80000000:
    		entry->eax = min(entry->eax, 0x8000001a);
    		break;
    	case 0x80000001:
    		entry->edx &= kvm_supported_word1_x86_features;
    		entry->ecx &= kvm_supported_word6_x86_features;
    		break;
    	}
    	put_cpu();
    }
    
    static int kvm_vm_ioctl_get_supported_cpuid(struct kvm *kvm,
    				    struct kvm_cpuid2 *cpuid,
    				    struct kvm_cpuid_entry2 __user *entries)
    {
    	struct kvm_cpuid_entry2 *cpuid_entries;
    	int limit, nent = 0, r = -E2BIG;
    	u32 func;
    
    	if (cpuid->nent < 1)
    		goto out;
    	r = -ENOMEM;
    	cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry2) * cpuid->nent);
    	if (!cpuid_entries)
    		goto out;
    
    	do_cpuid_ent(&cpuid_entries[0], 0, 0, &nent, cpuid->nent);
    	limit = cpuid_entries[0].eax;
    	for (func = 1; func <= limit && nent < cpuid->nent; ++func)
    		do_cpuid_ent(&cpuid_entries[nent], func, 0,
    				&nent, cpuid->nent);
    	r = -E2BIG;
    	if (nent >= cpuid->nent)
    		goto out_free;