Skip to content
Snippets Groups Projects
x86.c 132 KiB
Newer Older
  • Learn to ignore specific revisions
  • Huang Ying's avatar
    Huang Ying committed
    		if (msr >= MSR_IA32_MC0_CTL &&
    		    msr < MSR_IA32_MC0_CTL + 4 * bank_num) {
    			u32 offset = msr - MSR_IA32_MC0_CTL;
    
    			/* only 0 or all 1s can be written to IA32_MCi_CTL
    			 * some Linux kernels though clear bit 10 in bank 4 to
    			 * workaround a BIOS/GART TBL issue on AMD K8s, ignore
    			 * this to avoid an uncatched #GP in the guest
    			 */
    
    Huang Ying's avatar
    Huang Ying committed
    			if ((offset & 0x3) == 0 &&
    
    			    data != 0 && (data | (1 << 10)) != ~(u64)0)
    
    Huang Ying's avatar
    Huang Ying committed
    				return -1;
    			vcpu->arch.mce_banks[offset] = data;
    			break;
    		}
    		return 1;
    	}
    	return 0;
    }
    
    
    static int xen_hvm_config(struct kvm_vcpu *vcpu, u64 data)
    {
    	struct kvm *kvm = vcpu->kvm;
    	int lm = is_long_mode(vcpu);
    	u8 *blob_addr = lm ? (u8 *)(long)kvm->arch.xen_hvm_config.blob_addr_64
    		: (u8 *)(long)kvm->arch.xen_hvm_config.blob_addr_32;
    	u8 blob_size = lm ? kvm->arch.xen_hvm_config.blob_size_64
    		: kvm->arch.xen_hvm_config.blob_size_32;
    	u32 page_num = data & ~PAGE_MASK;
    	u64 page_addr = data & PAGE_MASK;
    	u8 *page;
    	int r;
    
    	r = -E2BIG;
    	if (page_num >= blob_size)
    		goto out;
    	r = -ENOMEM;
    	page = kzalloc(PAGE_SIZE, GFP_KERNEL);
    	if (!page)
    		goto out;
    	r = -EFAULT;
    	if (copy_from_user(page, blob_addr + (page_num * PAGE_SIZE), PAGE_SIZE))
    		goto out_free;
    	if (kvm_write_guest(kvm, page_addr, page, PAGE_SIZE))
    		goto out_free;
    	r = 0;
    out_free:
    	kfree(page);
    out:
    	return r;
    }
    
    
    static bool kvm_hv_hypercall_enabled(struct kvm *kvm)
    {
    	return kvm->arch.hv_hypercall & HV_X64_MSR_HYPERCALL_ENABLE;
    }
    
    static bool kvm_hv_msr_partition_wide(u32 msr)
    {
    	bool r = false;
    	switch (msr) {
    	case HV_X64_MSR_GUEST_OS_ID:
    	case HV_X64_MSR_HYPERCALL:
    		r = true;
    		break;
    	}
    
    	return r;
    }
    
    static int set_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data)
    {
    	struct kvm *kvm = vcpu->kvm;
    
    	switch (msr) {
    	case HV_X64_MSR_GUEST_OS_ID:
    		kvm->arch.hv_guest_os_id = data;
    		/* setting guest os id to zero disables hypercall page */
    		if (!kvm->arch.hv_guest_os_id)
    			kvm->arch.hv_hypercall &= ~HV_X64_MSR_HYPERCALL_ENABLE;
    		break;
    	case HV_X64_MSR_HYPERCALL: {
    		u64 gfn;
    		unsigned long addr;
    		u8 instructions[4];
    
    		/* if guest os id is not set hypercall should remain disabled */
    		if (!kvm->arch.hv_guest_os_id)
    			break;
    		if (!(data & HV_X64_MSR_HYPERCALL_ENABLE)) {
    			kvm->arch.hv_hypercall = data;
    			break;
    		}
    		gfn = data >> HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT;
    		addr = gfn_to_hva(kvm, gfn);
    		if (kvm_is_error_hva(addr))
    			return 1;
    		kvm_x86_ops->patch_hypercall(vcpu, instructions);
    		((unsigned char *)instructions)[3] = 0xc3; /* ret */
    		if (copy_to_user((void __user *)addr, instructions, 4))
    			return 1;
    		kvm->arch.hv_hypercall = data;
    		break;
    	}
    	default:
    		pr_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x "
    			  "data 0x%llx\n", msr, data);
    		return 1;
    	}
    	return 0;
    }
    
    static int set_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 data)
    {
    
    	switch (msr) {
    	case HV_X64_MSR_APIC_ASSIST_PAGE: {
    		unsigned long addr;
    
    		if (!(data & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE)) {
    			vcpu->arch.hv_vapic = data;
    			break;
    		}
    		addr = gfn_to_hva(vcpu->kvm, data >>
    				  HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT);
    		if (kvm_is_error_hva(addr))
    			return 1;
    		if (clear_user((void __user *)addr, PAGE_SIZE))
    			return 1;
    		vcpu->arch.hv_vapic = data;
    		break;
    	}
    	case HV_X64_MSR_EOI:
    		return kvm_hv_vapic_msr_write(vcpu, APIC_EOI, data);
    	case HV_X64_MSR_ICR:
    		return kvm_hv_vapic_msr_write(vcpu, APIC_ICR, data);
    	case HV_X64_MSR_TPR:
    		return kvm_hv_vapic_msr_write(vcpu, APIC_TASKPRI, data);
    	default:
    		pr_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x "
    			  "data 0x%llx\n", msr, data);
    		return 1;
    	}
    
    	return 0;
    
    int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
    {
    	switch (msr) {
    	case MSR_EFER:
    
    		return set_efer(vcpu, data);
    
    	case MSR_K7_HWCR:
    		data &= ~(u64)0x40;	/* ignore flush filter disable */
    
    		data &= ~(u64)0x100;	/* ignore ignne emulation enable */
    
    		if (data != 0) {
    			pr_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n",
    				data);
    			return 1;
    		}
    
    	case MSR_FAM10H_MMIO_CONF_BASE:
    		if (data != 0) {
    			pr_unimpl(vcpu, "unimplemented MMIO_CONF_BASE wrmsr: "
    				"0x%llx\n", data);
    			return 1;
    		}
    
    	case MSR_AMD64_NB_CFG:
    
    	case MSR_IA32_DEBUGCTLMSR:
    		if (!data) {
    			/* We support the non-activated case already */
    			break;
    		} else if (data & ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_BTF)) {
    			/* Values other than LBR and BTF are vendor-specific,
    			   thus reserved and should throw a #GP */
    			return 1;
    		}
    		pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n",
    			__func__, data);
    		break;
    
    	case MSR_IA32_UCODE_REV:
    	case MSR_IA32_UCODE_WRITE:
    
    	case MSR_VM_HSAVE_PA:
    
    	case MSR_AMD64_PATCH_LOADER:
    
    Avi Kivity's avatar
    Avi Kivity committed
    	case 0x200 ... 0x2ff:
    		return set_msr_mtrr(vcpu, msr, data);
    
    	case MSR_IA32_APICBASE:
    		kvm_set_apic_base(vcpu, data);
    		break;
    
    	case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff:
    		return kvm_x2apic_msr_write(vcpu, msr, data);
    
    	case MSR_IA32_MISC_ENABLE:
    
    		vcpu->arch.ia32_misc_enable_msr = data;
    
    	case MSR_KVM_WALL_CLOCK:
    		vcpu->kvm->arch.wall_clock = data;
    		kvm_write_wall_clock(vcpu->kvm, data);
    		break;
    	case MSR_KVM_SYSTEM_TIME: {
    		if (vcpu->arch.time_page) {
    			kvm_release_page_dirty(vcpu->arch.time_page);
    			vcpu->arch.time_page = NULL;
    		}
    
    		vcpu->arch.time = data;
    
    		/* we verify if the enable bit is set... */
    		if (!(data & 1))
    			break;
    
    		/* ...but clean it before doing the actual write */
    		vcpu->arch.time_offset = data & ~(PAGE_MASK | 1);
    
    		vcpu->arch.time_page =
    				gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT);
    
    		if (is_error_page(vcpu->arch.time_page)) {
    			kvm_release_page_clean(vcpu->arch.time_page);
    			vcpu->arch.time_page = NULL;
    		}
    
    
    		kvm_request_guest_time_update(vcpu);
    
    Huang Ying's avatar
    Huang Ying committed
    	case MSR_IA32_MCG_CTL:
    	case MSR_IA32_MCG_STATUS:
    	case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1:
    		return set_msr_mce(vcpu, msr, data);
    
    
    	/* Performance counters are not protected by a CPUID bit,
    	 * so we should check all of them in the generic path for the sake of
    	 * cross vendor migration.
    	 * Writing a zero into the event select MSRs disables them,
    	 * which we perfectly emulate ;-). Any other value should be at least
    	 * reported, some guests depend on them.
    	 */
    	case MSR_P6_EVNTSEL0:
    	case MSR_P6_EVNTSEL1:
    	case MSR_K7_EVNTSEL0:
    	case MSR_K7_EVNTSEL1:
    	case MSR_K7_EVNTSEL2:
    	case MSR_K7_EVNTSEL3:
    		if (data != 0)
    			pr_unimpl(vcpu, "unimplemented perfctr wrmsr: "
    				"0x%x data 0x%llx\n", msr, data);
    		break;
    	/* at least RHEL 4 unconditionally writes to the perfctr registers,
    	 * so we ignore writes to make it happy.
    	 */
    	case MSR_P6_PERFCTR0:
    	case MSR_P6_PERFCTR1:
    	case MSR_K7_PERFCTR0:
    	case MSR_K7_PERFCTR1:
    	case MSR_K7_PERFCTR2:
    	case MSR_K7_PERFCTR3:
    		pr_unimpl(vcpu, "unimplemented perfctr wrmsr: "
    			"0x%x data 0x%llx\n", msr, data);
    		break;
    
    	case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
    		if (kvm_hv_msr_partition_wide(msr)) {
    			int r;
    			mutex_lock(&vcpu->kvm->lock);
    			r = set_msr_hyperv_pw(vcpu, msr, data);
    			mutex_unlock(&vcpu->kvm->lock);
    			return r;
    		} else
    			return set_msr_hyperv(vcpu, msr, data);
    		break;
    
    		if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr))
    			return xen_hvm_config(vcpu, data);
    
    		if (!ignore_msrs) {
    			pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n",
    				msr, data);
    			return 1;
    		} else {
    			pr_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n",
    				msr, data);
    			break;
    		}
    
    	}
    	return 0;
    }
    EXPORT_SYMBOL_GPL(kvm_set_msr_common);
    
    
    /*
     * Reads an msr value (of 'msr_index') into 'pdata'.
     * Returns 0 on success, non-0 otherwise.
     * Assumes vcpu_load() was already called.
     */
    int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
    {
    	return kvm_x86_ops->get_msr(vcpu, msr_index, pdata);
    }
    
    
    Avi Kivity's avatar
    Avi Kivity committed
    static int get_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
    {
    
    Sheng Yang's avatar
    Sheng Yang committed
    	u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges;
    
    
    Avi Kivity's avatar
    Avi Kivity committed
    	if (!msr_mtrr_valid(msr))
    		return 1;
    
    
    Sheng Yang's avatar
    Sheng Yang committed
    	if (msr == MSR_MTRRdefType)
    		*pdata = vcpu->arch.mtrr_state.def_type +
    			 (vcpu->arch.mtrr_state.enabled << 10);
    	else if (msr == MSR_MTRRfix64K_00000)
    		*pdata = p[0];
    	else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000)
    		*pdata = p[1 + msr - MSR_MTRRfix16K_80000];
    	else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000)
    		*pdata = p[3 + msr - MSR_MTRRfix4K_C0000];
    	else if (msr == MSR_IA32_CR_PAT)
    		*pdata = vcpu->arch.pat;
    	else {	/* Variable MTRRs */
    		int idx, is_mtrr_mask;
    		u64 *pt;
    
    		idx = (msr - 0x200) / 2;
    		is_mtrr_mask = msr - 0x200 - 2 * idx;
    		if (!is_mtrr_mask)
    			pt =
    			  (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].base_lo;
    		else
    			pt =
    			  (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].mask_lo;
    		*pdata = *pt;
    	}
    
    
    Avi Kivity's avatar
    Avi Kivity committed
    	return 0;
    }
    
    
    Huang Ying's avatar
    Huang Ying committed
    static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
    
    Huang Ying's avatar
    Huang Ying committed
    	u64 mcg_cap = vcpu->arch.mcg_cap;
    	unsigned bank_num = mcg_cap & 0xff;
    
    
    	switch (msr) {
    	case MSR_IA32_P5_MC_ADDR:
    	case MSR_IA32_P5_MC_TYPE:
    
    Huang Ying's avatar
    Huang Ying committed
    		data = 0;
    		break;
    
    Huang Ying's avatar
    Huang Ying committed
    		data = vcpu->arch.mcg_cap;
    		break;
    
    	case MSR_IA32_MCG_CTL:
    
    Huang Ying's avatar
    Huang Ying committed
    		if (!(mcg_cap & MCG_CTL_P))
    			return 1;
    		data = vcpu->arch.mcg_ctl;
    		break;
    	case MSR_IA32_MCG_STATUS:
    		data = vcpu->arch.mcg_status;
    		break;
    	default:
    		if (msr >= MSR_IA32_MC0_CTL &&
    		    msr < MSR_IA32_MC0_CTL + 4 * bank_num) {
    			u32 offset = msr - MSR_IA32_MC0_CTL;
    			data = vcpu->arch.mce_banks[offset];
    			break;
    		}
    		return 1;
    	}
    	*pdata = data;
    	return 0;
    }
    
    
    static int get_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
    {
    	u64 data = 0;
    	struct kvm *kvm = vcpu->kvm;
    
    	switch (msr) {
    	case HV_X64_MSR_GUEST_OS_ID:
    		data = kvm->arch.hv_guest_os_id;
    		break;
    	case HV_X64_MSR_HYPERCALL:
    		data = kvm->arch.hv_hypercall;
    		break;
    	default:
    		pr_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr);
    		return 1;
    	}
    
    	*pdata = data;
    	return 0;
    }
    
    static int get_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
    {
    	u64 data = 0;
    
    	switch (msr) {
    	case HV_X64_MSR_VP_INDEX: {
    		int r;
    		struct kvm_vcpu *v;
    		kvm_for_each_vcpu(r, v, vcpu->kvm)
    			if (v == vcpu)
    				data = r;
    		break;
    	}
    
    	case HV_X64_MSR_EOI:
    		return kvm_hv_vapic_msr_read(vcpu, APIC_EOI, pdata);
    	case HV_X64_MSR_ICR:
    		return kvm_hv_vapic_msr_read(vcpu, APIC_ICR, pdata);
    	case HV_X64_MSR_TPR:
    		return kvm_hv_vapic_msr_read(vcpu, APIC_TASKPRI, pdata);
    
    	default:
    		pr_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr);
    		return 1;
    	}
    	*pdata = data;
    	return 0;
    }
    
    
    Huang Ying's avatar
    Huang Ying committed
    int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
    {
    	u64 data;
    
    	switch (msr) {
    	case MSR_IA32_PLATFORM_ID:
    
    	case MSR_IA32_UCODE_REV:
    	case MSR_IA32_EBL_CR_POWERON:
    
    	case MSR_IA32_DEBUGCTLMSR:
    	case MSR_IA32_LASTBRANCHFROMIP:
    	case MSR_IA32_LASTBRANCHTOIP:
    	case MSR_IA32_LASTINTFROMIP:
    	case MSR_IA32_LASTINTTOIP:
    
    	case MSR_K8_SYSCFG:
    	case MSR_K7_HWCR:
    
    	case MSR_VM_HSAVE_PA:
    
    	case MSR_P6_PERFCTR0:
    	case MSR_P6_PERFCTR1:
    
    	case MSR_P6_EVNTSEL0:
    	case MSR_P6_EVNTSEL1:
    
    	case MSR_K7_EVNTSEL0:
    
    	case MSR_K7_PERFCTR0:
    
    	case MSR_K8_INT_PENDING_MSG:
    
    	case MSR_AMD64_NB_CFG:
    
    	case MSR_FAM10H_MMIO_CONF_BASE:
    
    Avi Kivity's avatar
    Avi Kivity committed
    	case MSR_MTRRcap:
    		data = 0x500 | KVM_NR_VAR_MTRR;
    		break;
    	case 0x200 ... 0x2ff:
    		return get_msr_mtrr(vcpu, msr, pdata);
    
    	case 0xcd: /* fsb frequency */
    		data = 3;
    		break;
    	case MSR_IA32_APICBASE:
    		data = kvm_get_apic_base(vcpu);
    		break;
    
    	case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff:
    		return kvm_x2apic_msr_read(vcpu, msr, pdata);
    		break;
    
    	case MSR_IA32_MISC_ENABLE:
    
    		data = vcpu->arch.ia32_misc_enable_msr;
    
    	case MSR_IA32_PERF_STATUS:
    		/* TSC increment by tick */
    		data = 1000ULL;
    		/* CPU multiplier */
    		data |= (((uint64_t)4ULL) << 40);
    		break;
    
    		data = vcpu->arch.efer;
    
    	case MSR_KVM_WALL_CLOCK:
    		data = vcpu->kvm->arch.wall_clock;
    		break;
    	case MSR_KVM_SYSTEM_TIME:
    		data = vcpu->arch.time;
    		break;
    
    Huang Ying's avatar
    Huang Ying committed
    	case MSR_IA32_P5_MC_ADDR:
    	case MSR_IA32_P5_MC_TYPE:
    	case MSR_IA32_MCG_CAP:
    	case MSR_IA32_MCG_CTL:
    	case MSR_IA32_MCG_STATUS:
    	case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1:
    		return get_msr_mce(vcpu, msr, pdata);
    
    	case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
    		if (kvm_hv_msr_partition_wide(msr)) {
    			int r;
    			mutex_lock(&vcpu->kvm->lock);
    			r = get_msr_hyperv_pw(vcpu, msr, pdata);
    			mutex_unlock(&vcpu->kvm->lock);
    			return r;
    		} else
    			return get_msr_hyperv(vcpu, msr, pdata);
    		break;
    
    		if (!ignore_msrs) {
    			pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);
    			return 1;
    		} else {
    			pr_unimpl(vcpu, "ignored rdmsr: 0x%x\n", msr);
    			data = 0;
    		}
    		break;
    
    	}
    	*pdata = data;
    	return 0;
    }
    EXPORT_SYMBOL_GPL(kvm_get_msr_common);
    
    
    /*
     * Read or write a bunch of msrs. All parameters are kernel addresses.
     *
     * @return number of msrs set successfully.
     */
    static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs,
    		    struct kvm_msr_entry *entries,
    		    int (*do_msr)(struct kvm_vcpu *vcpu,
    				  unsigned index, u64 *data))
    {
    
    	idx = srcu_read_lock(&vcpu->kvm->srcu);
    
    	for (i = 0; i < msrs->nmsrs; ++i)
    		if (do_msr(vcpu, entries[i].index, &entries[i].data))
    			break;
    
    	srcu_read_unlock(&vcpu->kvm->srcu, idx);
    
    
    	vcpu_put(vcpu);
    
    	return i;
    }
    
    /*
     * Read or write a bunch of msrs. Parameters are user addresses.
     *
     * @return number of msrs set successfully.
     */
    static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs,
    		  int (*do_msr)(struct kvm_vcpu *vcpu,
    				unsigned index, u64 *data),
    		  int writeback)
    {
    	struct kvm_msrs msrs;
    	struct kvm_msr_entry *entries;
    	int r, n;
    	unsigned size;
    
    	r = -EFAULT;
    	if (copy_from_user(&msrs, user_msrs, sizeof msrs))
    		goto out;
    
    	r = -E2BIG;
    	if (msrs.nmsrs >= MAX_IO_MSRS)
    		goto out;
    
    	r = -ENOMEM;
    	size = sizeof(struct kvm_msr_entry) * msrs.nmsrs;
    	entries = vmalloc(size);
    	if (!entries)
    		goto out;
    
    	r = -EFAULT;
    	if (copy_from_user(entries, user_msrs->entries, size))
    		goto out_free;
    
    	r = n = __msr_io(vcpu, &msrs, entries, do_msr);
    	if (r < 0)
    		goto out_free;
    
    	r = -EFAULT;
    	if (writeback && copy_to_user(user_msrs->entries, entries, size))
    		goto out_free;
    
    	r = n;
    
    out_free:
    	vfree(entries);
    out:
    	return r;
    }
    
    
    int kvm_dev_ioctl_check_extension(long ext)
    {
    	int r;
    
    	switch (ext) {
    	case KVM_CAP_IRQCHIP:
    	case KVM_CAP_HLT:
    	case KVM_CAP_MMU_SHADOW_CACHE_CONTROL:
    	case KVM_CAP_SET_TSS_ADDR:
    
    	case KVM_CAP_EXT_CPUID:
    
    	case KVM_CAP_CLOCKSOURCE:
    
    Sheng Yang's avatar
    Sheng Yang committed
    	case KVM_CAP_PIT:
    
    	case KVM_CAP_NOP_IO_DELAY:
    
    	case KVM_CAP_MP_STATE:
    
    	case KVM_CAP_SYNC_MMU:
    
    	case KVM_CAP_REINJECT_CONTROL:
    
    	case KVM_CAP_IRQ_INJECT_STATUS:
    
    	case KVM_CAP_ASSIGN_DEV_IRQ:
    
    Gregory Haskins's avatar
    Gregory Haskins committed
    	case KVM_CAP_IRQFD:
    
    	case KVM_CAP_IOEVENTFD:
    
    	case KVM_CAP_PIT2:
    
    	case KVM_CAP_PIT_STATE2:
    
    	case KVM_CAP_SET_IDENTITY_MAP_ADDR:
    
    	case KVM_CAP_XEN_HVM:
    
    	case KVM_CAP_ADJUST_CLOCK:
    
    	case KVM_CAP_VCPU_EVENTS:
    
    	case KVM_CAP_HYPERV:
    
    	case KVM_CAP_HYPERV_VAPIC:
    
    	case KVM_CAP_HYPERV_SPIN:
    
    	case KVM_CAP_PCI_SEGMENT:
    
    	case KVM_CAP_X86_ROBUST_SINGLESTEP:
    
    	case KVM_CAP_COALESCED_MMIO:
    		r = KVM_COALESCED_MMIO_PAGE_OFFSET;
    		break;
    
    	case KVM_CAP_VAPIC:
    		r = !kvm_x86_ops->cpu_has_accelerated_tpr();
    		break;
    
    	case KVM_CAP_NR_VCPUS:
    		r = KVM_MAX_VCPUS;
    		break;
    
    	case KVM_CAP_NR_MEMSLOTS:
    		r = KVM_MEMORY_SLOTS;
    		break;
    
    	case KVM_CAP_PV_MMU:	/* obsolete */
    		r = 0;
    
    	case KVM_CAP_IOMMU:
    
    		r = iommu_found();
    
    Huang Ying's avatar
    Huang Ying committed
    	case KVM_CAP_MCE:
    		r = KVM_MAX_MCE_BANKS;
    		break;
    
    long kvm_arch_dev_ioctl(struct file *filp,
    			unsigned int ioctl, unsigned long arg)
    {
    	void __user *argp = (void __user *)arg;
    	long r;
    
    	switch (ioctl) {
    	case KVM_GET_MSR_INDEX_LIST: {
    		struct kvm_msr_list __user *user_msr_list = argp;
    		struct kvm_msr_list msr_list;
    		unsigned n;
    
    		r = -EFAULT;
    		if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list))
    			goto out;
    		n = msr_list.nmsrs;
    		msr_list.nmsrs = num_msrs_to_save + ARRAY_SIZE(emulated_msrs);
    		if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list))
    			goto out;
    		r = -E2BIG;
    
    		if (n < msr_list.nmsrs)
    
    			goto out;
    		r = -EFAULT;
    		if (copy_to_user(user_msr_list->indices, &msrs_to_save,
    				 num_msrs_to_save * sizeof(u32)))
    			goto out;
    
    		if (copy_to_user(user_msr_list->indices + num_msrs_to_save,
    
    				 &emulated_msrs,
    				 ARRAY_SIZE(emulated_msrs) * sizeof(u32)))
    			goto out;
    		r = 0;
    		break;
    	}
    
    	case KVM_GET_SUPPORTED_CPUID: {
    		struct kvm_cpuid2 __user *cpuid_arg = argp;
    		struct kvm_cpuid2 cpuid;
    
    		r = -EFAULT;
    		if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
    			goto out;
    		r = kvm_dev_ioctl_get_supported_cpuid(&cpuid,
    
    						      cpuid_arg->entries);
    
    		if (r)
    			goto out;
    
    		r = -EFAULT;
    		if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid))
    			goto out;
    		r = 0;
    		break;
    	}
    
    Huang Ying's avatar
    Huang Ying committed
    	case KVM_X86_GET_MCE_CAP_SUPPORTED: {
    		u64 mce_cap;
    
    		mce_cap = KVM_MCE_CAP_SUPPORTED;
    		r = -EFAULT;
    		if (copy_to_user(argp, &mce_cap, sizeof mce_cap))
    			goto out;
    		r = 0;
    		break;
    	}
    
    void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
    {
    	kvm_x86_ops->vcpu_load(vcpu, cpu);
    
    	if (unlikely(per_cpu(cpu_tsc_khz, cpu) == 0)) {
    		unsigned long khz = cpufreq_quick_get(cpu);
    		if (!khz)
    			khz = tsc_khz;
    		per_cpu(cpu_tsc_khz, cpu) = khz;
    	}
    
    	kvm_request_guest_time_update(vcpu);
    
    }
    
    void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
    {
    
    	kvm_x86_ops->vcpu_put(vcpu);
    
    static int is_efer_nx(void)
    
    	unsigned long long efer = 0;
    
    	rdmsrl_safe(MSR_EFER, &efer);
    
    	return efer & EFER_NX;
    }
    
    static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu)
    {
    	int i;
    	struct kvm_cpuid_entry2 *e, *entry;
    
    
    	entry = NULL;
    
    	for (i = 0; i < vcpu->arch.cpuid_nent; ++i) {
    		e = &vcpu->arch.cpuid_entries[i];
    
    		if (e->function == 0x80000001) {
    			entry = e;
    			break;
    		}
    	}
    
    	if (entry && (entry->edx & (1 << 20)) && !is_efer_nx()) {
    
    		entry->edx &= ~(1 << 20);
    		printk(KERN_INFO "kvm: guest NX capability removed\n");
    	}
    }
    
    
    /* when an old userspace process fills a new kernel module */
    
    static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
    				    struct kvm_cpuid *cpuid,
    				    struct kvm_cpuid_entry __user *entries)
    
    {
    	int r, i;
    	struct kvm_cpuid_entry *cpuid_entries;
    
    	r = -E2BIG;
    	if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
    		goto out;
    	r = -ENOMEM;
    	cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry) * cpuid->nent);
    	if (!cpuid_entries)
    		goto out;
    	r = -EFAULT;
    	if (copy_from_user(cpuid_entries, entries,
    			   cpuid->nent * sizeof(struct kvm_cpuid_entry)))
    		goto out_free;
    	for (i = 0; i < cpuid->nent; i++) {
    
    		vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function;
    		vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax;
    		vcpu->arch.cpuid_entries[i].ebx = cpuid_entries[i].ebx;
    		vcpu->arch.cpuid_entries[i].ecx = cpuid_entries[i].ecx;
    		vcpu->arch.cpuid_entries[i].edx = cpuid_entries[i].edx;
    		vcpu->arch.cpuid_entries[i].index = 0;
    		vcpu->arch.cpuid_entries[i].flags = 0;
    		vcpu->arch.cpuid_entries[i].padding[0] = 0;
    		vcpu->arch.cpuid_entries[i].padding[1] = 0;
    		vcpu->arch.cpuid_entries[i].padding[2] = 0;
    	}
    	vcpu->arch.cpuid_nent = cpuid->nent;
    
    	cpuid_fix_nx_cap(vcpu);
    	r = 0;
    
    	kvm_apic_set_version(vcpu);
    
    	kvm_x86_ops->cpuid_update(vcpu);
    
    
    out_free:
    	vfree(cpuid_entries);
    out:
    	return r;
    }
    
    static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
    
    				     struct kvm_cpuid2 *cpuid,
    				     struct kvm_cpuid_entry2 __user *entries)
    
    {
    	int r;
    
    	r = -E2BIG;
    	if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
    		goto out;
    	r = -EFAULT;
    
    	if (copy_from_user(&vcpu->arch.cpuid_entries, entries,
    
    			   cpuid->nent * sizeof(struct kvm_cpuid_entry2)))
    
    	vcpu->arch.cpuid_nent = cpuid->nent;
    
    	kvm_apic_set_version(vcpu);
    
    	kvm_x86_ops->cpuid_update(vcpu);
    
    static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
    
    				     struct kvm_cpuid2 *cpuid,
    				     struct kvm_cpuid_entry2 __user *entries)
    
    	if (cpuid->nent < vcpu->arch.cpuid_nent)
    
    		goto out;
    	r = -EFAULT;
    
    	if (copy_to_user(entries, &vcpu->arch.cpuid_entries,
    
    			 vcpu->arch.cpuid_nent * sizeof(struct kvm_cpuid_entry2)))
    
    	cpuid->nent = vcpu->arch.cpuid_nent;
    
    	return r;
    }
    
    static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function,
    
    {
    	entry->function = function;
    	entry->index = index;
    	cpuid_count(entry->function, entry->index,
    
    		    &entry->eax, &entry->ebx, &entry->ecx, &entry->edx);
    
    static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
    			 u32 index, int *nent, int maxnent)
    {
    
    	unsigned f_nx = is_efer_nx() ? F(NX) : 0;
    
    #ifdef CONFIG_X86_64
    
    	unsigned f_gbpages = (kvm_x86_ops->get_lpage_level() == PT_PDPE_LEVEL)
    				? F(GBPAGES) : 0;
    
    	unsigned f_rdtscp = kvm_x86_ops->rdtscp_supported() ? F(RDTSCP) : 0;
    
    
    	/* cpuid 1.edx */
    	const u32 kvm_supported_word0_x86_features =
    		F(FPU) | F(VME) | F(DE) | F(PSE) |
    		F(TSC) | F(MSR) | F(PAE) | F(MCE) |
    		F(CX8) | F(APIC) | 0 /* Reserved */ | F(SEP) |
    		F(MTRR) | F(PGE) | F(MCA) | F(CMOV) |
    		F(PAT) | F(PSE36) | 0 /* PSN */ | F(CLFLSH) |
    		0 /* Reserved, DS, ACPI */ | F(MMX) |
    		F(FXSR) | F(XMM) | F(XMM2) | F(SELFSNOOP) |
    		0 /* HTT, TM, Reserved, PBE */;
    	/* cpuid 0x80000001.edx */
    	const u32 kvm_supported_word1_x86_features =
    		F(FPU) | F(VME) | F(DE) | F(PSE) |
    		F(TSC) | F(MSR) | F(PAE) | F(MCE) |
    		F(CX8) | F(APIC) | 0 /* Reserved */ | F(SYSCALL) |
    		F(MTRR) | F(PGE) | F(MCA) | F(CMOV) |
    		F(PAT) | F(PSE36) | 0 /* Reserved */ |
    		f_nx | 0 /* Reserved */ | F(MMXEXT) | F(MMX) |
    
    		F(FXSR) | F(FXSR_OPT) | f_gbpages | f_rdtscp |
    
    		0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW);
    	/* cpuid 1.ecx */
    	const u32 kvm_supported_word4_x86_features =
    
    		F(XMM3) | 0 /* Reserved, DTES64, MONITOR */ |
    		0 /* DS-CPL, VMX, SMX, EST */ |
    		0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ |
    		0 /* Reserved */ | F(CX16) | 0 /* xTPR Update, PDCM */ |
    		0 /* Reserved, DCA */ | F(XMM4_1) |
    
    		F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) |
    
    		0 /* Reserved, XSAVE, OSXSAVE */;
    
    	const u32 kvm_supported_word6_x86_features =
    
    		F(LAHF_LM) | F(CMP_LEGACY) | F(SVM) | 0 /* ExtApicSpace */ |
    		F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) |
    		F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(SSE5) |
    		0 /* SKINIT */ | 0 /* WDT */;
    
    	/* all calls to cpuid_count() should be made on the same cpu */
    
    	get_cpu();
    	do_cpuid_1_ent(entry, function, index);
    	++*nent;
    
    	switch (function) {
    	case 0:
    		entry->eax = min(entry->eax, (u32)0xb);
    		break;
    	case 1:
    		entry->edx &= kvm_supported_word0_x86_features;
    
    		entry->ecx &= kvm_supported_word4_x86_features;
    
    		/* we support x2apic emulation even if host does not support
    		 * it since we emulate x2apic in software */
    		entry->ecx |= F(X2APIC);
    
    		break;
    	/* function 2 entries are STATEFUL. That is, repeated cpuid commands
    	 * may return different values. This forces us to get_cpu() before
    	 * issuing the first command, and also to emulate this annoying behavior
    	 * in kvm_emulate_cpuid() using KVM_CPUID_FLAG_STATE_READ_NEXT */
    	case 2: {
    		int t, times = entry->eax & 0xff;
    
    		entry->flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
    
    		entry->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT;
    
    		for (t = 1; t < times && *nent < maxnent; ++t) {
    			do_cpuid_1_ent(&entry[t], function, 0);
    			entry[t].flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
    			++*nent;
    		}
    		break;
    	}
    	/* function 4 and 0xb have additional index. */
    	case 4: {
    
    		int i, cache_type;
    
    
    		entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
    		/* read more entries until cache_type is zero */
    
    		for (i = 1; *nent < maxnent; ++i) {
    			cache_type = entry[i - 1].eax & 0x1f;
    
    			if (!cache_type)
    				break;
    
    			do_cpuid_1_ent(&entry[i], function, i);
    			entry[i].flags |=
    
    			       KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
    			++*nent;
    		}
    		break;
    	}
    	case 0xb: {
    
    		int i, level_type;
    
    
    		entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
    		/* read more entries until level_type is zero */
    
    		for (i = 1; *nent < maxnent; ++i) {
    
    			level_type = entry[i - 1].ecx & 0xff00;
    
    			if (!level_type)
    				break;
    
    			do_cpuid_1_ent(&entry[i], function, i);
    			entry[i].flags |=
    
    			       KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
    			++*nent;
    		}
    		break;
    	}
    	case 0x80000000:
    		entry->eax = min(entry->eax, 0x8000001a);
    		break;
    	case 0x80000001:
    		entry->edx &= kvm_supported_word1_x86_features;
    		entry->ecx &= kvm_supported_word6_x86_features;
    		break;
    	}
    
    
    	kvm_x86_ops->set_supported_cpuid(function, entry);
    
    
    static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
    
    				     struct kvm_cpuid_entry2 __user *entries)
    
    {
    	struct kvm_cpuid_entry2 *cpuid_entries;
    	int limit, nent = 0, r = -E2BIG;
    	u32 func;
    
    	if (cpuid->nent < 1)
    		goto out;
    
    	if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
    		cpuid->nent = KVM_MAX_CPUID_ENTRIES;
    
    	r = -ENOMEM;
    	cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry2) * cpuid->nent);
    	if (!cpuid_entries)
    		goto out;