Linux kernel & device driver programming

Cross-Referenced Linux and Device Driver Code

[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ]
Version: [ 2.6.11.8 ] [ 2.6.25 ] [ 2.6.25.8 ] [ 2.6.31.13 ] Architecture: [ i386 ]
  1 /*
  2  * Kernel-based Virtual Machine driver for Linux
  3  *
  4  * derived from drivers/kvm/kvm_main.c
  5  *
  6  * Copyright (C) 2006 Qumranet, Inc.
  7  * Copyright (C) 2008 Qumranet, Inc.
  8  * Copyright IBM Corporation, 2008
  9  *
 10  * Authors:
 11  *   Avi Kivity   <avi@qumranet.com>
 12  *   Yaniv Kamay  <yaniv@qumranet.com>
 13  *   Amit Shah    <amit.shah@qumranet.com>
 14  *   Ben-Ami Yassour <benami@il.ibm.com>
 15  *
 16  * This work is licensed under the terms of the GNU GPL, version 2.  See
 17  * the COPYING file in the top-level directory.
 18  *
 19  */
 20 
 21 #include <linux/kvm_host.h>
 22 #include "irq.h"
 23 #include "mmu.h"
 24 #include "i8254.h"
 25 #include "tss.h"
 26 #include "kvm_cache_regs.h"
 27 #include "x86.h"
 28 
 29 #include <linux/clocksource.h>
 30 #include <linux/interrupt.h>
 31 #include <linux/kvm.h>
 32 #include <linux/fs.h>
 33 #include <linux/vmalloc.h>
 34 #include <linux/module.h>
 35 #include <linux/mman.h>
 36 #include <linux/highmem.h>
 37 #include <linux/iommu.h>
 38 #include <linux/intel-iommu.h>
 39 #include <linux/cpufreq.h>
 40 
 41 #include <asm/uaccess.h>
 42 #include <asm/msr.h>
 43 #include <asm/desc.h>
 44 #include <asm/mtrr.h>
 45 
 46 #define MAX_IO_MSRS 256
 47 #define CR0_RESERVED_BITS                                               \
 48         (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
 49                           | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \
 50                           | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG))
 51 #define CR4_RESERVED_BITS                                               \
 52         (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\
 53                           | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE     \
 54                           | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR  \
 55                           | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE))
 56 
 57 #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
 58 /* EFER defaults:
 59  * - enable syscall per default because its emulated by KVM
 60  * - enable LME and LMA per default on 64 bit KVM
 61  */
 62 #ifdef CONFIG_X86_64
 63 static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffafeULL;
 64 #else
 65 static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffffeULL;
 66 #endif
 67 
 68 #define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM
 69 #define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
 70 
 71 static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
 72                                     struct kvm_cpuid_entry2 __user *entries);
 73 struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
 74                                               u32 function, u32 index);
 75 
 76 struct kvm_x86_ops *kvm_x86_ops;
 77 EXPORT_SYMBOL_GPL(kvm_x86_ops);
 78 
 79 struct kvm_stats_debugfs_item debugfs_entries[] = {
 80         { "pf_fixed", VCPU_STAT(pf_fixed) },
 81         { "pf_guest", VCPU_STAT(pf_guest) },
 82         { "tlb_flush", VCPU_STAT(tlb_flush) },
 83         { "invlpg", VCPU_STAT(invlpg) },
 84         { "exits", VCPU_STAT(exits) },
 85         { "io_exits", VCPU_STAT(io_exits) },
 86         { "mmio_exits", VCPU_STAT(mmio_exits) },
 87         { "signal_exits", VCPU_STAT(signal_exits) },
 88         { "irq_window", VCPU_STAT(irq_window_exits) },
 89         { "nmi_window", VCPU_STAT(nmi_window_exits) },
 90         { "halt_exits", VCPU_STAT(halt_exits) },
 91         { "halt_wakeup", VCPU_STAT(halt_wakeup) },
 92         { "hypercalls", VCPU_STAT(hypercalls) },
 93         { "request_irq", VCPU_STAT(request_irq_exits) },
 94         { "irq_exits", VCPU_STAT(irq_exits) },
 95         { "host_state_reload", VCPU_STAT(host_state_reload) },
 96         { "efer_reload", VCPU_STAT(efer_reload) },
 97         { "fpu_reload", VCPU_STAT(fpu_reload) },
 98         { "insn_emulation", VCPU_STAT(insn_emulation) },
 99         { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) },
100         { "irq_injections", VCPU_STAT(irq_injections) },
101         { "nmi_injections", VCPU_STAT(nmi_injections) },
102         { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) },
103         { "mmu_pte_write", VM_STAT(mmu_pte_write) },
104         { "mmu_pte_updated", VM_STAT(mmu_pte_updated) },
105         { "mmu_pde_zapped", VM_STAT(mmu_pde_zapped) },
106         { "mmu_flooded", VM_STAT(mmu_flooded) },
107         { "mmu_recycled", VM_STAT(mmu_recycled) },
108         { "mmu_cache_miss", VM_STAT(mmu_cache_miss) },
109         { "mmu_unsync", VM_STAT(mmu_unsync) },
110         { "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
111         { "largepages", VM_STAT(lpages) },
112         { NULL }
113 };
114 
115 unsigned long segment_base(u16 selector)
116 {
117         struct descriptor_table gdt;
118         struct desc_struct *d;
119         unsigned long table_base;
120         unsigned long v;
121 
122         if (selector == 0)
123                 return 0;
124 
125         asm("sgdt %0" : "=m"(gdt));
126         table_base = gdt.base;
127 
128         if (selector & 4) {           /* from ldt */
129                 u16 ldt_selector;
130 
131                 asm("sldt %0" : "=g"(ldt_selector));
132                 table_base = segment_base(ldt_selector);
133         }
134         d = (struct desc_struct *)(table_base + (selector & ~7));
135         v = d->base0 | ((unsigned long)d->base1 << 16) |
136                 ((unsigned long)d->base2 << 24);
137 #ifdef CONFIG_X86_64
138         if (d->s == 0 && (d->type == 2 || d->type == 9 || d->type == 11))
139                 v |= ((unsigned long)((struct ldttss_desc64 *)d)->base3) << 32;
140 #endif
141         return v;
142 }
143 EXPORT_SYMBOL_GPL(segment_base);
144 
145 u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)
146 {
147         if (irqchip_in_kernel(vcpu->kvm))
148                 return vcpu->arch.apic_base;
149         else
150                 return vcpu->arch.apic_base;
151 }
152 EXPORT_SYMBOL_GPL(kvm_get_apic_base);
153 
154 void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data)
155 {
156         /* TODO: reserve bits check */
157         if (irqchip_in_kernel(vcpu->kvm))
158                 kvm_lapic_set_base(vcpu, data);
159         else
160                 vcpu->arch.apic_base = data;
161 }
162 EXPORT_SYMBOL_GPL(kvm_set_apic_base);
163 
164 void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr)
165 {
166         WARN_ON(vcpu->arch.exception.pending);
167         vcpu->arch.exception.pending = true;
168         vcpu->arch.exception.has_error_code = false;
169         vcpu->arch.exception.nr = nr;
170 }
171 EXPORT_SYMBOL_GPL(kvm_queue_exception);
172 
173 void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr,
174                            u32 error_code)
175 {
176         ++vcpu->stat.pf_guest;
177 
178         if (vcpu->arch.exception.pending) {
179                 if (vcpu->arch.exception.nr == PF_VECTOR) {
180                         printk(KERN_DEBUG "kvm: inject_page_fault:"
181                                         " double fault 0x%lx\n", addr);
182                         vcpu->arch.exception.nr = DF_VECTOR;
183                         vcpu->arch.exception.error_code = 0;
184                 } else if (vcpu->arch.exception.nr == DF_VECTOR) {
185                         /* triple fault -> shutdown */
186                         set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
187                 }
188                 return;
189         }
190         vcpu->arch.cr2 = addr;
191         kvm_queue_exception_e(vcpu, PF_VECTOR, error_code);
192 }
193 
194 void kvm_inject_nmi(struct kvm_vcpu *vcpu)
195 {
196         vcpu->arch.nmi_pending = 1;
197 }
198 EXPORT_SYMBOL_GPL(kvm_inject_nmi);
199 
200 void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
201 {
202         WARN_ON(vcpu->arch.exception.pending);
203         vcpu->arch.exception.pending = true;
204         vcpu->arch.exception.has_error_code = true;
205         vcpu->arch.exception.nr = nr;
206         vcpu->arch.exception.error_code = error_code;
207 }
208 EXPORT_SYMBOL_GPL(kvm_queue_exception_e);
209 
210 static void __queue_exception(struct kvm_vcpu *vcpu)
211 {
212         kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr,
213                                      vcpu->arch.exception.has_error_code,
214                                      vcpu->arch.exception.error_code);
215 }
216 
217 /*
218  * Checks if cpl <= required_cpl; if true, return true.  Otherwise queue
219  * a #GP and return false.
220  */
221 bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl)
222 {
223         if (kvm_x86_ops->get_cpl(vcpu) <= required_cpl)
224                 return true;
225         kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
226         return false;
227 }
228 EXPORT_SYMBOL_GPL(kvm_require_cpl);
229 
230 /*
231  * Load the pae pdptrs.  Return true is they are all valid.
232  */
233 int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
234 {
235         gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
236         unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2;
237         int i;
238         int ret;
239         u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)];
240 
241         ret = kvm_read_guest_page(vcpu->kvm, pdpt_gfn, pdpte,
242                                   offset * sizeof(u64), sizeof(pdpte));
243         if (ret < 0) {
244                 ret = 0;
245                 goto out;
246         }
247         for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {
248                 if (is_present_pte(pdpte[i]) &&
249                     (pdpte[i] & vcpu->arch.mmu.rsvd_bits_mask[0][2])) {
250                         ret = 0;
251                         goto out;
252                 }
253         }
254         ret = 1;
255 
256         memcpy(vcpu->arch.pdptrs, pdpte, sizeof(vcpu->arch.pdptrs));
257 out:
258 
259         return ret;
260 }
261 EXPORT_SYMBOL_GPL(load_pdptrs);
262 
263 static bool pdptrs_changed(struct kvm_vcpu *vcpu)
264 {
265         u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)];
266         bool changed = true;
267         int r;
268 
269         if (is_long_mode(vcpu) || !is_pae(vcpu))
270                 return false;
271 
272         r = kvm_read_guest(vcpu->kvm, vcpu->arch.cr3 & ~31u, pdpte, sizeof(pdpte));
273         if (r < 0)
274                 goto out;
275         changed = memcmp(pdpte, vcpu->arch.pdptrs, sizeof(pdpte)) != 0;
276 out:
277 
278         return changed;
279 }
280 
281 void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
282 {
283         if (cr0 & CR0_RESERVED_BITS) {
284                 printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n",
285                        cr0, vcpu->arch.cr0);
286                 kvm_inject_gp(vcpu, 0);
287                 return;
288         }
289 
290         if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) {
291                 printk(KERN_DEBUG "set_cr0: #GP, CD == 0 && NW == 1\n");
292                 kvm_inject_gp(vcpu, 0);
293                 return;
294         }
295 
296         if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) {
297                 printk(KERN_DEBUG "set_cr0: #GP, set PG flag "
298                        "and a clear PE flag\n");
299                 kvm_inject_gp(vcpu, 0);
300                 return;
301         }
302 
303         if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
304 #ifdef CONFIG_X86_64
305                 if ((vcpu->arch.shadow_efer & EFER_LME)) {
306                         int cs_db, cs_l;
307 
308                         if (!is_pae(vcpu)) {
309                                 printk(KERN_DEBUG "set_cr0: #GP, start paging "
310                                        "in long mode while PAE is disabled\n");
311                                 kvm_inject_gp(vcpu, 0);
312                                 return;
313                         }
314                         kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
315                         if (cs_l) {
316                                 printk(KERN_DEBUG "set_cr0: #GP, start paging "
317                                        "in long mode while CS.L == 1\n");
318                                 kvm_inject_gp(vcpu, 0);
319                                 return;
320 
321                         }
322                 } else
323 #endif
324                 if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3)) {
325                         printk(KERN_DEBUG "set_cr0: #GP, pdptrs "
326                                "reserved bits\n");
327                         kvm_inject_gp(vcpu, 0);
328                         return;
329                 }
330 
331         }
332 
333         kvm_x86_ops->set_cr0(vcpu, cr0);
334         vcpu->arch.cr0 = cr0;
335 
336         kvm_mmu_reset_context(vcpu);
337         return;
338 }
339 EXPORT_SYMBOL_GPL(kvm_set_cr0);
340 
341 void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
342 {
343         kvm_set_cr0(vcpu, (vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f));
344         KVMTRACE_1D(LMSW, vcpu,
345                     (u32)((vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f)),
346                     handler);
347 }
348 EXPORT_SYMBOL_GPL(kvm_lmsw);
349 
350 void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
351 {
352         unsigned long old_cr4 = vcpu->arch.cr4;
353         unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE;
354 
355         if (cr4 & CR4_RESERVED_BITS) {
356                 printk(KERN_DEBUG "set_cr4: #GP, reserved bits\n");
357                 kvm_inject_gp(vcpu, 0);
358                 return;
359         }
360 
361         if (is_long_mode(vcpu)) {
362                 if (!(cr4 & X86_CR4_PAE)) {
363                         printk(KERN_DEBUG "set_cr4: #GP, clearing PAE while "
364                                "in long mode\n");
365                         kvm_inject_gp(vcpu, 0);
366                         return;
367                 }
368         } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE)
369                    && ((cr4 ^ old_cr4) & pdptr_bits)
370                    && !load_pdptrs(vcpu, vcpu->arch.cr3)) {
371                 printk(KERN_DEBUG "set_cr4: #GP, pdptrs reserved bits\n");
372                 kvm_inject_gp(vcpu, 0);
373                 return;
374         }
375 
376         if (cr4 & X86_CR4_VMXE) {
377                 printk(KERN_DEBUG "set_cr4: #GP, setting VMXE\n");
378                 kvm_inject_gp(vcpu, 0);
379                 return;
380         }
381         kvm_x86_ops->set_cr4(vcpu, cr4);
382         vcpu->arch.cr4 = cr4;
383         vcpu->arch.mmu.base_role.cr4_pge = (cr4 & X86_CR4_PGE) && !tdp_enabled;
384         kvm_mmu_reset_context(vcpu);
385 }
386 EXPORT_SYMBOL_GPL(kvm_set_cr4);
387 
388 void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
389 {
390         if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) {
391                 kvm_mmu_sync_roots(vcpu);
392                 kvm_mmu_flush_tlb(vcpu);
393                 return;
394         }
395 
396         if (is_long_mode(vcpu)) {
397                 if (cr3 & CR3_L_MODE_RESERVED_BITS) {
398                         printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n");
399                         kvm_inject_gp(vcpu, 0);
400                         return;
401                 }
402         } else {
403                 if (is_pae(vcpu)) {
404                         if (cr3 & CR3_PAE_RESERVED_BITS) {
405                                 printk(KERN_DEBUG
406                                        "set_cr3: #GP, reserved bits\n");
407                                 kvm_inject_gp(vcpu, 0);
408                                 return;
409                         }
410                         if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) {
411                                 printk(KERN_DEBUG "set_cr3: #GP, pdptrs "
412                                        "reserved bits\n");
413                                 kvm_inject_gp(vcpu, 0);
414                                 return;
415                         }
416                 }
417                 /*
418                  * We don't check reserved bits in nonpae mode, because
419                  * this isn't enforced, and VMware depends on this.
420                  */
421         }
422 
423         /*
424          * Does the new cr3 value map to physical memory? (Note, we
425          * catch an invalid cr3 even in real-mode, because it would
426          * cause trouble later on when we turn on paging anyway.)
427          *
428          * A real CPU would silently accept an invalid cr3 and would
429          * attempt to use it - with largely undefined (and often hard
430          * to debug) behavior on the guest side.
431          */
432         if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT)))
433                 kvm_inject_gp(vcpu, 0);
434         else {
435                 vcpu->arch.cr3 = cr3;
436                 vcpu->arch.mmu.new_cr3(vcpu);
437         }
438 }
439 EXPORT_SYMBOL_GPL(kvm_set_cr3);
440 
441 void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
442 {
443         if (cr8 & CR8_RESERVED_BITS) {
444                 printk(KERN_DEBUG "set_cr8: #GP, reserved bits 0x%lx\n", cr8);
445                 kvm_inject_gp(vcpu, 0);
446                 return;
447         }
448         if (irqchip_in_kernel(vcpu->kvm))
449                 kvm_lapic_set_tpr(vcpu, cr8);
450         else
451                 vcpu->arch.cr8 = cr8;
452 }
453 EXPORT_SYMBOL_GPL(kvm_set_cr8);
454 
455 unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
456 {
457         if (irqchip_in_kernel(vcpu->kvm))
458                 return kvm_lapic_get_cr8(vcpu);
459         else
460                 return vcpu->arch.cr8;
461 }
462 EXPORT_SYMBOL_GPL(kvm_get_cr8);
463 
464 static inline u32 bit(int bitno)
465 {
466         return 1 << (bitno & 31);
467 }
468 
469 /*
470  * List of msr numbers which we expose to userspace through KVM_GET_MSRS
471  * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
472  *
473  * This list is modified at module load time to reflect the
474  * capabilities of the host cpu.
475  */
476 static u32 msrs_to_save[] = {
477         MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
478         MSR_K6_STAR,
479 #ifdef CONFIG_X86_64
480         MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
481 #endif
482         MSR_IA32_TIME_STAMP_COUNTER, MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
483         MSR_IA32_PERF_STATUS, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA
484 };
485 
486 static unsigned num_msrs_to_save;
487 
488 static u32 emulated_msrs[] = {
489         MSR_IA32_MISC_ENABLE,
490 };
491 
492 static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
493 {
494         if (efer & efer_reserved_bits) {
495                 printk(KERN_DEBUG "set_efer: 0x%llx #GP, reserved bits\n",
496                        efer);
497                 kvm_inject_gp(vcpu, 0);
498                 return;
499         }
500 
501         if (is_paging(vcpu)
502             && (vcpu->arch.shadow_efer & EFER_LME) != (efer & EFER_LME)) {
503                 printk(KERN_DEBUG "set_efer: #GP, change LME while paging\n");
504                 kvm_inject_gp(vcpu, 0);
505                 return;
506         }
507 
508         if (efer & EFER_FFXSR) {
509                 struct kvm_cpuid_entry2 *feat;
510 
511                 feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
512                 if (!feat || !(feat->edx & bit(X86_FEATURE_FXSR_OPT))) {
513                         printk(KERN_DEBUG "set_efer: #GP, enable FFXSR w/o CPUID capability\n");
514                         kvm_inject_gp(vcpu, 0);
515                         return;
516                 }
517         }
518 
519         if (efer & EFER_SVME) {
520                 struct kvm_cpuid_entry2 *feat;
521 
522                 feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
523                 if (!feat || !(feat->ecx & bit(X86_FEATURE_SVM))) {
524                         printk(KERN_DEBUG "set_efer: #GP, enable SVM w/o SVM\n");
525                         kvm_inject_gp(vcpu, 0);
526                         return;
527                 }
528         }
529 
530         kvm_x86_ops->set_efer(vcpu, efer);
531 
532         efer &= ~EFER_LMA;
533         efer |= vcpu->arch.shadow_efer & EFER_LMA;
534 
535         vcpu->arch.shadow_efer = efer;
536 
537         vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled;
538         kvm_mmu_reset_context(vcpu);
539 }
540 
541 void kvm_enable_efer_bits(u64 mask)
542 {
543        efer_reserved_bits &= ~mask;
544 }
545 EXPORT_SYMBOL_GPL(kvm_enable_efer_bits);
546 
547 
548 /*
549  * Writes msr value into into the appropriate "register".
550  * Returns 0 on success, non-0 otherwise.
551  * Assumes vcpu_load() was already called.
552  */
553 int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
554 {
555         return kvm_x86_ops->set_msr(vcpu, msr_index, data);
556 }
557 
558 /*
559  * Adapt set_msr() to msr_io()'s calling convention
560  */
561 static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
562 {
563         return kvm_set_msr(vcpu, index, *data);
564 }
565 
566 static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
567 {
568         static int version;
569         struct pvclock_wall_clock wc;
570         struct timespec now, sys, boot;
571 
572         if (!wall_clock)
573                 return;
574 
575         version++;
576 
577         kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
578 
579         /*
580          * The guest calculates current wall clock time by adding
581          * system time (updated by kvm_write_guest_time below) to the
582          * wall clock specified here.  guest system time equals host
583          * system time for us, thus we must fill in host boot time here.
584          */
585         now = current_kernel_time();
586         ktime_get_ts(&sys);
587         boot = ns_to_timespec(timespec_to_ns(&now) - timespec_to_ns(&sys));
588 
589         wc.sec = boot.tv_sec;
590         wc.nsec = boot.tv_nsec;
591         wc.version = version;
592 
593         kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc));
594 
595         version++;
596         kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
597 }
598 
599 static uint32_t div_frac(uint32_t dividend, uint32_t divisor)
600 {
601         uint32_t quotient, remainder;
602 
603         /* Don't try to replace with do_div(), this one calculates
604          * "(dividend << 32) / divisor" */
605         __asm__ ( "divl %4"
606                   : "=a" (quotient), "=d" (remainder)
607                   : "" (0), "1" (dividend), "r" (divisor) );
608         return quotient;
609 }
610 
611 static void kvm_set_time_scale(uint32_t tsc_khz, struct pvclock_vcpu_time_info *hv_clock)
612 {
613         uint64_t nsecs = 1000000000LL;
614         int32_t  shift = 0;
615         uint64_t tps64;
616         uint32_t tps32;
617 
618         tps64 = tsc_khz * 1000LL;
619         while (tps64 > nsecs*2) {
620                 tps64 >>= 1;
621                 shift--;
622         }
623 
624         tps32 = (uint32_t)tps64;
625         while (tps32 <= (uint32_t)nsecs) {
626                 tps32 <<= 1;
627                 shift++;
628         }
629 
630         hv_clock->tsc_shift = shift;
631         hv_clock->tsc_to_system_mul = div_frac(nsecs, tps32);
632 
633         pr_debug("%s: tsc_khz %u, tsc_shift %d, tsc_mul %u\n",
634                  __func__, tsc_khz, hv_clock->tsc_shift,
635                  hv_clock->tsc_to_system_mul);
636 }
637 
638 static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);
639 
640 static void kvm_write_guest_time(struct kvm_vcpu *v)
641 {
642         struct timespec ts;
643         unsigned long flags;
644         struct kvm_vcpu_arch *vcpu = &v->arch;
645         void *shared_kaddr;
646         unsigned long this_tsc_khz;
647 
648         if ((!vcpu->time_page))
649                 return;
650 
651         this_tsc_khz = get_cpu_var(cpu_tsc_khz);
652         if (unlikely(vcpu->hv_clock_tsc_khz != this_tsc_khz)) {
653                 kvm_set_time_scale(this_tsc_khz, &vcpu->hv_clock);
654                 vcpu->hv_clock_tsc_khz = this_tsc_khz;
655         }
656         put_cpu_var(cpu_tsc_khz);
657 
658         /* Keep irq disabled to prevent changes to the clock */
659         local_irq_save(flags);
660         kvm_get_msr(v, MSR_IA32_TIME_STAMP_COUNTER,
661                           &vcpu->hv_clock.tsc_timestamp);
662         ktime_get_ts(&ts);
663         local_irq_restore(flags);
664 
665         /* With all the info we got, fill in the values */
666 
667         vcpu->hv_clock.system_time = ts.tv_nsec +
668                                      (NSEC_PER_SEC * (u64)ts.tv_sec);
669         /*
670          * The interface expects us to write an even number signaling that the
671          * update is finished. Since the guest won't see the intermediate
672          * state, we just increase by 2 at the end.
673          */
674         vcpu->hv_clock.version += 2;
675 
676         shared_kaddr = kmap_atomic(vcpu->time_page, KM_USER0);
677 
678         memcpy(shared_kaddr + vcpu->time_offset, &vcpu->hv_clock,
679                sizeof(vcpu->hv_clock));
680 
681         kunmap_atomic(shared_kaddr, KM_USER0);
682 
683         mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT);
684 }
685 
686 static int kvm_request_guest_time_update(struct kvm_vcpu *v)
687 {
688         struct kvm_vcpu_arch *vcpu = &v->arch;
689 
690         if (!vcpu->time_page)
691                 return 0;
692         set_bit(KVM_REQ_KVMCLOCK_UPDATE, &v->requests);
693         return 1;
694 }
695 
696 static bool msr_mtrr_valid(unsigned msr)
697 {
698         switch (msr) {
699         case 0x200 ... 0x200 + 2 * KVM_NR_VAR_MTRR - 1:
700         case MSR_MTRRfix64K_00000:
701         case MSR_MTRRfix16K_80000:
702         case MSR_MTRRfix16K_A0000:
703         case MSR_MTRRfix4K_C0000:
704         case MSR_MTRRfix4K_C8000:
705         case MSR_MTRRfix4K_D0000:
706         case MSR_MTRRfix4K_D8000:
707         case MSR_MTRRfix4K_E0000:
708         case MSR_MTRRfix4K_E8000:
709         case MSR_MTRRfix4K_F0000:
710         case MSR_MTRRfix4K_F8000:
711         case MSR_MTRRdefType:
712         case MSR_IA32_CR_PAT:
713                 return true;
714         case 0x2f8:
715                 return true;
716         }
717         return false;
718 }
719 
720 static bool valid_pat_type(unsigned t)
721 {
722         return t < 8 && (1 << t) & 0xf3; /* 0, 1, 4, 5, 6, 7 */
723 }
724 
725 static bool valid_mtrr_type(unsigned t)
726 {
727         return t < 8 && (1 << t) & 0x73; /* 0, 1, 4, 5, 6 */
728 }
729 
730 static bool mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data)
731 {
732         int i;
733 
734         if (!msr_mtrr_valid(msr))
735                 return false;
736 
737         if (msr == MSR_IA32_CR_PAT) {
738                 for (i = 0; i < 8; i++)
739                         if (!valid_pat_type((data >> (i * 8)) & 0xff))
740                                 return false;
741                 return true;
742         } else if (msr == MSR_MTRRdefType) {
743                 if (data & ~0xcff)
744                         return false;
745                 return valid_mtrr_type(data & 0xff);
746         } else if (msr >= MSR_MTRRfix64K_00000 && msr <= MSR_MTRRfix4K_F8000) {
747                 for (i = 0; i < 8 ; i++)
748                         if (!valid_mtrr_type((data >> (i * 8)) & 0xff))
749                                 return false;
750                 return true;
751         }
752 
753         /* variable MTRRs */
754         return valid_mtrr_type(data & 0xff);
755 }
756 
757 static int set_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
758 {
759         u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges;
760 
761         if (!mtrr_valid(vcpu, msr, data))
762                 return 1;
763 
764         if (msr == MSR_MTRRdefType) {
765                 vcpu->arch.mtrr_state.def_type = data;
766                 vcpu->arch.mtrr_state.enabled = (data & 0xc00) >> 10;
767         } else if (msr == MSR_MTRRfix64K_00000)
768                 p[0] = data;
769         else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000)
770                 p[1 + msr - MSR_MTRRfix16K_80000] = data;
771         else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000)
772                 p[3 + msr - MSR_MTRRfix4K_C0000] = data;
773         else if (msr == MSR_IA32_CR_PAT)
774                 vcpu->arch.pat = data;
775         else {  /* Variable MTRRs */
776                 int idx, is_mtrr_mask;
777                 u64 *pt;
778 
779                 idx = (msr - 0x200) / 2;
780                 is_mtrr_mask = msr - 0x200 - 2 * idx;
781                 if (!is_mtrr_mask)
782                         pt =
783                           (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].base_lo;
784                 else
785                         pt =
786                           (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].mask_lo;
787                 *pt = data;
788         }
789 
790         kvm_mmu_reset_context(vcpu);
791         return 0;
792 }
793 
794 int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
795 {
796         switch (msr) {
797         case MSR_EFER:
798                 set_efer(vcpu, data);
799                 break;
800         case MSR_IA32_MC0_STATUS:
801                 pr_unimpl(vcpu, "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n",
802                        __func__, data);
803                 break;
804         case MSR_IA32_MCG_STATUS:
805                 pr_unimpl(vcpu, "%s: MSR_IA32_MCG_STATUS 0x%llx, nop\n",
806                         __func__, data);
807                 break;
808         case MSR_IA32_MCG_CTL:
809                 pr_unimpl(vcpu, "%s: MSR_IA32_MCG_CTL 0x%llx, nop\n",
810                         __func__, data);
811                 break;
812         case MSR_IA32_DEBUGCTLMSR:
813                 if (!data) {
814                         /* We support the non-activated case already */
815                         break;
816                 } else if (data & ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_BTF)) {
817                         /* Values other than LBR and BTF are vendor-specific,
818                            thus reserved and should throw a #GP */
819                         return 1;
820                 }
821                 pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n",
822                         __func__, data);
823                 break;
824         case MSR_IA32_UCODE_REV:
825         case MSR_IA32_UCODE_WRITE:
826         case MSR_VM_HSAVE_PA:
827                 break;
828         case 0x200 ... 0x2ff:
829                 return set_msr_mtrr(vcpu, msr, data);
830         case MSR_IA32_APICBASE:
831                 kvm_set_apic_base(vcpu, data);
832                 break;
833         case MSR_IA32_MISC_ENABLE:
834                 vcpu->arch.ia32_misc_enable_msr = data;
835                 break;
836         case MSR_KVM_WALL_CLOCK:
837                 vcpu->kvm->arch.wall_clock = data;
838                 kvm_write_wall_clock(vcpu->kvm, data);
839                 break;
840         case MSR_KVM_SYSTEM_TIME: {
841                 if (vcpu->arch.time_page) {
842                         kvm_release_page_dirty(vcpu->arch.time_page);
843                         vcpu->arch.time_page = NULL;
844                 }
845 
846                 vcpu->arch.time = data;
847 
848                 /* we verify if the enable bit is set... */
849                 if (!(data & 1))
850                         break;
851 
852                 /* ...but clean it before doing the actual write */
853                 vcpu->arch.time_offset = data & ~(PAGE_MASK | 1);
854 
855                 vcpu->arch.time_page =
856                                 gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT);
857 
858                 if (is_error_page(vcpu->arch.time_page)) {
859                         kvm_release_page_clean(vcpu->arch.time_page);
860                         vcpu->arch.time_page = NULL;
861                 }
862 
863                 kvm_request_guest_time_update(vcpu);
864                 break;
865         }
866         default:
867                 pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", msr, data);
868                 return 1;
869         }
870         return 0;
871 }
872 EXPORT_SYMBOL_GPL(kvm_set_msr_common);
873 
874 
875 /*
876  * Reads an msr value (of 'msr_index') into 'pdata'.
877  * Returns 0 on success, non-0 otherwise.
878  * Assumes vcpu_load() was already called.
879  */
880 int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
881 {
882         return kvm_x86_ops->get_msr(vcpu, msr_index, pdata);
883 }
884 
885 static int get_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
886 {
887         u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges;
888 
889         if (!msr_mtrr_valid(msr))
890                 return 1;
891 
892         if (msr == MSR_MTRRdefType)
893                 *pdata = vcpu->arch.mtrr_state.def_type +
894                          (vcpu->arch.mtrr_state.enabled << 10);
895         else if (msr == MSR_MTRRfix64K_00000)
896                 *pdata = p[0];
897         else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000)
898                 *pdata = p[1 + msr - MSR_MTRRfix16K_80000];
899         else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000)
900                 *pdata = p[3 + msr - MSR_MTRRfix4K_C0000];
901         else if (msr == MSR_IA32_CR_PAT)
902                 *pdata = vcpu->arch.pat;
903         else {  /* Variable MTRRs */
904                 int idx, is_mtrr_mask;
905                 u64 *pt;
906 
907                 idx = (msr - 0x200) / 2;
908                 is_mtrr_mask = msr - 0x200 - 2 * idx;
909                 if (!is_mtrr_mask)
910                         pt =
911                           (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].base_lo;
912                 else
913                         pt =
914                           (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].mask_lo;
915                 *pdata = *pt;
916         }
917 
918         return 0;
919 }
920 
921 int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
922 {
923         u64 data;
924 
925         switch (msr) {
926         case 0xc0010010: /* SYSCFG */
927         case 0xc0010015: /* HWCR */
928         case MSR_IA32_PLATFORM_ID:
929         case MSR_IA32_P5_MC_ADDR:
930         case MSR_IA32_P5_MC_TYPE:
931         case MSR_IA32_MC0_CTL:
932         case MSR_IA32_MCG_STATUS:
933         case MSR_IA32_MCG_CAP:
934         case MSR_IA32_MCG_CTL:
935         case MSR_IA32_MC0_MISC:
936         case MSR_IA32_MC0_MISC+4:
937         case MSR_IA32_MC0_MISC+8:
938         case MSR_IA32_MC0_MISC+12:
939         case MSR_IA32_MC0_MISC+16:
940         case MSR_IA32_MC0_MISC+20:
941         case MSR_IA32_UCODE_REV:
942         case MSR_IA32_EBL_CR_POWERON:
943         case MSR_IA32_DEBUGCTLMSR:
944         case MSR_IA32_LASTBRANCHFROMIP:
945         case MSR_IA32_LASTBRANCHTOIP:
946         case MSR_IA32_LASTINTFROMIP:
947         case MSR_IA32_LASTINTTOIP:
948         case MSR_VM_HSAVE_PA:
949         case MSR_P6_EVNTSEL0:
950         case MSR_P6_EVNTSEL1:
951         case MSR_K7_EVNTSEL0:
952         case MSR_K8_INT_PENDING_MSG:
953                 data = 0;
954                 break;
955         case MSR_MTRRcap:
956                 data = 0x500 | KVM_NR_VAR_MTRR;
957                 break;
958         case 0x200 ... 0x2ff:
959                 return get_msr_mtrr(vcpu, msr, pdata);
960         case 0xcd: /* fsb frequency */
961                 data = 3;
962                 break;
963         case MSR_IA32_APICBASE:
964                 data = kvm_get_apic_base(vcpu);
965                 break;
966         case MSR_IA32_MISC_ENABLE:
967                 data = vcpu->arch.ia32_misc_enable_msr;
968                 break;
969         case MSR_IA32_PERF_STATUS:
970                 /* TSC increment by tick */
971                 data = 1000ULL;
972                 /* CPU multiplier */
973                 data |= (((uint64_t)4ULL) << 40);
974                 break;
975         case MSR_EFER:
976                 data = vcpu->arch.shadow_efer;
977                 break;
978         case MSR_KVM_WALL_CLOCK:
979                 data = vcpu->kvm->arch.wall_clock;
980                 break;
981         case MSR_KVM_SYSTEM_TIME:
982                 data = vcpu->arch.time;
983                 break;
984         default:
985                 pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);
986                 return 1;
987         }
988         *pdata = data;
989         return 0;
990 }
991 EXPORT_SYMBOL_GPL(kvm_get_msr_common);
992 
993 /*
994  * Read or write a bunch of msrs. All parameters are kernel addresses.
995  *
996  * @return number of msrs set successfully.
997  */
998 static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs,
999                     struct kvm_msr_entry *entries,
1000                     int (*do_msr)(struct kvm_vcpu *vcpu,
1001                                   unsigned index, u64 *data))
1002 {
1003         int i;
1004 
1005         vcpu_load(vcpu);
1006 
1007         down_read(&vcpu->kvm->slots_lock);
1008         for (i = 0; i < msrs->nmsrs; ++i)
1009                 if (do_msr(vcpu, entries[i].index, &entries[i].data))
1010                         break;
1011         up_read(&vcpu->kvm->slots_lock);
1012 
1013         vcpu_put(vcpu);
1014 
1015         return i;
1016 }
1017 
1018 /*
1019  * Read or write a bunch of msrs. Parameters are user addresses.
1020  *
1021  * @return number of msrs set successfully.
1022  */
1023 static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs,
1024                   int (*do_msr)(struct kvm_vcpu *vcpu,
1025                                 unsigned index, u64 *data),
1026                   int writeback)
1027 {
1028         struct kvm_msrs msrs;
1029         struct kvm_msr_entry *entries;
1030         int r, n;
1031         unsigned size;
1032 
1033         r = -EFAULT;
1034         if (copy_from_user(&msrs, user_msrs, sizeof msrs))
1035                 goto out;
1036 
1037         r = -E2BIG;
1038         if (msrs.nmsrs >= MAX_IO_MSRS)
1039                 goto out;
1040 
1041         r = -ENOMEM;
1042         size = sizeof(struct kvm_msr_entry) * msrs.nmsrs;
1043         entries = vmalloc(size);
1044         if (!entries)
1045                 goto out;
1046 
1047         r = -EFAULT;
1048         if (copy_from_user(entries, user_msrs->entries, size))
1049                 goto out_free;
1050 
1051         r = n = __msr_io(vcpu, &msrs, entries, do_msr);
1052         if (r < 0)
1053                 goto out_free;
1054 
1055         r = -EFAULT;
1056         if (writeback && copy_to_user(user_msrs->entries, entries, size))
1057                 goto out_free;
1058 
1059         r = n;
1060 
1061 out_free:
1062         vfree(entries);
1063 out:
1064         return r;
1065 }
1066 
1067 int kvm_dev_ioctl_check_extension(long ext)
1068 {
1069         int r;
1070 
1071         switch (ext) {
1072         case KVM_CAP_IRQCHIP:
1073         case KVM_CAP_HLT:
1074         case KVM_CAP_MMU_SHADOW_CACHE_CONTROL:
1075         case KVM_CAP_SET_TSS_ADDR:
1076         case KVM_CAP_EXT_CPUID:
1077         case KVM_CAP_CLOCKSOURCE:
1078         case KVM_CAP_PIT:
1079         case KVM_CAP_NOP_IO_DELAY:
1080         case KVM_CAP_MP_STATE:
1081         case KVM_CAP_SYNC_MMU:
1082         case KVM_CAP_REINJECT_CONTROL:
1083         case KVM_CAP_IRQ_INJECT_STATUS:
1084         case KVM_CAP_ASSIGN_DEV_IRQ:
1085                 r = 1;
1086                 break;
1087         case KVM_CAP_COALESCED_MMIO:
1088                 r = KVM_COALESCED_MMIO_PAGE_OFFSET;
1089                 break;
1090         case KVM_CAP_VAPIC:
1091                 r = !kvm_x86_ops->cpu_has_accelerated_tpr();
1092                 break;
1093         case KVM_CAP_NR_VCPUS:
1094                 r = KVM_MAX_VCPUS;
1095                 break;
1096         case KVM_CAP_NR_MEMSLOTS:
1097                 r = KVM_MEMORY_SLOTS;
1098                 break;
1099         case KVM_CAP_PV_MMU:
1100                 r = !tdp_enabled;
1101                 break;
1102         case KVM_CAP_IOMMU:
1103                 r = iommu_found();
1104                 break;
1105         default:
1106                 r = 0;
1107                 break;
1108         }
1109         return r;
1110 
1111 }
1112 
1113 long kvm_arch_dev_ioctl(struct file *filp,
1114                         unsigned int ioctl, unsigned long arg)
1115 {
1116         void __user *argp = (void __user *)arg;
1117         long r;
1118 
1119         switch (ioctl) {
1120         case KVM_GET_MSR_INDEX_LIST: {
1121                 struct kvm_msr_list __user *user_msr_list = argp;
1122                 struct kvm_msr_list msr_list;
1123                 unsigned n;
1124 
1125                 r = -EFAULT;
1126                 if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list))
1127                         goto out;
1128                 n = msr_list.nmsrs;
1129                 msr_list.nmsrs = num_msrs_to_save + ARRAY_SIZE(emulated_msrs);
1130                 if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list))
1131                         goto out;
1132                 r = -E2BIG;
1133                 if (n < msr_list.nmsrs)
1134                         goto out;
1135                 r = -EFAULT;
1136                 if (copy_to_user(user_msr_list->indices, &msrs_to_save,
1137                                  num_msrs_to_save * sizeof(u32)))
1138                         goto out;
1139                 if (copy_to_user(user_msr_list->indices + num_msrs_to_save,
1140                                  &emulated_msrs,
1141                                  ARRAY_SIZE(emulated_msrs) * sizeof(u32)))
1142                         goto out;
1143                 r = 0;
1144                 break;
1145         }
1146         case KVM_GET_SUPPORTED_CPUID: {
1147                 struct kvm_cpuid2 __user *cpuid_arg = argp;
1148                 struct kvm_cpuid2 cpuid;
1149 
1150                 r = -EFAULT;
1151                 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
1152                         goto out;
1153                 r = kvm_dev_ioctl_get_supported_cpuid(&cpuid,
1154                                                       cpuid_arg->entries);
1155                 if (r)
1156                         goto out;
1157 
1158                 r = -EFAULT;
1159                 if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid))
1160                         goto out;
1161                 r = 0;
1162                 break;
1163         }
1164         default:
1165                 r = -EINVAL;
1166         }
1167 out:
1168         return r;
1169 }
1170 
1171 void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
1172 {
1173         kvm_x86_ops->vcpu_load(vcpu, cpu);
1174         kvm_request_guest_time_update(vcpu);
1175 }
1176 
1177 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
1178 {
1179         kvm_x86_ops->vcpu_put(vcpu);
1180         kvm_put_guest_fpu(vcpu);
1181 }
1182 
1183 static int is_efer_nx(void)
1184 {
1185         unsigned long long efer = 0;
1186 
1187         rdmsrl_safe(MSR_EFER, &efer);
1188         return efer & EFER_NX;
1189 }
1190 
1191 static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu)
1192 {
1193         int i;
1194         struct kvm_cpuid_entry2 *e, *entry;
1195 
1196         entry = NULL;
1197         for (i = 0; i < vcpu->arch.cpuid_nent; ++i) {
1198                 e = &vcpu->arch.cpuid_entries[i];
1199                 if (e->function == 0x80000001) {
1200                         entry = e;
1201                         break;
1202                 }
1203         }
1204         if (entry && (entry->edx & (1 << 20)) && !is_efer_nx()) {
1205                 entry->edx &= ~(1 << 20);
1206                 printk(KERN_INFO "kvm: guest NX capability removed\n");
1207         }
1208 }
1209 
1210 /* when an old userspace process fills a new kernel module */
1211 static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
1212                                     struct kvm_cpuid *cpuid,
1213                                     struct kvm_cpuid_entry __user *entries)
1214 {
1215         int r, i;
1216         struct kvm_cpuid_entry *cpuid_entries;
1217 
1218         r = -E2BIG;
1219         if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
1220                 goto out;
1221         r = -ENOMEM;
1222         cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry) * cpuid->nent);
1223         if (!cpuid_entries)
1224                 goto out;
1225         r = -EFAULT;
1226         if (copy_from_user(cpuid_entries, entries,
1227                            cpuid->nent * sizeof(struct kvm_cpuid_entry)))
1228                 goto out_free;
1229         for (i = 0; i < cpuid->nent; i++) {
1230                 vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function;
1231                 vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax;
1232                 vcpu->arch.cpuid_entries[i].ebx = cpuid_entries[i].ebx;
1233                 vcpu->arch.cpuid_entries[i].ecx = cpuid_entries[i].ecx;
1234                 vcpu->arch.cpuid_entries[i].edx = cpuid_entries[i].edx;
1235                 vcpu->arch.cpuid_entries[i].index = 0;
1236                 vcpu->arch.cpuid_entries[i].flags = 0;
1237                 vcpu->arch.cpuid_entries[i].padding[0] = 0;
1238                 vcpu->arch.cpuid_entries[i].padding[1] = 0;
1239                 vcpu->arch.cpuid_entries[i].padding[2] = 0;
1240         }
1241         vcpu->arch.cpuid_nent = cpuid->nent;
1242         cpuid_fix_nx_cap(vcpu);
1243         r = 0;
1244 
1245 out_free:
1246         vfree(cpuid_entries);
1247 out:
1248         return r;
1249 }
1250 
1251 static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
1252                                      struct kvm_cpuid2 *cpuid,
1253                                      struct kvm_cpuid_entry2 __user *entries)
1254 {
1255         int r;
1256 
1257         r = -E2BIG;
1258         if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
1259                 goto out;
1260         r = -EFAULT;
1261         if (copy_from_user(&vcpu->arch.cpuid_entries, entries,
1262                            cpuid->nent * sizeof(struct kvm_cpuid_entry2)))
1263                 goto out;
1264         vcpu->arch.cpuid_nent = cpuid->nent;
1265         return 0;
1266 
1267 out:
1268         return r;
1269 }
1270 
1271 static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
1272                                      struct kvm_cpuid2 *cpuid,
1273                                      struct kvm_cpuid_entry2 __user *entries)
1274 {
1275         int r;
1276 
1277         r = -E2BIG;
1278         if (cpuid->nent < vcpu->arch.cpuid_nent)
1279                 goto out;
1280         r = -EFAULT;
1281         if (copy_to_user(entries, &vcpu->arch.cpuid_entries,
1282                          vcpu->arch.cpuid_nent * sizeof(struct kvm_cpuid_entry2)))
1283                 goto out;
1284         return 0;
1285 
1286 out:
1287         cpuid->nent = vcpu->arch.cpuid_nent;
1288         return r;
1289 }
1290 
1291 static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function,
1292                            u32 index)
1293 {
1294         entry->function = function;
1295         entry->index = index;
1296         cpuid_count(entry->function, entry->index,
1297                     &entry->eax, &entry->ebx, &entry->ecx, &entry->edx);
1298         entry->flags = 0;
1299 }
1300 
1301 #define F(x) bit(X86_FEATURE_##x)
1302 
1303 static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
1304                          u32 index, int *nent, int maxnent)
1305 {
1306         unsigned f_nx = is_efer_nx() ? F(NX) : 0;
1307 #ifdef CONFIG_X86_64
1308         unsigned f_lm = F(LM);
1309 #else
1310         unsigned f_lm = 0;
1311 #endif
1312 
1313         /* cpuid 1.edx */
1314         const u32 kvm_supported_word0_x86_features =
1315                 F(FPU) | F(VME) | F(DE) | F(PSE) |
1316                 F(TSC) | F(MSR) | F(PAE) | F(MCE) |
1317                 F(CX8) | F(APIC) | 0 /* Reserved */ | F(SEP) |
1318                 F(MTRR) | F(PGE) | F(MCA) | F(CMOV) |
1319                 F(PAT) | F(PSE36) | 0 /* PSN */ | F(CLFLSH) |
1320                 0 /* Reserved, DS, ACPI */ | F(MMX) |
1321                 F(FXSR) | F(XMM) | F(XMM2) | F(SELFSNOOP) |
1322                 0 /* HTT, TM, Reserved, PBE */;
1323         /* cpuid 0x80000001.edx */
1324         const u32 kvm_supported_word1_x86_features =
1325                 F(FPU) | F(VME) | F(DE) | F(PSE) |
1326                 F(TSC) | F(MSR) | F(PAE) | F(MCE) |
1327                 F(CX8) | F(APIC) | 0 /* Reserved */ | F(SYSCALL) |
1328                 F(MTRR) | F(PGE) | F(MCA) | F(CMOV) |
1329                 F(PAT) | F(PSE36) | 0 /* Reserved */ |
1330                 f_nx | 0 /* Reserved */ | F(MMXEXT) | F(MMX) |
1331                 F(FXSR) | F(FXSR_OPT) | 0 /* GBPAGES */ | 0 /* RDTSCP */ |
1332                 0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW);
1333         /* cpuid 1.ecx */
1334         const u32 kvm_supported_word4_x86_features =
1335                 F(XMM3) | 0 /* Reserved, DTES64, MONITOR */ |
1336                 0 /* DS-CPL, VMX, SMX, EST */ |
1337                 0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ |
1338                 0 /* Reserved */ | F(CX16) | 0 /* xTPR Update, PDCM */ |
1339                 0 /* Reserved, DCA */ | F(XMM4_1) |
1340                 F(XMM4_2) | 0 /* x2APIC */ | F(MOVBE) | F(POPCNT) |
1341                 0 /* Reserved, XSAVE, OSXSAVE */;
1342         /* cpuid 0x80000001.ecx */
1343         const u32 kvm_supported_word6_x86_features =
1344                 F(LAHF_LM) | F(CMP_LEGACY) | F(SVM) | 0 /* ExtApicSpace */ |
1345                 F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) |
1346                 F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(SSE5) |
1347                 0 /* SKINIT */ | 0 /* WDT */;
1348 
1349         /* all calls to cpuid_count() should be made on the same cpu */
1350         get_cpu();
1351         do_cpuid_1_ent(entry, function, index);
1352         ++*nent;
1353 
1354         switch (function) {
1355         case 0:
1356                 entry->eax = min(entry->eax, (u32)0xb);
1357                 break;
1358         case 1:
1359                 entry->edx &= kvm_supported_word0_x86_features;
1360                 entry->ecx &= kvm_supported_word4_x86_features;
1361                 break;
1362         /* function 2 entries are STATEFUL. That is, repeated cpuid commands
1363          * may return different values. This forces us to get_cpu() before
1364          * issuing the first command, and also to emulate this annoying behavior
1365          * in kvm_emulate_cpuid() using KVM_CPUID_FLAG_STATE_READ_NEXT */
1366         case 2: {
1367                 int t, times = entry->eax & 0xff;
1368 
1369                 entry->flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
1370                 entry->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT;
1371                 for (t = 1; t < times && *nent < maxnent; ++t) {
1372                         do_cpuid_1_ent(&entry[t], function, 0);
1373                         entry[t].flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
1374                         ++*nent;
1375                 }
1376                 break;
1377         }
1378         /* function 4 and 0xb have additional index. */
1379         case 4: {
1380                 int i, cache_type;
1381 
1382                 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
1383                 /* read more entries until cache_type is zero */
1384                 for (i = 1; *nent < maxnent; ++i) {
1385                         cache_type = entry[i - 1].eax & 0x1f;
1386                         if (!cache_type)
1387                                 break;
1388                         do_cpuid_1_ent(&entry[i], function, i);
1389                         entry[i].flags |=
1390                                KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
1391                         ++*nent;
1392                 }
1393                 break;
1394         }
1395         case 0xb: {
1396                 int i, level_type;
1397 
1398                 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
1399                 /* read more entries until level_type is zero */
1400                 for (i = 1; *nent < maxnent; ++i) {
1401                         level_type = entry[i - 1].ecx & 0xff00;
1402                         if (!level_type)
1403                                 break;
1404                         do_cpuid_1_ent(&entry[i], function, i);
1405                         entry[i].flags |=
1406                                KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
1407                         ++*nent;
1408                 }
1409                 break;
1410         }
1411         case 0x80000000:
1412                 entry->eax = min(entry->eax, 0x8000001a);
1413                 break;
1414         case 0x80000001:
1415                 entry->edx &= kvm_supported_word1_x86_features;
1416                 entry->ecx &= kvm_supported_word6_x86_features;
1417                 break;
1418         }
1419         put_cpu();
1420 }
1421 
1422 #undef F
1423 
1424 static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
1425                                      struct kvm_cpuid_entry2 __user *entries)
1426 {
1427         struct kvm_cpuid_entry2 *cpuid_entries;
1428         int limit, nent = 0, r = -E2BIG;
1429         u32 func;
1430 
1431         if (cpuid->nent < 1)
1432                 goto out;
1433         if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
1434                 cpuid->nent = KVM_MAX_CPUID_ENTRIES;
1435         r = -ENOMEM;
1436         cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry2) * cpuid->nent);
1437         if (!cpuid_entries)
1438                 goto out;
1439 
1440         do_cpuid_ent(&cpuid_entries[0], 0, 0, &nent, cpuid->nent);
1441         limit = cpuid_entries[0].eax;
1442         for (func = 1; func <= limit && nent < cpuid->nent; ++func)
1443                 do_cpuid_ent(&cpuid_entries[nent], func, 0,
1444                              &nent, cpuid->nent);
1445         r = -E2BIG;
1446         if (nent >= cpuid->nent)
1447                 goto out_free;
1448 
1449         do_cpuid_ent(&cpuid_entries[nent], 0x80000000, 0, &nent, cpuid->nent);
1450         limit = cpuid_entries[nent - 1].eax;
1451         for (func = 0x80000001; func <= limit && nent < cpuid->nent; ++func)
1452                 do_cpuid_ent(&cpuid_entries[nent], func, 0,
1453                              &nent, cpuid->nent);
1454         r = -E2BIG;
1455         if (nent >= cpuid->nent)
1456                 goto out_free;
1457 
1458         r = -EFAULT;
1459         if (copy_to_user(entries, cpuid_entries,
1460                          nent * sizeof(struct kvm_cpuid_entry2)))
1461                 goto out_free;
1462         cpuid->nent = nent;
1463         r = 0;
1464 
1465 out_free:
1466         vfree(cpuid_entries);
1467 out:
1468         return r;
1469 }
1470 
1471 static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
1472                                     struct kvm_lapic_state *s)
1473 {
1474         vcpu_load(vcpu);
1475         memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s);
1476         vcpu_put(vcpu);
1477 
1478         return 0;
1479 }
1480 
1481 static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
1482                                     struct kvm_lapic_state *s)
1483 {
1484         vcpu_load(vcpu);
1485         memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s);
1486         kvm_apic_post_state_restore(vcpu);
1487         vcpu_put(vcpu);
1488 
1489         return 0;
1490 }
1491 
1492 static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
1493                                     struct kvm_interrupt *irq)
1494 {
1495         if (irq->irq < 0 || irq->irq >= 256)
1496                 return -EINVAL;
1497         if (irqchip_in_kernel(vcpu->kvm))
1498                 return -ENXIO;
1499         vcpu_load(vcpu);
1500 
1501         kvm_queue_interrupt(vcpu, irq->irq, false);
1502 
1503         vcpu_put(vcpu);
1504 
1505         return 0;
1506 }
1507 
1508 static int kvm_vcpu_ioctl_nmi(struct kvm_vcpu *vcpu)
1509 {
1510         vcpu_load(vcpu);
1511         kvm_inject_nmi(vcpu);
1512         vcpu_put(vcpu);
1513 
1514         return 0;
1515 }
1516 
1517 static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu,
1518                                            struct kvm_tpr_access_ctl *tac)
1519 {
1520         if (tac->flags)
1521                 return -EINVAL;
1522         vcpu->arch.tpr_access_reporting = !!tac->enabled;
1523         return 0;
1524 }
1525 
1526 long kvm_arch_vcpu_ioctl(struct file *filp,
1527                          unsigned int ioctl, unsigned long arg)
1528 {
1529         struct kvm_vcpu *vcpu = filp->private_data;
1530         void __user *argp = (void __user *)arg;
1531         int r;
1532         struct kvm_lapic_state *lapic = NULL;
1533 
1534         switch (ioctl) {
1535         case KVM_GET_LAPIC: {
1536                 lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
1537 
1538                 r = -ENOMEM;
1539                 if (!lapic)
1540                         goto out;
1541                 r = kvm_vcpu_ioctl_get_lapic(vcpu, lapic);
1542                 if (r)
1543                         goto out;
1544                 r = -EFAULT;
1545                 if (copy_to_user(argp, lapic, sizeof(struct kvm_lapic_state)))
1546                         goto out;
1547                 r = 0;
1548                 break;
1549         }
1550         case KVM_SET_LAPIC: {
1551                 lapic = kmalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
1552                 r = -ENOMEM;
1553                 if (!lapic)
1554                         goto out;
1555                 r = -EFAULT;
1556                 if (copy_from_user(lapic, argp, sizeof(struct kvm_lapic_state)))
1557                         goto out;
1558                 r = kvm_vcpu_ioctl_set_lapic(vcpu, lapic);
1559                 if (r)
1560                         goto out;
1561                 r = 0;
1562                 break;
1563         }
1564         case KVM_INTERRUPT: {
1565                 struct kvm_interrupt irq;
1566 
1567                 r = -EFAULT;
1568                 if (copy_from_user(&irq, argp, sizeof irq))
1569                         goto out;
1570                 r = kvm_vcpu_ioctl_interrupt(vcpu, &irq);
1571                 if (r)
1572                         goto out;
1573                 r = 0;
1574                 break;
1575         }
1576         case KVM_NMI: {
1577                 r = kvm_vcpu_ioctl_nmi(vcpu);
1578                 if (r)
1579                         goto out;
1580                 r = 0;
1581                 break;
1582         }
1583         case KVM_SET_CPUID: {
1584                 struct kvm_cpuid __user *cpuid_arg = argp;
1585                 struct kvm_cpuid cpuid;
1586 
1587                 r = -EFAULT;
1588                 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
1589                         goto out;
1590                 r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries);
1591                 if (r)
1592                         goto out;
1593                 break;
1594         }
1595         case KVM_SET_CPUID2: {
1596                 struct kvm_cpuid2 __user *cpuid_arg = argp;
1597                 struct kvm_cpuid2 cpuid;
1598 
1599                 r = -EFAULT;
1600                 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
1601                         goto out;
1602                 r = kvm_vcpu_ioctl_set_cpuid2(vcpu, &cpuid,
1603                                               cpuid_arg->entries);
1604                 if (r)
1605                         goto out;
1606                 break;
1607         }
1608         case KVM_GET_CPUID2: {
1609                 struct kvm_cpuid2 __user *cpuid_arg = argp;
1610                 struct kvm_cpuid2 cpuid;
1611 
1612                 r = -EFAULT;
1613                 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
1614                         goto out;
1615                 r = kvm_vcpu_ioctl_get_cpuid2(vcpu, &cpuid,
1616                                               cpuid_arg->entries);
1617                 if (r)
1618                         goto out;
1619                 r = -EFAULT;
1620                 if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid))
1621                         goto out;
1622                 r = 0;
1623                 break;
1624         }
1625         case KVM_GET_MSRS:
1626                 r = msr_io(vcpu, argp, kvm_get_msr, 1);
1627                 break;
1628         case KVM_SET_MSRS:
1629                 r = msr_io(vcpu, argp, do_set_msr, 0);
1630                 break;
1631         case KVM_TPR_ACCESS_REPORTING: {
1632                 struct kvm_tpr_access_ctl tac;
1633 
1634                 r = -EFAULT;
1635                 if (copy_from_user(&tac, argp, sizeof tac))
1636                         goto out;
1637                 r = vcpu_ioctl_tpr_access_reporting(vcpu, &tac);
1638                 if (r)
1639                         goto out;
1640                 r = -EFAULT;
1641                 if (copy_to_user(argp, &tac, sizeof tac))
1642                         goto out;
1643                 r = 0;
1644                 break;
1645         };
1646         case KVM_SET_VAPIC_ADDR: {
1647                 struct kvm_vapic_addr va;
1648 
1649                 r = -EINVAL;
1650                 if (!irqchip_in_kernel(vcpu->kvm))
1651                         goto out;
1652                 r = -EFAULT;
1653                 if (copy_from_user(&va, argp, sizeof va))
1654                         goto out;
1655                 r = 0;
1656                 kvm_lapic_set_vapic_addr(vcpu, va.vapic_addr);
1657                 break;
1658         }
1659         default:
1660                 r = -EINVAL;
1661         }
1662 out:
1663         kfree(lapic);
1664         return r;
1665 }
1666 
1667 static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr)
1668 {
1669         int ret;
1670 
1671         if (addr > (unsigned int)(-3 * PAGE_SIZE))
1672                 return -1;
1673         ret = kvm_x86_ops->set_tss_addr(kvm, addr);
1674         return ret;
1675 }
1676 
1677 static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
1678                                           u32 kvm_nr_mmu_pages)
1679 {
1680         if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES)
1681                 return -EINVAL;
1682 
1683         down_write(&kvm->slots_lock);
1684         spin_lock(&kvm->mmu_lock);
1685 
1686         kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages);
1687         kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages;
1688 
1689         spin_unlock(&kvm->mmu_lock);
1690         up_write(&kvm->slots_lock);
1691         return 0;
1692 }
1693 
1694 static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm)
1695 {
1696         return kvm->arch.n_alloc_mmu_pages;
1697 }
1698 
1699 gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
1700 {
1701         int i;
1702         struct kvm_mem_alias *alias;
1703 
1704         for (i = 0; i < kvm->arch.naliases; ++i) {
1705                 alias = &kvm->arch.aliases[i];
1706                 if (gfn >= alias->base_gfn
1707                     && gfn < alias->base_gfn + alias->npages)
1708                         return alias->target_gfn + gfn - alias->base_gfn;
1709         }
1710         return gfn;
1711 }
1712 
1713 /*
1714  * Set a new alias region.  Aliases map a portion of physical memory into
1715  * another portion.  This is useful for memory windows, for example the PC
1716  * VGA region.
1717  */
1718 static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm,
1719                                          struct kvm_memory_alias *alias)
1720 {
1721         int r, n;
1722         struct kvm_mem_alias *p;
1723 
1724         r = -EINVAL;
1725         /* General sanity checks */
1726         if (alias->memory_size & (PAGE_SIZE - 1))
1727                 goto out;
1728         if (alias->guest_phys_addr & (PAGE_SIZE - 1))
1729                 goto out;
1730         if (alias->slot >= KVM_ALIAS_SLOTS)
1731                 goto out;
1732         if (alias->guest_phys_addr + alias->memory_size
1733             < alias->guest_phys_addr)
1734                 goto out;
1735         if (alias->target_phys_addr + alias->memory_size
1736             < alias->target_phys_addr)
1737                 goto out;
1738 
1739         down_write(&kvm->slots_lock);
1740         spin_lock(&kvm->mmu_lock);
1741 
1742         p = &kvm->arch.aliases[alias->slot];
1743         p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT;
1744         p->npages = alias->memory_size >> PAGE_SHIFT;
1745         p->target_gfn = alias->target_phys_addr >> PAGE_SHIFT;
1746 
1747         for (n = KVM_ALIAS_SLOTS; n > 0; --n)
1748                 if (kvm->arch.aliases[n - 1].npages)
1749                         break;
1750         kvm->arch.naliases = n;
1751 
1752         spin_unlock(&kvm->mmu_lock);
1753         kvm_mmu_zap_all(kvm);
1754 
1755         up_write(&kvm->slots_lock);
1756 
1757         return 0;
1758 
1759 out:
1760         return r;
1761 }
1762 
1763 static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
1764 {
1765         int r;
1766 
1767         r = 0;
1768         switch (chip->chip_id) {
1769         case KVM_IRQCHIP_PIC_MASTER:
1770                 memcpy(&chip->chip.pic,
1771                         &pic_irqchip(kvm)->pics[0],
1772                         sizeof(struct kvm_pic_state));
1773                 break;
1774         case KVM_IRQCHIP_PIC_SLAVE:
1775                 memcpy(&chip->chip.pic,
1776                         &pic_irqchip(kvm)->pics[1],
1777                         sizeof(struct kvm_pic_state));
1778                 break;
1779         case KVM_IRQCHIP_IOAPIC:
1780                 memcpy(&chip->chip.ioapic,
1781                         ioapic_irqchip(kvm),
1782                         sizeof(struct kvm_ioapic_state));
1783                 break;
1784         default:
1785                 r = -EINVAL;
1786                 break;
1787         }
1788         return r;
1789 }
1790 
1791 static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
1792 {
1793         int r;
1794 
1795         r = 0;
1796         switch (chip->chip_id) {
1797         case KVM_IRQCHIP_PIC_MASTER:
1798                 memcpy(&pic_irqchip(kvm)->pics[0],
1799                         &chip->chip.pic,
1800                         sizeof(struct kvm_pic_state));
1801                 break;
1802         case KVM_IRQCHIP_PIC_SLAVE:
1803                 memcpy(&pic_irqchip(kvm)->pics[1],
1804                         &chip->chip.pic,
1805                         sizeof(struct kvm_pic_state));
1806                 break;
1807         case KVM_IRQCHIP_IOAPIC:
1808                 memcpy(ioapic_irqchip(kvm),
1809                         &chip->chip.ioapic,
1810                         sizeof(struct kvm_ioapic_state));
1811                 break;
1812         default:
1813                 r = -EINVAL;
1814                 break;
1815         }
1816         kvm_pic_update_irq(pic_irqchip(kvm));
1817         return r;
1818 }
1819 
1820 static int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps)
1821 {
1822         int r = 0;
1823 
1824         memcpy(ps, &kvm->arch.vpit->pit_state, sizeof(struct kvm_pit_state));
1825         return r;
1826 }
1827 
1828 static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps)
1829 {
1830         int r = 0;
1831 
1832         memcpy(&kvm->arch.vpit->pit_state, ps, sizeof(struct kvm_pit_state));
1833         kvm_pit_load_count(kvm, 0, ps->channels[0].count);
1834         return r;
1835 }
1836 
1837 static int kvm_vm_ioctl_reinject(struct kvm *kvm,
1838                                  struct kvm_reinject_control *control)
1839 {
1840         if (!kvm->arch.vpit)
1841                 return -ENXIO;
1842         kvm->arch.vpit->pit_state.pit_timer.reinject = control->pit_reinject;
1843         return 0;
1844 }
1845 
1846 /*
1847  * Get (and clear) the dirty memory log for a memory slot.
1848  */
1849 int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
1850                                       struct kvm_dirty_log *log)
1851 {
1852         int r;
1853         int n;
1854         struct kvm_memory_slot *memslot;
1855         int is_dirty = 0;
1856 
1857         down_write(&kvm->slots_lock);
1858 
1859         r = kvm_get_dirty_log(kvm, log, &is_dirty);
1860         if (r)
1861                 goto out;
1862 
1863         /* If nothing is dirty, don't bother messing with page tables. */
1864         if (is_dirty) {
1865                 spin_lock(&kvm->mmu_lock);
1866                 kvm_mmu_slot_remove_write_access(kvm, log->slot);
1867                 spin_unlock(&kvm->mmu_lock);
1868                 kvm_flush_remote_tlbs(kvm);
1869                 memslot = &kvm->memslots[log->slot];
1870                 n = ALIGN(memslot->npages, BITS_PER_LONG) / 8;
1871                 memset(memslot->dirty_bitmap, 0, n);
1872         }
1873         r = 0;
1874 out:
1875         up_write(&kvm->slots_lock);
1876         return r;
1877 }
1878 
1879 long kvm_arch_vm_ioctl(struct file *filp,
1880                        unsigned int ioctl, unsigned long arg)
1881 {
1882         struct kvm *kvm = filp->private_data;
1883         void __user *argp = (void __user *)arg;
1884         int r = -EINVAL;
1885         /*
1886          * This union makes it completely explicit to gcc-3.x
1887          * that these two variables' stack usage should be
1888          * combined, not added together.
1889          */
1890         union {
1891                 struct kvm_pit_state ps;
1892                 struct kvm_memory_alias alias;
1893         } u;
1894 
1895         switch (ioctl) {
1896         case KVM_SET_TSS_ADDR:
1897                 r = kvm_vm_ioctl_set_tss_addr(kvm, arg);
1898                 if (r < 0)
1899                         goto out;
1900                 break;
1901         case KVM_SET_MEMORY_REGION: {
1902                 struct kvm_memory_region kvm_mem;
1903                 struct kvm_userspace_memory_region kvm_userspace_mem;
1904 
1905                 r = -EFAULT;
1906                 if (copy_from_user(&kvm_mem, argp, sizeof kvm_mem))
1907                         goto out;
1908                 kvm_userspace_mem.slot = kvm_mem.slot;
1909                 kvm_userspace_mem.flags = kvm_mem.flags;
1910                 kvm_userspace_mem.guest_phys_addr = kvm_mem.guest_phys_addr;
1911                 kvm_userspace_mem.memory_size = kvm_mem.memory_size;
1912                 r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 0);
1913                 if (r)
1914                         goto out;
1915                 break;
1916         }
1917         case KVM_SET_NR_MMU_PAGES:
1918                 r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg);
1919                 if (r)
1920                         goto out;
1921                 break;
1922         case KVM_GET_NR_MMU_PAGES:
1923                 r = kvm_vm_ioctl_get_nr_mmu_pages(kvm);
1924                 break;
1925         case KVM_SET_MEMORY_ALIAS:
1926                 r = -EFAULT;
1927                 if (copy_from_user(&u.alias, argp, sizeof(struct kvm_memory_alias)))
1928                         goto out;
1929                 r = kvm_vm_ioctl_set_memory_alias(kvm, &u.alias);
1930                 if (r)
1931                         goto out;
1932                 break;
1933         case KVM_CREATE_IRQCHIP:
1934                 r = -ENOMEM;
1935                 kvm->arch.vpic = kvm_create_pic(kvm);
1936                 if (kvm->arch.vpic) {
1937                         r = kvm_ioapic_init(kvm);
1938                         if (r) {
1939                                 kfree(kvm->arch.vpic);
1940                                 kvm->arch.vpic = NULL;
1941                                 goto out;
1942                         }
1943                 } else
1944                         goto out;
1945                 r = kvm_setup_default_irq_routing(kvm);
1946                 if (r) {
1947                         kfree(kvm->arch.vpic);
1948                         kfree(kvm->arch.vioapic);
1949                         goto out;
1950                 }
1951                 break;
1952         case KVM_CREATE_PIT:
1953                 mutex_lock(&kvm->lock);
1954                 r = -EEXIST;
1955                 if (kvm->arch.vpit)
1956                         goto create_pit_unlock;
1957                 r = -ENOMEM;
1958                 kvm->arch.vpit = kvm_create_pit(kvm);
1959                 if (kvm->arch.vpit)
1960                         r = 0;
1961         create_pit_unlock:
1962                 mutex_unlock(&kvm->lock);
1963                 break;
1964         case KVM_IRQ_LINE_STATUS:
1965         case KVM_IRQ_LINE: {
1966                 struct kvm_irq_level irq_event;
1967 
1968                 r = -EFAULT;
1969                 if (copy_from_user(&irq_event, argp, sizeof irq_event))
1970                         goto out;
1971                 if (irqchip_in_kernel(kvm)) {
1972                         __s32 status;
1973                         mutex_lock(&kvm->lock);
1974                         status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
1975                                         irq_event.irq, irq_event.level);
1976                         mutex_unlock(&kvm->lock);
1977                         if (ioctl == KVM_IRQ_LINE_STATUS) {
1978                                 irq_event.status = status;
1979                                 if (copy_to_user(argp, &irq_event,
1980                                                         sizeof irq_event))
1981                                         goto out;
1982                         }
1983                         r = 0;
1984                 }
1985                 break;
1986         }
1987         case KVM_GET_IRQCHIP: {
1988                 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
1989                 struct kvm_irqchip *chip = kmalloc(sizeof(*chip), GFP_KERNEL);
1990 
1991                 r = -ENOMEM;
1992                 if (!chip)
1993                         goto out;
1994                 r = -EFAULT;
1995                 if (copy_from_user(chip, argp, sizeof *chip))
1996                         goto get_irqchip_out;
1997                 r = -ENXIO;
1998                 if (!irqchip_in_kernel(kvm))
1999                         goto get_irqchip_out;
2000                 r = kvm_vm_ioctl_get_irqchip(kvm, chip);
2001                 if (r)
2002                         goto get_irqchip_out;
2003                 r = -EFAULT;
2004                 if (copy_to_user(argp, chip, sizeof *chip))
2005                         goto get_irqchip_out;
2006                 r = 0;
2007         get_irqchip_out:
2008                 kfree(chip);
2009                 if (r)
2010                         goto out;
2011                 break;
2012         }
2013         case KVM_SET_IRQCHIP: {
2014                 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
2015                 struct kvm_irqchip *chip = kmalloc(sizeof(*chip), GFP_KERNEL);
2016 
2017                 r = -ENOMEM;
2018                 if (!chip)
2019                         goto out;
2020                 r = -EFAULT;
2021                 if (copy_from_user(chip, argp, sizeof *chip))
2022                         goto set_irqchip_out;
2023                 r = -ENXIO;
2024                 if (!irqchip_in_kernel(kvm))
2025                         goto set_irqchip_out;
2026                 r = kvm_vm_ioctl_set_irqchip(kvm, chip);
2027                 if (r)
2028                         goto set_irqchip_out;
2029                 r = 0;
2030         set_irqchip_out:
2031                 kfree(chip);
2032                 if (r)
2033                         goto out;
2034                 break;
2035         }
2036         case KVM_GET_PIT: {
2037                 r = -EFAULT;
2038                 if (copy_from_user(&u.ps, argp, sizeof(struct kvm_pit_state)))
2039                         goto out;
2040                 r = -ENXIO;
2041                 if (!kvm->arch.vpit)
2042                         goto out;
2043                 r = kvm_vm_ioctl_get_pit(kvm, &u.ps);
2044                 if (r)
2045                         goto out;
2046                 r = -EFAULT;
2047                 if (copy_to_user(argp, &u.ps, sizeof(struct kvm_pit_state)))
2048                         goto out;
2049                 r = 0;
2050                 break;
2051         }
2052         case KVM_SET_PIT: {
2053                 r = -EFAULT;
2054                 if (copy_from_user(&u.ps, argp, sizeof u.ps))
2055                         goto out;
2056                 r = -ENXIO;
2057                 if (!kvm->arch.vpit)
2058                         goto out;
2059                 r = kvm_vm_ioctl_set_pit(kvm, &u.ps);
2060                 if (r)
2061                         goto out;
2062                 r = 0;
2063                 break;
2064         }
2065         case KVM_REINJECT_CONTROL: {
2066                 struct kvm_reinject_control control;
2067                 r =  -EFAULT;
2068                 if (copy_from_user(&control, argp, sizeof(control)))
2069                         goto out;
2070                 r = kvm_vm_ioctl_reinject(kvm, &control);
2071                 if (r)
2072                         goto out;
2073                 r = 0;
2074                 break;
2075         }
2076         default:
2077                 ;
2078         }
2079 out:
2080         return r;
2081 }
2082 
2083 static void kvm_init_msr_list(void)
2084 {
2085         u32 dummy[2];
2086         unsigned i, j;
2087 
2088         for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) {
2089                 if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0)
2090                         continue;
2091                 if (j < i)
2092                         msrs_to_save[j] = msrs_to_save[i];
2093                 j++;
2094         }
2095         num_msrs_to_save = j;
2096 }
2097 
2098 /*
2099  * Only apic need an MMIO device hook, so shortcut now..
2100  */
2101 static struct kvm_io_device *vcpu_find_pervcpu_dev(struct kvm_vcpu *vcpu,
2102                                                 gpa_t addr, int len,
2103                                                 int is_write)
2104 {
2105         struct kvm_io_device *dev;
2106 
2107         if (vcpu->arch.apic) {
2108                 dev = &vcpu->arch.apic->dev;
2109                 if (dev->in_range(dev, addr, len, is_write))
2110                         return dev;
2111         }
2112         return NULL;
2113 }
2114 
2115 
2116 static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu,
2117                                                 gpa_t addr, int len,
2118                                                 int is_write)
2119 {
2120         struct kvm_io_device *dev;
2121 
2122         dev = vcpu_find_pervcpu_dev(vcpu, addr, len, is_write);
2123         if (dev == NULL)
2124                 dev = kvm_io_bus_find_dev(&vcpu->kvm->mmio_bus, addr, len,
2125                                           is_write);
2126         return dev;
2127 }
2128 
2129 static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes,
2130                                struct kvm_vcpu *vcpu)
2131 {
2132         void *data = val;
2133         int r = X86EMUL_CONTINUE;
2134 
2135         while (bytes) {
2136                 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
2137                 unsigned offset = addr & (PAGE_SIZE-1);
2138                 unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset);
2139                 int ret;
2140 
2141                 if (gpa == UNMAPPED_GVA) {
2142                         r = X86EMUL_PROPAGATE_FAULT;
2143                         goto out;
2144                 }
2145                 ret = kvm_read_guest(vcpu->kvm, gpa, data, toread);
2146                 if (ret < 0) {
2147                         r = X86EMUL_UNHANDLEABLE;
2148                         goto out;
2149                 }
2150 
2151                 bytes -= toread;
2152                 data += toread;
2153                 addr += toread;
2154         }
2155 out:
2156         return r;
2157 }
2158 
2159 static int kvm_write_guest_virt(gva_t addr, void *val, unsigned int bytes,
2160                                 struct kvm_vcpu *vcpu)
2161 {
2162         void *data = val;
2163         int r = X86EMUL_CONTINUE;
2164 
2165         while (bytes) {
2166                 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
2167                 unsigned offset = addr & (PAGE_SIZE-1);
2168                 unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset);
2169                 int ret;
2170 
2171                 if (gpa == UNMAPPED_GVA) {
2172                         r = X86EMUL_PROPAGATE_FAULT;
2173                         goto out;
2174                 }
2175                 ret = kvm_write_guest(vcpu->kvm, gpa, data, towrite);
2176                 if (ret < 0) {
2177                         r = X86EMUL_UNHANDLEABLE;
2178                         goto out;
2179                 }
2180 
2181                 bytes -= towrite;
2182                 data += towrite;
2183                 addr += towrite;
2184         }
2185 out:
2186         return r;
2187 }
2188 
2189 
2190 static int emulator_read_emulated(unsigned long addr,
2191                                   void *val,
2192                                   unsigned int bytes,
2193                                   struct kvm_vcpu *vcpu)
2194 {
2195         struct kvm_io_device *mmio_dev;
2196         gpa_t                 gpa;
2197 
2198         if (vcpu->mmio_read_completed) {
2199                 memcpy(val, vcpu->mmio_data, bytes);
2200                 vcpu->mmio_read_completed = 0;
2201                 return X86EMUL_CONTINUE;
2202         }
2203 
2204         gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
2205 
2206         /* For APIC access vmexit */
2207         if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
2208                 goto mmio;
2209 
2210         if (kvm_read_guest_virt(addr, val, bytes, vcpu)
2211                                 == X86EMUL_CONTINUE)
2212                 return X86EMUL_CONTINUE;
2213         if (gpa == UNMAPPED_GVA)
2214                 return X86EMUL_PROPAGATE_FAULT;
2215 
2216 mmio:
2217         /*
2218          * Is this MMIO handled locally?
2219          */
2220         mutex_lock(&vcpu->kvm->lock);
2221         mmio_dev = vcpu_find_mmio_dev(vcpu, gpa, bytes, 0);
2222         if (mmio_dev) {
2223                 kvm_iodevice_read(mmio_dev, gpa, bytes, val);
2224                 mutex_unlock(&vcpu->kvm->lock);
2225                 return X86EMUL_CONTINUE;
2226         }
2227         mutex_unlock(&vcpu->kvm->lock);
2228 
2229         vcpu->mmio_needed = 1;
2230         vcpu->mmio_phys_addr = gpa;
2231         vcpu->mmio_size = bytes;
2232         vcpu->mmio_is_write = 0;
2233 
2234         return X86EMUL_UNHANDLEABLE;
2235 }
2236 
2237 int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
2238                           const void *val, int bytes)
2239 {
2240         int ret;
2241 
2242         ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes);
2243         if (ret < 0)
2244                 return 0;
2245         kvm_mmu_pte_write(vcpu, gpa, val, bytes, 1);
2246         return 1;
2247 }
2248 
2249 static int emulator_write_emulated_onepage(unsigned long addr,
2250                                            const void *val,
2251                                            unsigned int bytes,
2252                                            struct kvm_vcpu *vcpu)
2253 {
2254         struct kvm_io_device *mmio_dev;
2255         gpa_t                 gpa;
2256 
2257         gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
2258 
2259         if (gpa == UNMAPPED_GVA) {
2260                 kvm_inject_page_fault(vcpu, addr, 2);
2261                 return X86EMUL_PROPAGATE_FAULT;
2262         }
2263 
2264         /* For APIC access vmexit */
2265         if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
2266                 goto mmio;
2267 
2268         if (emulator_write_phys(vcpu, gpa, val, bytes))
2269                 return X86EMUL_CONTINUE;
2270 
2271 mmio:
2272         /*
2273          * Is this MMIO handled locally?
2274          */
2275         mutex_lock(&vcpu->kvm->lock);
2276         mmio_dev = vcpu_find_mmio_dev(vcpu, gpa, bytes, 1);
2277         if (mmio_dev) {
2278                 kvm_iodevice_write(mmio_dev, gpa, bytes, val);
2279                 mutex_unlock(&vcpu->kvm->lock);
2280                 return X86EMUL_CONTINUE;
2281         }
2282         mutex_unlock(&vcpu->kvm->lock);
2283 
2284         vcpu->mmio_needed = 1;
2285         vcpu->mmio_phys_addr = gpa;
2286         vcpu->mmio_size = bytes;
2287         vcpu->mmio_is_write = 1;
2288         memcpy(vcpu->mmio_data, val, bytes);
2289 
2290         return X86EMUL_CONTINUE;
2291 }
2292 
2293 int emulator_write_emulated(unsigned long addr,
2294                                    const void *val,
2295                                    unsigned int bytes,
2296                                    struct kvm_vcpu *vcpu)
2297 {
2298         /* Crossing a page boundary? */
2299         if (((addr + bytes - 1) ^ addr) & PAGE_MASK) {
2300                 int rc, now;
2301 
2302                 now = -addr & ~PAGE_MASK;
2303                 rc = emulator_write_emulated_onepage(addr, val, now, vcpu);
2304                 if (rc != X86EMUL_CONTINUE)
2305                         return rc;
2306                 addr += now;
2307                 val += now;
2308                 bytes -= now;
2309         }
2310         return emulator_write_emulated_onepage(addr, val, bytes, vcpu);
2311 }
2312 EXPORT_SYMBOL_GPL(emulator_write_emulated);
2313 
2314 static int emulator_cmpxchg_emulated(unsigned long addr,
2315                                      const void *old,
2316                                      const void *new,
2317                                      unsigned int bytes,
2318                                      struct kvm_vcpu *vcpu)
2319 {
2320         static int reported;
2321 
2322         if (!reported) {
2323                 reported = 1;
2324                 printk(KERN_WARNING "kvm: emulating exchange as write\n");
2325         }
2326 #ifndef CONFIG_X86_64
2327         /* guests cmpxchg8b have to be emulated atomically */
2328         if (bytes == 8) {
2329                 gpa_t gpa;
2330                 struct page *page;
2331                 char *kaddr;
2332                 u64 val;
2333 
2334                 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
2335 
2336                 if (gpa == UNMAPPED_GVA ||
2337                    (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
2338                         goto emul_write;
2339 
2340                 if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK))
2341                         goto emul_write;
2342 
2343                 val = *(u64 *)new;
2344 
2345                 page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
2346 
2347                 kaddr = kmap_atomic(page, KM_USER0);
2348                 set_64bit((u64 *)(kaddr + offset_in_page(gpa)), val);
2349                 kunmap_atomic(kaddr, KM_USER0);
2350                 kvm_release_page_dirty(page);
2351         }
2352 emul_write:
2353 #endif
2354 
2355         return emulator_write_emulated(addr, new, bytes, vcpu);
2356 }
2357 
2358 static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
2359 {
2360         return kvm_x86_ops->get_segment_base(vcpu, seg);
2361 }
2362 
2363 int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address)
2364 {
2365         kvm_mmu_invlpg(vcpu, address);
2366         return X86EMUL_CONTINUE;
2367 }
2368 
2369 int emulate_clts(struct kvm_vcpu *vcpu)
2370 {
2371         KVMTRACE_0D(CLTS, vcpu, handler);
2372         kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 & ~X86_CR0_TS);
2373         return X86EMUL_CONTINUE;
2374 }
2375 
2376 int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest)
2377 {
2378         struct kvm_vcpu *vcpu = ctxt->vcpu;
2379 
2380         switch (dr) {
2381         case 0 ... 3:
2382                 *dest = kvm_x86_ops->get_dr(vcpu, dr);
2383                 return X86EMUL_CONTINUE;
2384         default:
2385                 pr_unimpl(vcpu, "%s: unexpected dr %u\n", __func__, dr);
2386                 return X86EMUL_UNHANDLEABLE;
2387         }
2388 }
2389 
2390 int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value)
2391 {
2392         unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U;
2393         int exception;
2394 
2395         kvm_x86_ops->set_dr(ctxt->vcpu, dr, value & mask, &exception);
2396         if (exception) {
2397                 /* FIXME: better handling */
2398                 return X86EMUL_UNHANDLEABLE;
2399         }
2400         return X86EMUL_CONTINUE;
2401 }
2402 
2403 void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
2404 {
2405         u8 opcodes[4];
2406         unsigned long rip = kvm_rip_read(vcpu);
2407         unsigned long rip_linear;
2408 
2409         if (!printk_ratelimit())
2410                 return;
2411 
2412         rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS);
2413 
2414         kvm_read_guest_virt(rip_linear, (void *)opcodes, 4, vcpu);
2415 
2416         printk(KERN_ERR "emulation failed (%s) rip %lx %02x %02x %02x %02x\n",
2417                context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]);
2418 }
2419 EXPORT_SYMBOL_GPL(kvm_report_emulation_failure);
2420 
2421 static struct x86_emulate_ops emulate_ops = {
2422         .read_std            = kvm_read_guest_virt,
2423         .read_emulated       = emulator_read_emulated,
2424         .write_emulated      = emulator_write_emulated,
2425         .cmpxchg_emulated    = emulator_cmpxchg_emulated,
2426 };
2427 
2428 static void cache_all_regs(struct kvm_vcpu *vcpu)
2429 {
2430         kvm_register_read(vcpu, VCPU_REGS_RAX);
2431         kvm_register_read(vcpu, VCPU_REGS_RSP);
2432         kvm_register_read(vcpu, VCPU_REGS_RIP);
2433         vcpu->arch.regs_dirty = ~0;
2434 }
2435 
2436 int emulate_instruction(struct kvm_vcpu *vcpu,
2437                         struct kvm_run *run,
2438                         unsigned long cr2,
2439                         u16 error_code,
2440                         int emulation_type)
2441 {
2442         int r, shadow_mask;
2443         struct decode_cache *c;
2444 
2445         kvm_clear_exception_queue(vcpu);
2446         vcpu->arch.mmio_fault_cr2 = cr2;
2447         /*
2448          * TODO: fix x86_emulate.c to use guest_read/write_register
2449          * instead of direct ->regs accesses, can save hundred cycles
2450          * on Intel for instructions that don't read/change RSP, for
2451          * for example.
2452          */
2453         cache_all_regs(vcpu);
2454 
2455         vcpu->mmio_is_write = 0;
2456         vcpu->arch.pio.string = 0;
2457 
2458         if (!(emulation_type & EMULTYPE_NO_DECODE)) {
2459                 int cs_db, cs_l;
2460                 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
2461 
2462                 vcpu->arch.emulate_ctxt.vcpu = vcpu;
2463                 vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu);
2464                 vcpu->arch.emulate_ctxt.mode =
2465                         (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM)
2466                         ? X86EMUL_MODE_REAL : cs_l
2467                         ? X86EMUL_MODE_PROT64 : cs_db
2468                         ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
2469 
2470                 r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
2471 
2472                 /* Reject the instructions other than VMCALL/VMMCALL when
2473                  * try to emulate invalid opcode */
2474                 c = &vcpu->arch.emulate_ctxt.decode;
2475                 if ((emulation_type & EMULTYPE_TRAP_UD) &&
2476                     (!(c->twobyte && c->b == 0x01 &&
2477                       (c->modrm_reg == 0 || c->modrm_reg == 3) &&
2478                        c->modrm_mod == 3 && c->modrm_rm == 1)))
2479                         return EMULATE_FAIL;
2480 
2481                 ++vcpu->stat.insn_emulation;
2482                 if (r)  {
2483                         ++vcpu->stat.insn_emulation_fail;
2484                         if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
2485                                 return EMULATE_DONE;
2486                         return EMULATE_FAIL;
2487                 }
2488         }
2489 
2490         if (emulation_type & EMULTYPE_SKIP) {
2491                 kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.decode.eip);
2492                 return EMULATE_DONE;
2493         }
2494 
2495         r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
2496         shadow_mask = vcpu->arch.emulate_ctxt.interruptibility;
2497 
2498         if (r == 0)
2499                 kvm_x86_ops->set_interrupt_shadow(vcpu, shadow_mask);
2500 
2501         if (vcpu->arch.pio.string)
2502                 return EMULATE_DO_MMIO;
2503 
2504         if ((r || vcpu->mmio_is_write) && run) {
2505                 run->exit_reason = KVM_EXIT_MMIO;
2506                 run->mmio.phys_addr = vcpu->mmio_phys_addr;
2507                 memcpy(run->mmio.data, vcpu->mmio_data, 8);
2508                 run->mmio.len = vcpu->mmio_size;
2509                 run->mmio.is_write = vcpu->mmio_is_write;
2510         }
2511 
2512         if (r) {
2513                 if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
2514                         return EMULATE_DONE;
2515                 if (!vcpu->mmio_needed) {
2516                         kvm_report_emulation_failure(vcpu, "mmio");
2517                         return EMULATE_FAIL;
2518                 }
2519                 return EMULATE_DO_MMIO;
2520         }
2521 
2522         kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
2523 
2524         if (vcpu->mmio_is_write) {
2525                 vcpu->mmio_needed = 0;
2526                 return EMULATE_DO_MMIO;
2527         }
2528 
2529         return EMULATE_DONE;
2530 }
2531 EXPORT_SYMBOL_GPL(emulate_instruction);
2532 
2533 static int pio_copy_data(struct kvm_vcpu *vcpu)
2534 {
2535         void *p = vcpu->arch.pio_data;
2536         gva_t q = vcpu->arch.pio.guest_gva;
2537         unsigned bytes;
2538         int ret;
2539 
2540         bytes = vcpu->arch.pio.size * vcpu->arch.pio.cur_count;
2541         if (vcpu->arch.pio.in)
2542                 ret = kvm_write_guest_virt(q, p, bytes, vcpu);
2543         else
2544                 ret = kvm_read_guest_virt(q, p, bytes, vcpu);
2545         return ret;
2546 }
2547 
2548 int complete_pio(struct kvm_vcpu *vcpu)
2549 {
2550         struct kvm_pio_request *io = &vcpu->arch.pio;
2551         long delta;
2552         int r;
2553         unsigned long val;
2554 
2555         if (!io->string) {
2556                 if (io->in) {
2557                         val = kvm_register_read(vcpu, VCPU_REGS_RAX);
2558                         memcpy(&val, vcpu->arch.pio_data, io->size);
2559                         kvm_register_write(vcpu, VCPU_REGS_RAX, val);
2560                 }
2561         } else {
2562                 if (io->in) {
2563                         r = pio_copy_data(vcpu);
2564                         if (r)
2565                                 return r;
2566                 }
2567 
2568                 delta = 1;
2569                 if (io->rep) {
2570                         delta *= io->cur_count;
2571                         /*
2572                          * The size of the register should really depend on
2573                          * current address size.
2574                          */
2575                         val = kvm_register_read(vcpu, VCPU_REGS_RCX);
2576                         val -= delta;
2577                         kvm_register_write(vcpu, VCPU_REGS_RCX, val);
2578                 }
2579                 if (io->down)
2580                         delta = -delta;
2581                 delta *= io->size;
2582                 if (io->in) {
2583                         val = kvm_register_read(vcpu, VCPU_REGS_RDI);
2584                         val += delta;
2585                         kvm_register_write(vcpu, VCPU_REGS_RDI, val);
2586                 } else {
2587                         val = kvm_register_read(vcpu, VCPU_REGS_RSI);
2588                         val += delta;
2589                         kvm_register_write(vcpu, VCPU_REGS_RSI, val);
2590                 }
2591         }
2592 
2593         io->count -= io->cur_count;
2594         io->cur_count = 0;
2595 
2596         return 0;
2597 }
2598 
2599 static void kernel_pio(struct kvm_io_device *pio_dev,
2600                        struct kvm_vcpu *vcpu,
2601                        void *pd)
2602 {
2603         /* TODO: String I/O for in kernel device */
2604 
2605         mutex_lock(&vcpu->kvm->lock);
2606         if (vcpu->arch.pio.in)
2607                 kvm_iodevice_read(pio_dev, vcpu->arch.pio.port,
2608                                   vcpu->arch.pio.size,
2609                                   pd);
2610         else
2611                 kvm_iodevice_write(pio_dev, vcpu->arch.pio.port,
2612                                    vcpu->arch.pio.size,
2613                                    pd);
2614         mutex_unlock(&vcpu->kvm->lock);
2615 }
2616 
2617 static void pio_string_write(struct kvm_io_device *pio_dev,
2618                              struct kvm_vcpu *vcpu)
2619 {
2620         struct kvm_pio_request *io = &vcpu->arch.pio;
2621         void *pd = vcpu->arch.pio_data;
2622         int i;
2623 
2624         mutex_lock(&vcpu->kvm->lock);
2625         for (i = 0; i < io->cur_count; i++) {
2626                 kvm_iodevice_write(pio_dev, io->port,
2627                                    io->size,
2628                                    pd);
2629                 pd += io->size;
2630         }
2631         mutex_unlock(&vcpu->kvm->lock);
2632 }
2633 
2634 static struct kvm_io_device *vcpu_find_pio_dev(struct kvm_vcpu *vcpu,
2635                                                gpa_t addr, int len,
2636                                                int is_write)
2637 {
2638         return kvm_io_bus_find_dev(&vcpu->kvm->pio_bus, addr, len, is_write);
2639 }
2640 
2641 int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
2642                   int size, unsigned port)
2643 {
2644         struct kvm_io_device *pio_dev;
2645         unsigned long val;
2646 
2647         vcpu->run->exit_reason = KVM_EXIT_IO;
2648         vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
2649         vcpu->run->io.size = vcpu->arch.pio.size = size;
2650         vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
2651         vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = 1;
2652         vcpu->run->io.port = vcpu->arch.pio.port = port;
2653         vcpu->arch.pio.in = in;
2654         vcpu->arch.pio.string = 0;
2655         vcpu->arch.pio.down = 0;
2656         vcpu->arch.pio.rep = 0;
2657 
2658         if (vcpu->run->io.direction == KVM_EXIT_IO_IN)
2659                 KVMTRACE_2D(IO_READ, vcpu, vcpu->run->io.port, (u32)size,
2660                             handler);
2661         else
2662                 KVMTRACE_2D(IO_WRITE, vcpu, vcpu->run->io.port, (u32)size,
2663                             handler);
2664 
2665         val = kvm_register_read(vcpu, VCPU_REGS_RAX);
2666         memcpy(vcpu->arch.pio_data, &val, 4);
2667 
2668         pio_dev = vcpu_find_pio_dev(vcpu, port, size, !in);
2669         if (pio_dev) {
2670                 kernel_pio(pio_dev, vcpu, vcpu->arch.pio_data);
2671                 complete_pio(vcpu);
2672                 return 1;
2673         }
2674         return 0;
2675 }
2676 EXPORT_SYMBOL_GPL(kvm_emulate_pio);
2677 
2678 int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
2679                   int size, unsigned long count, int down,
2680                   gva_t address, int rep, unsigned port)
2681 {
2682         unsigned now, in_page;
2683         int ret = 0;
2684         struct kvm_io_device *pio_dev;
2685 
2686         vcpu->run->exit_reason = KVM_EXIT_IO;
2687         vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
2688         vcpu->run->io.size = vcpu->arch.pio.size = size;
2689         vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
2690         vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = count;
2691         vcpu->run->io.port = vcpu->arch.pio.port = port;
2692         vcpu->arch.pio.in = in;
2693         vcpu->arch.pio.string = 1;
2694         vcpu->arch.pio.down = down;
2695         vcpu->arch.pio.rep = rep;
2696 
2697         if (vcpu->run->io.direction == KVM_EXIT_IO_IN)
2698                 KVMTRACE_2D(IO_READ, vcpu, vcpu->run->io.port, (u32)size,
2699                             handler);
2700         else
2701                 KVMTRACE_2D(IO_WRITE, vcpu, vcpu->run->io.port, (u32)size,
2702                             handler);
2703 
2704         if (!count) {
2705                 kvm_x86_ops->skip_emulated_instruction(vcpu);
2706                 return 1;
2707         }
2708 
2709         if (!down)
2710                 in_page = PAGE_SIZE - offset_in_page(address);
2711         else
2712                 in_page = offset_in_page(address) + size;
2713         now = min(count, (unsigned long)in_page / size);
2714         if (!now)
2715                 now = 1;
2716         if (down) {
2717                 /*
2718                  * String I/O in reverse.  Yuck.  Kill the guest, fix later.
2719                  */
2720                 pr_unimpl(vcpu, "guest string pio down\n");
2721                 kvm_inject_gp(vcpu, 0);
2722                 return 1;
2723         }
2724         vcpu->run->io.count = now;
2725         vcpu->arch.pio.cur_count = now;
2726 
2727         if (vcpu->arch.pio.cur_count == vcpu->arch.pio.count)
2728                 kvm_x86_ops->skip_emulated_instruction(vcpu);
2729 
2730         vcpu->arch.pio.guest_gva = address;
2731 
2732         pio_dev = vcpu_find_pio_dev(vcpu, port,
2733                                     vcpu->arch.pio.cur_count,
2734                                     !vcpu->arch.pio.in);
2735         if (!vcpu->arch.pio.in) {
2736                 /* string PIO write */
2737                 ret = pio_copy_data(vcpu);
2738                 if (ret == X86EMUL_PROPAGATE_FAULT) {
2739                         kvm_inject_gp(vcpu, 0);
2740                         return 1;
2741                 }
2742                 if (ret == 0 && pio_dev) {
2743                         pio_string_write(pio_dev, vcpu);
2744                         complete_pio(vcpu);
2745                         if (vcpu->arch.pio.count == 0)
2746                                 ret = 1;
2747                 }
2748         } else if (pio_dev)
2749                 pr_unimpl(vcpu, "no string pio read support yet, "
2750                        "port %x size %d count %ld\n",
2751                         port, size, count);
2752 
2753         return ret;
2754 }
2755 EXPORT_SYMBOL_GPL(kvm_emulate_pio_string);
2756 
2757 static void bounce_off(void *info)
2758 {
2759         /* nothing */
2760 }
2761 
2762 static unsigned int  ref_freq;
2763 static unsigned long tsc_khz_ref;
2764 
2765 static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
2766                                      void *data)
2767 {
2768         struct cpufreq_freqs *freq = data;
2769         struct kvm *kvm;
2770         struct kvm_vcpu *vcpu;
2771         int i, send_ipi = 0;
2772 
2773         if (!ref_freq)
2774                 ref_freq = freq->old;
2775 
2776         if (val == CPUFREQ_PRECHANGE && freq->old > freq->new)
2777                 return 0;
2778         if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new)
2779                 return 0;
2780         per_cpu(cpu_tsc_khz, freq->cpu) = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new);
2781 
2782         spin_lock(&kvm_lock);
2783         list_for_each_entry(kvm, &vm_list, vm_list) {
2784                 for (i = 0; i < KVM_MAX_VCPUS; ++i) {
2785                         vcpu = kvm->vcpus[i];
2786                         if (!vcpu)
2787                                 continue;
2788                         if (vcpu->cpu != freq->cpu)
2789                                 continue;
2790                         if (!kvm_request_guest_time_update(vcpu))
2791                                 continue;
2792                         if (vcpu->cpu != smp_processor_id())
2793                                 send_ipi++;
2794                 }
2795         }
2796         spin_unlock(&kvm_lock);
2797 
2798         if (freq->old < freq->new && send_ipi) {
2799                 /*
2800                  * We upscale the frequency.  Must make the guest
2801                  * doesn't see old kvmclock values while running with
2802                  * the new frequency, otherwise we risk the guest sees
2803                  * time go backwards.
2804                  *
2805                  * In case we update the frequency for another cpu
2806                  * (which might be in guest context) send an interrupt
2807                  * to kick the cpu out of guest context.  Next time
2808                  * guest context is entered kvmclock will be updated,
2809                  * so the guest will not see stale values.
2810                  */
2811                 smp_call_function_single(freq->cpu, bounce_off, NULL, 1);
2812         }
2813         return 0;
2814 }
2815 
2816 static struct notifier_block kvmclock_cpufreq_notifier_block = {
2817         .notifier_call  = kvmclock_cpufreq_notifier
2818 };
2819 
2820 int kvm_arch_init(void *opaque)
2821 {
2822         int r, cpu;
2823         struct kvm_x86_ops *ops = (struct kvm_x86_ops *)opaque;
2824 
2825         if (kvm_x86_ops) {
2826                 printk(KERN_ERR "kvm: already loaded the other module\n");
2827                 r = -EEXIST;
2828                 goto out;
2829         }
2830 
2831         if (!ops->cpu_has_kvm_support()) {
2832                 printk(KERN_ERR "kvm: no hardware support\n");
2833                 r = -EOPNOTSUPP;
2834                 goto out;
2835         }
2836         if (ops->disabled_by_bios()) {
2837                 printk(KERN_ERR "kvm: disabled by bios\n");
2838                 r = -EOPNOTSUPP;
2839                 goto out;
2840         }
2841 
2842         r = kvm_mmu_module_init();
2843         if (r)
2844                 goto out;
2845 
2846         kvm_init_msr_list();
2847 
2848         kvm_x86_ops = ops;
2849         kvm_mmu_set_nonpresent_ptes(0ull, 0ull);
2850         kvm_mmu_set_base_ptes(PT_PRESENT_MASK);
2851         kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
2852                         PT_DIRTY_MASK, PT64_NX_MASK, 0);
2853 
2854         for_each_possible_cpu(cpu)
2855                 per_cpu(cpu_tsc_khz, cpu) = tsc_khz;
2856         if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
2857                 tsc_khz_ref = tsc_khz;
2858                 cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block,
2859                                           CPUFREQ_TRANSITION_NOTIFIER);
2860         }
2861 
2862         return 0;
2863 
2864 out:
2865         return r;
2866 }
2867 
2868 void kvm_arch_exit(void)
2869 {
2870         if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
2871                 cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block,
2872                                             CPUFREQ_TRANSITION_NOTIFIER);
2873         kvm_x86_ops = NULL;
2874         kvm_mmu_module_exit();
2875 }
2876 
2877 int kvm_emulate_halt(struct kvm_vcpu *vcpu)
2878 {
2879         ++vcpu->stat.halt_exits;
2880         KVMTRACE_0D(HLT, vcpu, handler);
2881         if (irqchip_in_kernel(vcpu->kvm)) {
2882                 vcpu->arch.mp_state = KVM_MP_STATE_HALTED;
2883                 return 1;
2884         } else {
2885                 vcpu->run->exit_reason = KVM_EXIT_HLT;
2886                 return 0;
2887         }
2888 }
2889 EXPORT_SYMBOL_GPL(kvm_emulate_halt);
2890 
2891 static inline gpa_t hc_gpa(struct kvm_vcpu *vcpu, unsigned long a0,
2892                            unsigned long a1)
2893 {
2894         if (is_long_mode(vcpu))
2895                 return a0;
2896         else
2897                 return a0 | ((gpa_t)a1 << 32);
2898 }
2899 
2900 int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
2901 {
2902         unsigned long nr, a0, a1, a2, a3, ret;
2903         int r = 1;
2904 
2905         nr = kvm_register_read(vcpu, VCPU_REGS_RAX);
2906         a0 = kvm_register_read(vcpu, VCPU_REGS_RBX);
2907         a1 = kvm_register_read(vcpu, VCPU_REGS_RCX);
2908         a2 = kvm_register_read(vcpu, VCPU_REGS_RDX);
2909         a3 = kvm_register_read(vcpu, VCPU_REGS_RSI);
2910 
2911         KVMTRACE_1D(VMMCALL, vcpu, (u32)nr, handler);
2912 
2913         if (!is_long_mode(vcpu)) {
2914                 nr &= 0xFFFFFFFF;
2915                 a0 &= 0xFFFFFFFF;
2916                 a1 &= 0xFFFFFFFF;
2917                 a2 &= 0xFFFFFFFF;
2918                 a3 &= 0xFFFFFFFF;
2919         }
2920 
2921         if (kvm_x86_ops->get_cpl(vcpu) != 0) {
2922                 ret = -KVM_EPERM;
2923                 goto out;
2924         }
2925 
2926         switch (nr) {
2927         case KVM_HC_VAPIC_POLL_IRQ:
2928                 ret = 0;
2929                 break;
2930         case KVM_HC_MMU_OP:
2931                 r = kvm_pv_mmu_op(vcpu, a0, hc_gpa(vcpu, a1, a2), &ret);
2932                 break;
2933         default:
2934                 ret = -KVM_ENOSYS;
2935                 break;
2936         }
2937 out:
2938         kvm_register_write(vcpu, VCPU_REGS_RAX, ret);
2939         ++vcpu->stat.hypercalls;
2940         return r;
2941 }
2942 EXPORT_SYMBOL_GPL(kvm_emulate_hypercall);
2943 
2944 int kvm_fix_hypercall(struct kvm_vcpu *vcpu)
2945 {
2946         char instruction[3];
2947         int ret = 0;
2948         unsigned long rip = kvm_rip_read(vcpu);
2949 
2950 
2951         /*
2952          * Blow out the MMU to ensure that no other VCPU has an active mapping
2953          * to ensure that the updated hypercall appears atomically across all
2954          * VCPUs.
2955          */
2956         kvm_mmu_zap_all(vcpu->kvm);
2957 
2958         kvm_x86_ops->patch_hypercall(vcpu, instruction);
2959         if (emulator_write_emulated(rip, instruction, 3, vcpu)
2960             != X86EMUL_CONTINUE)
2961                 ret = -EFAULT;
2962 
2963         return ret;
2964 }
2965 
2966 static u64 mk_cr_64(u64 curr_cr, u32 new_val)
2967 {
2968         return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
2969 }
2970 
2971 void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
2972 {
2973         struct descriptor_table dt = { limit, base };
2974 
2975         kvm_x86_ops->set_gdt(vcpu, &dt);
2976 }
2977 
2978 void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
2979 {
2980         struct descriptor_table dt = { limit, base };
2981 
2982         kvm_x86_ops->set_idt(vcpu, &dt);
2983 }
2984 
2985 void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
2986                    unsigned long *rflags)
2987 {
2988         kvm_lmsw(vcpu, msw);
2989         *rflags = kvm_x86_ops->get_rflags(vcpu);
2990 }
2991 
2992 unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)
2993 {
2994         unsigned long value;
2995 
2996         kvm_x86_ops->decache_cr4_guest_bits(vcpu);
2997         switch (cr) {
2998         case 0:
2999                 value = vcpu->arch.cr0;
3000                 break;
3001         case 2:
3002                 value = vcpu->arch.cr2;
3003                 break;
3004         case 3:
3005                 value = vcpu->arch.cr3;
3006                 break;
3007         case 4:
3008                 value = vcpu->arch.cr4;
3009                 break;
3010         case 8:
3011                 value = kvm_get_cr8(vcpu);
3012                 break;
3013         default:
3014                 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
3015                 return 0;
3016         }
3017         KVMTRACE_3D(CR_READ, vcpu, (u32)cr, (u32)value,
3018                     (u32)((u64)value >> 32), handler);
3019 
3020         return value;
3021 }
3022 
3023 void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val,
3024                      unsigned long *rflags)
3025 {
3026         KVMTRACE_3D(CR_WRITE, vcpu, (u32)cr, (u32)val,
3027                     (u32)((u64)val >> 32), handler);
3028 
3029         switch (cr) {
3030         case 0:
3031                 kvm_set_cr0(vcpu, mk_cr_64(vcpu->arch.cr0, val));
3032                 *rflags = kvm_x86_ops->get_rflags(vcpu);
3033                 break;
3034         case 2:
3035                 vcpu->arch.cr2 = val;
3036                 break;
3037         case 3:
3038                 kvm_set_cr3(vcpu, val);
3039                 break;
3040         case 4:
3041                 kvm_set_cr4(vcpu, mk_cr_64(vcpu->arch.cr4, val));
3042                 break;
3043         case 8:
3044                 kvm_set_cr8(vcpu, val & 0xfUL);
3045                 break;
3046         default:
3047                 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
3048         }
3049 }
3050 
3051 static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i)
3052 {
3053         struct kvm_cpuid_entry2 *e = &vcpu->arch.cpuid_entries[i];
3054         int j, nent = vcpu->arch.cpuid_nent;
3055 
3056         e->flags &= ~KVM_CPUID_FLAG_STATE_READ_NEXT;
3057         /* when no next entry is found, the current entry[i] is reselected */
3058         for (j = i + 1; ; j = (j + 1) % nent) {
3059                 struct kvm_cpuid_entry2 *ej = &vcpu->arch.cpuid_entries[j];
3060                 if (ej->function == e->function) {
3061                         ej->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT;
3062                         return j;
3063                 }
3064         }
3065         return 0; /* silence gcc, even though control never reaches here */
3066 }
3067 
3068 /* find an entry with matching function, matching index (if needed), and that
3069  * should be read next (if it's stateful) */
3070 static int is_matching_cpuid_entry(struct kvm_cpuid_entry2 *e,
3071         u32 function, u32 index)
3072 {
3073         if (e->function != function)
3074                 return 0;
3075         if ((e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) && e->index != index)
3076                 return 0;
3077         if ((e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) &&
3078             !(e->flags & KVM_CPUID_FLAG_STATE_READ_NEXT))
3079                 return 0;
3080         return 1;
3081 }
3082 
3083 struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
3084                                               u32 function, u32 index)
3085 {
3086         int i;
3087         struct kvm_cpuid_entry2 *best = NULL;
3088 
3089         for (i = 0; i < vcpu->arch.cpuid_nent; ++i) {
3090                 struct kvm_cpuid_entry2 *e;
3091 
3092                 e = &vcpu->arch.cpuid_entries[i];
3093                 if (is_matching_cpuid_entry(e, function, index)) {
3094                         if (e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC)
3095                                 move_to_next_stateful_cpuid_entry(vcpu, i);
3096                         best = e;
3097                         break;
3098                 }
3099                 /*
3100                  * Both basic or both extended?
3101                  */
3102                 if (((e->function ^ function) & 0x80000000) == 0)
3103                         if (!best || e->function > best->function)
3104                                 best = e;
3105         }
3106         return best;
3107 }
3108 
3109 int cpuid_maxphyaddr(struct kvm_vcpu *vcpu)
3110 {
3111         struct kvm_cpuid_entry2 *best;
3112 
3113         best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0);
3114         if (best)
3115                 return best->eax & 0xff;
3116         return 36;
3117 }
3118 
3119 void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
3120 {
3121         u32 function, index;
3122         struct kvm_cpuid_entry2 *best;
3123 
3124         function = kvm_register_read(vcpu, VCPU_REGS_RAX);
3125         index = kvm_register_read(vcpu, VCPU_REGS_RCX);
3126         kvm_register_write(vcpu, VCPU_REGS_RAX, 0);
3127         kvm_register_write(vcpu, VCPU_REGS_RBX, 0);
3128         kvm_register_write(vcpu, VCPU_REGS_RCX, 0);
3129         kvm_register_write(vcpu, VCPU_REGS_RDX, 0);
3130         best = kvm_find_cpuid_entry(vcpu, function, index);
3131         if (best) {
3132                 kvm_register_write(vcpu, VCPU_REGS_RAX, best->eax);
3133                 kvm_register_write(vcpu, VCPU_REGS_RBX, best->ebx);
3134                 kvm_register_write(vcpu, VCPU_REGS_RCX, best->ecx);
3135                 kvm_register_write(vcpu, VCPU_REGS_RDX, best->edx);
3136         }
3137         kvm_x86_ops->skip_emulated_instruction(vcpu);
3138         KVMTRACE_5D(CPUID, vcpu, function,
3139                     (u32)kvm_register_read(vcpu, VCPU_REGS_RAX),
3140                     (u32)kvm_register_read(vcpu, VCPU_REGS_RBX),
3141                     (u32)kvm_register_read(vcpu, VCPU_REGS_RCX),
3142                     (u32)kvm_register_read(vcpu, VCPU_REGS_RDX), handler);
3143 }
3144 EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
3145 
3146 /*
3147  * Check if userspace requested an interrupt window, and that the
3148  * interrupt window is open.
3149  *
3150  * No need to exit to userspace if we already have an interrupt queued.
3151  */
3152 static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu,
3153                                           struct kvm_run *kvm_run)
3154 {
3155         return (!irqchip_in_kernel(vcpu->kvm) && !kvm_cpu_has_interrupt(vcpu) &&
3156                 kvm_run->request_interrupt_window &&
3157                 kvm_arch_interrupt_allowed(vcpu));
3158 }
3159 
3160 static void post_kvm_run_save(struct kvm_vcpu *vcpu,
3161                               struct kvm_run *kvm_run)
3162 {
3163         kvm_run->if_flag = (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF) != 0;
3164         kvm_run->cr8 = kvm_get_cr8(vcpu);
3165         kvm_run->apic_base = kvm_get_apic_base(vcpu);
3166         if (irqchip_in_kernel(vcpu->kvm))
3167                 kvm_run->ready_for_interrupt_injection = 1;
3168         else
3169                 kvm_run->ready_for_interrupt_injection =
3170                         kvm_arch_interrupt_allowed(vcpu) &&
3171                         !kvm_cpu_has_interrupt(vcpu) &&
3172                         !kvm_event_needs_reinjection(vcpu);
3173 }
3174 
3175 static void vapic_enter(struct kvm_vcpu *vcpu)
3176 {
3177         struct kvm_lapic *apic = vcpu->arch.apic;
3178         struct page *page;
3179 
3180         if (!apic || !apic->vapic_addr)
3181                 return;
3182 
3183         page = gfn_to_page(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT);
3184 
3185         vcpu->arch.apic->vapic_page = page;
3186 }
3187 
3188 static void vapic_exit(struct kvm_vcpu *vcpu)
3189 {
3190         struct kvm_lapic *apic = vcpu->arch.apic;
3191 
3192         if (!apic || !apic->vapic_addr)
3193                 return;
3194 
3195         down_read(&vcpu->kvm->slots_lock);
3196         kvm_release_page_dirty(apic->vapic_page);
3197         mark_page_dirty(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT);
3198         up_read(&vcpu->kvm->slots_lock);
3199 }
3200 
3201 static void update_cr8_intercept(struct kvm_vcpu *vcpu)
3202 {
3203         int max_irr, tpr;
3204 
3205         if (!kvm_x86_ops->update_cr8_intercept)
3206                 return;
3207 
3208         if (!vcpu->arch.apic)
3209                 return;
3210 
3211         if (!vcpu->arch.apic->vapic_addr)
3212                 max_irr = kvm_lapic_find_highest_irr(vcpu);
3213         else
3214                 max_irr = -1;
3215 
3216         if (max_irr != -1)
3217                 max_irr >>= 4;
3218 
3219         tpr = kvm_lapic_get_cr8(vcpu);
3220 
3221         kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr);
3222 }
3223 
3224 static void inject_pending_irq(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3225 {
3226         if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
3227                 kvm_x86_ops->set_interrupt_shadow(vcpu, 0);
3228 
3229         /* try to reinject previous events if any */
3230         if (vcpu->arch.nmi_injected) {
3231                 kvm_x86_ops->set_nmi(vcpu);
3232                 return;
3233         }
3234 
3235         if (vcpu->arch.interrupt.pending) {
3236                 kvm_x86_ops->set_irq(vcpu);
3237                 return;
3238         }
3239 
3240         /* try to inject new event if pending */
3241         if (vcpu->arch.nmi_pending) {
3242                 if (kvm_x86_ops->nmi_allowed(vcpu)) {
3243                         vcpu->arch.nmi_pending = false;
3244                         vcpu->arch.nmi_injected = true;
3245                         kvm_x86_ops->set_nmi(vcpu);
3246                 }
3247         } else if (kvm_cpu_has_interrupt(vcpu)) {
3248                 if (kvm_x86_ops->interrupt_allowed(vcpu)) {
3249                         kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu),
3250                                             false);
3251                         kvm_x86_ops->set_irq(vcpu);
3252                 }
3253         }
3254 }
3255 
3256 static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3257 {
3258         int r;
3259         bool req_int_win = !irqchip_in_kernel(vcpu->kvm) &&
3260                 kvm_run->request_interrupt_window;
3261 
3262         if (vcpu->requests)
3263                 if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests))
3264                         kvm_mmu_unload(vcpu);
3265 
3266         r = kvm_mmu_reload(vcpu);
3267         if (unlikely(r))
3268                 goto out;
3269 
3270         if (vcpu->requests) {
3271                 if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests))
3272                         __kvm_migrate_timers(vcpu);
3273                 if (test_and_clear_bit(KVM_REQ_KVMCLOCK_UPDATE, &vcpu->requests))
3274                         kvm_write_guest_time(vcpu);
3275                 if (test_and_clear_bit(KVM_REQ_MMU_SYNC, &vcpu->requests))
3276                         kvm_mmu_sync_roots(vcpu);
3277                 if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests))
3278                         kvm_x86_ops->tlb_flush(vcpu);
3279                 if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS,
3280                                        &vcpu->requests)) {
3281                         kvm_run->exit_reason = KVM_EXIT_TPR_ACCESS;
3282                         r = 0;
3283                         goto out;
3284                 }
3285                 if (test_and_clear_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests)) {
3286                         kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
3287                         r = 0;
3288                         goto out;
3289                 }
3290         }
3291 
3292         preempt_disable();
3293 
3294         kvm_x86_ops->prepare_guest_switch(vcpu);
3295         kvm_load_guest_fpu(vcpu);
3296 
3297         local_irq_disable();
3298 
3299         clear_bit(KVM_REQ_KICK, &vcpu->requests);
3300         smp_mb__after_clear_bit();
3301 
3302         if (vcpu->requests || need_resched() || signal_pending(current)) {
3303                 local_irq_enable();
3304                 preempt_enable();
3305                 r = 1;
3306                 goto out;
3307         }
3308 
3309         if (vcpu->arch.exception.pending)
3310                 __queue_exception(vcpu);
3311         else
3312                 inject_pending_irq(vcpu, kvm_run);
3313 
3314         /* enable NMI/IRQ window open exits if needed */
3315         if (vcpu->arch.nmi_pending)
3316                 kvm_x86_ops->enable_nmi_window(vcpu);
3317         else if (kvm_cpu_has_interrupt(vcpu) || req_int_win)
3318                 kvm_x86_ops->enable_irq_window(vcpu);
3319 
3320         if (kvm_lapic_enabled(vcpu)) {
3321                 update_cr8_intercept(vcpu);
3322                 kvm_lapic_sync_to_vapic(vcpu);
3323         }
3324 
3325         up_read(&vcpu->kvm->slots_lock);
3326 
3327         kvm_guest_enter();
3328 
3329         get_debugreg(vcpu->arch.host_dr6, 6);
3330         get_debugreg(vcpu->arch.host_dr7, 7);
3331         if (unlikely(vcpu->arch.switch_db_regs)) {
3332                 get_debugreg(vcpu->arch.host_db[0], 0);
3333                 get_debugreg(vcpu->arch.host_db[1], 1);
3334                 get_debugreg(vcpu->arch.host_db[2], 2);
3335                 get_debugreg(vcpu->arch.host_db[3], 3);
3336 
3337                 set_debugreg(0, 7);
3338                 set_debugreg(vcpu->arch.eff_db[0], 0);
3339                 set_debugreg(vcpu->arch.eff_db[1], 1);
3340                 set_debugreg(vcpu->arch.eff_db[2], 2);
3341                 set_debugreg(vcpu->arch.eff_db[3], 3);
3342         }
3343 
3344         KVMTRACE_0D(VMENTRY, vcpu, entryexit);
3345         kvm_x86_ops->run(vcpu, kvm_run);
3346 
3347         if (unlikely(vcpu->arch.switch_db_regs)) {
3348                 set_debugreg(0, 7);
3349                 set_debugreg(vcpu->arch.host_db[0], 0);
3350                 set_debugreg(vcpu->arch.host_db[1], 1);
3351                 set_debugreg(vcpu->arch.host_db[2], 2);
3352                 set_debugreg(vcpu->arch.host_db[3], 3);
3353         }
3354         set_debugreg(vcpu->arch.host_dr6, 6);
3355         set_debugreg(vcpu->arch.host_dr7, 7);
3356 
3357         set_bit(KVM_REQ_KICK, &vcpu->requests);
3358         local_irq_enable();
3359 
3360         ++vcpu->stat.exits;
3361 
3362         /*
3363          * We must have an instruction between local_irq_enable() and
3364          * kvm_guest_exit(), so the timer interrupt isn't delayed by
3365          * the interrupt shadow.  The stat.exits increment will do nicely.
3366          * But we need to prevent reordering, hence this barrier():
3367          */
3368         barrier();
3369 
3370         kvm_guest_exit();
3371 
3372         preempt_enable();
3373 
3374         down_read(&vcpu->kvm->slots_lock);
3375 
3376         /*
3377          * Profile KVM exit RIPs:
3378          */
3379         if (unlikely(prof_on == KVM_PROFILING)) {
3380                 unsigned long rip = kvm_rip_read(vcpu);
3381                 profile_hit(KVM_PROFILING, (void *)rip);
3382         }
3383 
3384 
3385         kvm_lapic_sync_from_vapic(vcpu);
3386 
3387         r = kvm_x86_ops->handle_exit(kvm_run, vcpu);
3388 out:
3389         return r;
3390 }
3391 
3392 
3393 static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3394 {
3395         int r;
3396 
3397         if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) {
3398                 pr_debug("vcpu %d received sipi with vector # %x\n",
3399                          vcpu->vcpu_id, vcpu->arch.sipi_vector);
3400                 kvm_lapic_reset(vcpu);
3401                 r = kvm_arch_vcpu_reset(vcpu);
3402                 if (r)
3403                         return r;
3404                 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
3405         }
3406 
3407         down_read(&vcpu->kvm->slots_lock);
3408         vapic_enter(vcpu);
3409 
3410         r = 1;
3411         while (r > 0) {
3412                 if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE)
3413                         r = vcpu_enter_guest(vcpu, kvm_run);
3414                 else {
3415                         up_read(&vcpu->kvm->slots_lock);
3416                         kvm_vcpu_block(vcpu);
3417                         down_read(&vcpu->kvm->slots_lock);
3418                         if (test_and_clear_bit(KVM_REQ_UNHALT, &vcpu->requests))
3419                         {
3420                                 switch(vcpu->arch.mp_state) {
3421                                 case KVM_MP_STATE_HALTED:
3422                                         vcpu->arch.mp_state =
3423                                                 KVM_MP_STATE_RUNNABLE;
3424                                 case KVM_MP_STATE_RUNNABLE:
3425                                         break;
3426                                 case KVM_MP_STATE_SIPI_RECEIVED:
3427                                 default:
3428                                         r = -EINTR;
3429                                         break;
3430                                 }
3431                         }
3432                 }
3433 
3434                 if (r <= 0)
3435                         break;
3436 
3437                 clear_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests);
3438                 if (kvm_cpu_has_pending_timer(vcpu))
3439                         kvm_inject_pending_timer_irqs(vcpu);
3440 
3441                 if (dm_request_for_irq_injection(vcpu, kvm_run)) {
3442                         r = -EINTR;
3443                         kvm_run->exit_reason = KVM_EXIT_INTR;
3444                         ++vcpu->stat.request_irq_exits;
3445                 }
3446                 if (signal_pending(current)) {
3447                         r = -EINTR;
3448                         kvm_run->exit_reason = KVM_EXIT_INTR;
3449                         ++vcpu->stat.signal_exits;
3450                 }
3451                 if (need_resched()) {
3452                         up_read(&vcpu->kvm->slots_lock);
3453                         kvm_resched(vcpu);
3454                         down_read(&vcpu->kvm->slots_lock);
3455                 }
3456         }
3457 
3458         up_read(&vcpu->kvm->slots_lock);
3459         post_kvm_run_save(vcpu, kvm_run);
3460 
3461         vapic_exit(vcpu);
3462 
3463         return r;
3464 }
3465 
3466 int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3467 {
3468         int r;
3469         sigset_t sigsaved;
3470 
3471         vcpu_load(vcpu);
3472 
3473         if (vcpu->sigset_active)
3474                 sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
3475 
3476         if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) {
3477                 kvm_vcpu_block(vcpu);
3478                 clear_bit(KVM_REQ_UNHALT, &vcpu->requests);
3479                 r = -EAGAIN;
3480                 goto out;
3481         }
3482 
3483         /* re-sync apic's tpr */
3484         if (!irqchip_in_kernel(vcpu->kvm))
3485                 kvm_set_cr8(vcpu, kvm_run->cr8);
3486 
3487         if (vcpu->arch.pio.cur_count) {
3488                 r = complete_pio(vcpu);
3489                 if (r)
3490                         goto out;
3491         }
3492 #if CONFIG_HAS_IOMEM
3493         if (vcpu->mmio_needed) {
3494                 memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
3495                 vcpu->mmio_read_completed = 1;
3496                 vcpu->mmio_needed = 0;
3497 
3498                 down_read(&vcpu->kvm->slots_lock);
3499                 r = emulate_instruction(vcpu, kvm_run,
3500                                         vcpu->arch.mmio_fault_cr2, 0,
3501                                         EMULTYPE_NO_DECODE);
3502                 up_read(&vcpu->kvm->slots_lock);
3503                 if (r == EMULATE_DO_MMIO) {
3504                         /*
3505                          * Read-modify-write.  Back to userspace.
3506                          */
3507                         r = 0;
3508                         goto out;
3509                 }
3510         }
3511 #endif
3512         if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL)
3513                 kvm_register_write(vcpu, VCPU_REGS_RAX,
3514                                      kvm_run->hypercall.ret);
3515 
3516         r = __vcpu_run(vcpu, kvm_run);
3517 
3518 out:
3519         if (vcpu->sigset_active)
3520                 sigprocmask(SIG_SETMASK, &sigsaved, NULL);
3521 
3522         vcpu_put(vcpu);
3523         return r;
3524 }
3525 
3526 int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
3527 {
3528         vcpu_load(vcpu);
3529 
3530         regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX);
3531         regs->rbx = kvm_register_read(vcpu, VCPU_REGS_RBX);
3532         regs->rcx = kvm_register_read(vcpu, VCPU_REGS_RCX);
3533         regs->rdx = kvm_register_read(vcpu, VCPU_REGS_RDX);
3534         regs->rsi = kvm_register_read(vcpu, VCPU_REGS_RSI);
3535         regs->rdi = kvm_register_read(vcpu, VCPU_REGS_RDI);
3536         regs->rsp = kvm_register_read(vcpu, VCPU_REGS_RSP);
3537         regs->rbp = kvm_register_read(vcpu, VCPU_REGS_RBP);
3538 #ifdef CONFIG_X86_64
3539         regs->r8 = kvm_register_read(vcpu, VCPU_REGS_R8);
3540         regs->r9 = kvm_register_read(vcpu, VCPU_REGS_R9);
3541         regs->r10 = kvm_register_read(vcpu, VCPU_REGS_R10);
3542         regs->r11 = kvm_register_read(vcpu, VCPU_REGS_R11);
3543         regs->r12 = kvm_register_read(vcpu, VCPU_REGS_R12);
3544         regs->r13 = kvm_register_read(vcpu, VCPU_REGS_R13);
3545         regs->r14 = kvm_register_read(vcpu, VCPU_REGS_R14);
3546         regs->r15 = kvm_register_read(vcpu, VCPU_REGS_R15);
3547 #endif
3548 
3549         regs->rip = kvm_rip_read(vcpu);
3550         regs->rflags = kvm_x86_ops->get_rflags(vcpu);
3551 
3552         /*
3553          * Don't leak debug flags in case they were set for guest debugging
3554          */
3555         if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
3556                 regs->rflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
3557 
3558         vcpu_put(vcpu);
3559 
3560         return 0;
3561 }
3562 
3563 int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
3564 {
3565         vcpu_load(vcpu);
3566 
3567         kvm_register_write(vcpu, VCPU_REGS_RAX, regs->rax);
3568         kvm_register_write(vcpu, VCPU_REGS_RBX, regs->rbx);
3569         kvm_register_write(vcpu, VCPU_REGS_RCX, regs->rcx);
3570         kvm_register_write(vcpu, VCPU_REGS_RDX, regs->rdx);
3571         kvm_register_write(vcpu, VCPU_REGS_RSI, regs->rsi);
3572         kvm_register_write(vcpu, VCPU_REGS_RDI, regs->rdi);
3573         kvm_register_write(vcpu, VCPU_REGS_RSP, regs->rsp);
3574         kvm_register_write(vcpu, VCPU_REGS_RBP, regs->rbp);
3575 #ifdef CONFIG_X86_64
3576         kvm_register_write(vcpu, VCPU_REGS_R8, regs->r8);
3577         kvm_register_write(vcpu, VCPU_REGS_R9, regs->r9);
3578         kvm_register_write(vcpu, VCPU_REGS_R10, regs->r10);
3579         kvm_register_write(vcpu, VCPU_REGS_R11, regs->r11);
3580         kvm_register_write(vcpu, VCPU_REGS_R12, regs->r12);
3581         kvm_register_write(vcpu, VCPU_REGS_R13, regs->r13);
3582         kvm_register_write(vcpu, VCPU_REGS_R14, regs->r14);
3583         kvm_register_write(vcpu, VCPU_REGS_R15, regs->r15);
3584 
3585 #endif
3586 
3587         kvm_rip_write(vcpu, regs->rip);
3588         kvm_x86_ops->set_rflags(vcpu, regs->rflags);
3589 
3590 
3591         vcpu->arch.exception.pending = false;
3592 
3593         vcpu_put(vcpu);
3594 
3595         return 0;
3596 }
3597 
3598 void kvm_get_segment(struct kvm_vcpu *vcpu,
3599                      struct kvm_segment *var, int seg)
3600 {
3601         kvm_x86_ops->get_segment(vcpu, var, seg);
3602 }
3603 
3604 void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
3605 {
3606         struct kvm_segment cs;
3607 
3608         kvm_get_segment(vcpu, &cs, VCPU_SREG_CS);
3609         *db = cs.db;
3610         *l = cs.l;
3611 }
3612 EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits);
3613 
3614 int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
3615                                   struct kvm_sregs *sregs)
3616 {
3617         struct descriptor_table dt;
3618 
3619         vcpu_load(vcpu);
3620 
3621         kvm_get_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
3622         kvm_get_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
3623         kvm_get_segment(vcpu, &sregs->es, VCPU_SREG_ES);
3624         kvm_get_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
3625         kvm_get_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
3626         kvm_get_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
3627 
3628         kvm_get_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
3629         kvm_get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
3630 
3631         kvm_x86_ops->get_idt(vcpu, &dt);
3632         sregs->idt.limit = dt.limit;
3633         sregs->idt.base = dt.base;
3634         kvm_x86_ops->get_gdt(vcpu, &dt);
3635         sregs->gdt.limit = dt.limit;
3636         sregs->gdt.base = dt.base;
3637 
3638         kvm_x86_ops->decache_cr4_guest_bits(vcpu);
3639         sregs->cr0 = vcpu->arch.cr0;
3640         sregs->cr2 = vcpu->arch.cr2;
3641         sregs->cr3 = vcpu->arch.cr3;
3642         sregs->cr4 = vcpu->arch.cr4;
3643         sregs->cr8 = kvm_get_cr8(vcpu);
3644         sregs->efer = vcpu->arch.shadow_efer;
3645         sregs->apic_base = kvm_get_apic_base(vcpu);
3646 
3647         memset(sregs->interrupt_bitmap, 0, sizeof sregs->interrupt_bitmap);
3648 
3649         if (vcpu->arch.interrupt.pending && !vcpu->arch.interrupt.soft)
3650                 set_bit(vcpu->arch.interrupt.nr,
3651                         (unsigned long *)sregs->interrupt_bitmap);
3652 
3653         vcpu_put(vcpu);
3654 
3655         return 0;
3656 }
3657 
3658 int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
3659                                     struct kvm_mp_state *mp_state)
3660 {
3661         vcpu_load(vcpu);
3662         mp_state->mp_state = vcpu->arch.mp_state;
3663         vcpu_put(vcpu);
3664         return 0;
3665 }
3666 
3667 int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
3668                                     struct kvm_mp_state *mp_state)
3669 {
3670         vcpu_load(vcpu);
3671         vcpu->arch.mp_state = mp_state->mp_state;
3672         vcpu_put(vcpu);
3673         return 0;
3674 }
3675 
3676 static void kvm_set_segment(struct kvm_vcpu *vcpu,
3677                         struct kvm_segment *var, int seg)
3678 {
3679         kvm_x86_ops->set_segment(vcpu, var, seg);
3680 }
3681 
3682 static void seg_desct_to_kvm_desct(struct desc_struct *seg_desc, u16 selector,
3683                                    struct kvm_segment *kvm_desct)
3684 {
3685         kvm_desct->base = seg_desc->base0;
3686         kvm_desct->base |= seg_desc->base1 << 16;
3687         kvm_desct->base |= seg_desc->base2 << 24;
3688         kvm_desct->limit = seg_desc->limit0;
3689         kvm_desct->limit |= seg_desc->limit << 16;
3690         if (seg_desc->g) {
3691                 kvm_desct->limit <<= 12;
3692                 kvm_desct->limit |= 0xfff;
3693         }
3694         kvm_desct->selector = selector;
3695         kvm_desct->type = seg_desc->type;
3696         kvm_desct->present = seg_desc->p;
3697         kvm_desct->dpl = seg_desc->dpl;
3698         kvm_desct->db = seg_desc->d;
3699         kvm_desct->s = seg_desc->s;
3700         kvm_desct->l = seg_desc->l;
3701         kvm_desct->g = seg_desc->g;
3702         kvm_desct->avl = seg_desc->avl;
3703         if (!selector)
3704                 kvm_desct->unusable = 1;
3705         else
3706                 kvm_desct->unusable = 0;
3707         kvm_desct->padding = 0;
3708 }
3709 
3710 static void get_segment_descriptor_dtable(struct kvm_vcpu *vcpu,
3711                                           u16 selector,
3712                                           struct descriptor_table *dtable)
3713 {
3714         if (selector & 1 << 2) {
3715                 struct kvm_segment kvm_seg;
3716 
3717                 kvm_get_segment(vcpu, &kvm_seg, VCPU_SREG_LDTR);
3718 
3719                 if (kvm_seg.unusable)
3720                         dtable->limit = 0;
3721                 else
3722                         dtable->limit = kvm_seg.limit;
3723                 dtable->base = kvm_seg.base;
3724         }
3725         else
3726                 kvm_x86_ops->get_gdt(vcpu, dtable);
3727 }
3728 
3729 /* allowed just for 8 bytes segments */
3730 static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
3731                                          struct desc_struct *seg_desc)
3732 {
3733         gpa_t gpa;
3734         struct descriptor_table dtable;
3735         u16 index = selector >> 3;
3736 
3737         get_segment_descriptor_dtable(vcpu, selector, &dtable);
3738 
3739         if (dtable.limit < index * 8 + 7) {
3740                 kvm_queue_exception_e(vcpu, GP_VECTOR, selector & 0xfffc);
3741                 return 1;
3742         }
3743         gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, dtable.base);
3744         gpa += index * 8;
3745         return kvm_read_guest(vcpu->kvm, gpa, seg_desc, 8);
3746 }
3747 
3748 /* allowed just for 8 bytes segments */
3749 static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
3750                                          struct desc_struct *seg_desc)
3751 {
3752         gpa_t gpa;
3753         struct descriptor_table dtable;
3754         u16 index = selector >> 3;
3755 
3756         get_segment_descriptor_dtable(vcpu, selector, &dtable);
3757 
3758         if (dtable.limit < index * 8 + 7)
3759                 return 1;
3760         gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, dtable.base);
3761         gpa += index * 8;
3762         return kvm_write_guest(vcpu->kvm, gpa, seg_desc, 8);
3763 }
3764 
3765 static gpa_t get_tss_base_addr(struct kvm_vcpu *vcpu,
3766                              struct desc_struct *seg_desc)
3767 {
3768         u32 base_addr;
3769 
3770         base_addr = seg_desc->base0;
3771         base_addr |= (seg_desc->base1 << 16);
3772         base_addr |= (seg_desc->base2 << 24);
3773 
3774         return vcpu->arch.mmu.gva_to_gpa(vcpu, base_addr);
3775 }
3776 
3777 static u16 get_segment_selector(struct kvm_vcpu *vcpu, int seg)
3778 {
3779         struct kvm_segment kvm_seg;
3780 
3781         kvm_get_segment(vcpu, &kvm_seg, seg);
3782         return kvm_seg.selector;
3783 }
3784 
3785 static int load_segment_descriptor_to_kvm_desct(struct kvm_vcpu *vcpu,
3786                                                 u16 selector,
3787                                                 struct kvm_segment *kvm_seg)
3788 {
3789         struct desc_struct seg_desc;
3790 
3791         if (load_guest_segment_descriptor(vcpu, selector, &seg_desc))
3792                 return 1;
3793         seg_desct_to_kvm_desct(&seg_desc, selector, kvm_seg);
3794         return 0;
3795 }
3796 
3797 static int kvm_load_realmode_segment(struct kvm_vcpu *vcpu, u16 selector, int seg)
3798 {
3799         struct kvm_segment segvar = {
3800                 .base = selector << 4,
3801                 .limit = 0xffff,
3802                 .selector = selector,
3803                 .type = 3,
3804                 .present = 1,
3805                 .dpl = 3,
3806                 .db = 0,
3807                 .s = 1,
3808                 .l = 0,
3809                 .g = 0,
3810                 .avl = 0,
3811                 .unusable = 0,
3812         };
3813         kvm_x86_ops->set_segment(vcpu, &segvar, seg);
3814         return 0;
3815 }
3816 
3817 int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
3818                                 int type_bits, int seg)
3819 {
3820         struct kvm_segment kvm_seg;
3821 
3822         if (!(vcpu->arch.cr0 & X86_CR0_PE))
3823                 return kvm_load_realmode_segment(vcpu, selector, seg);
3824         if (load_segment_descriptor_to_kvm_desct(vcpu, selector, &kvm_seg))
3825                 return 1;
3826         kvm_seg.type |= type_bits;
3827 
3828         if (seg != VCPU_SREG_SS && seg != VCPU_SREG_CS &&
3829             seg != VCPU_SREG_LDTR)
3830                 if (!kvm_seg.s)
3831                         kvm_seg.unusable = 1;
3832 
3833         kvm_set_segment(vcpu, &kvm_seg, seg);
3834         return 0;
3835 }
3836 
3837 static void save_state_to_tss32(struct kvm_vcpu *vcpu,
3838                                 struct tss_segment_32 *tss)
3839 {
3840         tss->cr3 = vcpu->arch.cr3;
3841         tss->eip = kvm_rip_read(vcpu);
3842         tss->eflags = kvm_x86_ops->get_rflags(vcpu);
3843         tss->eax = kvm_register_read(vcpu, VCPU_REGS_RAX);
3844         tss->ecx = kvm_register_read(vcpu, VCPU_REGS_RCX);
3845         tss->edx = kvm_register_read(vcpu, VCPU_REGS_RDX);
3846         tss->ebx = kvm_register_read(vcpu, VCPU_REGS_RBX);
3847         tss->esp = kvm_register_read(vcpu, VCPU_REGS_RSP);
3848         tss->ebp = kvm_register_read(vcpu, VCPU_REGS_RBP);
3849         tss->esi = kvm_register_read(vcpu, VCPU_REGS_RSI);
3850         tss->edi = kvm_register_read(vcpu, VCPU_REGS_RDI);
3851         tss->es = get_segment_selector(vcpu, VCPU_SREG_ES);
3852         tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS);
3853         tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS);
3854         tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS);
3855         tss->fs = get_segment_selector(vcpu, VCPU_SREG_FS);
3856         tss->gs = get_segment_selector(vcpu, VCPU_SREG_GS);
3857         tss->ldt_selector = get_segment_selector(vcpu, VCPU_SREG_LDTR);
3858 }
3859 
3860 static int load_state_from_tss32(struct kvm_vcpu *vcpu,
3861                                   struct tss_segment_32 *tss)
3862 {
3863         kvm_set_cr3(vcpu, tss->cr3);
3864 
3865         kvm_rip_write(vcpu, tss->eip);
3866         kvm_x86_ops->set_rflags(vcpu, tss->eflags | 2);
3867 
3868         kvm_register_write(vcpu, VCPU_REGS_RAX, tss->eax);
3869         kvm_register_write(vcpu, VCPU_REGS_RCX, tss->ecx);
3870         kvm_register_write(vcpu, VCPU_REGS_RDX, tss->edx);
3871         kvm_register_write(vcpu, VCPU_REGS_RBX, tss->ebx);
3872         kvm_register_write(vcpu, VCPU_REGS_RSP, tss->esp);
3873         kvm_register_write(vcpu, VCPU_REGS_RBP, tss->ebp);
3874         kvm_register_write(vcpu, VCPU_REGS_RSI, tss->esi);
3875         kvm_register_write(vcpu, VCPU_REGS_RDI, tss->edi);
3876 
3877         if (kvm_load_segment_descriptor(vcpu, tss->ldt_selector, 0, VCPU_SREG_LDTR))
3878                 return 1;
3879 
3880         if (kvm_load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES))
3881                 return 1;
3882 
3883         if (kvm_load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS))
3884                 return 1;
3885 
3886         if (kvm_load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS))
3887                 return 1;
3888 
3889         if (kvm_load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS))
3890                 return 1;
3891 
3892         if (kvm_load_segment_descriptor(vcpu, tss->fs, 1, VCPU_SREG_FS))
3893                 return 1;
3894 
3895         if (kvm_load_segment_descriptor(vcpu, tss->gs, 1, VCPU_SREG_GS))
3896                 return 1;
3897         return 0;
3898 }
3899 
3900 static void save_state_to_tss16(struct kvm_vcpu *vcpu,
3901                                 struct tss_segment_16 *tss)
3902 {
3903         tss->ip = kvm_rip_read(vcpu);
3904         tss->flag = kvm_x86_ops->get_rflags(vcpu);
3905         tss->ax = kvm_register_read(vcpu, VCPU_REGS_RAX);
3906         tss->cx = kvm_register_read(vcpu, VCPU_REGS_RCX);
3907         tss->dx = kvm_register_read(vcpu, VCPU_REGS_RDX);
3908         tss->bx = kvm_register_read(vcpu, VCPU_REGS_RBX);
3909         tss->sp = kvm_register_read(vcpu, VCPU_REGS_RSP);
3910         tss->bp = kvm_register_read(vcpu, VCPU_REGS_RBP);
3911         tss->si = kvm_register_read(vcpu, VCPU_REGS_RSI);
3912         tss->di = kvm_register_read(vcpu, VCPU_REGS_RDI);
3913 
3914         tss->es = get_segment_selector(vcpu, VCPU_SREG_ES);
3915         tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS);
3916         tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS);
3917         tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS);
3918         tss->ldt = get_segment_selector(vcpu, VCPU_SREG_LDTR);
3919         tss->prev_task_link = get_segment_selector(vcpu, VCPU_SREG_TR);
3920 }
3921 
3922 static int load_state_from_tss16(struct kvm_vcpu *vcpu,
3923                                  struct tss_segment_16 *tss)
3924 {
3925         kvm_rip_write(vcpu, tss->ip);
3926         kvm_x86_ops->set_rflags(vcpu, tss->flag | 2);
3927         kvm_register_write(vcpu, VCPU_REGS_RAX, tss->ax);
3928         kvm_register_write(vcpu, VCPU_REGS_RCX, tss->cx);
3929         kvm_register_write(vcpu, VCPU_REGS_RDX, tss->dx);
3930         kvm_register_write(vcpu, VCPU_REGS_RBX, tss->bx);
3931         kvm_register_write(vcpu, VCPU_REGS_RSP, tss->sp);
3932         kvm_register_write(vcpu, VCPU_REGS_RBP, tss->bp);
3933         kvm_register_write(vcpu, VCPU_REGS_RSI, tss->si);
3934         kvm_register_write(vcpu, VCPU_REGS_RDI, tss->di);
3935 
3936         if (kvm_load_segment_descriptor(vcpu, tss->ldt, 0, VCPU_SREG_LDTR))
3937                 return 1;
3938 
3939         if (kvm_load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES))
3940                 return 1;
3941 
3942         if (kvm_load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS))
3943                 return 1;
3944 
3945         if (kvm_load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS))
3946                 return 1;
3947 
3948         if (kvm_load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS))
3949                 return 1;
3950         return 0;
3951 }
3952 
3953 static int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector,
3954                               u16 old_tss_sel, u32 old_tss_base,
3955                               struct desc_struct *nseg_desc)
3956 {
3957         struct tss_segment_16 tss_segment_16;
3958         int ret = 0;
3959 
3960         if (kvm_read_guest(vcpu->kvm, old_tss_base, &tss_segment_16,
3961                            sizeof tss_segment_16))
3962                 goto out;
3963 
3964         save_state_to_tss16(vcpu, &tss_segment_16);
3965 
3966         if (kvm_write_guest(vcpu->kvm, old_tss_base, &tss_segment_16,
3967                             sizeof tss_segment_16))
3968                 goto out;
3969 
3970         if (kvm_read_guest(vcpu->kvm, get_tss_base_addr(vcpu, nseg_desc),
3971                            &tss_segment_16, sizeof tss_segment_16))
3972                 goto out;
3973 
3974         if (old_tss_sel != 0xffff) {
3975                 tss_segment_16.prev_task_link = old_tss_sel;
3976 
3977                 if (kvm_write_guest(vcpu->kvm,
3978                                     get_tss_base_addr(vcpu, nseg_desc),
3979                                     &tss_segment_16.prev_task_link,
3980                                     sizeof tss_segment_16.prev_task_link))
3981                         goto out;
3982         }
3983 
3984         if (load_state_from_tss16(vcpu, &tss_segment_16))
3985                 goto out;
3986 
3987         ret = 1;
3988 out:
3989         return ret;
3990 }
3991 
3992 static int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector,
3993                        u16 old_tss_sel, u32 old_tss_base,
3994                        struct desc_struct *nseg_desc)
3995 {
3996         struct tss_segment_32 tss_segment_32;
3997         int ret = 0;
3998 
3999         if (kvm_read_guest(vcpu->kvm, old_tss_base, &tss_segment_32,
4000                            sizeof tss_segment_32))
4001                 goto out;
4002 
4003         save_state_to_tss32(vcpu, &tss_segment_32);
4004 
4005         if (kvm_write_guest(vcpu->kvm, old_tss_base, &tss_segment_32,
4006                             sizeof tss_segment_32))
4007                 goto out;
4008 
4009         if (kvm_read_guest(vcpu->kvm, get_tss_base_addr(vcpu, nseg_desc),
4010                            &tss_segment_32, sizeof tss_segment_32))
4011                 goto out;
4012 
4013         if (old_tss_sel != 0xffff) {
4014                 tss_segment_32.prev_task_link = old_tss_sel;
4015 
4016                 if (kvm_write_guest(vcpu->kvm,
4017                                     get_tss_base_addr(vcpu, nseg_desc),
4018                                     &tss_segment_32.prev_task_link,
4019                                     sizeof tss_segment_32.prev_task_link))
4020                         goto out;
4021         }
4022 
4023         if (load_state_from_tss32(vcpu, &tss_segment_32))
4024                 goto out;
4025 
4026         ret = 1;
4027 out:
4028         return ret;
4029 }
4030 
4031 int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
4032 {
4033         struct kvm_segment tr_seg;
4034         struct desc_struct cseg_desc;
4035         struct desc_struct nseg_desc;
4036         int ret = 0;
4037         u32 old_tss_base = get_segment_base(vcpu, VCPU_SREG_TR);
4038         u16 old_tss_sel = get_segment_selector(vcpu, VCPU_SREG_TR);
4039 
4040         old_tss_base = vcpu->arch.mmu.gva_to_gpa(vcpu, old_tss_base);
4041 
4042         /* FIXME: Handle errors. Failure to read either TSS or their
4043          * descriptors should generate a pagefault.
4044          */
4045         if (load_guest_segment_descriptor(vcpu, tss_selector, &nseg_desc))
4046                 goto out;
4047 
4048         if (load_guest_segment_descriptor(vcpu, old_tss_sel, &cseg_desc))
4049                 goto out;
4050 
4051         if (reason != TASK_SWITCH_IRET) {
4052                 int cpl;
4053 
4054                 cpl = kvm_x86_ops->get_cpl(vcpu);
4055                 if ((tss_selector & 3) > nseg_desc.dpl || cpl > nseg_desc.dpl) {
4056                         kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
4057                         return 1;
4058                 }
4059         }
4060 
4061         if (!nseg_desc.p || (nseg_desc.limit0 | nseg_desc.limit << 16) < 0x67) {
4062                 kvm_queue_exception_e(vcpu, TS_VECTOR, tss_selector & 0xfffc);
4063                 return 1;
4064         }
4065 
4066         if (reason == TASK_SWITCH_IRET || reason == TASK_SWITCH_JMP) {
4067                 cseg_desc.type &= ~(1 << 1); //clear the B flag
4068                 save_guest_segment_descriptor(vcpu, old_tss_sel, &cseg_desc);
4069         }
4070 
4071         if (reason == TASK_SWITCH_IRET) {
4072                 u32 eflags = kvm_x86_ops->get_rflags(vcpu);
4073                 kvm_x86_ops->set_rflags(vcpu, eflags & ~X86_EFLAGS_NT);
4074         }
4075 
4076         /* set back link to prev task only if NT bit is set in eflags
4077            note that old_tss_sel is not used afetr this point */
4078         if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE)
4079                 old_tss_sel = 0xffff;
4080 
4081         /* set back link to prev task only if NT bit is set in eflags
4082            note that old_tss_sel is not used afetr this point */
4083         if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE)
4084                 old_tss_sel = 0xffff;
4085 
4086         if (nseg_desc.type & 8)
4087                 ret = kvm_task_switch_32(vcpu, tss_selector, old_tss_sel,
4088                                          old_tss_base, &nseg_desc);
4089         else
4090                 ret = kvm_task_switch_16(vcpu, tss_selector, old_tss_sel,
4091                                          old_tss_base, &nseg_desc);
4092 
4093         if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE) {
4094                 u32 eflags = kvm_x86_ops->get_rflags(vcpu);
4095                 kvm_x86_ops->set_rflags(vcpu, eflags | X86_EFLAGS_NT);
4096         }
4097 
4098         if (reason != TASK_SWITCH_IRET) {
4099                 nseg_desc.type |= (1 << 1);
4100                 save_guest_segment_descriptor(vcpu, tss_selector,
4101                                               &nseg_desc);
4102         }
4103 
4104         kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 | X86_CR0_TS);
4105         seg_desct_to_kvm_desct(&nseg_desc, tss_selector, &tr_seg);
4106         tr_seg.type = 11;
4107         kvm_set_segment(vcpu, &tr_seg, VCPU_SREG_TR);
4108 out:
4109         return ret;
4110 }
4111 EXPORT_SYMBOL_GPL(kvm_task_switch);
4112 
4113 int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
4114                                   struct kvm_sregs *sregs)
4115 {
4116         int mmu_reset_needed = 0;
4117         int pending_vec, max_bits;
4118         struct descriptor_table dt;
4119 
4120         vcpu_load(vcpu);
4121 
4122         dt.limit = sregs->idt.limit;
4123         dt.base = sregs->idt.base;
4124         kvm_x86_ops->set_idt(vcpu, &dt);
4125         dt.limit = sregs->gdt.limit;
4126         dt.base = sregs->gdt.base;
4127         kvm_x86_ops->set_gdt(vcpu, &dt);
4128 
4129         vcpu->arch.cr2 = sregs->cr2;
4130         mmu_reset_needed |= vcpu->arch.cr3 != sregs->cr3;
4131         vcpu->arch.cr3 = sregs->cr3;
4132 
4133         kvm_set_cr8(vcpu, sregs->cr8);
4134 
4135         mmu_reset_needed |= vcpu->arch.shadow_efer != sregs->efer;
4136         kvm_x86_ops->set_efer(vcpu, sregs->efer);
4137         kvm_set_apic_base(vcpu, sregs->apic_base);
4138 
4139         kvm_x86_ops->decache_cr4_guest_bits(vcpu);
4140 
4141         mmu_reset_needed |= vcpu->arch.cr0 != sregs->cr0;
4142         kvm_x86_ops->set_cr0(vcpu, sregs->cr0);
4143         vcpu->arch.cr0 = sregs->cr0;
4144 
4145         mmu_reset_needed |= vcpu->arch.cr4 != sregs->cr4;
4146         kvm_x86_ops->set_cr4(vcpu, sregs->cr4);
4147         if (!is_long_mode(vcpu) && is_pae(vcpu))
4148                 load_pdptrs(vcpu, vcpu->arch.cr3);
4149 
4150         if (mmu_reset_needed)
4151                 kvm_mmu_reset_context(vcpu);
4152 
4153         max_bits = (sizeof sregs->interrupt_bitmap) << 3;
4154         pending_vec = find_first_bit(
4155                 (const unsigned long *)sregs->interrupt_bitmap, max_bits);
4156         if (pending_vec < max_bits) {
4157                 kvm_queue_interrupt(vcpu, pending_vec, false);
4158                 pr_debug("Set back pending irq %d\n", pending_vec);
4159                 if (irqchip_in_kernel(vcpu->kvm))
4160                         kvm_pic_clear_isr_ack(vcpu->kvm);
4161         }
4162 
4163         kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
4164         kvm_set_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
4165         kvm_set_segment(vcpu, &sregs->es, VCPU_SREG_ES);
4166         kvm_set_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
4167         kvm_set_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
4168         kvm_set_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
4169 
4170         kvm_set_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
4171         kvm_set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
4172 
4173         /* Older userspace won't unhalt the vcpu on reset. */
4174         if (vcpu->vcpu_id == 0 && kvm_rip_read(vcpu) == 0xfff0 &&
4175             sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 &&
4176             !(vcpu->arch.cr0 & X86_CR0_PE))
4177                 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
4178 
4179         vcpu_put(vcpu);
4180 
4181         return 0;
4182 }
4183 
4184 int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
4185                                         struct kvm_guest_debug *dbg)
4186 {
4187         int i, r;
4188 
4189         vcpu_load(vcpu);
4190 
4191         if ((dbg->control & (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP)) ==
4192             (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP)) {
4193                 for (i = 0; i < KVM_NR_DB_REGS; ++i)
4194                         vcpu->arch.eff_db[i] = dbg->arch.debugreg[i];
4195                 vcpu->arch.switch_db_regs =
4196                         (dbg->arch.debugreg[7] & DR7_BP_EN_MASK);
4197         } else {
4198                 for (i = 0; i < KVM_NR_DB_REGS; i++)
4199                         vcpu->arch.eff_db[i] = vcpu->arch.db[i];
4200                 vcpu->arch.switch_db_regs = (vcpu->arch.dr7 & DR7_BP_EN_MASK);
4201         }
4202 
4203         r = kvm_x86_ops->set_guest_debug(vcpu, dbg);
4204 
4205         if (dbg->control & KVM_GUESTDBG_INJECT_DB)
4206                 kvm_queue_exception(vcpu, DB_VECTOR);
4207         else if (dbg->control & KVM_GUESTDBG_INJECT_BP)
4208                 kvm_queue_exception(vcpu, BP_VECTOR);
4209 
4210         vcpu_put(vcpu);
4211 
4212         return r;
4213 }
4214 
4215 /*
4216  * fxsave fpu state.  Taken from x86_64/processor.h.  To be killed when
4217  * we have asm/x86/processor.h
4218  */
4219 struct fxsave {
4220         u16     cwd;
4221         u16     swd;
4222         u16     twd;
4223         u16     fop;
4224         u64     rip;
4225         u64     rdp;
4226         u32     mxcsr;
4227         u32     mxcsr_mask;
4228         u32     st_space[32];   /* 8*16 bytes for each FP-reg = 128 bytes */
4229 #ifdef CONFIG_X86_64
4230         u32     xmm_space[64];  /* 16*16 bytes for each XMM-reg = 256 bytes */
4231 #else
4232         u32     xmm_space[32];  /* 8*16 bytes for each XMM-reg = 128 bytes */
4233 #endif
4234 };
4235 
4236 /*
4237  * Translate a guest virtual address to a guest physical address.
4238  */
4239 int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
4240                                     struct kvm_translation *tr)
4241 {
4242         unsigned long vaddr = tr->linear_address;
4243         gpa_t gpa;
4244 
4245         vcpu_load(vcpu);
4246         down_read(&vcpu->kvm->slots_lock);
4247         gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, vaddr);
4248         up_read(&vcpu->kvm->slots_lock);
4249         tr->physical_address = gpa;
4250         tr->valid = gpa != UNMAPPED_GVA;
4251         tr->writeable = 1;
4252         tr->usermode = 0;
4253         vcpu_put(vcpu);
4254 
4255         return 0;
4256 }
4257 
4258 int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
4259 {
4260         struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image;
4261 
4262         vcpu_load(vcpu);
4263 
4264         memcpy(fpu->fpr, fxsave->st_space, 128);
4265         fpu->fcw = fxsave->cwd;
4266         fpu->fsw = fxsave->swd;
4267         fpu->ftwx = fxsave->twd;
4268         fpu->last_opcode = fxsave->fop;
4269         fpu->last_ip = fxsave->rip;
4270         fpu->last_dp = fxsave->rdp;
4271         memcpy(fpu->xmm, fxsave->xmm_space, sizeof fxsave->xmm_space);
4272 
4273         vcpu_put(vcpu);
4274 
4275         return 0;
4276 }
4277 
4278 int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
4279 {
4280         struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image;
4281 
4282         vcpu_load(vcpu);
4283 
4284         memcpy(fxsave->st_space, fpu->fpr, 128);
4285         fxsave->cwd = fpu->fcw;
4286         fxsave->swd = fpu->fsw;
4287         fxsave->twd = fpu->ftwx;
4288         fxsave->fop = fpu->last_opcode;
4289         fxsave->rip = fpu->last_ip;
4290         fxsave->rdp = fpu->last_dp;
4291         memcpy(fxsave->xmm_space, fpu->xmm, sizeof fxsave->xmm_space);
4292 
4293         vcpu_put(vcpu);
4294 
4295         return 0;
4296 }
4297 
4298 void fx_init(struct kvm_vcpu *vcpu)
4299 {
4300         unsigned after_mxcsr_mask;
4301 
4302         /*
4303          * Touch the fpu the first time in non atomic context as if
4304          * this is the first fpu instruction the exception handler
4305          * will fire before the instruction returns and it'll have to
4306          * allocate ram with GFP_KERNEL.
4307          */
4308         if (!used_math())
4309                 kvm_fx_save(&vcpu->arch.host_fx_image);
4310 
4311         /* Initialize guest FPU by resetting ours and saving into guest's */
4312         preempt_disable();
4313         kvm_fx_save(&vcpu->arch.host_fx_image);
4314         kvm_fx_finit();
4315         kvm_fx_save(&vcpu->arch.guest_fx_image);
4316         kvm_fx_restore(&vcpu->arch.host_fx_image);
4317         preempt_enable();
4318 
4319         vcpu->arch.cr0 |= X86_CR0_ET;
4320         after_mxcsr_mask = offsetof(struct i387_fxsave_struct, st_space);
4321         vcpu->arch.guest_fx_image.mxcsr = 0x1f80;
4322         memset((void *)&vcpu->arch.guest_fx_image + after_mxcsr_mask,
4323                0, sizeof(struct i387_fxsave_struct) - after_mxcsr_mask);
4324 }
4325 EXPORT_SYMBOL_GPL(fx_init);
4326 
4327 void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
4328 {
4329         if (!vcpu->fpu_active || vcpu->guest_fpu_loaded)
4330                 return;
4331 
4332         vcpu->guest_fpu_loaded = 1;
4333         kvm_fx_save(&vcpu->arch.host_fx_image);
4334         kvm_fx_restore(&vcpu->arch.guest_fx_image);
4335 }
4336 EXPORT_SYMBOL_GPL(kvm_load_guest_fpu);
4337 
4338 void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
4339 {
4340         if (!vcpu->guest_fpu_loaded)
4341                 return;
4342 
4343         vcpu->guest_fpu_loaded = 0;
4344         kvm_fx_save(&vcpu->arch.guest_fx_image);
4345         kvm_fx_restore(&vcpu->arch.host_fx_image);
4346         ++vcpu->stat.fpu_reload;
4347 }
4348 EXPORT_SYMBOL_GPL(kvm_put_guest_fpu);
4349 
4350 void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
4351 {
4352         if (vcpu->arch.time_page) {
4353                 kvm_release_page_dirty(vcpu->arch.time_page);
4354                 vcpu->arch.time_page = NULL;
4355         }
4356 
4357         kvm_x86_ops->vcpu_free(vcpu);
4358 }
4359 
4360 struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
4361                                                 unsigned int id)
4362 {
4363         return kvm_x86_ops->vcpu_create(kvm, id);
4364 }
4365 
4366 int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
4367 {
4368         int r;
4369 
4370         /* We do fxsave: this must be aligned. */
4371         BUG_ON((unsigned long)&vcpu->arch.host_fx_image & 0xF);
4372 
4373         vcpu->arch.mtrr_state.have_fixed = 1;
4374         vcpu_load(vcpu);
4375         r = kvm_arch_vcpu_reset(vcpu);
4376         if (r == 0)
4377                 r = kvm_mmu_setup(vcpu);
4378         vcpu_put(vcpu);
4379         if (r < 0)
4380                 goto free_vcpu;
4381 
4382         return 0;
4383 free_vcpu:
4384         kvm_x86_ops->vcpu_free(vcpu);
4385         return r;
4386 }
4387 
4388 void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
4389 {
4390         vcpu_load(vcpu);
4391         kvm_mmu_unload(vcpu);
4392         vcpu_put(vcpu);
4393 
4394         kvm_x86_ops->vcpu_free(vcpu);
4395 }
4396 
4397 int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
4398 {
4399         vcpu->arch.nmi_pending = false;
4400         vcpu->arch.nmi_injected = false;
4401 
4402         vcpu->arch.switch_db_regs = 0;
4403         memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db));
4404         vcpu->arch.dr6 = DR6_FIXED_1;
4405         vcpu->arch.dr7 = DR7_FIXED_1;
4406 
4407         return kvm_x86_ops->vcpu_reset(vcpu);
4408 }
4409 
4410 void kvm_arch_hardware_enable(void *garbage)
4411 {
4412         kvm_x86_ops->hardware_enable(garbage);
4413 }
4414 
4415 void kvm_arch_hardware_disable(void *garbage)
4416 {
4417         kvm_x86_ops->hardware_disable(garbage);
4418 }
4419 
4420 int kvm_arch_hardware_setup(void)
4421 {
4422         return kvm_x86_ops->hardware_setup();
4423 }
4424 
4425 void kvm_arch_hardware_unsetup(void)
4426 {
4427         kvm_x86_ops->hardware_unsetup();
4428 }
4429 
4430 void kvm_arch_check_processor_compat(void *rtn)
4431 {
4432         kvm_x86_ops->check_processor_compatibility(rtn);
4433 }
4434 
4435 int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
4436 {
4437         struct page *page;
4438         struct kvm *kvm;
4439         int r;
4440 
4441         BUG_ON(vcpu->kvm == NULL);
4442         kvm = vcpu->kvm;
4443 
4444         vcpu->arch.mmu.root_hpa = INVALID_PAGE;
4445         if (!irqchip_in_kernel(kvm) || vcpu->vcpu_id == 0)
4446                 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
4447         else
4448                 vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED;
4449 
4450         page = alloc_page(GFP_KERNEL | __GFP_ZERO);
4451         if (!page) {
4452                 r = -ENOMEM;
4453                 goto fail;
4454         }
4455         vcpu->arch.pio_data = page_address(page);
4456 
4457         r = kvm_mmu_create(vcpu);
4458         if (r < 0)
4459                 goto fail_free_pio_data;
4460 
4461         if (irqchip_in_kernel(kvm)) {
4462                 r = kvm_create_lapic(vcpu);
4463                 if (r < 0)
4464                         goto fail_mmu_destroy;
4465         }
4466 
4467         return 0;
4468 
4469 fail_mmu_destroy:
4470         kvm_mmu_destroy(vcpu);
4471 fail_free_pio_data:
4472         free_page((unsigned long)vcpu->arch.pio_data);
4473 fail:
4474         return r;
4475 }
4476 
4477 void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
4478 {
4479         kvm_free_lapic(vcpu);
4480         down_read(&vcpu->kvm->slots_lock);
4481         kvm_mmu_destroy(vcpu);
4482         up_read(&vcpu->kvm->slots_lock);
4483         free_page((unsigned long)vcpu->arch.pio_data);
4484 }
4485 
4486 struct  kvm *kvm_arch_create_vm(void)
4487 {
4488         struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
4489 
4490         if (!kvm)
4491                 return ERR_PTR(-ENOMEM);
4492 
4493         INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
4494         INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
4495 
4496         /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */
4497         set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap);
4498 
4499         rdtscll(kvm->arch.vm_init_tsc);
4500 
4501         return kvm;
4502 }
4503 
4504 static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
4505 {
4506         vcpu_load(vcpu);
4507         kvm_mmu_unload(vcpu);
4508         vcpu_put(vcpu);
4509 }
4510 
4511 static void kvm_free_vcpus(struct kvm *kvm)
4512 {
4513         unsigned int i;
4514 
4515         /*
4516          * Unpin any mmu pages first.
4517          */
4518         for (i = 0; i < KVM_MAX_VCPUS; ++i)
4519                 if (kvm->vcpus[i])
4520                         kvm_unload_vcpu_mmu(kvm->vcpus[i]);
4521         for (i = 0; i < KVM_MAX_VCPUS; ++i) {
4522                 if (kvm->vcpus[i]) {
4523                         kvm_arch_vcpu_free(kvm->vcpus[i]);
4524                         kvm->vcpus[i] = NULL;
4525                 }
4526         }
4527 
4528 }
4529 
4530 void kvm_arch_sync_events(struct kvm *kvm)
4531 {
4532         kvm_free_all_assigned_devices(kvm);
4533 }
4534 
4535 void kvm_arch_destroy_vm(struct kvm *kvm)
4536 {
4537         kvm_iommu_unmap_guest(kvm);
4538         kvm_free_pit(kvm);
4539         kfree(kvm->arch.vpic);
4540         kfree(kvm->arch.vioapic);
4541         kvm_free_vcpus(kvm);
4542         kvm_free_physmem(kvm);
4543         if (kvm->arch.apic_access_page)
4544                 put_page(kvm->arch.apic_access_page);
4545         if (kvm->arch.ept_identity_pagetable)
4546                 put_page(kvm->arch.ept_identity_pagetable);
4547         kfree(kvm);
4548 }
4549 
4550 int kvm_arch_set_memory_region(struct kvm *kvm,
4551                                 struct kvm_userspace_memory_region *mem,
4552                                 struct kvm_memory_slot old,
4553                                 int user_alloc)
4554 {
4555         int npages = mem->memory_size >> PAGE_SHIFT;
4556         struct kvm_memory_slot *memslot = &kvm->memslots[mem->slot];
4557 
4558         /*To keep backward compatibility with older userspace,
4559          *x86 needs to hanlde !user_alloc case.
4560          */
4561         if (!user_alloc) {
4562                 if (npages && !old.rmap) {
4563                         unsigned long userspace_addr;
4564 
4565                         down_write(&current->mm->mmap_sem);
4566                         userspace_addr = do_mmap(NULL, 0,
4567                                                  npages * PAGE_SIZE,
4568                                                  PROT_READ | PROT_WRITE,
4569                                                  MAP_PRIVATE | MAP_ANONYMOUS,
4570                                                  0);
4571                         up_write(&current->mm->mmap_sem);
4572 
4573                         if (IS_ERR((void *)userspace_addr))
4574                                 return PTR_ERR((void *)userspace_addr);
4575 
4576                         /* set userspace_addr atomically for kvm_hva_to_rmapp */
4577                         spin_lock(&kvm->mmu_lock);
4578                         memslot->userspace_addr = userspace_addr;
4579                         spin_unlock(&kvm->mmu_lock);
4580                 } else {
4581                         if (!old.user_alloc && old.rmap) {
4582                                 int ret;
4583 
4584                                 down_write(&current->mm->mmap_sem);
4585                                 ret = do_munmap(current->mm, old.userspace_addr,
4586                                                 old.npages * PAGE_SIZE);
4587                                 up_write(&current->mm->mmap_sem);
4588                                 if (ret < 0)
4589                                         printk(KERN_WARNING
4590                                        "kvm_vm_ioctl_set_memory_region: "
4591                                        "failed to munmap memory\n");
4592                         }
4593                 }
4594         }
4595 
4596         spin_lock(&kvm->mmu_lock);
4597         if (!kvm->arch.n_requested_mmu_pages) {
4598                 unsigned int nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm);
4599                 kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
4600         }
4601 
4602         kvm_mmu_slot_remove_write_access(kvm, mem->slot);
4603         spin_unlock(&kvm->mmu_lock);
4604         kvm_flush_remote_tlbs(kvm);
4605 
4606         return 0;
4607 }
4608 
4609 void kvm_arch_flush_shadow(struct kvm *kvm)
4610 {
4611         kvm_mmu_zap_all(kvm);
4612         kvm_reload_remote_mmus(kvm);
4613 }
4614 
4615 int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
4616 {
4617         return vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE
4618                || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED
4619                || vcpu->arch.nmi_pending;
4620 }
4621 
4622 void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
4623 {
4624         int me;
4625         int cpu = vcpu->cpu;
4626 
4627         if (waitqueue_active(&vcpu->wq)) {
4628                 wake_up_interruptible(&vcpu->wq);
4629                 ++vcpu->stat.halt_wakeup;
4630         }
4631 
4632         me = get_cpu();
4633         if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu))
4634                 if (!test_and_set_bit(KVM_REQ_KICK, &vcpu->requests))
4635                         smp_send_reschedule(cpu);
4636         put_cpu();
4637 }
4638 
4639 int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu)
4640 {
4641         return kvm_x86_ops->interrupt_allowed(vcpu);
4642 }
4643 
  This page was automatically generated by the LXR engine.