KVM
x86.c
Go to the documentation of this file.
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Kernel-based Virtual Machine driver for Linux
4  *
5  * derived from drivers/kvm/kvm_main.c
6  *
7  * Copyright (C) 2006 Qumranet, Inc.
8  * Copyright (C) 2008 Qumranet, Inc.
9  * Copyright IBM Corporation, 2008
10  * Copyright 2010 Red Hat, Inc. and/or its affiliates.
11  *
12  * Authors:
13  * Avi Kivity <avi@qumranet.com>
14  * Yaniv Kamay <yaniv@qumranet.com>
15  * Amit Shah <amit.shah@qumranet.com>
16  * Ben-Ami Yassour <benami@il.ibm.com>
17  */
18 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
19 
20 #include <linux/kvm_host.h>
21 #include "irq.h"
22 #include "ioapic.h"
23 #include "mmu.h"
24 #include "i8254.h"
25 #include "tss.h"
26 #include "kvm_cache_regs.h"
27 #include "kvm_emulate.h"
28 #include "mmu/page_track.h"
29 #include "x86.h"
30 #include "cpuid.h"
31 #include "pmu.h"
32 #include "hyperv.h"
33 #include "lapic.h"
34 #include "xen.h"
35 #include "smm.h"
36 
37 #include <linux/clocksource.h>
38 #include <linux/interrupt.h>
39 #include <linux/kvm.h>
40 #include <linux/fs.h>
41 #include <linux/vmalloc.h>
42 #include <linux/export.h>
43 #include <linux/moduleparam.h>
44 #include <linux/mman.h>
45 #include <linux/highmem.h>
46 #include <linux/iommu.h>
47 #include <linux/cpufreq.h>
48 #include <linux/user-return-notifier.h>
49 #include <linux/srcu.h>
50 #include <linux/slab.h>
51 #include <linux/perf_event.h>
52 #include <linux/uaccess.h>
53 #include <linux/hash.h>
54 #include <linux/pci.h>
55 #include <linux/timekeeper_internal.h>
56 #include <linux/pvclock_gtod.h>
57 #include <linux/kvm_irqfd.h>
58 #include <linux/irqbypass.h>
59 #include <linux/sched/stat.h>
60 #include <linux/sched/isolation.h>
61 #include <linux/mem_encrypt.h>
62 #include <linux/entry-kvm.h>
63 #include <linux/suspend.h>
64 #include <linux/smp.h>
65 
66 #include <trace/events/ipi.h>
67 #include <trace/events/kvm.h>
68 
69 #include <asm/debugreg.h>
70 #include <asm/msr.h>
71 #include <asm/desc.h>
72 #include <asm/mce.h>
73 #include <asm/pkru.h>
74 #include <linux/kernel_stat.h>
75 #include <asm/fpu/api.h>
76 #include <asm/fpu/xcr.h>
77 #include <asm/fpu/xstate.h>
78 #include <asm/pvclock.h>
79 #include <asm/div64.h>
80 #include <asm/irq_remapping.h>
81 #include <asm/mshyperv.h>
82 #include <asm/hypervisor.h>
83 #include <asm/tlbflush.h>
84 #include <asm/intel_pt.h>
85 #include <asm/emulate_prefix.h>
86 #include <asm/sgx.h>
87 #include <clocksource/hyperv_timer.h>
88 
89 #define CREATE_TRACE_POINTS
90 #include "trace.h"
91 
92 #define MAX_IO_MSRS 256
93 #define KVM_MAX_MCE_BANKS 32
94 
96  .supported_mce_cap = MCG_CTL_P | MCG_SER_P,
97 };
99 
100 #define ERR_PTR_USR(e) ((void __user *)ERR_PTR(e))
101 
102 #define emul_to_vcpu(ctxt) \
103  ((struct kvm_vcpu *)(ctxt)->vcpu)
104 
105 /* EFER defaults:
106  * - enable syscall per default because its emulated by KVM
107  * - enable LME and LMA per default on 64 bit KVM
108  */
109 #ifdef CONFIG_X86_64
110 static
111 u64 __read_mostly efer_reserved_bits = ~((u64)(EFER_SCE | EFER_LME | EFER_LMA));
112 #else
113 static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE);
114 #endif
115 
116 static u64 __read_mostly cr4_reserved_bits = CR4_RESERVED_BITS;
117 
118 #define KVM_EXIT_HYPERCALL_VALID_MASK (1 << KVM_HC_MAP_GPA_RANGE)
119 
120 #define KVM_CAP_PMU_VALID_MASK KVM_PMU_CAP_DISABLE
121 
122 #define KVM_X2APIC_API_VALID_FLAGS (KVM_X2APIC_API_USE_32BIT_IDS | \
123  KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK)
124 
125 static void update_cr8_intercept(struct kvm_vcpu *vcpu);
126 static void process_nmi(struct kvm_vcpu *vcpu);
127 static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
128 static void store_regs(struct kvm_vcpu *vcpu);
129 static int sync_regs(struct kvm_vcpu *vcpu);
130 static int kvm_vcpu_do_singlestep(struct kvm_vcpu *vcpu);
131 
132 static int __set_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2);
133 static void __get_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2);
134 
135 static DEFINE_MUTEX(vendor_module_lock);
136 struct kvm_x86_ops kvm_x86_ops __read_mostly;
137 
138 #define KVM_X86_OP(func) \
139  DEFINE_STATIC_CALL_NULL(kvm_x86_##func, \
140  *(((struct kvm_x86_ops *)0)->func));
141 #define KVM_X86_OP_OPTIONAL KVM_X86_OP
142 #define KVM_X86_OP_OPTIONAL_RET0 KVM_X86_OP
143 #include <asm/kvm-x86-ops.h>
144 EXPORT_STATIC_CALL_GPL(kvm_x86_get_cs_db_l_bits);
145 EXPORT_STATIC_CALL_GPL(kvm_x86_cache_reg);
146 
147 static bool __read_mostly ignore_msrs = 0;
149 
153 
154 unsigned int min_timer_period_us = 200;
156 
159 
160 /* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */
163 
164 /*
165  * lapic timer advance (tscdeadline mode only) in nanoseconds. '-1' enables
166  * adaptive tuning starting from default advancement of 1000ns. '0' disables
167  * advancement entirely. Any other value is used as-is and disables adaptive
168  * tuning, i.e. allows privileged userspace to set an exact advancement time.
169  */
172 
173 static bool __read_mostly vector_hashing = true;
175 
179 
180 /*
181  * Flags to manipulate forced emulation behavior (any non-zero value will
182  * enable forced emulation).
183  */
184 #define KVM_FEP_CLEAR_RFLAGS_RF BIT(1)
187 
190 
191 /* Enable/disable PMU virtualization */
195 
198 
199 /* Enable/disable SMT_RSB bug mitigation */
202 
203 /*
204  * Restoring the host value for MSRs that are only consumed when running in
205  * usermode, e.g. SYSCALL MSRs and TSC_AUX, can be deferred until the CPU
206  * returns to userspace, i.e. the kernel can run with the guest's value.
207  */
208 #define KVM_MAX_NR_USER_RETURN_MSRS 16
209 
211  struct user_return_notifier urn;
214  u64 host;
215  u64 curr;
217 };
218 
222 static struct kvm_user_return_msrs __percpu *user_return_msrs;
223 
224 #define KVM_SUPPORTED_XCR0 (XFEATURE_MASK_FP | XFEATURE_MASK_SSE \
225  | XFEATURE_MASK_YMM | XFEATURE_MASK_BNDREGS \
226  | XFEATURE_MASK_BNDCSR | XFEATURE_MASK_AVX512 \
227  | XFEATURE_MASK_PKRU | XFEATURE_MASK_XTILE)
228 
231 
234 
237 
240 
243 
244 const struct _kvm_stats_desc kvm_vm_stats_desc[] = {
245  KVM_GENERIC_VM_STATS(),
246  STATS_DESC_COUNTER(VM, mmu_shadow_zapped),
247  STATS_DESC_COUNTER(VM, mmu_pte_write),
248  STATS_DESC_COUNTER(VM, mmu_pde_zapped),
249  STATS_DESC_COUNTER(VM, mmu_flooded),
250  STATS_DESC_COUNTER(VM, mmu_recycled),
251  STATS_DESC_COUNTER(VM, mmu_cache_miss),
252  STATS_DESC_ICOUNTER(VM, mmu_unsync),
253  STATS_DESC_ICOUNTER(VM, pages_4k),
254  STATS_DESC_ICOUNTER(VM, pages_2m),
255  STATS_DESC_ICOUNTER(VM, pages_1g),
256  STATS_DESC_ICOUNTER(VM, nx_lpage_splits),
257  STATS_DESC_PCOUNTER(VM, max_mmu_rmap_size),
258  STATS_DESC_PCOUNTER(VM, max_mmu_page_hash_collisions)
259 };
260 
261 const struct kvm_stats_header kvm_vm_stats_header = {
262  .name_size = KVM_STATS_NAME_SIZE,
263  .num_desc = ARRAY_SIZE(kvm_vm_stats_desc),
264  .id_offset = sizeof(struct kvm_stats_header),
265  .desc_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE,
266  .data_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE +
267  sizeof(kvm_vm_stats_desc),
268 };
269 
270 const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = {
271  KVM_GENERIC_VCPU_STATS(),
272  STATS_DESC_COUNTER(VCPU, pf_taken),
273  STATS_DESC_COUNTER(VCPU, pf_fixed),
274  STATS_DESC_COUNTER(VCPU, pf_emulate),
275  STATS_DESC_COUNTER(VCPU, pf_spurious),
276  STATS_DESC_COUNTER(VCPU, pf_fast),
277  STATS_DESC_COUNTER(VCPU, pf_mmio_spte_created),
278  STATS_DESC_COUNTER(VCPU, pf_guest),
279  STATS_DESC_COUNTER(VCPU, tlb_flush),
280  STATS_DESC_COUNTER(VCPU, invlpg),
281  STATS_DESC_COUNTER(VCPU, exits),
282  STATS_DESC_COUNTER(VCPU, io_exits),
283  STATS_DESC_COUNTER(VCPU, mmio_exits),
284  STATS_DESC_COUNTER(VCPU, signal_exits),
285  STATS_DESC_COUNTER(VCPU, irq_window_exits),
286  STATS_DESC_COUNTER(VCPU, nmi_window_exits),
287  STATS_DESC_COUNTER(VCPU, l1d_flush),
288  STATS_DESC_COUNTER(VCPU, halt_exits),
289  STATS_DESC_COUNTER(VCPU, request_irq_exits),
290  STATS_DESC_COUNTER(VCPU, irq_exits),
291  STATS_DESC_COUNTER(VCPU, host_state_reload),
292  STATS_DESC_COUNTER(VCPU, fpu_reload),
293  STATS_DESC_COUNTER(VCPU, insn_emulation),
294  STATS_DESC_COUNTER(VCPU, insn_emulation_fail),
295  STATS_DESC_COUNTER(VCPU, hypercalls),
296  STATS_DESC_COUNTER(VCPU, irq_injections),
297  STATS_DESC_COUNTER(VCPU, nmi_injections),
298  STATS_DESC_COUNTER(VCPU, req_event),
299  STATS_DESC_COUNTER(VCPU, nested_run),
300  STATS_DESC_COUNTER(VCPU, directed_yield_attempted),
301  STATS_DESC_COUNTER(VCPU, directed_yield_successful),
302  STATS_DESC_COUNTER(VCPU, preemption_reported),
303  STATS_DESC_COUNTER(VCPU, preemption_other),
304  STATS_DESC_IBOOLEAN(VCPU, guest_mode),
305  STATS_DESC_COUNTER(VCPU, notify_window_exits),
306 };
307 
308 const struct kvm_stats_header kvm_vcpu_stats_header = {
309  .name_size = KVM_STATS_NAME_SIZE,
310  .num_desc = ARRAY_SIZE(kvm_vcpu_stats_desc),
311  .id_offset = sizeof(struct kvm_stats_header),
312  .desc_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE,
313  .data_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE +
314  sizeof(kvm_vcpu_stats_desc),
315 };
316 
318 
319 static struct kmem_cache *x86_emulator_cache;
320 
321 /*
322  * When called, it means the previous get/set msr reached an invalid msr.
323  * Return true if we want to ignore/silent this failed msr access.
324  */
325 static bool kvm_msr_ignored_check(u32 msr, u64 data, bool write)
326 {
327  const char *op = write ? "wrmsr" : "rdmsr";
328 
329  if (ignore_msrs) {
331  kvm_pr_unimpl("ignored %s: 0x%x data 0x%llx\n",
332  op, msr, data);
333  /* Mask the error */
334  return true;
335  } else {
336  kvm_debug_ratelimited("unhandled %s: 0x%x data 0x%llx\n",
337  op, msr, data);
338  return false;
339  }
340 }
341 
342 static struct kmem_cache *kvm_alloc_emulator_cache(void)
343 {
344  unsigned int useroffset = offsetof(struct x86_emulate_ctxt, src);
345  unsigned int size = sizeof(struct x86_emulate_ctxt);
346 
347  return kmem_cache_create_usercopy("x86_emulator", size,
348  __alignof__(struct x86_emulate_ctxt),
349  SLAB_ACCOUNT, useroffset,
350  size - useroffset, NULL);
351 }
352 
353 static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt);
354 
355 static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu)
356 {
357  int i;
358  for (i = 0; i < ASYNC_PF_PER_VCPU; i++)
359  vcpu->arch.apf.gfns[i] = ~0;
360 }
361 
362 static void kvm_on_user_return(struct user_return_notifier *urn)
363 {
364  unsigned slot;
365  struct kvm_user_return_msrs *msrs
366  = container_of(urn, struct kvm_user_return_msrs, urn);
367  struct kvm_user_return_msr_values *values;
368  unsigned long flags;
369 
370  /*
371  * Disabling irqs at this point since the following code could be
372  * interrupted and executed through kvm_arch_hardware_disable()
373  */
374  local_irq_save(flags);
375  if (msrs->registered) {
376  msrs->registered = false;
377  user_return_notifier_unregister(urn);
378  }
379  local_irq_restore(flags);
380  for (slot = 0; slot < kvm_nr_uret_msrs; ++slot) {
381  values = &msrs->values[slot];
382  if (values->host != values->curr) {
383  wrmsrl(kvm_uret_msrs_list[slot], values->host);
384  values->curr = values->host;
385  }
386  }
387 }
388 
389 static int kvm_probe_user_return_msr(u32 msr)
390 {
391  u64 val;
392  int ret;
393 
394  preempt_disable();
395  ret = rdmsrl_safe(msr, &val);
396  if (ret)
397  goto out;
398  ret = wrmsrl_safe(msr, val);
399 out:
400  preempt_enable();
401  return ret;
402 }
403 
405 {
407 
408  if (kvm_probe_user_return_msr(msr))
409  return -1;
410 
412  return kvm_nr_uret_msrs++;
413 }
415 
417 {
418  int i;
419 
420  for (i = 0; i < kvm_nr_uret_msrs; ++i) {
421  if (kvm_uret_msrs_list[i] == msr)
422  return i;
423  }
424  return -1;
425 }
427 
429 {
430  unsigned int cpu = smp_processor_id();
431  struct kvm_user_return_msrs *msrs = per_cpu_ptr(user_return_msrs, cpu);
432  u64 value;
433  int i;
434 
435  for (i = 0; i < kvm_nr_uret_msrs; ++i) {
436  rdmsrl_safe(kvm_uret_msrs_list[i], &value);
437  msrs->values[i].host = value;
438  msrs->values[i].curr = value;
439  }
440 }
441 
442 int kvm_set_user_return_msr(unsigned slot, u64 value, u64 mask)
443 {
444  unsigned int cpu = smp_processor_id();
445  struct kvm_user_return_msrs *msrs = per_cpu_ptr(user_return_msrs, cpu);
446  int err;
447 
448  value = (value & mask) | (msrs->values[slot].host & ~mask);
449  if (value == msrs->values[slot].curr)
450  return 0;
451  err = wrmsrl_safe(kvm_uret_msrs_list[slot], value);
452  if (err)
453  return 1;
454 
455  msrs->values[slot].curr = value;
456  if (!msrs->registered) {
457  msrs->urn.on_user_return = kvm_on_user_return;
458  user_return_notifier_register(&msrs->urn);
459  msrs->registered = true;
460  }
461  return 0;
462 }
464 
465 static void drop_user_return_notifiers(void)
466 {
467  unsigned int cpu = smp_processor_id();
468  struct kvm_user_return_msrs *msrs = per_cpu_ptr(user_return_msrs, cpu);
469 
470  if (msrs->registered)
471  kvm_on_user_return(&msrs->urn);
472 }
473 
474 u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)
475 {
476  return vcpu->arch.apic_base;
477 }
478 
479 enum lapic_mode kvm_get_apic_mode(struct kvm_vcpu *vcpu)
480 {
481  return kvm_apic_mode(kvm_get_apic_base(vcpu));
482 }
484 
485 int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
486 {
487  enum lapic_mode old_mode = kvm_get_apic_mode(vcpu);
488  enum lapic_mode new_mode = kvm_apic_mode(msr_info->data);
489  u64 reserved_bits = kvm_vcpu_reserved_gpa_bits_raw(vcpu) | 0x2ff |
490  (guest_cpuid_has(vcpu, X86_FEATURE_X2APIC) ? 0 : X2APIC_ENABLE);
491 
492  if ((msr_info->data & reserved_bits) != 0 || new_mode == LAPIC_MODE_INVALID)
493  return 1;
494  if (!msr_info->host_initiated) {
495  if (old_mode == LAPIC_MODE_X2APIC && new_mode == LAPIC_MODE_XAPIC)
496  return 1;
497  if (old_mode == LAPIC_MODE_DISABLED && new_mode == LAPIC_MODE_X2APIC)
498  return 1;
499  }
500 
501  kvm_lapic_set_base(vcpu, msr_info->data);
502  kvm_recalculate_apic_map(vcpu->kvm);
503  return 0;
504 }
505 
506 /*
507  * Handle a fault on a hardware virtualization (VMX or SVM) instruction.
508  *
509  * Hardware virtualization extension instructions may fault if a reboot turns
510  * off virtualization while processes are running. Usually after catching the
511  * fault we just panic; during reboot instead the instruction is ignored.
512  */
513 noinstr void kvm_spurious_fault(void)
514 {
515  /* Fault while not rebooting. We want the trace. */
516  BUG_ON(!kvm_rebooting);
517 }
519 
520 #define EXCPT_BENIGN 0
521 #define EXCPT_CONTRIBUTORY 1
522 #define EXCPT_PF 2
523 
524 static int exception_class(int vector)
525 {
526  switch (vector) {
527  case PF_VECTOR:
528  return EXCPT_PF;
529  case DE_VECTOR:
530  case TS_VECTOR:
531  case NP_VECTOR:
532  case SS_VECTOR:
533  case GP_VECTOR:
534  return EXCPT_CONTRIBUTORY;
535  default:
536  break;
537  }
538  return EXCPT_BENIGN;
539 }
540 
541 #define EXCPT_FAULT 0
542 #define EXCPT_TRAP 1
543 #define EXCPT_ABORT 2
544 #define EXCPT_INTERRUPT 3
545 #define EXCPT_DB 4
546 
547 static int exception_type(int vector)
548 {
549  unsigned int mask;
550 
551  if (WARN_ON(vector > 31 || vector == NMI_VECTOR))
552  return EXCPT_INTERRUPT;
553 
554  mask = 1 << vector;
555 
556  /*
557  * #DBs can be trap-like or fault-like, the caller must check other CPU
558  * state, e.g. DR6, to determine whether a #DB is a trap or fault.
559  */
560  if (mask & (1 << DB_VECTOR))
561  return EXCPT_DB;
562 
563  if (mask & ((1 << BP_VECTOR) | (1 << OF_VECTOR)))
564  return EXCPT_TRAP;
565 
566  if (mask & ((1 << DF_VECTOR) | (1 << MC_VECTOR)))
567  return EXCPT_ABORT;
568 
569  /* Reserved exceptions will result in fault */
570  return EXCPT_FAULT;
571 }
572 
573 void kvm_deliver_exception_payload(struct kvm_vcpu *vcpu,
574  struct kvm_queued_exception *ex)
575 {
576  if (!ex->has_payload)
577  return;
578 
579  switch (ex->vector) {
580  case DB_VECTOR:
581  /*
582  * "Certain debug exceptions may clear bit 0-3. The
583  * remaining contents of the DR6 register are never
584  * cleared by the processor".
585  */
586  vcpu->arch.dr6 &= ~DR_TRAP_BITS;
587  /*
588  * In order to reflect the #DB exception payload in guest
589  * dr6, three components need to be considered: active low
590  * bit, FIXED_1 bits and active high bits (e.g. DR6_BD,
591  * DR6_BS and DR6_BT)
592  * DR6_ACTIVE_LOW contains the FIXED_1 and active low bits.
593  * In the target guest dr6:
594  * FIXED_1 bits should always be set.
595  * Active low bits should be cleared if 1-setting in payload.
596  * Active high bits should be set if 1-setting in payload.
597  *
598  * Note, the payload is compatible with the pending debug
599  * exceptions/exit qualification under VMX, that active_low bits
600  * are active high in payload.
601  * So they need to be flipped for DR6.
602  */
603  vcpu->arch.dr6 |= DR6_ACTIVE_LOW;
604  vcpu->arch.dr6 |= ex->payload;
605  vcpu->arch.dr6 ^= ex->payload & DR6_ACTIVE_LOW;
606 
607  /*
608  * The #DB payload is defined as compatible with the 'pending
609  * debug exceptions' field under VMX, not DR6. While bit 12 is
610  * defined in the 'pending debug exceptions' field (enabled
611  * breakpoint), it is reserved and must be zero in DR6.
612  */
613  vcpu->arch.dr6 &= ~BIT(12);
614  break;
615  case PF_VECTOR:
616  vcpu->arch.cr2 = ex->payload;
617  break;
618  }
619 
620  ex->has_payload = false;
621  ex->payload = 0;
622 }
624 
625 static void kvm_queue_exception_vmexit(struct kvm_vcpu *vcpu, unsigned int vector,
626  bool has_error_code, u32 error_code,
627  bool has_payload, unsigned long payload)
628 {
629  struct kvm_queued_exception *ex = &vcpu->arch.exception_vmexit;
630 
631  ex->vector = vector;
632  ex->injected = false;
633  ex->pending = true;
634  ex->has_error_code = has_error_code;
635  ex->error_code = error_code;
636  ex->has_payload = has_payload;
637  ex->payload = payload;
638 }
639 
640 /* Forcibly leave the nested mode in cases like a vCPU reset */
641 static void kvm_leave_nested(struct kvm_vcpu *vcpu)
642 {
643  kvm_x86_ops.nested_ops->leave_nested(vcpu);
644 }
645 
646 static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
647  unsigned nr, bool has_error, u32 error_code,
648  bool has_payload, unsigned long payload, bool reinject)
649 {
650  u32 prev_nr;
651  int class1, class2;
652 
653  kvm_make_request(KVM_REQ_EVENT, vcpu);
654 
655  /*
656  * If the exception is destined for L2 and isn't being reinjected,
657  * morph it to a VM-Exit if L1 wants to intercept the exception. A
658  * previously injected exception is not checked because it was checked
659  * when it was original queued, and re-checking is incorrect if _L1_
660  * injected the exception, in which case it's exempt from interception.
661  */
662  if (!reinject && is_guest_mode(vcpu) &&
663  kvm_x86_ops.nested_ops->is_exception_vmexit(vcpu, nr, error_code)) {
664  kvm_queue_exception_vmexit(vcpu, nr, has_error, error_code,
665  has_payload, payload);
666  return;
667  }
668 
669  if (!vcpu->arch.exception.pending && !vcpu->arch.exception.injected) {
670  queue:
671  if (reinject) {
672  /*
673  * On VM-Entry, an exception can be pending if and only
674  * if event injection was blocked by nested_run_pending.
675  * In that case, however, vcpu_enter_guest() requests an
676  * immediate exit, and the guest shouldn't proceed far
677  * enough to need reinjection.
678  */
679  WARN_ON_ONCE(kvm_is_exception_pending(vcpu));
680  vcpu->arch.exception.injected = true;
681  if (WARN_ON_ONCE(has_payload)) {
682  /*
683  * A reinjected event has already
684  * delivered its payload.
685  */
686  has_payload = false;
687  payload = 0;
688  }
689  } else {
690  vcpu->arch.exception.pending = true;
691  vcpu->arch.exception.injected = false;
692  }
693  vcpu->arch.exception.has_error_code = has_error;
694  vcpu->arch.exception.vector = nr;
695  vcpu->arch.exception.error_code = error_code;
696  vcpu->arch.exception.has_payload = has_payload;
697  vcpu->arch.exception.payload = payload;
698  if (!is_guest_mode(vcpu))
700  &vcpu->arch.exception);
701  return;
702  }
703 
704  /* to check exception */
705  prev_nr = vcpu->arch.exception.vector;
706  if (prev_nr == DF_VECTOR) {
707  /* triple fault -> shutdown */
708  kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
709  return;
710  }
711  class1 = exception_class(prev_nr);
712  class2 = exception_class(nr);
713  if ((class1 == EXCPT_CONTRIBUTORY && class2 == EXCPT_CONTRIBUTORY) ||
714  (class1 == EXCPT_PF && class2 != EXCPT_BENIGN)) {
715  /*
716  * Synthesize #DF. Clear the previously injected or pending
717  * exception so as not to incorrectly trigger shutdown.
718  */
719  vcpu->arch.exception.injected = false;
720  vcpu->arch.exception.pending = false;
721 
722  kvm_queue_exception_e(vcpu, DF_VECTOR, 0);
723  } else {
724  /* replace previous exception with a new one in a hope
725  that instruction re-execution will regenerate lost
726  exception */
727  goto queue;
728  }
729 }
730 
731 void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr)
732 {
733  kvm_multiple_exception(vcpu, nr, false, 0, false, 0, false);
734 }
736 
737 void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr)
738 {
739  kvm_multiple_exception(vcpu, nr, false, 0, false, 0, true);
740 }
742 
743 void kvm_queue_exception_p(struct kvm_vcpu *vcpu, unsigned nr,
744  unsigned long payload)
745 {
746  kvm_multiple_exception(vcpu, nr, false, 0, true, payload, false);
747 }
749 
750 static void kvm_queue_exception_e_p(struct kvm_vcpu *vcpu, unsigned nr,
751  u32 error_code, unsigned long payload)
752 {
753  kvm_multiple_exception(vcpu, nr, true, error_code,
754  true, payload, false);
755 }
756 
757 int kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err)
758 {
759  if (err)
760  kvm_inject_gp(vcpu, 0);
761  else
762  return kvm_skip_emulated_instruction(vcpu);
763 
764  return 1;
765 }
767 
768 static int complete_emulated_insn_gp(struct kvm_vcpu *vcpu, int err)
769 {
770  if (err) {
771  kvm_inject_gp(vcpu, 0);
772  return 1;
773  }
774 
775  return kvm_emulate_instruction(vcpu, EMULTYPE_NO_DECODE | EMULTYPE_SKIP |
776  EMULTYPE_COMPLETE_USER_EXIT);
777 }
778 
779 void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
780 {
781  ++vcpu->stat.pf_guest;
782 
783  /*
784  * Async #PF in L2 is always forwarded to L1 as a VM-Exit regardless of
785  * whether or not L1 wants to intercept "regular" #PF.
786  */
787  if (is_guest_mode(vcpu) && fault->async_page_fault)
788  kvm_queue_exception_vmexit(vcpu, PF_VECTOR,
789  true, fault->error_code,
790  true, fault->address);
791  else
792  kvm_queue_exception_e_p(vcpu, PF_VECTOR, fault->error_code,
793  fault->address);
794 }
795 
796 void kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu,
797  struct x86_exception *fault)
798 {
799  struct kvm_mmu *fault_mmu;
800  WARN_ON_ONCE(fault->vector != PF_VECTOR);
801 
802  fault_mmu = fault->nested_page_fault ? vcpu->arch.mmu :
803  vcpu->arch.walk_mmu;
804 
805  /*
806  * Invalidate the TLB entry for the faulting address, if it exists,
807  * else the access will fault indefinitely (and to emulate hardware).
808  */
809  if ((fault->error_code & PFERR_PRESENT_MASK) &&
810  !(fault->error_code & PFERR_RSVD_MASK))
811  kvm_mmu_invalidate_addr(vcpu, fault_mmu, fault->address,
812  KVM_MMU_ROOT_CURRENT);
813 
814  fault_mmu->inject_page_fault(vcpu, fault);
815 }
817 
818 void kvm_inject_nmi(struct kvm_vcpu *vcpu)
819 {
820  atomic_inc(&vcpu->arch.nmi_queued);
821  kvm_make_request(KVM_REQ_NMI, vcpu);
822 }
823 
824 void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
825 {
826  kvm_multiple_exception(vcpu, nr, true, error_code, false, 0, false);
827 }
829 
830 void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
831 {
832  kvm_multiple_exception(vcpu, nr, true, error_code, false, 0, true);
833 }
835 
836 /*
837  * Checks if cpl <= required_cpl; if true, return true. Otherwise queue
838  * a #GP and return false.
839  */
840 bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl)
841 {
842  if (static_call(kvm_x86_get_cpl)(vcpu) <= required_cpl)
843  return true;
844  kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
845  return false;
846 }
847 
848 bool kvm_require_dr(struct kvm_vcpu *vcpu, int dr)
849 {
850  if ((dr != 4 && dr != 5) || !kvm_is_cr4_bit_set(vcpu, X86_CR4_DE))
851  return true;
852 
853  kvm_queue_exception(vcpu, UD_VECTOR);
854  return false;
855 }
857 
858 static inline u64 pdptr_rsvd_bits(struct kvm_vcpu *vcpu)
859 {
860  return vcpu->arch.reserved_gpa_bits | rsvd_bits(5, 8) | rsvd_bits(1, 2);
861 }
862 
863 /*
864  * Load the pae pdptrs. Return 1 if they are all valid, 0 otherwise.
865  */
866 int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
867 {
868  struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
869  gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
870  gpa_t real_gpa;
871  int i;
872  int ret;
873  u64 pdpte[ARRAY_SIZE(mmu->pdptrs)];
874 
875  /*
876  * If the MMU is nested, CR3 holds an L2 GPA and needs to be translated
877  * to an L1 GPA.
878  */
879  real_gpa = kvm_translate_gpa(vcpu, mmu, gfn_to_gpa(pdpt_gfn),
880  PFERR_USER_MASK | PFERR_WRITE_MASK, NULL);
881  if (real_gpa == INVALID_GPA)
882  return 0;
883 
884  /* Note the offset, PDPTRs are 32 byte aligned when using PAE paging. */
885  ret = kvm_vcpu_read_guest_page(vcpu, gpa_to_gfn(real_gpa), pdpte,
886  cr3 & GENMASK(11, 5), sizeof(pdpte));
887  if (ret < 0)
888  return 0;
889 
890  for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {
891  if ((pdpte[i] & PT_PRESENT_MASK) &&
892  (pdpte[i] & pdptr_rsvd_bits(vcpu))) {
893  return 0;
894  }
895  }
896 
897  /*
898  * Marking VCPU_EXREG_PDPTR dirty doesn't work for !tdp_enabled.
899  * Shadow page roots need to be reconstructed instead.
900  */
901  if (!tdp_enabled && memcmp(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs)))
902  kvm_mmu_free_roots(vcpu->kvm, mmu, KVM_MMU_ROOT_CURRENT);
903 
904  memcpy(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs));
905  kvm_register_mark_dirty(vcpu, VCPU_EXREG_PDPTR);
906  kvm_make_request(KVM_REQ_LOAD_MMU_PGD, vcpu);
907  vcpu->arch.pdptrs_from_userspace = false;
908 
909  return 1;
910 }
912 
913 static bool kvm_is_valid_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
914 {
915 #ifdef CONFIG_X86_64
916  if (cr0 & 0xffffffff00000000UL)
917  return false;
918 #endif
919 
920  if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD))
921  return false;
922 
923  if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE))
924  return false;
925 
926  return static_call(kvm_x86_is_valid_cr0)(vcpu, cr0);
927 }
928 
929 void kvm_post_set_cr0(struct kvm_vcpu *vcpu, unsigned long old_cr0, unsigned long cr0)
930 {
931  /*
932  * CR0.WP is incorporated into the MMU role, but only for non-nested,
933  * indirect shadow MMUs. If paging is disabled, no updates are needed
934  * as there are no permission bits to emulate. If TDP is enabled, the
935  * MMU's metadata needs to be updated, e.g. so that emulating guest
936  * translations does the right thing, but there's no need to unload the
937  * root as CR0.WP doesn't affect SPTEs.
938  */
939  if ((cr0 ^ old_cr0) == X86_CR0_WP) {
940  if (!(cr0 & X86_CR0_PG))
941  return;
942 
943  if (tdp_enabled) {
944  kvm_init_mmu(vcpu);
945  return;
946  }
947  }
948 
949  if ((cr0 ^ old_cr0) & X86_CR0_PG) {
952 
953  /*
954  * Clearing CR0.PG is defined to flush the TLB from the guest's
955  * perspective.
956  */
957  if (!(cr0 & X86_CR0_PG))
958  kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
959  }
960 
961  if ((cr0 ^ old_cr0) & KVM_MMU_CR0_ROLE_BITS)
962  kvm_mmu_reset_context(vcpu);
963 
964  if (((cr0 ^ old_cr0) & X86_CR0_CD) &&
965  kvm_mmu_honors_guest_mtrrs(vcpu->kvm) &&
966  !kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
967  kvm_zap_gfn_range(vcpu->kvm, 0, ~0ULL);
968 }
970 
971 int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
972 {
973  unsigned long old_cr0 = kvm_read_cr0(vcpu);
974 
975  if (!kvm_is_valid_cr0(vcpu, cr0))
976  return 1;
977 
978  cr0 |= X86_CR0_ET;
979 
980  /* Write to CR0 reserved bits are ignored, even on Intel. */
981  cr0 &= ~CR0_RESERVED_BITS;
982 
983 #ifdef CONFIG_X86_64
984  if ((vcpu->arch.efer & EFER_LME) && !is_paging(vcpu) &&
985  (cr0 & X86_CR0_PG)) {
986  int cs_db, cs_l;
987 
988  if (!is_pae(vcpu))
989  return 1;
990  static_call(kvm_x86_get_cs_db_l_bits)(vcpu, &cs_db, &cs_l);
991  if (cs_l)
992  return 1;
993  }
994 #endif
995  if (!(vcpu->arch.efer & EFER_LME) && (cr0 & X86_CR0_PG) &&
996  is_pae(vcpu) && ((cr0 ^ old_cr0) & X86_CR0_PDPTR_BITS) &&
997  !load_pdptrs(vcpu, kvm_read_cr3(vcpu)))
998  return 1;
999 
1000  if (!(cr0 & X86_CR0_PG) &&
1001  (is_64_bit_mode(vcpu) || kvm_is_cr4_bit_set(vcpu, X86_CR4_PCIDE)))
1002  return 1;
1003 
1004  static_call(kvm_x86_set_cr0)(vcpu, cr0);
1005 
1006  kvm_post_set_cr0(vcpu, old_cr0, cr0);
1007 
1008  return 0;
1009 }
1011 
1012 void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
1013 {
1014  (void)kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0eul) | (msw & 0x0f));
1015 }
1017 
1018 void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu)
1019 {
1020  if (vcpu->arch.guest_state_protected)
1021  return;
1022 
1023  if (kvm_is_cr4_bit_set(vcpu, X86_CR4_OSXSAVE)) {
1024 
1025  if (vcpu->arch.xcr0 != host_xcr0)
1026  xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0);
1027 
1028  if (guest_can_use(vcpu, X86_FEATURE_XSAVES) &&
1029  vcpu->arch.ia32_xss != host_xss)
1030  wrmsrl(MSR_IA32_XSS, vcpu->arch.ia32_xss);
1031  }
1032 
1033  if (cpu_feature_enabled(X86_FEATURE_PKU) &&
1034  vcpu->arch.pkru != vcpu->arch.host_pkru &&
1035  ((vcpu->arch.xcr0 & XFEATURE_MASK_PKRU) ||
1036  kvm_is_cr4_bit_set(vcpu, X86_CR4_PKE)))
1037  write_pkru(vcpu->arch.pkru);
1038 }
1040 
1041 void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu)
1042 {
1043  if (vcpu->arch.guest_state_protected)
1044  return;
1045 
1046  if (cpu_feature_enabled(X86_FEATURE_PKU) &&
1047  ((vcpu->arch.xcr0 & XFEATURE_MASK_PKRU) ||
1048  kvm_is_cr4_bit_set(vcpu, X86_CR4_PKE))) {
1049  vcpu->arch.pkru = rdpkru();
1050  if (vcpu->arch.pkru != vcpu->arch.host_pkru)
1051  write_pkru(vcpu->arch.host_pkru);
1052  }
1053 
1054  if (kvm_is_cr4_bit_set(vcpu, X86_CR4_OSXSAVE)) {
1055 
1056  if (vcpu->arch.xcr0 != host_xcr0)
1057  xsetbv(XCR_XFEATURE_ENABLED_MASK, host_xcr0);
1058 
1059  if (guest_can_use(vcpu, X86_FEATURE_XSAVES) &&
1060  vcpu->arch.ia32_xss != host_xss)
1061  wrmsrl(MSR_IA32_XSS, host_xss);
1062  }
1063 
1064 }
1066 
1067 #ifdef CONFIG_X86_64
1068 static inline u64 kvm_guest_supported_xfd(struct kvm_vcpu *vcpu)
1069 {
1070  return vcpu->arch.guest_supported_xcr0 & XFEATURE_MASK_USER_DYNAMIC;
1071 }
1072 #endif
1073 
1074 static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
1075 {
1076  u64 xcr0 = xcr;
1077  u64 old_xcr0 = vcpu->arch.xcr0;
1078  u64 valid_bits;
1079 
1080  /* Only support XCR_XFEATURE_ENABLED_MASK(xcr0) now */
1081  if (index != XCR_XFEATURE_ENABLED_MASK)
1082  return 1;
1083  if (!(xcr0 & XFEATURE_MASK_FP))
1084  return 1;
1085  if ((xcr0 & XFEATURE_MASK_YMM) && !(xcr0 & XFEATURE_MASK_SSE))
1086  return 1;
1087 
1088  /*
1089  * Do not allow the guest to set bits that we do not support
1090  * saving. However, xcr0 bit 0 is always set, even if the
1091  * emulated CPU does not support XSAVE (see kvm_vcpu_reset()).
1092  */
1093  valid_bits = vcpu->arch.guest_supported_xcr0 | XFEATURE_MASK_FP;
1094  if (xcr0 & ~valid_bits)
1095  return 1;
1096 
1097  if ((!(xcr0 & XFEATURE_MASK_BNDREGS)) !=
1098  (!(xcr0 & XFEATURE_MASK_BNDCSR)))
1099  return 1;
1100 
1101  if (xcr0 & XFEATURE_MASK_AVX512) {
1102  if (!(xcr0 & XFEATURE_MASK_YMM))
1103  return 1;
1104  if ((xcr0 & XFEATURE_MASK_AVX512) != XFEATURE_MASK_AVX512)
1105  return 1;
1106  }
1107 
1108  if ((xcr0 & XFEATURE_MASK_XTILE) &&
1109  ((xcr0 & XFEATURE_MASK_XTILE) != XFEATURE_MASK_XTILE))
1110  return 1;
1111 
1112  vcpu->arch.xcr0 = xcr0;
1113 
1114  if ((xcr0 ^ old_xcr0) & XFEATURE_MASK_EXTEND)
1116  return 0;
1117 }
1118 
1119 int kvm_emulate_xsetbv(struct kvm_vcpu *vcpu)
1120 {
1121  /* Note, #UD due to CR4.OSXSAVE=0 has priority over the intercept. */
1122  if (static_call(kvm_x86_get_cpl)(vcpu) != 0 ||
1123  __kvm_set_xcr(vcpu, kvm_rcx_read(vcpu), kvm_read_edx_eax(vcpu))) {
1124  kvm_inject_gp(vcpu, 0);
1125  return 1;
1126  }
1127 
1128  return kvm_skip_emulated_instruction(vcpu);
1129 }
1131 
1132 bool __kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
1133 {
1134  if (cr4 & cr4_reserved_bits)
1135  return false;
1136 
1137  if (cr4 & vcpu->arch.cr4_guest_rsvd_bits)
1138  return false;
1139 
1140  return true;
1141 }
1143 
1144 static bool kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
1145 {
1146  return __kvm_is_valid_cr4(vcpu, cr4) &&
1147  static_call(kvm_x86_is_valid_cr4)(vcpu, cr4);
1148 }
1149 
1150 void kvm_post_set_cr4(struct kvm_vcpu *vcpu, unsigned long old_cr4, unsigned long cr4)
1151 {
1152  if ((cr4 ^ old_cr4) & KVM_MMU_CR4_ROLE_BITS)
1153  kvm_mmu_reset_context(vcpu);
1154 
1155  /*
1156  * If CR4.PCIDE is changed 0 -> 1, there is no need to flush the TLB
1157  * according to the SDM; however, stale prev_roots could be reused
1158  * incorrectly in the future after a MOV to CR3 with NOFLUSH=1, so we
1159  * free them all. This is *not* a superset of KVM_REQ_TLB_FLUSH_GUEST
1160  * or KVM_REQ_TLB_FLUSH_CURRENT, because the hardware TLB is not flushed,
1161  * so fall through.
1162  */
1163  if (!tdp_enabled &&
1164  (cr4 & X86_CR4_PCIDE) && !(old_cr4 & X86_CR4_PCIDE))
1165  kvm_mmu_unload(vcpu);
1166 
1167  /*
1168  * The TLB has to be flushed for all PCIDs if any of the following
1169  * (architecturally required) changes happen:
1170  * - CR4.PCIDE is changed from 1 to 0
1171  * - CR4.PGE is toggled
1172  *
1173  * This is a superset of KVM_REQ_TLB_FLUSH_CURRENT.
1174  */
1175  if (((cr4 ^ old_cr4) & X86_CR4_PGE) ||
1176  (!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE)))
1177  kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
1178 
1179  /*
1180  * The TLB has to be flushed for the current PCID if any of the
1181  * following (architecturally required) changes happen:
1182  * - CR4.SMEP is changed from 0 to 1
1183  * - CR4.PAE is toggled
1184  */
1185  else if (((cr4 ^ old_cr4) & X86_CR4_PAE) ||
1186  ((cr4 & X86_CR4_SMEP) && !(old_cr4 & X86_CR4_SMEP)))
1187  kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
1188 
1189 }
1191 
1192 int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
1193 {
1194  unsigned long old_cr4 = kvm_read_cr4(vcpu);
1195 
1196  if (!kvm_is_valid_cr4(vcpu, cr4))
1197  return 1;
1198 
1199  if (is_long_mode(vcpu)) {
1200  if (!(cr4 & X86_CR4_PAE))
1201  return 1;
1202  if ((cr4 ^ old_cr4) & X86_CR4_LA57)
1203  return 1;
1204  } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE)
1205  && ((cr4 ^ old_cr4) & X86_CR4_PDPTR_BITS)
1206  && !load_pdptrs(vcpu, kvm_read_cr3(vcpu)))
1207  return 1;
1208 
1209  if ((cr4 & X86_CR4_PCIDE) && !(old_cr4 & X86_CR4_PCIDE)) {
1210  /* PCID can not be enabled when cr3[11:0]!=000H or EFER.LMA=0 */
1211  if ((kvm_read_cr3(vcpu) & X86_CR3_PCID_MASK) || !is_long_mode(vcpu))
1212  return 1;
1213  }
1214 
1215  static_call(kvm_x86_set_cr4)(vcpu, cr4);
1216 
1217  kvm_post_set_cr4(vcpu, old_cr4, cr4);
1218 
1219  return 0;
1220 }
1222 
1223 static void kvm_invalidate_pcid(struct kvm_vcpu *vcpu, unsigned long pcid)
1224 {
1225  struct kvm_mmu *mmu = vcpu->arch.mmu;
1226  unsigned long roots_to_free = 0;
1227  int i;
1228 
1229  /*
1230  * MOV CR3 and INVPCID are usually not intercepted when using TDP, but
1231  * this is reachable when running EPT=1 and unrestricted_guest=0, and
1232  * also via the emulator. KVM's TDP page tables are not in the scope of
1233  * the invalidation, but the guest's TLB entries need to be flushed as
1234  * the CPU may have cached entries in its TLB for the target PCID.
1235  */
1236  if (unlikely(tdp_enabled)) {
1237  kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
1238  return;
1239  }
1240 
1241  /*
1242  * If neither the current CR3 nor any of the prev_roots use the given
1243  * PCID, then nothing needs to be done here because a resync will
1244  * happen anyway before switching to any other CR3.
1245  */
1246  if (kvm_get_active_pcid(vcpu) == pcid) {
1247  kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
1248  kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
1249  }
1250 
1251  /*
1252  * If PCID is disabled, there is no need to free prev_roots even if the
1253  * PCIDs for them are also 0, because MOV to CR3 always flushes the TLB
1254  * with PCIDE=0.
1255  */
1256  if (!kvm_is_cr4_bit_set(vcpu, X86_CR4_PCIDE))
1257  return;
1258 
1259  for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
1260  if (kvm_get_pcid(vcpu, mmu->prev_roots[i].pgd) == pcid)
1261  roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
1262 
1263  kvm_mmu_free_roots(vcpu->kvm, mmu, roots_to_free);
1264 }
1265 
1266 int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
1267 {
1268  bool skip_tlb_flush = false;
1269  unsigned long pcid = 0;
1270 #ifdef CONFIG_X86_64
1271  if (kvm_is_cr4_bit_set(vcpu, X86_CR4_PCIDE)) {
1272  skip_tlb_flush = cr3 & X86_CR3_PCID_NOFLUSH;
1273  cr3 &= ~X86_CR3_PCID_NOFLUSH;
1274  pcid = cr3 & X86_CR3_PCID_MASK;
1275  }
1276 #endif
1277 
1278  /* PDPTRs are always reloaded for PAE paging. */
1279  if (cr3 == kvm_read_cr3(vcpu) && !is_pae_paging(vcpu))
1280  goto handle_tlb_flush;
1281 
1282  /*
1283  * Do not condition the GPA check on long mode, this helper is used to
1284  * stuff CR3, e.g. for RSM emulation, and there is no guarantee that
1285  * the current vCPU mode is accurate.
1286  */
1287  if (!kvm_vcpu_is_legal_cr3(vcpu, cr3))
1288  return 1;
1289 
1290  if (is_pae_paging(vcpu) && !load_pdptrs(vcpu, cr3))
1291  return 1;
1292 
1293  if (cr3 != kvm_read_cr3(vcpu))
1294  kvm_mmu_new_pgd(vcpu, cr3);
1295 
1296  vcpu->arch.cr3 = cr3;
1297  kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3);
1298  /* Do not call post_set_cr3, we do not get here for confidential guests. */
1299 
1300 handle_tlb_flush:
1301  /*
1302  * A load of CR3 that flushes the TLB flushes only the current PCID,
1303  * even if PCID is disabled, in which case PCID=0 is flushed. It's a
1304  * moot point in the end because _disabling_ PCID will flush all PCIDs,
1305  * and it's impossible to use a non-zero PCID when PCID is disabled,
1306  * i.e. only PCID=0 can be relevant.
1307  */
1308  if (!skip_tlb_flush)
1309  kvm_invalidate_pcid(vcpu, pcid);
1310 
1311  return 0;
1312 }
1314 
1315 int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
1316 {
1317  if (cr8 & CR8_RESERVED_BITS)
1318  return 1;
1319  if (lapic_in_kernel(vcpu))
1320  kvm_lapic_set_tpr(vcpu, cr8);
1321  else
1322  vcpu->arch.cr8 = cr8;
1323  return 0;
1324 }
1326 
1327 unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
1328 {
1329  if (lapic_in_kernel(vcpu))
1330  return kvm_lapic_get_cr8(vcpu);
1331  else
1332  return vcpu->arch.cr8;
1333 }
1335 
1336 static void kvm_update_dr0123(struct kvm_vcpu *vcpu)
1337 {
1338  int i;
1339 
1340  if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
1341  for (i = 0; i < KVM_NR_DB_REGS; i++)
1342  vcpu->arch.eff_db[i] = vcpu->arch.db[i];
1343  }
1344 }
1345 
1346 void kvm_update_dr7(struct kvm_vcpu *vcpu)
1347 {
1348  unsigned long dr7;
1349 
1350  if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
1351  dr7 = vcpu->arch.guest_debug_dr7;
1352  else
1353  dr7 = vcpu->arch.dr7;
1354  static_call(kvm_x86_set_dr7)(vcpu, dr7);
1355  vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_BP_ENABLED;
1356  if (dr7 & DR7_BP_EN_MASK)
1357  vcpu->arch.switch_db_regs |= KVM_DEBUGREG_BP_ENABLED;
1358 }
1360 
1361 static u64 kvm_dr6_fixed(struct kvm_vcpu *vcpu)
1362 {
1363  u64 fixed = DR6_FIXED_1;
1364 
1365  if (!guest_cpuid_has(vcpu, X86_FEATURE_RTM))
1366  fixed |= DR6_RTM;
1367 
1368  if (!guest_cpuid_has(vcpu, X86_FEATURE_BUS_LOCK_DETECT))
1369  fixed |= DR6_BUS_LOCK;
1370  return fixed;
1371 }
1372 
1373 int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
1374 {
1375  size_t size = ARRAY_SIZE(vcpu->arch.db);
1376 
1377  switch (dr) {
1378  case 0 ... 3:
1379  vcpu->arch.db[array_index_nospec(dr, size)] = val;
1380  if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
1381  vcpu->arch.eff_db[dr] = val;
1382  break;
1383  case 4:
1384  case 6:
1385  if (!kvm_dr6_valid(val))
1386  return 1; /* #GP */
1387  vcpu->arch.dr6 = (val & DR6_VOLATILE) | kvm_dr6_fixed(vcpu);
1388  break;
1389  case 5:
1390  default: /* 7 */
1391  if (!kvm_dr7_valid(val))
1392  return 1; /* #GP */
1393  vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1;
1394  kvm_update_dr7(vcpu);
1395  break;
1396  }
1397 
1398  return 0;
1399 }
1401 
1402 void kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
1403 {
1404  size_t size = ARRAY_SIZE(vcpu->arch.db);
1405 
1406  switch (dr) {
1407  case 0 ... 3:
1408  *val = vcpu->arch.db[array_index_nospec(dr, size)];
1409  break;
1410  case 4:
1411  case 6:
1412  *val = vcpu->arch.dr6;
1413  break;
1414  case 5:
1415  default: /* 7 */
1416  *val = vcpu->arch.dr7;
1417  break;
1418  }
1419 }
1421 
1422 int kvm_emulate_rdpmc(struct kvm_vcpu *vcpu)
1423 {
1424  u32 ecx = kvm_rcx_read(vcpu);
1425  u64 data;
1426 
1427  if (kvm_pmu_rdpmc(vcpu, ecx, &data)) {
1428  kvm_inject_gp(vcpu, 0);
1429  return 1;
1430  }
1431 
1432  kvm_rax_write(vcpu, (u32)data);
1433  kvm_rdx_write(vcpu, data >> 32);
1434  return kvm_skip_emulated_instruction(vcpu);
1435 }
1437 
1438 /*
1439  * The three MSR lists(msrs_to_save, emulated_msrs, msr_based_features) track
1440  * the set of MSRs that KVM exposes to userspace through KVM_GET_MSRS,
1441  * KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST. msrs_to_save holds MSRs that
1442  * require host support, i.e. should be probed via RDMSR. emulated_msrs holds
1443  * MSRs that KVM emulates without strictly requiring host support.
1444  * msr_based_features holds MSRs that enumerate features, i.e. are effectively
1445  * CPUID leafs. Note, msr_based_features isn't mutually exclusive with
1446  * msrs_to_save and emulated_msrs.
1447  */
1448 
1449 static const u32 msrs_to_save_base[] = {
1450  MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
1451  MSR_STAR,
1452 #ifdef CONFIG_X86_64
1453  MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
1454 #endif
1455  MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA,
1456  MSR_IA32_FEAT_CTL, MSR_IA32_BNDCFGS, MSR_TSC_AUX,
1457  MSR_IA32_SPEC_CTRL, MSR_IA32_TSX_CTRL,
1458  MSR_IA32_RTIT_CTL, MSR_IA32_RTIT_STATUS, MSR_IA32_RTIT_CR3_MATCH,
1459  MSR_IA32_RTIT_OUTPUT_BASE, MSR_IA32_RTIT_OUTPUT_MASK,
1460  MSR_IA32_RTIT_ADDR0_A, MSR_IA32_RTIT_ADDR0_B,
1461  MSR_IA32_RTIT_ADDR1_A, MSR_IA32_RTIT_ADDR1_B,
1462  MSR_IA32_RTIT_ADDR2_A, MSR_IA32_RTIT_ADDR2_B,
1463  MSR_IA32_RTIT_ADDR3_A, MSR_IA32_RTIT_ADDR3_B,
1464  MSR_IA32_UMWAIT_CONTROL,
1465 
1466  MSR_IA32_XFD, MSR_IA32_XFD_ERR,
1467 };
1468 
1469 static const u32 msrs_to_save_pmu[] = {
1470  MSR_ARCH_PERFMON_FIXED_CTR0, MSR_ARCH_PERFMON_FIXED_CTR1,
1471  MSR_ARCH_PERFMON_FIXED_CTR0 + 2,
1472  MSR_CORE_PERF_FIXED_CTR_CTRL, MSR_CORE_PERF_GLOBAL_STATUS,
1473  MSR_CORE_PERF_GLOBAL_CTRL, MSR_CORE_PERF_GLOBAL_OVF_CTRL,
1474  MSR_IA32_PEBS_ENABLE, MSR_IA32_DS_AREA, MSR_PEBS_DATA_CFG,
1475 
1476  /* This part of MSRs should match KVM_INTEL_PMC_MAX_GENERIC. */
1477  MSR_ARCH_PERFMON_PERFCTR0, MSR_ARCH_PERFMON_PERFCTR1,
1478  MSR_ARCH_PERFMON_PERFCTR0 + 2, MSR_ARCH_PERFMON_PERFCTR0 + 3,
1479  MSR_ARCH_PERFMON_PERFCTR0 + 4, MSR_ARCH_PERFMON_PERFCTR0 + 5,
1480  MSR_ARCH_PERFMON_PERFCTR0 + 6, MSR_ARCH_PERFMON_PERFCTR0 + 7,
1481  MSR_ARCH_PERFMON_EVENTSEL0, MSR_ARCH_PERFMON_EVENTSEL1,
1482  MSR_ARCH_PERFMON_EVENTSEL0 + 2, MSR_ARCH_PERFMON_EVENTSEL0 + 3,
1483  MSR_ARCH_PERFMON_EVENTSEL0 + 4, MSR_ARCH_PERFMON_EVENTSEL0 + 5,
1484  MSR_ARCH_PERFMON_EVENTSEL0 + 6, MSR_ARCH_PERFMON_EVENTSEL0 + 7,
1485 
1486  MSR_K7_EVNTSEL0, MSR_K7_EVNTSEL1, MSR_K7_EVNTSEL2, MSR_K7_EVNTSEL3,
1487  MSR_K7_PERFCTR0, MSR_K7_PERFCTR1, MSR_K7_PERFCTR2, MSR_K7_PERFCTR3,
1488 
1489  /* This part of MSRs should match KVM_AMD_PMC_MAX_GENERIC. */
1490  MSR_F15H_PERF_CTL0, MSR_F15H_PERF_CTL1, MSR_F15H_PERF_CTL2,
1491  MSR_F15H_PERF_CTL3, MSR_F15H_PERF_CTL4, MSR_F15H_PERF_CTL5,
1492  MSR_F15H_PERF_CTR0, MSR_F15H_PERF_CTR1, MSR_F15H_PERF_CTR2,
1493  MSR_F15H_PERF_CTR3, MSR_F15H_PERF_CTR4, MSR_F15H_PERF_CTR5,
1494 
1495  MSR_AMD64_PERF_CNTR_GLOBAL_CTL,
1496  MSR_AMD64_PERF_CNTR_GLOBAL_STATUS,
1497  MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR,
1498 };
1499 
1500 static u32 msrs_to_save[ARRAY_SIZE(msrs_to_save_base) +
1501  ARRAY_SIZE(msrs_to_save_pmu)];
1502 static unsigned num_msrs_to_save;
1503 
1504 static const u32 emulated_msrs_all[] = {
1505  MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
1506  MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
1507 
1508 #ifdef CONFIG_KVM_HYPERV
1509  HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
1510  HV_X64_MSR_TIME_REF_COUNT, HV_X64_MSR_REFERENCE_TSC,
1511  HV_X64_MSR_TSC_FREQUENCY, HV_X64_MSR_APIC_FREQUENCY,
1512  HV_X64_MSR_CRASH_P0, HV_X64_MSR_CRASH_P1, HV_X64_MSR_CRASH_P2,
1513  HV_X64_MSR_CRASH_P3, HV_X64_MSR_CRASH_P4, HV_X64_MSR_CRASH_CTL,
1514  HV_X64_MSR_RESET,
1515  HV_X64_MSR_VP_INDEX,
1516  HV_X64_MSR_VP_RUNTIME,
1517  HV_X64_MSR_SCONTROL,
1518  HV_X64_MSR_STIMER0_CONFIG,
1519  HV_X64_MSR_VP_ASSIST_PAGE,
1520  HV_X64_MSR_REENLIGHTENMENT_CONTROL, HV_X64_MSR_TSC_EMULATION_CONTROL,
1521  HV_X64_MSR_TSC_EMULATION_STATUS, HV_X64_MSR_TSC_INVARIANT_CONTROL,
1522  HV_X64_MSR_SYNDBG_OPTIONS,
1523  HV_X64_MSR_SYNDBG_CONTROL, HV_X64_MSR_SYNDBG_STATUS,
1524  HV_X64_MSR_SYNDBG_SEND_BUFFER, HV_X64_MSR_SYNDBG_RECV_BUFFER,
1525  HV_X64_MSR_SYNDBG_PENDING_BUFFER,
1526 #endif
1527 
1528  MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,
1529  MSR_KVM_PV_EOI_EN, MSR_KVM_ASYNC_PF_INT, MSR_KVM_ASYNC_PF_ACK,
1530 
1531  MSR_IA32_TSC_ADJUST,
1532  MSR_IA32_TSC_DEADLINE,
1533  MSR_IA32_ARCH_CAPABILITIES,
1534  MSR_IA32_PERF_CAPABILITIES,
1535  MSR_IA32_MISC_ENABLE,
1536  MSR_IA32_MCG_STATUS,
1537  MSR_IA32_MCG_CTL,
1538  MSR_IA32_MCG_EXT_CTL,
1539  MSR_IA32_SMBASE,
1540  MSR_SMI_COUNT,
1541  MSR_PLATFORM_INFO,
1542  MSR_MISC_FEATURES_ENABLES,
1543  MSR_AMD64_VIRT_SPEC_CTRL,
1544  MSR_AMD64_TSC_RATIO,
1545  MSR_IA32_POWER_CTL,
1546  MSR_IA32_UCODE_REV,
1547 
1548  /*
1549  * KVM always supports the "true" VMX control MSRs, even if the host
1550  * does not. The VMX MSRs as a whole are considered "emulated" as KVM
1551  * doesn't strictly require them to exist in the host (ignoring that
1552  * KVM would refuse to load in the first place if the core set of MSRs
1553  * aren't supported).
1554  */
1555  MSR_IA32_VMX_BASIC,
1556  MSR_IA32_VMX_TRUE_PINBASED_CTLS,
1557  MSR_IA32_VMX_TRUE_PROCBASED_CTLS,
1558  MSR_IA32_VMX_TRUE_EXIT_CTLS,
1559  MSR_IA32_VMX_TRUE_ENTRY_CTLS,
1560  MSR_IA32_VMX_MISC,
1561  MSR_IA32_VMX_CR0_FIXED0,
1562  MSR_IA32_VMX_CR4_FIXED0,
1563  MSR_IA32_VMX_VMCS_ENUM,
1564  MSR_IA32_VMX_PROCBASED_CTLS2,
1565  MSR_IA32_VMX_EPT_VPID_CAP,
1566  MSR_IA32_VMX_VMFUNC,
1567 
1568  MSR_K7_HWCR,
1569  MSR_KVM_POLL_CONTROL,
1570 };
1571 
1572 static u32 emulated_msrs[ARRAY_SIZE(emulated_msrs_all)];
1573 static unsigned num_emulated_msrs;
1574 
1575 /*
1576  * List of MSRs that control the existence of MSR-based features, i.e. MSRs
1577  * that are effectively CPUID leafs. VMX MSRs are also included in the set of
1578  * feature MSRs, but are handled separately to allow expedited lookups.
1579  */
1581  MSR_AMD64_DE_CFG,
1582  MSR_IA32_UCODE_REV,
1583  MSR_IA32_ARCH_CAPABILITIES,
1584  MSR_IA32_PERF_CAPABILITIES,
1585 };
1586 
1589 static unsigned int num_msr_based_features;
1590 
1591 /*
1592  * All feature MSRs except uCode revID, which tracks the currently loaded uCode
1593  * patch, are immutable once the vCPU model is defined.
1594  */
1595 static bool kvm_is_immutable_feature_msr(u32 msr)
1596 {
1597  int i;
1598 
1600  return true;
1601 
1602  for (i = 0; i < ARRAY_SIZE(msr_based_features_all_except_vmx); i++) {
1603  if (msr == msr_based_features_all_except_vmx[i])
1604  return msr != MSR_IA32_UCODE_REV;
1605  }
1606 
1607  return false;
1608 }
1609 
1610 /*
1611  * Some IA32_ARCH_CAPABILITIES bits have dependencies on MSRs that KVM
1612  * does not yet virtualize. These include:
1613  * 10 - MISC_PACKAGE_CTRLS
1614  * 11 - ENERGY_FILTERING_CTL
1615  * 12 - DOITM
1616  * 18 - FB_CLEAR_CTRL
1617  * 21 - XAPIC_DISABLE_STATUS
1618  * 23 - OVERCLOCKING_STATUS
1619  */
1620 
1621 #define KVM_SUPPORTED_ARCH_CAP \
1622  (ARCH_CAP_RDCL_NO | ARCH_CAP_IBRS_ALL | ARCH_CAP_RSBA | \
1623  ARCH_CAP_SKIP_VMENTRY_L1DFLUSH | ARCH_CAP_SSB_NO | ARCH_CAP_MDS_NO | \
1624  ARCH_CAP_PSCHANGE_MC_NO | ARCH_CAP_TSX_CTRL_MSR | ARCH_CAP_TAA_NO | \
1625  ARCH_CAP_SBDR_SSDP_NO | ARCH_CAP_FBSDP_NO | ARCH_CAP_PSDP_NO | \
1626  ARCH_CAP_FB_CLEAR | ARCH_CAP_RRSBA | ARCH_CAP_PBRSB_NO | ARCH_CAP_GDS_NO | \
1627  ARCH_CAP_RFDS_NO | ARCH_CAP_RFDS_CLEAR | ARCH_CAP_BHI_NO)
1628 
1630 {
1632 
1633  /*
1634  * If nx_huge_pages is enabled, KVM's shadow paging will ensure that
1635  * the nested hypervisor runs with NX huge pages. If it is not,
1636  * L1 is anyway vulnerable to ITLB_MULTIHIT exploits from other
1637  * L1 guests, so it need not worry about its own (L2) guests.
1638  */
1639  data |= ARCH_CAP_PSCHANGE_MC_NO;
1640 
1641  /*
1642  * If we're doing cache flushes (either "always" or "cond")
1643  * we will do one whenever the guest does a vmlaunch/vmresume.
1644  * If an outer hypervisor is doing the cache flush for us
1645  * (ARCH_CAP_SKIP_VMENTRY_L1DFLUSH), we can safely pass that
1646  * capability to the guest too, and if EPT is disabled we're not
1647  * vulnerable. Overall, only VMENTER_L1D_FLUSH_NEVER will
1648  * require a nested hypervisor to do a flush of its own.
1649  */
1650  if (l1tf_vmx_mitigation != VMENTER_L1D_FLUSH_NEVER)
1651  data |= ARCH_CAP_SKIP_VMENTRY_L1DFLUSH;
1652 
1653  if (!boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN))
1654  data |= ARCH_CAP_RDCL_NO;
1655  if (!boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS))
1656  data |= ARCH_CAP_SSB_NO;
1657  if (!boot_cpu_has_bug(X86_BUG_MDS))
1658  data |= ARCH_CAP_MDS_NO;
1659  if (!boot_cpu_has_bug(X86_BUG_RFDS))
1660  data |= ARCH_CAP_RFDS_NO;
1661 
1662  if (!boot_cpu_has(X86_FEATURE_RTM)) {
1663  /*
1664  * If RTM=0 because the kernel has disabled TSX, the host might
1665  * have TAA_NO or TSX_CTRL. Clear TAA_NO (the guest sees RTM=0
1666  * and therefore knows that there cannot be TAA) but keep
1667  * TSX_CTRL: some buggy userspaces leave it set on tsx=on hosts,
1668  * and we want to allow migrating those guests to tsx=off hosts.
1669  */
1670  data &= ~ARCH_CAP_TAA_NO;
1671  } else if (!boot_cpu_has_bug(X86_BUG_TAA)) {
1672  data |= ARCH_CAP_TAA_NO;
1673  } else {
1674  /*
1675  * Nothing to do here; we emulate TSX_CTRL if present on the
1676  * host so the guest can choose between disabling TSX or
1677  * using VERW to clear CPU buffers.
1678  */
1679  }
1680 
1681  if (!boot_cpu_has_bug(X86_BUG_GDS) || gds_ucode_mitigated())
1682  data |= ARCH_CAP_GDS_NO;
1683 
1684  return data;
1685 }
1686 
1687 static int kvm_get_msr_feature(struct kvm_msr_entry *msr)
1688 {
1689  switch (msr->index) {
1690  case MSR_IA32_ARCH_CAPABILITIES:
1691  msr->data = kvm_get_arch_capabilities();
1692  break;
1693  case MSR_IA32_PERF_CAPABILITIES:
1694  msr->data = kvm_caps.supported_perf_cap;
1695  break;
1696  case MSR_IA32_UCODE_REV:
1697  rdmsrl_safe(msr->index, &msr->data);
1698  break;
1699  default:
1700  return static_call(kvm_x86_get_msr_feature)(msr);
1701  }
1702  return 0;
1703 }
1704 
1705 static int do_get_msr_feature(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
1706 {
1707  struct kvm_msr_entry msr;
1708  int r;
1709 
1710  /* Unconditionally clear the output for simplicity */
1711  msr.data = 0;
1712  msr.index = index;
1713  r = kvm_get_msr_feature(&msr);
1714 
1715  if (r == KVM_MSR_RET_INVALID && kvm_msr_ignored_check(index, 0, false))
1716  r = 0;
1717 
1718  *data = msr.data;
1719 
1720  return r;
1721 }
1722 
1723 static bool __kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer)
1724 {
1725  if (efer & EFER_AUTOIBRS && !guest_cpuid_has(vcpu, X86_FEATURE_AUTOIBRS))
1726  return false;
1727 
1728  if (efer & EFER_FFXSR && !guest_cpuid_has(vcpu, X86_FEATURE_FXSR_OPT))
1729  return false;
1730 
1731  if (efer & EFER_SVME && !guest_cpuid_has(vcpu, X86_FEATURE_SVM))
1732  return false;
1733 
1734  if (efer & (EFER_LME | EFER_LMA) &&
1735  !guest_cpuid_has(vcpu, X86_FEATURE_LM))
1736  return false;
1737 
1738  if (efer & EFER_NX && !guest_cpuid_has(vcpu, X86_FEATURE_NX))
1739  return false;
1740 
1741  return true;
1742 
1743 }
1744 bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer)
1745 {
1746  if (efer & efer_reserved_bits)
1747  return false;
1748 
1749  return __kvm_valid_efer(vcpu, efer);
1750 }
1752 
1753 static int set_efer(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
1754 {
1755  u64 old_efer = vcpu->arch.efer;
1756  u64 efer = msr_info->data;
1757  int r;
1758 
1759  if (efer & efer_reserved_bits)
1760  return 1;
1761 
1762  if (!msr_info->host_initiated) {
1763  if (!__kvm_valid_efer(vcpu, efer))
1764  return 1;
1765 
1766  if (is_paging(vcpu) &&
1767  (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME))
1768  return 1;
1769  }
1770 
1771  efer &= ~EFER_LMA;
1772  efer |= vcpu->arch.efer & EFER_LMA;
1773 
1774  r = static_call(kvm_x86_set_efer)(vcpu, efer);
1775  if (r) {
1776  WARN_ON(r > 0);
1777  return r;
1778  }
1779 
1780  if ((efer ^ old_efer) & KVM_MMU_EFER_ROLE_BITS)
1781  kvm_mmu_reset_context(vcpu);
1782 
1783  if (!static_cpu_has(X86_FEATURE_XSAVES) &&
1784  (efer & EFER_SVME))
1786 
1787  return 0;
1788 }
1789 
1790 void kvm_enable_efer_bits(u64 mask)
1791 {
1792  efer_reserved_bits &= ~mask;
1793 }
1795 
1796 bool kvm_msr_allowed(struct kvm_vcpu *vcpu, u32 index, u32 type)
1797 {
1798  struct kvm_x86_msr_filter *msr_filter;
1799  struct msr_bitmap_range *ranges;
1800  struct kvm *kvm = vcpu->kvm;
1801  bool allowed;
1802  int idx;
1803  u32 i;
1804 
1805  /* x2APIC MSRs do not support filtering. */
1806  if (index >= 0x800 && index <= 0x8ff)
1807  return true;
1808 
1809  idx = srcu_read_lock(&kvm->srcu);
1810 
1811  msr_filter = srcu_dereference(kvm->arch.msr_filter, &kvm->srcu);
1812  if (!msr_filter) {
1813  allowed = true;
1814  goto out;
1815  }
1816 
1817  allowed = msr_filter->default_allow;
1818  ranges = msr_filter->ranges;
1819 
1820  for (i = 0; i < msr_filter->count; i++) {
1821  u32 start = ranges[i].base;
1822  u32 end = start + ranges[i].nmsrs;
1823  u32 flags = ranges[i].flags;
1824  unsigned long *bitmap = ranges[i].bitmap;
1825 
1826  if ((index >= start) && (index < end) && (flags & type)) {
1827  allowed = test_bit(index - start, bitmap);
1828  break;
1829  }
1830  }
1831 
1832 out:
1833  srcu_read_unlock(&kvm->srcu, idx);
1834 
1835  return allowed;
1836 }
1838 
1839 /*
1840  * Write @data into the MSR specified by @index. Select MSR specific fault
1841  * checks are bypassed if @host_initiated is %true.
1842  * Returns 0 on success, non-0 otherwise.
1843  * Assumes vcpu_load() was already called.
1844  */
1845 static int __kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data,
1846  bool host_initiated)
1847 {
1848  struct msr_data msr;
1849 
1850  switch (index) {
1851  case MSR_FS_BASE:
1852  case MSR_GS_BASE:
1853  case MSR_KERNEL_GS_BASE:
1854  case MSR_CSTAR:
1855  case MSR_LSTAR:
1856  if (is_noncanonical_address(data, vcpu))
1857  return 1;
1858  break;
1859  case MSR_IA32_SYSENTER_EIP:
1860  case MSR_IA32_SYSENTER_ESP:
1861  /*
1862  * IA32_SYSENTER_ESP and IA32_SYSENTER_EIP cause #GP if
1863  * non-canonical address is written on Intel but not on
1864  * AMD (which ignores the top 32-bits, because it does
1865  * not implement 64-bit SYSENTER).
1866  *
1867  * 64-bit code should hence be able to write a non-canonical
1868  * value on AMD. Making the address canonical ensures that
1869  * vmentry does not fail on Intel after writing a non-canonical
1870  * value, and that something deterministic happens if the guest
1871  * invokes 64-bit SYSENTER.
1872  */
1873  data = __canonical_address(data, vcpu_virt_addr_bits(vcpu));
1874  break;
1875  case MSR_TSC_AUX:
1876  if (!kvm_is_supported_user_return_msr(MSR_TSC_AUX))
1877  return 1;
1878 
1879  if (!host_initiated &&
1880  !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP) &&
1881  !guest_cpuid_has(vcpu, X86_FEATURE_RDPID))
1882  return 1;
1883 
1884  /*
1885  * Per Intel's SDM, bits 63:32 are reserved, but AMD's APM has
1886  * incomplete and conflicting architectural behavior. Current
1887  * AMD CPUs completely ignore bits 63:32, i.e. they aren't
1888  * reserved and always read as zeros. Enforce Intel's reserved
1889  * bits check if and only if the guest CPU is Intel, and clear
1890  * the bits in all other cases. This ensures cross-vendor
1891  * migration will provide consistent behavior for the guest.
1892  */
1893  if (guest_cpuid_is_intel(vcpu) && (data >> 32) != 0)
1894  return 1;
1895 
1896  data = (u32)data;
1897  break;
1898  }
1899 
1900  msr.data = data;
1901  msr.index = index;
1902  msr.host_initiated = host_initiated;
1903 
1904  return static_call(kvm_x86_set_msr)(vcpu, &msr);
1905 }
1906 
1907 static int kvm_set_msr_ignored_check(struct kvm_vcpu *vcpu,
1908  u32 index, u64 data, bool host_initiated)
1909 {
1910  int ret = __kvm_set_msr(vcpu, index, data, host_initiated);
1911 
1912  if (ret == KVM_MSR_RET_INVALID)
1913  if (kvm_msr_ignored_check(index, data, true))
1914  ret = 0;
1915 
1916  return ret;
1917 }
1918 
1919 /*
1920  * Read the MSR specified by @index into @data. Select MSR specific fault
1921  * checks are bypassed if @host_initiated is %true.
1922  * Returns 0 on success, non-0 otherwise.
1923  * Assumes vcpu_load() was already called.
1924  */
1925 int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data,
1926  bool host_initiated)
1927 {
1928  struct msr_data msr;
1929  int ret;
1930 
1931  switch (index) {
1932  case MSR_TSC_AUX:
1933  if (!kvm_is_supported_user_return_msr(MSR_TSC_AUX))
1934  return 1;
1935 
1936  if (!host_initiated &&
1937  !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP) &&
1938  !guest_cpuid_has(vcpu, X86_FEATURE_RDPID))
1939  return 1;
1940  break;
1941  }
1942 
1943  msr.index = index;
1944  msr.host_initiated = host_initiated;
1945 
1946  ret = static_call(kvm_x86_get_msr)(vcpu, &msr);
1947  if (!ret)
1948  *data = msr.data;
1949  return ret;
1950 }
1951 
1952 static int kvm_get_msr_ignored_check(struct kvm_vcpu *vcpu,
1953  u32 index, u64 *data, bool host_initiated)
1954 {
1955  int ret = __kvm_get_msr(vcpu, index, data, host_initiated);
1956 
1957  if (ret == KVM_MSR_RET_INVALID) {
1958  /* Unconditionally clear *data for simplicity */
1959  *data = 0;
1960  if (kvm_msr_ignored_check(index, 0, false))
1961  ret = 0;
1962  }
1963 
1964  return ret;
1965 }
1966 
1967 static int kvm_get_msr_with_filter(struct kvm_vcpu *vcpu, u32 index, u64 *data)
1968 {
1969  if (!kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_READ))
1970  return KVM_MSR_RET_FILTERED;
1971  return kvm_get_msr_ignored_check(vcpu, index, data, false);
1972 }
1973 
1974 static int kvm_set_msr_with_filter(struct kvm_vcpu *vcpu, u32 index, u64 data)
1975 {
1976  if (!kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_WRITE))
1977  return KVM_MSR_RET_FILTERED;
1978  return kvm_set_msr_ignored_check(vcpu, index, data, false);
1979 }
1980 
1981 int kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data)
1982 {
1983  return kvm_get_msr_ignored_check(vcpu, index, data, false);
1984 }
1986 
1987 int kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data)
1988 {
1989  return kvm_set_msr_ignored_check(vcpu, index, data, false);
1990 }
1992 
1993 static void complete_userspace_rdmsr(struct kvm_vcpu *vcpu)
1994 {
1995  if (!vcpu->run->msr.error) {
1996  kvm_rax_write(vcpu, (u32)vcpu->run->msr.data);
1997  kvm_rdx_write(vcpu, vcpu->run->msr.data >> 32);
1998  }
1999 }
2000 
2001 static int complete_emulated_msr_access(struct kvm_vcpu *vcpu)
2002 {
2003  return complete_emulated_insn_gp(vcpu, vcpu->run->msr.error);
2004 }
2005 
2006 static int complete_emulated_rdmsr(struct kvm_vcpu *vcpu)
2007 {
2009  return complete_emulated_msr_access(vcpu);
2010 }
2011 
2012 static int complete_fast_msr_access(struct kvm_vcpu *vcpu)
2013 {
2014  return static_call(kvm_x86_complete_emulated_msr)(vcpu, vcpu->run->msr.error);
2015 }
2016 
2017 static int complete_fast_rdmsr(struct kvm_vcpu *vcpu)
2018 {
2020  return complete_fast_msr_access(vcpu);
2021 }
2022 
2023 static u64 kvm_msr_reason(int r)
2024 {
2025  switch (r) {
2026  case KVM_MSR_RET_INVALID:
2027  return KVM_MSR_EXIT_REASON_UNKNOWN;
2028  case KVM_MSR_RET_FILTERED:
2029  return KVM_MSR_EXIT_REASON_FILTER;
2030  default:
2031  return KVM_MSR_EXIT_REASON_INVAL;
2032  }
2033 }
2034 
2035 static int kvm_msr_user_space(struct kvm_vcpu *vcpu, u32 index,
2036  u32 exit_reason, u64 data,
2037  int (*completion)(struct kvm_vcpu *vcpu),
2038  int r)
2039 {
2040  u64 msr_reason = kvm_msr_reason(r);
2041 
2042  /* Check if the user wanted to know about this MSR fault */
2043  if (!(vcpu->kvm->arch.user_space_msr_mask & msr_reason))
2044  return 0;
2045 
2046  vcpu->run->exit_reason = exit_reason;
2047  vcpu->run->msr.error = 0;
2048  memset(vcpu->run->msr.pad, 0, sizeof(vcpu->run->msr.pad));
2049  vcpu->run->msr.reason = msr_reason;
2050  vcpu->run->msr.index = index;
2051  vcpu->run->msr.data = data;
2052  vcpu->arch.complete_userspace_io = completion;
2053 
2054  return 1;
2055 }
2056 
2057 int kvm_emulate_rdmsr(struct kvm_vcpu *vcpu)
2058 {
2059  u32 ecx = kvm_rcx_read(vcpu);
2060  u64 data;
2061  int r;
2062 
2063  r = kvm_get_msr_with_filter(vcpu, ecx, &data);
2064 
2065  if (!r) {
2066  trace_kvm_msr_read(ecx, data);
2067 
2068  kvm_rax_write(vcpu, data & -1u);
2069  kvm_rdx_write(vcpu, (data >> 32) & -1u);
2070  } else {
2071  /* MSR read failed? See if we should ask user space */
2072  if (kvm_msr_user_space(vcpu, ecx, KVM_EXIT_X86_RDMSR, 0,
2073  complete_fast_rdmsr, r))
2074  return 0;
2075  trace_kvm_msr_read_ex(ecx);
2076  }
2077 
2078  return static_call(kvm_x86_complete_emulated_msr)(vcpu, r);
2079 }
2081 
2082 int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu)
2083 {
2084  u32 ecx = kvm_rcx_read(vcpu);
2085  u64 data = kvm_read_edx_eax(vcpu);
2086  int r;
2087 
2088  r = kvm_set_msr_with_filter(vcpu, ecx, data);
2089 
2090  if (!r) {
2091  trace_kvm_msr_write(ecx, data);
2092  } else {
2093  /* MSR write failed? See if we should ask user space */
2094  if (kvm_msr_user_space(vcpu, ecx, KVM_EXIT_X86_WRMSR, data,
2096  return 0;
2097  /* Signal all other negative errors to userspace */
2098  if (r < 0)
2099  return r;
2100  trace_kvm_msr_write_ex(ecx, data);
2101  }
2102 
2103  return static_call(kvm_x86_complete_emulated_msr)(vcpu, r);
2104 }
2106 
2107 int kvm_emulate_as_nop(struct kvm_vcpu *vcpu)
2108 {
2109  return kvm_skip_emulated_instruction(vcpu);
2110 }
2111 
2112 int kvm_emulate_invd(struct kvm_vcpu *vcpu)
2113 {
2114  /* Treat an INVD instruction as a NOP and just skip it. */
2115  return kvm_emulate_as_nop(vcpu);
2116 }
2118 
2119 int kvm_handle_invalid_op(struct kvm_vcpu *vcpu)
2120 {
2121  kvm_queue_exception(vcpu, UD_VECTOR);
2122  return 1;
2123 }
2125 
2126 
2127 static int kvm_emulate_monitor_mwait(struct kvm_vcpu *vcpu, const char *insn)
2128 {
2129  if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS) &&
2130  !guest_cpuid_has(vcpu, X86_FEATURE_MWAIT))
2131  return kvm_handle_invalid_op(vcpu);
2132 
2133  pr_warn_once("%s instruction emulated as NOP!\n", insn);
2134  return kvm_emulate_as_nop(vcpu);
2135 }
2136 int kvm_emulate_mwait(struct kvm_vcpu *vcpu)
2137 {
2138  return kvm_emulate_monitor_mwait(vcpu, "MWAIT");
2139 }
2141 
2142 int kvm_emulate_monitor(struct kvm_vcpu *vcpu)
2143 {
2144  return kvm_emulate_monitor_mwait(vcpu, "MONITOR");
2145 }
2147 
2148 static inline bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu)
2149 {
2150  xfer_to_guest_mode_prepare();
2151  return vcpu->mode == EXITING_GUEST_MODE || kvm_request_pending(vcpu) ||
2152  xfer_to_guest_mode_work_pending();
2153 }
2154 
2155 /*
2156  * The fast path for frequent and performance sensitive wrmsr emulation,
2157  * i.e. the sending of IPI, sending IPI early in the VM-Exit flow reduces
2158  * the latency of virtual IPI by avoiding the expensive bits of transitioning
2159  * from guest to host, e.g. reacquiring KVM's SRCU lock. In contrast to the
2160  * other cases which must be called after interrupts are enabled on the host.
2161  */
2162 static int handle_fastpath_set_x2apic_icr_irqoff(struct kvm_vcpu *vcpu, u64 data)
2163 {
2164  if (!lapic_in_kernel(vcpu) || !apic_x2apic_mode(vcpu->arch.apic))
2165  return 1;
2166 
2167  if (((data & APIC_SHORT_MASK) == APIC_DEST_NOSHORT) &&
2168  ((data & APIC_DEST_MASK) == APIC_DEST_PHYSICAL) &&
2169  ((data & APIC_MODE_MASK) == APIC_DM_FIXED) &&
2170  ((u32)(data >> 32) != X2APIC_BROADCAST))
2171  return kvm_x2apic_icr_write(vcpu->arch.apic, data);
2172 
2173  return 1;
2174 }
2175 
2176 static int handle_fastpath_set_tscdeadline(struct kvm_vcpu *vcpu, u64 data)
2177 {
2178  if (!kvm_can_use_hv_timer(vcpu))
2179  return 1;
2180 
2181  kvm_set_lapic_tscdeadline_msr(vcpu, data);
2182  return 0;
2183 }
2184 
2185 fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu)
2186 {
2187  u32 msr = kvm_rcx_read(vcpu);
2188  u64 data;
2189  fastpath_t ret = EXIT_FASTPATH_NONE;
2190 
2191  kvm_vcpu_srcu_read_lock(vcpu);
2192 
2193  switch (msr) {
2194  case APIC_BASE_MSR + (APIC_ICR >> 4):
2195  data = kvm_read_edx_eax(vcpu);
2196  if (!handle_fastpath_set_x2apic_icr_irqoff(vcpu, data)) {
2198  ret = EXIT_FASTPATH_EXIT_HANDLED;
2199  }
2200  break;
2201  case MSR_IA32_TSC_DEADLINE:
2202  data = kvm_read_edx_eax(vcpu);
2203  if (!handle_fastpath_set_tscdeadline(vcpu, data)) {
2205  ret = EXIT_FASTPATH_REENTER_GUEST;
2206  }
2207  break;
2208  default:
2209  break;
2210  }
2211 
2212  if (ret != EXIT_FASTPATH_NONE)
2213  trace_kvm_msr_write(msr, data);
2214 
2215  kvm_vcpu_srcu_read_unlock(vcpu);
2216 
2217  return ret;
2218 }
2220 
2221 /*
2222  * Adapt set_msr() to msr_io()'s calling convention
2223  */
2224 static int do_get_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
2225 {
2226  return kvm_get_msr_ignored_check(vcpu, index, data, true);
2227 }
2228 
2229 static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
2230 {
2231  u64 val;
2232 
2233  /*
2234  * Disallow writes to immutable feature MSRs after KVM_RUN. KVM does
2235  * not support modifying the guest vCPU model on the fly, e.g. changing
2236  * the nVMX capabilities while L2 is running is nonsensical. Ignore
2237  * writes of the same value, e.g. to allow userspace to blindly stuff
2238  * all MSRs when emulating RESET.
2239  */
2240  if (kvm_vcpu_has_run(vcpu) && kvm_is_immutable_feature_msr(index)) {
2241  if (do_get_msr(vcpu, index, &val) || *data != val)
2242  return -EINVAL;
2243 
2244  return 0;
2245  }
2246 
2247  return kvm_set_msr_ignored_check(vcpu, index, *data, true);
2248 }
2249 
2250 #ifdef CONFIG_X86_64
2251 struct pvclock_clock {
2252  int vclock_mode;
2253  u64 cycle_last;
2254  u64 mask;
2255  u32 mult;
2256  u32 shift;
2257  u64 base_cycles;
2258  u64 offset;
2259 };
2260 
2261 struct pvclock_gtod_data {
2262  seqcount_t seq;
2263 
2264  struct pvclock_clock clock; /* extract of a clocksource struct */
2265  struct pvclock_clock raw_clock; /* extract of a clocksource struct */
2266 
2267  ktime_t offs_boot;
2268  u64 wall_time_sec;
2269 };
2270 
2271 static struct pvclock_gtod_data pvclock_gtod_data;
2272 
2273 static void update_pvclock_gtod(struct timekeeper *tk)
2274 {
2275  struct pvclock_gtod_data *vdata = &pvclock_gtod_data;
2276 
2277  write_seqcount_begin(&vdata->seq);
2278 
2279  /* copy pvclock gtod data */
2280  vdata->clock.vclock_mode = tk->tkr_mono.clock->vdso_clock_mode;
2281  vdata->clock.cycle_last = tk->tkr_mono.cycle_last;
2282  vdata->clock.mask = tk->tkr_mono.mask;
2283  vdata->clock.mult = tk->tkr_mono.mult;
2284  vdata->clock.shift = tk->tkr_mono.shift;
2285  vdata->clock.base_cycles = tk->tkr_mono.xtime_nsec;
2286  vdata->clock.offset = tk->tkr_mono.base;
2287 
2288  vdata->raw_clock.vclock_mode = tk->tkr_raw.clock->vdso_clock_mode;
2289  vdata->raw_clock.cycle_last = tk->tkr_raw.cycle_last;
2290  vdata->raw_clock.mask = tk->tkr_raw.mask;
2291  vdata->raw_clock.mult = tk->tkr_raw.mult;
2292  vdata->raw_clock.shift = tk->tkr_raw.shift;
2293  vdata->raw_clock.base_cycles = tk->tkr_raw.xtime_nsec;
2294  vdata->raw_clock.offset = tk->tkr_raw.base;
2295 
2296  vdata->wall_time_sec = tk->xtime_sec;
2297 
2298  vdata->offs_boot = tk->offs_boot;
2299 
2300  write_seqcount_end(&vdata->seq);
2301 }
2302 
2303 static s64 get_kvmclock_base_ns(void)
2304 {
2305  /* Count up from boot time, but with the frequency of the raw clock. */
2306  return ktime_to_ns(ktime_add(ktime_get_raw(), pvclock_gtod_data.offs_boot));
2307 }
2308 #else
2309 static s64 get_kvmclock_base_ns(void)
2310 {
2311  /* Master clock not used, so we can just use CLOCK_BOOTTIME. */
2312  return ktime_get_boottime_ns();
2313 }
2314 #endif
2315 
2316 static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock, int sec_hi_ofs)
2317 {
2318  int version;
2319  int r;
2320  struct pvclock_wall_clock wc;
2321  u32 wc_sec_hi;
2322  u64 wall_nsec;
2323 
2324  if (!wall_clock)
2325  return;
2326 
2327  r = kvm_read_guest(kvm, wall_clock, &version, sizeof(version));
2328  if (r)
2329  return;
2330 
2331  if (version & 1)
2332  ++version; /* first time write, random junk */
2333 
2334  ++version;
2335 
2336  if (kvm_write_guest(kvm, wall_clock, &version, sizeof(version)))
2337  return;
2338 
2339  wall_nsec = kvm_get_wall_clock_epoch(kvm);
2340 
2341  wc.nsec = do_div(wall_nsec, NSEC_PER_SEC);
2342  wc.sec = (u32)wall_nsec; /* overflow in 2106 guest time */
2343  wc.version = version;
2344 
2345  kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc));
2346 
2347  if (sec_hi_ofs) {
2348  wc_sec_hi = wall_nsec >> 32;
2349  kvm_write_guest(kvm, wall_clock + sec_hi_ofs,
2350  &wc_sec_hi, sizeof(wc_sec_hi));
2351  }
2352 
2353  version++;
2354  kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
2355 }
2356 
2357 static void kvm_write_system_time(struct kvm_vcpu *vcpu, gpa_t system_time,
2358  bool old_msr, bool host_initiated)
2359 {
2360  struct kvm_arch *ka = &vcpu->kvm->arch;
2361 
2362  if (vcpu->vcpu_id == 0 && !host_initiated) {
2363  if (ka->boot_vcpu_runs_old_kvmclock != old_msr)
2364  kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
2365 
2366  ka->boot_vcpu_runs_old_kvmclock = old_msr;
2367  }
2368 
2369  vcpu->arch.time = system_time;
2370  kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu);
2371 
2372  /* we verify if the enable bit is set... */
2373  if (system_time & 1)
2374  kvm_gpc_activate(&vcpu->arch.pv_time, system_time & ~1ULL,
2375  sizeof(struct pvclock_vcpu_time_info));
2376  else
2377  kvm_gpc_deactivate(&vcpu->arch.pv_time);
2378 
2379  return;
2380 }
2381 
2382 static uint32_t div_frac(uint32_t dividend, uint32_t divisor)
2383 {
2384  do_shl32_div32(dividend, divisor);
2385  return dividend;
2386 }
2387 
2388 static void kvm_get_time_scale(uint64_t scaled_hz, uint64_t base_hz,
2389  s8 *pshift, u32 *pmultiplier)
2390 {
2391  uint64_t scaled64;
2392  int32_t shift = 0;
2393  uint64_t tps64;
2394  uint32_t tps32;
2395 
2396  tps64 = base_hz;
2397  scaled64 = scaled_hz;
2398  while (tps64 > scaled64*2 || tps64 & 0xffffffff00000000ULL) {
2399  tps64 >>= 1;
2400  shift--;
2401  }
2402 
2403  tps32 = (uint32_t)tps64;
2404  while (tps32 <= scaled64 || scaled64 & 0xffffffff00000000ULL) {
2405  if (scaled64 & 0xffffffff00000000ULL || tps32 & 0x80000000)
2406  scaled64 >>= 1;
2407  else
2408  tps32 <<= 1;
2409  shift++;
2410  }
2411 
2412  *pshift = shift;
2413  *pmultiplier = div_frac(scaled64, tps32);
2414 }
2415 
2416 #ifdef CONFIG_X86_64
2417 static atomic_t kvm_guest_has_master_clock = ATOMIC_INIT(0);
2418 #endif
2419 
2420 static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);
2421 static unsigned long max_tsc_khz;
2422 
2423 static u32 adjust_tsc_khz(u32 khz, s32 ppm)
2424 {
2425  u64 v = (u64)khz * (1000000 + ppm);
2426  do_div(v, 1000000);
2427  return v;
2428 }
2429 
2430 static void kvm_vcpu_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 l1_multiplier);
2431 
2432 static int set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)
2433 {
2434  u64 ratio;
2435 
2436  /* Guest TSC same frequency as host TSC? */
2437  if (!scale) {
2439  return 0;
2440  }
2441 
2442  /* TSC scaling supported? */
2443  if (!kvm_caps.has_tsc_control) {
2444  if (user_tsc_khz > tsc_khz) {
2445  vcpu->arch.tsc_catchup = 1;
2446  vcpu->arch.tsc_always_catchup = 1;
2447  return 0;
2448  } else {
2449  pr_warn_ratelimited("user requested TSC rate below hardware speed\n");
2450  return -1;
2451  }
2452  }
2453 
2454  /* TSC scaling required - calculate ratio */
2455  ratio = mul_u64_u32_div(1ULL << kvm_caps.tsc_scaling_ratio_frac_bits,
2456  user_tsc_khz, tsc_khz);
2457 
2458  if (ratio == 0 || ratio >= kvm_caps.max_tsc_scaling_ratio) {
2459  pr_warn_ratelimited("Invalid TSC scaling ratio - virtual-tsc-khz=%u\n",
2460  user_tsc_khz);
2461  return -1;
2462  }
2463 
2464  kvm_vcpu_write_tsc_multiplier(vcpu, ratio);
2465  return 0;
2466 }
2467 
2468 static int kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz)
2469 {
2470  u32 thresh_lo, thresh_hi;
2471  int use_scaling = 0;
2472 
2473  /* tsc_khz can be zero if TSC calibration fails */
2474  if (user_tsc_khz == 0) {
2475  /* set tsc_scaling_ratio to a safe value */
2477  return -1;
2478  }
2479 
2480  /* Compute a scale to convert nanoseconds in TSC cycles */
2481  kvm_get_time_scale(user_tsc_khz * 1000LL, NSEC_PER_SEC,
2482  &vcpu->arch.virtual_tsc_shift,
2483  &vcpu->arch.virtual_tsc_mult);
2484  vcpu->arch.virtual_tsc_khz = user_tsc_khz;
2485 
2486  /*
2487  * Compute the variation in TSC rate which is acceptable
2488  * within the range of tolerance and decide if the
2489  * rate being applied is within that bounds of the hardware
2490  * rate. If so, no scaling or compensation need be done.
2491  */
2492  thresh_lo = adjust_tsc_khz(tsc_khz, -tsc_tolerance_ppm);
2493  thresh_hi = adjust_tsc_khz(tsc_khz, tsc_tolerance_ppm);
2494  if (user_tsc_khz < thresh_lo || user_tsc_khz > thresh_hi) {
2495  pr_debug("requested TSC rate %u falls outside tolerance [%u,%u]\n",
2496  user_tsc_khz, thresh_lo, thresh_hi);
2497  use_scaling = 1;
2498  }
2499  return set_tsc_khz(vcpu, user_tsc_khz, use_scaling);
2500 }
2501 
2502 static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns)
2503 {
2504  u64 tsc = pvclock_scale_delta(kernel_ns-vcpu->arch.this_tsc_nsec,
2505  vcpu->arch.virtual_tsc_mult,
2506  vcpu->arch.virtual_tsc_shift);
2507  tsc += vcpu->arch.this_tsc_write;
2508  return tsc;
2509 }
2510 
2511 #ifdef CONFIG_X86_64
2512 static inline bool gtod_is_based_on_tsc(int mode)
2513 {
2514  return mode == VDSO_CLOCKMODE_TSC || mode == VDSO_CLOCKMODE_HVCLOCK;
2515 }
2516 #endif
2517 
2518 static void kvm_track_tsc_matching(struct kvm_vcpu *vcpu, bool new_generation)
2519 {
2520 #ifdef CONFIG_X86_64
2521  struct kvm_arch *ka = &vcpu->kvm->arch;
2522  struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
2523 
2524  /*
2525  * To use the masterclock, the host clocksource must be based on TSC
2526  * and all vCPUs must have matching TSCs. Note, the count for matching
2527  * vCPUs doesn't include the reference vCPU, hence "+1".
2528  */
2529  bool use_master_clock = (ka->nr_vcpus_matched_tsc + 1 ==
2530  atomic_read(&vcpu->kvm->online_vcpus)) &&
2531  gtod_is_based_on_tsc(gtod->clock.vclock_mode);
2532 
2533  /*
2534  * Request a masterclock update if the masterclock needs to be toggled
2535  * on/off, or when starting a new generation and the masterclock is
2536  * enabled (compute_guest_tsc() requires the masterclock snapshot to be
2537  * taken _after_ the new generation is created).
2538  */
2539  if ((ka->use_master_clock && new_generation) ||
2540  (ka->use_master_clock != use_master_clock))
2541  kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
2542 
2543  trace_kvm_track_tsc(vcpu->vcpu_id, ka->nr_vcpus_matched_tsc,
2544  atomic_read(&vcpu->kvm->online_vcpus),
2545  ka->use_master_clock, gtod->clock.vclock_mode);
2546 #endif
2547 }
2548 
2549 /*
2550  * Multiply tsc by a fixed point number represented by ratio.
2551  *
2552  * The most significant 64-N bits (mult) of ratio represent the
2553  * integral part of the fixed point number; the remaining N bits
2554  * (frac) represent the fractional part, ie. ratio represents a fixed
2555  * point number (mult + frac * 2^(-N)).
2556  *
2557  * N equals to kvm_caps.tsc_scaling_ratio_frac_bits.
2558  */
2559 static inline u64 __scale_tsc(u64 ratio, u64 tsc)
2560 {
2561  return mul_u64_u64_shr(tsc, ratio, kvm_caps.tsc_scaling_ratio_frac_bits);
2562 }
2563 
2564 u64 kvm_scale_tsc(u64 tsc, u64 ratio)
2565 {
2566  u64 _tsc = tsc;
2567 
2568  if (ratio != kvm_caps.default_tsc_scaling_ratio)
2569  _tsc = __scale_tsc(ratio, tsc);
2570 
2571  return _tsc;
2572 }
2573 
2574 static u64 kvm_compute_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
2575 {
2576  u64 tsc;
2577 
2578  tsc = kvm_scale_tsc(rdtsc(), vcpu->arch.l1_tsc_scaling_ratio);
2579 
2580  return target_tsc - tsc;
2581 }
2582 
2583 u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc)
2584 {
2585  return vcpu->arch.l1_tsc_offset +
2586  kvm_scale_tsc(host_tsc, vcpu->arch.l1_tsc_scaling_ratio);
2587 }
2589 
2590 u64 kvm_calc_nested_tsc_offset(u64 l1_offset, u64 l2_offset, u64 l2_multiplier)
2591 {
2592  u64 nested_offset;
2593 
2594  if (l2_multiplier == kvm_caps.default_tsc_scaling_ratio)
2595  nested_offset = l1_offset;
2596  else
2597  nested_offset = mul_s64_u64_shr((s64) l1_offset, l2_multiplier,
2599 
2600  nested_offset += l2_offset;
2601  return nested_offset;
2602 }
2604 
2605 u64 kvm_calc_nested_tsc_multiplier(u64 l1_multiplier, u64 l2_multiplier)
2606 {
2607  if (l2_multiplier != kvm_caps.default_tsc_scaling_ratio)
2608  return mul_u64_u64_shr(l1_multiplier, l2_multiplier,
2610 
2611  return l1_multiplier;
2612 }
2614 
2615 static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 l1_offset)
2616 {
2617  trace_kvm_write_tsc_offset(vcpu->vcpu_id,
2618  vcpu->arch.l1_tsc_offset,
2619  l1_offset);
2620 
2621  vcpu->arch.l1_tsc_offset = l1_offset;
2622 
2623  /*
2624  * If we are here because L1 chose not to trap WRMSR to TSC then
2625  * according to the spec this should set L1's TSC (as opposed to
2626  * setting L1's offset for L2).
2627  */
2628  if (is_guest_mode(vcpu))
2629  vcpu->arch.tsc_offset = kvm_calc_nested_tsc_offset(
2630  l1_offset,
2631  static_call(kvm_x86_get_l2_tsc_offset)(vcpu),
2632  static_call(kvm_x86_get_l2_tsc_multiplier)(vcpu));
2633  else
2634  vcpu->arch.tsc_offset = l1_offset;
2635 
2636  static_call(kvm_x86_write_tsc_offset)(vcpu);
2637 }
2638 
2639 static void kvm_vcpu_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 l1_multiplier)
2640 {
2641  vcpu->arch.l1_tsc_scaling_ratio = l1_multiplier;
2642 
2643  /* Userspace is changing the multiplier while L2 is active */
2644  if (is_guest_mode(vcpu))
2645  vcpu->arch.tsc_scaling_ratio = kvm_calc_nested_tsc_multiplier(
2646  l1_multiplier,
2647  static_call(kvm_x86_get_l2_tsc_multiplier)(vcpu));
2648  else
2649  vcpu->arch.tsc_scaling_ratio = l1_multiplier;
2650 
2652  static_call(kvm_x86_write_tsc_multiplier)(vcpu);
2653 }
2654 
2655 static inline bool kvm_check_tsc_unstable(void)
2656 {
2657 #ifdef CONFIG_X86_64
2658  /*
2659  * TSC is marked unstable when we're running on Hyper-V,
2660  * 'TSC page' clocksource is good.
2661  */
2662  if (pvclock_gtod_data.clock.vclock_mode == VDSO_CLOCKMODE_HVCLOCK)
2663  return false;
2664 #endif
2665  return check_tsc_unstable();
2666 }
2667 
2668 /*
2669  * Infers attempts to synchronize the guest's tsc from host writes. Sets the
2670  * offset for the vcpu and tracks the TSC matching generation that the vcpu
2671  * participates in.
2672  */
2673 static void __kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 offset, u64 tsc,
2674  u64 ns, bool matched)
2675 {
2676  struct kvm *kvm = vcpu->kvm;
2677 
2678  lockdep_assert_held(&kvm->arch.tsc_write_lock);
2679 
2680  /*
2681  * We also track th most recent recorded KHZ, write and time to
2682  * allow the matching interval to be extended at each write.
2683  */
2684  kvm->arch.last_tsc_nsec = ns;
2685  kvm->arch.last_tsc_write = tsc;
2686  kvm->arch.last_tsc_khz = vcpu->arch.virtual_tsc_khz;
2687  kvm->arch.last_tsc_offset = offset;
2688 
2689  vcpu->arch.last_guest_tsc = tsc;
2690 
2691  kvm_vcpu_write_tsc_offset(vcpu, offset);
2692 
2693  if (!matched) {
2694  /*
2695  * We split periods of matched TSC writes into generations.
2696  * For each generation, we track the original measured
2697  * nanosecond time, offset, and write, so if TSCs are in
2698  * sync, we can match exact offset, and if not, we can match
2699  * exact software computation in compute_guest_tsc()
2700  *
2701  * These values are tracked in kvm->arch.cur_xxx variables.
2702  */
2703  kvm->arch.cur_tsc_generation++;
2704  kvm->arch.cur_tsc_nsec = ns;
2705  kvm->arch.cur_tsc_write = tsc;
2706  kvm->arch.cur_tsc_offset = offset;
2707  kvm->arch.nr_vcpus_matched_tsc = 0;
2708  } else if (vcpu->arch.this_tsc_generation != kvm->arch.cur_tsc_generation) {
2709  kvm->arch.nr_vcpus_matched_tsc++;
2710  }
2711 
2712  /* Keep track of which generation this VCPU has synchronized to */
2713  vcpu->arch.this_tsc_generation = kvm->arch.cur_tsc_generation;
2714  vcpu->arch.this_tsc_nsec = kvm->arch.cur_tsc_nsec;
2715  vcpu->arch.this_tsc_write = kvm->arch.cur_tsc_write;
2716 
2717  kvm_track_tsc_matching(vcpu, !matched);
2718 }
2719 
2720 static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 *user_value)
2721 {
2722  u64 data = user_value ? *user_value : 0;
2723  struct kvm *kvm = vcpu->kvm;
2724  u64 offset, ns, elapsed;
2725  unsigned long flags;
2726  bool matched = false;
2727  bool synchronizing = false;
2728 
2729  raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
2730  offset = kvm_compute_l1_tsc_offset(vcpu, data);
2731  ns = get_kvmclock_base_ns();
2732  elapsed = ns - kvm->arch.last_tsc_nsec;
2733 
2734  if (vcpu->arch.virtual_tsc_khz) {
2735  if (data == 0) {
2736  /*
2737  * Force synchronization when creating a vCPU, or when
2738  * userspace explicitly writes a zero value.
2739  */
2740  synchronizing = true;
2741  } else if (kvm->arch.user_set_tsc) {
2742  u64 tsc_exp = kvm->arch.last_tsc_write +
2743  nsec_to_cycles(vcpu, elapsed);
2744  u64 tsc_hz = vcpu->arch.virtual_tsc_khz * 1000LL;
2745  /*
2746  * Here lies UAPI baggage: when a user-initiated TSC write has
2747  * a small delta (1 second) of virtual cycle time against the
2748  * previously set vCPU, we assume that they were intended to be
2749  * in sync and the delta was only due to the racy nature of the
2750  * legacy API.
2751  *
2752  * This trick falls down when restoring a guest which genuinely
2753  * has been running for less time than the 1 second of imprecision
2754  * which we allow for in the legacy API. In this case, the first
2755  * value written by userspace (on any vCPU) should not be subject
2756  * to this 'correction' to make it sync up with values that only
2757  * come from the kernel's default vCPU creation. Make the 1-second
2758  * slop hack only trigger if the user_set_tsc flag is already set.
2759  */
2760  synchronizing = data < tsc_exp + tsc_hz &&
2761  data + tsc_hz > tsc_exp;
2762  }
2763  }
2764 
2765  if (user_value)
2766  kvm->arch.user_set_tsc = true;
2767 
2768  /*
2769  * For a reliable TSC, we can match TSC offsets, and for an unstable
2770  * TSC, we add elapsed time in this computation. We could let the
2771  * compensation code attempt to catch up if we fall behind, but
2772  * it's better to try to match offsets from the beginning.
2773  */
2774  if (synchronizing &&
2775  vcpu->arch.virtual_tsc_khz == kvm->arch.last_tsc_khz) {
2776  if (!kvm_check_tsc_unstable()) {
2777  offset = kvm->arch.cur_tsc_offset;
2778  } else {
2779  u64 delta = nsec_to_cycles(vcpu, elapsed);
2780  data += delta;
2781  offset = kvm_compute_l1_tsc_offset(vcpu, data);
2782  }
2783  matched = true;
2784  }
2785 
2786  __kvm_synchronize_tsc(vcpu, offset, data, ns, matched);
2787  raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
2788 }
2789 
2790 static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu,
2791  s64 adjustment)
2792 {
2793  u64 tsc_offset = vcpu->arch.l1_tsc_offset;
2794  kvm_vcpu_write_tsc_offset(vcpu, tsc_offset + adjustment);
2795 }
2796 
2797 static inline void adjust_tsc_offset_host(struct kvm_vcpu *vcpu, s64 adjustment)
2798 {
2799  if (vcpu->arch.l1_tsc_scaling_ratio != kvm_caps.default_tsc_scaling_ratio)
2800  WARN_ON(adjustment < 0);
2801  adjustment = kvm_scale_tsc((u64) adjustment,
2802  vcpu->arch.l1_tsc_scaling_ratio);
2803  adjust_tsc_offset_guest(vcpu, adjustment);
2804 }
2805 
2806 #ifdef CONFIG_X86_64
2807 
2808 static u64 read_tsc(void)
2809 {
2810  u64 ret = (u64)rdtsc_ordered();
2811  u64 last = pvclock_gtod_data.clock.cycle_last;
2812 
2813  if (likely(ret >= last))
2814  return ret;
2815 
2816  /*
2817  * GCC likes to generate cmov here, but this branch is extremely
2818  * predictable (it's just a function of time and the likely is
2819  * very likely) and there's a data dependence, so force GCC
2820  * to generate a branch instead. I don't barrier() because
2821  * we don't actually need a barrier, and if this function
2822  * ever gets inlined it will generate worse code.
2823  */
2824  asm volatile ("");
2825  return last;
2826 }
2827 
2828 static inline u64 vgettsc(struct pvclock_clock *clock, u64 *tsc_timestamp,
2829  int *mode)
2830 {
2831  u64 tsc_pg_val;
2832  long v;
2833 
2834  switch (clock->vclock_mode) {
2835  case VDSO_CLOCKMODE_HVCLOCK:
2836  if (hv_read_tsc_page_tsc(hv_get_tsc_page(),
2837  tsc_timestamp, &tsc_pg_val)) {
2838  /* TSC page valid */
2839  *mode = VDSO_CLOCKMODE_HVCLOCK;
2840  v = (tsc_pg_val - clock->cycle_last) &
2841  clock->mask;
2842  } else {
2843  /* TSC page invalid */
2844  *mode = VDSO_CLOCKMODE_NONE;
2845  }
2846  break;
2847  case VDSO_CLOCKMODE_TSC:
2848  *mode = VDSO_CLOCKMODE_TSC;
2849  *tsc_timestamp = read_tsc();
2850  v = (*tsc_timestamp - clock->cycle_last) &
2851  clock->mask;
2852  break;
2853  default:
2854  *mode = VDSO_CLOCKMODE_NONE;
2855  }
2856 
2857  if (*mode == VDSO_CLOCKMODE_NONE)
2858  *tsc_timestamp = v = 0;
2859 
2860  return v * clock->mult;
2861 }
2862 
2863 static int do_monotonic_raw(s64 *t, u64 *tsc_timestamp)
2864 {
2865  struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
2866  unsigned long seq;
2867  int mode;
2868  u64 ns;
2869 
2870  do {
2871  seq = read_seqcount_begin(&gtod->seq);
2872  ns = gtod->raw_clock.base_cycles;
2873  ns += vgettsc(&gtod->raw_clock, tsc_timestamp, &mode);
2874  ns >>= gtod->raw_clock.shift;
2875  ns += ktime_to_ns(ktime_add(gtod->raw_clock.offset, gtod->offs_boot));
2876  } while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
2877  *t = ns;
2878 
2879  return mode;
2880 }
2881 
2882 static int do_realtime(struct timespec64 *ts, u64 *tsc_timestamp)
2883 {
2884  struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
2885  unsigned long seq;
2886  int mode;
2887  u64 ns;
2888 
2889  do {
2890  seq = read_seqcount_begin(&gtod->seq);
2891  ts->tv_sec = gtod->wall_time_sec;
2892  ns = gtod->clock.base_cycles;
2893  ns += vgettsc(&gtod->clock, tsc_timestamp, &mode);
2894  ns >>= gtod->clock.shift;
2895  } while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
2896 
2897  ts->tv_sec += __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns);
2898  ts->tv_nsec = ns;
2899 
2900  return mode;
2901 }
2902 
2903 /* returns true if host is using TSC based clocksource */
2904 static bool kvm_get_time_and_clockread(s64 *kernel_ns, u64 *tsc_timestamp)
2905 {
2906  /* checked again under seqlock below */
2907  if (!gtod_is_based_on_tsc(pvclock_gtod_data.clock.vclock_mode))
2908  return false;
2909 
2910  return gtod_is_based_on_tsc(do_monotonic_raw(kernel_ns,
2911  tsc_timestamp));
2912 }
2913 
2914 /* returns true if host is using TSC based clocksource */
2915 static bool kvm_get_walltime_and_clockread(struct timespec64 *ts,
2916  u64 *tsc_timestamp)
2917 {
2918  /* checked again under seqlock below */
2919  if (!gtod_is_based_on_tsc(pvclock_gtod_data.clock.vclock_mode))
2920  return false;
2921 
2922  return gtod_is_based_on_tsc(do_realtime(ts, tsc_timestamp));
2923 }
2924 #endif
2925 
2926 /*
2927  *
2928  * Assuming a stable TSC across physical CPUS, and a stable TSC
2929  * across virtual CPUs, the following condition is possible.
2930  * Each numbered line represents an event visible to both
2931  * CPUs at the next numbered event.
2932  *
2933  * "timespecX" represents host monotonic time. "tscX" represents
2934  * RDTSC value.
2935  *
2936  * VCPU0 on CPU0 | VCPU1 on CPU1
2937  *
2938  * 1. read timespec0,tsc0
2939  * 2. | timespec1 = timespec0 + N
2940  * | tsc1 = tsc0 + M
2941  * 3. transition to guest | transition to guest
2942  * 4. ret0 = timespec0 + (rdtsc - tsc0) |
2943  * 5. | ret1 = timespec1 + (rdtsc - tsc1)
2944  * | ret1 = timespec0 + N + (rdtsc - (tsc0 + M))
2945  *
2946  * Since ret0 update is visible to VCPU1 at time 5, to obey monotonicity:
2947  *
2948  * - ret0 < ret1
2949  * - timespec0 + (rdtsc - tsc0) < timespec0 + N + (rdtsc - (tsc0 + M))
2950  * ...
2951  * - 0 < N - M => M < N
2952  *
2953  * That is, when timespec0 != timespec1, M < N. Unfortunately that is not
2954  * always the case (the difference between two distinct xtime instances
2955  * might be smaller then the difference between corresponding TSC reads,
2956  * when updating guest vcpus pvclock areas).
2957  *
2958  * To avoid that problem, do not allow visibility of distinct
2959  * system_timestamp/tsc_timestamp values simultaneously: use a master
2960  * copy of host monotonic time values. Update that master copy
2961  * in lockstep.
2962  *
2963  * Rely on synchronization of host TSCs and guest TSCs for monotonicity.
2964  *
2965  */
2966 
2967 static void pvclock_update_vm_gtod_copy(struct kvm *kvm)
2968 {
2969 #ifdef CONFIG_X86_64
2970  struct kvm_arch *ka = &kvm->arch;
2971  int vclock_mode;
2972  bool host_tsc_clocksource, vcpus_matched;
2973 
2974  lockdep_assert_held(&kvm->arch.tsc_write_lock);
2975  vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 ==
2976  atomic_read(&kvm->online_vcpus));
2977 
2978  /*
2979  * If the host uses TSC clock, then passthrough TSC as stable
2980  * to the guest.
2981  */
2982  host_tsc_clocksource = kvm_get_time_and_clockread(
2983  &ka->master_kernel_ns,
2984  &ka->master_cycle_now);
2985 
2986  ka->use_master_clock = host_tsc_clocksource && vcpus_matched
2987  && !ka->backwards_tsc_observed
2988  && !ka->boot_vcpu_runs_old_kvmclock;
2989 
2990  if (ka->use_master_clock)
2991  atomic_set(&kvm_guest_has_master_clock, 1);
2992 
2993  vclock_mode = pvclock_gtod_data.clock.vclock_mode;
2994  trace_kvm_update_master_clock(ka->use_master_clock, vclock_mode,
2995  vcpus_matched);
2996 #endif
2997 }
2998 
2999 static void kvm_make_mclock_inprogress_request(struct kvm *kvm)
3000 {
3001  kvm_make_all_cpus_request(kvm, KVM_REQ_MCLOCK_INPROGRESS);
3002 }
3003 
3004 static void __kvm_start_pvclock_update(struct kvm *kvm)
3005 {
3006  raw_spin_lock_irq(&kvm->arch.tsc_write_lock);
3007  write_seqcount_begin(&kvm->arch.pvclock_sc);
3008 }
3009 
3010 static void kvm_start_pvclock_update(struct kvm *kvm)
3011 {
3013 
3014  /* no guest entries from this point */
3016 }
3017 
3018 static void kvm_end_pvclock_update(struct kvm *kvm)
3019 {
3020  struct kvm_arch *ka = &kvm->arch;
3021  struct kvm_vcpu *vcpu;
3022  unsigned long i;
3023 
3024  write_seqcount_end(&ka->pvclock_sc);
3025  raw_spin_unlock_irq(&ka->tsc_write_lock);
3026  kvm_for_each_vcpu(i, vcpu, kvm)
3027  kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
3028 
3029  /* guest entries allowed */
3030  kvm_for_each_vcpu(i, vcpu, kvm)
3031  kvm_clear_request(KVM_REQ_MCLOCK_INPROGRESS, vcpu);
3032 }
3033 
3034 static void kvm_update_masterclock(struct kvm *kvm)
3035 {
3040 }
3041 
3042 /*
3043  * Use the kernel's tsc_khz directly if the TSC is constant, otherwise use KVM's
3044  * per-CPU value (which may be zero if a CPU is going offline). Note, tsc_khz
3045  * can change during boot even if the TSC is constant, as it's possible for KVM
3046  * to be loaded before TSC calibration completes. Ideally, KVM would get a
3047  * notification when calibration completes, but practically speaking calibration
3048  * will complete before userspace is alive enough to create VMs.
3049  */
3050 static unsigned long get_cpu_tsc_khz(void)
3051 {
3052  if (static_cpu_has(X86_FEATURE_CONSTANT_TSC))
3053  return tsc_khz;
3054  else
3055  return __this_cpu_read(cpu_tsc_khz);
3056 }
3057 
3058 /* Called within read_seqcount_begin/retry for kvm->pvclock_sc. */
3059 static void __get_kvmclock(struct kvm *kvm, struct kvm_clock_data *data)
3060 {
3061  struct kvm_arch *ka = &kvm->arch;
3062  struct pvclock_vcpu_time_info hv_clock;
3063 
3064  /* both __this_cpu_read() and rdtsc() should be on the same cpu */
3065  get_cpu();
3066 
3067  data->flags = 0;
3068  if (ka->use_master_clock &&
3069  (static_cpu_has(X86_FEATURE_CONSTANT_TSC) || __this_cpu_read(cpu_tsc_khz))) {
3070 #ifdef CONFIG_X86_64
3071  struct timespec64 ts;
3072 
3073  if (kvm_get_walltime_and_clockread(&ts, &data->host_tsc)) {
3074  data->realtime = ts.tv_nsec + NSEC_PER_SEC * ts.tv_sec;
3075  data->flags |= KVM_CLOCK_REALTIME | KVM_CLOCK_HOST_TSC;
3076  } else
3077 #endif
3078  data->host_tsc = rdtsc();
3079 
3080  data->flags |= KVM_CLOCK_TSC_STABLE;
3081  hv_clock.tsc_timestamp = ka->master_cycle_now;
3082  hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset;
3083  kvm_get_time_scale(NSEC_PER_SEC, get_cpu_tsc_khz() * 1000LL,
3084  &hv_clock.tsc_shift,
3085  &hv_clock.tsc_to_system_mul);
3086  data->clock = __pvclock_read_cycles(&hv_clock, data->host_tsc);
3087  } else {
3088  data->clock = get_kvmclock_base_ns() + ka->kvmclock_offset;
3089  }
3090 
3091  put_cpu();
3092 }
3093 
3094 static void get_kvmclock(struct kvm *kvm, struct kvm_clock_data *data)
3095 {
3096  struct kvm_arch *ka = &kvm->arch;
3097  unsigned seq;
3098 
3099  do {
3100  seq = read_seqcount_begin(&ka->pvclock_sc);
3101  __get_kvmclock(kvm, data);
3102  } while (read_seqcount_retry(&ka->pvclock_sc, seq));
3103 }
3104 
3105 u64 get_kvmclock_ns(struct kvm *kvm)
3106 {
3107  struct kvm_clock_data data;
3108 
3109  get_kvmclock(kvm, &data);
3110  return data.clock;
3111 }
3112 
3113 static void kvm_setup_guest_pvclock(struct kvm_vcpu *v,
3114  struct gfn_to_pfn_cache *gpc,
3115  unsigned int offset,
3116  bool force_tsc_unstable)
3117 {
3118  struct kvm_vcpu_arch *vcpu = &v->arch;
3119  struct pvclock_vcpu_time_info *guest_hv_clock;
3120  unsigned long flags;
3121 
3122  read_lock_irqsave(&gpc->lock, flags);
3123  while (!kvm_gpc_check(gpc, offset + sizeof(*guest_hv_clock))) {
3124  read_unlock_irqrestore(&gpc->lock, flags);
3125 
3126  if (kvm_gpc_refresh(gpc, offset + sizeof(*guest_hv_clock)))
3127  return;
3128 
3129  read_lock_irqsave(&gpc->lock, flags);
3130  }
3131 
3132  guest_hv_clock = (void *)(gpc->khva + offset);
3133 
3134  /*
3135  * This VCPU is paused, but it's legal for a guest to read another
3136  * VCPU's kvmclock, so we really have to follow the specification where
3137  * it says that version is odd if data is being modified, and even after
3138  * it is consistent.
3139  */
3140 
3141  guest_hv_clock->version = vcpu->hv_clock.version = (guest_hv_clock->version + 1) | 1;
3142  smp_wmb();
3143 
3144  /* retain PVCLOCK_GUEST_STOPPED if set in guest copy */
3145  vcpu->hv_clock.flags |= (guest_hv_clock->flags & PVCLOCK_GUEST_STOPPED);
3146 
3147  if (vcpu->pvclock_set_guest_stopped_request) {
3148  vcpu->hv_clock.flags |= PVCLOCK_GUEST_STOPPED;
3149  vcpu->pvclock_set_guest_stopped_request = false;
3150  }
3151 
3152  memcpy(guest_hv_clock, &vcpu->hv_clock, sizeof(*guest_hv_clock));
3153 
3154  if (force_tsc_unstable)
3155  guest_hv_clock->flags &= ~PVCLOCK_TSC_STABLE_BIT;
3156 
3157  smp_wmb();
3158 
3159  guest_hv_clock->version = ++vcpu->hv_clock.version;
3160 
3161  mark_page_dirty_in_slot(v->kvm, gpc->memslot, gpc->gpa >> PAGE_SHIFT);
3162  read_unlock_irqrestore(&gpc->lock, flags);
3163 
3164  trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock);
3165 }
3166 
3167 static int kvm_guest_time_update(struct kvm_vcpu *v)
3168 {
3169  unsigned long flags, tgt_tsc_khz;
3170  unsigned seq;
3171  struct kvm_vcpu_arch *vcpu = &v->arch;
3172  struct kvm_arch *ka = &v->kvm->arch;
3173  s64 kernel_ns;
3174  u64 tsc_timestamp, host_tsc;
3175  u8 pvclock_flags;
3176  bool use_master_clock;
3177 #ifdef CONFIG_KVM_XEN
3178  /*
3179  * For Xen guests we may need to override PVCLOCK_TSC_STABLE_BIT as unless
3180  * explicitly told to use TSC as its clocksource Xen will not set this bit.
3181  * This default behaviour led to bugs in some guest kernels which cause
3182  * problems if they observe PVCLOCK_TSC_STABLE_BIT in the pvclock flags.
3183  */
3184  bool xen_pvclock_tsc_unstable =
3185  ka->xen_hvm_config.flags & KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE;
3186 #endif
3187 
3188  kernel_ns = 0;
3189  host_tsc = 0;
3190 
3191  /*
3192  * If the host uses TSC clock, then passthrough TSC as stable
3193  * to the guest.
3194  */
3195  do {
3196  seq = read_seqcount_begin(&ka->pvclock_sc);
3197  use_master_clock = ka->use_master_clock;
3198  if (use_master_clock) {
3199  host_tsc = ka->master_cycle_now;
3200  kernel_ns = ka->master_kernel_ns;
3201  }
3202  } while (read_seqcount_retry(&ka->pvclock_sc, seq));
3203 
3204  /* Keep irq disabled to prevent changes to the clock */
3205  local_irq_save(flags);
3206  tgt_tsc_khz = get_cpu_tsc_khz();
3207  if (unlikely(tgt_tsc_khz == 0)) {
3208  local_irq_restore(flags);
3209  kvm_make_request(KVM_REQ_CLOCK_UPDATE, v);
3210  return 1;
3211  }
3212  if (!use_master_clock) {
3213  host_tsc = rdtsc();
3214  kernel_ns = get_kvmclock_base_ns();
3215  }
3216 
3217  tsc_timestamp = kvm_read_l1_tsc(v, host_tsc);
3218 
3219  /*
3220  * We may have to catch up the TSC to match elapsed wall clock
3221  * time for two reasons, even if kvmclock is used.
3222  * 1) CPU could have been running below the maximum TSC rate
3223  * 2) Broken TSC compensation resets the base at each VCPU
3224  * entry to avoid unknown leaps of TSC even when running
3225  * again on the same CPU. This may cause apparent elapsed
3226  * time to disappear, and the guest to stand still or run
3227  * very slowly.
3228  */
3229  if (vcpu->tsc_catchup) {
3230  u64 tsc = compute_guest_tsc(v, kernel_ns);
3231  if (tsc > tsc_timestamp) {
3232  adjust_tsc_offset_guest(v, tsc - tsc_timestamp);
3233  tsc_timestamp = tsc;
3234  }
3235  }
3236 
3237  local_irq_restore(flags);
3238 
3239  /* With all the info we got, fill in the values */
3240 
3242  tgt_tsc_khz = kvm_scale_tsc(tgt_tsc_khz,
3243  v->arch.l1_tsc_scaling_ratio);
3244 
3245  if (unlikely(vcpu->hw_tsc_khz != tgt_tsc_khz)) {
3246  kvm_get_time_scale(NSEC_PER_SEC, tgt_tsc_khz * 1000LL,
3247  &vcpu->hv_clock.tsc_shift,
3248  &vcpu->hv_clock.tsc_to_system_mul);
3249  vcpu->hw_tsc_khz = tgt_tsc_khz;
3251  }
3252 
3253  vcpu->hv_clock.tsc_timestamp = tsc_timestamp;
3254  vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
3255  vcpu->last_guest_tsc = tsc_timestamp;
3256 
3257  /* If the host uses TSC clocksource, then it is stable */
3258  pvclock_flags = 0;
3259  if (use_master_clock)
3260  pvclock_flags |= PVCLOCK_TSC_STABLE_BIT;
3261 
3262  vcpu->hv_clock.flags = pvclock_flags;
3263 
3264  if (vcpu->pv_time.active)
3265  kvm_setup_guest_pvclock(v, &vcpu->pv_time, 0, false);
3266 #ifdef CONFIG_KVM_XEN
3267  if (vcpu->xen.vcpu_info_cache.active)
3268  kvm_setup_guest_pvclock(v, &vcpu->xen.vcpu_info_cache,
3269  offsetof(struct compat_vcpu_info, time),
3270  xen_pvclock_tsc_unstable);
3271  if (vcpu->xen.vcpu_time_info_cache.active)
3272  kvm_setup_guest_pvclock(v, &vcpu->xen.vcpu_time_info_cache, 0,
3273  xen_pvclock_tsc_unstable);
3274 #endif
3275  kvm_hv_setup_tsc_page(v->kvm, &vcpu->hv_clock);
3276  return 0;
3277 }
3278 
3279 /*
3280  * The pvclock_wall_clock ABI tells the guest the wall clock time at
3281  * which it started (i.e. its epoch, when its kvmclock was zero).
3282  *
3283  * In fact those clocks are subtly different; wall clock frequency is
3284  * adjusted by NTP and has leap seconds, while the kvmclock is a
3285  * simple function of the TSC without any such adjustment.
3286  *
3287  * Perhaps the ABI should have exposed CLOCK_TAI and a ratio between
3288  * that and kvmclock, but even that would be subject to change over
3289  * time.
3290  *
3291  * Attempt to calculate the epoch at a given moment using the *same*
3292  * TSC reading via kvm_get_walltime_and_clockread() to obtain both
3293  * wallclock and kvmclock times, and subtracting one from the other.
3294  *
3295  * Fall back to using their values at slightly different moments by
3296  * calling ktime_get_real_ns() and get_kvmclock_ns() separately.
3297  */
3298 uint64_t kvm_get_wall_clock_epoch(struct kvm *kvm)
3299 {
3300 #ifdef CONFIG_X86_64
3301  struct pvclock_vcpu_time_info hv_clock;
3302  struct kvm_arch *ka = &kvm->arch;
3303  unsigned long seq, local_tsc_khz;
3304  struct timespec64 ts;
3305  uint64_t host_tsc;
3306 
3307  do {
3308  seq = read_seqcount_begin(&ka->pvclock_sc);
3309 
3310  local_tsc_khz = 0;
3311  if (!ka->use_master_clock)
3312  break;
3313 
3314  /*
3315  * The TSC read and the call to get_cpu_tsc_khz() must happen
3316  * on the same CPU.
3317  */
3318  get_cpu();
3319 
3320  local_tsc_khz = get_cpu_tsc_khz();
3321 
3322  if (local_tsc_khz &&
3323  !kvm_get_walltime_and_clockread(&ts, &host_tsc))
3324  local_tsc_khz = 0; /* Fall back to old method */
3325 
3326  put_cpu();
3327 
3328  /*
3329  * These values must be snapshotted within the seqcount loop.
3330  * After that, it's just mathematics which can happen on any
3331  * CPU at any time.
3332  */
3333  hv_clock.tsc_timestamp = ka->master_cycle_now;
3334  hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset;
3335 
3336  } while (read_seqcount_retry(&ka->pvclock_sc, seq));
3337 
3338  /*
3339  * If the conditions were right, and obtaining the wallclock+TSC was
3340  * successful, calculate the KVM clock at the corresponding time and
3341  * subtract one from the other to get the guest's epoch in nanoseconds
3342  * since 1970-01-01.
3343  */
3344  if (local_tsc_khz) {
3345  kvm_get_time_scale(NSEC_PER_SEC, local_tsc_khz * NSEC_PER_USEC,
3346  &hv_clock.tsc_shift,
3347  &hv_clock.tsc_to_system_mul);
3348  return ts.tv_nsec + NSEC_PER_SEC * ts.tv_sec -
3349  __pvclock_read_cycles(&hv_clock, host_tsc);
3350  }
3351 #endif
3352  return ktime_get_real_ns() - get_kvmclock_ns(kvm);
3353 }
3354 
3355 /*
3356  * kvmclock updates which are isolated to a given vcpu, such as
3357  * vcpu->cpu migration, should not allow system_timestamp from
3358  * the rest of the vcpus to remain static. Otherwise ntp frequency
3359  * correction applies to one vcpu's system_timestamp but not
3360  * the others.
3361  *
3362  * So in those cases, request a kvmclock update for all vcpus.
3363  * We need to rate-limit these requests though, as they can
3364  * considerably slow guests that have a large number of vcpus.
3365  * The time for a remote vcpu to update its kvmclock is bound
3366  * by the delay we use to rate-limit the updates.
3367  */
3368 
3369 #define KVMCLOCK_UPDATE_DELAY msecs_to_jiffies(100)
3370 
3371 static void kvmclock_update_fn(struct work_struct *work)
3372 {
3373  unsigned long i;
3374  struct delayed_work *dwork = to_delayed_work(work);
3375  struct kvm_arch *ka = container_of(dwork, struct kvm_arch,
3376  kvmclock_update_work);
3377  struct kvm *kvm = container_of(ka, struct kvm, arch);
3378  struct kvm_vcpu *vcpu;
3379 
3380  kvm_for_each_vcpu(i, vcpu, kvm) {
3381  kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
3382  kvm_vcpu_kick(vcpu);
3383  }
3384 }
3385 
3386 static void kvm_gen_kvmclock_update(struct kvm_vcpu *v)
3387 {
3388  struct kvm *kvm = v->kvm;
3389 
3390  kvm_make_request(KVM_REQ_CLOCK_UPDATE, v);
3391  schedule_delayed_work(&kvm->arch.kvmclock_update_work,
3393 }
3394 
3395 #define KVMCLOCK_SYNC_PERIOD (300 * HZ)
3396 
3397 static void kvmclock_sync_fn(struct work_struct *work)
3398 {
3399  struct delayed_work *dwork = to_delayed_work(work);
3400  struct kvm_arch *ka = container_of(dwork, struct kvm_arch,
3401  kvmclock_sync_work);
3402  struct kvm *kvm = container_of(ka, struct kvm, arch);
3403 
3404  schedule_delayed_work(&kvm->arch.kvmclock_update_work, 0);
3405  schedule_delayed_work(&kvm->arch.kvmclock_sync_work,
3407 }
3408 
3409 /* These helpers are safe iff @msr is known to be an MCx bank MSR. */
3410 static bool is_mci_control_msr(u32 msr)
3411 {
3412  return (msr & 3) == 0;
3413 }
3414 static bool is_mci_status_msr(u32 msr)
3415 {
3416  return (msr & 3) == 1;
3417 }
3418 
3419 /*
3420  * On AMD, HWCR[McStatusWrEn] controls whether setting MCi_STATUS results in #GP.
3421  */
3422 static bool can_set_mci_status(struct kvm_vcpu *vcpu)
3423 {
3424  /* McStatusWrEn enabled? */
3426  return !!(vcpu->arch.msr_hwcr & BIT_ULL(18));
3427 
3428  return false;
3429 }
3430 
3431 static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
3432 {
3433  u64 mcg_cap = vcpu->arch.mcg_cap;
3434  unsigned bank_num = mcg_cap & 0xff;
3435  u32 msr = msr_info->index;
3436  u64 data = msr_info->data;
3437  u32 offset, last_msr;
3438 
3439  switch (msr) {
3440  case MSR_IA32_MCG_STATUS:
3441  vcpu->arch.mcg_status = data;
3442  break;
3443  case MSR_IA32_MCG_CTL:
3444  if (!(mcg_cap & MCG_CTL_P) &&
3445  (data || !msr_info->host_initiated))
3446  return 1;
3447  if (data != 0 && data != ~(u64)0)
3448  return 1;
3449  vcpu->arch.mcg_ctl = data;
3450  break;
3451  case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1:
3452  last_msr = MSR_IA32_MCx_CTL2(bank_num) - 1;
3453  if (msr > last_msr)
3454  return 1;
3455 
3456  if (!(mcg_cap & MCG_CMCI_P) && (data || !msr_info->host_initiated))
3457  return 1;
3458  /* An attempt to write a 1 to a reserved bit raises #GP */
3459  if (data & ~(MCI_CTL2_CMCI_EN | MCI_CTL2_CMCI_THRESHOLD_MASK))
3460  return 1;
3461  offset = array_index_nospec(msr - MSR_IA32_MC0_CTL2,
3462  last_msr + 1 - MSR_IA32_MC0_CTL2);
3463  vcpu->arch.mci_ctl2_banks[offset] = data;
3464  break;
3465  case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
3466  last_msr = MSR_IA32_MCx_CTL(bank_num) - 1;
3467  if (msr > last_msr)
3468  return 1;
3469 
3470  /*
3471  * Only 0 or all 1s can be written to IA32_MCi_CTL, all other
3472  * values are architecturally undefined. But, some Linux
3473  * kernels clear bit 10 in bank 4 to workaround a BIOS/GART TLB
3474  * issue on AMD K8s, allow bit 10 to be clear when setting all
3475  * other bits in order to avoid an uncaught #GP in the guest.
3476  *
3477  * UNIXWARE clears bit 0 of MC1_CTL to ignore correctable,
3478  * single-bit ECC data errors.
3479  */
3480  if (is_mci_control_msr(msr) &&
3481  data != 0 && (data | (1 << 10) | 1) != ~(u64)0)
3482  return 1;
3483 
3484  /*
3485  * All CPUs allow writing 0 to MCi_STATUS MSRs to clear the MSR.
3486  * AMD-based CPUs allow non-zero values, but if and only if
3487  * HWCR[McStatusWrEn] is set.
3488  */
3489  if (!msr_info->host_initiated && is_mci_status_msr(msr) &&
3490  data != 0 && !can_set_mci_status(vcpu))
3491  return 1;
3492 
3493  offset = array_index_nospec(msr - MSR_IA32_MC0_CTL,
3494  last_msr + 1 - MSR_IA32_MC0_CTL);
3495  vcpu->arch.mce_banks[offset] = data;
3496  break;
3497  default:
3498  return 1;
3499  }
3500  return 0;
3501 }
3502 
3503 static inline bool kvm_pv_async_pf_enabled(struct kvm_vcpu *vcpu)
3504 {
3505  u64 mask = KVM_ASYNC_PF_ENABLED | KVM_ASYNC_PF_DELIVERY_AS_INT;
3506 
3507  return (vcpu->arch.apf.msr_en_val & mask) == mask;
3508 }
3509 
3510 static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data)
3511 {
3512  gpa_t gpa = data & ~0x3f;
3513 
3514  /* Bits 4:5 are reserved, Should be zero */
3515  if (data & 0x30)
3516  return 1;
3517 
3518  if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_VMEXIT) &&
3519  (data & KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT))
3520  return 1;
3521 
3522  if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT) &&
3523  (data & KVM_ASYNC_PF_DELIVERY_AS_INT))
3524  return 1;
3525 
3526  if (!lapic_in_kernel(vcpu))
3527  return data ? 1 : 0;
3528 
3529  vcpu->arch.apf.msr_en_val = data;
3530 
3531  if (!kvm_pv_async_pf_enabled(vcpu)) {
3534  return 0;
3535  }
3536 
3537  if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.apf.data, gpa,
3538  sizeof(u64)))
3539  return 1;
3540 
3541  vcpu->arch.apf.send_user_only = !(data & KVM_ASYNC_PF_SEND_ALWAYS);
3542  vcpu->arch.apf.delivery_as_pf_vmexit = data & KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT;
3543 
3545 
3546  return 0;
3547 }
3548 
3549 static int kvm_pv_enable_async_pf_int(struct kvm_vcpu *vcpu, u64 data)
3550 {
3551  /* Bits 8-63 are reserved */
3552  if (data >> 8)
3553  return 1;
3554 
3555  if (!lapic_in_kernel(vcpu))
3556  return 1;
3557 
3558  vcpu->arch.apf.msr_int_val = data;
3559 
3560  vcpu->arch.apf.vec = data & KVM_ASYNC_PF_VEC_MASK;
3561 
3562  return 0;
3563 }
3564 
3565 static void kvmclock_reset(struct kvm_vcpu *vcpu)
3566 {
3567  kvm_gpc_deactivate(&vcpu->arch.pv_time);
3568  vcpu->arch.time = 0;
3569 }
3570 
3571 static void kvm_vcpu_flush_tlb_all(struct kvm_vcpu *vcpu)
3572 {
3573  ++vcpu->stat.tlb_flush;
3574  static_call(kvm_x86_flush_tlb_all)(vcpu);
3575 
3576  /* Flushing all ASIDs flushes the current ASID... */
3577  kvm_clear_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
3578 }
3579 
3580 static void kvm_vcpu_flush_tlb_guest(struct kvm_vcpu *vcpu)
3581 {
3582  ++vcpu->stat.tlb_flush;
3583 
3584  if (!tdp_enabled) {
3585  /*
3586  * A TLB flush on behalf of the guest is equivalent to
3587  * INVPCID(all), toggling CR4.PGE, etc., which requires
3588  * a forced sync of the shadow page tables. Ensure all the
3589  * roots are synced and the guest TLB in hardware is clean.
3590  */
3591  kvm_mmu_sync_roots(vcpu);
3593  }
3594 
3595  static_call(kvm_x86_flush_tlb_guest)(vcpu);
3596 
3597  /*
3598  * Flushing all "guest" TLB is always a superset of Hyper-V's fine
3599  * grained flushing.
3600  */
3602 }
3603 
3604 
3605 static inline void kvm_vcpu_flush_tlb_current(struct kvm_vcpu *vcpu)
3606 {
3607  ++vcpu->stat.tlb_flush;
3608  static_call(kvm_x86_flush_tlb_current)(vcpu);
3609 }
3610 
3611 /*
3612  * Service "local" TLB flush requests, which are specific to the current MMU
3613  * context. In addition to the generic event handling in vcpu_enter_guest(),
3614  * TLB flushes that are targeted at an MMU context also need to be serviced
3615  * prior before nested VM-Enter/VM-Exit.
3616  */
3617 void kvm_service_local_tlb_flush_requests(struct kvm_vcpu *vcpu)
3618 {
3619  if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu))
3621 
3622  if (kvm_check_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu))
3624 }
3626 
3627 static void record_steal_time(struct kvm_vcpu *vcpu)
3628 {
3629  struct gfn_to_hva_cache *ghc = &vcpu->arch.st.cache;
3630  struct kvm_steal_time __user *st;
3631  struct kvm_memslots *slots;
3632  gpa_t gpa = vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS;
3633  u64 steal;
3634  u32 version;
3635 
3636  if (kvm_xen_msr_enabled(vcpu->kvm)) {
3638  return;
3639  }
3640 
3641  if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
3642  return;
3643 
3644  if (WARN_ON_ONCE(current->mm != vcpu->kvm->mm))
3645  return;
3646 
3647  slots = kvm_memslots(vcpu->kvm);
3648 
3649  if (unlikely(slots->generation != ghc->generation ||
3650  gpa != ghc->gpa ||
3651  kvm_is_error_hva(ghc->hva) || !ghc->memslot)) {
3652  /* We rely on the fact that it fits in a single page. */
3653  BUILD_BUG_ON((sizeof(*st) - 1) & KVM_STEAL_VALID_BITS);
3654 
3655  if (kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, gpa, sizeof(*st)) ||
3656  kvm_is_error_hva(ghc->hva) || !ghc->memslot)
3657  return;
3658  }
3659 
3660  st = (struct kvm_steal_time __user *)ghc->hva;
3661  /*
3662  * Doing a TLB flush here, on the guest's behalf, can avoid
3663  * expensive IPIs.
3664  */
3665  if (guest_pv_has(vcpu, KVM_FEATURE_PV_TLB_FLUSH)) {
3666  u8 st_preempted = 0;
3667  int err = -EFAULT;
3668 
3669  if (!user_access_begin(st, sizeof(*st)))
3670  return;
3671 
3672  asm volatile("1: xchgb %0, %2\n"
3673  "xor %1, %1\n"
3674  "2:\n"
3675  _ASM_EXTABLE_UA(1b, 2b)
3676  : "+q" (st_preempted),
3677  "+&r" (err),
3678  "+m" (st->preempted));
3679  if (err)
3680  goto out;
3681 
3682  user_access_end();
3683 
3684  vcpu->arch.st.preempted = 0;
3685 
3686  trace_kvm_pv_tlb_flush(vcpu->vcpu_id,
3687  st_preempted & KVM_VCPU_FLUSH_TLB);
3688  if (st_preempted & KVM_VCPU_FLUSH_TLB)
3690 
3691  if (!user_access_begin(st, sizeof(*st)))
3692  goto dirty;
3693  } else {
3694  if (!user_access_begin(st, sizeof(*st)))
3695  return;
3696 
3697  unsafe_put_user(0, &st->preempted, out);
3698  vcpu->arch.st.preempted = 0;
3699  }
3700 
3701  unsafe_get_user(version, &st->version, out);
3702  if (version & 1)
3703  version += 1; /* first time write, random junk */
3704 
3705  version += 1;
3706  unsafe_put_user(version, &st->version, out);
3707 
3708  smp_wmb();
3709 
3710  unsafe_get_user(steal, &st->steal, out);
3711  steal += current->sched_info.run_delay -
3712  vcpu->arch.st.last_steal;
3713  vcpu->arch.st.last_steal = current->sched_info.run_delay;
3714  unsafe_put_user(steal, &st->steal, out);
3715 
3716  version += 1;
3717  unsafe_put_user(version, &st->version, out);
3718 
3719  out:
3720  user_access_end();
3721  dirty:
3722  mark_page_dirty_in_slot(vcpu->kvm, ghc->memslot, gpa_to_gfn(ghc->gpa));
3723 }
3724 
3725 static bool kvm_is_msr_to_save(u32 msr_index)
3726 {
3727  unsigned int i;
3728 
3729  for (i = 0; i < num_msrs_to_save; i++) {
3730  if (msrs_to_save[i] == msr_index)
3731  return true;
3732  }
3733 
3734  return false;
3735 }
3736 
3737 int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
3738 {
3739  u32 msr = msr_info->index;
3740  u64 data = msr_info->data;
3741 
3742  if (msr && msr == vcpu->kvm->arch.xen_hvm_config.msr)
3743  return kvm_xen_write_hypercall_page(vcpu, data);
3744 
3745  switch (msr) {
3746  case MSR_AMD64_NB_CFG:
3747  case MSR_IA32_UCODE_WRITE:
3748  case MSR_VM_HSAVE_PA:
3749  case MSR_AMD64_PATCH_LOADER:
3750  case MSR_AMD64_BU_CFG2:
3751  case MSR_AMD64_DC_CFG:
3752  case MSR_AMD64_TW_CFG:
3753  case MSR_F15H_EX_CFG:
3754  break;
3755 
3756  case MSR_IA32_UCODE_REV:
3757  if (msr_info->host_initiated)
3758  vcpu->arch.microcode_version = data;
3759  break;
3760  case MSR_IA32_ARCH_CAPABILITIES:
3761  if (!msr_info->host_initiated)
3762  return 1;
3763  vcpu->arch.arch_capabilities = data;
3764  break;
3765  case MSR_IA32_PERF_CAPABILITIES:
3766  if (!msr_info->host_initiated)
3767  return 1;
3768  if (data & ~kvm_caps.supported_perf_cap)
3769  return 1;
3770 
3771  /*
3772  * Note, this is not just a performance optimization! KVM
3773  * disallows changing feature MSRs after the vCPU has run; PMU
3774  * refresh will bug the VM if called after the vCPU has run.
3775  */
3776  if (vcpu->arch.perf_capabilities == data)
3777  break;
3778 
3779  vcpu->arch.perf_capabilities = data;
3780  kvm_pmu_refresh(vcpu);
3781  break;
3782  case MSR_IA32_PRED_CMD: {
3783  u64 reserved_bits = ~(PRED_CMD_IBPB | PRED_CMD_SBPB);
3784 
3785  if (!msr_info->host_initiated) {
3786  if ((!guest_has_pred_cmd_msr(vcpu)))
3787  return 1;
3788 
3789  if (!guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL) &&
3790  !guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBPB))
3791  reserved_bits |= PRED_CMD_IBPB;
3792 
3793  if (!guest_cpuid_has(vcpu, X86_FEATURE_SBPB))
3794  reserved_bits |= PRED_CMD_SBPB;
3795  }
3796 
3797  if (!boot_cpu_has(X86_FEATURE_IBPB))
3798  reserved_bits |= PRED_CMD_IBPB;
3799 
3800  if (!boot_cpu_has(X86_FEATURE_SBPB))
3801  reserved_bits |= PRED_CMD_SBPB;
3802 
3803  if (data & reserved_bits)
3804  return 1;
3805 
3806  if (!data)
3807  break;
3808 
3809  wrmsrl(MSR_IA32_PRED_CMD, data);
3810  break;
3811  }
3812  case MSR_IA32_FLUSH_CMD:
3813  if (!msr_info->host_initiated &&
3814  !guest_cpuid_has(vcpu, X86_FEATURE_FLUSH_L1D))
3815  return 1;
3816 
3817  if (!boot_cpu_has(X86_FEATURE_FLUSH_L1D) || (data & ~L1D_FLUSH))
3818  return 1;
3819  if (!data)
3820  break;
3821 
3822  wrmsrl(MSR_IA32_FLUSH_CMD, L1D_FLUSH);
3823  break;
3824  case MSR_EFER:
3825  return set_efer(vcpu, msr_info);
3826  case MSR_K7_HWCR:
3827  data &= ~(u64)0x40; /* ignore flush filter disable */
3828  data &= ~(u64)0x100; /* ignore ignne emulation enable */
3829  data &= ~(u64)0x8; /* ignore TLB cache disable */
3830 
3831  /*
3832  * Allow McStatusWrEn and TscFreqSel. (Linux guests from v3.2
3833  * through at least v6.6 whine if TscFreqSel is clear,
3834  * depending on F/M/S.
3835  */
3836  if (data & ~(BIT_ULL(18) | BIT_ULL(24))) {
3837  kvm_pr_unimpl_wrmsr(vcpu, msr, data);
3838  return 1;
3839  }
3840  vcpu->arch.msr_hwcr = data;
3841  break;
3842  case MSR_FAM10H_MMIO_CONF_BASE:
3843  if (data != 0) {
3844  kvm_pr_unimpl_wrmsr(vcpu, msr, data);
3845  return 1;
3846  }
3847  break;
3848  case MSR_IA32_CR_PAT:
3849  if (!kvm_pat_valid(data))
3850  return 1;
3851 
3852  vcpu->arch.pat = data;
3853  break;
3854  case MTRRphysBase_MSR(0) ... MSR_MTRRfix4K_F8000:
3855  case MSR_MTRRdefType:
3856  return kvm_mtrr_set_msr(vcpu, msr, data);
3857  case MSR_IA32_APICBASE:
3858  return kvm_set_apic_base(vcpu, msr_info);
3859  case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff:
3860  return kvm_x2apic_msr_write(vcpu, msr, data);
3861  case MSR_IA32_TSC_DEADLINE:
3862  kvm_set_lapic_tscdeadline_msr(vcpu, data);
3863  break;
3864  case MSR_IA32_TSC_ADJUST:
3865  if (guest_cpuid_has(vcpu, X86_FEATURE_TSC_ADJUST)) {
3866  if (!msr_info->host_initiated) {
3867  s64 adj = data - vcpu->arch.ia32_tsc_adjust_msr;
3868  adjust_tsc_offset_guest(vcpu, adj);
3869  /* Before back to guest, tsc_timestamp must be adjusted
3870  * as well, otherwise guest's percpu pvclock time could jump.
3871  */
3872  kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
3873  }
3874  vcpu->arch.ia32_tsc_adjust_msr = data;
3875  }
3876  break;
3877  case MSR_IA32_MISC_ENABLE: {
3878  u64 old_val = vcpu->arch.ia32_misc_enable_msr;
3879 
3880  if (!msr_info->host_initiated) {
3881  /* RO bits */
3882  if ((old_val ^ data) & MSR_IA32_MISC_ENABLE_PMU_RO_MASK)
3883  return 1;
3884 
3885  /* R bits, i.e. writes are ignored, but don't fault. */
3886  data = data & ~MSR_IA32_MISC_ENABLE_EMON;
3887  data |= old_val & MSR_IA32_MISC_ENABLE_EMON;
3888  }
3889 
3890  if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT) &&
3891  ((old_val ^ data) & MSR_IA32_MISC_ENABLE_MWAIT)) {
3892  if (!guest_cpuid_has(vcpu, X86_FEATURE_XMM3))
3893  return 1;
3894  vcpu->arch.ia32_misc_enable_msr = data;
3896  } else {
3897  vcpu->arch.ia32_misc_enable_msr = data;
3898  }
3899  break;
3900  }
3901  case MSR_IA32_SMBASE:
3902  if (!IS_ENABLED(CONFIG_KVM_SMM) || !msr_info->host_initiated)
3903  return 1;
3904  vcpu->arch.smbase = data;
3905  break;
3906  case MSR_IA32_POWER_CTL:
3907  vcpu->arch.msr_ia32_power_ctl = data;
3908  break;
3909  case MSR_IA32_TSC:
3910  if (msr_info->host_initiated) {
3911  kvm_synchronize_tsc(vcpu, &data);
3912  } else {
3913  u64 adj = kvm_compute_l1_tsc_offset(vcpu, data) - vcpu->arch.l1_tsc_offset;
3914  adjust_tsc_offset_guest(vcpu, adj);
3915  vcpu->arch.ia32_tsc_adjust_msr += adj;
3916  }
3917  break;
3918  case MSR_IA32_XSS:
3919  if (!msr_info->host_initiated &&
3920  !guest_cpuid_has(vcpu, X86_FEATURE_XSAVES))
3921  return 1;
3922  /*
3923  * KVM supports exposing PT to the guest, but does not support
3924  * IA32_XSS[bit 8]. Guests have to use RDMSR/WRMSR rather than
3925  * XSAVES/XRSTORS to save/restore PT MSRs.
3926  */
3927  if (data & ~kvm_caps.supported_xss)
3928  return 1;
3929  vcpu->arch.ia32_xss = data;
3931  break;
3932  case MSR_SMI_COUNT:
3933  if (!msr_info->host_initiated)
3934  return 1;
3935  vcpu->arch.smi_count = data;
3936  break;
3937  case MSR_KVM_WALL_CLOCK_NEW:
3938  if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2))
3939  return 1;
3940 
3941  vcpu->kvm->arch.wall_clock = data;
3942  kvm_write_wall_clock(vcpu->kvm, data, 0);
3943  break;
3944  case MSR_KVM_WALL_CLOCK:
3945  if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE))
3946  return 1;
3947 
3948  vcpu->kvm->arch.wall_clock = data;
3949  kvm_write_wall_clock(vcpu->kvm, data, 0);
3950  break;
3951  case MSR_KVM_SYSTEM_TIME_NEW:
3952  if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2))
3953  return 1;
3954 
3955  kvm_write_system_time(vcpu, data, false, msr_info->host_initiated);
3956  break;
3957  case MSR_KVM_SYSTEM_TIME:
3958  if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE))
3959  return 1;
3960 
3961  kvm_write_system_time(vcpu, data, true, msr_info->host_initiated);
3962  break;
3963  case MSR_KVM_ASYNC_PF_EN:
3964  if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF))
3965  return 1;
3966 
3967  if (kvm_pv_enable_async_pf(vcpu, data))
3968  return 1;
3969  break;
3970  case MSR_KVM_ASYNC_PF_INT:
3971  if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT))
3972  return 1;
3973 
3974  if (kvm_pv_enable_async_pf_int(vcpu, data))
3975  return 1;
3976  break;
3977  case MSR_KVM_ASYNC_PF_ACK:
3978  if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT))
3979  return 1;
3980  if (data & 0x1) {
3981  vcpu->arch.apf.pageready_pending = false;
3983  }
3984  break;
3985  case MSR_KVM_STEAL_TIME:
3986  if (!guest_pv_has(vcpu, KVM_FEATURE_STEAL_TIME))
3987  return 1;
3988 
3989  if (unlikely(!sched_info_on()))
3990  return 1;
3991 
3992  if (data & KVM_STEAL_RESERVED_MASK)
3993  return 1;
3994 
3995  vcpu->arch.st.msr_val = data;
3996 
3997  if (!(data & KVM_MSR_ENABLED))
3998  break;
3999 
4000  kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
4001 
4002  break;
4003  case MSR_KVM_PV_EOI_EN:
4004  if (!guest_pv_has(vcpu, KVM_FEATURE_PV_EOI))
4005  return 1;
4006 
4007  if (kvm_lapic_set_pv_eoi(vcpu, data, sizeof(u8)))
4008  return 1;
4009  break;
4010 
4011  case MSR_KVM_POLL_CONTROL:
4012  if (!guest_pv_has(vcpu, KVM_FEATURE_POLL_CONTROL))
4013  return 1;
4014 
4015  /* only enable bit supported */
4016  if (data & (-1ULL << 1))
4017  return 1;
4018 
4019  vcpu->arch.msr_kvm_poll_control = data;
4020  break;
4021 
4022  case MSR_IA32_MCG_CTL:
4023  case MSR_IA32_MCG_STATUS:
4024  case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
4025  case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1:
4026  return set_msr_mce(vcpu, msr_info);
4027 
4028  case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3:
4029  case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1:
4030  case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3:
4031  case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1:
4032  if (kvm_pmu_is_valid_msr(vcpu, msr))
4033  return kvm_pmu_set_msr(vcpu, msr_info);
4034 
4035  if (data)
4036  kvm_pr_unimpl_wrmsr(vcpu, msr, data);
4037  break;
4038  case MSR_K7_CLK_CTL:
4039  /*
4040  * Ignore all writes to this no longer documented MSR.
4041  * Writes are only relevant for old K7 processors,
4042  * all pre-dating SVM, but a recommended workaround from
4043  * AMD for these chips. It is possible to specify the
4044  * affected processor models on the command line, hence
4045  * the need to ignore the workaround.
4046  */
4047  break;
4048 #ifdef CONFIG_KVM_HYPERV
4049  case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
4050  case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER:
4051  case HV_X64_MSR_SYNDBG_OPTIONS:
4052  case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
4053  case HV_X64_MSR_CRASH_CTL:
4054  case HV_X64_MSR_STIMER0_CONFIG ... HV_X64_MSR_STIMER3_COUNT:
4055  case HV_X64_MSR_REENLIGHTENMENT_CONTROL:
4056  case HV_X64_MSR_TSC_EMULATION_CONTROL:
4057  case HV_X64_MSR_TSC_EMULATION_STATUS:
4058  case HV_X64_MSR_TSC_INVARIANT_CONTROL:
4059  return kvm_hv_set_msr_common(vcpu, msr, data,
4060  msr_info->host_initiated);
4061 #endif
4062  case MSR_IA32_BBL_CR_CTL3:
4063  /* Drop writes to this legacy MSR -- see rdmsr
4064  * counterpart for further detail.
4065  */
4066  kvm_pr_unimpl_wrmsr(vcpu, msr, data);
4067  break;
4068  case MSR_AMD64_OSVW_ID_LENGTH:
4069  if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW))
4070  return 1;
4071  vcpu->arch.osvw.length = data;
4072  break;
4073  case MSR_AMD64_OSVW_STATUS:
4074  if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW))
4075  return 1;
4076  vcpu->arch.osvw.status = data;
4077  break;
4078  case MSR_PLATFORM_INFO:
4079  if (!msr_info->host_initiated ||
4080  (!(data & MSR_PLATFORM_INFO_CPUID_FAULT) &&
4081  cpuid_fault_enabled(vcpu)))
4082  return 1;
4083  vcpu->arch.msr_platform_info = data;
4084  break;
4085  case MSR_MISC_FEATURES_ENABLES:
4086  if (data & ~MSR_MISC_FEATURES_ENABLES_CPUID_FAULT ||
4087  (data & MSR_MISC_FEATURES_ENABLES_CPUID_FAULT &&
4088  !supports_cpuid_fault(vcpu)))
4089  return 1;
4090  vcpu->arch.msr_misc_features_enables = data;
4091  break;
4092 #ifdef CONFIG_X86_64
4093  case MSR_IA32_XFD:
4094  if (!msr_info->host_initiated &&
4095  !guest_cpuid_has(vcpu, X86_FEATURE_XFD))
4096  return 1;
4097 
4098  if (data & ~kvm_guest_supported_xfd(vcpu))
4099  return 1;
4100 
4101  fpu_update_guest_xfd(&vcpu->arch.guest_fpu, data);
4102  break;
4103  case MSR_IA32_XFD_ERR:
4104  if (!msr_info->host_initiated &&
4105  !guest_cpuid_has(vcpu, X86_FEATURE_XFD))
4106  return 1;
4107 
4108  if (data & ~kvm_guest_supported_xfd(vcpu))
4109  return 1;
4110 
4111  vcpu->arch.guest_fpu.xfd_err = data;
4112  break;
4113 #endif
4114  default:
4115  if (kvm_pmu_is_valid_msr(vcpu, msr))
4116  return kvm_pmu_set_msr(vcpu, msr_info);
4117 
4118  /*
4119  * Userspace is allowed to write '0' to MSRs that KVM reports
4120  * as to-be-saved, even if an MSRs isn't fully supported.
4121  */
4122  if (msr_info->host_initiated && !data &&
4123  kvm_is_msr_to_save(msr))
4124  break;
4125 
4126  return KVM_MSR_RET_INVALID;
4127  }
4128  return 0;
4129 }
4131 
4132 static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host)
4133 {
4134  u64 data;
4135  u64 mcg_cap = vcpu->arch.mcg_cap;
4136  unsigned bank_num = mcg_cap & 0xff;
4137  u32 offset, last_msr;
4138 
4139  switch (msr) {
4140  case MSR_IA32_P5_MC_ADDR:
4141  case MSR_IA32_P5_MC_TYPE:
4142  data = 0;
4143  break;
4144  case MSR_IA32_MCG_CAP:
4145  data = vcpu->arch.mcg_cap;
4146  break;
4147  case MSR_IA32_MCG_CTL:
4148  if (!(mcg_cap & MCG_CTL_P) && !host)
4149  return 1;
4150  data = vcpu->arch.mcg_ctl;
4151  break;
4152  case MSR_IA32_MCG_STATUS:
4153  data = vcpu->arch.mcg_status;
4154  break;
4155  case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1:
4156  last_msr = MSR_IA32_MCx_CTL2(bank_num) - 1;
4157  if (msr > last_msr)
4158  return 1;
4159 
4160  if (!(mcg_cap & MCG_CMCI_P) && !host)
4161  return 1;
4162  offset = array_index_nospec(msr - MSR_IA32_MC0_CTL2,
4163  last_msr + 1 - MSR_IA32_MC0_CTL2);
4164  data = vcpu->arch.mci_ctl2_banks[offset];
4165  break;
4166  case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
4167  last_msr = MSR_IA32_MCx_CTL(bank_num) - 1;
4168  if (msr > last_msr)
4169  return 1;
4170 
4171  offset = array_index_nospec(msr - MSR_IA32_MC0_CTL,
4172  last_msr + 1 - MSR_IA32_MC0_CTL);
4173  data = vcpu->arch.mce_banks[offset];
4174  break;
4175  default:
4176  return 1;
4177  }
4178  *pdata = data;
4179  return 0;
4180 }
4181 
4182 int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
4183 {
4184  switch (msr_info->index) {
4185  case MSR_IA32_PLATFORM_ID:
4186  case MSR_IA32_EBL_CR_POWERON:
4187  case MSR_IA32_LASTBRANCHFROMIP:
4188  case MSR_IA32_LASTBRANCHTOIP:
4189  case MSR_IA32_LASTINTFROMIP:
4190  case MSR_IA32_LASTINTTOIP:
4191  case MSR_AMD64_SYSCFG:
4192  case MSR_K8_TSEG_ADDR:
4193  case MSR_K8_TSEG_MASK:
4194  case MSR_VM_HSAVE_PA:
4195  case MSR_K8_INT_PENDING_MSG:
4196  case MSR_AMD64_NB_CFG:
4197  case MSR_FAM10H_MMIO_CONF_BASE:
4198  case MSR_AMD64_BU_CFG2:
4199  case MSR_IA32_PERF_CTL:
4200  case MSR_AMD64_DC_CFG:
4201  case MSR_AMD64_TW_CFG:
4202  case MSR_F15H_EX_CFG:
4203  /*
4204  * Intel Sandy Bridge CPUs must support the RAPL (running average power
4205  * limit) MSRs. Just return 0, as we do not want to expose the host
4206  * data here. Do not conditionalize this on CPUID, as KVM does not do
4207  * so for existing CPU-specific MSRs.
4208  */
4209  case MSR_RAPL_POWER_UNIT:
4210  case MSR_PP0_ENERGY_STATUS: /* Power plane 0 (core) */
4211  case MSR_PP1_ENERGY_STATUS: /* Power plane 1 (graphics uncore) */
4212  case MSR_PKG_ENERGY_STATUS: /* Total package */
4213  case MSR_DRAM_ENERGY_STATUS: /* DRAM controller */
4214  msr_info->data = 0;
4215  break;
4216  case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3:
4217  case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3:
4218  case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1:
4219  case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1:
4220  if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
4221  return kvm_pmu_get_msr(vcpu, msr_info);
4222  msr_info->data = 0;
4223  break;
4224  case MSR_IA32_UCODE_REV:
4225  msr_info->data = vcpu->arch.microcode_version;
4226  break;
4227  case MSR_IA32_ARCH_CAPABILITIES:
4228  if (!msr_info->host_initiated &&
4229  !guest_cpuid_has(vcpu, X86_FEATURE_ARCH_CAPABILITIES))
4230  return 1;
4231  msr_info->data = vcpu->arch.arch_capabilities;
4232  break;
4233  case MSR_IA32_PERF_CAPABILITIES:
4234  if (!msr_info->host_initiated &&
4235  !guest_cpuid_has(vcpu, X86_FEATURE_PDCM))
4236  return 1;
4237  msr_info->data = vcpu->arch.perf_capabilities;
4238  break;
4239  case MSR_IA32_POWER_CTL:
4240  msr_info->data = vcpu->arch.msr_ia32_power_ctl;
4241  break;
4242  case MSR_IA32_TSC: {
4243  /*
4244  * Intel SDM states that MSR_IA32_TSC read adds the TSC offset
4245  * even when not intercepted. AMD manual doesn't explicitly
4246  * state this but appears to behave the same.
4247  *
4248  * On userspace reads and writes, however, we unconditionally
4249  * return L1's TSC value to ensure backwards-compatible
4250  * behavior for migration.
4251  */
4252  u64 offset, ratio;
4253 
4254  if (msr_info->host_initiated) {
4255  offset = vcpu->arch.l1_tsc_offset;
4256  ratio = vcpu->arch.l1_tsc_scaling_ratio;
4257  } else {
4258  offset = vcpu->arch.tsc_offset;
4259  ratio = vcpu->arch.tsc_scaling_ratio;
4260  }
4261 
4262  msr_info->data = kvm_scale_tsc(rdtsc(), ratio) + offset;
4263  break;
4264  }
4265  case MSR_IA32_CR_PAT:
4266  msr_info->data = vcpu->arch.pat;
4267  break;
4268  case MSR_MTRRcap:
4269  case MTRRphysBase_MSR(0) ... MSR_MTRRfix4K_F8000:
4270  case MSR_MTRRdefType:
4271  return kvm_mtrr_get_msr(vcpu, msr_info->index, &msr_info->data);
4272  case 0xcd: /* fsb frequency */
4273  msr_info->data = 3;
4274  break;
4275  /*
4276  * MSR_EBC_FREQUENCY_ID
4277  * Conservative value valid for even the basic CPU models.
4278  * Models 0,1: 000 in bits 23:21 indicating a bus speed of
4279  * 100MHz, model 2 000 in bits 18:16 indicating 100MHz,
4280  * and 266MHz for model 3, or 4. Set Core Clock
4281  * Frequency to System Bus Frequency Ratio to 1 (bits
4282  * 31:24) even though these are only valid for CPU
4283  * models > 2, however guests may end up dividing or
4284  * multiplying by zero otherwise.
4285  */
4286  case MSR_EBC_FREQUENCY_ID:
4287  msr_info->data = 1 << 24;
4288  break;
4289  case MSR_IA32_APICBASE:
4290  msr_info->data = kvm_get_apic_base(vcpu);
4291  break;
4292  case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff:
4293  return kvm_x2apic_msr_read(vcpu, msr_info->index, &msr_info->data);
4294  case MSR_IA32_TSC_DEADLINE:
4295  msr_info->data = kvm_get_lapic_tscdeadline_msr(vcpu);
4296  break;
4297  case MSR_IA32_TSC_ADJUST:
4298  msr_info->data = (u64)vcpu->arch.ia32_tsc_adjust_msr;
4299  break;
4300  case MSR_IA32_MISC_ENABLE:
4301  msr_info->data = vcpu->arch.ia32_misc_enable_msr;
4302  break;
4303  case MSR_IA32_SMBASE:
4304  if (!IS_ENABLED(CONFIG_KVM_SMM) || !msr_info->host_initiated)
4305  return 1;
4306  msr_info->data = vcpu->arch.smbase;
4307  break;
4308  case MSR_SMI_COUNT:
4309  msr_info->data = vcpu->arch.smi_count;
4310  break;
4311  case MSR_IA32_PERF_STATUS:
4312  /* TSC increment by tick */
4313  msr_info->data = 1000ULL;
4314  /* CPU multiplier */
4315  msr_info->data |= (((uint64_t)4ULL) << 40);
4316  break;
4317  case MSR_EFER:
4318  msr_info->data = vcpu->arch.efer;
4319  break;
4320  case MSR_KVM_WALL_CLOCK:
4321  if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE))
4322  return 1;
4323 
4324  msr_info->data = vcpu->kvm->arch.wall_clock;
4325  break;
4326  case MSR_KVM_WALL_CLOCK_NEW:
4327  if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2))
4328  return 1;
4329 
4330  msr_info->data = vcpu->kvm->arch.wall_clock;
4331  break;
4332  case MSR_KVM_SYSTEM_TIME:
4333  if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE))
4334  return 1;
4335 
4336  msr_info->data = vcpu->arch.time;
4337  break;
4338  case MSR_KVM_SYSTEM_TIME_NEW:
4339  if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2))
4340  return 1;
4341 
4342  msr_info->data = vcpu->arch.time;
4343  break;
4344  case MSR_KVM_ASYNC_PF_EN:
4345  if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF))
4346  return 1;
4347 
4348  msr_info->data = vcpu->arch.apf.msr_en_val;
4349  break;
4350  case MSR_KVM_ASYNC_PF_INT:
4351  if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT))
4352  return 1;
4353 
4354  msr_info->data = vcpu->arch.apf.msr_int_val;
4355  break;
4356  case MSR_KVM_ASYNC_PF_ACK:
4357  if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT))
4358  return 1;
4359 
4360  msr_info->data = 0;
4361  break;
4362  case MSR_KVM_STEAL_TIME:
4363  if (!guest_pv_has(vcpu, KVM_FEATURE_STEAL_TIME))
4364  return 1;
4365 
4366  msr_info->data = vcpu->arch.st.msr_val;
4367  break;
4368  case MSR_KVM_PV_EOI_EN:
4369  if (!guest_pv_has(vcpu, KVM_FEATURE_PV_EOI))
4370  return 1;
4371 
4372  msr_info->data = vcpu->arch.pv_eoi.msr_val;
4373  break;
4374  case MSR_KVM_POLL_CONTROL:
4375  if (!guest_pv_has(vcpu, KVM_FEATURE_POLL_CONTROL))
4376  return 1;
4377 
4378  msr_info->data = vcpu->arch.msr_kvm_poll_control;
4379  break;
4380  case MSR_IA32_P5_MC_ADDR:
4381  case MSR_IA32_P5_MC_TYPE:
4382  case MSR_IA32_MCG_CAP:
4383  case MSR_IA32_MCG_CTL:
4384  case MSR_IA32_MCG_STATUS:
4385  case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
4386  case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1:
4387  return get_msr_mce(vcpu, msr_info->index, &msr_info->data,
4388  msr_info->host_initiated);
4389  case MSR_IA32_XSS:
4390  if (!msr_info->host_initiated &&
4391  !guest_cpuid_has(vcpu, X86_FEATURE_XSAVES))
4392  return 1;
4393  msr_info->data = vcpu->arch.ia32_xss;
4394  break;
4395  case MSR_K7_CLK_CTL:
4396  /*
4397  * Provide expected ramp-up count for K7. All other
4398  * are set to zero, indicating minimum divisors for
4399  * every field.
4400  *
4401  * This prevents guest kernels on AMD host with CPU
4402  * type 6, model 8 and higher from exploding due to
4403  * the rdmsr failing.
4404  */
4405  msr_info->data = 0x20000000;
4406  break;
4407 #ifdef CONFIG_KVM_HYPERV
4408  case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
4409  case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER:
4410  case HV_X64_MSR_SYNDBG_OPTIONS:
4411  case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
4412  case HV_X64_MSR_CRASH_CTL:
4413  case HV_X64_MSR_STIMER0_CONFIG ... HV_X64_MSR_STIMER3_COUNT:
4414  case HV_X64_MSR_REENLIGHTENMENT_CONTROL:
4415  case HV_X64_MSR_TSC_EMULATION_CONTROL:
4416  case HV_X64_MSR_TSC_EMULATION_STATUS:
4417  case HV_X64_MSR_TSC_INVARIANT_CONTROL:
4418  return kvm_hv_get_msr_common(vcpu,
4419  msr_info->index, &msr_info->data,
4420  msr_info->host_initiated);
4421 #endif
4422  case MSR_IA32_BBL_CR_CTL3:
4423  /* This legacy MSR exists but isn't fully documented in current
4424  * silicon. It is however accessed by winxp in very narrow
4425  * scenarios where it sets bit #19, itself documented as
4426  * a "reserved" bit. Best effort attempt to source coherent
4427  * read data here should the balance of the register be
4428  * interpreted by the guest:
4429  *
4430  * L2 cache control register 3: 64GB range, 256KB size,
4431  * enabled, latency 0x1, configured
4432  */
4433  msr_info->data = 0xbe702111;
4434  break;
4435  case MSR_AMD64_OSVW_ID_LENGTH:
4436  if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW))
4437  return 1;
4438  msr_info->data = vcpu->arch.osvw.length;
4439  break;
4440  case MSR_AMD64_OSVW_STATUS:
4441  if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW))
4442  return 1;
4443  msr_info->data = vcpu->arch.osvw.status;
4444  break;
4445  case MSR_PLATFORM_INFO:
4446  if (!msr_info->host_initiated &&
4447  !vcpu->kvm->arch.guest_can_read_msr_platform_info)
4448  return 1;
4449  msr_info->data = vcpu->arch.msr_platform_info;
4450  break;
4451  case MSR_MISC_FEATURES_ENABLES:
4452  msr_info->data = vcpu->arch.msr_misc_features_enables;
4453  break;
4454  case MSR_K7_HWCR:
4455  msr_info->data = vcpu->arch.msr_hwcr;
4456  break;
4457 #ifdef CONFIG_X86_64
4458  case MSR_IA32_XFD:
4459  if (!msr_info->host_initiated &&
4460  !guest_cpuid_has(vcpu, X86_FEATURE_XFD))
4461  return 1;
4462 
4463  msr_info->data = vcpu->arch.guest_fpu.fpstate->xfd;
4464  break;
4465  case MSR_IA32_XFD_ERR:
4466  if (!msr_info->host_initiated &&
4467  !guest_cpuid_has(vcpu, X86_FEATURE_XFD))
4468  return 1;
4469 
4470  msr_info->data = vcpu->arch.guest_fpu.xfd_err;
4471  break;
4472 #endif
4473  default:
4474  if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
4475  return kvm_pmu_get_msr(vcpu, msr_info);
4476 
4477  /*
4478  * Userspace is allowed to read MSRs that KVM reports as
4479  * to-be-saved, even if an MSR isn't fully supported.
4480  */
4481  if (msr_info->host_initiated &&
4482  kvm_is_msr_to_save(msr_info->index)) {
4483  msr_info->data = 0;
4484  break;
4485  }
4486 
4487  return KVM_MSR_RET_INVALID;
4488  }
4489  return 0;
4490 }
4492 
4493 /*
4494  * Read or write a bunch of msrs. All parameters are kernel addresses.
4495  *
4496  * @return number of msrs set successfully.
4497  */
4498 static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs,
4499  struct kvm_msr_entry *entries,
4500  int (*do_msr)(struct kvm_vcpu *vcpu,
4501  unsigned index, u64 *data))
4502 {
4503  int i;
4504 
4505  for (i = 0; i < msrs->nmsrs; ++i)
4506  if (do_msr(vcpu, entries[i].index, &entries[i].data))
4507  break;
4508 
4509  return i;
4510 }
4511 
4512 /*
4513  * Read or write a bunch of msrs. Parameters are user addresses.
4514  *
4515  * @return number of msrs set successfully.
4516  */
4517 static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs,
4518  int (*do_msr)(struct kvm_vcpu *vcpu,
4519  unsigned index, u64 *data),
4520  int writeback)
4521 {
4522  struct kvm_msrs msrs;
4523  struct kvm_msr_entry *entries;
4524  unsigned size;
4525  int r;
4526 
4527  r = -EFAULT;
4528  if (copy_from_user(&msrs, user_msrs, sizeof(msrs)))
4529  goto out;
4530 
4531  r = -E2BIG;
4532  if (msrs.nmsrs >= MAX_IO_MSRS)
4533  goto out;
4534 
4535  size = sizeof(struct kvm_msr_entry) * msrs.nmsrs;
4536  entries = memdup_user(user_msrs->entries, size);
4537  if (IS_ERR(entries)) {
4538  r = PTR_ERR(entries);
4539  goto out;
4540  }
4541 
4542  r = __msr_io(vcpu, &msrs, entries, do_msr);
4543 
4544  if (writeback && copy_to_user(user_msrs->entries, entries, size))
4545  r = -EFAULT;
4546 
4547  kfree(entries);
4548 out:
4549  return r;
4550 }
4551 
4552 static inline bool kvm_can_mwait_in_guest(void)
4553 {
4554  return boot_cpu_has(X86_FEATURE_MWAIT) &&
4555  !boot_cpu_has_bug(X86_BUG_MONITOR) &&
4556  boot_cpu_has(X86_FEATURE_ARAT);
4557 }
4558 
4559 #ifdef CONFIG_KVM_HYPERV
4560 static int kvm_ioctl_get_supported_hv_cpuid(struct kvm_vcpu *vcpu,
4561  struct kvm_cpuid2 __user *cpuid_arg)
4562 {
4563  struct kvm_cpuid2 cpuid;
4564  int r;
4565 
4566  r = -EFAULT;
4567  if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid)))
4568  return r;
4569 
4570  r = kvm_get_hv_cpuid(vcpu, &cpuid, cpuid_arg->entries);
4571  if (r)
4572  return r;
4573 
4574  r = -EFAULT;
4575  if (copy_to_user(cpuid_arg, &cpuid, sizeof(cpuid)))
4576  return r;
4577 
4578  return 0;
4579 }
4580 #endif
4581 
4582 static bool kvm_is_vm_type_supported(unsigned long type)
4583 {
4584  return type == KVM_X86_DEFAULT_VM ||
4585  (type == KVM_X86_SW_PROTECTED_VM &&
4586  IS_ENABLED(CONFIG_KVM_SW_PROTECTED_VM) && tdp_mmu_enabled);
4587 }
4588 
4589 int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
4590 {
4591  int r = 0;
4592 
4593  switch (ext) {
4594  case KVM_CAP_IRQCHIP:
4595  case KVM_CAP_HLT:
4596  case KVM_CAP_MMU_SHADOW_CACHE_CONTROL:
4597  case KVM_CAP_SET_TSS_ADDR:
4598  case KVM_CAP_EXT_CPUID:
4599  case KVM_CAP_EXT_EMUL_CPUID:
4600  case KVM_CAP_CLOCKSOURCE:
4601  case KVM_CAP_PIT:
4602  case KVM_CAP_NOP_IO_DELAY:
4603  case KVM_CAP_MP_STATE:
4604  case KVM_CAP_SYNC_MMU:
4605  case KVM_CAP_USER_NMI:
4606  case KVM_CAP_REINJECT_CONTROL:
4607  case KVM_CAP_IRQ_INJECT_STATUS:
4608  case KVM_CAP_IOEVENTFD:
4609  case KVM_CAP_IOEVENTFD_NO_LENGTH:
4610  case KVM_CAP_PIT2:
4611  case KVM_CAP_PIT_STATE2:
4612  case KVM_CAP_SET_IDENTITY_MAP_ADDR:
4613  case KVM_CAP_VCPU_EVENTS:
4614 #ifdef CONFIG_KVM_HYPERV
4615  case KVM_CAP_HYPERV:
4616  case KVM_CAP_HYPERV_VAPIC:
4617  case KVM_CAP_HYPERV_SPIN:
4618  case KVM_CAP_HYPERV_TIME:
4619  case KVM_CAP_HYPERV_SYNIC:
4620  case KVM_CAP_HYPERV_SYNIC2:
4621  case KVM_CAP_HYPERV_VP_INDEX:
4622  case KVM_CAP_HYPERV_EVENTFD:
4623  case KVM_CAP_HYPERV_TLBFLUSH:
4624  case KVM_CAP_HYPERV_SEND_IPI:
4625  case KVM_CAP_HYPERV_CPUID:
4626  case KVM_CAP_HYPERV_ENFORCE_CPUID:
4627  case KVM_CAP_SYS_HYPERV_CPUID:
4628 #endif
4629  case KVM_CAP_PCI_SEGMENT:
4630  case KVM_CAP_DEBUGREGS:
4631  case KVM_CAP_X86_ROBUST_SINGLESTEP:
4632  case KVM_CAP_XSAVE:
4633  case KVM_CAP_ASYNC_PF:
4634  case KVM_CAP_ASYNC_PF_INT:
4635  case KVM_CAP_GET_TSC_KHZ:
4636  case KVM_CAP_KVMCLOCK_CTRL:
4637  case KVM_CAP_READONLY_MEM:
4638  case KVM_CAP_IOAPIC_POLARITY_IGNORED:
4639  case KVM_CAP_TSC_DEADLINE_TIMER:
4640  case KVM_CAP_DISABLE_QUIRKS:
4641  case KVM_CAP_SET_BOOT_CPU_ID:
4642  case KVM_CAP_SPLIT_IRQCHIP:
4643  case KVM_CAP_IMMEDIATE_EXIT:
4644  case KVM_CAP_PMU_EVENT_FILTER:
4645  case KVM_CAP_PMU_EVENT_MASKED_EVENTS:
4646  case KVM_CAP_GET_MSR_FEATURES:
4647  case KVM_CAP_MSR_PLATFORM_INFO:
4648  case KVM_CAP_EXCEPTION_PAYLOAD:
4649  case KVM_CAP_X86_TRIPLE_FAULT_EVENT:
4650  case KVM_CAP_SET_GUEST_DEBUG:
4651  case KVM_CAP_LAST_CPU:
4652  case KVM_CAP_X86_USER_SPACE_MSR:
4653  case KVM_CAP_X86_MSR_FILTER:
4654  case KVM_CAP_ENFORCE_PV_FEATURE_CPUID:
4655 #ifdef CONFIG_X86_SGX_KVM
4656  case KVM_CAP_SGX_ATTRIBUTE:
4657 #endif
4658  case KVM_CAP_VM_COPY_ENC_CONTEXT_FROM:
4659  case KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM:
4660  case KVM_CAP_SREGS2:
4661  case KVM_CAP_EXIT_ON_EMULATION_FAILURE:
4662  case KVM_CAP_VCPU_ATTRIBUTES:
4663  case KVM_CAP_SYS_ATTRIBUTES:
4664  case KVM_CAP_VAPIC:
4665  case KVM_CAP_ENABLE_CAP:
4666  case KVM_CAP_VM_DISABLE_NX_HUGE_PAGES:
4667  case KVM_CAP_IRQFD_RESAMPLE:
4668  case KVM_CAP_MEMORY_FAULT_INFO:
4669  r = 1;
4670  break;
4671  case KVM_CAP_EXIT_HYPERCALL:
4673  break;
4674  case KVM_CAP_SET_GUEST_DEBUG2:
4675  return KVM_GUESTDBG_VALID_MASK;
4676 #ifdef CONFIG_KVM_XEN
4677  case KVM_CAP_XEN_HVM:
4678  r = KVM_XEN_HVM_CONFIG_HYPERCALL_MSR |
4679  KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL |
4680  KVM_XEN_HVM_CONFIG_SHARED_INFO |
4681  KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL |
4682  KVM_XEN_HVM_CONFIG_EVTCHN_SEND |
4683  KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE;
4684  if (sched_info_on())
4685  r |= KVM_XEN_HVM_CONFIG_RUNSTATE |
4686  KVM_XEN_HVM_CONFIG_RUNSTATE_UPDATE_FLAG;
4687  break;
4688 #endif
4689  case KVM_CAP_SYNC_REGS:
4690  r = KVM_SYNC_X86_VALID_FIELDS;
4691  break;
4692  case KVM_CAP_ADJUST_CLOCK:
4693  r = KVM_CLOCK_VALID_FLAGS;
4694  break;
4695  case KVM_CAP_X86_DISABLE_EXITS:
4696  r = KVM_X86_DISABLE_EXITS_PAUSE;
4697 
4698  if (!mitigate_smt_rsb) {
4699  r |= KVM_X86_DISABLE_EXITS_HLT |
4700  KVM_X86_DISABLE_EXITS_CSTATE;
4701 
4702  if (kvm_can_mwait_in_guest())
4703  r |= KVM_X86_DISABLE_EXITS_MWAIT;
4704  }
4705  break;
4706  case KVM_CAP_X86_SMM:
4707  if (!IS_ENABLED(CONFIG_KVM_SMM))
4708  break;
4709 
4710  /* SMBASE is usually relocated above 1M on modern chipsets,
4711  * and SMM handlers might indeed rely on 4G segment limits,
4712  * so do not report SMM to be available if real mode is
4713  * emulated via vm86 mode. Still, do not go to great lengths
4714  * to avoid userspace's usage of the feature, because it is a
4715  * fringe case that is not enabled except via specific settings
4716  * of the module parameters.
4717  */
4718  r = static_call(kvm_x86_has_emulated_msr)(kvm, MSR_IA32_SMBASE);
4719  break;
4720  case KVM_CAP_NR_VCPUS:
4721  r = min_t(unsigned int, num_online_cpus(), KVM_MAX_VCPUS);
4722  break;
4723  case KVM_CAP_MAX_VCPUS:
4724  r = KVM_MAX_VCPUS;
4725  break;
4726  case KVM_CAP_MAX_VCPU_ID:
4727  r = KVM_MAX_VCPU_IDS;
4728  break;
4729  case KVM_CAP_PV_MMU: /* obsolete */
4730  r = 0;
4731  break;
4732  case KVM_CAP_MCE:
4733  r = KVM_MAX_MCE_BANKS;
4734  break;
4735  case KVM_CAP_XCRS:
4736  r = boot_cpu_has(X86_FEATURE_XSAVE);
4737  break;
4738  case KVM_CAP_TSC_CONTROL:
4739  case KVM_CAP_VM_TSC_CONTROL:
4741  break;
4742  case KVM_CAP_X2APIC_API:
4744  break;
4745  case KVM_CAP_NESTED_STATE:
4746  r = kvm_x86_ops.nested_ops->get_state ?
4747  kvm_x86_ops.nested_ops->get_state(NULL, NULL, 0) : 0;
4748  break;
4749 #ifdef CONFIG_KVM_HYPERV
4750  case KVM_CAP_HYPERV_DIRECT_TLBFLUSH:
4751  r = kvm_x86_ops.enable_l2_tlb_flush != NULL;
4752  break;
4753  case KVM_CAP_HYPERV_ENLIGHTENED_VMCS:
4754  r = kvm_x86_ops.nested_ops->enable_evmcs != NULL;
4755  break;
4756 #endif
4757  case KVM_CAP_SMALLER_MAXPHYADDR:
4758  r = (int) allow_smaller_maxphyaddr;
4759  break;
4760  case KVM_CAP_STEAL_TIME:
4761  r = sched_info_on();
4762  break;
4763  case KVM_CAP_X86_BUS_LOCK_EXIT:
4765  r = KVM_BUS_LOCK_DETECTION_OFF |
4766  KVM_BUS_LOCK_DETECTION_EXIT;
4767  else
4768  r = 0;
4769  break;
4770  case KVM_CAP_XSAVE2: {
4772  if (r < sizeof(struct kvm_xsave))
4773  r = sizeof(struct kvm_xsave);
4774  break;
4775  }
4776  case KVM_CAP_PMU_CAPABILITY:
4778  break;
4779  case KVM_CAP_DISABLE_QUIRKS2:
4780  r = KVM_X86_VALID_QUIRKS;
4781  break;
4782  case KVM_CAP_X86_NOTIFY_VMEXIT:
4784  break;
4785  case KVM_CAP_VM_TYPES:
4786  r = BIT(KVM_X86_DEFAULT_VM);
4787  if (kvm_is_vm_type_supported(KVM_X86_SW_PROTECTED_VM))
4788  r |= BIT(KVM_X86_SW_PROTECTED_VM);
4789  break;
4790  default:
4791  break;
4792  }
4793  return r;
4794 }
4795 
4796 static inline void __user *kvm_get_attr_addr(struct kvm_device_attr *attr)
4797 {
4798  void __user *uaddr = (void __user*)(unsigned long)attr->addr;
4799 
4800  if ((u64)(unsigned long)uaddr != attr->addr)
4801  return ERR_PTR_USR(-EFAULT);
4802  return uaddr;
4803 }
4804 
4805 static int kvm_x86_dev_get_attr(struct kvm_device_attr *attr)
4806 {
4807  u64 __user *uaddr = kvm_get_attr_addr(attr);
4808 
4809  if (attr->group)
4810  return -ENXIO;
4811 
4812  if (IS_ERR(uaddr))
4813  return PTR_ERR(uaddr);
4814 
4815  switch (attr->attr) {
4816  case KVM_X86_XCOMP_GUEST_SUPP:
4817  if (put_user(kvm_caps.supported_xcr0, uaddr))
4818  return -EFAULT;
4819  return 0;
4820  default:
4821  return -ENXIO;
4822  }
4823 }
4824 
4825 static int kvm_x86_dev_has_attr(struct kvm_device_attr *attr)
4826 {
4827  if (attr->group)
4828  return -ENXIO;
4829 
4830  switch (attr->attr) {
4831  case KVM_X86_XCOMP_GUEST_SUPP:
4832  return 0;
4833  default:
4834  return -ENXIO;
4835  }
4836 }
4837 
4838 long kvm_arch_dev_ioctl(struct file *filp,
4839  unsigned int ioctl, unsigned long arg)
4840 {
4841  void __user *argp = (void __user *)arg;
4842  long r;
4843 
4844  switch (ioctl) {
4845  case KVM_GET_MSR_INDEX_LIST: {
4846  struct kvm_msr_list __user *user_msr_list = argp;
4847  struct kvm_msr_list msr_list;
4848  unsigned n;
4849 
4850  r = -EFAULT;
4851  if (copy_from_user(&msr_list, user_msr_list, sizeof(msr_list)))
4852  goto out;
4853  n = msr_list.nmsrs;
4854  msr_list.nmsrs = num_msrs_to_save + num_emulated_msrs;
4855  if (copy_to_user(user_msr_list, &msr_list, sizeof(msr_list)))
4856  goto out;
4857  r = -E2BIG;
4858  if (n < msr_list.nmsrs)
4859  goto out;
4860  r = -EFAULT;
4861  if (copy_to_user(user_msr_list->indices, &msrs_to_save,
4862  num_msrs_to_save * sizeof(u32)))
4863  goto out;
4864  if (copy_to_user(user_msr_list->indices + num_msrs_to_save,
4865  &emulated_msrs,
4866  num_emulated_msrs * sizeof(u32)))
4867  goto out;
4868  r = 0;
4869  break;
4870  }
4871  case KVM_GET_SUPPORTED_CPUID:
4872  case KVM_GET_EMULATED_CPUID: {
4873  struct kvm_cpuid2 __user *cpuid_arg = argp;
4874  struct kvm_cpuid2 cpuid;
4875 
4876  r = -EFAULT;
4877  if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid)))
4878  goto out;
4879 
4880  r = kvm_dev_ioctl_get_cpuid(&cpuid, cpuid_arg->entries,
4881  ioctl);
4882  if (r)
4883  goto out;
4884 
4885  r = -EFAULT;
4886  if (copy_to_user(cpuid_arg, &cpuid, sizeof(cpuid)))
4887  goto out;
4888  r = 0;
4889  break;
4890  }
4891  case KVM_X86_GET_MCE_CAP_SUPPORTED:
4892  r = -EFAULT;
4893  if (copy_to_user(argp, &kvm_caps.supported_mce_cap,
4894  sizeof(kvm_caps.supported_mce_cap)))
4895  goto out;
4896  r = 0;
4897  break;
4898  case KVM_GET_MSR_FEATURE_INDEX_LIST: {
4899  struct kvm_msr_list __user *user_msr_list = argp;
4900  struct kvm_msr_list msr_list;
4901  unsigned int n;
4902 
4903  r = -EFAULT;
4904  if (copy_from_user(&msr_list, user_msr_list, sizeof(msr_list)))
4905  goto out;
4906  n = msr_list.nmsrs;
4907  msr_list.nmsrs = num_msr_based_features;
4908  if (copy_to_user(user_msr_list, &msr_list, sizeof(msr_list)))
4909  goto out;
4910  r = -E2BIG;
4911  if (n < msr_list.nmsrs)
4912  goto out;
4913  r = -EFAULT;
4914  if (copy_to_user(user_msr_list->indices, &msr_based_features,
4915  num_msr_based_features * sizeof(u32)))
4916  goto out;
4917  r = 0;
4918  break;
4919  }
4920  case KVM_GET_MSRS:
4921  r = msr_io(NULL, argp, do_get_msr_feature, 1);
4922  break;
4923 #ifdef CONFIG_KVM_HYPERV
4924  case KVM_GET_SUPPORTED_HV_CPUID:
4925  r = kvm_ioctl_get_supported_hv_cpuid(NULL, argp);
4926  break;
4927 #endif
4928  case KVM_GET_DEVICE_ATTR: {
4929  struct kvm_device_attr attr;
4930  r = -EFAULT;
4931  if (copy_from_user(&attr, (void __user *)arg, sizeof(attr)))
4932  break;
4933  r = kvm_x86_dev_get_attr(&attr);
4934  break;
4935  }
4936  case KVM_HAS_DEVICE_ATTR: {
4937  struct kvm_device_attr attr;
4938  r = -EFAULT;
4939  if (copy_from_user(&attr, (void __user *)arg, sizeof(attr)))
4940  break;
4941  r = kvm_x86_dev_has_attr(&attr);
4942  break;
4943  }
4944  default:
4945  r = -EINVAL;
4946  break;
4947  }
4948 out:
4949  return r;
4950 }
4951 
4952 static void wbinvd_ipi(void *garbage)
4953 {
4954  wbinvd();
4955 }
4956 
4957 static bool need_emulate_wbinvd(struct kvm_vcpu *vcpu)
4958 {
4959  return kvm_arch_has_noncoherent_dma(vcpu->kvm);
4960 }
4961 
4962 void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
4963 {
4964  /* Address WBINVD may be executed by guest */
4965  if (need_emulate_wbinvd(vcpu)) {
4966  if (static_call(kvm_x86_has_wbinvd_exit)())
4967  cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
4968  else if (vcpu->cpu != -1 && vcpu->cpu != cpu)
4969  smp_call_function_single(vcpu->cpu,
4970  wbinvd_ipi, NULL, 1);
4971  }
4972 
4973  static_call(kvm_x86_vcpu_load)(vcpu, cpu);
4974 
4975  /* Save host pkru register if supported */
4976  vcpu->arch.host_pkru = read_pkru();
4977 
4978  /* Apply any externally detected TSC adjustments (due to suspend) */
4979  if (unlikely(vcpu->arch.tsc_offset_adjustment)) {
4980  adjust_tsc_offset_host(vcpu, vcpu->arch.tsc_offset_adjustment);
4981  vcpu->arch.tsc_offset_adjustment = 0;
4982  kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
4983  }
4984 
4985  if (unlikely(vcpu->cpu != cpu) || kvm_check_tsc_unstable()) {
4986  s64 tsc_delta = !vcpu->arch.last_host_tsc ? 0 :
4987  rdtsc() - vcpu->arch.last_host_tsc;
4988  if (tsc_delta < 0)
4989  mark_tsc_unstable("KVM discovered backwards TSC");
4990 
4991  if (kvm_check_tsc_unstable()) {
4992  u64 offset = kvm_compute_l1_tsc_offset(vcpu,
4993  vcpu->arch.last_guest_tsc);
4994  kvm_vcpu_write_tsc_offset(vcpu, offset);
4995  vcpu->arch.tsc_catchup = 1;
4996  }
4997 
4998  if (kvm_lapic_hv_timer_in_use(vcpu))
5000 
5001  /*
5002  * On a host with synchronized TSC, there is no need to update
5003  * kvmclock on vcpu->cpu migration
5004  */
5005  if (!vcpu->kvm->arch.use_master_clock || vcpu->cpu == -1)
5006  kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu);
5007  if (vcpu->cpu != cpu)
5008  kvm_make_request(KVM_REQ_MIGRATE_TIMER, vcpu);
5009  vcpu->cpu = cpu;
5010  }
5011 
5012  kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
5013 }
5014 
5015 static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu)
5016 {
5017  struct gfn_to_hva_cache *ghc = &vcpu->arch.st.cache;
5018  struct kvm_steal_time __user *st;
5019  struct kvm_memslots *slots;
5020  static const u8 preempted = KVM_VCPU_PREEMPTED;
5021  gpa_t gpa = vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS;
5022 
5023  /*
5024  * The vCPU can be marked preempted if and only if the VM-Exit was on
5025  * an instruction boundary and will not trigger guest emulation of any
5026  * kind (see vcpu_run). Vendor specific code controls (conservatively)
5027  * when this is true, for example allowing the vCPU to be marked
5028  * preempted if and only if the VM-Exit was due to a host interrupt.
5029  */
5030  if (!vcpu->arch.at_instruction_boundary) {
5031  vcpu->stat.preemption_other++;
5032  return;
5033  }
5034 
5035  vcpu->stat.preemption_reported++;
5036  if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
5037  return;
5038 
5039  if (vcpu->arch.st.preempted)
5040  return;
5041 
5042  /* This happens on process exit */
5043  if (unlikely(current->mm != vcpu->kvm->mm))
5044  return;
5045 
5046  slots = kvm_memslots(vcpu->kvm);
5047 
5048  if (unlikely(slots->generation != ghc->generation ||
5049  gpa != ghc->gpa ||
5050  kvm_is_error_hva(ghc->hva) || !ghc->memslot))
5051  return;
5052 
5053  st = (struct kvm_steal_time __user *)ghc->hva;
5054  BUILD_BUG_ON(sizeof(st->preempted) != sizeof(preempted));
5055 
5056  if (!copy_to_user_nofault(&st->preempted, &preempted, sizeof(preempted)))
5057  vcpu->arch.st.preempted = KVM_VCPU_PREEMPTED;
5058 
5059  mark_page_dirty_in_slot(vcpu->kvm, ghc->memslot, gpa_to_gfn(ghc->gpa));
5060 }
5061 
5062 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
5063 {
5064  int idx;
5065 
5066  if (vcpu->preempted) {
5067  if (!vcpu->arch.guest_state_protected)
5068  vcpu->arch.preempted_in_kernel = !static_call(kvm_x86_get_cpl)(vcpu);
5069 
5070  /*
5071  * Take the srcu lock as memslots will be accessed to check the gfn
5072  * cache generation against the memslots generation.
5073  */
5074  idx = srcu_read_lock(&vcpu->kvm->srcu);
5075  if (kvm_xen_msr_enabled(vcpu->kvm))
5077  else
5079  srcu_read_unlock(&vcpu->kvm->srcu, idx);
5080  }
5081 
5082  static_call(kvm_x86_vcpu_put)(vcpu);
5083  vcpu->arch.last_host_tsc = rdtsc();
5084 }
5085 
5086 static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
5087  struct kvm_lapic_state *s)
5088 {
5089  static_call_cond(kvm_x86_sync_pir_to_irr)(vcpu);
5090 
5091  return kvm_apic_get_state(vcpu, s);
5092 }
5093 
5094 static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
5095  struct kvm_lapic_state *s)
5096 {
5097  int r;
5098 
5099  r = kvm_apic_set_state(vcpu, s);
5100  if (r)
5101  return r;
5102  update_cr8_intercept(vcpu);
5103 
5104  return 0;
5105 }
5106 
5107 static int kvm_cpu_accept_dm_intr(struct kvm_vcpu *vcpu)
5108 {
5109  /*
5110  * We can accept userspace's request for interrupt injection
5111  * as long as we have a place to store the interrupt number.
5112  * The actual injection will happen when the CPU is able to
5113  * deliver the interrupt.
5114  */
5115  if (kvm_cpu_has_extint(vcpu))
5116  return false;
5117 
5118  /* Acknowledging ExtINT does not happen if LINT0 is masked. */
5119  return (!lapic_in_kernel(vcpu) ||
5120  kvm_apic_accept_pic_intr(vcpu));
5121 }
5122 
5123 static int kvm_vcpu_ready_for_interrupt_injection(struct kvm_vcpu *vcpu)
5124 {
5125  /*
5126  * Do not cause an interrupt window exit if an exception
5127  * is pending or an event needs reinjection; userspace
5128  * might want to inject the interrupt manually using KVM_SET_REGS
5129  * or KVM_SET_SREGS. For that to work, we must be at an
5130  * instruction boundary and with no events half-injected.
5131  */
5132  return (kvm_arch_interrupt_allowed(vcpu) &&
5133  kvm_cpu_accept_dm_intr(vcpu) &&
5134  !kvm_event_needs_reinjection(vcpu) &&
5135  !kvm_is_exception_pending(vcpu));
5136 }
5137 
5138 static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
5139  struct kvm_interrupt *irq)
5140 {
5141  if (irq->irq >= KVM_NR_INTERRUPTS)
5142  return -EINVAL;
5143 
5144  if (!irqchip_in_kernel(vcpu->kvm)) {
5145  kvm_queue_interrupt(vcpu, irq->irq, false);
5146  kvm_make_request(KVM_REQ_EVENT, vcpu);
5147  return 0;
5148  }
5149 
5150  /*
5151  * With in-kernel LAPIC, we only use this to inject EXTINT, so
5152  * fail for in-kernel 8259.
5153  */
5154  if (pic_in_kernel(vcpu->kvm))
5155  return -ENXIO;
5156 
5157  if (vcpu->arch.pending_external_vector != -1)
5158  return -EEXIST;
5159 
5160  vcpu->arch.pending_external_vector = irq->irq;
5161  kvm_make_request(KVM_REQ_EVENT, vcpu);
5162  return 0;
5163 }
5164 
5165 static int kvm_vcpu_ioctl_nmi(struct kvm_vcpu *vcpu)
5166 {
5167  kvm_inject_nmi(vcpu);
5168 
5169  return 0;
5170 }
5171 
5172 static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu,
5173  struct kvm_tpr_access_ctl *tac)
5174 {
5175  if (tac->flags)
5176  return -EINVAL;
5177  vcpu->arch.tpr_access_reporting = !!tac->enabled;
5178  return 0;
5179 }
5180 
5181 static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
5182  u64 mcg_cap)
5183 {
5184  int r;
5185  unsigned bank_num = mcg_cap & 0xff, bank;
5186 
5187  r = -EINVAL;
5188  if (!bank_num || bank_num > KVM_MAX_MCE_BANKS)
5189  goto out;
5190  if (mcg_cap & ~(kvm_caps.supported_mce_cap | 0xff | 0xff0000))
5191  goto out;
5192  r = 0;
5193  vcpu->arch.mcg_cap = mcg_cap;
5194  /* Init IA32_MCG_CTL to all 1s */
5195  if (mcg_cap & MCG_CTL_P)
5196  vcpu->arch.mcg_ctl = ~(u64)0;
5197  /* Init IA32_MCi_CTL to all 1s, IA32_MCi_CTL2 to all 0s */
5198  for (bank = 0; bank < bank_num; bank++) {
5199  vcpu->arch.mce_banks[bank*4] = ~(u64)0;
5200  if (mcg_cap & MCG_CMCI_P)
5201  vcpu->arch.mci_ctl2_banks[bank] = 0;
5202  }
5203 
5205 
5206  static_call(kvm_x86_setup_mce)(vcpu);
5207 out:
5208  return r;
5209 }
5210 
5211 /*
5212  * Validate this is an UCNA (uncorrectable no action) error by checking the
5213  * MCG_STATUS and MCi_STATUS registers:
5214  * - none of the bits for Machine Check Exceptions are set
5215  * - both the VAL (valid) and UC (uncorrectable) bits are set
5216  * MCI_STATUS_PCC - Processor Context Corrupted
5217  * MCI_STATUS_S - Signaled as a Machine Check Exception
5218  * MCI_STATUS_AR - Software recoverable Action Required
5219  */
5220 static bool is_ucna(struct kvm_x86_mce *mce)
5221 {
5222  return !mce->mcg_status &&
5223  !(mce->status & (MCI_STATUS_PCC | MCI_STATUS_S | MCI_STATUS_AR)) &&
5224  (mce->status & MCI_STATUS_VAL) &&
5225  (mce->status & MCI_STATUS_UC);
5226 }
5227 
5228 static int kvm_vcpu_x86_set_ucna(struct kvm_vcpu *vcpu, struct kvm_x86_mce *mce, u64* banks)
5229 {
5230  u64 mcg_cap = vcpu->arch.mcg_cap;
5231 
5232  banks[1] = mce->status;
5233  banks[2] = mce->addr;
5234  banks[3] = mce->misc;
5235  vcpu->arch.mcg_status = mce->mcg_status;
5236 
5237  if (!(mcg_cap & MCG_CMCI_P) ||
5238  !(vcpu->arch.mci_ctl2_banks[mce->bank] & MCI_CTL2_CMCI_EN))
5239  return 0;
5240 
5241  if (lapic_in_kernel(vcpu))
5242  kvm_apic_local_deliver(vcpu->arch.apic, APIC_LVTCMCI);
5243 
5244  return 0;
5245 }
5246 
5247 static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
5248  struct kvm_x86_mce *mce)
5249 {
5250  u64 mcg_cap = vcpu->arch.mcg_cap;
5251  unsigned bank_num = mcg_cap & 0xff;
5252  u64 *banks = vcpu->arch.mce_banks;
5253 
5254  if (mce->bank >= bank_num || !(mce->status & MCI_STATUS_VAL))
5255  return -EINVAL;
5256 
5257  banks += array_index_nospec(4 * mce->bank, 4 * bank_num);
5258 
5259  if (is_ucna(mce))
5260  return kvm_vcpu_x86_set_ucna(vcpu, mce, banks);
5261 
5262  /*
5263  * if IA32_MCG_CTL is not all 1s, the uncorrected error
5264  * reporting is disabled
5265  */
5266  if ((mce->status & MCI_STATUS_UC) && (mcg_cap & MCG_CTL_P) &&
5267  vcpu->arch.mcg_ctl != ~(u64)0)
5268  return 0;
5269  /*
5270  * if IA32_MCi_CTL is not all 1s, the uncorrected error
5271  * reporting is disabled for the bank
5272  */
5273  if ((mce->status & MCI_STATUS_UC) && banks[0] != ~(u64)0)
5274  return 0;
5275  if (mce->status & MCI_STATUS_UC) {
5276  if ((vcpu->arch.mcg_status & MCG_STATUS_MCIP) ||
5277  !kvm_is_cr4_bit_set(vcpu, X86_CR4_MCE)) {
5278  kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
5279  return 0;
5280  }
5281  if (banks[1] & MCI_STATUS_VAL)
5282  mce->status |= MCI_STATUS_OVER;
5283  banks[2] = mce->addr;
5284  banks[3] = mce->misc;
5285  vcpu->arch.mcg_status = mce->mcg_status;
5286  banks[1] = mce->status;
5287  kvm_queue_exception(vcpu, MC_VECTOR);
5288  } else if (!(banks[1] & MCI_STATUS_VAL)
5289  || !(banks[1] & MCI_STATUS_UC)) {
5290  if (banks[1] & MCI_STATUS_VAL)
5291  mce->status |= MCI_STATUS_OVER;
5292  banks[2] = mce->addr;
5293  banks[3] = mce->misc;
5294  banks[1] = mce->status;
5295  } else
5296  banks[1] |= MCI_STATUS_OVER;
5297  return 0;
5298 }
5299 
5300 static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
5301  struct kvm_vcpu_events *events)
5302 {
5303  struct kvm_queued_exception *ex;
5304 
5305  process_nmi(vcpu);
5306 
5307 #ifdef CONFIG_KVM_SMM
5308  if (kvm_check_request(KVM_REQ_SMI, vcpu))
5309  process_smi(vcpu);
5310 #endif
5311 
5312  /*
5313  * KVM's ABI only allows for one exception to be migrated. Luckily,
5314  * the only time there can be two queued exceptions is if there's a
5315  * non-exiting _injected_ exception, and a pending exiting exception.
5316  * In that case, ignore the VM-Exiting exception as it's an extension
5317  * of the injected exception.
5318  */
5319  if (vcpu->arch.exception_vmexit.pending &&
5320  !vcpu->arch.exception.pending &&
5321  !vcpu->arch.exception.injected)
5322  ex = &vcpu->arch.exception_vmexit;
5323  else
5324  ex = &vcpu->arch.exception;
5325 
5326  /*
5327  * In guest mode, payload delivery should be deferred if the exception
5328  * will be intercepted by L1, e.g. KVM should not modifying CR2 if L1
5329  * intercepts #PF, ditto for DR6 and #DBs. If the per-VM capability,
5330  * KVM_CAP_EXCEPTION_PAYLOAD, is not set, userspace may or may not
5331  * propagate the payload and so it cannot be safely deferred. Deliver
5332  * the payload if the capability hasn't been requested.
5333  */
5334  if (!vcpu->kvm->arch.exception_payload_enabled &&
5335  ex->pending && ex->has_payload)
5337 
5338  memset(events, 0, sizeof(*events));
5339 
5340  /*
5341  * The API doesn't provide the instruction length for software
5342  * exceptions, so don't report them. As long as the guest RIP
5343  * isn't advanced, we should expect to encounter the exception
5344  * again.
5345  */
5346  if (!kvm_exception_is_soft(ex->vector)) {
5347  events->exception.injected = ex->injected;
5348  events->exception.pending = ex->pending;
5349  /*
5350  * For ABI compatibility, deliberately conflate
5351  * pending and injected exceptions when
5352  * KVM_CAP_EXCEPTION_PAYLOAD isn't enabled.
5353  */
5354  if (!vcpu->kvm->arch.exception_payload_enabled)
5355  events->exception.injected |= ex->pending;
5356  }
5357  events->exception.nr = ex->vector;
5358  events->exception.has_error_code = ex->has_error_code;
5359  events->exception.error_code = ex->error_code;
5360  events->exception_has_payload = ex->has_payload;
5361  events->exception_payload = ex->payload;
5362 
5363  events->interrupt.injected =
5364  vcpu->arch.interrupt.injected && !vcpu->arch.interrupt.soft;
5365  events->interrupt.nr = vcpu->arch.interrupt.nr;
5366  events->interrupt.shadow = static_call(kvm_x86_get_interrupt_shadow)(vcpu);
5367 
5368  events->nmi.injected = vcpu->arch.nmi_injected;
5369  events->nmi.pending = kvm_get_nr_pending_nmis(vcpu);
5370  events->nmi.masked = static_call(kvm_x86_get_nmi_mask)(vcpu);
5371 
5372  /* events->sipi_vector is never valid when reporting to user space */
5373 
5374 #ifdef CONFIG_KVM_SMM
5375  events->smi.smm = is_smm(vcpu);
5376  events->smi.pending = vcpu->arch.smi_pending;
5377  events->smi.smm_inside_nmi =
5378  !!(vcpu->arch.hflags & HF_SMM_INSIDE_NMI_MASK);
5379 #endif
5380  events->smi.latched_init = kvm_lapic_latched_init(vcpu);
5381 
5382  events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING
5383  | KVM_VCPUEVENT_VALID_SHADOW
5384  | KVM_VCPUEVENT_VALID_SMM);
5385  if (vcpu->kvm->arch.exception_payload_enabled)
5386  events->flags |= KVM_VCPUEVENT_VALID_PAYLOAD;
5387  if (vcpu->kvm->arch.triple_fault_event) {
5388  events->triple_fault.pending = kvm_test_request(KVM_REQ_TRIPLE_FAULT, vcpu);
5389  events->flags |= KVM_VCPUEVENT_VALID_TRIPLE_FAULT;
5390  }
5391 }
5392 
5393 static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
5394  struct kvm_vcpu_events *events)
5395 {
5396  if (events->flags & ~(KVM_VCPUEVENT_VALID_NMI_PENDING
5397  | KVM_VCPUEVENT_VALID_SIPI_VECTOR
5398  | KVM_VCPUEVENT_VALID_SHADOW
5399  | KVM_VCPUEVENT_VALID_SMM
5400  | KVM_VCPUEVENT_VALID_PAYLOAD
5401  | KVM_VCPUEVENT_VALID_TRIPLE_FAULT))
5402  return -EINVAL;
5403 
5404  if (events->flags & KVM_VCPUEVENT_VALID_PAYLOAD) {
5405  if (!vcpu->kvm->arch.exception_payload_enabled)
5406  return -EINVAL;
5407  if (events->exception.pending)
5408  events->exception.injected = 0;
5409  else
5410  events->exception_has_payload = 0;
5411  } else {
5412  events->exception.pending = 0;
5413  events->exception_has_payload = 0;
5414  }
5415 
5416  if ((events->exception.injected || events->exception.pending) &&
5417  (events->exception.nr > 31 || events->exception.nr == NMI_VECTOR))
5418  return -EINVAL;
5419 
5420  /* INITs are latched while in SMM */
5421  if (events->flags & KVM_VCPUEVENT_VALID_SMM &&
5422  (events->smi.smm || events->smi.pending) &&
5423  vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED)
5424  return -EINVAL;
5425 
5426  process_nmi(vcpu);
5427 
5428  /*
5429  * Flag that userspace is stuffing an exception, the next KVM_RUN will
5430  * morph the exception to a VM-Exit if appropriate. Do this only for
5431  * pending exceptions, already-injected exceptions are not subject to
5432  * intercpetion. Note, userspace that conflates pending and injected
5433  * is hosed, and will incorrectly convert an injected exception into a
5434  * pending exception, which in turn may cause a spurious VM-Exit.
5435  */
5436  vcpu->arch.exception_from_userspace = events->exception.pending;
5437 
5438  vcpu->arch.exception_vmexit.pending = false;
5439 
5440  vcpu->arch.exception.injected = events->exception.injected;
5441  vcpu->arch.exception.pending = events->exception.pending;
5442  vcpu->arch.exception.vector = events->exception.nr;
5443  vcpu->arch.exception.has_error_code = events->exception.has_error_code;
5444  vcpu->arch.exception.error_code = events->exception.error_code;
5445  vcpu->arch.exception.has_payload = events->exception_has_payload;
5446  vcpu->arch.exception.payload = events->exception_payload;
5447 
5448  vcpu->arch.interrupt.injected = events->interrupt.injected;
5449  vcpu->arch.interrupt.nr = events->interrupt.nr;
5450  vcpu->arch.interrupt.soft = events->interrupt.soft;
5451  if (events->flags & KVM_VCPUEVENT_VALID_SHADOW)
5452  static_call(kvm_x86_set_interrupt_shadow)(vcpu,
5453  events->interrupt.shadow);
5454 
5455  vcpu->arch.nmi_injected = events->nmi.injected;
5456  if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING) {
5457  vcpu->arch.nmi_pending = 0;
5458  atomic_set(&vcpu->arch.nmi_queued, events->nmi.pending);
5459  if (events->nmi.pending)
5460  kvm_make_request(KVM_REQ_NMI, vcpu);
5461  }
5462  static_call(kvm_x86_set_nmi_mask)(vcpu, events->nmi.masked);
5463 
5464  if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR &&
5465  lapic_in_kernel(vcpu))
5466  vcpu->arch.apic->sipi_vector = events->sipi_vector;
5467 
5468  if (events->flags & KVM_VCPUEVENT_VALID_SMM) {
5469 #ifdef CONFIG_KVM_SMM
5470  if (!!(vcpu->arch.hflags & HF_SMM_MASK) != events->smi.smm) {
5471  kvm_leave_nested(vcpu);
5472  kvm_smm_changed(vcpu, events->smi.smm);
5473  }
5474 
5475  vcpu->arch.smi_pending = events->smi.pending;
5476 
5477  if (events->smi.smm) {
5478  if (events->smi.smm_inside_nmi)
5479  vcpu->arch.hflags |= HF_SMM_INSIDE_NMI_MASK;
5480  else
5481  vcpu->arch.hflags &= ~HF_SMM_INSIDE_NMI_MASK;
5482  }
5483 
5484 #else
5485  if (events->smi.smm || events->smi.pending ||
5486  events->smi.smm_inside_nmi)
5487  return -EINVAL;
5488 #endif
5489 
5490  if (lapic_in_kernel(vcpu)) {
5491  if (events->smi.latched_init)
5492  set_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
5493  else
5494  clear_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
5495  }
5496  }
5497 
5498  if (events->flags & KVM_VCPUEVENT_VALID_TRIPLE_FAULT) {
5499  if (!vcpu->kvm->arch.triple_fault_event)
5500  return -EINVAL;
5501  if (events->triple_fault.pending)
5502  kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
5503  else
5504  kvm_clear_request(KVM_REQ_TRIPLE_FAULT, vcpu);
5505  }
5506 
5507  kvm_make_request(KVM_REQ_EVENT, vcpu);
5508 
5509  return 0;
5510 }
5511 
5512 static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu,
5513  struct kvm_debugregs *dbgregs)
5514 {
5515  unsigned long val;
5516 
5517  memset(dbgregs, 0, sizeof(*dbgregs));
5518  memcpy(dbgregs->db, vcpu->arch.db, sizeof(vcpu->arch.db));
5519  kvm_get_dr(vcpu, 6, &val);
5520  dbgregs->dr6 = val;
5521  dbgregs->dr7 = vcpu->arch.dr7;
5522 }
5523 
5524 static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
5525  struct kvm_debugregs *dbgregs)
5526 {
5527  if (dbgregs->flags)
5528  return -EINVAL;
5529 
5530  if (!kvm_dr6_valid(dbgregs->dr6))
5531  return -EINVAL;
5532  if (!kvm_dr7_valid(dbgregs->dr7))
5533  return -EINVAL;
5534 
5535  memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db));
5536  kvm_update_dr0123(vcpu);
5537  vcpu->arch.dr6 = dbgregs->dr6;
5538  vcpu->arch.dr7 = dbgregs->dr7;
5539  kvm_update_dr7(vcpu);
5540 
5541  return 0;
5542 }
5543 
5544 
5545 static void kvm_vcpu_ioctl_x86_get_xsave2(struct kvm_vcpu *vcpu,
5546  u8 *state, unsigned int size)
5547 {
5548  /*
5549  * Only copy state for features that are enabled for the guest. The
5550  * state itself isn't problematic, but setting bits in the header for
5551  * features that are supported in *this* host but not exposed to the
5552  * guest can result in KVM_SET_XSAVE failing when live migrating to a
5553  * compatible host without the features that are NOT exposed to the
5554  * guest.
5555  *
5556  * FP+SSE can always be saved/restored via KVM_{G,S}ET_XSAVE, even if
5557  * XSAVE/XCRO are not exposed to the guest, and even if XSAVE isn't
5558  * supported by the host.
5559  */
5560  u64 supported_xcr0 = vcpu->arch.guest_supported_xcr0 |
5561  XFEATURE_MASK_FPSSE;
5562 
5563  if (fpstate_is_confidential(&vcpu->arch.guest_fpu))
5564  return;
5565 
5566  fpu_copy_guest_fpstate_to_uabi(&vcpu->arch.guest_fpu, state, size,
5567  supported_xcr0, vcpu->arch.pkru);
5568 }
5569 
5570 static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu,
5571  struct kvm_xsave *guest_xsave)
5572 {
5573  kvm_vcpu_ioctl_x86_get_xsave2(vcpu, (void *)guest_xsave->region,
5574  sizeof(guest_xsave->region));
5575 }
5576 
5577 static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu,
5578  struct kvm_xsave *guest_xsave)
5579 {
5580  if (fpstate_is_confidential(&vcpu->arch.guest_fpu))
5581  return 0;
5582 
5583  return fpu_copy_uabi_to_guest_fpstate(&vcpu->arch.guest_fpu,
5584  guest_xsave->region,
5586  &vcpu->arch.pkru);
5587 }
5588 
5589 static void kvm_vcpu_ioctl_x86_get_xcrs(struct kvm_vcpu *vcpu,
5590  struct kvm_xcrs *guest_xcrs)
5591 {
5592  if (!boot_cpu_has(X86_FEATURE_XSAVE)) {
5593  guest_xcrs->nr_xcrs = 0;
5594  return;
5595  }
5596 
5597  guest_xcrs->nr_xcrs = 1;
5598  guest_xcrs->flags = 0;
5599  guest_xcrs->xcrs[0].xcr = XCR_XFEATURE_ENABLED_MASK;
5600  guest_xcrs->xcrs[0].value = vcpu->arch.xcr0;
5601 }
5602 
5603 static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu,
5604  struct kvm_xcrs *guest_xcrs)
5605 {
5606  int i, r = 0;
5607 
5608  if (!boot_cpu_has(X86_FEATURE_XSAVE))
5609  return -EINVAL;
5610 
5611  if (guest_xcrs->nr_xcrs > KVM_MAX_XCRS || guest_xcrs->flags)
5612  return -EINVAL;
5613 
5614  for (i = 0; i < guest_xcrs->nr_xcrs; i++)
5615  /* Only support XCR0 currently */
5616  if (guest_xcrs->xcrs[i].xcr == XCR_XFEATURE_ENABLED_MASK) {
5617  r = __kvm_set_xcr(vcpu, XCR_XFEATURE_ENABLED_MASK,
5618  guest_xcrs->xcrs[i].value);
5619  break;
5620  }
5621  if (r)
5622  r = -EINVAL;
5623  return r;
5624 }
5625 
5626 /*
5627  * kvm_set_guest_paused() indicates to the guest kernel that it has been
5628  * stopped by the hypervisor. This function will be called from the host only.
5629  * EINVAL is returned when the host attempts to set the flag for a guest that
5630  * does not support pv clocks.
5631  */
5632 static int kvm_set_guest_paused(struct kvm_vcpu *vcpu)
5633 {
5634  if (!vcpu->arch.pv_time.active)
5635  return -EINVAL;
5636  vcpu->arch.pvclock_set_guest_stopped_request = true;
5637  kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
5638  return 0;
5639 }
5640 
5641 static int kvm_arch_tsc_has_attr(struct kvm_vcpu *vcpu,
5642  struct kvm_device_attr *attr)
5643 {
5644  int r;
5645 
5646  switch (attr->attr) {
5647  case KVM_VCPU_TSC_OFFSET:
5648  r = 0;
5649  break;
5650  default:
5651  r = -ENXIO;
5652  }
5653 
5654  return r;
5655 }
5656 
5657 static int kvm_arch_tsc_get_attr(struct kvm_vcpu *vcpu,
5658  struct kvm_device_attr *attr)
5659 {
5660  u64 __user *uaddr = kvm_get_attr_addr(attr);
5661  int r;
5662 
5663  if (IS_ERR(uaddr))
5664  return PTR_ERR(uaddr);
5665 
5666  switch (attr->attr) {
5667  case KVM_VCPU_TSC_OFFSET:
5668  r = -EFAULT;
5669  if (put_user(vcpu->arch.l1_tsc_offset, uaddr))
5670  break;
5671  r = 0;
5672  break;
5673  default:
5674  r = -ENXIO;
5675  }
5676 
5677  return r;
5678 }
5679 
5680 static int kvm_arch_tsc_set_attr(struct kvm_vcpu *vcpu,
5681  struct kvm_device_attr *attr)
5682 {
5683  u64 __user *uaddr = kvm_get_attr_addr(attr);
5684  struct kvm *kvm = vcpu->kvm;
5685  int r;
5686 
5687  if (IS_ERR(uaddr))
5688  return PTR_ERR(uaddr);
5689 
5690  switch (attr->attr) {
5691  case KVM_VCPU_TSC_OFFSET: {
5692  u64 offset, tsc, ns;
5693  unsigned long flags;
5694  bool matched;
5695 
5696  r = -EFAULT;
5697  if (get_user(offset, uaddr))
5698  break;
5699 
5700  raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
5701 
5702  matched = (vcpu->arch.virtual_tsc_khz &&
5703  kvm->arch.last_tsc_khz == vcpu->arch.virtual_tsc_khz &&
5704  kvm->arch.last_tsc_offset == offset);
5705 
5706  tsc = kvm_scale_tsc(rdtsc(), vcpu->arch.l1_tsc_scaling_ratio) + offset;
5707  ns = get_kvmclock_base_ns();
5708 
5709  kvm->arch.user_set_tsc = true;
5710  __kvm_synchronize_tsc(vcpu, offset, tsc, ns, matched);
5711  raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
5712 
5713  r = 0;
5714  break;
5715  }
5716  default:
5717  r = -ENXIO;
5718  }
5719 
5720  return r;
5721 }
5722 
5723 static int kvm_vcpu_ioctl_device_attr(struct kvm_vcpu *vcpu,
5724  unsigned int ioctl,
5725  void __user *argp)
5726 {
5727  struct kvm_device_attr attr;
5728  int r;
5729 
5730  if (copy_from_user(&attr, argp, sizeof(attr)))
5731  return -EFAULT;
5732 
5733  if (attr.group != KVM_VCPU_TSC_CTRL)
5734  return -ENXIO;
5735 
5736  switch (ioctl) {
5737  case KVM_HAS_DEVICE_ATTR:
5738  r = kvm_arch_tsc_has_attr(vcpu, &attr);
5739  break;
5740  case KVM_GET_DEVICE_ATTR:
5741  r = kvm_arch_tsc_get_attr(vcpu, &attr);
5742  break;
5743  case KVM_SET_DEVICE_ATTR:
5744  r = kvm_arch_tsc_set_attr(vcpu, &attr);
5745  break;
5746  }
5747 
5748  return r;
5749 }
5750 
5751 static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
5752  struct kvm_enable_cap *cap)
5753 {
5754  if (cap->flags)
5755  return -EINVAL;
5756 
5757  switch (cap->cap) {
5758 #ifdef CONFIG_KVM_HYPERV
5759  case KVM_CAP_HYPERV_SYNIC2:
5760  if (cap->args[0])
5761  return -EINVAL;
5762  fallthrough;
5763 
5764  case KVM_CAP_HYPERV_SYNIC:
5765  if (!irqchip_in_kernel(vcpu->kvm))
5766  return -EINVAL;
5767  return kvm_hv_activate_synic(vcpu, cap->cap ==
5768  KVM_CAP_HYPERV_SYNIC2);
5769  case KVM_CAP_HYPERV_ENLIGHTENED_VMCS:
5770  {
5771  int r;
5772  uint16_t vmcs_version;
5773  void __user *user_ptr;
5774 
5775  if (!kvm_x86_ops.nested_ops->enable_evmcs)
5776  return -ENOTTY;
5777  r = kvm_x86_ops.nested_ops->enable_evmcs(vcpu, &vmcs_version);
5778  if (!r) {
5779  user_ptr = (void __user *)(uintptr_t)cap->args[0];
5780  if (copy_to_user(user_ptr, &vmcs_version,
5781  sizeof(vmcs_version)))
5782  r = -EFAULT;
5783  }
5784  return r;
5785  }
5786  case KVM_CAP_HYPERV_DIRECT_TLBFLUSH:
5787  if (!kvm_x86_ops.enable_l2_tlb_flush)
5788  return -ENOTTY;
5789 
5790  return static_call(kvm_x86_enable_l2_tlb_flush)(vcpu);
5791 
5792  case KVM_CAP_HYPERV_ENFORCE_CPUID:
5793  return kvm_hv_set_enforce_cpuid(vcpu, cap->args[0]);
5794 #endif
5795 
5796  case KVM_CAP_ENFORCE_PV_FEATURE_CPUID:
5797  vcpu->arch.pv_cpuid.enforce = cap->args[0];
5798  if (vcpu->arch.pv_cpuid.enforce)
5799  kvm_update_pv_runtime(vcpu);
5800 
5801  return 0;
5802  default:
5803  return -EINVAL;
5804  }
5805 }
5806 
5807 long kvm_arch_vcpu_ioctl(struct file *filp,
5808  unsigned int ioctl, unsigned long arg)
5809 {
5810  struct kvm_vcpu *vcpu = filp->private_data;
5811  void __user *argp = (void __user *)arg;
5812  int r;
5813  union {
5814  struct kvm_sregs2 *sregs2;
5815  struct kvm_lapic_state *lapic;
5816  struct kvm_xsave *xsave;
5817  struct kvm_xcrs *xcrs;
5818  void *buffer;
5819  } u;
5820 
5821  vcpu_load(vcpu);
5822 
5823  u.buffer = NULL;
5824  switch (ioctl) {
5825  case KVM_GET_LAPIC: {
5826  r = -EINVAL;
5827  if (!lapic_in_kernel(vcpu))
5828  goto out;
5829  u.lapic = kzalloc(sizeof(struct kvm_lapic_state),
5830  GFP_KERNEL_ACCOUNT);
5831 
5832  r = -ENOMEM;
5833  if (!u.lapic)
5834  goto out;
5835  r = kvm_vcpu_ioctl_get_lapic(vcpu, u.lapic);
5836  if (r)
5837  goto out;
5838  r = -EFAULT;
5839  if (copy_to_user(argp, u.lapic, sizeof(struct kvm_lapic_state)))
5840  goto out;
5841  r = 0;
5842  break;
5843  }
5844  case KVM_SET_LAPIC: {
5845  r = -EINVAL;
5846  if (!lapic_in_kernel(vcpu))
5847  goto out;
5848  u.lapic = memdup_user(argp, sizeof(*u.lapic));
5849  if (IS_ERR(u.lapic)) {
5850  r = PTR_ERR(u.lapic);
5851  goto out_nofree;
5852  }
5853 
5854  r = kvm_vcpu_ioctl_set_lapic(vcpu, u.lapic);
5855  break;
5856  }
5857  case KVM_INTERRUPT: {
5858  struct kvm_interrupt irq;
5859 
5860  r = -EFAULT;
5861  if (copy_from_user(&irq, argp, sizeof(irq)))
5862  goto out;
5863  r = kvm_vcpu_ioctl_interrupt(vcpu, &irq);
5864  break;
5865  }
5866  case KVM_NMI: {
5867  r = kvm_vcpu_ioctl_nmi(vcpu);
5868  break;
5869  }
5870  case KVM_SMI: {
5871  r = kvm_inject_smi(vcpu);
5872  break;
5873  }
5874  case KVM_SET_CPUID: {
5875  struct kvm_cpuid __user *cpuid_arg = argp;
5876  struct kvm_cpuid cpuid;
5877 
5878  r = -EFAULT;
5879  if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid)))
5880  goto out;
5881  r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries);
5882  break;
5883  }
5884  case KVM_SET_CPUID2: {
5885  struct kvm_cpuid2 __user *cpuid_arg = argp;
5886  struct kvm_cpuid2 cpuid;
5887 
5888  r = -EFAULT;
5889  if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid)))
5890  goto out;
5891  r = kvm_vcpu_ioctl_set_cpuid2(vcpu, &cpuid,
5892  cpuid_arg->entries);
5893  break;
5894  }
5895  case KVM_GET_CPUID2: {
5896  struct kvm_cpuid2 __user *cpuid_arg = argp;
5897  struct kvm_cpuid2 cpuid;
5898 
5899  r = -EFAULT;
5900  if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid)))
5901  goto out;
5902  r = kvm_vcpu_ioctl_get_cpuid2(vcpu, &cpuid,
5903  cpuid_arg->entries);
5904  if (r)
5905  goto out;
5906  r = -EFAULT;
5907  if (copy_to_user(cpuid_arg, &cpuid, sizeof(cpuid)))
5908  goto out;
5909  r = 0;
5910  break;
5911  }
5912  case KVM_GET_MSRS: {
5913  int idx = srcu_read_lock(&vcpu->kvm->srcu);
5914  r = msr_io(vcpu, argp, do_get_msr, 1);
5915  srcu_read_unlock(&vcpu->kvm->srcu, idx);
5916  break;
5917  }
5918  case KVM_SET_MSRS: {
5919  int idx = srcu_read_lock(&vcpu->kvm->srcu);
5920  r = msr_io(vcpu, argp, do_set_msr, 0);
5921  srcu_read_unlock(&vcpu->kvm->srcu, idx);
5922  break;
5923  }
5924  case KVM_TPR_ACCESS_REPORTING: {
5925  struct kvm_tpr_access_ctl tac;
5926 
5927  r = -EFAULT;
5928  if (copy_from_user(&tac, argp, sizeof(tac)))
5929  goto out;
5930  r = vcpu_ioctl_tpr_access_reporting(vcpu, &tac);
5931  if (r)
5932  goto out;
5933  r = -EFAULT;
5934  if (copy_to_user(argp, &tac, sizeof(tac)))
5935  goto out;
5936  r = 0;
5937  break;
5938  };
5939  case KVM_SET_VAPIC_ADDR: {
5940  struct kvm_vapic_addr va;
5941  int idx;
5942 
5943  r = -EINVAL;
5944  if (!lapic_in_kernel(vcpu))
5945  goto out;
5946  r = -EFAULT;
5947  if (copy_from_user(&va, argp, sizeof(va)))
5948  goto out;
5949  idx = srcu_read_lock(&vcpu->kvm->srcu);
5950  r = kvm_lapic_set_vapic_addr(vcpu, va.vapic_addr);
5951  srcu_read_unlock(&vcpu->kvm->srcu, idx);
5952  break;
5953  }
5954  case KVM_X86_SETUP_MCE: {
5955  u64 mcg_cap;
5956 
5957  r = -EFAULT;
5958  if (copy_from_user(&mcg_cap, argp, sizeof(mcg_cap)))
5959  goto out;
5960  r = kvm_vcpu_ioctl_x86_setup_mce(vcpu, mcg_cap);
5961  break;
5962  }
5963  case KVM_X86_SET_MCE: {
5964  struct kvm_x86_mce mce;
5965 
5966  r = -EFAULT;
5967  if (copy_from_user(&mce, argp, sizeof(mce)))
5968  goto out;
5969  r = kvm_vcpu_ioctl_x86_set_mce(vcpu, &mce);
5970  break;
5971  }
5972  case KVM_GET_VCPU_EVENTS: {
5973  struct kvm_vcpu_events events;
5974 
5975  kvm_vcpu_ioctl_x86_get_vcpu_events(vcpu, &events);
5976 
5977  r = -EFAULT;
5978  if (copy_to_user(argp, &events, sizeof(struct kvm_vcpu_events)))
5979  break;
5980  r = 0;
5981  break;
5982  }
5983  case KVM_SET_VCPU_EVENTS: {
5984  struct kvm_vcpu_events events;
5985 
5986  r = -EFAULT;
5987  if (copy_from_user(&events, argp, sizeof(struct kvm_vcpu_events)))
5988  break;
5989 
5990  r = kvm_vcpu_ioctl_x86_set_vcpu_events(vcpu, &events);
5991  break;
5992  }
5993  case KVM_GET_DEBUGREGS: {
5994  struct kvm_debugregs dbgregs;
5995 
5996  kvm_vcpu_ioctl_x86_get_debugregs(vcpu, &dbgregs);
5997 
5998  r = -EFAULT;
5999  if (copy_to_user(argp, &dbgregs,
6000  sizeof(struct kvm_debugregs)))
6001  break;
6002  r = 0;
6003  break;
6004  }
6005  case KVM_SET_DEBUGREGS: {
6006  struct kvm_debugregs dbgregs;
6007 
6008  r = -EFAULT;
6009  if (copy_from_user(&dbgregs, argp,
6010  sizeof(struct kvm_debugregs)))
6011  break;
6012 
6013  r = kvm_vcpu_ioctl_x86_set_debugregs(vcpu, &dbgregs);
6014  break;
6015  }
6016  case KVM_GET_XSAVE: {
6017  r = -EINVAL;
6018  if (vcpu->arch.guest_fpu.uabi_size > sizeof(struct kvm_xsave))
6019  break;
6020 
6021  u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL_ACCOUNT);
6022  r = -ENOMEM;
6023  if (!u.xsave)
6024  break;
6025 
6026  kvm_vcpu_ioctl_x86_get_xsave(vcpu, u.xsave);
6027 
6028  r = -EFAULT;
6029  if (copy_to_user(argp, u.xsave, sizeof(struct kvm_xsave)))
6030  break;
6031  r = 0;
6032  break;
6033  }
6034  case KVM_SET_XSAVE: {
6035  int size = vcpu->arch.guest_fpu.uabi_size;
6036 
6037  u.xsave = memdup_user(argp, size);
6038  if (IS_ERR(u.xsave)) {
6039  r = PTR_ERR(u.xsave);
6040  goto out_nofree;
6041  }
6042 
6043  r = kvm_vcpu_ioctl_x86_set_xsave(vcpu, u.xsave);
6044  break;
6045  }
6046 
6047  case KVM_GET_XSAVE2: {
6048  int size = vcpu->arch.guest_fpu.uabi_size;
6049 
6050  u.xsave = kzalloc(size, GFP_KERNEL_ACCOUNT);
6051  r = -ENOMEM;
6052  if (!u.xsave)
6053  break;
6054 
6055  kvm_vcpu_ioctl_x86_get_xsave2(vcpu, u.buffer, size);
6056 
6057  r = -EFAULT;
6058  if (copy_to_user(argp, u.xsave, size))
6059  break;
6060 
6061  r = 0;
6062  break;
6063  }
6064 
6065  case KVM_GET_XCRS: {
6066  u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL_ACCOUNT);
6067  r = -ENOMEM;
6068  if (!u.xcrs)
6069  break;
6070 
6071  kvm_vcpu_ioctl_x86_get_xcrs(vcpu, u.xcrs);
6072 
6073  r = -EFAULT;
6074  if (copy_to_user(argp, u.xcrs,
6075  sizeof(struct kvm_xcrs)))
6076  break;
6077  r = 0;
6078  break;
6079  }
6080  case KVM_SET_XCRS: {
6081  u.xcrs = memdup_user(argp, sizeof(*u.xcrs));
6082  if (IS_ERR(u.xcrs)) {
6083  r = PTR_ERR(u.xcrs);
6084  goto out_nofree;
6085  }
6086 
6087  r = kvm_vcpu_ioctl_x86_set_xcrs(vcpu, u.xcrs);
6088  break;
6089  }
6090  case KVM_SET_TSC_KHZ: {
6091  u32 user_tsc_khz;
6092 
6093  r = -EINVAL;
6094  user_tsc_khz = (u32)arg;
6095 
6096  if (kvm_caps.has_tsc_control &&
6097  user_tsc_khz >= kvm_caps.max_guest_tsc_khz)
6098  goto out;
6099 
6100  if (user_tsc_khz == 0)
6101  user_tsc_khz = tsc_khz;
6102 
6103  if (!kvm_set_tsc_khz(vcpu, user_tsc_khz))
6104  r = 0;
6105 
6106  goto out;
6107  }
6108  case KVM_GET_TSC_KHZ: {
6109  r = vcpu->arch.virtual_tsc_khz;
6110  goto out;
6111  }
6112  case KVM_KVMCLOCK_CTRL: {
6113  r = kvm_set_guest_paused(vcpu);
6114  goto out;
6115  }
6116  case KVM_ENABLE_CAP: {
6117  struct kvm_enable_cap cap;
6118 
6119  r = -EFAULT;
6120  if (copy_from_user(&cap, argp, sizeof(cap)))
6121  goto out;
6122  r = kvm_vcpu_ioctl_enable_cap(vcpu, &cap);
6123  break;
6124  }
6125  case KVM_GET_NESTED_STATE: {
6126  struct kvm_nested_state __user *user_kvm_nested_state = argp;
6127  u32 user_data_size;
6128 
6129  r = -EINVAL;
6130  if (!kvm_x86_ops.nested_ops->get_state)
6131  break;
6132 
6133  BUILD_BUG_ON(sizeof(user_data_size) != sizeof(user_kvm_nested_state->size));
6134  r = -EFAULT;
6135  if (get_user(user_data_size, &user_kvm_nested_state->size))
6136  break;
6137 
6138  r = kvm_x86_ops.nested_ops->get_state(vcpu, user_kvm_nested_state,
6139  user_data_size);
6140  if (r < 0)
6141  break;
6142 
6143  if (r > user_data_size) {
6144  if (put_user(r, &user_kvm_nested_state->size))
6145  r = -EFAULT;
6146  else
6147  r = -E2BIG;
6148  break;
6149  }
6150 
6151  r = 0;
6152  break;
6153  }
6154  case KVM_SET_NESTED_STATE: {
6155  struct kvm_nested_state __user *user_kvm_nested_state = argp;
6156  struct kvm_nested_state kvm_state;
6157  int idx;
6158 
6159  r = -EINVAL;
6160  if (!kvm_x86_ops.nested_ops->set_state)
6161  break;
6162 
6163  r = -EFAULT;
6164  if (copy_from_user(&kvm_state, user_kvm_nested_state, sizeof(kvm_state)))
6165  break;
6166 
6167  r = -EINVAL;
6168  if (kvm_state.size < sizeof(kvm_state))
6169  break;
6170 
6171  if (kvm_state.flags &
6172  ~(KVM_STATE_NESTED_RUN_PENDING | KVM_STATE_NESTED_GUEST_MODE
6173  | KVM_STATE_NESTED_EVMCS | KVM_STATE_NESTED_MTF_PENDING
6174  | KVM_STATE_NESTED_GIF_SET))
6175  break;
6176 
6177  /* nested_run_pending implies guest_mode. */
6178  if ((kvm_state.flags & KVM_STATE_NESTED_RUN_PENDING)
6179  && !(kvm_state.flags & KVM_STATE_NESTED_GUEST_MODE))
6180  break;
6181 
6182  idx = srcu_read_lock(&vcpu->kvm->srcu);
6183  r = kvm_x86_ops.nested_ops->set_state(vcpu, user_kvm_nested_state, &kvm_state);
6184  srcu_read_unlock(&vcpu->kvm->srcu, idx);
6185  break;
6186  }
6187 #ifdef CONFIG_KVM_HYPERV
6188  case KVM_GET_SUPPORTED_HV_CPUID:
6189  r = kvm_ioctl_get_supported_hv_cpuid(vcpu, argp);
6190  break;
6191 #endif
6192 #ifdef CONFIG_KVM_XEN
6193  case KVM_XEN_VCPU_GET_ATTR: {
6194  struct kvm_xen_vcpu_attr xva;
6195 
6196  r = -EFAULT;
6197  if (copy_from_user(&xva, argp, sizeof(xva)))
6198  goto out;
6199  r = kvm_xen_vcpu_get_attr(vcpu, &xva);
6200  if (!r && copy_to_user(argp, &xva, sizeof(xva)))
6201  r = -EFAULT;
6202  break;
6203  }
6204  case KVM_XEN_VCPU_SET_ATTR: {
6205  struct kvm_xen_vcpu_attr xva;
6206 
6207  r = -EFAULT;
6208  if (copy_from_user(&xva, argp, sizeof(xva)))
6209  goto out;
6210  r = kvm_xen_vcpu_set_attr(vcpu, &xva);
6211  break;
6212  }
6213 #endif
6214  case KVM_GET_SREGS2: {
6215  u.sregs2 = kzalloc(sizeof(struct kvm_sregs2), GFP_KERNEL);
6216  r = -ENOMEM;
6217  if (!u.sregs2)
6218  goto out;
6219  __get_sregs2(vcpu, u.sregs2);
6220  r = -EFAULT;
6221  if (copy_to_user(argp, u.sregs2, sizeof(struct kvm_sregs2)))
6222  goto out;
6223  r = 0;
6224  break;
6225  }
6226  case KVM_SET_SREGS2: {
6227  u.sregs2 = memdup_user(argp, sizeof(struct kvm_sregs2));
6228  if (IS_ERR(u.sregs2)) {
6229  r = PTR_ERR(u.sregs2);
6230  u.sregs2 = NULL;
6231  goto out;
6232  }
6233  r = __set_sregs2(vcpu, u.sregs2);
6234  break;
6235  }
6236  case KVM_HAS_DEVICE_ATTR:
6237  case KVM_GET_DEVICE_ATTR:
6238  case KVM_SET_DEVICE_ATTR:
6239  r = kvm_vcpu_ioctl_device_attr(vcpu, ioctl, argp);
6240  break;
6241  default:
6242  r = -EINVAL;
6243  }
6244 out:
6245  kfree(u.buffer);
6246 out_nofree:
6247  vcpu_put(vcpu);
6248  return r;
6249 }
6250 
6251 vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
6252 {
6253  return VM_FAULT_SIGBUS;
6254 }
6255 
6256 static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr)
6257 {
6258  int ret;
6259 
6260  if (addr > (unsigned int)(-3 * PAGE_SIZE))
6261  return -EINVAL;
6262  ret = static_call(kvm_x86_set_tss_addr)(kvm, addr);
6263  return ret;
6264 }
6265 
6266 static int kvm_vm_ioctl_set_identity_map_addr(struct kvm *kvm,
6267  u64 ident_addr)
6268 {
6269  return static_call(kvm_x86_set_identity_map_addr)(kvm, ident_addr);
6270 }
6271 
6272 static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
6273  unsigned long kvm_nr_mmu_pages)
6274 {
6275  if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES)
6276  return -EINVAL;
6277 
6278  mutex_lock(&kvm->slots_lock);
6279 
6280  kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages);
6281  kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages;
6282 
6283  mutex_unlock(&kvm->slots_lock);
6284  return 0;
6285 }
6286 
6287 static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
6288 {
6289  struct kvm_pic *pic = kvm->arch.vpic;
6290  int r;
6291 
6292  r = 0;
6293  switch (chip->chip_id) {
6294  case KVM_IRQCHIP_PIC_MASTER:
6295  memcpy(&chip->chip.pic, &pic->pics[0],
6296  sizeof(struct kvm_pic_state));
6297  break;
6298  case KVM_IRQCHIP_PIC_SLAVE:
6299  memcpy(&chip->chip.pic, &pic->pics[1],
6300  sizeof(struct kvm_pic_state));
6301  break;
6302  case KVM_IRQCHIP_IOAPIC:
6303  kvm_get_ioapic(kvm, &chip->chip.ioapic);
6304  break;
6305  default:
6306  r = -EINVAL;
6307  break;
6308  }
6309  return r;
6310 }
6311 
6312 static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
6313 {
6314  struct kvm_pic *pic = kvm->arch.vpic;
6315  int r;
6316 
6317  r = 0;
6318  switch (chip->chip_id) {
6319  case KVM_IRQCHIP_PIC_MASTER:
6320  spin_lock(&pic->lock);
6321  memcpy(&pic->pics[0], &chip->chip.pic,
6322  sizeof(struct kvm_pic_state));
6323  spin_unlock(&pic->lock);
6324  break;
6325  case KVM_IRQCHIP_PIC_SLAVE:
6326  spin_lock(&pic->lock);
6327  memcpy(&pic->pics[1], &chip->chip.pic,
6328  sizeof(struct kvm_pic_state));
6329  spin_unlock(&pic->lock);
6330  break;
6331  case KVM_IRQCHIP_IOAPIC:
6332  kvm_set_ioapic(kvm, &chip->chip.ioapic);
6333  break;
6334  default:
6335  r = -EINVAL;
6336  break;
6337  }
6338  kvm_pic_update_irq(pic);
6339  return r;
6340 }
6341 
6342 static int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps)
6343 {
6344  struct kvm_kpit_state *kps = &kvm->arch.vpit->pit_state;
6345 
6346  BUILD_BUG_ON(sizeof(*ps) != sizeof(kps->channels));
6347 
6348  mutex_lock(&kps->lock);
6349  memcpy(ps, &kps->channels, sizeof(*ps));
6350  mutex_unlock(&kps->lock);
6351  return 0;
6352 }
6353 
6354 static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps)
6355 {
6356  int i;
6357  struct kvm_pit *pit = kvm->arch.vpit;
6358 
6359  mutex_lock(&pit->pit_state.lock);
6360  memcpy(&pit->pit_state.channels, ps, sizeof(*ps));
6361  for (i = 0; i < 3; i++)
6362  kvm_pit_load_count(pit, i, ps->channels[i].count, 0);
6363  mutex_unlock(&pit->pit_state.lock);
6364  return 0;
6365 }
6366 
6367 static int kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
6368 {
6369  mutex_lock(&kvm->arch.vpit->pit_state.lock);
6370  memcpy(ps->channels, &kvm->arch.vpit->pit_state.channels,
6371  sizeof(ps->channels));
6372  ps->flags = kvm->arch.vpit->pit_state.flags;
6373  mutex_unlock(&kvm->arch.vpit->pit_state.lock);
6374  memset(&ps->reserved, 0, sizeof(ps->reserved));
6375  return 0;
6376 }
6377 
6378 static int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
6379 {
6380  int start = 0;
6381  int i;
6382  u32 prev_legacy, cur_legacy;
6383  struct kvm_pit *pit = kvm->arch.vpit;
6384 
6385  mutex_lock(&pit->pit_state.lock);
6386  prev_legacy = pit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY;
6387  cur_legacy = ps->flags & KVM_PIT_FLAGS_HPET_LEGACY;
6388  if (!prev_legacy && cur_legacy)
6389  start = 1;
6390  memcpy(&pit->pit_state.channels, &ps->channels,
6391  sizeof(pit->pit_state.channels));
6392  pit->pit_state.flags = ps->flags;
6393  for (i = 0; i < 3; i++)
6394  kvm_pit_load_count(pit, i, pit->pit_state.channels[i].count,
6395  start && i == 0);
6396  mutex_unlock(&pit->pit_state.lock);
6397  return 0;
6398 }
6399 
6400 static int kvm_vm_ioctl_reinject(struct kvm *kvm,
6401  struct kvm_reinject_control *control)
6402 {
6403  struct kvm_pit *pit = kvm->arch.vpit;
6404 
6405  /* pit->pit_state.lock was overloaded to prevent userspace from getting
6406  * an inconsistent state after running multiple KVM_REINJECT_CONTROL
6407  * ioctls in parallel. Use a separate lock if that ioctl isn't rare.
6408  */
6409  mutex_lock(&pit->pit_state.lock);
6410  kvm_pit_set_reinject(pit, control->pit_reinject);
6411  mutex_unlock(&pit->pit_state.lock);
6412 
6413  return 0;
6414 }
6415 
6416 void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
6417 {
6418 
6419  /*
6420  * Flush all CPUs' dirty log buffers to the dirty_bitmap. Called
6421  * before reporting dirty_bitmap to userspace. KVM flushes the buffers
6422  * on all VM-Exits, thus we only need to kick running vCPUs to force a
6423  * VM-Exit.
6424  */
6425  struct kvm_vcpu *vcpu;
6426  unsigned long i;
6427 
6428  if (!kvm_x86_ops.cpu_dirty_log_size)
6429  return;
6430 
6431  kvm_for_each_vcpu(i, vcpu, kvm)
6432  kvm_vcpu_kick(vcpu);
6433 }
6434 
6435 int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event,
6436  bool line_status)
6437 {
6438  if (!irqchip_in_kernel(kvm))
6439  return -ENXIO;
6440 
6441  irq_event->status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
6442  irq_event->irq, irq_event->level,
6443  line_status);
6444  return 0;
6445 }
6446 
6447 int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
6448  struct kvm_enable_cap *cap)
6449 {
6450  int r;
6451 
6452  if (cap->flags)
6453  return -EINVAL;
6454 
6455  switch (cap->cap) {
6456  case KVM_CAP_DISABLE_QUIRKS2:
6457  r = -EINVAL;
6458  if (cap->args[0] & ~KVM_X86_VALID_QUIRKS)
6459  break;
6460  fallthrough;
6461  case KVM_CAP_DISABLE_QUIRKS:
6462  kvm->arch.disabled_quirks = cap->args[0];
6463  r = 0;
6464  break;
6465  case KVM_CAP_SPLIT_IRQCHIP: {
6466  mutex_lock(&kvm->lock);
6467  r = -EINVAL;
6468  if (cap->args[0] > MAX_NR_RESERVED_IOAPIC_PINS)
6469  goto split_irqchip_unlock;
6470  r = -EEXIST;
6471  if (irqchip_in_kernel(kvm))
6472  goto split_irqchip_unlock;
6473  if (kvm->created_vcpus)
6474  goto split_irqchip_unlock;
6475  r = kvm_setup_empty_irq_routing(kvm);
6476  if (r)
6477  goto split_irqchip_unlock;
6478  /* Pairs with irqchip_in_kernel. */
6479  smp_wmb();
6480  kvm->arch.irqchip_mode = KVM_IRQCHIP_SPLIT;
6481  kvm->arch.nr_reserved_ioapic_pins = cap->args[0];
6482  kvm_clear_apicv_inhibit(kvm, APICV_INHIBIT_REASON_ABSENT);
6483  r = 0;
6484 split_irqchip_unlock:
6485  mutex_unlock(&kvm->lock);
6486  break;
6487  }
6488  case KVM_CAP_X2APIC_API:
6489  r = -EINVAL;
6490  if (cap->args[0] & ~KVM_X2APIC_API_VALID_FLAGS)
6491  break;
6492 
6493  if (cap->args[0] & KVM_X2APIC_API_USE_32BIT_IDS)
6494  kvm->arch.x2apic_format = true;
6495  if (cap->args[0] & KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK)
6496  kvm->arch.x2apic_broadcast_quirk_disabled = true;
6497 
6498  r = 0;
6499  break;
6500  case KVM_CAP_X86_DISABLE_EXITS:
6501  r = -EINVAL;
6502  if (cap->args[0] & ~KVM_X86_DISABLE_VALID_EXITS)
6503  break;
6504 
6505  if (cap->args[0] & KVM_X86_DISABLE_EXITS_PAUSE)
6506  kvm->arch.pause_in_guest = true;
6507 
6508 #define SMT_RSB_MSG "This processor is affected by the Cross-Thread Return Predictions vulnerability. " \
6509  "KVM_CAP_X86_DISABLE_EXITS should only be used with SMT disabled or trusted guests."
6510 
6511  if (!mitigate_smt_rsb) {
6512  if (boot_cpu_has_bug(X86_BUG_SMT_RSB) && cpu_smt_possible() &&
6513  (cap->args[0] & ~KVM_X86_DISABLE_EXITS_PAUSE))
6514  pr_warn_once(SMT_RSB_MSG);
6515 
6516  if ((cap->args[0] & KVM_X86_DISABLE_EXITS_MWAIT) &&
6518  kvm->arch.mwait_in_guest = true;
6519  if (cap->args[0] & KVM_X86_DISABLE_EXITS_HLT)
6520  kvm->arch.hlt_in_guest = true;
6521  if (cap->args[0] & KVM_X86_DISABLE_EXITS_CSTATE)
6522  kvm->arch.cstate_in_guest = true;
6523  }
6524 
6525  r = 0;
6526  break;
6527  case KVM_CAP_MSR_PLATFORM_INFO:
6528  kvm->arch.guest_can_read_msr_platform_info = cap->args[0];
6529  r = 0;
6530  break;
6531  case KVM_CAP_EXCEPTION_PAYLOAD:
6532  kvm->arch.exception_payload_enabled = cap->args[0];
6533  r = 0;
6534  break;
6535  case KVM_CAP_X86_TRIPLE_FAULT_EVENT:
6536  kvm->arch.triple_fault_event = cap->args[0];
6537  r = 0;
6538  break;
6539  case KVM_CAP_X86_USER_SPACE_MSR:
6540  r = -EINVAL;
6541  if (cap->args[0] & ~KVM_MSR_EXIT_REASON_VALID_MASK)
6542  break;
6543  kvm->arch.user_space_msr_mask = cap->args[0];
6544  r = 0;
6545  break;
6546  case KVM_CAP_X86_BUS_LOCK_EXIT:
6547  r = -EINVAL;
6548  if (cap->args[0] & ~KVM_BUS_LOCK_DETECTION_VALID_MODE)
6549  break;
6550 
6551  if ((cap->args[0] & KVM_BUS_LOCK_DETECTION_OFF) &&
6552  (cap->args[0] & KVM_BUS_LOCK_DETECTION_EXIT))
6553  break;
6554 
6556  cap->args[0] & KVM_BUS_LOCK_DETECTION_EXIT)
6557  kvm->arch.bus_lock_detection_enabled = true;
6558  r = 0;
6559  break;
6560 #ifdef CONFIG_X86_SGX_KVM
6561  case KVM_CAP_SGX_ATTRIBUTE: {
6562  unsigned long allowed_attributes = 0;
6563 
6564  r = sgx_set_attribute(&allowed_attributes, cap->args[0]);
6565  if (r)
6566  break;
6567 
6568  /* KVM only supports the PROVISIONKEY privileged attribute. */
6569  if ((allowed_attributes & SGX_ATTR_PROVISIONKEY) &&
6570  !(allowed_attributes & ~SGX_ATTR_PROVISIONKEY))
6571  kvm->arch.sgx_provisioning_allowed = true;
6572  else
6573  r = -EINVAL;
6574  break;
6575  }
6576 #endif
6577  case KVM_CAP_VM_COPY_ENC_CONTEXT_FROM:
6578  r = -EINVAL;
6579  if (!kvm_x86_ops.vm_copy_enc_context_from)
6580  break;
6581 
6582  r = static_call(kvm_x86_vm_copy_enc_context_from)(kvm, cap->args[0]);
6583  break;
6584  case KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM:
6585  r = -EINVAL;
6586  if (!kvm_x86_ops.vm_move_enc_context_from)
6587  break;
6588 
6589  r = static_call(kvm_x86_vm_move_enc_context_from)(kvm, cap->args[0]);
6590  break;
6591  case KVM_CAP_EXIT_HYPERCALL:
6592  if (cap->args[0] & ~KVM_EXIT_HYPERCALL_VALID_MASK) {
6593  r = -EINVAL;
6594  break;
6595  }
6596  kvm->arch.hypercall_exit_enabled = cap->args[0];
6597  r = 0;
6598  break;
6599  case KVM_CAP_EXIT_ON_EMULATION_FAILURE:
6600  r = -EINVAL;
6601  if (cap->args[0] & ~1)
6602  break;
6603  kvm->arch.exit_on_emulation_error = cap->args[0];
6604  r = 0;
6605  break;
6606  case KVM_CAP_PMU_CAPABILITY:
6607  r = -EINVAL;
6608  if (!enable_pmu || (cap->args[0] & ~KVM_CAP_PMU_VALID_MASK))
6609  break;
6610 
6611  mutex_lock(&kvm->lock);
6612  if (!kvm->created_vcpus) {
6613  kvm->arch.enable_pmu = !(cap->args[0] & KVM_PMU_CAP_DISABLE);
6614  r = 0;
6615  }
6616  mutex_unlock(&kvm->lock);
6617  break;
6618  case KVM_CAP_MAX_VCPU_ID:
6619  r = -EINVAL;
6620  if (cap->args[0] > KVM_MAX_VCPU_IDS)
6621  break;
6622 
6623  mutex_lock(&kvm->lock);
6624  if (kvm->arch.max_vcpu_ids == cap->args[0]) {
6625  r = 0;
6626  } else if (!kvm->arch.max_vcpu_ids) {
6627  kvm->arch.max_vcpu_ids = cap->args[0];
6628  r = 0;
6629  }
6630  mutex_unlock(&kvm->lock);
6631  break;
6632  case KVM_CAP_X86_NOTIFY_VMEXIT:
6633  r = -EINVAL;
6634  if ((u32)cap->args[0] & ~KVM_X86_NOTIFY_VMEXIT_VALID_BITS)
6635  break;
6637  break;
6638  if (!((u32)cap->args[0] & KVM_X86_NOTIFY_VMEXIT_ENABLED))
6639  break;
6640  mutex_lock(&kvm->lock);
6641  if (!kvm->created_vcpus) {
6642  kvm->arch.notify_window = cap->args[0] >> 32;
6643  kvm->arch.notify_vmexit_flags = (u32)cap->args[0];
6644  r = 0;
6645  }
6646  mutex_unlock(&kvm->lock);
6647  break;
6648  case KVM_CAP_VM_DISABLE_NX_HUGE_PAGES:
6649  r = -EINVAL;
6650 
6651  /*
6652  * Since the risk of disabling NX hugepages is a guest crashing
6653  * the system, ensure the userspace process has permission to
6654  * reboot the system.
6655  *
6656  * Note that unlike the reboot() syscall, the process must have
6657  * this capability in the root namespace because exposing
6658  * /dev/kvm into a container does not limit the scope of the
6659  * iTLB multihit bug to that container. In other words,
6660  * this must use capable(), not ns_capable().
6661  */
6662  if (!capable(CAP_SYS_BOOT)) {
6663  r = -EPERM;
6664  break;
6665  }
6666 
6667  if (cap->args[0])
6668  break;
6669 
6670  mutex_lock(&kvm->lock);
6671  if (!kvm->created_vcpus) {
6672  kvm->arch.disable_nx_huge_pages = true;
6673  r = 0;
6674  }
6675  mutex_unlock(&kvm->lock);
6676  break;
6677  default:
6678  r = -EINVAL;
6679  break;
6680  }
6681  return r;
6682 }
6683 
6684 static struct kvm_x86_msr_filter *kvm_alloc_msr_filter(bool default_allow)
6685 {
6686  struct kvm_x86_msr_filter *msr_filter;
6687 
6688  msr_filter = kzalloc(sizeof(*msr_filter), GFP_KERNEL_ACCOUNT);
6689  if (!msr_filter)
6690  return NULL;
6691 
6692  msr_filter->default_allow = default_allow;
6693  return msr_filter;
6694 }
6695 
6696 static void kvm_free_msr_filter(struct kvm_x86_msr_filter *msr_filter)
6697 {
6698  u32 i;
6699 
6700  if (!msr_filter)
6701  return;
6702 
6703  for (i = 0; i < msr_filter->count; i++)
6704  kfree(msr_filter->ranges[i].bitmap);
6705 
6706  kfree(msr_filter);
6707 }
6708 
6709 static int kvm_add_msr_filter(struct kvm_x86_msr_filter *msr_filter,
6710  struct kvm_msr_filter_range *user_range)
6711 {
6712  unsigned long *bitmap;
6713  size_t bitmap_size;
6714 
6715  if (!user_range->nmsrs)
6716  return 0;
6717 
6718  if (user_range->flags & ~KVM_MSR_FILTER_RANGE_VALID_MASK)
6719  return -EINVAL;
6720 
6721  if (!user_range->flags)
6722  return -EINVAL;
6723 
6724  bitmap_size = BITS_TO_LONGS(user_range->nmsrs) * sizeof(long);
6725  if (!bitmap_size || bitmap_size > KVM_MSR_FILTER_MAX_BITMAP_SIZE)
6726  return -EINVAL;
6727 
6728  bitmap = memdup_user((__user u8*)user_range->bitmap, bitmap_size);
6729  if (IS_ERR(bitmap))
6730  return PTR_ERR(bitmap);
6731 
6732  msr_filter->ranges[msr_filter->count] = (struct msr_bitmap_range) {
6733  .flags = user_range->flags,
6734  .base = user_range->base,
6735  .nmsrs = user_range->nmsrs,
6736  .bitmap = bitmap,
6737  };
6738 
6739  msr_filter->count++;
6740  return 0;
6741 }
6742 
6743 static int kvm_vm_ioctl_set_msr_filter(struct kvm *kvm,
6744  struct kvm_msr_filter *filter)
6745 {
6746  struct kvm_x86_msr_filter *new_filter, *old_filter;
6747  bool default_allow;
6748  bool empty = true;
6749  int r;
6750  u32 i;
6751 
6752  if (filter->flags & ~KVM_MSR_FILTER_VALID_MASK)
6753  return -EINVAL;
6754 
6755  for (i = 0; i < ARRAY_SIZE(filter->ranges); i++)
6756  empty &= !filter->ranges[i].nmsrs;
6757 
6758  default_allow = !(filter->flags & KVM_MSR_FILTER_DEFAULT_DENY);
6759  if (empty && !default_allow)
6760  return -EINVAL;
6761 
6762  new_filter = kvm_alloc_msr_filter(default_allow);
6763  if (!new_filter)
6764  return -ENOMEM;
6765 
6766  for (i = 0; i < ARRAY_SIZE(filter->ranges); i++) {
6767  r = kvm_add_msr_filter(new_filter, &filter->ranges[i]);
6768  if (r) {
6769  kvm_free_msr_filter(new_filter);
6770  return r;
6771  }
6772  }
6773 
6774  mutex_lock(&kvm->lock);
6775  old_filter = rcu_replace_pointer(kvm->arch.msr_filter, new_filter,
6776  mutex_is_locked(&kvm->lock));
6777  mutex_unlock(&kvm->lock);
6778  synchronize_srcu(&kvm->srcu);
6779 
6780  kvm_free_msr_filter(old_filter);
6781 
6782  kvm_make_all_cpus_request(kvm, KVM_REQ_MSR_FILTER_CHANGED);
6783 
6784  return 0;
6785 }
6786 
6787 #ifdef CONFIG_KVM_COMPAT
6788 /* for KVM_X86_SET_MSR_FILTER */
6789 struct kvm_msr_filter_range_compat {
6790  __u32 flags;
6791  __u32 nmsrs;
6792  __u32 base;
6793  __u32 bitmap;
6794 };
6795 
6796 struct kvm_msr_filter_compat {
6797  __u32 flags;
6798  struct kvm_msr_filter_range_compat ranges[KVM_MSR_FILTER_MAX_RANGES];
6799 };
6800 
6801 #define KVM_X86_SET_MSR_FILTER_COMPAT _IOW(KVMIO, 0xc6, struct kvm_msr_filter_compat)
6802 
6803 long kvm_arch_vm_compat_ioctl(struct file *filp, unsigned int ioctl,
6804  unsigned long arg)
6805 {
6806  void __user *argp = (void __user *)arg;
6807  struct kvm *kvm = filp->private_data;
6808  long r = -ENOTTY;
6809 
6810  switch (ioctl) {
6811  case KVM_X86_SET_MSR_FILTER_COMPAT: {
6812  struct kvm_msr_filter __user *user_msr_filter = argp;
6813  struct kvm_msr_filter_compat filter_compat;
6814  struct kvm_msr_filter filter;
6815  int i;
6816 
6817  if (copy_from_user(&filter_compat, user_msr_filter,
6818  sizeof(filter_compat)))
6819  return -EFAULT;
6820 
6821  filter.flags = filter_compat.flags;
6822  for (i = 0; i < ARRAY_SIZE(filter.ranges); i++) {
6823  struct kvm_msr_filter_range_compat *cr;
6824 
6825  cr = &filter_compat.ranges[i];
6826  filter.ranges[i] = (struct kvm_msr_filter_range) {
6827  .flags = cr->flags,
6828  .nmsrs = cr->nmsrs,
6829  .base = cr->base,
6830  .bitmap = (__u8 *)(ulong)cr->bitmap,
6831  };
6832  }
6833 
6834  r = kvm_vm_ioctl_set_msr_filter(kvm, &filter);
6835  break;
6836  }
6837  }
6838 
6839  return r;
6840 }
6841 #endif
6842 
6843 #ifdef CONFIG_HAVE_KVM_PM_NOTIFIER
6844 static int kvm_arch_suspend_notifier(struct kvm *kvm)
6845 {
6846  struct kvm_vcpu *vcpu;
6847  unsigned long i;
6848  int ret = 0;
6849 
6850  mutex_lock(&kvm->lock);
6851  kvm_for_each_vcpu(i, vcpu, kvm) {
6852  if (!vcpu->arch.pv_time.active)
6853  continue;
6854 
6855  ret = kvm_set_guest_paused(vcpu);
6856  if (ret) {
6857  kvm_err("Failed to pause guest VCPU%d: %d\n",
6858  vcpu->vcpu_id, ret);
6859  break;
6860  }
6861  }
6862  mutex_unlock(&kvm->lock);
6863 
6864  return ret ? NOTIFY_BAD : NOTIFY_DONE;
6865 }
6866 
6867 int kvm_arch_pm_notifier(struct kvm *kvm, unsigned long state)
6868 {
6869  switch (state) {
6870  case PM_HIBERNATION_PREPARE:
6871  case PM_SUSPEND_PREPARE:
6872  return kvm_arch_suspend_notifier(kvm);
6873  }
6874 
6875  return NOTIFY_DONE;
6876 }
6877 #endif /* CONFIG_HAVE_KVM_PM_NOTIFIER */
6878 
6879 static int kvm_vm_ioctl_get_clock(struct kvm *kvm, void __user *argp)
6880 {
6881  struct kvm_clock_data data = { 0 };
6882 
6883  get_kvmclock(kvm, &data);
6884  if (copy_to_user(argp, &data, sizeof(data)))
6885  return -EFAULT;
6886 
6887  return 0;
6888 }
6889 
6890 static int kvm_vm_ioctl_set_clock(struct kvm *kvm, void __user *argp)
6891 {
6892  struct kvm_arch *ka = &kvm->arch;
6893  struct kvm_clock_data data;
6894  u64 now_raw_ns;
6895 
6896  if (copy_from_user(&data, argp, sizeof(data)))
6897  return -EFAULT;
6898 
6899  /*
6900  * Only KVM_CLOCK_REALTIME is used, but allow passing the
6901  * result of KVM_GET_CLOCK back to KVM_SET_CLOCK.
6902  */
6903  if (data.flags & ~KVM_CLOCK_VALID_FLAGS)
6904  return -EINVAL;
6905 
6909 
6910  /*
6911  * This pairs with kvm_guest_time_update(): when masterclock is
6912  * in use, we use master_kernel_ns + kvmclock_offset to set
6913  * unsigned 'system_time' so if we use get_kvmclock_ns() (which
6914  * is slightly ahead) here we risk going negative on unsigned
6915  * 'system_time' when 'data.clock' is very small.
6916  */
6917  if (data.flags & KVM_CLOCK_REALTIME) {
6918  u64 now_real_ns = ktime_get_real_ns();
6919 
6920  /*
6921  * Avoid stepping the kvmclock backwards.
6922  */
6923  if (now_real_ns > data.realtime)
6924  data.clock += now_real_ns - data.realtime;
6925  }
6926 
6927  if (ka->use_master_clock)
6928  now_raw_ns = ka->master_kernel_ns;
6929  else
6930  now_raw_ns = get_kvmclock_base_ns();
6931  ka->kvmclock_offset = data.clock - now_raw_ns;
6933  return 0;
6934 }
6935 
6936 int kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
6937 {
6938  struct kvm *kvm = filp->private_data;
6939  void __user *argp = (void __user *)arg;
6940  int r = -ENOTTY;
6941  /*
6942  * This union makes it completely explicit to gcc-3.x
6943  * that these two variables' stack usage should be
6944  * combined, not added together.
6945  */
6946  union {
6947  struct kvm_pit_state ps;
6948  struct kvm_pit_state2 ps2;
6949  struct kvm_pit_config pit_config;
6950  } u;
6951 
6952  switch (ioctl) {
6953  case KVM_SET_TSS_ADDR:
6954  r = kvm_vm_ioctl_set_tss_addr(kvm, arg);
6955  break;
6956  case KVM_SET_IDENTITY_MAP_ADDR: {
6957  u64 ident_addr;
6958 
6959  mutex_lock(&kvm->lock);
6960  r = -EINVAL;
6961  if (kvm->created_vcpus)
6962  goto set_identity_unlock;
6963  r = -EFAULT;
6964  if (copy_from_user(&ident_addr, argp, sizeof(ident_addr)))
6965  goto set_identity_unlock;
6966  r = kvm_vm_ioctl_set_identity_map_addr(kvm, ident_addr);
6967 set_identity_unlock:
6968  mutex_unlock(&kvm->lock);
6969  break;
6970  }
6971  case KVM_SET_NR_MMU_PAGES:
6972  r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg);
6973  break;
6974  case KVM_CREATE_IRQCHIP: {
6975  mutex_lock(&kvm->lock);
6976 
6977  r = -EEXIST;
6978  if (irqchip_in_kernel(kvm))
6979  goto create_irqchip_unlock;
6980 
6981  r = -EINVAL;
6982  if (kvm->created_vcpus)
6983  goto create_irqchip_unlock;
6984 
6985  r = kvm_pic_init(kvm);
6986  if (r)
6987  goto create_irqchip_unlock;
6988 
6989  r = kvm_ioapic_init(kvm);
6990  if (r) {
6991  kvm_pic_destroy(kvm);
6992  goto create_irqchip_unlock;
6993  }
6994 
6996  if (r) {
6997  kvm_ioapic_destroy(kvm);
6998  kvm_pic_destroy(kvm);
6999  goto create_irqchip_unlock;
7000  }
7001  /* Write kvm->irq_routing before enabling irqchip_in_kernel. */
7002  smp_wmb();
7003  kvm->arch.irqchip_mode = KVM_IRQCHIP_KERNEL;
7004  kvm_clear_apicv_inhibit(kvm, APICV_INHIBIT_REASON_ABSENT);
7005  create_irqchip_unlock:
7006  mutex_unlock(&kvm->lock);
7007  break;
7008  }
7009  case KVM_CREATE_PIT:
7010  u.pit_config.flags = KVM_PIT_SPEAKER_DUMMY;
7011  goto create_pit;
7012  case KVM_CREATE_PIT2:
7013  r = -EFAULT;
7014  if (copy_from_user(&u.pit_config, argp,
7015  sizeof(struct kvm_pit_config)))
7016  goto out;
7017  create_pit:
7018  mutex_lock(&kvm->lock);
7019  r = -EEXIST;
7020  if (kvm->arch.vpit)
7021  goto create_pit_unlock;
7022  r = -ENOENT;
7023  if (!pic_in_kernel(kvm))
7024  goto create_pit_unlock;
7025  r = -ENOMEM;
7026  kvm->arch.vpit = kvm_create_pit(kvm, u.pit_config.flags);
7027  if (kvm->arch.vpit)
7028  r = 0;
7029  create_pit_unlock:
7030  mutex_unlock(&kvm->lock);
7031  break;
7032  case KVM_GET_IRQCHIP: {
7033  /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
7034  struct kvm_irqchip *chip;
7035 
7036  chip = memdup_user(argp, sizeof(*chip));
7037  if (IS_ERR(chip)) {
7038  r = PTR_ERR(chip);
7039  goto out;
7040  }
7041 
7042  r = -ENXIO;
7043  if (!irqchip_kernel(kvm))
7044  goto get_irqchip_out;
7045  r = kvm_vm_ioctl_get_irqchip(kvm, chip);
7046  if (r)
7047  goto get_irqchip_out;
7048  r = -EFAULT;
7049  if (copy_to_user(argp, chip, sizeof(*chip)))
7050  goto get_irqchip_out;
7051  r = 0;
7052  get_irqchip_out:
7053  kfree(chip);
7054  break;
7055  }
7056  case KVM_SET_IRQCHIP: {
7057  /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
7058  struct kvm_irqchip *chip;
7059 
7060  chip = memdup_user(argp, sizeof(*chip));
7061  if (IS_ERR(chip)) {
7062  r = PTR_ERR(chip);
7063  goto out;
7064  }
7065 
7066  r = -ENXIO;
7067  if (!irqchip_kernel(kvm))
7068  goto set_irqchip_out;
7069  r = kvm_vm_ioctl_set_irqchip(kvm, chip);
7070  set_irqchip_out:
7071  kfree(chip);
7072  break;
7073  }
7074  case KVM_GET_PIT: {
7075  r = -EFAULT;
7076  if (copy_from_user(&u.ps, argp, sizeof(struct kvm_pit_state)))
7077  goto out;
7078  r = -ENXIO;
7079  if (!kvm->arch.vpit)
7080  goto out;
7081  r = kvm_vm_ioctl_get_pit(kvm, &u.ps);
7082  if (r)
7083  goto out;
7084  r = -EFAULT;
7085  if (copy_to_user(argp, &u.ps, sizeof(struct kvm_pit_state)))
7086  goto out;
7087  r = 0;
7088  break;
7089  }
7090  case KVM_SET_PIT: {
7091  r = -EFAULT;
7092  if (copy_from_user(&u.ps, argp, sizeof(u.ps)))
7093  goto out;
7094  mutex_lock(&kvm->lock);
7095  r = -ENXIO;
7096  if (!kvm->arch.vpit)
7097  goto set_pit_out;
7098  r = kvm_vm_ioctl_set_pit(kvm, &u.ps);
7099 set_pit_out:
7100  mutex_unlock(&kvm->lock);
7101  break;
7102  }
7103  case KVM_GET_PIT2: {
7104  r = -ENXIO;
7105  if (!kvm->arch.vpit)
7106  goto out;
7107  r = kvm_vm_ioctl_get_pit2(kvm, &u.ps2);
7108  if (r)
7109  goto out;
7110  r = -EFAULT;
7111  if (copy_to_user(argp, &u.ps2, sizeof(u.ps2)))
7112  goto out;
7113  r = 0;
7114  break;
7115  }
7116  case KVM_SET_PIT2: {
7117  r = -EFAULT;
7118  if (copy_from_user(&u.ps2, argp, sizeof(u.ps2)))
7119  goto out;
7120  mutex_lock(&kvm->lock);
7121  r = -ENXIO;
7122  if (!kvm->arch.vpit)
7123  goto set_pit2_out;
7124  r = kvm_vm_ioctl_set_pit2(kvm, &u.ps2);
7125 set_pit2_out:
7126  mutex_unlock(&kvm->lock);
7127  break;
7128  }
7129  case KVM_REINJECT_CONTROL: {
7130  struct kvm_reinject_control control;
7131  r = -EFAULT;
7132  if (copy_from_user(&control, argp, sizeof(control)))
7133  goto out;
7134  r = -ENXIO;
7135  if (!kvm->arch.vpit)
7136  goto out;
7137  r = kvm_vm_ioctl_reinject(kvm, &control);
7138  break;
7139  }
7140  case KVM_SET_BOOT_CPU_ID:
7141  r = 0;
7142  mutex_lock(&kvm->lock);
7143  if (kvm->created_vcpus)
7144  r = -EBUSY;
7145  else
7146  kvm->arch.bsp_vcpu_id = arg;
7147  mutex_unlock(&kvm->lock);
7148  break;
7149 #ifdef CONFIG_KVM_XEN
7150  case KVM_XEN_HVM_CONFIG: {
7151  struct kvm_xen_hvm_config xhc;
7152  r = -EFAULT;
7153  if (copy_from_user(&xhc, argp, sizeof(xhc)))
7154  goto out;
7155  r = kvm_xen_hvm_config(kvm, &xhc);
7156  break;
7157  }
7158  case KVM_XEN_HVM_GET_ATTR: {
7159  struct kvm_xen_hvm_attr xha;
7160 
7161  r = -EFAULT;
7162  if (copy_from_user(&xha, argp, sizeof(xha)))
7163  goto out;
7164  r = kvm_xen_hvm_get_attr(kvm, &xha);
7165  if (!r && copy_to_user(argp, &xha, sizeof(xha)))
7166  r = -EFAULT;
7167  break;
7168  }
7169  case KVM_XEN_HVM_SET_ATTR: {
7170  struct kvm_xen_hvm_attr xha;
7171 
7172  r = -EFAULT;
7173  if (copy_from_user(&xha, argp, sizeof(xha)))
7174  goto out;
7175  r = kvm_xen_hvm_set_attr(kvm, &xha);
7176  break;
7177  }
7178  case KVM_XEN_HVM_EVTCHN_SEND: {
7179  struct kvm_irq_routing_xen_evtchn uxe;
7180 
7181  r = -EFAULT;
7182  if (copy_from_user(&uxe, argp, sizeof(uxe)))
7183  goto out;
7184  r = kvm_xen_hvm_evtchn_send(kvm, &uxe);
7185  break;
7186  }
7187 #endif
7188  case KVM_SET_CLOCK:
7189  r = kvm_vm_ioctl_set_clock(kvm, argp);
7190  break;
7191  case KVM_GET_CLOCK:
7192  r = kvm_vm_ioctl_get_clock(kvm, argp);
7193  break;
7194  case KVM_SET_TSC_KHZ: {
7195  u32 user_tsc_khz;
7196 
7197  r = -EINVAL;
7198  user_tsc_khz = (u32)arg;
7199 
7200  if (kvm_caps.has_tsc_control &&
7201  user_tsc_khz >= kvm_caps.max_guest_tsc_khz)
7202  goto out;
7203 
7204  if (user_tsc_khz == 0)
7205  user_tsc_khz = tsc_khz;
7206 
7207  WRITE_ONCE(kvm->arch.default_tsc_khz, user_tsc_khz);
7208  r = 0;
7209 
7210  goto out;
7211  }
7212  case KVM_GET_TSC_KHZ: {
7213  r = READ_ONCE(kvm->arch.default_tsc_khz);
7214  goto out;
7215  }
7216  case KVM_MEMORY_ENCRYPT_OP: {
7217  r = -ENOTTY;
7218  if (!kvm_x86_ops.mem_enc_ioctl)
7219  goto out;
7220 
7221  r = static_call(kvm_x86_mem_enc_ioctl)(kvm, argp);
7222  break;
7223  }
7224  case KVM_MEMORY_ENCRYPT_REG_REGION: {
7225  struct kvm_enc_region region;
7226 
7227  r = -EFAULT;
7228  if (copy_from_user(&region, argp, sizeof(region)))
7229  goto out;
7230 
7231  r = -ENOTTY;
7232  if (!kvm_x86_ops.mem_enc_register_region)
7233  goto out;
7234 
7235  r = static_call(kvm_x86_mem_enc_register_region)(kvm, &region);
7236  break;
7237  }
7238  case KVM_MEMORY_ENCRYPT_UNREG_REGION: {
7239  struct kvm_enc_region region;
7240 
7241  r = -EFAULT;
7242  if (copy_from_user(&region, argp, sizeof(region)))
7243  goto out;
7244 
7245  r = -ENOTTY;
7246  if (!kvm_x86_ops.mem_enc_unregister_region)
7247  goto out;
7248 
7249  r = static_call(kvm_x86_mem_enc_unregister_region)(kvm, &region);
7250  break;
7251  }
7252 #ifdef CONFIG_KVM_HYPERV
7253  case KVM_HYPERV_EVENTFD: {
7254  struct kvm_hyperv_eventfd hvevfd;
7255 
7256  r = -EFAULT;
7257  if (copy_from_user(&hvevfd, argp, sizeof(hvevfd)))
7258  goto out;
7259  r = kvm_vm_ioctl_hv_eventfd(kvm, &hvevfd);
7260  break;
7261  }
7262 #endif
7263  case KVM_SET_PMU_EVENT_FILTER:
7264  r = kvm_vm_ioctl_set_pmu_event_filter(kvm, argp);
7265  break;
7266  case KVM_X86_SET_MSR_FILTER: {
7267  struct kvm_msr_filter __user *user_msr_filter = argp;
7268  struct kvm_msr_filter filter;
7269 
7270  if (copy_from_user(&filter, user_msr_filter, sizeof(filter)))
7271  return -EFAULT;
7272 
7273  r = kvm_vm_ioctl_set_msr_filter(kvm, &filter);
7274  break;
7275  }
7276  default:
7277  r = -ENOTTY;
7278  }
7279 out:
7280  return r;
7281 }
7282 
7283 static void kvm_probe_feature_msr(u32 msr_index)
7284 {
7285  struct kvm_msr_entry msr = {
7286  .index = msr_index,
7287  };
7288 
7289  if (kvm_get_msr_feature(&msr))
7290  return;
7291 
7293 }
7294 
7295 static void kvm_probe_msr_to_save(u32 msr_index)
7296 {
7297  u32 dummy[2];
7298 
7299  if (rdmsr_safe(msr_index, &dummy[0], &dummy[1]))
7300  return;
7301 
7302  /*
7303  * Even MSRs that are valid in the host may not be exposed to guests in
7304  * some cases.
7305  */
7306  switch (msr_index) {
7307  case MSR_IA32_BNDCFGS:
7308  if (!kvm_mpx_supported())
7309  return;
7310  break;
7311  case MSR_TSC_AUX:
7312  if (!kvm_cpu_cap_has(X86_FEATURE_RDTSCP) &&
7313  !kvm_cpu_cap_has(X86_FEATURE_RDPID))
7314  return;
7315  break;
7316  case MSR_IA32_UMWAIT_CONTROL:
7317  if (!kvm_cpu_cap_has(X86_FEATURE_WAITPKG))
7318  return;
7319  break;
7320  case MSR_IA32_RTIT_CTL:
7321  case MSR_IA32_RTIT_STATUS:
7322  if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT))
7323  return;
7324  break;
7325  case MSR_IA32_RTIT_CR3_MATCH:
7326  if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) ||
7327  !intel_pt_validate_hw_cap(PT_CAP_cr3_filtering))
7328  return;
7329  break;
7330  case MSR_IA32_RTIT_OUTPUT_BASE:
7331  case MSR_IA32_RTIT_OUTPUT_MASK:
7332  if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) ||
7333  (!intel_pt_validate_hw_cap(PT_CAP_topa_output) &&
7334  !intel_pt_validate_hw_cap(PT_CAP_single_range_output)))
7335  return;
7336  break;
7337  case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
7338  if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) ||
7339  (msr_index - MSR_IA32_RTIT_ADDR0_A >=
7340  intel_pt_validate_hw_cap(PT_CAP_num_address_ranges) * 2))
7341  return;
7342  break;
7343  case MSR_ARCH_PERFMON_PERFCTR0 ... MSR_ARCH_PERFMON_PERFCTR_MAX:
7344  if (msr_index - MSR_ARCH_PERFMON_PERFCTR0 >=
7345  kvm_pmu_cap.num_counters_gp)
7346  return;
7347  break;
7348  case MSR_ARCH_PERFMON_EVENTSEL0 ... MSR_ARCH_PERFMON_EVENTSEL_MAX:
7349  if (msr_index - MSR_ARCH_PERFMON_EVENTSEL0 >=
7350  kvm_pmu_cap.num_counters_gp)
7351  return;
7352  break;
7353  case MSR_ARCH_PERFMON_FIXED_CTR0 ... MSR_ARCH_PERFMON_FIXED_CTR_MAX:
7354  if (msr_index - MSR_ARCH_PERFMON_FIXED_CTR0 >=
7355  kvm_pmu_cap.num_counters_fixed)
7356  return;
7357  break;
7358  case MSR_AMD64_PERF_CNTR_GLOBAL_CTL:
7359  case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS:
7360  case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR:
7361  if (!kvm_cpu_cap_has(X86_FEATURE_PERFMON_V2))
7362  return;
7363  break;
7364  case MSR_IA32_XFD:
7365  case MSR_IA32_XFD_ERR:
7366  if (!kvm_cpu_cap_has(X86_FEATURE_XFD))
7367  return;
7368  break;
7369  case MSR_IA32_TSX_CTRL:
7370  if (!(kvm_get_arch_capabilities() & ARCH_CAP_TSX_CTRL_MSR))
7371  return;
7372  break;
7373  default:
7374  break;
7375  }
7376 
7377  msrs_to_save[num_msrs_to_save++] = msr_index;
7378 }
7379 
7380 static void kvm_init_msr_lists(void)
7381 {
7382  unsigned i;
7383 
7384  BUILD_BUG_ON_MSG(KVM_PMC_MAX_FIXED != 3,
7385  "Please update the fixed PMCs in msrs_to_save_pmu[]");
7386 
7387  num_msrs_to_save = 0;
7388  num_emulated_msrs = 0;
7390 
7391  for (i = 0; i < ARRAY_SIZE(msrs_to_save_base); i++)
7393 
7394  if (enable_pmu) {
7395  for (i = 0; i < ARRAY_SIZE(msrs_to_save_pmu); i++)
7397  }
7398 
7399  for (i = 0; i < ARRAY_SIZE(emulated_msrs_all); i++) {
7400  if (!static_call(kvm_x86_has_emulated_msr)(NULL, emulated_msrs_all[i]))
7401  continue;
7402 
7404  }
7405 
7408 
7409  for (i = 0; i < ARRAY_SIZE(msr_based_features_all_except_vmx); i++)
7411 }
7412 
7413 static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len,
7414  const void *v)
7415 {
7416  int handled = 0;
7417  int n;
7418 
7419  do {
7420  n = min(len, 8);
7421  if (!(lapic_in_kernel(vcpu) &&
7422  !kvm_iodevice_write(vcpu, &vcpu->arch.apic->dev, addr, n, v))
7423  && kvm_io_bus_write(vcpu, KVM_MMIO_BUS, addr, n, v))
7424  break;
7425  handled += n;
7426  addr += n;
7427  len -= n;
7428  v += n;
7429  } while (len);
7430 
7431  return handled;
7432 }
7433 
7434 static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v)
7435 {
7436  int handled = 0;
7437  int n;
7438 
7439  do {
7440  n = min(len, 8);
7441  if (!(lapic_in_kernel(vcpu) &&
7442  !kvm_iodevice_read(vcpu, &vcpu->arch.apic->dev,
7443  addr, n, v))
7444  && kvm_io_bus_read(vcpu, KVM_MMIO_BUS, addr, n, v))
7445  break;
7446  trace_kvm_mmio(KVM_TRACE_MMIO_READ, n, addr, v);
7447  handled += n;
7448  addr += n;
7449  len -= n;
7450  v += n;
7451  } while (len);
7452 
7453  return handled;
7454 }
7455 
7456 void kvm_set_segment(struct kvm_vcpu *vcpu,
7457  struct kvm_segment *var, int seg)
7458 {
7459  static_call(kvm_x86_set_segment)(vcpu, var, seg);
7460 }
7461 
7462 void kvm_get_segment(struct kvm_vcpu *vcpu,
7463  struct kvm_segment *var, int seg)
7464 {
7465  static_call(kvm_x86_get_segment)(vcpu, var, seg);
7466 }
7467 
7468 gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u64 access,
7469  struct x86_exception *exception)
7470 {
7471  struct kvm_mmu *mmu = vcpu->arch.mmu;
7472  gpa_t t_gpa;
7473 
7474  BUG_ON(!mmu_is_nested(vcpu));
7475 
7476  /* NPT walks are always user-walks */
7477  access |= PFERR_USER_MASK;
7478  t_gpa = mmu->gva_to_gpa(vcpu, mmu, gpa, access, exception);
7479 
7480  return t_gpa;
7481 }
7482 
7483 gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva,
7484  struct x86_exception *exception)
7485 {
7486  struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
7487 
7488  u64 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0;
7489  return mmu->gva_to_gpa(vcpu, mmu, gva, access, exception);
7490 }
7492 
7493 gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva,
7494  struct x86_exception *exception)
7495 {
7496  struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
7497 
7498  u64 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0;
7499  access |= PFERR_WRITE_MASK;
7500  return mmu->gva_to_gpa(vcpu, mmu, gva, access, exception);
7501 }
7503 
7504 /* uses this to access any guest's mapped memory without checking CPL */
7505 gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva,
7506  struct x86_exception *exception)
7507 {
7508  struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
7509 
7510  return mmu->gva_to_gpa(vcpu, mmu, gva, 0, exception);
7511 }
7512 
7513 static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes,
7514  struct kvm_vcpu *vcpu, u64 access,
7515  struct x86_exception *exception)
7516 {
7517  struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
7518  void *data = val;
7519  int r = X86EMUL_CONTINUE;
7520 
7521  while (bytes) {
7522  gpa_t gpa = mmu->gva_to_gpa(vcpu, mmu, addr, access, exception);
7523  unsigned offset = addr & (PAGE_SIZE-1);
7524  unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset);
7525  int ret;
7526 
7527  if (gpa == INVALID_GPA)
7528  return X86EMUL_PROPAGATE_FAULT;
7529  ret = kvm_vcpu_read_guest_page(vcpu, gpa >> PAGE_SHIFT, data,
7530  offset, toread);
7531  if (ret < 0) {
7532  r = X86EMUL_IO_NEEDED;
7533  goto out;
7534  }
7535 
7536  bytes -= toread;
7537  data += toread;
7538  addr += toread;
7539  }
7540 out:
7541  return r;
7542 }
7543 
7544 /* used for instruction fetching */
7545 static int kvm_fetch_guest_virt(struct x86_emulate_ctxt *ctxt,
7546  gva_t addr, void *val, unsigned int bytes,
7547  struct x86_exception *exception)
7548 {
7549  struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
7550  struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
7551  u64 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0;
7552  unsigned offset;
7553  int ret;
7554 
7555  /* Inline kvm_read_guest_virt_helper for speed. */
7556  gpa_t gpa = mmu->gva_to_gpa(vcpu, mmu, addr, access|PFERR_FETCH_MASK,
7557  exception);
7558  if (unlikely(gpa == INVALID_GPA))
7559  return X86EMUL_PROPAGATE_FAULT;
7560 
7561  offset = addr & (PAGE_SIZE-1);
7562  if (WARN_ON(offset + bytes > PAGE_SIZE))
7563  bytes = (unsigned)PAGE_SIZE - offset;
7564  ret = kvm_vcpu_read_guest_page(vcpu, gpa >> PAGE_SHIFT, val,
7565  offset, bytes);
7566  if (unlikely(ret < 0))
7567  return X86EMUL_IO_NEEDED;
7568 
7569  return X86EMUL_CONTINUE;
7570 }
7571 
7572 int kvm_read_guest_virt(struct kvm_vcpu *vcpu,
7573  gva_t addr, void *val, unsigned int bytes,
7574  struct x86_exception *exception)
7575 {
7576  u64 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0;
7577 
7578  /*
7579  * FIXME: this should call handle_emulation_failure if X86EMUL_IO_NEEDED
7580  * is returned, but our callers are not ready for that and they blindly
7581  * call kvm_inject_page_fault. Ensure that they at least do not leak
7582  * uninitialized kernel stack memory into cr2 and error code.
7583  */
7584  memset(exception, 0, sizeof(*exception));
7585  return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access,
7586  exception);
7587 }
7589 
7590 static int emulator_read_std(struct x86_emulate_ctxt *ctxt,
7591  gva_t addr, void *val, unsigned int bytes,
7592  struct x86_exception *exception, bool system)
7593 {
7594  struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
7595  u64 access = 0;
7596 
7597  if (system)
7598  access |= PFERR_IMPLICIT_ACCESS;
7599  else if (static_call(kvm_x86_get_cpl)(vcpu) == 3)
7600  access |= PFERR_USER_MASK;
7601 
7602  return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access, exception);
7603 }
7604 
7605 static int kvm_write_guest_virt_helper(gva_t addr, void *val, unsigned int bytes,
7606  struct kvm_vcpu *vcpu, u64 access,
7607  struct x86_exception *exception)
7608 {
7609  struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
7610  void *data = val;
7611  int r = X86EMUL_CONTINUE;
7612 
7613  while (bytes) {
7614  gpa_t gpa = mmu->gva_to_gpa(vcpu, mmu, addr, access, exception);
7615  unsigned offset = addr & (PAGE_SIZE-1);
7616  unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset);
7617  int ret;
7618 
7619  if (gpa == INVALID_GPA)
7620  return X86EMUL_PROPAGATE_FAULT;
7621  ret = kvm_vcpu_write_guest(vcpu, gpa, data, towrite);
7622  if (ret < 0) {
7623  r = X86EMUL_IO_NEEDED;
7624  goto out;
7625  }
7626 
7627  bytes -= towrite;
7628  data += towrite;
7629  addr += towrite;
7630  }
7631 out:
7632  return r;
7633 }
7634 
7635 static int emulator_write_std(struct x86_emulate_ctxt *ctxt, gva_t addr, void *val,
7636  unsigned int bytes, struct x86_exception *exception,
7637  bool system)
7638 {
7639  struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
7640  u64 access = PFERR_WRITE_MASK;
7641 
7642  if (system)
7643  access |= PFERR_IMPLICIT_ACCESS;
7644  else if (static_call(kvm_x86_get_cpl)(vcpu) == 3)
7645  access |= PFERR_USER_MASK;
7646 
7647  return kvm_write_guest_virt_helper(addr, val, bytes, vcpu,
7648  access, exception);
7649 }
7650 
7651 int kvm_write_guest_virt_system(struct kvm_vcpu *vcpu, gva_t addr, void *val,
7652  unsigned int bytes, struct x86_exception *exception)
7653 {
7654  /* kvm_write_guest_virt_system can pull in tons of pages. */
7655  vcpu->arch.l1tf_flush_l1d = true;
7656 
7657  return kvm_write_guest_virt_helper(addr, val, bytes, vcpu,
7658  PFERR_WRITE_MASK, exception);
7659 }
7661 
7662 static int kvm_check_emulate_insn(struct kvm_vcpu *vcpu, int emul_type,
7663  void *insn, int insn_len)
7664 {
7665  return static_call(kvm_x86_check_emulate_instruction)(vcpu, emul_type,
7666  insn, insn_len);
7667 }
7668 
7669 int handle_ud(struct kvm_vcpu *vcpu)
7670 {
7671  static const char kvm_emulate_prefix[] = { __KVM_EMULATE_PREFIX };
7672  int fep_flags = READ_ONCE(force_emulation_prefix);
7673  int emul_type = EMULTYPE_TRAP_UD;
7674  char sig[5]; /* ud2; .ascii "kvm" */
7675  struct x86_exception e;
7676  int r;
7677 
7678  r = kvm_check_emulate_insn(vcpu, emul_type, NULL, 0);
7679  if (r != X86EMUL_CONTINUE)
7680  return 1;
7681 
7682  if (fep_flags &&
7684  sig, sizeof(sig), &e) == 0 &&
7685  memcmp(sig, kvm_emulate_prefix, sizeof(sig)) == 0) {
7686  if (fep_flags & KVM_FEP_CLEAR_RFLAGS_RF)
7687  kvm_set_rflags(vcpu, kvm_get_rflags(vcpu) & ~X86_EFLAGS_RF);
7688  kvm_rip_write(vcpu, kvm_rip_read(vcpu) + sizeof(sig));
7689  emul_type = EMULTYPE_TRAP_UD_FORCED;
7690  }
7691 
7692  return kvm_emulate_instruction(vcpu, emul_type);
7693 }
7695 
7696 static int vcpu_is_mmio_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
7697  gpa_t gpa, bool write)
7698 {
7699  /* For APIC access vmexit */
7700  if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
7701  return 1;
7702 
7703  if (vcpu_match_mmio_gpa(vcpu, gpa)) {
7704  trace_vcpu_match_mmio(gva, gpa, write, true);
7705  return 1;
7706  }
7707 
7708  return 0;
7709 }
7710 
7711 static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
7712  gpa_t *gpa, struct x86_exception *exception,
7713  bool write)
7714 {
7715  struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
7716  u64 access = ((static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0)
7717  | (write ? PFERR_WRITE_MASK : 0);
7718 
7719  /*
7720  * currently PKRU is only applied to ept enabled guest so
7721  * there is no pkey in EPT page table for L1 guest or EPT
7722  * shadow page table for L2 guest.
7723  */
7724  if (vcpu_match_mmio_gva(vcpu, gva) && (!is_paging(vcpu) ||
7725  !permission_fault(vcpu, vcpu->arch.walk_mmu,
7726  vcpu->arch.mmio_access, 0, access))) {
7727  *gpa = vcpu->arch.mmio_gfn << PAGE_SHIFT |
7728  (gva & (PAGE_SIZE - 1));
7729  trace_vcpu_match_mmio(gva, *gpa, write, false);
7730  return 1;
7731  }
7732 
7733  *gpa = mmu->gva_to_gpa(vcpu, mmu, gva, access, exception);
7734 
7735  if (*gpa == INVALID_GPA)
7736  return -1;
7737 
7738  return vcpu_is_mmio_gpa(vcpu, gva, *gpa, write);
7739 }
7740 
7741 int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
7742  const void *val, int bytes)
7743 {
7744  int ret;
7745 
7746  ret = kvm_vcpu_write_guest(vcpu, gpa, val, bytes);
7747  if (ret < 0)
7748  return 0;
7749  kvm_page_track_write(vcpu, gpa, val, bytes);
7750  return 1;
7751 }
7752 
7754  int (*read_write_prepare)(struct kvm_vcpu *vcpu, void *val,
7755  int bytes);
7756  int (*read_write_emulate)(struct kvm_vcpu *vcpu, gpa_t gpa,
7757  void *val, int bytes);
7758  int (*read_write_mmio)(struct kvm_vcpu *vcpu, gpa_t gpa,
7759  int bytes, void *val);
7760  int (*read_write_exit_mmio)(struct kvm_vcpu *vcpu, gpa_t gpa,
7761  void *val, int bytes);
7762  bool write;
7763 };
7764 
7765 static int read_prepare(struct kvm_vcpu *vcpu, void *val, int bytes)
7766 {
7767  if (vcpu->mmio_read_completed) {
7768  trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes,
7769  vcpu->mmio_fragments[0].gpa, val);
7770  vcpu->mmio_read_completed = 0;
7771  return 1;
7772  }
7773 
7774  return 0;
7775 }
7776 
7777 static int read_emulate(struct kvm_vcpu *vcpu, gpa_t gpa,
7778  void *val, int bytes)
7779 {
7780  return !kvm_vcpu_read_guest(vcpu, gpa, val, bytes);
7781 }
7782 
7783 static int write_emulate(struct kvm_vcpu *vcpu, gpa_t gpa,
7784  void *val, int bytes)
7785 {
7786  return emulator_write_phys(vcpu, gpa, val, bytes);
7787 }
7788 
7789 static int write_mmio(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes, void *val)
7790 {
7791  trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, val);
7792  return vcpu_mmio_write(vcpu, gpa, bytes, val);
7793 }
7794 
7795 static int read_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa,
7796  void *val, int bytes)
7797 {
7798  trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, NULL);
7799  return X86EMUL_IO_NEEDED;
7800 }
7801 
7802 static int write_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa,
7803  void *val, int bytes)
7804 {
7805  struct kvm_mmio_fragment *frag = &vcpu->mmio_fragments[0];
7806 
7807  memcpy(vcpu->run->mmio.data, frag->data, min(8u, frag->len));
7808  return X86EMUL_CONTINUE;
7809 }
7810 
7811 static const struct read_write_emulator_ops read_emultor = {
7813  .read_write_emulate = read_emulate,
7814  .read_write_mmio = vcpu_mmio_read,
7815  .read_write_exit_mmio = read_exit_mmio,
7816 };
7817 
7818 static const struct read_write_emulator_ops write_emultor = {
7820  .read_write_mmio = write_mmio,
7821  .read_write_exit_mmio = write_exit_mmio,
7822  .write = true,
7823 };
7824 
7825 static int emulator_read_write_onepage(unsigned long addr, void *val,
7826  unsigned int bytes,
7827  struct x86_exception *exception,
7828  struct kvm_vcpu *vcpu,
7829  const struct read_write_emulator_ops *ops)
7830 {
7831  gpa_t gpa;
7832  int handled, ret;
7833  bool write = ops->write;
7834  struct kvm_mmio_fragment *frag;
7835  struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
7836 
7837  /*
7838  * If the exit was due to a NPF we may already have a GPA.
7839  * If the GPA is present, use it to avoid the GVA to GPA table walk.
7840  * Note, this cannot be used on string operations since string
7841  * operation using rep will only have the initial GPA from the NPF
7842  * occurred.
7843  */
7844  if (ctxt->gpa_available && emulator_can_use_gpa(ctxt) &&
7845  (addr & ~PAGE_MASK) == (ctxt->gpa_val & ~PAGE_MASK)) {
7846  gpa = ctxt->gpa_val;
7847  ret = vcpu_is_mmio_gpa(vcpu, addr, gpa, write);
7848  } else {
7849  ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, write);
7850  if (ret < 0)
7851  return X86EMUL_PROPAGATE_FAULT;
7852  }
7853 
7854  if (!ret && ops->read_write_emulate(vcpu, gpa, val, bytes))
7855  return X86EMUL_CONTINUE;
7856 
7857  /*
7858  * Is this MMIO handled locally?
7859  */
7860  handled = ops->read_write_mmio(vcpu, gpa, bytes, val);
7861  if (handled == bytes)
7862  return X86EMUL_CONTINUE;
7863 
7864  gpa += handled;
7865  bytes -= handled;
7866  val += handled;
7867 
7868  WARN_ON(vcpu->mmio_nr_fragments >= KVM_MAX_MMIO_FRAGMENTS);
7869  frag = &vcpu->mmio_fragments[vcpu->mmio_nr_fragments++];
7870  frag->gpa = gpa;
7871  frag->data = val;
7872  frag->len = bytes;
7873  return X86EMUL_CONTINUE;
7874 }
7875 
7876 static int emulator_read_write(struct x86_emulate_ctxt *ctxt,
7877  unsigned long addr,
7878  void *val, unsigned int bytes,
7879  struct x86_exception *exception,
7880  const struct read_write_emulator_ops *ops)
7881 {
7882  struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
7883  gpa_t gpa;
7884  int rc;
7885 
7886  if (ops->read_write_prepare &&
7887  ops->read_write_prepare(vcpu, val, bytes))
7888  return X86EMUL_CONTINUE;
7889 
7890  vcpu->mmio_nr_fragments = 0;
7891 
7892  /* Crossing a page boundary? */
7893  if (((addr + bytes - 1) ^ addr) & PAGE_MASK) {
7894  int now;
7895 
7896  now = -addr & ~PAGE_MASK;
7897  rc = emulator_read_write_onepage(addr, val, now, exception,
7898  vcpu, ops);
7899 
7900  if (rc != X86EMUL_CONTINUE)
7901  return rc;
7902  addr += now;
7903  if (ctxt->mode != X86EMUL_MODE_PROT64)
7904  addr = (u32)addr;
7905  val += now;
7906  bytes -= now;
7907  }
7908 
7909  rc = emulator_read_write_onepage(addr, val, bytes, exception,
7910  vcpu, ops);
7911  if (rc != X86EMUL_CONTINUE)
7912  return rc;
7913 
7914  if (!vcpu->mmio_nr_fragments)
7915  return rc;
7916 
7917  gpa = vcpu->mmio_fragments[0].gpa;
7918 
7919  vcpu->mmio_needed = 1;
7920  vcpu->mmio_cur_fragment = 0;
7921 
7922  vcpu->run->mmio.len = min(8u, vcpu->mmio_fragments[0].len);
7923  vcpu->run->mmio.is_write = vcpu->mmio_is_write = ops->write;
7924  vcpu->run->exit_reason = KVM_EXIT_MMIO;
7925  vcpu->run->mmio.phys_addr = gpa;
7926 
7927  return ops->read_write_exit_mmio(vcpu, gpa, val, bytes);
7928 }
7929 
7931  unsigned long addr,
7932  void *val,
7933  unsigned int bytes,
7934  struct x86_exception *exception)
7935 {
7936  return emulator_read_write(ctxt, addr, val, bytes,
7937  exception, &read_emultor);
7938 }
7939 
7941  unsigned long addr,
7942  const void *val,
7943  unsigned int bytes,
7944  struct x86_exception *exception)
7945 {
7946  return emulator_read_write(ctxt, addr, (void *)val, bytes,
7947  exception, &write_emultor);
7948 }
7949 
7950 #define emulator_try_cmpxchg_user(t, ptr, old, new) \
7951  (__try_cmpxchg_user((t __user *)(ptr), (t *)(old), *(t *)(new), efault ## t))
7952 
7954  unsigned long addr,
7955  const void *old,
7956  const void *new,
7957  unsigned int bytes,
7958  struct x86_exception *exception)
7959 {
7960  struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
7961  u64 page_line_mask;
7962  unsigned long hva;
7963  gpa_t gpa;
7964  int r;
7965 
7966  /* guests cmpxchg8b have to be emulated atomically */
7967  if (bytes > 8 || (bytes & (bytes - 1)))
7968  goto emul_write;
7969 
7970  gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, NULL);
7971 
7972  if (gpa == INVALID_GPA ||
7973  (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
7974  goto emul_write;
7975 
7976  /*
7977  * Emulate the atomic as a straight write to avoid #AC if SLD is
7978  * enabled in the host and the access splits a cache line.
7979  */
7980  if (boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT))
7981  page_line_mask = ~(cache_line_size() - 1);
7982  else
7983  page_line_mask = PAGE_MASK;
7984 
7985  if (((gpa + bytes - 1) & page_line_mask) != (gpa & page_line_mask))
7986  goto emul_write;
7987 
7988  hva = kvm_vcpu_gfn_to_hva(vcpu, gpa_to_gfn(gpa));
7989  if (kvm_is_error_hva(hva))
7990  goto emul_write;
7991 
7992  hva += offset_in_page(gpa);
7993 
7994  switch (bytes) {
7995  case 1:
7996  r = emulator_try_cmpxchg_user(u8, hva, old, new);
7997  break;
7998  case 2:
7999  r = emulator_try_cmpxchg_user(u16, hva, old, new);
8000  break;
8001  case 4:
8002  r = emulator_try_cmpxchg_user(u32, hva, old, new);
8003  break;
8004  case 8:
8005  r = emulator_try_cmpxchg_user(u64, hva, old, new);
8006  break;
8007  default:
8008  BUG();
8009  }
8010 
8011  if (r < 0)
8012  return X86EMUL_UNHANDLEABLE;
8013 
8014  /*
8015  * Mark the page dirty _before_ checking whether or not the CMPXCHG was
8016  * successful, as the old value is written back on failure. Note, for
8017  * live migration, this is unnecessarily conservative as CMPXCHG writes
8018  * back the original value and the access is atomic, but KVM's ABI is
8019  * that all writes are dirty logged, regardless of the value written.
8020  */
8021  kvm_vcpu_mark_page_dirty(vcpu, gpa_to_gfn(gpa));
8022 
8023  if (r)
8024  return X86EMUL_CMPXCHG_FAILED;
8025 
8026  kvm_page_track_write(vcpu, gpa, new, bytes);
8027 
8028  return X86EMUL_CONTINUE;
8029 
8030 emul_write:
8031  pr_warn_once("emulating exchange as write\n");
8032 
8033  return emulator_write_emulated(ctxt, addr, new, bytes, exception);
8034 }
8035 
8036 static int emulator_pio_in_out(struct kvm_vcpu *vcpu, int size,
8037  unsigned short port, void *data,
8038  unsigned int count, bool in)
8039 {
8040  unsigned i;
8041  int r;
8042 
8043  WARN_ON_ONCE(vcpu->arch.pio.count);
8044  for (i = 0; i < count; i++) {
8045  if (in)
8046  r = kvm_io_bus_read(vcpu, KVM_PIO_BUS, port, size, data);
8047  else
8048  r = kvm_io_bus_write(vcpu, KVM_PIO_BUS, port, size, data);
8049 
8050  if (r) {
8051  if (i == 0)
8052  goto userspace_io;
8053 
8054  /*
8055  * Userspace must have unregistered the device while PIO
8056  * was running. Drop writes / read as 0.
8057  */
8058  if (in)
8059  memset(data, 0, size * (count - i));
8060  break;
8061  }
8062 
8063  data += size;
8064  }
8065  return 1;
8066 
8067 userspace_io:
8068  vcpu->arch.pio.port = port;
8069  vcpu->arch.pio.in = in;
8070  vcpu->arch.pio.count = count;
8071  vcpu->arch.pio.size = size;
8072 
8073  if (in)
8074  memset(vcpu->arch.pio_data, 0, size * count);
8075  else
8076  memcpy(vcpu->arch.pio_data, data, size * count);
8077 
8078  vcpu->run->exit_reason = KVM_EXIT_IO;
8079  vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
8080  vcpu->run->io.size = size;
8081  vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
8082  vcpu->run->io.count = count;
8083  vcpu->run->io.port = port;
8084  return 0;
8085 }
8086 
8087 static int emulator_pio_in(struct kvm_vcpu *vcpu, int size,
8088  unsigned short port, void *val, unsigned int count)
8089 {
8090  int r = emulator_pio_in_out(vcpu, size, port, val, count, true);
8091  if (r)
8092  trace_kvm_pio(KVM_PIO_IN, port, size, count, val);
8093 
8094  return r;
8095 }
8096 
8097 static void complete_emulator_pio_in(struct kvm_vcpu *vcpu, void *val)
8098 {
8099  int size = vcpu->arch.pio.size;
8100  unsigned int count = vcpu->arch.pio.count;
8101  memcpy(val, vcpu->arch.pio_data, size * count);
8102  trace_kvm_pio(KVM_PIO_IN, vcpu->arch.pio.port, size, count, vcpu->arch.pio_data);
8103  vcpu->arch.pio.count = 0;
8104 }
8105 
8107  int size, unsigned short port, void *val,
8108  unsigned int count)
8109 {
8110  struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
8111  if (vcpu->arch.pio.count) {
8112  /*
8113  * Complete a previous iteration that required userspace I/O.
8114  * Note, @count isn't guaranteed to match pio.count as userspace
8115  * can modify ECX before rerunning the vCPU. Ignore any such
8116  * shenanigans as KVM doesn't support modifying the rep count,
8117  * and the emulator ensures @count doesn't overflow the buffer.
8118  */
8119  complete_emulator_pio_in(vcpu, val);
8120  return 1;
8121  }
8122 
8123  return emulator_pio_in(vcpu, size, port, val, count);
8124 }
8125 
8126 static int emulator_pio_out(struct kvm_vcpu *vcpu, int size,
8127  unsigned short port, const void *val,
8128  unsigned int count)
8129 {
8130  trace_kvm_pio(KVM_PIO_OUT, port, size, count, val);
8131  return emulator_pio_in_out(vcpu, size, port, (void *)val, count, false);
8132 }
8133 
8135  int size, unsigned short port,
8136  const void *val, unsigned int count)
8137 {
8138  return emulator_pio_out(emul_to_vcpu(ctxt), size, port, val, count);
8139 }
8140 
8141 static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
8142 {
8143  return static_call(kvm_x86_get_segment_base)(vcpu, seg);
8144 }
8145 
8146 static void emulator_invlpg(struct x86_emulate_ctxt *ctxt, ulong address)
8147 {
8148  kvm_mmu_invlpg(emul_to_vcpu(ctxt), address);
8149 }
8150 
8151 static int kvm_emulate_wbinvd_noskip(struct kvm_vcpu *vcpu)
8152 {
8153  if (!need_emulate_wbinvd(vcpu))
8154  return X86EMUL_CONTINUE;
8155 
8156  if (static_call(kvm_x86_has_wbinvd_exit)()) {
8157  int cpu = get_cpu();
8158 
8159  cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
8160  on_each_cpu_mask(vcpu->arch.wbinvd_dirty_mask,
8161  wbinvd_ipi, NULL, 1);
8162  put_cpu();
8163  cpumask_clear(vcpu->arch.wbinvd_dirty_mask);
8164  } else
8165  wbinvd();
8166  return X86EMUL_CONTINUE;
8167 }
8168 
8169 int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu)
8170 {
8172  return kvm_skip_emulated_instruction(vcpu);
8173 }
8175 
8176 
8177 
8178 static void emulator_wbinvd(struct x86_emulate_ctxt *ctxt)
8179 {
8181 }
8182 
8183 static void emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr,
8184  unsigned long *dest)
8185 {
8186  kvm_get_dr(emul_to_vcpu(ctxt), dr, dest);
8187 }
8188 
8189 static int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr,
8190  unsigned long value)
8191 {
8192 
8193  return kvm_set_dr(emul_to_vcpu(ctxt), dr, value);
8194 }
8195 
8196 static u64 mk_cr_64(u64 curr_cr, u32 new_val)
8197 {
8198  return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
8199 }
8200 
8201 static unsigned long emulator_get_cr(struct x86_emulate_ctxt *ctxt, int cr)
8202 {
8203  struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
8204  unsigned long value;
8205 
8206  switch (cr) {
8207  case 0:
8208  value = kvm_read_cr0(vcpu);
8209  break;
8210  case 2:
8211  value = vcpu->arch.cr2;
8212  break;
8213  case 3:
8214  value = kvm_read_cr3(vcpu);
8215  break;
8216  case 4:
8217  value = kvm_read_cr4(vcpu);
8218  break;
8219  case 8:
8220  value = kvm_get_cr8(vcpu);
8221  break;
8222  default:
8223  kvm_err("%s: unexpected cr %u\n", __func__, cr);
8224  return 0;
8225  }
8226 
8227  return value;
8228 }
8229 
8230 static int emulator_set_cr(struct x86_emulate_ctxt *ctxt, int cr, ulong val)
8231 {
8232  struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
8233  int res = 0;
8234 
8235  switch (cr) {
8236  case 0:
8237  res = kvm_set_cr0(vcpu, mk_cr_64(kvm_read_cr0(vcpu), val));
8238  break;
8239  case 2:
8240  vcpu->arch.cr2 = val;
8241  break;
8242  case 3:
8243  res = kvm_set_cr3(vcpu, val);
8244  break;
8245  case 4:
8246  res = kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val));
8247  break;
8248  case 8:
8249  res = kvm_set_cr8(vcpu, val);
8250  break;
8251  default:
8252  kvm_err("%s: unexpected cr %u\n", __func__, cr);
8253  res = -1;
8254  }
8255 
8256  return res;
8257 }
8258 
8259 static int emulator_get_cpl(struct x86_emulate_ctxt *ctxt)
8260 {
8261  return static_call(kvm_x86_get_cpl)(emul_to_vcpu(ctxt));
8262 }
8263 
8264 static void emulator_get_gdt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
8265 {
8266  static_call(kvm_x86_get_gdt)(emul_to_vcpu(ctxt), dt);
8267 }
8268 
8269 static void emulator_get_idt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
8270 {
8271  static_call(kvm_x86_get_idt)(emul_to_vcpu(ctxt), dt);
8272 }
8273 
8274 static void emulator_set_gdt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
8275 {
8276  static_call(kvm_x86_set_gdt)(emul_to_vcpu(ctxt), dt);
8277 }
8278 
8279 static void emulator_set_idt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
8280 {
8281  static_call(kvm_x86_set_idt)(emul_to_vcpu(ctxt), dt);
8282 }
8283 
8285  struct x86_emulate_ctxt *ctxt, int seg)
8286 {
8287  return get_segment_base(emul_to_vcpu(ctxt), seg);
8288 }
8289 
8290 static bool emulator_get_segment(struct x86_emulate_ctxt *ctxt, u16 *selector,
8291  struct desc_struct *desc, u32 *base3,
8292  int seg)
8293 {
8294  struct kvm_segment var;
8295 
8296  kvm_get_segment(emul_to_vcpu(ctxt), &var, seg);
8297  *selector = var.selector;
8298 
8299  if (var.unusable) {
8300  memset(desc, 0, sizeof(*desc));
8301  if (base3)
8302  *base3 = 0;
8303  return false;
8304  }
8305 
8306  if (var.g)
8307  var.limit >>= 12;
8308  set_desc_limit(desc, var.limit);
8309  set_desc_base(desc, (unsigned long)var.base);
8310 #ifdef CONFIG_X86_64
8311  if (base3)
8312  *base3 = var.base >> 32;
8313 #endif
8314  desc->type = var.type;
8315  desc->s = var.s;
8316  desc->dpl = var.dpl;
8317  desc->p = var.present;
8318  desc->avl = var.avl;
8319  desc->l = var.l;
8320  desc->d = var.db;
8321  desc->g = var.g;
8322 
8323  return true;
8324 }
8325 
8326 static void emulator_set_segment(struct x86_emulate_ctxt *ctxt, u16 selector,
8327  struct desc_struct *desc, u32 base3,
8328  int seg)
8329 {
8330  struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
8331  struct kvm_segment var;
8332 
8333  var.selector = selector;
8334  var.base = get_desc_base(desc);
8335 #ifdef CONFIG_X86_64
8336  var.base |= ((u64)base3) << 32;
8337 #endif
8338  var.limit = get_desc_limit(desc);
8339  if (desc->g)
8340  var.limit = (var.limit << 12) | 0xfff;
8341  var.type = desc->type;
8342  var.dpl = desc->dpl;
8343  var.db = desc->d;
8344  var.s = desc->s;
8345  var.l = desc->l;
8346  var.g = desc->g;
8347  var.avl = desc->avl;
8348  var.present = desc->p;
8349  var.unusable = !var.present;
8350  var.padding = 0;
8351 
8352  kvm_set_segment(vcpu, &var, seg);
8353  return;
8354 }
8355 
8357  u32 msr_index, u64 *pdata)
8358 {
8359  struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
8360  int r;
8361 
8362  r = kvm_get_msr_with_filter(vcpu, msr_index, pdata);
8363  if (r < 0)
8364  return X86EMUL_UNHANDLEABLE;
8365 
8366  if (r) {
8367  if (kvm_msr_user_space(vcpu, msr_index, KVM_EXIT_X86_RDMSR, 0,
8369  return X86EMUL_IO_NEEDED;
8370 
8371  trace_kvm_msr_read_ex(msr_index);
8372  return X86EMUL_PROPAGATE_FAULT;
8373  }
8374 
8375  trace_kvm_msr_read(msr_index, *pdata);
8376  return X86EMUL_CONTINUE;
8377 }
8378 
8380  u32 msr_index, u64 data)
8381 {
8382  struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
8383  int r;
8384 
8385  r = kvm_set_msr_with_filter(vcpu, msr_index, data);
8386  if (r < 0)
8387  return X86EMUL_UNHANDLEABLE;
8388 
8389  if (r) {
8390  if (kvm_msr_user_space(vcpu, msr_index, KVM_EXIT_X86_WRMSR, data,
8392  return X86EMUL_IO_NEEDED;
8393 
8394  trace_kvm_msr_write_ex(msr_index, data);
8395  return X86EMUL_PROPAGATE_FAULT;
8396  }
8397 
8398  trace_kvm_msr_write(msr_index, data);
8399  return X86EMUL_CONTINUE;
8400 }
8401 
8402 static int emulator_get_msr(struct x86_emulate_ctxt *ctxt,
8403  u32 msr_index, u64 *pdata)
8404 {
8405  return kvm_get_msr(emul_to_vcpu(ctxt), msr_index, pdata);
8406 }
8407 
8408 static int emulator_check_pmc(struct x86_emulate_ctxt *ctxt,
8409  u32 pmc)
8410 {
8411  if (kvm_pmu_is_valid_rdpmc_ecx(emul_to_vcpu(ctxt), pmc))
8412  return 0;
8413  return -EINVAL;
8414 }
8415 
8416 static int emulator_read_pmc(struct x86_emulate_ctxt *ctxt,
8417  u32 pmc, u64 *pdata)
8418 {
8419  return kvm_pmu_rdpmc(emul_to_vcpu(ctxt), pmc, pdata);
8420 }
8421 
8422 static void emulator_halt(struct x86_emulate_ctxt *ctxt)
8423 {
8424  emul_to_vcpu(ctxt)->arch.halt_request = 1;
8425 }
8426 
8427 static int emulator_intercept(struct x86_emulate_ctxt *ctxt,
8428  struct x86_instruction_info *info,
8429  enum x86_intercept_stage stage)
8430 {
8431  return static_call(kvm_x86_check_intercept)(emul_to_vcpu(ctxt), info, stage,
8432  &ctxt->exception);
8433 }
8434 
8435 static bool emulator_get_cpuid(struct x86_emulate_ctxt *ctxt,
8436  u32 *eax, u32 *ebx, u32 *ecx, u32 *edx,
8437  bool exact_only)
8438 {
8439  return kvm_cpuid(emul_to_vcpu(ctxt), eax, ebx, ecx, edx, exact_only);
8440 }
8441 
8443 {
8444  return guest_cpuid_has(emul_to_vcpu(ctxt), X86_FEATURE_MOVBE);
8445 }
8446 
8448 {
8449  return guest_cpuid_has(emul_to_vcpu(ctxt), X86_FEATURE_FXSR);
8450 }
8451 
8453 {
8454  return guest_cpuid_has(emul_to_vcpu(ctxt), X86_FEATURE_RDPID);
8455 }
8456 
8457 static ulong emulator_read_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg)
8458 {
8459  return kvm_register_read_raw(emul_to_vcpu(ctxt), reg);
8460 }
8461 
8462 static void emulator_write_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg, ulong val)
8463 {
8464  kvm_register_write_raw(emul_to_vcpu(ctxt), reg, val);
8465 }
8466 
8467 static void emulator_set_nmi_mask(struct x86_emulate_ctxt *ctxt, bool masked)
8468 {
8469  static_call(kvm_x86_set_nmi_mask)(emul_to_vcpu(ctxt), masked);
8470 }
8471 
8472 static bool emulator_is_smm(struct x86_emulate_ctxt *ctxt)
8473 {
8474  return is_smm(emul_to_vcpu(ctxt));
8475 }
8476 
8477 static bool emulator_is_guest_mode(struct x86_emulate_ctxt *ctxt)
8478 {
8479  return is_guest_mode(emul_to_vcpu(ctxt));
8480 }
8481 
8482 #ifndef CONFIG_KVM_SMM
8483 static int emulator_leave_smm(struct x86_emulate_ctxt *ctxt)
8484 {
8485  WARN_ON_ONCE(1);
8486  return X86EMUL_UNHANDLEABLE;
8487 }
8488 #endif
8489 
8490 static void emulator_triple_fault(struct x86_emulate_ctxt *ctxt)
8491 {
8492  kvm_make_request(KVM_REQ_TRIPLE_FAULT, emul_to_vcpu(ctxt));
8493 }
8494 
8495 static int emulator_set_xcr(struct x86_emulate_ctxt *ctxt, u32 index, u64 xcr)
8496 {
8497  return __kvm_set_xcr(emul_to_vcpu(ctxt), index, xcr);
8498 }
8499 
8500 static void emulator_vm_bugged(struct x86_emulate_ctxt *ctxt)
8501 {
8502  struct kvm *kvm = emul_to_vcpu(ctxt)->kvm;
8503 
8504  if (!kvm->vm_bugged)
8505  kvm_vm_bugged(kvm);
8506 }
8507 
8509  gva_t addr, unsigned int flags)
8510 {
8511  if (!kvm_x86_ops.get_untagged_addr)
8512  return addr;
8513 
8514  return static_call(kvm_x86_get_untagged_addr)(emul_to_vcpu(ctxt), addr, flags);
8515 }
8516 
8517 static const struct x86_emulate_ops emulate_ops = {
8519  .read_gpr = emulator_read_gpr,
8520  .write_gpr = emulator_write_gpr,
8521  .read_std = emulator_read_std,
8522  .write_std = emulator_write_std,
8523  .fetch = kvm_fetch_guest_virt,
8524  .read_emulated = emulator_read_emulated,
8525  .write_emulated = emulator_write_emulated,
8526  .cmpxchg_emulated = emulator_cmpxchg_emulated,
8527  .invlpg = emulator_invlpg,
8528  .pio_in_emulated = emulator_pio_in_emulated,
8529  .pio_out_emulated = emulator_pio_out_emulated,
8530  .get_segment = emulator_get_segment,
8531  .set_segment = emulator_set_segment,
8532  .get_cached_segment_base = emulator_get_cached_segment_base,
8533  .get_gdt = emulator_get_gdt,
8534  .get_idt = emulator_get_idt,
8535  .set_gdt = emulator_set_gdt,
8536  .set_idt = emulator_set_idt,
8537  .get_cr = emulator_get_cr,
8538  .set_cr = emulator_set_cr,
8539  .cpl = emulator_get_cpl,
8540  .get_dr = emulator_get_dr,
8541  .set_dr = emulator_set_dr,
8542  .set_msr_with_filter = emulator_set_msr_with_filter,
8543  .get_msr_with_filter = emulator_get_msr_with_filter,
8544  .get_msr = emulator_get_msr,
8545  .check_pmc = emulator_check_pmc,
8546  .read_pmc = emulator_read_pmc,
8547  .halt = emulator_halt,
8548  .wbinvd = emulator_wbinvd,
8549  .fix_hypercall = emulator_fix_hypercall,
8550  .intercept = emulator_intercept,
8551  .get_cpuid = emulator_get_cpuid,
8552  .guest_has_movbe = emulator_guest_has_movbe,
8553  .guest_has_fxsr = emulator_guest_has_fxsr,
8554  .guest_has_rdpid = emulator_guest_has_rdpid,
8555  .set_nmi_mask = emulator_set_nmi_mask,
8556  .is_smm = emulator_is_smm,
8557  .is_guest_mode = emulator_is_guest_mode,
8558  .leave_smm = emulator_leave_smm,
8559  .triple_fault = emulator_triple_fault,
8560  .set_xcr = emulator_set_xcr,
8561  .get_untagged_addr = emulator_get_untagged_addr,
8562 };
8563 
8564 static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask)
8565 {
8566  u32 int_shadow = static_call(kvm_x86_get_interrupt_shadow)(vcpu);
8567  /*
8568  * an sti; sti; sequence only disable interrupts for the first
8569  * instruction. So, if the last instruction, be it emulated or
8570  * not, left the system with the INT_STI flag enabled, it
8571  * means that the last instruction is an sti. We should not
8572  * leave the flag on in this case. The same goes for mov ss
8573  */
8574  if (int_shadow & mask)
8575  mask = 0;
8576  if (unlikely(int_shadow || mask)) {
8577  static_call(kvm_x86_set_interrupt_shadow)(vcpu, mask);
8578  if (!mask)
8579  kvm_make_request(KVM_REQ_EVENT, vcpu);
8580  }
8581 }
8582 
8583 static void inject_emulated_exception(struct kvm_vcpu *vcpu)
8584 {
8585  struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
8586 
8587  if (ctxt->exception.vector == PF_VECTOR)
8589  else if (ctxt->exception.error_code_valid)
8591  ctxt->exception.error_code);
8592  else
8594 }
8595 
8596 static struct x86_emulate_ctxt *alloc_emulate_ctxt(struct kvm_vcpu *vcpu)
8597 {
8598  struct x86_emulate_ctxt *ctxt;
8599 
8600  ctxt = kmem_cache_zalloc(x86_emulator_cache, GFP_KERNEL_ACCOUNT);
8601  if (!ctxt) {
8602  pr_err("failed to allocate vcpu's emulator\n");
8603  return NULL;
8604  }
8605 
8606  ctxt->vcpu = vcpu;
8607  ctxt->ops = &emulate_ops;
8608  vcpu->arch.emulate_ctxt = ctxt;
8609 
8610  return ctxt;
8611 }
8612 
8613 static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
8614 {
8615  struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
8616  int cs_db, cs_l;
8617 
8618  static_call(kvm_x86_get_cs_db_l_bits)(vcpu, &cs_db, &cs_l);
8619 
8620  ctxt->gpa_available = false;
8621  ctxt->eflags = kvm_get_rflags(vcpu);
8622  ctxt->tf = (ctxt->eflags & X86_EFLAGS_TF) != 0;
8623 
8624  ctxt->eip = kvm_rip_read(vcpu);
8625  ctxt->mode = (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL :
8626  (ctxt->eflags & X86_EFLAGS_VM) ? X86EMUL_MODE_VM86 :
8627  (cs_l && is_long_mode(vcpu)) ? X86EMUL_MODE_PROT64 :
8628  cs_db ? X86EMUL_MODE_PROT32 :
8630  ctxt->interruptibility = 0;
8631  ctxt->have_exception = false;
8632  ctxt->exception.vector = -1;
8633  ctxt->perm_ok = false;
8634 
8635  init_decode_cache(ctxt);
8636  vcpu->arch.emulate_regs_need_sync_from_vcpu = false;
8637 }
8638 
8639 void kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip)
8640 {
8641  struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
8642  int ret;
8643 
8645 
8646  ctxt->op_bytes = 2;
8647  ctxt->ad_bytes = 2;
8648  ctxt->_eip = ctxt->eip + inc_eip;
8649  ret = emulate_int_real(ctxt, irq);
8650 
8651  if (ret != X86EMUL_CONTINUE) {
8652  kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
8653  } else {
8654  ctxt->eip = ctxt->_eip;
8655  kvm_rip_write(vcpu, ctxt->eip);
8656  kvm_set_rflags(vcpu, ctxt->eflags);
8657  }
8658 }
8660 
8661 static void prepare_emulation_failure_exit(struct kvm_vcpu *vcpu, u64 *data,
8662  u8 ndata, u8 *insn_bytes, u8 insn_size)
8663 {
8664  struct kvm_run *run = vcpu->run;
8665  u64 info[5];
8666  u8 info_start;
8667 
8668  /*
8669  * Zero the whole array used to retrieve the exit info, as casting to
8670  * u32 for select entries will leave some chunks uninitialized.
8671  */
8672  memset(&info, 0, sizeof(info));
8673 
8674  static_call(kvm_x86_get_exit_info)(vcpu, (u32 *)&info[0], &info[1],
8675  &info[2], (u32 *)&info[3],
8676  (u32 *)&info[4]);
8677 
8678  run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
8679  run->emulation_failure.suberror = KVM_INTERNAL_ERROR_EMULATION;
8680 
8681  /*
8682  * There's currently space for 13 entries, but 5 are used for the exit
8683  * reason and info. Restrict to 4 to reduce the maintenance burden
8684  * when expanding kvm_run.emulation_failure in the future.
8685  */
8686  if (WARN_ON_ONCE(ndata > 4))
8687  ndata = 4;
8688 
8689  /* Always include the flags as a 'data' entry. */
8690  info_start = 1;
8691  run->emulation_failure.flags = 0;
8692 
8693  if (insn_size) {
8694  BUILD_BUG_ON((sizeof(run->emulation_failure.insn_size) +
8695  sizeof(run->emulation_failure.insn_bytes) != 16));
8696  info_start += 2;
8697  run->emulation_failure.flags |=
8698  KVM_INTERNAL_ERROR_EMULATION_FLAG_INSTRUCTION_BYTES;
8699  run->emulation_failure.insn_size = insn_size;
8700  memset(run->emulation_failure.insn_bytes, 0x90,
8701  sizeof(run->emulation_failure.insn_bytes));
8702  memcpy(run->emulation_failure.insn_bytes, insn_bytes, insn_size);
8703  }
8704 
8705  memcpy(&run->internal.data[info_start], info, sizeof(info));
8706  memcpy(&run->internal.data[info_start + ARRAY_SIZE(info)], data,
8707  ndata * sizeof(data[0]));
8708 
8709  run->emulation_failure.ndata = info_start + ARRAY_SIZE(info) + ndata;
8710 }
8711 
8712 static void prepare_emulation_ctxt_failure_exit(struct kvm_vcpu *vcpu)
8713 {
8714  struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
8715 
8717  ctxt->fetch.end - ctxt->fetch.data);
8718 }
8719 
8720 void __kvm_prepare_emulation_failure_exit(struct kvm_vcpu *vcpu, u64 *data,
8721  u8 ndata)
8722 {
8723  prepare_emulation_failure_exit(vcpu, data, ndata, NULL, 0);
8724 }
8726 
8728 {
8730 }
8732 
8733 static int handle_emulation_failure(struct kvm_vcpu *vcpu, int emulation_type)
8734 {
8735  struct kvm *kvm = vcpu->kvm;
8736 
8737  ++vcpu->stat.insn_emulation_fail;
8739 
8740  if (emulation_type & EMULTYPE_VMWARE_GP) {
8741  kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
8742  return 1;
8743  }
8744 
8745  if (kvm->arch.exit_on_emulation_error ||
8746  (emulation_type & EMULTYPE_SKIP)) {
8748  return 0;
8749  }
8750 
8751  kvm_queue_exception(vcpu, UD_VECTOR);
8752 
8753  if (!is_guest_mode(vcpu) && static_call(kvm_x86_get_cpl)(vcpu) == 0) {
8755  return 0;
8756  }
8757 
8758  return 1;
8759 }
8760 
8761 static bool reexecute_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
8762  int emulation_type)
8763 {
8764  gpa_t gpa = cr2_or_gpa;
8765  kvm_pfn_t pfn;
8766 
8767  if (!(emulation_type & EMULTYPE_ALLOW_RETRY_PF))
8768  return false;
8769 
8770  if (WARN_ON_ONCE(is_guest_mode(vcpu)) ||
8771  WARN_ON_ONCE(!(emulation_type & EMULTYPE_PF)))
8772  return false;
8773 
8774  if (!vcpu->arch.mmu->root_role.direct) {
8775  /*
8776  * Write permission should be allowed since only
8777  * write access need to be emulated.
8778  */
8779  gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2_or_gpa, NULL);
8780 
8781  /*
8782  * If the mapping is invalid in guest, let cpu retry
8783  * it to generate fault.
8784  */
8785  if (gpa == INVALID_GPA)
8786  return true;
8787  }
8788 
8789  /*
8790  * Do not retry the unhandleable instruction if it faults on the
8791  * readonly host memory, otherwise it will goto a infinite loop:
8792  * retry instruction -> write #PF -> emulation fail -> retry
8793  * instruction -> ...
8794  */
8795  pfn = gfn_to_pfn(vcpu->kvm, gpa_to_gfn(gpa));
8796 
8797  /*
8798  * If the instruction failed on the error pfn, it can not be fixed,
8799  * report the error to userspace.
8800  */
8801  if (is_error_noslot_pfn(pfn))
8802  return false;
8803 
8804  kvm_release_pfn_clean(pfn);
8805 
8806  /* The instructions are well-emulated on direct mmu. */
8807  if (vcpu->arch.mmu->root_role.direct) {
8808  unsigned int indirect_shadow_pages;
8809 
8810  write_lock(&vcpu->kvm->mmu_lock);
8811  indirect_shadow_pages = vcpu->kvm->arch.indirect_shadow_pages;
8812  write_unlock(&vcpu->kvm->mmu_lock);
8813 
8814  if (indirect_shadow_pages)
8815  kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
8816 
8817  return true;
8818  }
8819 
8820  /*
8821  * if emulation was due to access to shadowed page table
8822  * and it failed try to unshadow page and re-enter the
8823  * guest to let CPU execute the instruction.
8824  */
8825  kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
8826 
8827  /*
8828  * If the access faults on its page table, it can not
8829  * be fixed by unprotecting shadow page and it should
8830  * be reported to userspace.
8831  */
8832  return !(emulation_type & EMULTYPE_WRITE_PF_TO_SP);
8833 }
8834 
8835 static bool retry_instruction(struct x86_emulate_ctxt *ctxt,
8836  gpa_t cr2_or_gpa, int emulation_type)
8837 {
8838  struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
8839  unsigned long last_retry_eip, last_retry_addr, gpa = cr2_or_gpa;
8840 
8841  last_retry_eip = vcpu->arch.last_retry_eip;
8842  last_retry_addr = vcpu->arch.last_retry_addr;
8843 
8844  /*
8845  * If the emulation is caused by #PF and it is non-page_table
8846  * writing instruction, it means the VM-EXIT is caused by shadow
8847  * page protected, we can zap the shadow page and retry this
8848  * instruction directly.
8849  *
8850  * Note: if the guest uses a non-page-table modifying instruction
8851  * on the PDE that points to the instruction, then we will unmap
8852  * the instruction and go to an infinite loop. So, we cache the
8853  * last retried eip and the last fault address, if we meet the eip
8854  * and the address again, we can break out of the potential infinite
8855  * loop.
8856  */
8857  vcpu->arch.last_retry_eip = vcpu->arch.last_retry_addr = 0;
8858 
8859  if (!(emulation_type & EMULTYPE_ALLOW_RETRY_PF))
8860  return false;
8861 
8862  if (WARN_ON_ONCE(is_guest_mode(vcpu)) ||
8863  WARN_ON_ONCE(!(emulation_type & EMULTYPE_PF)))
8864  return false;
8865 
8866  if (x86_page_table_writing_insn(ctxt))
8867  return false;
8868 
8869  if (ctxt->eip == last_retry_eip && last_retry_addr == cr2_or_gpa)
8870  return false;
8871 
8872  vcpu->arch.last_retry_eip = ctxt->eip;
8873  vcpu->arch.last_retry_addr = cr2_or_gpa;
8874 
8875  if (!vcpu->arch.mmu->root_role.direct)
8876  gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2_or_gpa, NULL);
8877 
8878  kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
8879 
8880  return true;
8881 }
8882 
8883 static int complete_emulated_mmio(struct kvm_vcpu *vcpu);
8884 static int complete_emulated_pio(struct kvm_vcpu *vcpu);
8885 
8886 static int kvm_vcpu_check_hw_bp(unsigned long addr, u32 type, u32 dr7,
8887  unsigned long *db)
8888 {
8889  u32 dr6 = 0;
8890  int i;
8891  u32 enable, rwlen;
8892 
8893  enable = dr7;
8894  rwlen = dr7 >> 16;
8895  for (i = 0; i < 4; i++, enable >>= 2, rwlen >>= 4)
8896  if ((enable & 3) && (rwlen & 15) == type && db[i] == addr)
8897  dr6 |= (1 << i);
8898  return dr6;
8899 }
8900 
8901 static int kvm_vcpu_do_singlestep(struct kvm_vcpu *vcpu)
8902 {
8903  struct kvm_run *kvm_run = vcpu->run;
8904 
8905  if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) {
8906  kvm_run->debug.arch.dr6 = DR6_BS | DR6_ACTIVE_LOW;
8907  kvm_run->debug.arch.pc = kvm_get_linear_rip(vcpu);
8908  kvm_run->debug.arch.exception = DB_VECTOR;
8909  kvm_run->exit_reason = KVM_EXIT_DEBUG;
8910  return 0;
8911  }
8912  kvm_queue_exception_p(vcpu, DB_VECTOR, DR6_BS);
8913  return 1;
8914 }
8915 
8916 int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu)
8917 {
8918  unsigned long rflags = static_call(kvm_x86_get_rflags)(vcpu);
8919  int r;
8920 
8921  r = static_call(kvm_x86_skip_emulated_instruction)(vcpu);
8922  if (unlikely(!r))
8923  return 0;
8924 
8925  kvm_pmu_trigger_event(vcpu, PERF_COUNT_HW_INSTRUCTIONS);
8926 
8927  /*
8928  * rflags is the old, "raw" value of the flags. The new value has
8929  * not been saved yet.
8930  *
8931  * This is correct even for TF set by the guest, because "the
8932  * processor will not generate this exception after the instruction
8933  * that sets the TF flag".
8934  */
8935  if (unlikely(rflags & X86_EFLAGS_TF))
8936  r = kvm_vcpu_do_singlestep(vcpu);
8937  return r;
8938 }
8940 
8941 static bool kvm_is_code_breakpoint_inhibited(struct kvm_vcpu *vcpu)
8942 {
8943  u32 shadow;
8944 
8945  if (kvm_get_rflags(vcpu) & X86_EFLAGS_RF)
8946  return true;
8947 
8948  /*
8949  * Intel CPUs inhibit code #DBs when MOV/POP SS blocking is active,
8950  * but AMD CPUs do not. MOV/POP SS blocking is rare, check that first
8951  * to avoid the relatively expensive CPUID lookup.
8952  */
8953  shadow = static_call(kvm_x86_get_interrupt_shadow)(vcpu);
8954  return (shadow & KVM_X86_SHADOW_INT_MOV_SS) &&
8955  guest_cpuid_is_intel(vcpu);
8956 }
8957 
8958 static bool kvm_vcpu_check_code_breakpoint(struct kvm_vcpu *vcpu,
8959  int emulation_type, int *r)
8960 {
8961  WARN_ON_ONCE(emulation_type & EMULTYPE_NO_DECODE);
8962 
8963  /*
8964  * Do not check for code breakpoints if hardware has already done the
8965  * checks, as inferred from the emulation type. On NO_DECODE and SKIP,
8966  * the instruction has passed all exception checks, and all intercepted
8967  * exceptions that trigger emulation have lower priority than code
8968  * breakpoints, i.e. the fact that the intercepted exception occurred
8969  * means any code breakpoints have already been serviced.
8970  *
8971  * Note, KVM needs to check for code #DBs on EMULTYPE_TRAP_UD_FORCED as
8972  * hardware has checked the RIP of the magic prefix, but not the RIP of
8973  * the instruction being emulated. The intent of forced emulation is
8974  * to behave as if KVM intercepted the instruction without an exception
8975  * and without a prefix.
8976  */
8977  if (emulation_type & (EMULTYPE_NO_DECODE | EMULTYPE_SKIP |
8978  EMULTYPE_TRAP_UD | EMULTYPE_VMWARE_GP | EMULTYPE_PF))
8979  return false;
8980 
8981  if (unlikely(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) &&
8982  (vcpu->arch.guest_debug_dr7 & DR7_BP_EN_MASK)) {
8983  struct kvm_run *kvm_run = vcpu->run;
8984  unsigned long eip = kvm_get_linear_rip(vcpu);
8985  u32 dr6 = kvm_vcpu_check_hw_bp(eip, 0,
8986  vcpu->arch.guest_debug_dr7,
8987  vcpu->arch.eff_db);
8988 
8989  if (dr6 != 0) {
8990  kvm_run->debug.arch.dr6 = dr6 | DR6_ACTIVE_LOW;
8991  kvm_run->debug.arch.pc = eip;
8992  kvm_run->debug.arch.exception = DB_VECTOR;
8993  kvm_run->exit_reason = KVM_EXIT_DEBUG;
8994  *r = 0;
8995  return true;
8996  }
8997  }
8998 
8999  if (unlikely(vcpu->arch.dr7 & DR7_BP_EN_MASK) &&
9001  unsigned long eip = kvm_get_linear_rip(vcpu);
9002  u32 dr6 = kvm_vcpu_check_hw_bp(eip, 0,
9003  vcpu->arch.dr7,
9004  vcpu->arch.db);
9005 
9006  if (dr6 != 0) {
9007  kvm_queue_exception_p(vcpu, DB_VECTOR, dr6);
9008  *r = 1;
9009  return true;
9010  }
9011  }
9012 
9013  return false;
9014 }
9015 
9017 {
9018  switch (ctxt->opcode_len) {
9019  case 1:
9020  switch (ctxt->b) {
9021  case 0xe4: /* IN */
9022  case 0xe5:
9023  case 0xec:
9024  case 0xed:
9025  case 0xe6: /* OUT */
9026  case 0xe7:
9027  case 0xee:
9028  case 0xef:
9029  case 0x6c: /* INS */
9030  case 0x6d:
9031  case 0x6e: /* OUTS */
9032  case 0x6f:
9033  return true;
9034  }
9035  break;
9036  case 2:
9037  switch (ctxt->b) {
9038  case 0x33: /* RDPMC */
9039  return true;
9040  }
9041  break;
9042  }
9043 
9044  return false;
9045 }
9046 
9047 /*
9048  * Decode an instruction for emulation. The caller is responsible for handling
9049  * code breakpoints. Note, manually detecting code breakpoints is unnecessary
9050  * (and wrong) when emulating on an intercepted fault-like exception[*], as
9051  * code breakpoints have higher priority and thus have already been done by
9052  * hardware.
9053  *
9054  * [*] Except #MC, which is higher priority, but KVM should never emulate in
9055  * response to a machine check.
9056  */
9057 int x86_decode_emulated_instruction(struct kvm_vcpu *vcpu, int emulation_type,
9058  void *insn, int insn_len)
9059 {
9060  struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
9061  int r;
9062 
9064 
9065  r = x86_decode_insn(ctxt, insn, insn_len, emulation_type);
9066 
9068  ++vcpu->stat.insn_emulation;
9069 
9070  return r;
9071 }
9073 
9074 int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
9075  int emulation_type, void *insn, int insn_len)
9076 {
9077  int r;
9078  struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
9079  bool writeback = true;
9080 
9081  r = kvm_check_emulate_insn(vcpu, emulation_type, insn, insn_len);
9082  if (r != X86EMUL_CONTINUE) {
9084  return 1;
9085 
9086  WARN_ON_ONCE(r != X86EMUL_UNHANDLEABLE);
9087  return handle_emulation_failure(vcpu, emulation_type);
9088  }
9089 
9090  vcpu->arch.l1tf_flush_l1d = true;
9091 
9092  if (!(emulation_type & EMULTYPE_NO_DECODE)) {
9094 
9095  /*
9096  * Return immediately if RIP hits a code breakpoint, such #DBs
9097  * are fault-like and are higher priority than any faults on
9098  * the code fetch itself.
9099  */
9100  if (kvm_vcpu_check_code_breakpoint(vcpu, emulation_type, &r))
9101  return r;
9102 
9103  r = x86_decode_emulated_instruction(vcpu, emulation_type,
9104  insn, insn_len);
9105  if (r != EMULATION_OK) {
9106  if ((emulation_type & EMULTYPE_TRAP_UD) ||
9107  (emulation_type & EMULTYPE_TRAP_UD_FORCED)) {
9108  kvm_queue_exception(vcpu, UD_VECTOR);
9109  return 1;
9110  }
9111  if (reexecute_instruction(vcpu, cr2_or_gpa,
9112  emulation_type))
9113  return 1;
9114 
9115  if (ctxt->have_exception &&
9116  !(emulation_type & EMULTYPE_SKIP)) {
9117  /*
9118  * #UD should result in just EMULATION_FAILED, and trap-like
9119  * exception should not be encountered during decode.
9120  */
9121  WARN_ON_ONCE(ctxt->exception.vector == UD_VECTOR ||
9124  return 1;
9125  }
9126  return handle_emulation_failure(vcpu, emulation_type);
9127  }
9128  }
9129 
9130  if ((emulation_type & EMULTYPE_VMWARE_GP) &&
9131  !is_vmware_backdoor_opcode(ctxt)) {
9132  kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
9133  return 1;
9134  }
9135 
9136  /*
9137  * EMULTYPE_SKIP without EMULTYPE_COMPLETE_USER_EXIT is intended for
9138  * use *only* by vendor callbacks for kvm_skip_emulated_instruction().
9139  * The caller is responsible for updating interruptibility state and
9140  * injecting single-step #DBs.
9141  */
9142  if (emulation_type & EMULTYPE_SKIP) {
9143  if (ctxt->mode != X86EMUL_MODE_PROT64)
9144  ctxt->eip = (u32)ctxt->_eip;
9145  else
9146  ctxt->eip = ctxt->_eip;
9147 
9148  if (emulation_type & EMULTYPE_COMPLETE_USER_EXIT) {
9149  r = 1;
9150  goto writeback;
9151  }
9152 
9153  kvm_rip_write(vcpu, ctxt->eip);
9154  if (ctxt->eflags & X86_EFLAGS_RF)
9155  kvm_set_rflags(vcpu, ctxt->eflags & ~X86_EFLAGS_RF);
9156  return 1;
9157  }
9158 
9159  if (retry_instruction(ctxt, cr2_or_gpa, emulation_type))
9160  return 1;
9161 
9162  /* this is needed for vmware backdoor interface to work since it
9163  changes registers values during IO operation */
9164  if (vcpu->arch.emulate_regs_need_sync_from_vcpu) {
9165  vcpu->arch.emulate_regs_need_sync_from_vcpu = false;
9167  }
9168 
9169 restart:
9170  if (emulation_type & EMULTYPE_PF) {
9171  /* Save the faulting GPA (cr2) in the address field */
9172  ctxt->exception.address = cr2_or_gpa;
9173 
9174  /* With shadow page tables, cr2 contains a GVA or nGPA. */
9175  if (vcpu->arch.mmu->root_role.direct) {
9176  ctxt->gpa_available = true;
9177  ctxt->gpa_val = cr2_or_gpa;
9178  }
9179  } else {
9180  /* Sanitize the address out of an abundance of paranoia. */
9181  ctxt->exception.address = 0;
9182  }
9183 
9184  r = x86_emulate_insn(ctxt);
9185 
9186  if (r == EMULATION_INTERCEPTED)
9187  return 1;
9188 
9189  if (r == EMULATION_FAILED) {
9190  if (reexecute_instruction(vcpu, cr2_or_gpa, emulation_type))
9191  return 1;
9192 
9193  return handle_emulation_failure(vcpu, emulation_type);
9194  }
9195 
9196  if (ctxt->have_exception) {
9197  WARN_ON_ONCE(vcpu->mmio_needed && !vcpu->mmio_is_write);
9198  vcpu->mmio_needed = false;
9199  r = 1;
9201  } else if (vcpu->arch.pio.count) {
9202  if (!vcpu->arch.pio.in) {
9203  /* FIXME: return into emulator if single-stepping. */
9204  vcpu->arch.pio.count = 0;
9205  } else {
9206  writeback = false;
9207  vcpu->arch.complete_userspace_io = complete_emulated_pio;
9208  }
9209  r = 0;
9210  } else if (vcpu->mmio_needed) {
9211  ++vcpu->stat.mmio_exits;
9212 
9213  if (!vcpu->mmio_is_write)
9214  writeback = false;
9215  r = 0;
9216  vcpu->arch.complete_userspace_io = complete_emulated_mmio;
9217  } else if (vcpu->arch.complete_userspace_io) {
9218  writeback = false;
9219  r = 0;
9220  } else if (r == EMULATION_RESTART)
9221  goto restart;
9222  else
9223  r = 1;
9224 
9225 writeback:
9226  if (writeback) {
9227  unsigned long rflags = static_call(kvm_x86_get_rflags)(vcpu);
9229  vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
9230 
9231  /*
9232  * Note, EXCPT_DB is assumed to be fault-like as the emulator
9233  * only supports code breakpoints and general detect #DB, both
9234  * of which are fault-like.
9235  */
9236  if (!ctxt->have_exception ||
9238  kvm_pmu_trigger_event(vcpu, PERF_COUNT_HW_INSTRUCTIONS);
9239  if (ctxt->is_branch)
9240  kvm_pmu_trigger_event(vcpu, PERF_COUNT_HW_BRANCH_INSTRUCTIONS);
9241  kvm_rip_write(vcpu, ctxt->eip);
9242  if (r && (ctxt->tf || (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)))
9244  static_call_cond(kvm_x86_update_emulated_instruction)(vcpu);
9245  __kvm_set_rflags(vcpu, ctxt->eflags);
9246  }
9247 
9248  /*
9249  * For STI, interrupts are shadowed; so KVM_REQ_EVENT will
9250  * do nothing, and it will be requested again as soon as
9251  * the shadow expires. But we still need to check here,
9252  * because POPF has no interrupt shadow.
9253  */
9254  if (unlikely((ctxt->eflags & ~rflags) & X86_EFLAGS_IF))
9255  kvm_make_request(KVM_REQ_EVENT, vcpu);
9256  } else
9257  vcpu->arch.emulate_regs_need_sync_to_vcpu = true;
9258 
9259  return r;
9260 }
9261 
9262 int kvm_emulate_instruction(struct kvm_vcpu *vcpu, int emulation_type)
9263 {
9264  return x86_emulate_instruction(vcpu, 0, emulation_type, NULL, 0);
9265 }
9267 
9269  void *insn, int insn_len)
9270 {
9271  return x86_emulate_instruction(vcpu, 0, 0, insn, insn_len);
9272 }
9274 
9275 static int complete_fast_pio_out_port_0x7e(struct kvm_vcpu *vcpu)
9276 {
9277  vcpu->arch.pio.count = 0;
9278  return 1;
9279 }
9280 
9281 static int complete_fast_pio_out(struct kvm_vcpu *vcpu)
9282 {
9283  vcpu->arch.pio.count = 0;
9284 
9285  if (unlikely(!kvm_is_linear_rip(vcpu, vcpu->arch.pio.linear_rip)))
9286  return 1;
9287 
9289 }
9290 
9291 static int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size,
9292  unsigned short port)
9293 {
9294  unsigned long val = kvm_rax_read(vcpu);
9295  int ret = emulator_pio_out(vcpu, size, port, &val, 1);
9296 
9297  if (ret)
9298  return ret;
9299 
9300  /*
9301  * Workaround userspace that relies on old KVM behavior of %rip being
9302  * incremented prior to exiting to userspace to handle "OUT 0x7e".
9303  */
9304  if (port == 0x7e &&
9305  kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_OUT_7E_INC_RIP)) {
9306  vcpu->arch.complete_userspace_io =
9309  } else {
9310  vcpu->arch.pio.linear_rip = kvm_get_linear_rip(vcpu);
9311  vcpu->arch.complete_userspace_io = complete_fast_pio_out;
9312  }
9313  return 0;
9314 }
9315 
9316 static int complete_fast_pio_in(struct kvm_vcpu *vcpu)
9317 {
9318  unsigned long val;
9319 
9320  /* We should only ever be called with arch.pio.count equal to 1 */
9321  BUG_ON(vcpu->arch.pio.count != 1);
9322 
9323  if (unlikely(!kvm_is_linear_rip(vcpu, vcpu->arch.pio.linear_rip))) {
9324  vcpu->arch.pio.count = 0;
9325  return 1;
9326  }
9327 
9328  /* For size less than 4 we merge, else we zero extend */
9329  val = (vcpu->arch.pio.size < 4) ? kvm_rax_read(vcpu) : 0;
9330 
9332  kvm_rax_write(vcpu, val);
9333 
9335 }
9336 
9337 static int kvm_fast_pio_in(struct kvm_vcpu *vcpu, int size,
9338  unsigned short port)
9339 {
9340  unsigned long val;
9341  int ret;
9342 
9343  /* For size less than 4 we merge, else we zero extend */
9344  val = (size < 4) ? kvm_rax_read(vcpu) : 0;
9345 
9346  ret = emulator_pio_in(vcpu, size, port, &val, 1);
9347  if (ret) {
9348  kvm_rax_write(vcpu, val);
9349  return ret;
9350  }
9351 
9352  vcpu->arch.pio.linear_rip = kvm_get_linear_rip(vcpu);
9353  vcpu->arch.complete_userspace_io = complete_fast_pio_in;
9354 
9355  return 0;
9356 }
9357 
9358 int kvm_fast_pio(struct kvm_vcpu *vcpu, int size, unsigned short port, int in)
9359 {
9360  int ret;
9361 
9362  if (in)
9363  ret = kvm_fast_pio_in(vcpu, size, port);
9364  else
9365  ret = kvm_fast_pio_out(vcpu, size, port);
9366  return ret && kvm_skip_emulated_instruction(vcpu);
9367 }
9369 
9370 static int kvmclock_cpu_down_prep(unsigned int cpu)
9371 {
9372  __this_cpu_write(cpu_tsc_khz, 0);
9373  return 0;
9374 }
9375 
9376 static void tsc_khz_changed(void *data)
9377 {
9378  struct cpufreq_freqs *freq = data;
9379  unsigned long khz;
9380 
9381  WARN_ON_ONCE(boot_cpu_has(X86_FEATURE_CONSTANT_TSC));
9382 
9383  if (data)
9384  khz = freq->new;
9385  else
9386  khz = cpufreq_quick_get(raw_smp_processor_id());
9387  if (!khz)
9388  khz = tsc_khz;
9389  __this_cpu_write(cpu_tsc_khz, khz);
9390 }
9391 
9392 #ifdef CONFIG_X86_64
9393 static void kvm_hyperv_tsc_notifier(void)
9394 {
9395  struct kvm *kvm;
9396  int cpu;
9397 
9398  mutex_lock(&kvm_lock);
9399  list_for_each_entry(kvm, &vm_list, vm_list)
9401 
9402  /* no guest entries from this point */
9403  hyperv_stop_tsc_emulation();
9404 
9405  /* TSC frequency always matches when on Hyper-V */
9406  if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
9407  for_each_present_cpu(cpu)
9408  per_cpu(cpu_tsc_khz, cpu) = tsc_khz;
9409  }
9410  kvm_caps.max_guest_tsc_khz = tsc_khz;
9411 
9412  list_for_each_entry(kvm, &vm_list, vm_list) {
9416  }
9417 
9418  mutex_unlock(&kvm_lock);
9419 }
9420 #endif
9421 
9422 static void __kvmclock_cpufreq_notifier(struct cpufreq_freqs *freq, int cpu)
9423 {
9424  struct kvm *kvm;
9425  struct kvm_vcpu *vcpu;
9426  int send_ipi = 0;
9427  unsigned long i;
9428 
9429  /*
9430  * We allow guests to temporarily run on slowing clocks,
9431  * provided we notify them after, or to run on accelerating
9432  * clocks, provided we notify them before. Thus time never
9433  * goes backwards.
9434  *
9435  * However, we have a problem. We can't atomically update
9436  * the frequency of a given CPU from this function; it is
9437  * merely a notifier, which can be called from any CPU.
9438  * Changing the TSC frequency at arbitrary points in time
9439  * requires a recomputation of local variables related to
9440  * the TSC for each VCPU. We must flag these local variables
9441  * to be updated and be sure the update takes place with the
9442  * new frequency before any guests proceed.
9443  *
9444  * Unfortunately, the combination of hotplug CPU and frequency
9445  * change creates an intractable locking scenario; the order
9446  * of when these callouts happen is undefined with respect to
9447  * CPU hotplug, and they can race with each other. As such,
9448  * merely setting per_cpu(cpu_tsc_khz) = X during a hotadd is
9449  * undefined; you can actually have a CPU frequency change take
9450  * place in between the computation of X and the setting of the
9451  * variable. To protect against this problem, all updates of
9452  * the per_cpu tsc_khz variable are done in an interrupt
9453  * protected IPI, and all callers wishing to update the value
9454  * must wait for a synchronous IPI to complete (which is trivial
9455  * if the caller is on the CPU already). This establishes the
9456  * necessary total order on variable updates.
9457  *
9458  * Note that because a guest time update may take place
9459  * anytime after the setting of the VCPU's request bit, the
9460  * correct TSC value must be set before the request. However,
9461  * to ensure the update actually makes it to any guest which
9462  * starts running in hardware virtualization between the set
9463  * and the acquisition of the spinlock, we must also ping the
9464  * CPU after setting the request bit.
9465  *
9466  */
9467 
9468  smp_call_function_single(cpu, tsc_khz_changed, freq, 1);
9469 
9470  mutex_lock(&kvm_lock);
9471  list_for_each_entry(kvm, &vm_list, vm_list) {
9472  kvm_for_each_vcpu(i, vcpu, kvm) {
9473  if (vcpu->cpu != cpu)
9474  continue;
9475  kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
9476  if (vcpu->cpu != raw_smp_processor_id())
9477  send_ipi = 1;
9478  }
9479  }
9480  mutex_unlock(&kvm_lock);
9481 
9482  if (freq->old < freq->new && send_ipi) {
9483  /*
9484  * We upscale the frequency. Must make the guest
9485  * doesn't see old kvmclock values while running with
9486  * the new frequency, otherwise we risk the guest sees
9487  * time go backwards.
9488  *
9489  * In case we update the frequency for another cpu
9490  * (which might be in guest context) send an interrupt
9491  * to kick the cpu out of guest context. Next time
9492  * guest context is entered kvmclock will be updated,
9493  * so the guest will not see stale values.
9494  */
9495  smp_call_function_single(cpu, tsc_khz_changed, freq, 1);
9496  }
9497 }
9498 
9499 static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
9500  void *data)
9501 {
9502  struct cpufreq_freqs *freq = data;
9503  int cpu;
9504 
9505  if (val == CPUFREQ_PRECHANGE && freq->old > freq->new)
9506  return 0;
9507  if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new)
9508  return 0;
9509 
9510  for_each_cpu(cpu, freq->policy->cpus)
9511  __kvmclock_cpufreq_notifier(freq, cpu);
9512 
9513  return 0;
9514 }
9515 
9516 static struct notifier_block kvmclock_cpufreq_notifier_block = {
9517  .notifier_call = kvmclock_cpufreq_notifier
9518 };
9519 
9520 static int kvmclock_cpu_online(unsigned int cpu)
9521 {
9522  tsc_khz_changed(NULL);
9523  return 0;
9524 }
9525 
9526 static void kvm_timer_init(void)
9527 {
9528  if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
9529  max_tsc_khz = tsc_khz;
9530 
9531  if (IS_ENABLED(CONFIG_CPU_FREQ)) {
9532  struct cpufreq_policy *policy;
9533  int cpu;
9534 
9535  cpu = get_cpu();
9536  policy = cpufreq_cpu_get(cpu);
9537  if (policy) {
9538  if (policy->cpuinfo.max_freq)
9539  max_tsc_khz = policy->cpuinfo.max_freq;
9540  cpufreq_cpu_put(policy);
9541  }
9542  put_cpu();
9543  }
9544  cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block,
9545  CPUFREQ_TRANSITION_NOTIFIER);
9546 
9547  cpuhp_setup_state(CPUHP_AP_X86_KVM_CLK_ONLINE, "x86/kvm/clk:online",
9549  }
9550 }
9551 
9552 #ifdef CONFIG_X86_64
9553 static void pvclock_gtod_update_fn(struct work_struct *work)
9554 {
9555  struct kvm *kvm;
9556  struct kvm_vcpu *vcpu;
9557  unsigned long i;
9558 
9559  mutex_lock(&kvm_lock);
9560  list_for_each_entry(kvm, &vm_list, vm_list)
9561  kvm_for_each_vcpu(i, vcpu, kvm)
9562  kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
9563  atomic_set(&kvm_guest_has_master_clock, 0);
9564  mutex_unlock(&kvm_lock);
9565 }
9566 
9567 static DECLARE_WORK(pvclock_gtod_work, pvclock_gtod_update_fn);
9568 
9569 /*
9570  * Indirection to move queue_work() out of the tk_core.seq write held
9571  * region to prevent possible deadlocks against time accessors which
9572  * are invoked with work related locks held.
9573  */
9574 static void pvclock_irq_work_fn(struct irq_work *w)
9575 {
9576  queue_work(system_long_wq, &pvclock_gtod_work);
9577 }
9578 
9579 static DEFINE_IRQ_WORK(pvclock_irq_work, pvclock_irq_work_fn);
9580 
9581 /*
9582  * Notification about pvclock gtod data update.
9583  */
9584 static int pvclock_gtod_notify(struct notifier_block *nb, unsigned long unused,
9585  void *priv)
9586 {
9587  struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
9588  struct timekeeper *tk = priv;
9589 
9590  update_pvclock_gtod(tk);
9591 
9592  /*
9593  * Disable master clock if host does not trust, or does not use,
9594  * TSC based clocksource. Delegate queue_work() to irq_work as
9595  * this is invoked with tk_core.seq write held.
9596  */
9597  if (!gtod_is_based_on_tsc(gtod->clock.vclock_mode) &&
9598  atomic_read(&kvm_guest_has_master_clock) != 0)
9599  irq_work_queue(&pvclock_irq_work);
9600  return 0;
9601 }
9602 
9603 static struct notifier_block pvclock_gtod_notifier = {
9604  .notifier_call = pvclock_gtod_notify,
9605 };
9606 #endif
9607 
9608 static inline void kvm_ops_update(struct kvm_x86_init_ops *ops)
9609 {
9610  memcpy(&kvm_x86_ops, ops->runtime_ops, sizeof(kvm_x86_ops));
9611 
9612 #define __KVM_X86_OP(func) \
9613  static_call_update(kvm_x86_##func, kvm_x86_ops.func);
9614 #define KVM_X86_OP(func) \
9615  WARN_ON(!kvm_x86_ops.func); __KVM_X86_OP(func)
9616 #define KVM_X86_OP_OPTIONAL __KVM_X86_OP
9617 #define KVM_X86_OP_OPTIONAL_RET0(func) \
9618  static_call_update(kvm_x86_##func, (void *)kvm_x86_ops.func ? : \
9619  (void *)__static_call_return0);
9620 #include <asm/kvm-x86-ops.h>
9621 #undef __KVM_X86_OP
9622 
9623  kvm_pmu_ops_update(ops->pmu_ops);
9624 }
9625 
9627 {
9628  int cpu = smp_processor_id();
9629  struct cpuinfo_x86 *c = &cpu_data(cpu);
9630 
9631  /*
9632  * Compatibility checks are done when loading KVM and when enabling
9633  * hardware, e.g. during CPU hotplug, to ensure all online CPUs are
9634  * compatible, i.e. KVM should never perform a compatibility check on
9635  * an offline CPU.
9636  */
9637  WARN_ON(!cpu_online(cpu));
9638 
9639  if (__cr4_reserved_bits(cpu_has, c) !=
9640  __cr4_reserved_bits(cpu_has, &boot_cpu_data))
9641  return -EIO;
9642 
9643  return static_call(kvm_x86_check_processor_compatibility)();
9644 }
9645 
9646 static void kvm_x86_check_cpu_compat(void *ret)
9647 {
9648  *(int *)ret = kvm_x86_check_processor_compatibility();
9649 }
9650 
9651 static int __kvm_x86_vendor_init(struct kvm_x86_init_ops *ops)
9652 {
9653  u64 host_pat;
9654  int r, cpu;
9655 
9656  if (kvm_x86_ops.hardware_enable) {
9657  pr_err("already loaded vendor module '%s'\n", kvm_x86_ops.name);
9658  return -EEXIST;
9659  }
9660 
9661  /*
9662  * KVM explicitly assumes that the guest has an FPU and
9663  * FXSAVE/FXRSTOR. For example, the KVM_GET_FPU explicitly casts the
9664  * vCPU's FPU state as a fxregs_state struct.
9665  */
9666  if (!boot_cpu_has(X86_FEATURE_FPU) || !boot_cpu_has(X86_FEATURE_FXSR)) {
9667  pr_err("inadequate fpu\n");
9668  return -EOPNOTSUPP;
9669  }
9670 
9671  if (IS_ENABLED(CONFIG_PREEMPT_RT) && !boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
9672  pr_err("RT requires X86_FEATURE_CONSTANT_TSC\n");
9673  return -EOPNOTSUPP;
9674  }
9675 
9676  /*
9677  * KVM assumes that PAT entry '0' encodes WB memtype and simply zeroes
9678  * the PAT bits in SPTEs. Bail if PAT[0] is programmed to something
9679  * other than WB. Note, EPT doesn't utilize the PAT, but don't bother
9680  * with an exception. PAT[0] is set to WB on RESET and also by the
9681  * kernel, i.e. failure indicates a kernel bug or broken firmware.
9682  */
9683  if (rdmsrl_safe(MSR_IA32_CR_PAT, &host_pat) ||
9684  (host_pat & GENMASK(2, 0)) != 6) {
9685  pr_err("host PAT[0] is not WB\n");
9686  return -EIO;
9687  }
9688 
9690  if (!x86_emulator_cache) {
9691  pr_err("failed to allocate cache for x86 emulator\n");
9692  return -ENOMEM;
9693  }
9694 
9695  user_return_msrs = alloc_percpu(struct kvm_user_return_msrs);
9696  if (!user_return_msrs) {
9697  pr_err("failed to allocate percpu kvm_user_return_msrs\n");
9698  r = -ENOMEM;
9699  goto out_free_x86_emulator_cache;
9700  }
9701  kvm_nr_uret_msrs = 0;
9702 
9704  if (r)
9705  goto out_free_percpu;
9706 
9707  if (boot_cpu_has(X86_FEATURE_XSAVE)) {
9708  host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
9710  }
9711 
9712  rdmsrl_safe(MSR_EFER, &host_efer);
9713 
9714  if (boot_cpu_has(X86_FEATURE_XSAVES))
9715  rdmsrl(MSR_IA32_XSS, host_xss);
9716 
9717  kvm_init_pmu_capability(ops->pmu_ops);
9718 
9719  if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES))
9720  rdmsrl(MSR_IA32_ARCH_CAPABILITIES, host_arch_capabilities);
9721 
9722  r = ops->hardware_setup();
9723  if (r != 0)
9724  goto out_mmu_exit;
9725 
9726  kvm_ops_update(ops);
9727 
9728  for_each_online_cpu(cpu) {
9729  smp_call_function_single(cpu, kvm_x86_check_cpu_compat, &r, 1);
9730  if (r < 0)
9731  goto out_unwind_ops;
9732  }
9733 
9734  /*
9735  * Point of no return! DO NOT add error paths below this point unless
9736  * absolutely necessary, as most operations from this point forward
9737  * require unwinding.
9738  */
9739  kvm_timer_init();
9740 
9741  if (pi_inject_timer == -1)
9742  pi_inject_timer = housekeeping_enabled(HK_TYPE_TIMER);
9743 #ifdef CONFIG_X86_64
9744  pvclock_gtod_register_notifier(&pvclock_gtod_notifier);
9745 
9746  if (hypervisor_is_type(X86_HYPER_MS_HYPERV))
9747  set_hv_tscchange_cb(kvm_hyperv_tsc_notifier);
9748 #endif
9749 
9750  kvm_register_perf_callbacks(ops->handle_intel_pt_intr);
9751 
9752  if (!kvm_cpu_cap_has(X86_FEATURE_XSAVES))
9753  kvm_caps.supported_xss = 0;
9754 
9755 #define __kvm_cpu_cap_has(UNUSED_, f) kvm_cpu_cap_has(f)
9757 #undef __kvm_cpu_cap_has
9758 
9759  if (kvm_caps.has_tsc_control) {
9760  /*
9761  * Make sure the user can only configure tsc_khz values that
9762  * fit into a signed integer.
9763  * A min value is not calculated because it will always
9764  * be 1 on all machines.
9765  */
9766  u64 max = min(0x7fffffffULL,
9769  }
9772  return 0;
9773 
9774 out_unwind_ops:
9775  kvm_x86_ops.hardware_enable = NULL;
9776  static_call(kvm_x86_hardware_unsetup)();
9777 out_mmu_exit:
9779 out_free_percpu:
9780  free_percpu(user_return_msrs);
9781 out_free_x86_emulator_cache:
9782  kmem_cache_destroy(x86_emulator_cache);
9783  return r;
9784 }
9785 
9786 int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops)
9787 {
9788  int r;
9789 
9790  mutex_lock(&vendor_module_lock);
9791  r = __kvm_x86_vendor_init(ops);
9792  mutex_unlock(&vendor_module_lock);
9793 
9794  return r;
9795 }
9797 
9799 {
9800  kvm_unregister_perf_callbacks();
9801 
9802 #ifdef CONFIG_X86_64
9803  if (hypervisor_is_type(X86_HYPER_MS_HYPERV))
9804  clear_hv_tscchange_cb();
9805 #endif
9806  kvm_lapic_exit();
9807 
9808  if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
9809  cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block,
9810  CPUFREQ_TRANSITION_NOTIFIER);
9811  cpuhp_remove_state_nocalls(CPUHP_AP_X86_KVM_CLK_ONLINE);
9812  }
9813 #ifdef CONFIG_X86_64
9814  pvclock_gtod_unregister_notifier(&pvclock_gtod_notifier);
9815  irq_work_sync(&pvclock_irq_work);
9816  cancel_work_sync(&pvclock_gtod_work);
9817 #endif
9818  static_call(kvm_x86_hardware_unsetup)();
9820  free_percpu(user_return_msrs);
9821  kmem_cache_destroy(x86_emulator_cache);
9822 #ifdef CONFIG_KVM_XEN
9823  static_key_deferred_flush(&kvm_xen_enabled);
9824  WARN_ON(static_branch_unlikely(&kvm_xen_enabled.key));
9825 #endif
9826  mutex_lock(&vendor_module_lock);
9827  kvm_x86_ops.hardware_enable = NULL;
9828  mutex_unlock(&vendor_module_lock);
9829 }
9831 
9832 static int __kvm_emulate_halt(struct kvm_vcpu *vcpu, int state, int reason)
9833 {
9834  /*
9835  * The vCPU has halted, e.g. executed HLT. Update the run state if the
9836  * local APIC is in-kernel, the run loop will detect the non-runnable
9837  * state and halt the vCPU. Exit to userspace if the local APIC is
9838  * managed by userspace, in which case userspace is responsible for
9839  * handling wake events.
9840  */
9841  ++vcpu->stat.halt_exits;
9842  if (lapic_in_kernel(vcpu)) {
9843  vcpu->arch.mp_state = state;
9844  return 1;
9845  } else {
9846  vcpu->run->exit_reason = reason;
9847  return 0;
9848  }
9849 }
9850 
9851 int kvm_emulate_halt_noskip(struct kvm_vcpu *vcpu)
9852 {
9853  return __kvm_emulate_halt(vcpu, KVM_MP_STATE_HALTED, KVM_EXIT_HLT);
9854 }
9856 
9857 int kvm_emulate_halt(struct kvm_vcpu *vcpu)
9858 {
9859  int ret = kvm_skip_emulated_instruction(vcpu);
9860  /*
9861  * TODO: we might be squashing a GUESTDBG_SINGLESTEP-triggered
9862  * KVM_EXIT_DEBUG here.
9863  */
9864  return kvm_emulate_halt_noskip(vcpu) && ret;
9865 }
9867 
9868 int kvm_emulate_ap_reset_hold(struct kvm_vcpu *vcpu)
9869 {
9870  int ret = kvm_skip_emulated_instruction(vcpu);
9871 
9872  return __kvm_emulate_halt(vcpu, KVM_MP_STATE_AP_RESET_HOLD,
9873  KVM_EXIT_AP_RESET_HOLD) && ret;
9874 }
9876 
9877 #ifdef CONFIG_X86_64
9878 static int kvm_pv_clock_pairing(struct kvm_vcpu *vcpu, gpa_t paddr,
9879  unsigned long clock_type)
9880 {
9881  struct kvm_clock_pairing clock_pairing;
9882  struct timespec64 ts;
9883  u64 cycle;
9884  int ret;
9885 
9886  if (clock_type != KVM_CLOCK_PAIRING_WALLCLOCK)
9887  return -KVM_EOPNOTSUPP;
9888 
9889  /*
9890  * When tsc is in permanent catchup mode guests won't be able to use
9891  * pvclock_read_retry loop to get consistent view of pvclock
9892  */
9893  if (vcpu->arch.tsc_always_catchup)
9894  return -KVM_EOPNOTSUPP;
9895 
9896  if (!kvm_get_walltime_and_clockread(&ts, &cycle))
9897  return -KVM_EOPNOTSUPP;
9898 
9899  clock_pairing.sec = ts.tv_sec;
9900  clock_pairing.nsec = ts.tv_nsec;
9901  clock_pairing.tsc = kvm_read_l1_tsc(vcpu, cycle);
9902  clock_pairing.flags = 0;
9903  memset(&clock_pairing.pad, 0, sizeof(clock_pairing.pad));
9904 
9905  ret = 0;
9906  if (kvm_write_guest(vcpu->kvm, paddr, &clock_pairing,
9907  sizeof(struct kvm_clock_pairing)))
9908  ret = -KVM_EFAULT;
9909 
9910  return ret;
9911 }
9912 #endif
9913 
9914 /*
9915  * kvm_pv_kick_cpu_op: Kick a vcpu.
9916  *
9917  * @apicid - apicid of vcpu to be kicked.
9918  */
9919 static void kvm_pv_kick_cpu_op(struct kvm *kvm, int apicid)
9920 {
9921  /*
9922  * All other fields are unused for APIC_DM_REMRD, but may be consumed by
9923  * common code, e.g. for tracing. Defer initialization to the compiler.
9924  */
9925  struct kvm_lapic_irq lapic_irq = {
9926  .delivery_mode = APIC_DM_REMRD,
9927  .dest_mode = APIC_DEST_PHYSICAL,
9928  .shorthand = APIC_DEST_NOSHORT,
9929  .dest_id = apicid,
9930  };
9931 
9932  kvm_irq_delivery_to_apic(kvm, NULL, &lapic_irq, NULL);
9933 }
9934 
9935 bool kvm_apicv_activated(struct kvm *kvm)
9936 {
9937  return (READ_ONCE(kvm->arch.apicv_inhibit_reasons) == 0);
9938 }
9940 
9941 bool kvm_vcpu_apicv_activated(struct kvm_vcpu *vcpu)
9942 {
9943  ulong vm_reasons = READ_ONCE(vcpu->kvm->arch.apicv_inhibit_reasons);
9944  ulong vcpu_reasons = static_call(kvm_x86_vcpu_get_apicv_inhibit_reasons)(vcpu);
9945 
9946  return (vm_reasons | vcpu_reasons) == 0;
9947 }
9949 
9950 static void set_or_clear_apicv_inhibit(unsigned long *inhibits,
9951  enum kvm_apicv_inhibit reason, bool set)
9952 {
9953  if (set)
9954  __set_bit(reason, inhibits);
9955  else
9956  __clear_bit(reason, inhibits);
9957 
9958  trace_kvm_apicv_inhibit_changed(reason, set, *inhibits);
9959 }
9960 
9961 static void kvm_apicv_init(struct kvm *kvm)
9962 {
9963  unsigned long *inhibits = &kvm->arch.apicv_inhibit_reasons;
9964 
9965  init_rwsem(&kvm->arch.apicv_update_lock);
9966 
9967  set_or_clear_apicv_inhibit(inhibits, APICV_INHIBIT_REASON_ABSENT, true);
9968 
9969  if (!enable_apicv)
9970  set_or_clear_apicv_inhibit(inhibits,
9971  APICV_INHIBIT_REASON_DISABLE, true);
9972 }
9973 
9974 static void kvm_sched_yield(struct kvm_vcpu *vcpu, unsigned long dest_id)
9975 {
9976  struct kvm_vcpu *target = NULL;
9977  struct kvm_apic_map *map;
9978 
9979  vcpu->stat.directed_yield_attempted++;
9980 
9981  if (single_task_running())
9982  goto no_yield;
9983 
9984  rcu_read_lock();
9985  map = rcu_dereference(vcpu->kvm->arch.apic_map);
9986 
9987  if (likely(map) && dest_id <= map->max_apic_id && map->phys_map[dest_id])
9988  target = map->phys_map[dest_id]->vcpu;
9989 
9990  rcu_read_unlock();
9991 
9992  if (!target || !READ_ONCE(target->ready))
9993  goto no_yield;
9994 
9995  /* Ignore requests to yield to self */
9996  if (vcpu == target)
9997  goto no_yield;
9998 
9999  if (kvm_vcpu_yield_to(target) <= 0)
10000  goto no_yield;
10001 
10002  vcpu->stat.directed_yield_successful++;
10003 
10004 no_yield:
10005  return;
10006 }
10007 
10008 static int complete_hypercall_exit(struct kvm_vcpu *vcpu)
10009 {
10010  u64 ret = vcpu->run->hypercall.ret;
10011 
10012  if (!is_64_bit_mode(vcpu))
10013  ret = (u32)ret;
10014  kvm_rax_write(vcpu, ret);
10015  ++vcpu->stat.hypercalls;
10016  return kvm_skip_emulated_instruction(vcpu);
10017 }
10018 
10019 int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
10020 {
10021  unsigned long nr, a0, a1, a2, a3, ret;
10022  int op_64_bit;
10023 
10024  if (kvm_xen_hypercall_enabled(vcpu->kvm))
10025  return kvm_xen_hypercall(vcpu);
10026 
10027  if (kvm_hv_hypercall_enabled(vcpu))
10028  return kvm_hv_hypercall(vcpu);
10029 
10030  nr = kvm_rax_read(vcpu);
10031  a0 = kvm_rbx_read(vcpu);
10032  a1 = kvm_rcx_read(vcpu);
10033  a2 = kvm_rdx_read(vcpu);
10034  a3 = kvm_rsi_read(vcpu);
10035 
10036  trace_kvm_hypercall(nr, a0, a1, a2, a3);
10037 
10038  op_64_bit = is_64_bit_hypercall(vcpu);
10039  if (!op_64_bit) {
10040  nr &= 0xFFFFFFFF;
10041  a0 &= 0xFFFFFFFF;
10042  a1 &= 0xFFFFFFFF;
10043  a2 &= 0xFFFFFFFF;
10044  a3 &= 0xFFFFFFFF;
10045  }
10046 
10047  if (static_call(kvm_x86_get_cpl)(vcpu) != 0) {
10048  ret = -KVM_EPERM;
10049  goto out;
10050  }
10051 
10052  ret = -KVM_ENOSYS;
10053 
10054  switch (nr) {
10055  case KVM_HC_VAPIC_POLL_IRQ:
10056  ret = 0;
10057  break;
10058  case KVM_HC_KICK_CPU:
10059  if (!guest_pv_has(vcpu, KVM_FEATURE_PV_UNHALT))
10060  break;
10061 
10062  kvm_pv_kick_cpu_op(vcpu->kvm, a1);
10063  kvm_sched_yield(vcpu, a1);
10064  ret = 0;
10065  break;
10066 #ifdef CONFIG_X86_64
10067  case KVM_HC_CLOCK_PAIRING:
10068  ret = kvm_pv_clock_pairing(vcpu, a0, a1);
10069  break;
10070 #endif
10071  case KVM_HC_SEND_IPI:
10072  if (!guest_pv_has(vcpu, KVM_FEATURE_PV_SEND_IPI))
10073  break;
10074 
10075  ret = kvm_pv_send_ipi(vcpu->kvm, a0, a1, a2, a3, op_64_bit);
10076  break;
10077  case KVM_HC_SCHED_YIELD:
10078  if (!guest_pv_has(vcpu, KVM_FEATURE_PV_SCHED_YIELD))
10079  break;
10080 
10081  kvm_sched_yield(vcpu, a0);
10082  ret = 0;
10083  break;
10084  case KVM_HC_MAP_GPA_RANGE: {
10085  u64 gpa = a0, npages = a1, attrs = a2;
10086 
10087  ret = -KVM_ENOSYS;
10088  if (!(vcpu->kvm->arch.hypercall_exit_enabled & (1 << KVM_HC_MAP_GPA_RANGE)))
10089  break;
10090 
10091  if (!PAGE_ALIGNED(gpa) || !npages ||
10092  gpa_to_gfn(gpa) + npages <= gpa_to_gfn(gpa)) {
10093  ret = -KVM_EINVAL;
10094  break;
10095  }
10096 
10097  vcpu->run->exit_reason = KVM_EXIT_HYPERCALL;
10098  vcpu->run->hypercall.nr = KVM_HC_MAP_GPA_RANGE;
10099  vcpu->run->hypercall.args[0] = gpa;
10100  vcpu->run->hypercall.args[1] = npages;
10101  vcpu->run->hypercall.args[2] = attrs;
10102  vcpu->run->hypercall.flags = 0;
10103  if (op_64_bit)
10104  vcpu->run->hypercall.flags |= KVM_EXIT_HYPERCALL_LONG_MODE;
10105 
10106  WARN_ON_ONCE(vcpu->run->hypercall.flags & KVM_EXIT_HYPERCALL_MBZ);
10107  vcpu->arch.complete_userspace_io = complete_hypercall_exit;
10108  return 0;
10109  }
10110  default:
10111  ret = -KVM_ENOSYS;
10112  break;
10113  }
10114 out:
10115  if (!op_64_bit)
10116  ret = (u32)ret;
10117  kvm_rax_write(vcpu, ret);
10118 
10119  ++vcpu->stat.hypercalls;
10120  return kvm_skip_emulated_instruction(vcpu);
10121 }
10123 
10125 {
10126  struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
10127  char instruction[3];
10128  unsigned long rip = kvm_rip_read(vcpu);
10129 
10130  /*
10131  * If the quirk is disabled, synthesize a #UD and let the guest pick up
10132  * the pieces.
10133  */
10134  if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_FIX_HYPERCALL_INSN)) {
10135  ctxt->exception.error_code_valid = false;
10136  ctxt->exception.vector = UD_VECTOR;
10137  ctxt->have_exception = true;
10138  return X86EMUL_PROPAGATE_FAULT;
10139  }
10140 
10141  static_call(kvm_x86_patch_hypercall)(vcpu, instruction);
10142 
10143  return emulator_write_emulated(ctxt, rip, instruction, 3,
10144  &ctxt->exception);
10145 }
10146 
10147 static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu)
10148 {
10149  return vcpu->run->request_interrupt_window &&
10150  likely(!pic_in_kernel(vcpu->kvm));
10151 }
10152 
10153 /* Called within kvm->srcu read side. */
10154 static void post_kvm_run_save(struct kvm_vcpu *vcpu)
10155 {
10156  struct kvm_run *kvm_run = vcpu->run;
10157 
10158  kvm_run->if_flag = static_call(kvm_x86_get_if_flag)(vcpu);
10159  kvm_run->cr8 = kvm_get_cr8(vcpu);
10160  kvm_run->apic_base = kvm_get_apic_base(vcpu);
10161 
10162  kvm_run->ready_for_interrupt_injection =
10163  pic_in_kernel(vcpu->kvm) ||
10165 
10166  if (is_smm(vcpu))
10167  kvm_run->flags |= KVM_RUN_X86_SMM;
10168 }
10169 
10170 static void update_cr8_intercept(struct kvm_vcpu *vcpu)
10171 {
10172  int max_irr, tpr;
10173 
10174  if (!kvm_x86_ops.update_cr8_intercept)
10175  return;
10176 
10177  if (!lapic_in_kernel(vcpu))
10178  return;
10179 
10180  if (vcpu->arch.apic->apicv_active)
10181  return;
10182 
10183  if (!vcpu->arch.apic->vapic_addr)
10184  max_irr = kvm_lapic_find_highest_irr(vcpu);
10185  else
10186  max_irr = -1;
10187 
10188  if (max_irr != -1)
10189  max_irr >>= 4;
10190 
10191  tpr = kvm_lapic_get_cr8(vcpu);
10192 
10193  static_call(kvm_x86_update_cr8_intercept)(vcpu, tpr, max_irr);
10194 }
10195 
10196 
10197 int kvm_check_nested_events(struct kvm_vcpu *vcpu)
10198 {
10199  if (kvm_test_request(KVM_REQ_TRIPLE_FAULT, vcpu)) {
10200  kvm_x86_ops.nested_ops->triple_fault(vcpu);
10201  return 1;
10202  }
10203 
10204  return kvm_x86_ops.nested_ops->check_events(vcpu);
10205 }
10206 
10207 static void kvm_inject_exception(struct kvm_vcpu *vcpu)
10208 {
10209  /*
10210  * Suppress the error code if the vCPU is in Real Mode, as Real Mode
10211  * exceptions don't report error codes. The presence of an error code
10212  * is carried with the exception and only stripped when the exception
10213  * is injected as intercepted #PF VM-Exits for AMD's Paged Real Mode do
10214  * report an error code despite the CPU being in Real Mode.
10215  */
10216  vcpu->arch.exception.has_error_code &= is_protmode(vcpu);
10217 
10218  trace_kvm_inj_exception(vcpu->arch.exception.vector,
10219  vcpu->arch.exception.has_error_code,
10220  vcpu->arch.exception.error_code,
10221  vcpu->arch.exception.injected);
10222 
10223  static_call(kvm_x86_inject_exception)(vcpu);
10224 }
10225 
10226 /*
10227  * Check for any event (interrupt or exception) that is ready to be injected,
10228  * and if there is at least one event, inject the event with the highest
10229  * priority. This handles both "pending" events, i.e. events that have never
10230  * been injected into the guest, and "injected" events, i.e. events that were
10231  * injected as part of a previous VM-Enter, but weren't successfully delivered
10232  * and need to be re-injected.
10233  *
10234  * Note, this is not guaranteed to be invoked on a guest instruction boundary,
10235  * i.e. doesn't guarantee that there's an event window in the guest. KVM must
10236  * be able to inject exceptions in the "middle" of an instruction, and so must
10237  * also be able to re-inject NMIs and IRQs in the middle of an instruction.
10238  * I.e. for exceptions and re-injected events, NOT invoking this on instruction
10239  * boundaries is necessary and correct.
10240  *
10241  * For simplicity, KVM uses a single path to inject all events (except events
10242  * that are injected directly from L1 to L2) and doesn't explicitly track
10243  * instruction boundaries for asynchronous events. However, because VM-Exits
10244  * that can occur during instruction execution typically result in KVM skipping
10245  * the instruction or injecting an exception, e.g. instruction and exception
10246  * intercepts, and because pending exceptions have higher priority than pending
10247  * interrupts, KVM still honors instruction boundaries in most scenarios.
10248  *
10249  * But, if a VM-Exit occurs during instruction execution, and KVM does NOT skip
10250  * the instruction or inject an exception, then KVM can incorrecty inject a new
10251  * asynchronous event if the event became pending after the CPU fetched the
10252  * instruction (in the guest). E.g. if a page fault (#PF, #NPF, EPT violation)
10253  * occurs and is resolved by KVM, a coincident NMI, SMI, IRQ, etc... can be
10254  * injected on the restarted instruction instead of being deferred until the
10255  * instruction completes.
10256  *
10257  * In practice, this virtualization hole is unlikely to be observed by the
10258  * guest, and even less likely to cause functional problems. To detect the
10259  * hole, the guest would have to trigger an event on a side effect of an early
10260  * phase of instruction execution, e.g. on the instruction fetch from memory.
10261  * And for it to be a functional problem, the guest would need to depend on the
10262  * ordering between that side effect, the instruction completing, _and_ the
10263  * delivery of the asynchronous event.
10264  */
10265 static int kvm_check_and_inject_events(struct kvm_vcpu *vcpu,
10266  bool *req_immediate_exit)
10267 {
10268  bool can_inject;
10269  int r;
10270 
10271  /*
10272  * Process nested events first, as nested VM-Exit supersedes event
10273  * re-injection. If there's an event queued for re-injection, it will
10274  * be saved into the appropriate vmc{b,s}12 fields on nested VM-Exit.
10275  */
10276  if (is_guest_mode(vcpu))
10277  r = kvm_check_nested_events(vcpu);
10278  else
10279  r = 0;
10280 
10281  /*
10282  * Re-inject exceptions and events *especially* if immediate entry+exit
10283  * to/from L2 is needed, as any event that has already been injected
10284  * into L2 needs to complete its lifecycle before injecting a new event.
10285  *
10286  * Don't re-inject an NMI or interrupt if there is a pending exception.
10287  * This collision arises if an exception occurred while vectoring the
10288  * injected event, KVM intercepted said exception, and KVM ultimately
10289  * determined the fault belongs to the guest and queues the exception
10290  * for injection back into the guest.
10291  *
10292  * "Injected" interrupts can also collide with pending exceptions if
10293  * userspace ignores the "ready for injection" flag and blindly queues
10294  * an interrupt. In that case, prioritizing the exception is correct,
10295  * as the exception "occurred" before the exit to userspace. Trap-like
10296  * exceptions, e.g. most #DBs, have higher priority than interrupts.
10297  * And while fault-like exceptions, e.g. #GP and #PF, are the lowest
10298  * priority, they're only generated (pended) during instruction
10299  * execution, and interrupts are recognized at instruction boundaries.
10300  * Thus a pending fault-like exception means the fault occurred on the
10301  * *previous* instruction and must be serviced prior to recognizing any
10302  * new events in order to fully complete the previous instruction.
10303  */
10304  if (vcpu->arch.exception.injected)
10305  kvm_inject_exception(vcpu);
10306  else if (kvm_is_exception_pending(vcpu))
10307  ; /* see above */
10308  else if (vcpu->arch.nmi_injected)
10309  static_call(kvm_x86_inject_nmi)(vcpu);
10310  else if (vcpu->arch.interrupt.injected)
10311  static_call(kvm_x86_inject_irq)(vcpu, true);
10312 
10313  /*
10314  * Exceptions that morph to VM-Exits are handled above, and pending
10315  * exceptions on top of injected exceptions that do not VM-Exit should
10316  * either morph to #DF or, sadly, override the injected exception.
10317  */
10318  WARN_ON_ONCE(vcpu->arch.exception.injected &&
10319  vcpu->arch.exception.pending);
10320 
10321  /*
10322  * Bail if immediate entry+exit to/from the guest is needed to complete
10323  * nested VM-Enter or event re-injection so that a different pending
10324  * event can be serviced (or if KVM needs to exit to userspace).
10325  *
10326  * Otherwise, continue processing events even if VM-Exit occurred. The
10327  * VM-Exit will have cleared exceptions that were meant for L2, but
10328  * there may now be events that can be injected into L1.
10329  */
10330  if (r < 0)
10331  goto out;
10332 
10333  /*
10334  * A pending exception VM-Exit should either result in nested VM-Exit
10335  * or force an immediate re-entry and exit to/from L2, and exception
10336  * VM-Exits cannot be injected (flag should _never_ be set).
10337  */
10338  WARN_ON_ONCE(vcpu->arch.exception_vmexit.injected ||
10339  vcpu->arch.exception_vmexit.pending);
10340 
10341  /*
10342  * New events, other than exceptions, cannot be injected if KVM needs
10343  * to re-inject a previous event. See above comments on re-injecting
10344  * for why pending exceptions get priority.
10345  */
10346  can_inject = !kvm_event_needs_reinjection(vcpu);
10347 
10348  if (vcpu->arch.exception.pending) {
10349  /*
10350  * Fault-class exceptions, except #DBs, set RF=1 in the RFLAGS
10351  * value pushed on the stack. Trap-like exception and all #DBs
10352  * leave RF as-is (KVM follows Intel's behavior in this regard;
10353  * AMD states that code breakpoint #DBs excplitly clear RF=0).
10354  *
10355  * Note, most versions of Intel's SDM and AMD's APM incorrectly
10356  * describe the behavior of General Detect #DBs, which are
10357  * fault-like. They do _not_ set RF, a la code breakpoints.
10358  */
10359  if (exception_type(vcpu->arch.exception.vector) == EXCPT_FAULT)
10360  __kvm_set_rflags(vcpu, kvm_get_rflags(vcpu) |
10361  X86_EFLAGS_RF);
10362 
10363  if (vcpu->arch.exception.vector == DB_VECTOR) {
10364  kvm_deliver_exception_payload(vcpu, &vcpu->arch.exception);
10365  if (vcpu->arch.dr7 & DR7_GD) {
10366  vcpu->arch.dr7 &= ~DR7_GD;
10367  kvm_update_dr7(vcpu);
10368  }
10369  }
10370 
10371  kvm_inject_exception(vcpu);
10372 
10373  vcpu->arch.exception.pending = false;
10374  vcpu->arch.exception.injected = true;
10375 
10376  can_inject = false;
10377  }
10378 
10379  /* Don't inject interrupts if the user asked to avoid doing so */
10380  if (vcpu->guest_debug & KVM_GUESTDBG_BLOCKIRQ)
10381  return 0;
10382 
10383  /*
10384  * Finally, inject interrupt events. If an event cannot be injected
10385  * due to architectural conditions (e.g. IF=0) a window-open exit
10386  * will re-request KVM_REQ_EVENT. Sometimes however an event is pending
10387  * and can architecturally be injected, but we cannot do it right now:
10388  * an interrupt could have arrived just now and we have to inject it
10389  * as a vmexit, or there could already an event in the queue, which is
10390  * indicated by can_inject. In that case we request an immediate exit
10391  * in order to make progress and get back here for another iteration.
10392  * The kvm_x86_ops hooks communicate this by returning -EBUSY.
10393  */
10394 #ifdef CONFIG_KVM_SMM
10395  if (vcpu->arch.smi_pending) {
10396  r = can_inject ? static_call(kvm_x86_smi_allowed)(vcpu, true) : -EBUSY;
10397  if (r < 0)
10398  goto out;
10399  if (r) {
10400  vcpu->arch.smi_pending = false;
10401  ++vcpu->arch.smi_count;
10402  enter_smm(vcpu);
10403  can_inject = false;
10404  } else
10405  static_call(kvm_x86_enable_smi_window)(vcpu);
10406  }
10407 #endif
10408 
10409  if (vcpu->arch.nmi_pending) {
10410  r = can_inject ? static_call(kvm_x86_nmi_allowed)(vcpu, true) : -EBUSY;
10411  if (r < 0)
10412  goto out;
10413  if (r) {
10414  --vcpu->arch.nmi_pending;
10415  vcpu->arch.nmi_injected = true;
10416  static_call(kvm_x86_inject_nmi)(vcpu);
10417  can_inject = false;
10418  WARN_ON(static_call(kvm_x86_nmi_allowed)(vcpu, true) < 0);
10419  }
10420  if (vcpu->arch.nmi_pending)
10421  static_call(kvm_x86_enable_nmi_window)(vcpu);
10422  }
10423 
10424  if (kvm_cpu_has_injectable_intr(vcpu)) {
10425  r = can_inject ? static_call(kvm_x86_interrupt_allowed)(vcpu, true) : -EBUSY;
10426  if (r < 0)
10427  goto out;
10428  if (r) {
10429  int irq = kvm_cpu_get_interrupt(vcpu);
10430 
10431  if (!WARN_ON_ONCE(irq == -1)) {
10432  kvm_queue_interrupt(vcpu, irq, false);
10433  static_call(kvm_x86_inject_irq)(vcpu, false);
10434  WARN_ON(static_call(kvm_x86_interrupt_allowed)(vcpu, true) < 0);
10435  }
10436  }
10437  if (kvm_cpu_has_injectable_intr(vcpu))
10438  static_call(kvm_x86_enable_irq_window)(vcpu);
10439  }
10440 
10441  if (is_guest_mode(vcpu) &&
10442  kvm_x86_ops.nested_ops->has_events &&
10443  kvm_x86_ops.nested_ops->has_events(vcpu))
10444  *req_immediate_exit = true;
10445 
10446  /*
10447  * KVM must never queue a new exception while injecting an event; KVM
10448  * is done emulating and should only propagate the to-be-injected event
10449  * to the VMCS/VMCB. Queueing a new exception can put the vCPU into an
10450  * infinite loop as KVM will bail from VM-Enter to inject the pending
10451  * exception and start the cycle all over.
10452  *
10453  * Exempt triple faults as they have special handling and won't put the
10454  * vCPU into an infinite loop. Triple fault can be queued when running
10455  * VMX without unrestricted guest, as that requires KVM to emulate Real
10456  * Mode events (see kvm_inject_realmode_interrupt()).
10457  */
10458  WARN_ON_ONCE(vcpu->arch.exception.pending ||
10459  vcpu->arch.exception_vmexit.pending);
10460  return 0;
10461 
10462 out:
10463  if (r == -EBUSY) {
10464  *req_immediate_exit = true;
10465  r = 0;
10466  }
10467  return r;
10468 }
10469 
10470 static void process_nmi(struct kvm_vcpu *vcpu)
10471 {
10472  unsigned int limit;
10473 
10474  /*
10475  * x86 is limited to one NMI pending, but because KVM can't react to
10476  * incoming NMIs as quickly as bare metal, e.g. if the vCPU is
10477  * scheduled out, KVM needs to play nice with two queued NMIs showing
10478  * up at the same time. To handle this scenario, allow two NMIs to be
10479  * (temporarily) pending so long as NMIs are not blocked and KVM is not
10480  * waiting for a previous NMI injection to complete (which effectively
10481  * blocks NMIs). KVM will immediately inject one of the two NMIs, and
10482  * will request an NMI window to handle the second NMI.
10483  */
10484  if (static_call(kvm_x86_get_nmi_mask)(vcpu) || vcpu->arch.nmi_injected)
10485  limit = 1;
10486  else
10487  limit = 2;
10488 
10489  /*
10490  * Adjust the limit to account for pending virtual NMIs, which aren't
10491  * tracked in vcpu->arch.nmi_pending.
10492  */
10493  if (static_call(kvm_x86_is_vnmi_pending)(vcpu))
10494  limit--;
10495 
10496  vcpu->arch.nmi_pending += atomic_xchg(&vcpu->arch.nmi_queued, 0);
10497  vcpu->arch.nmi_pending = min(vcpu->arch.nmi_pending, limit);
10498 
10499  if (vcpu->arch.nmi_pending &&
10500  (static_call(kvm_x86_set_vnmi_pending)(vcpu)))
10501  vcpu->arch.nmi_pending--;
10502 
10503  if (vcpu->arch.nmi_pending)
10504  kvm_make_request(KVM_REQ_EVENT, vcpu);
10505 }
10506 
10507 /* Return total number of NMIs pending injection to the VM */
10508 int kvm_get_nr_pending_nmis(struct kvm_vcpu *vcpu)
10509 {
10510  return vcpu->arch.nmi_pending +
10511  static_call(kvm_x86_is_vnmi_pending)(vcpu);
10512 }
10513 
10515  unsigned long *vcpu_bitmap)
10516 {
10517  kvm_make_vcpus_request_mask(kvm, KVM_REQ_SCAN_IOAPIC, vcpu_bitmap);
10518 }
10519 
10520 void kvm_make_scan_ioapic_request(struct kvm *kvm)
10521 {
10522  kvm_make_all_cpus_request(kvm, KVM_REQ_SCAN_IOAPIC);
10523 }
10524 
10525 void __kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu)
10526 {
10527  struct kvm_lapic *apic = vcpu->arch.apic;
10528  bool activate;
10529 
10530  if (!lapic_in_kernel(vcpu))
10531  return;
10532 
10533  down_read(&vcpu->kvm->arch.apicv_update_lock);
10534  preempt_disable();
10535 
10536  /* Do not activate APICV when APIC is disabled */
10537  activate = kvm_vcpu_apicv_activated(vcpu) &&
10539 
10540  if (apic->apicv_active == activate)
10541  goto out;
10542 
10543  apic->apicv_active = activate;
10545  static_call(kvm_x86_refresh_apicv_exec_ctrl)(vcpu);
10546 
10547  /*
10548  * When APICv gets disabled, we may still have injected interrupts
10549  * pending. At the same time, KVM_REQ_EVENT may not be set as APICv was
10550  * still active when the interrupt got accepted. Make sure
10551  * kvm_check_and_inject_events() is called to check for that.
10552  */
10553  if (!apic->apicv_active)
10554  kvm_make_request(KVM_REQ_EVENT, vcpu);
10555 
10556 out:
10557  preempt_enable();
10558  up_read(&vcpu->kvm->arch.apicv_update_lock);
10559 }
10561 
10562 static void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu)
10563 {
10564  if (!lapic_in_kernel(vcpu))
10565  return;
10566 
10567  /*
10568  * Due to sharing page tables across vCPUs, the xAPIC memslot must be
10569  * deleted if any vCPU has xAPIC virtualization and x2APIC enabled, but
10570  * and hardware doesn't support x2APIC virtualization. E.g. some AMD
10571  * CPUs support AVIC but not x2APIC. KVM still allows enabling AVIC in
10572  * this case so that KVM can the AVIC doorbell to inject interrupts to
10573  * running vCPUs, but KVM must not create SPTEs for the APIC base as
10574  * the vCPU would incorrectly be able to access the vAPIC page via MMIO
10575  * despite being in x2APIC mode. For simplicity, inhibiting the APIC
10576  * access page is sticky.
10577  */
10578  if (apic_x2apic_mode(vcpu->arch.apic) &&
10579  kvm_x86_ops.allow_apicv_in_x2apic_without_x2apic_virtualization)
10581 
10583 }
10584 
10586  enum kvm_apicv_inhibit reason, bool set)
10587 {
10588  unsigned long old, new;
10589 
10590  lockdep_assert_held_write(&kvm->arch.apicv_update_lock);
10591 
10592  if (!(kvm_x86_ops.required_apicv_inhibits & BIT(reason)))
10593  return;
10594 
10595  old = new = kvm->arch.apicv_inhibit_reasons;
10596 
10597  set_or_clear_apicv_inhibit(&new, reason, set);
10598 
10599  if (!!old != !!new) {
10600  /*
10601  * Kick all vCPUs before setting apicv_inhibit_reasons to avoid
10602  * false positives in the sanity check WARN in svm_vcpu_run().
10603  * This task will wait for all vCPUs to ack the kick IRQ before
10604  * updating apicv_inhibit_reasons, and all other vCPUs will
10605  * block on acquiring apicv_update_lock so that vCPUs can't
10606  * redo svm_vcpu_run() without seeing the new inhibit state.
10607  *
10608  * Note, holding apicv_update_lock and taking it in the read
10609  * side (handling the request) also prevents other vCPUs from
10610  * servicing the request with a stale apicv_inhibit_reasons.
10611  */
10612  kvm_make_all_cpus_request(kvm, KVM_REQ_APICV_UPDATE);
10613  kvm->arch.apicv_inhibit_reasons = new;
10614  if (new) {
10615  unsigned long gfn = gpa_to_gfn(APIC_DEFAULT_PHYS_BASE);
10616  int idx = srcu_read_lock(&kvm->srcu);
10617 
10618  kvm_zap_gfn_range(kvm, gfn, gfn+1);
10619  srcu_read_unlock(&kvm->srcu, idx);
10620  }
10621  } else {
10622  kvm->arch.apicv_inhibit_reasons = new;
10623  }
10624 }
10625 
10626 void kvm_set_or_clear_apicv_inhibit(struct kvm *kvm,
10627  enum kvm_apicv_inhibit reason, bool set)
10628 {
10629  if (!enable_apicv)
10630  return;
10631 
10632  down_write(&kvm->arch.apicv_update_lock);
10633  __kvm_set_or_clear_apicv_inhibit(kvm, reason, set);
10634  up_write(&kvm->arch.apicv_update_lock);
10635 }
10637 
10638 static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
10639 {
10640  if (!kvm_apic_present(vcpu))
10641  return;
10642 
10643  bitmap_zero(vcpu->arch.ioapic_handled_vectors, 256);
10644 
10645  if (irqchip_split(vcpu->kvm))
10646  kvm_scan_ioapic_routes(vcpu, vcpu->arch.ioapic_handled_vectors);
10647  else {
10648  static_call_cond(kvm_x86_sync_pir_to_irr)(vcpu);
10649  if (ioapic_in_kernel(vcpu->kvm))
10650  kvm_ioapic_scan_entry(vcpu, vcpu->arch.ioapic_handled_vectors);
10651  }
10652 
10653  if (is_guest_mode(vcpu))
10654  vcpu->arch.load_eoi_exitmap_pending = true;
10655  else
10656  kvm_make_request(KVM_REQ_LOAD_EOI_EXITMAP, vcpu);
10657 }
10658 
10659 static void vcpu_load_eoi_exitmap(struct kvm_vcpu *vcpu)
10660 {
10661  if (!kvm_apic_hw_enabled(vcpu->arch.apic))
10662  return;
10663 
10664 #ifdef CONFIG_KVM_HYPERV
10665  if (to_hv_vcpu(vcpu)) {
10666  u64 eoi_exit_bitmap[4];
10667 
10668  bitmap_or((ulong *)eoi_exit_bitmap,
10669  vcpu->arch.ioapic_handled_vectors,
10670  to_hv_synic(vcpu)->vec_bitmap, 256);
10671  static_call_cond(kvm_x86_load_eoi_exitmap)(vcpu, eoi_exit_bitmap);
10672  return;
10673  }
10674 #endif
10675  static_call_cond(kvm_x86_load_eoi_exitmap)(
10676  vcpu, (u64 *)vcpu->arch.ioapic_handled_vectors);
10677 }
10678 
10679 void kvm_arch_guest_memory_reclaimed(struct kvm *kvm)
10680 {
10681  static_call_cond(kvm_x86_guest_memory_reclaimed)(kvm);
10682 }
10683 
10684 static void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu)
10685 {
10686  if (!lapic_in_kernel(vcpu))
10687  return;
10688 
10689  static_call_cond(kvm_x86_set_apic_access_page_addr)(vcpu);
10690 }
10691 
10692 void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu)
10693 {
10694  smp_send_reschedule(vcpu->cpu);
10695 }
10697 
10698 /*
10699  * Called within kvm->srcu read side.
10700  * Returns 1 to let vcpu_run() continue the guest execution loop without
10701  * exiting to the userspace. Otherwise, the value will be returned to the
10702  * userspace.
10703  */
10704 static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
10705 {
10706  int r;
10707  bool req_int_win =
10710  fastpath_t exit_fastpath;
10711 
10712  bool req_immediate_exit = false;
10713 
10714  if (kvm_request_pending(vcpu)) {
10715  if (kvm_check_request(KVM_REQ_VM_DEAD, vcpu)) {
10716  r = -EIO;
10717  goto out;
10718  }
10719 
10721  r = 0;
10722  goto out;
10723  }
10724 
10725  if (kvm_check_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu)) {
10726  if (unlikely(!kvm_x86_ops.nested_ops->get_nested_state_pages(vcpu))) {
10727  r = 0;
10728  goto out;
10729  }
10730  }
10731  if (kvm_check_request(KVM_REQ_MMU_FREE_OBSOLETE_ROOTS, vcpu))
10733  if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu))
10735  if (kvm_check_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu))
10737  if (kvm_check_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu))
10739  if (kvm_check_request(KVM_REQ_CLOCK_UPDATE, vcpu)) {
10741  if (unlikely(r))
10742  goto out;
10743  }
10744  if (kvm_check_request(KVM_REQ_MMU_SYNC, vcpu))
10746  if (kvm_check_request(KVM_REQ_LOAD_MMU_PGD, vcpu))
10748 
10749  /*
10750  * Note, the order matters here, as flushing "all" TLB entries
10751  * also flushes the "current" TLB entries, i.e. servicing the
10752  * flush "all" will clear any request to flush "current".
10753  */
10754  if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu))
10756 
10758 
10759  /*
10760  * Fall back to a "full" guest flush if Hyper-V's precise
10761  * flushing fails. Note, Hyper-V's flushing is per-vCPU, but
10762  * the flushes are considered "remote" and not "local" because
10763  * the requests can be initiated from other vCPUs.
10764  */
10765 #ifdef CONFIG_KVM_HYPERV
10766  if (kvm_check_request(KVM_REQ_HV_TLB_FLUSH, vcpu) &&
10769 #endif
10770 
10771  if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) {
10772  vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS;
10773  r = 0;
10774  goto out;
10775  }
10776  if (kvm_test_request(KVM_REQ_TRIPLE_FAULT, vcpu)) {
10777  if (is_guest_mode(vcpu))
10778  kvm_x86_ops.nested_ops->triple_fault(vcpu);
10779 
10780  if (kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)) {
10781  vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
10782  vcpu->mmio_needed = 0;
10783  r = 0;
10784  goto out;
10785  }
10786  }
10787  if (kvm_check_request(KVM_REQ_APF_HALT, vcpu)) {
10788  /* Page is swapped out. Do synthetic halt */
10789  vcpu->arch.apf.halted = true;
10790  r = 1;
10791  goto out;
10792  }
10793  if (kvm_check_request(KVM_REQ_STEAL_UPDATE, vcpu))
10795  if (kvm_check_request(KVM_REQ_PMU, vcpu))
10797  if (kvm_check_request(KVM_REQ_PMI, vcpu))
10799 #ifdef CONFIG_KVM_SMM
10800  if (kvm_check_request(KVM_REQ_SMI, vcpu))
10801  process_smi(vcpu);
10802 #endif
10803  if (kvm_check_request(KVM_REQ_NMI, vcpu))
10804  process_nmi(vcpu);
10805  if (kvm_check_request(KVM_REQ_IOAPIC_EOI_EXIT, vcpu)) {
10806  BUG_ON(vcpu->arch.pending_ioapic_eoi > 255);
10807  if (test_bit(vcpu->arch.pending_ioapic_eoi,
10808  vcpu->arch.ioapic_handled_vectors)) {
10809  vcpu->run->exit_reason = KVM_EXIT_IOAPIC_EOI;
10810  vcpu->run->eoi.vector =
10811  vcpu->arch.pending_ioapic_eoi;
10812  r = 0;
10813  goto out;
10814  }
10815  }
10816  if (kvm_check_request(KVM_REQ_SCAN_IOAPIC, vcpu))
10818  if (kvm_check_request(KVM_REQ_LOAD_EOI_EXITMAP, vcpu))
10820  if (kvm_check_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu))
10822 #ifdef CONFIG_KVM_HYPERV
10823  if (kvm_check_request(KVM_REQ_HV_CRASH, vcpu)) {
10824  vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
10825  vcpu->run->system_event.type = KVM_SYSTEM_EVENT_CRASH;
10826  vcpu->run->system_event.ndata = 0;
10827  r = 0;
10828  goto out;
10829  }
10830  if (kvm_check_request(KVM_REQ_HV_RESET, vcpu)) {
10831  vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
10832  vcpu->run->system_event.type = KVM_SYSTEM_EVENT_RESET;
10833  vcpu->run->system_event.ndata = 0;
10834  r = 0;
10835  goto out;
10836  }
10837  if (kvm_check_request(KVM_REQ_HV_EXIT, vcpu)) {
10838  struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
10839 
10840  vcpu->run->exit_reason = KVM_EXIT_HYPERV;
10841  vcpu->run->hyperv = hv_vcpu->exit;
10842  r = 0;
10843  goto out;
10844  }
10845 
10846  /*
10847  * KVM_REQ_HV_STIMER has to be processed after
10848  * KVM_REQ_CLOCK_UPDATE, because Hyper-V SynIC timers
10849  * depend on the guest clock being up-to-date
10850  */
10851  if (kvm_check_request(KVM_REQ_HV_STIMER, vcpu))
10852  kvm_hv_process_stimers(vcpu);
10853 #endif
10854  if (kvm_check_request(KVM_REQ_APICV_UPDATE, vcpu))
10855  kvm_vcpu_update_apicv(vcpu);
10856  if (kvm_check_request(KVM_REQ_APF_READY, vcpu))
10858  if (kvm_check_request(KVM_REQ_MSR_FILTER_CHANGED, vcpu))
10859  static_call(kvm_x86_msr_filter_changed)(vcpu);
10860 
10861  if (kvm_check_request(KVM_REQ_UPDATE_CPU_DIRTY_LOGGING, vcpu))
10862  static_call(kvm_x86_update_cpu_dirty_logging)(vcpu);
10863  }
10864 
10865  if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win ||
10866  kvm_xen_has_interrupt(vcpu)) {
10867  ++vcpu->stat.req_event;
10868  r = kvm_apic_accept_events(vcpu);
10869  if (r < 0) {
10870  r = 0;
10871  goto out;
10872  }
10873  if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
10874  r = 1;
10875  goto out;
10876  }
10877 
10878  r = kvm_check_and_inject_events(vcpu, &req_immediate_exit);
10879  if (r < 0) {
10880  r = 0;
10881  goto out;
10882  }
10883  if (req_int_win)
10884  static_call(kvm_x86_enable_irq_window)(vcpu);
10885 
10886  if (kvm_lapic_enabled(vcpu)) {
10887  update_cr8_intercept(vcpu);
10889  }
10890  }
10891 
10892  r = kvm_mmu_reload(vcpu);
10893  if (unlikely(r)) {
10894  goto cancel_injection;
10895  }
10896 
10897  preempt_disable();
10898 
10899  static_call(kvm_x86_prepare_switch_to_guest)(vcpu);
10900 
10901  /*
10902  * Disable IRQs before setting IN_GUEST_MODE. Posted interrupt
10903  * IPI are then delayed after guest entry, which ensures that they
10904  * result in virtual interrupt delivery.
10905  */
10906  local_irq_disable();
10907 
10908  /* Store vcpu->apicv_active before vcpu->mode. */
10909  smp_store_release(&vcpu->mode, IN_GUEST_MODE);
10910 
10911  kvm_vcpu_srcu_read_unlock(vcpu);
10912 
10913  /*
10914  * 1) We should set ->mode before checking ->requests. Please see
10915  * the comment in kvm_vcpu_exiting_guest_mode().
10916  *
10917  * 2) For APICv, we should set ->mode before checking PID.ON. This
10918  * pairs with the memory barrier implicit in pi_test_and_set_on
10919  * (see vmx_deliver_posted_interrupt).
10920  *
10921  * 3) This also orders the write to mode from any reads to the page
10922  * tables done while the VCPU is running. Please see the comment
10923  * in kvm_flush_remote_tlbs.
10924  */
10925  smp_mb__after_srcu_read_unlock();
10926 
10927  /*
10928  * Process pending posted interrupts to handle the case where the
10929  * notification IRQ arrived in the host, or was never sent (because the
10930  * target vCPU wasn't running). Do this regardless of the vCPU's APICv
10931  * status, KVM doesn't update assigned devices when APICv is inhibited,
10932  * i.e. they can post interrupts even if APICv is temporarily disabled.
10933  */
10934  if (kvm_lapic_enabled(vcpu))
10935  static_call_cond(kvm_x86_sync_pir_to_irr)(vcpu);
10936 
10937  if (kvm_vcpu_exit_request(vcpu)) {
10938  vcpu->mode = OUTSIDE_GUEST_MODE;
10939  smp_wmb();
10940  local_irq_enable();
10941  preempt_enable();
10942  kvm_vcpu_srcu_read_lock(vcpu);
10943  r = 1;
10944  goto cancel_injection;
10945  }
10946 
10947  if (req_immediate_exit) {
10948  kvm_make_request(KVM_REQ_EVENT, vcpu);
10949  static_call(kvm_x86_request_immediate_exit)(vcpu);
10950  }
10951 
10952  fpregs_assert_state_consistent();
10953  if (test_thread_flag(TIF_NEED_FPU_LOAD))
10954  switch_fpu_return();
10955 
10956  if (vcpu->arch.guest_fpu.xfd_err)
10957  wrmsrl(MSR_IA32_XFD_ERR, vcpu->arch.guest_fpu.xfd_err);
10958 
10959  if (unlikely(vcpu->arch.switch_db_regs)) {
10960  set_debugreg(0, 7);
10961  set_debugreg(vcpu->arch.eff_db[0], 0);
10962  set_debugreg(vcpu->arch.eff_db[1], 1);
10963  set_debugreg(vcpu->arch.eff_db[2], 2);
10964  set_debugreg(vcpu->arch.eff_db[3], 3);
10965  } else if (unlikely(hw_breakpoint_active())) {
10966  set_debugreg(0, 7);
10967  }
10968 
10969  guest_timing_enter_irqoff();
10970 
10971  for (;;) {
10972  /*
10973  * Assert that vCPU vs. VM APICv state is consistent. An APICv
10974  * update must kick and wait for all vCPUs before toggling the
10975  * per-VM state, and responding vCPUs must wait for the update
10976  * to complete before servicing KVM_REQ_APICV_UPDATE.
10977  */
10978  WARN_ON_ONCE((kvm_vcpu_apicv_activated(vcpu) != kvm_vcpu_apicv_active(vcpu)) &&
10980 
10981  exit_fastpath = static_call(kvm_x86_vcpu_run)(vcpu);
10982  if (likely(exit_fastpath != EXIT_FASTPATH_REENTER_GUEST))
10983  break;
10984 
10985  if (kvm_lapic_enabled(vcpu))
10986  static_call_cond(kvm_x86_sync_pir_to_irr)(vcpu);
10987 
10988  if (unlikely(kvm_vcpu_exit_request(vcpu))) {
10989  exit_fastpath = EXIT_FASTPATH_EXIT_HANDLED;
10990  break;
10991  }
10992 
10993  /* Note, VM-Exits that go down the "slow" path are accounted below. */
10994  ++vcpu->stat.exits;
10995  }
10996 
10997  /*
10998  * Do this here before restoring debug registers on the host. And
10999  * since we do this before handling the vmexit, a DR access vmexit
11000  * can (a) read the correct value of the debug registers, (b) set
11001  * KVM_DEBUGREG_WONT_EXIT again.
11002  */
11003  if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) {
11004  WARN_ON(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP);
11005  static_call(kvm_x86_sync_dirty_debug_regs)(vcpu);
11006  kvm_update_dr0123(vcpu);
11007  kvm_update_dr7(vcpu);
11008  }
11009 
11010  /*
11011  * If the guest has used debug registers, at least dr7
11012  * will be disabled while returning to the host.
11013  * If we don't have active breakpoints in the host, we don't
11014  * care about the messed up debug address registers. But if
11015  * we have some of them active, restore the old state.
11016  */
11017  if (hw_breakpoint_active())
11018  hw_breakpoint_restore();
11019 
11020  vcpu->arch.last_vmentry_cpu = vcpu->cpu;
11021  vcpu->arch.last_guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
11022 
11023  vcpu->mode = OUTSIDE_GUEST_MODE;
11024  smp_wmb();
11025 
11026  /*
11027  * Sync xfd before calling handle_exit_irqoff() which may
11028  * rely on the fact that guest_fpu::xfd is up-to-date (e.g.
11029  * in #NM irqoff handler).
11030  */
11031  if (vcpu->arch.xfd_no_write_intercept)
11032  fpu_sync_guest_vmexit_xfd_state();
11033 
11034  static_call(kvm_x86_handle_exit_irqoff)(vcpu);
11035 
11036  if (vcpu->arch.guest_fpu.xfd_err)
11037  wrmsrl(MSR_IA32_XFD_ERR, 0);
11038 
11039  /*
11040  * Consume any pending interrupts, including the possible source of
11041  * VM-Exit on SVM and any ticks that occur between VM-Exit and now.
11042  * An instruction is required after local_irq_enable() to fully unblock
11043  * interrupts on processors that implement an interrupt shadow, the
11044  * stat.exits increment will do nicely.
11045  */
11047  local_irq_enable();
11048  ++vcpu->stat.exits;
11049  local_irq_disable();
11050  kvm_after_interrupt(vcpu);
11051 
11052  /*
11053  * Wait until after servicing IRQs to account guest time so that any
11054  * ticks that occurred while running the guest are properly accounted
11055  * to the guest. Waiting until IRQs are enabled degrades the accuracy
11056  * of accounting via context tracking, but the loss of accuracy is
11057  * acceptable for all known use cases.
11058  */
11059  guest_timing_exit_irqoff();
11060 
11061  local_irq_enable();
11062  preempt_enable();
11063 
11064  kvm_vcpu_srcu_read_lock(vcpu);
11065 
11066  /*
11067  * Profile KVM exit RIPs:
11068  */
11069  if (unlikely(prof_on == KVM_PROFILING)) {
11070  unsigned long rip = kvm_rip_read(vcpu);
11071  profile_hit(KVM_PROFILING, (void *)rip);
11072  }
11073 
11074  if (unlikely(vcpu->arch.tsc_always_catchup))
11075  kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
11076 
11077  if (vcpu->arch.apic_attention)
11079 
11080  r = static_call(kvm_x86_handle_exit)(vcpu, exit_fastpath);
11081  return r;
11082 
11083 cancel_injection:
11084  if (req_immediate_exit)
11085  kvm_make_request(KVM_REQ_EVENT, vcpu);
11086  static_call(kvm_x86_cancel_injection)(vcpu);
11087  if (unlikely(vcpu->arch.apic_attention))
11089 out:
11090  return r;
11091 }
11092 
11093 /* Called within kvm->srcu read side. */
11094 static inline int vcpu_block(struct kvm_vcpu *vcpu)
11095 {
11096  bool hv_timer;
11097 
11098  if (!kvm_arch_vcpu_runnable(vcpu)) {
11099  /*
11100  * Switch to the software timer before halt-polling/blocking as
11101  * the guest's timer may be a break event for the vCPU, and the
11102  * hypervisor timer runs only when the CPU is in guest mode.
11103  * Switch before halt-polling so that KVM recognizes an expired
11104  * timer before blocking.
11105  */
11106  hv_timer = kvm_lapic_hv_timer_in_use(vcpu);
11107  if (hv_timer)
11109 
11110  kvm_vcpu_srcu_read_unlock(vcpu);
11111  if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED)
11112  kvm_vcpu_halt(vcpu);
11113  else
11114  kvm_vcpu_block(vcpu);
11115  kvm_vcpu_srcu_read_lock(vcpu);
11116 
11117  if (hv_timer)
11119 
11120  /*
11121  * If the vCPU is not runnable, a signal or another host event
11122  * of some kind is pending; service it without changing the
11123  * vCPU's activity state.
11124  */
11125  if (!kvm_arch_vcpu_runnable(vcpu))
11126  return 1;
11127  }
11128 
11129  /*
11130  * Evaluate nested events before exiting the halted state. This allows
11131  * the halt state to be recorded properly in the VMCS12's activity
11132  * state field (AMD does not have a similar field and a VM-Exit always
11133  * causes a spurious wakeup from HLT).
11134  */
11135  if (is_guest_mode(vcpu)) {
11136  if (kvm_check_nested_events(vcpu) < 0)
11137  return 0;
11138  }
11139 
11140  if (kvm_apic_accept_events(vcpu) < 0)
11141  return 0;
11142  switch(vcpu->arch.mp_state) {
11143  case KVM_MP_STATE_HALTED:
11144  case KVM_MP_STATE_AP_RESET_HOLD:
11145  vcpu->arch.pv.pv_unhalted = false;
11146  vcpu->arch.mp_state =
11147  KVM_MP_STATE_RUNNABLE;
11148  fallthrough;
11149  case KVM_MP_STATE_RUNNABLE:
11150  vcpu->arch.apf.halted = false;
11151  break;
11152  case KVM_MP_STATE_INIT_RECEIVED:
11153  break;
11154  default:
11155  WARN_ON_ONCE(1);
11156  break;
11157  }
11158  return 1;
11159 }
11160 
11161 static inline bool kvm_vcpu_running(struct kvm_vcpu *vcpu)
11162 {
11163  return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
11164  !vcpu->arch.apf.halted);
11165 }
11166 
11167 /* Called within kvm->srcu read side. */
11168 static int vcpu_run(struct kvm_vcpu *vcpu)
11169 {
11170  int r;
11171 
11172  vcpu->run->exit_reason = KVM_EXIT_UNKNOWN;
11173  vcpu->arch.l1tf_flush_l1d = true;
11174 
11175  for (;;) {
11176  /*
11177  * If another guest vCPU requests a PV TLB flush in the middle
11178  * of instruction emulation, the rest of the emulation could
11179  * use a stale page translation. Assume that any code after
11180  * this point can start executing an instruction.
11181  */
11182  vcpu->arch.at_instruction_boundary = false;
11183  if (kvm_vcpu_running(vcpu)) {
11184  r = vcpu_enter_guest(vcpu);
11185  } else {
11186  r = vcpu_block(vcpu);
11187  }
11188 
11189  if (r <= 0)
11190  break;
11191 
11192  kvm_clear_request(KVM_REQ_UNBLOCK, vcpu);
11193  if (kvm_xen_has_pending_events(vcpu))
11195 
11196  if (kvm_cpu_has_pending_timer(vcpu))
11198 
11199  if (dm_request_for_irq_injection(vcpu) &&
11201  r = 0;
11202  vcpu->run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
11203  ++vcpu->stat.request_irq_exits;
11204  break;
11205  }
11206 
11207  if (__xfer_to_guest_mode_work_pending()) {
11208  kvm_vcpu_srcu_read_unlock(vcpu);
11209  r = xfer_to_guest_mode_handle_work(vcpu);
11210  kvm_vcpu_srcu_read_lock(vcpu);
11211  if (r)
11212  return r;
11213  }
11214  }
11215 
11216  return r;
11217 }
11218 
11219 static inline int complete_emulated_io(struct kvm_vcpu *vcpu)
11220 {
11221  return kvm_emulate_instruction(vcpu, EMULTYPE_NO_DECODE);
11222 }
11223 
11224 static int complete_emulated_pio(struct kvm_vcpu *vcpu)
11225 {
11226  BUG_ON(!vcpu->arch.pio.count);
11227 
11228  return complete_emulated_io(vcpu);
11229 }
11230 
11231 /*
11232  * Implements the following, as a state machine:
11233  *
11234  * read:
11235  * for each fragment
11236  * for each mmio piece in the fragment
11237  * write gpa, len
11238  * exit
11239  * copy data
11240  * execute insn
11241  *
11242  * write:
11243  * for each fragment
11244  * for each mmio piece in the fragment
11245  * write gpa, len
11246  * copy data
11247  * exit
11248  */
11249 static int complete_emulated_mmio(struct kvm_vcpu *vcpu)
11250 {
11251  struct kvm_run *run = vcpu->run;
11252  struct kvm_mmio_fragment *frag;
11253  unsigned len;
11254 
11255  BUG_ON(!vcpu->mmio_needed);
11256 
11257  /* Complete previous fragment */
11258  frag = &vcpu->mmio_fragments[vcpu->mmio_cur_fragment];
11259  len = min(8u, frag->len);
11260  if (!vcpu->mmio_is_write)
11261  memcpy(frag->data, run->mmio.data, len);
11262 
11263  if (frag->len <= 8) {
11264  /* Switch to the next fragment. */
11265  frag++;
11266  vcpu->mmio_cur_fragment++;
11267  } else {
11268  /* Go forward to the next mmio piece. */
11269  frag->data += len;
11270  frag->gpa += len;
11271  frag->len -= len;
11272  }
11273 
11274  if (vcpu->mmio_cur_fragment >= vcpu->mmio_nr_fragments) {
11275  vcpu->mmio_needed = 0;
11276 
11277  /* FIXME: return into emulator if single-stepping. */
11278  if (vcpu->mmio_is_write)
11279  return 1;
11280  vcpu->mmio_read_completed = 1;
11281  return complete_emulated_io(vcpu);
11282  }
11283 
11284  run->exit_reason = KVM_EXIT_MMIO;
11285  run->mmio.phys_addr = frag->gpa;
11286  if (vcpu->mmio_is_write)
11287  memcpy(run->mmio.data, frag->data, min(8u, frag->len));
11288  run->mmio.len = min(8u, frag->len);
11289  run->mmio.is_write = vcpu->mmio_is_write;
11290  vcpu->arch.complete_userspace_io = complete_emulated_mmio;
11291  return 0;
11292 }
11293 
11294 /* Swap (qemu) user FPU context for the guest FPU context. */
11295 static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
11296 {
11297  /* Exclude PKRU, it's restored separately immediately after VM-Exit. */
11298  fpu_swap_kvm_fpstate(&vcpu->arch.guest_fpu, true);
11299  trace_kvm_fpu(1);
11300 }
11301 
11302 /* When vcpu_run ends, restore user space FPU context. */
11303 static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
11304 {
11305  fpu_swap_kvm_fpstate(&vcpu->arch.guest_fpu, false);
11306  ++vcpu->stat.fpu_reload;
11307  trace_kvm_fpu(0);
11308 }
11309 
11310 int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
11311 {
11312  struct kvm_queued_exception *ex = &vcpu->arch.exception;
11313  struct kvm_run *kvm_run = vcpu->run;
11314  int r;
11315 
11316  vcpu_load(vcpu);
11317  kvm_sigset_activate(vcpu);
11318  kvm_run->flags = 0;
11319  kvm_load_guest_fpu(vcpu);
11320 
11321  kvm_vcpu_srcu_read_lock(vcpu);
11322  if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) {
11323  if (kvm_run->immediate_exit) {
11324  r = -EINTR;
11325  goto out;
11326  }
11327 
11328  /*
11329  * Don't bother switching APIC timer emulation from the
11330  * hypervisor timer to the software timer, the only way for the
11331  * APIC timer to be active is if userspace stuffed vCPU state,
11332  * i.e. put the vCPU into a nonsensical state. Only an INIT
11333  * will transition the vCPU out of UNINITIALIZED (without more
11334  * state stuffing from userspace), which will reset the local
11335  * APIC and thus cancel the timer or drop the IRQ (if the timer
11336  * already expired).
11337  */
11338  kvm_vcpu_srcu_read_unlock(vcpu);
11339  kvm_vcpu_block(vcpu);
11340  kvm_vcpu_srcu_read_lock(vcpu);
11341 
11342  if (kvm_apic_accept_events(vcpu) < 0) {
11343  r = 0;
11344  goto out;
11345  }
11346  r = -EAGAIN;
11347  if (signal_pending(current)) {
11348  r = -EINTR;
11349  kvm_run->exit_reason = KVM_EXIT_INTR;
11350  ++vcpu->stat.signal_exits;
11351  }
11352  goto out;
11353  }
11354 
11355  if ((kvm_run->kvm_valid_regs & ~KVM_SYNC_X86_VALID_FIELDS) ||
11356  (kvm_run->kvm_dirty_regs & ~KVM_SYNC_X86_VALID_FIELDS)) {
11357  r = -EINVAL;
11358  goto out;
11359  }
11360 
11361  if (kvm_run->kvm_dirty_regs) {
11362  r = sync_regs(vcpu);
11363  if (r != 0)
11364  goto out;
11365  }
11366 
11367  /* re-sync apic's tpr */
11368  if (!lapic_in_kernel(vcpu)) {
11369  if (kvm_set_cr8(vcpu, kvm_run->cr8) != 0) {
11370  r = -EINVAL;
11371  goto out;
11372  }
11373  }
11374 
11375  /*
11376  * If userspace set a pending exception and L2 is active, convert it to
11377  * a pending VM-Exit if L1 wants to intercept the exception.
11378  */
11379  if (vcpu->arch.exception_from_userspace && is_guest_mode(vcpu) &&
11380  kvm_x86_ops.nested_ops->is_exception_vmexit(vcpu, ex->vector,
11381  ex->error_code)) {
11382  kvm_queue_exception_vmexit(vcpu, ex->vector,
11383  ex->has_error_code, ex->error_code,
11384  ex->has_payload, ex->payload);
11385  ex->injected = false;
11386  ex->pending = false;
11387  }
11388  vcpu->arch.exception_from_userspace = false;
11389 
11390  if (unlikely(vcpu->arch.complete_userspace_io)) {
11391  int (*cui)(struct kvm_vcpu *) = vcpu->arch.complete_userspace_io;
11392  vcpu->arch.complete_userspace_io = NULL;
11393  r = cui(vcpu);
11394  if (r <= 0)
11395  goto out;
11396  } else {
11397  WARN_ON_ONCE(vcpu->arch.pio.count);
11398  WARN_ON_ONCE(vcpu->mmio_needed);
11399  }
11400 
11401  if (kvm_run->immediate_exit) {
11402  r = -EINTR;
11403  goto out;
11404  }
11405 
11406  r = static_call(kvm_x86_vcpu_pre_run)(vcpu);
11407  if (r <= 0)
11408  goto out;
11409 
11410  r = vcpu_run(vcpu);
11411 
11412 out:
11413  kvm_put_guest_fpu(vcpu);
11414  if (kvm_run->kvm_valid_regs)
11415  store_regs(vcpu);
11416  post_kvm_run_save(vcpu);
11417  kvm_vcpu_srcu_read_unlock(vcpu);
11418 
11419  kvm_sigset_deactivate(vcpu);
11420  vcpu_put(vcpu);
11421  return r;
11422 }
11423 
11424 static void __get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
11425 {
11426  if (vcpu->arch.emulate_regs_need_sync_to_vcpu) {
11427  /*
11428  * We are here if userspace calls get_regs() in the middle of
11429  * instruction emulation. Registers state needs to be copied
11430  * back from emulation context to vcpu. Userspace shouldn't do
11431  * that usually, but some bad designed PV devices (vmware
11432  * backdoor interface) need this to work
11433  */
11434  emulator_writeback_register_cache(vcpu->arch.emulate_ctxt);
11435  vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
11436  }
11437  regs->rax = kvm_rax_read(vcpu);
11438  regs->rbx = kvm_rbx_read(vcpu);
11439  regs->rcx = kvm_rcx_read(vcpu);
11440  regs->rdx = kvm_rdx_read(vcpu);
11441  regs->rsi = kvm_rsi_read(vcpu);
11442  regs->rdi = kvm_rdi_read(vcpu);
11443  regs->rsp = kvm_rsp_read(vcpu);
11444  regs->rbp = kvm_rbp_read(vcpu);
11445 #ifdef CONFIG_X86_64
11446  regs->r8 = kvm_r8_read(vcpu);
11447  regs->r9 = kvm_r9_read(vcpu);
11448  regs->r10 = kvm_r10_read(vcpu);
11449  regs->r11 = kvm_r11_read(vcpu);
11450  regs->r12 = kvm_r12_read(vcpu);
11451  regs->r13 = kvm_r13_read(vcpu);
11452  regs->r14 = kvm_r14_read(vcpu);
11453  regs->r15 = kvm_r15_read(vcpu);
11454 #endif
11455 
11456  regs->rip = kvm_rip_read(vcpu);
11457  regs->rflags = kvm_get_rflags(vcpu);
11458 }
11459 
11460 int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
11461 {
11462  vcpu_load(vcpu);
11463  __get_regs(vcpu, regs);
11464  vcpu_put(vcpu);
11465  return 0;
11466 }
11467 
11468 static void __set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
11469 {
11470  vcpu->arch.emulate_regs_need_sync_from_vcpu = true;
11471  vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
11472 
11473  kvm_rax_write(vcpu, regs->rax);
11474  kvm_rbx_write(vcpu, regs->rbx);
11475  kvm_rcx_write(vcpu, regs->rcx);
11476  kvm_rdx_write(vcpu, regs->rdx);
11477  kvm_rsi_write(vcpu, regs->rsi);
11478  kvm_rdi_write(vcpu, regs->rdi);
11479  kvm_rsp_write(vcpu, regs->rsp);
11480  kvm_rbp_write(vcpu, regs->rbp);
11481 #ifdef CONFIG_X86_64
11482  kvm_r8_write(vcpu, regs->r8);
11483  kvm_r9_write(vcpu, regs->r9);
11484  kvm_r10_write(vcpu, regs->r10);
11485  kvm_r11_write(vcpu, regs->r11);
11486  kvm_r12_write(vcpu, regs->r12);
11487  kvm_r13_write(vcpu, regs->r13);
11488  kvm_r14_write(vcpu, regs->r14);
11489  kvm_r15_write(vcpu, regs->r15);
11490 #endif
11491 
11492  kvm_rip_write(vcpu, regs->rip);
11493  kvm_set_rflags(vcpu, regs->rflags | X86_EFLAGS_FIXED);
11494 
11495  vcpu->arch.exception.pending = false;
11496  vcpu->arch.exception_vmexit.pending = false;
11497 
11498  kvm_make_request(KVM_REQ_EVENT, vcpu);
11499 }
11500 
11501 int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
11502 {
11503  vcpu_load(vcpu);
11504  __set_regs(vcpu, regs);
11505  vcpu_put(vcpu);
11506  return 0;
11507 }
11508 
11509 static void __get_sregs_common(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
11510 {
11511  struct desc_ptr dt;
11512 
11513  if (vcpu->arch.guest_state_protected)
11514  goto skip_protected_regs;
11515 
11516  kvm_get_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
11517  kvm_get_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
11518  kvm_get_segment(vcpu, &sregs->es, VCPU_SREG_ES);
11519  kvm_get_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
11520  kvm_get_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
11521  kvm_get_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
11522 
11523  kvm_get_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
11524  kvm_get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
11525 
11526  static_call(kvm_x86_get_idt)(vcpu, &dt);
11527  sregs->idt.limit = dt.size;
11528  sregs->idt.base = dt.address;
11529  static_call(kvm_x86_get_gdt)(vcpu, &dt);
11530  sregs->gdt.limit = dt.size;
11531  sregs->gdt.base = dt.address;
11532 
11533  sregs->cr2 = vcpu->arch.cr2;
11534  sregs->cr3 = kvm_read_cr3(vcpu);
11535 
11536 skip_protected_regs:
11537  sregs->cr0 = kvm_read_cr0(vcpu);
11538  sregs->cr4 = kvm_read_cr4(vcpu);
11539  sregs->cr8 = kvm_get_cr8(vcpu);
11540  sregs->efer = vcpu->arch.efer;
11541  sregs->apic_base = kvm_get_apic_base(vcpu);
11542 }
11543 
11544 static void __get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
11545 {
11546  __get_sregs_common(vcpu, sregs);
11547 
11548  if (vcpu->arch.guest_state_protected)
11549  return;
11550 
11551  if (vcpu->arch.interrupt.injected && !vcpu->arch.interrupt.soft)
11552  set_bit(vcpu->arch.interrupt.nr,
11553  (unsigned long *)sregs->interrupt_bitmap);
11554 }
11555 
11556 static void __get_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2)
11557 {
11558  int i;
11559 
11560  __get_sregs_common(vcpu, (struct kvm_sregs *)sregs2);
11561 
11562  if (vcpu->arch.guest_state_protected)
11563  return;
11564 
11565  if (is_pae_paging(vcpu)) {
11566  for (i = 0 ; i < 4 ; i++)
11567  sregs2->pdptrs[i] = kvm_pdptr_read(vcpu, i);
11568  sregs2->flags |= KVM_SREGS2_FLAGS_PDPTRS_VALID;
11569  }
11570 }
11571 
11572 int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
11573  struct kvm_sregs *sregs)
11574 {
11575  vcpu_load(vcpu);
11576  __get_sregs(vcpu, sregs);
11577  vcpu_put(vcpu);
11578  return 0;
11579 }
11580 
11581 int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
11582  struct kvm_mp_state *mp_state)
11583 {
11584  int r;
11585 
11586  vcpu_load(vcpu);
11587  if (kvm_mpx_supported())
11588  kvm_load_guest_fpu(vcpu);
11589 
11590  r = kvm_apic_accept_events(vcpu);
11591  if (r < 0)
11592  goto out;
11593  r = 0;
11594 
11595  if ((vcpu->arch.mp_state == KVM_MP_STATE_HALTED ||
11596  vcpu->arch.mp_state == KVM_MP_STATE_AP_RESET_HOLD) &&
11597  vcpu->arch.pv.pv_unhalted)
11598  mp_state->mp_state = KVM_MP_STATE_RUNNABLE;
11599  else
11600  mp_state->mp_state = vcpu->arch.mp_state;
11601 
11602 out:
11603  if (kvm_mpx_supported())
11604  kvm_put_guest_fpu(vcpu);
11605  vcpu_put(vcpu);
11606  return r;
11607 }
11608 
11609 int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
11610  struct kvm_mp_state *mp_state)
11611 {
11612  int ret = -EINVAL;
11613 
11614  vcpu_load(vcpu);
11615 
11616  switch (mp_state->mp_state) {
11617  case KVM_MP_STATE_UNINITIALIZED:
11618  case KVM_MP_STATE_HALTED:
11619  case KVM_MP_STATE_AP_RESET_HOLD:
11620  case KVM_MP_STATE_INIT_RECEIVED:
11621  case KVM_MP_STATE_SIPI_RECEIVED:
11622  if (!lapic_in_kernel(vcpu))
11623  goto out;
11624  break;
11625 
11626  case KVM_MP_STATE_RUNNABLE:
11627  break;
11628 
11629  default:
11630  goto out;
11631  }
11632 
11633  /*
11634  * Pending INITs are reported using KVM_SET_VCPU_EVENTS, disallow
11635  * forcing the guest into INIT/SIPI if those events are supposed to be
11636  * blocked. KVM prioritizes SMI over INIT, so reject INIT/SIPI state
11637  * if an SMI is pending as well.
11638  */
11639  if ((!kvm_apic_init_sipi_allowed(vcpu) || vcpu->arch.smi_pending) &&
11640  (mp_state->mp_state == KVM_MP_STATE_SIPI_RECEIVED ||
11641  mp_state->mp_state == KVM_MP_STATE_INIT_RECEIVED))
11642  goto out;
11643 
11644  if (mp_state->mp_state == KVM_MP_STATE_SIPI_RECEIVED) {
11645  vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED;
11646  set_bit(KVM_APIC_SIPI, &vcpu->arch.apic->pending_events);
11647  } else
11648  vcpu->arch.mp_state = mp_state->mp_state;
11649  kvm_make_request(KVM_REQ_EVENT, vcpu);
11650 
11651  ret = 0;
11652 out:
11653  vcpu_put(vcpu);
11654  return ret;
11655 }
11656 
11657 int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index,
11658  int reason, bool has_error_code, u32 error_code)
11659 {
11660  struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
11661  int ret;
11662 
11664 
11665  ret = emulator_task_switch(ctxt, tss_selector, idt_index, reason,
11666  has_error_code, error_code);
11667  if (ret) {
11668  vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
11669  vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
11670  vcpu->run->internal.ndata = 0;
11671  return 0;
11672  }
11673 
11674  kvm_rip_write(vcpu, ctxt->eip);
11675  kvm_set_rflags(vcpu, ctxt->eflags);
11676  return 1;
11677 }
11679 
11680 static bool kvm_is_valid_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
11681 {
11682  if ((sregs->efer & EFER_LME) && (sregs->cr0 & X86_CR0_PG)) {
11683  /*
11684  * When EFER.LME and CR0.PG are set, the processor is in
11685  * 64-bit mode (though maybe in a 32-bit code segment).
11686  * CR4.PAE and EFER.LMA must be set.
11687  */
11688  if (!(sregs->cr4 & X86_CR4_PAE) || !(sregs->efer & EFER_LMA))
11689  return false;
11690  if (!kvm_vcpu_is_legal_cr3(vcpu, sregs->cr3))
11691  return false;
11692  } else {
11693  /*
11694  * Not in 64-bit mode: EFER.LMA is clear and the code
11695  * segment cannot be 64-bit.
11696  */
11697  if (sregs->efer & EFER_LMA || sregs->cs.l)
11698  return false;
11699  }
11700 
11701  return kvm_is_valid_cr4(vcpu, sregs->cr4) &&
11702  kvm_is_valid_cr0(vcpu, sregs->cr0);
11703 }
11704 
11705 static int __set_sregs_common(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs,
11706  int *mmu_reset_needed, bool update_pdptrs)
11707 {
11708  struct msr_data apic_base_msr;
11709  int idx;
11710  struct desc_ptr dt;
11711 
11712  if (!kvm_is_valid_sregs(vcpu, sregs))
11713  return -EINVAL;
11714 
11715  apic_base_msr.data = sregs->apic_base;
11716  apic_base_msr.host_initiated = true;
11717  if (kvm_set_apic_base(vcpu, &apic_base_msr))
11718  return -EINVAL;
11719 
11720  if (vcpu->arch.guest_state_protected)
11721  return 0;
11722 
11723  dt.size = sregs->idt.limit;
11724  dt.address = sregs->idt.base;
11725  static_call(kvm_x86_set_idt)(vcpu, &dt);
11726  dt.size = sregs->gdt.limit;
11727  dt.address = sregs->gdt.base;
11728  static_call(kvm_x86_set_gdt)(vcpu, &dt);
11729 
11730  vcpu->arch.cr2 = sregs->cr2;
11731  *mmu_reset_needed |= kvm_read_cr3(vcpu) != sregs->cr3;
11732  vcpu->arch.cr3 = sregs->cr3;
11733  kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3);
11734  static_call_cond(kvm_x86_post_set_cr3)(vcpu, sregs->cr3);
11735 
11736  kvm_set_cr8(vcpu, sregs->cr8);
11737 
11738  *mmu_reset_needed |= vcpu->arch.efer != sregs->efer;
11739  static_call(kvm_x86_set_efer)(vcpu, sregs->efer);
11740 
11741  *mmu_reset_needed |= kvm_read_cr0(vcpu) != sregs->cr0;
11742  static_call(kvm_x86_set_cr0)(vcpu, sregs->cr0);
11743 
11744  *mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4;
11745  static_call(kvm_x86_set_cr4)(vcpu, sregs->cr4);
11746 
11747  if (update_pdptrs) {
11748  idx = srcu_read_lock(&vcpu->kvm->srcu);
11749  if (is_pae_paging(vcpu)) {
11750  load_pdptrs(vcpu, kvm_read_cr3(vcpu));
11751  *mmu_reset_needed = 1;
11752  }
11753  srcu_read_unlock(&vcpu->kvm->srcu, idx);
11754  }
11755 
11756  kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
11757  kvm_set_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
11758  kvm_set_segment(vcpu, &sregs->es, VCPU_SREG_ES);
11759  kvm_set_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
11760  kvm_set_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
11761  kvm_set_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
11762 
11763  kvm_set_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
11764  kvm_set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
11765 
11766  update_cr8_intercept(vcpu);
11767 
11768  /* Older userspace won't unhalt the vcpu on reset. */
11769  if (kvm_vcpu_is_bsp(vcpu) && kvm_rip_read(vcpu) == 0xfff0 &&
11770  sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 &&
11771  !is_protmode(vcpu))
11772  vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
11773 
11774  return 0;
11775 }
11776 
11777 static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
11778 {
11779  int pending_vec, max_bits;
11780  int mmu_reset_needed = 0;
11781  int ret = __set_sregs_common(vcpu, sregs, &mmu_reset_needed, true);
11782 
11783  if (ret)
11784  return ret;
11785 
11786  if (mmu_reset_needed) {
11787  kvm_mmu_reset_context(vcpu);
11788  kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
11789  }
11790 
11791  max_bits = KVM_NR_INTERRUPTS;
11792  pending_vec = find_first_bit(
11793  (const unsigned long *)sregs->interrupt_bitmap, max_bits);
11794 
11795  if (pending_vec < max_bits) {
11796  kvm_queue_interrupt(vcpu, pending_vec, false);
11797  pr_debug("Set back pending irq %d\n", pending_vec);
11798  kvm_make_request(KVM_REQ_EVENT, vcpu);
11799  }
11800  return 0;
11801 }
11802 
11803 static int __set_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2)
11804 {
11805  int mmu_reset_needed = 0;
11806  bool valid_pdptrs = sregs2->flags & KVM_SREGS2_FLAGS_PDPTRS_VALID;
11807  bool pae = (sregs2->cr0 & X86_CR0_PG) && (sregs2->cr4 & X86_CR4_PAE) &&
11808  !(sregs2->efer & EFER_LMA);
11809  int i, ret;
11810 
11811  if (sregs2->flags & ~KVM_SREGS2_FLAGS_PDPTRS_VALID)
11812  return -EINVAL;
11813 
11814  if (valid_pdptrs && (!pae || vcpu->arch.guest_state_protected))
11815  return -EINVAL;
11816 
11817  ret = __set_sregs_common(vcpu, (struct kvm_sregs *)sregs2,
11818  &mmu_reset_needed, !valid_pdptrs);
11819  if (ret)
11820  return ret;
11821 
11822  if (valid_pdptrs) {
11823  for (i = 0; i < 4 ; i++)
11824  kvm_pdptr_write(vcpu, i, sregs2->pdptrs[i]);
11825 
11826  kvm_register_mark_dirty(vcpu, VCPU_EXREG_PDPTR);
11827  mmu_reset_needed = 1;
11828  vcpu->arch.pdptrs_from_userspace = true;
11829  }
11830  if (mmu_reset_needed) {
11831  kvm_mmu_reset_context(vcpu);
11832  kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
11833  }
11834  return 0;
11835 }
11836 
11837 int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
11838  struct kvm_sregs *sregs)
11839 {
11840  int ret;
11841 
11842  vcpu_load(vcpu);
11843  ret = __set_sregs(vcpu, sregs);
11844  vcpu_put(vcpu);
11845  return ret;
11846 }
11847 
11849 {
11850  bool set = false;
11851  struct kvm_vcpu *vcpu;
11852  unsigned long i;
11853 
11854  if (!enable_apicv)
11855  return;
11856 
11857  down_write(&kvm->arch.apicv_update_lock);
11858 
11859  kvm_for_each_vcpu(i, vcpu, kvm) {
11860  if (vcpu->guest_debug & KVM_GUESTDBG_BLOCKIRQ) {
11861  set = true;
11862  break;
11863  }
11864  }
11865  __kvm_set_or_clear_apicv_inhibit(kvm, APICV_INHIBIT_REASON_BLOCKIRQ, set);
11866  up_write(&kvm->arch.apicv_update_lock);
11867 }
11868 
11869 int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
11870  struct kvm_guest_debug *dbg)
11871 {
11872  unsigned long rflags;
11873  int i, r;
11874 
11875  if (vcpu->arch.guest_state_protected)
11876  return -EINVAL;
11877 
11878  vcpu_load(vcpu);
11879 
11880  if (dbg->control & (KVM_GUESTDBG_INJECT_DB | KVM_GUESTDBG_INJECT_BP)) {
11881  r = -EBUSY;
11882  if (kvm_is_exception_pending(vcpu))
11883  goto out;
11884  if (dbg->control & KVM_GUESTDBG_INJECT_DB)
11885  kvm_queue_exception(vcpu, DB_VECTOR);
11886  else
11887  kvm_queue_exception(vcpu, BP_VECTOR);
11888  }
11889 
11890  /*
11891  * Read rflags as long as potentially injected trace flags are still
11892  * filtered out.
11893  */
11894  rflags = kvm_get_rflags(vcpu);
11895 
11896  vcpu->guest_debug = dbg->control;
11897  if (!(vcpu->guest_debug & KVM_GUESTDBG_ENABLE))
11898  vcpu->guest_debug = 0;
11899 
11900  if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) {
11901  for (i = 0; i < KVM_NR_DB_REGS; ++i)
11902  vcpu->arch.eff_db[i] = dbg->arch.debugreg[i];
11903  vcpu->arch.guest_debug_dr7 = dbg->arch.debugreg[7];
11904  } else {
11905  for (i = 0; i < KVM_NR_DB_REGS; i++)
11906  vcpu->arch.eff_db[i] = vcpu->arch.db[i];
11907  }
11908  kvm_update_dr7(vcpu);
11909 
11910  if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
11911  vcpu->arch.singlestep_rip = kvm_get_linear_rip(vcpu);
11912 
11913  /*
11914  * Trigger an rflags update that will inject or remove the trace
11915  * flags.
11916  */
11917  kvm_set_rflags(vcpu, rflags);
11918 
11919  static_call(kvm_x86_update_exception_bitmap)(vcpu);
11920 
11922 
11923  r = 0;
11924 
11925 out:
11926  vcpu_put(vcpu);
11927  return r;
11928 }
11929 
11930 /*
11931  * Translate a guest virtual address to a guest physical address.
11932  */
11933 int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
11934  struct kvm_translation *tr)
11935 {
11936  unsigned long vaddr = tr->linear_address;
11937  gpa_t gpa;
11938  int idx;
11939 
11940  vcpu_load(vcpu);
11941 
11942  idx = srcu_read_lock(&vcpu->kvm->srcu);
11943  gpa = kvm_mmu_gva_to_gpa_system(vcpu, vaddr, NULL);
11944  srcu_read_unlock(&vcpu->kvm->srcu, idx);
11945  tr->physical_address = gpa;
11946  tr->valid = gpa != INVALID_GPA;
11947  tr->writeable = 1;
11948  tr->usermode = 0;
11949 
11950  vcpu_put(vcpu);
11951  return 0;
11952 }
11953 
11954 int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
11955 {
11956  struct fxregs_state *fxsave;
11957 
11958  if (fpstate_is_confidential(&vcpu->arch.guest_fpu))
11959  return 0;
11960 
11961  vcpu_load(vcpu);
11962 
11963  fxsave = &vcpu->arch.guest_fpu.fpstate->regs.fxsave;
11964  memcpy(fpu->fpr, fxsave->st_space, 128);
11965  fpu->fcw = fxsave->cwd;
11966  fpu->fsw = fxsave->swd;
11967  fpu->ftwx = fxsave->twd;
11968  fpu->last_opcode = fxsave->fop;
11969  fpu->last_ip = fxsave->rip;
11970  fpu->last_dp = fxsave->rdp;
11971  memcpy(fpu->xmm, fxsave->xmm_space, sizeof(fxsave->xmm_space));
11972 
11973  vcpu_put(vcpu);
11974  return 0;
11975 }
11976 
11977 int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
11978 {
11979  struct fxregs_state *fxsave;
11980 
11981  if (fpstate_is_confidential(&vcpu->arch.guest_fpu))
11982  return 0;
11983 
11984  vcpu_load(vcpu);
11985 
11986  fxsave = &vcpu->arch.guest_fpu.fpstate->regs.fxsave;
11987 
11988  memcpy(fxsave->st_space, fpu->fpr, 128);
11989  fxsave->cwd = fpu->fcw;
11990  fxsave->swd = fpu->fsw;
11991  fxsave->twd = fpu->ftwx;
11992  fxsave->fop = fpu->last_opcode;
11993  fxsave->rip = fpu->last_ip;
11994  fxsave->rdp = fpu->last_dp;
11995  memcpy(fxsave->xmm_space, fpu->xmm, sizeof(fxsave->xmm_space));
11996 
11997  vcpu_put(vcpu);
11998  return 0;
11999 }
12000 
12001 static void store_regs(struct kvm_vcpu *vcpu)
12002 {
12003  BUILD_BUG_ON(sizeof(struct kvm_sync_regs) > SYNC_REGS_SIZE_BYTES);
12004 
12005  if (vcpu->run->kvm_valid_regs & KVM_SYNC_X86_REGS)
12006  __get_regs(vcpu, &vcpu->run->s.regs.regs);
12007 
12008  if (vcpu->run->kvm_valid_regs & KVM_SYNC_X86_SREGS)
12009  __get_sregs(vcpu, &vcpu->run->s.regs.sregs);
12010 
12011  if (vcpu->run->kvm_valid_regs & KVM_SYNC_X86_EVENTS)
12013  vcpu, &vcpu->run->s.regs.events);
12014 }
12015 
12016 static int sync_regs(struct kvm_vcpu *vcpu)
12017 {
12018  if (vcpu->run->kvm_dirty_regs & KVM_SYNC_X86_REGS) {
12019  __set_regs(vcpu, &vcpu->run->s.regs.regs);
12020  vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_REGS;
12021  }
12022 
12023  if (vcpu->run->kvm_dirty_regs & KVM_SYNC_X86_SREGS) {
12024  struct kvm_sregs sregs = vcpu->run->s.regs.sregs;
12025 
12026  if (__set_sregs(vcpu, &sregs))
12027  return -EINVAL;
12028 
12029  vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_SREGS;
12030  }
12031 
12032  if (vcpu->run->kvm_dirty_regs & KVM_SYNC_X86_EVENTS) {
12033  struct kvm_vcpu_events events = vcpu->run->s.regs.events;
12034 
12035  if (kvm_vcpu_ioctl_x86_set_vcpu_events(vcpu, &events))
12036  return -EINVAL;
12037 
12038  vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_EVENTS;
12039  }
12040 
12041  return 0;
12042 }
12043 
12044 int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id)
12045 {
12046  if (kvm_check_tsc_unstable() && kvm->created_vcpus)
12047  pr_warn_once("SMP vm created on host with unstable TSC; "
12048  "guest TSC will not be reliable\n");
12049 
12050  if (!kvm->arch.max_vcpu_ids)
12051  kvm->arch.max_vcpu_ids = KVM_MAX_VCPU_IDS;
12052 
12053  if (id >= kvm->arch.max_vcpu_ids)
12054  return -EINVAL;
12055 
12056  return static_call(kvm_x86_vcpu_precreate)(kvm);
12057 }
12058 
12059 int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
12060 {
12061  struct page *page;
12062  int r;
12063 
12064  vcpu->arch.last_vmentry_cpu = -1;
12065  vcpu->arch.regs_avail = ~0;
12066  vcpu->arch.regs_dirty = ~0;
12067 
12068  kvm_gpc_init(&vcpu->arch.pv_time, vcpu->kvm, vcpu, KVM_HOST_USES_PFN);
12069 
12070  if (!irqchip_in_kernel(vcpu->kvm) || kvm_vcpu_is_reset_bsp(vcpu))
12071  vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
12072  else
12073  vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED;
12074 
12075  r = kvm_mmu_create(vcpu);
12076  if (r < 0)
12077  return r;
12078 
12079  if (irqchip_in_kernel(vcpu->kvm)) {
12081  if (r < 0)
12082  goto fail_mmu_destroy;
12083 
12084  /*
12085  * Defer evaluating inhibits until the vCPU is first run, as
12086  * this vCPU will not get notified of any changes until this
12087  * vCPU is visible to other vCPUs (marked online and added to
12088  * the set of vCPUs). Opportunistically mark APICv active as
12089  * VMX in particularly is highly unlikely to have inhibits.
12090  * Ignore the current per-VM APICv state so that vCPU creation
12091  * is guaranteed to run with a deterministic value, the request
12092  * will ensure the vCPU gets the correct state before VM-Entry.
12093  */
12094  if (enable_apicv) {
12095  vcpu->arch.apic->apicv_active = true;
12096  kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu);
12097  }
12098  } else
12099  static_branch_inc(&kvm_has_noapic_vcpu);
12100 
12101  r = -ENOMEM;
12102 
12103  page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
12104  if (!page)
12105  goto fail_free_lapic;
12106  vcpu->arch.pio_data = page_address(page);
12107 
12108  vcpu->arch.mce_banks = kcalloc(KVM_MAX_MCE_BANKS * 4, sizeof(u64),
12109  GFP_KERNEL_ACCOUNT);
12110  vcpu->arch.mci_ctl2_banks = kcalloc(KVM_MAX_MCE_BANKS, sizeof(u64),
12111  GFP_KERNEL_ACCOUNT);
12112  if (!vcpu->arch.mce_banks || !vcpu->arch.mci_ctl2_banks)
12113  goto fail_free_mce_banks;
12114  vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS;
12115 
12116  if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask,
12117  GFP_KERNEL_ACCOUNT))
12118  goto fail_free_mce_banks;
12119 
12120  if (!alloc_emulate_ctxt(vcpu))
12121  goto free_wbinvd_dirty_mask;
12122 
12123  if (!fpu_alloc_guest_fpstate(&vcpu->arch.guest_fpu)) {
12124  pr_err("failed to allocate vcpu's fpu\n");
12125  goto free_emulate_ctxt;
12126  }
12127 
12128  vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
12129  vcpu->arch.reserved_gpa_bits = kvm_vcpu_reserved_gpa_bits_raw(vcpu);
12130 
12131  vcpu->arch.pat = MSR_IA32_CR_PAT_DEFAULT;
12132 
12134 
12135  vcpu->arch.perf_capabilities = kvm_caps.supported_perf_cap;
12136  kvm_pmu_init(vcpu);
12137 
12138  vcpu->arch.pending_external_vector = -1;
12139  vcpu->arch.preempted_in_kernel = false;
12140 
12141 #if IS_ENABLED(CONFIG_HYPERV)
12142  vcpu->arch.hv_root_tdp = INVALID_PAGE;
12143 #endif
12144 
12145  r = static_call(kvm_x86_vcpu_create)(vcpu);
12146  if (r)
12147  goto free_guest_fpu;
12148 
12149  vcpu->arch.arch_capabilities = kvm_get_arch_capabilities();
12150  vcpu->arch.msr_platform_info = MSR_PLATFORM_INFO_CPUID_FAULT;
12151  kvm_xen_init_vcpu(vcpu);
12152  kvm_vcpu_mtrr_init(vcpu);
12153  vcpu_load(vcpu);
12154  kvm_set_tsc_khz(vcpu, vcpu->kvm->arch.default_tsc_khz);
12155  kvm_vcpu_reset(vcpu, false);
12156  kvm_init_mmu(vcpu);
12157  vcpu_put(vcpu);
12158  return 0;
12159 
12160 free_guest_fpu:
12161  fpu_free_guest_fpstate(&vcpu->arch.guest_fpu);
12162 free_emulate_ctxt:
12163  kmem_cache_free(x86_emulator_cache, vcpu->arch.emulate_ctxt);
12164 free_wbinvd_dirty_mask:
12165  free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
12166 fail_free_mce_banks:
12167  kfree(vcpu->arch.mce_banks);
12168  kfree(vcpu->arch.mci_ctl2_banks);
12169  free_page((unsigned long)vcpu->arch.pio_data);
12170 fail_free_lapic:
12171  kvm_free_lapic(vcpu);
12172 fail_mmu_destroy:
12173  kvm_mmu_destroy(vcpu);
12174  return r;
12175 }
12176 
12177 void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
12178 {
12179  struct kvm *kvm = vcpu->kvm;
12180 
12181  if (mutex_lock_killable(&vcpu->mutex))
12182  return;
12183  vcpu_load(vcpu);
12184  kvm_synchronize_tsc(vcpu, NULL);
12185  vcpu_put(vcpu);
12186 
12187  /* poll control enabled by default */
12188  vcpu->arch.msr_kvm_poll_control = 1;
12189 
12190  mutex_unlock(&vcpu->mutex);
12191 
12192  if (kvmclock_periodic_sync && vcpu->vcpu_idx == 0)
12193  schedule_delayed_work(&kvm->arch.kvmclock_sync_work,
12195 }
12196 
12197 void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
12198 {
12199  int idx;
12200 
12201  kvmclock_reset(vcpu);
12202 
12203  static_call(kvm_x86_vcpu_free)(vcpu);
12204 
12205  kmem_cache_free(x86_emulator_cache, vcpu->arch.emulate_ctxt);
12206  free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
12207  fpu_free_guest_fpstate(&vcpu->arch.guest_fpu);
12208 
12209  kvm_xen_destroy_vcpu(vcpu);
12210  kvm_hv_vcpu_uninit(vcpu);
12211  kvm_pmu_destroy(vcpu);
12212  kfree(vcpu->arch.mce_banks);
12213  kfree(vcpu->arch.mci_ctl2_banks);
12214  kvm_free_lapic(vcpu);
12215  idx = srcu_read_lock(&vcpu->kvm->srcu);
12216  kvm_mmu_destroy(vcpu);
12217  srcu_read_unlock(&vcpu->kvm->srcu, idx);
12218  free_page((unsigned long)vcpu->arch.pio_data);
12219  kvfree(vcpu->arch.cpuid_entries);
12220  if (!lapic_in_kernel(vcpu))
12221  static_branch_dec(&kvm_has_noapic_vcpu);
12222 }
12223 
12224 void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
12225 {
12226  struct kvm_cpuid_entry2 *cpuid_0x1;
12227  unsigned long old_cr0 = kvm_read_cr0(vcpu);
12228  unsigned long new_cr0;
12229 
12230  /*
12231  * Several of the "set" flows, e.g. ->set_cr0(), read other registers
12232  * to handle side effects. RESET emulation hits those flows and relies
12233  * on emulated/virtualized registers, including those that are loaded
12234  * into hardware, to be zeroed at vCPU creation. Use CRs as a sentinel
12235  * to detect improper or missing initialization.
12236  */
12237  WARN_ON_ONCE(!init_event &&
12238  (old_cr0 || kvm_read_cr3(vcpu) || kvm_read_cr4(vcpu)));
12239 
12240  /*
12241  * SVM doesn't unconditionally VM-Exit on INIT and SHUTDOWN, thus it's
12242  * possible to INIT the vCPU while L2 is active. Force the vCPU back
12243  * into L1 as EFER.SVME is cleared on INIT (along with all other EFER
12244  * bits), i.e. virtualization is disabled.
12245  */
12246  if (is_guest_mode(vcpu))
12247  kvm_leave_nested(vcpu);
12248 
12249  kvm_lapic_reset(vcpu, init_event);
12250 
12251  WARN_ON_ONCE(is_guest_mode(vcpu) || is_smm(vcpu));
12252  vcpu->arch.hflags = 0;
12253 
12254  vcpu->arch.smi_pending = 0;
12255  vcpu->arch.smi_count = 0;
12256  atomic_set(&vcpu->arch.nmi_queued, 0);
12257  vcpu->arch.nmi_pending = 0;
12258  vcpu->arch.nmi_injected = false;
12261 
12262  memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db));
12263  kvm_update_dr0123(vcpu);
12264  vcpu->arch.dr6 = DR6_ACTIVE_LOW;
12265  vcpu->arch.dr7 = DR7_FIXED_1;
12266  kvm_update_dr7(vcpu);
12267 
12268  vcpu->arch.cr2 = 0;
12269 
12270  kvm_make_request(KVM_REQ_EVENT, vcpu);
12271  vcpu->arch.apf.msr_en_val = 0;
12272  vcpu->arch.apf.msr_int_val = 0;
12273  vcpu->arch.st.msr_val = 0;
12274 
12275  kvmclock_reset(vcpu);
12276 
12279  vcpu->arch.apf.halted = false;
12280 
12281  if (vcpu->arch.guest_fpu.fpstate && kvm_mpx_supported()) {
12282  struct fpstate *fpstate = vcpu->arch.guest_fpu.fpstate;
12283 
12284  /*
12285  * All paths that lead to INIT are required to load the guest's
12286  * FPU state (because most paths are buried in KVM_RUN).
12287  */
12288  if (init_event)
12289  kvm_put_guest_fpu(vcpu);
12290 
12291  fpstate_clear_xstate_component(fpstate, XFEATURE_BNDREGS);
12292  fpstate_clear_xstate_component(fpstate, XFEATURE_BNDCSR);
12293 
12294  if (init_event)
12295  kvm_load_guest_fpu(vcpu);
12296  }
12297 
12298  if (!init_event) {
12299  vcpu->arch.smbase = 0x30000;
12300 
12301  vcpu->arch.msr_misc_features_enables = 0;
12302  vcpu->arch.ia32_misc_enable_msr = MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL |
12303  MSR_IA32_MISC_ENABLE_BTS_UNAVAIL;
12304 
12305  __kvm_set_xcr(vcpu, 0, XFEATURE_MASK_FP);
12306  __kvm_set_msr(vcpu, MSR_IA32_XSS, 0, true);
12307  }
12308 
12309  /* All GPRs except RDX (handled below) are zeroed on RESET/INIT. */
12310  memset(vcpu->arch.regs, 0, sizeof(vcpu->arch.regs));
12311  kvm_register_mark_dirty(vcpu, VCPU_REGS_RSP);
12312 
12313  /*
12314  * Fall back to KVM's default Family/Model/Stepping of 0x600 (P6/Athlon)
12315  * if no CPUID match is found. Note, it's impossible to get a match at
12316  * RESET since KVM emulates RESET before exposing the vCPU to userspace,
12317  * i.e. it's impossible for kvm_find_cpuid_entry() to find a valid entry
12318  * on RESET. But, go through the motions in case that's ever remedied.
12319  */
12320  cpuid_0x1 = kvm_find_cpuid_entry(vcpu, 1);
12321  kvm_rdx_write(vcpu, cpuid_0x1 ? cpuid_0x1->eax : 0x600);
12322 
12323  static_call(kvm_x86_vcpu_reset)(vcpu, init_event);
12324 
12325  kvm_set_rflags(vcpu, X86_EFLAGS_FIXED);
12326  kvm_rip_write(vcpu, 0xfff0);
12327 
12328  vcpu->arch.cr3 = 0;
12329  kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3);
12330 
12331  /*
12332  * CR0.CD/NW are set on RESET, preserved on INIT. Note, some versions
12333  * of Intel's SDM list CD/NW as being set on INIT, but they contradict
12334  * (or qualify) that with a footnote stating that CD/NW are preserved.
12335  */
12336  new_cr0 = X86_CR0_ET;
12337  if (init_event)
12338  new_cr0 |= (old_cr0 & (X86_CR0_NW | X86_CR0_CD));
12339  else
12340  new_cr0 |= X86_CR0_NW | X86_CR0_CD;
12341 
12342  static_call(kvm_x86_set_cr0)(vcpu, new_cr0);
12343  static_call(kvm_x86_set_cr4)(vcpu, 0);
12344  static_call(kvm_x86_set_efer)(vcpu, 0);
12345  static_call(kvm_x86_update_exception_bitmap)(vcpu);
12346 
12347  /*
12348  * On the standard CR0/CR4/EFER modification paths, there are several
12349  * complex conditions determining whether the MMU has to be reset and/or
12350  * which PCIDs have to be flushed. However, CR0.WP and the paging-related
12351  * bits in CR4 and EFER are irrelevant if CR0.PG was '0'; and a reset+flush
12352  * is needed anyway if CR0.PG was '1' (which can only happen for INIT, as
12353  * CR0 will be '0' prior to RESET). So we only need to check CR0.PG here.
12354  */
12355  if (old_cr0 & X86_CR0_PG) {
12356  kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
12357  kvm_mmu_reset_context(vcpu);
12358  }
12359 
12360  /*
12361  * Intel's SDM states that all TLB entries are flushed on INIT. AMD's
12362  * APM states the TLBs are untouched by INIT, but it also states that
12363  * the TLBs are flushed on "External initialization of the processor."
12364  * Flush the guest TLB regardless of vendor, there is no meaningful
12365  * benefit in relying on the guest to flush the TLB immediately after
12366  * INIT. A spurious TLB flush is benign and likely negligible from a
12367  * performance perspective.
12368  */
12369  if (init_event)
12370  kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
12371 }
12373 
12374 void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
12375 {
12376  struct kvm_segment cs;
12377 
12378  kvm_get_segment(vcpu, &cs, VCPU_SREG_CS);
12379  cs.selector = vector << 8;
12380  cs.base = vector << 12;
12381  kvm_set_segment(vcpu, &cs, VCPU_SREG_CS);
12382  kvm_rip_write(vcpu, 0);
12383 }
12385 
12387 {
12388  struct kvm *kvm;
12389  struct kvm_vcpu *vcpu;
12390  unsigned long i;
12391  int ret;
12392  u64 local_tsc;
12393  u64 max_tsc = 0;
12394  bool stable, backwards_tsc = false;
12395 
12397 
12399  if (ret)
12400  return ret;
12401 
12402  ret = static_call(kvm_x86_hardware_enable)();
12403  if (ret != 0)
12404  return ret;
12405 
12406  local_tsc = rdtsc();
12407  stable = !kvm_check_tsc_unstable();
12408  list_for_each_entry(kvm, &vm_list, vm_list) {
12409  kvm_for_each_vcpu(i, vcpu, kvm) {
12410  if (!stable && vcpu->cpu == smp_processor_id())
12411  kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
12412  if (stable && vcpu->arch.last_host_tsc > local_tsc) {
12413  backwards_tsc = true;
12414  if (vcpu->arch.last_host_tsc > max_tsc)
12415  max_tsc = vcpu->arch.last_host_tsc;
12416  }
12417  }
12418  }
12419 
12420  /*
12421  * Sometimes, even reliable TSCs go backwards. This happens on
12422  * platforms that reset TSC during suspend or hibernate actions, but
12423  * maintain synchronization. We must compensate. Fortunately, we can
12424  * detect that condition here, which happens early in CPU bringup,
12425  * before any KVM threads can be running. Unfortunately, we can't
12426  * bring the TSCs fully up to date with real time, as we aren't yet far
12427  * enough into CPU bringup that we know how much real time has actually
12428  * elapsed; our helper function, ktime_get_boottime_ns() will be using boot
12429  * variables that haven't been updated yet.
12430  *
12431  * So we simply find the maximum observed TSC above, then record the
12432  * adjustment to TSC in each VCPU. When the VCPU later gets loaded,
12433  * the adjustment will be applied. Note that we accumulate
12434  * adjustments, in case multiple suspend cycles happen before some VCPU
12435  * gets a chance to run again. In the event that no KVM threads get a
12436  * chance to run, we will miss the entire elapsed period, as we'll have
12437  * reset last_host_tsc, so VCPUs will not have the TSC adjusted and may
12438  * loose cycle time. This isn't too big a deal, since the loss will be
12439  * uniform across all VCPUs (not to mention the scenario is extremely
12440  * unlikely). It is possible that a second hibernate recovery happens
12441  * much faster than a first, causing the observed TSC here to be
12442  * smaller; this would require additional padding adjustment, which is
12443  * why we set last_host_tsc to the local tsc observed here.
12444  *
12445  * N.B. - this code below runs only on platforms with reliable TSC,
12446  * as that is the only way backwards_tsc is set above. Also note
12447  * that this runs for ALL vcpus, which is not a bug; all VCPUs should
12448  * have the same delta_cyc adjustment applied if backwards_tsc
12449  * is detected. Note further, this adjustment is only done once,
12450  * as we reset last_host_tsc on all VCPUs to stop this from being
12451  * called multiple times (one for each physical CPU bringup).
12452  *
12453  * Platforms with unreliable TSCs don't have to deal with this, they
12454  * will be compensated by the logic in vcpu_load, which sets the TSC to
12455  * catchup mode. This will catchup all VCPUs to real time, but cannot
12456  * guarantee that they stay in perfect synchronization.
12457  */
12458  if (backwards_tsc) {
12459  u64 delta_cyc = max_tsc - local_tsc;
12460  list_for_each_entry(kvm, &vm_list, vm_list) {
12461  kvm->arch.backwards_tsc_observed = true;
12462  kvm_for_each_vcpu(i, vcpu, kvm) {
12463  vcpu->arch.tsc_offset_adjustment += delta_cyc;
12464  vcpu->arch.last_host_tsc = local_tsc;
12465  kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
12466  }
12467 
12468  /*
12469  * We have to disable TSC offset matching.. if you were
12470  * booting a VM while issuing an S4 host suspend....
12471  * you may have some problem. Solving this issue is
12472  * left as an exercise to the reader.
12473  */
12474  kvm->arch.last_tsc_nsec = 0;
12475  kvm->arch.last_tsc_write = 0;
12476  }
12477 
12478  }
12479  return 0;
12480 }
12481 
12483 {
12484  static_call(kvm_x86_hardware_disable)();
12486 }
12487 
12488 bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu)
12489 {
12490  return vcpu->kvm->arch.bsp_vcpu_id == vcpu->vcpu_id;
12491 }
12492 
12493 bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu)
12494 {
12495  return (vcpu->arch.apic_base & MSR_IA32_APICBASE_BSP) != 0;
12496 }
12497 
12499 EXPORT_SYMBOL_GPL(kvm_has_noapic_vcpu);
12500 
12501 void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu)
12502 {
12503  struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
12504 
12505  vcpu->arch.l1tf_flush_l1d = true;
12506  if (pmu->version && unlikely(pmu->event_count)) {
12507  pmu->need_cleanup = true;
12508  kvm_make_request(KVM_REQ_PMU, vcpu);
12509  }
12510  static_call(kvm_x86_sched_in)(vcpu, cpu);
12511 }
12512 
12513 void kvm_arch_free_vm(struct kvm *kvm)
12514 {
12515 #if IS_ENABLED(CONFIG_HYPERV)
12516  kfree(kvm->arch.hv_pa_pg);
12517 #endif
12518  __kvm_arch_free_vm(kvm);
12519 }
12520 
12521 
12522 int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
12523 {
12524  int ret;
12525  unsigned long flags;
12526 
12527  if (!kvm_is_vm_type_supported(type))
12528  return -EINVAL;
12529 
12530  kvm->arch.vm_type = type;
12531 
12532  ret = kvm_page_track_init(kvm);
12533  if (ret)
12534  goto out;
12535 
12536  kvm_mmu_init_vm(kvm);
12537 
12538  ret = static_call(kvm_x86_vm_init)(kvm);
12539  if (ret)
12540  goto out_uninit_mmu;
12541 
12542  INIT_HLIST_HEAD(&kvm->arch.mask_notifier_list);
12543  atomic_set(&kvm->arch.noncoherent_dma_count, 0);
12544 
12545  /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */
12546  set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap);
12547  /* Reserve bit 1 of irq_sources_bitmap for irqfd-resampler */
12548  set_bit(KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
12549  &kvm->arch.irq_sources_bitmap);
12550 
12551  raw_spin_lock_init(&kvm->arch.tsc_write_lock);
12552  mutex_init(&kvm->arch.apic_map_lock);
12553  seqcount_raw_spinlock_init(&kvm->arch.pvclock_sc, &kvm->arch.tsc_write_lock);
12554  kvm->arch.kvmclock_offset = -get_kvmclock_base_ns();
12555 
12556  raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
12558  raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
12559 
12560  kvm->arch.default_tsc_khz = max_tsc_khz ? : tsc_khz;
12561  kvm->arch.guest_can_read_msr_platform_info = true;
12562  kvm->arch.enable_pmu = enable_pmu;
12563 
12564 #if IS_ENABLED(CONFIG_HYPERV)
12565  spin_lock_init(&kvm->arch.hv_root_tdp_lock);
12566  kvm->arch.hv_root_tdp = INVALID_PAGE;
12567 #endif
12568 
12569  INIT_DELAYED_WORK(&kvm->arch.kvmclock_update_work, kvmclock_update_fn);
12570  INIT_DELAYED_WORK(&kvm->arch.kvmclock_sync_work, kvmclock_sync_fn);
12571 
12572  kvm_apicv_init(kvm);
12573  kvm_hv_init_vm(kvm);
12574  kvm_xen_init_vm(kvm);
12575 
12576  return 0;
12577 
12578 out_uninit_mmu:
12579  kvm_mmu_uninit_vm(kvm);
12581 out:
12582  return ret;
12583 }
12584 
12585 int kvm_arch_post_init_vm(struct kvm *kvm)
12586 {
12587  return kvm_mmu_post_init_vm(kvm);
12588 }
12589 
12590 static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
12591 {
12592  vcpu_load(vcpu);
12593  kvm_mmu_unload(vcpu);
12594  vcpu_put(vcpu);
12595 }
12596 
12597 static void kvm_unload_vcpu_mmus(struct kvm *kvm)
12598 {
12599  unsigned long i;
12600  struct kvm_vcpu *vcpu;
12601 
12602  kvm_for_each_vcpu(i, vcpu, kvm) {
12604  kvm_unload_vcpu_mmu(vcpu);
12605  }
12606 }
12607 
12608 void kvm_arch_sync_events(struct kvm *kvm)
12609 {
12610  cancel_delayed_work_sync(&kvm->arch.kvmclock_sync_work);
12611  cancel_delayed_work_sync(&kvm->arch.kvmclock_update_work);
12612  kvm_free_pit(kvm);
12613 }
12614 
12615 /**
12616  * __x86_set_memory_region: Setup KVM internal memory slot
12617  *
12618  * @kvm: the kvm pointer to the VM.
12619  * @id: the slot ID to setup.
12620  * @gpa: the GPA to install the slot (unused when @size == 0).
12621  * @size: the size of the slot. Set to zero to uninstall a slot.
12622  *
12623  * This function helps to setup a KVM internal memory slot. Specify
12624  * @size > 0 to install a new slot, while @size == 0 to uninstall a
12625  * slot. The return code can be one of the following:
12626  *
12627  * HVA: on success (uninstall will return a bogus HVA)
12628  * -errno: on error
12629  *
12630  * The caller should always use IS_ERR() to check the return value
12631  * before use. Note, the KVM internal memory slots are guaranteed to
12632  * remain valid and unchanged until the VM is destroyed, i.e., the
12633  * GPA->HVA translation will not change. However, the HVA is a user
12634  * address, i.e. its accessibility is not guaranteed, and must be
12635  * accessed via __copy_{to,from}_user().
12636  */
12637 void __user * __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa,
12638  u32 size)
12639 {
12640  int i, r;
12641  unsigned long hva, old_npages;
12642  struct kvm_memslots *slots = kvm_memslots(kvm);
12643  struct kvm_memory_slot *slot;
12644 
12645  /* Called with kvm->slots_lock held. */
12646  if (WARN_ON(id >= KVM_MEM_SLOTS_NUM))
12647  return ERR_PTR_USR(-EINVAL);
12648 
12649  slot = id_to_memslot(slots, id);
12650  if (size) {
12651  if (slot && slot->npages)
12652  return ERR_PTR_USR(-EEXIST);
12653 
12654  /*
12655  * MAP_SHARED to prevent internal slot pages from being moved
12656  * by fork()/COW.
12657  */
12658  hva = vm_mmap(NULL, 0, size, PROT_READ | PROT_WRITE,
12659  MAP_SHARED | MAP_ANONYMOUS, 0);
12660  if (IS_ERR_VALUE(hva))
12661  return (void __user *)hva;
12662  } else {
12663  if (!slot || !slot->npages)
12664  return NULL;
12665 
12666  old_npages = slot->npages;
12667  hva = slot->userspace_addr;
12668  }
12669 
12670  for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
12671  struct kvm_userspace_memory_region2 m;
12672 
12673  m.slot = id | (i << 16);
12674  m.flags = 0;
12675  m.guest_phys_addr = gpa;
12676  m.userspace_addr = hva;
12677  m.memory_size = size;
12678  r = __kvm_set_memory_region(kvm, &m);
12679  if (r < 0)
12680  return ERR_PTR_USR(r);
12681  }
12682 
12683  if (!size)
12684  vm_munmap(hva, old_npages * PAGE_SIZE);
12685 
12686  return (void __user *)hva;
12687 }
12689 
12690 void kvm_arch_pre_destroy_vm(struct kvm *kvm)
12691 {
12693 }
12694 
12695 void kvm_arch_destroy_vm(struct kvm *kvm)
12696 {
12697  if (current->mm == kvm->mm) {
12698  /*
12699  * Free memory regions allocated on behalf of userspace,
12700  * unless the memory map has changed due to process exit
12701  * or fd copying.
12702  */
12703  mutex_lock(&kvm->slots_lock);
12704  __x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT,
12705  0, 0);
12706  __x86_set_memory_region(kvm, IDENTITY_PAGETABLE_PRIVATE_MEMSLOT,
12707  0, 0);
12708  __x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, 0, 0);
12709  mutex_unlock(&kvm->slots_lock);
12710  }
12711  kvm_unload_vcpu_mmus(kvm);
12712  static_call_cond(kvm_x86_vm_destroy)(kvm);
12713  kvm_free_msr_filter(srcu_dereference_check(kvm->arch.msr_filter, &kvm->srcu, 1));
12714  kvm_pic_destroy(kvm);
12715  kvm_ioapic_destroy(kvm);
12716  kvm_destroy_vcpus(kvm);
12717  kvfree(rcu_dereference_check(kvm->arch.apic_map, 1));
12718  kfree(srcu_dereference_check(kvm->arch.pmu_event_filter, &kvm->srcu, 1));
12719  kvm_mmu_uninit_vm(kvm);
12721  kvm_xen_destroy_vm(kvm);
12722  kvm_hv_destroy_vm(kvm);
12723 }
12724 
12725 static void memslot_rmap_free(struct kvm_memory_slot *slot)
12726 {
12727  int i;
12728 
12729  for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
12730  kvfree(slot->arch.rmap[i]);
12731  slot->arch.rmap[i] = NULL;
12732  }
12733 }
12734 
12735 void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
12736 {
12737  int i;
12738 
12739  memslot_rmap_free(slot);
12740 
12741  for (i = 1; i < KVM_NR_PAGE_SIZES; ++i) {
12742  kvfree(slot->arch.lpage_info[i - 1]);
12743  slot->arch.lpage_info[i - 1] = NULL;
12744  }
12745 
12747 }
12748 
12749 int memslot_rmap_alloc(struct kvm_memory_slot *slot, unsigned long npages)
12750 {
12751  const int sz = sizeof(*slot->arch.rmap[0]);
12752  int i;
12753 
12754  for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
12755  int level = i + 1;
12756  int lpages = __kvm_mmu_slot_lpages(slot, npages, level);
12757 
12758  if (slot->arch.rmap[i])
12759  continue;
12760 
12761  slot->arch.rmap[i] = __vcalloc(lpages, sz, GFP_KERNEL_ACCOUNT);
12762  if (!slot->arch.rmap[i]) {
12763  memslot_rmap_free(slot);
12764  return -ENOMEM;
12765  }
12766  }
12767 
12768  return 0;
12769 }
12770 
12771 static int kvm_alloc_memslot_metadata(struct kvm *kvm,
12772  struct kvm_memory_slot *slot)
12773 {
12774  unsigned long npages = slot->npages;
12775  int i, r;
12776 
12777  /*
12778  * Clear out the previous array pointers for the KVM_MR_MOVE case. The
12779  * old arrays will be freed by __kvm_set_memory_region() if installing
12780  * the new memslot is successful.
12781  */
12782  memset(&slot->arch, 0, sizeof(slot->arch));
12783 
12784  if (kvm_memslots_have_rmaps(kvm)) {
12785  r = memslot_rmap_alloc(slot, npages);
12786  if (r)
12787  return r;
12788  }
12789 
12790  for (i = 1; i < KVM_NR_PAGE_SIZES; ++i) {
12791  struct kvm_lpage_info *linfo;
12792  unsigned long ugfn;
12793  int lpages;
12794  int level = i + 1;
12795 
12796  lpages = __kvm_mmu_slot_lpages(slot, npages, level);
12797 
12798  linfo = __vcalloc(lpages, sizeof(*linfo), GFP_KERNEL_ACCOUNT);
12799  if (!linfo)
12800  goto out_free;
12801 
12802  slot->arch.lpage_info[i - 1] = linfo;
12803 
12804  if (slot->base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1))
12805  linfo[0].disallow_lpage = 1;
12806  if ((slot->base_gfn + npages) & (KVM_PAGES_PER_HPAGE(level) - 1))
12807  linfo[lpages - 1].disallow_lpage = 1;
12808  ugfn = slot->userspace_addr >> PAGE_SHIFT;
12809  /*
12810  * If the gfn and userspace address are not aligned wrt each
12811  * other, disable large page support for this slot.
12812  */
12813  if ((slot->base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE(level) - 1)) {
12814  unsigned long j;
12815 
12816  for (j = 0; j < lpages; ++j)
12817  linfo[j].disallow_lpage = 1;
12818  }
12819  }
12820 
12821 #ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
12822  kvm_mmu_init_memslot_memory_attributes(kvm, slot);
12823 #endif
12824 
12825  if (kvm_page_track_create_memslot(kvm, slot, npages))
12826  goto out_free;
12827 
12828  return 0;
12829 
12830 out_free:
12831  memslot_rmap_free(slot);
12832 
12833  for (i = 1; i < KVM_NR_PAGE_SIZES; ++i) {
12834  kvfree(slot->arch.lpage_info[i - 1]);
12835  slot->arch.lpage_info[i - 1] = NULL;
12836  }
12837  return -ENOMEM;
12838 }
12839 
12840 void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen)
12841 {
12842  struct kvm_vcpu *vcpu;
12843  unsigned long i;
12844 
12845  /*
12846  * memslots->generation has been incremented.
12847  * mmio generation may have reached its maximum value.
12848  */
12850 
12851  /* Force re-initialization of steal_time cache */
12852  kvm_for_each_vcpu(i, vcpu, kvm)
12853  kvm_vcpu_kick(vcpu);
12854 }
12855 
12857  const struct kvm_memory_slot *old,
12858  struct kvm_memory_slot *new,
12859  enum kvm_mr_change change)
12860 {
12861  /*
12862  * KVM doesn't support moving memslots when there are external page
12863  * trackers attached to the VM, i.e. if KVMGT is in use.
12864  */
12865  if (change == KVM_MR_MOVE && kvm_page_track_has_external_user(kvm))
12866  return -EINVAL;
12867 
12868  if (change == KVM_MR_CREATE || change == KVM_MR_MOVE) {
12869  if ((new->base_gfn + new->npages - 1) > kvm_mmu_max_gfn())
12870  return -EINVAL;
12871 
12872  return kvm_alloc_memslot_metadata(kvm, new);
12873  }
12874 
12875  if (change == KVM_MR_FLAGS_ONLY)
12876  memcpy(&new->arch, &old->arch, sizeof(old->arch));
12877  else if (WARN_ON_ONCE(change != KVM_MR_DELETE))
12878  return -EIO;
12879 
12880  return 0;
12881 }
12882 
12883 
12884 static void kvm_mmu_update_cpu_dirty_logging(struct kvm *kvm, bool enable)
12885 {
12886  int nr_slots;
12887 
12888  if (!kvm_x86_ops.cpu_dirty_log_size)
12889  return;
12890 
12891  nr_slots = atomic_read(&kvm->nr_memslots_dirty_logging);
12892  if ((enable && nr_slots == 1) || !nr_slots)
12893  kvm_make_all_cpus_request(kvm, KVM_REQ_UPDATE_CPU_DIRTY_LOGGING);
12894 }
12895 
12896 static void kvm_mmu_slot_apply_flags(struct kvm *kvm,
12897  struct kvm_memory_slot *old,
12898  const struct kvm_memory_slot *new,
12899  enum kvm_mr_change change)
12900 {
12901  u32 old_flags = old ? old->flags : 0;
12902  u32 new_flags = new ? new->flags : 0;
12903  bool log_dirty_pages = new_flags & KVM_MEM_LOG_DIRTY_PAGES;
12904 
12905  /*
12906  * Update CPU dirty logging if dirty logging is being toggled. This
12907  * applies to all operations.
12908  */
12909  if ((old_flags ^ new_flags) & KVM_MEM_LOG_DIRTY_PAGES)
12910  kvm_mmu_update_cpu_dirty_logging(kvm, log_dirty_pages);
12911 
12912  /*
12913  * Nothing more to do for RO slots (which can't be dirtied and can't be
12914  * made writable) or CREATE/MOVE/DELETE of a slot.
12915  *
12916  * For a memslot with dirty logging disabled:
12917  * CREATE: No dirty mappings will already exist.
12918  * MOVE/DELETE: The old mappings will already have been cleaned up by
12919  * kvm_arch_flush_shadow_memslot()
12920  *
12921  * For a memslot with dirty logging enabled:
12922  * CREATE: No shadow pages exist, thus nothing to write-protect
12923  * and no dirty bits to clear.
12924  * MOVE/DELETE: The old mappings will already have been cleaned up by
12925  * kvm_arch_flush_shadow_memslot().
12926  */
12927  if ((change != KVM_MR_FLAGS_ONLY) || (new_flags & KVM_MEM_READONLY))
12928  return;
12929 
12930  /*
12931  * READONLY and non-flags changes were filtered out above, and the only
12932  * other flag is LOG_DIRTY_PAGES, i.e. something is wrong if dirty
12933  * logging isn't being toggled on or off.
12934  */
12935  if (WARN_ON_ONCE(!((old_flags ^ new_flags) & KVM_MEM_LOG_DIRTY_PAGES)))
12936  return;
12937 
12938  if (!log_dirty_pages) {
12939  /*
12940  * Dirty logging tracks sptes in 4k granularity, meaning that
12941  * large sptes have to be split. If live migration succeeds,
12942  * the guest in the source machine will be destroyed and large
12943  * sptes will be created in the destination. However, if the
12944  * guest continues to run in the source machine (for example if
12945  * live migration fails), small sptes will remain around and
12946  * cause bad performance.
12947  *
12948  * Scan sptes if dirty logging has been stopped, dropping those
12949  * which can be collapsed into a single large-page spte. Later
12950  * page faults will create the large-page sptes.
12951  */
12953  } else {
12954  /*
12955  * Initially-all-set does not require write protecting any page,
12956  * because they're all assumed to be dirty.
12957  */
12958  if (kvm_dirty_log_manual_protect_and_init_set(kvm))
12959  return;
12960 
12961  if (READ_ONCE(eager_page_split))
12962  kvm_mmu_slot_try_split_huge_pages(kvm, new, PG_LEVEL_4K);
12963 
12964  if (kvm_x86_ops.cpu_dirty_log_size) {
12966  kvm_mmu_slot_remove_write_access(kvm, new, PG_LEVEL_2M);
12967  } else {
12968  kvm_mmu_slot_remove_write_access(kvm, new, PG_LEVEL_4K);
12969  }
12970 
12971  /*
12972  * Unconditionally flush the TLBs after enabling dirty logging.
12973  * A flush is almost always going to be necessary (see below),
12974  * and unconditionally flushing allows the helpers to omit
12975  * the subtly complex checks when removing write access.
12976  *
12977  * Do the flush outside of mmu_lock to reduce the amount of
12978  * time mmu_lock is held. Flushing after dropping mmu_lock is
12979  * safe as KVM only needs to guarantee the slot is fully
12980  * write-protected before returning to userspace, i.e. before
12981  * userspace can consume the dirty status.
12982  *
12983  * Flushing outside of mmu_lock requires KVM to be careful when
12984  * making decisions based on writable status of an SPTE, e.g. a
12985  * !writable SPTE doesn't guarantee a CPU can't perform writes.
12986  *
12987  * Specifically, KVM also write-protects guest page tables to
12988  * monitor changes when using shadow paging, and must guarantee
12989  * no CPUs can write to those page before mmu_lock is dropped.
12990  * Because CPUs may have stale TLB entries at this point, a
12991  * !writable SPTE doesn't guarantee CPUs can't perform writes.
12992  *
12993  * KVM also allows making SPTES writable outside of mmu_lock,
12994  * e.g. to allow dirty logging without taking mmu_lock.
12995  *
12996  * To handle these scenarios, KVM uses a separate software-only
12997  * bit (MMU-writable) to track if a SPTE is !writable due to
12998  * a guest page table being write-protected (KVM clears the
12999  * MMU-writable flag when write-protecting for shadow paging).
13000  *
13001  * The use of MMU-writable is also the primary motivation for
13002  * the unconditional flush. Because KVM must guarantee that a
13003  * CPU doesn't contain stale, writable TLB entries for a
13004  * !MMU-writable SPTE, KVM must flush if it encounters any
13005  * MMU-writable SPTE regardless of whether the actual hardware
13006  * writable bit was set. I.e. KVM is almost guaranteed to need
13007  * to flush, while unconditionally flushing allows the "remove
13008  * write access" helpers to ignore MMU-writable entirely.
13009  *
13010  * See is_writable_pte() for more details (the case involving
13011  * access-tracked SPTEs is particularly relevant).
13012  */
13014  }
13015 }
13016 
13017 void kvm_arch_commit_memory_region(struct kvm *kvm,
13018  struct kvm_memory_slot *old,
13019  const struct kvm_memory_slot *new,
13020  enum kvm_mr_change change)
13021 {
13022  if (change == KVM_MR_DELETE)
13023  kvm_page_track_delete_slot(kvm, old);
13024 
13025  if (!kvm->arch.n_requested_mmu_pages &&
13026  (change == KVM_MR_CREATE || change == KVM_MR_DELETE)) {
13027  unsigned long nr_mmu_pages;
13028 
13029  nr_mmu_pages = kvm->nr_memslot_pages / KVM_MEMSLOT_PAGES_TO_MMU_PAGES_RATIO;
13030  nr_mmu_pages = max(nr_mmu_pages, KVM_MIN_ALLOC_MMU_PAGES);
13031  kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
13032  }
13033 
13034  kvm_mmu_slot_apply_flags(kvm, old, new, change);
13035 
13036  /* Free the arrays associated with the old memslot. */
13037  if (change == KVM_MR_MOVE)
13038  kvm_arch_free_memslot(kvm, old);
13039 }
13040 
13041 static inline bool kvm_guest_apic_has_interrupt(struct kvm_vcpu *vcpu)
13042 {
13043  return (is_guest_mode(vcpu) &&
13044  static_call(kvm_x86_guest_apic_has_interrupt)(vcpu));
13045 }
13046 
13047 static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu)
13048 {
13049  if (!list_empty_careful(&vcpu->async_pf.done))
13050  return true;
13051 
13054  return true;
13055 
13056  if (vcpu->arch.pv.pv_unhalted)
13057  return true;
13058 
13059  if (kvm_is_exception_pending(vcpu))
13060  return true;
13061 
13062  if (kvm_test_request(KVM_REQ_NMI, vcpu) ||
13063  (vcpu->arch.nmi_pending &&
13064  static_call(kvm_x86_nmi_allowed)(vcpu, false)))
13065  return true;
13066 
13067 #ifdef CONFIG_KVM_SMM
13068  if (kvm_test_request(KVM_REQ_SMI, vcpu) ||
13069  (vcpu->arch.smi_pending &&
13070  static_call(kvm_x86_smi_allowed)(vcpu, false)))
13071  return true;
13072 #endif
13073 
13074  if (kvm_test_request(KVM_REQ_PMI, vcpu))
13075  return true;
13076 
13077  if (kvm_arch_interrupt_allowed(vcpu) &&
13078  (kvm_cpu_has_interrupt(vcpu) ||
13080  return true;
13081 
13082  if (kvm_hv_has_stimer_pending(vcpu))
13083  return true;
13084 
13085  if (is_guest_mode(vcpu) &&
13086  kvm_x86_ops.nested_ops->has_events &&
13087  kvm_x86_ops.nested_ops->has_events(vcpu))
13088  return true;
13089 
13090  if (kvm_xen_has_pending_events(vcpu))
13091  return true;
13092 
13093  return false;
13094 }
13095 
13096 int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
13097 {
13098  return kvm_vcpu_running(vcpu) || kvm_vcpu_has_events(vcpu);
13099 }
13100 
13101 bool kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu)
13102 {
13103  if (kvm_vcpu_apicv_active(vcpu) &&
13104  static_call(kvm_x86_dy_apicv_has_pending_interrupt)(vcpu))
13105  return true;
13106 
13107  return false;
13108 }
13109 
13110 bool kvm_arch_dy_runnable(struct kvm_vcpu *vcpu)
13111 {
13112  if (READ_ONCE(vcpu->arch.pv.pv_unhalted))
13113  return true;
13114 
13115  if (kvm_test_request(KVM_REQ_NMI, vcpu) ||
13116 #ifdef CONFIG_KVM_SMM
13117  kvm_test_request(KVM_REQ_SMI, vcpu) ||
13118 #endif
13119  kvm_test_request(KVM_REQ_EVENT, vcpu))
13120  return true;
13121 
13122  return kvm_arch_dy_has_pending_interrupt(vcpu);
13123 }
13124 
13125 bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu)
13126 {
13127  if (vcpu->arch.guest_state_protected)
13128  return true;
13129 
13130  if (vcpu != kvm_get_running_vcpu())
13131  return vcpu->arch.preempted_in_kernel;
13132 
13133  return static_call(kvm_x86_get_cpl)(vcpu) == 0;
13134 }
13135 
13136 unsigned long kvm_arch_vcpu_get_ip(struct kvm_vcpu *vcpu)
13137 {
13138  return kvm_rip_read(vcpu);
13139 }
13140 
13141 int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu)
13142 {
13143  return kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE;
13144 }
13145 
13146 int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu)
13147 {
13148  return static_call(kvm_x86_interrupt_allowed)(vcpu, false);
13149 }
13150 
13151 unsigned long kvm_get_linear_rip(struct kvm_vcpu *vcpu)
13152 {
13153  /* Can't read the RIP when guest state is protected, just return 0 */
13154  if (vcpu->arch.guest_state_protected)
13155  return 0;
13156 
13157  if (is_64_bit_mode(vcpu))
13158  return kvm_rip_read(vcpu);
13159  return (u32)(get_segment_base(vcpu, VCPU_SREG_CS) +
13160  kvm_rip_read(vcpu));
13161 }
13163 
13164 bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip)
13165 {
13166  return kvm_get_linear_rip(vcpu) == linear_rip;
13167 }
13169 
13170 unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu)
13171 {
13172  unsigned long rflags;
13173 
13174  rflags = static_call(kvm_x86_get_rflags)(vcpu);
13175  if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
13176  rflags &= ~X86_EFLAGS_TF;
13177  return rflags;
13178 }
13180 
13181 static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
13182 {
13183  if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP &&
13184  kvm_is_linear_rip(vcpu, vcpu->arch.singlestep_rip))
13185  rflags |= X86_EFLAGS_TF;
13186  static_call(kvm_x86_set_rflags)(vcpu, rflags);
13187 }
13188 
13189 void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
13190 {
13191  __kvm_set_rflags(vcpu, rflags);
13192  kvm_make_request(KVM_REQ_EVENT, vcpu);
13193 }
13195 
13196 static inline u32 kvm_async_pf_hash_fn(gfn_t gfn)
13197 {
13198  BUILD_BUG_ON(!is_power_of_2(ASYNC_PF_PER_VCPU));
13199 
13200  return hash_32(gfn & 0xffffffff, order_base_2(ASYNC_PF_PER_VCPU));
13201 }
13202 
13203 static inline u32 kvm_async_pf_next_probe(u32 key)
13204 {
13205  return (key + 1) & (ASYNC_PF_PER_VCPU - 1);
13206 }
13207 
13208 static void kvm_add_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
13209 {
13210  u32 key = kvm_async_pf_hash_fn(gfn);
13211 
13212  while (vcpu->arch.apf.gfns[key] != ~0)
13213  key = kvm_async_pf_next_probe(key);
13214 
13215  vcpu->arch.apf.gfns[key] = gfn;
13216 }
13217 
13218 static u32 kvm_async_pf_gfn_slot(struct kvm_vcpu *vcpu, gfn_t gfn)
13219 {
13220  int i;
13221  u32 key = kvm_async_pf_hash_fn(gfn);
13222 
13223  for (i = 0; i < ASYNC_PF_PER_VCPU &&
13224  (vcpu->arch.apf.gfns[key] != gfn &&
13225  vcpu->arch.apf.gfns[key] != ~0); i++)
13226  key = kvm_async_pf_next_probe(key);
13227 
13228  return key;
13229 }
13230 
13231 bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
13232 {
13233  return vcpu->arch.apf.gfns[kvm_async_pf_gfn_slot(vcpu, gfn)] == gfn;
13234 }
13235 
13236 static void kvm_del_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
13237 {
13238  u32 i, j, k;
13239 
13240  i = j = kvm_async_pf_gfn_slot(vcpu, gfn);
13241 
13242  if (WARN_ON_ONCE(vcpu->arch.apf.gfns[i] != gfn))
13243  return;
13244 
13245  while (true) {
13246  vcpu->arch.apf.gfns[i] = ~0;
13247  do {
13248  j = kvm_async_pf_next_probe(j);
13249  if (vcpu->arch.apf.gfns[j] == ~0)
13250  return;
13251  k = kvm_async_pf_hash_fn(vcpu->arch.apf.gfns[j]);
13252  /*
13253  * k lies cyclically in ]i,j]
13254  * | i.k.j |
13255  * |....j i.k.| or |.k..j i...|
13256  */
13257  } while ((i <= j) ? (i < k && k <= j) : (i < k || k <= j));
13258  vcpu->arch.apf.gfns[i] = vcpu->arch.apf.gfns[j];
13259  i = j;
13260  }
13261 }
13262 
13263 static inline int apf_put_user_notpresent(struct kvm_vcpu *vcpu)
13264 {
13265  u32 reason = KVM_PV_REASON_PAGE_NOT_PRESENT;
13266 
13267  return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.apf.data, &reason,
13268  sizeof(reason));
13269 }
13270 
13271 static inline int apf_put_user_ready(struct kvm_vcpu *vcpu, u32 token)
13272 {
13273  unsigned int offset = offsetof(struct kvm_vcpu_pv_apf_data, token);
13274 
13275  return kvm_write_guest_offset_cached(vcpu->kvm, &vcpu->arch.apf.data,
13276  &token, offset, sizeof(token));
13277 }
13278 
13279 static inline bool apf_pageready_slot_free(struct kvm_vcpu *vcpu)
13280 {
13281  unsigned int offset = offsetof(struct kvm_vcpu_pv_apf_data, token);
13282  u32 val;
13283 
13284  if (kvm_read_guest_offset_cached(vcpu->kvm, &vcpu->arch.apf.data,
13285  &val, offset, sizeof(val)))
13286  return false;
13287 
13288  return !val;
13289 }
13290 
13291 static bool kvm_can_deliver_async_pf(struct kvm_vcpu *vcpu)
13292 {
13293 
13294  if (!kvm_pv_async_pf_enabled(vcpu))
13295  return false;
13296 
13297  if (vcpu->arch.apf.send_user_only &&
13298  static_call(kvm_x86_get_cpl)(vcpu) == 0)
13299  return false;
13300 
13301  if (is_guest_mode(vcpu)) {
13302  /*
13303  * L1 needs to opt into the special #PF vmexits that are
13304  * used to deliver async page faults.
13305  */
13306  return vcpu->arch.apf.delivery_as_pf_vmexit;
13307  } else {
13308  /*
13309  * Play it safe in case the guest temporarily disables paging.
13310  * The real mode IDT in particular is unlikely to have a #PF
13311  * exception setup.
13312  */
13313  return is_paging(vcpu);
13314  }
13315 }
13316 
13317 bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu)
13318 {
13319  if (unlikely(!lapic_in_kernel(vcpu) ||
13321  kvm_is_exception_pending(vcpu)))
13322  return false;
13323 
13324  if (kvm_hlt_in_guest(vcpu->kvm) && !kvm_can_deliver_async_pf(vcpu))
13325  return false;
13326 
13327  /*
13328  * If interrupts are off we cannot even use an artificial
13329  * halt state.
13330  */
13331  return kvm_arch_interrupt_allowed(vcpu);
13332 }
13333 
13334 bool kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
13335  struct kvm_async_pf *work)
13336 {
13337  struct x86_exception fault;
13338 
13339  trace_kvm_async_pf_not_present(work->arch.token, work->cr2_or_gpa);
13340  kvm_add_async_pf_gfn(vcpu, work->arch.gfn);
13341 
13342  if (kvm_can_deliver_async_pf(vcpu) &&
13343  !apf_put_user_notpresent(vcpu)) {
13344  fault.vector = PF_VECTOR;
13345  fault.error_code_valid = true;
13346  fault.error_code = 0;
13347  fault.nested_page_fault = false;
13348  fault.address = work->arch.token;
13349  fault.async_page_fault = true;
13350  kvm_inject_page_fault(vcpu, &fault);
13351  return true;
13352  } else {
13353  /*
13354  * It is not possible to deliver a paravirtualized asynchronous
13355  * page fault, but putting the guest in an artificial halt state
13356  * can be beneficial nevertheless: if an interrupt arrives, we
13357  * can deliver it timely and perhaps the guest will schedule
13358  * another process. When the instruction that triggered a page
13359  * fault is retried, hopefully the page will be ready in the host.
13360  */
13361  kvm_make_request(KVM_REQ_APF_HALT, vcpu);
13362  return false;
13363  }
13364 }
13365 
13366 void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
13367  struct kvm_async_pf *work)
13368 {
13369  struct kvm_lapic_irq irq = {
13370  .delivery_mode = APIC_DM_FIXED,
13371  .vector = vcpu->arch.apf.vec
13372  };
13373 
13374  if (work->wakeup_all)
13375  work->arch.token = ~0; /* broadcast wakeup */
13376  else
13377  kvm_del_async_pf_gfn(vcpu, work->arch.gfn);
13378  trace_kvm_async_pf_ready(work->arch.token, work->cr2_or_gpa);
13379 
13380  if ((work->wakeup_all || work->notpresent_injected) &&
13381  kvm_pv_async_pf_enabled(vcpu) &&
13382  !apf_put_user_ready(vcpu, work->arch.token)) {
13383  vcpu->arch.apf.pageready_pending = true;
13384  kvm_apic_set_irq(vcpu, &irq, NULL);
13385  }
13386 
13387  vcpu->arch.apf.halted = false;
13388  vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
13389 }
13390 
13391 void kvm_arch_async_page_present_queued(struct kvm_vcpu *vcpu)
13392 {
13393  kvm_make_request(KVM_REQ_APF_READY, vcpu);
13394  if (!vcpu->arch.apf.pageready_pending)
13395  kvm_vcpu_kick(vcpu);
13396 }
13397 
13398 bool kvm_arch_can_dequeue_async_page_present(struct kvm_vcpu *vcpu)
13399 {
13400  if (!kvm_pv_async_pf_enabled(vcpu))
13401  return true;
13402  else
13403  return kvm_lapic_enabled(vcpu) && apf_pageready_slot_free(vcpu);
13404 }
13405 
13406 void kvm_arch_start_assignment(struct kvm *kvm)
13407 {
13408  if (atomic_inc_return(&kvm->arch.assigned_device_count) == 1)
13409  static_call_cond(kvm_x86_pi_start_assignment)(kvm);
13410 }
13412 
13413 void kvm_arch_end_assignment(struct kvm *kvm)
13414 {
13415  atomic_dec(&kvm->arch.assigned_device_count);
13416 }
13418 
13419 bool noinstr kvm_arch_has_assigned_device(struct kvm *kvm)
13420 {
13421  return raw_atomic_read(&kvm->arch.assigned_device_count);
13422 }
13424 
13426 {
13427  /*
13428  * Non-coherent DMA assignment and de-assignment will affect
13429  * whether KVM honors guest MTRRs and cause changes in memtypes
13430  * in TDP.
13431  * So, pass %true unconditionally to indicate non-coherent DMA was,
13432  * or will be involved, and that zapping SPTEs might be necessary.
13433  */
13434  if (__kvm_mmu_honors_guest_mtrrs(true))
13435  kvm_zap_gfn_range(kvm, gpa_to_gfn(0), gpa_to_gfn(~0ULL));
13436 }
13437 
13439 {
13440  if (atomic_inc_return(&kvm->arch.noncoherent_dma_count) == 1)
13442 }
13444 
13446 {
13447  if (!atomic_dec_return(&kvm->arch.noncoherent_dma_count))
13449 }
13451 
13452 bool kvm_arch_has_noncoherent_dma(struct kvm *kvm)
13453 {
13454  return atomic_read(&kvm->arch.noncoherent_dma_count);
13455 }
13457 
13459 {
13460  return enable_apicv && irq_remapping_cap(IRQ_POSTING_CAP);
13461 }
13462 
13463 int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons,
13464  struct irq_bypass_producer *prod)
13465 {
13466  struct kvm_kernel_irqfd *irqfd =
13467  container_of(cons, struct kvm_kernel_irqfd, consumer);
13468  int ret;
13469 
13470  irqfd->producer = prod;
13471  kvm_arch_start_assignment(irqfd->kvm);
13472  ret = static_call(kvm_x86_pi_update_irte)(irqfd->kvm,
13473  prod->irq, irqfd->gsi, 1);
13474 
13475  if (ret)
13476  kvm_arch_end_assignment(irqfd->kvm);
13477 
13478  return ret;
13479 }
13480 
13481 void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
13482  struct irq_bypass_producer *prod)
13483 {
13484  int ret;
13485  struct kvm_kernel_irqfd *irqfd =
13486  container_of(cons, struct kvm_kernel_irqfd, consumer);
13487 
13488  WARN_ON(irqfd->producer != prod);
13489  irqfd->producer = NULL;
13490 
13491  /*
13492  * When producer of consumer is unregistered, we change back to
13493  * remapped mode, so we can re-use the current implementation
13494  * when the irq is masked/disabled or the consumer side (KVM
13495  * int this case doesn't want to receive the interrupts.
13496  */
13497  ret = static_call(kvm_x86_pi_update_irte)(irqfd->kvm, prod->irq, irqfd->gsi, 0);
13498  if (ret)
13499  printk(KERN_INFO "irq bypass consumer (token %p) unregistration"
13500  " fails: %d\n", irqfd->consumer.token, ret);
13501 
13502  kvm_arch_end_assignment(irqfd->kvm);
13503 }
13504 
13505 int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq,
13506  uint32_t guest_irq, bool set)
13507 {
13508  return static_call(kvm_x86_pi_update_irte)(kvm, host_irq, guest_irq, set);
13509 }
13510 
13511 bool kvm_arch_irqfd_route_changed(struct kvm_kernel_irq_routing_entry *old,
13512  struct kvm_kernel_irq_routing_entry *new)
13513 {
13514  if (new->type != KVM_IRQ_ROUTING_MSI)
13515  return true;
13516 
13517  return !!memcmp(&old->msi, &new->msi, sizeof(new->msi));
13518 }
13519 
13521 {
13522  return vector_hashing;
13523 }
13524 
13525 bool kvm_arch_no_poll(struct kvm_vcpu *vcpu)
13526 {
13527  return (vcpu->arch.msr_kvm_poll_control & 1) == 0;
13528 }
13530 
13531 
13533 {
13534  /*
13535  * test that setting IA32_SPEC_CTRL to given value
13536  * is allowed by the host processor
13537  */
13538 
13539  u64 saved_value;
13540  unsigned long flags;
13541  int ret = 0;
13542 
13543  local_irq_save(flags);
13544 
13545  if (rdmsrl_safe(MSR_IA32_SPEC_CTRL, &saved_value))
13546  ret = 1;
13547  else if (wrmsrl_safe(MSR_IA32_SPEC_CTRL, value))
13548  ret = 1;
13549  else
13550  wrmsrl(MSR_IA32_SPEC_CTRL, saved_value);
13551 
13552  local_irq_restore(flags);
13553 
13554  return ret;
13555 }
13557 
13558 void kvm_fixup_and_inject_pf_error(struct kvm_vcpu *vcpu, gva_t gva, u16 error_code)
13559 {
13560  struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
13561  struct x86_exception fault;
13562  u64 access = error_code &
13563  (PFERR_WRITE_MASK | PFERR_FETCH_MASK | PFERR_USER_MASK);
13564 
13565  if (!(error_code & PFERR_PRESENT_MASK) ||
13566  mmu->gva_to_gpa(vcpu, mmu, gva, access, &fault) != INVALID_GPA) {
13567  /*
13568  * If vcpu->arch.walk_mmu->gva_to_gpa succeeded, the page
13569  * tables probably do not match the TLB. Just proceed
13570  * with the error code that the processor gave.
13571  */
13572  fault.vector = PF_VECTOR;
13573  fault.error_code_valid = true;
13574  fault.error_code = error_code;
13575  fault.nested_page_fault = false;
13576  fault.address = gva;
13577  fault.async_page_fault = false;
13578  }
13579  vcpu->arch.walk_mmu->inject_page_fault(vcpu, &fault);
13580 }
13582 
13583 /*
13584  * Handles kvm_read/write_guest_virt*() result and either injects #PF or returns
13585  * KVM_EXIT_INTERNAL_ERROR for cases not currently handled by KVM. Return value
13586  * indicates whether exit to userspace is needed.
13587  */
13588 int kvm_handle_memory_failure(struct kvm_vcpu *vcpu, int r,
13589  struct x86_exception *e)
13590 {
13591  if (r == X86EMUL_PROPAGATE_FAULT) {
13592  if (KVM_BUG_ON(!e, vcpu->kvm))
13593  return -EIO;
13594 
13596  return 1;
13597  }
13598 
13599  /*
13600  * In case kvm_read/write_guest_virt*() failed with X86EMUL_IO_NEEDED
13601  * while handling a VMX instruction KVM could've handled the request
13602  * correctly by exiting to userspace and performing I/O but there
13603  * doesn't seem to be a real use-case behind such requests, just return
13604  * KVM_EXIT_INTERNAL_ERROR for now.
13605  */
13607 
13608  return 0;
13609 }
13611 
13612 int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva)
13613 {
13614  bool pcid_enabled;
13615  struct x86_exception e;
13616  struct {
13617  u64 pcid;
13618  u64 gla;
13619  } operand;
13620  int r;
13621 
13622  r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e);
13623  if (r != X86EMUL_CONTINUE)
13624  return kvm_handle_memory_failure(vcpu, r, &e);
13625 
13626  if (operand.pcid >> 12 != 0) {
13627  kvm_inject_gp(vcpu, 0);
13628  return 1;
13629  }
13630 
13631  pcid_enabled = kvm_is_cr4_bit_set(vcpu, X86_CR4_PCIDE);
13632 
13633  switch (type) {
13634  case INVPCID_TYPE_INDIV_ADDR:
13635  /*
13636  * LAM doesn't apply to addresses that are inputs to TLB
13637  * invalidation.
13638  */
13639  if ((!pcid_enabled && (operand.pcid != 0)) ||
13640  is_noncanonical_address(operand.gla, vcpu)) {
13641  kvm_inject_gp(vcpu, 0);
13642  return 1;
13643  }
13644  kvm_mmu_invpcid_gva(vcpu, operand.gla, operand.pcid);
13645  return kvm_skip_emulated_instruction(vcpu);
13646 
13647  case INVPCID_TYPE_SINGLE_CTXT:
13648  if (!pcid_enabled && (operand.pcid != 0)) {
13649  kvm_inject_gp(vcpu, 0);
13650  return 1;
13651  }
13652 
13653  kvm_invalidate_pcid(vcpu, operand.pcid);
13654  return kvm_skip_emulated_instruction(vcpu);
13655 
13656  case INVPCID_TYPE_ALL_NON_GLOBAL:
13657  /*
13658  * Currently, KVM doesn't mark global entries in the shadow
13659  * page tables, so a non-global flush just degenerates to a
13660  * global flush. If needed, we could optimize this later by
13661  * keeping track of global entries in shadow page tables.
13662  */
13663 
13664  fallthrough;
13665  case INVPCID_TYPE_ALL_INCL_GLOBAL:
13666  kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
13667  return kvm_skip_emulated_instruction(vcpu);
13668 
13669  default:
13670  kvm_inject_gp(vcpu, 0);
13671  return 1;
13672  }
13673 }
13675 
13676 static int complete_sev_es_emulated_mmio(struct kvm_vcpu *vcpu)
13677 {
13678  struct kvm_run *run = vcpu->run;
13679  struct kvm_mmio_fragment *frag;
13680  unsigned int len;
13681 
13682  BUG_ON(!vcpu->mmio_needed);
13683 
13684  /* Complete previous fragment */
13685  frag = &vcpu->mmio_fragments[vcpu->mmio_cur_fragment];
13686  len = min(8u, frag->len);
13687  if (!vcpu->mmio_is_write)
13688  memcpy(frag->data, run->mmio.data, len);
13689 
13690  if (frag->len <= 8) {
13691  /* Switch to the next fragment. */
13692  frag++;
13693  vcpu->mmio_cur_fragment++;
13694  } else {
13695  /* Go forward to the next mmio piece. */
13696  frag->data += len;
13697  frag->gpa += len;
13698  frag->len -= len;
13699  }
13700 
13701  if (vcpu->mmio_cur_fragment >= vcpu->mmio_nr_fragments) {
13702  vcpu->mmio_needed = 0;
13703 
13704  // VMG change, at this point, we're always done
13705  // RIP has already been advanced
13706  return 1;
13707  }
13708 
13709  // More MMIO is needed
13710  run->mmio.phys_addr = frag->gpa;
13711  run->mmio.len = min(8u, frag->len);
13712  run->mmio.is_write = vcpu->mmio_is_write;
13713  if (run->mmio.is_write)
13714  memcpy(run->mmio.data, frag->data, min(8u, frag->len));
13715  run->exit_reason = KVM_EXIT_MMIO;
13716 
13717  vcpu->arch.complete_userspace_io = complete_sev_es_emulated_mmio;
13718 
13719  return 0;
13720 }
13721 
13722 int kvm_sev_es_mmio_write(struct kvm_vcpu *vcpu, gpa_t gpa, unsigned int bytes,
13723  void *data)
13724 {
13725  int handled;
13726  struct kvm_mmio_fragment *frag;
13727 
13728  if (!data)
13729  return -EINVAL;
13730 
13731  handled = write_emultor.read_write_mmio(vcpu, gpa, bytes, data);
13732  if (handled == bytes)
13733  return 1;
13734 
13735  bytes -= handled;
13736  gpa += handled;
13737  data += handled;
13738 
13739  /*TODO: Check if need to increment number of frags */
13740  frag = vcpu->mmio_fragments;
13741  vcpu->mmio_nr_fragments = 1;
13742  frag->len = bytes;
13743  frag->gpa = gpa;
13744  frag->data = data;
13745 
13746  vcpu->mmio_needed = 1;
13747  vcpu->mmio_cur_fragment = 0;
13748 
13749  vcpu->run->mmio.phys_addr = gpa;
13750  vcpu->run->mmio.len = min(8u, frag->len);
13751  vcpu->run->mmio.is_write = 1;
13752  memcpy(vcpu->run->mmio.data, frag->data, min(8u, frag->len));
13753  vcpu->run->exit_reason = KVM_EXIT_MMIO;
13754 
13755  vcpu->arch.complete_userspace_io = complete_sev_es_emulated_mmio;
13756 
13757  return 0;
13758 }
13760 
13761 int kvm_sev_es_mmio_read(struct kvm_vcpu *vcpu, gpa_t gpa, unsigned int bytes,
13762  void *data)
13763 {
13764  int handled;
13765  struct kvm_mmio_fragment *frag;
13766 
13767  if (!data)
13768  return -EINVAL;
13769 
13770  handled = read_emultor.read_write_mmio(vcpu, gpa, bytes, data);
13771  if (handled == bytes)
13772  return 1;
13773 
13774  bytes -= handled;
13775  gpa += handled;
13776  data += handled;
13777 
13778  /*TODO: Check if need to increment number of frags */
13779  frag = vcpu->mmio_fragments;
13780  vcpu->mmio_nr_fragments = 1;
13781  frag->len = bytes;
13782  frag->gpa = gpa;
13783  frag->data = data;
13784 
13785  vcpu->mmio_needed = 1;
13786  vcpu->mmio_cur_fragment = 0;
13787 
13788  vcpu->run->mmio.phys_addr = gpa;
13789  vcpu->run->mmio.len = min(8u, frag->len);
13790  vcpu->run->mmio.is_write = 0;
13791  vcpu->run->exit_reason = KVM_EXIT_MMIO;
13792 
13793  vcpu->arch.complete_userspace_io = complete_sev_es_emulated_mmio;
13794 
13795  return 0;
13796 }
13798 
13799 static void advance_sev_es_emulated_pio(struct kvm_vcpu *vcpu, unsigned count, int size)
13800 {
13801  vcpu->arch.sev_pio_count -= count;
13802  vcpu->arch.sev_pio_data += count * size;
13803 }
13804 
13805 static int kvm_sev_es_outs(struct kvm_vcpu *vcpu, unsigned int size,
13806  unsigned int port);
13807 
13808 static int complete_sev_es_emulated_outs(struct kvm_vcpu *vcpu)
13809 {
13810  int size = vcpu->arch.pio.size;
13811  int port = vcpu->arch.pio.port;
13812 
13813  vcpu->arch.pio.count = 0;
13814  if (vcpu->arch.sev_pio_count)
13815  return kvm_sev_es_outs(vcpu, size, port);
13816  return 1;
13817 }
13818 
13819 static int kvm_sev_es_outs(struct kvm_vcpu *vcpu, unsigned int size,
13820  unsigned int port)
13821 {
13822  for (;;) {
13823  unsigned int count =
13824  min_t(unsigned int, PAGE_SIZE / size, vcpu->arch.sev_pio_count);
13825  int ret = emulator_pio_out(vcpu, size, port, vcpu->arch.sev_pio_data, count);
13826 
13827  /* memcpy done already by emulator_pio_out. */
13828  advance_sev_es_emulated_pio(vcpu, count, size);
13829  if (!ret)
13830  break;
13831 
13832  /* Emulation done by the kernel. */
13833  if (!vcpu->arch.sev_pio_count)
13834  return 1;
13835  }
13836 
13837  vcpu->arch.complete_userspace_io = complete_sev_es_emulated_outs;
13838  return 0;
13839 }
13840 
13841 static int kvm_sev_es_ins(struct kvm_vcpu *vcpu, unsigned int size,
13842  unsigned int port);
13843 
13844 static int complete_sev_es_emulated_ins(struct kvm_vcpu *vcpu)
13845 {
13846  unsigned count = vcpu->arch.pio.count;
13847  int size = vcpu->arch.pio.size;
13848  int port = vcpu->arch.pio.port;
13849 
13850  complete_emulator_pio_in(vcpu, vcpu->arch.sev_pio_data);
13851  advance_sev_es_emulated_pio(vcpu, count, size);
13852  if (vcpu->arch.sev_pio_count)
13853  return kvm_sev_es_ins(vcpu, size, port);
13854  return 1;
13855 }
13856 
13857 static int kvm_sev_es_ins(struct kvm_vcpu *vcpu, unsigned int size,
13858  unsigned int port)
13859 {
13860  for (;;) {
13861  unsigned int count =
13862  min_t(unsigned int, PAGE_SIZE / size, vcpu->arch.sev_pio_count);
13863  if (!emulator_pio_in(vcpu, size, port, vcpu->arch.sev_pio_data, count))
13864  break;
13865 
13866  /* Emulation done by the kernel. */
13867  advance_sev_es_emulated_pio(vcpu, count, size);
13868  if (!vcpu->arch.sev_pio_count)
13869  return 1;
13870  }
13871 
13872  vcpu->arch.complete_userspace_io = complete_sev_es_emulated_ins;
13873  return 0;
13874 }
13875 
13876 int kvm_sev_es_string_io(struct kvm_vcpu *vcpu, unsigned int size,
13877  unsigned int port, void *data, unsigned int count,
13878  int in)
13879 {
13880  vcpu->arch.sev_pio_data = data;
13881  vcpu->arch.sev_pio_count = count;
13882  return in ? kvm_sev_es_ins(vcpu, size, port)
13883  : kvm_sev_es_outs(vcpu, size, port);
13884 }
13886 
13894 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmenter);
13895 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit);
13896 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit_inject);
13897 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intr_vmexit);
13898 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmenter_failed);
13901 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intercepts);
13902 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_write_tsc_offset);
13903 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_ple_window_update);
13905 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pi_irte_update);
13906 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_unaccelerated_access);
13907 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_incomplete_ipi);
13909 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_kick_vcpu_slowpath);
13910 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_doorbell);
13911 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_apicv_accept_irq);
13912 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_enter);
13914 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_msr_protocol_enter);
13915 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_msr_protocol_exit);
13916 
13917 static int __init kvm_x86_init(void)
13918 {
13919  pr_info("hello, kvm\n");
13921  mitigate_smt_rsb &= boot_cpu_has_bug(X86_BUG_SMT_RSB) && cpu_smt_possible();
13922  return 0;
13923 }
13925 
13926 static void __exit kvm_x86_exit(void)
13927 {
13928  /*
13929  * If module_init() is implemented, module_exit() must also be
13930  * implemented to allow module unload.
13931  */
13932 }
#define irqchip_in_kernel(k)
Definition: arm_vgic.h:392
void kvm_check_async_pf_completion(struct kvm_vcpu *vcpu)
Definition: async_pf.c:158
void kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu)
Definition: async_pf.c:113
int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu)
Definition: async_pf.c:223
void kvm_update_pv_runtime(struct kvm_vcpu *vcpu)
Definition: cpuid.c:238
int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid *cpuid, struct kvm_cpuid_entry __user *entries)
Definition: cpuid.c:467
void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu)
Definition: cpuid.c:309
int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, struct kvm_cpuid_entry2 __user *entries)
Definition: cpuid.c:535
int cpuid_query_maxphyaddr(struct kvm_vcpu *vcpu)
Definition: cpuid.c:390
u64 kvm_vcpu_reserved_gpa_bits_raw(struct kvm_vcpu *vcpu)
Definition: cpuid.c:409
bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx, bool exact_only)
Definition: cpuid.c:1531
u32 xstate_required_size(u64 xstate_bv, bool compacted)
Definition: cpuid.c:39
struct kvm_cpuid_entry2 * kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, u32 function)
Definition: cpuid.c:1455
int kvm_dev_ioctl_get_cpuid(struct kvm_cpuid2 *cpuid, struct kvm_cpuid_entry2 __user *entries, unsigned int type)
Definition: cpuid.c:1404
int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, struct kvm_cpuid_entry2 __user *entries)
Definition: cpuid.c:512
static bool supports_cpuid_fault(struct kvm_vcpu *vcpu)
Definition: cpuid.h:186
static __always_inline bool kvm_cpu_cap_has(unsigned int x86_feature)
Definition: cpuid.h:221
static bool guest_has_pred_cmd_msr(struct kvm_vcpu *vcpu)
Definition: cpuid.h:179
static __always_inline bool guest_pv_has(struct kvm_vcpu *vcpu, unsigned int kvm_feature)
Definition: cpuid.h:232
static bool kvm_vcpu_is_legal_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
Definition: cpuid.h:287
static bool guest_cpuid_is_intel(struct kvm_vcpu *vcpu)
Definition: cpuid.h:115
static __always_inline bool guest_can_use(struct kvm_vcpu *vcpu, unsigned int x86_feature)
Definition: cpuid.h:278
static __always_inline bool guest_cpuid_has(struct kvm_vcpu *vcpu, unsigned int x86_feature)
Definition: cpuid.h:83
static bool cpuid_fault_enabled(struct kvm_vcpu *vcpu)
Definition: cpuid.h:191
static bool guest_cpuid_is_amd_compatible(struct kvm_vcpu *vcpu)
Definition: cpuid.h:123
bool kvm_dirty_ring_check_request(struct kvm_vcpu *vcpu)
Definition: dirty_ring.c:194
int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
Definition: emulate.c:5140
static int writeback(struct x86_emulate_ctxt *ctxt, struct operand *op)
Definition: emulate.c:1785
void emulator_invalidate_register_cache(struct x86_emulate_ctxt *ctxt)
Definition: emulate.c:5492
void init_decode_cache(struct x86_emulate_ctxt *ctxt)
Definition: emulate.c:5125
int emulate_int_real(struct x86_emulate_ctxt *ctxt, int irq)
Definition: emulate.c:2069
int emulator_task_switch(struct x86_emulate_ctxt *ctxt, u16 tss_selector, int idt_index, int reason, bool has_error_code, u32 error_code)
Definition: emulate.c:3020
void emulator_writeback_register_cache(struct x86_emulate_ctxt *ctxt)
Definition: emulate.c:5497
bool x86_page_table_writing_insn(struct x86_emulate_ctxt *ctxt)
Definition: emulate.c:5062
int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len, int emulation_type)
Definition: emulate.c:4763
bool emulator_can_use_gpa(struct x86_emulate_ctxt *ctxt)
Definition: emulate.c:5502
void kvm_hv_request_tsc_page_update(struct kvm *kvm)
Definition: hyperv.c:1236
void kvm_hv_init_vm(struct kvm *kvm)
Definition: hyperv.c:2689
int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host)
Definition: hyperv.c:1771
int kvm_hv_vcpu_flush_tlb(struct kvm_vcpu *vcpu)
Definition: hyperv.c:1963
int kvm_hv_activate_synic(struct kvm_vcpu *vcpu, bool dont_zero_synic_pages)
Definition: hyperv.c:991
void kvm_hv_setup_tsc_page(struct kvm *kvm, struct pvclock_vcpu_time_info *hv_clock)
Definition: hyperv.c:1158
int kvm_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, struct kvm_cpuid_entry2 __user *entries)
Definition: hyperv.c:2760
void kvm_hv_vcpu_uninit(struct kvm_vcpu *vcpu)
Definition: hyperv.c:896
int kvm_vm_ioctl_hv_eventfd(struct kvm *kvm, struct kvm_hyperv_eventfd *args)
Definition: hyperv.c:2749
void kvm_hv_process_stimers(struct kvm_vcpu *vcpu)
Definition: hyperv.c:863
int kvm_hv_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host)
Definition: hyperv.c:1750
void kvm_hv_xsaves_xsavec_maybe_warn(struct kvm_vcpu *vcpu)
Definition: hyperv.c:1362
void kvm_hv_destroy_vm(struct kvm *kvm)
Definition: hyperv.c:2697
int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
Definition: hyperv.c:2519
int kvm_hv_set_enforce_cpuid(struct kvm_vcpu *vcpu, bool enforce)
Definition: hyperv.c:2342
static void kvm_hv_vcpu_purge_flush_tlb(struct kvm_vcpu *vcpu)
Definition: hyperv.h:288
static bool kvm_hv_hypercall_enabled(struct kvm_vcpu *vcpu)
Definition: hyperv.h:280
static bool kvm_hv_has_stimer_pending(struct kvm_vcpu *vcpu)
Definition: hyperv.h:304
struct kvm_pit * kvm_create_pit(struct kvm *kvm, u32 flags)
Definition: i8254.c:662
void kvm_pit_load_count(struct kvm_pit *pit, int channel, u32 val, int hpet_legacy_start)
Definition: i8254.c:403
void kvm_pit_set_reinject(struct kvm_pit *pit, bool reinject)
Definition: i8254.c:291
void kvm_free_pit(struct kvm *kvm)
Definition: i8254.c:736
void kvm_pic_destroy(struct kvm *kvm)
Definition: i8259.c:645
void kvm_pic_update_irq(struct kvm_pic *s)
Definition: i8259.c:181
int kvm_pic_init(struct kvm *kvm)
Definition: i8259.c:590
void kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state)
Definition: ioapic.c:755
int kvm_ioapic_init(struct kvm *kvm)
Definition: ioapic.c:714
void kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state)
Definition: ioapic.c:765
void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, ulong *ioapic_handled_vectors)
Definition: ioapic.c:278
void kvm_ioapic_destroy(struct kvm *kvm)
Definition: ioapic.c:740
void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu, ulong *ioapic_handled_vectors)
Definition: irq_comm.c:412
static int ioapic_in_kernel(struct kvm *kvm)
Definition: ioapic.h:104
#define MAX_NR_RESERVED_IOAPIC_PINS
Definition: ioapic.h:13
static int kvm_iodevice_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev, gpa_t addr, int l, void *v)
Definition: iodev.h:42
static int kvm_iodevice_write(struct kvm_vcpu *vcpu, struct kvm_io_device *dev, gpa_t addr, int l, const void *v)
Definition: iodev.h:50
int kvm_cpu_has_extint(struct kvm_vcpu *v)
Definition: irq.c:48
void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu)
Definition: irq.c:148
int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
Definition: irq.c:24
int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v)
Definition: irq.c:82
void __kvm_migrate_timers(struct kvm_vcpu *vcpu)
Definition: irq.c:156
int kvm_cpu_get_interrupt(struct kvm_vcpu *v)
Definition: irq.c:138
int kvm_cpu_has_interrupt(struct kvm_vcpu *v)
Definition: irq.c:98
int kvm_setup_default_irq_routing(struct kvm *kvm)
Definition: irq_comm.c:392
int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, struct kvm_lapic_irq *irq, struct dest_map *dest_map)
Definition: irq_comm.c:47
static int irqchip_split(struct kvm *kvm)
Definition: irq.h:67
static int pic_in_kernel(struct kvm *kvm)
Definition: irq.h:85
int kvm_setup_empty_irq_routing(struct kvm *kvm)
Definition: irq_comm.c:400
static int irqchip_kernel(struct kvm *kvm)
Definition: irq.h:76
int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level, bool line_status)
Definition: irqchip.c:70
#define X86_CR4_PDPTR_BITS
static void kvm_register_mark_dirty(struct kvm_vcpu *vcpu, enum kvm_reg reg)
static __always_inline bool kvm_is_cr4_bit_set(struct kvm_vcpu *vcpu, unsigned long cr4_bit)
static unsigned long kvm_rsp_read(struct kvm_vcpu *vcpu)
static void kvm_register_write_raw(struct kvm_vcpu *vcpu, int reg, unsigned long val)
static void kvm_pdptr_write(struct kvm_vcpu *vcpu, int index, u64 value)
static unsigned long kvm_register_read_raw(struct kvm_vcpu *vcpu, int reg)
static ulong kvm_read_cr0_bits(struct kvm_vcpu *vcpu, ulong mask)
static ulong kvm_read_cr0(struct kvm_vcpu *vcpu)
static u64 kvm_read_edx_eax(struct kvm_vcpu *vcpu)
static void kvm_rip_write(struct kvm_vcpu *vcpu, unsigned long val)
static ulong kvm_read_cr3(struct kvm_vcpu *vcpu)
static void kvm_rsp_write(struct kvm_vcpu *vcpu, unsigned long val)
#define X86_CR0_PDPTR_BITS
static ulong kvm_read_cr4(struct kvm_vcpu *vcpu)
static unsigned long kvm_rip_read(struct kvm_vcpu *vcpu)
static u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index)
static bool is_guest_mode(struct kvm_vcpu *vcpu)
#define X86EMUL_PROPAGATE_FAULT
Definition: kvm_emulate.h:85
#define EMULATION_INTERCEPTED
Definition: kvm_emulate.h:508
#define X86EMUL_UNHANDLEABLE
Definition: kvm_emulate.h:83
#define X86EMUL_CONTINUE
Definition: kvm_emulate.h:81
#define EMULATION_RESTART
Definition: kvm_emulate.h:507
#define EMULATION_OK
Definition: kvm_emulate.h:506
@ X86EMUL_MODE_PROT64
Definition: kvm_emulate.h:284
@ X86EMUL_MODE_VM86
Definition: kvm_emulate.h:281
@ X86EMUL_MODE_REAL
Definition: kvm_emulate.h:280
@ X86EMUL_MODE_PROT32
Definition: kvm_emulate.h:283
@ X86EMUL_MODE_PROT16
Definition: kvm_emulate.h:282
#define EMULATION_FAILED
Definition: kvm_emulate.h:505
#define X86EMUL_IO_NEEDED
Definition: kvm_emulate.h:88
x86_intercept_stage
Definition: kvm_emulate.h:435
#define X86EMUL_CMPXCHG_FAILED
Definition: kvm_emulate.h:87
#define X86EMUL_RETRY_INSTR
Definition: kvm_emulate.h:86
kvm_pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
Definition: kvm_main.c:3091
void kvm_destroy_vcpus(struct kvm *kvm)
Definition: kvm_main.c:522
void vcpu_put(struct kvm_vcpu *vcpu)
Definition: kvm_main.c:219
bool kvm_make_vcpus_request_mask(struct kvm *kvm, unsigned int req, unsigned long *vcpu_bitmap)
Definition: kvm_main.c:288
int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data, unsigned long len)
Definition: kvm_main.c:3449
int kvm_io_bus_read(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr, int len, void *val)
Definition: kvm_main.c:5878
int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len)
Definition: kvm_main.c:3346
void kvm_sigset_deactivate(struct kvm_vcpu *vcpu)
Definition: kvm_main.c:3692
unsigned long kvm_vcpu_gfn_to_hva(struct kvm_vcpu *vcpu, gfn_t gfn)
Definition: kvm_main.c:2748
void vcpu_load(struct kvm_vcpu *vcpu)
Definition: kvm_main.c:208
void kvm_vcpu_halt(struct kvm_vcpu *vcpu)
Definition: kvm_main.c:3842
bool kvm_vcpu_block(struct kvm_vcpu *vcpu)
Definition: kvm_main.c:3764
int kvm_vcpu_yield_to(struct kvm_vcpu *target)
Definition: kvm_main.c:3969
void kvm_flush_remote_tlbs_memslot(struct kvm *kvm, const struct kvm_memory_slot *memslot)
Definition: kvm_main.c:380
void kvm_exit(void)
Definition: kvm_main.c:6495
void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn)
Definition: kvm_main.c:3669
bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req)
Definition: kvm_main.c:340
int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, void *data, unsigned long len)
Definition: kvm_main.c:3571
int kvm_io_bus_write(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr, int len, const void *val)
Definition: kvm_main.c:5807
struct kvm_vcpu * kvm_get_running_vcpu(void)
Definition: kvm_main.c:6338
int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc, gpa_t gpa, unsigned long len)
Definition: kvm_main.c:3532
int __kvm_set_memory_region(struct kvm *kvm, const struct kvm_userspace_memory_region2 *mem)
Definition: kvm_main.c:2020
void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
Definition: kvm_main.c:3931
int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, void *data, unsigned int offset, unsigned long len)
Definition: kvm_main.c:3540
int kvm_vcpu_read_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, void *data, int offset, int len)
Definition: kvm_main.c:3337
void mark_page_dirty_in_slot(struct kvm *kvm, const struct kvm_memory_slot *memslot, gfn_t gfn)
Definition: kvm_main.c:3635
int kvm_vcpu_read_guest(struct kvm_vcpu *vcpu, gpa_t gpa, void *data, unsigned long len)
Definition: kvm_main.c:3366
int kvm_read_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, void *data, unsigned int offset, unsigned long len)
Definition: kvm_main.c:3578
void kvm_release_pfn_clean(kvm_pfn_t pfn)
Definition: kvm_main.c:3241
void kvm_sigset_activate(struct kvm_vcpu *vcpu)
Definition: kvm_main.c:3678
int kvm_vcpu_write_guest(struct kvm_vcpu *vcpu, gpa_t gpa, const void *data, unsigned long len)
Definition: kvm_main.c:3470
void kvm_inhibit_apic_access_page(struct kvm_vcpu *vcpu)
Definition: lapic.c:2634
bool kvm_lapic_hv_timer_in_use(struct kvm_vcpu *vcpu)
Definition: lapic.c:2083
int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data)
Definition: lapic.c:3210
int kvm_create_lapic(struct kvm_vcpu *vcpu, int timer_advance_ns)
Definition: lapic.c:2810
int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu)
Definition: lapic.c:808
int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu)
Definition: lapic.c:2872
void kvm_apic_after_set_mcg_cap(struct kvm_vcpu *vcpu)
Definition: lapic.c:596
bool kvm_can_use_hv_timer(struct kvm_vcpu *vcpu)
Definition: lapic.c:154
int kvm_apic_accept_events(struct kvm_vcpu *vcpu)
Definition: lapic.c:3263
void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu)
Definition: lapic.c:3115
u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu)
Definition: lapic.c:2521
int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq, struct dest_map *dest_map)
Definition: lapic.c:823
void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data)
Definition: lapic.c:2504
int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s)
Definition: lapic.c:2984
void kvm_free_lapic(struct kvm_vcpu *vcpu)
Definition: lapic.c:2468
void kvm_apic_update_apicv(struct kvm_vcpu *vcpu)
Definition: lapic.c:2578
int kvm_apic_get_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s)
Definition: lapic.c:2970
int kvm_pv_send_ipi(struct kvm *kvm, unsigned long ipi_bitmap_low, unsigned long ipi_bitmap_high, u32 min, unsigned long icr, int op_64_bit)
Definition: lapic.c:852
void kvm_recalculate_apic_map(struct kvm *kvm)
Definition: lapic.c:374
int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data)
Definition: lapic.c:3199
void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
Definition: lapic.c:2669
void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
Definition: lapic.c:2530
void kvm_lapic_switch_to_hv_timer(struct kvm_vcpu *vcpu)
Definition: lapic.c:2192
int kvm_lapic_set_pv_eoi(struct kvm_vcpu *vcpu, u64 data, unsigned long len)
Definition: lapic.c:3237
void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu)
Definition: lapic.c:3072
int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type)
Definition: lapic.c:2762
void kvm_lapic_restart_hv_timer(struct kvm_vcpu *vcpu)
Definition: lapic.c:2208
int kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr)
Definition: lapic.c:3139
int kvm_x2apic_icr_write(struct kvm_lapic *apic, u64 data)
Definition: lapic.c:3155
void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8)
Definition: lapic.c:2516
void kvm_lapic_switch_to_sw_timer(struct kvm_vcpu *vcpu)
Definition: lapic.c:2197
void kvm_lapic_exit(void)
Definition: lapic.c:3313
u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu)
Definition: lapic.c:2494
static int apic_x2apic_mode(struct kvm_lapic *apic)
Definition: lapic.h:221
lapic_mode
Definition: lapic.h:25
@ LAPIC_MODE_X2APIC
Definition: lapic.h:29
@ LAPIC_MODE_DISABLED
Definition: lapic.h:26
@ LAPIC_MODE_XAPIC
Definition: lapic.h:28
@ LAPIC_MODE_INVALID
Definition: lapic.h:27
static enum lapic_mode kvm_apic_mode(u64 apic_base)
Definition: lapic.h:271
static bool kvm_apic_has_pending_init_or_sipi(struct kvm_vcpu *vcpu)
Definition: lapic.h:231
static bool kvm_apic_hw_enabled(struct kvm_lapic *apic)
Definition: lapic.h:195
static bool lapic_in_kernel(struct kvm_vcpu *vcpu)
Definition: lapic.h:186
static bool kvm_vcpu_apicv_active(struct kvm_vcpu *vcpu)
Definition: lapic.h:226
#define KVM_APIC_INIT
Definition: lapic.h:12
#define X2APIC_BROADCAST
Definition: lapic.h:23
static bool kvm_apic_init_sipi_allowed(struct kvm_vcpu *vcpu)
Definition: lapic.h:236
#define APIC_DEST_MASK
Definition: lapic.h:17
static int kvm_lapic_latched_init(struct kvm_vcpu *vcpu)
Definition: lapic.h:248
static int kvm_lapic_enabled(struct kvm_vcpu *vcpu)
Definition: lapic.h:216
#define APIC_DEST_NOSHORT
Definition: lapic.h:16
#define APIC_SHORT_MASK
Definition: lapic.h:15
static bool kvm_apic_present(struct kvm_vcpu *vcpu)
Definition: lapic.h:211
#define KVM_APIC_SIPI
Definition: lapic.h:13
void kvm_mmu_slot_try_split_huge_pages(struct kvm *kvm, const struct kvm_memory_slot *memslot, int target_level)
Definition: mmu.c:6673
void kvm_mmu_uninit_vm(struct kvm *kvm)
Definition: mmu.c:6330
int kvm_mmu_post_init_vm(struct kvm *kvm)
Definition: mmu.c:7279
void kvm_mmu_slot_remove_write_access(struct kvm *kvm, const struct kvm_memory_slot *memslot, int start_level)
Definition: mmu.c:6406
void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
Definition: mmu.c:7076
void kvm_mmu_pre_destroy_vm(struct kvm *kvm)
Definition: mmu.c:7295
void kvm_mmu_vendor_module_exit(void)
Definition: mmu.c:7084
void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm, const struct kvm_memory_slot *slot)
Definition: mmu.c:6753
void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid)
Definition: mmu.c:5986
void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
Definition: mmu.c:5968
void __init kvm_mmu_x86_module_init(void)
Definition: mmu.c:7006
void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd)
Definition: mmu.c:4753
int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
Definition: mmu.c:2757
void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned long goal_nr_mmu_pages)
Definition: mmu.c:2741
void kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
Definition: mmu.c:5581
void kvm_mmu_sync_prev_roots(struct kvm_vcpu *vcpu)
Definition: mmu.c:4062
void kvm_mmu_invalidate_addr(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, u64 addr, unsigned long roots)
Definition: mmu.c:5939
void kvm_mmu_unload(struct kvm_vcpu *vcpu)
Definition: mmu.c:5621
void kvm_init_mmu(struct kvm_vcpu *vcpu)
Definition: mmu.c:5538
void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
Definition: mmu.c:6373
void kvm_mmu_free_obsolete_roots(struct kvm_vcpu *vcpu)
Definition: mmu.c:5676
bool __kvm_mmu_honors_guest_mtrrs(bool vm_has_noncoherent_dma)
Definition: mmu.c:4609
void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
Definition: mmu.c:4021
void kvm_mmu_init_vm(struct kvm *kvm)
Definition: mmu.c:6304
void kvm_mmu_free_roots(struct kvm *kvm, struct kvm_mmu *mmu, ulong roots_to_free)
Definition: mmu.c:3587
bool tdp_enabled
Definition: mmu.c:106
int kvm_mmu_create(struct kvm_vcpu *vcpu)
Definition: mmu.c:6153
int kvm_mmu_vendor_module_init(void)
Definition: mmu.c:7026
void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm, const struct kvm_memory_slot *memslot)
Definition: mmu.c:6769
void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen)
Definition: mmu.c:6834
static bool kvm_mmu_honors_guest_mtrrs(struct kvm *kvm)
Definition: mmu.h:250
#define tdp_mmu_enabled
Definition: mmu.h:276
static __always_inline u64 rsvd_bits(int s, int e)
Definition: mmu.h:45
#define KVM_MMU_EFER_ROLE_BITS
Definition: mmu.h:43
#define KVM_MMU_CR0_ROLE_BITS
Definition: mmu.h:42
static unsigned long __kvm_mmu_slot_lpages(struct kvm_memory_slot *slot, unsigned long npages, int level)
Definition: mmu.h:292
static unsigned long kvm_get_pcid(struct kvm_vcpu *vcpu, gpa_t cr3)
Definition: mmu.h:135
static gfn_t kvm_mmu_max_gfn(void)
Definition: mmu.h:66
#define PT_PRESENT_MASK
Definition: mmu.h:14
static gpa_t kvm_translate_gpa(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, gpa_t gpa, u64 access, struct x86_exception *exception)
Definition: mmu.h:313
static int kvm_mmu_reload(struct kvm_vcpu *vcpu)
Definition: mmu.h:127
static bool kvm_memslots_have_rmaps(struct kvm *kvm)
Definition: mmu.h:279
static unsigned long kvm_get_active_pcid(struct kvm_vcpu *vcpu)
Definition: mmu.h:144
static u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned pte_access, unsigned pte_pkey, u64 access)
Definition: mmu.h:194
static void kvm_mmu_load_pgd(struct kvm_vcpu *vcpu)
Definition: mmu.h:157
#define KVM_MMU_CR4_ROLE_BITS
Definition: mmu.h:39
void kvm_vcpu_mtrr_init(struct kvm_vcpu *vcpu)
Definition: mtrr.c:434
int kvm_mtrr_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
Definition: mtrr.c:378
int kvm_mtrr_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
Definition: mtrr.c:397
void kvm_page_track_free_memslot(struct kvm_memory_slot *slot)
Definition: page_track.c:29
int kvm_page_track_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, unsigned long npages)
Definition: page_track.c:47
static void kvm_page_track_cleanup(struct kvm *kvm)
Definition: page_track.h:39
static bool kvm_page_track_has_external_user(struct kvm *kvm)
Definition: page_track.h:46
static int kvm_page_track_init(struct kvm *kvm)
Definition: page_track.h:38
static void kvm_page_track_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new, int bytes)
Definition: page_track.h:50
static void kvm_page_track_delete_slot(struct kvm *kvm, struct kvm_memory_slot *slot)
Definition: page_track.h:43
void kvm_gpc_deactivate(struct gfn_to_pfn_cache *gpc)
Definition: pfncache.c:382
void kvm_gpc_init(struct gfn_to_pfn_cache *gpc, struct kvm *kvm, struct kvm_vcpu *vcpu, enum pfn_cache_usage usage)
Definition: pfncache.c:340
bool kvm_gpc_check(struct gfn_to_pfn_cache *gpc, unsigned long len)
Definition: pfncache.c:79
int kvm_gpc_refresh(struct gfn_to_pfn_cache *gpc, unsigned long len)
Definition: pfncache.c:334
int kvm_gpc_activate(struct gfn_to_pfn_cache *gpc, gpa_t gpa, unsigned long len)
Definition: pfncache.c:357
void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu)
Definition: pmu.c:594
void kvm_pmu_trigger_event(struct kvm_vcpu *vcpu, u64 perf_hw_id)
Definition: pmu.c:828
int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
Definition: pmu.c:625
int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp)
Definition: pmu.c:928
bool kvm_pmu_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr)
Definition: pmu.c:602
int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
Definition: pmu.c:650
void kvm_pmu_handle_event(struct kvm_vcpu *vcpu)
Definition: pmu.c:503
void kvm_pmu_refresh(struct kvm_vcpu *vcpu)
Definition: pmu.c:742
struct x86_pmu_capability __read_mostly kvm_pmu_cap
Definition: pmu.c:29
int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data)
Definition: pmu.c:568
void kvm_pmu_ops_update(const struct kvm_pmu_ops *pmu_ops)
Definition: pmu.c:83
void kvm_pmu_destroy(struct kvm_vcpu *vcpu)
Definition: pmu.c:791
bool kvm_pmu_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx)
Definition: pmu.c:529
void kvm_pmu_init(struct kvm_vcpu *vcpu)
Definition: pmu.c:757
static void kvm_init_pmu_capability(const struct kvm_pmu_ops *pmu_ops)
Definition: pmu.h:141
#define MSR_IA32_MISC_ENABLE_PMU_RO_MASK
Definition: pmu.h:11
#define vcpu_to_pmu(vcpu)
Definition: pmu.h:7
u64 control
Definition: posted_intr.h:16
void kvm_smm_changed(struct kvm_vcpu *vcpu, bool entering_smm)
Definition: smm.c:112
void process_smi(struct kvm_vcpu *vcpu)
Definition: smm.c:135
void enter_smm(struct kvm_vcpu *vcpu)
Definition: smm.c:281
static bool is_smm(struct kvm_vcpu *vcpu)
Definition: smm.h:160
static int kvm_inject_smi(struct kvm_vcpu *vcpu)
Definition: smm.h:159
u8 data[15]
Definition: kvm_emulate.h:267
Definition: x86.h:12
u64 default_tsc_scaling_ratio
Definition: x86.h:22
u64 supported_mce_cap
Definition: x86.h:28
bool has_notify_vmexit
Definition: x86.h:26
u64 supported_perf_cap
Definition: x86.h:31
u64 supported_xss
Definition: x86.h:30
u64 max_tsc_scaling_ratio
Definition: x86.h:20
u8 tsc_scaling_ratio_frac_bits
Definition: x86.h:18
bool has_tsc_control
Definition: x86.h:14
u32 max_guest_tsc_khz
Definition: x86.h:16
u64 supported_xcr0
Definition: x86.h:29
bool has_bus_lock_exit
Definition: x86.h:24
u32 flags
Definition: i8254.h:28
struct mutex lock
Definition: i8254.h:33
struct kvm_kpit_channel_state channels[3]
Definition: i8254.h:27
bool apicv_active
Definition: lapic.h:65
struct kvm_vcpu * vcpu
Definition: lapic.h:64
Definition: irq.h:49
struct kvm_kpic_state pics[2]
Definition: irq.h:54
struct kvm * kvm
Definition: irq.h:53
spinlock_t lock
Definition: irq.h:50
Definition: i8254.h:40
struct kvm * kvm
Definition: i8254.h:43
struct kvm_kpit_state pit_state
Definition: i8254.h:44
struct user_return_notifier urn
Definition: x86.c:211
struct kvm_user_return_msrs::kvm_user_return_msr_values values[KVM_MAX_NR_USER_RETURN_MSRS]
int(* read_write_exit_mmio)(struct kvm_vcpu *vcpu, gpa_t gpa, void *val, int bytes)
Definition: x86.c:7760
int(* read_write_prepare)(struct kvm_vcpu *vcpu, void *val, int bytes)
Definition: x86.c:7754
int(* read_write_mmio)(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes, void *val)
Definition: x86.c:7758
int(* read_write_emulate)(struct kvm_vcpu *vcpu, gpa_t gpa, void *val, int bytes)
Definition: x86.c:7756
struct fetch_cache fetch
Definition: kvm_emulate.h:371
unsigned long eflags
Definition: kvm_emulate.h:312
enum x86emul_mode mode
Definition: kvm_emulate.h:315
const struct x86_emulate_ops * ops
Definition: kvm_emulate.h:309
struct x86_exception exception
Definition: kvm_emulate.h:324
unsigned long _eip
Definition: kvm_emulate.h:362
unsigned long eip
Definition: kvm_emulate.h:313
void(* vm_bugged)(struct x86_emulate_ctxt *ctxt)
Definition: kvm_emulate.h:98
bool error_code_valid
Definition: kvm_emulate.h:24
bool nested_page_fault
Definition: kvm_emulate.h:26
#define trace_kvm_msr_write(ecx, data)
Definition: trace.h:450
#define trace_kvm_emulate_insn_start(vcpu)
Definition: trace.h:848
#define trace_kvm_msr_read(ecx, data)
Definition: trace.h:449
#define KVM_PIO_OUT
Definition: trace.h:156
#define trace_kvm_msr_read_ex(ecx)
Definition: trace.h:451
#define trace_kvm_emulate_insn_failed(vcpu)
Definition: trace.h:849
#define KVM_PIO_IN
Definition: trace.h:155
#define trace_kvm_msr_write_ex(ecx, data)
Definition: trace.h:452
int kvm_spec_ctrl_test_value(u64 value)
Definition: x86.c:13532
static unsigned num_emulated_msrs
Definition: x86.c:1573
static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, struct kvm_vcpu_events *events)
Definition: x86.c:5300
#define emulator_try_cmpxchg_user(t, ptr, old, new)
Definition: x86.c:7950
static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
Definition: x86.c:13181
unsigned long kvm_get_linear_rip(struct kvm_vcpu *vcpu)
Definition: x86.c:13151
int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
Definition: x86.c:11977
static void inject_emulated_exception(struct kvm_vcpu *vcpu)
Definition: x86.c:8583
static void kvm_timer_init(void)
Definition: x86.c:9526
bool __read_mostly enable_apicv
Definition: x86.c:235
static int kvm_vcpu_ready_for_interrupt_injection(struct kvm_vcpu *vcpu)
Definition: x86.c:5123
long kvm_arch_dev_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
Definition: x86.c:4838
static unsigned num_msrs_to_save
Definition: x86.c:1502
static void kvm_user_return_msr_cpu_online(void)
Definition: x86.c:428
bool kvm_arch_has_noncoherent_dma(struct kvm *kvm)
Definition: x86.c:13452
#define EXCPT_DB
Definition: x86.c:545
static void kvm_update_masterclock(struct kvm *kvm)
Definition: x86.c:3034
static int kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz)
Definition: x86.c:2468
void kvm_inject_nmi(struct kvm_vcpu *vcpu)
Definition: x86.c:818
static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu, struct kvm_debugregs *dbgregs)
Definition: x86.c:5512
static bool kvm_is_code_breakpoint_inhibited(struct kvm_vcpu *vcpu)
Definition: x86.c:8941
static void kvm_apicv_init(struct kvm *kvm)
Definition: x86.c:9961
static int __kvm_x86_vendor_init(struct kvm_x86_init_ops *ops)
Definition: x86.c:9651
static int emulator_leave_smm(struct x86_emulate_ctxt *ctxt)
Definition: x86.c:8483
static u64 kvm_get_arch_capabilities(void)
Definition: x86.c:1629
static int __init kvm_x86_init(void)
Definition: x86.c:13917
int kvm_emulate_monitor(struct kvm_vcpu *vcpu)
Definition: x86.c:2142
void __user * __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size)
Definition: x86.c:12637
#define KVMCLOCK_SYNC_PERIOD
Definition: x86.c:3395
static int do_get_msr_feature(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
Definition: x86.c:1705
static void __get_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2)
Definition: x86.c:11556
static void adjust_tsc_offset_host(struct kvm_vcpu *vcpu, s64 adjustment)
Definition: x86.c:2797
static int emulator_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port, const void *val, unsigned int count)
Definition: x86.c:8126
void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
Definition: x86.c:830
void kvm_arch_guest_memory_reclaimed(struct kvm *kvm)
Definition: x86.c:10679
#define emul_to_vcpu(ctxt)
Definition: x86.c:102
int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
Definition: x86.c:1373
int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
Definition: x86.c:4182
static bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu)
Definition: x86.c:2148
int kvm_find_user_return_msr(u32 msr)
Definition: x86.c:416
static int complete_fast_msr_access(struct kvm_vcpu *vcpu)
Definition: x86.c:2012
int kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
Definition: x86.c:6936
static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu, u64 mcg_cap)
Definition: x86.c:5181
static int kvm_vm_ioctl_set_clock(struct kvm *kvm, void __user *argp)
Definition: x86.c:6890
int kvm_fast_pio(struct kvm_vcpu *vcpu, int size, unsigned short port, int in)
Definition: x86.c:9358
int kvm_emulate_instruction(struct kvm_vcpu *vcpu, int emulation_type)
Definition: x86.c:9262
static int vcpu_block(struct kvm_vcpu *vcpu)
Definition: x86.c:11094
static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr)
Definition: x86.c:6256
int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
Definition: x86.c:1315
static int complete_fast_pio_out(struct kvm_vcpu *vcpu)
Definition: x86.c:9281
static void kvm_make_mclock_inprogress_request(struct kvm *kvm)
Definition: x86.c:2999
void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
Definition: x86.c:13189
static int kvm_vcpu_x86_set_ucna(struct kvm_vcpu *vcpu, struct kvm_x86_mce *mce, u64 *banks)
Definition: x86.c:5228
#define EXCPT_CONTRIBUTORY
Definition: x86.c:521
static void kvm_vcpu_flush_tlb_all(struct kvm_vcpu *vcpu)
Definition: x86.c:3571
static int emulator_read_std(struct x86_emulate_ctxt *ctxt, gva_t addr, void *val, unsigned int bytes, struct x86_exception *exception, bool system)
Definition: x86.c:7590
int kvm_arch_prepare_memory_region(struct kvm *kvm, const struct kvm_memory_slot *old, struct kvm_memory_slot *new, enum kvm_mr_change change)
Definition: x86.c:12856
#define EXCPT_TRAP
Definition: x86.c:542
static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns)
Definition: x86.c:2502
int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu)
Definition: x86.c:2082
static bool kvm_pv_async_pf_enabled(struct kvm_vcpu *vcpu)
Definition: x86.c:3503
static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt, unsigned long addr, const void *old, const void *new, unsigned int bytes, struct x86_exception *exception)
Definition: x86.c:7953
#define KVM_MAX_NR_USER_RETURN_MSRS
Definition: x86.c:208
static void record_steal_time(struct kvm_vcpu *vcpu)
Definition: x86.c:3627
int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
Definition: x86.c:4589
static void emulator_set_segment(struct x86_emulate_ctxt *ctxt, u16 selector, struct desc_struct *desc, u32 base3, int seg)
Definition: x86.c:8326
vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
Definition: x86.c:6251
static int kvm_alloc_memslot_metadata(struct kvm *kvm, struct kvm_memory_slot *slot)
Definition: x86.c:12771
#define EXCPT_PF
Definition: x86.c:522
static u32 adjust_tsc_khz(u32 khz, s32 ppm)
Definition: x86.c:2423
static bool emulator_get_cpuid(struct x86_emulate_ctxt *ctxt, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx, bool exact_only)
Definition: x86.c:8435
bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
Definition: x86.c:13231
int x86_decode_emulated_instruction(struct kvm_vcpu *vcpu, int emulation_type, void *insn, int insn_len)
Definition: x86.c:9057
static void emulator_set_gdt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
Definition: x86.c:8274
bool __read_mostly eager_page_split
Definition: x86.c:196
static u32 kvm_async_pf_gfn_slot(struct kvm_vcpu *vcpu, gfn_t gfn)
Definition: x86.c:13218
static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
Definition: x86.c:8613
static void kvm_start_pvclock_update(struct kvm *kvm)
Definition: x86.c:3010
bool kvm_apicv_activated(struct kvm *kvm)
Definition: x86.c:9935
static u64 __read_mostly efer_reserved_bits
Definition: x86.c:113
static void kvm_vcpu_flush_tlb_current(struct kvm_vcpu *vcpu)
Definition: x86.c:3605
static void emulator_set_idt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
Definition: x86.c:8279
static int do_get_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
Definition: x86.c:2224
void kvm_make_scan_ioapic_request_mask(struct kvm *kvm, unsigned long *vcpu_bitmap)
Definition: x86.c:10514
static void kvm_mmu_update_cpu_dirty_logging(struct kvm *kvm, bool enable)
Definition: x86.c:12884
int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, const void *val, int bytes)
Definition: x86.c:7741
static void memslot_rmap_free(struct kvm_memory_slot *slot)
Definition: x86.c:12725
void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
Definition: x86.c:12197
static void kvm_queue_exception_e_p(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code, unsigned long payload)
Definition: x86.c:750
static void __kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 offset, u64 tsc, u64 ns, bool matched)
Definition: x86.c:2673
void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
Definition: x86.c:12224
static void emulator_vm_bugged(struct x86_emulate_ctxt *ctxt)
Definition: x86.c:8500
static void emulator_invlpg(struct x86_emulate_ctxt *ctxt, ulong address)
Definition: x86.c:8146
static int kvm_vcpu_check_hw_bp(unsigned long addr, u32 type, u32 dr7, unsigned long *db)
Definition: x86.c:8886
static void kvm_queue_exception_vmexit(struct kvm_vcpu *vcpu, unsigned int vector, bool has_error_code, u32 error_code, bool has_payload, unsigned long payload)
Definition: x86.c:625
u64 kvm_calc_nested_tsc_multiplier(u64 l1_multiplier, u64 l2_multiplier)
Definition: x86.c:2605
int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq, bool set)
Definition: x86.c:13505
static void emulator_write_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg, ulong val)
Definition: x86.c:8462
static bool kvm_is_vm_type_supported(unsigned long type)
Definition: x86.c:4582
static int complete_emulated_io(struct kvm_vcpu *vcpu)
Definition: x86.c:11219
static u64 kvm_dr6_fixed(struct kvm_vcpu *vcpu)
Definition: x86.c:1361
static int kvm_check_and_inject_events(struct kvm_vcpu *vcpu, bool *req_immediate_exit)
Definition: x86.c:10265
int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index, int reason, bool has_error_code, u32 error_code)
Definition: x86.c:11657
static bool is_ucna(struct kvm_x86_mce *mce)
Definition: x86.c:5220
static u32 kvm_async_pf_hash_fn(gfn_t gfn)
Definition: x86.c:13196
bool __read_mostly enable_vmware_backdoor
Definition: x86.c:176
static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu, struct kvm_x86_mce *mce)
Definition: x86.c:5247
static void kvmclock_reset(struct kvm_vcpu *vcpu)
Definition: x86.c:3565
static bool __read_mostly vector_hashing
Definition: x86.c:173
static int kvm_fast_pio_in(struct kvm_vcpu *vcpu, int size, unsigned short port)
Definition: x86.c:9337
void kvm_arch_free_vm(struct kvm *kvm)
Definition: x86.c:12513
unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
Definition: x86.c:1327
module_param(ignore_msrs, bool, 0644)
int kvm_write_guest_virt_system(struct kvm_vcpu *vcpu, gva_t addr, void *val, unsigned int bytes, struct x86_exception *exception)
Definition: x86.c:7651
static int emulator_pio_in_out(struct kvm_vcpu *vcpu, int size, unsigned short port, void *data, unsigned int count, bool in)
Definition: x86.c:8036
static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask)
Definition: x86.c:8564
bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip)
Definition: x86.c:13164
static void kvm_vcpu_flush_tlb_guest(struct kvm_vcpu *vcpu)
Definition: x86.c:3580
static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
Definition: x86.c:10704
static void emulator_get_gdt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
Definition: x86.c:8264
u64 __read_mostly host_arch_capabilities
Definition: x86.c:241
unsigned long kvm_arch_vcpu_get_ip(struct kvm_vcpu *vcpu)
Definition: x86.c:13136
static int complete_emulated_msr_access(struct kvm_vcpu *vcpu)
Definition: x86.c:2001
static u64 kvm_compute_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
Definition: x86.c:2574
static int kvm_vm_ioctl_reinject(struct kvm *kvm, struct kvm_reinject_control *control)
Definition: x86.c:6400
static void __user * kvm_get_attr_addr(struct kvm_device_attr *attr)
Definition: x86.c:4796
static void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu)
Definition: x86.c:10562
int kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data)
Definition: x86.c:1981
static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu, struct kvm_xcrs *guest_xcrs)
Definition: x86.c:5603
static int kvm_set_guest_paused(struct kvm_vcpu *vcpu)
Definition: x86.c:5632
static int complete_emulated_mmio(struct kvm_vcpu *vcpu)
Definition: x86.c:11249
u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc)
Definition: x86.c:2583
int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
Definition: x86.c:1192
uint64_t kvm_get_wall_clock_epoch(struct kvm *kvm)
Definition: x86.c:3298
static int read_emulate(struct kvm_vcpu *vcpu, gpa_t gpa, void *val, int bytes)
Definition: x86.c:7777
module_init(kvm_x86_init)
u64 kvm_calc_nested_tsc_offset(u64 l1_offset, u64 l2_offset, u64 l2_multiplier)
Definition: x86.c:2590
static int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port)
Definition: x86.c:9291
int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
Definition: x86.c:11460
void kvm_post_set_cr0(struct kvm_vcpu *vcpu, unsigned long old_cr0, unsigned long cr0)
Definition: x86.c:929
static int kvm_sev_es_outs(struct kvm_vcpu *vcpu, unsigned int size, unsigned int port)
Definition: x86.c:13819
void kvm_make_scan_ioapic_request(struct kvm *kvm)
Definition: x86.c:10520
static int kvm_get_msr_with_filter(struct kvm_vcpu *vcpu, u32 index, u64 *data)
Definition: x86.c:1967
int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, int emulation_type, void *insn, int insn_len)
Definition: x86.c:9074
bool kvm_arch_dy_runnable(struct kvm_vcpu *vcpu)
Definition: x86.c:13110
static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs, struct kvm_msr_entry *entries, int(*do_msr)(struct kvm_vcpu *vcpu, unsigned index, u64 *data))
Definition: x86.c:4498
static int set_efer(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
Definition: x86.c:1753
int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu)
Definition: x86.c:13141
static u32 emulated_msrs[ARRAY_SIZE(emulated_msrs_all)]
Definition: x86.c:1572
static int emulator_pio_in(struct kvm_vcpu *vcpu, int size, unsigned short port, void *val, unsigned int count)
Definition: x86.c:8087
static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s)
Definition: x86.c:5094
#define EXCPT_BENIGN
Definition: x86.c:520
bool kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)
Definition: x86.c:13334
static struct kmem_cache * x86_emulator_cache
Definition: x86.c:319
static void kvm_end_pvclock_update(struct kvm *kvm)
Definition: x86.c:3018
static const struct read_write_emulator_ops read_emultor
Definition: x86.c:7811
int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu)
Definition: x86.c:13146
static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
Definition: x86.c:8141
static bool kvm_guest_apic_has_interrupt(struct kvm_vcpu *vcpu)
Definition: x86.c:13041
bool kvm_vector_hashing_enabled(void)
Definition: x86.c:13520
static bool kvm_is_valid_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
Definition: x86.c:913
int kvm_emulate_instruction_from_buffer(struct kvm_vcpu *vcpu, void *insn, int insn_len)
Definition: x86.c:9268
static int complete_sev_es_emulated_mmio(struct kvm_vcpu *vcpu)
Definition: x86.c:13676
static int __set_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2)
Definition: x86.c:11803
static void kvm_x86_check_cpu_compat(void *ret)
Definition: x86.c:9646
static u64 kvm_msr_reason(int r)
Definition: x86.c:2023
void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
Definition: x86.c:6416
static int kvm_set_msr_ignored_check(struct kvm_vcpu *vcpu, u32 index, u64 data, bool host_initiated)
Definition: x86.c:1907
static int complete_sev_es_emulated_outs(struct kvm_vcpu *vcpu)
Definition: x86.c:13808
static bool kvm_is_immutable_feature_msr(u32 msr)
Definition: x86.c:1595
static int kvm_fetch_guest_virt(struct x86_emulate_ctxt *ctxt, gva_t addr, void *val, unsigned int bytes, struct x86_exception *exception)
Definition: x86.c:7545
static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs, int(*do_msr)(struct kvm_vcpu *vcpu, unsigned index, u64 *data), int writeback)
Definition: x86.c:4517
static int kvm_sev_es_ins(struct kvm_vcpu *vcpu, unsigned int size, unsigned int port)
Definition: x86.c:13857
void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)
Definition: x86.c:7462
static bool kvm_vcpu_running(struct kvm_vcpu *vcpu)
Definition: x86.c:11161
static void kvm_vcpu_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 l1_multiplier)
Definition: x86.c:2639
gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, struct x86_exception *exception)
Definition: x86.c:7483
const struct kvm_stats_header kvm_vm_stats_header
Definition: x86.c:261
static void kvm_probe_msr_to_save(u32 msr_index)
Definition: x86.c:7295
static int vcpu_is_mmio_gpa(struct kvm_vcpu *vcpu, unsigned long gva, gpa_t gpa, bool write)
Definition: x86.c:7696
int kvm_check_nested_events(struct kvm_vcpu *vcpu)
Definition: x86.c:10197
static int kvm_arch_tsc_has_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
Definition: x86.c:5641
gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, struct x86_exception *exception)
Definition: x86.c:7493
static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
Definition: x86.c:6287
u64 __read_mostly host_xss
Definition: x86.c:238
void kvm_arch_register_noncoherent_dma(struct kvm *kvm)
Definition: x86.c:13438
static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
Definition: x86.c:3431
static void complete_userspace_rdmsr(struct kvm_vcpu *vcpu)
Definition: x86.c:1993
static bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu)
Definition: x86.c:13047
static int kvm_vm_ioctl_get_clock(struct kvm *kvm, void __user *argp)
Definition: x86.c:6879
static bool reexecute_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, int emulation_type)
Definition: x86.c:8761
static bool kvm_is_msr_to_save(u32 msr_index)
Definition: x86.c:3725
static int write_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa, void *val, int bytes)
Definition: x86.c:7802
static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data)
Definition: x86.c:3510
static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz)
static int emulator_read_write_onepage(unsigned long addr, void *val, unsigned int bytes, struct x86_exception *exception, struct kvm_vcpu *vcpu, const struct read_write_emulator_ops *ops)
Definition: x86.c:7825
static int write_emulate(struct kvm_vcpu *vcpu, gpa_t gpa, void *val, int bytes)
Definition: x86.c:7783
static void wbinvd_ipi(void *garbage)
Definition: x86.c:4952
static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt)
Definition: x86.c:10124
static int __read_mostly force_emulation_prefix
Definition: x86.c:185
static void kvm_inject_exception(struct kvm_vcpu *vcpu)
Definition: x86.c:10207
static bool retry_instruction(struct x86_emulate_ctxt *ctxt, gpa_t cr2_or_gpa, int emulation_type)
Definition: x86.c:8835
static int __read_mostly lapic_timer_advance_ns
Definition: x86.c:170
static void complete_emulator_pio_in(struct kvm_vcpu *vcpu, void *val)
Definition: x86.c:8097
void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
Definition: x86.c:12735
static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
Definition: x86.c:11295
gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, struct x86_exception *exception)
Definition: x86.c:7505
static void kvm_on_user_return(struct user_return_notifier *urn)
Definition: x86.c:362
static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq)
Definition: x86.c:5138
static void __kvmclock_cpufreq_notifier(struct cpufreq_freqs *freq, int cpu)
Definition: x86.c:9422
static int kvm_check_emulate_insn(struct kvm_vcpu *vcpu, int emul_type, void *insn, int insn_len)
Definition: x86.c:7662
static int emulator_read_write(struct x86_emulate_ctxt *ctxt, unsigned long addr, void *val, unsigned int bytes, struct x86_exception *exception, const struct read_write_emulator_ops *ops)
Definition: x86.c:7876
static DEFINE_MUTEX(vendor_module_lock)
int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, struct kvm_mp_state *mp_state)
Definition: x86.c:11581
static void set_or_clear_apicv_inhibit(unsigned long *inhibits, enum kvm_apicv_inhibit reason, bool set)
Definition: x86.c:9950
static int kvm_emulate_wbinvd_noskip(struct kvm_vcpu *vcpu)
Definition: x86.c:8151
static int set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)
Definition: x86.c:2432
static void __get_kvmclock(struct kvm *kvm, struct kvm_clock_data *data)
Definition: x86.c:3059
static void emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest)
Definition: x86.c:8183
static int emulator_get_msr_with_filter(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 *pdata)
Definition: x86.c:8356
static void kvm_del_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
Definition: x86.c:13236
static u64 __scale_tsc(u64 ratio, u64 tsc)
Definition: x86.c:2559
void kvm_set_or_clear_apicv_inhibit(struct kvm *kvm, enum kvm_apicv_inhibit reason, bool set)
Definition: x86.c:10626
void kvm_arch_async_page_present(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)
Definition: x86.c:13366
u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)
Definition: x86.c:474
static void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu, s64 adjustment)
Definition: x86.c:2790
int kvm_handle_invalid_op(struct kvm_vcpu *vcpu)
Definition: x86.c:2119
static int kvm_vm_ioctl_set_msr_filter(struct kvm *kvm, struct kvm_msr_filter *filter)
Definition: x86.c:6743
static void __exit kvm_x86_exit(void)
Definition: x86.c:13926
u64 __read_mostly host_xcr0
Definition: x86.c:317
static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host)
Definition: x86.c:4132
static bool apf_pageready_slot_free(struct kvm_vcpu *vcpu)
Definition: x86.c:13279
static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 *user_value)
Definition: x86.c:2720
bool kvm_arch_has_irq_bypass(void)
Definition: x86.c:13458
static void kvm_unload_vcpu_mmus(struct kvm *kvm)
Definition: x86.c:12597
static int exception_type(int vector)
Definition: x86.c:547
void kvm_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)
Definition: x86.c:7456
static void advance_sev_es_emulated_pio(struct kvm_vcpu *vcpu, unsigned count, int size)
Definition: x86.c:13799
bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer)
Definition: x86.c:1744
static int kvm_arch_tsc_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
Definition: x86.c:5680
static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt, unsigned long addr, void *val, unsigned int bytes, struct x86_exception *exception)
Definition: x86.c:7930
int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data, bool host_initiated)
Definition: x86.c:1925
#define EXCPT_FAULT
Definition: x86.c:541
void kvm_x86_vendor_exit(void)
Definition: x86.c:9798
static int kvm_get_msr_ignored_check(struct kvm_vcpu *vcpu, u32 index, u64 *data, bool host_initiated)
Definition: x86.c:1952
static int kvm_guest_time_update(struct kvm_vcpu *v)
Definition: x86.c:3167
static int kvm_add_msr_filter(struct kvm_x86_msr_filter *msr_filter, struct kvm_msr_filter_range *user_range)
Definition: x86.c:6709
static int complete_fast_pio_out_port_0x7e(struct kvm_vcpu *vcpu)
Definition: x86.c:9275
static bool __read_mostly mitigate_smt_rsb
Definition: x86.c:200
static void kvm_ops_update(struct kvm_x86_init_ops *ops)
Definition: x86.c:9608
void kvm_arch_sync_events(struct kvm *kvm)
Definition: x86.c:12608
int kvm_read_guest_virt(struct kvm_vcpu *vcpu, gva_t addr, void *val, unsigned int bytes, struct x86_exception *exception)
Definition: x86.c:7572
static void kvm_setup_guest_pvclock(struct kvm_vcpu *v, struct gfn_to_pfn_cache *gpc, unsigned int offset, bool force_tsc_unstable)
Definition: x86.c:3113
int kvm_emulate_ap_reset_hold(struct kvm_vcpu *vcpu)
Definition: x86.c:9868
static int complete_sev_es_emulated_ins(struct kvm_vcpu *vcpu)
Definition: x86.c:13844
struct kvm_caps kvm_caps __read_mostly
Definition: x86.c:95
static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu, struct kvm_xsave *guest_xsave)
Definition: x86.c:5570
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_entry)
#define SMT_RSB_MSG
static int kvm_write_guest_virt_helper(gva_t addr, void *val, unsigned int bytes, struct kvm_vcpu *vcpu, u64 access, struct x86_exception *exception)
Definition: x86.c:7605
static int apf_put_user_notpresent(struct kvm_vcpu *vcpu)
Definition: x86.c:13263
static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu, struct kvm_debugregs *dbgregs)
Definition: x86.c:5524
EXPORT_SYMBOL_GPL(kvm_caps)
int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu)
Definition: x86.c:8169
void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
Definition: x86.c:12177
int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
Definition: x86.c:1266
static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu)
Definition: x86.c:10147
int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
Definition: x86.c:12522
void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu)
Definition: x86.c:12501
#define KVM_SUPPORTED_ARCH_CAP
Definition: x86.c:1621
void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr)
Definition: x86.c:737
void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
Definition: x86.c:12374
static int handle_fastpath_set_tscdeadline(struct kvm_vcpu *vcpu, u64 data)
Definition: x86.c:2176
static uint32_t div_frac(uint32_t dividend, uint32_t divisor)
Definition: x86.c:2382
static int emulator_write_emulated(struct x86_emulate_ctxt *ctxt, unsigned long addr, const void *val, unsigned int bytes, struct x86_exception *exception)
Definition: x86.c:7940
static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 l1_offset)
Definition: x86.c:2615
static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu)
Definition: x86.c:5015
static int kvm_probe_user_return_msr(u32 msr)
Definition: x86.c:389
static bool emulator_is_smm(struct x86_emulate_ctxt *ctxt)
Definition: x86.c:8472
static bool emulator_guest_has_movbe(struct x86_emulate_ctxt *ctxt)
Definition: x86.c:8442
static int complete_fast_pio_in(struct kvm_vcpu *vcpu)
Definition: x86.c:9316
int kvm_sev_es_mmio_write(struct kvm_vcpu *vcpu, gpa_t gpa, unsigned int bytes, void *data)
Definition: x86.c:13722
static void kvm_update_dr0123(struct kvm_vcpu *vcpu)
Definition: x86.c:1336
static gva_t emulator_get_untagged_addr(struct x86_emulate_ctxt *ctxt, gva_t addr, unsigned int flags)
Definition: x86.c:8508
#define KVM_EXIT_HYPERCALL_VALID_MASK
Definition: x86.c:118
static int read_prepare(struct kvm_vcpu *vcpu, void *val, int bytes)
Definition: x86.c:7765
int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
Definition: x86.c:11954
bool __read_mostly allow_smaller_maxphyaddr
Definition: x86.c:232
bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu)
Definition: x86.c:13125
static struct kvm_x86_msr_filter * kvm_alloc_msr_filter(bool default_allow)
Definition: x86.c:6684
u64 kvm_scale_tsc(u64 tsc, u64 ratio)
Definition: x86.c:2564
static int complete_emulated_pio(struct kvm_vcpu *vcpu)
Definition: x86.c:11224
static bool is_vmware_backdoor_opcode(struct x86_emulate_ctxt *ctxt)
Definition: x86.c:9016
enum lapic_mode kvm_get_apic_mode(struct kvm_vcpu *vcpu)
Definition: x86.c:479
static bool kvm_can_mwait_in_guest(void)
Definition: x86.c:4552
static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
Definition: x86.c:1074
static bool kvm_msr_ignored_check(u32 msr, u64 data, bool write)
Definition: x86.c:325
int kvm_get_nr_pending_nmis(struct kvm_vcpu *vcpu)
Definition: x86.c:10508
static void get_kvmclock(struct kvm *kvm, struct kvm_clock_data *data)
Definition: x86.c:3094
static void tsc_khz_changed(void *data)
Definition: x86.c:9376
static int emulator_set_cr(struct x86_emulate_ctxt *ctxt, int cr, ulong val)
Definition: x86.c:8230
void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
Definition: x86.c:779
bool __read_mostly enable_pmu
Definition: x86.c:192
u32 __read_mostly kvm_nr_uret_msrs
Definition: x86.c:219
static const struct read_write_emulator_ops write_emultor
Definition: x86.c:7818
__read_mostly DEFINE_STATIC_KEY_FALSE(kvm_has_noapic_vcpu)
void __kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu)
Definition: x86.c:10525
static void kvm_sched_yield(struct kvm_vcpu *vcpu, unsigned long dest_id)
Definition: x86.c:9974
static int complete_emulated_rdmsr(struct kvm_vcpu *vcpu)
Definition: x86.c:2006
fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu)
Definition: x86.c:2185
static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
Definition: x86.c:10638
static int read_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa, void *val, int bytes)
Definition: x86.c:7795
u64 __read_mostly host_efer
Definition: x86.c:229
void kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
Definition: x86.c:796
int kvm_emulate_as_nop(struct kvm_vcpu *vcpu)
Definition: x86.c:2107
int kvm_emulate_halt_noskip(struct kvm_vcpu *vcpu)
Definition: x86.c:9851
static void __kvm_start_pvclock_update(struct kvm *kvm)
Definition: x86.c:3004
unsigned int min_timer_period_us
Definition: x86.c:154
bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu)
Definition: x86.c:13317
static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps)
Definition: x86.c:6354
bool __read_mostly report_ignored_msrs
Definition: x86.c:150
int kvm_arch_post_init_vm(struct kvm *kvm)
Definition: x86.c:12585
int kvm_handle_memory_failure(struct kvm_vcpu *vcpu, int r, struct x86_exception *e)
Definition: x86.c:13588
static bool __read_mostly kvmclock_periodic_sync
Definition: x86.c:157
int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event, bool line_status)
Definition: x86.c:6435
void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
Definition: x86.c:5062
static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v)
Definition: x86.c:7434
void kvm_deliver_exception_payload(struct kvm_vcpu *vcpu, struct kvm_queued_exception *ex)
Definition: x86.c:573
int kvm_emulate_halt(struct kvm_vcpu *vcpu)
Definition: x86.c:9857
int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, struct kvm_mp_state *mp_state)
Definition: x86.c:11609
static void kvm_track_tsc_matching(struct kvm_vcpu *vcpu, bool new_generation)
Definition: x86.c:2518
static int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value)
Definition: x86.c:8189
int kvm_add_user_return_msr(u32 msr)
Definition: x86.c:404
int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
Definition: x86.c:13096
static u32 kvm_async_pf_next_probe(u32 key)
Definition: x86.c:13203
static bool kvm_vcpu_check_code_breakpoint(struct kvm_vcpu *vcpu, int emulation_type, int *r)
Definition: x86.c:8958
int kvm_emulate_rdmsr(struct kvm_vcpu *vcpu)
Definition: x86.c:2057
static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva, gpa_t *gpa, struct x86_exception *exception, bool write)
Definition: x86.c:7711
static void kvmclock_update_fn(struct work_struct *work)
Definition: x86.c:3371
int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva)
Definition: x86.c:13612
bool kvm_arch_no_poll(struct kvm_vcpu *vcpu)
Definition: x86.c:13525
static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len, const void *v)
Definition: x86.c:7413
bool kvm_arch_irqfd_route_changed(struct kvm_kernel_irq_routing_entry *old, struct kvm_kernel_irq_routing_entry *new)
Definition: x86.c:13511
static int kvm_set_msr_with_filter(struct kvm_vcpu *vcpu, u32 index, u64 data)
Definition: x86.c:1974
static void kvm_multiple_exception(struct kvm_vcpu *vcpu, unsigned nr, bool has_error, u32 error_code, bool has_payload, unsigned long payload, bool reinject)
Definition: x86.c:646
static void __get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
Definition: x86.c:11424
const struct _kvm_stats_desc kvm_vcpu_stats_desc[]
Definition: x86.c:270
void kvm_arch_end_assignment(struct kvm *kvm)
Definition: x86.c:13413
static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
Definition: x86.c:6312
static void kvm_mmu_slot_apply_flags(struct kvm *kvm, struct kvm_memory_slot *old, const struct kvm_memory_slot *new, enum kvm_mr_change change)
Definition: x86.c:12896
int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
Definition: x86.c:485
static int kvm_arch_tsc_get_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
Definition: x86.c:5657
static bool emulator_guest_has_fxsr(struct x86_emulate_ctxt *ctxt)
Definition: x86.c:8447
static int handle_fastpath_set_x2apic_icr_irqoff(struct kvm_vcpu *vcpu, u64 data)
Definition: x86.c:2162
static void emulator_halt(struct x86_emulate_ctxt *ctxt)
Definition: x86.c:8422
static int write_mmio(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes, void *val)
Definition: x86.c:7789
static void __set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
Definition: x86.c:11468
static int emulator_get_msr(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 *pdata)
Definition: x86.c:8402
static int apf_put_user_ready(struct kvm_vcpu *vcpu, u32 token)
Definition: x86.c:13271
int kvm_emulate_invd(struct kvm_vcpu *vcpu)
Definition: x86.c:2112
bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu)
Definition: x86.c:12488
gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u64 access, struct x86_exception *exception)
Definition: x86.c:7468
static void kvm_vcpu_ioctl_x86_get_xcrs(struct kvm_vcpu *vcpu, struct kvm_xcrs *guest_xcrs)
Definition: x86.c:5589
int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, struct kvm_translation *tr)
Definition: x86.c:11933
static int emulator_set_xcr(struct x86_emulate_ctxt *ctxt, u32 index, u64 xcr)
Definition: x86.c:8495
static void kvm_get_time_scale(uint64_t scaled_hz, uint64_t base_hz, s8 *pshift, u32 *pmultiplier)
Definition: x86.c:2388
noinstr void kvm_spurious_fault(void)
Definition: x86.c:513
EXPORT_STATIC_CALL_GPL(kvm_x86_get_cs_db_l_bits)
static int kvm_x86_dev_get_attr(struct kvm_device_attr *attr)
Definition: x86.c:4805
static bool is_mci_status_msr(u32 msr)
Definition: x86.c:3414
static void kvm_invalidate_pcid(struct kvm_vcpu *vcpu, unsigned long pcid)
Definition: x86.c:1223
static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
Definition: x86.c:12590
static int handle_emulation_failure(struct kvm_vcpu *vcpu, int emulation_type)
Definition: x86.c:8733
static void __get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
Definition: x86.c:11544
int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
Definition: x86.c:971
static void process_nmi(struct kvm_vcpu *vcpu)
Definition: x86.c:10470
static void kvm_add_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
Definition: x86.c:13208
void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
Definition: x86.c:4962
static void kvm_gen_kvmclock_update(struct kvm_vcpu *v)
Definition: x86.c:3386
static int kvm_vcpu_do_singlestep(struct kvm_vcpu *vcpu)
Definition: x86.c:8901
void kvm_arch_unregister_noncoherent_dma(struct kvm *kvm)
Definition: x86.c:13445
void __kvm_prepare_emulation_failure_exit(struct kvm_vcpu *vcpu, u64 *data, u8 ndata)
Definition: x86.c:8720
void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
Definition: x86.c:824
static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
Definition: x86.c:2229
static bool emulator_is_guest_mode(struct x86_emulate_ctxt *ctxt)
Definition: x86.c:8477
#define KVM_SUPPORTED_XCR0
Definition: x86.c:224
int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap)
Definition: x86.c:6447
static u32 __read_mostly tsc_tolerance_ppm
Definition: x86.c:161
static int emulator_get_cpl(struct x86_emulate_ctxt *ctxt)
Definition: x86.c:8259
static void vcpu_load_eoi_exitmap(struct kvm_vcpu *vcpu)
Definition: x86.c:10659
static const u32 msr_based_features_all_except_vmx[]
Definition: x86.c:1580
static void emulator_get_idt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
Definition: x86.c:8269
static int kvm_get_msr_feature(struct kvm_msr_entry *msr)
Definition: x86.c:1687
const struct kvm_stats_header kvm_vcpu_stats_header
Definition: x86.c:308
static u32 msr_based_features[ARRAY_SIZE(msr_based_features_all_except_vmx)+(KVM_LAST_EMULATED_VMX_MSR - KVM_FIRST_EMULATED_VMX_MSR+1)]
Definition: x86.c:1588
#define KVM_MAX_MCE_BANKS
Definition: x86.c:93
int kvm_sev_es_mmio_read(struct kvm_vcpu *vcpu, gpa_t gpa, unsigned int bytes, void *data)
Definition: x86.c:13761
long kvm_arch_vcpu_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
Definition: x86.c:5807
static bool __read_mostly ignore_msrs
Definition: x86.c:147
static void kvm_noncoherent_dma_assignment_start_or_stop(struct kvm *kvm)
Definition: x86.c:13425
int kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err)
Definition: x86.c:757
void kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
Definition: x86.c:1402
static int kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
Definition: x86.c:6367
void kvm_arch_async_page_present_queued(struct kvm_vcpu *vcpu)
Definition: x86.c:13391
static bool kvm_is_valid_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
Definition: x86.c:11680
static void kvm_leave_nested(struct kvm_vcpu *vcpu)
Definition: x86.c:641
int kvm_emulate_xsetbv(struct kvm_vcpu *vcpu)
Definition: x86.c:1119
static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes, struct kvm_vcpu *vcpu, u64 access, struct x86_exception *exception)
Definition: x86.c:7513
int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu)
Definition: x86.c:8916
static struct kmem_cache * kvm_alloc_emulator_cache(void)
Definition: x86.c:342
static s64 get_kvmclock_base_ns(void)
Definition: x86.c:2309
static const u32 msrs_to_save_pmu[]
Definition: x86.c:1469
int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
Definition: x86.c:10019
#define EXCPT_ABORT
Definition: x86.c:543
u64 get_kvmclock_ns(struct kvm *kvm)
Definition: x86.c:3105
int memslot_rmap_alloc(struct kvm_memory_slot *slot, unsigned long npages)
Definition: x86.c:12749
static bool kvm_check_tsc_unstable(void)
Definition: x86.c:2655
bool kvm_arch_can_dequeue_async_page_present(struct kvm_vcpu *vcpu)
Definition: x86.c:13398
void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
Definition: x86.c:1012
void kvm_service_local_tlb_flush_requests(struct kvm_vcpu *vcpu)
Definition: x86.c:3617
static int emulator_read_pmc(struct x86_emulate_ctxt *ctxt, u32 pmc, u64 *pdata)
Definition: x86.c:8416
static bool is_mci_control_msr(u32 msr)
Definition: x86.c:3410
static bool emulator_get_segment(struct x86_emulate_ctxt *ctxt, u16 *selector, struct desc_struct *desc, u32 *base3, int seg)
Definition: x86.c:8290
static void kvm_probe_feature_msr(u32 msr_index)
Definition: x86.c:7283
static void drop_user_return_notifiers(void)
Definition: x86.c:465
int kvm_arch_hardware_enable(void)
Definition: x86.c:12386
void kvm_arch_pre_destroy_vm(struct kvm *kvm)
Definition: x86.c:12690
void __kvm_set_or_clear_apicv_inhibit(struct kvm *kvm, enum kvm_apicv_inhibit reason, bool set)
Definition: x86.c:10585
module_exit(kvm_x86_exit)
int kvm_sev_es_string_io(struct kvm_vcpu *vcpu, unsigned int size, unsigned int port, void *data, unsigned int count, int in)
Definition: x86.c:13876
static int emulator_pio_out_emulated(struct x86_emulate_ctxt *ctxt, int size, unsigned short port, const void *val, unsigned int count)
Definition: x86.c:8134
static unsigned long max_tsc_khz
Definition: x86.c:2421
static int kvm_vcpu_ioctl_nmi(struct kvm_vcpu *vcpu)
Definition: x86.c:5165
int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id)
Definition: x86.c:12044
static void pvclock_update_vm_gtod_copy(struct kvm *kvm)
Definition: x86.c:2967
int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
Definition: x86.c:11572
void kvm_prepare_emulation_failure_exit(struct kvm_vcpu *vcpu)
Definition: x86.c:8727
static int emulator_check_pmc(struct x86_emulate_ctxt *ctxt, u32 pmc)
Definition: x86.c:8408
static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
Definition: x86.c:11777
static void kvm_write_system_time(struct kvm_vcpu *vcpu, gpa_t system_time, bool old_msr, bool host_initiated)
Definition: x86.c:2357
static unsigned long emulator_get_cr(struct x86_emulate_ctxt *ctxt, int cr)
Definition: x86.c:8201
static int kvmclock_cpu_down_prep(unsigned int cpu)
Definition: x86.c:9370
static ulong emulator_read_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg)
Definition: x86.c:8457
static void emulator_wbinvd(struct x86_emulate_ctxt *ctxt)
Definition: x86.c:8178
void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr)
Definition: x86.c:731
static int complete_hypercall_exit(struct kvm_vcpu *vcpu)
Definition: x86.c:10008
static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s)
Definition: x86.c:5086
static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
Definition: x86.c:11303
static const struct x86_emulate_ops emulate_ops
Definition: x86.c:8517
#define KVM_CAP_PMU_VALID_MASK
Definition: x86.c:120
static bool __kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer)
Definition: x86.c:1723
static int kvm_msr_user_space(struct kvm_vcpu *vcpu, u32 index, u32 exit_reason, u64 data, int(*completion)(struct kvm_vcpu *vcpu), int r)
Definition: x86.c:2035
static void kvm_init_msr_lists(void)
Definition: x86.c:7380
static void prepare_emulation_failure_exit(struct kvm_vcpu *vcpu, u64 *data, u8 ndata, u8 *insn_bytes, u8 insn_size)
Definition: x86.c:8661
static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu, struct kvm_tpr_access_ctl *tac)
Definition: x86.c:5172
void kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip)
Definition: x86.c:8639
static int exception_class(int vector)
Definition: x86.c:524
static void post_kvm_run_save(struct kvm_vcpu *vcpu)
Definition: x86.c:10154
static bool can_set_mci_status(struct kvm_vcpu *vcpu)
Definition: x86.c:3422
static unsigned int num_msr_based_features
Definition: x86.c:1589
static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, struct kvm_enable_cap *cap)
Definition: x86.c:5751
static struct x86_emulate_ctxt * alloc_emulate_ctxt(struct kvm_vcpu *vcpu)
Definition: x86.c:8596
static void __get_sregs_common(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
Definition: x86.c:11509
static u64 mk_cr_64(u64 curr_cr, u32 new_val)
Definition: x86.c:8196
int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons, struct irq_bypass_producer *prod)
Definition: x86.c:13463
#define KVM_X2APIC_API_VALID_FLAGS
Definition: x86.c:122
static int __kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data, bool host_initiated)
Definition: x86.c:1845
static struct kvm_user_return_msrs __percpu * user_return_msrs
Definition: x86.c:222
static void kvmclock_sync_fn(struct work_struct *work)
Definition: x86.c:3397
static int kvm_x86_check_processor_compatibility(void)
Definition: x86.c:9626
static bool kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
Definition: x86.c:1144
#define KVM_FEP_CLEAR_RFLAGS_RF
Definition: x86.c:184
static int emulator_write_std(struct x86_emulate_ctxt *ctxt, gva_t addr, void *val, unsigned int bytes, struct x86_exception *exception, bool system)
Definition: x86.c:7635
void kvm_arch_hardware_disable(void)
Definition: x86.c:12482
static int __set_sregs_common(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs, int *mmu_reset_needed, bool update_pdptrs)
Definition: x86.c:11705
static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, struct kvm_vcpu_events *events)
Definition: x86.c:5393
int kvm_emulate_rdpmc(struct kvm_vcpu *vcpu)
Definition: x86.c:1422
static u64 __read_mostly cr4_reserved_bits
Definition: x86.c:116
static void kvm_free_msr_filter(struct kvm_x86_msr_filter *msr_filter)
Definition: x86.c:6696
void kvm_arch_destroy_vm(struct kvm *kvm)
Definition: x86.c:12695
static void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu)
Definition: x86.c:355
static bool kvm_can_deliver_async_pf(struct kvm_vcpu *vcpu)
Definition: x86.c:13291
static int __kvm_emulate_halt(struct kvm_vcpu *vcpu, int state, int reason)
Definition: x86.c:9832
static unsigned long get_cpu_tsc_khz(void)
Definition: x86.c:3050
const struct _kvm_stats_desc kvm_vm_stats_desc[]
Definition: x86.c:244
int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
Definition: x86.c:11310
static int emulator_intercept(struct x86_emulate_ctxt *ctxt, struct x86_instruction_info *info, enum x86_intercept_stage stage)
Definition: x86.c:8427
int handle_ud(struct kvm_vcpu *vcpu)
Definition: x86.c:7669
static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock, int sec_hi_ofs)
Definition: x86.c:2316
#define MAX_IO_MSRS
Definition: x86.c:92
static int sync_regs(struct kvm_vcpu *vcpu)
Definition: x86.c:12016
bool kvm_vcpu_apicv_activated(struct kvm_vcpu *vcpu)
Definition: x86.c:9941
void kvm_queue_exception_p(struct kvm_vcpu *vcpu, unsigned nr, unsigned long payload)
Definition: x86.c:743
#define __kvm_cpu_cap_has(UNUSED_, f)
static const u32 emulated_msrs_all[]
Definition: x86.c:1504
int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
Definition: x86.c:11837
static int complete_emulated_insn_gp(struct kvm_vcpu *vcpu, int err)
Definition: x86.c:768
static void kvm_arch_vcpu_guestdbg_update_apicv_inhibit(struct kvm *kvm)
Definition: x86.c:11848
void kvm_arch_start_assignment(struct kvm *kvm)
Definition: x86.c:13406
static int kvm_vm_ioctl_set_identity_map_addr(struct kvm *kvm, u64 ident_addr)
Definition: x86.c:6266
void kvm_enable_efer_bits(u64 mask)
Definition: x86.c:1790
int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops)
Definition: x86.c:9786
static int kvm_x86_dev_has_attr(struct kvm_device_attr *attr)
Definition: x86.c:4825
int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
Definition: x86.c:12059
#define EXCPT_INTERRUPT
Definition: x86.c:544
int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg)
Definition: x86.c:11869
static u32 __read_mostly kvm_uret_msrs_list[KVM_MAX_NR_USER_RETURN_MSRS]
Definition: x86.c:221
static int kvm_emulate_monitor_mwait(struct kvm_vcpu *vcpu, const char *insn)
Definition: x86.c:2127
static int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
Definition: x86.c:6378
static int kvm_pv_enable_async_pf_int(struct kvm_vcpu *vcpu, u64 data)
Definition: x86.c:3549
bool noinstr kvm_arch_has_assigned_device(struct kvm *kvm)
Definition: x86.c:13419
static unsigned long emulator_get_cached_segment_base(struct x86_emulate_ctxt *ctxt, int seg)
Definition: x86.c:8284
int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
Definition: x86.c:866
bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl)
Definition: x86.c:840
void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen)
Definition: x86.c:12840
static void emulator_set_nmi_mask(struct x86_emulate_ctxt *ctxt, bool masked)
Definition: x86.c:8467
static int vcpu_run(struct kvm_vcpu *vcpu)
Definition: x86.c:11168
bool kvm_msr_allowed(struct kvm_vcpu *vcpu, u32 index, u32 type)
Definition: x86.c:1796
static void prepare_emulation_ctxt_failure_exit(struct kvm_vcpu *vcpu)
Definition: x86.c:8712
static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val, void *data)
Definition: x86.c:9499
static bool need_emulate_wbinvd(struct kvm_vcpu *vcpu)
Definition: x86.c:4957
static int kvm_cpu_accept_dm_intr(struct kvm_vcpu *vcpu)
Definition: x86.c:5107
int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
Definition: x86.c:3737
unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu)
Definition: x86.c:13170
static const u32 msrs_to_save_base[]
Definition: x86.c:1449
void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu)
Definition: x86.c:1018
static int kvmclock_cpu_online(unsigned int cpu)
Definition: x86.c:9520
static void store_regs(struct kvm_vcpu *vcpu)
Definition: x86.c:12001
static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt, int size, unsigned short port, void *val, unsigned int count)
Definition: x86.c:8106
static u64 pdptr_rsvd_bits(struct kvm_vcpu *vcpu)
Definition: x86.c:858
static void kvm_vcpu_ioctl_x86_get_xsave2(struct kvm_vcpu *vcpu, u8 *state, unsigned int size)
Definition: x86.c:5545
static int complete_fast_rdmsr(struct kvm_vcpu *vcpu)
Definition: x86.c:2017
#define KVMCLOCK_UPDATE_DELAY
Definition: x86.c:3369
static void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu)
Definition: x86.c:10684
bool __kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
Definition: x86.c:1132
static int kvm_vcpu_ioctl_device_attr(struct kvm_vcpu *vcpu, unsigned int ioctl, void __user *argp)
Definition: x86.c:5723
static void update_cr8_intercept(struct kvm_vcpu *vcpu)
Definition: x86.c:10170
static struct notifier_block kvmclock_cpufreq_notifier_block
Definition: x86.c:9516
int kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data)
Definition: x86.c:1987
int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
Definition: x86.c:11501
int __read_mostly pi_inject_timer
Definition: x86.c:188
static void kvm_pv_kick_cpu_op(struct kvm *kvm, int apicid)
Definition: x86.c:9919
void kvm_fixup_and_inject_pf_error(struct kvm_vcpu *vcpu, gva_t gva, u16 error_code)
Definition: x86.c:13558
bool kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu)
Definition: x86.c:13101
int kvm_set_user_return_msr(unsigned slot, u64 value, u64 mask)
Definition: x86.c:442
bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu)
Definition: x86.c:12493
void kvm_arch_commit_memory_region(struct kvm *kvm, struct kvm_memory_slot *old, const struct kvm_memory_slot *new, enum kvm_mr_change change)
Definition: x86.c:13017
static bool emulator_guest_has_rdpid(struct x86_emulate_ctxt *ctxt)
Definition: x86.c:8452
bool kvm_require_dr(struct kvm_vcpu *vcpu, int dr)
Definition: x86.c:848
static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu, struct kvm_xsave *guest_xsave)
Definition: x86.c:5577
void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu)
Definition: x86.c:10692
void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons, struct irq_bypass_producer *prod)
Definition: x86.c:13481
static void emulator_triple_fault(struct x86_emulate_ctxt *ctxt)
Definition: x86.c:8490
void kvm_post_set_cr4(struct kvm_vcpu *vcpu, unsigned long old_cr4, unsigned long cr4)
Definition: x86.c:1150
static int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps)
Definition: x86.c:6342
static int emulator_set_msr_with_filter(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 data)
Definition: x86.c:8379
static u32 msrs_to_save[ARRAY_SIZE(msrs_to_save_base)+ARRAY_SIZE(msrs_to_save_pmu)]
Definition: x86.c:1501
static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm, unsigned long kvm_nr_mmu_pages)
Definition: x86.c:6272
#define ERR_PTR_USR(e)
Definition: x86.c:100
void kvm_update_dr7(struct kvm_vcpu *vcpu)
Definition: x86.c:1346
void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu)
Definition: x86.c:1041
int kvm_emulate_mwait(struct kvm_vcpu *vcpu)
Definition: x86.c:2136
#define __cr4_reserved_bits(__cpu_has, __c)
Definition: x86.h:511
static bool is_protmode(struct kvm_vcpu *vcpu)
Definition: x86.h:138
#define KVM_MSR_RET_FILTERED
Definition: x86.h:509
static bool kvm_event_needs_reinjection(struct kvm_vcpu *vcpu)
Definition: x86.h:127
static bool vcpu_match_mmio_gva(struct kvm_vcpu *vcpu, unsigned long gva)
Definition: x86.h:255
#define do_shl32_div32(n, base)
Definition: x86.h:400
static __always_inline void kvm_after_interrupt(struct kvm_vcpu *vcpu)
Definition: x86.h:446
static bool kvm_dr6_valid(u64 data)
Definition: x86.h:469
static bool kvm_exception_is_soft(unsigned int nr)
Definition: x86.h:133
static bool kvm_mpx_supported(void)
Definition: x86.h:361
static bool kvm_pat_valid(u64 data)
Definition: x86.h:456
#define KVM_FIRST_EMULATED_VMX_MSR
Definition: x86.h:49
static bool mmu_is_nested(struct kvm_vcpu *vcpu)
Definition: x86.h:183
static bool kvm_dr7_valid(u64 data)
Definition: x86.h:464
static bool is_paging(struct kvm_vcpu *vcpu)
Definition: x86.h:198
static __always_inline void kvm_before_interrupt(struct kvm_vcpu *vcpu, enum kvm_intr_type intr)
Definition: x86.h:440
@ KVM_HANDLING_IRQ
Definition: x86.h:436
static void kvm_clear_interrupt_queue(struct kvm_vcpu *vcpu)
Definition: x86.h:122
static u64 kvm_get_filtered_xcr0(void)
Definition: x86.h:341
static bool kvm_is_exception_pending(struct kvm_vcpu *vcpu)
Definition: x86.h:100
static void kvm_queue_interrupt(struct kvm_vcpu *vcpu, u8 vector, bool soft)
Definition: x86.h:114
static bool is_pae_paging(struct kvm_vcpu *vcpu)
Definition: x86.h:203
static bool is_long_mode(struct kvm_vcpu *vcpu)
Definition: x86.h:143
#define KVM_MSR_RET_INVALID
Definition: x86.h:508
#define MSR_IA32_CR_PAT_DEFAULT
Definition: x86.h:90
static u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec)
Definition: x86.h:389
static bool is_pae(struct kvm_vcpu *vcpu)
Definition: x86.h:188
static bool kvm_vcpu_has_run(struct kvm_vcpu *vcpu)
Definition: x86.h:95
static void kvm_pr_unimpl_wrmsr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
Definition: x86.h:377
static void kvm_clear_exception_queue(struct kvm_vcpu *vcpu)
Definition: x86.h:107
static bool kvm_hlt_in_guest(struct kvm *kvm)
Definition: x86.h:414
#define KVM_LAST_EMULATED_VMX_MSR
Definition: x86.h:50
static bool is_64_bit_mode(struct kvm_vcpu *vcpu)
Definition: x86.h:152
static bool vcpu_match_mmio_gpa(struct kvm_vcpu *vcpu, gpa_t gpa)
Definition: x86.h:264
static bool kvm_check_has_quirk(struct kvm *kvm, u64 quirk)
Definition: x86.h:288
static bool is_64_bit_hypercall(struct kvm_vcpu *vcpu)
Definition: x86.h:164
static u8 vcpu_virt_addr_bits(struct kvm_vcpu *vcpu)
Definition: x86.h:208
static bool is_noncanonical_address(u64 la, struct kvm_vcpu *vcpu)
Definition: x86.h:213
int kvm_xen_vcpu_get_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
Definition: xen.c:972
void kvm_xen_inject_pending_events(struct kvm_vcpu *v)
Definition: xen.c:519
void kvm_xen_init_vm(struct kvm *kvm)
Definition: xen.c:2158
void kvm_xen_destroy_vm(struct kvm *kvm)
Definition: xen.c:2165
void kvm_xen_destroy_vcpu(struct kvm_vcpu *vcpu)
Definition: xen.c:2122
int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
Definition: xen.c:736
uint32_t flags
Definition: xen.c:1
int kvm_xen_write_hypercall_page(struct kvm_vcpu *vcpu, u64 data)
Definition: xen.c:1090
int kvm_xen_hvm_evtchn_send(struct kvm *kvm, struct kvm_irq_routing_xen_evtchn *uxe)
Definition: xen.c:1824
int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data)
Definition: xen.c:626
int kvm_xen_hypercall(struct kvm_vcpu *vcpu)
Definition: xen.c:1486
int kvm_xen_hvm_config(struct kvm *kvm, struct kvm_xen_hvm_config *xhc)
Definition: xen.c:1161
void kvm_xen_init_vcpu(struct kvm_vcpu *vcpu)
Definition: xen.c:2105
int kvm_xen_hvm_get_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data)
Definition: xen.c:689
void kvm_xen_update_tsc_info(struct kvm_vcpu *vcpu)
Definition: xen.c:2135
static bool kvm_xen_hypercall_enabled(struct kvm *kvm)
Definition: xen.h:127
static void kvm_xen_runstate_set_preempted(struct kvm_vcpu *vcpu)
Definition: xen.h:178
static int kvm_xen_has_interrupt(struct kvm_vcpu *vcpu)
Definition: xen.h:132
int state
Definition: xen.h:0
uint64_t time[4]
Definition: xen.h:2
static bool kvm_xen_has_pending_events(struct kvm_vcpu *vcpu)
Definition: xen.h:141
static bool kvm_xen_msr_enabled(struct kvm *kvm)
Definition: xen.h:122
static void kvm_xen_runstate_set_running(struct kvm_vcpu *vcpu)
Definition: xen.h:173