KVM虚拟化常用API
kvm 模块加载后会生成/dev/kvm字符设备,/dev/kvm是一个标准的字符设备,可以使用常用的open、close、ioctl接口,使用ioctl代替read,write接口与kvm交互。KVM API从功能上可以分为三大类:
1、虚拟化system指令,针对虚拟化系统的全局性参数设置和控制。
2、VM指令,针对VM虚拟机进行控制,如:内存设置、创建VCPU等。
3、 VCPU指令,针对具体的VCPU进行参数设置。如:相关寄存器的读写、中断控制等。
通常对于KVM的操作都是从open /dev/kvm设备文件开始的,open后,会获得相应的文件描述符(fd),然后通过ioctl系统指令对该fd进行进一步的操作,比如通过KVM_CREATE_VM指令,可以创建一个虚拟机并返回虚拟机对应的文件描述符,然后根据该描述符来进一步控制虚拟机的行为,比如通过KVM_CREATE_VCPU指令来为该虚拟机创建VCPU。
一、 System指令
System ioctl指令用于控制KVM运行环境的参数,包括全局性的参数设置和虚拟机创建等工作,主要的指令字包括:
KVM_CREATE_VM 创建KVM虚拟机
KVM_GET_API_VERSION 查询当前KVM API版本
KVM_GET_MSR_INDEX_LIST 获得MSR索引列表
KVM_CHECK_EXTENSION 检查扩展支持情况
KVM_GET_VCPU_MMAP_SIZE 运行虚拟机和用户态空间共享的一片内存区域的大小
其中,KVM_CREATE_VM比较重要,用于创建虚拟机,并返回一个代表该虚拟机的描述符(fd)。新创建的虚拟机没有VCPU,也没有内存等资源,需要对创建虚拟机时返回的描述符,通过ioctl指令,进行进一步的配置。
二、VM指令
VM ioctl指令实现对虚拟机的控制,大多需要从KVM_CREATE_VM中返回的fd来进行操作,具体操作包括:配置内存、配置VCPU、运行虚拟机等,主要指令如下:
KVM_CREATE_VCPU 为虚拟机创建VCPU
KVM_RUN 根据kvm_run结构体信息,运行VM虚拟机
KVM_CREATE_IRQCHIP 创建虚拟APIC,且随后创建的VCPU都关联到此APIC
KVM_IRQ_LINE 对某虚拟APIC发出中断信号
KVM_GET_IRQCHIP 读取APIC的中断标志信息
KVM_SET_IRQCHIP 写入APIC的中断标志信息
KVM_GET_DIRTY_LOG 返回脏内存页的位图
KVM_CREATE_VCPU 和KVM_RUN是VM ioctl指令中两种重要的指令字,通过 KVM_CREATE_VCPU为虚拟机创建VCPU,并获得对应的fd描述符后,可以对其调用KVM_RUN,以启动该虚拟机(或称为调度VCPU)。
Kvm结构体代表一个具体的虚拟机,当通过KVM_CREATE_VM指令字创建一个虚拟机后,就会创建一个新的kvm结构体对象。Kvm结构体中包括了VCPU、内存、APIC、IRQ、MMU、Event事件等相关信息,该结构体主要在KVM虚拟机内部使用,用于跟踪虚拟机状态。
struct kvm {spinlock_t mmu_lock;struct mutex slots_lock;struct mm_struct *mm; /* userspace tied to this vm */struct kvm_memslots __rcu *memslots[KVM_ADDRESS_SPACE_NUM];/*KVM虚拟机分配的内存slot,用于GPAàHVA的转换,内存虚拟化使用*/struct kvm_vcpu *vcpus[KVM_MAX_VCPUS];/*kvm支持的最大vcpu个数*// created_vcpus is protected by kvm->lock, and is incremented* at the beginning of KVM_CREATE_VCPU. online_vcpus is only* incremented after storing the kvm_vcpu pointer in vcpus,* and is accessed atomically.*/atomic_t online_vcpus;int created_vcpus;int last_boosted_vcpu;struct list_head vm_list;struct mutex lock;struct kvm_io_bus __rcu *buses[KVM_NR_BUSES];
#ifdef CONFIG_HAVE_KVM_EVENTFDstruct {spinlock_t lock;struct list_head items;struct list_head resampler_list;struct mutex resampler_lock;} irqfds;struct list_head ioeventfds;
#endifstruct kvm_vm_stat stat;/*KVM虚拟机中的运行时状态信息,比如页表、MMU等状态。*/struct kvm_arch arch;refcount_t users_count;
#ifdef CONFIG_KVM_MMIOstruct kvm_coalesced_mmio_ring *coalesced_mmio_ring;spinlock_t ring_lock;struct list_head coalesced_zones;
#endifstruct mutex irq_lock;
#ifdef CONFIG_HAVE_KVM_IRQCHIP/ Update side is protected by irq_lock.*/struct kvm_irq_routing_table __rcu *irq_routing;
#endif
#ifdef CONFIG_HAVE_KVM_IRQFDstruct hlist_head irq_ack_notifier_list;
#endif#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)struct mmu_notifier mmu_notifier;unsigned long mmu_notifier_seq;long mmu_notifier_count;
#endiflong tlbs_dirty;struct list_head devices;bool manual_dirty_log_protect;struct dentry *debugfs_dentry;struct kvm_stat_data debugfs_stat_data;struct srcu_struct srcu;struct srcu_struct irq_srcu;pid_t userspace_pid;struct kvm_mig_opt mig_opt;
};
kvm_run结构体定义在include/uapi/linux/kvm.h中,可以通过该结构体了解KVM的内部运行状态。
/* for KVM_RUN, returned by mmap(vcpu_fd, offset=0) */
struct kvm_run {/* in */__u8 request_interrupt_window;/*向VCPU注入一个中断,让VCPU做好相关准备工作*/__u8 immediate_exit;__u8 padding1[6];/* out */__u32 exit_reason;/*记录退出原因*/__u8 ready_for_interrupt_injection; /*响应request_interrupt_window的中断请求,当设置时,说明VCPU可以接收中断*/__u8 if_flag; /*中断使能标识,如果使用了APIC,则无效*/__u16 flags;/* in (pre_kvm_run), out (post_kvm_run) */__u64 cr8;__u64 apic_base;
#ifdef __KVM_S390/* the processor status word for s390 */__u64 psw_mask; /* psw upper half */__u64 psw_addr; /* psw lower half */
#endifunion {/* KVM_EXIT_UNKNOWN */struct {__u64 hardware_exit_reason;} hw;/* KVM_EXIT_FAIL_ENTRY */struct {__u64 hardware_entry_failure_reason;} fail_entry;/* KVM_EXIT_EXCEPTION */struct {__u32 exception;__u32 error_code;} ex;/* KVM_EXIT_IO */struct {
#define KVM_EXIT_IO_IN 0
#define KVM_EXIT_IO_OUT 1__u8 direction;__u8 size; /* bytes */__u16 port;__u32 count;__u64 data_offset; /* relative to kvm_run start */} io; /*当由于IO操作导致发生VM-Exit时,该结构体保存IO相关信息。*//* KVM_EXIT_DEBUG */struct {struct kvm_debug_exit_arch arch;} debug;/* KVM_EXIT_MMIO */struct {__u64 phys_addr;__u8 data[8];__u32 len;__u8 is_write;} mmio;/* KVM_EXIT_HYPERCALL */struct {__u64 nr;__u64 args[6];__u64 ret;__u32 longmode;__u32 pad;} hypercall; /*hypercall exit*//* KVM_EXIT_TPR_ACCESS */struct {__u64 rip;__u32 is_write;__u32 pad;} tpr_access;/* KVM_EXIT_S390_SIEIC */struct {__u8 icptcode;__u16 ipa;__u32 ipb;} s390_sieic;/* KVM_EXIT_S390_RESET */
#define KVM_S390_RESET_POR 1
#define KVM_S390_RESET_CLEAR 2
#define KVM_S390_RESET_SUBSYSTEM 4
#define KVM_S390_RESET_CPU_INIT 8
#define KVM_S390_RESET_IPL 16__u64 s390_reset_flags;/* KVM_EXIT_S390_UCONTROL */struct {__u64 trans_exc_code;__u32 pgm_code;} s390_ucontrol;/* KVM_EXIT_DCR (deprecated) */struct {__u32 dcrn;__u32 data;__u8 is_write;} dcr;/* KVM_EXIT_INTERNAL_ERROR */struct {__u32 suberror;/* Available with KVM_CAP_INTERNAL_ERROR_DATA: */__u32 ndata;__u64 data[16];} internal;/* KVM_EXIT_OSI */struct {__u64 gprs[32];} osi;/* KVM_EXIT_PAPR_HCALL */struct {__u64 nr;__u64 ret;__u64 args[9];} papr_hcall;/* KVM_EXIT_S390_TSCH */struct {__u16 subchannel_id;__u16 subchannel_nr;__u32 io_int_parm;__u32 io_int_word;__u32 ipb;__u8 dequeued;} s390_tsch;/* KVM_EXIT_EPR */struct {__u32 epr;} epr;/* KVM_EXIT_SYSTEM_EVENT */struct {
#define KVM_SYSTEM_EVENT_SHUTDOWN 1
#define KVM_SYSTEM_EVENT_RESET 2
#define KVM_SYSTEM_EVENT_CRASH 3__u32 type;__u64 flags;} system_event;/* KVM_EXIT_S390_STSI */struct {__u64 addr;__u8 ar;__u8 reserved;__u8 fc;__u8 sel1;__u16 sel2;} s390_stsi;/* KVM_EXIT_IOAPIC_EOI */struct {__u8 vector;} eoi;/* KVM_EXIT_HYPERV */struct kvm_hyperv_exit hyperv;/* Fix the size of the union. */char padding[256];};/* 2048 is the size of the char array used to bound/pad the size* of the union that holds sync regs.*/#define SYNC_REGS_SIZE_BYTES 2048/ shared registers between kvm and userspace.* kvm_valid_regs specifies the register classes set by the host* kvm_dirty_regs specified the register classes dirtied by userspace* struct kvm_sync_regs is architecture specific, as well as the* bits for kvm_valid_regs and kvm_dirty_regs*/__u64 kvm_valid_regs;__u64 kvm_dirty_regs;union {struct kvm_sync_regs regs;char padding[SYNC_REGS_SIZE_BYTES];} s;
}
三、 VCPU指令
VCPU ioctl指令主要针对具体的VCPU进行配置,包括寄存器读写、中断设置、内存设置、时钟管理、调试开关等,可以对KVM虚拟机进行运行时配置。主要指令字包括:
1. 寄存器控制方面
KVM_GET_REGS 获取通用寄存器信息
KVM_SET_REGS 设置通用寄存器信息
KVM_GET_SREGS 获取特殊寄存器信息
KVM_SET_SREGS设置特殊寄存器信息
KVM_GET_MSRS获取MSR寄存器信息
KVM_SET_MSRS设置MSR寄存器信息
KVM_GET_FPU获取浮点寄存器信息
KVM_SET_FPU设置浮点寄存器信息
KVM_GET_XSAVE获取VCPU的xsave寄存器信息
KVM_SET_XSAVE设置VCPU的xsave寄存器信息
KVM_GET_XCRS获取VCPU的xcr寄存器信息
KVM_SET_XCRS设置VCPU的xcr寄存器信息
2 中断和事件管理方面
KVM_INTERRUPT 在VCPU上产生中断(当APIC无效时)
KVM_SET_SIGNAL_MASK 设置某个VCPU的中断信号屏蔽掩码
KVM_GET_CPU_EVENTS 获取VCPU中被挂起待延时处理的事件,如中断、NMI或异常
KVM_SET_CPU_EVENTS 设置VCPU的事件,如中断、NMI或异常
3 内存管理方面
KVM_TRANSLATE 将VCPU的物理地址翻译成HPA
KVM_SET_USER_MEMORY_REGION 修改VCPU的内存区域
KVM_SET_TSS_ADDR 初始化TSS内存区域(Intel架构专用)
KVM_SET_IDENTITY_MAP_ADDR 创建EPT页表(Intel架构专用)
4 其他方面(如:CPUID的设置、调试接口等)
kvm中kvm_vcpu(include/linux/kvm_host.h)结构实现vcpu相关的信息。
struct kvm_vcpu {struct kvm *kvm;/*记录虚拟机相关信息*/
#ifdef CONFIG_PREEMPT_NOTIFIERSstruct preempt_notifier preempt_notifier; /*vcpu抢占通知*/
#endifint cpu;int vcpu_id;/*vcpu id*/int srcu_idx;int mode;u64 requests;unsigned long guest_debug;int pre_pcpu;struct list_head blocked_vcpu_list;struct mutex mutex;struct kvm_run *run; /*记录虚拟机运行状态*/int guest_xcr0_loaded;struct swait_queue_head wq;struct pid __rcu *pid;int sigset_active;sigset_t sigset;struct kvm_vcpu_stat stat;unsigned int halt_poll_ns;bool valid_wakeup;#ifdef CONFIG_HAS_IOMEMint mmio_needed;int mmio_read_completed;int mmio_is_write;int mmio_cur_fragment;int mmio_nr_fragments;struct kvm_mmio_fragment mmio_fragments[KVM_MAX_MMIO_FRAGMENTS];
#endif#ifdef CONFIG_KVM_ASYNC_PFstruct {u32 queued;struct list_head queue;struct list_head done;spinlock_t lock;} async_pf;
#endif#ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT/ Cpu relax intercept or pause loop exit optimization* in_spin_loop: set when a vcpu does a pause loop exit* or cpu relax intercepted.* dy_eligible: indicates whether vcpu is eligible for directed yield.*/struct {bool in_spin_loop;bool dy_eligible;} spin_loop;
#endifbool preempted;bool ready;struct kvm_vcpu_arch arch;struct dentry *debugfs_dentry;