> 文章列表 > DPDK系列之十四虚拟化vhost源码分析

DPDK系列之十四虚拟化vhost源码分析

DPDK系列之十四虚拟化vhost源码分析

一、vhost的基础数据结构

vhost的编程有内核态和user态两种,在DPDK中使用后者可以更好的操作数据流程减少数据的流动路径和复制的过程。在DPDK中主要的控制文件在librte_vhost、librte_ethdev和virtio和vhost三个文件夹中分布。其中在drviers/net/virtio/virtio-user中的vhost.h中还有一些user中相关的数据结构。这里看一下librte_vhost中的vhost.h中的数据结构:

/*** Structure contains buffer address, length and descriptor index* from vring to do scatter RX.*/
struct buf_vector {uint64_t buf_iova;uint64_t buf_addr;uint32_t buf_len;uint32_t desc_idx;
};/** A structure to hold some fields needed in zero copy code path,* mainly for associating an mbuf with the right desc_idx.*/
struct zcopy_mbuf {struct rte_mbuf *mbuf;uint32_t desc_idx;uint16_t desc_count;uint16_t in_use;TAILQ_ENTRY(zcopy_mbuf) next;
};
struct vring_used_elem_packed {uint16_t id;uint16_t flags;uint32_t len;uint32_t count;
};/*** Structure contains variables relevant to RX/TX virtqueues.*/
struct vhost_virtqueue {union {struct vring_desc	*desc;struct vring_packed_desc   *desc_packed;};union {struct vring_avail	*avail;struct vring_packed_desc_event *driver_event;};union {struct vring_used	*used;struct vring_packed_desc_event *device_event;};uint32_t		size;uint16_t		last_avail_idx;uint16_t		last_used_idx;/* Last used index we notify to front end. */uint16_t		signalled_used;bool			signalled_used_valid;
#define VIRTIO_INVALID_EVENTFD		(-1)
#define VIRTIO_UNINITIALIZED_EVENTFD	(-2)/* Backend value to determine if device should started/stopped */int			backend;int			enabled;int			access_ok;rte_spinlock_t		access_lock;/* Used to notify the guest (trigger interrupt) */int			callfd;/* Currently unused as polling mode is enabled */int			kickfd;/* Physical address of used ring, for logging */uint64_t		log_guest_addr;/* inflight share memory info */union {struct rte_vhost_inflight_info_split *inflight_split;struct rte_vhost_inflight_info_packed *inflight_packed;};struct rte_vhost_resubmit_info *resubmit_inflight;uint64_t		global_counter;uint16_t		nr_zmbuf;uint16_t		zmbuf_size;uint16_t		last_zmbuf_idx;struct zcopy_mbuf	*zmbufs;struct zcopy_mbuf_list	zmbuf_list;union {struct vring_used_elem  *shadow_used_split;struct vring_used_elem_packed *shadow_used_packed;};uint16_t                shadow_used_idx;/* Record packed ring enqueue latest desc cache aligned index */uint16_t		shadow_aligned_idx;/* Record packed ring first dequeue desc index */uint16_t		shadow_last_used_idx;struct vhost_vring_addr ring_addrs;struct batch_copy_elem	*batch_copy_elems;uint16_t		batch_copy_nb_elems;bool			used_wrap_counter;bool			avail_wrap_counter;struct log_cache_entry log_cache[VHOST_LOG_CACHE_NR];uint16_t log_cache_nb_elem;rte_rwlock_t	iotlb_lock;rte_rwlock_t	iotlb_pending_lock;struct rte_mempool *iotlb_pool;TAILQ_HEAD(, vhost_iotlb_entry) iotlb_list;int				iotlb_cache_nr;TAILQ_HEAD(, vhost_iotlb_entry) iotlb_pending_list;
} __rte_cache_aligned;#define VHOST_MAX_VRING			0x100
#define VHOST_MAX_QUEUE_PAIRS		0x80/* Declare IOMMU related bits for older kernels */
#ifndef VIRTIO_F_IOMMU_PLATFORM#define VIRTIO_F_IOMMU_PLATFORM 33struct vhost_iotlb_msg {__u64 iova;__u64 size;__u64 uaddr;
#define VHOST_ACCESS_RO      0x1
#define VHOST_ACCESS_WO      0x2
#define VHOST_ACCESS_RW      0x3__u8 perm;
#define VHOST_IOTLB_MISS           1
#define VHOST_IOTLB_UPDATE         2
#define VHOST_IOTLB_INVALIDATE     3
#define VHOST_IOTLB_ACCESS_FAIL    4__u8 type;
};#define VHOST_IOTLB_MSG 0x1struct vhost_msg {int type;union {struct vhost_iotlb_msg iotlb;__u8 padding[64];};
};
#endif
struct vring_packed_desc {uint64_t addr;uint32_t len;uint16_t id;uint16_t flags;
};struct vring_packed_desc_event {uint16_t off_wrap;uint16_t flags;
};
#endif
/*** Device structure contains all configuration information relating* to the device.*/
struct virtio_net {/* Frontend (QEMU) memory and memory region information */struct rte_vhost_memory	*mem;uint64_t		features;uint64_t		protocol_features;int			vid;uint32_t		flags;uint16_t		vhost_hlen;/* to tell if we need broadcast rarp packet */rte_atomic16_t		broadcast_rarp;uint32_t		nr_vring;int			dequeue_zero_copy;int			extbuf;int			linearbuf;struct vhost_virtqueue	*virtqueue[VHOST_MAX_QUEUE_PAIRS * 2];struct inflight_mem_info *inflight_info;
#define IF_NAME_SZ (PATH_MAX > IFNAMSIZ ? PATH_MAX : IFNAMSIZ)char			ifname[IF_NAME_SZ];uint64_t		log_size;uint64_t		log_base;uint64_t		log_addr;struct rte_ether_addr	mac;uint16_t		mtu;struct vhost_device_ops const *notify_ops;uint32_t		nr_guest_pages;uint32_t		max_guest_pages;struct guest_page       *guest_pages;int			slave_req_fd;rte_spinlock_t		slave_req_lock;int			postcopy_ufd;int			postcopy_listening;/** Device id to identify a specific backend device.* It's set to -1 for the default software implementation.*/int			vdpa_dev_id;/* context data for the external message handlers */void			*extern_data;/* pre and post vhost user message handlers for the device * /struct rte_vhost_user_extern_ops extern_ops;
} __rte_cache_aligned;

看到上面的一系列的数据结构是不是有熟悉的感觉,和上层的是不是有一些类似。这些数据都是为了RX/TX的准备,包括在前面提到的在新版本中使用了Packed的方式来处理数据。
在设备处理的数据结构定义在librte_ethdev/rte_ethdev_core.h中:

/*** @internal* The generic data structure associated with each ethernet device.** Pointers to burst-oriented packet receive and transmit functions are* located at the beginning of the structure, along with the pointer to* where all the data elements for the particular device are stored in shared* memory. This split allows the function pointer and driver data to be per-* process, while the actual configuration data for the device is shared.*/
struct rte_eth_dev {eth_rx_burst_t rx_pkt_burst; /**< Pointer to PMD receive function. */eth_tx_burst_t tx_pkt_burst; /**< Pointer to PMD transmit function. */eth_tx_prep_t tx_pkt_prepare; /**< Pointer to PMD transmit prepare function. *//*** Next two fields are per-device data but *data is shared between* primary and secondary processes and *process_private is per-process* private. The second one is managed by PMDs if necessary.*/struct rte_eth_dev_data *data;  /**< Pointer to device data. */void *process_private; /**< Pointer to per-process device data. */const struct eth_dev_ops *dev_ops; /**< Functions exported by PMD */struct rte_device *device; /**< Backing device */struct rte_intr_handle *intr_handle; /**< Device interrupt handle *//** User application callbacks for NIC interrupts */struct rte_eth_dev_cb_list link_intr_cbs;/*** User-supplied functions called from rx_burst to post-process* received packets before passing them to the user*/struct rte_eth_rxtx_callback *post_rx_burst_cbs[RTE_MAX_QUEUES_PER_PORT];/*** User-supplied functions called from tx_burst to pre-process* received packets before passing them to the driver for transmission.*/struct rte_eth_rxtx_callback *pre_tx_burst_cbs[RTE_MAX_QUEUES_PER_PORT];enum rte_eth_dev_state state; /**< Flag indicating the port state */void *security_ctx; /**< Context for security ops */uint64_t reserved_64s[4]; /**< Reserved for future fields */void *reserved_ptrs[4];   /**< Reserved for future fields */
} __rte_cache_aligned;struct rte_eth_dev_sriov;
struct rte_eth_dev_owner;/*** @internal* The data part, with no function pointers, associated with each ethernet device.** This structure is safe to place in shared memory to be common among different* processes in a multi-process configuration.*/
struct rte_eth_dev_data {char name[RTE_ETH_NAME_MAX_LEN]; /**< Unique identifier name */void **rx_queues; /**< Array of pointers to RX queues. */void **tx_queues; /**< Array of pointers to TX queues. */uint16_t nb_rx_queues; /**< Number of RX queues. */uint16_t nb_tx_queues; /**< Number of TX queues. */struct rte_eth_dev_sriov sriov;    /**< SRIOV data */void *dev_private;/**< PMD-specific private data.*   @see rte_eth_dev_release_port()*/struct rte_eth_link dev_link;   /**< Link-level information & status. */struct rte_eth_conf dev_conf;   /**< Configuration applied to device. */uint16_t mtu;                   /**< Maximum Transmission Unit. */uint32_t min_rx_buf_size;/**< Common RX buffer size handled by all queues. */uint64_t rx_mbuf_alloc_failed; /**< RX ring mbuf allocation failures. */struct rte_ether_addr *mac_addrs;/**< Device Ethernet link address.*   @see rte_eth_dev_release_port()*/uint64_t mac_pool_sel[ETH_NUM_RECEIVE_MAC_ADDR];/**< Bitmap associating MAC addresses to pools. */struct rte_ether_addr *hash_mac_addrs;/**< Device Ethernet MAC addresses of hash filtering.*   @see rte_eth_dev_release_port()*/uint16_t port_id;           /**< Device [external] port identifier. */__extension__uint8_t promiscuous   : 1, /**< RX promiscuous mode ON(1) / OFF(0). */scattered_rx : 1,  /**< RX of scattered packets is ON(1) / OFF(0) */all_multicast : 1, /**< RX all multicast mode ON(1) / OFF(0). */dev_started : 1,   /**< Device state: STARTED(1) / STOPPED(0). */lro         : 1;   /**< RX LRO is ON(1) / OFF(0) */uint8_t rx_queue_state[RTE_MAX_QUEUES_PER_PORT];/**< Queues state: HAIRPIN(2) / STARTED(1) / STOPPED(0). */uint8_t tx_queue_state[RTE_MAX_QUEUES_PER_PORT];/**< Queues state: HAIRPIN(2) / STARTED(1) / STOPPED(0). */uint32_t dev_flags;             /**< Capabilities. */enum rte_kernel_driver kdrv;    /**< Kernel driver passthrough. */int numa_node;                  /**< NUMA node connection. */struct rte_vlan_filter_conf vlan_filter_conf;/**< VLAN filter configuration. */struct rte_eth_dev_owner owner; /**< The port owner. */uint16_t representor_id;/**< Switch-specific identifier.*   Valid if RTE_ETH_DEV_REPRESENTOR in dev_flags.*/uint64_t reserved_64s[4]; /**< Reserved for future fields */void *reserved_ptrs[4];   /**< Reserved for future fields * /
}

除了上述的定义其还有一些内存数据结构相关管理数据都分散在上面提到的这些文件夹内,可以在分析代码时有针对性的看一看。

二、源码分析

通过上面的数据结构可以清楚的看到数据的定义,那么把这些数据结构串通起来的代码,就可以看做是DPDK的数据流动的过程,其实在IO通信中,从底层来看,只有三种状况,收、发和事件。这里就从收发开始处理,这里有一个需要注意的情况,在DPDK中有lib抽象出来的封装函数和在PMD使用的更上层的封装函数,先看lib中:

//从上层客户端中得到缓冲区的数据
uint16_t  rte_vhost_dequeue_burst(int vid, uint16_t queue_id,struct rte_mempool * mbuf_pool, struct rte_mbuf ** pkts, uint16_t count)
{struct virtio_net * dev;struct rte_mbuf * rarp_mbuf = NULL;struct vhost_virtqueue * vq;dev = get_device(vid);if (!dev)return 0;if (unlikely(!(dev->flags & VIRTIO_DEV_BUILTIN_VIRTIO_NET))) {RTE_LOG(ERR, VHOST_DATA,"(%d) %s: built-in vhost net backend is disabled.\\n",dev->vid, __func__);return 0;}if (unlikely(!is_valid_virt_queue_idx(queue_id, 1, dev->nr_vring))) {RTE_LOG(ERR, VHOST_DATA, "(%d) %s: invalid virtqueue idx %d.\\n",dev->vid, __func__, queue_id);return 0;}vq = dev->virtqueue[queue_id];if (unlikely(rte_spinlock_trylock(&vq->access_lock) == 0))return 0;if (unlikely(vq->enabled == 0)) {count = 0;goto out_access_unlock;}if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))vhost_user_iotlb_rd_lock(vq);if (unlikely(vq->access_ok == 0))if (unlikely(vring_translate(dev, vq) < 0)) {count = 0;goto out;}/** Construct a RARP broadcast packet, and inject it to the "pkts"* array, to looks like that guest actually send such packet.** Check user_send_rarp() for more information.** broadcast_rarp shares a cacheline in the virtio_net structure* with some fields that are accessed during enqueue and* rte_atomic16_cmpset() causes a write if using cmpxchg. This could* result in false sharing between enqueue and dequeue.** Prevent unnecessary false sharing by reading broadcast_rarp first* and only performing cmpset if the read indicates it is likely to* be set.*/if (unlikely(rte_atomic16_read(&dev->broadcast_rarp) &&rte_atomic16_cmpset((volatile uint16_t *)&dev->broadcast_rarp.cnt, 1, 0))) {rarp_mbuf = rte_net_make_rarp_packet(mbuf_pool, &dev->mac);if (rarp_mbuf == NULL) {RTE_LOG(ERR, VHOST_DATA,"Failed to make RARP packet.\\n");count = 0;goto out;}count -= 1;}if (vq_is_packed(dev)) {if (unlikely(dev->dequeue_zero_copy))count = virtio_dev_tx_packed_zmbuf(dev, vq, mbuf_pool,pkts, count);elsecount = virtio_dev_tx_packed(dev, vq, mbuf_pool, pkts,count);} elsecount = virtio_dev_tx_split(dev, vq, mbuf_pool, pkts, count);out:if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))vhost_user_iotlb_rd_unlock(vq);out_access_unlock:rte_spinlock_unlock(&vq->access_lock);if (unlikely(rarp_mbuf != NULL)) {/** Inject it to the head of "pkts" array, so that switch's mac* learning table will get updated first.* /memmove(&pkts[1], pkts, count * sizeof(struct rte_mbuf * ));pkts[0] = rarp_mbuf;count += 1;}return count;
}

下面是增加到队列中缓冲区的数据:

//向Virtio设备的接收virtqueue增加缓冲区
uint16_t rte_vhost_enqueue_burst( int vid, uint16_t queue_id,struct rte_mbuf **pkts, uint16_t count)
{struct virtio_net * dev = get_device(vid);if (!dev)return 0;if (unlikely(!(dev->flags & VIRTIO_DEV_BUILTIN_VIRTIO_NET))) {RTE_LOG(ERR, VHOST_DATA,"(%d) %s: built-in vhost net backend is disabled.\\n",dev->vid, __func__);return 0;}return virtio_dev_rx(dev, queue_id, pkts, count);
}

再看一下PMD中的处理封装函数

static inline uint16_t
rte_eth_rx_burst(uint16_t port_id, uint16_t queue_id,struct rte_mbuf **rx_pkts, const uint16_t nb_pkts)
{struct rte_eth_dev *dev = &rte_eth_devices[port_id];uint16_t nb_rx;#ifdef RTE_LIBRTE_ETHDEV_DEBUGRTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, 0);RTE_FUNC_PTR_OR_ERR_RET(*dev->rx_pkt_burst, 0);if (queue_id >= dev->data->nb_rx_queues) {RTE_ETHDEV_LOG(ERR, "Invalid RX queue_id=%u\\n", queue_id);return 0;}
#endifnb_rx = (*dev->rx_pkt_burst)(dev->data->rx_queues[queue_id],rx_pkts, nb_pkts);#ifdef RTE_ETHDEV_RXTX_CALLBACKSstruct rte_eth_rxtx_callback *cb;/* __ATOMIC_RELEASE memory order was used when the* call back was inserted into the list.* Since there is a clear dependency between loading* cb and cb->fn/cb->next, __ATOMIC_ACQUIRE memory order is* not required.*/cb = __atomic_load_n(&dev->post_rx_burst_cbs[queue_id],__ATOMIC_RELAXED);if (unlikely(cb != NULL)) {do {nb_rx = cb->fn.rx(port_id, queue_id, rx_pkts, nb_rx,nb_pkts, cb->param);cb = cb->next;} while (cb != NULL);}
#endifreturn nb_rx;
}
static inline uint16_t
rte_eth_tx_burst(uint16_t port_id, uint16_t queue_id,struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
{struct rte_eth_dev *dev = &rte_eth_devices[port_id];#ifdef RTE_LIBRTE_ETHDEV_DEBUGRTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, 0);RTE_FUNC_PTR_OR_ERR_RET(*dev->tx_pkt_burst, 0);if (queue_id >= dev->data->nb_tx_queues) {RTE_ETHDEV_LOG(ERR, "Invalid TX queue_id=%u\\n", queue_id);return 0;}
#endif#ifdef RTE_ETHDEV_RXTX_CALLBACKSstruct rte_eth_rxtx_callback *cb;/* __ATOMIC_RELEASE memory order was used when the* call back was inserted into the list.* Since there is a clear dependency between loading* cb and cb->fn/cb->next, __ATOMIC_ACQUIRE memory order is* not required.*/cb = __atomic_load_n(&dev->pre_tx_burst_cbs[queue_id],__ATOMIC_RELAXED);if (unlikely(cb != NULL)) {do {nb_pkts = cb->fn.tx(port_id, queue_id, tx_pkts, nb_pkts,cb->param);cb = cb->next;} while (cb != NULL);}
#endifreturn (*dev->tx_pkt_burst)(dev->data->tx_queues[queue_id], tx_pkts, nb_pkts);
}

看上面的最后一行代码,通过ID来查找相关的函数函数,并将其通过tx_pkt_burst强制转到相关的函数指针,这就和下面分析如何与lib中的相关封装函数连接在一起进行了一个转换。

下面是二者是如何转换的过程,看下面的代码:

static int
eth_dev_vhost_create(struct rte_vdev_device *dev, char *iface_name,int16_t queues, const unsigned int numa_node, uint64_t flags,uint64_t disable_flags)
{
......eth_dev->dev_ops = &ops;/* finally assign rx and tx ops */eth_dev->rx_pkt_burst = eth_vhost_rx;//定义函数指针eth_dev->tx_pkt_burst = eth_vhost_tx;
}static inline uint16_t
rte_eth_rx_burst(uint16_t port_id, uint16_t queue_id,struct rte_mbuf **rx_pkts, const uint16_t nb_pkts)
{struct rte_eth_dev *dev = &rte_eth_devices[port_id];uint16_t nb_rx;#ifdef RTE_LIBRTE_ETHDEV_DEBUGRTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, 0);RTE_FUNC_PTR_OR_ERR_RET(*dev->rx_pkt_burst, 0);if (queue_id >= dev->data->nb_rx_queues) {RTE_ETHDEV_LOG(ERR, "Invalid RX queue_id=%u\\n", queue_id);return 0;}
#endif
//此处的代码认真分析一下可以看到转换的过程nb_rx = (*dev->rx_pkt_burst)(dev->data->rx_queues[queue_id],rx_pkts, nb_pkts);#ifdef RTE_ETHDEV_RXTX_CALLBACKSstruct rte_eth_rxtx_callback *cb;/* __ATOMIC_RELEASE memory order was used when the* call back was inserted into the list.* Since there is a clear dependency between loading* cb and cb->fn/cb->next, __ATOMIC_ACQUIRE memory order is* not required.*/cb = __atomic_load_n(&dev->post_rx_burst_cbs[queue_id],__ATOMIC_RELAXED);if (unlikely(cb != NULL)) {do {nb_rx = cb->fn.rx(port_id, queue_id, rx_pkts, nb_rx,nb_pkts, cb->param);cb = cb->next;} while (cb != NULL);}
#endifreturn nb_rx;
}static uint16_t
eth_vhost_rx(void *q, struct rte_mbuf **bufs, uint16_t nb_bufs)
{struct vhost_queue *r = q;uint16_t i, nb_rx = 0;uint16_t nb_receive = nb_bufs;uint64_t nb_bytes = 0;if (unlikely(rte_atomic32_read(&r->allow_queuing) == 0))return 0;rte_atomic32_set(&r->while_queuing, 1);if (unlikely(rte_atomic32_read(&r->allow_queuing) == 0))goto out;/* Dequeue packets from guest TX queue */while (nb_receive) {uint16_t nb_pkts;uint16_t num = (uint16_t)RTE_MIN(nb_receive,VHOST_MAX_PKT_BURST);//此处调用了rte_vhost_dequeue_burstnb_pkts = rte_vhost_dequeue_burst(r->vid, r->virtqueue_id,r->mb_pool, &bufs[nb_rx],num);nb_rx += nb_pkts;nb_receive -= nb_pkts;if (nb_pkts < num)break;}r->stats.pkts += nb_rx;for (i = 0; likely(i < nb_rx); i++) {bufs[i]->port = r->port;bufs[i]->vlan_tci = 0;if (r->internal->vlan_strip)rte_vlan_strip(bufs[i]);nb_bytes += bufs[i]->pkt_len;}r->stats.bytes += nb_bytes;vhost_update_packet_xstats(r, bufs, nb_rx, nb_bytes, 0);out:rte_atomic32_set(&r->while_queuing, 0);return nb_rx;
}

从此处可以看出eth_vhost_rx中封装了rte_vhost_dequeue_burst,另外一个enqueue也是如此。

三、总结

在基本了解了VHOST的源码结构后,一个大致的前后端的轮廓就初步呈现在了面前。但是,这些具体的更详细的应用和一些细节的应用,还需要进一步的去看源码。抓住重点,略去细节,可以从整体把握;掌握整体后,就某一细节拆分研究,又可以理解和掌握真正的实现手段和技巧。这就看大家想在某方面有兴趣了,不要强求所谓的全面掌握,全都知道。毕竟,需要学的东西太多了。