ICode9

精准搜索请尝试: 精确搜索
首页 > 其他分享> 文章详细

kvm虚拟磁盘设备全过程

2021-01-30 21:56:13  阅读:419  来源: 互联网

标签:virtio struct kvm dev vp pci 全过程 device 磁盘


读者慎入,没有虚拟化相关知识可能完全看不懂。

虚拟PCI设备配对过程:
以kvmtool中的pci disk为例:

disk pci配置空间中有:
vendor_id = 0x1af4
device_id = 0x1001
subsys_id = 2
步骤:guest linux kernel 通过0xcf8 0xcfc端口遍历pci配置空间,遍历端口过程中被vm截获,传递给qemu,qemu中的pci配置信息反馈给guest ,在guest中建立对应的struct pci_dev 然后注册pci dev.
在drivers/virtio/virtio_pci_common.c中有 对应的pci_driver与之匹配。

static struct pci_driver virtio_pci_driver = { 
        .name           = "virtio-pci",
        .id_table       = virtio_pci_id_table,
        .probe          = virtio_pci_probe,
        .remove         = virtio_pci_remove,
#ifdef CONFIG_PM_SLEEP
        .driver.pm      = &virtio_pci_pm_ops,
#endif
};
#define PCI_VENDOR_ID_REDHAT_QUMRANET 0x1af4
#define PCI_ANY_ID (~0)
static const struct pci_device_id virtio_pci_id_table[] = {
        { PCI_DEVICE(PCI_VENDOR_ID_REDHAT_QUMRANET, PCI_ANY_ID) },
        { 0 }
};
struct pci_dev {
  ....
  ....
unsigned short  vendor;
  unsigned short  device;
  unsigned short  subsystem_vendor;
  unsigned short  subsystem_device;
  unsigned int  class;    /* 3 bytes: (base,sub,prog-if) */
 .....
 ....
}

pci驱动和设备匹配之后,运行pci驱动中的probe函数virtio_pci_probe()

static int virtio_pci_probe(struct pci_dev *pci_dev,
          const struct pci_device_id *id)
{
  struct virtio_pci_device *vp_dev, *reg_dev = NULL;
  pci_set_drvdata(pci_dev, vp_dev);
  vp_dev->vdev.dev.parent = &pci_dev->dev;
  vp_dev->vdev.dev.release = virtio_pci_release_dev;
  vp_dev->pci_dev = pci_dev;
  INIT_LIST_HEAD(&vp_dev->virtqueues);
  spin_lock_init(&vp_dev->lock);

  /* enable the device */
  rc = pci_enable_device(pci_dev);
  if (rc)
    goto err_enable_device;

  if (force_legacy) {
    rc = virtio_pci_legacy_probe(vp_dev);
    /* Also try modern mode if we can't map BAR0 (no IO space). */
    if (rc == -ENODEV || rc == -ENOMEM)
      rc = virtio_pci_modern_probe(vp_dev);
    if (rc)
      goto err_probe;
  } else {
    rc = virtio_pci_modern_probe(vp_dev);
    if (rc == -ENODEV)
      rc = virtio_pci_legacy_probe(vp_dev);
  }
  pci_set_master(pci_dev);
  rc = register_virtio_device(&vp_dev->vdev);
}

值得注意的是上图中的 virtio_pci_legacy_probe() 和 register_virtio_device

首先看下 struct virtio_pci_device结构

struct virtio_pci_device {
  struct virtio_device vdev;
  struct pci_dev *pci_dev;
  /* Number of available vectors */
  unsigned msix_vectors;
  /* Vectors allocated, excluding per-vq vectors if any */
  unsigned msix_used_vectors;

  /* Whether we have vector per vq */
  bool per_vq_vectors;

  struct virtqueue *(*setup_vq)(struct virtio_pci_device *vp_dev,
              struct virtio_pci_vq_info *info,
              unsigned idx,
              void (*callback)(struct virtqueue *vq),
              const char *name,
              bool ctx,
              u16 msix_vec);
  void (*del_vq)(struct virtio_pci_vq_info *info);

  u16 (*config_vector)(struct virtio_pci_device *vp_dev, u16 vector);
};

再看下 intvirtio_pci_legacy_probe()

/ the PCI probing function /
intvirtio_pci_legacy_probe(struct virtio_pci_device vp_dev)
{
struct pci_dev
pci_dev =vp_dev->pci_dev;
int rc;

     if (pci_dev->device < 0x1000 ||pci_dev->device > 0x103f)
               return -ENODEV;

     rc = dma_set_mask(&pci_dev->dev,DMA_BIT_MASK(64));

     rc = pci_request_region(pci_dev, 0,"virtio-pci-legacy");

     vp_dev->vdev.id.vendor = pci_dev->subsystem_vendor;
     vp_dev->vdev.id.device = pci_dev->subsystem_device;
     vp_dev->vdev.config = &virtio_pci_config_ops;

}

static const struct virtio_config_ops virtio_pci_config_ops = {
  .get    = vp_get,
  .set    = vp_set,
  .get_status  = vp_get_status,
  .set_status  = vp_set_status,
  .reset    = vp_reset,
  .find_vqs  = vp_find_vqs,
  .del_vqs  = vp_del_vqs,
  .get_features  = vp_get_features,
  .finalize_features = vp_finalize_features,
  .bus_name  = vp_bus_name,
  .set_vq_affinity = vp_set_vq_affinity,
  .get_vq_affinity = vp_get_vq_affinity,
};

现在开始注册虚拟设备(磁盘)virtio_device

注册virtio_device
int register_virtio_device(struct virtio_device *dev)
{
  int err;

  dev->dev.bus = &virtio_bus;
  device_initialize(&dev->dev);
  dev_set_name(&dev->dev, "virtio%u", dev->index);

  /* We always start by resetting the device, in case a previous
   * driver messed it up.  This also tests that code path a little. */
  dev->config->reset(dev);
  /* Acknowledge that we've seen the device. */
  virtio_add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE);
  device_add(&dev->dev);
}

在drivers/block/virtio_blk.c中有 register_virtio_driver(&virtio_blk)和上面的 register_virtio_device正好匹配上。

static int __init init(void)
{
       virtblk_wq = alloc_workqueue("virtio-blk", 0, 0); 
       if (!virtblk_wq)
                return -ENOMEM;
       major = register_blkdev(0, "virtblk");

       register_virtio_driver(&virtio_blk);
}

#define VIRTIO_ID_BLOCK    2 /* virtio block */

static const struct virtio_device_id id_table[] = {
  { VIRTIO_ID_BLOCK, VIRTIO_DEV_ANY_ID },
  { 0 },
};

static struct virtio_driver virtio_blk = {
  .feature_table      = features,
  .feature_table_size    = ARRAY_SIZE(features),
  .feature_table_legacy    = features_legacy,
  .feature_table_size_legacy  = ARRAY_SIZE(features_legacy),
  .driver.name      = KBUILD_MODNAME,
  .driver.owner      = THIS_MODULE,
  .id_table      = id_table,
  .probe        = virtblk_probe,
  .remove        = virtblk_remove,
  .config_changed      = virtblk_config_changed,
#ifdef CONFIG_PM_SLEEP
  .freeze        = virtblk_freeze,
  .restore      = virtblk_restore,
#endif
};

struct virtio_blk {
  struct virtio_device *vdev;

  /* The disk structure for the kernel. */
  struct gendisk *disk;

  /* Block layer tags. */
  struct blk_mq_tag_set tag_set;

  /* Process context for config space updates */
  struct work_struct config_work;

  /* What host tells us, plus 2 for header & tailer. */
  unsigned int sg_elems;

  /* Ida index - used to track minor number allocations. */
  int index;

  /* num of vqs */
  int num_vqs;
  struct virtio_blk_vq *vqs;
};

匹配上之后执行virtblk_probe().

总的过程就是首先虚拟的磁盘pci设备匹配上,匹配上之后执行驱动的probe函数然后注册虚拟磁盘设备register_virtio_device(),
然后再与系统中的虚拟磁盘驱动匹配一次,执行virtblk_probe()

匹配上之后执行virtblk_probe()

staticint virtblk_probe(struct virtio_device *vdev)

{

         struct virtio_blk *vblk;
         struct request_queue *q;
         err = init_vq(vblk);
         if (err)
                   goto out_free_vblk;
         vblk->disk = alloc_disk(1 <<PART_BITS);
         if (!vblk->disk) {
                   err = -ENOMEM;
                   goto out_free_vq;
         }
         memset(&vblk->tag_set, 0,sizeof(vblk->tag_set));
         vblk->tag_set.ops =&virtio_mq_ops;
         vblk->tag_set.queue_depth =virtblk_queue_depth;
         vblk->tag_set.numa_node =NUMA_NO_NODE;
         vblk->tag_st.flags =BLK_MQ_F_SHOULD_MERGE;
         vblk->tag_set.cmd_size =
                   sizeof(struct virtblk_req) +
                   sizeof(struct scatterlist) *sg_elems;
         vblk->tag_set.driver_data = vblk;
         vblk->tag_set.nr_hw_queues =vblk->num_vqs;
         err =blk_mq_alloc_tag_set(&vblk->tag_set);
         if (err)
                   goto out_put_disk;
         q =blk_mq_init_queue(&vblk->tag_set);
         if (IS_ERR(q)) {
                   err = -ENOMEM;
                   goto out_free_tags;
         }
         vblk->disk->queue = q;
         q->queuedata = vblk;
         virtblk_name_format("vd",index, vblk->disk->disk_name, DISK_NAME_LEN);
/* 虚拟磁盘与 major = register_blkdev(0,"virtblk") 关联起来 */
         vblk->disk->major = major;
         vblk->disk->first_minor =index_to_minor(index);
         vblk->disk->private_data = vblk;
         vblk->disk->fops =&virtblk_fops;
static const struct blk_mq_ops virtio_mq_ops = {
         .queue_rq         = virtio_queue_rq,
         .commit_rqs    = virtio_commit_rqs,
         .complete         = virtblk_request_done,
         .init_request    = virtblk_init_request,
#ifdefCONFIG_VIRTIO_BLK_SCSI
         .initialize_rq_fn = virtblk_initialize_rq,
#endif
         .map_queues  = virtblk_map_queues,
};

staticblk_status_t virtio_queue_rq(struct blk_mq_hw_ctx *hctx,

                               const struct blk_mq_queue_data *bd)

{
         switch (req_op(req)) {
         case REQ_OP_READ:
         case REQ_OP_WRITE:
                   type = 0;
                   break;
         case REQ_OP_FLUSH:
                   type = VIRTIO_BLK_T_FLUSH;
                   break;
         case REQ_OP_DISCARD:
                   type = VIRTIO_BLK_T_DISCARD;
                   break;
         case REQ_OP_WRITE_ZEROES:
                   type =VIRTIO_BLK_T_WRITE_ZEROES;
                   unmap = !(req->cmd_flags& REQ_NOUNMAP);
                   break;
         case REQ_OP_SCSI_IN:
         case REQ_OP_SCSI_OUT:
                   type = VIRTIO_BLK_T_SCSI_CMD;
                   break;
         case REQ_OP_DRV_IN:
                   type = VIRTIO_BLK_T_GET_ID;
                   break;
         default:
                   WARN_ON_ONCE(1);
                   return BLK_STS_IOERR;
         }
         blk_mq_start_request(req);
         if (notify)
                   virtqueue_notify(vblk->vqs[qid].vq);
         return BLK_STS_OK;
}

在drivers/virtio/virtio_ring.c中

boolvirtqueue_notify(struct virtqueue *_vq)

{
         struct vring_virtqueue *vq =to_vvq(_vq);
         if (unlikely(vq->broken))
                   return false;
         /* Prod other side to tell it aboutchanges. */
         if (!vq->notify(_vq)) {
                   vq->broken = true;
                   return false;
         }
         return true;
}
   /* Host must always specify thecapacity. */
     virtio_cread(vdev, struct virtio_blk_config, capacity, &cap);

这里获取虚拟硬盘的容量:

static void vp_get(struct virtio_device *vdev, unsigned offset,

                      void *buf, unsigned len)
{
         struct virtio_pci_device *vp_dev =to_vp_device(vdev);
         void __iomem *ioaddr =vp_dev->ioaddr +
                            VIRTIO_PCI_CONFIG_OFF(vp_dev->msix_enabled)+
                            offset;
         u8 *ptr = buf;
         int i;
         for (i = 0; i < len; i++)
                   ptr[i] = ioread8(ioaddr + i);

通过io端口读取硬盘容量,io操作会被vm截获,看看在kvmtool(相当于qemu)中传递给guest.在kvmtool中有相应的结构传递给guest.
kvmtool/virtio/blk.c

static voidset_guest_features(struct kvm *kvm, void *dev, u32 features)

{

         struct blk_dev *bdev = dev;
         struct virtio_blk_config *conf = &bdev->blk_config;
         struct virtio_blk_geometry *geo = &conf->geometry;
         bdev->features = features;
         conf->capacity= virtio_host_to_guest_u64(&bdev->vdev, conf->capacity);
         conf->size_max =virtio_host_to_guest_u32(&bdev->vdev, conf->size_max);
         conf->seg_max = virtio_host_to_guest_u32(&bdev->vdev,conf->seg_max);
         /* Geometry */
         geo->cylinders =virtio_host_to_guest_u16(&bdev->vdev, geo->cylinders);
         conf->blk_size =virtio_host_to_guest_u32(&bdev->vdev, conf->blk_size);
         conf->min_o_size =virtio_host_to_guest_u16(&bdev->vdev, conf->min_io_size);
         conf->opt_io_size =virtio_host_to_guest_u32(&bdev->vdev, conf->opt_io_size);

}
*bdev = (structblk_dev) {
                   .mutex                        = MUTEX_INITIALIZER,
                   .disk                   = disk,
                   .blk_config                 = (struct virtio_blk_config) {
                            .capacity  =disk->size / SECTOR_SIZE,
                            .seg_max          =DISK_SEG_MAX,
                   },
                   .io_efd                        = eventfd(0, 0),
                   .kvm                            = kvm,
         };
}
  set_capacity(vblk->disk, cap);
    /* 把虚拟磁盘添加进系统 */
  device_add_disk(&vdev->dev,vblk->disk);

系统里面通过虚拟pci的方式添加虚拟设备,原理都是类似的。
drivers/block/virtio_blk.c
drivers/net/virtio_net.c
drivers/char/virtio_console.c
drivers/char/hw_random/virtio-rng.c
drivers/scsi/virtio_scsi.c
block/blk-mq-virtio.c

标签:virtio,struct,kvm,dev,vp,pci,全过程,device,磁盘
来源: https://blog.51cto.com/15069487/2612712

本站声明: 1. iCode9 技术分享网(下文简称本站)提供的所有内容,仅供技术学习、探讨和分享;
2. 关于本站的所有留言、评论、转载及引用,纯属内容发起人的个人观点,与本站观点和立场无关;
3. 关于本站的所有言论和文字,纯属内容发起人的个人观点,与本站观点和立场无关;
4. 本站文章均是网友提供,不完全保证技术分享内容的完整性、准确性、时效性、风险性和版权归属;如您发现该文章侵犯了您的权益,可联系我们第一时间进行删除;
5. 本站为非盈利性的个人网站,所有内容不会用来进行牟利,也不会利用任何形式的广告来间接获益,纯粹是为了广大技术爱好者提供技术内容和技术思想的分享性交流网站。

专注分享技术,共同学习,共同进步。侵权联系[81616952@qq.com]

Copyright (C)ICode9.com, All Rights Reserved.

ICode9版权所有