remap_pfn_range: Map bar space to user space pci_map_device

There is basically no useful Linux PCI driver tutorial on the Internet. After half a day, the PCI configuration space is over. However, the PCI configuration space is the easiest to access, but it is more important to scan PCI devices when the kernel starts. For PCI drivers, the IO space and memory space of PCI devices are more commonly used.
Previously, it was only known that BAR0-BAR5 can read the IO space or the base address of the address space of the PCI device in the configuration space of the PCI device, but how to distinguish whether this BAR represents the IO space or the memory address space?
In the sample program of PCI network card (PCI skeleton. C):

  1. pio_start = pci_resource_start(pdev, 0);
  2.     pio_end = pci_resource_end(pdev, 0);
  3.     pio_flags = pci_resource_flags(pdev, 0);
  4.     pio_len = pci_resource_len(pdev, 0);
  5.     mmio_start = pci_resource_start(pdev, 1);
  6.     mmio_end = pci_resource_end(pdev, 1);
  7.     mmio_flags = pci_resource_flags(pdev, 1);
  8.     mmio_len = pci_resource_len(pdev, 1);
  9.     /* make sure PCI base addr 0 is PIO */
  10.     if (!(pio_flags & IORESOURCE_IO)) {
  11.     dev_err(&pdev->dev, "region #0 not a PIO resource, aborting\n");
  12.     rc = -ENODEV;
  13.     goto err_out;
  14.     }
  15.     /* make sure PCI base addr 1 is MMIO */
  16.     if (!(mmio_flags & IORESOURCE_MEM)) {
  17.     dev_err(&pdev->dev, "region #1 not an MMIO resource, aborting\n");
  18.     rc = -ENODEV;
  19.     goto err_out;
  20.     }

It can be seen that if only the driver is written, the kernel will recognize the BAR attribute of the device long before scanning the PCI device. Of course, how many bars are there, and whether each BAR is an IO space or a PCI address space, you can directly ask the hardware engineer who makes PCI devices.
So how does the kernel get this flag? I followed for a long time and didn't find the source code. Just know that the PCI bus specification stipulates that the BAR is read directly and the BAR space base address is returned. Write all 1 to BAR first and then read it. You can read the size and attributes of BAR space. Select the lowest bit that is not 0. For example, if 0xffff00 is read, the size of that space is 0x100 bytes. If the last bit is 0, it indicates that it is the address area. If it is 1, this BAR is the IO space.

In addition, a very important concept is that what BAR reads is the address in PCI address space, which is not equivalent to the memory address recognized by CPU. Although on x86, if the IOMMU is not turned on, their values are generally the same, but they can be different for CPUs of other architectures, such as PowerPC.
Therefore, the correct way to use BAR space:

pciaddr=pci_resource_start(pdev,1);
if(pciaddr!=NULL)
{
ioremap(pciaddr,xx_SIZE);
}

Wrong method:

pci_read_config_dword(pdev,1,&pciaddr);
ioremap(pciaddr,xx_SIZE);

dma_mem_map
static int
vfio_type1_dma_mem_map(int vfio_container_fd, uint64_t vaddr, uint64_t iova,
        uint64_t len, int do_map)
{
    struct vfio_iommu_type1_dma_map dma_map;
    struct vfio_iommu_type1_dma_unmap dma_unmap;
    int ret;

    if (do_map != 0) {
        memset(&dma_map, 0, sizeof(dma_map));
        dma_map.argsz = sizeof(struct vfio_iommu_type1_dma_map);
        dma_map.vaddr = vaddr;
        dma_map.size = len;
        dma_map.iova = iova;
        dma_map.flags = VFIO_DMA_MAP_FLAG_READ |
                VFIO_DMA_MAP_FLAG_WRITE;

//VFIO_IOMMU_MAP_DMA This command is to iova adopt IOMMU Map to vaddr Corresponding physical address.
        ret = ioctl(vfio_container_fd, VFIO_IOMMU_MAP_DMA, &dma_map);
        if (ret) {
            /**
             * In case the mapping was already done EEXIST will be
             * returned from kernel.
             */
            if (errno == EEXIST) {
                RTE_LOG(DEBUG, EAL,
                    " Memory segment is already mapped,"
                    " skipping");
            } else {
                RTE_LOG(ERR, EAL,
                    "  cannot set up DMA remapping,"
                    " error %i (%s)\n",
                    errno, strerror(errno));
                return -1;
            }
        }
    } else {
        memset(&dma_unmap, 0, sizeof(dma_unmap));
        dma_unmap.argsz = sizeof(struct vfio_iommu_type1_dma_unmap);
        dma_unmap.size = len;
        dma_unmap.iova = iova;

        ret = ioctl(vfio_container_fd, VFIO_IOMMU_UNMAP_DMA,
                &dma_unmap);
        if (ret) {
            RTE_LOG(ERR, EAL, "  cannot clear DMA remapping, error %i (%s)\n",
                    errno, strerror(errno));
            return -1;
        }
    }

    return 0;
}

 

 

 

 

 

static int vfio_pci_mmap(void *device_data, struct vm_area_struct *vma)
{
    struct vfio_pci_device *vdev = device_data;
    struct pci_dev *pdev = vdev->pdev;
    unsigned int index;
    u64 phys_len, req_len, pgoff, req_start;
    int ret;
    index = vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT);
    if (vma->vm_end < vma->vm_start)
        return -EINVAL;
    if ((vma->vm_flags & VM_SHARED) == 0)
        return -EINVAL;
    if (index >= VFIO_PCI_ROM_REGION_INDEX)
        return -EINVAL;
    if (!(pci_resource_flags(pdev, index) & IORESOURCE_MEM))
        return -EINVAL;
    phys_len = pci_resource_len(pdev, index);
    req_len = vma->vm_end - vma->vm_start;
    pgoff = vma->vm_pgoff &
        ((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1);
    req_start = pgoff << PAGE_SHIFT;
    if (phys_len < PAGE_SIZE || req_start + req_len > phys_len)
        return -EINVAL;
    if (index == vdev->msix_bar) {
        /*
         * Disallow mmaps overlapping the MSI-X table; users don't
         * get to touch this directly.  We could find somewhere
         * else to map the overlap, but page granularity is only
         * a recommendation, not a requirement, so the user needs
         * to know which bits are real.  Requiring them to mmap
         * around the table makes that clear.
         */
        /* If neither entirely above nor below, then it overlaps */
        if (!(req_start >= vdev->msix_offset + vdev->msix_size ||
              req_start + req_len <= vdev->msix_offset))
            return -EINVAL;
    }
    /*
     * Even though we don't make use of the barmap for the mmap,
     * we need to request the region and the barmap tracks that.
     */
    if (!vdev->barmap[index]) {
        ret = pci_request_selected_regions(pdev,
                           1 << index, "vfio-pci");
        if (ret)
            return ret;
        vdev->barmap[index] = pci_iomap(pdev, index, 0);
    }
    vma->vm_private_data = vdev;
    vma->vm_flags |= VM_IO | VM_DONTEXPAND | VM_DONTDUMP;
    vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
    vma->vm_pgoff = (pci_resource_start(pdev, index) >> PAGE_SHIFT) + pgoff;
    return remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff,
                   req_len, vma->vm_page_prot);
}

 

 

Vfio PCI and igb_uio mapping hardware resources

DPDK(version 20.02) function rte_pci_map_device is used to map pci device resource to user state:

/* Map pci device, only reserve skeleton codes */
int
rte_pci_map_device(struct rte_pci_device *dev)
{
    switch (dev->kdrv) {
    case RTE_KDRV_VFIO:
        pci_vfio_map_resource(dev);
        break;
    case RTE_KDRV_IGB_UIO:
        pci_uio_map_resource(dev);
        break;
}

 

A vfio PCI
When the device is bound to vfio PCI, the function PCI is called_ vfio_ map_ resource

1.1 function pci_vfio_map_resource

Here we discuss the function PCI_ vfio_ map_ resource_ The main part of primary is analyzed.

 

 

static int
pci_vfio_map_resource_primary(struct rte_pci_device *dev)
{
    struct vfio_device_info device_info = { .argsz = sizeof(device_info) };
    char pci_addr[PATH_MAX] = {0};
    int vfio_dev_fd;
    struct rte_pci_addr *loc = &dev->addr;
    int i, ret;
    struct mapped_pci_resource *vfio_res = NULL;
    struct mapped_pci_res_list *vfio_res_list =
        RTE_TAILQ_CAST(rte_vfio_tailq.head, mapped_pci_res_list);

    struct pci_map *maps;

    dev->intr_handle.fd = -1;
#ifdef HAVE_VFIO_DEV_REQ_INTERFACE
    dev->vfio_req_intr_handle.fd = -1;
#endif

    /* store PCI address string */
    snprintf(pci_addr, sizeof(pci_addr), PCI_PRI_FMT,
            loc->domain, loc->bus, loc->devid, loc->function);

    ret = rte_vfio_setup_device(rte_pci_get_sysfs_path(), pci_addr,
                    &vfio_dev_fd, &device_info);
    if (ret)
        return ret;

    /* allocate vfio_res and get region info */
    vfio_res = rte_zmalloc("VFIO_RES", sizeof(*vfio_res), 0);
    if (vfio_res == NULL) {
        RTE_LOG(ERR, EAL,
            "%s(): cannot store vfio mmap details\n", __func__);
        goto err_vfio_dev_fd;
    }
    memcpy(&vfio_res->pci_addr, &dev->addr, sizeof(vfio_res->pci_addr));

    /* get number of registers (up to BAR5) */
    vfio_res->nb_maps = RTE_MIN((int) device_info.num_regions,
            VFIO_PCI_BAR5_REGION_INDEX + 1);

    /* map BARs */
    maps = vfio_res->maps;

    vfio_res->msix_table.bar_index = -1;
    /* get MSI-X BAR, if any (we have to know where it is because we can't
     * easily mmap it when using VFIO)
     */
    ret = pci_vfio_get_msix_bar(vfio_dev_fd, &vfio_res->msix_table);
    if (ret < 0) {
        RTE_LOG(ERR, EAL, "  %s cannot get MSI-X BAR number!\n",
                pci_addr);
        goto err_vfio_res;
    }
    /* if we found our MSI-X BAR region, check if we can mmap it */
    if (vfio_res->msix_table.bar_index != -1) {
        int ret = pci_vfio_msix_is_mappable(vfio_dev_fd,
                vfio_res->msix_table.bar_index);
        if (ret < 0) {
            RTE_LOG(ERR, EAL, "Couldn't check if MSI-X BAR is mappable\n");
            goto err_vfio_res;
        } else if (ret != 0) {
            /* we can map it, so we don't care where it is */
            RTE_LOG(DEBUG, EAL, "VFIO reports MSI-X BAR as mappable\n");
            vfio_res->msix_table.bar_index = -1;
        }
    }

    for (i = 0; i < (int) vfio_res->nb_maps; i++) {
        struct vfio_region_info *reg = NULL;
        void *bar_addr;

        ret = pci_vfio_get_region_info(vfio_dev_fd, &reg, i);
        if (ret < 0) {
            RTE_LOG(ERR, EAL, "  %s cannot get device region info "
                "error %i (%s)\n", pci_addr, errno,
                strerror(errno));
            goto err_vfio_res;
        }

        /* chk for io port region */
        ret = pci_vfio_is_ioport_bar(vfio_dev_fd, i);
        if (ret < 0) {
            free(reg);
            goto err_vfio_res;
        } else if (ret) {
            RTE_LOG(INFO, EAL, "Ignore mapping IO port bar(%d)\n",
                    i);
            free(reg);
            continue;
        }

        /* skip non-mmapable BARs */
        if ((reg->flags & VFIO_REGION_INFO_FLAG_MMAP) == 0) {
            free(reg);
            continue;
        }

        /* try mapping somewhere close to the end of hugepages */
        if (pci_map_addr == NULL)
            pci_map_addr = pci_find_max_end_va();

        bar_addr = pci_map_addr;
        pci_map_addr = RTE_PTR_ADD(bar_addr, (size_t) reg->size);

        maps[i].addr = bar_addr;
        maps[i].offset = reg->offset;
        maps[i].size = reg->size;
        maps[i].path = NULL; /* vfio doesn't have per-resource paths */

        ret = pci_vfio_mmap_bar(vfio_dev_fd, vfio_res, i, 0);
        if (ret < 0) {
            RTE_LOG(ERR, EAL, "  %s mapping BAR%i failed: %s\n",
                    pci_addr, i, strerror(errno));
            free(reg);
            goto err_vfio_res;
        }

        dev->mem_resource[i].addr = maps[i].addr;

        free(reg);
    }

    if (pci_rte_vfio_setup_device(dev, vfio_dev_fd) < 0) {
        RTE_LOG(ERR, EAL, "  %s setup device failed\n", pci_addr);
        goto err_vfio_res;
    }

 
}

 

 

 

1.1.1 rte_vfio_setup_device
The main work of this function is as follows:

First, get the IOMMU corresponding to the device_ Group, find iommu_group id and open the corresponding character device
/*This function obtains IOMMU through sys file system_ id number of the group*/
int
rte_vfio_get_group_num(const char *sysfs_base,
const char *dev_addr, int *iommu_group_num)

/*This function opens the character device / dev/vfio/{iommu_group} and returns the character device handle*/
int
rte_vfio_get_group_fd(int iommu_group_num)
{
struct vfio_config *vfio_cfg;

/* get the vfio_config it belongs to */
vfio_cfg = get_vfio_cfg_by_group_num(iommu_group_num);
vfio_cfg = vfio_cfg ? vfio_cfg : default_vfio_cfg;

return vfio_get_group_fd(vfio_cfg, iommu_group_num);
}
Get the IOMMU to which the current device belongs_ Group configuration
struct vfio_config {
int vfio_enabled;
int vfio_container_fd;
int vfio_active_groups;
const struct vfio_iommu_type *vfio_iommu_type;
struct vfio_group vfio_groups[VFIO_MAX_GROUPS];
struct user_mem_maps mem_maps;
};

/* get the vfio_config it belongs to */
struct vfio_config *vfio_cfg;
vfio_cfg = get_vfio_cfg_by_group_num(iommu_group_num);
vfio_cfg = vfio_cfg ? vfio_cfg : default_vfio_cfg;
vfio_container_fd = vfio_cfg->vfio_container_fd;
user_mem_maps = &vfio_cfg->mem_maps;
? Add the character device just opened to the container, complete the memory mapping of iommu, and call the function vfio in Intel Architecture_ type1_ dma_ Map is mapped, and the memory mapped by DPDK is (it seems that all the memory managed by DPDK are mapped).....

Get device fd and device info and return.

/* get a file descriptor for the device */
*vfio_dev_fd = ioctl(vfio_group_fd, VFIO_GROUP_GET_DEVICE_FD, dev_addr);

/* test and setup the device */
ret = ioctl(*vfio_dev_fd, VFIO_DEVICE_GET_INFO, device_info);
1.1.2 pci_vfio_get_msix_bar
By reading the PCI configuration space of the device, the method of reading is to obtain the configuration information of msix through the device handle obtained in the previous step. And save to vfio_res structure.

/* get MSI-X BAR, if any (we have to know where it is because we can't
* easily mmap it when using VFIO)
*/
ret = pci_vfio_get_msix_bar(vfio_dev_fd, &vfio_res->msix_table);

1.1.3 pci_vfio_get_region_info & pci_vfio_mmap_bar
Get the bar region (register, interrupt and other information) of the device, and complete the mmap mapping of the register, so that the user state program can directly access the register of the PCI device.

1.1.4 pci_rte_vfio_setup_device
This function first sets the interrupt and adds the first interrupt to the interrupt rotation linked list of the system.
Turn on the device and reset it.


static int
pci_rte_vfio_setup_device(struct rte_pci_device *dev, int vfio_dev_fd)
{
if (pci_vfio_setup_interrupts(dev, vfio_dev_fd) != 0) {
RTE_LOG(ERR, EAL, "Error setting up interrupts!\n");
return -1;
}

/* set bus mastering for the device */
if (pci_vfio_set_bus_master(vfio_dev_fd, true)) {
RTE_LOG(ERR, EAL, "Cannot set up bus mastering!\n");
return -1;
}

/*
* Reset the device. If the device is not capable of resetting,
* then it updates errno as EINVAL.
*/
if (ioctl(vfio_dev_fd, VFIO_DEVICE_RESET) && errno != EINVAL) {
RTE_LOG(ERR, EAL, "Unable to reset device! Error: %d (%s)\n",
errno, strerror(errno));
return -1;
}

return 0;
}

 

Posted by WiseGuy on Fri, 20 May 2022 08:14:07 +0300