linux_dsm_epyc7002/drivers/vfio/pci/vfio_pci.c
Alex Williamson 89e1f7d4c6 vfio: Add PCI device driver
Add PCI device support for VFIO.  PCI devices expose regions
for accessing config space, I/O port space, and MMIO areas
of the device.  PCI config access is virtualized in the kernel,
allowing us to ensure the integrity of the system, by preventing
various accesses while reducing duplicate support across various
userspace drivers.  I/O port supports read/write access while
MMIO also supports mmap of sufficiently sized regions.  Support
for INTx, MSI, and MSI-X interrupts are provided using eventfds to
userspace.

Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2012-07-31 08:16:24 -06:00

580 lines
14 KiB
C

/*
* Copyright (C) 2012 Red Hat, Inc. All rights reserved.
* Author: Alex Williamson <alex.williamson@redhat.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*
* Derived from original vfio:
* Copyright 2010 Cisco Systems, Inc. All rights reserved.
* Author: Tom Lyon, pugs@cisco.com
*/
#include <linux/device.h>
#include <linux/eventfd.h>
#include <linux/interrupt.h>
#include <linux/iommu.h>
#include <linux/module.h>
#include <linux/mutex.h>
#include <linux/notifier.h>
#include <linux/pci.h>
#include <linux/pm_runtime.h>
#include <linux/slab.h>
#include <linux/types.h>
#include <linux/uaccess.h>
#include <linux/vfio.h>
#include "vfio_pci_private.h"
#define DRIVER_VERSION "0.2"
#define DRIVER_AUTHOR "Alex Williamson <alex.williamson@redhat.com>"
#define DRIVER_DESC "VFIO PCI - User Level meta-driver"
static bool nointxmask;
module_param_named(nointxmask, nointxmask, bool, S_IRUGO | S_IWUSR);
MODULE_PARM_DESC(nointxmask,
"Disable support for PCI 2.3 style INTx masking. If this resolves problems for specific devices, report lspci -vvvxxx to linux-pci@vger.kernel.org so the device can be fixed automatically via the broken_intx_masking flag.");
static int vfio_pci_enable(struct vfio_pci_device *vdev)
{
struct pci_dev *pdev = vdev->pdev;
int ret;
u16 cmd;
u8 msix_pos;
vdev->reset_works = (pci_reset_function(pdev) == 0);
pci_save_state(pdev);
vdev->pci_saved_state = pci_store_saved_state(pdev);
if (!vdev->pci_saved_state)
pr_debug("%s: Couldn't store %s saved state\n",
__func__, dev_name(&pdev->dev));
ret = vfio_config_init(vdev);
if (ret)
goto out;
if (likely(!nointxmask))
vdev->pci_2_3 = pci_intx_mask_supported(pdev);
pci_read_config_word(pdev, PCI_COMMAND, &cmd);
if (vdev->pci_2_3 && (cmd & PCI_COMMAND_INTX_DISABLE)) {
cmd &= ~PCI_COMMAND_INTX_DISABLE;
pci_write_config_word(pdev, PCI_COMMAND, cmd);
}
msix_pos = pci_find_capability(pdev, PCI_CAP_ID_MSIX);
if (msix_pos) {
u16 flags;
u32 table;
pci_read_config_word(pdev, msix_pos + PCI_MSIX_FLAGS, &flags);
pci_read_config_dword(pdev, msix_pos + PCI_MSIX_TABLE, &table);
vdev->msix_bar = table & PCI_MSIX_FLAGS_BIRMASK;
vdev->msix_offset = table & ~PCI_MSIX_FLAGS_BIRMASK;
vdev->msix_size = ((flags & PCI_MSIX_FLAGS_QSIZE) + 1) * 16;
} else
vdev->msix_bar = 0xFF;
ret = pci_enable_device(pdev);
if (ret)
goto out;
return ret;
out:
kfree(vdev->pci_saved_state);
vdev->pci_saved_state = NULL;
vfio_config_free(vdev);
return ret;
}
static void vfio_pci_disable(struct vfio_pci_device *vdev)
{
int bar;
pci_disable_device(vdev->pdev);
vfio_pci_set_irqs_ioctl(vdev, VFIO_IRQ_SET_DATA_NONE |
VFIO_IRQ_SET_ACTION_TRIGGER,
vdev->irq_type, 0, 0, NULL);
vdev->virq_disabled = false;
vfio_config_free(vdev);
pci_reset_function(vdev->pdev);
if (pci_load_and_free_saved_state(vdev->pdev,
&vdev->pci_saved_state) == 0)
pci_restore_state(vdev->pdev);
else
pr_info("%s: Couldn't reload %s saved state\n",
__func__, dev_name(&vdev->pdev->dev));
for (bar = PCI_STD_RESOURCES; bar <= PCI_STD_RESOURCE_END; bar++) {
if (!vdev->barmap[bar])
continue;
pci_iounmap(vdev->pdev, vdev->barmap[bar]);
pci_release_selected_regions(vdev->pdev, 1 << bar);
vdev->barmap[bar] = NULL;
}
}
static void vfio_pci_release(void *device_data)
{
struct vfio_pci_device *vdev = device_data;
if (atomic_dec_and_test(&vdev->refcnt))
vfio_pci_disable(vdev);
module_put(THIS_MODULE);
}
static int vfio_pci_open(void *device_data)
{
struct vfio_pci_device *vdev = device_data;
if (!try_module_get(THIS_MODULE))
return -ENODEV;
if (atomic_inc_return(&vdev->refcnt) == 1) {
int ret = vfio_pci_enable(vdev);
if (ret) {
module_put(THIS_MODULE);
return ret;
}
}
return 0;
}
static int vfio_pci_get_irq_count(struct vfio_pci_device *vdev, int irq_type)
{
if (irq_type == VFIO_PCI_INTX_IRQ_INDEX) {
u8 pin;
pci_read_config_byte(vdev->pdev, PCI_INTERRUPT_PIN, &pin);
if (pin)
return 1;
} else if (irq_type == VFIO_PCI_MSI_IRQ_INDEX) {
u8 pos;
u16 flags;
pos = pci_find_capability(vdev->pdev, PCI_CAP_ID_MSI);
if (pos) {
pci_read_config_word(vdev->pdev,
pos + PCI_MSI_FLAGS, &flags);
return 1 << (flags & PCI_MSI_FLAGS_QMASK);
}
} else if (irq_type == VFIO_PCI_MSIX_IRQ_INDEX) {
u8 pos;
u16 flags;
pos = pci_find_capability(vdev->pdev, PCI_CAP_ID_MSIX);
if (pos) {
pci_read_config_word(vdev->pdev,
pos + PCI_MSIX_FLAGS, &flags);
return (flags & PCI_MSIX_FLAGS_QSIZE) + 1;
}
}
return 0;
}
static long vfio_pci_ioctl(void *device_data,
unsigned int cmd, unsigned long arg)
{
struct vfio_pci_device *vdev = device_data;
unsigned long minsz;
if (cmd == VFIO_DEVICE_GET_INFO) {
struct vfio_device_info info;
minsz = offsetofend(struct vfio_device_info, num_irqs);
if (copy_from_user(&info, (void __user *)arg, minsz))
return -EFAULT;
if (info.argsz < minsz)
return -EINVAL;
info.flags = VFIO_DEVICE_FLAGS_PCI;
if (vdev->reset_works)
info.flags |= VFIO_DEVICE_FLAGS_RESET;
info.num_regions = VFIO_PCI_NUM_REGIONS;
info.num_irqs = VFIO_PCI_NUM_IRQS;
return copy_to_user((void __user *)arg, &info, minsz);
} else if (cmd == VFIO_DEVICE_GET_REGION_INFO) {
struct pci_dev *pdev = vdev->pdev;
struct vfio_region_info info;
minsz = offsetofend(struct vfio_region_info, offset);
if (copy_from_user(&info, (void __user *)arg, minsz))
return -EFAULT;
if (info.argsz < minsz)
return -EINVAL;
switch (info.index) {
case VFIO_PCI_CONFIG_REGION_INDEX:
info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
info.size = pdev->cfg_size;
info.flags = VFIO_REGION_INFO_FLAG_READ |
VFIO_REGION_INFO_FLAG_WRITE;
break;
case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX:
info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
info.size = pci_resource_len(pdev, info.index);
if (!info.size) {
info.flags = 0;
break;
}
info.flags = VFIO_REGION_INFO_FLAG_READ |
VFIO_REGION_INFO_FLAG_WRITE;
if (pci_resource_flags(pdev, info.index) &
IORESOURCE_MEM && info.size >= PAGE_SIZE)
info.flags |= VFIO_REGION_INFO_FLAG_MMAP;
break;
case VFIO_PCI_ROM_REGION_INDEX:
{
void __iomem *io;
size_t size;
info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
info.flags = 0;
/* Report the BAR size, not the ROM size */
info.size = pci_resource_len(pdev, info.index);
if (!info.size)
break;
/* Is it really there? */
io = pci_map_rom(pdev, &size);
if (!io || !size) {
info.size = 0;
break;
}
pci_unmap_rom(pdev, io);
info.flags = VFIO_REGION_INFO_FLAG_READ;
break;
}
default:
return -EINVAL;
}
return copy_to_user((void __user *)arg, &info, minsz);
} else if (cmd == VFIO_DEVICE_GET_IRQ_INFO) {
struct vfio_irq_info info;
minsz = offsetofend(struct vfio_irq_info, count);
if (copy_from_user(&info, (void __user *)arg, minsz))
return -EFAULT;
if (info.argsz < minsz || info.index >= VFIO_PCI_NUM_IRQS)
return -EINVAL;
info.flags = VFIO_IRQ_INFO_EVENTFD;
info.count = vfio_pci_get_irq_count(vdev, info.index);
if (info.index == VFIO_PCI_INTX_IRQ_INDEX)
info.flags |= (VFIO_IRQ_INFO_MASKABLE |
VFIO_IRQ_INFO_AUTOMASKED);
else
info.flags |= VFIO_IRQ_INFO_NORESIZE;
return copy_to_user((void __user *)arg, &info, minsz);
} else if (cmd == VFIO_DEVICE_SET_IRQS) {
struct vfio_irq_set hdr;
u8 *data = NULL;
int ret = 0;
minsz = offsetofend(struct vfio_irq_set, count);
if (copy_from_user(&hdr, (void __user *)arg, minsz))
return -EFAULT;
if (hdr.argsz < minsz || hdr.index >= VFIO_PCI_NUM_IRQS ||
hdr.flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK |
VFIO_IRQ_SET_ACTION_TYPE_MASK))
return -EINVAL;
if (!(hdr.flags & VFIO_IRQ_SET_DATA_NONE)) {
size_t size;
if (hdr.flags & VFIO_IRQ_SET_DATA_BOOL)
size = sizeof(uint8_t);
else if (hdr.flags & VFIO_IRQ_SET_DATA_EVENTFD)
size = sizeof(int32_t);
else
return -EINVAL;
if (hdr.argsz - minsz < hdr.count * size ||
hdr.count > vfio_pci_get_irq_count(vdev, hdr.index))
return -EINVAL;
data = kmalloc(hdr.count * size, GFP_KERNEL);
if (!data)
return -ENOMEM;
if (copy_from_user(data, (void __user *)(arg + minsz),
hdr.count * size)) {
kfree(data);
return -EFAULT;
}
}
mutex_lock(&vdev->igate);
ret = vfio_pci_set_irqs_ioctl(vdev, hdr.flags, hdr.index,
hdr.start, hdr.count, data);
mutex_unlock(&vdev->igate);
kfree(data);
return ret;
} else if (cmd == VFIO_DEVICE_RESET)
return vdev->reset_works ?
pci_reset_function(vdev->pdev) : -EINVAL;
return -ENOTTY;
}
static ssize_t vfio_pci_read(void *device_data, char __user *buf,
size_t count, loff_t *ppos)
{
unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
struct vfio_pci_device *vdev = device_data;
struct pci_dev *pdev = vdev->pdev;
if (index >= VFIO_PCI_NUM_REGIONS)
return -EINVAL;
if (index == VFIO_PCI_CONFIG_REGION_INDEX)
return vfio_pci_config_readwrite(vdev, buf, count, ppos, false);
else if (index == VFIO_PCI_ROM_REGION_INDEX)
return vfio_pci_mem_readwrite(vdev, buf, count, ppos, false);
else if (pci_resource_flags(pdev, index) & IORESOURCE_IO)
return vfio_pci_io_readwrite(vdev, buf, count, ppos, false);
else if (pci_resource_flags(pdev, index) & IORESOURCE_MEM)
return vfio_pci_mem_readwrite(vdev, buf, count, ppos, false);
return -EINVAL;
}
static ssize_t vfio_pci_write(void *device_data, const char __user *buf,
size_t count, loff_t *ppos)
{
unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
struct vfio_pci_device *vdev = device_data;
struct pci_dev *pdev = vdev->pdev;
if (index >= VFIO_PCI_NUM_REGIONS)
return -EINVAL;
if (index == VFIO_PCI_CONFIG_REGION_INDEX)
return vfio_pci_config_readwrite(vdev, (char __user *)buf,
count, ppos, true);
else if (index == VFIO_PCI_ROM_REGION_INDEX)
return -EINVAL;
else if (pci_resource_flags(pdev, index) & IORESOURCE_IO)
return vfio_pci_io_readwrite(vdev, (char __user *)buf,
count, ppos, true);
else if (pci_resource_flags(pdev, index) & IORESOURCE_MEM) {
return vfio_pci_mem_readwrite(vdev, (char __user *)buf,
count, ppos, true);
}
return -EINVAL;
}
static int vfio_pci_mmap(void *device_data, struct vm_area_struct *vma)
{
struct vfio_pci_device *vdev = device_data;
struct pci_dev *pdev = vdev->pdev;
unsigned int index;
u64 phys_len, req_len, pgoff, req_start, phys;
int ret;
index = vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT);
if (vma->vm_end < vma->vm_start)
return -EINVAL;
if ((vma->vm_flags & VM_SHARED) == 0)
return -EINVAL;
if (index >= VFIO_PCI_ROM_REGION_INDEX)
return -EINVAL;
if (!(pci_resource_flags(pdev, index) & IORESOURCE_MEM))
return -EINVAL;
phys_len = pci_resource_len(pdev, index);
req_len = vma->vm_end - vma->vm_start;
pgoff = vma->vm_pgoff &
((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1);
req_start = pgoff << PAGE_SHIFT;
if (phys_len < PAGE_SIZE || req_start + req_len > phys_len)
return -EINVAL;
if (index == vdev->msix_bar) {
/*
* Disallow mmaps overlapping the MSI-X table; users don't
* get to touch this directly. We could find somewhere
* else to map the overlap, but page granularity is only
* a recommendation, not a requirement, so the user needs
* to know which bits are real. Requiring them to mmap
* around the table makes that clear.
*/
/* If neither entirely above nor below, then it overlaps */
if (!(req_start >= vdev->msix_offset + vdev->msix_size ||
req_start + req_len <= vdev->msix_offset))
return -EINVAL;
}
/*
* Even though we don't make use of the barmap for the mmap,
* we need to request the region and the barmap tracks that.
*/
if (!vdev->barmap[index]) {
ret = pci_request_selected_regions(pdev,
1 << index, "vfio-pci");
if (ret)
return ret;
vdev->barmap[index] = pci_iomap(pdev, index, 0);
}
vma->vm_private_data = vdev;
vma->vm_flags |= (VM_IO | VM_RESERVED);
vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
phys = (pci_resource_start(pdev, index) >> PAGE_SHIFT) + pgoff;
return remap_pfn_range(vma, vma->vm_start, phys,
req_len, vma->vm_page_prot);
}
static const struct vfio_device_ops vfio_pci_ops = {
.name = "vfio-pci",
.open = vfio_pci_open,
.release = vfio_pci_release,
.ioctl = vfio_pci_ioctl,
.read = vfio_pci_read,
.write = vfio_pci_write,
.mmap = vfio_pci_mmap,
};
static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
{
u8 type;
struct vfio_pci_device *vdev;
struct iommu_group *group;
int ret;
pci_read_config_byte(pdev, PCI_HEADER_TYPE, &type);
if ((type & PCI_HEADER_TYPE) != PCI_HEADER_TYPE_NORMAL)
return -EINVAL;
group = iommu_group_get(&pdev->dev);
if (!group)
return -EINVAL;
vdev = kzalloc(sizeof(*vdev), GFP_KERNEL);
if (!vdev) {
iommu_group_put(group);
return -ENOMEM;
}
vdev->pdev = pdev;
vdev->irq_type = VFIO_PCI_NUM_IRQS;
mutex_init(&vdev->igate);
spin_lock_init(&vdev->irqlock);
atomic_set(&vdev->refcnt, 0);
ret = vfio_add_group_dev(&pdev->dev, &vfio_pci_ops, vdev);
if (ret) {
iommu_group_put(group);
kfree(vdev);
}
return ret;
}
static void vfio_pci_remove(struct pci_dev *pdev)
{
struct vfio_pci_device *vdev;
vdev = vfio_del_group_dev(&pdev->dev);
if (!vdev)
return;
iommu_group_put(pdev->dev.iommu_group);
kfree(vdev);
}
static struct pci_driver vfio_pci_driver = {
.name = "vfio-pci",
.id_table = NULL, /* only dynamic ids */
.probe = vfio_pci_probe,
.remove = vfio_pci_remove,
};
static void __exit vfio_pci_cleanup(void)
{
pci_unregister_driver(&vfio_pci_driver);
vfio_pci_virqfd_exit();
vfio_pci_uninit_perm_bits();
}
static int __init vfio_pci_init(void)
{
int ret;
/* Allocate shared config space permision data used by all devices */
ret = vfio_pci_init_perm_bits();
if (ret)
return ret;
/* Start the virqfd cleanup handler */
ret = vfio_pci_virqfd_init();
if (ret)
goto out_virqfd;
/* Register and scan for devices */
ret = pci_register_driver(&vfio_pci_driver);
if (ret)
goto out_driver;
return 0;
out_virqfd:
vfio_pci_virqfd_exit();
out_driver:
vfio_pci_uninit_perm_bits();
return ret;
}
module_init(vfio_pci_init);
module_exit(vfio_pci_cleanup);
MODULE_VERSION(DRIVER_VERSION);
MODULE_LICENSE("GPL v2");
MODULE_AUTHOR(DRIVER_AUTHOR);
MODULE_DESCRIPTION(DRIVER_DESC);