linux_dsm_epyc7002/drivers/pci/p2pdma.c
Logan Gunthorpe 52916982af PCI/P2PDMA: Support peer-to-peer memory
Some PCI devices may have memory mapped in a BAR space that's intended for
use in peer-to-peer transactions.  To enable such transactions the memory
must be registered with ZONE_DEVICE pages so it can be used by DMA
interfaces in existing drivers.

Add an interface for other subsystems to find and allocate chunks of P2P
memory as necessary to facilitate transfers between two PCI peers:

  struct pci_dev *pci_p2pmem_find[_many]();
  int pci_p2pdma_distance[_many]();
  void *pci_alloc_p2pmem();

The new interface requires a driver to collect a list of client devices
involved in the transaction then call pci_p2pmem_find() to obtain any
suitable P2P memory.  Alternatively, if the caller knows a device which
provides P2P memory, they can use pci_p2pdma_distance() to determine if it
is usable.  With a suitable p2pmem device, memory can then be allocated
with pci_alloc_p2pmem() for use in DMA transactions.

Depending on hardware, using peer-to-peer memory may reduce the bandwidth
of the transfer but can significantly reduce pressure on system memory.
This may be desirable in many cases: for example a system could be designed
with a small CPU connected to a PCIe switch by a small number of lanes
which would maximize the number of lanes available to connect to NVMe
devices.

The code is designed to only utilize the p2pmem device if all the devices
involved in a transfer are behind the same PCI bridge.  This is because we
have no way of knowing whether peer-to-peer routing between PCIe Root Ports
is supported (PCIe r4.0, sec 1.3.1).  Additionally, the benefits of P2P
transfers that go through the RC is limited to only reducing DRAM usage
and, in some cases, coding convenience.  The PCI-SIG may be exploring
adding a new capability bit to advertise whether this is possible for
future hardware.

This commit includes significant rework and feedback from Christoph
Hellwig.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Logan Gunthorpe <logang@deltatee.com>
[bhelgaas: fold in fix from Keith Busch <keith.busch@intel.com>:
https://lore.kernel.org/linux-pci/20181012155920.15418-1-keith.busch@intel.com,
to address comment from Dan Carpenter <dan.carpenter@oracle.com>, fold in
https://lore.kernel.org/linux-pci/20181017160510.17926-1-logang@deltatee.com]
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
2018-10-10 14:00:54 -05:00

627 lines
16 KiB
C

// SPDX-License-Identifier: GPL-2.0
/*
* PCI Peer 2 Peer DMA support.
*
* Copyright (c) 2016-2018, Logan Gunthorpe
* Copyright (c) 2016-2017, Microsemi Corporation
* Copyright (c) 2017, Christoph Hellwig
* Copyright (c) 2018, Eideticom Inc.
*/
#include <linux/pci-p2pdma.h>
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/genalloc.h>
#include <linux/memremap.h>
#include <linux/percpu-refcount.h>
#include <linux/random.h>
#include <linux/seq_buf.h>
struct pci_p2pdma {
struct percpu_ref devmap_ref;
struct completion devmap_ref_done;
struct gen_pool *pool;
bool p2pmem_published;
};
static void pci_p2pdma_percpu_release(struct percpu_ref *ref)
{
struct pci_p2pdma *p2p =
container_of(ref, struct pci_p2pdma, devmap_ref);
complete_all(&p2p->devmap_ref_done);
}
static void pci_p2pdma_percpu_kill(void *data)
{
struct percpu_ref *ref = data;
/*
* pci_p2pdma_add_resource() may be called multiple times
* by a driver and may register the percpu_kill devm action multiple
* times. We only want the first action to actually kill the
* percpu_ref.
*/
if (percpu_ref_is_dying(ref))
return;
percpu_ref_kill(ref);
}
static void pci_p2pdma_release(void *data)
{
struct pci_dev *pdev = data;
if (!pdev->p2pdma)
return;
wait_for_completion(&pdev->p2pdma->devmap_ref_done);
percpu_ref_exit(&pdev->p2pdma->devmap_ref);
gen_pool_destroy(pdev->p2pdma->pool);
pdev->p2pdma = NULL;
}
static int pci_p2pdma_setup(struct pci_dev *pdev)
{
int error = -ENOMEM;
struct pci_p2pdma *p2p;
p2p = devm_kzalloc(&pdev->dev, sizeof(*p2p), GFP_KERNEL);
if (!p2p)
return -ENOMEM;
p2p->pool = gen_pool_create(PAGE_SHIFT, dev_to_node(&pdev->dev));
if (!p2p->pool)
goto out;
init_completion(&p2p->devmap_ref_done);
error = percpu_ref_init(&p2p->devmap_ref,
pci_p2pdma_percpu_release, 0, GFP_KERNEL);
if (error)
goto out_pool_destroy;
error = devm_add_action_or_reset(&pdev->dev, pci_p2pdma_release, pdev);
if (error)
goto out_pool_destroy;
pdev->p2pdma = p2p;
return 0;
out_pool_destroy:
gen_pool_destroy(p2p->pool);
out:
devm_kfree(&pdev->dev, p2p);
return error;
}
/**
* pci_p2pdma_add_resource - add memory for use as p2p memory
* @pdev: the device to add the memory to
* @bar: PCI BAR to add
* @size: size of the memory to add, may be zero to use the whole BAR
* @offset: offset into the PCI BAR
*
* The memory will be given ZONE_DEVICE struct pages so that it may
* be used with any DMA request.
*/
int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size,
u64 offset)
{
struct dev_pagemap *pgmap;
void *addr;
int error;
if (!(pci_resource_flags(pdev, bar) & IORESOURCE_MEM))
return -EINVAL;
if (offset >= pci_resource_len(pdev, bar))
return -EINVAL;
if (!size)
size = pci_resource_len(pdev, bar) - offset;
if (size + offset > pci_resource_len(pdev, bar))
return -EINVAL;
if (!pdev->p2pdma) {
error = pci_p2pdma_setup(pdev);
if (error)
return error;
}
pgmap = devm_kzalloc(&pdev->dev, sizeof(*pgmap), GFP_KERNEL);
if (!pgmap)
return -ENOMEM;
pgmap->res.start = pci_resource_start(pdev, bar) + offset;
pgmap->res.end = pgmap->res.start + size - 1;
pgmap->res.flags = pci_resource_flags(pdev, bar);
pgmap->ref = &pdev->p2pdma->devmap_ref;
pgmap->type = MEMORY_DEVICE_PCI_P2PDMA;
addr = devm_memremap_pages(&pdev->dev, pgmap);
if (IS_ERR(addr)) {
error = PTR_ERR(addr);
goto pgmap_free;
}
error = gen_pool_add_virt(pdev->p2pdma->pool, (unsigned long)addr,
pci_bus_address(pdev, bar) + offset,
resource_size(&pgmap->res), dev_to_node(&pdev->dev));
if (error)
goto pgmap_free;
error = devm_add_action_or_reset(&pdev->dev, pci_p2pdma_percpu_kill,
&pdev->p2pdma->devmap_ref);
if (error)
goto pgmap_free;
pci_info(pdev, "added peer-to-peer DMA memory %pR\n",
&pgmap->res);
return 0;
pgmap_free:
devm_kfree(&pdev->dev, pgmap);
return error;
}
EXPORT_SYMBOL_GPL(pci_p2pdma_add_resource);
/*
* Note this function returns the parent PCI device with a
* reference taken. It is the caller's responsibily to drop
* the reference.
*/
static struct pci_dev *find_parent_pci_dev(struct device *dev)
{
struct device *parent;
dev = get_device(dev);
while (dev) {
if (dev_is_pci(dev))
return to_pci_dev(dev);
parent = get_device(dev->parent);
put_device(dev);
dev = parent;
}
return NULL;
}
/*
* Check if a PCI bridge has its ACS redirection bits set to redirect P2P
* TLPs upstream via ACS. Returns 1 if the packets will be redirected
* upstream, 0 otherwise.
*/
static int pci_bridge_has_acs_redir(struct pci_dev *pdev)
{
int pos;
u16 ctrl;
pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ACS);
if (!pos)
return 0;
pci_read_config_word(pdev, pos + PCI_ACS_CTRL, &ctrl);
if (ctrl & (PCI_ACS_RR | PCI_ACS_CR | PCI_ACS_EC))
return 1;
return 0;
}
static void seq_buf_print_bus_devfn(struct seq_buf *buf, struct pci_dev *pdev)
{
if (!buf)
return;
seq_buf_printf(buf, "%s;", pci_name(pdev));
}
/*
* Find the distance through the nearest common upstream bridge between
* two PCI devices.
*
* If the two devices are the same device then 0 will be returned.
*
* If there are two virtual functions of the same device behind the same
* bridge port then 2 will be returned (one step down to the PCIe switch,
* then one step back to the same device).
*
* In the case where two devices are connected to the same PCIe switch, the
* value 4 will be returned. This corresponds to the following PCI tree:
*
* -+ Root Port
* \+ Switch Upstream Port
* +-+ Switch Downstream Port
* + \- Device A
* \-+ Switch Downstream Port
* \- Device B
*
* The distance is 4 because we traverse from Device A through the downstream
* port of the switch, to the common upstream port, back up to the second
* downstream port and then to Device B.
*
* Any two devices that don't have a common upstream bridge will return -1.
* In this way devices on separate PCIe root ports will be rejected, which
* is what we want for peer-to-peer seeing each PCIe root port defines a
* separate hierarchy domain and there's no way to determine whether the root
* complex supports forwarding between them.
*
* In the case where two devices are connected to different PCIe switches,
* this function will still return a positive distance as long as both
* switches eventually have a common upstream bridge. Note this covers
* the case of using multiple PCIe switches to achieve a desired level of
* fan-out from a root port. The exact distance will be a function of the
* number of switches between Device A and Device B.
*
* If a bridge which has any ACS redirection bits set is in the path
* then this functions will return -2. This is so we reject any
* cases where the TLPs are forwarded up into the root complex.
* In this case, a list of all infringing bridge addresses will be
* populated in acs_list (assuming it's non-null) for printk purposes.
*/
static int upstream_bridge_distance(struct pci_dev *a,
struct pci_dev *b,
struct seq_buf *acs_list)
{
int dist_a = 0;
int dist_b = 0;
struct pci_dev *bb = NULL;
int acs_cnt = 0;
/*
* Note, we don't need to take references to devices returned by
* pci_upstream_bridge() seeing we hold a reference to a child
* device which will already hold a reference to the upstream bridge.
*/
while (a) {
dist_b = 0;
if (pci_bridge_has_acs_redir(a)) {
seq_buf_print_bus_devfn(acs_list, a);
acs_cnt++;
}
bb = b;
while (bb) {
if (a == bb)
goto check_b_path_acs;
bb = pci_upstream_bridge(bb);
dist_b++;
}
a = pci_upstream_bridge(a);
dist_a++;
}
return -1;
check_b_path_acs:
bb = b;
while (bb) {
if (a == bb)
break;
if (pci_bridge_has_acs_redir(bb)) {
seq_buf_print_bus_devfn(acs_list, bb);
acs_cnt++;
}
bb = pci_upstream_bridge(bb);
}
if (acs_cnt)
return -2;
return dist_a + dist_b;
}
static int upstream_bridge_distance_warn(struct pci_dev *provider,
struct pci_dev *client)
{
struct seq_buf acs_list;
int ret;
seq_buf_init(&acs_list, kmalloc(PAGE_SIZE, GFP_KERNEL), PAGE_SIZE);
if (!acs_list.buffer)
return -ENOMEM;
ret = upstream_bridge_distance(provider, client, &acs_list);
if (ret == -2) {
pci_warn(client, "cannot be used for peer-to-peer DMA as ACS redirect is set between the client and provider (%s)\n",
pci_name(provider));
/* Drop final semicolon */
acs_list.buffer[acs_list.len-1] = 0;
pci_warn(client, "to disable ACS redirect for this path, add the kernel parameter: pci=disable_acs_redir=%s\n",
acs_list.buffer);
} else if (ret < 0) {
pci_warn(client, "cannot be used for peer-to-peer DMA as the client and provider (%s) do not share an upstream bridge\n",
pci_name(provider));
}
kfree(acs_list.buffer);
return ret;
}
/**
* pci_p2pdma_distance_many - Determive the cumulative distance between
* a p2pdma provider and the clients in use.
* @provider: p2pdma provider to check against the client list
* @clients: array of devices to check (NULL-terminated)
* @num_clients: number of clients in the array
* @verbose: if true, print warnings for devices when we return -1
*
* Returns -1 if any of the clients are not compatible (behind the same
* root port as the provider), otherwise returns a positive number where
* a lower number is the preferrable choice. (If there's one client
* that's the same as the provider it will return 0, which is best choice).
*
* For now, "compatible" means the provider and the clients are all behind
* the same PCI root port. This cuts out cases that may work but is safest
* for the user. Future work can expand this to white-list root complexes that
* can safely forward between each ports.
*/
int pci_p2pdma_distance_many(struct pci_dev *provider, struct device **clients,
int num_clients, bool verbose)
{
bool not_supported = false;
struct pci_dev *pci_client;
int distance = 0;
int i, ret;
if (num_clients == 0)
return -1;
for (i = 0; i < num_clients; i++) {
pci_client = find_parent_pci_dev(clients[i]);
if (!pci_client) {
if (verbose)
dev_warn(clients[i],
"cannot be used for peer-to-peer DMA as it is not a PCI device\n");
return -1;
}
if (verbose)
ret = upstream_bridge_distance_warn(provider,
pci_client);
else
ret = upstream_bridge_distance(provider, pci_client,
NULL);
pci_dev_put(pci_client);
if (ret < 0)
not_supported = true;
if (not_supported && !verbose)
break;
distance += ret;
}
if (not_supported)
return -1;
return distance;
}
EXPORT_SYMBOL_GPL(pci_p2pdma_distance_many);
/**
* pci_has_p2pmem - check if a given PCI device has published any p2pmem
* @pdev: PCI device to check
*/
bool pci_has_p2pmem(struct pci_dev *pdev)
{
return pdev->p2pdma && pdev->p2pdma->p2pmem_published;
}
EXPORT_SYMBOL_GPL(pci_has_p2pmem);
/**
* pci_p2pmem_find - find a peer-to-peer DMA memory device compatible with
* the specified list of clients and shortest distance (as determined
* by pci_p2pmem_dma())
* @clients: array of devices to check (NULL-terminated)
* @num_clients: number of client devices in the list
*
* If multiple devices are behind the same switch, the one "closest" to the
* client devices in use will be chosen first. (So if one of the providers are
* the same as one of the clients, that provider will be used ahead of any
* other providers that are unrelated). If multiple providers are an equal
* distance away, one will be chosen at random.
*
* Returns a pointer to the PCI device with a reference taken (use pci_dev_put
* to return the reference) or NULL if no compatible device is found. The
* found provider will also be assigned to the client list.
*/
struct pci_dev *pci_p2pmem_find_many(struct device **clients, int num_clients)
{
struct pci_dev *pdev = NULL;
int distance;
int closest_distance = INT_MAX;
struct pci_dev **closest_pdevs;
int dev_cnt = 0;
const int max_devs = PAGE_SIZE / sizeof(*closest_pdevs);
int i;
closest_pdevs = kmalloc(PAGE_SIZE, GFP_KERNEL);
if (!closest_pdevs)
return NULL;
while ((pdev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, pdev))) {
if (!pci_has_p2pmem(pdev))
continue;
distance = pci_p2pdma_distance_many(pdev, clients,
num_clients, false);
if (distance < 0 || distance > closest_distance)
continue;
if (distance == closest_distance && dev_cnt >= max_devs)
continue;
if (distance < closest_distance) {
for (i = 0; i < dev_cnt; i++)
pci_dev_put(closest_pdevs[i]);
dev_cnt = 0;
closest_distance = distance;
}
closest_pdevs[dev_cnt++] = pci_dev_get(pdev);
}
if (dev_cnt)
pdev = pci_dev_get(closest_pdevs[prandom_u32_max(dev_cnt)]);
for (i = 0; i < dev_cnt; i++)
pci_dev_put(closest_pdevs[i]);
kfree(closest_pdevs);
return pdev;
}
EXPORT_SYMBOL_GPL(pci_p2pmem_find_many);
/**
* pci_alloc_p2p_mem - allocate peer-to-peer DMA memory
* @pdev: the device to allocate memory from
* @size: number of bytes to allocate
*
* Returns the allocated memory or NULL on error.
*/
void *pci_alloc_p2pmem(struct pci_dev *pdev, size_t size)
{
void *ret;
if (unlikely(!pdev->p2pdma))
return NULL;
if (unlikely(!percpu_ref_tryget_live(&pdev->p2pdma->devmap_ref)))
return NULL;
ret = (void *)gen_pool_alloc(pdev->p2pdma->pool, size);
if (unlikely(!ret))
percpu_ref_put(&pdev->p2pdma->devmap_ref);
return ret;
}
EXPORT_SYMBOL_GPL(pci_alloc_p2pmem);
/**
* pci_free_p2pmem - free peer-to-peer DMA memory
* @pdev: the device the memory was allocated from
* @addr: address of the memory that was allocated
* @size: number of bytes that was allocated
*/
void pci_free_p2pmem(struct pci_dev *pdev, void *addr, size_t size)
{
gen_pool_free(pdev->p2pdma->pool, (uintptr_t)addr, size);
percpu_ref_put(&pdev->p2pdma->devmap_ref);
}
EXPORT_SYMBOL_GPL(pci_free_p2pmem);
/**
* pci_virt_to_bus - return the PCI bus address for a given virtual
* address obtained with pci_alloc_p2pmem()
* @pdev: the device the memory was allocated from
* @addr: address of the memory that was allocated
*/
pci_bus_addr_t pci_p2pmem_virt_to_bus(struct pci_dev *pdev, void *addr)
{
if (!addr)
return 0;
if (!pdev->p2pdma)
return 0;
/*
* Note: when we added the memory to the pool we used the PCI
* bus address as the physical address. So gen_pool_virt_to_phys()
* actually returns the bus address despite the misleading name.
*/
return gen_pool_virt_to_phys(pdev->p2pdma->pool, (unsigned long)addr);
}
EXPORT_SYMBOL_GPL(pci_p2pmem_virt_to_bus);
/**
* pci_p2pmem_alloc_sgl - allocate peer-to-peer DMA memory in a scatterlist
* @pdev: the device to allocate memory from
* @nents: the number of SG entries in the list
* @length: number of bytes to allocate
*
* Returns 0 on success
*/
struct scatterlist *pci_p2pmem_alloc_sgl(struct pci_dev *pdev,
unsigned int *nents, u32 length)
{
struct scatterlist *sg;
void *addr;
sg = kzalloc(sizeof(*sg), GFP_KERNEL);
if (!sg)
return NULL;
sg_init_table(sg, 1);
addr = pci_alloc_p2pmem(pdev, length);
if (!addr)
goto out_free_sg;
sg_set_buf(sg, addr, length);
*nents = 1;
return sg;
out_free_sg:
kfree(sg);
return NULL;
}
EXPORT_SYMBOL_GPL(pci_p2pmem_alloc_sgl);
/**
* pci_p2pmem_free_sgl - free a scatterlist allocated by pci_p2pmem_alloc_sgl()
* @pdev: the device to allocate memory from
* @sgl: the allocated scatterlist
*/
void pci_p2pmem_free_sgl(struct pci_dev *pdev, struct scatterlist *sgl)
{
struct scatterlist *sg;
int count;
for_each_sg(sgl, sg, INT_MAX, count) {
if (!sg)
break;
pci_free_p2pmem(pdev, sg_virt(sg), sg->length);
}
kfree(sgl);
}
EXPORT_SYMBOL_GPL(pci_p2pmem_free_sgl);
/**
* pci_p2pmem_publish - publish the peer-to-peer DMA memory for use by
* other devices with pci_p2pmem_find()
* @pdev: the device with peer-to-peer DMA memory to publish
* @publish: set to true to publish the memory, false to unpublish it
*
* Published memory can be used by other PCI device drivers for
* peer-2-peer DMA operations. Non-published memory is reserved for
* exlusive use of the device driver that registers the peer-to-peer
* memory.
*/
void pci_p2pmem_publish(struct pci_dev *pdev, bool publish)
{
if (pdev->p2pdma)
pdev->p2pdma->p2pmem_published = publish;
}
EXPORT_SYMBOL_GPL(pci_p2pmem_publish);