linux_dsm_epyc7002/net/xdp/xsk_buff_pool.c
Magnus Karlsson e5e1a4bc91 xsk: Fix possible memory leak at socket close
Fix a possible memory leak at xsk socket close that is caused by the
refcounting of the umem object being wrong. The reference count of the
umem was decremented only after the pool had been freed. Note that if
the buffer pool is destroyed, it is important that the umem is
destroyed after the pool, otherwise the umem would disappear while the
driver is still running. And as the buffer pool needs to be destroyed
in a work queue, the umem is also (if its refcount reaches zero)
destroyed after the buffer pool in that same work queue.

What was missing is that the refcount also needs to be decremented
when the pool is not freed and when the pool has not even been
created. The first case happens when the refcount of the pool is
higher than 1, i.e. it is still being used by some other socket using
the same device and queue id. In this case, it is safe to decrement
the refcount of the umem outside of the work queue as the umem will
never be freed because the refcount of the umem is always greater than
or equal to the refcount of the buffer pool. The second case is if the
buffer pool has not been created yet, i.e. the socket was closed
before it was bound but after the umem was created. In this case, it
is safe to destroy the umem outside of the work queue, since there is
no pool that can use it by definition.

Fixes: 1c1efc2af1 ("xsk: Create and free buffer pool independently from umem")
Reported-by: syzbot+eb71df123dc2be2c1456@syzkaller.appspotmail.com
Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Björn Töpel <bjorn.topel@intel.com>
Link: https://lore.kernel.org/bpf/1603801921-2712-1-git-send-email-magnus.karlsson@gmail.com
2020-10-29 15:19:56 +01:00

560 lines
12 KiB
C

// SPDX-License-Identifier: GPL-2.0
#include <net/xsk_buff_pool.h>
#include <net/xdp_sock.h>
#include <net/xdp_sock_drv.h>
#include "xsk_queue.h"
#include "xdp_umem.h"
#include "xsk.h"
void xp_add_xsk(struct xsk_buff_pool *pool, struct xdp_sock *xs)
{
unsigned long flags;
if (!xs->tx)
return;
spin_lock_irqsave(&pool->xsk_tx_list_lock, flags);
list_add_rcu(&xs->tx_list, &pool->xsk_tx_list);
spin_unlock_irqrestore(&pool->xsk_tx_list_lock, flags);
}
void xp_del_xsk(struct xsk_buff_pool *pool, struct xdp_sock *xs)
{
unsigned long flags;
if (!xs->tx)
return;
spin_lock_irqsave(&pool->xsk_tx_list_lock, flags);
list_del_rcu(&xs->tx_list);
spin_unlock_irqrestore(&pool->xsk_tx_list_lock, flags);
}
void xp_destroy(struct xsk_buff_pool *pool)
{
if (!pool)
return;
kvfree(pool->heads);
kvfree(pool);
}
struct xsk_buff_pool *xp_create_and_assign_umem(struct xdp_sock *xs,
struct xdp_umem *umem)
{
struct xsk_buff_pool *pool;
struct xdp_buff_xsk *xskb;
u32 i;
pool = kvzalloc(struct_size(pool, free_heads, umem->chunks),
GFP_KERNEL);
if (!pool)
goto out;
pool->heads = kvcalloc(umem->chunks, sizeof(*pool->heads), GFP_KERNEL);
if (!pool->heads)
goto out;
pool->chunk_mask = ~((u64)umem->chunk_size - 1);
pool->addrs_cnt = umem->size;
pool->heads_cnt = umem->chunks;
pool->free_heads_cnt = umem->chunks;
pool->headroom = umem->headroom;
pool->chunk_size = umem->chunk_size;
pool->unaligned = umem->flags & XDP_UMEM_UNALIGNED_CHUNK_FLAG;
pool->frame_len = umem->chunk_size - umem->headroom -
XDP_PACKET_HEADROOM;
pool->umem = umem;
pool->addrs = umem->addrs;
INIT_LIST_HEAD(&pool->free_list);
INIT_LIST_HEAD(&pool->xsk_tx_list);
spin_lock_init(&pool->xsk_tx_list_lock);
refcount_set(&pool->users, 1);
pool->fq = xs->fq_tmp;
pool->cq = xs->cq_tmp;
xs->fq_tmp = NULL;
xs->cq_tmp = NULL;
for (i = 0; i < pool->free_heads_cnt; i++) {
xskb = &pool->heads[i];
xskb->pool = pool;
xskb->xdp.frame_sz = umem->chunk_size - umem->headroom;
pool->free_heads[i] = xskb;
}
return pool;
out:
xp_destroy(pool);
return NULL;
}
void xp_set_rxq_info(struct xsk_buff_pool *pool, struct xdp_rxq_info *rxq)
{
u32 i;
for (i = 0; i < pool->heads_cnt; i++)
pool->heads[i].xdp.rxq = rxq;
}
EXPORT_SYMBOL(xp_set_rxq_info);
static void xp_disable_drv_zc(struct xsk_buff_pool *pool)
{
struct netdev_bpf bpf;
int err;
ASSERT_RTNL();
if (pool->umem->zc) {
bpf.command = XDP_SETUP_XSK_POOL;
bpf.xsk.pool = NULL;
bpf.xsk.queue_id = pool->queue_id;
err = pool->netdev->netdev_ops->ndo_bpf(pool->netdev, &bpf);
if (err)
WARN(1, "Failed to disable zero-copy!\n");
}
}
static int __xp_assign_dev(struct xsk_buff_pool *pool,
struct net_device *netdev, u16 queue_id, u16 flags)
{
bool force_zc, force_copy;
struct netdev_bpf bpf;
int err = 0;
ASSERT_RTNL();
force_zc = flags & XDP_ZEROCOPY;
force_copy = flags & XDP_COPY;
if (force_zc && force_copy)
return -EINVAL;
if (xsk_get_pool_from_qid(netdev, queue_id))
return -EBUSY;
pool->netdev = netdev;
pool->queue_id = queue_id;
err = xsk_reg_pool_at_qid(netdev, pool, queue_id);
if (err)
return err;
if (flags & XDP_USE_NEED_WAKEUP) {
pool->uses_need_wakeup = true;
/* Tx needs to be explicitly woken up the first time.
* Also for supporting drivers that do not implement this
* feature. They will always have to call sendto().
*/
pool->cached_need_wakeup = XDP_WAKEUP_TX;
}
dev_hold(netdev);
if (force_copy)
/* For copy-mode, we are done. */
return 0;
if (!netdev->netdev_ops->ndo_bpf ||
!netdev->netdev_ops->ndo_xsk_wakeup) {
err = -EOPNOTSUPP;
goto err_unreg_pool;
}
bpf.command = XDP_SETUP_XSK_POOL;
bpf.xsk.pool = pool;
bpf.xsk.queue_id = queue_id;
err = netdev->netdev_ops->ndo_bpf(netdev, &bpf);
if (err)
goto err_unreg_pool;
if (!pool->dma_pages) {
WARN(1, "Driver did not DMA map zero-copy buffers");
goto err_unreg_xsk;
}
pool->umem->zc = true;
return 0;
err_unreg_xsk:
xp_disable_drv_zc(pool);
err_unreg_pool:
if (!force_zc)
err = 0; /* fallback to copy mode */
if (err)
xsk_clear_pool_at_qid(netdev, queue_id);
return err;
}
int xp_assign_dev(struct xsk_buff_pool *pool, struct net_device *dev,
u16 queue_id, u16 flags)
{
return __xp_assign_dev(pool, dev, queue_id, flags);
}
int xp_assign_dev_shared(struct xsk_buff_pool *pool, struct xdp_umem *umem,
struct net_device *dev, u16 queue_id)
{
u16 flags;
/* One fill and completion ring required for each queue id. */
if (!pool->fq || !pool->cq)
return -EINVAL;
flags = umem->zc ? XDP_ZEROCOPY : XDP_COPY;
if (pool->uses_need_wakeup)
flags |= XDP_USE_NEED_WAKEUP;
return __xp_assign_dev(pool, dev, queue_id, flags);
}
void xp_clear_dev(struct xsk_buff_pool *pool)
{
if (!pool->netdev)
return;
xp_disable_drv_zc(pool);
xsk_clear_pool_at_qid(pool->netdev, pool->queue_id);
dev_put(pool->netdev);
pool->netdev = NULL;
}
static void xp_release_deferred(struct work_struct *work)
{
struct xsk_buff_pool *pool = container_of(work, struct xsk_buff_pool,
work);
rtnl_lock();
xp_clear_dev(pool);
rtnl_unlock();
if (pool->fq) {
xskq_destroy(pool->fq);
pool->fq = NULL;
}
if (pool->cq) {
xskq_destroy(pool->cq);
pool->cq = NULL;
}
xdp_put_umem(pool->umem);
xp_destroy(pool);
}
void xp_get_pool(struct xsk_buff_pool *pool)
{
refcount_inc(&pool->users);
}
bool xp_put_pool(struct xsk_buff_pool *pool)
{
if (!pool)
return false;
if (refcount_dec_and_test(&pool->users)) {
INIT_WORK(&pool->work, xp_release_deferred);
schedule_work(&pool->work);
return true;
}
return false;
}
static struct xsk_dma_map *xp_find_dma_map(struct xsk_buff_pool *pool)
{
struct xsk_dma_map *dma_map;
list_for_each_entry(dma_map, &pool->umem->xsk_dma_list, list) {
if (dma_map->netdev == pool->netdev)
return dma_map;
}
return NULL;
}
static struct xsk_dma_map *xp_create_dma_map(struct device *dev, struct net_device *netdev,
u32 nr_pages, struct xdp_umem *umem)
{
struct xsk_dma_map *dma_map;
dma_map = kzalloc(sizeof(*dma_map), GFP_KERNEL);
if (!dma_map)
return NULL;
dma_map->dma_pages = kvcalloc(nr_pages, sizeof(*dma_map->dma_pages), GFP_KERNEL);
if (!dma_map->dma_pages) {
kfree(dma_map);
return NULL;
}
dma_map->netdev = netdev;
dma_map->dev = dev;
dma_map->dma_need_sync = false;
dma_map->dma_pages_cnt = nr_pages;
refcount_set(&dma_map->users, 1);
list_add(&dma_map->list, &umem->xsk_dma_list);
return dma_map;
}
static void xp_destroy_dma_map(struct xsk_dma_map *dma_map)
{
list_del(&dma_map->list);
kvfree(dma_map->dma_pages);
kfree(dma_map);
}
static void __xp_dma_unmap(struct xsk_dma_map *dma_map, unsigned long attrs)
{
dma_addr_t *dma;
u32 i;
for (i = 0; i < dma_map->dma_pages_cnt; i++) {
dma = &dma_map->dma_pages[i];
if (*dma) {
dma_unmap_page_attrs(dma_map->dev, *dma, PAGE_SIZE,
DMA_BIDIRECTIONAL, attrs);
*dma = 0;
}
}
xp_destroy_dma_map(dma_map);
}
void xp_dma_unmap(struct xsk_buff_pool *pool, unsigned long attrs)
{
struct xsk_dma_map *dma_map;
if (pool->dma_pages_cnt == 0)
return;
dma_map = xp_find_dma_map(pool);
if (!dma_map) {
WARN(1, "Could not find dma_map for device");
return;
}
if (!refcount_dec_and_test(&dma_map->users))
return;
__xp_dma_unmap(dma_map, attrs);
kvfree(pool->dma_pages);
pool->dma_pages_cnt = 0;
pool->dev = NULL;
}
EXPORT_SYMBOL(xp_dma_unmap);
static void xp_check_dma_contiguity(struct xsk_dma_map *dma_map)
{
u32 i;
for (i = 0; i < dma_map->dma_pages_cnt - 1; i++) {
if (dma_map->dma_pages[i] + PAGE_SIZE == dma_map->dma_pages[i + 1])
dma_map->dma_pages[i] |= XSK_NEXT_PG_CONTIG_MASK;
else
dma_map->dma_pages[i] &= ~XSK_NEXT_PG_CONTIG_MASK;
}
}
static int xp_init_dma_info(struct xsk_buff_pool *pool, struct xsk_dma_map *dma_map)
{
pool->dma_pages = kvcalloc(dma_map->dma_pages_cnt, sizeof(*pool->dma_pages), GFP_KERNEL);
if (!pool->dma_pages)
return -ENOMEM;
pool->dev = dma_map->dev;
pool->dma_pages_cnt = dma_map->dma_pages_cnt;
pool->dma_need_sync = dma_map->dma_need_sync;
memcpy(pool->dma_pages, dma_map->dma_pages,
pool->dma_pages_cnt * sizeof(*pool->dma_pages));
return 0;
}
int xp_dma_map(struct xsk_buff_pool *pool, struct device *dev,
unsigned long attrs, struct page **pages, u32 nr_pages)
{
struct xsk_dma_map *dma_map;
dma_addr_t dma;
int err;
u32 i;
dma_map = xp_find_dma_map(pool);
if (dma_map) {
err = xp_init_dma_info(pool, dma_map);
if (err)
return err;
refcount_inc(&dma_map->users);
return 0;
}
dma_map = xp_create_dma_map(dev, pool->netdev, nr_pages, pool->umem);
if (!dma_map)
return -ENOMEM;
for (i = 0; i < dma_map->dma_pages_cnt; i++) {
dma = dma_map_page_attrs(dev, pages[i], 0, PAGE_SIZE,
DMA_BIDIRECTIONAL, attrs);
if (dma_mapping_error(dev, dma)) {
__xp_dma_unmap(dma_map, attrs);
return -ENOMEM;
}
if (dma_need_sync(dev, dma))
dma_map->dma_need_sync = true;
dma_map->dma_pages[i] = dma;
}
if (pool->unaligned)
xp_check_dma_contiguity(dma_map);
err = xp_init_dma_info(pool, dma_map);
if (err) {
__xp_dma_unmap(dma_map, attrs);
return err;
}
return 0;
}
EXPORT_SYMBOL(xp_dma_map);
static bool xp_addr_crosses_non_contig_pg(struct xsk_buff_pool *pool,
u64 addr)
{
return xp_desc_crosses_non_contig_pg(pool, addr, pool->chunk_size);
}
static bool xp_check_unaligned(struct xsk_buff_pool *pool, u64 *addr)
{
*addr = xp_unaligned_extract_addr(*addr);
if (*addr >= pool->addrs_cnt ||
*addr + pool->chunk_size > pool->addrs_cnt ||
xp_addr_crosses_non_contig_pg(pool, *addr))
return false;
return true;
}
static bool xp_check_aligned(struct xsk_buff_pool *pool, u64 *addr)
{
*addr = xp_aligned_extract_addr(pool, *addr);
return *addr < pool->addrs_cnt;
}
static struct xdp_buff_xsk *__xp_alloc(struct xsk_buff_pool *pool)
{
struct xdp_buff_xsk *xskb;
u64 addr;
bool ok;
if (pool->free_heads_cnt == 0)
return NULL;
xskb = pool->free_heads[--pool->free_heads_cnt];
for (;;) {
if (!xskq_cons_peek_addr_unchecked(pool->fq, &addr)) {
pool->fq->queue_empty_descs++;
xp_release(xskb);
return NULL;
}
ok = pool->unaligned ? xp_check_unaligned(pool, &addr) :
xp_check_aligned(pool, &addr);
if (!ok) {
pool->fq->invalid_descs++;
xskq_cons_release(pool->fq);
continue;
}
break;
}
xskq_cons_release(pool->fq);
xskb->orig_addr = addr;
xskb->xdp.data_hard_start = pool->addrs + addr + pool->headroom;
if (pool->dma_pages_cnt) {
xskb->frame_dma = (pool->dma_pages[addr >> PAGE_SHIFT] &
~XSK_NEXT_PG_CONTIG_MASK) +
(addr & ~PAGE_MASK);
xskb->dma = xskb->frame_dma + pool->headroom +
XDP_PACKET_HEADROOM;
}
return xskb;
}
struct xdp_buff *xp_alloc(struct xsk_buff_pool *pool)
{
struct xdp_buff_xsk *xskb;
if (!pool->free_list_cnt) {
xskb = __xp_alloc(pool);
if (!xskb)
return NULL;
} else {
pool->free_list_cnt--;
xskb = list_first_entry(&pool->free_list, struct xdp_buff_xsk,
free_list_node);
list_del(&xskb->free_list_node);
}
xskb->xdp.data = xskb->xdp.data_hard_start + XDP_PACKET_HEADROOM;
xskb->xdp.data_meta = xskb->xdp.data;
if (pool->dma_need_sync) {
dma_sync_single_range_for_device(pool->dev, xskb->dma, 0,
pool->frame_len,
DMA_BIDIRECTIONAL);
}
return &xskb->xdp;
}
EXPORT_SYMBOL(xp_alloc);
bool xp_can_alloc(struct xsk_buff_pool *pool, u32 count)
{
if (pool->free_list_cnt >= count)
return true;
return xskq_cons_has_entries(pool->fq, count - pool->free_list_cnt);
}
EXPORT_SYMBOL(xp_can_alloc);
void xp_free(struct xdp_buff_xsk *xskb)
{
xskb->pool->free_list_cnt++;
list_add(&xskb->free_list_node, &xskb->pool->free_list);
}
EXPORT_SYMBOL(xp_free);
void *xp_raw_get_data(struct xsk_buff_pool *pool, u64 addr)
{
addr = pool->unaligned ? xp_unaligned_add_offset_to_addr(addr) : addr;
return pool->addrs + addr;
}
EXPORT_SYMBOL(xp_raw_get_data);
dma_addr_t xp_raw_get_dma(struct xsk_buff_pool *pool, u64 addr)
{
addr = pool->unaligned ? xp_unaligned_add_offset_to_addr(addr) : addr;
return (pool->dma_pages[addr >> PAGE_SHIFT] &
~XSK_NEXT_PG_CONTIG_MASK) +
(addr & ~PAGE_MASK);
}
EXPORT_SYMBOL(xp_raw_get_dma);
void xp_dma_sync_for_cpu_slow(struct xdp_buff_xsk *xskb)
{
dma_sync_single_range_for_cpu(xskb->pool->dev, xskb->dma, 0,
xskb->pool->frame_len, DMA_BIDIRECTIONAL);
}
EXPORT_SYMBOL(xp_dma_sync_for_cpu_slow);
void xp_dma_sync_for_device_slow(struct xsk_buff_pool *pool, dma_addr_t dma,
size_t size)
{
dma_sync_single_range_for_device(pool->dev, dma, 0,
size, DMA_BIDIRECTIONAL);
}
EXPORT_SYMBOL(xp_dma_sync_for_device_slow);