mirror of
https://github.com/AuxXxilium/linux_dsm_epyc7002.git
synced 2024-12-18 17:58:29 +07:00
8700e3e7c4
Soft RoCE (RXE) - The software RoCE driver ib_rxe implements the RDMA transport and registers to the RDMA core device as a kernel verbs provider. It also implements the packet IO layer. On the other hand ib_rxe registers to the Linux netdev stack as a udp encapsulating protocol, in that case RDMA, for sending and receiving packets over any Ethernet device. This yields a RDMA transport over the UDP/Ethernet network layer forming a RoCEv2 compatible device. The configuration procedure of the Soft RoCE drivers requires binding to any existing Ethernet network device. This is done with /sys interface. A userspace Soft RoCE library (librxe) provides user applications the ability to run with Soft RoCE devices. The use of rxe verbs ins user space requires the inclusion of librxe as a device specifics plug-in to libibverbs. librxe is packaged separately. Architecture: +-----------------------------------------------------------+ | Application | +-----------------------------------------------------------+ +-----------------------------------+ | libibverbs | User +-----------------------------------+ +----------------+ +----------------+ | librxe | | HW RoCE lib | +----------------+ +----------------+ +---------------------------------------------------------------+ +--------------+ +------------+ | Sockets | | RDMA ULP | +--------------+ +------------+ +--------------+ +---------------------+ | TCP/IP | | ib_core | +--------------+ +---------------------+ +------------+ +----------------+ Kernel | ib_rxe | | HW RoCE driver | +------------+ +----------------+ +------------------------------------+ | NIC driver | +------------------------------------+ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +-----------------------------------------------------------+ | Application | +-----------------------------------------------------------+ +-----------------------------------+ | libibverbs | User +-----------------------------------+ +----------------+ +----------------+ | librxe | | HW RoCE lib | +----------------+ +----------------+ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +--------------+ +------------+ | Sockets | | RDMA ULP | +--------------+ +------------+ +--------------+ +---------------------+ | TCP/IP | | ib_core | +--------------+ +---------------------+ +------------+ +----------------+ Kernel | ib_rxe | | HW RoCE driver | +------------+ +----------------+ +------------------------------------+ | NIC driver | +------------------------------------+ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Soft RoCE resources: [1[ https://github.com/SoftRoCE/librxe-dev librxe - source code in Github [2] https://github.com/SoftRoCE/rxe-dev/wiki/rxe-dev:-Home - Soft RoCE Wiki page [3] https://github.com/SoftRoCE/librxe-dev - Soft RoCE userspace library Signed-off-by: Kamal Heib <kamalh@mellanox.com> Signed-off-by: Amir Vadai <amirv@mellanox.com> Signed-off-by: Moni Shoua <monis@mellanox.com> Reviewed-by: Haggai Eran <haggaie@mellanox.com> Signed-off-by: Doug Ledford <dledford@redhat.com>
503 lines
11 KiB
C
503 lines
11 KiB
C
/*
|
|
* Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
|
|
* Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
|
|
*
|
|
* This software is available to you under a choice of one of two
|
|
* licenses. You may choose to be licensed under the terms of the GNU
|
|
* General Public License (GPL) Version 2, available from the file
|
|
* COPYING in the main directory of this source tree, or the
|
|
* OpenIB.org BSD license below:
|
|
*
|
|
* Redistribution and use in source and binary forms, with or
|
|
* without modification, are permitted provided that the following
|
|
* conditions are met:
|
|
*
|
|
* - Redistributions of source code must retain the above
|
|
* copyright notice, this list of conditions and the following
|
|
* disclaimer.
|
|
*
|
|
* - Redistributions in binary form must reproduce the above
|
|
* copyright notice, this list of conditions and the following
|
|
* disclaimer in the documentation and/or other materials
|
|
* provided with the distribution.
|
|
*
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
|
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
|
|
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
|
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
|
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
* SOFTWARE.
|
|
*/
|
|
|
|
#include "rxe.h"
|
|
#include "rxe_loc.h"
|
|
|
|
/* info about object pools
|
|
* note that mr and mw share a single index space
|
|
* so that one can map an lkey to the correct type of object
|
|
*/
|
|
struct rxe_type_info rxe_type_info[RXE_NUM_TYPES] = {
|
|
[RXE_TYPE_UC] = {
|
|
.name = "rxe-uc",
|
|
.size = sizeof(struct rxe_ucontext),
|
|
},
|
|
[RXE_TYPE_PD] = {
|
|
.name = "rxe-pd",
|
|
.size = sizeof(struct rxe_pd),
|
|
},
|
|
[RXE_TYPE_AH] = {
|
|
.name = "rxe-ah",
|
|
.size = sizeof(struct rxe_ah),
|
|
.flags = RXE_POOL_ATOMIC,
|
|
},
|
|
[RXE_TYPE_SRQ] = {
|
|
.name = "rxe-srq",
|
|
.size = sizeof(struct rxe_srq),
|
|
.flags = RXE_POOL_INDEX,
|
|
.min_index = RXE_MIN_SRQ_INDEX,
|
|
.max_index = RXE_MAX_SRQ_INDEX,
|
|
},
|
|
[RXE_TYPE_QP] = {
|
|
.name = "rxe-qp",
|
|
.size = sizeof(struct rxe_qp),
|
|
.cleanup = rxe_qp_cleanup,
|
|
.flags = RXE_POOL_INDEX,
|
|
.min_index = RXE_MIN_QP_INDEX,
|
|
.max_index = RXE_MAX_QP_INDEX,
|
|
},
|
|
[RXE_TYPE_CQ] = {
|
|
.name = "rxe-cq",
|
|
.size = sizeof(struct rxe_cq),
|
|
.cleanup = rxe_cq_cleanup,
|
|
},
|
|
[RXE_TYPE_MR] = {
|
|
.name = "rxe-mr",
|
|
.size = sizeof(struct rxe_mem),
|
|
.cleanup = rxe_mem_cleanup,
|
|
.flags = RXE_POOL_INDEX,
|
|
.max_index = RXE_MAX_MR_INDEX,
|
|
.min_index = RXE_MIN_MR_INDEX,
|
|
},
|
|
[RXE_TYPE_MW] = {
|
|
.name = "rxe-mw",
|
|
.size = sizeof(struct rxe_mem),
|
|
.flags = RXE_POOL_INDEX,
|
|
.max_index = RXE_MAX_MW_INDEX,
|
|
.min_index = RXE_MIN_MW_INDEX,
|
|
},
|
|
[RXE_TYPE_MC_GRP] = {
|
|
.name = "rxe-mc_grp",
|
|
.size = sizeof(struct rxe_mc_grp),
|
|
.cleanup = rxe_mc_cleanup,
|
|
.flags = RXE_POOL_KEY,
|
|
.key_offset = offsetof(struct rxe_mc_grp, mgid),
|
|
.key_size = sizeof(union ib_gid),
|
|
},
|
|
[RXE_TYPE_MC_ELEM] = {
|
|
.name = "rxe-mc_elem",
|
|
.size = sizeof(struct rxe_mc_elem),
|
|
.flags = RXE_POOL_ATOMIC,
|
|
},
|
|
};
|
|
|
|
static inline char *pool_name(struct rxe_pool *pool)
|
|
{
|
|
return rxe_type_info[pool->type].name;
|
|
}
|
|
|
|
static inline struct kmem_cache *pool_cache(struct rxe_pool *pool)
|
|
{
|
|
return rxe_type_info[pool->type].cache;
|
|
}
|
|
|
|
static inline enum rxe_elem_type rxe_type(void *arg)
|
|
{
|
|
struct rxe_pool_entry *elem = arg;
|
|
|
|
return elem->pool->type;
|
|
}
|
|
|
|
int rxe_cache_init(void)
|
|
{
|
|
int err;
|
|
int i;
|
|
size_t size;
|
|
struct rxe_type_info *type;
|
|
|
|
for (i = 0; i < RXE_NUM_TYPES; i++) {
|
|
type = &rxe_type_info[i];
|
|
size = ALIGN(type->size, RXE_POOL_ALIGN);
|
|
type->cache = kmem_cache_create(type->name, size,
|
|
RXE_POOL_ALIGN,
|
|
RXE_POOL_CACHE_FLAGS, NULL);
|
|
if (!type->cache) {
|
|
pr_err("Unable to init kmem cache for %s\n",
|
|
type->name);
|
|
err = -ENOMEM;
|
|
goto err1;
|
|
}
|
|
}
|
|
|
|
return 0;
|
|
|
|
err1:
|
|
while (--i >= 0) {
|
|
kmem_cache_destroy(type->cache);
|
|
type->cache = NULL;
|
|
}
|
|
|
|
return err;
|
|
}
|
|
|
|
void rxe_cache_exit(void)
|
|
{
|
|
int i;
|
|
struct rxe_type_info *type;
|
|
|
|
for (i = 0; i < RXE_NUM_TYPES; i++) {
|
|
type = &rxe_type_info[i];
|
|
kmem_cache_destroy(type->cache);
|
|
type->cache = NULL;
|
|
}
|
|
}
|
|
|
|
static int rxe_pool_init_index(struct rxe_pool *pool, u32 max, u32 min)
|
|
{
|
|
int err = 0;
|
|
size_t size;
|
|
|
|
if ((max - min + 1) < pool->max_elem) {
|
|
pr_warn("not enough indices for max_elem\n");
|
|
err = -EINVAL;
|
|
goto out;
|
|
}
|
|
|
|
pool->max_index = max;
|
|
pool->min_index = min;
|
|
|
|
size = BITS_TO_LONGS(max - min + 1) * sizeof(long);
|
|
pool->table = kmalloc(size, GFP_KERNEL);
|
|
if (!pool->table) {
|
|
pr_warn("no memory for bit table\n");
|
|
err = -ENOMEM;
|
|
goto out;
|
|
}
|
|
|
|
pool->table_size = size;
|
|
bitmap_zero(pool->table, max - min + 1);
|
|
|
|
out:
|
|
return err;
|
|
}
|
|
|
|
int rxe_pool_init(
|
|
struct rxe_dev *rxe,
|
|
struct rxe_pool *pool,
|
|
enum rxe_elem_type type,
|
|
unsigned max_elem)
|
|
{
|
|
int err = 0;
|
|
size_t size = rxe_type_info[type].size;
|
|
|
|
memset(pool, 0, sizeof(*pool));
|
|
|
|
pool->rxe = rxe;
|
|
pool->type = type;
|
|
pool->max_elem = max_elem;
|
|
pool->elem_size = ALIGN(size, RXE_POOL_ALIGN);
|
|
pool->flags = rxe_type_info[type].flags;
|
|
pool->tree = RB_ROOT;
|
|
pool->cleanup = rxe_type_info[type].cleanup;
|
|
|
|
atomic_set(&pool->num_elem, 0);
|
|
|
|
kref_init(&pool->ref_cnt);
|
|
|
|
spin_lock_init(&pool->pool_lock);
|
|
|
|
if (rxe_type_info[type].flags & RXE_POOL_INDEX) {
|
|
err = rxe_pool_init_index(pool,
|
|
rxe_type_info[type].max_index,
|
|
rxe_type_info[type].min_index);
|
|
if (err)
|
|
goto out;
|
|
}
|
|
|
|
if (rxe_type_info[type].flags & RXE_POOL_KEY) {
|
|
pool->key_offset = rxe_type_info[type].key_offset;
|
|
pool->key_size = rxe_type_info[type].key_size;
|
|
}
|
|
|
|
pool->state = rxe_pool_valid;
|
|
|
|
out:
|
|
return err;
|
|
}
|
|
|
|
static void rxe_pool_release(struct kref *kref)
|
|
{
|
|
struct rxe_pool *pool = container_of(kref, struct rxe_pool, ref_cnt);
|
|
|
|
pool->state = rxe_pool_invalid;
|
|
kfree(pool->table);
|
|
}
|
|
|
|
static void rxe_pool_put(struct rxe_pool *pool)
|
|
{
|
|
kref_put(&pool->ref_cnt, rxe_pool_release);
|
|
}
|
|
|
|
int rxe_pool_cleanup(struct rxe_pool *pool)
|
|
{
|
|
unsigned long flags;
|
|
|
|
spin_lock_irqsave(&pool->pool_lock, flags);
|
|
pool->state = rxe_pool_invalid;
|
|
if (atomic_read(&pool->num_elem) > 0)
|
|
pr_warn("%s pool destroyed with unfree'd elem\n",
|
|
pool_name(pool));
|
|
spin_unlock_irqrestore(&pool->pool_lock, flags);
|
|
|
|
rxe_pool_put(pool);
|
|
|
|
return 0;
|
|
}
|
|
|
|
static u32 alloc_index(struct rxe_pool *pool)
|
|
{
|
|
u32 index;
|
|
u32 range = pool->max_index - pool->min_index + 1;
|
|
|
|
index = find_next_zero_bit(pool->table, range, pool->last);
|
|
if (index >= range)
|
|
index = find_first_zero_bit(pool->table, range);
|
|
|
|
set_bit(index, pool->table);
|
|
pool->last = index;
|
|
return index + pool->min_index;
|
|
}
|
|
|
|
static void insert_index(struct rxe_pool *pool, struct rxe_pool_entry *new)
|
|
{
|
|
struct rb_node **link = &pool->tree.rb_node;
|
|
struct rb_node *parent = NULL;
|
|
struct rxe_pool_entry *elem;
|
|
|
|
while (*link) {
|
|
parent = *link;
|
|
elem = rb_entry(parent, struct rxe_pool_entry, node);
|
|
|
|
if (elem->index == new->index) {
|
|
pr_warn("element already exists!\n");
|
|
goto out;
|
|
}
|
|
|
|
if (elem->index > new->index)
|
|
link = &(*link)->rb_left;
|
|
else
|
|
link = &(*link)->rb_right;
|
|
}
|
|
|
|
rb_link_node(&new->node, parent, link);
|
|
rb_insert_color(&new->node, &pool->tree);
|
|
out:
|
|
return;
|
|
}
|
|
|
|
static void insert_key(struct rxe_pool *pool, struct rxe_pool_entry *new)
|
|
{
|
|
struct rb_node **link = &pool->tree.rb_node;
|
|
struct rb_node *parent = NULL;
|
|
struct rxe_pool_entry *elem;
|
|
int cmp;
|
|
|
|
while (*link) {
|
|
parent = *link;
|
|
elem = rb_entry(parent, struct rxe_pool_entry, node);
|
|
|
|
cmp = memcmp((u8 *)elem + pool->key_offset,
|
|
(u8 *)new + pool->key_offset, pool->key_size);
|
|
|
|
if (cmp == 0) {
|
|
pr_warn("key already exists!\n");
|
|
goto out;
|
|
}
|
|
|
|
if (cmp > 0)
|
|
link = &(*link)->rb_left;
|
|
else
|
|
link = &(*link)->rb_right;
|
|
}
|
|
|
|
rb_link_node(&new->node, parent, link);
|
|
rb_insert_color(&new->node, &pool->tree);
|
|
out:
|
|
return;
|
|
}
|
|
|
|
void rxe_add_key(void *arg, void *key)
|
|
{
|
|
struct rxe_pool_entry *elem = arg;
|
|
struct rxe_pool *pool = elem->pool;
|
|
unsigned long flags;
|
|
|
|
spin_lock_irqsave(&pool->pool_lock, flags);
|
|
memcpy((u8 *)elem + pool->key_offset, key, pool->key_size);
|
|
insert_key(pool, elem);
|
|
spin_unlock_irqrestore(&pool->pool_lock, flags);
|
|
}
|
|
|
|
void rxe_drop_key(void *arg)
|
|
{
|
|
struct rxe_pool_entry *elem = arg;
|
|
struct rxe_pool *pool = elem->pool;
|
|
unsigned long flags;
|
|
|
|
spin_lock_irqsave(&pool->pool_lock, flags);
|
|
rb_erase(&elem->node, &pool->tree);
|
|
spin_unlock_irqrestore(&pool->pool_lock, flags);
|
|
}
|
|
|
|
void rxe_add_index(void *arg)
|
|
{
|
|
struct rxe_pool_entry *elem = arg;
|
|
struct rxe_pool *pool = elem->pool;
|
|
unsigned long flags;
|
|
|
|
spin_lock_irqsave(&pool->pool_lock, flags);
|
|
elem->index = alloc_index(pool);
|
|
insert_index(pool, elem);
|
|
spin_unlock_irqrestore(&pool->pool_lock, flags);
|
|
}
|
|
|
|
void rxe_drop_index(void *arg)
|
|
{
|
|
struct rxe_pool_entry *elem = arg;
|
|
struct rxe_pool *pool = elem->pool;
|
|
unsigned long flags;
|
|
|
|
spin_lock_irqsave(&pool->pool_lock, flags);
|
|
clear_bit(elem->index - pool->min_index, pool->table);
|
|
rb_erase(&elem->node, &pool->tree);
|
|
spin_unlock_irqrestore(&pool->pool_lock, flags);
|
|
}
|
|
|
|
void *rxe_alloc(struct rxe_pool *pool)
|
|
{
|
|
struct rxe_pool_entry *elem;
|
|
unsigned long flags;
|
|
|
|
might_sleep_if(!(pool->flags & RXE_POOL_ATOMIC));
|
|
|
|
spin_lock_irqsave(&pool->pool_lock, flags);
|
|
if (pool->state != rxe_pool_valid) {
|
|
spin_unlock_irqrestore(&pool->pool_lock, flags);
|
|
return NULL;
|
|
}
|
|
kref_get(&pool->ref_cnt);
|
|
spin_unlock_irqrestore(&pool->pool_lock, flags);
|
|
|
|
kref_get(&pool->rxe->ref_cnt);
|
|
|
|
if (atomic_inc_return(&pool->num_elem) > pool->max_elem) {
|
|
atomic_dec(&pool->num_elem);
|
|
rxe_dev_put(pool->rxe);
|
|
rxe_pool_put(pool);
|
|
return NULL;
|
|
}
|
|
|
|
elem = kmem_cache_zalloc(pool_cache(pool),
|
|
(pool->flags & RXE_POOL_ATOMIC) ?
|
|
GFP_ATOMIC : GFP_KERNEL);
|
|
|
|
elem->pool = pool;
|
|
kref_init(&elem->ref_cnt);
|
|
|
|
return elem;
|
|
}
|
|
|
|
void rxe_elem_release(struct kref *kref)
|
|
{
|
|
struct rxe_pool_entry *elem =
|
|
container_of(kref, struct rxe_pool_entry, ref_cnt);
|
|
struct rxe_pool *pool = elem->pool;
|
|
|
|
if (pool->cleanup)
|
|
pool->cleanup(elem);
|
|
|
|
kmem_cache_free(pool_cache(pool), elem);
|
|
atomic_dec(&pool->num_elem);
|
|
rxe_dev_put(pool->rxe);
|
|
rxe_pool_put(pool);
|
|
}
|
|
|
|
void *rxe_pool_get_index(struct rxe_pool *pool, u32 index)
|
|
{
|
|
struct rb_node *node = NULL;
|
|
struct rxe_pool_entry *elem = NULL;
|
|
unsigned long flags;
|
|
|
|
spin_lock_irqsave(&pool->pool_lock, flags);
|
|
|
|
if (pool->state != rxe_pool_valid)
|
|
goto out;
|
|
|
|
node = pool->tree.rb_node;
|
|
|
|
while (node) {
|
|
elem = rb_entry(node, struct rxe_pool_entry, node);
|
|
|
|
if (elem->index > index)
|
|
node = node->rb_left;
|
|
else if (elem->index < index)
|
|
node = node->rb_right;
|
|
else
|
|
break;
|
|
}
|
|
|
|
if (node)
|
|
kref_get(&elem->ref_cnt);
|
|
|
|
out:
|
|
spin_unlock_irqrestore(&pool->pool_lock, flags);
|
|
return node ? (void *)elem : NULL;
|
|
}
|
|
|
|
void *rxe_pool_get_key(struct rxe_pool *pool, void *key)
|
|
{
|
|
struct rb_node *node = NULL;
|
|
struct rxe_pool_entry *elem = NULL;
|
|
int cmp;
|
|
unsigned long flags;
|
|
|
|
spin_lock_irqsave(&pool->pool_lock, flags);
|
|
|
|
if (pool->state != rxe_pool_valid)
|
|
goto out;
|
|
|
|
node = pool->tree.rb_node;
|
|
|
|
while (node) {
|
|
elem = rb_entry(node, struct rxe_pool_entry, node);
|
|
|
|
cmp = memcmp((u8 *)elem + pool->key_offset,
|
|
key, pool->key_size);
|
|
|
|
if (cmp > 0)
|
|
node = node->rb_left;
|
|
else if (cmp < 0)
|
|
node = node->rb_right;
|
|
else
|
|
break;
|
|
}
|
|
|
|
if (node)
|
|
kref_get(&elem->ref_cnt);
|
|
|
|
out:
|
|
spin_unlock_irqrestore(&pool->pool_lock, flags);
|
|
return node ? ((void *)elem) : NULL;
|
|
}
|