mirror of
https://github.com/AuxXxilium/linux_dsm_epyc7002.git
synced 2024-12-17 02:46:59 +07:00
Merge branch 'rds-ha-failover-fixes'
Sowmini Varadhan says: ==================== RDS: TCP: HA/Failover fixes This series contains a set of fixes for bugs exposed when we ran the following in a loop between a test machine pair: while (1); do # modprobe rds-tcp on test nodes # run rds-stress in bi-dir mode between test machine pair # modprobe -r rds-tcp on test nodes done rds-stress in bi-dir mode will cause both nodes to initiate RDS-TCP connections at almost the same instant, exposing the bugs fixed in this series. Without the fixes, rds-stress reports sporadic packet drops, and packets arriving out of sequence. After the fixes,we have been able to run the test overnight, without any issues. Each patch has a detailed description of the root-cause fixed by the patch. ==================== Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
commit
fcd2b0da73
@ -605,10 +605,14 @@ static void rds_exit(void)
|
||||
}
|
||||
module_exit(rds_exit);
|
||||
|
||||
u32 rds_gen_num;
|
||||
|
||||
static int rds_init(void)
|
||||
{
|
||||
int ret;
|
||||
|
||||
net_get_random_once(&rds_gen_num, sizeof(rds_gen_num));
|
||||
|
||||
ret = rds_bind_lock_init();
|
||||
if (ret)
|
||||
goto out;
|
||||
|
@ -269,6 +269,8 @@ static struct rds_connection *__rds_conn_create(struct net *net,
|
||||
kmem_cache_free(rds_conn_slab, conn);
|
||||
conn = found;
|
||||
} else {
|
||||
conn->c_my_gen_num = rds_gen_num;
|
||||
conn->c_peer_gen_num = 0;
|
||||
hlist_add_head_rcu(&conn->c_hash_node, head);
|
||||
rds_cong_add_conn(conn);
|
||||
rds_conn_count++;
|
||||
@ -681,6 +683,7 @@ void rds_conn_path_connect_if_down(struct rds_conn_path *cp)
|
||||
!test_and_set_bit(RDS_RECONNECT_PENDING, &cp->cp_flags))
|
||||
queue_delayed_work(rds_wq, &cp->cp_conn_w, 0);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(rds_conn_path_connect_if_down);
|
||||
|
||||
void rds_conn_connect_if_down(struct rds_connection *conn)
|
||||
{
|
||||
|
@ -42,6 +42,7 @@ static unsigned int rds_exthdr_size[__RDS_EXTHDR_MAX] = {
|
||||
[RDS_EXTHDR_RDMA] = sizeof(struct rds_ext_header_rdma),
|
||||
[RDS_EXTHDR_RDMA_DEST] = sizeof(struct rds_ext_header_rdma_dest),
|
||||
[RDS_EXTHDR_NPATHS] = sizeof(u16),
|
||||
[RDS_EXTHDR_GEN_NUM] = sizeof(u32),
|
||||
};
|
||||
|
||||
|
||||
|
@ -151,6 +151,9 @@ struct rds_connection {
|
||||
|
||||
struct rds_conn_path c_path[RDS_MPATH_WORKERS];
|
||||
wait_queue_head_t c_hs_waitq; /* handshake waitq */
|
||||
|
||||
u32 c_my_gen_num;
|
||||
u32 c_peer_gen_num;
|
||||
};
|
||||
|
||||
static inline
|
||||
@ -243,7 +246,8 @@ struct rds_ext_header_rdma_dest {
|
||||
/* Extension header announcing number of paths.
|
||||
* Implicit length = 2 bytes.
|
||||
*/
|
||||
#define RDS_EXTHDR_NPATHS 4
|
||||
#define RDS_EXTHDR_NPATHS 5
|
||||
#define RDS_EXTHDR_GEN_NUM 6
|
||||
|
||||
#define __RDS_EXTHDR_MAX 16 /* for now */
|
||||
|
||||
@ -338,6 +342,7 @@ static inline u32 rds_rdma_cookie_offset(rds_rdma_cookie_t cookie)
|
||||
#define RDS_MSG_RETRANSMITTED 5
|
||||
#define RDS_MSG_MAPPED 6
|
||||
#define RDS_MSG_PAGEVEC 7
|
||||
#define RDS_MSG_FLUSH 8
|
||||
|
||||
struct rds_message {
|
||||
atomic_t m_refcount;
|
||||
@ -664,6 +669,7 @@ void rds_cong_exit(void);
|
||||
struct rds_message *rds_cong_update_alloc(struct rds_connection *conn);
|
||||
|
||||
/* conn.c */
|
||||
extern u32 rds_gen_num;
|
||||
int rds_conn_init(void);
|
||||
void rds_conn_exit(void);
|
||||
struct rds_connection *rds_conn_create(struct net *net,
|
||||
|
@ -120,6 +120,36 @@ static void rds_recv_rcvbuf_delta(struct rds_sock *rs, struct sock *sk,
|
||||
/* do nothing if no change in cong state */
|
||||
}
|
||||
|
||||
static void rds_conn_peer_gen_update(struct rds_connection *conn,
|
||||
u32 peer_gen_num)
|
||||
{
|
||||
int i;
|
||||
struct rds_message *rm, *tmp;
|
||||
unsigned long flags;
|
||||
|
||||
WARN_ON(conn->c_trans->t_type != RDS_TRANS_TCP);
|
||||
if (peer_gen_num != 0) {
|
||||
if (conn->c_peer_gen_num != 0 &&
|
||||
peer_gen_num != conn->c_peer_gen_num) {
|
||||
for (i = 0; i < RDS_MPATH_WORKERS; i++) {
|
||||
struct rds_conn_path *cp;
|
||||
|
||||
cp = &conn->c_path[i];
|
||||
spin_lock_irqsave(&cp->cp_lock, flags);
|
||||
cp->cp_next_tx_seq = 1;
|
||||
cp->cp_next_rx_seq = 0;
|
||||
list_for_each_entry_safe(rm, tmp,
|
||||
&cp->cp_retrans,
|
||||
m_conn_item) {
|
||||
set_bit(RDS_MSG_FLUSH, &rm->m_flags);
|
||||
}
|
||||
spin_unlock_irqrestore(&cp->cp_lock, flags);
|
||||
}
|
||||
}
|
||||
conn->c_peer_gen_num = peer_gen_num;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Process all extension headers that come with this message.
|
||||
*/
|
||||
@ -163,7 +193,9 @@ static void rds_recv_hs_exthdrs(struct rds_header *hdr,
|
||||
union {
|
||||
struct rds_ext_header_version version;
|
||||
u16 rds_npaths;
|
||||
u32 rds_gen_num;
|
||||
} buffer;
|
||||
u32 new_peer_gen_num = 0;
|
||||
|
||||
while (1) {
|
||||
len = sizeof(buffer);
|
||||
@ -176,6 +208,9 @@ static void rds_recv_hs_exthdrs(struct rds_header *hdr,
|
||||
conn->c_npaths = min_t(int, RDS_MPATH_WORKERS,
|
||||
buffer.rds_npaths);
|
||||
break;
|
||||
case RDS_EXTHDR_GEN_NUM:
|
||||
new_peer_gen_num = buffer.rds_gen_num;
|
||||
break;
|
||||
default:
|
||||
pr_warn_ratelimited("ignoring unknown exthdr type "
|
||||
"0x%x\n", type);
|
||||
@ -183,6 +218,7 @@ static void rds_recv_hs_exthdrs(struct rds_header *hdr,
|
||||
}
|
||||
/* if RDS_EXTHDR_NPATHS was not found, default to a single-path */
|
||||
conn->c_npaths = max_t(int, conn->c_npaths, 1);
|
||||
rds_conn_peer_gen_update(conn, new_peer_gen_num);
|
||||
}
|
||||
|
||||
/* rds_start_mprds() will synchronously start multiple paths when appropriate.
|
||||
|
@ -259,8 +259,9 @@ int rds_send_xmit(struct rds_conn_path *cp)
|
||||
* connection.
|
||||
* Therefore, we never retransmit messages with RDMA ops.
|
||||
*/
|
||||
if (rm->rdma.op_active &&
|
||||
test_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags)) {
|
||||
if (test_bit(RDS_MSG_FLUSH, &rm->m_flags) ||
|
||||
(rm->rdma.op_active &&
|
||||
test_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags))) {
|
||||
spin_lock_irqsave(&cp->cp_lock, flags);
|
||||
if (test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags))
|
||||
list_move(&rm->m_conn_item, &to_be_dropped);
|
||||
@ -1209,6 +1210,10 @@ rds_send_probe(struct rds_conn_path *cp, __be16 sport,
|
||||
rds_message_add_extension(&rm->m_inc.i_hdr,
|
||||
RDS_EXTHDR_NPATHS, &npaths,
|
||||
sizeof(npaths));
|
||||
rds_message_add_extension(&rm->m_inc.i_hdr,
|
||||
RDS_EXTHDR_GEN_NUM,
|
||||
&cp->cp_conn->c_my_gen_num,
|
||||
sizeof(u32));
|
||||
}
|
||||
spin_unlock_irqrestore(&cp->cp_lock, flags);
|
||||
|
||||
|
@ -60,7 +60,19 @@ void rds_tcp_state_change(struct sock *sk)
|
||||
case TCP_SYN_RECV:
|
||||
break;
|
||||
case TCP_ESTABLISHED:
|
||||
rds_connect_path_complete(cp, RDS_CONN_CONNECTING);
|
||||
/* Force the peer to reconnect so that we have the
|
||||
* TCP ports going from <smaller-ip>.<transient> to
|
||||
* <larger-ip>.<RDS_TCP_PORT>. We avoid marking the
|
||||
* RDS connection as RDS_CONN_UP until the reconnect,
|
||||
* to avoid RDS datagram loss.
|
||||
*/
|
||||
if (cp->cp_conn->c_laddr > cp->cp_conn->c_faddr &&
|
||||
rds_conn_path_transition(cp, RDS_CONN_CONNECTING,
|
||||
RDS_CONN_ERROR)) {
|
||||
rds_conn_path_drop(cp);
|
||||
} else {
|
||||
rds_connect_path_complete(cp, RDS_CONN_CONNECTING);
|
||||
}
|
||||
break;
|
||||
case TCP_CLOSE_WAIT:
|
||||
case TCP_CLOSE:
|
||||
|
@ -83,25 +83,20 @@ struct rds_tcp_connection *rds_tcp_accept_one_path(struct rds_connection *conn)
|
||||
{
|
||||
int i;
|
||||
bool peer_is_smaller = (conn->c_faddr < conn->c_laddr);
|
||||
int npaths = conn->c_npaths;
|
||||
int npaths = max_t(int, 1, conn->c_npaths);
|
||||
|
||||
if (npaths <= 1) {
|
||||
struct rds_conn_path *cp = &conn->c_path[0];
|
||||
int ret;
|
||||
|
||||
ret = rds_conn_path_transition(cp, RDS_CONN_DOWN,
|
||||
RDS_CONN_CONNECTING);
|
||||
if (!ret)
|
||||
rds_conn_path_transition(cp, RDS_CONN_ERROR,
|
||||
RDS_CONN_CONNECTING);
|
||||
return cp->cp_transport_data;
|
||||
}
|
||||
|
||||
/* for mprds, paths with cp_index > 0 MUST be initiated by the peer
|
||||
/* for mprds, all paths MUST be initiated by the peer
|
||||
* with the smaller address.
|
||||
*/
|
||||
if (!peer_is_smaller)
|
||||
if (!peer_is_smaller) {
|
||||
/* Make sure we initiate at least one path if this
|
||||
* has not already been done; rds_start_mprds() will
|
||||
* take care of additional paths, if necessary.
|
||||
*/
|
||||
if (npaths == 1)
|
||||
rds_conn_path_connect_if_down(&conn->c_path[0]);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
for (i = 0; i < npaths; i++) {
|
||||
struct rds_conn_path *cp = &conn->c_path[i];
|
||||
@ -171,8 +166,8 @@ int rds_tcp_accept_one(struct socket *sock)
|
||||
mutex_lock(&rs_tcp->t_conn_path_lock);
|
||||
cp = rs_tcp->t_cpath;
|
||||
conn_state = rds_conn_path_state(cp);
|
||||
if (conn_state != RDS_CONN_CONNECTING && conn_state != RDS_CONN_UP &&
|
||||
conn_state != RDS_CONN_ERROR)
|
||||
WARN_ON(conn_state == RDS_CONN_UP);
|
||||
if (conn_state != RDS_CONN_CONNECTING && conn_state != RDS_CONN_ERROR)
|
||||
goto rst_nsk;
|
||||
if (rs_tcp->t_sock) {
|
||||
/* Need to resolve a duelling SYN between peers.
|
||||
|
@ -100,6 +100,9 @@ int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm,
|
||||
set_bit(RDS_MSG_HAS_ACK_SEQ, &rm->m_flags);
|
||||
tc->t_last_expected_una = rm->m_ack_seq + 1;
|
||||
|
||||
if (test_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags))
|
||||
rm->m_inc.i_hdr.h_flags |= RDS_FLAG_RETRANSMITTED;
|
||||
|
||||
rdsdebug("rm %p tcp nxt %u ack_seq %llu\n",
|
||||
rm, rds_tcp_snd_nxt(tc),
|
||||
(unsigned long long)rm->m_ack_seq);
|
||||
|
Loading…
Reference in New Issue
Block a user