2010-04-22 05:30:06 +07:00
|
|
|
/*
|
|
|
|
* Copyright (c) 2009-2010 Chelsio, Inc. All rights reserved.
|
|
|
|
*
|
|
|
|
* This software is available to you under a choice of one of two
|
|
|
|
* licenses. You may choose to be licensed under the terms of the GNU
|
|
|
|
* General Public License (GPL) Version 2, available from the file
|
|
|
|
* COPYING in the main directory of this source tree, or the
|
|
|
|
* OpenIB.org BSD license below:
|
|
|
|
*
|
|
|
|
* Redistribution and use in source and binary forms, with or
|
|
|
|
* without modification, are permitted provided that the following
|
|
|
|
* conditions are met:
|
|
|
|
*
|
|
|
|
* - Redistributions of source code must retain the above
|
|
|
|
* copyright notice, this list of conditions and the following
|
|
|
|
* disclaimer.
|
|
|
|
*
|
|
|
|
* - Redistributions in binary form must reproduce the above
|
|
|
|
* copyright notice, this list of conditions and the following
|
|
|
|
* disclaimer in the documentation and/or other materials
|
|
|
|
* provided with the distribution.
|
|
|
|
*
|
|
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
|
|
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
|
|
|
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
|
|
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
|
|
|
|
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
|
|
|
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
|
|
|
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
|
|
* SOFTWARE.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include "iw_cxgb4.h"
|
|
|
|
|
|
|
|
static int destroy_cq(struct c4iw_rdev *rdev, struct t4_cq *cq,
|
2017-09-27 03:08:08 +07:00
|
|
|
struct c4iw_dev_ucontext *uctx, struct sk_buff *skb,
|
|
|
|
struct c4iw_wr_wait *wr_waitp)
|
2010-04-22 05:30:06 +07:00
|
|
|
{
|
|
|
|
struct fw_ri_res_wr *res_wr;
|
|
|
|
struct fw_ri_res *res;
|
|
|
|
int wr_len;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
wr_len = sizeof *res_wr + sizeof *res;
|
|
|
|
set_wr_txq(skb, CPL_PRIORITY_CONTROL, 0);
|
|
|
|
|
net: introduce __skb_put_[zero, data, u8]
follow Johannes Berg, semantic patch file as below,
@@
identifier p, p2;
expression len;
expression skb;
type t, t2;
@@
(
-p = __skb_put(skb, len);
+p = __skb_put_zero(skb, len);
|
-p = (t)__skb_put(skb, len);
+p = __skb_put_zero(skb, len);
)
... when != p
(
p2 = (t2)p;
-memset(p2, 0, len);
|
-memset(p, 0, len);
)
@@
identifier p;
expression len;
expression skb;
type t;
@@
(
-t p = __skb_put(skb, len);
+t p = __skb_put_zero(skb, len);
)
... when != p
(
-memset(p, 0, len);
)
@@
type t, t2;
identifier p, p2;
expression skb;
@@
t *p;
...
(
-p = __skb_put(skb, sizeof(t));
+p = __skb_put_zero(skb, sizeof(t));
|
-p = (t *)__skb_put(skb, sizeof(t));
+p = __skb_put_zero(skb, sizeof(t));
)
... when != p
(
p2 = (t2)p;
-memset(p2, 0, sizeof(*p));
|
-memset(p, 0, sizeof(*p));
)
@@
expression skb, len;
@@
-memset(__skb_put(skb, len), 0, len);
+__skb_put_zero(skb, len);
@@
expression skb, len, data;
@@
-memcpy(__skb_put(skb, len), data, len);
+__skb_put_data(skb, data, len);
@@
expression SKB, C, S;
typedef u8;
identifier fn = {__skb_put};
fresh identifier fn2 = fn ## "_u8";
@@
- *(u8 *)fn(SKB, S) = C;
+ fn2(SKB, C);
Signed-off-by: yuan linyu <Linyu.Yuan@alcatel-sbell.com.cn>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-06-18 21:48:17 +07:00
|
|
|
res_wr = __skb_put_zero(skb, wr_len);
|
2010-04-22 05:30:06 +07:00
|
|
|
res_wr->op_nres = cpu_to_be32(
|
2014-11-07 11:05:25 +07:00
|
|
|
FW_WR_OP_V(FW_RI_RES_WR) |
|
2015-01-16 10:54:48 +07:00
|
|
|
FW_RI_RES_WR_NRES_V(1) |
|
2014-11-07 11:05:25 +07:00
|
|
|
FW_WR_COMPL_F);
|
2010-04-22 05:30:06 +07:00
|
|
|
res_wr->len16_pkd = cpu_to_be32(DIV_ROUND_UP(wr_len, 16));
|
2017-09-27 03:08:08 +07:00
|
|
|
res_wr->cookie = (uintptr_t)wr_waitp;
|
2010-04-22 05:30:06 +07:00
|
|
|
res = res_wr->res;
|
|
|
|
res->u.cq.restype = FW_RI_RES_TYPE_CQ;
|
|
|
|
res->u.cq.op = FW_RI_RES_OP_RESET;
|
|
|
|
res->u.cq.iqid = cpu_to_be32(cq->cqid);
|
|
|
|
|
2017-09-27 03:08:08 +07:00
|
|
|
c4iw_init_wr_wait(wr_waitp);
|
iw_cxgb4: add referencing to wait objects
For messages sent from the host to fw that solicit a reply from fw,
the c4iw_wr_wait struct pointer is passed in the host->fw message, and
included in the fw->host fw6_msg reply. This allows the sender to wait
until the reply is received, and the code processing the ingress reply
to wake up the sender.
If c4iw_wait_for_reply() times out, however, we need to keep the
c4iw_wr_wait object around in case the reply eventually does arrive.
Otherwise we have touch-after-free bugs in the wake_up paths.
This was hit due to a bad kernel driver that blocked ingress processing
of cxgb4 for a long time, causing iw_cxgb4 timeouts, but eventually
resuming ingress processing and thus hitting the touch-after-free bug.
So I want to fix iw_cxgb4 such that we'll at least keep the wait object
around until the reply comes. If it never comes we leak a small amount of
memory, but if it does come late, we won't potentially crash the system.
So add a kref struct in the c4iw_wr_wait struct, and take a reference
before sending a message to FW that will generate a FW6 reply. And remove
the reference (and potentially free the wait object) when the reply
is processed.
The ep code also uses the wr_wait for non FW6 CPL messages and doesn't
embed the c4iw_wr_wait object in the message sent to firmware. So for
those cases we add c4iw_wake_up_noref().
The mr/mw, cq, and qp object create/destroy paths do need this reference
logic. For these paths, c4iw_ref_send_wait() is introduced to take the
wr_wait reference, send the msg to fw, and then wait for the reply.
So going forward, iw_cxgb4 either uses c4iw_ofld_send(),
c4iw_wait_for_reply() and c4iw_wake_up_noref() like is done in the some
of the endpoint logic, or c4iw_ref_send_wait() and c4iw_wake_up_deref()
(formerly c4iw_wake_up()) when sending messages with the c4iw_wr_wait
object pointer embedded in the message and resulting FW6 reply.
Signed-off-by: Steve Wise <swise@opengridcomputing.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
2017-09-27 03:13:17 +07:00
|
|
|
ret = c4iw_ref_send_wait(rdev, skb, wr_waitp, 0, 0, __func__);
|
2010-04-22 05:30:06 +07:00
|
|
|
|
|
|
|
kfree(cq->sw_queue);
|
|
|
|
dma_free_coherent(&(rdev->lldi.pdev->dev),
|
|
|
|
cq->memsize, cq->queue,
|
2010-06-03 12:37:50 +07:00
|
|
|
dma_unmap_addr(cq, mapping));
|
2010-04-22 05:30:06 +07:00
|
|
|
c4iw_put_cqid(rdev, cq->cqid, uctx);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int create_cq(struct c4iw_rdev *rdev, struct t4_cq *cq,
|
2017-09-27 03:08:08 +07:00
|
|
|
struct c4iw_dev_ucontext *uctx,
|
|
|
|
struct c4iw_wr_wait *wr_waitp)
|
2010-04-22 05:30:06 +07:00
|
|
|
{
|
|
|
|
struct fw_ri_res_wr *res_wr;
|
|
|
|
struct fw_ri_res *res;
|
|
|
|
int wr_len;
|
|
|
|
int user = (uctx != &rdev->uctx);
|
|
|
|
int ret;
|
|
|
|
struct sk_buff *skb;
|
2018-07-05 19:56:01 +07:00
|
|
|
struct c4iw_ucontext *ucontext = NULL;
|
|
|
|
|
|
|
|
if (user)
|
|
|
|
ucontext = container_of(uctx, struct c4iw_ucontext, uctx);
|
2010-04-22 05:30:06 +07:00
|
|
|
|
|
|
|
cq->cqid = c4iw_get_cqid(rdev, uctx);
|
|
|
|
if (!cq->cqid) {
|
|
|
|
ret = -ENOMEM;
|
|
|
|
goto err1;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!user) {
|
|
|
|
cq->sw_queue = kzalloc(cq->memsize, GFP_KERNEL);
|
|
|
|
if (!cq->sw_queue) {
|
|
|
|
ret = -ENOMEM;
|
|
|
|
goto err2;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
cq->queue = dma_alloc_coherent(&rdev->lldi.pdev->dev, cq->memsize,
|
|
|
|
&cq->dma_addr, GFP_KERNEL);
|
|
|
|
if (!cq->queue) {
|
|
|
|
ret = -ENOMEM;
|
|
|
|
goto err3;
|
|
|
|
}
|
2010-06-03 12:37:50 +07:00
|
|
|
dma_unmap_addr_set(cq, mapping, cq->dma_addr);
|
2010-04-22 05:30:06 +07:00
|
|
|
memset(cq->queue, 0, cq->memsize);
|
|
|
|
|
2018-07-05 19:56:01 +07:00
|
|
|
if (user && ucontext->is_32b_cqe) {
|
|
|
|
cq->qp_errp = &((struct t4_status_page *)
|
|
|
|
((u8 *)cq->queue + (cq->size - 1) *
|
|
|
|
(sizeof(*cq->queue) / 2)))->qp_err;
|
|
|
|
} else {
|
|
|
|
cq->qp_errp = &((struct t4_status_page *)
|
|
|
|
((u8 *)cq->queue + (cq->size - 1) *
|
|
|
|
sizeof(*cq->queue)))->qp_err;
|
|
|
|
}
|
|
|
|
|
2010-04-22 05:30:06 +07:00
|
|
|
/* build fw_ri_res_wr */
|
|
|
|
wr_len = sizeof *res_wr + sizeof *res;
|
|
|
|
|
2010-07-21 09:44:56 +07:00
|
|
|
skb = alloc_skb(wr_len, GFP_KERNEL);
|
2010-04-22 05:30:06 +07:00
|
|
|
if (!skb) {
|
|
|
|
ret = -ENOMEM;
|
|
|
|
goto err4;
|
|
|
|
}
|
|
|
|
set_wr_txq(skb, CPL_PRIORITY_CONTROL, 0);
|
|
|
|
|
net: introduce __skb_put_[zero, data, u8]
follow Johannes Berg, semantic patch file as below,
@@
identifier p, p2;
expression len;
expression skb;
type t, t2;
@@
(
-p = __skb_put(skb, len);
+p = __skb_put_zero(skb, len);
|
-p = (t)__skb_put(skb, len);
+p = __skb_put_zero(skb, len);
)
... when != p
(
p2 = (t2)p;
-memset(p2, 0, len);
|
-memset(p, 0, len);
)
@@
identifier p;
expression len;
expression skb;
type t;
@@
(
-t p = __skb_put(skb, len);
+t p = __skb_put_zero(skb, len);
)
... when != p
(
-memset(p, 0, len);
)
@@
type t, t2;
identifier p, p2;
expression skb;
@@
t *p;
...
(
-p = __skb_put(skb, sizeof(t));
+p = __skb_put_zero(skb, sizeof(t));
|
-p = (t *)__skb_put(skb, sizeof(t));
+p = __skb_put_zero(skb, sizeof(t));
)
... when != p
(
p2 = (t2)p;
-memset(p2, 0, sizeof(*p));
|
-memset(p, 0, sizeof(*p));
)
@@
expression skb, len;
@@
-memset(__skb_put(skb, len), 0, len);
+__skb_put_zero(skb, len);
@@
expression skb, len, data;
@@
-memcpy(__skb_put(skb, len), data, len);
+__skb_put_data(skb, data, len);
@@
expression SKB, C, S;
typedef u8;
identifier fn = {__skb_put};
fresh identifier fn2 = fn ## "_u8";
@@
- *(u8 *)fn(SKB, S) = C;
+ fn2(SKB, C);
Signed-off-by: yuan linyu <Linyu.Yuan@alcatel-sbell.com.cn>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-06-18 21:48:17 +07:00
|
|
|
res_wr = __skb_put_zero(skb, wr_len);
|
2010-04-22 05:30:06 +07:00
|
|
|
res_wr->op_nres = cpu_to_be32(
|
2014-11-07 11:05:25 +07:00
|
|
|
FW_WR_OP_V(FW_RI_RES_WR) |
|
2015-01-16 10:54:48 +07:00
|
|
|
FW_RI_RES_WR_NRES_V(1) |
|
2014-11-07 11:05:25 +07:00
|
|
|
FW_WR_COMPL_F);
|
2010-04-22 05:30:06 +07:00
|
|
|
res_wr->len16_pkd = cpu_to_be32(DIV_ROUND_UP(wr_len, 16));
|
2017-09-27 03:08:08 +07:00
|
|
|
res_wr->cookie = (uintptr_t)wr_waitp;
|
2010-04-22 05:30:06 +07:00
|
|
|
res = res_wr->res;
|
|
|
|
res->u.cq.restype = FW_RI_RES_TYPE_CQ;
|
|
|
|
res->u.cq.op = FW_RI_RES_OP_WRITE;
|
|
|
|
res->u.cq.iqid = cpu_to_be32(cq->cqid);
|
|
|
|
res->u.cq.iqandst_to_iqandstindex = cpu_to_be32(
|
2015-01-16 10:54:48 +07:00
|
|
|
FW_RI_RES_WR_IQANUS_V(0) |
|
|
|
|
FW_RI_RES_WR_IQANUD_V(1) |
|
|
|
|
FW_RI_RES_WR_IQANDST_F |
|
|
|
|
FW_RI_RES_WR_IQANDSTINDEX_V(
|
2014-06-06 23:10:42 +07:00
|
|
|
rdev->lldi.ciq_ids[cq->vector]));
|
2010-04-22 05:30:06 +07:00
|
|
|
res->u.cq.iqdroprss_to_iqesize = cpu_to_be16(
|
2015-01-16 10:54:48 +07:00
|
|
|
FW_RI_RES_WR_IQDROPRSS_F |
|
|
|
|
FW_RI_RES_WR_IQPCIECH_V(2) |
|
|
|
|
FW_RI_RES_WR_IQINTCNTTHRESH_V(0) |
|
|
|
|
FW_RI_RES_WR_IQO_F |
|
2018-07-05 19:56:01 +07:00
|
|
|
((user && ucontext->is_32b_cqe) ?
|
|
|
|
FW_RI_RES_WR_IQESIZE_V(1) :
|
|
|
|
FW_RI_RES_WR_IQESIZE_V(2)));
|
2010-04-22 05:30:06 +07:00
|
|
|
res->u.cq.iqsize = cpu_to_be16(cq->size);
|
|
|
|
res->u.cq.iqaddr = cpu_to_be64(cq->dma_addr);
|
|
|
|
|
2017-09-27 03:08:08 +07:00
|
|
|
c4iw_init_wr_wait(wr_waitp);
|
iw_cxgb4: add referencing to wait objects
For messages sent from the host to fw that solicit a reply from fw,
the c4iw_wr_wait struct pointer is passed in the host->fw message, and
included in the fw->host fw6_msg reply. This allows the sender to wait
until the reply is received, and the code processing the ingress reply
to wake up the sender.
If c4iw_wait_for_reply() times out, however, we need to keep the
c4iw_wr_wait object around in case the reply eventually does arrive.
Otherwise we have touch-after-free bugs in the wake_up paths.
This was hit due to a bad kernel driver that blocked ingress processing
of cxgb4 for a long time, causing iw_cxgb4 timeouts, but eventually
resuming ingress processing and thus hitting the touch-after-free bug.
So I want to fix iw_cxgb4 such that we'll at least keep the wait object
around until the reply comes. If it never comes we leak a small amount of
memory, but if it does come late, we won't potentially crash the system.
So add a kref struct in the c4iw_wr_wait struct, and take a reference
before sending a message to FW that will generate a FW6 reply. And remove
the reference (and potentially free the wait object) when the reply
is processed.
The ep code also uses the wr_wait for non FW6 CPL messages and doesn't
embed the c4iw_wr_wait object in the message sent to firmware. So for
those cases we add c4iw_wake_up_noref().
The mr/mw, cq, and qp object create/destroy paths do need this reference
logic. For these paths, c4iw_ref_send_wait() is introduced to take the
wr_wait reference, send the msg to fw, and then wait for the reply.
So going forward, iw_cxgb4 either uses c4iw_ofld_send(),
c4iw_wait_for_reply() and c4iw_wake_up_noref() like is done in the some
of the endpoint logic, or c4iw_ref_send_wait() and c4iw_wake_up_deref()
(formerly c4iw_wake_up()) when sending messages with the c4iw_wr_wait
object pointer embedded in the message and resulting FW6 reply.
Signed-off-by: Steve Wise <swise@opengridcomputing.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
2017-09-27 03:13:17 +07:00
|
|
|
ret = c4iw_ref_send_wait(rdev, skb, wr_waitp, 0, 0, __func__);
|
2010-04-22 05:30:06 +07:00
|
|
|
if (ret)
|
|
|
|
goto err4;
|
|
|
|
|
|
|
|
cq->gen = 1;
|
2015-06-09 19:53:12 +07:00
|
|
|
cq->gts = rdev->lldi.gts_reg;
|
2010-04-22 05:30:06 +07:00
|
|
|
cq->rdev = rdev;
|
2015-04-22 03:15:00 +07:00
|
|
|
|
2015-06-09 19:53:12 +07:00
|
|
|
cq->bar2_va = c4iw_bar2_addrs(rdev, cq->cqid, T4_BAR2_QTYPE_INGRESS,
|
|
|
|
&cq->bar2_qid,
|
|
|
|
user ? &cq->bar2_pa : NULL);
|
2016-04-05 11:53:48 +07:00
|
|
|
if (user && !cq->bar2_pa) {
|
2017-02-10 05:23:50 +07:00
|
|
|
pr_warn("%s: cqid %u not in BAR2 range\n",
|
2015-06-09 19:53:12 +07:00
|
|
|
pci_name(rdev->lldi.pdev), cq->cqid);
|
|
|
|
ret = -EINVAL;
|
|
|
|
goto err4;
|
2010-04-22 05:30:06 +07:00
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
err4:
|
|
|
|
dma_free_coherent(&rdev->lldi.pdev->dev, cq->memsize, cq->queue,
|
2010-06-03 12:37:50 +07:00
|
|
|
dma_unmap_addr(cq, mapping));
|
2010-04-22 05:30:06 +07:00
|
|
|
err3:
|
|
|
|
kfree(cq->sw_queue);
|
|
|
|
err2:
|
|
|
|
c4iw_put_cqid(rdev, cq->cqid, uctx);
|
|
|
|
err1:
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void insert_recv_cqe(struct t4_wq *wq, struct t4_cq *cq)
|
|
|
|
{
|
|
|
|
struct t4_cqe cqe;
|
|
|
|
|
2017-09-27 14:35:49 +07:00
|
|
|
pr_debug("wq %p cq %p sw_cidx %u sw_pidx %u\n",
|
2017-02-10 05:23:51 +07:00
|
|
|
wq, cq, cq->sw_cidx, cq->sw_pidx);
|
2010-04-22 05:30:06 +07:00
|
|
|
memset(&cqe, 0, sizeof(cqe));
|
2015-01-16 10:54:47 +07:00
|
|
|
cqe.header = cpu_to_be32(CQE_STATUS_V(T4_ERR_SWFLUSH) |
|
|
|
|
CQE_OPCODE_V(FW_RI_SEND) |
|
|
|
|
CQE_TYPE_V(0) |
|
|
|
|
CQE_SWCQE_V(1) |
|
|
|
|
CQE_QPID_V(wq->sq.qid));
|
|
|
|
cqe.bits_type_ts = cpu_to_be64(CQE_GENBIT_V((u64)cq->gen));
|
2010-04-22 05:30:06 +07:00
|
|
|
cq->sw_queue[cq->sw_pidx] = cqe;
|
|
|
|
t4_swcq_produce(cq);
|
|
|
|
}
|
|
|
|
|
|
|
|
int c4iw_flush_rq(struct t4_wq *wq, struct t4_cq *cq, int count)
|
|
|
|
{
|
|
|
|
int flushed = 0;
|
|
|
|
int in_use = wq->rq.in_use - count;
|
|
|
|
|
2017-09-27 14:35:49 +07:00
|
|
|
pr_debug("wq %p cq %p rq.in_use %u skip count %u\n",
|
2017-02-10 05:23:51 +07:00
|
|
|
wq, cq, wq->rq.in_use, count);
|
2010-04-22 05:30:06 +07:00
|
|
|
while (in_use--) {
|
|
|
|
insert_recv_cqe(wq, cq);
|
|
|
|
flushed++;
|
|
|
|
}
|
|
|
|
return flushed;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void insert_sq_cqe(struct t4_wq *wq, struct t4_cq *cq,
|
|
|
|
struct t4_swsqe *swcqe)
|
|
|
|
{
|
|
|
|
struct t4_cqe cqe;
|
|
|
|
|
2017-09-27 14:35:49 +07:00
|
|
|
pr_debug("wq %p cq %p sw_cidx %u sw_pidx %u\n",
|
2017-02-10 05:23:51 +07:00
|
|
|
wq, cq, cq->sw_cidx, cq->sw_pidx);
|
2010-04-22 05:30:06 +07:00
|
|
|
memset(&cqe, 0, sizeof(cqe));
|
2015-01-16 10:54:47 +07:00
|
|
|
cqe.header = cpu_to_be32(CQE_STATUS_V(T4_ERR_SWFLUSH) |
|
|
|
|
CQE_OPCODE_V(swcqe->opcode) |
|
|
|
|
CQE_TYPE_V(1) |
|
|
|
|
CQE_SWCQE_V(1) |
|
|
|
|
CQE_QPID_V(wq->sq.qid));
|
2010-04-22 05:30:06 +07:00
|
|
|
CQE_WRID_SQ_IDX(&cqe) = swcqe->idx;
|
2015-01-16 10:54:47 +07:00
|
|
|
cqe.bits_type_ts = cpu_to_be64(CQE_GENBIT_V((u64)cq->gen));
|
2010-04-22 05:30:06 +07:00
|
|
|
cq->sw_queue[cq->sw_pidx] = cqe;
|
|
|
|
t4_swcq_produce(cq);
|
|
|
|
}
|
|
|
|
|
2013-08-06 22:34:35 +07:00
|
|
|
static void advance_oldest_read(struct t4_wq *wq);
|
|
|
|
|
|
|
|
int c4iw_flush_sq(struct c4iw_qp *qhp)
|
2010-04-22 05:30:06 +07:00
|
|
|
{
|
|
|
|
int flushed = 0;
|
2013-08-06 22:34:35 +07:00
|
|
|
struct t4_wq *wq = &qhp->wq;
|
|
|
|
struct c4iw_cq *chp = to_c4iw_cq(qhp->ibqp.send_cq);
|
|
|
|
struct t4_cq *cq = &chp->cq;
|
|
|
|
int idx;
|
|
|
|
struct t4_swsqe *swsqe;
|
|
|
|
|
|
|
|
if (wq->sq.flush_cidx == -1)
|
|
|
|
wq->sq.flush_cidx = wq->sq.cidx;
|
|
|
|
idx = wq->sq.flush_cidx;
|
|
|
|
while (idx != wq->sq.pidx) {
|
RDMA/cxgb4: SQ flush fix
There is a race when moving a QP from RTS->CLOSING where a SQ work
request could be posted after the FW receives the RDMA_RI/FINI WR.
The SQ work request will never get processed, and should be completed
with FLUSHED status. Function c4iw_flush_sq(), however was dropping
the oldest SQ work request when in CLOSING or IDLE states, instead of
completing the pending work request. If that oldest pending work
request was actually complete and has a CQE in the CQ, then when that
CQE is proceessed in poll_cq, we'll BUG_ON() due to the inconsistent
SQ/CQ state.
This is a very small timing hole and has only been hit once so far.
The fix is two-fold:
1) c4iw_flush_sq() MUST always flush all non-completed WRs with FLUSHED
status regardless of the QP state.
2) In c4iw_modify_rc_qp(), always set the "in error" bit on the queue
before moving the state out of RTS. This ensures that the state
transition will not happen while another thread is in
post_rc_send(), because set_state() and post_rc_send() both aquire
the qp spinlock. Also, once we transition the state out of RTS,
subsequent calls to post_rc_send() will fail because the "in error"
bit is set. I don't think this fully closes the race where the FW
can get a FINI followed a SQ work request being posted (because
they are posted to differente EQs), but the #1 fix will handle the
issue by flushing the SQ work request.
Signed-off-by: Steve Wise <swise@opengridcomputing.com>
Signed-off-by: Roland Dreier <roland@purestorage.com>
2014-04-09 21:38:26 +07:00
|
|
|
swsqe = &wq->sq.sw_sq[idx];
|
|
|
|
swsqe->flushed = 1;
|
|
|
|
insert_sq_cqe(wq, cq, swsqe);
|
|
|
|
if (wq->sq.oldest_read == swsqe) {
|
|
|
|
advance_oldest_read(wq);
|
2013-08-06 22:34:35 +07:00
|
|
|
}
|
RDMA/cxgb4: SQ flush fix
There is a race when moving a QP from RTS->CLOSING where a SQ work
request could be posted after the FW receives the RDMA_RI/FINI WR.
The SQ work request will never get processed, and should be completed
with FLUSHED status. Function c4iw_flush_sq(), however was dropping
the oldest SQ work request when in CLOSING or IDLE states, instead of
completing the pending work request. If that oldest pending work
request was actually complete and has a CQE in the CQ, then when that
CQE is proceessed in poll_cq, we'll BUG_ON() due to the inconsistent
SQ/CQ state.
This is a very small timing hole and has only been hit once so far.
The fix is two-fold:
1) c4iw_flush_sq() MUST always flush all non-completed WRs with FLUSHED
status regardless of the QP state.
2) In c4iw_modify_rc_qp(), always set the "in error" bit on the queue
before moving the state out of RTS. This ensures that the state
transition will not happen while another thread is in
post_rc_send(), because set_state() and post_rc_send() both aquire
the qp spinlock. Also, once we transition the state out of RTS,
subsequent calls to post_rc_send() will fail because the "in error"
bit is set. I don't think this fully closes the race where the FW
can get a FINI followed a SQ work request being posted (because
they are posted to differente EQs), but the #1 fix will handle the
issue by flushing the SQ work request.
Signed-off-by: Steve Wise <swise@opengridcomputing.com>
Signed-off-by: Roland Dreier <roland@purestorage.com>
2014-04-09 21:38:26 +07:00
|
|
|
flushed++;
|
2013-08-06 22:34:35 +07:00
|
|
|
if (++idx == wq->sq.size)
|
|
|
|
idx = 0;
|
2010-04-22 05:30:06 +07:00
|
|
|
}
|
2013-08-06 22:34:35 +07:00
|
|
|
wq->sq.flush_cidx += flushed;
|
|
|
|
if (wq->sq.flush_cidx >= wq->sq.size)
|
|
|
|
wq->sq.flush_cidx -= wq->sq.size;
|
2010-04-22 05:30:06 +07:00
|
|
|
return flushed;
|
|
|
|
}
|
|
|
|
|
2013-08-06 22:34:35 +07:00
|
|
|
static void flush_completed_wrs(struct t4_wq *wq, struct t4_cq *cq)
|
|
|
|
{
|
|
|
|
struct t4_swsqe *swsqe;
|
|
|
|
int cidx;
|
|
|
|
|
|
|
|
if (wq->sq.flush_cidx == -1)
|
|
|
|
wq->sq.flush_cidx = wq->sq.cidx;
|
|
|
|
cidx = wq->sq.flush_cidx;
|
|
|
|
|
|
|
|
while (cidx != wq->sq.pidx) {
|
|
|
|
swsqe = &wq->sq.sw_sq[cidx];
|
|
|
|
if (!swsqe->signaled) {
|
|
|
|
if (++cidx == wq->sq.size)
|
|
|
|
cidx = 0;
|
|
|
|
} else if (swsqe->complete) {
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Insert this completed cqe into the swcq.
|
|
|
|
*/
|
2017-09-27 14:35:49 +07:00
|
|
|
pr_debug("moving cqe into swcq sq idx %u cq idx %u\n",
|
|
|
|
cidx, cq->sw_pidx);
|
2015-01-16 10:54:47 +07:00
|
|
|
swsqe->cqe.header |= htonl(CQE_SWCQE_V(1));
|
2013-08-06 22:34:35 +07:00
|
|
|
cq->sw_queue[cq->sw_pidx] = swsqe->cqe;
|
|
|
|
t4_swcq_produce(cq);
|
|
|
|
swsqe->flushed = 1;
|
|
|
|
if (++cidx == wq->sq.size)
|
|
|
|
cidx = 0;
|
|
|
|
wq->sq.flush_cidx = cidx;
|
|
|
|
} else
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void create_read_req_cqe(struct t4_wq *wq, struct t4_cqe *hw_cqe,
|
|
|
|
struct t4_cqe *read_cqe)
|
|
|
|
{
|
|
|
|
read_cqe->u.scqe.cidx = wq->sq.oldest_read->idx;
|
|
|
|
read_cqe->len = htonl(wq->sq.oldest_read->read_len);
|
2015-01-16 10:54:47 +07:00
|
|
|
read_cqe->header = htonl(CQE_QPID_V(CQE_QPID(hw_cqe)) |
|
|
|
|
CQE_SWCQE_V(SW_CQE(hw_cqe)) |
|
|
|
|
CQE_OPCODE_V(FW_RI_READ_REQ) |
|
|
|
|
CQE_TYPE_V(1));
|
2013-08-06 22:34:35 +07:00
|
|
|
read_cqe->bits_type_ts = hw_cqe->bits_type_ts;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void advance_oldest_read(struct t4_wq *wq)
|
|
|
|
{
|
|
|
|
|
|
|
|
u32 rptr = wq->sq.oldest_read - wq->sq.sw_sq + 1;
|
|
|
|
|
|
|
|
if (rptr == wq->sq.size)
|
|
|
|
rptr = 0;
|
|
|
|
while (rptr != wq->sq.pidx) {
|
|
|
|
wq->sq.oldest_read = &wq->sq.sw_sq[rptr];
|
|
|
|
|
|
|
|
if (wq->sq.oldest_read->opcode == FW_RI_READ_REQ)
|
|
|
|
return;
|
|
|
|
if (++rptr == wq->sq.size)
|
|
|
|
rptr = 0;
|
|
|
|
}
|
|
|
|
wq->sq.oldest_read = NULL;
|
|
|
|
}
|
|
|
|
|
2010-04-22 05:30:06 +07:00
|
|
|
/*
|
|
|
|
* Move all CQEs from the HWCQ into the SWCQ.
|
2013-08-06 22:34:35 +07:00
|
|
|
* Deal with out-of-order and/or completions that complete
|
|
|
|
* prior unsignalled WRs.
|
2010-04-22 05:30:06 +07:00
|
|
|
*/
|
2018-04-27 18:11:16 +07:00
|
|
|
void c4iw_flush_hw_cq(struct c4iw_cq *chp, struct c4iw_qp *flush_qhp)
|
2010-04-22 05:30:06 +07:00
|
|
|
{
|
2013-08-06 22:34:35 +07:00
|
|
|
struct t4_cqe *hw_cqe, *swcqe, read_cqe;
|
|
|
|
struct c4iw_qp *qhp;
|
|
|
|
struct t4_swsqe *swsqe;
|
2010-04-22 05:30:06 +07:00
|
|
|
int ret;
|
|
|
|
|
2017-09-27 14:35:49 +07:00
|
|
|
pr_debug("cqid 0x%x\n", chp->cq.cqid);
|
2013-08-06 22:34:35 +07:00
|
|
|
ret = t4_next_hw_cqe(&chp->cq, &hw_cqe);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* This logic is similar to poll_cq(), but not quite the same
|
|
|
|
* unfortunately. Need to move pertinent HW CQEs to the SW CQ but
|
|
|
|
* also do any translation magic that poll_cq() normally does.
|
|
|
|
*/
|
2010-04-22 05:30:06 +07:00
|
|
|
while (!ret) {
|
2013-08-06 22:34:35 +07:00
|
|
|
qhp = get_qhp(chp->rhp, CQE_QPID(hw_cqe));
|
|
|
|
|
|
|
|
/*
|
|
|
|
* drop CQEs with no associated QP
|
|
|
|
*/
|
|
|
|
if (qhp == NULL)
|
|
|
|
goto next_cqe;
|
|
|
|
|
2018-04-27 18:11:16 +07:00
|
|
|
if (flush_qhp != qhp) {
|
|
|
|
spin_lock(&qhp->lock);
|
|
|
|
|
|
|
|
if (qhp->wq.flushed == 1)
|
|
|
|
goto next_cqe;
|
|
|
|
}
|
|
|
|
|
2013-08-06 22:34:35 +07:00
|
|
|
if (CQE_OPCODE(hw_cqe) == FW_RI_TERMINATE)
|
|
|
|
goto next_cqe;
|
|
|
|
|
|
|
|
if (CQE_OPCODE(hw_cqe) == FW_RI_READ_RESP) {
|
|
|
|
|
2014-03-21 22:10:32 +07:00
|
|
|
/* If we have reached here because of async
|
|
|
|
* event or other error, and have egress error
|
|
|
|
* then drop
|
|
|
|
*/
|
|
|
|
if (CQE_TYPE(hw_cqe) == 1)
|
|
|
|
goto next_cqe;
|
|
|
|
|
|
|
|
/* drop peer2peer RTR reads.
|
2013-08-06 22:34:35 +07:00
|
|
|
*/
|
|
|
|
if (CQE_WRID_STAG(hw_cqe) == 1)
|
|
|
|
goto next_cqe;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Eat completions for unsignaled read WRs.
|
|
|
|
*/
|
|
|
|
if (!qhp->wq.sq.oldest_read->signaled) {
|
|
|
|
advance_oldest_read(&qhp->wq);
|
|
|
|
goto next_cqe;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Don't write to the HWCQ, create a new read req CQE
|
|
|
|
* in local memory and move it into the swcq.
|
|
|
|
*/
|
|
|
|
create_read_req_cqe(&qhp->wq, hw_cqe, &read_cqe);
|
|
|
|
hw_cqe = &read_cqe;
|
|
|
|
advance_oldest_read(&qhp->wq);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* if its a SQ completion, then do the magic to move all the
|
|
|
|
* unsignaled and now in-order completions into the swcq.
|
|
|
|
*/
|
|
|
|
if (SQ_TYPE(hw_cqe)) {
|
|
|
|
swsqe = &qhp->wq.sq.sw_sq[CQE_WRID_SQ_IDX(hw_cqe)];
|
|
|
|
swsqe->cqe = *hw_cqe;
|
|
|
|
swsqe->complete = 1;
|
|
|
|
flush_completed_wrs(&qhp->wq, &chp->cq);
|
|
|
|
} else {
|
|
|
|
swcqe = &chp->cq.sw_queue[chp->cq.sw_pidx];
|
|
|
|
*swcqe = *hw_cqe;
|
2015-01-16 10:54:47 +07:00
|
|
|
swcqe->header |= cpu_to_be32(CQE_SWCQE_V(1));
|
2013-08-06 22:34:35 +07:00
|
|
|
t4_swcq_produce(&chp->cq);
|
|
|
|
}
|
|
|
|
next_cqe:
|
|
|
|
t4_hwcq_consume(&chp->cq);
|
|
|
|
ret = t4_next_hw_cqe(&chp->cq, &hw_cqe);
|
2018-04-27 18:11:16 +07:00
|
|
|
if (qhp && flush_qhp != qhp)
|
|
|
|
spin_unlock(&qhp->lock);
|
2010-04-22 05:30:06 +07:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static int cqe_completes_wr(struct t4_cqe *cqe, struct t4_wq *wq)
|
|
|
|
{
|
2017-12-20 01:29:25 +07:00
|
|
|
if (DRAIN_CQE(cqe)) {
|
2017-11-28 04:16:32 +07:00
|
|
|
WARN_ONCE(1, "Unexpected DRAIN CQE qp id %u!\n", wq->sq.qid);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2010-04-22 05:30:06 +07:00
|
|
|
if (CQE_OPCODE(cqe) == FW_RI_TERMINATE)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
if ((CQE_OPCODE(cqe) == FW_RI_RDMA_WRITE) && RQ_TYPE(cqe))
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
if ((CQE_OPCODE(cqe) == FW_RI_READ_RESP) && SQ_TYPE(cqe))
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
if (CQE_SEND_OPCODE(cqe) && RQ_TYPE(cqe) && t4_rq_empty(wq))
|
|
|
|
return 0;
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
void c4iw_count_rcqes(struct t4_cq *cq, struct t4_wq *wq, int *count)
|
|
|
|
{
|
|
|
|
struct t4_cqe *cqe;
|
|
|
|
u32 ptr;
|
|
|
|
|
|
|
|
*count = 0;
|
2017-09-27 14:35:49 +07:00
|
|
|
pr_debug("count zero %d\n", *count);
|
2010-04-22 05:30:06 +07:00
|
|
|
ptr = cq->sw_cidx;
|
|
|
|
while (ptr != cq->sw_pidx) {
|
|
|
|
cqe = &cq->sw_queue[ptr];
|
|
|
|
if (RQ_TYPE(cqe) && (CQE_OPCODE(cqe) != FW_RI_READ_RESP) &&
|
2011-10-21 01:25:14 +07:00
|
|
|
(CQE_QPID(cqe) == wq->sq.qid) && cqe_completes_wr(cqe, wq))
|
2010-04-22 05:30:06 +07:00
|
|
|
(*count)++;
|
|
|
|
if (++ptr == cq->size)
|
|
|
|
ptr = 0;
|
|
|
|
}
|
2017-09-27 14:35:49 +07:00
|
|
|
pr_debug("cq %p count %d\n", cq, *count);
|
2010-04-22 05:30:06 +07:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* poll_cq
|
|
|
|
*
|
|
|
|
* Caller must:
|
|
|
|
* check the validity of the first CQE,
|
|
|
|
* supply the wq assicated with the qpid.
|
|
|
|
*
|
|
|
|
* credit: cq credit to return to sge.
|
|
|
|
* cqe_flushed: 1 iff the CQE is flushed.
|
|
|
|
* cqe: copy of the polled CQE.
|
|
|
|
*
|
|
|
|
* return value:
|
|
|
|
* 0 CQE returned ok.
|
|
|
|
* -EAGAIN CQE skipped, try again.
|
|
|
|
* -EOVERFLOW CQ overflow detected.
|
|
|
|
*/
|
|
|
|
static int poll_cq(struct t4_wq *wq, struct t4_cq *cq, struct t4_cqe *cqe,
|
|
|
|
u8 *cqe_flushed, u64 *cookie, u32 *credit)
|
|
|
|
{
|
|
|
|
int ret = 0;
|
|
|
|
struct t4_cqe *hw_cqe, read_cqe;
|
|
|
|
|
|
|
|
*cqe_flushed = 0;
|
|
|
|
*credit = 0;
|
|
|
|
ret = t4_next_cqe(cq, &hw_cqe);
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
|
2017-09-27 14:35:49 +07:00
|
|
|
pr_debug("CQE OVF %u qpid 0x%0x genbit %u type %u status 0x%0x opcode 0x%0x len 0x%0x wrid_hi_stag 0x%x wrid_low_msn 0x%x\n",
|
|
|
|
CQE_OVFBIT(hw_cqe), CQE_QPID(hw_cqe),
|
2017-02-10 05:23:51 +07:00
|
|
|
CQE_GENBIT(hw_cqe), CQE_TYPE(hw_cqe), CQE_STATUS(hw_cqe),
|
|
|
|
CQE_OPCODE(hw_cqe), CQE_LEN(hw_cqe), CQE_WRID_HI(hw_cqe),
|
|
|
|
CQE_WRID_LOW(hw_cqe));
|
2010-04-22 05:30:06 +07:00
|
|
|
|
|
|
|
/*
|
|
|
|
* skip cqe's not affiliated with a QP.
|
|
|
|
*/
|
|
|
|
if (wq == NULL) {
|
|
|
|
ret = -EAGAIN;
|
|
|
|
goto skip_cqe;
|
|
|
|
}
|
|
|
|
|
2013-08-06 22:34:35 +07:00
|
|
|
/*
|
|
|
|
* skip hw cqe's if the wq is flushed.
|
|
|
|
*/
|
|
|
|
if (wq->flushed && !SW_CQE(hw_cqe)) {
|
|
|
|
ret = -EAGAIN;
|
|
|
|
goto skip_cqe;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* skip TERMINATE cqes...
|
|
|
|
*/
|
|
|
|
if (CQE_OPCODE(hw_cqe) == FW_RI_TERMINATE) {
|
|
|
|
ret = -EAGAIN;
|
|
|
|
goto skip_cqe;
|
|
|
|
}
|
|
|
|
|
iw_cxgb4: refactor sq/rq drain logic
With the addition of the IB/Core drain API, iw_cxgb4 supported drain
by watching the CQs when the QP was out of RTS and signalling "drain
complete" when the last CQE is polled. This, however, doesn't fully
support the drain semantics. Namely, the drain logic is supposed to signal
"drain complete" only when the application has _processed_ the last CQE,
not just removed them from the CQ. Thus a small timing hole exists that
can cause touch after free type bugs in applications using the drain API
(nvmf, iSER, for example). So iw_cxgb4 needs a better solution.
The iWARP Verbs spec mandates that "_at some point_ after the QP is
moved to ERROR", the iWARP driver MUST synchronously fail post_send and
post_recv calls. iw_cxgb4 was currently not allowing any posts once the
QP is in ERROR. This was in part due to the fact that the HW queues for
the QP in ERROR state are disabled at this point, so there wasn't much
else to do but fail the post operation synchronously. This restriction
is what drove the first drain implementation in iw_cxgb4 that has the
above mentioned flaw.
This patch changes iw_cxgb4 to allow post_send and post_recv WRs after
the QP is moved to ERROR state for kernel mode users, thus still adhering
to the Verbs spec for user mode users, but allowing flush WRs for kernel
users. Since the HW queues are disabled, we just synthesize a CQE for
this post, queue it to the SW CQ, and then call the CQ event handler.
This enables proper drain operations for the various storage applications.
Signed-off-by: Steve Wise <swise@opengridcomputing.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
2016-12-22 22:04:59 +07:00
|
|
|
/*
|
|
|
|
* Special cqe for drain WR completions...
|
|
|
|
*/
|
2017-12-20 01:29:25 +07:00
|
|
|
if (DRAIN_CQE(hw_cqe)) {
|
iw_cxgb4: refactor sq/rq drain logic
With the addition of the IB/Core drain API, iw_cxgb4 supported drain
by watching the CQs when the QP was out of RTS and signalling "drain
complete" when the last CQE is polled. This, however, doesn't fully
support the drain semantics. Namely, the drain logic is supposed to signal
"drain complete" only when the application has _processed_ the last CQE,
not just removed them from the CQ. Thus a small timing hole exists that
can cause touch after free type bugs in applications using the drain API
(nvmf, iSER, for example). So iw_cxgb4 needs a better solution.
The iWARP Verbs spec mandates that "_at some point_ after the QP is
moved to ERROR", the iWARP driver MUST synchronously fail post_send and
post_recv calls. iw_cxgb4 was currently not allowing any posts once the
QP is in ERROR. This was in part due to the fact that the HW queues for
the QP in ERROR state are disabled at this point, so there wasn't much
else to do but fail the post operation synchronously. This restriction
is what drove the first drain implementation in iw_cxgb4 that has the
above mentioned flaw.
This patch changes iw_cxgb4 to allow post_send and post_recv WRs after
the QP is moved to ERROR state for kernel mode users, thus still adhering
to the Verbs spec for user mode users, but allowing flush WRs for kernel
users. Since the HW queues are disabled, we just synthesize a CQE for
this post, queue it to the SW CQ, and then call the CQ event handler.
This enables proper drain operations for the various storage applications.
Signed-off-by: Steve Wise <swise@opengridcomputing.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
2016-12-22 22:04:59 +07:00
|
|
|
*cookie = CQE_DRAIN_COOKIE(hw_cqe);
|
|
|
|
*cqe = *hw_cqe;
|
|
|
|
goto skip_cqe;
|
|
|
|
}
|
|
|
|
|
2010-04-22 05:30:06 +07:00
|
|
|
/*
|
|
|
|
* Gotta tweak READ completions:
|
|
|
|
* 1) the cqe doesn't contain the sq_wptr from the wr.
|
|
|
|
* 2) opcode not reflected from the wr.
|
|
|
|
* 3) read_len not reflected from the wr.
|
|
|
|
* 4) cq_type is RQ_TYPE not SQ_TYPE.
|
|
|
|
*/
|
|
|
|
if (RQ_TYPE(hw_cqe) && (CQE_OPCODE(hw_cqe) == FW_RI_READ_RESP)) {
|
|
|
|
|
2014-03-21 22:10:32 +07:00
|
|
|
/* If we have reached here because of async
|
|
|
|
* event or other error, and have egress error
|
|
|
|
* then drop
|
|
|
|
*/
|
|
|
|
if (CQE_TYPE(hw_cqe) == 1) {
|
|
|
|
if (CQE_STATUS(hw_cqe))
|
|
|
|
t4_set_wq_in_error(wq);
|
|
|
|
ret = -EAGAIN;
|
|
|
|
goto skip_cqe;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* If this is an unsolicited read response, then the read
|
2010-04-22 05:30:06 +07:00
|
|
|
* was generated by the kernel driver as part of peer-2-peer
|
|
|
|
* connection setup. So ignore the completion.
|
|
|
|
*/
|
2013-08-06 22:34:35 +07:00
|
|
|
if (CQE_WRID_STAG(hw_cqe) == 1) {
|
2010-04-22 05:30:06 +07:00
|
|
|
if (CQE_STATUS(hw_cqe))
|
|
|
|
t4_set_wq_in_error(wq);
|
|
|
|
ret = -EAGAIN;
|
|
|
|
goto skip_cqe;
|
|
|
|
}
|
|
|
|
|
2013-08-06 22:34:35 +07:00
|
|
|
/*
|
|
|
|
* Eat completions for unsignaled read WRs.
|
|
|
|
*/
|
|
|
|
if (!wq->sq.oldest_read->signaled) {
|
|
|
|
advance_oldest_read(wq);
|
|
|
|
ret = -EAGAIN;
|
|
|
|
goto skip_cqe;
|
|
|
|
}
|
|
|
|
|
2010-04-22 05:30:06 +07:00
|
|
|
/*
|
|
|
|
* Don't write to the HWCQ, so create a new read req CQE
|
|
|
|
* in local memory.
|
|
|
|
*/
|
|
|
|
create_read_req_cqe(wq, hw_cqe, &read_cqe);
|
|
|
|
hw_cqe = &read_cqe;
|
|
|
|
advance_oldest_read(wq);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (CQE_STATUS(hw_cqe) || t4_wq_in_error(wq)) {
|
2013-08-06 22:34:35 +07:00
|
|
|
*cqe_flushed = (CQE_STATUS(hw_cqe) == T4_ERR_SWFLUSH);
|
2010-04-22 05:30:06 +07:00
|
|
|
t4_set_wq_in_error(wq);
|
2010-09-10 23:15:04 +07:00
|
|
|
}
|
|
|
|
|
2010-04-22 05:30:06 +07:00
|
|
|
/*
|
|
|
|
* RECV completion.
|
|
|
|
*/
|
|
|
|
if (RQ_TYPE(hw_cqe)) {
|
|
|
|
|
|
|
|
/*
|
|
|
|
* HW only validates 4 bits of MSN. So we must validate that
|
|
|
|
* the MSN in the SEND is the next expected MSN. If its not,
|
|
|
|
* then we complete this with T4_ERR_MSN and mark the wq in
|
|
|
|
* error.
|
|
|
|
*/
|
|
|
|
|
|
|
|
if (t4_rq_empty(wq)) {
|
|
|
|
t4_set_wq_in_error(wq);
|
|
|
|
ret = -EAGAIN;
|
|
|
|
goto skip_cqe;
|
|
|
|
}
|
2017-12-19 04:10:00 +07:00
|
|
|
if (unlikely(!CQE_STATUS(hw_cqe) &&
|
|
|
|
CQE_WRID_MSN(hw_cqe) != wq->rq.msn)) {
|
2010-04-22 05:30:06 +07:00
|
|
|
t4_set_wq_in_error(wq);
|
2017-12-19 04:10:00 +07:00
|
|
|
hw_cqe->header |= cpu_to_be32(CQE_STATUS_V(T4_ERR_MSN));
|
2010-04-22 05:30:06 +07:00
|
|
|
}
|
|
|
|
goto proc_cqe;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If we get here its a send completion.
|
|
|
|
*
|
|
|
|
* Handle out of order completion. These get stuffed
|
|
|
|
* in the SW SQ. Then the SW SQ is walked to move any
|
|
|
|
* now in-order completions into the SW CQ. This handles
|
|
|
|
* 2 cases:
|
|
|
|
* 1) reaping unsignaled WRs when the first subsequent
|
|
|
|
* signaled WR is completed.
|
|
|
|
* 2) out of order read completions.
|
|
|
|
*/
|
|
|
|
if (!SW_CQE(hw_cqe) && (CQE_WRID_SQ_IDX(hw_cqe) != wq->sq.cidx)) {
|
|
|
|
struct t4_swsqe *swsqe;
|
|
|
|
|
2017-09-27 14:35:49 +07:00
|
|
|
pr_debug("out of order completion going in sw_sq at idx %u\n",
|
|
|
|
CQE_WRID_SQ_IDX(hw_cqe));
|
2010-04-22 05:30:06 +07:00
|
|
|
swsqe = &wq->sq.sw_sq[CQE_WRID_SQ_IDX(hw_cqe)];
|
|
|
|
swsqe->cqe = *hw_cqe;
|
|
|
|
swsqe->complete = 1;
|
|
|
|
ret = -EAGAIN;
|
|
|
|
goto flush_wq;
|
|
|
|
}
|
|
|
|
|
|
|
|
proc_cqe:
|
|
|
|
*cqe = *hw_cqe;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Reap the associated WR(s) that are freed up with this
|
|
|
|
* completion.
|
|
|
|
*/
|
|
|
|
if (SQ_TYPE(hw_cqe)) {
|
2013-08-06 22:34:35 +07:00
|
|
|
int idx = CQE_WRID_SQ_IDX(hw_cqe);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Account for any unsignaled completions completed by
|
|
|
|
* this signaled completion. In this case, cidx points
|
|
|
|
* to the first unsignaled one, and idx points to the
|
|
|
|
* signaled one. So adjust in_use based on this delta.
|
|
|
|
* if this is not completing any unsigned wrs, then the
|
2013-08-06 22:34:36 +07:00
|
|
|
* delta will be 0. Handle wrapping also!
|
2013-08-06 22:34:35 +07:00
|
|
|
*/
|
2013-08-06 22:34:36 +07:00
|
|
|
if (idx < wq->sq.cidx)
|
|
|
|
wq->sq.in_use -= wq->sq.size + idx - wq->sq.cidx;
|
|
|
|
else
|
|
|
|
wq->sq.in_use -= idx - wq->sq.cidx;
|
2013-08-06 22:34:35 +07:00
|
|
|
|
|
|
|
wq->sq.cidx = (uint16_t)idx;
|
2017-09-27 14:35:49 +07:00
|
|
|
pr_debug("completing sq idx %u\n", wq->sq.cidx);
|
2010-04-22 05:30:06 +07:00
|
|
|
*cookie = wq->sq.sw_sq[wq->sq.cidx].wr_id;
|
2014-07-14 23:04:54 +07:00
|
|
|
if (c4iw_wr_log)
|
|
|
|
c4iw_log_wr_stats(wq, hw_cqe);
|
2010-04-22 05:30:06 +07:00
|
|
|
t4_sq_consume(wq);
|
|
|
|
} else {
|
2017-09-27 14:35:49 +07:00
|
|
|
pr_debug("completing rq idx %u\n", wq->rq.cidx);
|
2010-04-22 05:30:06 +07:00
|
|
|
*cookie = wq->rq.sw_rq[wq->rq.cidx].wr_id;
|
2014-07-14 23:04:54 +07:00
|
|
|
if (c4iw_wr_log)
|
|
|
|
c4iw_log_wr_stats(wq, hw_cqe);
|
2010-04-22 05:30:06 +07:00
|
|
|
t4_rq_consume(wq);
|
2013-08-06 22:34:35 +07:00
|
|
|
goto skip_cqe;
|
2010-04-22 05:30:06 +07:00
|
|
|
}
|
|
|
|
|
|
|
|
flush_wq:
|
|
|
|
/*
|
|
|
|
* Flush any completed cqes that are now in-order.
|
|
|
|
*/
|
|
|
|
flush_completed_wrs(wq, cq);
|
|
|
|
|
|
|
|
skip_cqe:
|
|
|
|
if (SW_CQE(hw_cqe)) {
|
2017-09-27 14:35:49 +07:00
|
|
|
pr_debug("cq %p cqid 0x%x skip sw cqe cidx %u\n",
|
|
|
|
cq, cq->cqid, cq->sw_cidx);
|
2010-04-22 05:30:06 +07:00
|
|
|
t4_swcq_consume(cq);
|
|
|
|
} else {
|
2017-09-27 14:35:49 +07:00
|
|
|
pr_debug("cq %p cqid 0x%x skip hw cqe cidx %u\n",
|
|
|
|
cq, cq->cqid, cq->cidx);
|
2010-04-22 05:30:06 +07:00
|
|
|
t4_hwcq_consume(cq);
|
|
|
|
}
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2018-07-07 00:28:27 +07:00
|
|
|
static int __c4iw_poll_cq_one(struct c4iw_cq *chp, struct c4iw_qp *qhp,
|
|
|
|
struct ib_wc *wc)
|
2010-04-22 05:30:06 +07:00
|
|
|
{
|
2018-07-11 02:03:16 +07:00
|
|
|
struct t4_cqe uninitialized_var(cqe);
|
2018-07-07 00:28:27 +07:00
|
|
|
struct t4_wq *wq = qhp ? &qhp->wq : NULL;
|
2010-04-22 05:30:06 +07:00
|
|
|
u32 credit = 0;
|
|
|
|
u8 cqe_flushed;
|
|
|
|
u64 cookie = 0;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
ret = poll_cq(wq, &(chp->cq), &cqe, &cqe_flushed, &cookie, &credit);
|
|
|
|
if (ret)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
wc->wr_id = cookie;
|
2018-07-07 00:28:27 +07:00
|
|
|
wc->qp = qhp ? &qhp->ibqp : NULL;
|
2010-04-22 05:30:06 +07:00
|
|
|
wc->vendor_err = CQE_STATUS(&cqe);
|
|
|
|
wc->wc_flags = 0;
|
|
|
|
|
2017-09-27 14:35:49 +07:00
|
|
|
pr_debug("qpid 0x%x type %d opcode %d status 0x%x len %u wrid hi 0x%x lo 0x%x cookie 0x%llx\n",
|
|
|
|
CQE_QPID(&cqe),
|
2017-02-10 05:23:51 +07:00
|
|
|
CQE_TYPE(&cqe), CQE_OPCODE(&cqe),
|
|
|
|
CQE_STATUS(&cqe), CQE_LEN(&cqe),
|
|
|
|
CQE_WRID_HI(&cqe), CQE_WRID_LOW(&cqe),
|
|
|
|
(unsigned long long)cookie);
|
2010-04-22 05:30:06 +07:00
|
|
|
|
|
|
|
if (CQE_TYPE(&cqe) == 0) {
|
|
|
|
if (!CQE_STATUS(&cqe))
|
|
|
|
wc->byte_len = CQE_LEN(&cqe);
|
|
|
|
else
|
|
|
|
wc->byte_len = 0;
|
|
|
|
wc->opcode = IB_WC_RECV;
|
|
|
|
if (CQE_OPCODE(&cqe) == FW_RI_SEND_WITH_INV ||
|
|
|
|
CQE_OPCODE(&cqe) == FW_RI_SEND_WITH_SE_INV) {
|
|
|
|
wc->ex.invalidate_rkey = CQE_WRID_STAG(&cqe);
|
|
|
|
wc->wc_flags |= IB_WC_WITH_INVALIDATE;
|
2016-11-04 02:09:38 +07:00
|
|
|
c4iw_invalidate_mr(qhp->rhp, wc->ex.invalidate_rkey);
|
2010-04-22 05:30:06 +07:00
|
|
|
}
|
|
|
|
} else {
|
|
|
|
switch (CQE_OPCODE(&cqe)) {
|
|
|
|
case FW_RI_RDMA_WRITE:
|
|
|
|
wc->opcode = IB_WC_RDMA_WRITE;
|
|
|
|
break;
|
|
|
|
case FW_RI_READ_REQ:
|
|
|
|
wc->opcode = IB_WC_RDMA_READ;
|
|
|
|
wc->byte_len = CQE_LEN(&cqe);
|
|
|
|
break;
|
|
|
|
case FW_RI_SEND_WITH_INV:
|
|
|
|
case FW_RI_SEND_WITH_SE_INV:
|
|
|
|
wc->opcode = IB_WC_SEND;
|
|
|
|
wc->wc_flags |= IB_WC_WITH_INVALIDATE;
|
|
|
|
break;
|
|
|
|
case FW_RI_SEND:
|
|
|
|
case FW_RI_SEND_WITH_SE:
|
|
|
|
wc->opcode = IB_WC_SEND;
|
|
|
|
break;
|
|
|
|
|
|
|
|
case FW_RI_LOCAL_INV:
|
|
|
|
wc->opcode = IB_WC_LOCAL_INV;
|
|
|
|
break;
|
|
|
|
case FW_RI_FAST_REGISTER:
|
2015-10-13 23:11:46 +07:00
|
|
|
wc->opcode = IB_WC_REG_MR;
|
2016-09-16 21:54:52 +07:00
|
|
|
|
|
|
|
/* Invalidate the MR if the fastreg failed */
|
|
|
|
if (CQE_STATUS(&cqe) != T4_ERR_SUCCESS)
|
2016-11-04 02:09:38 +07:00
|
|
|
c4iw_invalidate_mr(qhp->rhp,
|
|
|
|
CQE_WRID_FR_STAG(&cqe));
|
2010-04-22 05:30:06 +07:00
|
|
|
break;
|
|
|
|
default:
|
2017-02-10 05:23:50 +07:00
|
|
|
pr_err("Unexpected opcode %d in the CQE received for QPID=0x%0x\n",
|
2010-04-22 05:30:06 +07:00
|
|
|
CQE_OPCODE(&cqe), CQE_QPID(&cqe));
|
|
|
|
ret = -EINVAL;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (cqe_flushed)
|
|
|
|
wc->status = IB_WC_WR_FLUSH_ERR;
|
|
|
|
else {
|
|
|
|
|
|
|
|
switch (CQE_STATUS(&cqe)) {
|
|
|
|
case T4_ERR_SUCCESS:
|
|
|
|
wc->status = IB_WC_SUCCESS;
|
|
|
|
break;
|
|
|
|
case T4_ERR_STAG:
|
|
|
|
wc->status = IB_WC_LOC_ACCESS_ERR;
|
|
|
|
break;
|
|
|
|
case T4_ERR_PDID:
|
|
|
|
wc->status = IB_WC_LOC_PROT_ERR;
|
|
|
|
break;
|
|
|
|
case T4_ERR_QPID:
|
|
|
|
case T4_ERR_ACCESS:
|
|
|
|
wc->status = IB_WC_LOC_ACCESS_ERR;
|
|
|
|
break;
|
|
|
|
case T4_ERR_WRAP:
|
|
|
|
wc->status = IB_WC_GENERAL_ERR;
|
|
|
|
break;
|
|
|
|
case T4_ERR_BOUND:
|
|
|
|
wc->status = IB_WC_LOC_LEN_ERR;
|
|
|
|
break;
|
|
|
|
case T4_ERR_INVALIDATE_SHARED_MR:
|
|
|
|
case T4_ERR_INVALIDATE_MR_WITH_MW_BOUND:
|
|
|
|
wc->status = IB_WC_MW_BIND_ERR;
|
|
|
|
break;
|
|
|
|
case T4_ERR_CRC:
|
|
|
|
case T4_ERR_MARKER:
|
|
|
|
case T4_ERR_PDU_LEN_ERR:
|
|
|
|
case T4_ERR_OUT_OF_RQE:
|
|
|
|
case T4_ERR_DDP_VERSION:
|
|
|
|
case T4_ERR_RDMA_VERSION:
|
|
|
|
case T4_ERR_DDP_QUEUE_NUM:
|
|
|
|
case T4_ERR_MSN:
|
|
|
|
case T4_ERR_TBIT:
|
|
|
|
case T4_ERR_MO:
|
|
|
|
case T4_ERR_MSN_RANGE:
|
|
|
|
case T4_ERR_IRD_OVERFLOW:
|
|
|
|
case T4_ERR_OPCODE:
|
2010-09-10 23:15:04 +07:00
|
|
|
case T4_ERR_INTERNAL_ERR:
|
2010-04-22 05:30:06 +07:00
|
|
|
wc->status = IB_WC_FATAL_ERR;
|
|
|
|
break;
|
|
|
|
case T4_ERR_SWFLUSH:
|
|
|
|
wc->status = IB_WC_WR_FLUSH_ERR;
|
|
|
|
break;
|
|
|
|
default:
|
2017-02-10 05:23:50 +07:00
|
|
|
pr_err("Unexpected cqe_status 0x%x for QPID=0x%0x\n",
|
2010-04-22 05:30:06 +07:00
|
|
|
CQE_STATUS(&cqe), CQE_QPID(&cqe));
|
2015-07-27 15:38:14 +07:00
|
|
|
wc->status = IB_WC_FATAL_ERR;
|
2010-04-22 05:30:06 +07:00
|
|
|
}
|
|
|
|
}
|
|
|
|
out:
|
2018-07-07 00:28:27 +07:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Get one cq entry from c4iw and map it to openib.
|
|
|
|
*
|
|
|
|
* Returns:
|
|
|
|
* 0 cqe returned
|
|
|
|
* -ENODATA EMPTY;
|
|
|
|
* -EAGAIN caller must try again
|
|
|
|
* any other -errno fatal error
|
|
|
|
*/
|
|
|
|
static int c4iw_poll_cq_one(struct c4iw_cq *chp, struct ib_wc *wc)
|
|
|
|
{
|
|
|
|
struct c4iw_qp *qhp = NULL;
|
|
|
|
struct t4_cqe *rd_cqe;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
ret = t4_next_cqe(&chp->cq, &rd_cqe);
|
|
|
|
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
|
|
|
|
qhp = get_qhp(chp->rhp, CQE_QPID(rd_cqe));
|
|
|
|
if (qhp) {
|
|
|
|
spin_lock(&qhp->lock);
|
|
|
|
ret = __c4iw_poll_cq_one(chp, qhp, wc);
|
2010-04-22 05:30:06 +07:00
|
|
|
spin_unlock(&qhp->lock);
|
2018-07-07 00:28:27 +07:00
|
|
|
} else {
|
|
|
|
ret = __c4iw_poll_cq_one(chp, NULL, wc);
|
|
|
|
}
|
2010-04-22 05:30:06 +07:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
int c4iw_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc)
|
|
|
|
{
|
|
|
|
struct c4iw_cq *chp;
|
|
|
|
unsigned long flags;
|
|
|
|
int npolled;
|
|
|
|
int err = 0;
|
|
|
|
|
|
|
|
chp = to_c4iw_cq(ibcq);
|
|
|
|
|
|
|
|
spin_lock_irqsave(&chp->lock, flags);
|
|
|
|
for (npolled = 0; npolled < num_entries; ++npolled) {
|
|
|
|
do {
|
|
|
|
err = c4iw_poll_cq_one(chp, wc + npolled);
|
|
|
|
} while (err == -EAGAIN);
|
|
|
|
if (err)
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
spin_unlock_irqrestore(&chp->lock, flags);
|
|
|
|
return !err || err == -ENODATA ? npolled : err;
|
|
|
|
}
|
|
|
|
|
|
|
|
int c4iw_destroy_cq(struct ib_cq *ib_cq)
|
|
|
|
{
|
|
|
|
struct c4iw_cq *chp;
|
|
|
|
struct c4iw_ucontext *ucontext;
|
|
|
|
|
2017-09-27 14:35:49 +07:00
|
|
|
pr_debug("ib_cq %p\n", ib_cq);
|
2010-04-22 05:30:06 +07:00
|
|
|
chp = to_c4iw_cq(ib_cq);
|
|
|
|
|
|
|
|
remove_handle(chp->rhp, &chp->rhp->cqidr, chp->cq.cqid);
|
|
|
|
atomic_dec(&chp->refcnt);
|
|
|
|
wait_event(chp->wait, !atomic_read(&chp->refcnt));
|
|
|
|
|
|
|
|
ucontext = ib_cq->uobject ? to_c4iw_ucontext(ib_cq->uobject->context)
|
|
|
|
: NULL;
|
|
|
|
destroy_cq(&chp->rhp->rdev, &chp->cq,
|
2016-06-10 02:35:17 +07:00
|
|
|
ucontext ? &ucontext->uctx : &chp->cq.rdev->uctx,
|
2017-09-27 03:08:08 +07:00
|
|
|
chp->destroy_skb, chp->wr_waitp);
|
iw_cxgb4: add referencing to wait objects
For messages sent from the host to fw that solicit a reply from fw,
the c4iw_wr_wait struct pointer is passed in the host->fw message, and
included in the fw->host fw6_msg reply. This allows the sender to wait
until the reply is received, and the code processing the ingress reply
to wake up the sender.
If c4iw_wait_for_reply() times out, however, we need to keep the
c4iw_wr_wait object around in case the reply eventually does arrive.
Otherwise we have touch-after-free bugs in the wake_up paths.
This was hit due to a bad kernel driver that blocked ingress processing
of cxgb4 for a long time, causing iw_cxgb4 timeouts, but eventually
resuming ingress processing and thus hitting the touch-after-free bug.
So I want to fix iw_cxgb4 such that we'll at least keep the wait object
around until the reply comes. If it never comes we leak a small amount of
memory, but if it does come late, we won't potentially crash the system.
So add a kref struct in the c4iw_wr_wait struct, and take a reference
before sending a message to FW that will generate a FW6 reply. And remove
the reference (and potentially free the wait object) when the reply
is processed.
The ep code also uses the wr_wait for non FW6 CPL messages and doesn't
embed the c4iw_wr_wait object in the message sent to firmware. So for
those cases we add c4iw_wake_up_noref().
The mr/mw, cq, and qp object create/destroy paths do need this reference
logic. For these paths, c4iw_ref_send_wait() is introduced to take the
wr_wait reference, send the msg to fw, and then wait for the reply.
So going forward, iw_cxgb4 either uses c4iw_ofld_send(),
c4iw_wait_for_reply() and c4iw_wake_up_noref() like is done in the some
of the endpoint logic, or c4iw_ref_send_wait() and c4iw_wake_up_deref()
(formerly c4iw_wake_up()) when sending messages with the c4iw_wr_wait
object pointer embedded in the message and resulting FW6 reply.
Signed-off-by: Steve Wise <swise@opengridcomputing.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
2017-09-27 03:13:17 +07:00
|
|
|
c4iw_put_wr_wait(chp->wr_waitp);
|
2010-04-22 05:30:06 +07:00
|
|
|
kfree(chp);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2015-06-11 20:35:20 +07:00
|
|
|
struct ib_cq *c4iw_create_cq(struct ib_device *ibdev,
|
|
|
|
const struct ib_cq_init_attr *attr,
|
|
|
|
struct ib_ucontext *ib_context,
|
2010-04-22 05:30:06 +07:00
|
|
|
struct ib_udata *udata)
|
|
|
|
{
|
2015-06-11 20:35:20 +07:00
|
|
|
int entries = attr->cqe;
|
|
|
|
int vector = attr->comp_vector;
|
2010-04-22 05:30:06 +07:00
|
|
|
struct c4iw_dev *rhp;
|
|
|
|
struct c4iw_cq *chp;
|
2018-07-05 19:56:01 +07:00
|
|
|
struct c4iw_create_cq ucmd;
|
2010-04-22 05:30:06 +07:00
|
|
|
struct c4iw_create_cq_resp uresp;
|
|
|
|
struct c4iw_ucontext *ucontext = NULL;
|
2016-06-10 02:35:17 +07:00
|
|
|
int ret, wr_len;
|
2010-06-11 02:03:06 +07:00
|
|
|
size_t memsize, hwentries;
|
2010-04-22 05:30:06 +07:00
|
|
|
struct c4iw_mm_entry *mm, *mm2;
|
|
|
|
|
2017-09-27 14:35:49 +07:00
|
|
|
pr_debug("ib_dev %p entries %d\n", ibdev, entries);
|
2015-06-11 20:35:20 +07:00
|
|
|
if (attr->flags)
|
|
|
|
return ERR_PTR(-EINVAL);
|
2010-04-22 05:30:06 +07:00
|
|
|
|
|
|
|
rhp = to_c4iw_dev(ibdev);
|
|
|
|
|
2014-06-06 23:10:42 +07:00
|
|
|
if (vector >= rhp->rdev.lldi.nciq)
|
|
|
|
return ERR_PTR(-EINVAL);
|
|
|
|
|
2018-07-05 19:56:01 +07:00
|
|
|
if (ib_context) {
|
|
|
|
ucontext = to_c4iw_ucontext(ib_context);
|
|
|
|
if (udata->inlen < sizeof(ucmd))
|
|
|
|
ucontext->is_32b_cqe = 1;
|
|
|
|
}
|
|
|
|
|
2010-04-22 05:30:06 +07:00
|
|
|
chp = kzalloc(sizeof(*chp), GFP_KERNEL);
|
|
|
|
if (!chp)
|
|
|
|
return ERR_PTR(-ENOMEM);
|
2018-07-05 19:56:01 +07:00
|
|
|
|
iw_cxgb4: add referencing to wait objects
For messages sent from the host to fw that solicit a reply from fw,
the c4iw_wr_wait struct pointer is passed in the host->fw message, and
included in the fw->host fw6_msg reply. This allows the sender to wait
until the reply is received, and the code processing the ingress reply
to wake up the sender.
If c4iw_wait_for_reply() times out, however, we need to keep the
c4iw_wr_wait object around in case the reply eventually does arrive.
Otherwise we have touch-after-free bugs in the wake_up paths.
This was hit due to a bad kernel driver that blocked ingress processing
of cxgb4 for a long time, causing iw_cxgb4 timeouts, but eventually
resuming ingress processing and thus hitting the touch-after-free bug.
So I want to fix iw_cxgb4 such that we'll at least keep the wait object
around until the reply comes. If it never comes we leak a small amount of
memory, but if it does come late, we won't potentially crash the system.
So add a kref struct in the c4iw_wr_wait struct, and take a reference
before sending a message to FW that will generate a FW6 reply. And remove
the reference (and potentially free the wait object) when the reply
is processed.
The ep code also uses the wr_wait for non FW6 CPL messages and doesn't
embed the c4iw_wr_wait object in the message sent to firmware. So for
those cases we add c4iw_wake_up_noref().
The mr/mw, cq, and qp object create/destroy paths do need this reference
logic. For these paths, c4iw_ref_send_wait() is introduced to take the
wr_wait reference, send the msg to fw, and then wait for the reply.
So going forward, iw_cxgb4 either uses c4iw_ofld_send(),
c4iw_wait_for_reply() and c4iw_wake_up_noref() like is done in the some
of the endpoint logic, or c4iw_ref_send_wait() and c4iw_wake_up_deref()
(formerly c4iw_wake_up()) when sending messages with the c4iw_wr_wait
object pointer embedded in the message and resulting FW6 reply.
Signed-off-by: Steve Wise <swise@opengridcomputing.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
2017-09-27 03:13:17 +07:00
|
|
|
chp->wr_waitp = c4iw_alloc_wr_wait(GFP_KERNEL);
|
2017-09-27 03:08:08 +07:00
|
|
|
if (!chp->wr_waitp) {
|
|
|
|
ret = -ENOMEM;
|
|
|
|
goto err_free_chp;
|
|
|
|
}
|
|
|
|
c4iw_init_wr_wait(chp->wr_waitp);
|
2010-04-22 05:30:06 +07:00
|
|
|
|
2016-06-10 02:35:17 +07:00
|
|
|
wr_len = sizeof(struct fw_ri_res_wr) + sizeof(struct fw_ri_res);
|
|
|
|
chp->destroy_skb = alloc_skb(wr_len, GFP_KERNEL);
|
|
|
|
if (!chp->destroy_skb) {
|
|
|
|
ret = -ENOMEM;
|
2017-09-27 03:08:08 +07:00
|
|
|
goto err_free_wr_wait;
|
2016-06-10 02:35:17 +07:00
|
|
|
}
|
|
|
|
|
2010-04-22 05:30:06 +07:00
|
|
|
/* account for the status page. */
|
|
|
|
entries++;
|
|
|
|
|
2010-05-21 04:57:38 +07:00
|
|
|
/* IQ needs one extra entry to differentiate full vs empty. */
|
|
|
|
entries++;
|
|
|
|
|
2010-04-22 05:30:06 +07:00
|
|
|
/*
|
|
|
|
* entries must be multiple of 16 for HW.
|
|
|
|
*/
|
|
|
|
entries = roundup(entries, 16);
|
2010-06-11 02:03:06 +07:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Make actual HW queue 2x to avoid cdix_inc overflows.
|
|
|
|
*/
|
2014-07-14 23:04:51 +07:00
|
|
|
hwentries = min(entries * 2, rhp->rdev.hw_queue.t4_max_iq_size);
|
2010-06-11 02:03:06 +07:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Make HW queue at least 64 entries so GTS updates aren't too
|
|
|
|
* frequent.
|
|
|
|
*/
|
|
|
|
if (hwentries < 64)
|
|
|
|
hwentries = 64;
|
|
|
|
|
2018-07-05 19:56:01 +07:00
|
|
|
memsize = hwentries * ((ucontext && ucontext->is_32b_cqe) ?
|
|
|
|
(sizeof(*chp->cq.queue) / 2) : sizeof(*chp->cq.queue));
|
2010-04-22 05:30:06 +07:00
|
|
|
|
|
|
|
/*
|
|
|
|
* memsize must be a multiple of the page size if its a user cq.
|
|
|
|
*/
|
2014-07-21 22:25:15 +07:00
|
|
|
if (ucontext)
|
2010-04-22 05:30:06 +07:00
|
|
|
memsize = roundup(memsize, PAGE_SIZE);
|
2018-07-05 19:56:01 +07:00
|
|
|
|
2010-06-11 02:03:06 +07:00
|
|
|
chp->cq.size = hwentries;
|
2010-04-22 05:30:06 +07:00
|
|
|
chp->cq.memsize = memsize;
|
2014-06-06 23:10:42 +07:00
|
|
|
chp->cq.vector = vector;
|
2010-04-22 05:30:06 +07:00
|
|
|
|
|
|
|
ret = create_cq(&rhp->rdev, &chp->cq,
|
2017-09-27 03:08:08 +07:00
|
|
|
ucontext ? &ucontext->uctx : &rhp->rdev.uctx,
|
|
|
|
chp->wr_waitp);
|
2010-04-22 05:30:06 +07:00
|
|
|
if (ret)
|
2017-09-27 03:08:08 +07:00
|
|
|
goto err_free_skb;
|
2010-04-22 05:30:06 +07:00
|
|
|
|
|
|
|
chp->rhp = rhp;
|
|
|
|
chp->cq.size--; /* status page */
|
2010-06-11 02:03:06 +07:00
|
|
|
chp->ibcq.cqe = entries - 2;
|
2010-04-22 05:30:06 +07:00
|
|
|
spin_lock_init(&chp->lock);
|
2011-10-24 22:50:21 +07:00
|
|
|
spin_lock_init(&chp->comp_handler_lock);
|
2010-04-22 05:30:06 +07:00
|
|
|
atomic_set(&chp->refcnt, 1);
|
|
|
|
init_waitqueue_head(&chp->wait);
|
|
|
|
ret = insert_handle(rhp, &rhp->cqidr, chp, chp->cq.cqid);
|
|
|
|
if (ret)
|
2017-09-27 03:08:08 +07:00
|
|
|
goto err_destroy_cq;
|
2010-04-22 05:30:06 +07:00
|
|
|
|
|
|
|
if (ucontext) {
|
2017-07-13 14:47:40 +07:00
|
|
|
ret = -ENOMEM;
|
2010-04-22 05:30:06 +07:00
|
|
|
mm = kmalloc(sizeof *mm, GFP_KERNEL);
|
|
|
|
if (!mm)
|
2017-09-27 03:08:08 +07:00
|
|
|
goto err_remove_handle;
|
2010-04-22 05:30:06 +07:00
|
|
|
mm2 = kmalloc(sizeof *mm2, GFP_KERNEL);
|
|
|
|
if (!mm2)
|
2017-09-27 03:08:08 +07:00
|
|
|
goto err_free_mm;
|
2010-04-22 05:30:06 +07:00
|
|
|
|
2018-07-05 19:56:01 +07:00
|
|
|
memset(&uresp, 0, sizeof(uresp));
|
2010-04-22 05:30:06 +07:00
|
|
|
uresp.qid_mask = rhp->rdev.cqmask;
|
|
|
|
uresp.cqid = chp->cq.cqid;
|
|
|
|
uresp.size = chp->cq.size;
|
|
|
|
uresp.memsize = chp->cq.memsize;
|
|
|
|
spin_lock(&ucontext->mmap_lock);
|
|
|
|
uresp.key = ucontext->key;
|
|
|
|
ucontext->key += PAGE_SIZE;
|
|
|
|
uresp.gts_key = ucontext->key;
|
|
|
|
ucontext->key += PAGE_SIZE;
|
2018-07-05 19:56:01 +07:00
|
|
|
/* communicate to the userspace that
|
|
|
|
* kernel driver supports 64B CQE
|
|
|
|
*/
|
|
|
|
uresp.flags |= C4IW_64B_CQE;
|
|
|
|
|
2010-04-22 05:30:06 +07:00
|
|
|
spin_unlock(&ucontext->mmap_lock);
|
RDMA/cxgb4: Add missing padding at end of struct c4iw_create_cq_resp
The i386 ABI disagrees with most other ABIs regarding alignment of
data types larger than 4 bytes: on most ABIs a padding must be added
at end of the structures, while it is not required on i386.
So for most ABI struct c4iw_create_cq_resp gets implicitly padded
to be aligned on a 8 bytes multiple, while for i386, such padding
is not added.
The tool pahole can be used to find such implicit padding:
$ pahole --anon_include \
--nested_anon_include \
--recursive \
--class_name c4iw_create_cq_resp \
drivers/infiniband/hw/cxgb4/iw_cxgb4.o
Then, structure layout can be compared between i386 and x86_64:
+++ obj-i386/drivers/infiniband/hw/cxgb4/iw_cxgb4.o.pahole.txt 2014-03-28 11:43:05.547432195 +0100
--- obj-x86_64/drivers/infiniband/hw/cxgb4/iw_cxgb4.o.pahole.txt 2014-03-28 10:55:10.990133017 +0100
@@ -14,9 +13,8 @@ struct c4iw_create_cq_resp {
__u32 size; /* 28 4 */
__u32 qid_mask; /* 32 4 */
- /* size: 36, cachelines: 1, members: 6 */
- /* last cacheline: 36 bytes */
+ /* size: 40, cachelines: 1, members: 6 */
+ /* padding: 4 */
+ /* last cacheline: 40 bytes */
};
This ABI disagreement will make an x86_64 kernel try to write past the
buffer provided by an i386 binary.
When boundary check will be implemented, the x86_64 kernel will refuse
to write past the i386 userspace provided buffer and the uverbs will
fail.
If the structure is on a page boundary and the next page is not
mapped, ib_copy_to_udata() will fail and the uverb will fail.
This patch adds an explicit padding at end of structure
c4iw_create_cq_resp, and, like 92b0ca7cb149 ("IB/mlx5: Fix stack info
leak in mlx5_ib_alloc_ucontext()"), makes function c4iw_create_cq()
not writting this padding field to userspace. This way, x86_64 kernel
will be able to write struct c4iw_create_cq_resp as expected by
unpatched and patched i386 libcxgb4.
Link: http://marc.info/?i=cover.1399309513.git.ydroneaud@opteya.com
Cc: <stable@vger.kernel.org>
Fixes: cfdda9d764362 ("RDMA/cxgb4: Add driver for Chelsio T4 RNIC")
Fixes: e24a72a3302a6 ("RDMA/cxgb4: Fix four byte info leak in c4iw_create_cq()")
Cc: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Yann Droneaud <ydroneaud@opteya.com>
Acked-by: Steve Wise <swise@opengridcomputing.com>
Signed-off-by: Roland Dreier <roland@purestorage.com>
2014-05-06 00:33:23 +07:00
|
|
|
ret = ib_copy_to_udata(udata, &uresp,
|
2018-07-05 19:56:01 +07:00
|
|
|
ucontext->is_32b_cqe ?
|
|
|
|
sizeof(uresp) - sizeof(uresp.flags) :
|
|
|
|
sizeof(uresp));
|
2010-04-22 05:30:06 +07:00
|
|
|
if (ret)
|
2017-09-27 03:08:08 +07:00
|
|
|
goto err_free_mm2;
|
2010-04-22 05:30:06 +07:00
|
|
|
|
|
|
|
mm->key = uresp.key;
|
|
|
|
mm->addr = virt_to_phys(chp->cq.queue);
|
|
|
|
mm->len = chp->cq.memsize;
|
|
|
|
insert_mmap(ucontext, mm);
|
|
|
|
|
|
|
|
mm2->key = uresp.gts_key;
|
2015-06-09 19:53:12 +07:00
|
|
|
mm2->addr = chp->cq.bar2_pa;
|
2010-04-22 05:30:06 +07:00
|
|
|
mm2->len = PAGE_SIZE;
|
|
|
|
insert_mmap(ucontext, mm2);
|
|
|
|
}
|
2017-09-27 14:35:49 +07:00
|
|
|
pr_debug("cqid 0x%0x chp %p size %u memsize %zu, dma_addr 0x%0llx\n",
|
|
|
|
chp->cq.cqid, chp, chp->cq.size,
|
2017-02-10 05:23:51 +07:00
|
|
|
chp->cq.memsize, (unsigned long long)chp->cq.dma_addr);
|
2010-04-22 05:30:06 +07:00
|
|
|
return &chp->ibcq;
|
2017-09-27 03:08:08 +07:00
|
|
|
err_free_mm2:
|
2010-04-22 05:30:06 +07:00
|
|
|
kfree(mm2);
|
2017-09-27 03:08:08 +07:00
|
|
|
err_free_mm:
|
2010-04-22 05:30:06 +07:00
|
|
|
kfree(mm);
|
2017-09-27 03:08:08 +07:00
|
|
|
err_remove_handle:
|
2010-04-22 05:30:06 +07:00
|
|
|
remove_handle(rhp, &rhp->cqidr, chp->cq.cqid);
|
2017-09-27 03:08:08 +07:00
|
|
|
err_destroy_cq:
|
2010-04-22 05:30:06 +07:00
|
|
|
destroy_cq(&chp->rhp->rdev, &chp->cq,
|
2016-06-10 02:35:17 +07:00
|
|
|
ucontext ? &ucontext->uctx : &rhp->rdev.uctx,
|
2017-09-27 03:08:08 +07:00
|
|
|
chp->destroy_skb, chp->wr_waitp);
|
|
|
|
err_free_skb:
|
2016-06-10 02:35:17 +07:00
|
|
|
kfree_skb(chp->destroy_skb);
|
2017-09-27 03:08:08 +07:00
|
|
|
err_free_wr_wait:
|
iw_cxgb4: add referencing to wait objects
For messages sent from the host to fw that solicit a reply from fw,
the c4iw_wr_wait struct pointer is passed in the host->fw message, and
included in the fw->host fw6_msg reply. This allows the sender to wait
until the reply is received, and the code processing the ingress reply
to wake up the sender.
If c4iw_wait_for_reply() times out, however, we need to keep the
c4iw_wr_wait object around in case the reply eventually does arrive.
Otherwise we have touch-after-free bugs in the wake_up paths.
This was hit due to a bad kernel driver that blocked ingress processing
of cxgb4 for a long time, causing iw_cxgb4 timeouts, but eventually
resuming ingress processing and thus hitting the touch-after-free bug.
So I want to fix iw_cxgb4 such that we'll at least keep the wait object
around until the reply comes. If it never comes we leak a small amount of
memory, but if it does come late, we won't potentially crash the system.
So add a kref struct in the c4iw_wr_wait struct, and take a reference
before sending a message to FW that will generate a FW6 reply. And remove
the reference (and potentially free the wait object) when the reply
is processed.
The ep code also uses the wr_wait for non FW6 CPL messages and doesn't
embed the c4iw_wr_wait object in the message sent to firmware. So for
those cases we add c4iw_wake_up_noref().
The mr/mw, cq, and qp object create/destroy paths do need this reference
logic. For these paths, c4iw_ref_send_wait() is introduced to take the
wr_wait reference, send the msg to fw, and then wait for the reply.
So going forward, iw_cxgb4 either uses c4iw_ofld_send(),
c4iw_wait_for_reply() and c4iw_wake_up_noref() like is done in the some
of the endpoint logic, or c4iw_ref_send_wait() and c4iw_wake_up_deref()
(formerly c4iw_wake_up()) when sending messages with the c4iw_wr_wait
object pointer embedded in the message and resulting FW6 reply.
Signed-off-by: Steve Wise <swise@opengridcomputing.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
2017-09-27 03:13:17 +07:00
|
|
|
c4iw_put_wr_wait(chp->wr_waitp);
|
2017-09-27 03:08:08 +07:00
|
|
|
err_free_chp:
|
2010-04-22 05:30:06 +07:00
|
|
|
kfree(chp);
|
|
|
|
return ERR_PTR(ret);
|
|
|
|
}
|
|
|
|
|
|
|
|
int c4iw_resize_cq(struct ib_cq *cq, int cqe, struct ib_udata *udata)
|
|
|
|
{
|
|
|
|
return -ENOSYS;
|
|
|
|
}
|
|
|
|
|
|
|
|
int c4iw_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags)
|
|
|
|
{
|
|
|
|
struct c4iw_cq *chp;
|
2016-08-23 21:57:33 +07:00
|
|
|
int ret = 0;
|
2010-04-22 05:30:06 +07:00
|
|
|
unsigned long flag;
|
|
|
|
|
|
|
|
chp = to_c4iw_cq(ibcq);
|
|
|
|
spin_lock_irqsave(&chp->lock, flag);
|
2016-08-23 21:57:33 +07:00
|
|
|
t4_arm_cq(&chp->cq,
|
|
|
|
(flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED);
|
|
|
|
if (flags & IB_CQ_REPORT_MISSED_EVENTS)
|
|
|
|
ret = t4_cq_notempty(&chp->cq);
|
2010-04-22 05:30:06 +07:00
|
|
|
spin_unlock_irqrestore(&chp->lock, flag);
|
|
|
|
return ret;
|
|
|
|
}
|