linux_dsm_epyc7002/net/tipc/msg.c

723 lines
19 KiB
C
Raw Normal View History

/*
* net/tipc/msg.c: TIPC message header routines
*
* Copyright (c) 2000-2006, 2014-2015, Ericsson AB
* Copyright (c) 2005, 2010-2011, Wind River Systems
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the names of the copyright holders nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* Alternatively, this software may be distributed under the terms of the
* GNU General Public License ("GPL") version 2 as published by the Free
* Software Foundation.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include <net/sock.h>
#include "core.h"
#include "msg.h"
#include "addr.h"
#include "name_table.h"
#define MAX_FORWARD_SIZE 1024
#define BUF_HEADROOM (LL_MAX_HEADER + 48)
#define BUF_TAILROOM 16
static unsigned int align(unsigned int i)
{
return (i + 3) & ~3u;
}
/**
* tipc_buf_acquire - creates a TIPC message buffer
* @size: message size (including TIPC header)
*
* Returns a new buffer with data pointers set to the specified size.
*
* NOTE: Headroom is reserved to allow prepending of a data link header.
* There may also be unrequested tailroom present at the buffer's end.
*/
struct sk_buff *tipc_buf_acquire(u32 size, gfp_t gfp)
{
struct sk_buff *skb;
unsigned int buf_size = (BUF_HEADROOM + size + 3) & ~3u;
skb = alloc_skb_fclone(buf_size, gfp);
if (skb) {
skb_reserve(skb, BUF_HEADROOM);
skb_put(skb, size);
skb->next = NULL;
}
return skb;
}
2015-02-05 20:36:36 +07:00
void tipc_msg_init(u32 own_node, struct tipc_msg *m, u32 user, u32 type,
u32 hsize, u32 dnode)
{
memset(m, 0, hsize);
msg_set_version(m);
msg_set_user(m, user);
msg_set_hdr_sz(m, hsize);
msg_set_size(m, hsize);
2015-02-05 20:36:36 +07:00
msg_set_prevnode(m, own_node);
msg_set_type(m, type);
if (hsize > SHORT_H_SIZE) {
2015-02-05 20:36:36 +07:00
msg_set_orignode(m, own_node);
msg_set_destnode(m, dnode);
}
}
2015-02-05 20:36:36 +07:00
struct sk_buff *tipc_msg_create(uint user, uint type,
uint hdr_sz, uint data_sz, u32 dnode,
u32 onode, u32 dport, u32 oport, int errcode)
{
struct tipc_msg *msg;
struct sk_buff *buf;
buf = tipc_buf_acquire(hdr_sz + data_sz, GFP_ATOMIC);
if (unlikely(!buf))
return NULL;
msg = buf_msg(buf);
2015-02-05 20:36:36 +07:00
tipc_msg_init(onode, msg, user, type, hdr_sz, dnode);
msg_set_size(msg, hdr_sz + data_sz);
msg_set_origport(msg, oport);
msg_set_destport(msg, dport);
msg_set_errcode(msg, errcode);
if (hdr_sz > SHORT_H_SIZE) {
msg_set_orignode(msg, onode);
msg_set_destnode(msg, dnode);
}
return buf;
}
/* tipc_buf_append(): Append a buffer to the fragment list of another buffer
* @*headbuf: in: NULL for first frag, otherwise value returned from prev call
* out: set when successful non-complete reassembly, otherwise NULL
* @*buf: in: the buffer to append. Always defined
* out: head buf after successful complete reassembly, otherwise NULL
* Returns 1 when reassembly complete, otherwise 0
*/
int tipc_buf_append(struct sk_buff **headbuf, struct sk_buff **buf)
{
struct sk_buff *head = *headbuf;
struct sk_buff *frag = *buf;
struct sk_buff *tail = NULL;
struct tipc_msg *msg;
u32 fragid;
int delta;
bool headstolen;
if (!frag)
goto err;
msg = buf_msg(frag);
fragid = msg_type(msg);
frag->next = NULL;
skb_pull(frag, msg_hdr_sz(msg));
if (fragid == FIRST_FRAGMENT) {
if (unlikely(head))
goto err;
if (unlikely(skb_unclone(frag, GFP_ATOMIC)))
goto err;
head = *headbuf = frag;
*buf = NULL;
TIPC_SKB_CB(head)->tail = NULL;
if (skb_is_nonlinear(head)) {
skb_walk_frags(head, tail) {
TIPC_SKB_CB(head)->tail = tail;
}
} else {
skb_frag_list_init(head);
}
return 0;
}
if (!head)
goto err;
if (skb_try_coalesce(head, frag, &headstolen, &delta)) {
kfree_skb_partial(frag, headstolen);
} else {
tail = TIPC_SKB_CB(head)->tail;
if (!skb_has_frag_list(head))
skb_shinfo(head)->frag_list = frag;
else
tail->next = frag;
head->truesize += frag->truesize;
head->data_len += frag->len;
head->len += frag->len;
TIPC_SKB_CB(head)->tail = frag;
}
if (fragid == LAST_FRAGMENT) {
TIPC_SKB_CB(head)->validated = false;
if (unlikely(!tipc_msg_validate(&head)))
goto err;
*buf = head;
TIPC_SKB_CB(head)->tail = NULL;
*headbuf = NULL;
return 1;
}
*buf = NULL;
return 0;
err:
kfree_skb(*buf);
kfree_skb(*headbuf);
*buf = *headbuf = NULL;
return 0;
}
/* tipc_msg_validate - validate basic format of received message
*
* This routine ensures a TIPC message has an acceptable header, and at least
* as much data as the header indicates it should. The routine also ensures
* that the entire message header is stored in the main fragment of the message
* buffer, to simplify future access to message header fields.
*
* Note: Having extra info present in the message header or data areas is OK.
* TIPC will ignore the excess, under the assumption that it is optional info
* introduced by a later release of the protocol.
*/
bool tipc_msg_validate(struct sk_buff **_skb)
{
struct sk_buff *skb = *_skb;
struct tipc_msg *hdr;
int msz, hsz;
/* Ensure that flow control ratio condition is satisfied */
if (unlikely(skb->truesize / buf_roundup_len(skb) >= 4)) {
skb = skb_copy_expand(skb, BUF_HEADROOM, 0, GFP_ATOMIC);
if (!skb)
return false;
kfree_skb(*_skb);
*_skb = skb;
}
if (unlikely(TIPC_SKB_CB(skb)->validated))
return true;
if (unlikely(!pskb_may_pull(skb, MIN_H_SIZE)))
return false;
hsz = msg_hdr_sz(buf_msg(skb));
if (unlikely(hsz < MIN_H_SIZE) || (hsz > MAX_H_SIZE))
return false;
if (unlikely(!pskb_may_pull(skb, hsz)))
return false;
hdr = buf_msg(skb);
if (unlikely(msg_version(hdr) != TIPC_VERSION))
return false;
msz = msg_size(hdr);
if (unlikely(msz < hsz))
return false;
if (unlikely((msz - hsz) > TIPC_MAX_USER_MSG_SIZE))
return false;
if (unlikely(skb->len < msz))
return false;
TIPC_SKB_CB(skb)->validated = true;
return true;
}
/**
* tipc_msg_build - create buffer chain containing specified header and data
* @mhdr: Message header, to be prepended to data
* @m: User message
* @dsz: Total length of user data
* @pktmax: Max packet size that can be used
* @list: Buffer or chain of buffers to be returned to caller
*
* Note that the recursive call we are making here is safe, since it can
* logically go only one further level down.
*
* Returns message data size or errno: -ENOMEM, -EFAULT
*/
int tipc_msg_build(struct tipc_msg *mhdr, struct msghdr *m, int offset,
int dsz, int pktmax, struct sk_buff_head *list)
{
int mhsz = msg_hdr_sz(mhdr);
struct tipc_msg pkthdr;
int msz = mhsz + dsz;
int pktrem = pktmax;
struct sk_buff *skb;
int drem = dsz;
int pktno = 1;
char *pktpos;
int pktsz;
int rc;
msg_set_size(mhdr, msz);
/* No fragmentation needed? */
if (likely(msz <= pktmax)) {
skb = tipc_buf_acquire(msz, GFP_KERNEL);
/* Fall back to smaller MTU if node local message */
if (unlikely(!skb)) {
if (pktmax != MAX_MSG_SIZE)
return -ENOMEM;
rc = tipc_msg_build(mhdr, m, offset, dsz, FB_MTU, list);
if (rc != dsz)
return rc;
if (tipc_msg_assemble(list))
return dsz;
return -ENOMEM;
}
skb_orphan(skb);
__skb_queue_tail(list, skb);
skb_copy_to_linear_data(skb, mhdr, mhsz);
pktpos = skb->data + mhsz;
if (copy_from_iter_full(pktpos, dsz, &m->msg_iter))
return dsz;
rc = -EFAULT;
goto error;
}
/* Prepare reusable fragment header */
2015-02-05 20:36:36 +07:00
tipc_msg_init(msg_prevnode(mhdr), &pkthdr, MSG_FRAGMENTER,
FIRST_FRAGMENT, INT_H_SIZE, msg_destnode(mhdr));
msg_set_size(&pkthdr, pktmax);
msg_set_fragm_no(&pkthdr, pktno);
tipc: clean up handling of message priorities Messages transferred by TIPC are assigned an "importance priority", -an integer value indicating how to treat the message when there is link or destination socket congestion. There is no separate header field for this value. Instead, the message user values have been chosen in ascending order according to perceived importance, so that the message user field can be used for this. This is not a good solution. First, we have many more users than the needed priority levels, so we end up with treating more priority levels than necessary. Second, the user field cannot always accurately reflect the priority of the message. E.g., a message fragment packet should really have the priority of the enveloped user data message, and not the priority of the MSG_FRAGMENTER user. Until now, we have been working around this problem in different ways, but it is now time to implement a consistent way of handling such priorities, although still within the constraint that we cannot allocate any more bits in the regular data message header for this. In this commit, we define a new priority level, TIPC_SYSTEM_IMPORTANCE, that will be the only one used apart from the four (lower) user data levels. All non-data messages map down to this priority. Furthermore, we take some free bits from the MSG_FRAGMENTER header and allocate them to store the priority of the enveloped message. We then adjust the functions msg_importance()/msg_set_importance() so that they read/set the correct header fields depending on user type. This small protocol change is fully compatible, because the code at the receiving end of a link currently reads the importance level only from user data messages, where there is no change. Reviewed-by: Erik Hugne <erik.hugne@ericsson.com> Signed-off-by: Jon Maloy <jon.maloy@ericsson.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-03-14 03:08:11 +07:00
msg_set_importance(&pkthdr, msg_importance(mhdr));
/* Prepare first fragment */
skb = tipc_buf_acquire(pktmax, GFP_KERNEL);
if (!skb)
return -ENOMEM;
skb_orphan(skb);
__skb_queue_tail(list, skb);
pktpos = skb->data;
skb_copy_to_linear_data(skb, &pkthdr, INT_H_SIZE);
pktpos += INT_H_SIZE;
pktrem -= INT_H_SIZE;
skb_copy_to_linear_data_offset(skb, INT_H_SIZE, mhdr, mhsz);
pktpos += mhsz;
pktrem -= mhsz;
do {
if (drem < pktrem)
pktrem = drem;
if (!copy_from_iter_full(pktpos, pktrem, &m->msg_iter)) {
rc = -EFAULT;
goto error;
}
drem -= pktrem;
if (!drem)
break;
/* Prepare new fragment: */
if (drem < (pktmax - INT_H_SIZE))
pktsz = drem + INT_H_SIZE;
else
pktsz = pktmax;
skb = tipc_buf_acquire(pktsz, GFP_KERNEL);
if (!skb) {
rc = -ENOMEM;
goto error;
}
skb_orphan(skb);
__skb_queue_tail(list, skb);
msg_set_type(&pkthdr, FRAGMENT);
msg_set_size(&pkthdr, pktsz);
msg_set_fragm_no(&pkthdr, ++pktno);
skb_copy_to_linear_data(skb, &pkthdr, INT_H_SIZE);
pktpos = skb->data + INT_H_SIZE;
pktrem = pktsz - INT_H_SIZE;
} while (1);
msg_set_type(buf_msg(skb), LAST_FRAGMENT);
return dsz;
error:
__skb_queue_purge(list);
__skb_queue_head_init(list);
return rc;
}
/**
* tipc_msg_bundle(): Append contents of a buffer to tail of an existing one
* @skb: the buffer to append to ("bundle")
* @msg: message to be appended
* @mtu: max allowable size for the bundle buffer
* Consumes buffer if successful
* Returns true if bundling could be performed, otherwise false
*/
bool tipc_msg_bundle(struct sk_buff *skb, struct tipc_msg *msg, u32 mtu)
{
struct tipc_msg *bmsg;
unsigned int bsz;
unsigned int msz = msg_size(msg);
u32 start, pad;
u32 max = mtu - INT_H_SIZE;
if (likely(msg_user(msg) == MSG_FRAGMENTER))
return false;
if (!skb)
return false;
bmsg = buf_msg(skb);
bsz = msg_size(bmsg);
start = align(bsz);
pad = start - bsz;
if (unlikely(msg_user(msg) == TUNNEL_PROTOCOL))
return false;
if (unlikely(msg_user(msg) == BCAST_PROTOCOL))
return false;
if (unlikely(msg_user(bmsg) != MSG_BUNDLER))
return false;
if (unlikely(skb_tailroom(skb) < (pad + msz)))
return false;
if (unlikely(max < (start + msz)))
return false;
tipc: improve link congestion algorithm The link congestion algorithm used until now implies two problems. - It is too generous towards lower-level messages in situations of high load by giving "absolute" bandwidth guarantees to the different priority levels. LOW traffic is guaranteed 10%, MEDIUM is guaranted 20%, HIGH is guaranteed 30%, and CRITICAL is guaranteed 40% of the available bandwidth. But, in the absence of higher level traffic, the ratio between two distinct levels becomes unreasonable. E.g. if there is only LOW and MEDIUM traffic on a system, the former is guaranteed 1/3 of the bandwidth, and the latter 2/3. This again means that if there is e.g. one LOW user and 10 MEDIUM users, the former will have 33.3% of the bandwidth, and the others will have to compete for the remainder, i.e. each will end up with 6.7% of the capacity. - Packets of type MSG_BUNDLER are created at SYSTEM importance level, but only after the packets bundled into it have passed the congestion test for their own respective levels. Since bundled packets don't result in incrementing the level counter for their own importance, only occasionally for the SYSTEM level counter, they do in practice obtain SYSTEM level importance. Hence, the current implementation provides a gap in the congestion algorithm that in the worst case may lead to a link reset. We now refine the congestion algorithm as follows: - A message is accepted to the link backlog only if its own level counter, and all superior level counters, permit it. - The importance of a created bundle packet is set according to its contents. A bundle packet created from messges at levels LOW to CRITICAL is given importance level CRITICAL, while a bundle created from a SYSTEM level message is given importance SYSTEM. In the latter case only subsequent SYSTEM level messages are allowed to be bundled into it. This solves the first problem described above, by making the bandwidth guarantee relative to the total number of users at all levels; only the upper limit for each level remains absolute. In the example described above, the single LOW user would use 1/11th of the bandwidth, the same as each of the ten MEDIUM users, but he still has the same guarantee against starvation as the latter ones. The fix also solves the second problem. If the CRITICAL level is filled up by bundle packets of that level, no lower level packets will be accepted any more. Suggested-by: Gergely Kiss <gergely.kiss@ericsson.com> Reviewed-by: Ying Xue <ying.xue@windriver.com> Signed-off-by: Jon Maloy <jon.maloy@ericsson.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-05-14 21:46:17 +07:00
if ((msg_importance(msg) < TIPC_SYSTEM_IMPORTANCE) &&
(msg_importance(bmsg) == TIPC_SYSTEM_IMPORTANCE))
return false;
skb_put(skb, pad + msz);
skb_copy_to_linear_data_offset(skb, start, msg, msz);
msg_set_size(bmsg, start + msz);
msg_set_msgcnt(bmsg, msg_msgcnt(bmsg) + 1);
return true;
}
tipc: resolve race problem at unicast message reception TIPC handles message cardinality and sequencing at the link layer, before passing messages upwards to the destination sockets. During the upcall from link to socket no locks are held. It is therefore possible, and we see it happen occasionally, that messages arriving in different threads and delivered in sequence still bypass each other before they reach the destination socket. This must not happen, since it violates the sequentiality guarantee. We solve this by adding a new input buffer queue to the link structure. Arriving messages are added safely to the tail of that queue by the link, while the head of the queue is consumed, also safely, by the receiving socket. Sequentiality is secured per socket by only allowing buffers to be dequeued inside the socket lock. Since there may be multiple simultaneous readers of the queue, we use a 'filter' parameter to reduce the risk that they peek the same buffer from the queue, hence also reducing the risk of contention on the receiving socket locks. This solves the sequentiality problem, and seems to cause no measurable performance degradation. A nice side effect of this change is that lock handling in the functions tipc_rcv() and tipc_bcast_rcv() now becomes uniform, something that will enable future simplifications of those functions. Reviewed-by: Ying Xue <ying.xue@windriver.com> Signed-off-by: Jon Maloy <jon.maloy@ericsson.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-02-05 20:36:41 +07:00
/**
* tipc_msg_extract(): extract bundled inner packet from buffer
* @skb: buffer to be extracted from.
tipc: resolve race problem at unicast message reception TIPC handles message cardinality and sequencing at the link layer, before passing messages upwards to the destination sockets. During the upcall from link to socket no locks are held. It is therefore possible, and we see it happen occasionally, that messages arriving in different threads and delivered in sequence still bypass each other before they reach the destination socket. This must not happen, since it violates the sequentiality guarantee. We solve this by adding a new input buffer queue to the link structure. Arriving messages are added safely to the tail of that queue by the link, while the head of the queue is consumed, also safely, by the receiving socket. Sequentiality is secured per socket by only allowing buffers to be dequeued inside the socket lock. Since there may be multiple simultaneous readers of the queue, we use a 'filter' parameter to reduce the risk that they peek the same buffer from the queue, hence also reducing the risk of contention on the receiving socket locks. This solves the sequentiality problem, and seems to cause no measurable performance degradation. A nice side effect of this change is that lock handling in the functions tipc_rcv() and tipc_bcast_rcv() now becomes uniform, something that will enable future simplifications of those functions. Reviewed-by: Ying Xue <ying.xue@windriver.com> Signed-off-by: Jon Maloy <jon.maloy@ericsson.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-02-05 20:36:41 +07:00
* @iskb: extracted inner buffer, to be returned
* @pos: position in outer message of msg to be extracted.
* Returns position of next msg
tipc: resolve race problem at unicast message reception TIPC handles message cardinality and sequencing at the link layer, before passing messages upwards to the destination sockets. During the upcall from link to socket no locks are held. It is therefore possible, and we see it happen occasionally, that messages arriving in different threads and delivered in sequence still bypass each other before they reach the destination socket. This must not happen, since it violates the sequentiality guarantee. We solve this by adding a new input buffer queue to the link structure. Arriving messages are added safely to the tail of that queue by the link, while the head of the queue is consumed, also safely, by the receiving socket. Sequentiality is secured per socket by only allowing buffers to be dequeued inside the socket lock. Since there may be multiple simultaneous readers of the queue, we use a 'filter' parameter to reduce the risk that they peek the same buffer from the queue, hence also reducing the risk of contention on the receiving socket locks. This solves the sequentiality problem, and seems to cause no measurable performance degradation. A nice side effect of this change is that lock handling in the functions tipc_rcv() and tipc_bcast_rcv() now becomes uniform, something that will enable future simplifications of those functions. Reviewed-by: Ying Xue <ying.xue@windriver.com> Signed-off-by: Jon Maloy <jon.maloy@ericsson.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-02-05 20:36:41 +07:00
* Consumes outer buffer when last packet extracted
* Returns true when when there is an extracted buffer, otherwise false
*/
bool tipc_msg_extract(struct sk_buff *skb, struct sk_buff **iskb, int *pos)
{
struct tipc_msg *msg;
int imsz, offset;
tipc: resolve race problem at unicast message reception TIPC handles message cardinality and sequencing at the link layer, before passing messages upwards to the destination sockets. During the upcall from link to socket no locks are held. It is therefore possible, and we see it happen occasionally, that messages arriving in different threads and delivered in sequence still bypass each other before they reach the destination socket. This must not happen, since it violates the sequentiality guarantee. We solve this by adding a new input buffer queue to the link structure. Arriving messages are added safely to the tail of that queue by the link, while the head of the queue is consumed, also safely, by the receiving socket. Sequentiality is secured per socket by only allowing buffers to be dequeued inside the socket lock. Since there may be multiple simultaneous readers of the queue, we use a 'filter' parameter to reduce the risk that they peek the same buffer from the queue, hence also reducing the risk of contention on the receiving socket locks. This solves the sequentiality problem, and seems to cause no measurable performance degradation. A nice side effect of this change is that lock handling in the functions tipc_rcv() and tipc_bcast_rcv() now becomes uniform, something that will enable future simplifications of those functions. Reviewed-by: Ying Xue <ying.xue@windriver.com> Signed-off-by: Jon Maloy <jon.maloy@ericsson.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-02-05 20:36:41 +07:00
*iskb = NULL;
if (unlikely(skb_linearize(skb)))
goto none;
msg = buf_msg(skb);
offset = msg_hdr_sz(msg) + *pos;
if (unlikely(offset > (msg_size(msg) - MIN_H_SIZE)))
tipc: resolve race problem at unicast message reception TIPC handles message cardinality and sequencing at the link layer, before passing messages upwards to the destination sockets. During the upcall from link to socket no locks are held. It is therefore possible, and we see it happen occasionally, that messages arriving in different threads and delivered in sequence still bypass each other before they reach the destination socket. This must not happen, since it violates the sequentiality guarantee. We solve this by adding a new input buffer queue to the link structure. Arriving messages are added safely to the tail of that queue by the link, while the head of the queue is consumed, also safely, by the receiving socket. Sequentiality is secured per socket by only allowing buffers to be dequeued inside the socket lock. Since there may be multiple simultaneous readers of the queue, we use a 'filter' parameter to reduce the risk that they peek the same buffer from the queue, hence also reducing the risk of contention on the receiving socket locks. This solves the sequentiality problem, and seems to cause no measurable performance degradation. A nice side effect of this change is that lock handling in the functions tipc_rcv() and tipc_bcast_rcv() now becomes uniform, something that will enable future simplifications of those functions. Reviewed-by: Ying Xue <ying.xue@windriver.com> Signed-off-by: Jon Maloy <jon.maloy@ericsson.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-02-05 20:36:41 +07:00
goto none;
*iskb = skb_clone(skb, GFP_ATOMIC);
if (unlikely(!*iskb))
tipc: resolve race problem at unicast message reception TIPC handles message cardinality and sequencing at the link layer, before passing messages upwards to the destination sockets. During the upcall from link to socket no locks are held. It is therefore possible, and we see it happen occasionally, that messages arriving in different threads and delivered in sequence still bypass each other before they reach the destination socket. This must not happen, since it violates the sequentiality guarantee. We solve this by adding a new input buffer queue to the link structure. Arriving messages are added safely to the tail of that queue by the link, while the head of the queue is consumed, also safely, by the receiving socket. Sequentiality is secured per socket by only allowing buffers to be dequeued inside the socket lock. Since there may be multiple simultaneous readers of the queue, we use a 'filter' parameter to reduce the risk that they peek the same buffer from the queue, hence also reducing the risk of contention on the receiving socket locks. This solves the sequentiality problem, and seems to cause no measurable performance degradation. A nice side effect of this change is that lock handling in the functions tipc_rcv() and tipc_bcast_rcv() now becomes uniform, something that will enable future simplifications of those functions. Reviewed-by: Ying Xue <ying.xue@windriver.com> Signed-off-by: Jon Maloy <jon.maloy@ericsson.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-02-05 20:36:41 +07:00
goto none;
skb_pull(*iskb, offset);
imsz = msg_size(buf_msg(*iskb));
skb_trim(*iskb, imsz);
if (unlikely(!tipc_msg_validate(iskb)))
tipc: resolve race problem at unicast message reception TIPC handles message cardinality and sequencing at the link layer, before passing messages upwards to the destination sockets. During the upcall from link to socket no locks are held. It is therefore possible, and we see it happen occasionally, that messages arriving in different threads and delivered in sequence still bypass each other before they reach the destination socket. This must not happen, since it violates the sequentiality guarantee. We solve this by adding a new input buffer queue to the link structure. Arriving messages are added safely to the tail of that queue by the link, while the head of the queue is consumed, also safely, by the receiving socket. Sequentiality is secured per socket by only allowing buffers to be dequeued inside the socket lock. Since there may be multiple simultaneous readers of the queue, we use a 'filter' parameter to reduce the risk that they peek the same buffer from the queue, hence also reducing the risk of contention on the receiving socket locks. This solves the sequentiality problem, and seems to cause no measurable performance degradation. A nice side effect of this change is that lock handling in the functions tipc_rcv() and tipc_bcast_rcv() now becomes uniform, something that will enable future simplifications of those functions. Reviewed-by: Ying Xue <ying.xue@windriver.com> Signed-off-by: Jon Maloy <jon.maloy@ericsson.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-02-05 20:36:41 +07:00
goto none;
*pos += align(imsz);
return true;
none:
kfree_skb(skb);
kfree_skb(*iskb);
tipc: resolve race problem at unicast message reception TIPC handles message cardinality and sequencing at the link layer, before passing messages upwards to the destination sockets. During the upcall from link to socket no locks are held. It is therefore possible, and we see it happen occasionally, that messages arriving in different threads and delivered in sequence still bypass each other before they reach the destination socket. This must not happen, since it violates the sequentiality guarantee. We solve this by adding a new input buffer queue to the link structure. Arriving messages are added safely to the tail of that queue by the link, while the head of the queue is consumed, also safely, by the receiving socket. Sequentiality is secured per socket by only allowing buffers to be dequeued inside the socket lock. Since there may be multiple simultaneous readers of the queue, we use a 'filter' parameter to reduce the risk that they peek the same buffer from the queue, hence also reducing the risk of contention on the receiving socket locks. This solves the sequentiality problem, and seems to cause no measurable performance degradation. A nice side effect of this change is that lock handling in the functions tipc_rcv() and tipc_bcast_rcv() now becomes uniform, something that will enable future simplifications of those functions. Reviewed-by: Ying Xue <ying.xue@windriver.com> Signed-off-by: Jon Maloy <jon.maloy@ericsson.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-02-05 20:36:41 +07:00
*iskb = NULL;
return false;
}
/**
* tipc_msg_make_bundle(): Create bundle buf and append message to its tail
* @list: the buffer chain, where head is the buffer to replace/append
* @skb: buffer to be created, appended to and returned in case of success
* @msg: message to be appended
* @mtu: max allowable size for the bundle buffer, inclusive header
* @dnode: destination node for message. (Not always present in header)
* Returns true if success, otherwise false
*/
bool tipc_msg_make_bundle(struct sk_buff **skb, struct tipc_msg *msg,
u32 mtu, u32 dnode)
{
struct sk_buff *_skb;
struct tipc_msg *bmsg;
u32 msz = msg_size(msg);
u32 max = mtu - INT_H_SIZE;
if (msg_user(msg) == MSG_FRAGMENTER)
return false;
if (msg_user(msg) == TUNNEL_PROTOCOL)
return false;
if (msg_user(msg) == BCAST_PROTOCOL)
return false;
if (msz > (max / 2))
return false;
_skb = tipc_buf_acquire(max, GFP_ATOMIC);
if (!_skb)
return false;
skb_trim(_skb, INT_H_SIZE);
bmsg = buf_msg(_skb);
2015-02-05 20:36:36 +07:00
tipc_msg_init(msg_prevnode(msg), bmsg, MSG_BUNDLER, 0,
INT_H_SIZE, dnode);
tipc: improve link congestion algorithm The link congestion algorithm used until now implies two problems. - It is too generous towards lower-level messages in situations of high load by giving "absolute" bandwidth guarantees to the different priority levels. LOW traffic is guaranteed 10%, MEDIUM is guaranted 20%, HIGH is guaranteed 30%, and CRITICAL is guaranteed 40% of the available bandwidth. But, in the absence of higher level traffic, the ratio between two distinct levels becomes unreasonable. E.g. if there is only LOW and MEDIUM traffic on a system, the former is guaranteed 1/3 of the bandwidth, and the latter 2/3. This again means that if there is e.g. one LOW user and 10 MEDIUM users, the former will have 33.3% of the bandwidth, and the others will have to compete for the remainder, i.e. each will end up with 6.7% of the capacity. - Packets of type MSG_BUNDLER are created at SYSTEM importance level, but only after the packets bundled into it have passed the congestion test for their own respective levels. Since bundled packets don't result in incrementing the level counter for their own importance, only occasionally for the SYSTEM level counter, they do in practice obtain SYSTEM level importance. Hence, the current implementation provides a gap in the congestion algorithm that in the worst case may lead to a link reset. We now refine the congestion algorithm as follows: - A message is accepted to the link backlog only if its own level counter, and all superior level counters, permit it. - The importance of a created bundle packet is set according to its contents. A bundle packet created from messges at levels LOW to CRITICAL is given importance level CRITICAL, while a bundle created from a SYSTEM level message is given importance SYSTEM. In the latter case only subsequent SYSTEM level messages are allowed to be bundled into it. This solves the first problem described above, by making the bandwidth guarantee relative to the total number of users at all levels; only the upper limit for each level remains absolute. In the example described above, the single LOW user would use 1/11th of the bandwidth, the same as each of the ten MEDIUM users, but he still has the same guarantee against starvation as the latter ones. The fix also solves the second problem. If the CRITICAL level is filled up by bundle packets of that level, no lower level packets will be accepted any more. Suggested-by: Gergely Kiss <gergely.kiss@ericsson.com> Reviewed-by: Ying Xue <ying.xue@windriver.com> Signed-off-by: Jon Maloy <jon.maloy@ericsson.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-05-14 21:46:17 +07:00
if (msg_isdata(msg))
msg_set_importance(bmsg, TIPC_CRITICAL_IMPORTANCE);
else
msg_set_importance(bmsg, TIPC_SYSTEM_IMPORTANCE);
msg_set_seqno(bmsg, msg_seqno(msg));
msg_set_ack(bmsg, msg_ack(msg));
msg_set_bcast_ack(bmsg, msg_bcast_ack(msg));
tipc_msg_bundle(_skb, msg, mtu);
*skb = _skb;
return true;
}
/**
* tipc_msg_reverse(): swap source and destination addresses and add error code
* @own_node: originating node id for reversed message
* @skb: buffer containing message to be reversed; may be replaced.
* @err: error code to be set in message, if any
* Consumes buffer at failure
* Returns true if success, otherwise false
*/
bool tipc_msg_reverse(u32 own_node, struct sk_buff **skb, int err)
{
struct sk_buff *_skb = *skb;
struct tipc_msg *hdr;
struct tipc_msg ohdr;
int dlen;
if (skb_linearize(_skb))
goto exit;
hdr = buf_msg(_skb);
dlen = min_t(uint, msg_data_sz(hdr), MAX_FORWARD_SIZE);
if (msg_dest_droppable(hdr))
goto exit;
if (msg_errcode(hdr))
goto exit;
/* Take a copy of original header before altering message */
memcpy(&ohdr, hdr, msg_hdr_sz(hdr));
/* Never return SHORT header; expand by replacing buffer if necessary */
if (msg_short(hdr)) {
*skb = tipc_buf_acquire(BASIC_H_SIZE + dlen, GFP_ATOMIC);
if (!*skb)
goto exit;
memcpy((*skb)->data + BASIC_H_SIZE, msg_data(hdr), dlen);
kfree_skb(_skb);
_skb = *skb;
hdr = buf_msg(_skb);
memcpy(hdr, &ohdr, BASIC_H_SIZE);
msg_set_hdr_sz(hdr, BASIC_H_SIZE);
}
if (skb_cloned(_skb) &&
pskb_expand_head(_skb, BUF_HEADROOM, BUF_TAILROOM, GFP_ATOMIC))
goto exit;
/* reassign after skb header modifications */
hdr = buf_msg(_skb);
/* Now reverse the concerned fields */
msg_set_errcode(hdr, err);
msg_set_non_seq(hdr, 0);
msg_set_origport(hdr, msg_destport(&ohdr));
msg_set_destport(hdr, msg_origport(&ohdr));
msg_set_destnode(hdr, msg_prevnode(&ohdr));
msg_set_prevnode(hdr, own_node);
msg_set_orignode(hdr, own_node);
msg_set_size(hdr, msg_hdr_sz(hdr) + dlen);
skb_trim(_skb, msg_size(hdr));
skb_orphan(_skb);
return true;
exit:
kfree_skb(_skb);
*skb = NULL;
return false;
}
/**
* tipc_msg_lookup_dest(): try to find new destination for named message
* @skb: the buffer containing the message.
* @err: error code to be used by caller if lookup fails
* Does not consume buffer
* Returns true if a destination is found, false otherwise
*/
bool tipc_msg_lookup_dest(struct net *net, struct sk_buff *skb, int *err)
{
struct tipc_msg *msg = buf_msg(skb);
u32 dport, dnode;
u32 onode = tipc_own_addr(net);
if (!msg_isdata(msg))
return false;
if (!msg_named(msg))
return false;
if (msg_errcode(msg))
return false;
*err = TIPC_ERR_NO_NAME;
if (skb_linearize(skb))
return false;
msg = buf_msg(skb);
if (msg_reroute_cnt(msg))
return false;
dnode = addr_domain(net, msg_lookup_scope(msg));
dport = tipc_nametbl_translate(net, msg_nametype(msg),
msg_nameinst(msg), &dnode);
if (!dport)
return false;
msg_incr_reroute_cnt(msg);
if (dnode != onode)
msg_set_prevnode(msg, onode);
msg_set_destnode(msg, dnode);
msg_set_destport(msg, dport);
*err = TIPC_OK;
if (!skb_cloned(skb))
return true;
/* Unclone buffer in case it was bundled */
if (pskb_expand_head(skb, BUF_HEADROOM, BUF_TAILROOM, GFP_ATOMIC))
return false;
return true;
}
/* tipc_msg_assemble() - assemble chain of fragments into one message
*/
bool tipc_msg_assemble(struct sk_buff_head *list)
{
struct sk_buff *skb, *tmp = NULL;
if (skb_queue_len(list) == 1)
return true;
while ((skb = __skb_dequeue(list))) {
skb->next = NULL;
if (tipc_buf_append(&tmp, &skb)) {
__skb_queue_tail(list, skb);
return true;
}
if (!tmp)
break;
}
__skb_queue_purge(list);
__skb_queue_head_init(list);
pr_warn("Failed do assemble buffer\n");
return false;
}
/* tipc_msg_reassemble() - clone a buffer chain of fragments and
* reassemble the clones into one message
*/
bool tipc_msg_reassemble(struct sk_buff_head *list, struct sk_buff_head *rcvq)
{
struct sk_buff *skb, *_skb;
struct sk_buff *frag = NULL;
struct sk_buff *head = NULL;
int hdr_len;
/* Copy header if single buffer */
if (skb_queue_len(list) == 1) {
skb = skb_peek(list);
hdr_len = skb_headroom(skb) + msg_hdr_sz(buf_msg(skb));
_skb = __pskb_copy(skb, hdr_len, GFP_ATOMIC);
if (!_skb)
return false;
__skb_queue_tail(rcvq, _skb);
return true;
}
/* Clone all fragments and reassemble */
skb_queue_walk(list, skb) {
frag = skb_clone(skb, GFP_ATOMIC);
if (!frag)
goto error;
frag->next = NULL;
if (tipc_buf_append(&head, &frag))
break;
if (!head)
goto error;
}
__skb_queue_tail(rcvq, frag);
return true;
error:
pr_warn("Failed do clone local mcast rcv buffer\n");
kfree_skb(head);
return false;
}
bool tipc_msg_pskb_copy(u32 dst, struct sk_buff_head *msg,
struct sk_buff_head *cpy)
{
struct sk_buff *skb, *_skb;
skb_queue_walk(msg, skb) {
_skb = pskb_copy(skb, GFP_ATOMIC);
if (!_skb) {
__skb_queue_purge(cpy);
return false;
}
msg_set_destnode(buf_msg(_skb), dst);
__skb_queue_tail(cpy, _skb);
}
return true;
}
/* tipc_skb_queue_sorted(); sort pkt into list according to sequence number
* @list: list to be appended to
* @seqno: sequence number of buffer to add
* @skb: buffer to add
*/
void __tipc_skb_queue_sorted(struct sk_buff_head *list, u16 seqno,
struct sk_buff *skb)
{
struct sk_buff *_skb, *tmp;
if (skb_queue_empty(list) || less(seqno, buf_seqno(skb_peek(list)))) {
__skb_queue_head(list, skb);
return;
}
if (more(seqno, buf_seqno(skb_peek_tail(list)))) {
__skb_queue_tail(list, skb);
return;
}
skb_queue_walk_safe(list, _skb, tmp) {
if (more(seqno, buf_seqno(_skb)))
continue;
if (seqno == buf_seqno(_skb))
break;
__skb_queue_before(list, _skb, skb);
return;
}
kfree_skb(skb);
}
void tipc_skb_reject(struct net *net, int err, struct sk_buff *skb,
struct sk_buff_head *xmitq)
{
if (tipc_msg_reverse(tipc_own_addr(net), &skb, err))
__skb_queue_tail(xmitq, skb);
}