mirror of
https://github.com/AuxXxilium/linux_dsm_epyc7002.git
synced 2024-11-25 15:30:54 +07:00
3d9a0d2f82
While doing high throughput test on a BQL enabled NIC, I found a very high cost in ndo_start_xmit() when accessing BQL data. It turned out the problem was caused by compiler trying to be smart, but involving a bad MESI transaction : 0.05 │ mov 0xc0(%rax),%edi // LOAD dql->num_queued 0.48 │ mov %edx,0xc8(%rax) // STORE dql->last_obj_cnt = count 58.23 │ add %edx,%edi 0.58 │ cmp %edi,0xc4(%rax) 0.76 │ mov %edi,0xc0(%rax) // STORE dql->num_queued += count 0.72 │ js bd8 I got an incredible 10 % gain [1] by making sure cpu do not attempt to get the cache line in Shared mode, but directly requests for ownership. New code : mov %edx,0xc8(%rax) // STORE dql->last_obj_cnt = count add %edx,0xc0(%rax) // RMW dql->num_queued += count mov 0xc4(%rax),%ecx // LOAD dql->adj_limit mov 0xc0(%rax),%edx // LOAD dql->num_queued cmp %edx,%ecx The TX completion was running from another cpu, with high interrupts rate. Note that I am using barrier() as a soft hint, as mb() here could be too heavy cost. [1] This was a netperf TCP_STREAM with TSO disabled, but GSO enabled. Signed-off-by: Eric Dumazet <edumazet@google.com> Acked-by: Jesper Dangaard Brouer <brouer@redhat.com> Signed-off-by: David S. Miller <davem@davemloft.net>
106 lines
3.7 KiB
C
106 lines
3.7 KiB
C
/*
|
|
* Dynamic queue limits (dql) - Definitions
|
|
*
|
|
* Copyright (c) 2011, Tom Herbert <therbert@google.com>
|
|
*
|
|
* This header file contains the definitions for dynamic queue limits (dql).
|
|
* dql would be used in conjunction with a producer/consumer type queue
|
|
* (possibly a HW queue). Such a queue would have these general properties:
|
|
*
|
|
* 1) Objects are queued up to some limit specified as number of objects.
|
|
* 2) Periodically a completion process executes which retires consumed
|
|
* objects.
|
|
* 3) Starvation occurs when limit has been reached, all queued data has
|
|
* actually been consumed, but completion processing has not yet run
|
|
* so queuing new data is blocked.
|
|
* 4) Minimizing the amount of queued data is desirable.
|
|
*
|
|
* The goal of dql is to calculate the limit as the minimum number of objects
|
|
* needed to prevent starvation.
|
|
*
|
|
* The primary functions of dql are:
|
|
* dql_queued - called when objects are enqueued to record number of objects
|
|
* dql_avail - returns how many objects are available to be queued based
|
|
* on the object limit and how many objects are already enqueued
|
|
* dql_completed - called at completion time to indicate how many objects
|
|
* were retired from the queue
|
|
*
|
|
* The dql implementation does not implement any locking for the dql data
|
|
* structures, the higher layer should provide this. dql_queued should
|
|
* be serialized to prevent concurrent execution of the function; this
|
|
* is also true for dql_completed. However, dql_queued and dlq_completed can
|
|
* be executed concurrently (i.e. they can be protected by different locks).
|
|
*/
|
|
|
|
#ifndef _LINUX_DQL_H
|
|
#define _LINUX_DQL_H
|
|
|
|
#ifdef __KERNEL__
|
|
|
|
struct dql {
|
|
/* Fields accessed in enqueue path (dql_queued) */
|
|
unsigned int num_queued; /* Total ever queued */
|
|
unsigned int adj_limit; /* limit + num_completed */
|
|
unsigned int last_obj_cnt; /* Count at last queuing */
|
|
|
|
/* Fields accessed only by completion path (dql_completed) */
|
|
|
|
unsigned int limit ____cacheline_aligned_in_smp; /* Current limit */
|
|
unsigned int num_completed; /* Total ever completed */
|
|
|
|
unsigned int prev_ovlimit; /* Previous over limit */
|
|
unsigned int prev_num_queued; /* Previous queue total */
|
|
unsigned int prev_last_obj_cnt; /* Previous queuing cnt */
|
|
|
|
unsigned int lowest_slack; /* Lowest slack found */
|
|
unsigned long slack_start_time; /* Time slacks seen */
|
|
|
|
/* Configuration */
|
|
unsigned int max_limit; /* Max limit */
|
|
unsigned int min_limit; /* Minimum limit */
|
|
unsigned int slack_hold_time; /* Time to measure slack */
|
|
};
|
|
|
|
/* Set some static maximums */
|
|
#define DQL_MAX_OBJECT (UINT_MAX / 16)
|
|
#define DQL_MAX_LIMIT ((UINT_MAX / 2) - DQL_MAX_OBJECT)
|
|
|
|
/*
|
|
* Record number of objects queued. Assumes that caller has already checked
|
|
* availability in the queue with dql_avail.
|
|
*/
|
|
static inline void dql_queued(struct dql *dql, unsigned int count)
|
|
{
|
|
BUG_ON(count > DQL_MAX_OBJECT);
|
|
|
|
dql->last_obj_cnt = count;
|
|
|
|
/* We want to force a write first, so that cpu do not attempt
|
|
* to get cache line containing last_obj_cnt, num_queued, adj_limit
|
|
* in Shared state, but directly does a Request For Ownership
|
|
* It is only a hint, we use barrier() only.
|
|
*/
|
|
barrier();
|
|
|
|
dql->num_queued += count;
|
|
}
|
|
|
|
/* Returns how many objects can be queued, < 0 indicates over limit. */
|
|
static inline int dql_avail(const struct dql *dql)
|
|
{
|
|
return ACCESS_ONCE(dql->adj_limit) - ACCESS_ONCE(dql->num_queued);
|
|
}
|
|
|
|
/* Record number of completed objects and recalculate the limit. */
|
|
void dql_completed(struct dql *dql, unsigned int count);
|
|
|
|
/* Reset dql state */
|
|
void dql_reset(struct dql *dql);
|
|
|
|
/* Initialize dql state */
|
|
int dql_init(struct dql *dql, unsigned hold_time);
|
|
|
|
#endif /* _KERNEL_ */
|
|
|
|
#endif /* _LINUX_DQL_H */
|