2009-10-07 01:31:11 +07:00
|
|
|
|
|
|
|
#ifdef __KERNEL__
|
|
|
|
# include <linux/string.h>
|
|
|
|
# include <linux/slab.h>
|
|
|
|
# include <linux/bug.h>
|
|
|
|
# include <linux/kernel.h>
|
|
|
|
# ifndef dprintk
|
|
|
|
# define dprintk(args...)
|
|
|
|
# endif
|
|
|
|
#else
|
|
|
|
# include <string.h>
|
|
|
|
# include <stdio.h>
|
|
|
|
# include <stdlib.h>
|
|
|
|
# include <assert.h>
|
|
|
|
# define BUG_ON(x) assert(!(x))
|
|
|
|
# define dprintk(args...) /* printf(args) */
|
|
|
|
# define kmalloc(x, f) malloc(x)
|
|
|
|
# define kfree(x) free(x)
|
|
|
|
#endif
|
|
|
|
|
2010-04-07 05:14:15 +07:00
|
|
|
#include <linux/crush/crush.h>
|
|
|
|
#include <linux/crush/hash.h>
|
2012-04-24 21:38:37 +07:00
|
|
|
#include <linux/crush/mapper.h>
|
2009-10-07 01:31:11 +07:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Implement the core CRUSH mapping algorithm.
|
|
|
|
*/
|
|
|
|
|
|
|
|
/**
|
|
|
|
* crush_find_rule - find a crush_rule id for a given ruleset, type, and size.
|
|
|
|
* @map: the crush_map
|
|
|
|
* @ruleset: the storage ruleset id (user defined)
|
|
|
|
* @type: storage ruleset type (user defined)
|
|
|
|
* @size: output set size
|
|
|
|
*/
|
2012-05-08 05:38:35 +07:00
|
|
|
int crush_find_rule(const struct crush_map *map, int ruleset, int type, int size)
|
2009-10-07 01:31:11 +07:00
|
|
|
{
|
2012-05-08 05:38:35 +07:00
|
|
|
__u32 i;
|
2009-10-07 01:31:11 +07:00
|
|
|
|
|
|
|
for (i = 0; i < map->max_rules; i++) {
|
|
|
|
if (map->rules[i] &&
|
|
|
|
map->rules[i]->mask.ruleset == ruleset &&
|
|
|
|
map->rules[i]->mask.type == type &&
|
|
|
|
map->rules[i]->mask.min_size <= size &&
|
|
|
|
map->rules[i]->mask.max_size >= size)
|
|
|
|
return i;
|
|
|
|
}
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
* bucket choose methods
|
|
|
|
*
|
|
|
|
* For each bucket algorithm, we have a "choose" method that, given a
|
|
|
|
* crush input @x and replica position (usually, position in output set) @r,
|
|
|
|
* will produce an item in the bucket.
|
|
|
|
*/
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Choose based on a random permutation of the bucket.
|
|
|
|
*
|
|
|
|
* We used to use some prime number arithmetic to do this, but it
|
|
|
|
* wasn't very random, and had some other bad behaviors. Instead, we
|
|
|
|
* calculate an actual random permutation of the bucket members.
|
|
|
|
* Since this is expensive, we optimize for the r=0 case, which
|
|
|
|
* captures the vast majority of calls.
|
|
|
|
*/
|
|
|
|
static int bucket_perm_choose(struct crush_bucket *bucket,
|
|
|
|
int x, int r)
|
|
|
|
{
|
2012-04-15 12:58:06 +07:00
|
|
|
unsigned int pr = r % bucket->size;
|
|
|
|
unsigned int i, s;
|
2009-10-07 01:31:11 +07:00
|
|
|
|
|
|
|
/* start a new permutation if @x has changed */
|
2012-05-08 05:38:35 +07:00
|
|
|
if (bucket->perm_x != (__u32)x || bucket->perm_n == 0) {
|
2009-10-07 01:31:11 +07:00
|
|
|
dprintk("bucket %d new x=%d\n", bucket->id, x);
|
|
|
|
bucket->perm_x = x;
|
|
|
|
|
|
|
|
/* optimize common r=0 case */
|
|
|
|
if (pr == 0) {
|
2009-11-08 11:18:22 +07:00
|
|
|
s = crush_hash32_3(bucket->hash, x, bucket->id, 0) %
|
2009-10-07 01:31:11 +07:00
|
|
|
bucket->size;
|
|
|
|
bucket->perm[0] = s;
|
|
|
|
bucket->perm_n = 0xffff; /* magic value, see below */
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
for (i = 0; i < bucket->size; i++)
|
|
|
|
bucket->perm[i] = i;
|
|
|
|
bucket->perm_n = 0;
|
|
|
|
} else if (bucket->perm_n == 0xffff) {
|
|
|
|
/* clean up after the r=0 case above */
|
|
|
|
for (i = 1; i < bucket->size; i++)
|
|
|
|
bucket->perm[i] = i;
|
|
|
|
bucket->perm[bucket->perm[0]] = 0;
|
|
|
|
bucket->perm_n = 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* calculate permutation up to pr */
|
|
|
|
for (i = 0; i < bucket->perm_n; i++)
|
|
|
|
dprintk(" perm_choose have %d: %d\n", i, bucket->perm[i]);
|
|
|
|
while (bucket->perm_n <= pr) {
|
2012-04-15 12:58:06 +07:00
|
|
|
unsigned int p = bucket->perm_n;
|
2009-10-07 01:31:11 +07:00
|
|
|
/* no point in swapping the final entry */
|
|
|
|
if (p < bucket->size - 1) {
|
2009-11-08 11:18:22 +07:00
|
|
|
i = crush_hash32_3(bucket->hash, x, bucket->id, p) %
|
2009-10-07 01:31:11 +07:00
|
|
|
(bucket->size - p);
|
|
|
|
if (i) {
|
2012-04-15 12:58:06 +07:00
|
|
|
unsigned int t = bucket->perm[p + i];
|
2009-10-07 01:31:11 +07:00
|
|
|
bucket->perm[p + i] = bucket->perm[p];
|
|
|
|
bucket->perm[p] = t;
|
|
|
|
}
|
|
|
|
dprintk(" perm_choose swap %d with %d\n", p, p+i);
|
|
|
|
}
|
|
|
|
bucket->perm_n++;
|
|
|
|
}
|
|
|
|
for (i = 0; i < bucket->size; i++)
|
|
|
|
dprintk(" perm_choose %d: %d\n", i, bucket->perm[i]);
|
|
|
|
|
|
|
|
s = bucket->perm[pr];
|
|
|
|
out:
|
|
|
|
dprintk(" perm_choose %d sz=%d x=%d r=%d (%d) s=%d\n", bucket->id,
|
|
|
|
bucket->size, x, r, pr, s);
|
|
|
|
return bucket->items[s];
|
|
|
|
}
|
|
|
|
|
|
|
|
/* uniform */
|
|
|
|
static int bucket_uniform_choose(struct crush_bucket_uniform *bucket,
|
|
|
|
int x, int r)
|
|
|
|
{
|
|
|
|
return bucket_perm_choose(&bucket->h, x, r);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* list */
|
|
|
|
static int bucket_list_choose(struct crush_bucket_list *bucket,
|
|
|
|
int x, int r)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
|
|
|
|
for (i = bucket->h.size-1; i >= 0; i--) {
|
2009-11-08 11:18:22 +07:00
|
|
|
__u64 w = crush_hash32_4(bucket->h.hash,x, bucket->h.items[i],
|
|
|
|
r, bucket->h.id);
|
2009-10-07 01:31:11 +07:00
|
|
|
w &= 0xffff;
|
|
|
|
dprintk("list_choose i=%d x=%d r=%d item %d weight %x "
|
|
|
|
"sw %x rand %llx",
|
|
|
|
i, x, r, bucket->h.items[i], bucket->item_weights[i],
|
|
|
|
bucket->sum_weights[i], w);
|
|
|
|
w *= bucket->sum_weights[i];
|
|
|
|
w = w >> 16;
|
|
|
|
/*dprintk(" scaled %llx\n", w);*/
|
|
|
|
if (w < bucket->item_weights[i])
|
|
|
|
return bucket->h.items[i];
|
|
|
|
}
|
|
|
|
|
2012-05-08 05:35:24 +07:00
|
|
|
dprintk("bad list sums for bucket %d\n", bucket->h.id);
|
|
|
|
return bucket->h.items[0];
|
2009-10-07 01:31:11 +07:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* (binary) tree */
|
|
|
|
static int height(int n)
|
|
|
|
{
|
|
|
|
int h = 0;
|
|
|
|
while ((n & 1) == 0) {
|
|
|
|
h++;
|
|
|
|
n = n >> 1;
|
|
|
|
}
|
|
|
|
return h;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int left(int x)
|
|
|
|
{
|
|
|
|
int h = height(x);
|
|
|
|
return x - (1 << (h-1));
|
|
|
|
}
|
|
|
|
|
|
|
|
static int right(int x)
|
|
|
|
{
|
|
|
|
int h = height(x);
|
|
|
|
return x + (1 << (h-1));
|
|
|
|
}
|
|
|
|
|
|
|
|
static int terminal(int x)
|
|
|
|
{
|
|
|
|
return x & 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int bucket_tree_choose(struct crush_bucket_tree *bucket,
|
|
|
|
int x, int r)
|
|
|
|
{
|
|
|
|
int n, l;
|
|
|
|
__u32 w;
|
|
|
|
__u64 t;
|
|
|
|
|
|
|
|
/* start at root */
|
|
|
|
n = bucket->num_nodes >> 1;
|
|
|
|
|
|
|
|
while (!terminal(n)) {
|
|
|
|
/* pick point in [0, w) */
|
|
|
|
w = bucket->node_weights[n];
|
2009-11-08 11:18:22 +07:00
|
|
|
t = (__u64)crush_hash32_4(bucket->h.hash, x, n, r,
|
|
|
|
bucket->h.id) * (__u64)w;
|
2009-10-07 01:31:11 +07:00
|
|
|
t = t >> 32;
|
|
|
|
|
|
|
|
/* descend to the left or right? */
|
|
|
|
l = left(n);
|
|
|
|
if (t < bucket->node_weights[l])
|
|
|
|
n = l;
|
|
|
|
else
|
|
|
|
n = right(n);
|
|
|
|
}
|
|
|
|
|
|
|
|
return bucket->h.items[n >> 1];
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* straw */
|
|
|
|
|
|
|
|
static int bucket_straw_choose(struct crush_bucket_straw *bucket,
|
|
|
|
int x, int r)
|
|
|
|
{
|
2012-05-08 05:38:35 +07:00
|
|
|
__u32 i;
|
2009-10-07 01:31:11 +07:00
|
|
|
int high = 0;
|
|
|
|
__u64 high_draw = 0;
|
|
|
|
__u64 draw;
|
|
|
|
|
|
|
|
for (i = 0; i < bucket->h.size; i++) {
|
2009-11-08 11:18:22 +07:00
|
|
|
draw = crush_hash32_3(bucket->h.hash, x, bucket->h.items[i], r);
|
2009-10-07 01:31:11 +07:00
|
|
|
draw &= 0xffff;
|
|
|
|
draw *= bucket->straws[i];
|
|
|
|
if (i == 0 || draw > high_draw) {
|
|
|
|
high = i;
|
|
|
|
high_draw = draw;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return bucket->h.items[high];
|
|
|
|
}
|
|
|
|
|
|
|
|
static int crush_bucket_choose(struct crush_bucket *in, int x, int r)
|
|
|
|
{
|
2010-06-25 02:58:14 +07:00
|
|
|
dprintk(" crush_bucket_choose %d x=%d r=%d\n", in->id, x, r);
|
2012-05-08 05:35:24 +07:00
|
|
|
BUG_ON(in->size == 0);
|
2009-10-07 01:31:11 +07:00
|
|
|
switch (in->alg) {
|
|
|
|
case CRUSH_BUCKET_UNIFORM:
|
|
|
|
return bucket_uniform_choose((struct crush_bucket_uniform *)in,
|
|
|
|
x, r);
|
|
|
|
case CRUSH_BUCKET_LIST:
|
|
|
|
return bucket_list_choose((struct crush_bucket_list *)in,
|
|
|
|
x, r);
|
|
|
|
case CRUSH_BUCKET_TREE:
|
|
|
|
return bucket_tree_choose((struct crush_bucket_tree *)in,
|
|
|
|
x, r);
|
|
|
|
case CRUSH_BUCKET_STRAW:
|
|
|
|
return bucket_straw_choose((struct crush_bucket_straw *)in,
|
|
|
|
x, r);
|
|
|
|
default:
|
2012-05-08 05:35:24 +07:00
|
|
|
dprintk("unknown bucket %d alg %d\n", in->id, in->alg);
|
2009-12-02 05:12:07 +07:00
|
|
|
return in->items[0];
|
2009-10-07 01:31:11 +07:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* true if device is marked "out" (failed, fully offloaded)
|
|
|
|
* of the cluster
|
|
|
|
*/
|
2012-05-08 05:38:35 +07:00
|
|
|
static int is_out(const struct crush_map *map, const __u32 *weight, int item, int x)
|
2009-10-07 01:31:11 +07:00
|
|
|
{
|
2010-07-05 23:44:17 +07:00
|
|
|
if (weight[item] >= 0x10000)
|
2009-10-07 01:31:11 +07:00
|
|
|
return 0;
|
|
|
|
if (weight[item] == 0)
|
|
|
|
return 1;
|
2009-11-08 11:18:22 +07:00
|
|
|
if ((crush_hash32_2(CRUSH_HASH_RJENKINS1, x, item) & 0xffff)
|
|
|
|
< weight[item])
|
2009-10-07 01:31:11 +07:00
|
|
|
return 0;
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* crush_choose - choose numrep distinct items of given type
|
|
|
|
* @map: the crush_map
|
|
|
|
* @bucket: the bucket we are choose an item from
|
|
|
|
* @x: crush input value
|
|
|
|
* @numrep: the number of items to choose
|
|
|
|
* @type: the type of item to choose
|
|
|
|
* @out: pointer to output vector
|
|
|
|
* @outpos: our position in that vector
|
|
|
|
* @firstn: true if choosing "first n" items, false if choosing "indep"
|
|
|
|
* @recurse_to_leaf: true if we want one device under each item of given type
|
libceph: for chooseleaf rules, retry CRUSH map descent from root if leaf is failed
Add libceph support for a new CRUSH tunable recently added to Ceph servers.
Consider the CRUSH rule
step chooseleaf firstn 0 type <node_type>
This rule means that <n> replicas will be chosen in a manner such that
each chosen leaf's branch will contain a unique instance of <node_type>.
When an object is re-replicated after a leaf failure, if the CRUSH map uses
a chooseleaf rule the remapped replica ends up under the <node_type> bucket
that held the failed leaf. This causes uneven data distribution across the
storage cluster, to the point that when all the leaves but one fail under a
particular <node_type> bucket, that remaining leaf holds all the data from
its failed peers.
This behavior also limits the number of peers that can participate in the
re-replication of the data held by the failed leaf, which increases the
time required to re-replicate after a failure.
For a chooseleaf CRUSH rule, the tree descent has two steps: call them the
inner and outer descents.
If the tree descent down to <node_type> is the outer descent, and the descent
from <node_type> down to a leaf is the inner descent, the issue is that a
down leaf is detected on the inner descent, so only the inner descent is
retried.
In order to disperse re-replicated data as widely as possible across a
storage cluster after a failure, we want to retry the outer descent. So,
fix up crush_choose() to allow the inner descent to return immediately on
choosing a failed leaf. Wire this up as a new CRUSH tunable.
Note that after this change, for a chooseleaf rule, if the primary OSD
in a placement group has failed, choosing a replacement may result in
one of the other OSDs in the PG colliding with the new primary. This
requires that OSD's data for that PG to need moving as well. This
seems unavoidable but should be relatively rare.
This corresponds to ceph.git commit 88f218181a9e6d2292e2697fc93797d0f6d6e5dc.
Signed-off-by: Jim Schutt <jaschut@sandia.gov>
Reviewed-by: Sage Weil <sage@inktank.com>
2012-11-30 23:15:25 +07:00
|
|
|
* @descend_once: true if we should only try one descent before giving up
|
2009-10-07 01:31:11 +07:00
|
|
|
* @out2: second output vector for leaf items (if @recurse_to_leaf)
|
|
|
|
*/
|
2012-05-08 05:38:35 +07:00
|
|
|
static int crush_choose(const struct crush_map *map,
|
2009-10-07 01:31:11 +07:00
|
|
|
struct crush_bucket *bucket,
|
2012-05-08 05:38:35 +07:00
|
|
|
const __u32 *weight,
|
2009-10-07 01:31:11 +07:00
|
|
|
int x, int numrep, int type,
|
|
|
|
int *out, int outpos,
|
|
|
|
int firstn, int recurse_to_leaf,
|
libceph: for chooseleaf rules, retry CRUSH map descent from root if leaf is failed
Add libceph support for a new CRUSH tunable recently added to Ceph servers.
Consider the CRUSH rule
step chooseleaf firstn 0 type <node_type>
This rule means that <n> replicas will be chosen in a manner such that
each chosen leaf's branch will contain a unique instance of <node_type>.
When an object is re-replicated after a leaf failure, if the CRUSH map uses
a chooseleaf rule the remapped replica ends up under the <node_type> bucket
that held the failed leaf. This causes uneven data distribution across the
storage cluster, to the point that when all the leaves but one fail under a
particular <node_type> bucket, that remaining leaf holds all the data from
its failed peers.
This behavior also limits the number of peers that can participate in the
re-replication of the data held by the failed leaf, which increases the
time required to re-replicate after a failure.
For a chooseleaf CRUSH rule, the tree descent has two steps: call them the
inner and outer descents.
If the tree descent down to <node_type> is the outer descent, and the descent
from <node_type> down to a leaf is the inner descent, the issue is that a
down leaf is detected on the inner descent, so only the inner descent is
retried.
In order to disperse re-replicated data as widely as possible across a
storage cluster after a failure, we want to retry the outer descent. So,
fix up crush_choose() to allow the inner descent to return immediately on
choosing a failed leaf. Wire this up as a new CRUSH tunable.
Note that after this change, for a chooseleaf rule, if the primary OSD
in a placement group has failed, choosing a replacement may result in
one of the other OSDs in the PG colliding with the new primary. This
requires that OSD's data for that PG to need moving as well. This
seems unavoidable but should be relatively rare.
This corresponds to ceph.git commit 88f218181a9e6d2292e2697fc93797d0f6d6e5dc.
Signed-off-by: Jim Schutt <jaschut@sandia.gov>
Reviewed-by: Sage Weil <sage@inktank.com>
2012-11-30 23:15:25 +07:00
|
|
|
int descend_once, int *out2)
|
2009-10-07 01:31:11 +07:00
|
|
|
{
|
|
|
|
int rep;
|
2012-05-08 05:38:35 +07:00
|
|
|
unsigned int ftotal, flocal;
|
2009-10-07 01:31:11 +07:00
|
|
|
int retry_descent, retry_bucket, skip_rep;
|
|
|
|
struct crush_bucket *in = bucket;
|
|
|
|
int r;
|
|
|
|
int i;
|
2009-10-08 00:59:34 +07:00
|
|
|
int item = 0;
|
2009-10-07 01:31:11 +07:00
|
|
|
int itemtype;
|
|
|
|
int collide, reject;
|
2010-06-25 02:58:14 +07:00
|
|
|
|
|
|
|
dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d\n", recurse_to_leaf ? "_LEAF" : "",
|
|
|
|
bucket->id, x, outpos, numrep);
|
2009-10-07 01:31:11 +07:00
|
|
|
|
|
|
|
for (rep = outpos; rep < numrep; rep++) {
|
|
|
|
/* keep trying until we get a non-out, non-colliding item */
|
|
|
|
ftotal = 0;
|
|
|
|
skip_rep = 0;
|
|
|
|
do {
|
|
|
|
retry_descent = 0;
|
|
|
|
in = bucket; /* initial bucket */
|
|
|
|
|
|
|
|
/* choose through intervening buckets */
|
|
|
|
flocal = 0;
|
|
|
|
do {
|
2009-10-08 00:59:34 +07:00
|
|
|
collide = 0;
|
2009-10-07 01:31:11 +07:00
|
|
|
retry_bucket = 0;
|
|
|
|
r = rep;
|
|
|
|
if (in->alg == CRUSH_BUCKET_UNIFORM) {
|
|
|
|
/* be careful */
|
2012-05-08 05:38:35 +07:00
|
|
|
if (firstn || (__u32)numrep >= in->size)
|
2009-10-07 01:31:11 +07:00
|
|
|
/* r' = r + f_total */
|
|
|
|
r += ftotal;
|
|
|
|
else if (in->size % numrep == 0)
|
|
|
|
/* r'=r+(n+1)*f_local */
|
|
|
|
r += (numrep+1) *
|
|
|
|
(flocal+ftotal);
|
|
|
|
else
|
|
|
|
/* r' = r + n*f_local */
|
|
|
|
r += numrep * (flocal+ftotal);
|
|
|
|
} else {
|
|
|
|
if (firstn)
|
|
|
|
/* r' = r + f_total */
|
|
|
|
r += ftotal;
|
|
|
|
else
|
|
|
|
/* r' = r + n*f_local */
|
|
|
|
r += numrep * (flocal+ftotal);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* bucket choose */
|
2009-10-08 00:59:34 +07:00
|
|
|
if (in->size == 0) {
|
|
|
|
reject = 1;
|
|
|
|
goto reject;
|
|
|
|
}
|
2012-07-31 08:15:23 +07:00
|
|
|
if (map->choose_local_fallback_tries > 0 &&
|
|
|
|
flocal >= (in->size>>1) &&
|
|
|
|
flocal > map->choose_local_fallback_tries)
|
2009-10-07 01:31:11 +07:00
|
|
|
item = bucket_perm_choose(in, x, r);
|
|
|
|
else
|
|
|
|
item = crush_bucket_choose(in, x, r);
|
2012-05-08 05:35:24 +07:00
|
|
|
if (item >= map->max_devices) {
|
|
|
|
dprintk(" bad item %d\n", item);
|
|
|
|
skip_rep = 1;
|
|
|
|
break;
|
|
|
|
}
|
2009-10-07 01:31:11 +07:00
|
|
|
|
|
|
|
/* desired type? */
|
|
|
|
if (item < 0)
|
|
|
|
itemtype = map->buckets[-1-item]->type;
|
|
|
|
else
|
|
|
|
itemtype = 0;
|
|
|
|
dprintk(" item %d type %d\n", item, itemtype);
|
|
|
|
|
|
|
|
/* keep going? */
|
|
|
|
if (itemtype != type) {
|
2012-05-08 05:35:24 +07:00
|
|
|
if (item >= 0 ||
|
|
|
|
(-1-item) >= map->max_buckets) {
|
|
|
|
dprintk(" bad item type %d\n", type);
|
|
|
|
skip_rep = 1;
|
|
|
|
break;
|
|
|
|
}
|
2009-10-07 01:31:11 +07:00
|
|
|
in = map->buckets[-1-item];
|
2010-06-25 02:55:48 +07:00
|
|
|
retry_bucket = 1;
|
2009-10-07 01:31:11 +07:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* collision? */
|
|
|
|
for (i = 0; i < outpos; i++) {
|
|
|
|
if (out[i] == item) {
|
|
|
|
collide = 1;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2010-06-25 02:58:14 +07:00
|
|
|
reject = 0;
|
|
|
|
if (recurse_to_leaf) {
|
|
|
|
if (item < 0) {
|
|
|
|
if (crush_choose(map,
|
|
|
|
map->buckets[-1-item],
|
|
|
|
weight,
|
|
|
|
x, outpos+1, 0,
|
|
|
|
out2, outpos,
|
|
|
|
firstn, 0,
|
libceph: for chooseleaf rules, retry CRUSH map descent from root if leaf is failed
Add libceph support for a new CRUSH tunable recently added to Ceph servers.
Consider the CRUSH rule
step chooseleaf firstn 0 type <node_type>
This rule means that <n> replicas will be chosen in a manner such that
each chosen leaf's branch will contain a unique instance of <node_type>.
When an object is re-replicated after a leaf failure, if the CRUSH map uses
a chooseleaf rule the remapped replica ends up under the <node_type> bucket
that held the failed leaf. This causes uneven data distribution across the
storage cluster, to the point that when all the leaves but one fail under a
particular <node_type> bucket, that remaining leaf holds all the data from
its failed peers.
This behavior also limits the number of peers that can participate in the
re-replication of the data held by the failed leaf, which increases the
time required to re-replicate after a failure.
For a chooseleaf CRUSH rule, the tree descent has two steps: call them the
inner and outer descents.
If the tree descent down to <node_type> is the outer descent, and the descent
from <node_type> down to a leaf is the inner descent, the issue is that a
down leaf is detected on the inner descent, so only the inner descent is
retried.
In order to disperse re-replicated data as widely as possible across a
storage cluster after a failure, we want to retry the outer descent. So,
fix up crush_choose() to allow the inner descent to return immediately on
choosing a failed leaf. Wire this up as a new CRUSH tunable.
Note that after this change, for a chooseleaf rule, if the primary OSD
in a placement group has failed, choosing a replacement may result in
one of the other OSDs in the PG colliding with the new primary. This
requires that OSD's data for that PG to need moving as well. This
seems unavoidable but should be relatively rare.
This corresponds to ceph.git commit 88f218181a9e6d2292e2697fc93797d0f6d6e5dc.
Signed-off-by: Jim Schutt <jaschut@sandia.gov>
Reviewed-by: Sage Weil <sage@inktank.com>
2012-11-30 23:15:25 +07:00
|
|
|
map->chooseleaf_descend_once,
|
2010-06-25 02:58:14 +07:00
|
|
|
NULL) <= outpos)
|
|
|
|
/* didn't get leaf */
|
|
|
|
reject = 1;
|
|
|
|
} else {
|
|
|
|
/* we already have a leaf! */
|
|
|
|
out2[outpos] = item;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!reject) {
|
2009-10-07 01:31:11 +07:00
|
|
|
/* out? */
|
|
|
|
if (itemtype == 0)
|
|
|
|
reject = is_out(map, weight,
|
|
|
|
item, x);
|
|
|
|
else
|
|
|
|
reject = 0;
|
|
|
|
}
|
|
|
|
|
2009-10-08 00:59:34 +07:00
|
|
|
reject:
|
2009-10-07 01:31:11 +07:00
|
|
|
if (reject || collide) {
|
|
|
|
ftotal++;
|
|
|
|
flocal++;
|
|
|
|
|
libceph: for chooseleaf rules, retry CRUSH map descent from root if leaf is failed
Add libceph support for a new CRUSH tunable recently added to Ceph servers.
Consider the CRUSH rule
step chooseleaf firstn 0 type <node_type>
This rule means that <n> replicas will be chosen in a manner such that
each chosen leaf's branch will contain a unique instance of <node_type>.
When an object is re-replicated after a leaf failure, if the CRUSH map uses
a chooseleaf rule the remapped replica ends up under the <node_type> bucket
that held the failed leaf. This causes uneven data distribution across the
storage cluster, to the point that when all the leaves but one fail under a
particular <node_type> bucket, that remaining leaf holds all the data from
its failed peers.
This behavior also limits the number of peers that can participate in the
re-replication of the data held by the failed leaf, which increases the
time required to re-replicate after a failure.
For a chooseleaf CRUSH rule, the tree descent has two steps: call them the
inner and outer descents.
If the tree descent down to <node_type> is the outer descent, and the descent
from <node_type> down to a leaf is the inner descent, the issue is that a
down leaf is detected on the inner descent, so only the inner descent is
retried.
In order to disperse re-replicated data as widely as possible across a
storage cluster after a failure, we want to retry the outer descent. So,
fix up crush_choose() to allow the inner descent to return immediately on
choosing a failed leaf. Wire this up as a new CRUSH tunable.
Note that after this change, for a chooseleaf rule, if the primary OSD
in a placement group has failed, choosing a replacement may result in
one of the other OSDs in the PG colliding with the new primary. This
requires that OSD's data for that PG to need moving as well. This
seems unavoidable but should be relatively rare.
This corresponds to ceph.git commit 88f218181a9e6d2292e2697fc93797d0f6d6e5dc.
Signed-off-by: Jim Schutt <jaschut@sandia.gov>
Reviewed-by: Sage Weil <sage@inktank.com>
2012-11-30 23:15:25 +07:00
|
|
|
if (reject && descend_once)
|
|
|
|
/* let outer call try again */
|
|
|
|
skip_rep = 1;
|
|
|
|
else if (collide && flocal <= map->choose_local_tries)
|
2009-10-07 01:31:11 +07:00
|
|
|
/* retry locally a few times */
|
|
|
|
retry_bucket = 1;
|
2012-07-31 08:15:23 +07:00
|
|
|
else if (map->choose_local_fallback_tries > 0 &&
|
|
|
|
flocal <= in->size + map->choose_local_fallback_tries)
|
2009-10-07 01:31:11 +07:00
|
|
|
/* exhaustive bucket search */
|
|
|
|
retry_bucket = 1;
|
2012-07-31 08:15:23 +07:00
|
|
|
else if (ftotal <= map->choose_total_tries)
|
2009-10-07 01:31:11 +07:00
|
|
|
/* then retry descent */
|
|
|
|
retry_descent = 1;
|
|
|
|
else
|
|
|
|
/* else give up */
|
|
|
|
skip_rep = 1;
|
|
|
|
dprintk(" reject %d collide %d "
|
2012-05-08 05:38:35 +07:00
|
|
|
"ftotal %u flocal %u\n",
|
2009-10-07 01:31:11 +07:00
|
|
|
reject, collide, ftotal,
|
|
|
|
flocal);
|
|
|
|
}
|
|
|
|
} while (retry_bucket);
|
|
|
|
} while (retry_descent);
|
|
|
|
|
|
|
|
if (skip_rep) {
|
|
|
|
dprintk("skip rep\n");
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2010-06-25 02:58:14 +07:00
|
|
|
dprintk("CHOOSE got %d\n", item);
|
2009-10-07 01:31:11 +07:00
|
|
|
out[outpos] = item;
|
|
|
|
outpos++;
|
|
|
|
}
|
|
|
|
|
2010-06-25 02:58:14 +07:00
|
|
|
dprintk("CHOOSE returns %d\n", outpos);
|
2009-10-07 01:31:11 +07:00
|
|
|
return outpos;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
* crush_do_rule - calculate a mapping with the given input and rule
|
|
|
|
* @map: the crush_map
|
|
|
|
* @ruleno: the rule id
|
|
|
|
* @x: hash input
|
|
|
|
* @result: pointer to result vector
|
|
|
|
* @result_max: maximum result size
|
|
|
|
*/
|
2012-05-08 05:38:35 +07:00
|
|
|
int crush_do_rule(const struct crush_map *map,
|
2009-10-07 01:31:11 +07:00
|
|
|
int ruleno, int x, int *result, int result_max,
|
2012-05-08 05:39:29 +07:00
|
|
|
const __u32 *weight)
|
2009-10-07 01:31:11 +07:00
|
|
|
{
|
|
|
|
int result_len;
|
|
|
|
int a[CRUSH_MAX_SET];
|
|
|
|
int b[CRUSH_MAX_SET];
|
|
|
|
int c[CRUSH_MAX_SET];
|
|
|
|
int recurse_to_leaf;
|
|
|
|
int *w;
|
|
|
|
int wsize = 0;
|
|
|
|
int *o;
|
|
|
|
int osize;
|
|
|
|
int *tmp;
|
|
|
|
struct crush_rule *rule;
|
2012-05-08 05:38:35 +07:00
|
|
|
__u32 step;
|
2009-10-07 01:31:11 +07:00
|
|
|
int i, j;
|
|
|
|
int numrep;
|
|
|
|
int firstn;
|
libceph: for chooseleaf rules, retry CRUSH map descent from root if leaf is failed
Add libceph support for a new CRUSH tunable recently added to Ceph servers.
Consider the CRUSH rule
step chooseleaf firstn 0 type <node_type>
This rule means that <n> replicas will be chosen in a manner such that
each chosen leaf's branch will contain a unique instance of <node_type>.
When an object is re-replicated after a leaf failure, if the CRUSH map uses
a chooseleaf rule the remapped replica ends up under the <node_type> bucket
that held the failed leaf. This causes uneven data distribution across the
storage cluster, to the point that when all the leaves but one fail under a
particular <node_type> bucket, that remaining leaf holds all the data from
its failed peers.
This behavior also limits the number of peers that can participate in the
re-replication of the data held by the failed leaf, which increases the
time required to re-replicate after a failure.
For a chooseleaf CRUSH rule, the tree descent has two steps: call them the
inner and outer descents.
If the tree descent down to <node_type> is the outer descent, and the descent
from <node_type> down to a leaf is the inner descent, the issue is that a
down leaf is detected on the inner descent, so only the inner descent is
retried.
In order to disperse re-replicated data as widely as possible across a
storage cluster after a failure, we want to retry the outer descent. So,
fix up crush_choose() to allow the inner descent to return immediately on
choosing a failed leaf. Wire this up as a new CRUSH tunable.
Note that after this change, for a chooseleaf rule, if the primary OSD
in a placement group has failed, choosing a replacement may result in
one of the other OSDs in the PG colliding with the new primary. This
requires that OSD's data for that PG to need moving as well. This
seems unavoidable but should be relatively rare.
This corresponds to ceph.git commit 88f218181a9e6d2292e2697fc93797d0f6d6e5dc.
Signed-off-by: Jim Schutt <jaschut@sandia.gov>
Reviewed-by: Sage Weil <sage@inktank.com>
2012-11-30 23:15:25 +07:00
|
|
|
const int descend_once = 0;
|
2009-10-07 01:31:11 +07:00
|
|
|
|
2012-05-08 05:35:24 +07:00
|
|
|
if ((__u32)ruleno >= map->max_rules) {
|
|
|
|
dprintk(" bad ruleno %d\n", ruleno);
|
|
|
|
return 0;
|
|
|
|
}
|
2009-10-07 01:31:11 +07:00
|
|
|
|
|
|
|
rule = map->rules[ruleno];
|
|
|
|
result_len = 0;
|
|
|
|
w = a;
|
|
|
|
o = b;
|
|
|
|
|
|
|
|
for (step = 0; step < rule->len; step++) {
|
2012-05-08 05:35:48 +07:00
|
|
|
struct crush_rule_step *curstep = &rule->steps[step];
|
|
|
|
|
2009-10-07 01:31:11 +07:00
|
|
|
firstn = 0;
|
2012-05-08 05:35:48 +07:00
|
|
|
switch (curstep->op) {
|
2009-10-07 01:31:11 +07:00
|
|
|
case CRUSH_RULE_TAKE:
|
2012-05-08 05:35:48 +07:00
|
|
|
w[0] = curstep->arg1;
|
2009-10-07 01:31:11 +07:00
|
|
|
wsize = 1;
|
|
|
|
break;
|
|
|
|
|
|
|
|
case CRUSH_RULE_CHOOSE_LEAF_FIRSTN:
|
|
|
|
case CRUSH_RULE_CHOOSE_FIRSTN:
|
|
|
|
firstn = 1;
|
2012-05-08 05:35:48 +07:00
|
|
|
/* fall through */
|
2009-10-07 01:31:11 +07:00
|
|
|
case CRUSH_RULE_CHOOSE_LEAF_INDEP:
|
|
|
|
case CRUSH_RULE_CHOOSE_INDEP:
|
2012-05-08 05:35:24 +07:00
|
|
|
if (wsize == 0)
|
|
|
|
break;
|
2009-10-07 01:31:11 +07:00
|
|
|
|
|
|
|
recurse_to_leaf =
|
2012-05-08 05:35:48 +07:00
|
|
|
curstep->op ==
|
2009-10-07 01:31:11 +07:00
|
|
|
CRUSH_RULE_CHOOSE_LEAF_FIRSTN ||
|
2012-05-08 05:35:48 +07:00
|
|
|
curstep->op ==
|
2009-10-07 01:31:11 +07:00
|
|
|
CRUSH_RULE_CHOOSE_LEAF_INDEP;
|
|
|
|
|
|
|
|
/* reset output */
|
|
|
|
osize = 0;
|
|
|
|
|
|
|
|
for (i = 0; i < wsize; i++) {
|
|
|
|
/*
|
|
|
|
* see CRUSH_N, CRUSH_N_MINUS macros.
|
|
|
|
* basically, numrep <= 0 means relative to
|
|
|
|
* the provided result_max
|
|
|
|
*/
|
2012-05-08 05:35:48 +07:00
|
|
|
numrep = curstep->arg1;
|
2009-10-07 01:31:11 +07:00
|
|
|
if (numrep <= 0) {
|
|
|
|
numrep += result_max;
|
|
|
|
if (numrep <= 0)
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
j = 0;
|
|
|
|
osize += crush_choose(map,
|
|
|
|
map->buckets[-1-w[i]],
|
|
|
|
weight,
|
|
|
|
x, numrep,
|
2012-05-08 05:35:48 +07:00
|
|
|
curstep->arg2,
|
2009-10-07 01:31:11 +07:00
|
|
|
o+osize, j,
|
|
|
|
firstn,
|
libceph: for chooseleaf rules, retry CRUSH map descent from root if leaf is failed
Add libceph support for a new CRUSH tunable recently added to Ceph servers.
Consider the CRUSH rule
step chooseleaf firstn 0 type <node_type>
This rule means that <n> replicas will be chosen in a manner such that
each chosen leaf's branch will contain a unique instance of <node_type>.
When an object is re-replicated after a leaf failure, if the CRUSH map uses
a chooseleaf rule the remapped replica ends up under the <node_type> bucket
that held the failed leaf. This causes uneven data distribution across the
storage cluster, to the point that when all the leaves but one fail under a
particular <node_type> bucket, that remaining leaf holds all the data from
its failed peers.
This behavior also limits the number of peers that can participate in the
re-replication of the data held by the failed leaf, which increases the
time required to re-replicate after a failure.
For a chooseleaf CRUSH rule, the tree descent has two steps: call them the
inner and outer descents.
If the tree descent down to <node_type> is the outer descent, and the descent
from <node_type> down to a leaf is the inner descent, the issue is that a
down leaf is detected on the inner descent, so only the inner descent is
retried.
In order to disperse re-replicated data as widely as possible across a
storage cluster after a failure, we want to retry the outer descent. So,
fix up crush_choose() to allow the inner descent to return immediately on
choosing a failed leaf. Wire this up as a new CRUSH tunable.
Note that after this change, for a chooseleaf rule, if the primary OSD
in a placement group has failed, choosing a replacement may result in
one of the other OSDs in the PG colliding with the new primary. This
requires that OSD's data for that PG to need moving as well. This
seems unavoidable but should be relatively rare.
This corresponds to ceph.git commit 88f218181a9e6d2292e2697fc93797d0f6d6e5dc.
Signed-off-by: Jim Schutt <jaschut@sandia.gov>
Reviewed-by: Sage Weil <sage@inktank.com>
2012-11-30 23:15:25 +07:00
|
|
|
recurse_to_leaf,
|
|
|
|
descend_once, c+osize);
|
2009-10-07 01:31:11 +07:00
|
|
|
}
|
|
|
|
|
|
|
|
if (recurse_to_leaf)
|
|
|
|
/* copy final _leaf_ values to output set */
|
|
|
|
memcpy(o, c, osize*sizeof(*o));
|
|
|
|
|
|
|
|
/* swap t and w arrays */
|
|
|
|
tmp = o;
|
|
|
|
o = w;
|
|
|
|
w = tmp;
|
|
|
|
wsize = osize;
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
|
|
case CRUSH_RULE_EMIT:
|
|
|
|
for (i = 0; i < wsize && result_len < result_max; i++) {
|
|
|
|
result[result_len] = w[i];
|
|
|
|
result_len++;
|
|
|
|
}
|
|
|
|
wsize = 0;
|
|
|
|
break;
|
|
|
|
|
|
|
|
default:
|
2012-05-08 05:35:24 +07:00
|
|
|
dprintk(" unknown op %d at step %d\n",
|
|
|
|
curstep->op, step);
|
|
|
|
break;
|
2009-10-07 01:31:11 +07:00
|
|
|
}
|
|
|
|
}
|
2011-12-08 00:10:26 +07:00
|
|
|
return result_len;
|
2009-10-07 01:31:11 +07:00
|
|
|
}
|
|
|
|
|
|
|
|
|