diff --git a/Documentation/vm/00-INDEX b/Documentation/vm/00-INDEX index dca82d7c83d8..5481c8ba3412 100644 --- a/Documentation/vm/00-INDEX +++ b/Documentation/vm/00-INDEX @@ -30,8 +30,6 @@ page_migration - description of page migration in NUMA systems. pagemap.txt - pagemap, from the userspace perspective -slabinfo.c - - source code for a tool to get reports about slabs. slub.txt - a short users guide for SLUB. unevictable-lru.txt diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 29971a589ff2..c93d00a6e95d 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -79,9 +79,21 @@ struct page { }; /* Third double word block */ - struct list_head lru; /* Pageout list, eg. active_list + union { + struct list_head lru; /* Pageout list, eg. active_list * protected by zone->lru_lock ! */ + struct { /* slub per cpu partial pages */ + struct page *next; /* Next partial slab */ +#ifdef CONFIG_64BIT + int pages; /* Nr of partial slabs left */ + int pobjects; /* Approximate # of objects */ +#else + short int pages; + short int pobjects; +#endif + }; + }; /* Remainder is not double word aligned */ union { diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index f58d6413d230..a32bcfdc7834 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -36,12 +36,15 @@ enum stat_item { ORDER_FALLBACK, /* Number of times fallback was necessary */ CMPXCHG_DOUBLE_CPU_FAIL,/* Failure of this_cpu_cmpxchg_double */ CMPXCHG_DOUBLE_FAIL, /* Number of times that cmpxchg double did not match */ + CPU_PARTIAL_ALLOC, /* Used cpu partial on alloc */ + CPU_PARTIAL_FREE, /* USed cpu partial on free */ NR_SLUB_STAT_ITEMS }; struct kmem_cache_cpu { void **freelist; /* Pointer to next available object */ unsigned long tid; /* Globally unique transaction id */ struct page *page; /* The slab from which we are allocating */ + struct page *partial; /* Partially allocated frozen slabs */ int node; /* The node of the page (or -1 for debug) */ #ifdef CONFIG_SLUB_STATS unsigned stat[NR_SLUB_STAT_ITEMS]; @@ -79,6 +82,7 @@ struct kmem_cache { int size; /* The size of an object including meta data */ int objsize; /* The size of an object without meta data */ int offset; /* Free pointer offset. */ + int cpu_partial; /* Number of per cpu partial objects to keep around */ struct kmem_cache_order_objects oo; /* Allocation and freeing of slabs */ diff --git a/mm/slab.c b/mm/slab.c index 6d90a091fdca..708efe886154 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1851,15 +1851,15 @@ static void dump_line(char *data, int offset, int limit) unsigned char error = 0; int bad_count = 0; - printk(KERN_ERR "%03x:", offset); + printk(KERN_ERR "%03x: ", offset); for (i = 0; i < limit; i++) { if (data[offset + i] != POISON_FREE) { error = data[offset + i]; bad_count++; } - printk(" %02x", (unsigned char)data[offset + i]); } - printk("\n"); + print_hex_dump(KERN_CONT, "", 0, 16, 1, + &data[offset], limit, 1); if (bad_count == 1) { error ^= POISON_FREE; @@ -3039,14 +3039,9 @@ static void check_slabp(struct kmem_cache *cachep, struct slab *slabp) printk(KERN_ERR "slab: Internal list corruption detected in " "cache '%s'(%d), slabp %p(%d). Hexdump:\n", cachep->name, cachep->num, slabp, slabp->inuse); - for (i = 0; - i < sizeof(*slabp) + cachep->num * sizeof(kmem_bufctl_t); - i++) { - if (i % 16 == 0) - printk("\n%03x:", i); - printk(" %02x", ((unsigned char *)slabp)[i]); - } - printk("\n"); + print_hex_dump(KERN_ERR, "", DUMP_PREFIX_OFFSET, 16, 1, slabp, + sizeof(*slabp) + cachep->num * sizeof(kmem_bufctl_t), + 1); BUG(); } } @@ -4584,7 +4579,7 @@ static const struct file_operations proc_slabstats_operations = { static int __init slab_proc_init(void) { - proc_create("slabinfo",S_IWUSR|S_IRUGO,NULL,&proc_slabinfo_operations); + proc_create("slabinfo",S_IWUSR|S_IRUSR,NULL,&proc_slabinfo_operations); #ifdef CONFIG_DEBUG_SLAB_LEAK proc_create("slab_allocators", 0, NULL, &proc_slabstats_operations); #endif diff --git a/mm/slub.c b/mm/slub.c index 7c54fe83a90c..95215aa6a75e 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -467,34 +467,8 @@ static int disable_higher_order_debug; */ static void print_section(char *text, u8 *addr, unsigned int length) { - int i, offset; - int newline = 1; - char ascii[17]; - - ascii[16] = 0; - - for (i = 0; i < length; i++) { - if (newline) { - printk(KERN_ERR "%8s 0x%p: ", text, addr + i); - newline = 0; - } - printk(KERN_CONT " %02x", addr[i]); - offset = i % 16; - ascii[offset] = isgraph(addr[i]) ? addr[i] : '.'; - if (offset == 15) { - printk(KERN_CONT " %s\n", ascii); - newline = 1; - } - } - if (!newline) { - i %= 16; - while (i < 16) { - printk(KERN_CONT " "); - ascii[i] = ' '; - i++; - } - printk(KERN_CONT " %s\n", ascii); - } + print_hex_dump(KERN_ERR, text, DUMP_PREFIX_ADDRESS, 16, 1, addr, + length, 1); } static struct track *get_track(struct kmem_cache *s, void *object, @@ -625,12 +599,12 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) p, p - addr, get_freepointer(s, p)); if (p > addr + 16) - print_section("Bytes b4", p - 16, 16); - - print_section("Object", p, min_t(unsigned long, s->objsize, PAGE_SIZE)); + print_section("Bytes b4 ", p - 16, 16); + print_section("Object ", p, min_t(unsigned long, s->objsize, + PAGE_SIZE)); if (s->flags & SLAB_RED_ZONE) - print_section("Redzone", p + s->objsize, + print_section("Redzone ", p + s->objsize, s->inuse - s->objsize); if (s->offset) @@ -643,7 +617,7 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) if (off != s->size) /* Beginning of the filler is the free pointer */ - print_section("Padding", p + off, s->size - off); + print_section("Padding ", p + off, s->size - off); dump_stack(); } @@ -838,7 +812,7 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page) end--; slab_err(s, page, "Padding overwritten. 0x%p-0x%p", fault, end - 1); - print_section("Padding", end - remainder, remainder); + print_section("Padding ", end - remainder, remainder); restore_bytes(s, "slab padding", POISON_INUSE, end - remainder, end); return 0; @@ -987,7 +961,7 @@ static void trace(struct kmem_cache *s, struct page *page, void *object, page->freelist); if (!alloc) - print_section("Object", (void *)object, s->objsize); + print_section("Object ", (void *)object, s->objsize); dump_stack(); } @@ -1447,7 +1421,7 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) set_freepointer(s, last, NULL); page->freelist = start; - page->inuse = 0; + page->inuse = page->objects; page->frozen = 1; out: return page; @@ -1534,7 +1508,7 @@ static inline void add_partial(struct kmem_cache_node *n, struct page *page, int tail) { n->nr_partial++; - if (tail) + if (tail == DEACTIVATE_TO_TAIL) list_add_tail(&page->lru, &n->partial); else list_add(&page->lru, &n->partial); @@ -1554,10 +1528,13 @@ static inline void remove_partial(struct kmem_cache_node *n, * Lock slab, remove from the partial list and put the object into the * per cpu freelist. * + * Returns a list of objects or NULL if it fails. + * * Must hold list_lock. */ -static inline int acquire_slab(struct kmem_cache *s, - struct kmem_cache_node *n, struct page *page) +static inline void *acquire_slab(struct kmem_cache *s, + struct kmem_cache_node *n, struct page *page, + int mode) { void *freelist; unsigned long counters; @@ -1572,7 +1549,8 @@ static inline int acquire_slab(struct kmem_cache *s, freelist = page->freelist; counters = page->counters; new.counters = counters; - new.inuse = page->objects; + if (mode) + new.inuse = page->objects; VM_BUG_ON(new.frozen); new.frozen = 1; @@ -1583,32 +1561,19 @@ static inline int acquire_slab(struct kmem_cache *s, "lock and freeze")); remove_partial(n, page); - - if (freelist) { - /* Populate the per cpu freelist */ - this_cpu_write(s->cpu_slab->freelist, freelist); - this_cpu_write(s->cpu_slab->page, page); - this_cpu_write(s->cpu_slab->node, page_to_nid(page)); - return 1; - } else { - /* - * Slab page came from the wrong list. No object to allocate - * from. Put it onto the correct list and continue partial - * scan. - */ - printk(KERN_ERR "SLUB: %s : Page without available objects on" - " partial list\n", s->name); - return 0; - } + return freelist; } +static int put_cpu_partial(struct kmem_cache *s, struct page *page, int drain); + /* * Try to allocate a partial slab from a specific node. */ -static struct page *get_partial_node(struct kmem_cache *s, - struct kmem_cache_node *n) +static void *get_partial_node(struct kmem_cache *s, + struct kmem_cache_node *n, struct kmem_cache_cpu *c) { - struct page *page; + struct page *page, *page2; + void *object = NULL; /* * Racy check. If we mistakenly see no partial slabs then we @@ -1620,26 +1585,43 @@ static struct page *get_partial_node(struct kmem_cache *s, return NULL; spin_lock(&n->list_lock); - list_for_each_entry(page, &n->partial, lru) - if (acquire_slab(s, n, page)) - goto out; - page = NULL; -out: + list_for_each_entry_safe(page, page2, &n->partial, lru) { + void *t = acquire_slab(s, n, page, object == NULL); + int available; + + if (!t) + break; + + if (!object) { + c->page = page; + c->node = page_to_nid(page); + stat(s, ALLOC_FROM_PARTIAL); + object = t; + available = page->objects - page->inuse; + } else { + page->freelist = t; + available = put_cpu_partial(s, page, 0); + } + if (kmem_cache_debug(s) || available > s->cpu_partial / 2) + break; + + } spin_unlock(&n->list_lock); - return page; + return object; } /* * Get a page from somewhere. Search in increasing NUMA distances. */ -static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags) +static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags, + struct kmem_cache_cpu *c) { #ifdef CONFIG_NUMA struct zonelist *zonelist; struct zoneref *z; struct zone *zone; enum zone_type high_zoneidx = gfp_zone(flags); - struct page *page; + void *object; /* * The defrag ratio allows a configuration of the tradeoffs between @@ -1672,10 +1654,10 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags) if (n && cpuset_zone_allowed_hardwall(zone, flags) && n->nr_partial > s->min_partial) { - page = get_partial_node(s, n); - if (page) { + object = get_partial_node(s, n, c); + if (object) { put_mems_allowed(); - return page; + return object; } } } @@ -1687,16 +1669,17 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags) /* * Get a partial page, lock it and return it. */ -static struct page *get_partial(struct kmem_cache *s, gfp_t flags, int node) +static void *get_partial(struct kmem_cache *s, gfp_t flags, int node, + struct kmem_cache_cpu *c) { - struct page *page; + void *object; int searchnode = (node == NUMA_NO_NODE) ? numa_node_id() : node; - page = get_partial_node(s, get_node(s, searchnode)); - if (page || node != NUMA_NO_NODE) - return page; + object = get_partial_node(s, get_node(s, searchnode), c); + if (object || node != NUMA_NO_NODE) + return object; - return get_any_partial(s, flags); + return get_any_partial(s, flags, c); } #ifdef CONFIG_PREEMPT @@ -1765,9 +1748,6 @@ void init_kmem_cache_cpus(struct kmem_cache *s) for_each_possible_cpu(cpu) per_cpu_ptr(s->cpu_slab, cpu)->tid = init_tid(cpu); } -/* - * Remove the cpu slab - */ /* * Remove the cpu slab @@ -1781,13 +1761,13 @@ static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) enum slab_modes l = M_NONE, m = M_NONE; void *freelist; void *nextfree; - int tail = 0; + int tail = DEACTIVATE_TO_HEAD; struct page new; struct page old; if (page->freelist) { stat(s, DEACTIVATE_REMOTE_FREES); - tail = 1; + tail = DEACTIVATE_TO_TAIL; } c->tid = next_tid(c->tid); @@ -1893,7 +1873,7 @@ static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) if (m == M_PARTIAL) { add_partial(n, page, tail); - stat(s, tail ? DEACTIVATE_TO_TAIL : DEACTIVATE_TO_HEAD); + stat(s, tail); } else if (m == M_FULL) { @@ -1920,6 +1900,123 @@ static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) } } +/* Unfreeze all the cpu partial slabs */ +static void unfreeze_partials(struct kmem_cache *s) +{ + struct kmem_cache_node *n = NULL; + struct kmem_cache_cpu *c = this_cpu_ptr(s->cpu_slab); + struct page *page; + + while ((page = c->partial)) { + enum slab_modes { M_PARTIAL, M_FREE }; + enum slab_modes l, m; + struct page new; + struct page old; + + c->partial = page->next; + l = M_FREE; + + do { + + old.freelist = page->freelist; + old.counters = page->counters; + VM_BUG_ON(!old.frozen); + + new.counters = old.counters; + new.freelist = old.freelist; + + new.frozen = 0; + + if (!new.inuse && (!n || n->nr_partial > s->min_partial)) + m = M_FREE; + else { + struct kmem_cache_node *n2 = get_node(s, + page_to_nid(page)); + + m = M_PARTIAL; + if (n != n2) { + if (n) + spin_unlock(&n->list_lock); + + n = n2; + spin_lock(&n->list_lock); + } + } + + if (l != m) { + if (l == M_PARTIAL) + remove_partial(n, page); + else + add_partial(n, page, 1); + + l = m; + } + + } while (!cmpxchg_double_slab(s, page, + old.freelist, old.counters, + new.freelist, new.counters, + "unfreezing slab")); + + if (m == M_FREE) { + stat(s, DEACTIVATE_EMPTY); + discard_slab(s, page); + stat(s, FREE_SLAB); + } + } + + if (n) + spin_unlock(&n->list_lock); +} + +/* + * Put a page that was just frozen (in __slab_free) into a partial page + * slot if available. This is done without interrupts disabled and without + * preemption disabled. The cmpxchg is racy and may put the partial page + * onto a random cpus partial slot. + * + * If we did not find a slot then simply move all the partials to the + * per node partial list. + */ +int put_cpu_partial(struct kmem_cache *s, struct page *page, int drain) +{ + struct page *oldpage; + int pages; + int pobjects; + + do { + pages = 0; + pobjects = 0; + oldpage = this_cpu_read(s->cpu_slab->partial); + + if (oldpage) { + pobjects = oldpage->pobjects; + pages = oldpage->pages; + if (drain && pobjects > s->cpu_partial) { + unsigned long flags; + /* + * partial array is full. Move the existing + * set to the per node partial list. + */ + local_irq_save(flags); + unfreeze_partials(s); + local_irq_restore(flags); + pobjects = 0; + pages = 0; + } + } + + pages++; + pobjects += page->objects - page->inuse; + + page->pages = pages; + page->pobjects = pobjects; + page->next = oldpage; + + } while (this_cpu_cmpxchg(s->cpu_slab->partial, oldpage, page) != oldpage); + stat(s, CPU_PARTIAL_FREE); + return pobjects; +} + static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) { stat(s, CPUSLAB_FLUSH); @@ -1935,8 +2032,12 @@ static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) { struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); - if (likely(c && c->page)) - flush_slab(s, c); + if (likely(c)) { + if (c->page) + flush_slab(s, c); + + unfreeze_partials(s); + } } static void flush_cpu_slab(void *d) @@ -2027,12 +2128,39 @@ slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid) } } +static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags, + int node, struct kmem_cache_cpu **pc) +{ + void *object; + struct kmem_cache_cpu *c; + struct page *page = new_slab(s, flags, node); + + if (page) { + c = __this_cpu_ptr(s->cpu_slab); + if (c->page) + flush_slab(s, c); + + /* + * No other reference to the page yet so we can + * muck around with it freely without cmpxchg + */ + object = page->freelist; + page->freelist = NULL; + + stat(s, ALLOC_SLAB); + c->node = page_to_nid(page); + c->page = page; + *pc = c; + } else + object = NULL; + + return object; +} + /* * Slow path. The lockless freelist is empty or we need to perform * debugging duties. * - * Interrupts are disabled. - * * Processing is still very fast if new objects have been freed to the * regular freelist. In that case we simply take over the regular freelist * as the lockless freelist and zap the regular freelist. @@ -2049,7 +2177,6 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, unsigned long addr, struct kmem_cache_cpu *c) { void **object; - struct page *page; unsigned long flags; struct page new; unsigned long counters; @@ -2064,13 +2191,9 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, c = this_cpu_ptr(s->cpu_slab); #endif - /* We handle __GFP_ZERO in the caller */ - gfpflags &= ~__GFP_ZERO; - - page = c->page; - if (!page) + if (!c->page) goto new_slab; - +redo: if (unlikely(!node_match(c, node))) { stat(s, ALLOC_NODE_MISMATCH); deactivate_slab(s, c); @@ -2080,8 +2203,8 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, stat(s, ALLOC_SLOWPATH); do { - object = page->freelist; - counters = page->counters; + object = c->page->freelist; + counters = c->page->counters; new.counters = counters; VM_BUG_ON(!new.frozen); @@ -2093,17 +2216,17 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, * * If there are objects left then we retrieve them * and use them to refill the per cpu queue. - */ + */ - new.inuse = page->objects; + new.inuse = c->page->objects; new.frozen = object != NULL; - } while (!__cmpxchg_double_slab(s, page, + } while (!__cmpxchg_double_slab(s, c->page, object, counters, NULL, new.counters, "__slab_alloc")); - if (unlikely(!object)) { + if (!object) { c->page = NULL; stat(s, DEACTIVATE_BYPASS); goto new_slab; @@ -2112,58 +2235,47 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, stat(s, ALLOC_REFILL); load_freelist: - VM_BUG_ON(!page->frozen); c->freelist = get_freepointer(s, object); c->tid = next_tid(c->tid); local_irq_restore(flags); return object; new_slab: - page = get_partial(s, gfpflags, node); - if (page) { - stat(s, ALLOC_FROM_PARTIAL); - object = c->freelist; - if (kmem_cache_debug(s)) - goto debug; - goto load_freelist; + if (c->partial) { + c->page = c->partial; + c->partial = c->page->next; + c->node = page_to_nid(c->page); + stat(s, CPU_PARTIAL_ALLOC); + c->freelist = NULL; + goto redo; } - page = new_slab(s, gfpflags, node); + /* Then do expensive stuff like retrieving pages from the partial lists */ + object = get_partial(s, gfpflags, node, c); - if (page) { - c = __this_cpu_ptr(s->cpu_slab); - if (c->page) - flush_slab(s, c); + if (unlikely(!object)) { - /* - * No other reference to the page yet so we can - * muck around with it freely without cmpxchg - */ - object = page->freelist; - page->freelist = NULL; - page->inuse = page->objects; + object = new_slab_objects(s, gfpflags, node, &c); - stat(s, ALLOC_SLAB); - c->node = page_to_nid(page); - c->page = page; + if (unlikely(!object)) { + if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit()) + slab_out_of_memory(s, gfpflags, node); - if (kmem_cache_debug(s)) - goto debug; - goto load_freelist; + local_irq_restore(flags); + return NULL; + } } - if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit()) - slab_out_of_memory(s, gfpflags, node); - local_irq_restore(flags); - return NULL; -debug: - if (!object || !alloc_debug_processing(s, page, object, addr)) - goto new_slab; + if (likely(!kmem_cache_debug(s))) + goto load_freelist; + + /* Only entered in the debug case */ + if (!alloc_debug_processing(s, c->page, object, addr)) + goto new_slab; /* Slab failed checks. Next slab needed */ c->freelist = get_freepointer(s, object); deactivate_slab(s, c); - c->page = NULL; c->node = NUMA_NO_NODE; local_irq_restore(flags); return object; @@ -2333,16 +2445,29 @@ static void __slab_free(struct kmem_cache *s, struct page *page, was_frozen = new.frozen; new.inuse--; if ((!new.inuse || !prior) && !was_frozen && !n) { - n = get_node(s, page_to_nid(page)); - /* - * Speculatively acquire the list_lock. - * If the cmpxchg does not succeed then we may - * drop the list_lock without any processing. - * - * Otherwise the list_lock will synchronize with - * other processors updating the list of slabs. - */ - spin_lock_irqsave(&n->list_lock, flags); + + if (!kmem_cache_debug(s) && !prior) + + /* + * Slab was on no list before and will be partially empty + * We can defer the list move and instead freeze it. + */ + new.frozen = 1; + + else { /* Needs to be taken off a list */ + + n = get_node(s, page_to_nid(page)); + /* + * Speculatively acquire the list_lock. + * If the cmpxchg does not succeed then we may + * drop the list_lock without any processing. + * + * Otherwise the list_lock will synchronize with + * other processors updating the list of slabs. + */ + spin_lock_irqsave(&n->list_lock, flags); + + } } inuse = new.inuse; @@ -2352,7 +2477,15 @@ static void __slab_free(struct kmem_cache *s, struct page *page, "__slab_free")); if (likely(!n)) { - /* + + /* + * If we just froze the page then put it onto the + * per cpu partial list. + */ + if (new.frozen && !was_frozen) + put_cpu_partial(s, page, 1); + + /* * The list lock was not taken therefore no list * activity can be necessary. */ @@ -2377,7 +2510,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page, */ if (unlikely(!prior)) { remove_full(s, page); - add_partial(n, page, 1); + add_partial(n, page, DEACTIVATE_TO_TAIL); stat(s, FREE_ADD_PARTIAL); } } @@ -2421,7 +2554,6 @@ static __always_inline void slab_free(struct kmem_cache *s, slab_free_hook(s, x); redo: - /* * Determine the currently cpus per cpu slab. * The cpu may change afterward. However that does not matter since @@ -2685,7 +2817,7 @@ static void early_kmem_cache_node_alloc(int node) n = page->freelist; BUG_ON(!n); page->freelist = get_freepointer(kmem_cache_node, n); - page->inuse++; + page->inuse = 1; page->frozen = 0; kmem_cache_node->node[node] = n; #ifdef CONFIG_SLUB_DEBUG @@ -2695,7 +2827,7 @@ static void early_kmem_cache_node_alloc(int node) init_kmem_cache_node(n, kmem_cache_node); inc_slabs_node(kmem_cache_node, node, page->objects); - add_partial(n, page, 0); + add_partial(n, page, DEACTIVATE_TO_HEAD); } static void free_kmem_cache_nodes(struct kmem_cache *s) @@ -2911,7 +3043,34 @@ static int kmem_cache_open(struct kmem_cache *s, * The larger the object size is, the more pages we want on the partial * list to avoid pounding the page allocator excessively. */ - set_min_partial(s, ilog2(s->size)); + set_min_partial(s, ilog2(s->size) / 2); + + /* + * cpu_partial determined the maximum number of objects kept in the + * per cpu partial lists of a processor. + * + * Per cpu partial lists mainly contain slabs that just have one + * object freed. If they are used for allocation then they can be + * filled up again with minimal effort. The slab will never hit the + * per node partial lists and therefore no locking will be required. + * + * This setting also determines + * + * A) The number of objects from per cpu partial slabs dumped to the + * per node list when we reach the limit. + * B) The number of objects in cpu partial slabs to extract from the + * per node list when we run out of per cpu objects. We only fetch 50% + * to keep some capacity around for frees. + */ + if (s->size >= PAGE_SIZE) + s->cpu_partial = 2; + else if (s->size >= 1024) + s->cpu_partial = 6; + else if (s->size >= 256) + s->cpu_partial = 13; + else + s->cpu_partial = 30; + s->refcount = 1; #ifdef CONFIG_NUMA s->remote_node_defrag_ratio = 1000; @@ -2970,13 +3129,13 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page, /* * Attempt to free all partial slabs on a node. + * This is called from kmem_cache_close(). We must be the last thread + * using the cache and therefore we do not need to lock anymore. */ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n) { - unsigned long flags; struct page *page, *h; - spin_lock_irqsave(&n->list_lock, flags); list_for_each_entry_safe(page, h, &n->partial, lru) { if (!page->inuse) { remove_partial(n, page); @@ -2986,7 +3145,6 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n) "Objects remaining on kmem_cache_close()"); } } - spin_unlock_irqrestore(&n->list_lock, flags); } /* @@ -3020,6 +3178,7 @@ void kmem_cache_destroy(struct kmem_cache *s) s->refcount--; if (!s->refcount) { list_del(&s->list); + up_write(&slub_lock); if (kmem_cache_close(s)) { printk(KERN_ERR "SLUB %s: %s called for cache that " "still has objects.\n", s->name, __func__); @@ -3028,8 +3187,8 @@ void kmem_cache_destroy(struct kmem_cache *s) if (s->flags & SLAB_DESTROY_BY_RCU) rcu_barrier(); sysfs_slab_remove(s); - } - up_write(&slub_lock); + } else + up_write(&slub_lock); } EXPORT_SYMBOL(kmem_cache_destroy); @@ -3347,23 +3506,23 @@ int kmem_cache_shrink(struct kmem_cache *s) * list_lock. page->inuse here is the upper limit. */ list_for_each_entry_safe(page, t, &n->partial, lru) { - if (!page->inuse) { - remove_partial(n, page); - discard_slab(s, page); - } else { - list_move(&page->lru, - slabs_by_inuse + page->inuse); - } + list_move(&page->lru, slabs_by_inuse + page->inuse); + if (!page->inuse) + n->nr_partial--; } /* * Rebuild the partial list with the slabs filled up most * first and the least used slabs at the end. */ - for (i = objects - 1; i >= 0; i--) + for (i = objects - 1; i > 0; i--) list_splice(slabs_by_inuse + i, n->partial.prev); spin_unlock_irqrestore(&n->list_lock, flags); + + /* Release empty slabs */ + list_for_each_entry_safe(page, t, slabs_by_inuse, lru) + discard_slab(s, page); } kfree(slabs_by_inuse); @@ -4319,6 +4478,7 @@ static ssize_t show_slab_objects(struct kmem_cache *s, for_each_possible_cpu(cpu) { struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); + struct page *page; if (!c || c->node < 0) continue; @@ -4334,6 +4494,13 @@ static ssize_t show_slab_objects(struct kmem_cache *s, total += x; nodes[c->node] += x; } + page = c->partial; + + if (page) { + x = page->pobjects; + total += x; + nodes[c->node] += x; + } per_cpu[c->node]++; } } @@ -4412,11 +4579,12 @@ struct slab_attribute { }; #define SLAB_ATTR_RO(_name) \ - static struct slab_attribute _name##_attr = __ATTR_RO(_name) + static struct slab_attribute _name##_attr = \ + __ATTR(_name, 0400, _name##_show, NULL) #define SLAB_ATTR(_name) \ static struct slab_attribute _name##_attr = \ - __ATTR(_name, 0644, _name##_show, _name##_store) + __ATTR(_name, 0600, _name##_show, _name##_store) static ssize_t slab_size_show(struct kmem_cache *s, char *buf) { @@ -4485,6 +4653,27 @@ static ssize_t min_partial_store(struct kmem_cache *s, const char *buf, } SLAB_ATTR(min_partial); +static ssize_t cpu_partial_show(struct kmem_cache *s, char *buf) +{ + return sprintf(buf, "%u\n", s->cpu_partial); +} + +static ssize_t cpu_partial_store(struct kmem_cache *s, const char *buf, + size_t length) +{ + unsigned long objects; + int err; + + err = strict_strtoul(buf, 10, &objects); + if (err) + return err; + + s->cpu_partial = objects; + flush_all(s); + return length; +} +SLAB_ATTR(cpu_partial); + static ssize_t ctor_show(struct kmem_cache *s, char *buf) { if (!s->ctor) @@ -4523,6 +4712,37 @@ static ssize_t objects_partial_show(struct kmem_cache *s, char *buf) } SLAB_ATTR_RO(objects_partial); +static ssize_t slabs_cpu_partial_show(struct kmem_cache *s, char *buf) +{ + int objects = 0; + int pages = 0; + int cpu; + int len; + + for_each_online_cpu(cpu) { + struct page *page = per_cpu_ptr(s->cpu_slab, cpu)->partial; + + if (page) { + pages += page->pages; + objects += page->pobjects; + } + } + + len = sprintf(buf, "%d(%d)", objects, pages); + +#ifdef CONFIG_SMP + for_each_online_cpu(cpu) { + struct page *page = per_cpu_ptr(s->cpu_slab, cpu) ->partial; + + if (page && len < PAGE_SIZE - 20) + len += sprintf(buf + len, " C%d=%d(%d)", cpu, + page->pobjects, page->pages); + } +#endif + return len + sprintf(buf + len, "\n"); +} +SLAB_ATTR_RO(slabs_cpu_partial); + static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf) { return sprintf(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT)); @@ -4845,6 +5065,8 @@ STAT_ATTR(DEACTIVATE_BYPASS, deactivate_bypass); STAT_ATTR(ORDER_FALLBACK, order_fallback); STAT_ATTR(CMPXCHG_DOUBLE_CPU_FAIL, cmpxchg_double_cpu_fail); STAT_ATTR(CMPXCHG_DOUBLE_FAIL, cmpxchg_double_fail); +STAT_ATTR(CPU_PARTIAL_ALLOC, cpu_partial_alloc); +STAT_ATTR(CPU_PARTIAL_FREE, cpu_partial_free); #endif static struct attribute *slab_attrs[] = { @@ -4853,6 +5075,7 @@ static struct attribute *slab_attrs[] = { &objs_per_slab_attr.attr, &order_attr.attr, &min_partial_attr.attr, + &cpu_partial_attr.attr, &objects_attr.attr, &objects_partial_attr.attr, &partial_attr.attr, @@ -4865,6 +5088,7 @@ static struct attribute *slab_attrs[] = { &destroy_by_rcu_attr.attr, &shrink_attr.attr, &reserved_attr.attr, + &slabs_cpu_partial_attr.attr, #ifdef CONFIG_SLUB_DEBUG &total_objects_attr.attr, &slabs_attr.attr, @@ -4906,6 +5130,8 @@ static struct attribute *slab_attrs[] = { &order_fallback_attr.attr, &cmpxchg_double_fail_attr.attr, &cmpxchg_double_cpu_fail_attr.attr, + &cpu_partial_alloc_attr.attr, + &cpu_partial_free_attr.attr, #endif #ifdef CONFIG_FAILSLAB &failslab_attr.attr, @@ -5257,7 +5483,7 @@ static const struct file_operations proc_slabinfo_operations = { static int __init slab_proc_init(void) { - proc_create("slabinfo", S_IRUGO, NULL, &proc_slabinfo_operations); + proc_create("slabinfo", S_IRUSR, NULL, &proc_slabinfo_operations); return 0; } module_init(slab_proc_init); diff --git a/tools/slub/slabinfo.c b/tools/slub/slabinfo.c index 868cc93f7ac2..164cbcf61106 100644 --- a/tools/slub/slabinfo.c +++ b/tools/slub/slabinfo.c @@ -42,6 +42,7 @@ struct slabinfo { unsigned long deactivate_remote_frees, order_fallback; unsigned long cmpxchg_double_cpu_fail, cmpxchg_double_fail; unsigned long alloc_node_mismatch, deactivate_bypass; + unsigned long cpu_partial_alloc, cpu_partial_free; int numa[MAX_NODES]; int numa_partial[MAX_NODES]; } slabinfo[MAX_SLABS]; @@ -455,6 +456,11 @@ static void slab_stats(struct slabinfo *s) s->alloc_from_partial * 100 / total_alloc, s->free_remove_partial * 100 / total_free); + printf("Cpu partial list %8lu %8lu %3lu %3lu\n", + s->cpu_partial_alloc, s->cpu_partial_free, + s->cpu_partial_alloc * 100 / total_alloc, + s->cpu_partial_free * 100 / total_free); + printf("RemoteObj/SlabFrozen %8lu %8lu %3lu %3lu\n", s->deactivate_remote_frees, s->free_frozen, s->deactivate_remote_frees * 100 / total_alloc, @@ -1145,7 +1151,7 @@ static void read_slab_dir(void) switch (de->d_type) { case DT_LNK: alias->name = strdup(de->d_name); - count = readlink(de->d_name, buffer, sizeof(buffer)); + count = readlink(de->d_name, buffer, sizeof(buffer)-1); if (count < 0) fatal("Cannot read symlink %s\n", de->d_name); @@ -1209,6 +1215,8 @@ static void read_slab_dir(void) slab->order_fallback = get_obj("order_fallback"); slab->cmpxchg_double_cpu_fail = get_obj("cmpxchg_double_cpu_fail"); slab->cmpxchg_double_fail = get_obj("cmpxchg_double_fail"); + slab->cpu_partial_alloc = get_obj("cpu_partial_alloc"); + slab->cpu_partial_free = get_obj("cpu_partial_free"); slab->alloc_node_mismatch = get_obj("alloc_node_mismatch"); slab->deactivate_bypass = get_obj("deactivate_bypass"); chdir("..");