Skip to content

Commit e74eb94

Browse files
JoonsooKimgregkh
authored andcommitted
slab: fix oops when reading /proc/slab_allocators
commit 0378730 upstream. Commit b1cb098 ("change the management method of free objects of the slab") introduced a bug on slab leak detector ('/proc/slab_allocators'). This detector works like as following decription. 1. traverse all objects on all the slabs. 2. determine whether it is active or not. 3. if active, print who allocate this object. but that commit changed the way how to manage free objects, so the logic determining whether it is active or not is also changed. In before, we regard object in cpu caches as inactive one, but, with this commit, we mistakenly regard object in cpu caches as active one. This intoduces kernel oops if DEBUG_PAGEALLOC is enabled. If DEBUG_PAGEALLOC is enabled, kernel_map_pages() is used to detect who corrupt free memory in the slab. It unmaps page table mapping if object is free and map it if object is active. When slab leak detector check object in cpu caches, it mistakenly think this object active so try to access object memory to retrieve caller of allocation. At this point, page table mapping to this object doesn't exist, so oops occurs. Following is oops message reported from Dave. It blew up when something tried to read /proc/slab_allocators (Just cat it, and you should see the oops below) Oops: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC Modules linked in: [snip...] CPU: 1 PID: 9386 Comm: trinity-c33 Not tainted 3.14.0-rc5+ #131 task: ffff8801aa46e890 ti: ffff880076924000 task.ti: ffff880076924000 RIP: 0010:[<ffffffffaa1a8f4a>] [<ffffffffaa1a8f4a>] handle_slab+0x8a/0x180 RSP: 0018:ffff880076925de0 EFLAGS: 00010002 RAX: 0000000000001000 RBX: 0000000000000000 RCX: 000000005ce85ce7 RDX: ffffea00079be100 RSI: 0000000000001000 RDI: ffff880107458000 RBP: ffff880076925e18 R08: 0000000000000001 R09: 0000000000000000 R10: 0000000000000000 R11: 000000000000000f R12: ffff8801e6f84000 R13: ffffea00079be100 R14: ffff880107458000 R15: ffff88022bb8d2c0 FS: 00007fb769e45740(0000) GS:ffff88024d040000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: ffff8801e6f84ff8 CR3: 00000000a22db000 CR4: 00000000001407e0 DR0: 0000000002695000 DR1: 0000000002695000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000070602 Call Trace: leaks_show+0xce/0x240 seq_read+0x28e/0x490 proc_reg_read+0x3d/0x80 vfs_read+0x9b/0x160 SyS_read+0x58/0xb0 tracesys+0xd4/0xd9 Code: f5 00 00 00 0f 1f 44 00 00 48 63 c8 44 3b 0c 8a 0f 84 e3 00 00 00 83 c0 01 44 39 c0 72 eb 41 f6 47 1a 01 0f 84 e9 00 00 00 89 f0 <4d> 8b 4c 04 f8 4d 85 c9 0f 84 88 00 00 00 49 8b 7e 08 4d 8d 46 RIP handle_slab+0x8a/0x180 To fix the problem, I introduce an object status buffer on each slab. With this, we can track object status precisely, so slab leak detector would not access active object and no kernel oops would occur. Memory overhead caused by this fix is only imposed to CONFIG_DEBUG_SLAB_LEAK which is mainly used for debugging, so memory overhead isn't big problem. Signed-off-by: Joonsoo Kim <[email protected]> Reported-by: Dave Jones <[email protected]> Reported-by: Tetsuo Handa <[email protected]> Reviewed-by: Vladimir Davydov <[email protected]> Cc: Christoph Lameter <[email protected]> Cc: Pekka Enberg <[email protected]> Cc: David Rientjes <[email protected]> Signed-off-by: Andrew Morton <[email protected]> Signed-off-by: Linus Torvalds <[email protected]> Signed-off-by: Greg Kroah-Hartman <[email protected]>
1 parent 264b866 commit e74eb94

File tree

1 file changed

+71
-19
lines changed

1 file changed

+71
-19
lines changed

mm/slab.c

Lines changed: 71 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -386,6 +386,39 @@ static void **dbg_userword(struct kmem_cache *cachep, void *objp)
386386

387387
#endif
388388

389+
#define OBJECT_FREE (0)
390+
#define OBJECT_ACTIVE (1)
391+
392+
#ifdef CONFIG_DEBUG_SLAB_LEAK
393+
394+
static void set_obj_status(struct page *page, int idx, int val)
395+
{
396+
int freelist_size;
397+
char *status;
398+
struct kmem_cache *cachep = page->slab_cache;
399+
400+
freelist_size = cachep->num * sizeof(freelist_idx_t);
401+
status = (char *)page->freelist + freelist_size;
402+
status[idx] = val;
403+
}
404+
405+
static inline unsigned int get_obj_status(struct page *page, int idx)
406+
{
407+
int freelist_size;
408+
char *status;
409+
struct kmem_cache *cachep = page->slab_cache;
410+
411+
freelist_size = cachep->num * sizeof(freelist_idx_t);
412+
status = (char *)page->freelist + freelist_size;
413+
414+
return status[idx];
415+
}
416+
417+
#else
418+
static inline void set_obj_status(struct page *page, int idx, int val) {}
419+
420+
#endif
421+
389422
/*
390423
* Do not go above this order unless 0 objects fit into the slab or
391424
* overridden on the command line.
@@ -576,12 +609,30 @@ static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep)
576609
return cachep->array[smp_processor_id()];
577610
}
578611

612+
static size_t calculate_freelist_size(int nr_objs, size_t align)
613+
{
614+
size_t freelist_size;
615+
616+
freelist_size = nr_objs * sizeof(freelist_idx_t);
617+
if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK))
618+
freelist_size += nr_objs * sizeof(char);
619+
620+
if (align)
621+
freelist_size = ALIGN(freelist_size, align);
622+
623+
return freelist_size;
624+
}
625+
579626
static int calculate_nr_objs(size_t slab_size, size_t buffer_size,
580627
size_t idx_size, size_t align)
581628
{
582629
int nr_objs;
630+
size_t remained_size;
583631
size_t freelist_size;
632+
int extra_space = 0;
584633

634+
if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK))
635+
extra_space = sizeof(char);
585636
/*
586637
* Ignore padding for the initial guess. The padding
587638
* is at most @align-1 bytes, and @buffer_size is at
@@ -590,14 +641,15 @@ static int calculate_nr_objs(size_t slab_size, size_t buffer_size,
590641
* into the memory allocation when taking the padding
591642
* into account.
592643
*/
593-
nr_objs = slab_size / (buffer_size + idx_size);
644+
nr_objs = slab_size / (buffer_size + idx_size + extra_space);
594645

595646
/*
596647
* This calculated number will be either the right
597648
* amount, or one greater than what we want.
598649
*/
599-
freelist_size = slab_size - nr_objs * buffer_size;
600-
if (freelist_size < ALIGN(nr_objs * idx_size, align))
650+
remained_size = slab_size - nr_objs * buffer_size;
651+
freelist_size = calculate_freelist_size(nr_objs, align);
652+
if (remained_size < freelist_size)
601653
nr_objs--;
602654

603655
return nr_objs;
@@ -635,7 +687,7 @@ static void cache_estimate(unsigned long gfporder, size_t buffer_size,
635687
} else {
636688
nr_objs = calculate_nr_objs(slab_size, buffer_size,
637689
sizeof(freelist_idx_t), align);
638-
mgmt_size = ALIGN(nr_objs * sizeof(freelist_idx_t), align);
690+
mgmt_size = calculate_freelist_size(nr_objs, align);
639691
}
640692
*num = nr_objs;
641693
*left_over = slab_size - nr_objs*buffer_size - mgmt_size;
@@ -2032,13 +2084,16 @@ static size_t calculate_slab_order(struct kmem_cache *cachep,
20322084
break;
20332085

20342086
if (flags & CFLGS_OFF_SLAB) {
2087+
size_t freelist_size_per_obj = sizeof(freelist_idx_t);
20352088
/*
20362089
* Max number of objs-per-slab for caches which
20372090
* use off-slab slabs. Needed to avoid a possible
20382091
* looping condition in cache_grow().
20392092
*/
2093+
if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK))
2094+
freelist_size_per_obj += sizeof(char);
20402095
offslab_limit = size;
2041-
offslab_limit /= sizeof(freelist_idx_t);
2096+
offslab_limit /= freelist_size_per_obj;
20422097

20432098
if (num > offslab_limit)
20442099
break;
@@ -2285,8 +2340,7 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
22852340
if (!cachep->num)
22862341
return -E2BIG;
22872342

2288-
freelist_size =
2289-
ALIGN(cachep->num * sizeof(freelist_idx_t), cachep->align);
2343+
freelist_size = calculate_freelist_size(cachep->num, cachep->align);
22902344

22912345
/*
22922346
* If the slab has been placed off-slab, and we have enough space then
@@ -2299,7 +2353,7 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
22992353

23002354
if (flags & CFLGS_OFF_SLAB) {
23012355
/* really off slab. No need for manual alignment */
2302-
freelist_size = cachep->num * sizeof(freelist_idx_t);
2356+
freelist_size = calculate_freelist_size(cachep->num, 0);
23032357

23042358
#ifdef CONFIG_PAGE_POISONING
23052359
/* If we're going to use the generic kernel_map_pages()
@@ -2625,6 +2679,7 @@ static void cache_init_objs(struct kmem_cache *cachep,
26252679
if (cachep->ctor)
26262680
cachep->ctor(objp);
26272681
#endif
2682+
set_obj_status(page, i, OBJECT_FREE);
26282683
set_free_obj(page, i, i);
26292684
}
26302685
}
@@ -2833,6 +2888,7 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
28332888
BUG_ON(objnr >= cachep->num);
28342889
BUG_ON(objp != index_to_obj(cachep, page, objnr));
28352890

2891+
set_obj_status(page, objnr, OBJECT_FREE);
28362892
if (cachep->flags & SLAB_POISON) {
28372893
#ifdef CONFIG_DEBUG_PAGEALLOC
28382894
if ((cachep->size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) {
@@ -2966,6 +3022,8 @@ static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep,
29663022
static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
29673023
gfp_t flags, void *objp, unsigned long caller)
29683024
{
3025+
struct page *page;
3026+
29693027
if (!objp)
29703028
return objp;
29713029
if (cachep->flags & SLAB_POISON) {
@@ -2996,6 +3054,9 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
29963054
*dbg_redzone1(cachep, objp) = RED_ACTIVE;
29973055
*dbg_redzone2(cachep, objp) = RED_ACTIVE;
29983056
}
3057+
3058+
page = virt_to_head_page(objp);
3059+
set_obj_status(page, obj_to_index(cachep, page, objp), OBJECT_ACTIVE);
29993060
objp += obj_offset(cachep);
30003061
if (cachep->ctor && cachep->flags & SLAB_POISON)
30013062
cachep->ctor(objp);
@@ -4232,21 +4293,12 @@ static void handle_slab(unsigned long *n, struct kmem_cache *c,
42324293
struct page *page)
42334294
{
42344295
void *p;
4235-
int i, j;
4296+
int i;
42364297

42374298
if (n[0] == n[1])
42384299
return;
42394300
for (i = 0, p = page->s_mem; i < c->num; i++, p += c->size) {
4240-
bool active = true;
4241-
4242-
for (j = page->active; j < c->num; j++) {
4243-
/* Skip freed item */
4244-
if (get_free_obj(page, j) == i) {
4245-
active = false;
4246-
break;
4247-
}
4248-
}
4249-
if (!active)
4301+
if (get_obj_status(page, i) != OBJECT_ACTIVE)
42504302
continue;
42514303

42524304
if (!add_caller(n, (unsigned long)*dbg_userword(c, p)))

0 commit comments

Comments
 (0)