slab的基本架构之前的文章已经介绍过了。每个结构体或者文件描述符等,称作slab描述符,
在代码中结构体是kmem_cache。每个slab描述符里面都由slab组成,每个slab由一个or多个page组成。
slab们又分成三类,full,partial和free用于节省开支。slab中存放objects,就是实际上的每个结构体!
同时slab机制创建了多层的缓冲池,空间换时间。每个CPU都有本地对象缓冲池,减少多核锁争用问题。
每个内存节点都有共享对象内存池。
既然slab这么叼,我们肯定要来学习下slab是如何实现的~
slab描述符创建释放需要注意的是,如果CONFIG_SLAB_MERGE_DEFAULT=y宏打开,
可能会找到一个合适的、现有的slab描述符复用,slab描述符名不一样也能使用。
先看下slab的结构体:
struct kmem_cache {
struct array_cache __percpu *cpu_cache;
/* 1) Cache tunables. Protected by slab_mutex */ 加锁保护
/*每个CPU都有一个表示本地对象缓冲池*/
/*batchcount迁移对象数目,如果当前CPU的本地缓冲池array_cache为空,
*就从共享对象缓冲池、slab_partial/slab_free列表的slab中
*迁移空闲对象到本地对象缓冲池的数量
*/
unsigned int batchcount;
/*本地对象缓冲池的空闲对象数目 > limit,就主动释放batchcount个对象*/
unsigned int limit;
/*共享对象缓冲池*/
unsigned int shared;
/*对象长度,已经加上字节对齐*/
unsigned int size;
/*变量除以常量的优化,无符号除法*/
struct reciprocal_value reciprocal_buffer_size;
/* 2) touched by every alloc & free from the backend */
/*分配掩码*/
slab_flags_t flags; /* constant flags */
/*一个slab分配器中最多有多少个对象,一般一个slab由1个或几个page组成*/
unsigned int num; /* # of objs per slab */
/* 3) cache_grow/shrink */
/* order of pgs per slab (2^n) 每个slab分配2的gfporder个page*/
unsigned int gfporder;
/* force GFP flags, e.g. GFP_DMA slab分配page的分配掩码*/
gfp_t allocflags;
/*一个slab分配器中有多少个不同的高速缓存行,用于着色*/
size_t colour; /* cache colouring range*/
/*一个着色区的长度,和L1高速缓存行大小相同*/
unsigned int colour_off; /* colour offset */
/*使用额外内存来保存slab管理区域*/
struct kmem_cache *freelist_cache;
/*每个对象在freelist管理区中占1个字节,这里指freelist管理区大小*/
unsigned int freelist_size;
/* constructor func 构造函数*/
void (*ctor)(void *obj);
/* 4) cache creation/removal */
const char *name; //slab描述符的名字
struct list_head list; //链表节点,用于把slab描述符添加到全局链表slab_caches中
int refcount; //引用计数,创建其他slab描述符并需要引用该描述符时会增加引用计数
int object_size; //对象实际大小
int align; //对齐的长度
/* 5) statistics */
#ifdef CONFIG_DEBUG_SLAB
unsigned long num_active;
unsigned long num_allocations;
unsigned long high_mark;
unsigned long grown;
unsigned long reaped;
unsigned long errors;
unsigned long max_freeable;
unsigned long node_allocs;
unsigned long node_frees;
unsigned long node_overflow;
atomic_t allochit;
atomic_t allocmiss;
atomic_t freehit;
atomic_t freemiss;
#ifdef CONFIG_DEBUG_SLAB_LEAK
atomic_t store_user_clean;
#endif
/*
* If debugging is enabled, then the allocator can add additional
* fields and/or padding to every object. 'size' contains the total
* object size including these internal fields, while 'obj_offset'
* and 'object_size' contain the offset to the user object and its
* size.
*/
int obj_offset;
#endif /* CONFIG_DEBUG_SLAB */
#ifdef CONFIG_MEMCG
struct memcg_cache_params memcg_params;
#endif
#ifdef CONFIG_KASAN
struct kasan_cache kasan_info;
#endif
#ifdef CONFIG_SLAB_FREELIST_RANDOM
unsigned int *random_seq;
#endif
unsigned int useroffset; /* Usercopy region offset */
unsigned int usersize; /* Usercopy region size */
/*slab节点,每个node节点都有一个kmem_cache_node结构*/
struct kmem_cache_node *node[MAX_NUMNODES];
};
其中array_cache的结构体如下,可以描述本地对象缓冲池or共享对象缓冲池
struct array_cache {
unsigned int avail;
unsigned int limit;
unsigned int batchcount; //通常为limit的一半
unsigned int touched; //表示这个对象缓冲池最近使用过
void *entry[]; /*
* Must have this definition in here for the proper
* alignment of array_cache. Also simplifies accessing
* the entries.
*/
/*指向存储对象的变长数组,每个成员存放一个obj的指针(obj的地址而不是obj本身),最多limit个*/
};
创建slab描述符
struct kmem_cache *
kmem_cache_create(const char *name, unsigned int size, unsigned int align,
slab_flags_t flags, void (*ctor)(void *))构造函数
{
return kmem_cache_create_usercopy(name, size, align, flags, 0, 0,
ctor);
}
进一步:
/*
* kmem_cache_create_usercopy - Create a cache.
* @name: A string which is used in /proc/slabinfo to identify this cache. 描述符的名称
* @size: The size of objects to be created in this cache. 缓存对象的大小
* @align: The required alignment for the objects. 缓存对象需要对齐的字节数
* @flags: SLAB flags 分配掩码
* @useroffset: Usercopy region offset
* @usersize: Usercopy region size
* @ctor: A constructor for the objects. 对象的构造函数
*
* Returns a ptr to the cache on success, NULL on failure.
* Cannot be called within a interrupt, but can be interrupted.
* The @ctor is run when new pages are allocated by the cache.
*
* The flags are
*
* %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5)
* to catch references to uninitialised memory.
*
* %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check
* for buffer overruns.
*
* %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware
* cacheline. This can be beneficial if you're counting cycles as closely
* as davem.
*/
struct kmem_cache *
kmem_cache_create_usercopy(const char *name,
unsigned int size, unsigned int align,
slab_flags_t flags,
unsigned int useroffset, unsigned int usersize,
void (*ctor)(void *))
{
struct kmem_cache *s = NULL; //成功返回结构体指针
const char *cache_name;
int err;
get_online_cpus();
get_online_mems();
memcg_get_cache_ids();
/*创建slab时申请一个互斥量进行保护*/
mutex_lock(&slab_mutex);
/*直接返回0*/
err = kmem_cache_sanity_check(name, size);
if (err) {
goto out_unlock;
}
/* Refuse requests with allocator specific flags */
if (flags & ~SLAB_FLAGS_PERMITTED) {
err = -EINVAL;
goto out_unlock;
}
/*
* Some allocators will constraint the set of valid flags to a subset
* of all flags. We expect them to define CACHE_CREATE_MASK in this
* case, and we'll just provide them with a sanitized version of the
* passed flags.
*/
flags &= CACHE_CREATE_MASK;
/* Fail closed on bad usersize of useroffset values. */
if (WARN_ON(!usersize && useroffset) ||
WARN_ON(size < usersize || size - usersize < useroffset))
usersize = useroffset = 0;
if (!usersize)
/*如果有现成的slab描述符可以直接复用,直接返回*/
s = __kmem_cache_alias(name, size, align, flags, ctor);
if (s)
goto out_unlock;
/*重新分配个区域存放name,,好奇怪的操作*/
cache_name = kstrdup_const(name, GFP_KERNEL);
if (!cache_name) {
err = -ENOMEM;
goto out_unlock;
}
/*真正创建描述符*/
s = create_cache(cache_name, size,
calculate_alignment(flags, align, size),
flags, useroffset, usersize, ctor, NULL, NULL);
if (IS_ERR(s)) {
err = PTR_ERR(s);
kfree_const(cache_name);
}
out_unlock:
mutex_unlock(&slab_mutex);
memcg_put_cache_ids();
put_online_mems();
put_online_cpus();
if (err) {
if (flags & SLAB_PANIC)
panic("kmem_cache_create: Failed to create slab '%s'. Error %d\n",
name, err);
else {
pr_warn("kmem_cache_create(%s) failed with error %d\n",
name, err);
dump_stack();
}
return NULL;
}
return s;
}
真正创建slab描述符!:
static struct kmem_cache *create_cache(const char *name,
unsigned int object_size, unsigned int align,
slab_flags_t flags, unsigned int useroffset,
unsigned int usersize, void (*ctor)(void *),
struct mem_cgroup *memcg, struct kmem_cache *root_cache)
{
struct kmem_cache *s;
int err;
if (WARN_ON(useroffset + usersize > object_size))
useroffset = usersize = 0;
err = -ENOMEM;
/*分配一个kmem_cache数据结构*/
s = kmem_cache_zalloc(kmem_cache, GFP_KERNEL);
if (!s)
goto out;
/*填充数据结构*/
s->name = name;
s->size = s->object_size = object_size;
s->align = align;
s->ctor = ctor;
s->useroffset = useroffset;
s->usersize = usersize;
err = init_memcg_params(s, memcg, root_cache);
if (err)
goto out_free_cache;
/*创建缓存描述符,这个还可以细看下*/
err = __kmem_cache_create(s, flags);
if (err)
goto out_free_cache;
s->refcount = 1;
/*把新建的slab描述符添加到全局的链表slab_caches中*/
list_add(&s->list, &slab_caches);
memcg_link_cache(s);
out:
if (err)
return ERR_PTR(err);
return s;
out_free_cache:
destroy_memcg_params(s);
kmem_cache_free(kmem_cache, s);
goto out;
}
Linux内核实现了3种slab分配器,手机上现在用的是slub:
int __kmem_cache_create(struct kmem_cache *s, slab_flags_t flags)
{
int err;
err = kmem_cache_open(s, flags);
if (err)
return err;
/* Mutex is not taken during early boot */
if (slab_state <= UP)
return 0;
memcg_propagate_slab_attrs(s);
err = sysfs_slab_add(s);
if (err)
__kmem_cache_release(s);
return err;
}
slab描述符创建完成后,就需要分配slab对象了
通过kmem_cache_alloc -> slab_alloc
static __always_inline void *
slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller)
{
unsigned long save_flags;
void *objp;
flags &= gfp_allowed_mask;
cachep = slab_pre_alloc_hook(cachep, flags);
if (unlikely(!cachep))
return NULL;
cache_alloc_debugcheck_before(cachep, flags);
/*关闭本地中断*/
local_irq_save(save_flags);
/*核心函数,获取slab对象*/
objp = __do_cache_alloc(cachep, flags);
local_irq_restore(save_flags);
objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller);
prefetchw(objp);
/*如果分配时配置了__GFP_ZERO,把slab对象清零*/
if (unlikely(slab_want_init_on_alloc(flags, cachep)) && objp)
memset(objp, 0, cachep->object_size);
slab_post_alloc_hook(cachep, flags, 1, &objp);
return objp;
}
没有打开NUMA的宏,所以执行这里,有个疑问,参数是不是太少了...分配几个页面啥的也不知道啊
static __always_inline void *
__do_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
{
return ____cache_alloc(cachep, flags);
}
static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
{
void *objp;
struct array_cache *ac; //本地对象缓冲池
check_irq_off(); //确认已经关闭中断
ac = cpu_cache_get(cachep); //用这个宏来获得slab描述符的本地CPU对象缓冲池ac
/*如果ac有空的对象,最近使用置为1,从数组中取出一个返回给objp*/
if (likely(ac->avail)) {
ac->touched = 1;
objp = ac->entry[--ac->avail];
STATS_INC_ALLOCHIT(cachep);
goto out;
}
STATS_INC_ALLOCMISS(cachep);
/*第一次进入这里才能给avail赋值*/
objp = cache_alloc_refill(cachep, flags);
/*
* the 'ac' may be updated by cache_alloc_refill(),
* and kmemleak_erase() requires its correct value.
*/
ac = cpu_cache_get(cachep);
out:
/*
* To avoid a false negative, if an object that is in one of the
* per-CPU caches is leaked, we need to make sure kmemleak doesn't
* treat the array pointers as a reference to the object.
*/
if (objp)
kmemleak_erase(&ac->entry[ac->avail]);
return objp;
}
第一次分配本地缓冲池或者本地缓冲池avail不足
static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags)
{
int batchcount;
struct kmem_cache_node *n; //只有一个
struct array_cache *ac, *shared; //一个本地,一个共享
int node;
void *list = NULL;
struct page *page;
check_irq_off(); //确认中断关闭
node = numa_mem_id(); //就尼玛1个node
ac = cpu_cache_get(cachep); //获取本地cpu缓冲池
batchcount = ac->batchcount;
if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {
/*
* If there was little recent activity on this cache, then
* perform only a partial refill. Otherwise we could generate
* refill bouncing.
*/
batchcount = BATCHREFILL_LIMIT; //16
}
n = get_node(cachep, node); //也就一个slab节点
BUG_ON(ac->avail > 0 || !n);
shared = READ_ONCE(n->shared); //kmem_cache_node有共享池shared
/*slab节点没有空闲对象且共享对象池也没有*/
if (!n->free_objects && (!shared || !shared->avail))
goto direct_grow;
spin_lock(&n->list_lock);
shared = READ_ONCE(n->shared);
/* See if we can refill from the shared array
* 否则的话,transfer_objects就迁移batchcount个共享缓冲池的对象到本地缓冲池
*/
if (shared && transfer_objects(ac, shared, batchcount)) {
shared->touched = 1;
goto alloc_done; //返回分配成功
}
/*shared里面也没有空闲页面了,可太惨了*/
while (batchcount > 0) {
/* Get slab alloc is to come from.
* 扫描检查slabs_partial(先)和slabs_free(后)链表,返回第一个slab成员
* 第一个物理页面的page数据结构
*/
page = get_first_slab(n, false);
if (!page)
goto must_grow;
check_spinlock_acquired(cachep);
/*然后还是从page中迁移batchcount个空闲对象到本地对象缓冲池*/
batchcount = alloc_block(cachep, ac, page, batchcount);
fixup_slab_list(cachep, n, page, &list);
}
must_grow:
n->free_objects -= ac->avail; //从slab节点总的free_objects中减去分给本地cpu缓冲池的avail
alloc_done:
spin_unlock(&n->list_lock);
fixup_objfreelist_debug(cachep, &list);
direct_grow:
if (unlikely(!ac->avail)) {
/* Check if we can use obj in pfmemalloc slab */
if (sk_memalloc_socks()) {
void *obj = cache_alloc_pfmemalloc(cachep, n, flags);
if (obj)
return obj;
}
/*这个是重点,分配2的cachep->gfporder个物理页面*/
page = cache_grow_begin(cachep, gfp_exact_node(flags), node);
/*
* cache_grow_begin() can reenable interrupts,
* then ac could change.
*/
ac = cpu_cache_get(cachep);
if (!ac->avail && page)
/*从新的slab分配器中迁移空闲对象*/
alloc_block(cachep, ac, page, batchcount);
cache_grow_end(cachep, page);
if (!ac->avail)
return NULL;
}
ac->touched = 1;
/*和快速路径一样,直接从entry中分配第avail个*/
return ac->entry[--ac->avail];
}
释放slab缓存对象
有申就有放~
void ___cache_free(struct kmem_cache *cachep, void *objp,
unsigned long caller)
{
struct array_cache *ac = cpu_cache_get(cachep); //获取slab缓存对象
check_irq_off(); //确认中断关闭
if (unlikely(slab_want_init_on_free(cachep)))
memset(objp, 0, cachep->object_size);
kmemleak_free_recursive(objp, cachep->flags);
objp = cache_free_debugcheck(cachep, objp, caller);
/*
* Skip calling cache_free_alien() when the platform is not numa.
* This will avoid cache misses that happen while accessing slabp (which
* is per page memory reference) to get nodeid. Instead use a global
* variable to skip the call, which is mostly likely to be present in
* the cache.
*/
if (nr_online_nodes > 1 && cache_free_alien(cachep, objp))
return;
if (ac->avail < ac->limit) {
STATS_INC_FREEHIT(cachep);
} else {
STATS_INC_FREEMISS(cachep);
/*如果avail超过limit,就尝试回收空闲对象,limit的计算在enable_cpucache中
*主要是为了回收slab分配器
*/
cache_flusharray(cachep, ac);
}
if (sk_memalloc_socks()) {
struct page *page = virt_to_head_page(objp);
if (unlikely(PageSlabPfmemalloc(page))) {
cache_free_pfmemalloc(cachep, page, objp);
return;
}
}
/*把对象释放到本地对象缓冲池ac*/
ac->entry[ac->avail++] = objp;
}
cache_flusharray函数主要用来回收整个slab,而不是某个对象
static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac)
{
int batchcount;
struct kmem_cache_node *n;
int node = numa_mem_id();
LIST_HEAD(list);
batchcount = ac->batchcount;
check_irq_off(); //关中断
n = get_node(cachep, node);
spin_lock(&n->list_lock);
/*是否有共享缓冲池*/
if (n->shared) {
struct array_cache *shared_array = n->shared;
/*是否avail超过limit,max>0说明还没达到,共享池没有满*/
int max = shared_array->limit - shared_array->avail;
if (max) {
if (batchcount > max)
batchcount = max;
/*把本地共享池ac的复制到共享缓冲池中,给共享缓冲池填满到limit*/
memcpy(&(shared_array->entry[shared_array->avail]),
ac->entry, sizeof(void *) * batchcount);
shared_array->avail += batchcount;
goto free_done;
}
}
/*共享池充足,删除一些slab对象,主动释放batchcount个空闲对象*/
free_block(cachep, ac->entry, batchcount, node, &list);
free_done:
#if STATS
{
int i = 0;
struct page *page;
list_for_each_entry(page, &n->slabs_free, lru) {
BUG_ON(page->active);
i++;
}
STATS_SET_FREEABLE(cachep, i);
}
#endif
spin_unlock(&n->list_lock);
/*删除这个slab分配器,本地缓冲池减去分给共享池的*/
slabs_destroy(cachep, &list);
ac->avail -= batchcount;
/*把本地ac池的缓存空闲对象迁移到缓存的头部*/
memmove(ac->entry, &(ac->entry[batchcount]), sizeof(void *)*ac->avail);
}
共享池充足,删除一些slab对象free_block
1、slab描述符
2、本地缓冲池ac->entry
3、batchcount
4、node
5、list
static void free_block(struct kmem_cache *cachep, void **objpp,
int nr_objects, int node, struct list_head *list)
{
int i;
struct kmem_cache_node *n = get_node(cachep, node); //这个结构体包含了slab的三个链表
struct page *page;
n->free_objects += nr_objects; //回收的obj对象加入到free_obj中
for (i = 0; i < nr_objects; i++) {
void *objp; //定义个指针
struct page *page; //定义个page结构体
objp = objpp[i]; //遍历entry,指向每个obj
page = virt_to_head_page(objp); //找到这个obj所在的page
list_del(&page->lru); //从lru上摘下来这个页
check_spinlock_acquired_node(cachep, node);
slab_put_obj(cachep, page, objp);
STATS_DEC_ACTIVE(cachep);
/* fixup slab chains
* 如果没有了活跃对象,就把这个slab分配器添加到slab_free中
*/
if (page->active == 0) {
/*这个page的lru指针,如果page用作slab分配器,就指向slab节点
*如果是普通的文件页、匿名页,就指向 (in)active_anon(file)的lru链表
*/
list_add(&page->lru, &n->slabs_free);
n->free_slabs++; //看来free_slabs的单位就是page哦
} else {
/* Unconditionally move a slab to the end of the
* partial list on free - maximum time for the
* other objects to be freed, too.
* 仍然有活跃对象
*/
list_add_tail(&page->lru, &n->slabs_partial);
}
}
/*如果free超过limit,减去一个slab分配器,等会销毁掉*/
while (n->free_objects > n->free_limit && !list_empty(&n->slabs_free)) {
n->free_objects -= cachep->num; //一个slab中有num个obj
page = list_last_entry(&n->slabs_free, struct page, lru);
list_move(&page->lru, list);
n->free_slabs--;
n->total_slabs--;
}
}
slab分配器其实就是page页面,那么如何从buddy系统中获取page呢,
之前有提到 cache_grow_begin -> kmem_getpages -> __alloc_pages_node -> __alloc_pages
static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags,
int nodeid)
{
struct page *page;
int nr_pages;
flags |= cachep->allocflags; //从哪个zone分配
/*重点分配*/
page = __alloc_pages_node(nodeid, flags, cachep->gfporder);
if (!page) { //分配不到oom
slab_out_of_memory(cachep, flags, nodeid);
return NULL;
}
if (memcg_charge_slab(page, flags, cachep->gfporder, cachep)) {
__free_pages(page, cachep->gfporder);
return NULL;
}
/*返回2的gfporder阶,就是页数*/
nr_pages = (1 << cachep->gfporder);
/*根据slab描述符的分配掩码,增加内存节点统计,区分是否可回收*/
if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
mod_lruvec_page_state(page, NR_SLAB_RECLAIMABLE, nr_pages);
else
mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE, nr_pages);
__SetPageSlab(page);
/* Record if ALLOC_NO_WATERMARKS was set when allocating the slab */
if (sk_memalloc_socks() && page_is_pfmemalloc(page))
SetPageSlabPfmemalloc(page);
return page;
}