slab的基本架构之前的文章已经介绍过了。每个结构体或者文件描述符等,称作slab描述符,
在代码中结构体是kmem_cache。每个slab描述符里面都由slab组成,每个slab由一个or多个page组成。
slab们又分成三类,full,partial和free用于节省开支。slab中存放objects,就是实际上的每个结构体!
同时slab机制创建了多层的缓冲池,空间换时间。每个CPU都有本地对象缓冲池,减少多核锁争用问题。
每个内存节点都有共享对象内存池。
既然slab这么叼,我们肯定要来学习下slab是如何实现的~
slab描述符创建释放需要注意的是,如果CONFIG_SLAB_MERGE_DEFAULT=y宏打开,
可能会找到一个合适的、现有的slab描述符复用,slab描述符名不一样也能使用。
先看下slab的结构体:
struct kmem_cache {
    struct array_cache __percpu *cpu_cache;   
/* 1) Cache tunables. Protected by slab_mutex */   加锁保护
/*每个CPU都有一个表示本地对象缓冲池*/
/*batchcount迁移对象数目,如果当前CPU的本地缓冲池array_cache为空,
 *就从共享对象缓冲池、slab_partial/slab_free列表的slab中
 *迁移空闲对象到本地对象缓冲池的数量
 */
    unsigned int batchcount;  
/*本地对象缓冲池的空闲对象数目 > limit,就主动释放batchcount个对象*/
    unsigned int limit;
/*共享对象缓冲池*/
    unsigned int shared;
/*对象长度,已经加上字节对齐*/
    unsigned int size;
/*变量除以常量的优化,无符号除法*/
    struct reciprocal_value reciprocal_buffer_size;
/* 2) touched by every alloc & free from the backend */
/*分配掩码*/
    slab_flags_t flags;     /* constant flags */
/*一个slab分配器中最多有多少个对象,一般一个slab由1个或几个page组成*/
    unsigned int num;       /* # of objs per slab */
/* 3) cache_grow/shrink */
    /* order of pgs per slab (2^n) 每个slab分配2的gfporder个page*/
    unsigned int gfporder;
    /* force GFP flags, e.g. GFP_DMA slab分配page的分配掩码*/
    gfp_t allocflags;
/*一个slab分配器中有多少个不同的高速缓存行,用于着色*/
    size_t colour;          /* cache colouring range*/
/*一个着色区的长度,和L1高速缓存行大小相同*/   
    unsigned int colour_off;    /* colour offset */
/*使用额外内存来保存slab管理区域*/   
    struct kmem_cache *freelist_cache;
/*每个对象在freelist管理区中占1个字节,这里指freelist管理区大小*/
    unsigned int freelist_size;
    /* constructor func 构造函数*/
    void (*ctor)(void *obj);
/* 4) cache creation/removal */
    const char *name;   //slab描述符的名字
    struct list_head list; //链表节点,用于把slab描述符添加到全局链表slab_caches中
    int refcount;   //引用计数,创建其他slab描述符并需要引用该描述符时会增加引用计数
    int object_size;    //对象实际大小
    int align;      //对齐的长度
/* 5) statistics */
#ifdef CONFIG_DEBUG_SLAB
    unsigned long num_active;
    unsigned long num_allocations;
    unsigned long high_mark;
    unsigned long grown;
    unsigned long reaped;
    unsigned long errors;
    unsigned long max_freeable;
    unsigned long node_allocs;
    unsigned long node_frees;
    unsigned long node_overflow;
    atomic_t allochit;
    atomic_t allocmiss;
    atomic_t freehit;
    atomic_t freemiss;
#ifdef CONFIG_DEBUG_SLAB_LEAK
    atomic_t store_user_clean;
#endif
    /*
     * If debugging is enabled, then the allocator can add additional
     * fields and/or padding to every object. 'size' contains the total
     * object size including these internal fields, while 'obj_offset'
     * and 'object_size' contain the offset to the user object and its
     * size.
     */
    int obj_offset;
#endif /* CONFIG_DEBUG_SLAB */
#ifdef CONFIG_MEMCG
    struct memcg_cache_params memcg_params;
#endif
#ifdef CONFIG_KASAN
    struct kasan_cache kasan_info;
#endif
#ifdef CONFIG_SLAB_FREELIST_RANDOM
    unsigned int *random_seq;
#endif
    unsigned int useroffset;    /* Usercopy region offset */
    unsigned int usersize;      /* Usercopy region size */
/*slab节点,每个node节点都有一个kmem_cache_node结构*/
    struct kmem_cache_node *node[MAX_NUMNODES];
};
其中array_cache的结构体如下,可以描述本地对象缓冲池or共享对象缓冲池
struct array_cache {
    unsigned int avail;
    unsigned int limit;
    unsigned int batchcount; //通常为limit的一半
    unsigned int touched; //表示这个对象缓冲池最近使用过
    void *entry[];  /*
             * Must have this definition in here for the proper
             * alignment of array_cache. Also simplifies accessing
             * the entries.
             */
    /*指向存储对象的变长数组,每个成员存放一个obj的指针(obj的地址而不是obj本身),最多limit个*/
};
创建slab描述符
struct kmem_cache *
kmem_cache_create(const char *name, unsigned int size, unsigned int align,
        slab_flags_t flags, void (*ctor)(void *))构造函数
{
    return kmem_cache_create_usercopy(name, size, align, flags, 0, 0,
                      ctor);
}
进一步:
/*
 * kmem_cache_create_usercopy - Create a cache.
 * @name: A string which is used in /proc/slabinfo to identify this cache. 描述符的名称
 * @size: The size of objects to be created in this cache. 缓存对象的大小
 * @align: The required alignment for the objects. 缓存对象需要对齐的字节数
 * @flags: SLAB flags 分配掩码
 * @useroffset: Usercopy region offset
 * @usersize: Usercopy region size
 * @ctor: A constructor for the objects. 对象的构造函数
 *
 * Returns a ptr to the cache on success, NULL on failure.
 * Cannot be called within a interrupt, but can be interrupted.
 * The @ctor is run when new pages are allocated by the cache.
 *
 * The flags are
 *
 * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5)
 * to catch references to uninitialised memory.
 *
 * %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check
 * for buffer overruns.
 *
 * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware
 * cacheline.  This can be beneficial if you're counting cycles as closely
 * as davem.
 */
struct kmem_cache *
kmem_cache_create_usercopy(const char *name,
          unsigned int size, unsigned int align,
          slab_flags_t flags,
          unsigned int useroffset, unsigned int usersize,
          void (*ctor)(void *))
{
    struct kmem_cache *s = NULL;  //成功返回结构体指针
    const char *cache_name;
    int err;
    get_online_cpus();
    get_online_mems();
    memcg_get_cache_ids();
/*创建slab时申请一个互斥量进行保护*/
    mutex_lock(&slab_mutex);
/*直接返回0*/
    err = kmem_cache_sanity_check(name, size);
    if (err) {
        goto out_unlock;
    }
    /* Refuse requests with allocator specific flags */
    if (flags & ~SLAB_FLAGS_PERMITTED) {
        err = -EINVAL;
        goto out_unlock;
    }
    /*
     * Some allocators will constraint the set of valid flags to a subset
     * of all flags. We expect them to define CACHE_CREATE_MASK in this
     * case, and we'll just provide them with a sanitized version of the
     * passed flags.
     */
    flags &= CACHE_CREATE_MASK;
    /* Fail closed on bad usersize of useroffset values. */
    if (WARN_ON(!usersize && useroffset) ||
        WARN_ON(size < usersize || size - usersize < useroffset))
        usersize = useroffset = 0;
    if (!usersize)
    /*如果有现成的slab描述符可以直接复用,直接返回*/
        s = __kmem_cache_alias(name, size, align, flags, ctor);
    if (s)
        goto out_unlock;
    /*重新分配个区域存放name,,好奇怪的操作*/
    cache_name = kstrdup_const(name, GFP_KERNEL);
    if (!cache_name) {
        err = -ENOMEM;
        goto out_unlock;
    }
    /*真正创建描述符*/
    s = create_cache(cache_name, size,
             calculate_alignment(flags, align, size),
             flags, useroffset, usersize, ctor, NULL, NULL);
    if (IS_ERR(s)) {
        err = PTR_ERR(s);
        kfree_const(cache_name);
    }
out_unlock:
    mutex_unlock(&slab_mutex);
    memcg_put_cache_ids();
    put_online_mems();
    put_online_cpus();
    if (err) {
        if (flags & SLAB_PANIC)
            panic("kmem_cache_create: Failed to create slab '%s'. Error %d\n",
                name, err);
        else {
            pr_warn("kmem_cache_create(%s) failed with error %d\n",
                name, err);
            dump_stack();
        }
        return NULL;
    }
    return s;
}
真正创建slab描述符!:
static struct kmem_cache *create_cache(const char *name,
        unsigned int object_size, unsigned int align,
        slab_flags_t flags, unsigned int useroffset,
        unsigned int usersize, void (*ctor)(void *),
        struct mem_cgroup *memcg, struct kmem_cache *root_cache)
{
    struct kmem_cache *s;
    int err;
    if (WARN_ON(useroffset + usersize > object_size))
        useroffset = usersize = 0;
    err = -ENOMEM;
/*分配一个kmem_cache数据结构*/
    s = kmem_cache_zalloc(kmem_cache, GFP_KERNEL);
    if (!s)
        goto out;
/*填充数据结构*/
    s->name = name;
    s->size = s->object_size = object_size;
    s->align = align;
    s->ctor = ctor;
    s->useroffset = useroffset;
    s->usersize = usersize;
    err = init_memcg_params(s, memcg, root_cache);
    if (err)
        goto out_free_cache;
/*创建缓存描述符,这个还可以细看下*/
    err = __kmem_cache_create(s, flags);
    if (err)
        goto out_free_cache;
    s->refcount = 1;
/*把新建的slab描述符添加到全局的链表slab_caches中*/
    list_add(&s->list, &slab_caches);
    memcg_link_cache(s);
out:
    if (err)
        return ERR_PTR(err);
    return s;
out_free_cache:
    destroy_memcg_params(s);
    kmem_cache_free(kmem_cache, s);
    goto out;
}
Linux内核实现了3种slab分配器,手机上现在用的是slub:
int __kmem_cache_create(struct kmem_cache *s, slab_flags_t flags)
{
    int err;
    err = kmem_cache_open(s, flags);
    if (err)
        return err;
    /* Mutex is not taken during early boot */
    if (slab_state <= UP)
        return 0;
    memcg_propagate_slab_attrs(s);
    err = sysfs_slab_add(s);
    if (err)
        __kmem_cache_release(s);
    return err;
}
通过kmem_cache_alloc -> slab_alloc
static __always_inline void *
slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller)
{
    unsigned long save_flags;
    void *objp;
    flags &= gfp_allowed_mask;
    cachep = slab_pre_alloc_hook(cachep, flags);
    if (unlikely(!cachep))
        return NULL;
    cache_alloc_debugcheck_before(cachep, flags);
    /*关闭本地中断*/
    local_irq_save(save_flags);
    /*核心函数,获取slab对象*/
    objp = __do_cache_alloc(cachep, flags);
    local_irq_restore(save_flags);
    objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller);
    prefetchw(objp);
    /*如果分配时配置了__GFP_ZERO,把slab对象清零*/
    if (unlikely(slab_want_init_on_alloc(flags, cachep)) && objp)
        memset(objp, 0, cachep->object_size);
    slab_post_alloc_hook(cachep, flags, 1, &objp);
    return objp;
}
没有打开NUMA的宏,所以执行这里,有个疑问,参数是不是太少了...分配几个页面啥的也不知道啊
static __always_inline void *
__do_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
{
    return ____cache_alloc(cachep, flags);
}
static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
{
    void *objp;
    struct array_cache *ac;  //本地对象缓冲池
    check_irq_off();   //确认已经关闭中断
    ac = cpu_cache_get(cachep); //用这个宏来获得slab描述符的本地CPU对象缓冲池ac
    
    /*如果ac有空的对象,最近使用置为1,从数组中取出一个返回给objp*/
    if (likely(ac->avail)) {  
        ac->touched = 1;
        objp = ac->entry[--ac->avail];
        STATS_INC_ALLOCHIT(cachep);
        goto out;
    }
    STATS_INC_ALLOCMISS(cachep);
    
    /*第一次进入这里才能给avail赋值*/
    objp = cache_alloc_refill(cachep, flags);
    /*
     * the 'ac' may be updated by cache_alloc_refill(),
     * and kmemleak_erase() requires its correct value.
     */
    ac = cpu_cache_get(cachep);
out:
    /*
     * To avoid a false negative, if an object that is in one of the
     * per-CPU caches is leaked, we need to make sure kmemleak doesn't
     * treat the array pointers as a reference to the object.
     */
    if (objp)
        kmemleak_erase(&ac->entry[ac->avail]);
    return objp;
}
第一次分配本地缓冲池或者本地缓冲池avail不足
static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags)
{
    int batchcount;
    struct kmem_cache_node *n;  //只有一个
    struct array_cache *ac, *shared;  //一个本地,一个共享
    int node;
    void *list = NULL;
    struct page *page;
    check_irq_off();  //确认中断关闭
    node = numa_mem_id(); //就尼玛1个node
    ac = cpu_cache_get(cachep);  //获取本地cpu缓冲池
    batchcount = ac->batchcount;
    if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {
        /*
         * If there was little recent activity on this cache, then
         * perform only a partial refill.  Otherwise we could generate
         * refill bouncing.
         */
        batchcount = BATCHREFILL_LIMIT;   //16
    }
    n = get_node(cachep, node); //也就一个slab节点
    BUG_ON(ac->avail > 0 || !n);
    shared = READ_ONCE(n->shared);  //kmem_cache_node有共享池shared
    
    /*slab节点没有空闲对象且共享对象池也没有*/
    if (!n->free_objects && (!shared || !shared->avail))
        goto direct_grow;
    spin_lock(&n->list_lock);
    shared = READ_ONCE(n->shared);
    /* See if we can refill from the shared array 
     * 否则的话,transfer_objects就迁移batchcount个共享缓冲池的对象到本地缓冲池
     */
    if (shared && transfer_objects(ac, shared, batchcount)) {
        shared->touched = 1;
        goto alloc_done;  //返回分配成功
    }
    /*shared里面也没有空闲页面了,可太惨了*/
    while (batchcount > 0) {
        /* Get slab alloc is to come from. 
         * 扫描检查slabs_partial(先)和slabs_free(后)链表,返回第一个slab成员
         * 第一个物理页面的page数据结构
         */
        page = get_first_slab(n, false);
        if (!page)
            goto must_grow;
        check_spinlock_acquired(cachep);
        /*然后还是从page中迁移batchcount个空闲对象到本地对象缓冲池*/
        batchcount = alloc_block(cachep, ac, page, batchcount); 
        fixup_slab_list(cachep, n, page, &list);
    }
must_grow:
    n->free_objects -= ac->avail;   //从slab节点总的free_objects中减去分给本地cpu缓冲池的avail
alloc_done:
    spin_unlock(&n->list_lock);
    fixup_objfreelist_debug(cachep, &list);
direct_grow:
    if (unlikely(!ac->avail)) {
        /* Check if we can use obj in pfmemalloc slab */
        if (sk_memalloc_socks()) {
            void *obj = cache_alloc_pfmemalloc(cachep, n, flags);
            if (obj)
                return obj;
        }
        /*这个是重点,分配2的cachep->gfporder个物理页面*/
        page = cache_grow_begin(cachep, gfp_exact_node(flags), node);
        /*
         * cache_grow_begin() can reenable interrupts,
         * then ac could change.
         */
        ac = cpu_cache_get(cachep);
        if (!ac->avail && page)
        /*从新的slab分配器中迁移空闲对象*/
            alloc_block(cachep, ac, page, batchcount);
        cache_grow_end(cachep, page);
        if (!ac->avail)
            return NULL;
    }
    ac->touched = 1;
    /*和快速路径一样,直接从entry中分配第avail个*/
    return ac->entry[--ac->avail];
}
有申就有放~
void ___cache_free(struct kmem_cache *cachep, void *objp,
        unsigned long caller)
{
    struct array_cache *ac = cpu_cache_get(cachep);   //获取slab缓存对象
    check_irq_off();    //确认中断关闭
    if (unlikely(slab_want_init_on_free(cachep)))
        memset(objp, 0, cachep->object_size);
    kmemleak_free_recursive(objp, cachep->flags);
    objp = cache_free_debugcheck(cachep, objp, caller);
    /*
     * Skip calling cache_free_alien() when the platform is not numa.
     * This will avoid cache misses that happen while accessing slabp (which
     * is per page memory  reference) to get nodeid. Instead use a global
     * variable to skip the call, which is mostly likely to be present in
     * the cache.
     */
    if (nr_online_nodes > 1 && cache_free_alien(cachep, objp))
        return;
    if (ac->avail < ac->limit) {
        STATS_INC_FREEHIT(cachep);
    } else {
        STATS_INC_FREEMISS(cachep);
        /*如果avail超过limit,就尝试回收空闲对象,limit的计算在enable_cpucache中
         *主要是为了回收slab分配器
         */
        cache_flusharray(cachep, ac);
    }
    if (sk_memalloc_socks()) {
        struct page *page = virt_to_head_page(objp);
        if (unlikely(PageSlabPfmemalloc(page))) {
            cache_free_pfmemalloc(cachep, page, objp);
            return;
        }
    }
    /*把对象释放到本地对象缓冲池ac*/
    ac->entry[ac->avail++] = objp;
}
cache_flusharray函数主要用来回收整个slab,而不是某个对象
static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac)
{
    int batchcount;
    struct kmem_cache_node *n;
    int node = numa_mem_id();
    LIST_HEAD(list);
    batchcount = ac->batchcount;
    check_irq_off();   //关中断
    n = get_node(cachep, node);
    spin_lock(&n->list_lock);
    
    /*是否有共享缓冲池*/
    if (n->shared) {
        struct array_cache *shared_array = n->shared;
        /*是否avail超过limit,max>0说明还没达到,共享池没有满*/
        int max = shared_array->limit - shared_array->avail;  
        if (max) {
            if (batchcount > max)
                batchcount = max;  
            /*把本地共享池ac的复制到共享缓冲池中,给共享缓冲池填满到limit*/
            memcpy(&(shared_array->entry[shared_array->avail]),
                   ac->entry, sizeof(void *) * batchcount);
            shared_array->avail += batchcount;
            goto free_done;
        }
    }
    /*共享池充足,删除一些slab对象,主动释放batchcount个空闲对象*/
    free_block(cachep, ac->entry, batchcount, node, &list);
free_done:
#if STATS
    {
        int i = 0;
        struct page *page;
        list_for_each_entry(page, &n->slabs_free, lru) {
            BUG_ON(page->active);
            i++;
        }
        STATS_SET_FREEABLE(cachep, i);
    }
#endif
    spin_unlock(&n->list_lock);
    /*删除这个slab分配器,本地缓冲池减去分给共享池的*/
    slabs_destroy(cachep, &list);
    ac->avail -= batchcount;
        /*把本地ac池的缓存空闲对象迁移到缓存的头部*/
    memmove(ac->entry, &(ac->entry[batchcount]), sizeof(void *)*ac->avail);
}
共享池充足,删除一些slab对象free_block
1、slab描述符
2、本地缓冲池ac->entry
3、batchcount
4、node
5、list
static void free_block(struct kmem_cache *cachep, void **objpp,
            int nr_objects, int node, struct list_head *list)
{
    int i;
    struct kmem_cache_node *n = get_node(cachep, node); //这个结构体包含了slab的三个链表
    struct page *page;
    n->free_objects += nr_objects;  //回收的obj对象加入到free_obj中
    for (i = 0; i < nr_objects; i++) {
        void *objp; //定义个指针
        struct page *page;   //定义个page结构体
        objp = objpp[i];    //遍历entry,指向每个obj
        page = virt_to_head_page(objp); //找到这个obj所在的page
        list_del(&page->lru);   //从lru上摘下来这个页
        check_spinlock_acquired_node(cachep, node);
        slab_put_obj(cachep, page, objp);
        STATS_DEC_ACTIVE(cachep);
        /* fixup slab chains 
         * 如果没有了活跃对象,就把这个slab分配器添加到slab_free中
         */
        if (page->active == 0) {
        /*这个page的lru指针,如果page用作slab分配器,就指向slab节点
         *如果是普通的文件页、匿名页,就指向 (in)active_anon(file)的lru链表
         */
            list_add(&page->lru, &n->slabs_free);
            n->free_slabs++; //看来free_slabs的单位就是page哦
        } else {
            /* Unconditionally move a slab to the end of the
             * partial list on free - maximum time for the
             * other objects to be freed, too.
             * 仍然有活跃对象
             */
            list_add_tail(&page->lru, &n->slabs_partial);
        }
    }
    /*如果free超过limit,减去一个slab分配器,等会销毁掉*/
    while (n->free_objects > n->free_limit && !list_empty(&n->slabs_free)) {
        n->free_objects -= cachep->num; //一个slab中有num个obj
        page = list_last_entry(&n->slabs_free, struct page, lru);
        list_move(&page->lru, list);
        n->free_slabs--;
        n->total_slabs--;
    }
}
slab分配器其实就是page页面,那么如何从buddy系统中获取page呢,
之前有提到 cache_grow_begin -> kmem_getpages -> __alloc_pages_node -> __alloc_pages
static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags,
                                int nodeid)
{
    struct page *page;
    int nr_pages;
    flags |= cachep->allocflags;   //从哪个zone分配
    
    /*重点分配*/
    page = __alloc_pages_node(nodeid, flags, cachep->gfporder);
    
    if (!page) { //分配不到oom
        slab_out_of_memory(cachep, flags, nodeid);
        return NULL;
    }
    if (memcg_charge_slab(page, flags, cachep->gfporder, cachep)) {
        __free_pages(page, cachep->gfporder);
        return NULL;
    }
    /*返回2的gfporder阶,就是页数*/
    nr_pages = (1 << cachep->gfporder);
    /*根据slab描述符的分配掩码,增加内存节点统计,区分是否可回收*/
    if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
        mod_lruvec_page_state(page, NR_SLAB_RECLAIMABLE, nr_pages);
    else
        mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE, nr_pages);
    __SetPageSlab(page);
    /* Record if ALLOC_NO_WATERMARKS was set when allocating the slab */
    if (sk_memalloc_socks() && page_is_pfmemalloc(page))
        SetPageSlabPfmemalloc(page);
    return page;
}
