kmem_cache_alloc核心函数slab_alloc_node的实现详解
程序员文章站
2022-04-18 23:14:09
...
kmem_cache_alloc()是申请slub对象的入口函数,他的核心实现就是slab_alloc_node函数,此函数涉及的面很广,包括cgroup,进程调度,内存管理,cpu抢占等细节,需要仔细推敲深究;
代码版本:kernel-3.10
(一)全函数概要
/*
* Inlined fastpath so that allocation functions (kmalloc, kmem_cache_alloc)
* have the fastpath folded into their functions. So no function call
* overhead for requests that can be satisfied on the fastpath.
*
* The fastpath works by first checking if the lockless freelist can be used.
* If not then __slab_alloc is called for slow processing.
*
* Otherwise we can simply pick the next object from the lockless free list.
*/
static __always_inline void *slab_alloc_node(struct kmem_cache *s,
gfp_t gfpflags, int node, unsigned long addr)
{
void **object;
struct kmem_cache_cpu *c;
struct page *page;
unsigned long tid;
s = slab_pre_alloc_hook(s, gfpflags);//(1)这个函数里面涉及进程cgroup空间,主要负责对slub对象分配的预处理,返回用于分配slub对象的kmem_cache
if (!s)
return NULL;
redo:
/*
* Must read kmem_cache cpu data via this cpu ptr. Preemption is
* enabled. We may switch back and forth between cpus while
* reading from one cpu area. That does not matter as long
* as we end up on the original cpu again when doing the cmpxchg.
*
* Preemption is disabled for the retrieval of the tid because that
* must occur from the current processor. We cannot allow rescheduling
* on a different processor between the determination of the pointer
* and the retrieval of the tid.
*/
preempt_disable();//禁止抢占,确保tid和kmem_cache_cpu的获取是在同一个cpu上
c = this_cpu_ptr(s->cpu_slab);//获取当前CPU的kmem_cache_cpu结构
/*
* The transaction ids are globally unique per cpu and per operation on
* a per cpu queue. Thus they can be guarantee that the cmpxchg_double
* occurs on the right processor and that there was no operation on the
* linked list in between.
*/
tid = c->tid;//取得kmem_cache_cpu的tid值
preempt_enable();//使能抢占
object = c->freelist;//获得当前cpu的空闲对象列表
page = c->page;//获取当前cpu使用的页
if (unlikely(!object || !node_match(page, node))) { //当前CPU的slab空闲列表为空或者当前slab使用内存页面与管理节点不匹配时,需要重新分配slub对象。
object = __slab_alloc(s, gfpflags, node, addr, c);//(2)分配slub对象
stat(s, ALLOC_SLOWPATH);//设置kmem_cache_cpu的状态位(相应位加1),此操作表示从当前cpu获得新的cpu slub来分配对象(直译慢路径分配);
} else {
void *next_object = get_freepointer_safe(s, object);//获取空闲对象地址(object +s->offset)
/*
* The cmpxchg will only match if there was no additional
* operation and if we are on the right processor.
*
* The cmpxchg does the following atomically (without lock
* semantics!)
* 1. Relocate first pointer to the current per cpu area.
* 2. Verify that tid and freelist have not been changed
* 3. If they were not changed replace tid and freelist
*
* Since this is without lock semantics the protection is only
* against code executing on this cpu *not* from access by
* other cpus.
*/
if (unlikely(!this_cpu_cmpxchg_double(//(3)原子操作获得空闲对象
s->cpu_slab->freelist, s->cpu_slab->tid,
object, tid,
next_object, next_tid(tid)))) {
note_cmpxchg_failure("slab_alloc", s, tid);//获取失败,重回redo,重新分配
goto redo;
}
prefetch_freepointer(s, next_object);//若成功,则使用此函数刷新数据;
stat(s, ALLOC_FASTPATH);//设置kmem_cache_cpu的状态位,此段表示通过当前cpu的cpu slub分配对象;
}
if (unlikely(gfpflags & __GFP_ZERO) && object)
memset(object, 0, s->object_size);//初始化对象
slab_post_alloc_hook(s, gfpflags, object);//进行对象分配后处理
return object;
}
(二)分段详解
(1)slab_pre_alloc_hook函数
static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s,
gfp_t flags)
{
flags &= gfp_allowed_mask;
lockdep_trace_alloc(flags);
might_sleep_if(flags & __GFP_WAIT);
if (should_failslab(s->object_size, flags, s->flags))
return NULL;
return memcg_kmem_get_cache(s, flags); //选择一个正确的per-memcg cache来分配对象
}
/**
* memcg_kmem_get_cache: selects the correct per-memcg cache for allocation
* @cachep: the original global kmem cache //全局的缓存链表节点
* @gfp: allocation flags.// 分配的标识
static __always_inline struct kmem_cache *
memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp)
{
if (!memcg_kmem_enabled())//出现以下四种情况时,立刻返回
return cachep;
if (gfp & __GFP_NOFAIL)
return cachep;
if (in_interrupt() || (!current->mm) || (current->flags & PF_KTHREAD))
return cachep;
if (unlikely(fatal_signal_pending(current)))
return cachep;
return __memcg_kmem_get_cache(cachep, gfp);//返回kmem_cache ,用它来分配slab对象(下文详细解析)
}
/*
* Return the kmem_cache we're supposed to use for a slab allocation.
* We try to use the current memcg's version of the cache.
*
* If the cache does not exist yet, if we are the first user of it,//如果这个cache不存在或者第一次使用
* we either create it immediately, if possible, or create it asynchronously
* in a workqueue.//立刻创建他,如果可以,我们会在工作队列中异步创建
* In the latter case, we will let the current allocation go through with
* the original cache.//对于后一种情形,我们会通过最初的的cache来分配
*
* Can't be called in interrupt context or from kernel threads.//不能在中断上下文或者内核线程中调用这个函数
* This function needs to be called with rcu_read_lock() held.//这个函数必须调用rcu_read_lock RCU读锁
*/
struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep,
gfp_t gfp)
{
struct mem_cgroup *memcg;//这是一个控制结构体,这个内存控制结构体包含所有的页信息与缓存信息。我们最终想要提供静态的信息帮助管理员确定调整怎样的值
int idx;
VM_BUG_ON(!cachep->memcg_params);
VM_BUG_ON(!cachep->memcg_params->is_root_cache);
if (!current->mm || current->memcg_kmem_skip_account)
return cachep;
rcu_read_lock();//加rcu锁
memcg = mem_cgroup_from_task(rcu_dereference(current->mm->owner));/*从当前进程得到memcg*/
if (!memcg_can_account_kmem(memcg))
goto out;
idx = memcg_cache_id(memcg);//获取memcg的index
/*
* barrier to mare sure we're always seeing the up to date value. The
* code updating memcg_caches will issue a write barrier to match this.
*/
read_barrier_depends();//barrier保证读取的总是上面的数值
if (likely(cachep->memcg_params->memcg_caches[idx])) {
cachep = cachep->memcg_params->memcg_caches[idx];//根据index从全局内存控制结构数组中获取一个缓存节点(将kmem_cache结构指针转换为mcgroup组的kmem_cache指针)
goto out;
}
/* The corresponding put will be done in the workqueue. */
if (!css_tryget(&memcg->css))
goto out;
rcu_read_unlock();
memcg_create_cache_enqueue(memcg, cachep);//将创建per-memcg kmem_cacher的工作create_work放入队列(下文详解)
return cachep;
out:
rcu_read_unlock();
return cachep;
}
static void memcg_create_cache_enqueue(struct mem_cgroup *memcg,
struct kmem_cache *cachep)
{
memcg_stop_kmem_account();//kmalloc时停止计数,防止kmalloc还没创建成功,第一次在__memcg_create_cache_enqueue中分配会递归
__memcg_create_cache_enqueue(memcg, cachep);//将创建per-memcg kmem_cacher的工作create_work放入全局的队列(下文详解)
memcg_resume_kmem_account();
}
(2)慢分配:__slab_alloc函数:当前CPU的slab空闲列表为空或者当前slab使用内存页面与管理节点不匹配时,需要重新分配slub对象
/*
* Another one that disabled interrupt and compensates for possible
* cpu changes by refetching the per cpu area pointer.
*/
static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
unsigned long addr, struct kmem_cache_cpu *c)
{
void *p; /*返回的目标指针*/
unsigned long flags;
local_irq_save(flags);//关中断
#ifdef CONFIG_PREEMPT
/*
* We may have been preempted and rescheduled on a different
* cpu before disabling interrupts. Need to reload cpu area
* pointer.//由于在关中断之前,可能被抢占或者重新调度(迁移到其余cpu),因此需要重新获取每cpu变量
*/
c = this_cpu_ptr(s->cpu_slab);
#endif
p = ___slab_alloc(s, gfpflags, node, addr, c);
local_irq_restore(flags);
return p;
}
假如当前活动 slab 没有空闲对象,或本处理器所在节点与指定节点不一致,则调用 __slab_alloc 函数来分配对象
static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
unsigned long addr, struct kmem_cache_cpu *c)
{
void *freelist;
struct page *page;
page = c->page;//如果没有本地活动 slab,转到 new_slab 步骤获取 slab 。
if (!page)
goto new_slab;
redo:
if (unlikely(!node_match(page, node))) {//如果本处理器所在节点与指定节点不一致,转到if外层执行
int searchnode = node;
if (node != NUMA_NO_NODE && !node_present_pages(node))
searchnode = node_to_mem_node(node);//获取指定节点的node
if (unlikely(!node_match(page, searchnode))) {//如果node还是不匹配,则移除cpu slab,进入new_slab流程
stat(s, ALLOC_NODE_MISMATCH);
deactivate_slab(s, page, c->freelist);//移除cpu slab(释放每cpu变量的所有freelist对象指针)
//在Slub分配器中,如果一个slab位于本地CPU缓存上的话则称其处于冻结状态(frozen),如果处于slab 链表中,则称其处于解冻状态(unfrozen)。
/*这个函数会做2件事情:1.释放每cpu变量的所有空闲对象到page的freelist,因为页是冻结的,此时也就不需要list->lock
2.确保在非冻结状态获取真实的对象数量*/
c->page = NULL;
c->freelist = NULL;
goto new_slab;
}
}
/*
* By rights, we should be searching for a slab page that was
* PFMEMALLOC but right now, we are losing the pfmemalloc
* information when the page leaves the per-cpu allocator
*/
if (unlikely(!pfmemalloc_match(page, gfpflags))) {//判断当前页面属性是否为pfmemalloc,如果不是则同样移除cpu slab。
deactivate_slab(s, page, c->freelist);
c->page = NULL;
c->freelist = NULL;
goto new_slab;
}
/* must check again c->freelist in case of cpu migration or IRQ */
freelist = c->freelist;//再次检查空闲对象指针freelist是否为空,避免在禁止本地处理器中断前因发生了CPU迁移或者中断,导致本地的空闲对象指针不为空;
if (freelist)
goto load_freelist;//如果不为空的情况下,将会跳转至load_freelist
stat(s, ALLOC_SLOWPATH);
freelist = get_freelist(s, page);//如果为空,将会更新慢路径申请对象的统计信息,并通过get_freelist()从非冻结页面(未在cpu缓存中)中获取空闲队列
if (!freelist) {//表示获取空闲队列失败,此时则需要创建新的slab,否则更新统计信息进入load_freelist分支取得对象并返回。
c->page = NULL;
stat(s, DEACTIVATE_BYPASS);
goto new_slab;
}
stat(s, ALLOC_REFILL);
load_freelist:
/*
* freelist is pointing to the list of objects to be used.//freelist指向将要被使用的空闲列表
* page is pointing to the page from which the objects are obtained.//page指向包含对象的页
* That page must be frozen for per cpu allocations to work.//page应处于冻结状态,即在cpu缓存中
*/
VM_BUG_ON(!c->page->frozen);
c->freelist = get_freepointer(s, freelist);//获取空闲对象并返回空闲对象
c->tid = next_tid(c->tid);
return freelist;
new_slab:
if (c->partial) {//首先会判断partial是否为空,不为空则从partial中取出page,然后跳转回redo重试分配
page = c->page = c->partial;
c->partial = page->next;
stat(s, CPU_PARTIAL_ALLOC);
c->freelist = NULL;
goto redo;
}
freelist = new_slab_objects(s, gfpflags, node, &c);//如果partial为空,意味着当前所有的slab都已经满负荷使用,那么则需使用new_slab_objects()创建新的slab
if (unlikely(!freelist)) {
if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit())/*如果创建失败,那么将if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit())判断申请页面是否配置为无告警,并且送往控制台的消息数量在临界值内*/
slab_out_of_memory(s, gfpflags, node);//调用slab_out_of_memory()记录日志后使能中断并返回NULL表示申请失败
return NULL;
}
page = c->page;
if (likely(!kmem_cache_debug(s) && pfmemalloc_match(page, gfpflags)))
goto load_freelist;
/* Only entered in the debug case */
if (kmem_cache_debug(s) && !alloc_debug_processing(s, page, freelist, addr))
goto new_slab; /* Slab failed checks. Next slab needed */
deactivate_slab(s, page, get_freepointer(s, freelist));
c->page = NULL;
c->freelist = NULL;
return freelist;
}
(3)快分配:this_cpu_cmpxchg_double(
s->cpu_slab->freelist, s->cpu_slab->tid,
object, tid,
next_object, next_tid(tid)函数
作用:
a.重新获取当前每cpu域的指向下一个空闲对象的指针
b.确保tid和freelist没有发生变化,如果发生了变化,就需要重新分配
c.获得第一个空闲对象的指针,然后更新指针使其指向下一个空闲对象
这是个原子操作,可以避免上锁,此函数类似以下操作:
cmpxchg_double(p1, p2, o1, o2, n1, n2)
if(p1==o1 && p2==o2){
p1 = n1;
p2 = n2;
return 1;
}else
return 0;