欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

kmem_cache_alloc核心函数slab_alloc_node的实现详解

程序员文章站 2022-04-18 23:14:09
...

       kmem_cache_alloc()是申请slub对象的入口函数,他的核心实现就是slab_alloc_node函数,此函数涉及的面很广,包括cgroup,进程调度,内存管理,cpu抢占等细节,需要仔细推敲深究;
代码版本:kernel-3.10

(一)全函数概要

 /*
 * Inlined fastpath so that allocation functions (kmalloc, kmem_cache_alloc)
 * have the fastpath folded into their functions. So no function call
 * overhead for requests that can be satisfied on the fastpath.
 *
 * The fastpath works by first checking if the lockless freelist can be used.
 * If not then __slab_alloc is called for slow processing.
 *
 * Otherwise we can simply pick the next object from the lockless free list.
 */
static __always_inline void *slab_alloc_node(struct kmem_cache *s,
		gfp_t gfpflags, int node, unsigned long addr)
{
	void **object;
	struct kmem_cache_cpu *c;
	struct page *page;
	unsigned long tid;

	s = slab_pre_alloc_hook(s, gfpflags);//(1)这个函数里面涉及进程cgroup空间,主要负责对slub对象分配的预处理,返回用于分配slub对象的kmem_cache
	if (!s)
		return NULL;
redo:
	/*
	 * Must read kmem_cache cpu data via this cpu ptr. Preemption is
	 * enabled. We may switch back and forth between cpus while
	 * reading from one cpu area. That does not matter as long
	 * as we end up on the original cpu again when doing the cmpxchg.
	 *
	 * Preemption is disabled for the retrieval of the tid because that
	 * must occur from the current processor. We cannot allow rescheduling
	 * on a different processor between the determination of the pointer
	 * and the retrieval of the tid.
	 */
	preempt_disable();//禁止抢占,确保tid和kmem_cache_cpu的获取是在同一个cpu上
	c = this_cpu_ptr(s->cpu_slab);//获取当前CPU的kmem_cache_cpu结构

	/*
	 * The transaction ids are globally unique per cpu and per operation on
	 * a per cpu queue. Thus they can be guarantee that the cmpxchg_double
	 * occurs on the right processor and that there was no operation on the
	 * linked list in between.
	 */
	tid = c->tid;//取得kmem_cache_cpu的tid值
	preempt_enable();//使能抢占

	object = c->freelist;//获得当前cpu的空闲对象列表
	page = c->page;//获取当前cpu使用的页
	if (unlikely(!object || !node_match(page, node))) { //当前CPU的slab空闲列表为空或者当前slab使用内存页面与管理节点不匹配时,需要重新分配slub对象。
		object = __slab_alloc(s, gfpflags, node, addr, c);//(2)分配slub对象
		stat(s, ALLOC_SLOWPATH);//设置kmem_cache_cpu的状态位(相应位加1),此操作表示从当前cpu获得新的cpu slub来分配对象(直译慢路径分配);
	} else {
		void *next_object = get_freepointer_safe(s, object);//获取空闲对象地址(object +s->offset)

		/*
		 * The cmpxchg will only match if there was no additional
		 * operation and if we are on the right processor.
		 *
		 * The cmpxchg does the following atomically (without lock
		 * semantics!)
		 * 1. Relocate first pointer to the current per cpu area.
		 * 2. Verify that tid and freelist have not been changed
		 * 3. If they were not changed replace tid and freelist
		 *
		 * Since this is without lock semantics the protection is only
		 * against code executing on this cpu *not* from access by
		 * other cpus.
		 */
		if (unlikely(!this_cpu_cmpxchg_double(//(3)原子操作获得空闲对象
				s->cpu_slab->freelist, s->cpu_slab->tid,
				object, tid,
				next_object, next_tid(tid)))) {

			note_cmpxchg_failure("slab_alloc", s, tid);//获取失败,重回redo,重新分配
			goto redo;
		}
		prefetch_freepointer(s, next_object);//若成功,则使用此函数刷新数据;
		stat(s, ALLOC_FASTPATH);//设置kmem_cache_cpu的状态位,此段表示通过当前cpu的cpu slub分配对象;
	}

	if (unlikely(gfpflags & __GFP_ZERO) && object)
		memset(object, 0, s->object_size);//初始化对象

	slab_post_alloc_hook(s, gfpflags, object);//进行对象分配后处理

	return object;
}

(二)分段详解
(1)slab_pre_alloc_hook函数

static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s,
						     gfp_t flags)
{
	flags &= gfp_allowed_mask;
	lockdep_trace_alloc(flags);
	might_sleep_if(flags & __GFP_WAIT);

	if (should_failslab(s->object_size, flags, s->flags))
		return NULL;

	return memcg_kmem_get_cache(s, flags); //选择一个正确的per-memcg cache来分配对象
}

/**
 * memcg_kmem_get_cache: selects the correct per-memcg cache for allocation
 * @cachep: the original global kmem cache //全局的缓存链表节点
 * @gfp: allocation flags.// 分配的标识
 static __always_inline struct kmem_cache *
memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp)
{
	if (!memcg_kmem_enabled())//出现以下四种情况时,立刻返回
		return cachep;
	if (gfp & __GFP_NOFAIL)
		return cachep;
	if (in_interrupt() || (!current->mm) || (current->flags & PF_KTHREAD))
		return cachep;
	if (unlikely(fatal_signal_pending(current)))
		return cachep;

	return __memcg_kmem_get_cache(cachep, gfp);//返回kmem_cache ,用它来分配slab对象(下文详细解析)
}

/*
 * Return the kmem_cache we're supposed to use for a slab allocation.
 * We try to use the current memcg's version of the cache.
 *
 * If the cache does not exist yet, if we are the first user of it,//如果这个cache不存在或者第一次使用
 * we either create it immediately, if possible, or create it asynchronously
 * in a workqueue.//立刻创建他,如果可以,我们会在工作队列中异步创建
 * In the latter case, we will let the current allocation go through with
 * the original cache.//对于后一种情形,我们会通过最初的的cache来分配
 *
 * Can't be called in interrupt context or from kernel threads.//不能在中断上下文或者内核线程中调用这个函数
 * This function needs to be called with rcu_read_lock() held.//这个函数必须调用rcu_read_lock RCU读锁
 */
struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep,
					  gfp_t gfp)
{
	struct mem_cgroup *memcg;//这是一个控制结构体,这个内存控制结构体包含所有的页信息与缓存信息。我们最终想要提供静态的信息帮助管理员确定调整怎样的值
	int idx;

	VM_BUG_ON(!cachep->memcg_params);
	VM_BUG_ON(!cachep->memcg_params->is_root_cache);

	if (!current->mm || current->memcg_kmem_skip_account)
		return cachep;

	rcu_read_lock();//加rcu锁
	memcg = mem_cgroup_from_task(rcu_dereference(current->mm->owner));/*从当前进程得到memcg*/

	if (!memcg_can_account_kmem(memcg))
		goto out;

	idx = memcg_cache_id(memcg);//获取memcg的index

	/*
	 * barrier to mare sure we're always seeing the up to date value.  The
	 * code updating memcg_caches will issue a write barrier to match this.
	 */
	read_barrier_depends();//barrier保证读取的总是上面的数值
	if (likely(cachep->memcg_params->memcg_caches[idx])) {
		cachep = cachep->memcg_params->memcg_caches[idx];//根据index从全局内存控制结构数组中获取一个缓存节点(将kmem_cache结构指针转换为mcgroup组的kmem_cache指针)
		goto out;
	}

	/* The corresponding put will be done in the workqueue. */
	if (!css_tryget(&memcg->css))
		goto out;
	rcu_read_unlock();

	memcg_create_cache_enqueue(memcg, cachep);//将创建per-memcg kmem_cacher的工作create_work放入队列(下文详解)
	return cachep;
out:
	rcu_read_unlock();
	return cachep;
}

static void memcg_create_cache_enqueue(struct mem_cgroup *memcg,
				       struct kmem_cache *cachep)
{
	memcg_stop_kmem_account();//kmalloc时停止计数,防止kmalloc还没创建成功,第一次在__memcg_create_cache_enqueue中分配会递归
	__memcg_create_cache_enqueue(memcg, cachep);//将创建per-memcg kmem_cacher的工作create_work放入全局的队列(下文详解)
	memcg_resume_kmem_account();
}

(2)慢分配:__slab_alloc函数:当前CPU的slab空闲列表为空或者当前slab使用内存页面与管理节点不匹配时,需要重新分配slub对象

/*
 * Another one that disabled interrupt and compensates for possible
 * cpu changes by refetching the per cpu area pointer.
 */
static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
			  unsigned long addr, struct kmem_cache_cpu *c)
{
	void *p; /*返回的目标指针*/
	unsigned long flags;

	local_irq_save(flags);//关中断
#ifdef CONFIG_PREEMPT
	/*
	 * We may have been preempted and rescheduled on a different
	 * cpu before disabling interrupts. Need to reload cpu area
	 * pointer.//由于在关中断之前,可能被抢占或者重新调度(迁移到其余cpu),因此需要重新获取每cpu变量
	 */
	c = this_cpu_ptr(s->cpu_slab);
#endif

	p = ___slab_alloc(s, gfpflags, node, addr, c);
	local_irq_restore(flags);
	return p;
}
假如当前活动 slab 没有空闲对象,或本处理器所在节点与指定节点不一致,则调用 __slab_alloc 函数来分配对象
static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
			  unsigned long addr, struct kmem_cache_cpu *c)
{
	void *freelist;
	struct page *page;

	page = c->page;//如果没有本地活动 slab,转到 new_slab 步骤获取 slab 。
	if (!page)
		goto new_slab;
redo:

	if (unlikely(!node_match(page, node))) {//如果本处理器所在节点与指定节点不一致,转到if外层执行
		int searchnode = node;

		if (node != NUMA_NO_NODE && !node_present_pages(node))
			searchnode = node_to_mem_node(node);//获取指定节点的node

		if (unlikely(!node_match(page, searchnode))) {//如果node还是不匹配,则移除cpu slab,进入new_slab流程
			stat(s, ALLOC_NODE_MISMATCH);
			deactivate_slab(s, page, c->freelist);//移除cpu slab(释放每cpu变量的所有freelist对象指针)
			//在Slub分配器中,如果一个slab位于本地CPU缓存上的话则称其处于冻结状态(frozen),如果处于slab 链表中,则称其处于解冻状态(unfrozen)。
			/*这个函数会做2件事情:1.释放每cpu变量的所有空闲对象到page的freelist,因为页是冻结的,此时也就不需要list->lock
						         2.确保在非冻结状态获取真实的对象数量*/
			c->page = NULL;
			c->freelist = NULL;
			goto new_slab;
		}
	}

	/*
	 * By rights, we should be searching for a slab page that was
	 * PFMEMALLOC but right now, we are losing the pfmemalloc
	 * information when the page leaves the per-cpu allocator
	 */
	if (unlikely(!pfmemalloc_match(page, gfpflags))) {//判断当前页面属性是否为pfmemalloc,如果不是则同样移除cpu slab。
		deactivate_slab(s, page, c->freelist);
		c->page = NULL;
		c->freelist = NULL;
		goto new_slab;
	}

	/* must check again c->freelist in case of cpu migration or IRQ */
	freelist = c->freelist;//再次检查空闲对象指针freelist是否为空,避免在禁止本地处理器中断前因发生了CPU迁移或者中断,导致本地的空闲对象指针不为空;
	if (freelist)
		goto load_freelist;//如果不为空的情况下,将会跳转至load_freelist

	stat(s, ALLOC_SLOWPATH);

	freelist = get_freelist(s, page);//如果为空,将会更新慢路径申请对象的统计信息,并通过get_freelist()从非冻结页面(未在cpu缓存中)中获取空闲队列

	if (!freelist) {//表示获取空闲队列失败,此时则需要创建新的slab,否则更新统计信息进入load_freelist分支取得对象并返回。
		c->page = NULL;
		stat(s, DEACTIVATE_BYPASS);
		goto new_slab;
	}

	stat(s, ALLOC_REFILL);

load_freelist:
	/*
	 * freelist is pointing to the list of objects to be used.//freelist指向将要被使用的空闲列表
	 * page is pointing to the page from which the objects are obtained.//page指向包含对象的页
	 * That page must be frozen for per cpu allocations to work.//page应处于冻结状态,即在cpu缓存中
	 */
	VM_BUG_ON(!c->page->frozen);
	c->freelist = get_freepointer(s, freelist);//获取空闲对象并返回空闲对象
	c->tid = next_tid(c->tid);
	return freelist;

new_slab:

	if (c->partial) {//首先会判断partial是否为空,不为空则从partial中取出page,然后跳转回redo重试分配
		page = c->page = c->partial;
		c->partial = page->next;
		stat(s, CPU_PARTIAL_ALLOC);
		c->freelist = NULL;
		goto redo;
	}

	freelist = new_slab_objects(s, gfpflags, node, &c);//如果partial为空,意味着当前所有的slab都已经满负荷使用,那么则需使用new_slab_objects()创建新的slab

	if (unlikely(!freelist)) {
		if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit())/*如果创建失败,那么将if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit())判断申请页面是否配置为无告警,并且送往控制台的消息数量在临界值内*/
			slab_out_of_memory(s, gfpflags, node);//调用slab_out_of_memory()记录日志后使能中断并返回NULL表示申请失败
		return NULL;
	}

	page = c->page;
	if (likely(!kmem_cache_debug(s) && pfmemalloc_match(page, gfpflags)))
		goto load_freelist;

	/* Only entered in the debug case */
	if (kmem_cache_debug(s) && !alloc_debug_processing(s, page, freelist, addr))
		goto new_slab;	/* Slab failed checks. Next slab needed */

	deactivate_slab(s, page, get_freepointer(s, freelist));
	c->page = NULL;
	c->freelist = NULL;
	return freelist;
}

(3)快分配:this_cpu_cmpxchg_double(
                s->cpu_slab->freelist, s->cpu_slab->tid,
                object, tid,
                next_object, next_tid(tid)函数
作用:

    a.重新获取当前每cpu域的指向下一个空闲对象的指针
    b.确保tid和freelist没有发生变化,如果发生了变化,就需要重新分配
    c.获得第一个空闲对象的指针,然后更新指针使其指向下一个空闲对象
这是个原子操作,可以避免上锁,此函数类似以下操作:
cmpxchg_double(p1, p2, o1, o2, n1, n2) 
 if(p1==o1 && p2==o2){
 p1 = n1; 
 p2 = n2; 
 return 1; 
}else 
return 0;