kmem_cache_alloc核心函数slab_alloc_node的实现详解

程序员文章站 2022-04-18 23:14:09

...

kmem_cache_alloc()是申请slub对象的入口函数，他的核心实现就是slab_alloc_node函数，此函数涉及的面很广，包括cgroup，进程调度，内存管理，cpu抢占等细节，需要仔细推敲深究；
代码版本：kernel-3.10

（一）全函数概要

 /*
 * Inlined fastpath so that allocation functions (kmalloc, kmem_cache_alloc)
 * have the fastpath folded into their functions. So no function call
 * overhead for requests that can be satisfied on the fastpath.
 *
 * The fastpath works by first checking if the lockless freelist can be used.
 * If not then __slab_alloc is called for slow processing.
 *
 * Otherwise we can simply pick the next object from the lockless free list.
 */
static __always_inline void *slab_alloc_node(struct kmem_cache *s,
		gfp_t gfpflags, int node, unsigned long addr)
{
	void **object;
	struct kmem_cache_cpu *c;
	struct page *page;
	unsigned long tid;

	s = slab_pre_alloc_hook(s, gfpflags);//(1)这个函数里面涉及进程cgroup空间，主要负责对slub对象分配的预处理，返回用于分配slub对象的kmem_cache
	if (!s)
		return NULL;
redo:
	/*
	 * Must read kmem_cache cpu data via this cpu ptr. Preemption is
	 * enabled. We may switch back and forth between cpus while
	 * reading from one cpu area. That does not matter as long
	 * as we end up on the original cpu again when doing the cmpxchg.
	 *
	 * Preemption is disabled for the retrieval of the tid because that
	 * must occur from the current processor. We cannot allow rescheduling
	 * on a different processor between the determination of the pointer
	 * and the retrieval of the tid.
	 */
	preempt_disable();//禁止抢占，确保tid和kmem_cache_cpu的获取是在同一个cpu上
	c = this_cpu_ptr(s->cpu_slab);//获取当前CPU的kmem_cache_cpu结构

	/*
	 * The transaction ids are globally unique per cpu and per operation on
	 * a per cpu queue. Thus they can be guarantee that the cmpxchg_double
	 * occurs on the right processor and that there was no operation on the
	 * linked list in between.
	 */
	tid = c->tid;//取得kmem_cache_cpu的tid值
	preempt_enable();//使能抢占

	object = c->freelist;//获得当前cpu的空闲对象列表
	page = c->page;//获取当前cpu使用的页
	if (unlikely(!object || !node_match(page, node))) { //当前CPU的slab空闲列表为空或者当前slab使用内存页面与管理节点不匹配时，需要重新分配slub对象。
		object = __slab_alloc(s, gfpflags, node, addr, c);//(2)分配slub对象
		stat(s, ALLOC_SLOWPATH);//设置kmem_cache_cpu的状态位（相应位加1）,此操作表示从当前cpu获得新的cpu slub来分配对象（直译慢路径分配）；
	} else {
		void *next_object = get_freepointer_safe(s, object);//获取空闲对象地址（object +s->offset）

		/*
		 * The cmpxchg will only match if there was no additional
		 * operation and if we are on the right processor.
		 *
		 * The cmpxchg does the following atomically (without lock
		 * semantics!)
		 * 1. Relocate first pointer to the current per cpu area.
		 * 2. Verify that tid and freelist have not been changed
		 * 3. If they were not changed replace tid and freelist
		 *
		 * Since this is without lock semantics the protection is only
		 * against code executing on this cpu *not* from access by
		 * other cpus.
		 */
		if (unlikely(!this_cpu_cmpxchg_double(//(3)原子操作获得空闲对象
				s->cpu_slab->freelist, s->cpu_slab->tid,
				object, tid,
				next_object, next_tid(tid)))) {

			note_cmpxchg_failure("slab_alloc", s, tid);//获取失败，重回redo，重新分配
			goto redo;
		}
		prefetch_freepointer(s, next_object);//若成功，则使用此函数刷新数据；
		stat(s, ALLOC_FASTPATH);//设置kmem_cache_cpu的状态位,此段表示通过当前cpu的cpu slub分配对象；
	}

	if (unlikely(gfpflags & __GFP_ZERO) && object)
		memset(object, 0, s->object_size);//初始化对象

	slab_post_alloc_hook(s, gfpflags, object);//进行对象分配后处理

	return object;
}

（二）分段详解
（1）slab_pre_alloc_hook函数

static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s,
						     gfp_t flags)
{
	flags &= gfp_allowed_mask;
	lockdep_trace_alloc(flags);
	might_sleep_if(flags & __GFP_WAIT);

	if (should_failslab(s->object_size, flags, s->flags))
		return NULL;

	return memcg_kmem_get_cache(s, flags); //选择一个正确的per-memcg cache来分配对象
}

/**
 * memcg_kmem_get_cache: selects the correct per-memcg cache for allocation
 * @cachep: the original global kmem cache //全局的缓存链表节点
 * @gfp: allocation flags.// 分配的标识
 static __always_inline struct kmem_cache *
memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp)
{
	if (!memcg_kmem_enabled())//出现以下四种情况时，立刻返回
		return cachep;
	if (gfp & __GFP_NOFAIL)
		return cachep;
	if (in_interrupt() || (!current->mm) || (current->flags & PF_KTHREAD))
		return cachep;
	if (unlikely(fatal_signal_pending(current)))
		return cachep;

	return __memcg_kmem_get_cache(cachep, gfp);//返回kmem_cache ,用它来分配slab对象（下文详细解析）
}

/*
 * Return the kmem_cache we're supposed to use for a slab allocation.
 * We try to use the current memcg's version of the cache.
 *
 * If the cache does not exist yet, if we are the first user of it,//如果这个cache不存在或者第一次使用
 * we either create it immediately, if possible, or create it asynchronously
 * in a workqueue.//立刻创建他，如果可以，我们会在工作队列中异步创建
 * In the latter case, we will let the current allocation go through with
 * the original cache.//对于后一种情形，我们会通过最初的的cache来分配
 *
 * Can't be called in interrupt context or from kernel threads.//不能在中断上下文或者内核线程中调用这个函数
 * This function needs to be called with rcu_read_lock() held.//这个函数必须调用rcu_read_lock RCU读锁
 */
struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep,
					  gfp_t gfp)
{
	struct mem_cgroup *memcg;//这是一个控制结构体，这个内存控制结构体包含所有的页信息与缓存信息。我们最终想要提供静态的信息帮助管理员确定调整怎样的值
	int idx;

	VM_BUG_ON(!cachep->memcg_params);
	VM_BUG_ON(!cachep->memcg_params->is_root_cache);

	if (!current->mm || current->memcg_kmem_skip_account)
		return cachep;

	rcu_read_lock();//加rcu锁
	memcg = mem_cgroup_from_task(rcu_dereference(current->mm->owner));/*从当前进程得到memcg*/

	if (!memcg_can_account_kmem(memcg))
		goto out;

	idx = memcg_cache_id(memcg);//获取memcg的index

	/*
	 * barrier to mare sure we're always seeing the up to date value.  The
	 * code updating memcg_caches will issue a write barrier to match this.
	 */
	read_barrier_depends();//barrier保证读取的总是上面的数值
	if (likely(cachep->memcg_params->memcg_caches[idx])) {
		cachep = cachep->memcg_params->memcg_caches[idx];//根据index从全局内存控制结构数组中获取一个缓存节点(将kmem_cache结构指针转换为mcgroup组的kmem_cache指针)
		goto out;
	}

	/* The corresponding put will be done in the workqueue. */
	if (!css_tryget(&memcg->css))
		goto out;
	rcu_read_unlock();

	memcg_create_cache_enqueue(memcg, cachep);//将创建per-memcg kmem_cacher的工作create_work放入队列（下文详解）
	return cachep;
out:
	rcu_read_unlock();
	return cachep;
}

static void memcg_create_cache_enqueue(struct mem_cgroup *memcg,
				       struct kmem_cache *cachep)
{
	memcg_stop_kmem_account();//kmalloc时停止计数，防止kmalloc还没创建成功，第一次在__memcg_create_cache_enqueue中分配会递归
	__memcg_create_cache_enqueue(memcg, cachep);//将创建per-memcg kmem_cacher的工作create_work放入全局的队列（下文详解）
	memcg_resume_kmem_account();
}

(2)慢分配：__slab_alloc函数：当前CPU的slab空闲列表为空或者当前slab使用内存页面与管理节点不匹配时，需要重新分配slub对象

/*
 * Another one that disabled interrupt and compensates for possible
 * cpu changes by refetching the per cpu area pointer.
 */
static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
			  unsigned long addr, struct kmem_cache_cpu *c)
{
	void *p; /*返回的目标指针*/
	unsigned long flags;

	local_irq_save(flags);//关中断
#ifdef CONFIG_PREEMPT
	/*
	 * We may have been preempted and rescheduled on a different
	 * cpu before disabling interrupts. Need to reload cpu area
	 * pointer.//由于在关中断之前，可能被抢占或者重新调度（迁移到其余cpu），因此需要重新获取每cpu变量
	 */
	c = this_cpu_ptr(s->cpu_slab);
#endif

	p = ___slab_alloc(s, gfpflags, node, addr, c);
	local_irq_restore(flags);
	return p;
}
假如当前活动 slab 没有空闲对象，或本处理器所在节点与指定节点不一致，则调用 __slab_alloc 函数来分配对象
static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
			  unsigned long addr, struct kmem_cache_cpu *c)
{
	void *freelist;
	struct page *page;

	page = c->page;//如果没有本地活动 slab，转到 new_slab 步骤获取 slab 。
	if (!page)
		goto new_slab;
redo:

	if (unlikely(!node_match(page, node))) {//如果本处理器所在节点与指定节点不一致，转到if外层执行
		int searchnode = node;

		if (node != NUMA_NO_NODE && !node_present_pages(node))
			searchnode = node_to_mem_node(node);//获取指定节点的node

		if (unlikely(!node_match(page, searchnode))) {//如果node还是不匹配，则移除cpu slab，进入new_slab流程
			stat(s, ALLOC_NODE_MISMATCH);
			deactivate_slab(s, page, c->freelist);//移除cpu slab(释放每cpu变量的所有freelist对象指针)
			//在Slub分配器中，如果一个slab位于本地CPU缓存上的话则称其处于冻结状态(frozen)，如果处于slab 链表中，则称其处于解冻状态(unfrozen)。
			/*这个函数会做2件事情：1.释放每cpu变量的所有空闲对象到page的freelist，因为页是冻结的，此时也就不需要list->lock
						         2.确保在非冻结状态获取真实的对象数量*/
			c->page = NULL;
			c->freelist = NULL;
			goto new_slab;
		}
	}

	/*
	 * By rights, we should be searching for a slab page that was
	 * PFMEMALLOC but right now, we are losing the pfmemalloc
	 * information when the page leaves the per-cpu allocator
	 */
	if (unlikely(!pfmemalloc_match(page, gfpflags))) {//判断当前页面属性是否为pfmemalloc，如果不是则同样移除cpu slab。
		deactivate_slab(s, page, c->freelist);
		c->page = NULL;
		c->freelist = NULL;
		goto new_slab;
	}

	/* must check again c->freelist in case of cpu migration or IRQ */
	freelist = c->freelist;//再次检查空闲对象指针freelist是否为空，避免在禁止本地处理器中断前因发生了CPU迁移或者中断，导致本地的空闲对象指针不为空；
	if (freelist)
		goto load_freelist;//如果不为空的情况下，将会跳转至load_freelist

	stat(s, ALLOC_SLOWPATH);

	freelist = get_freelist(s, page);//如果为空，将会更新慢路径申请对象的统计信息，并通过get_freelist()从非冻结页面（未在cpu缓存中）中获取空闲队列

	if (!freelist) {//表示获取空闲队列失败，此时则需要创建新的slab，否则更新统计信息进入load_freelist分支取得对象并返回。
		c->page = NULL;
		stat(s, DEACTIVATE_BYPASS);
		goto new_slab;
	}

	stat(s, ALLOC_REFILL);

load_freelist:
	/*
	 * freelist is pointing to the list of objects to be used.//freelist指向将要被使用的空闲列表
	 * page is pointing to the page from which the objects are obtained.//page指向包含对象的页
	 * That page must be frozen for per cpu allocations to work.//page应处于冻结状态，即在cpu缓存中
	 */
	VM_BUG_ON(!c->page->frozen);
	c->freelist = get_freepointer(s, freelist);//获取空闲对象并返回空闲对象
	c->tid = next_tid(c->tid);
	return freelist;

new_slab:

	if (c->partial) {//首先会判断partial是否为空，不为空则从partial中取出page，然后跳转回redo重试分配
		page = c->page = c->partial;
		c->partial = page->next;
		stat(s, CPU_PARTIAL_ALLOC);
		c->freelist = NULL;
		goto redo;
	}

	freelist = new_slab_objects(s, gfpflags, node, &c);//如果partial为空，意味着当前所有的slab都已经满负荷使用，那么则需使用new_slab_objects()创建新的slab

	if (unlikely(!freelist)) {
		if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit())/*如果创建失败，那么将if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit())判断申请页面是否配置为无告警，并且送往控制台的消息数量在临界值内*/
			slab_out_of_memory(s, gfpflags, node);//调用slab_out_of_memory()记录日志后使能中断并返回NULL表示申请失败
		return NULL;
	}

	page = c->page;
	if (likely(!kmem_cache_debug(s) && pfmemalloc_match(page, gfpflags)))
		goto load_freelist;

	/* Only entered in the debug case */
	if (kmem_cache_debug(s) && !alloc_debug_processing(s, page, freelist, addr))
		goto new_slab;	/* Slab failed checks. Next slab needed */

	deactivate_slab(s, page, get_freepointer(s, freelist));
	c->page = NULL;
	c->freelist = NULL;
	return freelist;
}

(3)快分配：this_cpu_cmpxchg_double(
               s->cpu_slab->freelist, s->cpu_slab->tid,
               object, tid,
               next_object, next_tid(tid)函数
作用：

a.重新获取当前每cpu域的指向下一个空闲对象的指针
b.确保tid和freelist没有发生变化，如果发生了变化，就需要重新分配
c.获得第一个空闲对象的指针，然后更新指针使其指向下一个空闲对象
这是个原子操作，可以避免上锁，此函数类似以下操作：
cmpxchg_double(p1, p2, o1, o2, n1, n2)
if(p1==o1 && p2==o2){
p1 = n1;
p2 = n2;
return 1;
}else
return 0;

上一篇：对象释放函数kmem_cache_free核心函数slab_free的实现详解

下一篇：百度推广里中怎么查看网站页面排名?

kmem_cache_alloc核心函数slab_alloc_node的实现详解

对Python实现累加函数的方法详解

详解C#中通过委托来实现回调函数功能的方法

jQuery中extend函数的实现原理详解

SQL2005CLR函数扩展-数据导出的实现详解

Golang 函数执行时间统计装饰器的一个实现详解

Go语言 channel如何实现归并排序中的merge函数详解

JS中如何实现Laravel的route函数详解

ASP中实现限制IP的函数详解

postman自定义函数实现时间函数的思路详解

JS函数节流和防抖之间的区分和实现详解

kmem_cache_alloc核心函数slab_alloc_node的实现详解

对Python实现累加函数的方法详解

详解C#中通过委托来实现回调函数功能的方法

jQuery中extend函数的实现原理详解

SQL2005CLR函数扩展-数据导出的实现详解

Golang 函数执行时间统计装饰器的一个实现详解

Go语言 channel如何实现归并排序中的merge函数详解

JS中如何实现Laravel的route函数详解

ASP中实现限制IP的函数详解

postman自定义函数实现 时间函数的思路详解

JS函数节流和防抖之间的区分和实现详解

postman自定义函数实现时间函数的思路详解