Linux内存初始化(2)——paging_init初始化
Linux内存初始化
说明
Kernel版本:4.14.111
ARM处理器,Contex-A7,四核(arm32)
在上文中,跳转到start_kernel
前,已知,内核已创建了kernel、dtb的线性映射,即内核可以访问自己代码段等区域,并能访问dtb所在内存区域的虚拟地址。
随后在内存管理系统未初始化时,尽管物理内存已经通过memblock_add
添加进了系统,但是这部分物理内存到虚拟内存的映射还没有建立,虽然可以通过memblock_alloc
分配一段内存,但是还不能访问。
一切都要等待paging_init
执行,建立完页表,就可以通过虚拟地址去访问最终的物理地址了。
paging_init
start_kernel->setup_arch->paging_init
/*
* paging_init() sets up the page tables, initialises the zone memory
* maps, and sets up the zero page, bad page and bad page tables.
*/
void __init paging_init(const struct machine_desc *mdesc)
{
void *zero_page;
prepare_page_table(); (1)
map_lowmem(); (2)
memblock_set_current_limit(arm_lowmem_limit); (3)
dma_contiguous_remap(); (4)
early_fixmap_shutdown(); (5)
devicemaps_init(mdesc);
kmap_init();
tcm_init();
top_pmd = pmd_off_k(0xffff0000);
/* allocate the zero page. */
zero_page = early_alloc(PAGE_SIZE); (6)
bootmem_init(); (7)
empty_zero_page = virt_to_page(zero_page);
__flush_dcache_page(NULL, empty_zero_page);
/* Compute the virt/idmap offset, mostly for the sake of KVM */
kimage_voffset = (unsigned long)&kimage_voffset - virt_to_idmap(&kimage_voffset);
}
1)清除部分段页表
2)低端内存创建映射
3)设置memblock的限制为低端内存结束地址
4)为dma预留的连续内存区域创建页表映射
5)fixmap相关,看了下函数实现,大概是也是建立映射关系(page为单位)
6)申请zero page
7)bootmem_init,zone初始化等。
下面对各个模块进行进一步分析:
prepare_page_table
start_kernel->setup_arch->paging_init->prepare_page_table
static inline void prepare_page_table(void)
{
unsigned long addr;
phys_addr_t end;
/*
* Clear out all the mappings below the kernel image.
*/
for (addr = 0; addr < MODULES_VADDR; addr += PMD_SIZE)
pmd_clear(pmd_off_k(addr));
#ifdef CONFIG_XIP_KERNEL
/* The XIP kernel is mapped in the module area -- skip over it */
addr = ((unsigned long)_exiprom + PMD_SIZE - 1) & PMD_MASK;
#endif
for ( ; addr < PAGE_OFFSET; addr += PMD_SIZE)
pmd_clear(pmd_off_k(addr));
/*
* Find the end of the first block of lowmem.
*/
end = memblock.memory.regions[0].base + memblock.memory.regions[0].size;
if (end >= arm_lowmem_limit)
end = arm_lowmem_limit;
/*
* Clear out all the kernel space mappings, except for the first
* memory bank, up to the vmalloc region.
*/
for (addr = __phys_to_virt(end);
addr < VMALLOC_START; addr += PMD_SIZE)
pmd_clear(pmd_off_k(addr));
}
上述代码,清除了三段代码的页表。
1)0~MODULES_VADDR
内核启动打印:
modules : 0xbf000000 - 0xbfe00000 ( 14 MB)
2)MODULES_VADDR~PAGE_OFFSET
0xbfe00000~0xc0000000
3) arm_lowmem_limit~VMALLOC_START
vmalloc : 0xf0800000 - 0xff800000 ( 240 MB)
lowmem : 0xc0000000 - 0xf0000000 ( 768 MB)
即清除的区域为:0xf0000000 ~ 0xf0800000
map_lowmem
start_kernel->setup_arch->paging_init->map_lowmem
static void __init map_lowmem(void)
{
struct memblock_region *reg;
phys_addr_t kernel_x_start = round_down(__pa(KERNEL_START), SECTION_SIZE); (1)
phys_addr_t kernel_x_end = round_up(__pa(__init_end), SECTION_SIZE); (2)
/* Map all the lowmem memory banks. */
for_each_memblock(memory, reg) { (3)
phys_addr_t start = reg->base;
phys_addr_t end = start + reg->size;
struct map_desc map;
if (memblock_is_nomap(reg))
continue;
if (end > arm_lowmem_limit)
end = arm_lowmem_limit;
if (start >= end)
break;
if (end < kernel_x_start) {
map.pfn = __phys_to_pfn(start);
map.virtual = __phys_to_virt(start);
map.length = end - start;
map.type = MT_MEMORY_RWX;
create_mapping(&map);
} else if (start >= kernel_x_end) {
map.pfn = __phys_to_pfn(start);
map.virtual = __phys_to_virt(start);
map.length = end - start;
map.type = MT_MEMORY_RW;
create_mapping(&map);
} else {
/* This better cover the entire kernel */
if (start < kernel_x_start) {
map.pfn = __phys_to_pfn(start);
map.virtual = __phys_to_virt(start);
map.length = kernel_x_start - start;
map.type = MT_MEMORY_RW;
create_mapping(&map);
}
map.pfn = __phys_to_pfn(kernel_x_start);
map.virtual = __phys_to_virt(kernel_x_start);
map.length = kernel_x_end - kernel_x_start;
map.type = MT_MEMORY_RWX;
create_mapping(&map);
if (kernel_x_end < end) {
map.pfn = __phys_to_pfn(kernel_x_end);
map.virtual = __phys_to_virt(kernel_x_end);
map.length = end - kernel_x_end;
map.type = MT_MEMORY_RW;
create_mapping(&map);
}
}
}
}
1)内核开始区域内存向下以1M对齐
2)内核结束区域内存向上以1M对齐,这时得到的kernel_x_start 、kernel_x_end地址以段内存对齐。
3)遍历全局变量memblock中memory节点,并逐一创建内存映射。这里我的设备只有一个节点,内核启动打印可以看出,memory.cnt = 0x01。
MEMBLOCK configuration:
memory size = 0x0000000040000000 reserved size = 0x00000000052d5910
memory.cnt = 0x1
memory[0x0] [0x0000000040000000-0x000000007fffffff], 0x0000000040000000 bytes flags: 0x0
reserved.cnt = 0x5
reserved[0x0] [0x0000000040003000-0x0000000040007fff], 0x0000000000005000 bytes flags: 0x0
reserved[0x1] [0x0000000040200000-0x00000000412b688f], 0x00000000010b6890 bytes flags: 0x0
reserved[0x2] [0x0000000049de3000-0x0000000049deb07f], 0x0000000000008080 bytes flags: 0x0
reserved[0x3] [0x0000000049dee000-0x0000000049ffffff], 0x0000000000212000 bytes flags: 0x0
reserved[0x4] [0x000000007c000000-0x000000007fffffff], 0x0000000004000000 bytes flags: 0x0
MT_MEMORY_RWX和MT_MEMORY_RW区别在于ARM页表项有一个XN比特位,XN比特位置1,表示这段内存区域不允许执行。
设备物理地址为0x40000000-0x7fffffff
,则在这里创建的映射为:
0x40000000~kernel_x_start, MT_MEMORY_RW
kernel_x_start~kernel_x_end, MT_MEMORY_RWX
kernel_x_end~arm_lowmem_limit, MT_MEMORY_RW
注意:
if (end > arm_lowmem_limit)
end = arm_lowmem_limit;
因为内存超过arm_lowmem_limit(我这里是768Mb),所以只映射到低端内存结束地址,根据函数名``map_lowmem`也可以看出这层意思。
create_mapping函数创建的映射就是物理内存直接映射,或者叫做线性映射。就是在原有物理地址上,加上一个偏移地址,使之成为内核可以访问到的虚拟地址。
create_mapping
概念说明
1)如果采用单层的段映射,32位处理器最大寻址地址4GB,所以需要有4096个页表项,每个表项大小4bytes,则总共需要16KB地址。内核启动段映射页表一般在0xc0004000~0xc0008000。
当cpu访问内存时,32位虚拟地址的搞12位(bit31-bit20)为作为段映射表的索引,找到对应页表项的序号,买个页表项提供了一个12位的物理地址段,将这个12位和虚拟地址的低20位拼在一起,就是32位物理地址。
2)如果采用页表映射的方式,段映射表变成一级映射表,其表项提供的不再是物理地址段,而是二级页表的基地址。32位地址的高12位作为访问一级页表的索引值,找到相应的表项,每个表项指向一个二级页表。以虚拟地址的次8位(bit19-12)作为访问二级页表的索引值,得到相应的页表项,从这个页表项中找到20位的物理的地址,最后将这20位物理页面地址和虚拟地址的低12位拼凑在一起,得到最终的32位物理地址,这个过程由MMU硬件完成。
重要数据结构
struct map_desc {
unsigned long virtual; //虚拟地址的起始地址
unsigned long pfn; //物理地址的开始地址的页帧号
unsigned long length; //内存区间大小
unsigned int type; //类型(RW、RWX)
};
struct mem_type {
pteval_t prot_pte;
pteval_t prot_pte_s2;
pmdval_t prot_l1;
pmdval_t prot_sect;
unsigned int domain;
};
/* domain定义如下 */
#ifndef CONFIG_IO_36
#define DOMAIN_KERNEL 0
#define DOMAIN_USER 1
#define DOMAIN_IO 2
#else
#define DOMAIN_KERNEL 2
#define DOMAIN_USER 1
#define DOMAIN_IO 0
#endif
#define DOMAIN_VECTORS 3
DOMAIN_KERNEL属于系统空间,DOMAIN_IO表示系统空间,DOMAIN_USER表示用户空间。
prot_pte成员用于页面表项的控制位和标志位
prot_l1成员表示一级页表项的控制位和标志位
系统中定义了一个全局的mem_type[]数组来描述所有的内存区间定义。arm结构如下:
/*arm mem_types实现*/
static struct mem_type mem_types[] __ro_after_init = {
[MT_DEVICE] = { /* Strongly ordered / ARMv6 shared device */
.prot_pte = PROT_PTE_DEVICE | L_PTE_MT_DEV_SHARED |
L_PTE_SHARED,
.prot_pte_s2 = s2_policy(PROT_PTE_S2_DEVICE) |
s2_policy(L_PTE_S2_MT_DEV_SHARED) |
L_PTE_SHARED,
.prot_l1 = PMD_TYPE_TABLE,
.prot_sect = PROT_SECT_DEVICE | PMD_SECT_S,
.domain = DOMAIN_IO,
},
[MT_DEVICE_NONSHARED] = { /* ARMv6 non-shared device */
.prot_pte = PROT_PTE_DEVICE | L_PTE_MT_DEV_NONSHARED,
.prot_l1 = PMD_TYPE_TABLE,
.prot_sect = PROT_SECT_DEVICE,
.domain = DOMAIN_IO,
},
[MT_DEVICE_CACHED] = { /* ioremap_cached */
.prot_pte = PROT_PTE_DEVICE | L_PTE_MT_DEV_CACHED,
.prot_l1 = PMD_TYPE_TABLE,
.prot_sect = PROT_SECT_DEVICE | PMD_SECT_WB,
.domain = DOMAIN_IO,
},
[MT_DEVICE_WC] = { /* ioremap_wc */
.prot_pte = PROT_PTE_DEVICE | L_PTE_MT_DEV_WC,
.prot_l1 = PMD_TYPE_TABLE,
.prot_sect = PROT_SECT_DEVICE,
.domain = DOMAIN_IO,
},
[MT_UNCACHED] = {
.prot_pte = PROT_PTE_DEVICE,
.prot_l1 = PMD_TYPE_TABLE,
.prot_sect = PMD_TYPE_SECT | PMD_SECT_XN,
.domain = DOMAIN_IO,
},
[MT_CACHECLEAN] = {
.prot_sect = PMD_TYPE_SECT | PMD_SECT_XN,
.domain = DOMAIN_KERNEL,
},
#ifndef CONFIG_ARM_LPAE
[MT_MINICLEAN] = {
.prot_sect = PMD_TYPE_SECT | PMD_SECT_XN | PMD_SECT_MINICACHE,
.domain = DOMAIN_KERNEL,
},
#endif
[MT_LOW_VECTORS] = {
.prot_pte = L_PTE_PRESENT | L_PTE_YOUNG | L_PTE_DIRTY |
L_PTE_RDONLY,
.prot_l1 = PMD_TYPE_TABLE,
.domain = DOMAIN_VECTORS,
},
[MT_HIGH_VECTORS] = {
.prot_pte = L_PTE_PRESENT | L_PTE_YOUNG | L_PTE_DIRTY |
L_PTE_USER | L_PTE_RDONLY,
.prot_l1 = PMD_TYPE_TABLE,
.domain = DOMAIN_VECTORS,
},
[MT_MEMORY_RWX] = {
.prot_pte = L_PTE_PRESENT | L_PTE_YOUNG | L_PTE_DIRTY,
.prot_l1 = PMD_TYPE_TABLE,
.prot_sect = PMD_TYPE_SECT | PMD_SECT_AP_WRITE,
.domain = DOMAIN_KERNEL,
},
[MT_MEMORY_RW] = {
.prot_pte = L_PTE_PRESENT | L_PTE_YOUNG | L_PTE_DIRTY |
L_PTE_XN,
.prot_l1 = PMD_TYPE_TABLE,
.prot_sect = PMD_TYPE_SECT | PMD_SECT_AP_WRITE,
.domain = DOMAIN_KERNEL,
},
[MT_ROM] = {
.prot_sect = PMD_TYPE_SECT,
.domain = DOMAIN_KERNEL,
},
[MT_MEMORY_RWX_NONCACHED] = {
.prot_pte = L_PTE_PRESENT | L_PTE_YOUNG | L_PTE_DIRTY |
L_PTE_MT_BUFFERABLE,
.prot_l1 = PMD_TYPE_TABLE,
.prot_sect = PMD_TYPE_SECT | PMD_SECT_AP_WRITE,
.domain = DOMAIN_KERNEL,
},
[MT_MEMORY_RW_DTCM] = {
.prot_pte = L_PTE_PRESENT | L_PTE_YOUNG | L_PTE_DIRTY |
L_PTE_XN,
.prot_l1 = PMD_TYPE_TABLE,
.prot_sect = PMD_TYPE_SECT | PMD_SECT_XN,
.domain = DOMAIN_KERNEL,
},
[MT_MEMORY_RWX_ITCM] = {
.prot_pte = L_PTE_PRESENT | L_PTE_YOUNG | L_PTE_DIRTY,
.prot_l1 = PMD_TYPE_TABLE,
.domain = DOMAIN_KERNEL,
},
[MT_MEMORY_RW_SO] = {
.prot_pte = L_PTE_PRESENT | L_PTE_YOUNG | L_PTE_DIRTY |
L_PTE_MT_UNCACHED | L_PTE_XN,
.prot_l1 = PMD_TYPE_TABLE,
.prot_sect = PMD_TYPE_SECT | PMD_SECT_AP_WRITE | PMD_SECT_S |
PMD_SECT_UNCACHED | PMD_SECT_XN,
.domain = DOMAIN_KERNEL,
},
[MT_MEMORY_DMA_READY] = {
.prot_pte = L_PTE_PRESENT | L_PTE_YOUNG | L_PTE_DIRTY |
L_PTE_XN,
.prot_l1 = PMD_TYPE_TABLE,
.domain = DOMAIN_KERNEL,
},
}
create_mapping代码实现
create_mapping->__create_mapping
static void __init __create_mapping(struct mm_struct *mm, struct map_desc *md,
void *(*alloc)(unsigned long sz),
bool ng)
{
unsigned long addr, length, end;
phys_addr_t phys;
const struct mem_type *type;
pgd_t *pgd;
type = &mem_types[md->type]; (1)
#ifndef CONFIG_ARM_LPAE
/*
* Catch 36-bit addresses
*/
if (md->pfn >= 0x100000) {
create_36bit_mapping(mm, md, type, ng);
return;
}
#endif
addr = md->virtual & PAGE_MASK;
phys = __pfn_to_phys(md->pfn);
length = PAGE_ALIGN(md->length + (md->virtual & ~PAGE_MASK));
if (type->prot_l1 == 0 && ((addr | phys | length) & ~SECTION_MASK)) {
pr_warn("BUG: map for 0x%08llx at 0x%08lx can not be mapped using pages, ignoring.\n",
(long long)__pfn_to_phys(md->pfn), addr);
return;
}
pgd = pgd_offset(mm, addr); (2)
end = addr + length;
do {
unsigned long next = pgd_addr_end(addr, end);
alloc_init_pud(pgd, addr, next, phys, type, alloc, ng); (3)
phys += next - addr;
addr = next;
} while (pgd++, addr != end);
}
1)首先通过md->type获取描述内存区域属性的mem_type数据结构。
2)通过pgd_offset函数获取所属的页面目录项PGD,内核的一级页表存放在swapper_pg_dir地址中,可以通过init_mm数据结构来获取。
3)alloc_init_pud,arm只有两级页表,所以一路调用如下:create_mapping->alloc_init_pud->alloc_init_pmd
static void __init alloc_init_pmd(pud_t *pud, unsigned long addr,
unsigned long end, phys_addr_t phys,
const struct mem_type *type,
void *(*alloc)(unsigned long sz), bool ng)
{
pmd_t *pmd = pmd_offset(pud, addr);
unsigned long next;
do {
/*
* With LPAE, we must loop over to map
* all the pmds for the given range.
*/
next = pmd_addr_end(addr, end);
/*
* Try a section mapping - addr, next and phys must all be
* aligned to a section boundary.
*/
if (type->prot_sect &&
((addr | next | phys) & ~SECTION_MASK) == 0) {
__map_init_section(pmd, addr, next, phys, type, ng); (1)
} else {
alloc_init_pte(pmd, addr, next, (2)
__phys_to_pfn(phys), type, alloc, ng);
}
phys += next - addr;
} while (pmd++, addr = next, addr != end);
}
1)尝试段映射,如果映射地址、下一个映射地址、且物理地址都是段对齐,则进行段映射
2)如果不满足段映射条件,则初始化二级pte页表。需要申请pte页表内存,调用传参的alloc函数申请内存, 这里是static void __init *early_alloc(unsigned long sz)
下面来看alloc_init_pte实现
static void __init alloc_init_pte(pmd_t *pmd, unsigned long addr,
unsigned long end, unsigned long pfn,
const struct mem_type *type,
void *(*alloc)(unsigned long sz),
bool ng)
{
pte_t *pte = arm_pte_alloc(pmd, addr, type->prot_l1, alloc);
do {
set_pte_ext(pte, pfn_pte(pfn, __pgprot(type->prot_pte)),
ng ? PTE_EXT_NG : 0);
pfn++;
} while (pte++, addr += PAGE_SIZE, addr != end);
}
static pte_t * __init arm_pte_alloc(pmd_t *pmd, unsigned long addr,
unsigned long prot,
void *(*alloc)(unsigned long sz))
{
if (pmd_none(*pmd)) {
pte_t *pte = alloc(PTE_HWTABLE_OFF + PTE_HWTABLE_SIZE);
__pmd_populate(pmd, __pa(pte), prot);
}
BUG_ON(pmd_bad(*pmd));
return pte_offset_kernel(pmd, addr);
}
pmd_noe去检查这个参数对应的PMD页表项的内容,如果为0,说明PTE页表还没建立,需要去建立新的页表项。这里会创建(PTE_HWTABLE_OFF + PTE_HWTABLE_SIZE)个页面表。即会分配512+512个页表项。4Kb,物理页面对齐。
物理mmu默认一级页表偏移从20位开始,分4096段。内核从21位开始,分2048段。
在arm linux中,一个PGD页表项,映射2M内存,4KB页面则需要512个表项。
在真实页面中,一个PGD,映射1M,4Kb页面则需要256个表项。
相当于一次映射2个相邻的一级页表项,两个相邻的二级页表都存放在一个page中。前512个pte表项,内核使用,后512个表项,硬件使用(对应2M内存寻址,即2个硬件一级页表项)
然后把这个PTE页面表的基地址,通过__pmd_populate设置到PMD页表项中。
static inline void __pmd_populate(pmd_t *pmdp, phys_addr_t pte,
pmdval_t prot)
{
pmdval_t pmdval = (pte + PTE_HWTABLE_OFF) | prot; (1)
pmdp[0] = __pmd(pmdval); (2)
#ifndef CONFIG_ARM_LPAE
pmdp[1] = __pmd(pmdval + 256 * sizeof(pte_t)); (3)
#endif
flush_pmd_entry(pmdp);
}
1)pte传参是pte页表的基地址,刚分配的1024个PTE页面表,硬件页表偏移地址在后512个表项,所以设置到硬件pmd中的pte地址需要偏移512*4字节,并或上prot标志位,写入上一级页表项PMD中
2)上文提到物理mmu默认一级页表偏移从20位开始,分4096段。内核从21位开始,分2048段。在申请pte页表时,申请了2段硬件pmd的pte。需要分别设置2段pmd的pte页表地址。这里设置第一段。
3)设置第二段pmd的pte页表地址。
回到alloc_init_pte
函数,
最后通过set_pte_ext完成对硬件页表项的设置。把20位物理地址偏移、标志位写入Linux页表、硬件页表。
重要说明
注意:低端内存基本都是段映射。
相关文章
Linux内存初始化(1)——memblock初始化
Linux内存初始化(2)——paging_init初始化
Linux内存初始化(3)——pglist_data/zone初始化