目录
- 引言
- 汇编启动!!!
- 细节剖析
引言
之前在研究Linux内核源码的时候总是找不到关于这部分源码的相关剖析,要么也是模棱两可的,也有一些比较专业的代码分析,不过比较分散,感觉大家都不太喜欢这部分代码,正好今天周末,这段时间也在学习Arm64汇编,以这部分为研究对象来解析
源码版本:Linux 5.0
架构信息
- 芯片架构:
ARM64
- 内存架构:
UMA
CONFIG_ARM64_VA_BITS:39
CONFIG_ARM64_PAGE_SHIFT:12
CONFIG_PGTABLE_LEVELS:3
之前写的一些相关文章:
Linux内存管理:Bootmem的率先登场
Linux内存管理:Buddy System姗姗来迟
Linux内存管理:Slab闪亮登场
Linux内存管理:内存分配和内存回收原理
Linux CFS调度器:原理和实现
汇编启动!!!
Linux内核代码从哪里执行的?从链接脚本看
# arch/arm64/kernel/vmlinux.lds.SSECTIONS
{. = KIMAGE_VADDR + TEXT_OFFSET;.head.text : {_text = .;HEAD_TEXT}
// include/linux/init.h#define __HEAD .section ".head.text","ax"
// arch/arm64/kernel/head.S__HEAD
_head:/** DO NOT MODIFY. Image header expected by Linux boot-loaders.*/b stext // branch to kernel start, magic
KIMAGE_VADDR
是vmalloc
区域的起始地址,TEXT_OFFSET
是内核起始地址距离ram
起始地址的偏移(每一版的Linux
内核内存架构都有点不同,此处不做纠结)
// arch/arm64/include/asm/memory.h#define VA_BITS (39)
#define VA_START (UL(0xffffffffffffffff) - \(UL(1) << VA_BITS) + 1)
#define PAGE_OFFSET (UL(0xffffffffffffffff) - \(UL(1) << (VA_BITS - 1)) + 1)
#define KIMAGE_VADDR (MODULES_END)
#define BPF_JIT_REGION_START (VA_START + KASAN_SHADOW_SIZE)
#define BPF_JIT_REGION_SIZE (SZ_128M)
#define BPF_JIT_REGION_END (BPF_JIT_REGION_START + BPF_JIT_REGION_SIZE)
#define MODULES_END (MODULES_VADDR + MODULES_VSIZE)
#define MODULES_VADDR (BPF_JIT_REGION_END)
#define MODULES_VSIZE (SZ_128M)
#define VMEMMAP_START (PAGE_OFFSET - VMEMMAP_SIZE)
#define PCI_IO_END (VMEMMAP_START - SZ_2M)
#define PCI_IO_START (PCI_IO_END - PCI_IO_SIZE)
#define FIXADDR_TOP (PCI_IO_START - SZ_2M)
这个Linear Mapping
区域位置不固定,有时候会在VM_START
下
上述跳转到stext
符号,从这里才正式开始
ENTRY(stext)bl preserve_boot_argsbl el2_setup // Drop to EL1, w0=cpu_boot_modeadrp x23, __PHYS_OFFSETand x23, x23, MIN_KIMG_ALIGN - 1 // KASLR offset, defaults to 0bl set_cpu_boot_mode_flagbl /** The following calls CPU setup code, see arch/arm64/mm/proc.S for* details.* On return, the CPU will be ready for the MMU to be turned on and* the TCR will have been set.*/bl __cpu_setup // initialise processorb __primary_switch
ENDPROC(stext)
首先是preserve_boot_args
符号
x20
存储的是FDT
设备树文件的物理地址,将其传递给x21
/** Preserve the arguments passed by the bootloader in x0 .. x3*/
preserve_boot_args:mov x21, x0 // x21=FDTadr_l x0, boot_args // record the contents ofstp x21, x1, [x0] // x0 .. x3 at kernel entrystp x2, x3, [x0, #16]dmb sy // needed before dc ivac with// MMU offmov x1, #0x20 // 4 x 8 bytesb __inval_dcache_area // tail call
ENDPROC(preserve_boot_args)
将x21 x1 x2 x3
四个寄存器的值存入boot_args
数组
// arch/arm64/kernel/setup.c/** The recorded values of x0 .. x3 upon kernel entry.*/
u64 __cacheline_aligned boot_args[4];
__inval_dcache_area
用于清理32
字节的数据缓存,x0
是boot_args
的地址,x1
是32
字节,即数组的四个元素(这部分功能代码,在末尾进行解析)
接下来是el2_setup
ENTRY(el2_setup)msr SPsel, #1 // We want to use SP_EL{1,2}mrs x0, CurrentELcmp x0, #CurrentEL_EL2b.eq 1fmov_q x0, (SCTLR_EL1_RES1 | ENDIAN_SET_EL1)msr sctlr_el1, x0mov w0, #BOOT_CPU_MODE_EL1 // This cpu booted in EL1isbret
SPsel
用于在SP_EL0
和SP_ELn
中选择SP
寄存器,此处选择使用当前特权级SP
寄存器
CurrentEL
用于获取当前运行级别,并与CurrentEL_EL2
进行比较,我们假设此处不使用虚拟机,而是使用内核级级别
sctlr_el1
为系统控制寄存器,此处用于设置小端模式(如下,默认是小端)
// arch/arm64/include/asm/sysreg.h#define SCTLR_EL1_RES1 ((_BITUL(11)) | (_BITUL(20)) | (_BITUL(22)) | (_BITUL(28)) | \(_BITUL(29)))#ifdef CONFIG_CPU_BIG_ENDIAN
#define ENDIAN_SET_EL1 (SCTLR_EL1_E0E | SCTLR_ELx_EE)
#define ENDIAN_CLEAR_EL1 0
#else
#define ENDIAN_SET_EL1 0
#define ENDIAN_CLEAR_EL1 (SCTLR_EL1_E0E | SCTLR_ELx_EE)
#endif
BOOT_CPU_MODE_EL1
暂时不知道啥用处,只会通过返回存放在w0
寄存器,放在这里
// arch/arm64/include/asm/virt.h#define BOOT_CPU_MODE_EL1 (0xe11)
kaslr
假设不开启,此处略过
直接看set_cpu_boot_mode_flag
/** Sets the __boot_cpu_mode flag depending on the CPU boot mode passed* in w0. See arch/arm64/include/asm/virt.h for more info.*/
set_cpu_boot_mode_flag:adr_l x1, __boot_cpu_modecmp w0, #BOOT_CPU_MODE_EL2b.ne 1fadd x1, x1, #4
1: str w0, [x1] // This CPU has booted in EL1dmb sydc ivac, x1 // Invalidate potentially stale cache lineret
ENDPROC(set_cpu_boot_mode_flag)
__boot_cpu_mode
是一个整数数组
ENTRY(__boot_cpu_mode).long BOOT_CPU_MODE_EL2.long BOOT_CPU_MODE_EL1
// arch/arm64/include/asm/virt.hextern u32 __boot_cpu_mode[2];
前面的w0
存储的是BOOT_CPU_MODE_EL1
,由此处可知:This CPU has booted in EL1
,跳转到1
标签
此处让__boot_cpu_mode[0]
等于BOOT_CPU_MODE_EL1
,并且清理此处缓存
然后是__cpu_setup
,先是清理tlb
缓存
.pushsection ".idmap.text", "awx"
ENTRY(__cpu_setup)tlbi vmalle1 // Invalidate local TLBdsb nsh
cpacr_el1
用于控制对浮点数simd
的访问:捕获访问与浮点和SIMD
执行相关的寄存器的指令,以便在从EL0
或EL1
执行时捕获到EL1
mdscr_el1(Monitor Debug System Control Register)
,debug
功能不做概述
mov x0, #3 << 20msr cpacr_el1, x0 // Enable FP/ASIMDmov x0, #1 << 12 // Reset mdscr_el1 and disablemsr mdscr_el1, x0 // access to the DCC from EL0isb // Unmask debug exceptions now,enable_dbg // since this is per-cpureset_pmuserenr_el0 x0 // Disable PMU access from EL0
mair_el1
用于控制存储器属性的编码:分为八段,用于描述不同的内存属性,后续会在页表中使用AttrIndx[2:0]
进行索引
ARMv8
最多可以定义八种不同的内存属性,而Linux
内核只定义了六种
ldr x5, =MAIR(0x00, MT_DEVICE_nGnRnE) | \MAIR(0x04, MT_DEVICE_nGnRE) | \MAIR(0x0c, MT_DEVICE_GRE) | \MAIR(0x44, MT_NORMAL_NC) | \MAIR(0xff, MT_NORMAL) | \MAIR(0xbb, MT_NORMAL_WT)msr mair_el1, x5
TCR
寄存器主要包括了与地址转换相关的控制信息以及与高速缓存相关的配置信息
/** Set/prepare TCR and TTBR. We use 512GB (39-bit) address range for* both user and kernel.*/ldr x10, =TCR_TxSZ(VA_BITS) | TCR_CACHE_FLAGS | TCR_SMP_FLAGS | \TCR_TG_FLAGS | TCR_KASLR_FLAGS | TCR_ASID16 | \TCR_TBI0 | TCR_A1 | TCR_KASAN_FLAGSldr_l x9, idmap_t0sztcr_set_t0sz x10, x9/** Set the IPS bits in TCR_EL1.*/tcr_compute_pa_size x10, #TCR_IPS_SHIFT, x5, x6msr tcr_el1, x10ret // return to head.S
ENDPROC(__cpu_setup)
这部分是初始化内存部分
最重要的是__primary_switch
__primary_switch:adrp x1, init_pg_dirbl __enable_mmuldr x8, =__primary_switchedadrp x0, __PHYS_OFFSETbr x8
ENDPROC(__primary_switch)
__enable_mmu
这个看名词即知,用于开启mmu
__create_page_tables
用于创建两个页表:init_pg_dir
和idmap_pg_dir
在执行之前先关闭
mmu
msr sctlr_el1, x20 // disable the MMUisbbl __create_page_tables // recreate kernel mapping
清除init_pg_dir
页表缓存
__create_page_tables:mov x28, lr/** Invalidate the init page tables to avoid potential dirty cache lines* being evicted. Other page tables are allocated in rodata as part of* the kernel image, and thus are clean to the PoC per the boot* protocol.*/adrp x0, init_pg_dir // adrp获取的是物理地址adrp x1, init_pg_endsub x1, x1, x0bl __inval_dcache_area // 清除缓存
将init_pg_dir
页表内存重置为xzr
adrp x0, init_pg_diradrp x1, init_pg_endsub x1, x1, x0 // x1为init_pg_dir占用的字节数
1: stp xzr, xzr, [x0], #16stp xzr, xzr, [x0], #16stp xzr, xzr, [x0], #16stp xzr, xzr, [x0], #16subs x1, x1, #64 // 一次清理64Bb.ne 1b
vabits_user
用于保存虚拟地址位数
/** Create the identity mapping. 恒等映射*/adrp x0, idmap_pg_dir // idmap_pg_dir的物理地址adrp x3, __idmap_text_start // __pa(__idmap_text_start)mov x5, #VA_BITS
1:adr_l x6, vabits_userstr x5, [x6]dmb sydc ivac, x6 // Invalidate potentially stale cache line
idmap_ptrs_per_pgd
用于获得PGD(idmap_pg_dir)
的PGD
表项数
PGDIR_SHIFT
为PGD
的偏移位数
/** If VA_BITS == 48, we don't have to configure an additional* translation level, but the top-level table has more entries.*/mov x4, #1 << (PHYS_MASK_SHIFT - PGDIR_SHIFT)str_l x4, idmap_ptrs_per_pgd, x5
创建并更新idmap_pg_dir
页表
ldr_l x4, idmap_ptrs_per_pgdmov x5, x3 // __pa(__idmap_text_start)adr_l x6, __idmap_text_end // __pa(__idmap_text_end)// 创建各个页表map_memory x0, x1, x3, x6, x7, x3, x4, x10, x11, x12, x13, x14
这部分空间是什么?
# arch/arm64/kernel/vmlinux.lds.S#define IDMAP_TEXT \. = ALIGN(SZ_4K); \__idmap_text_start = .; \*(.idmap.text) \__idmap_text_end = .;
其中.idmap.text
如下
pushsection .idmap.text, "awx"//
.popsection
使用map_memory
创建页表并映射,map_memory
宏定义如下
/** Map memory for specified virtual address range. Each level of page table needed supports* multiple entries. If a level requires n entries the next page table level is assumed to be* formed from n pages.** tbl: location of page table* rtbl: address to be used for first level page table entry (typically tbl + PAGE_SIZE)* vstart: start address to map* vend: end address to map - we map [vstart, vend]* flags: flags to use to map last level entries* phys: physical address corresponding to vstart - physical memory is contiguous* pgds: the number of pgd entries** Temporaries: istart, iend, tmp, count, sv - these need to be different registers* Preserves: vstart, vend, flags* Corrupts: tbl, rtbl, istart, iend, tmp, count, sv*/.macro map_memory, tbl, rtbl, vstart, vend, flags, phys, pgds, istart, iend, tmp, count, svadd \rtbl, \tbl, #PAGE_SIZEmov \sv, \rtblmov \count, #0compute_indices \vstart, \vend, #PGDIR_SHIFT, \pgds, \istart, \iend, \countpopulate_entries \tbl, \rtbl, \istart, \iend, #PMD_TYPE_TABLE, #PAGE_SIZE, \tmpmov \tbl, \svmov \sv, \rtbl
#if SWAPPER_PGTABLE_LEVELS > 2compute_indices \vstart, \vend, #SWAPPER_TABLE_SHIFT, #PTRS_PER_PMD, \istart, \iend, \countpopulate_entries \tbl, \rtbl, \istart, \iend, #PMD_TYPE_TABLE, #PAGE_SIZE, \tmpmov \tbl, \sv
#endifcompute_indices \vstart, \vend, #SWAPPER_BLOCK_SHIFT, #PTRS_PER_PTE, \istart, \iend, \countbic \count, \phys, #SWAPPER_BLOCK_SIZE - 1populate_entries \tbl, \count, \istart, \iend, \flags, #SWAPPER_BLOCK_SIZE, \tmp.endm
创建并更新init_pg_dir
表项
/** Map the kernel image (starting with PHYS_OFFSET).*/adrp x0, init_pg_dirmov_q x5, KIMAGE_VADDR + TEXT_OFFSET // compile time __va(_text)add x5, x5, x23 // add KASLR displacementmov x4, PTRS_PER_PGDadrp x6, _end // runtime __pa(_end)adrp x3, _text // runtime __pa(_text)sub x6, x6, x3 // _end - _textadd x6, x6, x5 // runtime __va(_end)map_memory x0, x1, x5, x6, x7, x3, x4, x10, x11, x12, x13, x14
由上面可知,前面已经建立了恒等映射和内核映射,下面开启MMU
,并执行到__primary_switched
msr sctlr_el1, x19 // re-enable the MMUisbic iallu // flush instructions fetcheddsb nsh // via old mappingisbldr x8, =__primary_switchedadrp x0, __PHYS_OFFSETbr x8
ENDPROC(__primary_switch)
__primary_switched:adrp x4, init_thread_unionadd sp, x4, #THREAD_SIZEadr_l x5, init_taskmsr sp_el0, x5 // Save thread_info// 设置异常向量表adr_l x8, vectors // load VBAR_EL1 with virtualmsr vbar_el1, x8 // vector table addressisbstp xzr, x30, [sp, #-16]!mov x29, spstr_l x21, __fdt_pointer, x5 // Save FDT pointerldr_l x4, kimage_vaddr // Save the offset betweensub x4, x4, x0 // the kernel virtual andstr_l x4, kimage_voffset, x5 // physical mappings// Clear BSSadr_l x0, __bss_startmov x1, xzradr_l x2, __bss_stopsub x2, x2, x0bl __pi_memsetdsb ishst // Make zero page visible to PTWadd sp, sp, #16mov x29, #0mov x30, #0b start_kernel
ENDPROC(__primary_switched)
init_thread_union
存放了init
栈的起始地址,如下__start_init_task = init_thread_union = init_stack
,并将其add sp, x4, #THREAD_SIZE
赋值为sp
寄存器,并将init_task
进程描述符存储到sp_el0
寄存器
// include/asm-generic/vmlinux.lds.h#define INIT_TASK_DATA(align) \. = ALIGN(align); \__start_init_task = .; \init_thread_union = .; \init_stack = .; \KEEP(*(.data..init_task)) \KEEP(*(.data..init_thread_info)) \. = __start_init_task + THREAD_SIZE; \__end_init_task = .;
// init/init_task.c/** Set up the first task table, touch at your own risk!. Base=0,* limit=0x1fffff (=2MB)*/
struct task_struct init_task
#ifdef CONFIG_ARCH_TASK_STRUCT_ON_STACK__init_task_data
#endif
= {//
};
EXPORT_SYMBOL(init_task);
设置异常向量表(vectors
在后续会进行剖析)
adr_l x8, vectors // load VBAR_EL1 with virtualmsr vbar_el1, x8 // vector table addressisb
将FDT
物理地址(刚开始的时候将其地址存入x21
)存入__fdt_pointer
str_l x21, __fdt_pointer, x5 // Save FDT pointer
保存kimage_vaddr
,这个地址是kernel
的虚拟地址,x0
是内核被加载的物理地址
ldr_l x4, kimage_vaddr // Save the offset betweensub x4, x4, x0 // the kernel virtual andstr_l x4, kimage_voffset, x5 // physical mappings
最后是清理bss
段位执行内核函数做准备
跳转到start_kernel执行
细节剖析
map_memory
宏定义
由上面idmap_pg_dir
页表创建可知,
寄存器 | 地址 |
---|---|
x0 | idmap_pg_dir |
x3 | __idmap_text_start |
x6 | __idmap_text_end |
x7 | SWAPPER_MM_MMUFLAGS |
x4 | idmap_ptrs_per_pgd |
map_memory x0, x1, x3, x6, x7, x3, x4, x10, x11, x12, x13, x14
可知,PGD
页表占用一个页:PAGE_SIZE
,tbl
用于存储下一级页表的基址
.macro map_memory, tbl, rtbl, vstart, vend, flags, phys, pgds, istart, iend, tmp, count, svadd \rtbl, \tbl, #PAGE_SIZEmov \sv, \rtblmov \count, #0
compute_indices
宏的功能:用于计算虚拟地址计算各级页表的索引值
compute_indices \vstart, \vend, #PGDIR_SHIFT, \pgds, \istart, \iend, \count
populate_entries
宏的功能:填充索引值index
对应的页表项
此处用于设置PGD PUD PMD
页表项,此处不会设置PTE
,使用段映射,一般是2MB
// arch/arm64/include/asm/pgtable-hwdef.h/* Initial memory map size */
#if ARM64_SWAPPER_USES_SECTION_MAPS
#define SWAPPER_BLOCK_SHIFT SECTION_SHIFT
#define SWAPPER_BLOCK_SIZE SECTION_SIZE
#define SWAPPER_TABLE_SHIFT PUD_SHIFT
#else
#define SWAPPER_BLOCK_SHIFT PAGE_SHIFT
#define SWAPPER_BLOCK_SIZE PAGE_SIZE
#define SWAPPER_TABLE_SHIFT PMD_SHIFT
#endif
populate_entries \tbl, \rtbl, \istart, \iend, #PMD_TYPE_TABLE, #PAGE_SIZE, \tmp
是怎么切换页表的?每一次切换一级页表
mov \tbl, \svmov \sv, \rtbl
来看看具体的宏定义
.macro compute_indices, vstart, vend, shift, ptrs, istart, iend, countlsr \iend, \vend, \shift // 计算结束PGD表项mov \istart, \ptrssub \istart, \istart, #1 // 获得页表最大索引and \iend, \iend, \istart // iend = (vend >> shift) & (ptrs - 1) 将iend限制在最大范围内mov \istart, \ptrsmul \istart, \istart, \countadd \iend, \iend, \istart // iend += (count - 1) * ptrs// our entries span multiple tableslsr \istart, \vstart, \shiftmov \count, \ptrssub \count, \count, #1and \istart, \istart, \countsub \count, \iend, \istart.endm
populate_entries
下次补上
__inval_dcache_area
宏定义
dc
指令用于控制数据缓存
civac
:PoC
,清理并使指定的虚拟地址对应的高速缓存失效ivac
:PoC
,使指定的虚拟地址中对于的高速缓存失效
ENTRY(__inval_dcache_area)/* FALLTHROUGH *//** __dma_inv_area(start, size)* - start - virtual start address of region x0* - size - size in question x1*/
__dma_inv_area:add x1, x1, x0 // x1=x0(start)+x1(size) 结束地址enddcache_line_size x2, x3 // x2为缓存行大小sub x3, x2, #1tst x1, x3 // end cache line aligned? 缓存行是否对齐bic x1, x1, x3 // 不对齐则清除为0,这会提前地址,而不会跳过规定的起始范围b.eq 1fdc civac, x1 // clean & invalidate D / U line
1: tst x0, x3 // start cache line aligned?bic x0, x0, x3b.eq 2fdc civac, x0 // clean & invalidate D / U lineb 3f
2: dc ivac, x0 // invalidate D / U line
3: add x0, x0, x2cmp x0, x1b.lo 2bdsb syret
ENDPIPROC(__inval_dcache_area)
vectors
异常向量表
ARM64
中的中断向量表占用2048B
,分为四组,每组四个表项,每表项占用128B
,四组分别是:
EL1t
:在EL1
下,与当前栈指针SP_ELx
不同(一般是SP_EL0
)EL1t
:在EL1
下,与当前栈指针SP_ELx
相同(即SP_EL1
)- 从低异常级
EL0
进入当前异常级EL1
(Lower EL, AArch64
) - 从低异常级
EL0
进入当前异常级EL1
(Lower EL, AArch32
)
/** Exception vectors.*/.pushsection ".entry.text", "ax".align 11
ENTRY(vectors)kernel_ventry 1, sync_invalid // Synchronous EL1tkernel_ventry 1, irq_invalid // IRQ EL1tkernel_ventry 1, fiq_invalid // FIQ EL1tkernel_ventry 1, error_invalid // Error EL1tkernel_ventry 1, sync // Synchronous EL1hkernel_ventry 1, irq // IRQ EL1hkernel_ventry 1, fiq_invalid // FIQ EL1hkernel_ventry 1, error // Error EL1hkernel_ventry 0, sync // Synchronous 64-bit EL0kernel_ventry 0, irq // IRQ 64-bit EL0kernel_ventry 0, fiq_invalid // FIQ 64-bit EL0kernel_ventry 0, error // Error 64-bit EL0kernel_ventry 0, sync_invalid, 32 // Synchronous 32-bit EL0kernel_ventry 0, irq_invalid, 32 // IRQ 32-bit EL0kernel_ventry 0, fiq_invalid, 32 // FIQ 32-bit EL0kernel_ventry 0, error_invalid, 32 // Error 32-bit EL0
END(vectors)
每四个异常分别对应于
#define BAD_SYNC 0
#define BAD_IRQ 1
#define BAD_FIQ 2
#define BAD_ERROR 3