Linux kernel oops
程序员文章站
2022-05-29 09:11:50
...
本文以ARM64为例,介绍内核的Oops机制,我们使用grep搜索一下内核中可能会报Oops的地方:
./arch/arm64/kernel/sys_compat.c:142: arm64_notify_die("Oops - bad compat syscall(2)", regs, &info, scno);
./arch/arm64/kernel/traps.c:771: die("Oops - bad mode", regs, 0);
./arch/arm64/kernel/traps.c:929: die("Oops - BUG", regs, 0);
./arch/arm64/mm/fault.c:270: die("Oops", regs, esr);
搜索结果如上所示,一共有这几个地方定义为Oops,因此Oops可能包含如下一些场景:
- 64bit 系统调用发生了错误,报Oops
- CPU陷入了某种不正常的exception mode,在该exception对应的exception vector entry中直接报Oops
- traps中定义的BUG()函数被调用触发了Oops
- 内核空间中发生了内存地址相关的访问异常
本文着重从第4种情况来入手跟踪Oops的发生过程:
在代码文件 ./arch/arm64/mm/fault.c 中:
do_translation_fault --> do_bad_area --> __do_kernel_fault --> die_kernel_fault
do_alignment_fault --> do_bad_area --> __do_kernel_fault --> die_kernel_fault
do_page_fault --> __do_kernel_fault --> die_kernel_fault
调用路径如上所示,当内核访问一个内存地址发生错误时会分别调用 do_xxx_fault
该函数最终的目标是 die_kernel_fault:
static void die_kernel_fault(const char *msg, unsigned long addr,
unsigned int esr, struct pt_regs *regs)
{
bust_spinlocks(1);
pr_alert("Unable to handle kernel %s at virtual address %016lx\n", msg,
addr);
mem_abort_decode(esr);
show_pte(addr);
die("Oops", regs, esr);
bust_spinlocks(0);
do_exit(SIGKILL);
}
这里最终会调用 die("Oops", regs, esr)
函数:
/*
* This function is protected against re-entrancy.
*/
void die(const char *str, struct pt_regs *regs, int err)
{
int ret;
unsigned long flags;
raw_spin_lock_irqsave(&die_lock, flags);
oops_enter();
console_verbose();
bust_spinlocks(1);
ret = __die(str, err, regs); // 其中会发送 notify_die 通知
if (regs && kexec_should_crash(current))
crash_kexec(regs);
bust_spinlocks(0);
add_taint(TAINT_DIE, LOCKDEP_NOW_UNRELIABLE);
oops_exit();
if (in_interrupt())
panic("Fatal exception in interrupt");
if (panic_on_oops) // 判断是否要执行panic操作
panic("Fatal exception");
raw_spin_unlock_irqrestore(&die_lock, flags);
if (ret != NOTIFY_STOP)
do_exit(SIGSEGV);
}
在die中可以看到如果配置了panic_on_oops为1,那么才会直接触发panic操作,如果没有配置为1,并不会导致系统panic重启。Oops都会打印内核调用栈。
一种手动触发panic的机制
利用sysrq机制可以触发kernel crash:
echo c > /proc/sysrq-trigger
这种方式就是利用Oops机制来触发panic的:
static void sysrq_handle_crash(int key)
{
char *killer = NULL;
/* we need to release the RCU read lock here,
* otherwise we get an annoying
* 'BUG: sleeping function called from invalid context'
* complaint from the kernel before the panic.
*/
rcu_read_unlock();
panic_on_oops = 1; /* force panic */ //-------- (1)
wmb();
*killer = 1; //---------------------------(2)
}
- 第(1)步先配置panic_on_oops为1,使得当内核oops时直接触发panic操作
- 第(2)步访问一个内核NULL空地址,触发oops操作
到这里可能很多人会有一个疑惑,对一个内核空地址赋值,是如何产生了Oops呢?
查看异常arm64向量表:
/*
* EL1 mode handlers.
*/
el1_da:
/*
* Data abort handling
*/
mrs x3, far_el1
inherit_daif pstate=x23, tmp=x2
clear_address_tag x0, x3
mov x2, sp // struct pt_regs
bl do_mem_abort
kernel_exit 1
......
el0_da:
/*
* Data abort handling
*/
mrs x26, far_el1
enable_daif
ct_user_exit
clear_address_tag x0, x26
mov x1, x25
mov x2, sp
bl do_mem_abort
b ret_to_user
其中el1_da和el1_da中会调用到do_mem_abort,这个向量函数是在CPU运行时发生了data abort异常时进入的一种模式,并且会执行到向量表中对应的函数。
asmlinkage void __exception do_mem_abort(unsigned long addr, unsigned int esr,
struct pt_regs *regs)
{
const struct fault_info *inf = esr_to_fault_info(esr);
struct siginfo info;
if (!inf->fn(addr, esr, regs))
return;
if (!user_mode(regs)) {
pr_alert("Unhandled fault at 0x%016lx\n", addr);
mem_abort_decode(esr);
show_pte(addr);
}
clear_siginfo(&info);
info.si_signo = inf->sig;
info.si_errno = 0;
info.si_code = inf->code;
info.si_addr = (void __user *)addr;
arm64_notify_die(inf->name, regs, &info, esr);
}
其中对应一个系统错误处理列表:
static inline const struct fault_info *esr_to_fault_info(unsigned int esr)
{
return fault_info + (esr & 63);
}
static const struct fault_info fault_info[] = {
{ do_bad, SIGKILL, SI_KERNEL, "ttbr address size fault" },
{ do_bad, SIGKILL, SI_KERNEL, "level 1 address size fault" },
{ do_bad, SIGKILL, SI_KERNEL, "level 2 address size fault" },
{ do_bad, SIGKILL, SI_KERNEL, "level 3 address size fault" },
{ do_translation_fault, SIGSEGV, SEGV_MAPERR, "level 0 translation fault" },
{ do_translation_fault, SIGSEGV, SEGV_MAPERR, "level 1 translation fault" },
{ do_translation_fault, SIGSEGV, SEGV_MAPERR, "level 2 translation fault" },
{ do_translation_fault, SIGSEGV, SEGV_MAPERR, "level 3 translation fault" },
{ do_bad, SIGKILL, SI_KERNEL, "unknown 8" },
{ do_page_fault, SIGSEGV, SEGV_ACCERR, "level 1 access flag fault" },
{ do_page_fault, SIGSEGV, SEGV_ACCERR, "level 2 access flag fault" },
{ do_page_fault, SIGSEGV, SEGV_ACCERR, "level 3 access flag fault" },
{ do_bad, SIGKILL, SI_KERNEL, "unknown 12" },
{ do_page_fault, SIGSEGV, SEGV_ACCERR, "level 1 permission fault" },
{ do_page_fault, SIGSEGV, SEGV_ACCERR, "level 2 permission fault" },
{ do_page_fault, SIGSEGV, SEGV_ACCERR, "level 3 permission fault" },
{ do_sea, SIGBUS, BUS_OBJERR, "synchronous external abort" },
{ do_bad, SIGKILL, SI_KERNEL, "unknown 17" },
{ do_bad, SIGKILL, SI_KERNEL, "unknown 18" },
{ do_bad, SIGKILL, SI_KERNEL, "unknown 19" },
{ do_sea, SIGKILL, SI_KERNEL, "level 0 (translation table walk)" },
{ do_sea, SIGKILL, SI_KERNEL, "level 1 (translation table walk)" },
{ do_sea, SIGKILL, SI_KERNEL, "level 2 (translation table walk)" },
{ do_sea, SIGKILL, SI_KERNEL, "level 3 (translation table walk)" },
{ do_sea, SIGBUS, BUS_OBJERR, "synchronous parity or ECC error" }, // Reserved when RAS is implemented
{ do_bad, SIGKILL, SI_KERNEL, "unknown 25" },
{ do_bad, SIGKILL, SI_KERNEL, "unknown 26" },
{ do_bad, SIGKILL, SI_KERNEL, "unknown 27" },
{ do_sea, SIGKILL, SI_KERNEL, "level 0 synchronous parity error (translation table walk)" }, // Reserved when RAS is implemented
{ do_sea, SIGKILL, SI_KERNEL, "level 1 synchronous parity error (translation table walk)" }, // Reserved when RAS is implemented
{ do_sea, SIGKILL, SI_KERNEL, "level 2 synchronous parity error (translation table walk)" }, // Reserved when RAS is implemented
{ do_sea, SIGKILL, SI_KERNEL, "level 3 synchronous parity error (translation table walk)" }, // Reserved when RAS is implemented
{ do_bad, SIGKILL, SI_KERNEL, "unknown 32" },
{ do_alignment_fault, SIGBUS, BUS_ADRALN, "alignment fault" },
{ do_bad, SIGKILL, SI_KERNEL, "unknown 34" },
{ do_bad, SIGKILL, SI_KERNEL, "unknown 35" },
{ do_bad, SIGKILL, SI_KERNEL, "unknown 36" },
{ do_bad, SIGKILL, SI_KERNEL, "unknown 37" },
{ do_bad, SIGKILL, SI_KERNEL, "unknown 38" },
{ do_bad, SIGKILL, SI_KERNEL, "unknown 39" },
{ do_bad, SIGKILL, SI_KERNEL, "unknown 40" },
......
经过这一系列的调用,最终内核会运行对应的错误处理函数。
上一篇: Linux2.6内存布局
下一篇: kernel debug(内核调试)