欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

Linux kernel oops

程序员文章站 2022-05-29 09:11:50
...

本文以ARM64为例,介绍内核的Oops机制,我们使用grep搜索一下内核中可能会报Oops的地方:

./arch/arm64/kernel/sys_compat.c:142:	arm64_notify_die("Oops - bad compat syscall(2)", regs, &info, scno);
./arch/arm64/kernel/traps.c:771:	die("Oops - bad mode", regs, 0);
./arch/arm64/kernel/traps.c:929:		die("Oops - BUG", regs, 0);
./arch/arm64/mm/fault.c:270:	die("Oops", regs, esr);

搜索结果如上所示,一共有这几个地方定义为Oops,因此Oops可能包含如下一些场景:

  1. 64bit 系统调用发生了错误,报Oops
  2. CPU陷入了某种不正常的exception mode,在该exception对应的exception vector entry中直接报Oops
  3. traps中定义的BUG()函数被调用触发了Oops
  4. 内核空间中发生了内存地址相关的访问异常

本文着重从第4种情况来入手跟踪Oops的发生过程:

在代码文件 ./arch/arm64/mm/fault.c 中:

do_translation_fault --> do_bad_area --> __do_kernel_fault --> die_kernel_fault
do_alignment_fault --> do_bad_area --> __do_kernel_fault --> die_kernel_fault
do_page_fault --> __do_kernel_fault --> die_kernel_fault

调用路径如上所示,当内核访问一个内存地址发生错误时会分别调用 do_xxx_fault 该函数最终的目标是 die_kernel_fault:

static void die_kernel_fault(const char *msg, unsigned long addr,
                 unsigned int esr, struct pt_regs *regs)
{
    bust_spinlocks(1);

    pr_alert("Unable to handle kernel %s at virtual address %016lx\n", msg,
         addr);

    mem_abort_decode(esr);

    show_pte(addr);
    die("Oops", regs, esr);
    bust_spinlocks(0);
    do_exit(SIGKILL);
}

这里最终会调用 die("Oops", regs, esr) 函数:

/*
 * This function is protected against re-entrancy.
 */
void die(const char *str, struct pt_regs *regs, int err)
{
    int ret;
    unsigned long flags;

    raw_spin_lock_irqsave(&die_lock, flags);

    oops_enter();

    console_verbose();
    bust_spinlocks(1);
    ret = __die(str, err, regs); // 其中会发送 notify_die 通知

    if (regs && kexec_should_crash(current))
        crash_kexec(regs);

    bust_spinlocks(0);
    add_taint(TAINT_DIE, LOCKDEP_NOW_UNRELIABLE);
    oops_exit();

    if (in_interrupt())
        panic("Fatal exception in interrupt");
    if (panic_on_oops)     // 判断是否要执行panic操作
        panic("Fatal exception");

    raw_spin_unlock_irqrestore(&die_lock, flags);

    if (ret != NOTIFY_STOP)
        do_exit(SIGSEGV);
}

在die中可以看到如果配置了panic_on_oops为1,那么才会直接触发panic操作,如果没有配置为1,并不会导致系统panic重启。Oops都会打印内核调用栈。

一种手动触发panic的机制

利用sysrq机制可以触发kernel crash:

echo c > /proc/sysrq-trigger

这种方式就是利用Oops机制来触发panic的:

static void sysrq_handle_crash(int key)
{
    char *killer = NULL;

    /* we need to release the RCU read lock here,
     * otherwise we get an annoying
     * 'BUG: sleeping function called from invalid context'
     * complaint from the kernel before the panic.
     */
    rcu_read_unlock();
    panic_on_oops = 1;  /* force panic */  //-------- (1)
    wmb();
    *killer = 1; //---------------------------(2)
}
  • 第(1)步先配置panic_on_oops为1,使得当内核oops时直接触发panic操作
  • 第(2)步访问一个内核NULL空地址,触发oops操作

到这里可能很多人会有一个疑惑,对一个内核空地址赋值,是如何产生了Oops呢?

查看异常arm64向量表:

 /*
  * EL1 mode handlers.
  */

 el1_da:
     /*
      * Data abort handling
      */
     mrs x3, far_el1
     inherit_daif    pstate=x23, tmp=x2
     clear_address_tag x0, x3
     mov x2, sp              // struct pt_regs
     bl  do_mem_abort

     kernel_exit 1
......

el0_da:
    /*
     * Data abort handling
     */
    mrs x26, far_el1
    enable_daif
    ct_user_exit
    clear_address_tag x0, x26
    mov x1, x25
    mov x2, sp
    bl  do_mem_abort
    b   ret_to_user


其中el1_da和el1_da中会调用到do_mem_abort,这个向量函数是在CPU运行时发生了data abort异常时进入的一种模式,并且会执行到向量表中对应的函数。

asmlinkage void __exception do_mem_abort(unsigned long addr, unsigned int esr,
                     struct pt_regs *regs)
{
    const struct fault_info *inf = esr_to_fault_info(esr);
    struct siginfo info;

    if (!inf->fn(addr, esr, regs))
        return;

    if (!user_mode(regs)) {
        pr_alert("Unhandled fault at 0x%016lx\n", addr);
        mem_abort_decode(esr);
        show_pte(addr);
    }

    clear_siginfo(&info);
    info.si_signo = inf->sig;
    info.si_errno = 0;
    info.si_code  = inf->code;
    info.si_addr  = (void __user *)addr;
    arm64_notify_die(inf->name, regs, &info, esr);
}

其中对应一个系统错误处理列表:

static inline const struct fault_info *esr_to_fault_info(unsigned int esr)
{
    return fault_info + (esr & 63);
}

static const struct fault_info fault_info[] = {
    { do_bad,       SIGKILL, SI_KERNEL, "ttbr address size fault"   },
    { do_bad,       SIGKILL, SI_KERNEL, "level 1 address size fault"    },
    { do_bad,       SIGKILL, SI_KERNEL, "level 2 address size fault"    },
    { do_bad,       SIGKILL, SI_KERNEL, "level 3 address size fault"    },
    { do_translation_fault, SIGSEGV, SEGV_MAPERR,   "level 0 translation fault" },
    { do_translation_fault, SIGSEGV, SEGV_MAPERR,   "level 1 translation fault" },
    { do_translation_fault, SIGSEGV, SEGV_MAPERR,   "level 2 translation fault" },
    { do_translation_fault, SIGSEGV, SEGV_MAPERR,   "level 3 translation fault" },
    { do_bad,       SIGKILL, SI_KERNEL, "unknown 8"         },
    { do_page_fault,    SIGSEGV, SEGV_ACCERR,   "level 1 access flag fault" },
    { do_page_fault,    SIGSEGV, SEGV_ACCERR,   "level 2 access flag fault" },
    { do_page_fault,    SIGSEGV, SEGV_ACCERR,   "level 3 access flag fault" },
    { do_bad,       SIGKILL, SI_KERNEL, "unknown 12"            },
    { do_page_fault,    SIGSEGV, SEGV_ACCERR,   "level 1 permission fault"  },
    { do_page_fault,    SIGSEGV, SEGV_ACCERR,   "level 2 permission fault"  },
    { do_page_fault,    SIGSEGV, SEGV_ACCERR,   "level 3 permission fault"  },
    { do_sea,       SIGBUS,  BUS_OBJERR,    "synchronous external abort"    },
    { do_bad,       SIGKILL, SI_KERNEL, "unknown 17"            },
    { do_bad,       SIGKILL, SI_KERNEL, "unknown 18"            },
    { do_bad,       SIGKILL, SI_KERNEL, "unknown 19"            },
    { do_sea,       SIGKILL, SI_KERNEL, "level 0 (translation table walk)"  },
    { do_sea,       SIGKILL, SI_KERNEL, "level 1 (translation table walk)"  },
    { do_sea,       SIGKILL, SI_KERNEL, "level 2 (translation table walk)"  },
    { do_sea,       SIGKILL, SI_KERNEL, "level 3 (translation table walk)"  },
    { do_sea,       SIGBUS,  BUS_OBJERR,    "synchronous parity or ECC error" },    // Reserved when RAS is implemented
    { do_bad,       SIGKILL, SI_KERNEL, "unknown 25"            },
    { do_bad,       SIGKILL, SI_KERNEL, "unknown 26"            },
    { do_bad,       SIGKILL, SI_KERNEL, "unknown 27"            },
    { do_sea,       SIGKILL, SI_KERNEL, "level 0 synchronous parity error (translation table walk)" },  // Reserved when RAS is implemented
    { do_sea,       SIGKILL, SI_KERNEL, "level 1 synchronous parity error (translation table walk)" },  // Reserved when RAS is implemented
    { do_sea,       SIGKILL, SI_KERNEL, "level 2 synchronous parity error (translation table walk)" },  // Reserved when RAS is implemented
    { do_sea,       SIGKILL, SI_KERNEL, "level 3 synchronous parity error (translation table walk)" },  // Reserved when RAS is implemented
    { do_bad,       SIGKILL, SI_KERNEL, "unknown 32"            },
    { do_alignment_fault,   SIGBUS,  BUS_ADRALN,    "alignment fault"       },
    { do_bad,       SIGKILL, SI_KERNEL, "unknown 34"            },
    { do_bad,       SIGKILL, SI_KERNEL, "unknown 35"            },
    { do_bad,       SIGKILL, SI_KERNEL, "unknown 36"            },
    { do_bad,       SIGKILL, SI_KERNEL, "unknown 37"            },
    { do_bad,       SIGKILL, SI_KERNEL, "unknown 38"            },
    { do_bad,       SIGKILL, SI_KERNEL, "unknown 39"            },
    { do_bad,       SIGKILL, SI_KERNEL, "unknown 40"            },
......

经过这一系列的调用,最终内核会运行对应的错误处理函数。

相关标签: kernel