欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页  >  网络运营

Linode Xen 下 grsecurity >= 4.3 崩溃问题

程序员文章站 2024-01-19 18:50:22
自从 linux 4.3 开始,在 linode 上使用 pax/grsecurity 时,内核会在被 pv-grub 执行后不久立即崩溃。由于崩溃是在启动后极早期立刻发生...

自从 linux 4.3 开始,在 linode 上使用 pax/grsecurity 时,内核会在被 pv-grub 执行后不久立即崩溃。由于崩溃是在启动后极早期立刻发生的,没有任何可以用来调试的日志,同时公司也不是盖子开的,也没有办法得到母机上有意义的调试信息。这导致了盖子的 vps 内核从去年 12 月开始被锁定在 4.2.7。由于不知什么时候产生了 linode 东京机房会在 2016 年 6 月从 xen 迁移到 kvm 的错觉,也没有花精力去尝试调试这个问题。

然而今年 linode 周年庆时硬件全部翻倍,惟独东京机房除外。而根据官方最新的说法,新机房乐观估计要第四季度上线。解决内核问题就不得不提上了盖子的日程,首先是手工修复了不少 cve 高危漏洞,随后又祭出 diff 折腾半天,内核始终会在启动后立刻死亡。而由于 grsecurity 并不提供 git 源,所以 git bisect 也是不可能的,唯一可用的工具只有 linux 4.2.7 / 补丁文件,与 linux 4.3.3 / 补丁文件。

在阅读代码差异时,一个很大的挑战是如何区分上游内核的修改与下游 pax/grsecurity 补丁的修改。直接比较补丁文件会导致代码上下文丢失,让代码的意图不可理解。最后盖子打算编写一个名为 metadiff 的工具,自动比较并去除在上游中出现的代码段,以便仅仅对 pax/grsecurity 的代码进行比较,就连名字都想好了就叫 metadiff ,但一直没有动手。

直到上个月和 shawn 聊天时,提到了自己装个 xen 也不是不可行;于是周六终于动手在 virutalbox 虚拟机里撞了个 debian + xen,又在 xen 里启动了一个虚拟机,果然很快就得到了内核崩溃的 traceback。

rip: ffffffff8100b2b0 pmu_msr_read+0x10
flags: 00000282 i s nz
rsp: ffffffff81aeff30
rax: 8000000000000000  rcx: 0000000000000001  rdx: ffffffff81aeffcc
rbx: 00000000c0000080  rsi: ffffffff81aeffa0  rdi: 00000000c0000080
rbp: ffffffff81aeffa0  r8: 0000000000000001  r9: 00000000ffffffff
r10: ffffffff81cf9000  r11: 0000000000000000  r12: ffffffff81aeffcc
r13: ffffffff81aeffc4  r14: ffffffff81aeffc0  r15: 6f73b764afec1c9d
 cs: e033    ss: e02b    ds: 0000    es: 0000
 fs: 0000 @ 0000000000000000
 gs: 0000 @ 0000000000000000/0000000000000000
code (instr addr ffffffff8100b2b0)
00 00 00 00 00 41 54 49 89 d4 55 48 89 f5 53 89 fb 48 83 ec 10 <65> 48 8b 04 25 28 00 00 00 48 89 


stack:
 0000000000000001 0000000000000000 0000000000000000 ffffffff8100b2b0
 000000010000e030 0000000000010082 ffffffff81aeff70 000000000000e02b
 0000000000000000 0000000000000000 00000000c0000080 ffffffff81aeffcc
 ffffffff81aeffc8 ffffffff810041c8 ffffffff81aeffc8 ffffffff81aeffcc

call trace:
 [<ffffffff8100b2b0>] pmu_msr_read+0x10 <--
 [<ffffffff8100b2b0>] pmu_msr_read+0x10
 [<ffffffff810041c8>] xen_read_msr_safe+0x18
 [<ffffffff81be93eb>] xen_start_kernel+0x1b9

哦?可见内核在 xen_start_kernel 不久就崩溃了,这是 /* first c function to be called on xen boot */,在如此早期就崩溃,什么错误日志到看不到也就不奇怪了。来看看 xen_read_msr 和 pmu_msr_read 在 4.2 和 4.3 之间有什么改变:

--- ../../4.2.7/linux-4.2.7/arch/x86/xen/enlighten.c  2016-09-11 00:44:12.010022936 +0800
+++ arch/x86/xen/enlighten.c  2015-12-15 13:41:43.000000000 +0800

@@ -1030,6 +1034,9 @@ static u64 xen_read_msr_safe(unsigned in
 {
    u64 val;

+    if (pmu_msr_read(msr, &val, err))
+        return val;
+
    val = native_read_msr_safe(msr, err);
    switch (msr) {
    case msr_ia32_apicbase:
@@ -1074,9 +1081,11 @@ static int xen_write_msr_safe(unsigned i
        /* fast syscall setup is all done in hypercalls, so
          these are all ignored. stub them out here to stop
          xen console noise. */
+        break;

    default:
-        ret = native_write_msr_safe(msr, low, high);
+        if (!pmu_msr_write(msr, low, high, &ret))
+            ret = native_write_msr_safe(msr, low, high);
    }

    return ret;

可见 pmu_msr_read 完全是个新东西,使用 git blame 继续追查。

xen/pmu: initialization code for xen pmu 65d0cf0be79feebeb19e7626fd3ed41ae73f642d
xen/pmu: describe vendor-specific pmu registers e27b72df01109c689062caeba1defa013b759e0e
xen/pmu: intercept pmu-related msr and apic accesses 6b08cd6328c58a2ae190c5ee03a2ffcab5ef828e
xen/pmu: pmu emulation code bf6dfb154d935725c9a2005033ca33017b9df439

发现 pmu 是 xen 在 4.3 进入主线内核的新特性,于是解决方法就很简单了,把 bf6dfb 和 6b08cd 都撤销就好,接下来的事情就让 pax team 和 spender 去追查吧。最后的补丁是:

diff -uprn linux-4.7.3-hardened/arch/x86/xen/apic.c linux-4.7.3-hardened.good/arch/x86/xen/apic.c
  --- linux-4.7.3-hardened/arch/x86/xen/apic.c  2016-07-24 19:23:50.000000000 +0000
  +++ linux-4.7.3-hardened.good/arch/x86/xen/apic.c  2016-09-10 20:05:21.450647009 +0000
  @@ -7,7 +7,6 @@
   #include <xen/xen.h>
   #include <xen/interface/physdev.h>
   #include "xen-ops.h"
  -#include "pmu.h"
   #include "smp.h"

   static unsigned int xen_io_apic_read(unsigned apic, unsigned reg)
  @@ -73,10 +72,8 @@ static u32 xen_apic_read(u32 reg)

   static void xen_apic_write(u32 reg, u32 val)
   {
  -  if (reg == apic_lvtpc) {
  -   (void)pmu_apic_update(reg);
  +  if (reg == apic_lvtpc)
      return;
  -  }

    /* warn to see if there's any stray references */
    warn(1,"register: %x, value: %x\n", reg, val);
  diff -uprn linux-4.7.3-hardened/arch/x86/xen/enlighten.c linux-4.7.3-hardened.good/arch/x86/xen/enlighten.c
  --- linux-4.7.3-hardened/arch/x86/xen/enlighten.c  2016-09-10 19:59:29.237313676 +0000
  +++ linux-4.7.3-hardened.good/arch/x86/xen/enlighten.c  2016-09-10 20:06:49.683980342 +0000
  @@ -1031,9 +1031,6 @@ static u64 xen_read_msr_safe(unsigned in
   {
    u64 val;

  -  if (pmu_msr_read(msr, &val, err))
  -   return val;
  -
    val = native_read_msr_safe(msr, err);
    switch (msr) {
    case msr_ia32_apicbase:
  @@ -1081,13 +1078,17 @@ static int xen_write_msr_safe(unsigned i
      break;

    default:
  -   if (!pmu_msr_write(msr, low, high, &ret))
  -     ret = native_write_msr_safe(msr, low, high);
  +   ret = native_write_msr_safe(msr, low, high);
    }

    return ret;
   }

  +unsigned long long xen_read_pmc(int counter)
  +{
  +  return 0;
  +}
  +
   static u64 xen_read_msr(unsigned int msr)
   {
    /*
  diff -uprn linux-4.7.3-hardened/arch/x86/xen/pmu.c linux-4.7.3-hardened.good/arch/x86/xen/pmu.c
  --- linux-4.7.3-hardened/arch/x86/xen/pmu.c  2016-07-24 19:23:50.000000000 +0000
  +++ linux-4.7.3-hardened.good/arch/x86/xen/pmu.c  2016-09-10 20:05:21.450647009 +0000
  @@ -13,20 +13,11 @@
   /* x86_pmu.handle_irq definition */
   #include "../events/perf_event.h"

  -#define xenpmu_irq_processing  1
  -struct xenpmu {
  -  /* shared page between hypervisor and domain */
  -  struct xen_pmu_data *xenpmu_data;

  -  uint8_t flags;
  -};
  -static define_per_cpu(struct xenpmu, xenpmu_shared);
  -#define get_xenpmu_data()  (this_cpu_ptr(&xenpmu_shared)->xenpmu_data)
  -#define get_xenpmu_flags()  (this_cpu_ptr(&xenpmu_shared)->flags)
  -
  -/* macro for computing address of a pmu msr bank */
  -#define field_offset(ctxt, field) ((void *)((uintptr_t)ctxt + \
  -          (uintptr_t)ctxt->field))
  +/* shared page between hypervisor and domain */
  +static define_per_cpu(struct xen_pmu_data *, xenpmu_shared);
  +#define get_xenpmu_data()  per_cpu(xenpmu_shared, smp_processor_id())
  +

   /* amd pmu */
   #define f15h_num_counters  6
  @@ -60,8 +51,6 @@ static __read_mostly int amd_num_counter
   /* alias registers (0x4c1) for full-width writes to pmcs */
   #define msr_pmc_alias_mask     (~(msr_ia32_perfctr0 ^ msr_ia32_pmc0))

  -#define intel_pmc_type_shift    30
  -
   static __read_mostly int intel_num_arch_counters, intel_num_fixed_counters;


  @@ -178,232 +167,6 @@ static int is_intel_pmu_msr(u32 msr_inde
    }
   }

  -static bool xen_intel_pmu_emulate(unsigned int msr, u64 *val, int type,
  -       int index, bool is_read)
  -{
  -  uint64_t *reg = null;
  -  struct xen_pmu_intel_ctxt *ctxt;
  -  uint64_t *fix_counters;
  -  struct xen_pmu_cntr_pair *arch_cntr_pair;
  -  struct xen_pmu_data *xenpmu_data = get_xenpmu_data();
  -  uint8_t xenpmu_flags = get_xenpmu_flags();
  -
  -
  -  if (!xenpmu_data || !(xenpmu_flags & xenpmu_irq_processing))
  -   return false;
  -
  -  ctxt = &xenpmu_data->pmu.c.intel;
  -
  -  switch (msr) {
  -  case msr_core_perf_global_ovf_ctrl:
  -   reg = &ctxt->global_ovf_ctrl;
  -   break;
  -  case msr_core_perf_global_status:
  -   reg = &ctxt->global_status;
  -   break;
  -  case msr_core_perf_global_ctrl:
  -   reg = &ctxt->global_ctrl;
  -   break;
  -  case msr_core_perf_fixed_ctr_ctrl:
  -   reg = &ctxt->fixed_ctrl;
  -   break;
  -  default:
  -   switch (type) {
  -   case msr_type_counter:
  -     fix_counters = field_offset(ctxt, fixed_counters);
  -     reg = &fix_counters[index];
  -     break;
  -   case msr_type_arch_counter:
  -     arch_cntr_pair = field_offset(ctxt, arch_counters);
  -     reg = &arch_cntr_pair[index].counter;
  -     break;
  -   case msr_type_arch_ctrl:
  -     arch_cntr_pair = field_offset(ctxt, arch_counters);
  -     reg = &arch_cntr_pair[index].control;
  -     break;
  -   default:
  -     return false;
  -   }
  -  }
  -
  -  if (reg) {
  -   if (is_read)
  -     *val = *reg;
  -   else {
  -     *reg = *val;
  -
  -     if (msr == msr_core_perf_global_ovf_ctrl)
  -      ctxt->global_status &= (~(*val));
  -   }
  -   return true;
  -  }
  -
  -  return false;
  -}
  -
  -static bool xen_amd_pmu_emulate(unsigned int msr, u64 *val, bool is_read)
  -{
  -  uint64_t *reg = null;
  -  int i, off = 0;
  -  struct xen_pmu_amd_ctxt *ctxt;
  -  uint64_t *counter_regs, *ctrl_regs;
  -  struct xen_pmu_data *xenpmu_data = get_xenpmu_data();
  -  uint8_t xenpmu_flags = get_xenpmu_flags();
  -
  -  if (!xenpmu_data || !(xenpmu_flags & xenpmu_irq_processing))
  -   return false;
  -
  -  if (k7_counters_mirrored &&
  -    ((msr >= msr_k7_evntsel0) && (msr <= msr_k7_perfctr3)))
  -   msr = get_fam15h_addr(msr);
  -
  -  ctxt = &xenpmu_data->pmu.c.amd;
  -  for (i = 0; i < amd_num_counters; i++) {
  -   if (msr == amd_ctrls_base + off) {
  -     ctrl_regs = field_offset(ctxt, ctrls);
  -     reg = &ctrl_regs[i];
  -     break;
  -   } else if (msr == amd_counters_base + off) {
  -     counter_regs = field_offset(ctxt, counters);
  -     reg = &counter_regs[i];
  -     break;
  -   }
  -   off += amd_msr_step;
  -  }
  -
  -  if (reg) {
  -   if (is_read)
  -     *val = *reg;
  -   else
  -     *reg = *val;
  -
  -   return true;
  -  }
  -  return false;
  -}
  -
  -bool pmu_msr_read(unsigned int msr, uint64_t *val, int *err)
  -{
  -  if (boot_cpu_data.x86_vendor == x86_vendor_amd) {
  -   if (is_amd_pmu_msr(msr)) {
  -     if (!xen_amd_pmu_emulate(msr, val, 1))
  -      *val = native_read_msr_safe(msr, err);
  -     return true;
  -   }
  -  } else {
  -   int type, index;
  -
  -   if (is_intel_pmu_msr(msr, &type, &index)) {
  -     if (!xen_intel_pmu_emulate(msr, val, type, index, 1))
  -      *val = native_read_msr_safe(msr, err);
  -     return true;
  -   }
  -  }
  -
  -  return false;
  -}
  -
  -bool pmu_msr_write(unsigned int msr, uint32_t low, uint32_t high, int *err)
  -{
  -  uint64_t val = ((uint64_t)high << 32) | low;
  -
  -  if (boot_cpu_data.x86_vendor == x86_vendor_amd) {
  -   if (is_amd_pmu_msr(msr)) {
  -     if (!xen_amd_pmu_emulate(msr, &val, 0))
  -      *err = native_write_msr_safe(msr, low, high);
  -     return true;
  -   }
  -  } else {
  -   int type, index;
  -
  -   if (is_intel_pmu_msr(msr, &type, &index)) {
  -     if (!xen_intel_pmu_emulate(msr, &val, type, index, 0))
  -      *err = native_write_msr_safe(msr, low, high);
  -     return true;
  -   }
  -  }
  -
  -  return false;
  -}
  -
  -static unsigned long long xen_amd_read_pmc(int counter)
  -{
  -  struct xen_pmu_amd_ctxt *ctxt;
  -  uint64_t *counter_regs;
  -  struct xen_pmu_data *xenpmu_data = get_xenpmu_data();
  -  uint8_t xenpmu_flags = get_xenpmu_flags();
  -
  -  if (!xenpmu_data || !(xenpmu_flags & xenpmu_irq_processing)) {
  -   uint32_t msr;
  -   int err;
  -
  -   msr = amd_counters_base + (counter * amd_msr_step);
  -   return native_read_msr_safe(msr, &err);
  -  }
  -
  -  ctxt = &xenpmu_data->pmu.c.amd;
  -  counter_regs = field_offset(ctxt, counters);
  -  return counter_regs[counter];
  -}
  -
  -static unsigned long long xen_intel_read_pmc(int counter)
  -{
  -  struct xen_pmu_intel_ctxt *ctxt;
  -  uint64_t *fixed_counters;
  -  struct xen_pmu_cntr_pair *arch_cntr_pair;
  -  struct xen_pmu_data *xenpmu_data = get_xenpmu_data();
  -  uint8_t xenpmu_flags = get_xenpmu_flags();
  -
  -  if (!xenpmu_data || !(xenpmu_flags & xenpmu_irq_processing)) {
  -   uint32_t msr;
  -   int err;
  -
  -   if (counter & (1 << intel_pmc_type_shift))
  -     msr = msr_core_perf_fixed_ctr0 + (counter & 0xffff);
  -   else
  -     msr = msr_ia32_perfctr0 + counter;
  -
  -   return native_read_msr_safe(msr, &err);
  -  }
  -
  -  ctxt = &xenpmu_data->pmu.c.intel;
  -  if (counter & (1 << intel_pmc_type_shift)) {
  -   fixed_counters = field_offset(ctxt, fixed_counters);
  -   return fixed_counters[counter & 0xffff];
  -  }
  -
  -  arch_cntr_pair = field_offset(ctxt, arch_counters);
  -  return arch_cntr_pair[counter].counter;
  -}
  -
  -unsigned long long xen_read_pmc(int counter)
  -{
  -  if (boot_cpu_data.x86_vendor == x86_vendor_amd)
  -   return xen_amd_read_pmc(counter);
  -  else
  -   return xen_intel_read_pmc(counter);
  -}
  -
  -int pmu_apic_update(uint32_t val)
  -{
  -  int ret;
  -  struct xen_pmu_data *xenpmu_data = get_xenpmu_data();
  -
  -  if (!xenpmu_data) {
  -   pr_warn_once("%s: pmudata not initialized\n", __func__);
  -   return -einval;
  -  }
  -
  -  xenpmu_data->pmu.l.lapic_lvtpc = val;
  -
  -  if (get_xenpmu_flags() & xenpmu_irq_processing)
  -   return 0;
  -
  -  ret = hypervisor_xenpmu_op(xenpmu_lvtpc_set, null);
  -
  -  return ret;
  -}
  -
   /* perf callbacks */
   static int xen_is_in_guest(void)
   {
  @@ -476,37 +239,26 @@ static void xen_convert_regs(const struc

   irqreturn_t xen_pmu_irq_handler(int irq, void *dev_id)
   {
  -  int err, ret = irq_none;
  +  int ret = irq_none;
    struct pt_regs regs;
    const struct xen_pmu_data *xenpmu_data = get_xenpmu_data();
  -  uint8_t xenpmu_flags = get_xenpmu_flags();

    if (!xenpmu_data) {
      pr_warn_once("%s: pmudata not initialized\n", __func__);
      return ret;
    }

  -  this_cpu_ptr(&xenpmu_shared)->flags =
  -   xenpmu_flags | xenpmu_irq_processing;
    xen_convert_regs(&xenpmu_data->pmu.r.regs, ®s,
        xenpmu_data->pmu.pmu_flags);
    if (x86_pmu.handle_irq(®s))
      ret = irq_handled;

  -  /* write out cached context to hw */
  -  err = hypervisor_xenpmu_op(xenpmu_flush, null);
  -  this_cpu_ptr(&xenpmu_shared)->flags = xenpmu_flags;
  -  if (err) {
  -   pr_warn_once("%s: failed hypercall, err: %d\n", __func__, err);
  -   return irq_none;
  -  }
  -
    return ret;
   }

   bool is_xen_pmu(int cpu)
   {
  -  return (get_xenpmu_data() != null);
  +  return (per_cpu(xenpmu_shared, cpu) != null);
   }

   void xen_pmu_init(int cpu)
  @@ -536,8 +288,7 @@ void xen_pmu_init(int cpu)
    if (err)
      goto fail;

  -  per_cpu(xenpmu_shared, cpu).xenpmu_data = xenpmu_data;
  -  per_cpu(xenpmu_shared, cpu).flags = 0;
  +  per_cpu(xenpmu_shared, cpu) = xenpmu_data;

    if (cpu == 0) {
      perf_register_guest_info_callbacks(&xen_guest_cbs);
  @@ -565,6 +316,6 @@ void xen_pmu_finish(int cpu)

    (void)hypervisor_xenpmu_op(xenpmu_finish, &xp);

  -  free_pages((unsigned long)per_cpu(xenpmu_shared, cpu).xenpmu_data, 0);
  -  per_cpu(xenpmu_shared, cpu).xenpmu_data = null;
  +  free_pages((unsigned long)per_cpu(xenpmu_shared, cpu), 0);
  +  per_cpu(xenpmu_shared, cpu) = null;
   }
  diff -uprn linux-4.7.3-hardened/arch/x86/xen/pmu.h linux-4.7.3-hardened.good/arch/x86/xen/pmu.h
  --- linux-4.7.3-hardened/arch/x86/xen/pmu.h  2016-07-24 19:23:50.000000000 +0000
  +++ linux-4.7.3-hardened.good/arch/x86/xen/pmu.h  2016-09-10 20:05:21.453980342 +0000
  @@ -7,9 +7,5 @@ irqreturn_t xen_pmu_irq_handler(int irq,
   void xen_pmu_init(int cpu);
   void xen_pmu_finish(int cpu);
   bool is_xen_pmu(int cpu);
  -bool pmu_msr_read(unsigned int msr, uint64_t *val, int *err);
  -bool pmu_msr_write(unsigned int msr, uint32_t low, uint32_t high, int *err);
  -int pmu_apic_update(uint32_t reg);
  -unsigned long long xen_read_pmc(int counter);

   #endif /* __xen_pmu_h */

打好补丁再编译内核,被智子锁定版本的内核果然升级成功了。

$ uname -r
4.7.3-hardened

更新:官方已在 grsecurity-3.1-4.7.4-201609152234.patch 中修复问题,不再需要此 workaround。