aarch64架构 4.19 nfs_readpage_async()空指针解引用问题

1 问题描述

环境信息:

crash> sys
      KERNEL: vmlinux
    DUMPFILE: vmcore  [PARTIAL DUMP]
        CPUS: 128
        DATE: Sat Nov  4 17:05:14 CST 2023
      UPTIME: 16 days, 19:08:18
LOAD AVERAGE: 5.32, 5.21, 5.04
       TASKS: 25391
    NODENAME: kc-fz02-node-arm-5
     RELEASE: 4.19.90-25.26.1.v2101.fortest.ky10.aarch64
     VERSION: #1 SMP Sun Oct 1 09:43:19 CST 2023
     MACHINE: aarch64  (unknown Mhz)
      MEMORY: 512 GB
       PANIC: "Unable to handle kernel NULL pointer dereference at virtual address 0000000000000000"

日志:

crash> dmesg | less
[1451283.455528] Unable to handle kernel NULL pointer dereference at virtual address 0000000000000000
...
[1451283.971377] Call trace:
[1451283.977098]  nfs_readpage_async+0x54/0x258 [nfs]
[1451283.984942]  nfs_readpage+0x164/0x220 [nfs]
[1451283.992230]  generic_file_buffered_read+0x3cc/0x718
[1451284.000159]  generic_file_read_iter+0x114/0x190
[1451284.007680]  nfs_file_read+0x8c/0x100 [nfs]
[1451284.014750]  new_sync_read+0xcc/0x128
[1451284.021192]  __vfs_read+0x74/0x80
[1451284.027187]  vfs_read+0x94/0x150
[1451284.032998]  ksys_read+0x5c/0xc8
[1451284.038714]  __arm64_sys_read+0x24/0x30
[1451284.045009]  el0_svc_common+0x78/0x130
[1451284.051191]  el0_svc_handler+0x38/0x78
[1451284.057363]  el0_svc+0x8/0x1b0

faddr2line脚本解析:

./faddr2line 4.19.90-25+/kernel/fs/nfs/nfs.ko nfs_readpage_async+0x54/0x258
nfs_readpage_async+0x54/0x258:
i_size_read 于 include/linux/fs.h:849
(已内连入)nfs_page_length 于 fs/nfs/internal.h:681
(已内连入)nfs_readpage_async 于 fs/nfs/read.c:118

空指针解引用发生在nfs_readpage_async -> nfs_page_lengthi_size_read(page_file_mapping(page)->host)host成员在struct address_space中的偏移量为0。

2 vmcore解析

加载nfs相关ko文件:

crash> mod -s nfs 4.19.90-25+/kernel/fs/nfs/nfs.ko
crash> mod -s nfsv3 4.19.90-25+/kernel/fs/nfs/nfsv3.ko
crash> mod -s nfsv4 4.19.90-25+/kernel/fs/nfs/nfsv4.ko

查看栈的信息:

crash> bt -FF
 #9 [ffffec3216ffbad0] el1_ia at ffff2f770862310c
     PC: ffff2f76d1d9634c  [nfs_readpage_async+84]
     LR: ffff2f76d1d966b4  [nfs_readpage+356]
     SP: ffffec3216ffbae0  PSTATE: 40400009
    X29: ffffec3216ffbae0  X28: ffff7ff30a0911c0  X27: ffff7ff30a0911c0
    X26: ffffcc2f3041d790  X25: ffffcc3756c18ee0  X24: ffffec3216ffbd68
    X23: ffffcc2b512d9b00  X22: ffffcc3756c18ee0  X21: ffffcc3756c18ee0
    X20: ffffcc2b512d9b00  X19: ffff7ff30a0911c0  X18: 0000000000000000
    X17: 0000000000000000  X16: ffff2f7708825990  X15: 0000000000000000
    X14: ffffcc27e5ea9d68  X13: ffffcc3756c1905c  X12: 0000000000000007
    X11: ffffcc27e5ea9d90  X10: 0000000000000000   X9: 0000000000000008
     X8: 0000000000000060   X7: ffffcc3681c26b00   X6: ffff7ff30da07080
     X5: 0000000000210d00   X4: ffff7ff30da070a0   X3: 01ffffc000000081
     X2: ffff7ff30a0911c0   X1: dead000000000100   X0: 0000000000000000
    ffffec3216ffbad0: [ffffec3216ffbae0:thread_stack] nfs_readpage_async+84 
#10 [ffffec3216ffbae0] nfs_readpage_async at ffff2f76d1d96348 [nfs]
    ffffec3216ffbae0: [ffffec3216ffbbc0:thread_stack] nfs_readpage+356 
    ffffec3216ffbaf0: ffff7ff30a0911c0 0000000000000000 
    ffffec3216ffbb00: [ffffcc3756c18ee0:nfs_inode_cache] 0000000000000000 
    ffffec3216ffbb10: [ffffec3216ffbd40:thread_stack] 0000000000000000 
    ffffec3216ffbb20: [ffffec3216ffbbc0:thread_stack] nfs_readpage+188 
    ffffec3216ffbb30: ffff7ff30a0911c0 [ffffcc2f3041d700:filp] 
    ffffec3216ffbb40: [ffffcc3756c18ee0:nfs_inode_cache] activate_task+132 
    ffffec3216ffbb50: 0000000000000000 0000000000000000 
    ffffec3216ffbb60: 0000000003870000 000000000387ffff 
    ffffec3216ffbb70: 0000000000000001 0000000000000000 
    ffffec3216ffbb80: 0000000000000000 0000000000000000 
    ffffec3216ffbb90: 0000000000000000 0000000000000000 
    ffffec3216ffbba0: [ffffec3216ffbbc0:thread_stack] nfs_readpage+284 
    ffffec3216ffbbb0: ffff7ff30a0911c0 0000000000000000 

再结合nfs_readpage_async的汇编代码:

crash> dis nfs_readpage_async
0xffff2f76d1d962f8 <nfs_readpage_async>:        stp     x29, x30, [sp,#-224]!
0xffff2f76d1d962fc <nfs_readpage_async+4>:      mov     x29, sp
0xffff2f76d1d96300 <nfs_readpage_async+8>:      stp     x21, x22, [sp,#32]
0xffff2f76d1d96304 <nfs_readpage_async+12>:     str     x19, [sp,#16]
0xffff2f76d1d96308 <nfs_readpage_async+16>:     str     x23, [sp,#48]
0xffff2f76d1d9630c <nfs_readpage_async+20>:     mov     x19, x2 # aarch64下是指x2复制到x19
0xffff2f76d1d96310 <nfs_readpage_async+24>:     mov     x22, x1
0xffff2f76d1d96314 <nfs_readpage_async+28>:     mov     x23, x0
0xffff2f76d1d96318 <nfs_readpage_async+32>:     mov     x0, x30
0xffff2f76d1d9631c <nfs_readpage_async+36>:     nop
0xffff2f76d1d96320 <nfs_readpage_async+40>:     ldr     x1, [x19,#8]
0xffff2f76d1d96324 <nfs_readpage_async+44>:     sub     x0, x1, #0x1
0xffff2f76d1d96328 <nfs_readpage_async+48>:     tst     x1, #0x1
0xffff2f76d1d9632c <nfs_readpage_async+52>:     csel    x0, x0, x19, ne
0xffff2f76d1d96330 <nfs_readpage_async+56>:     ldr     x0, [x0]
0xffff2f76d1d96334 <nfs_readpage_async+60>:     tst     w0, #0x40000
0xffff2f76d1d96338 <nfs_readpage_async+64>:     b.eq    0xffff2f76d1d96348 <nfs_readpage_async+80>
0xffff2f76d1d9633c <nfs_readpage_async+68>:     ldr     x0, [x19]
0xffff2f76d1d96340 <nfs_readpage_async+72>:     tst     w0, #0x200
0xffff2f76d1d96344 <nfs_readpage_async+76>:     b.ne    0xffff2f76d1d96418 <nfs_readpage_async+288>
0xffff2f76d1d96348 <nfs_readpage_async+80>:     ldr     x0, [x19,#24]
0xffff2f76d1d9634c <nfs_readpage_async+84>:     ldr     x0, [x0]
...

aarch64架构下整数参数使用的寄存器依次为: x0~x7,从栈帧数据中可以看出函数nfs_readpage_async第三个参数page的值为X2: ffff7ff30a0911c0,且x2寄存器的值未被覆盖。

crash> struct page ffff7ff30a0911c0 -x
struct page {
...
      mapping = 0x0, 
...
  _refcount = {
    counter = 0x3
  }, 
}

可以看到page->mapping已经为NULL。

3 分析与解决方案

具体的分析请查看《4.19 nfs_readpage_async()空指针解引用问题