4.19 nfs_readpage_async()空指针解引用问题

1 问题描述

环境信息:

# uname -a
Linux server-v10-sp2-x86 4.19.90-25.30.v2101.ky10.x86_64 #1 SMP Thu Aug 17 18:12:26 CST 2023 x86_64 x86_64 x86_64 GNU/Linux

# mount | grep nfs
172.20.190.58:/data/iso on /data/iso type nfs4 (rw,relatime,vers=4.2,rsize=1048576,wsize=1048576,namlen=255,hard,proto=tcp,timeo=600,retrans=2,sec=sys,clientaddr=172.20.190.55,local_lock=none,addr=172.20.190.58,_netdev)
nfsd on /proc/fs/nfsd type nfsd (rw,relatime)

dmesg日志:

BUG: unable to handle kernel NULL pointer dereference at 0000000000000000      
CPU: 104 PID: 1861326 Comm: filebeat Kdump: loaded Tainted: G        W         4.19.90-24.4.v2101.ky10.x86_64 #1
Hardware name: Lenovo ThinkServer SR658H V2/HengShan    , BIOS HSE166A-1.16 01/05/2023
RIP: 0010:nfs_readpage_async+0x67/0x2d0 [nfs]
Call Trace:
 nfs_readpage+0x121/0x1e0 [nfs]
 generic_file_read_iter+0x60f/0xba0
 nfs_file_read+0x66/0xa0 [nfs]
 new_sync_read+0xfa/0x160
 vfs_read+0x89/0x130
 ksys_read+0x5a/0xd0
 do_syscall_64+0x5b/0x1d0
 entry_SYSCALL_64_after_hwframe+0x44/0xa9

faddr2line脚本解析:

server-v10-sp2-x86:~# ./faddr2line fs/nfs/nfs.ko.debug nfs_readpage_async+0x67/0x2d0
nfs_readpage_async+0x67/0x2d0:
page_file_mapping 于 /usr/src/debug/kernel-4.19.90/linux-4.19.90-24.4.v2101.ky10.x86_64/./include/linux/mm.h:1242
(已内连入)nfs_page_length 于 /usr/src/debug/kernel-4.19.90/linux-4.19.90-24.4.v2101.ky10.x86_64/fs/nfs/internal.h:681
(已内连入)nfs_readpage_async 于 /usr/src/debug/kernel-4.19.90/linux-4.19.90-24.4.v2101.ky10.x86_64/fs/nfs/read.c:118

空指针解引用发生在nfs_readpage_async -> nfs_page_length -> page_file_mapping的语句return page->mapping,因为page_file_mapping是内联函数,所以空指针解引用是发生在nfs_page_length函数中的i_size_read(page_file_mapping(page)->host)host成员在struct address_space中的偏移量为0。

2 vmcore解析

加载nfs相关ko文件:

crash> mod -s nfs fs/nfs/nfs.ko.debug 
crash> mod -s nfsv3 fs/nfs/nfsv3.ko.debug
crash> mod -s nfsv4 fs/nfs/nfsv4.ko.debug
crash> mod -s nfs_acl fs/nfs_common/nfs_acl.ko.debug
crash> mod -s grace fs/nfs_common/grace.ko.debug

查看栈的信息:

crash> bt -FF
#7 [ffffbcb34f133ba0] page_fault at ffffffff8c80116e
    [exception RIP: nfs_readpage_async+103]
    RIP: ffffffffc0a43047  RSP: ffffbcb34f133c50  RFLAGS: 00010246
    RAX: 0000000000000000  RBX: ffffe3e4768a6080  RCX: 0000000000000000
    RDX: 01d7ffffc0000001  RSI: ffff97a5b452a940  RDI: ffff979b656b3280
    RBP: ffff97a5b452a940   R8: 0000000000000001   R9: 0000000000000000
    R10: ffff97a6f6213680  R11: 0000000000000000  R12: ffff97a5b452a940
    R13: ffff979b656b3280  R14: ffff97a5b452aab0  R15: ffffbcb34f133e60
    ORIG_RAX: ffffffffffffffff  CS: 0010  SS: 0018
    ffffbcb34f133ba8: ffffbcb34f133e60 [ffff97a5b452aab0:nfs_inode_cache]
    ffffbcb34f133bb8: [ffff979b656b3280:kmalloc-128] [ffff97a5b452a940:nfs_inode_cache]
    ffffbcb34f133bc8: [ffff97a5b452a940:nfs_inode_cache] ffffe3e4768a6080
    ffffbcb34f133bd8: 0000000000000000 [ffff97a6f6213680:secpath_cache]
    ffffbcb34f133be8: 0000000000000000 0000000000000001
    ffffbcb34f133bf8: 0000000000000000 0000000000000000
    ffffbcb34f133c08: 01d7ffffc0000001 [ffff97a5b452a940:nfs_inode_cache]
    ffffbcb34f133c18: [ffff979b656b3280:kmalloc-128] ffffffffffffffff
    ffffbcb34f133c28: nfs_readpage_async+103 0000000000000010
    ffffbcb34f133c38: 0000000000010246 ffffbcb34f133c50
    ffffbcb34f133c48: 0000000000000018 ffffbcb34f133c80
    ffffbcb34f133c58: ffffbcb34f133dd8 [ffff97a5b452aab0:nfs_inode_cache]
    ffffbcb34f133c68: ffffbcb34f133e60 nfs_wb_page+190
    ffffbcb34f133c78: 0000000000000246 0000000000000000
    ffffbcb34f133c88: 0000000000000000 00000000019d3000
    ffffbcb34f133c98: 00000000019d3fff 0000000000000001
    ffffbcb34f133ca8: 0000000000000000 0000000000000000
    ffffbcb34f133cb8: 0000000000000000 0000000000000000
    ffffbcb34f133cc8: 0000000000000000 0000000000000000
    ffffbcb34f133cd8: 0000000000000000 0000000000000000
    ffffbcb34f133ce8: 0000000000000000 faff81de0e046600
    ffffbcb34f133cf8: ffffe3e4768a6080 [ffff97a5b452a940:nfs_inode_cache]
    ffffbcb34f133d08: [ffff979b656b3280:kmalloc-128] 0000000000000000
    ffffbcb34f133d18: nfs_readpage+289
#8 [ffffbcb34f133d18] nfs_readpage at ffffffffc0a433d1 [nfs]
    ffffbcb34f133d20: ffffe3e4768a6080 0000000000004000
    ffffbcb34f133d30: [ffff97a5b452a940:nfs_inode_cache] ffffbcb34f133dd8
    ffffbcb34f133d40: generic_file_read_iter+1551

反汇编nfs_readpage_async崩溃前的代码:

crash> dis nfs_readpage_async
0xffffffffc0a42fe0 <nfs_readpage_async>:        nopl   0x0(%rax,%rax,1) [FTRACE NOP]
0xffffffffc0a42fe5 <nfs_readpage_async+5>:      push   %r13
0xffffffffc0a42fe7 <nfs_readpage_async+7>:      push   %r12
0xffffffffc0a42fe9 <nfs_readpage_async+9>:      mov    %rdi,%r13
0xffffffffc0a42fec <nfs_readpage_async+12>:     push   %rbp
0xffffffffc0a42fed <nfs_readpage_async+13>:     push   %rbx
0xffffffffc0a42fee <nfs_readpage_async+14>:     mov    %rdx,%rbx
0xffffffffc0a42ff1 <nfs_readpage_async+17>:     mov    %rsi,%r12
0xffffffffc0a42ff4 <nfs_readpage_async+20>:     sub    $0xa8,%rsp
0xffffffffc0a42ffb <nfs_readpage_async+27>:     mov    %gs:0x28,%rax
0xffffffffc0a43004 <nfs_readpage_async+36>:     mov    %rax,0xa0(%rsp)
0xffffffffc0a4300c <nfs_readpage_async+44>:     xor    %eax,%eax
0xffffffffc0a4300e <nfs_readpage_async+46>:     mov    0x8(%rdx),%rdx
0xffffffffc0a43012 <nfs_readpage_async+50>:     lea    -0x1(%rdx),%rax
0xffffffffc0a43016 <nfs_readpage_async+54>:     and    $0x1,%edx
0xffffffffc0a43019 <nfs_readpage_async+57>:     cmove  %rbx,%rax
0xffffffffc0a4301d <nfs_readpage_async+61>:     mov    0x8(%rax),%rcx
0xffffffffc0a43021 <nfs_readpage_async+65>:     lea    -0x1(%rcx),%rdx
0xffffffffc0a43025 <nfs_readpage_async+69>:     and    $0x1,%ecx
0xffffffffc0a43028 <nfs_readpage_async+72>:     cmove  %rax,%rdx
0xffffffffc0a4302c <nfs_readpage_async+76>:     mov    (%rdx),%rdx
0xffffffffc0a4302f <nfs_readpage_async+79>:     test   $0x40000,%edx
0xffffffffc0a43035 <nfs_readpage_async+85>:     je     0xffffffffc0a43043 <nfs_readpage_async+99>
0xffffffffc0a43037 <nfs_readpage_async+87>:     mov    (%rax),%rax
0xffffffffc0a4303a <nfs_readpage_async+90>:     test   $0x2,%ah
0xffffffffc0a4303d <nfs_readpage_async+93>:     jne    0xffffffffc0a4315e <nfs_readpage_async+382>
0xffffffffc0a43043 <nfs_readpage_async+99>:     mov    0x18(%rbx),%rax
0xffffffffc0a43047 <nfs_readpage_async+103>:    mov    (%rax),%rax
...

x86_64架构下整数参数使用的寄存器依次为: RDI,RSI,RDX,RCX,R8,R9,从栈帧数据中可以看出函数int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode, struct page *page)第三个参数为rdx,但从汇编代码mov %rdx,%rbx可以看出rdx已经赋值给rbx,而且rdx的值已经被覆盖,所以第三个参数pageRBX: ffffe3e4768a6080:

crash> struct page ffffe3e4768a6080 -x
struct page {
      mapping = 0x0,
  ...
  _refcount = {
    counter = 0x2
  },
...

可以看到page->mapping已经为NULL

再看一下nfs_readpage_async函数的第二个参数RSI: ffff97a5b452a940:

crash> kmem ffff97a5b452a940
  NAME
  nfs_inode_cache
  ...
  FREE / [ALLOCATED]
  [ffff97a5b452a760] # 栈中[ffff97a5b452a940:nfs_inode_cache]的含义

crash> struct nfs_inode ffff97a5b452a760 -ox
struct nfs_inode {
  ...
  [ffff97a5b452a940] struct inode vfs_inode; # 可以看出`struct inode`是嵌套在`struct nfs_inode`中的
}
SIZE: 0x458

crash> struct inode ffff97a5b452a940 -x
struct inode {
  ...
  i_mapping = 0xffff97a5b452aab0,
  ...
}

地址0xffff97a5b452aab0存在于栈中,具体为什么,有待分析。

3 代码分析

读文件时,如果有数据需要写到server端,而这时又刚好有个致命错误发生,page->mapping被置为NULL,接着执行到nfs_readpage_async就发生了空指针解引用。具体根因和解决方案请查看《4.19 nfs_updatepage()空指针解引用问题》。

read
  ksys_read
    vfs_read
      __vfs_read
        new_sync_read
          call_read_iter
            nfs_file_read
              generic_file_read_iter
                generic_file_buffered_read
                  page_cache_sync_readahead
                    ondemand_readahead
                      ra_submit
                        __do_page_cache_readahead
                          read_pages // 修改成总是执行mapping->a_ops->readpage
                            nfs_readpage

nfs_readpage
  inode = page_file_mapping(page)->host // 这里没问题
  nfs_wb_page // return 0,不退出,而是继续往下执行
    nfs_writepage_locked // return 0
      nfs_do_writepage // return 0
        // 合入补丁 14bebe3c90b3 NFS: Don't interrupt file writeout due to fatal errors 后返回 0
        nfs_page_async_flush // return 0
          nfs_error_is_fatal_on_server // 发生致命错误时
          generic_error_remove_page
            truncate_inode_page
              delete_from_page_cache
                __delete_from_page_cache
                  page_cache_tree_delete
                    page->mapping = NULL
  nfs_readpage_async
    nfs_page_length
      page_file_mapping(page)->host // 发生空指针解引用
        page->mapping == 0

4 构造

以下修改使得执行buffer读可以调用到函数nfs_readpage:

Author: ChenXiaoSong <chenxiaosongemail@foxmail.com>
Date:   Fri Nov 10 02:21:20 2023 +0000

    always use mapping->a_ops->readpage
    
    Signed-off-by: ChenXiaoSong <chenxiaosongemail@foxmail.com>

diff --git a/mm/readahead.c b/mm/readahead.c
index 4e630143a0ba..5cb303815e5f 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -119,7 +119,7 @@ static int read_pages(struct address_space *mapping, struct file *filp,
 
        blk_start_plug(&plug);
 
-       if (mapping->a_ops->readpages) {
+       if (0) { // mapping->a_ops->readpages) {
                ret = mapping->a_ops->readpages(filp, mapping, pages, nr_pages);
                /* Clean up the remaining pages */
                put_pages_list(pages);

这里我只找到办法在direct读的时候触发回写,暂时还未找到在buffer读的时候触发回写(因为buffer读直接从page cache里读取数据),有兴趣的朋友可以和我一起尝试。direct读触发回写的用户态程序是direct-read-trigger-writeback.c,执行以下脚本:

touch /mnt/file
sync
gcc direct-read-trigger-writeback.c
echo 3 > /proc/sys/vm/drop_caches
./a.out

direct读没有执行到nfs_readpage,以下是gdb断点的栈(有关gdb调试内核的方法可以查看《Linux内核课程》):

(gdb) bt
#0  generic_error_remove_page (mapping=0xffff88812d04c298, page=0xffffea0004b28700) at ../mm/truncate.c:235
#1  0xffffffff815310bf in nfs_write_error_remove_page (req=0xffff88813899dc80) at ../fs/nfs/write.c:602
#2  0xffffffff815321aa in nfs_page_async_flush (pgio=0xffffc90000dbfb50, page=0xffffea0004b28700) at ../fs/nfs/write.c:664
#3  0xffffffff815323d6 in nfs_do_writepage (page=0xffffea0004b28700, wbc=0xffffc90000dbfc38, pgio=0xffffc90000dbfb50) at ../fs/nfs/write.c:674
#4  0xffffffff81532442 in nfs_writepages_callback (page=0xffffea0004b28700, wbc=0xffffc90000dbfc38, data=0xffffc90000dbfb50) at ../fs/nfs/write.c:717
#5  0xffffffff812eb6ea in write_cache_pages (mapping=0xffff88812d04c298, wbc=0xffffc90000dbfc38, writepage=<optimized out>, data=<optimized out>) at ../mm/page-writeback.c:2240
#6  0xffffffff81533571 in nfs_writepages (mapping=0xffff88812d04c298, wbc=0xffffc90000dbfc38) at ../fs/nfs/write.c:742
#7  0xffffffff812ec9e4 in do_writepages (mapping=0xffff88812d04c298, wbc=0xffffc90000dbfc38) at ../mm/page-writeback.c:2344
#8  0xffffffff812d8c97 in __filemap_fdatawrite_range (mapping=0xffff88812d04c298, start=0, end=-131386645534720, sync_mode=<optimized out>) at ../mm/filemap.c:446
#9  0xffffffff812d8ce4 in __filemap_fdatawrite (sync_mode=<optimized out>, mapping=<optimized out>) at ../mm/filemap.c:454
#10 filemap_fdatawrite (mapping=0xffff88812d04c298) at ../mm/filemap.c:459
#11 0xffffffff812d8d0e in filemap_write_and_wait (mapping=0xffff88812d04c298) at ../mm/filemap.c:640
#12 0xffffffff81533d4b in nfs_wb_all (inode=0xffff88812d04c120) at ../fs/nfs/write.c:2000
#13 0xffffffff81522c57 in nfs_sync_mapping (mapping=0xffff88812d04c298) at ../fs/nfs/inode.c:153
#14 0xffffffff8152917d in nfs_block_buffered (nfsi=0xffff88812d04bf48, inode=0xffff88812d04c120) at ../fs/nfs/io.c:102
#15 0xffffffff815292a6 in nfs_start_io_direct (inode=0xffff88812d04c120) at ../fs/nfs/io.c:133
#16 0xffffffff8152acc2 in nfs_file_direct_read (iocb=0xffffc90000dbfdc8, iter=0xffffc90000dbfdf0, swap=false) at ../fs/nfs/direct.c:616
#17 0xffffffff8151d75a in nfs_file_read (iocb=0xffffc90000dbfdc8, to=0xffffc90000dbfdf0) at ../fs/nfs/file.c:160
#18 0xffffffff813c2ab2 in call_read_iter (iter=<optimized out>, kio=<optimized out>, file=<optimized out>) at ../include/linux/fs.h:1817
#19 new_sync_read (filp=0xffff88812c2f8000, buf=<optimized out>, len=100, ppos=0xffffc90000dbfec0) at ../fs/read_write.c:406
#20 0xffffffff813c67fb in __vfs_read (file=0xffff88812c2f8000, buf=0x7ffd836abc80 "", count=100, pos=0xffffc90000dbfec0) at ../fs/read_write.c:418
#21 0xffffffff813c68d0 in vfs_read (file=0xffff88812c2f8000, buf=0x7ffd836abc80 "", count=100, pos=0xffffc90000dbfec0) at ../fs/read_write.c:452
#22 0xffffffff813c6ee9 in ksys_read (fd=<optimized out>, buf=0x7ffd836abc80 "", count=100) at ../fs/read_write.c:579
#23 0xffffffff813c6f86 in __do_sys_read (count=<optimized out>, buf=<optimized out>, fd=<optimized out>) at ../fs/read_write.c:589
...