4.19 nfs_wb_page() soft lockup的问题

1 问题现象

4.19报soft lockup,日志如下:

[44257027.415633] watchdog: BUG: soft lockup - CPU#73 stuck for 22s! [ICCCCEKF/ICCEqu:2649529]
[44257027.505122] CPU: 73 PID: 2649529 Comm: ICCCCEKF/ICCEqu Kdump: loaded Tainted: G        W  OEL    4.19.90-23.29.v2101.ky10.aarch64 #1
[44257027.505123] pstate: 40400009 (nZcv daif +PAN -UAO)
[44257027.505142] pc : nfs_commit_end+0x28/0x48 [nfs]
[44257027.505147] lr : 0x1
[44257027.547172] sp : fffffadb78c7b930
[44257027.547173] x29: fffffadb78c7b930 x28: 0000000000000000 
[44257027.547175] x27: fffffaa393635708 x26: ffff3d862f4bf000 
[44257027.547175] x25: 0000000000010000 x24: fffffaa393635598 
[44257027.547177] x23: 00000000fffffffe x22: 0000000000000000 
[44257027.547178] x21: fffffadb78c7b998 x20: 0000000000000000 
[44257027.547179] x19: fffffaa3936354b8 x18: 0000000000000000 
[44257027.547180] x17: 0000000000000000 x16: ffff3d8680813920 
[44257027.547181] x15: 0000000000000000 x14: d293b0140efe4f58 
[44257027.547182] x13: afa441f967f4fab6 x12: ffffffffffffff83 
[44257027.547183] x11: ffffffffffffff83 x10: fffffae27fffcec0 
[44257027.547183] x9 : 000000000000001a x8 : 0000000000000000 
[44257027.547184] x7 : 0000000000000000 x6 : fffffae259a314c0 
[44257027.547185] x5 : fffffadb78c7b898 x4 : ffff3d8681d905d8 
[44257027.547186] x3 : 01ffffc00000102b x2 : ffff3d8629f81000 
[44257027.547187] x1 : fffffaa3936354b8 x0 : 00000000ffffffff 
[44257027.570469] Call trace:
[44257027.570479]  nfs_commit_end+0x28/0x48 [nfs]
[44257027.570489]  __nfs_commit_inode+0x114/0x1a8 [nfs]
[44257027.570501]  nfs_wb_page+0xc0/0x210 [nfs]
[44257027.581163]  nfs_writepage_setup+0xb4/0x5c0 [nfs]
[44257027.581175]  nfs_updatepage+0x144/0x420 [nfs]
[44257027.592961]  nfs_write_end+0x80/0x348 [nfs]
[44257027.592966]  generic_perform_write+0xfc/0x188
[44257027.592977]  nfs_file_write+0xc8/0x260 [nfs]
[44257027.614899]  __vfs_write+0x74/0x80
[44257027.614903]  vfs_write+0xac/0x1c0
[44257027.626860]  ksys_write+0x5c/0xc8
[44257027.626862]  __arm64_sys_write+0x24/0x30

2 日志解析

soft lockup解析日志其实没啥卵用,但我还是习惯性的想解析一下,熟能生巧嘛,总有收获,这不发现了主线的脚本(2024.12.17)已经解析不了4.19的vmlinux了:

# 要用4.19的faddr2line,不能用主线的脚本
# PC 存储当前正在执行指令的地址
faddr2line vmlinux clear_page_dirty_for_io+0x1f8/0x3f0
clear_page_dirty_for_io+0x1f8/0x3f0:
unlocked_inode_to_wb_end 于 include/linux/backing-dev.h:393
(已内连入)clear_page_dirty_for_io 于 mm/page-writeback.c:2737 # unlocked_inode_to_wb_end(inode, &cookie);

# LR 用于保存函数调用的返回地址
faddr2line vmlinux clear_page_dirty_for_io+0x84/0x3f0
clear_page_dirty_for_io+0x84/0x3f0:
clear_page_dirty_for_io 于 mm/page-writeback.c:2720 # if (page_mkclean(page))

3 vmcore解析

md5sum vmcore 
8931169f2c9a7e43ad917ecf73b24b25  vmcore

查看进程2649529:

crash> bt 2649529
PID: 2649529  TASK: fffffae18c896480  CPU: 73  COMMAND: "ICCCCEKF/ICCEqu"
 #0 [fffffae259a2fe30] crash_save_cpu at ffff3d868073fb1c
 #1 [fffffae259a2ffe0] handle_IPI at ffff3d86806274f0
 #2 [fffffae259a30050] gic_handle_irq at ffff3d8680611740
     PC: 0000000040400009   LR: fffffadb78c7b810   SP: ffff3d868082f848
    X29: fffffae259a30080  X28: fffffae259a2c080  X27: fffffaa393635598
    X26: 0000000040400009  X25: ffff3d8629f67d60  X24: fffffadb78c7b7f0
    X23: 0000000000000003  X22: 0000000000000003  X21: ffff3d8680611744
    X20: fffffae259a30050  X19: 0000000000000000  X18: ffff3d8681980000
    X17: ffff3d8680627534  X16: fffffae259a2ffe0  X15: 0000000000000000
    X14: 0000000000000049  X13: ffff3d8680620ab0  X12: fffffae259a2ffc0
    X11: ffff3d862f4bf000  X10: 0000000000000049   X9: 0000000000000003
     X8: 0000000000000000   X7: fffffadb78c7b7f0   X6: ffff3d8681a0c000
     X5: 0000000000000049   X4: 0000000000010000   X3: ffff3d8680f5fa08
     X2: fffffae259a2ffa0   X1: 0000000000000000   X0: fffffae259a2ff80
    ORIG_X0: fffffadb78c7b920  SYSCALLNO: ffff3d86806133b8  PSTATE: 00000000

这个栈里看不出我们需要的page相关信息。

再来查看ICCCCEKF/ICCEqu名称的进程:

crash> ps ICCCCEKF/ICCEqu
   PID    PPID  CPU       TASK        ST  %MEM     VSZ    RSS  COMM
  2395050      1  66  fffffabb04ff3580  UN   0.0 12894528 1267776  ICCCCEKF/ICCEqu
  2395054      1  32  fffffae0e22c8b80  UN   0.0 13167744 1199168  ICCCCEKF/ICCEqu
  2409446      1  93  ffffdaa35b0c1f00  UN   0.0 10686784 1151232  ICCCCEKF/ICCEqu
  2649513      1  17  fffffad84ab0ba00  UN   0.0 12496000 1198976  ICCCCEKF/ICCEqu
> 2649529      1  73  fffffae18c896480  RU   0.0 12493120 1187456  ICCCCEKF/ICCEqu

前面4个进程的栈都类似,我们选取第一个2395050进程:

crash> bt 2395050
PID: 2395050  TASK: fffffabb04ff3580  CPU: 66  COMMAND: "ICCCCEKF/ICCEqu"
 #0 [fffffadb7cabba70] __switch_to at ffff3d8680618ba4
 #1 [fffffadb7cabba90] __schedule at ffff3d86811859ac
 #2 [fffffadb7cabbb10] schedule at ffff3d8681186098
 #3 [fffffadb7cabbb20] rwsem_down_write_failed at ffff3d8681189c00
 #4 [fffffadb7cabbbb0] down_write at ffff3d8681188e04
 #5 [fffffadb7cabbbd0] nfs_start_io_write at ffff3d8629f618cc [nfs]
 #6 [fffffadb7cabbbf0] nfs_file_write at ffff3d8629f59038 [nfs]
 #7 [fffffadb7cabbc30] new_sync_write at ffff3d86808bb4d0
 #8 [fffffadb7cabbcc0] __vfs_write at ffff3d86808be080
 #9 [fffffadb7cabbcf0] vfs_write at ffff3d86808be290
#10 [fffffadb7cabbd30] ksys_write at ffff3d86808be5c0
#11 [fffffadb7cabbd70] __arm64_sys_write at ffff3d86808be650
#12 [fffffadb7cabbd90] _MODULE_START_syshook_linux at ffff3d862a4514b0 [syshook_linux]
#13 [fffffadb7cabbdf0] _MODULE_START_syshook_linux at ffff3d862a453988 [syshook_linux]
#14 [fffffadb7cabbe60] el0_svc_common at ffff3d86806283ec
#15 [fffffadb7cabbea0] el0_svc_handler at ffff3d86806284dc
#16 [fffffadb7cabbff0] el0_svc at ffff3d8680614084

这几个进程在nfs_start_io_write()中获取不到信号量,进入睡眠。

4 调试

加上以下修改:

--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -659,6 +659,8 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
 out:
        return ret;
 out_launder:
+       printk("%s:%d, err:%d\n", __func__, __LINE__, ret);
+       dump_stack();
        nfs_write_error_remove_page(req);
        return 0;
 }
kprobe_func_name=nfs_wb_page
kprobe_func_name=nfs_writepage_locked
kprobe_func_name=nfs_do_writepage
kprobe_func_name=nfs_page_async_flush
kprobe_func_name=nfs_error_is_fatal_on_server

cd /sys/kernel/debug/tracing/
# 可以用 kprobe 跟踪的函数
cat available_filter_functions | grep ${kprobe_func_name}
echo 1 > tracing_on
# aarch64函数参数用到的寄存器: X0 ~ X7
echo "p:p_${kprobe_func_name} ${kprobe_func_name} err=%x0:x32" >> kprobe_events
echo 1 > events/kprobes/p_${kprobe_func_name}/enable
echo stacktrace > events/kprobes/p_${kprobe_func_name}/trigger
echo '!stacktrace' > events/kprobes/p_${kprobe_func_name}/trigger
echo 0 > events/kprobes/p_${kprobe_func_name}/enable
echo "-:p_${kprobe_func_name}" >> kprobe_events

echo 0 > trace # 清除trace信息
cat trace_pipe

虽然nfs_error_is_fatal_on_server()函数能在/sys/kernel/debug/tracing/available_filter_functions文件中找到(表示可被kprobe跟踪),但在nfs_page_async_flush() -> nfs_error_is_fatal_on_server()调用路径中,kprobe探测不到nfs_error_is_fatal_on_server()

5 代码分析和复现

请参考《4.19 nfs_updatepage()空指针解引用问题》

5.1 复现

打上补丁 22876f540bdf ("NFS: Don't call generic_error_remove_page() while holding locks")89047634f5ce NFS: Don't interrupt file writeout due to fatal errors、构造补丁0001-reproduce-4.19-null-ptr-deref-in-nfs_updatepage.patch,复现步骤:

mount -t nfs -o vers=4.1 ${nfs_server_ip}:/server/export/dir /mnt

echo something > something
echo something_else > something_else
echo something_else_again > something_else_again
# 为什么不直接用 echo something > /mnt/file 呢,因为用ps无法查看到echo进程
cat something > /mnt/file &
cat something_else > /mnt/file &
cat something_else_again > /mnt/file &

可以看到:

root@syzkaller:~# jobs
[1]-  Running                 cat something > /mnt/file &
[3]+  Running                 cat something_else_again > /mnt/file &

查看进程的状态:

root@syzkaller:~# ps aux | grep cat
root         448  100  0.0   4740   568 ttyS0    R    07:16   8:30 cat something
root         450  0.0  0.0   4740   504 ttyS0    D    07:16   0:00 cat something_else_again

顺便从文件名查一下进程号(非必要步骤,就是记录一下曾经的尝试):

root@syzkaller:~# lsof /mnt/file
COMMAND PID USER   FD   TYPE DEVICE SIZE/OFF NODE NAME
cat     448 root    1w   REG   0,35       15   12 /mnt/file
cat     450 root    1w   REG   0,35       15   12 /mnt/file
root@syzkaller:~# fuser /mnt/file
/mnt/file:             448   450

查看进程栈:

root@syzkaller:~# ls /proc/448/task/
448 # 只有一个线程
root@syzkaller:~# cat /proc/448/stack
[<0>] 0xffffffffffffffff
root@syzkaller:~# ls /proc/450/task/
450 # 只有一个线程
root@syzkaller:~# cat /proc/450/stack 
[<0>] call_rwsem_down_write_failed+0x13/0x20
[<0>] nfs_start_io_write+0x1a/0x2b
[<0>] nfs_file_write+0x1be/0x6e5
[<0>] new_sync_write+0x442/0x560
[<0>] __vfs_write+0xda/0xef
[<0>] vfs_write+0x176/0x48b
[<0>] ksys_write+0x10a/0x1e9
[<0>] __se_sys_write+0x24/0x29
[<0>] __x64_sys_write+0x79/0x93
[<0>] do_syscall_64+0x16d/0x4bb
[<0>] entry_SYSCALL_64_after_hwframe+0x5c/0xc1
[<0>] 0xffffffffffffffff

5.2 代码分析

合入补丁22876f540bdf ("NFS: Don't call generic_error_remove_page() while holding locks")后,generic_error_remove_page不执行,page的private标记不会被清除,所以在nfs_wb_page函数陷入死循环。

// 其中一个先执行的cat进程,就是那个stack栈只有0xffffffffffffffff的进程
write
  ksys_write
    vfs_write
      __vfs_write
        new_sync_write
          nfs_file_write
            nfs_start_io_write
              // 在这里加打印调试
              if (!strcmp(inode->i_sb->s_type->name, "nfs4"))
              printk("%s:%d, inode:%p\n", __func__, __LINE__, inode);
              dump_stack()
              down_write(&inode->i_rwsem); // 这里获取了锁,另一个后执行的cat进程就获取锁失败
            generic_perform_write
              nfs_write_end
                nfs_updatepage
                  nfs_writepage_setup
                    nfs_setup_write_request
                      // 尝试搜索已经存在的request,如果已存在就更新,并返回非NULL
                      nfs_try_to_update_request // return NULL
                        nfs_wb_page // return 0
                          if (clear_page_dirty_for_io(page)) // 条件满足
                          nfs_writepage_locked // return 0
                            nfs_do_writepage // return 0
                              // 合入补丁 14bebe3c90b3 NFS: Don't interrupt file writeout due to fatal errors 后返回 0
                              nfs_page_async_flush // return 0
                                nfs_error_is_fatal_on_server // 发生致命错误时
                                nfs_write_error_remove_page
                                  // 合入22876f540bdf NFS: Don't call generic_error_remove_page() while holding locks后,
                                  // generic_error_remove_page()不执行,而是执行SetPageError()
                                  generic_error_remove_page
                                    truncate_inode_page
                                      truncate_cleanup_page
                                        do_invalidatepage
                                          nfs_invalidate_page
                                            nfs_wb_page_cancel
                                              nfs_inode_remove_request
                                                // 因为generic_error_remove_page()不执行,所以这里没执行到,
                                                // 后面的 if (!PagePrivate(page)) 条件不满足,陷入死循环
                                                ClearPagePrivate(head->wb_page) // 清除private标记
                                      delete_from_page_cache
                                        __delete_from_page_cache
                                          page_cache_tree_delete
                                            page->mapping = NULL
                          continue
                          // 开始循环
                          if (clear_page_dirty_for_io(page)) // 条件不满足
                          if (!PagePrivate(page)) // 条件不满足,因为没执行generic_error_remove_page,没清除private标记
                          ret = nfs_commit_inode = 0
                          // 进行下一次循环,永远不会结束
            nfs_end_io_write // 未执行到

// 另一个后执行的cat进程
write
  ksys_write
    vfs_write
      __vfs_write
        new_sync_write
          nfs_file_write
            nfs_start_io_write
              down_write(&inode->i_rwsem);
                call_rwsem_down_write_failed // 获取锁失败

5.3 vmcore解析

按上述文章中的步骤复现,虽然进入了死循环,但并没有出现软锁的情况,导出vmcore分析一下。

crash> ps cat
      PID    PPID  CPU       TASK        ST  %MEM      VSZ      RSS  COMM
>     528       1  14  ffff88813b1c0000  RU   0.0     4800     1824  cat
      529       1   3  ffff88813b1c2f00  UN   0.0     4800     1688  cat

crash> bt 528
PID: 528      TASK: ffff88813b1c0000  CPU: 14   COMMAND: "cat"
    [exception RIP: page_mapping+50]
    RIP: ffffffff81258672  RSP: ffffc900010abb70  RFLAGS: 00000246
    RAX: ffffea0004cbebc0  RBX: ffff88812a8cc380  RCX: 0000000000000000
    RDX: 0017ffffc000002b  RSI: 0000000000000001  RDI: ffffea0004cbebc0
    RBP: ffffc900010abb70   R8: ffffffff82606600   R9: ffff88813bdaa778
    R10: 0000000000000000  R11: 0000000002d6ac5c  R12: 0000000000000000
    R13: ffffc900010abbd0  R14: ffffea0004cbebc0  R15: ffffea0004cbebc0
    CS: 0010  SS: 0018
 #0 [ffffc900010abb78] clear_page_dirty_for_io at ffffffff8123dc3e
 #1 [ffffc900010abbc8] nfs_wb_page at ffffffff81429077
 #2 [ffffc900010abc70] nfs_writepage_setup at ffffffff814293a0
 #3 [ffffc900010abcb0] nfs_updatepage at ffffffff8142966e
 #4 [ffffc900010abcf8] nfs_write_end at ffffffff81416d17
 #5 [ffffc900010abd40] generic_perform_write at ffffffff8122df1a
 #6 [ffffc900010abdd0] nfs_file_write at ffffffff81417972
 #7 [ffffc900010abe08] new_sync_write at ffffffff812dfea7
 #8 [ffffc900010abe90] __vfs_write at ffffffff812e33a9
 #9 [ffffc900010abea0] vfs_write at ffffffff812e356d
#10 [ffffc900010abed8] ksys_write at ffffffff812e39ae
#11 [ffffc900010abf18] __x64_sys_write at ffffffff812e3a49
#12 [ffffc900010abf28] do_syscall_64 at ffffffff81004384
#13 [ffffc900010abf50] entry_SYSCALL_64_after_hwframe at ffffffff81c00088
    RIP: 00007f07cbd624b3  RSP: 00007ffdbdd43748  RFLAGS: 00000246
    RAX: ffffffffffffffda  RBX: 000000000000000a  RCX: 00007f07cbd624b3
    RDX: 000000000000000a  RSI: 00007f07cba25000  RDI: 0000000000000001
    RBP: 00007f07cba25000   R8: 00007f07cba24010   R9: 0000000000000000
    R10: fffffffffffffbc5  R11: 0000000000000246  R12: 0000000000000001
    R13: 0000000000000001  R14: 0000000000080000  R15: 0000000000080000
    ORIG_RAX: 0000000000000001  CS: 0033  SS: 002b

反汇编:

/home/sonvhi/chenxiaosong/code/klinux-4.19/x86_64-build/../mm/util.c: 529
0xffffffff81258640 <page_mapping>:      nopl   0x0(%rax,%rax,1) [FTRACE NOP]
/home/sonvhi/chenxiaosong/code/klinux-4.19/x86_64-build/../include/linux/compiler.h: 294
0xffffffff81258645 <page_mapping+5>:    mov    0x8(%rdi),%rdx
/home/sonvhi/chenxiaosong/code/klinux-4.19/x86_64-build/../include/linux/page-flags.h: 144
0xffffffff81258649 <page_mapping+9>:    mov    %rdi,%rax
/home/sonvhi/chenxiaosong/code/klinux-4.19/x86_64-build/../mm/util.c: 529
0xffffffff8125864c <page_mapping+12>:   push   %rbp
/home/sonvhi/chenxiaosong/code/klinux-4.19/x86_64-build/../include/linux/page-flags.h: 145
0xffffffff8125864d <page_mapping+13>:   lea    -0x1(%rdx),%rcx
0xffffffff81258651 <page_mapping+17>:   and    $0x1,%edx
0xffffffff81258654 <page_mapping+20>:   cmovne %rcx,%rax
/home/sonvhi/chenxiaosong/code/klinux-4.19/x86_64-build/../include/linux/compiler.h: 294
0xffffffff81258658 <page_mapping+24>:   mov    %rsp,%rbp
0xffffffff8125865b <page_mapping+27>:   mov    0x8(%rax),%rcx
/home/sonvhi/chenxiaosong/code/klinux-4.19/x86_64-build/../include/linux/page-flags.h: 144
0xffffffff8125865f <page_mapping+31>:   lea    -0x1(%rcx),%rdx
0xffffffff81258663 <page_mapping+35>:   and    $0x1,%ecx
0xffffffff81258666 <page_mapping+38>:   cmove  %rax,%rdx
/home/sonvhi/chenxiaosong/code/klinux-4.19/x86_64-build/../arch/x86/include/asm/bitops.h: 317
0xffffffff8125866a <page_mapping+42>:   mov    (%rdx),%rdx
/home/sonvhi/chenxiaosong/code/klinux-4.19/x86_64-build/../mm/util.c: 535
0xffffffff8125866d <page_mapping+45>:   and    $0x1,%dh
0xffffffff81258670 <page_mapping+48>:   jne    0xffffffff812586e5 <page_mapping+165>
/home/sonvhi/chenxiaosong/code/klinux-4.19/x86_64-build/../include/linux/compiler.h: 294
0xffffffff81258672 <page_mapping+50>:   mov    0x8(%rax),%rcx
/home/sonvhi/chenxiaosong/code/klinux-4.19/x86_64-build/../include/linux/page-flags.h: 144

可以看到保存page_mapping()函数第一个参数的rdi的值并没有被覆盖,所以RDI: ffffea0004cbebc0就是page的指针:

crash> kmem ffffea0004cbebc0
      PAGE       PHYSICAL      MAPPING       INDEX CNT FLAGS
ffffea0004cbebc0 132faf000 ffff88812a8cc4f0        0  3 17ffffc000102b locked,error,uptodate,lru,private