环境信息:
# uname -a
Linux server-v10-sp2-x86 4.19.90-25.30.v2101.ky10.x86_64 #1 SMP Thu Aug 17 18:12:26 CST 2023 x86_64 x86_64 x86_64 GNU/Linux
# mount | grep nfs
172.20.190.58:/data/iso on /data/iso type nfs4 (rw,relatime,vers=4.2,rsize=1048576,wsize=1048576,namlen=255,hard,proto=tcp,timeo=600,retrans=2,sec=sys,clientaddr=172.20.190.55,local_lock=none,addr=172.20.190.58,_netdev)
nfsd on /proc/fs/nfsd type nfsd (rw,relatime)
dmesg日志:
BUG: unable to handle kernel NULL pointer dereference at 0000000000000000
CPU: 104 PID: 1861326 Comm: filebeat Kdump: loaded Tainted: G W 4.19.90-24.4.v2101.ky10.x86_64 #1
Hardware name: Lenovo ThinkServer SR658H V2/HengShan , BIOS HSE166A-1.16 01/05/2023
RIP: 0010:nfs_readpage_async+0x67/0x2d0 [nfs]
Call Trace:
nfs_readpage+0x121/0x1e0 [nfs]
generic_file_read_iter+0x60f/0xba0
nfs_file_read+0x66/0xa0 [nfs]
new_sync_read+0xfa/0x160
vfs_read+0x89/0x130
ksys_read+0x5a/0xd0
do_syscall_64+0x5b/0x1d0
entry_SYSCALL_64_after_hwframe+0x44/0xa9
用faddr2line
脚本解析:
server-v10-sp2-x86:~# ./faddr2line fs/nfs/nfs.ko.debug nfs_readpage_async+0x67/0x2d0
nfs_readpage_async+0x67/0x2d0:
page_file_mapping 于 /usr/src/debug/kernel-4.19.90/linux-4.19.90-24.4.v2101.ky10.x86_64/./include/linux/mm.h:1242
(已内连入)nfs_page_length 于 /usr/src/debug/kernel-4.19.90/linux-4.19.90-24.4.v2101.ky10.x86_64/fs/nfs/internal.h:681
(已内连入)nfs_readpage_async 于 /usr/src/debug/kernel-4.19.90/linux-4.19.90-24.4.v2101.ky10.x86_64/fs/nfs/read.c:118
空指针解引用发生在nfs_readpage_async -> nfs_page_length -> page_file_mapping
的语句return page->mapping
,因为page_file_mapping
是内联函数,所以空指针解引用是发生在nfs_page_length
函数中的i_size_read(page_file_mapping(page)->host)
,host
成员在struct address_space
中的偏移量为0。
加载nfs相关ko文件:
crash> mod -s nfs fs/nfs/nfs.ko.debug
crash> mod -s nfsv3 fs/nfs/nfsv3.ko.debug
crash> mod -s nfsv4 fs/nfs/nfsv4.ko.debug
crash> mod -s nfs_acl fs/nfs_common/nfs_acl.ko.debug
crash> mod -s grace fs/nfs_common/grace.ko.debug
查看栈的信息:
crash> bt -FF
#7 [ffffbcb34f133ba0] page_fault at ffffffff8c80116e
exception RIP: nfs_readpage_async+103]
[RIP: ffffffffc0a43047 RSP: ffffbcb34f133c50 RFLAGS: 00010246
RAX: 0000000000000000 RBX: ffffe3e4768a6080 RCX: 0000000000000000
RDX: 01d7ffffc0000001 RSI: ffff97a5b452a940 RDI: ffff979b656b3280
RBP: ffff97a5b452a940 R8: 0000000000000001 R9: 0000000000000000
R10: ffff97a6f6213680 R11: 0000000000000000 R12: ffff97a5b452a940
R13: ffff979b656b3280 R14: ffff97a5b452aab0 R15: ffffbcb34f133e60
ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0018
ffffbcb34f133ba8: ffffbcb34f133e60 [ffff97a5b452aab0:nfs_inode_cache]
ffffbcb34f133bb8: [ffff979b656b3280:kmalloc-128] [ffff97a5b452a940:nfs_inode_cache]
ffffbcb34f133bc8: [ffff97a5b452a940:nfs_inode_cache] ffffe3e4768a6080
ffffbcb34f133bd8: 0000000000000000 [ffff97a6f6213680:secpath_cache]
ffffbcb34f133be8: 0000000000000000 0000000000000001
ffffbcb34f133bf8: 0000000000000000 0000000000000000
ffffbcb34f133c08: 01d7ffffc0000001 [ffff97a5b452a940:nfs_inode_cache]
ffffbcb34f133c18: [ffff979b656b3280:kmalloc-128] ffffffffffffffff
ffffbcb34f133c28: nfs_readpage_async+103 0000000000000010
ffffbcb34f133c38: 0000000000010246 ffffbcb34f133c50
ffffbcb34f133c48: 0000000000000018 ffffbcb34f133c80
ffffbcb34f133c58: ffffbcb34f133dd8 [ffff97a5b452aab0:nfs_inode_cache]
ffffbcb34f133c68: ffffbcb34f133e60 nfs_wb_page+190
ffffbcb34f133c78: 0000000000000246 0000000000000000
ffffbcb34f133c88: 0000000000000000 00000000019d3000
ffffbcb34f133c98: 00000000019d3fff 0000000000000001
ffffbcb34f133ca8: 0000000000000000 0000000000000000
ffffbcb34f133cb8: 0000000000000000 0000000000000000
ffffbcb34f133cc8: 0000000000000000 0000000000000000
ffffbcb34f133cd8: 0000000000000000 0000000000000000
ffffbcb34f133ce8: 0000000000000000 faff81de0e046600
ffffbcb34f133cf8: ffffe3e4768a6080 [ffff97a5b452a940:nfs_inode_cache]
ffffbcb34f133d08: [ffff979b656b3280:kmalloc-128] 0000000000000000
ffffbcb34f133d18: nfs_readpage+289
#8 [ffffbcb34f133d18] nfs_readpage at ffffffffc0a433d1 [nfs]
ffffbcb34f133d20: ffffe3e4768a6080 0000000000004000
ffffbcb34f133d30: [ffff97a5b452a940:nfs_inode_cache] ffffbcb34f133dd8
ffffbcb34f133d40: generic_file_read_iter+1551
反汇编nfs_readpage_async
崩溃前的代码:
crash> dis nfs_readpage_async
0xffffffffc0a42fe0 <nfs_readpage_async>: nopl 0x0(%rax,%rax,1) [FTRACE NOP]
0xffffffffc0a42fe5 <nfs_readpage_async+5>: push %r13
0xffffffffc0a42fe7 <nfs_readpage_async+7>: push %r12
0xffffffffc0a42fe9 <nfs_readpage_async+9>: mov %rdi,%r13
0xffffffffc0a42fec <nfs_readpage_async+12>: push %rbp
0xffffffffc0a42fed <nfs_readpage_async+13>: push %rbx
0xffffffffc0a42fee <nfs_readpage_async+14>: mov %rdx,%rbx
0xffffffffc0a42ff1 <nfs_readpage_async+17>: mov %rsi,%r12
0xffffffffc0a42ff4 <nfs_readpage_async+20>: sub $0xa8,%rsp
0xffffffffc0a42ffb <nfs_readpage_async+27>: mov %gs:0x28,%rax
0xffffffffc0a43004 <nfs_readpage_async+36>: mov %rax,0xa0(%rsp)
0xffffffffc0a4300c <nfs_readpage_async+44>: xor %eax,%eax
0xffffffffc0a4300e <nfs_readpage_async+46>: mov 0x8(%rdx),%rdx
0xffffffffc0a43012 <nfs_readpage_async+50>: lea -0x1(%rdx),%rax
0xffffffffc0a43016 <nfs_readpage_async+54>: and $0x1,%edx
0xffffffffc0a43019 <nfs_readpage_async+57>: cmove %rbx,%rax
0xffffffffc0a4301d <nfs_readpage_async+61>: mov 0x8(%rax),%rcx
0xffffffffc0a43021 <nfs_readpage_async+65>: lea -0x1(%rcx),%rdx
0xffffffffc0a43025 <nfs_readpage_async+69>: and $0x1,%ecx
0xffffffffc0a43028 <nfs_readpage_async+72>: cmove %rax,%rdx
0xffffffffc0a4302c <nfs_readpage_async+76>: mov (%rdx),%rdx
0xffffffffc0a4302f <nfs_readpage_async+79>: test $0x40000,%edx
0xffffffffc0a43035 <nfs_readpage_async+85>: je 0xffffffffc0a43043 <nfs_readpage_async+99>
0xffffffffc0a43037 <nfs_readpage_async+87>: mov (%rax),%rax
0xffffffffc0a4303a <nfs_readpage_async+90>: test $0x2,%ah
0xffffffffc0a4303d <nfs_readpage_async+93>: jne 0xffffffffc0a4315e <nfs_readpage_async+382>
0xffffffffc0a43043 <nfs_readpage_async+99>: mov 0x18(%rbx),%rax
0xffffffffc0a43047 <nfs_readpage_async+103>: mov (%rax),%rax
...
x86_64架构下整数参数使用的寄存器依次为: RDI,RSI,RDX,RCX,R8,R9,从栈帧数据中可以看出函数int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode, struct page *page)
第三个参数为rdx
,但从汇编代码mov %rdx,%rbx
可以看出rdx
已经赋值给rbx
,而且rdx
的值已经被覆盖,所以第三个参数page
为RBX: ffffe3e4768a6080
:
crash> struct page ffffe3e4768a6080 -x
struct page {
mapping = 0x0,
...
_refcount = {
counter = 0x2
},...
可以看到page->mapping
已经为NULL
。
再看一下nfs_readpage_async
函数的第二个参数RSI: ffff97a5b452a940
:
crash> kmem ffff97a5b452a940
NAME
nfs_inode_cache
...
FREE / [ALLOCATED]
ffff97a5b452a760] # 栈中[ffff97a5b452a940:nfs_inode_cache]的含义
[
crash> struct nfs_inode ffff97a5b452a760 -ox
struct nfs_inode {
...
ffff97a5b452a940] struct inode vfs_inode; # 可以看出`struct inode`是嵌套在`struct nfs_inode`中的
[
}SIZE: 0x458
crash> struct inode ffff97a5b452a940 -x
struct inode {
...
i_mapping = 0xffff97a5b452aab0,
...
}
地址0xffff97a5b452aab0
存在于栈中,具体为什么,有待分析。
读文件时,如果有数据需要写到server端,而这时又刚好有个致命错误发生,page->mapping
被置为NULL
,接着执行到nfs_readpage_async
就发生了空指针解引用。具体根因和解决方案请查看《4.19 nfs_updatepage()空指针解引用问题》。
read
ksys_read
vfs_read
__vfs_read
new_sync_read
call_read_iter
nfs_file_read
generic_file_read_iter
generic_file_buffered_read
page_cache_sync_readahead
ondemand_readahead
ra_submit
__do_page_cache_readahead// 修改成总是执行mapping->a_ops->readpage
read_pages
nfs_readpage
nfs_readpage// 这里没问题
inode = page_file_mapping(page)->host // return 0,不退出,而是继续往下执行
nfs_wb_page // return 0
nfs_writepage_locked // return 0
nfs_do_writepage // 合入补丁 14bebe3c90b3 NFS: Don't interrupt file writeout due to fatal errors 后返回 0
// return 0
nfs_page_async_flush // 发生致命错误时
nfs_error_is_fatal_on_server
generic_error_remove_page
truncate_inode_page
delete_from_page_cache
__delete_from_page_cache
page_cache_tree_delete
page->mapping = NULL
nfs_readpage_async
nfs_page_length// 发生空指针解引用
page_file_mapping(page)->host 0 page->mapping ==
以下修改使得执行buffer读可以调用到函数nfs_readpage
:
Author: ChenXiaoSong <chenxiaosongemail@foxmail.com>
Date: Fri Nov 10 02:21:20 2023 +0000
always use mapping->a_ops->readpage
Signed-off-by: ChenXiaoSong <chenxiaosongemail@foxmail.com>
diff --git a/mm/readahead.c b/mm/readahead.c
index 4e630143a0ba..5cb303815e5f 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -119,7 +119,7 @@ static int read_pages(struct address_space *mapping, struct file *filp,
blk_start_plug(&plug);
- if (mapping->a_ops->readpages) {
+ if (0) { // mapping->a_ops->readpages) {
ret = mapping->a_ops->readpages(filp, mapping, pages, nr_pages);
/* Clean up the remaining pages */
put_pages_list(pages);
这里我只找到办法在direct读的时候触发回写,暂时还未找到在buffer读的时候触发回写(因为buffer读直接从page cache里读取数据),有兴趣的朋友可以和我一起尝试。direct读触发回写的用户态程序是direct-read-trigger-writeback.c,执行以下脚本:
touch /mnt/file
sync
gcc direct-read-trigger-writeback.c
echo 3 > /proc/sys/vm/drop_caches
./a.out
direct读没有执行到nfs_readpage
,以下是gdb断点的栈(有关gdb调试内核的方法可以查看《Linux内核课程》):
(gdb) bt
#0 generic_error_remove_page (mapping=0xffff88812d04c298, page=0xffffea0004b28700) at ../mm/truncate.c:235
#1 0xffffffff815310bf in nfs_write_error_remove_page (req=0xffff88813899dc80) at ../fs/nfs/write.c:602
#2 0xffffffff815321aa in nfs_page_async_flush (pgio=0xffffc90000dbfb50, page=0xffffea0004b28700) at ../fs/nfs/write.c:664
#3 0xffffffff815323d6 in nfs_do_writepage (page=0xffffea0004b28700, wbc=0xffffc90000dbfc38, pgio=0xffffc90000dbfb50) at ../fs/nfs/write.c:674
#4 0xffffffff81532442 in nfs_writepages_callback (page=0xffffea0004b28700, wbc=0xffffc90000dbfc38, data=0xffffc90000dbfb50) at ../fs/nfs/write.c:717
#5 0xffffffff812eb6ea in write_cache_pages (mapping=0xffff88812d04c298, wbc=0xffffc90000dbfc38, writepage=<optimized out>, data=<optimized out>) at ../mm/page-writeback.c:2240
#6 0xffffffff81533571 in nfs_writepages (mapping=0xffff88812d04c298, wbc=0xffffc90000dbfc38) at ../fs/nfs/write.c:742
#7 0xffffffff812ec9e4 in do_writepages (mapping=0xffff88812d04c298, wbc=0xffffc90000dbfc38) at ../mm/page-writeback.c:2344
#8 0xffffffff812d8c97 in __filemap_fdatawrite_range (mapping=0xffff88812d04c298, start=0, end=-131386645534720, sync_mode=<optimized out>) at ../mm/filemap.c:446
#9 0xffffffff812d8ce4 in __filemap_fdatawrite (sync_mode=<optimized out>, mapping=<optimized out>) at ../mm/filemap.c:454
#10 filemap_fdatawrite (mapping=0xffff88812d04c298) at ../mm/filemap.c:459
#11 0xffffffff812d8d0e in filemap_write_and_wait (mapping=0xffff88812d04c298) at ../mm/filemap.c:640
#12 0xffffffff81533d4b in nfs_wb_all (inode=0xffff88812d04c120) at ../fs/nfs/write.c:2000
#13 0xffffffff81522c57 in nfs_sync_mapping (mapping=0xffff88812d04c298) at ../fs/nfs/inode.c:153
#14 0xffffffff8152917d in nfs_block_buffered (nfsi=0xffff88812d04bf48, inode=0xffff88812d04c120) at ../fs/nfs/io.c:102
#15 0xffffffff815292a6 in nfs_start_io_direct (inode=0xffff88812d04c120) at ../fs/nfs/io.c:133
#16 0xffffffff8152acc2 in nfs_file_direct_read (iocb=0xffffc90000dbfdc8, iter=0xffffc90000dbfdf0, swap=false) at ../fs/nfs/direct.c:616
#17 0xffffffff8151d75a in nfs_file_read (iocb=0xffffc90000dbfdc8, to=0xffffc90000dbfdf0) at ../fs/nfs/file.c:160
#18 0xffffffff813c2ab2 in call_read_iter (iter=<optimized out>, kio=<optimized out>, file=<optimized out>) at ../include/linux/fs.h:1817
#19 new_sync_read (filp=0xffff88812c2f8000, buf=<optimized out>, len=100, ppos=0xffffc90000dbfec0) at ../fs/read_write.c:406
#20 0xffffffff813c67fb in __vfs_read (file=0xffff88812c2f8000, buf=0x7ffd836abc80 "", count=100, pos=0xffffc90000dbfec0) at ../fs/read_write.c:418
#21 0xffffffff813c68d0 in vfs_read (file=0xffff88812c2f8000, buf=0x7ffd836abc80 "", count=100, pos=0xffffc90000dbfec0) at ../fs/read_write.c:452
#22 0xffffffff813c6ee9 in ksys_read (fd=<optimized out>, buf=0x7ffd836abc80 "", count=100) at ../fs/read_write.c:579
#23 0xffffffff813c6f86 in __do_sys_read (count=<optimized out>, buf=<optimized out>, fd=<optimized out>) at ../fs/read_write.c:589
...