重点强调一下一个可能有用的线索: nfs client系统日志写到nfs
server,client定期进行日志切割,先锁定原日志文件(如messages
),重命名为messages-20250410
,
再生成新的messages
文件,然后再将锁定期间缓存日志数据写入messages
文件。
4.19内核发生panic,日志如下:
[2305675.245995] NFS: Invalid unlock attempted
[2305675.256784] ------------[ cut here ]------------
[2305675.256787] kernel BUG at fs/nfs/pagelist.c:382!
[2305675.259330] invalid opcode: 0000 [#1] SMP NOPTI
...
[2305675.301490] Call Trace:
[2305675.302501] nfs_end_page_writeback+0x6d/0x100 [nfs]
[2305675.304672] nfs_redirty_request+0x28/0x40 [nfs]
[2305675.306123] nfs_do_writepage+0x1f6/0x2a0 [nfs]
[2305675.307856] nfs_writepages_callback+0xf/0x20 [nfs]
[2305675.309453] write_cache_pages+0x187/0x410
[2305675.313517] nfs_writepages+0xb0/0x170 [nfs]
[2305675.315238] do_writepages+0x4b/0xe0
[2305675.319478] __filemap_fdatawrite_range+0xcf/0x110
[2305675.321603] filemap_write_and_wait_range+0x30/0x80
[2305675.323860] nfs_file_fsync+0x44/0x1f0 [nfs]
[2305675.328512] filp_close+0x3e/0xd0
[2305675.330914] __x64_sys_close+0x1e/0x50
[2305675.333686] do_syscall_64+0x5b/0x1c0
[2305675.336788] entry_SYSCALL_64_after_hwframe+0x44/0xa9
# 另一台机器的日志
[2305678.431213] NFS: Invalid unlock attempted
[2305678.432588] ------------[ cut here ]------------
[2305678.432590] kernel BUG at fs/nfs/pagelist.c:382!
[2305678.433859] invalid opcode: 0000 [#1] SMP NOPTI
...
[2305678.468588] Call Trace:
[2305678.469602] nfs_end_page_writeback+0x6d/0x100 [nfs]
[2305678.471389] nfs_redirty_request+0x28/0x40 [nfs]
[2305678.473202] nfs_do_writepage+0x1f6/0x2a0 [nfs]
[2305678.474824] nfs_writepages_callback+0xf/0x20 [nfs]
[2305678.476719] write_cache_pages+0x187/0x410
[2305678.481762] nfs_writepages+0xb0/0x170 [nfs]
[2305678.483415] do_writepages+0x4b/0xe0
[2305678.490717] __writeback_single_inode+0x3d/0x310
[2305678.494506] writeback_sb_inodes+0x1ad/0x4b0
[2305678.498275] __writeback_inodes_wb+0x5d/0xb0
[2305678.501829] wb_writeback+0x26c/0x300
[2305678.511473] wb_workfn+0x1dc/0x4c0
[2305678.515136] process_one_work+0x195/0x3d0
[2305678.518714] worker_thread+0x30/0x390
[2305678.525743] kthread+0x113/0x130
[2305678.532686] ret_from_fork+0x35/0x40
md5sum vmcore
# 4d708113fe3f28fa0fd3ef88c5b3f0e3 vmcore # 其中一个vmcore
rpm2cpio kernel-debuginfo-4.19.xxx.x86_64.rpm | cpio -div
crash vmcore usr/lib/debug/lib/modules/4.19.xxx.x86_64/vmlinux
crash> mod | grep nfs
# ffffffffc084c000 nfs_acl 16384 (not loaded) [CONFIG_KALLSYMS]
# ffffffffc0858100 nfsv3 49152 (not loaded) [CONFIG_KALLSYMS]
# ffffffffc0985c80 nfs 311296 (not loaded) [CONFIG_KALLSYMS]
# ffffffffc0c51400 nfsv4 679936 (not loaded) [CONFIG_KALLSYMS]
crash> mod -s nfs
crash> mod -s nfsv3
crash> mod -s nfsv4
nfs_unlock_request()
参数req
的值crash> bt
PID: 3930345 TASK: ffff9c13db3bdf00 CPU: 30 COMMAND: "java"
#0 [ffffb2f160b236a0] machine_kexec at ffffffff8e45adde
#1 [ffffb2f160b236f8] __crash_kexec at ffffffff8e55df01
...
[exception RIP: nfs_unlock_request+55]
RIP: ffffffffc095df27 RSP: ffffb2f160b239a0 RFLAGS: 00010286
RAX: 000000000000001d RBX: ffff9bf9f048da00 RCX: 0000000000000000
RDX: 0000000000000000 RSI: ffff9c147cf968f8 RDI: ffff9c147cf968f8
RBP: ffff9c114b4c8aa0 R8: 000000000000163c R9: 0000000000000018
R10: ffffffffffffff83 R11: ffff9c14733a1b10 R12: ffff9c131b2e5800
R13: 0000000000000001 R14: ffff9bf9f048da00 R15: ffff9c114b4c8aa0
ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0018
#7 [ffffb2f160b239a0] nfs_end_page_writeback at ffffffffc096165d [nfs]
#8 [ffffb2f160b239c8] nfs_redirty_request at ffffffffc0961718 [nfs]
#9 [ffffb2f160b239d8] nfs_do_writepage at ffffffffc0962596 [nfs]
...
crash> dis -l nfs_unlock_request
0xffffffffc095def0 <nfs_unlock_request>: nopl 0x0(%rax,%rax,1) [FTRACE NOP]
0xffffffffc095def5 <nfs_unlock_request+5>: mov 0x40(%rdi),%rax
0xffffffffc095def9 <nfs_unlock_request+9>: test $0x1,%al
0xffffffffc095defb <nfs_unlock_request+11>: je 0xffffffffc095df1b <nfs_unlock_request+43>
0xffffffffc095defd <nfs_unlock_request+13>: lea 0x40(%rdi),%rdx
0xffffffffc095df01 <nfs_unlock_request+17>: lock andb $0xfe,0x40(%rdi)
0xffffffffc095df06 <nfs_unlock_request+22>: mov 0x40(%rdi),%rax
0xffffffffc095df0a <nfs_unlock_request+26>: test $0x10,%ah
0xffffffffc095df0d <nfs_unlock_request+29>: jne 0xffffffffc095df11 <nfs_unlock_request+33>
0xffffffffc095df0f <nfs_unlock_request+31>: repz ret
0xffffffffc095df11 <nfs_unlock_request+33>: xor %esi,%esi
0xffffffffc095df13 <nfs_unlock_request+35>: mov %rdx,%rdi
0xffffffffc095df16 <nfs_unlock_request+38>: jmp 0xffffffff8e5017b0 <wake_up_bit>
0xffffffffc095df1b <nfs_unlock_request+43>: mov $0xffffffffc0970df0,%rdi
0xffffffffc095df22 <nfs_unlock_request+50>: call 0xffffffff8e51d0fe <printk>
0xffffffffc095df27 <nfs_unlock_request+55>: ud2
x86_64下整数参数使用的寄存器依次为:
RDI,RSI,RDX,RCX,R8,R9,可以看出nfs_unlock_request()
的第一个参数%rdi
寄存器的值被覆盖了(mov $0xffffffffc0970df0,%rdi
),
所以无法通过传参寄存器%rdi
的值得到nfs_page
的值。
接下来再分析栈中上面几层函数的汇编,反汇编nfs_end_page_writeback()
:
crash> dis -l nfs_end_page_writeback
...
0xffffffffc09615fb <nfs_end_page_writeback+11>: mov %rdi,%rbx # rdi是第一个参数
...
0xffffffffc0961652 <nfs_end_page_writeback+98>: mov %rbx,%rdi # nfs_unlock_request()的第一个参数
...
0xffffffffc0961658 <nfs_end_page_writeback+104>: call 0xffffffffc095def0 <nfs_unlock_request>
再结合前面nfs_unlock_request()
的反汇编(%rbx
的值没有被覆盖),可以推出在崩溃时%rbx
存的是nfs_unlock_request()
第一个参数的值。
# RBX: ffff9bf9f048da00
crash> struct nfs_page ffff9bf9f048da00 -x
struct nfs_page {
wb_list = {
next = 0xffff9bf9f048da00,
prev = 0xffff9bf9f048da00
},
wb_page = 0xfffffb9dd6f34dc0,
wb_context = 0xffff9c13143a9a80,
wb_lock_context = 0xffff9c13143a9a80,
wb_index = 0x6af69,
wb_offset = 0x0,
wb_pgbase = 0x0,
wb_bytes = 0xfff,
wb_kref = {
refcount = {
refs = {
counter = 0x1
}
}
},
wb_flags = 0x12, # 0b10010, PG_MAPPED, PG_INODE_REF
wb_verf = {
data = "\000\000\000\000\000\000\000"
},
wb_this_page = 0xffff9bf9f048da00,
wb_head = 0xffff9bf9f048da00
}
crash> kmem ffff9bf9f048da00 # nfs_init_nfspagecache
CACHE OBJSIZE ALLOCATED TOTAL SLABS SSIZE NAME
ffff9c0a2749c700 128 11389 53376 834 8k secpath_cache
SLAB MEMORY NODE TOTAL ALLOCATED FREE
fffffb9d91c12300 ffff9bf9f048c000 0 64 50 14
FREE / [ALLOCATED]
[ffff9bf9f048da00]
PAGE PHYSICAL MAPPING INDEX CNT FLAGS
fffffb9d91c12340 247048d000 dead000000000400 0 0 17ffffc0000000
crash> struct page 0xfffffb9dd6f34dc0 -x
struct page {
flags = 0x17ffffc0001039, # 第12位PG_private为1,在nfs_inode_add_request()设置的
{
{
...
private = 0xffff9bf9f048da00 # 就是nfs_page的值
},
crash> search 0xfffffb9dd6f34dc0 # nfs_page->wb_page
ffff9bf9f048da10: fffffb9dd6f34dc0 # 这个是崩溃时的nfs_page
ffff9c01b80175c8: fffffb9dd6f34dc0 # radix_tree_node
ffff9c028515b9d0: fffffb9dd6f34dc0 # 不是slab
ffff9c028515b9e0: fffffb9dd6f34dc0 # 不是slab
ffff9c028515ba08: fffffb9dd6f34dc0 # 不是slab
ffff9c028515ba98: fffffb9dd6f34dc0 # 不是slab
ffffb2f160b239d0: fffffb9dd6f34dc0 # 不是slab
ffffb2f160b239e0: fffffb9dd6f34dc0 # 不是slab
ffffb2f160b23a08: fffffb9dd6f34dc0 # 不是slab
ffffb2f160b23a98: fffffb9dd6f34dc0 # 不是slab
struct nfs_page
结构体中的引用计数为1
,wb_flags
值为0x12
(二进制0b10010
,设置了PG_MAPPED
和PG_INODE_REF
)。
从下面的代码分析可知,这里正常情况下wb_flags
值为0x13
,引用计数为2
。
在nfs中只有在nfs_unlock_request()
一个地方清除PG_BUSY
位,发生panic肯定是两次执行到了nfs_unlock_request()
函数。
正常情况下执行nfs_redirty_request()
前会先执行nfs_lock_and_join_requests()
,nfs_lock_and_join_requests()
中引用计数变为2
,wb_flags
变为0x13
。
从前面的vmcore解释中可知崩溃时struct nfs_page
结构体中的引用计数为1
,wb_flags
值为0x12
(二进制0b10010
,设置了PG_MAPPED
和PG_INODE_REF
)。初步怀疑是在另一个并行执行的进程中同时操作了减引用计数和清除wb_flags
的PG_BUSY
位,同时执行这两个操作的函数是nfs_unlock_and_release_request()
。
初始化nfs_page
的流程:
nfs_file_write
generic_perform_write
nfs_write_end
nfs_updatepage// 内联
nfs_writepage_setup
nfs_setup_write_request
nfs_create_request// 引用计数设置为1
(&req->wb_kref)
kref_init
nfs_inode_add_request// wb_flags == 0x1
nfs_lock_request (PG_MAPPED, // wb_flags == 0x3
set_bit(req->wb_page, (unsigned long)req) // 设置page的private字段
set_page_private(PG_INODE_REF, // wb_flags == 0x13
test_and_set_bit(&req->wb_kref) // 引用计数为2
kref_get
nfs_unlock_and_release_request// wb_flags == 0x12
nfs_unlock_request // 引用计数为1 nfs_release_request
回写流程:
// 可能是由close文件触发,也可能是由内核线程(wb_workfn)触发
do_writepages
nfs_writepages
write_cache_pages
nfs_writepages_callback
nfs_do_writepage// 内联
nfs_page_async_flush
nfs_lock_and_join_requests
nfs_page_find_head_request
nfs_page_find_private_request(&req->wb_kref) // 引用计数为2
kref_get// lock成功了就不执行nfs_wait_on_request()
nfs_lock_request (PG_BUSY, // wb_flags == 0x13
test_and_set_bit// 其他进程lock了nfs_page时才会执行到这里等待
nfs_wait_on_request (&req->wb_flags, PG_BUSY,
wait_on_bit_io// 发生了错误
nfs_pageio_add_request // 错误码不是以下情况时会执行到nfs_redirty_request
// 不是 -EACCES, -EDQUOT, -EFBIG, -EIO, -ENOSPC, -EROFS, -ESTALE, -E2BIG
nfs_redirty_request
nfs_end_page_writeback
nfs_unlock_requestif (!NFS_WBACK_BUSY(req)) // 正常情况下条件不满足
// 异常情况下panic了
BUG (PG_BUSY, // 只在nfs_unlock_request 中清除busy位
clear_bit(&req->wb_flags, PG_BUSY) // 唤醒进程 wake_up_bit
异步释放nfs_page
的流程:
rpc_async_release
nfs_pgio_release
nfs_write_completion
nfs_inode_remove_request(head->wb_page, 0); // 清除page的private字段
set_page_private(PG_MAPPED, // wb_flags == 0x11
clear_bit(PG_INODE_REF, // wb_flags == 0x1
test_and_clear_bit// 引用计数为1
nfs_release_request
nfs_end_page_writeback// wb_flags == 0x0
nfs_unlock_request // 引用计数为0,释放nfs_page nfs_release_request
调用nfs_unlock_and_release_request()
的地方(已排除的放后面):
// 600 fs/nfs/write.c
// 这种情况不会执行到nfs_redirty_request
// 有没可能是其他并行的进程?
nfs_lock_and_join_requests// 引用计数加1
nfs_page_find_head_request // 设置PG_BUSY
nfs_lock_request
nfs_unlock_and_release_request
// 591 fs/nfs/write.c
// 有没可能是其他并行的进程?
nfs_lock_and_join_requests
nfs_page_find_head_request
nfs_unlock_and_release_request
// 552 fs/nfs/write.c
// 有没可能是其他并行的进程?
nfs_lock_and_join_requests
nfs_page_find_head_request
nfs_unlock_and_release_request
// 504 fs/nfs/write.c
// 有没可能是其他并行的进程?
nfs_lock_and_join_requests
nfs_page_find_head_request
nfs_unlock_and_release_request
// 452 fs/nfs/write.c
// 有没可能是其他并行的进程?
nfs_lock_and_join_requests
nfs_page_find_head_request
nfs_destroy_unlinked_subrequests
nfs_unlock_and_release_request
// 396 fs/nfs/write.c
nfs_lock_and_join_requests
nfs_page_find_head_requestfor (subreq = head->wb_this_page // 在其他进程执行循环?
nfs_unroll_locks
nfs_unlock_and_release_request
// 947 fs/nfs/direct.c
nfs_file_direct_write
nfs_direct_write_schedule_iovec
nfs_unlock_and_release_request
// 856 fs/nfs/direct.c
// .error_cleanup
nfs_write_sync_pgio_error
nfs_unlock_and_release_request
// 840 fs/nfs/direct.c
// .completion
nfs_direct_write_completion
nfs_unlock_and_release_request
// 729 fs/nfs/direct.c
// .completion
nfs_direct_commit_complete
nfs_unlock_and_release_request
// 703 fs/nfs/direct.c
(&dreq->work, nfs_direct_write_schedule_work)
INIT_WORK
nfs_direct_write_schedule_work
nfs_direct_write_reschedule
nfs_unlock_and_release_request
// 已排除
// 2053 fs/nfs/write.c
// .invalidatepage
nfs_invalidate_page
nfs_wb_page_cancel// 假设这里真有情况会返回NULL
nfs_lock_and_join_requests // 这里执行完wb_flags肯定清除了`PG_MAPPED`和`PG_INODE_REF`
nfs_inode_remove_request
nfs_unlock_and_release_request
// 已排除
// 1873 fs/nfs/write.c
// .completion
nfs_commit_release_pages if (req->wb_page) // 满足条件
// 这里执行完wb_flags肯定清除了`PG_MAPPED`和`PG_INODE_REF`
nfs_inode_remove_request
nfs_unlock_and_release_request
// 已排除
// 1784 fs/nfs/write.c
// pnfs相关,不涉及
pnfs_generic_retry_commit
nfs_retry_commit
nfs_unlock_and_release_request
// 已排除
// 1207 fs/nfs/write.c
// 初始化nfs_page,不会重复初始化
nfs_updatepage
nfs_writepage_setup
nfs_unlock_and_release_request
// 已排除
// 1166 fs/nfs/write.c
// 初始化nfs_page,不会重复初始化
nfs_writepage_setup
nfs_setup_write_request
nfs_try_to_update_request nfs_unlock_and_release_request
查找含有函数关键字的相关补丁:
git log origin/master --oneline --date=short --format="%cd %h %s %an <%ae>" --grep=nfs_end_page_writeback
# 都是2012年前的补丁,4.19内核肯定包含2012年的补丁
# 4.19内核肯定包含2012年的补丁
git log origin/master --oneline --date=short --format="%cd %h %s %an <%ae>" --grep=nfs_unlock_request
# 都是2012年前的补丁,4.19内核肯定包含2012年的补丁
git log origin/master --oneline --date=short --format="%cd %h %s %an <%ae>" --grep=nfs_lock_and_join_requests
# 2024-11-18 66f9dac9077c Revert "nfs: don't reuse partially completed requests in nfs_lock_and_join_requests" Trond Myklebust <trond.myklebust@hammerspace.com>
# 2024-07-08 b571cfcb9dca nfs: don't reuse partially completed requests in nfs_lock_and_join_requests Christoph Hellwig <hch@lst.de>
# 2024-07-08 25edbcac6e32 nfs: fold nfs_page_group_lock_subrequests into nfs_lock_and_join_requests Christoph Hellwig <hch@lst.de>
# 2024-07-08 c3f2235782c3 nfs: fold nfs_folio_find_and_lock_request into nfs_lock_and_join_requests Christoph Hellwig <hch@lst.de>
# 2020-04-01 e00ed89d7bd5 NFS: Refactor nfs_lock_and_join_requests() Trond Myklebust <trond.myklebust@hammerspace.com>
# 2020-04-01 a62f8e3bd836 NFS: Clean up nfs_lock_and_join_requests() Trond Myklebust <trond.myklebust@hammerspace.com>
# 2020-04-01 08ca8b21f760 NFS: Fix races nfs_page_group_destroy() vs nfs_destroy_unlinked_subrequests() Trond Myklebust <trond.myklebust@hammerspace.com>
# 2020-04-01 add42de31721 NFS: Fix a page leak in nfs_destroy_unlinked_subrequests() Trond Myklebust <trond.myklebust@hammerspace.com>
查找nfs_unlock_request()
函数修改记录:
git log -L:nfs_unlock_request:fs/nfs/pagelist.c
# v5.16-rc1 43d20e80e288 NFS: Fix a few more clear_bit() instances that need release semantics
补丁0001-debug-nfs_unlock_request.patch
。
还在尝试中,暂未构造出来。
nfs_redirty_request()
所做的修改内核做以下修改,写文件就能执行到nfs_redirty_request()
:
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -869,22 +870,35 @@ static void nfs_pageio_setup_mirroring(struct nfs_pageio_descriptor *pgio,
{
unsigned int mirror_count = 1;
+ if (1) {
+ pgio->pg_error = -ERESTARTSYS;
+ pgio->pg_mirrors = pgio->pg_mirrors_static;
+ mirror_count = 1;
+ pgio->pg_mirror_count = mirror_count;
+ return;
+ }
+
if (pgio->pg_ops->pg_get_mirror_count)
mirror_count = pgio->pg_ops->pg_get_mirror_count(pgio, req);
nfs_redirty_request()
中delay--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1404,6 +1405,9 @@ static void nfs_initiate_write(struct nfs_pgio_header *hdr,
*/
static void nfs_redirty_request(struct nfs_page *req)
{
+ printk("%s:%d, delay begin\n", __func__, __LINE__);
+ mdelay(20 * 1000);
+ printk("%s:%d, delay end\n", __func__, __LINE__);
nfs_mark_request_dirty(req);
set_bit(NFS_CONTEXT_RESEND_WRITES, &req->wb_context->flags);
nfs_end_page_writeback(req);
write.c
:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
int main(void)
{
const char *filename = "/mnt/file";
const size_t data_size = 4100;
FILE *fp = fopen(filename, "wb");
if (!fp) {
("fopen");
perrorreturn EXIT_FAILURE;
}
// 分配 4090 字节的缓冲区,并填充字符 'A'
char *buffer = malloc(data_size);
if (!buffer) {
("malloc");
perror(fp);
fclosereturn EXIT_FAILURE;
}
(buffer, 'A', data_size);
memset
// 写入数据
size_t written = fwrite(buffer, 1, data_size, fp);
if (written != data_size) {
("fwrite");
perror(buffer);
free(fp);
fclosereturn EXIT_FAILURE;
}
// 在关闭文件之前,睡眠
(100000);
sleep
// 释放资源并关闭文件
(buffer);
free(fp);
fclosereturn EXIT_SUCCESS;
}
truncate.c
:
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <fcntl.h>
#include <string.h>
#include <sys/stat.h>
int main(void)
{
const char *filename = "/mnt/file";
int fd = open(filename, O_RDWR | O_CREAT, 0666);
if (fd < 0) {
("open");
perrorreturn EXIT_FAILURE;
}
// 调用 ftruncate 将文件截断到较小尺寸,比如 100 字节,
// 这会使得超过 100 字节部分的页缓存被标记为无效(触发 invalidate page)
= 100;
off_t new_size if (ftruncate(fd, new_size) < 0) {
("ftruncate");
perror(fd);
closereturn EXIT_FAILURE;
}
(100000);
sleep
(fd);
closereturn EXIT_SUCCESS;
}
gcc -o write write.c
gcc -o truncate truncate.c
./write &
sleep 1
./truncate &