4.19内核nfs_unlock_request()报BUG()的问题

点击这里查看配套的教学视频

点击跳转到nfs课程所有目录

1 问题描述

redhat遇到的类似问题

重点强调一下一个可能有用的线索: nfs client系统日志写到nfs server,client定期进行日志切割,先锁定原日志文件(如messages),重命名为messages-20250410, 再生成新的messages文件,然后再将锁定期间缓存日志数据写入messages文件。

4.19内核发生panic,日志如下:

[2305675.245995] NFS: Invalid unlock attempted
[2305675.256784] ------------[ cut here ]------------
[2305675.256787] kernel BUG at fs/nfs/pagelist.c:382!
[2305675.259330] invalid opcode: 0000 [#1] SMP NOPTI
...
[2305675.301490] Call Trace:
[2305675.302501]  nfs_end_page_writeback+0x6d/0x100 [nfs]
[2305675.304672]  nfs_redirty_request+0x28/0x40 [nfs]
[2305675.306123]  nfs_do_writepage+0x1f6/0x2a0 [nfs]
[2305675.307856]  nfs_writepages_callback+0xf/0x20 [nfs]
[2305675.309453]  write_cache_pages+0x187/0x410
[2305675.313517]  nfs_writepages+0xb0/0x170 [nfs]
[2305675.315238]  do_writepages+0x4b/0xe0
[2305675.319478]  __filemap_fdatawrite_range+0xcf/0x110
[2305675.321603]  filemap_write_and_wait_range+0x30/0x80
[2305675.323860]  nfs_file_fsync+0x44/0x1f0 [nfs]
[2305675.328512]  filp_close+0x3e/0xd0
[2305675.330914]  __x64_sys_close+0x1e/0x50
[2305675.333686]  do_syscall_64+0x5b/0x1c0
[2305675.336788]  entry_SYSCALL_64_after_hwframe+0x44/0xa9

# 另一台机器的日志
[2305678.431213] NFS: Invalid unlock attempted
[2305678.432588] ------------[ cut here ]------------
[2305678.432590] kernel BUG at fs/nfs/pagelist.c:382!
[2305678.433859] invalid opcode: 0000 [#1] SMP NOPTI
...
[2305678.468588] Call Trace:
[2305678.469602]  nfs_end_page_writeback+0x6d/0x100 [nfs]
[2305678.471389]  nfs_redirty_request+0x28/0x40 [nfs]
[2305678.473202]  nfs_do_writepage+0x1f6/0x2a0 [nfs]
[2305678.474824]  nfs_writepages_callback+0xf/0x20 [nfs]
[2305678.476719]  write_cache_pages+0x187/0x410
[2305678.481762]  nfs_writepages+0xb0/0x170 [nfs]
[2305678.483415]  do_writepages+0x4b/0xe0
[2305678.490717]  __writeback_single_inode+0x3d/0x310
[2305678.494506]  writeback_sb_inodes+0x1ad/0x4b0
[2305678.498275]  __writeback_inodes_wb+0x5d/0xb0
[2305678.501829]  wb_writeback+0x26c/0x300
[2305678.511473]  wb_workfn+0x1dc/0x4c0
[2305678.515136]  process_one_work+0x195/0x3d0
[2305678.518714]  worker_thread+0x30/0x390
[2305678.525743]  kthread+0x113/0x130
[2305678.532686]  ret_from_fork+0x35/0x40

2 解析vmcore

详细的crash命令输出信息请点击这里查看

2.1 准备

md5sum vmcore
# 4d708113fe3f28fa0fd3ef88c5b3f0e3  vmcore # 其中一个vmcore

rpm2cpio kernel-debuginfo-4.19.xxx.x86_64.rpm | cpio -div
crash vmcore usr/lib/debug/lib/modules/4.19.xxx.x86_64/vmlinux
crash> mod | grep nfs
  # ffffffffc084c000  nfs_acl                 16384  (not loaded)  [CONFIG_KALLSYMS]
  # ffffffffc0858100  nfsv3                   49152  (not loaded)  [CONFIG_KALLSYMS]
  # ffffffffc0985c80  nfs                    311296  (not loaded)  [CONFIG_KALLSYMS]
  # ffffffffc0c51400  nfsv4                  679936  (not loaded)  [CONFIG_KALLSYMS]
crash> mod -s nfs
crash> mod -s nfsv3
crash> mod -s nfsv4

2.2 寻找nfs_unlock_request()参数req的值

crash> bt
PID: 3930345  TASK: ffff9c13db3bdf00  CPU: 30  COMMAND: "java"
 #0 [ffffb2f160b236a0] machine_kexec at ffffffff8e45adde
 #1 [ffffb2f160b236f8] __crash_kexec at ffffffff8e55df01
 ...
    [exception RIP: nfs_unlock_request+55]
    RIP: ffffffffc095df27  RSP: ffffb2f160b239a0  RFLAGS: 00010286
    RAX: 000000000000001d  RBX: ffff9bf9f048da00  RCX: 0000000000000000
    RDX: 0000000000000000  RSI: ffff9c147cf968f8  RDI: ffff9c147cf968f8
    RBP: ffff9c114b4c8aa0   R8: 000000000000163c   R9: 0000000000000018
    R10: ffffffffffffff83  R11: ffff9c14733a1b10  R12: ffff9c131b2e5800
    R13: 0000000000000001  R14: ffff9bf9f048da00  R15: ffff9c114b4c8aa0
    ORIG_RAX: ffffffffffffffff  CS: 0010  SS: 0018
 #7 [ffffb2f160b239a0] nfs_end_page_writeback at ffffffffc096165d [nfs]
 #8 [ffffb2f160b239c8] nfs_redirty_request at ffffffffc0961718 [nfs]
 #9 [ffffb2f160b239d8] nfs_do_writepage at ffffffffc0962596 [nfs]
 ...

crash> dis -l nfs_unlock_request
0xffffffffc095def0 <nfs_unlock_request>:        nopl   0x0(%rax,%rax,1) [FTRACE NOP]
0xffffffffc095def5 <nfs_unlock_request+5>:      mov    0x40(%rdi),%rax
0xffffffffc095def9 <nfs_unlock_request+9>:      test   $0x1,%al
0xffffffffc095defb <nfs_unlock_request+11>:     je     0xffffffffc095df1b <nfs_unlock_request+43>
0xffffffffc095defd <nfs_unlock_request+13>:     lea    0x40(%rdi),%rdx
0xffffffffc095df01 <nfs_unlock_request+17>:     lock andb $0xfe,0x40(%rdi)
0xffffffffc095df06 <nfs_unlock_request+22>:     mov    0x40(%rdi),%rax
0xffffffffc095df0a <nfs_unlock_request+26>:     test   $0x10,%ah
0xffffffffc095df0d <nfs_unlock_request+29>:     jne    0xffffffffc095df11 <nfs_unlock_request+33>
0xffffffffc095df0f <nfs_unlock_request+31>:     repz ret 
0xffffffffc095df11 <nfs_unlock_request+33>:     xor    %esi,%esi
0xffffffffc095df13 <nfs_unlock_request+35>:     mov    %rdx,%rdi
0xffffffffc095df16 <nfs_unlock_request+38>:     jmp    0xffffffff8e5017b0 <wake_up_bit>
0xffffffffc095df1b <nfs_unlock_request+43>:     mov    $0xffffffffc0970df0,%rdi
0xffffffffc095df22 <nfs_unlock_request+50>:     call   0xffffffff8e51d0fe <printk>
0xffffffffc095df27 <nfs_unlock_request+55>:     ud2 

x86_64下整数参数使用的寄存器依次为: RDI,RSI,RDX,RCX,R8,R9,可以看出nfs_unlock_request()的第一个参数%rdi寄存器的值被覆盖了(mov $0xffffffffc0970df0,%rdi), 所以无法通过传参寄存器%rdi的值得到nfs_page的值。

接下来再分析栈中上面几层函数的汇编,反汇编nfs_end_page_writeback():

crash> dis -l nfs_end_page_writeback
...
0xffffffffc09615fb <nfs_end_page_writeback+11>: mov    %rdi,%rbx # rdi是第一个参数
...
0xffffffffc0961652 <nfs_end_page_writeback+98>: mov    %rbx,%rdi # nfs_unlock_request()的第一个参数
...
0xffffffffc0961658 <nfs_end_page_writeback+104>:    call   0xffffffffc095def0 <nfs_unlock_request>

再结合前面nfs_unlock_request()的反汇编(%rbx的值没有被覆盖),可以推出在崩溃时%rbx存的是nfs_unlock_request()第一个参数的值。

# RBX: ffff9bf9f048da00
crash> struct nfs_page ffff9bf9f048da00 -x
struct nfs_page {
  wb_list = {
    next = 0xffff9bf9f048da00,
    prev = 0xffff9bf9f048da00
  },
  wb_page = 0xfffffb9dd6f34dc0,
  wb_context = 0xffff9c13143a9a80,
  wb_lock_context = 0xffff9c13143a9a80,
  wb_index = 0x6af69,
  wb_offset = 0x0,
  wb_pgbase = 0x0,
  wb_bytes = 0xfff,
  wb_kref = {
    refcount = {
      refs = {
        counter = 0x1
      }
    }
  },
  wb_flags = 0x12, # 0b10010, PG_MAPPED, PG_INODE_REF
  wb_verf = {
    data = "\000\000\000\000\000\000\000"
  },
  wb_this_page = 0xffff9bf9f048da00,
  wb_head = 0xffff9bf9f048da00
}

crash> kmem ffff9bf9f048da00 # nfs_init_nfspagecache
CACHE             OBJSIZE  ALLOCATED     TOTAL  SLABS  SSIZE  NAME
ffff9c0a2749c700      128      11389     53376    834     8k  secpath_cache
  SLAB              MEMORY            NODE  TOTAL  ALLOCATED  FREE
  fffffb9d91c12300  ffff9bf9f048c000     0     64         50    14
  FREE / [ALLOCATED]
  [ffff9bf9f048da00]

      PAGE         PHYSICAL      MAPPING       INDEX CNT FLAGS
fffffb9d91c12340 247048d000 dead000000000400        0  0 17ffffc0000000

crash> struct page 0xfffffb9dd6f34dc0 -x
struct page {
  flags = 0x17ffffc0001039, # 第12位PG_private为1,在nfs_inode_add_request()设置的
  {
    {
      ...
      private = 0xffff9bf9f048da00 # 就是nfs_page的值
    },

crash> search 0xfffffb9dd6f34dc0 # nfs_page->wb_page
ffff9bf9f048da10: fffffb9dd6f34dc0 # 这个是崩溃时的nfs_page
ffff9c01b80175c8: fffffb9dd6f34dc0 # radix_tree_node
ffff9c028515b9d0: fffffb9dd6f34dc0 # 不是slab
ffff9c028515b9e0: fffffb9dd6f34dc0 # 不是slab
ffff9c028515ba08: fffffb9dd6f34dc0 # 不是slab
ffff9c028515ba98: fffffb9dd6f34dc0 # 不是slab
ffffb2f160b239d0: fffffb9dd6f34dc0 # 不是slab
ffffb2f160b239e0: fffffb9dd6f34dc0 # 不是slab
ffffb2f160b23a08: fffffb9dd6f34dc0 # 不是slab
ffffb2f160b23a98: fffffb9dd6f34dc0 # 不是slab

struct nfs_page结构体中的引用计数为1wb_flags值为0x12(二进制0b10010,设置了PG_MAPPEDPG_INODE_REF)。

从下面的代码分析可知,这里正常情况下wb_flags值为0x13,引用计数为2

3 代码分析

在nfs中只有在nfs_unlock_request()一个地方清除PG_BUSY位,发生panic肯定是两次执行到了nfs_unlock_request()函数。 正常情况下执行nfs_redirty_request()前会先执行nfs_lock_and_join_requests()nfs_lock_and_join_requests()中引用计数变为2wb_flags变为0x13

从前面的vmcore解释中可知崩溃时struct nfs_page结构体中的引用计数为1wb_flags值为0x12(二进制0b10010,设置了PG_MAPPEDPG_INODE_REF)。初步怀疑是在另一个并行执行的进程中同时操作了减引用计数和清除wb_flagsPG_BUSY位,同时执行这两个操作的函数是nfs_unlock_and_release_request()

初始化nfs_page的流程:

nfs_file_write
  generic_perform_write
    nfs_write_end
      nfs_updatepage
        nfs_writepage_setup // 内联
          nfs_setup_write_request
            nfs_create_request
              // 引用计数设置为1
              kref_init(&req->wb_kref)
            nfs_inode_add_request
              nfs_lock_request // wb_flags == 0x1
              set_bit(PG_MAPPED, // wb_flags == 0x3
              set_page_private(req->wb_page, (unsigned long)req) // 设置page的private字段
              test_and_set_bit(PG_INODE_REF, // wb_flags == 0x13
              kref_get(&req->wb_kref) // 引用计数为2
          nfs_unlock_and_release_request
            nfs_unlock_request // wb_flags == 0x12
            nfs_release_request // 引用计数为1

回写流程:

// 可能是由close文件触发,也可能是由内核线程(wb_workfn)触发
do_writepages
  nfs_writepages
    write_cache_pages
      nfs_writepages_callback
        nfs_do_writepage
          nfs_page_async_flush // 内联
            nfs_lock_and_join_requests
              nfs_page_find_head_request
                nfs_page_find_private_request
                  kref_get(&req->wb_kref) // 引用计数为2
              nfs_lock_request // lock成功了就不执行nfs_wait_on_request()
                test_and_set_bit(PG_BUSY, // wb_flags == 0x13
              nfs_wait_on_request // 其他进程lock了nfs_page时才会执行到这里等待
                wait_on_bit_io(&req->wb_flags, PG_BUSY,
            nfs_pageio_add_request // 发生了错误
            // 错误码不是以下情况时会执行到nfs_redirty_request
            //   不是 -EACCES, -EDQUOT, -EFBIG, -EIO, -ENOSPC, -EROFS, -ESTALE, -E2BIG
            nfs_redirty_request
              nfs_end_page_writeback
                nfs_unlock_request
                  if (!NFS_WBACK_BUSY(req)) // 正常情况下条件不满足
                  BUG // 异常情况下panic了
                  clear_bit(PG_BUSY, // 只在nfs_unlock_request 中清除busy位
                  wake_up_bit(&req->wb_flags, PG_BUSY) // 唤醒进程

异步释放nfs_page的流程:

rpc_async_release
  nfs_pgio_release
    nfs_write_completion
      nfs_inode_remove_request
        set_page_private(head->wb_page, 0); // 清除page的private字段
        clear_bit(PG_MAPPED, // wb_flags == 0x11
        test_and_clear_bit(PG_INODE_REF, // wb_flags == 0x1
        nfs_release_request // 引用计数为1
      nfs_end_page_writeback
        nfs_unlock_request // wb_flags == 0x0
      nfs_release_request // 引用计数为0,释放nfs_page

调用nfs_unlock_and_release_request()的地方(已排除的放后面):

// 600 fs/nfs/write.c
// 这种情况不会执行到nfs_redirty_request
// 有没可能是其他并行的进程?
nfs_lock_and_join_requests
  nfs_page_find_head_request // 引用计数加1
  nfs_lock_request // 设置PG_BUSY
  nfs_unlock_and_release_request

// 591 fs/nfs/write.c
// 有没可能是其他并行的进程?
nfs_lock_and_join_requests
  nfs_page_find_head_request
  nfs_unlock_and_release_request

// 552 fs/nfs/write.c
// 有没可能是其他并行的进程?
nfs_lock_and_join_requests
  nfs_page_find_head_request
  nfs_unlock_and_release_request

// 504 fs/nfs/write.c
// 有没可能是其他并行的进程?
nfs_lock_and_join_requests
  nfs_page_find_head_request
  nfs_unlock_and_release_request

// 452 fs/nfs/write.c
// 有没可能是其他并行的进程?
nfs_lock_and_join_requests
  nfs_page_find_head_request
  nfs_destroy_unlinked_subrequests
    nfs_unlock_and_release_request

// 396 fs/nfs/write.c
nfs_lock_and_join_requests
  nfs_page_find_head_request
  for (subreq = head->wb_this_page // 在其他进程执行循环?
  nfs_unroll_locks
    nfs_unlock_and_release_request

// 947 fs/nfs/direct.c
nfs_file_direct_write
  nfs_direct_write_schedule_iovec
    nfs_unlock_and_release_request

// 856 fs/nfs/direct.c
nfs_write_sync_pgio_error // .error_cleanup
  nfs_unlock_and_release_request

// 840 fs/nfs/direct.c
nfs_direct_write_completion // .completion
  nfs_unlock_and_release_request

// 729 fs/nfs/direct.c
nfs_direct_commit_complete // .completion
  nfs_unlock_and_release_request

// 703 fs/nfs/direct.c
INIT_WORK(&dreq->work, nfs_direct_write_schedule_work)
  nfs_direct_write_schedule_work
    nfs_direct_write_reschedule
      nfs_unlock_and_release_request

// 已排除
// 2053 fs/nfs/write.c
nfs_invalidate_page // .invalidatepage
  nfs_wb_page_cancel
    nfs_lock_and_join_requests // 假设这里真有情况会返回NULL
    nfs_inode_remove_request // 这里执行完wb_flags肯定清除了`PG_MAPPED`和`PG_INODE_REF`
    nfs_unlock_and_release_request

// 已排除
// 1873 fs/nfs/write.c
nfs_commit_release_pages // .completion
  if (req->wb_page) // 满足条件
  nfs_inode_remove_request // 这里执行完wb_flags肯定清除了`PG_MAPPED`和`PG_INODE_REF`
  nfs_unlock_and_release_request

// 已排除
// 1784 fs/nfs/write.c
// pnfs相关,不涉及
pnfs_generic_retry_commit
  nfs_retry_commit
    nfs_unlock_and_release_request

// 已排除
// 1207 fs/nfs/write.c
// 初始化nfs_page,不会重复初始化
nfs_updatepage
  nfs_writepage_setup
    nfs_unlock_and_release_request

// 已排除
// 1166 fs/nfs/write.c
// 初始化nfs_page,不会重复初始化
nfs_writepage_setup
  nfs_setup_write_request
    nfs_try_to_update_request
      nfs_unlock_and_release_request

4 补丁

查找含有函数关键字的相关补丁:

git log origin/master --oneline --date=short --format="%cd %h %s %an <%ae>" --grep=nfs_end_page_writeback
  # 都是2012年前的补丁,4.19内核肯定包含2012年的补丁

# 4.19内核肯定包含2012年的补丁
git log origin/master --oneline --date=short --format="%cd %h %s %an <%ae>" --grep=nfs_unlock_request
  # 都是2012年前的补丁,4.19内核肯定包含2012年的补丁

git log origin/master --oneline --date=short --format="%cd %h %s %an <%ae>" --grep=nfs_lock_and_join_requests
  # 2024-11-18 66f9dac9077c Revert "nfs: don't reuse partially completed requests in nfs_lock_and_join_requests" Trond Myklebust <trond.myklebust@hammerspace.com>
  # 2024-07-08 b571cfcb9dca nfs: don't reuse partially completed requests in nfs_lock_and_join_requests Christoph Hellwig <hch@lst.de>
  # 2024-07-08 25edbcac6e32 nfs: fold nfs_page_group_lock_subrequests into nfs_lock_and_join_requests Christoph Hellwig <hch@lst.de>
  # 2024-07-08 c3f2235782c3 nfs: fold nfs_folio_find_and_lock_request into nfs_lock_and_join_requests Christoph Hellwig <hch@lst.de>
  # 2020-04-01 e00ed89d7bd5 NFS: Refactor nfs_lock_and_join_requests() Trond Myklebust <trond.myklebust@hammerspace.com>
  # 2020-04-01 a62f8e3bd836 NFS: Clean up nfs_lock_and_join_requests() Trond Myklebust <trond.myklebust@hammerspace.com>
  # 2020-04-01 08ca8b21f760 NFS: Fix races nfs_page_group_destroy() vs nfs_destroy_unlinked_subrequests() Trond Myklebust <trond.myklebust@hammerspace.com>
  # 2020-04-01 add42de31721 NFS: Fix a page leak in nfs_destroy_unlinked_subrequests() Trond Myklebust <trond.myklebust@hammerspace.com>

查找nfs_unlock_request()函数修改记录:

git log -L:nfs_unlock_request:fs/nfs/pagelist.c
  # v5.16-rc1 43d20e80e288 NFS: Fix a few more clear_bit() instances that need release semantics

5 调试

补丁0001-debug-nfs_unlock_request.patch

6 构造复现

还在尝试中,暂未构造出来。

6.1 让内核执行到nfs_redirty_request()所做的修改

内核做以下修改,写文件就能执行到nfs_redirty_request():

--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -869,22 +870,35 @@ static void nfs_pageio_setup_mirroring(struct nfs_pageio_descriptor *pgio,
 {
        unsigned int mirror_count = 1;
 
+       if (1) {
+               pgio->pg_error = -ERESTARTSYS;
+               pgio->pg_mirrors = pgio->pg_mirrors_static;
+               mirror_count = 1;
+               pgio->pg_mirror_count = mirror_count;
+               return;
+       }
+
        if (pgio->pg_ops->pg_get_mirror_count)
                mirror_count = pgio->pg_ops->pg_get_mirror_count(pgio, req);

6.2nfs_redirty_request()中delay

--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1404,6 +1405,9 @@ static void nfs_initiate_write(struct nfs_pgio_header *hdr,
  */
 static void nfs_redirty_request(struct nfs_page *req)
 {
+       printk("%s:%d, delay begin\n", __func__, __LINE__);
+       mdelay(20 * 1000);
+       printk("%s:%d, delay end\n", __func__, __LINE__);
        nfs_mark_request_dirty(req);
        set_bit(NFS_CONTEXT_RESEND_WRITES, &req->wb_context->flags);
        nfs_end_page_writeback(req);

6.3 触发invalidatepage

write.c:

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>

int main(void)
{
    const char *filename = "/mnt/file";
    const size_t data_size = 4100;
    FILE *fp = fopen(filename, "wb");
    if (!fp) {
        perror("fopen");
        return EXIT_FAILURE;
    }

    // 分配 4090 字节的缓冲区,并填充字符 'A'
    char *buffer = malloc(data_size);
    if (!buffer) {
        perror("malloc");
        fclose(fp);
        return EXIT_FAILURE;
    }
    memset(buffer, 'A', data_size);

    // 写入数据
    size_t written = fwrite(buffer, 1, data_size, fp);
    if (written != data_size) {
        perror("fwrite");
        free(buffer);
        fclose(fp);
        return EXIT_FAILURE;
    }

    // 在关闭文件之前,睡眠
    sleep(100000);

    // 释放资源并关闭文件
    free(buffer);
    fclose(fp);
    return EXIT_SUCCESS;
}

truncate.c:

#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <fcntl.h>
#include <string.h>
#include <sys/stat.h>

int main(void)
{
    const char *filename = "/mnt/file";
    int fd = open(filename, O_RDWR | O_CREAT, 0666);
    if (fd < 0) {
        perror("open");
        return EXIT_FAILURE;
    }

    // 调用 ftruncate 将文件截断到较小尺寸,比如 100 字节,
    // 这会使得超过 100 字节部分的页缓存被标记为无效(触发 invalidate page)
    off_t new_size = 100;
    if (ftruncate(fd, new_size) < 0) {
        perror("ftruncate");
        close(fd);
        return EXIT_FAILURE;
    }

    sleep(100000);

    close(fd);
    return EXIT_SUCCESS;
}
gcc -o write write.c
gcc -o truncate truncate.c
./write &
sleep 1
./truncate &